text_alignment 0.6 → 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +4 -19
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +199 -168
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
|
4
|
+
data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
|
7
|
+
data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
|
data/bin/align_annotations
CHANGED
@@ -105,9 +105,7 @@ lost_annotations = []
|
|
105
105
|
target_annotations = if source_annotations.class == Array
|
106
106
|
align_mdoc(source_annotations, {text: target_text})
|
107
107
|
else
|
108
|
-
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
109
|
-
|
110
|
-
# pp alignment
|
108
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
|
111
109
|
|
112
110
|
# verification
|
113
111
|
# source_text = source_annotations[:text]
|
@@ -142,22 +140,7 @@ else
|
|
142
140
|
puts "====="
|
143
141
|
# exit
|
144
142
|
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
143
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
144
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
145
|
|
163
146
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
|
|
194
177
|
|
195
178
|
if lost_annotations
|
196
179
|
warn "\n[lost annotations]"
|
197
|
-
|
180
|
+
lost_annotations.each do |a|
|
181
|
+
p a
|
182
|
+
end
|
198
183
|
end
|
199
184
|
|
200
185
|
#puts target_annotations.to_json
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@block_alignment = {source_text:
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
19
|
+
@str1 = str1
|
20
|
+
@str2 = str2
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
# try exact match
|
22
|
+
## Block exact match
|
23
23
|
block_begin = str2.index(str1)
|
24
24
|
unless block_begin.nil?
|
25
25
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
-
return
|
26
|
+
return
|
27
27
|
end
|
28
28
|
|
29
|
-
# try exact match
|
30
29
|
block_begin = str2.downcase.index(str1.downcase)
|
31
30
|
unless block_begin.nil?
|
32
31
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
-
return
|
32
|
+
return
|
34
33
|
end
|
35
34
|
|
35
|
+
|
36
|
+
## to find block alignments
|
36
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
37
38
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
last
|
42
|
-
|
43
|
-
last[:
|
44
|
-
last[:target][:end] = anchor[:target][:end]
|
39
|
+
blocks = []
|
40
|
+
while block = anchor_finder.get_next_anchor
|
41
|
+
last = blocks.last
|
42
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
43
|
+
last[:source][:end] = block[:source][:end]
|
44
|
+
last[:target][:end] = block[:target][:end]
|
45
45
|
else
|
46
|
-
|
46
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
50
|
-
# pp
|
50
|
+
# pp blocks
|
51
51
|
# puts "-----"
|
52
52
|
# puts
|
53
|
-
#
|
53
|
+
# exit
|
54
|
+
# blocks.each do |b|
|
54
55
|
# p [b[:source], b[:target]]
|
55
56
|
# puts "---"
|
56
57
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -62,114 +63,196 @@ class TextAlignment::TextAlignment
|
|
62
63
|
# puts "-=-=-=-=-"
|
63
64
|
# puts
|
64
65
|
|
65
|
-
##
|
66
|
-
|
67
|
-
|
66
|
+
## to fill the gaps
|
67
|
+
last_block = nil
|
68
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
69
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
70
|
+
e1 = block[:source][:begin]
|
68
71
|
|
69
|
-
|
70
|
-
|
71
|
-
e1 = mblocks[0][:source][:begin]
|
72
|
-
e2 = mblocks[0][:target][:begin]
|
73
|
-
|
74
|
-
if mblocks[0][:target][:begin] == 0
|
75
|
-
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
72
|
+
sum += if b1 == e1
|
73
|
+
[block]
|
76
74
|
else
|
77
|
-
|
78
|
-
|
75
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
76
|
+
e2 = block[:target][:begin]
|
77
|
+
|
78
|
+
if b2 == e2
|
79
|
+
[
|
80
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
81
|
+
block
|
82
|
+
]
|
83
|
+
else
|
84
|
+
if b1 == 0 && b2 == 0
|
85
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
87
|
+
end
|
88
|
+
|
89
|
+
_str1 = str1[b1 ... e1]
|
90
|
+
_str2 = str2[b2 ... e2]
|
79
91
|
|
80
|
-
|
81
|
-
|
82
|
-
|
92
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
93
|
+
[
|
94
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
|
+
block
|
96
|
+
]
|
83
97
|
else
|
84
|
-
|
85
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
87
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
88
|
-
|
89
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
90
|
-
|
91
|
-
_str1 = str1[b1 ... e1]
|
92
|
-
_str2 = str2[b2 ... e2]
|
93
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
94
|
-
if alignment.similarity < 0.6
|
95
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
96
|
-
else
|
97
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
98
|
-
end
|
98
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
99
99
|
end
|
100
100
|
end
|
101
101
|
end
|
102
|
+
|
103
|
+
last_block = block
|
104
|
+
sum
|
102
105
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
if
|
114
|
-
|
106
|
+
|
107
|
+
# the last step
|
108
|
+
blocks2 += if last_block.nil?
|
109
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
110
|
+
else
|
111
|
+
b1 = last_block[:source][:end]
|
112
|
+
if b1 < str1.length
|
113
|
+
e1 = str1.length
|
114
|
+
|
115
|
+
b2 = last_block[:target][:end]
|
116
|
+
if b2 < str2.length
|
117
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
118
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
119
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
115
120
|
else
|
116
|
-
|
117
|
-
if alignment.similarity < 0.6
|
118
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
119
|
-
else
|
120
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
121
|
-
end
|
121
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
122
122
|
end
|
123
123
|
end
|
124
|
-
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
125
124
|
end
|
126
125
|
|
127
|
-
|
128
|
-
|
129
|
-
b1 = mblocks[-1][:source][:end]
|
130
|
-
b2 = mblocks[-1][:target][:end]
|
131
|
-
_str1 = str1[b1 ... str1.length]
|
132
|
-
_str2 = str2[b2 ... str2.length]
|
126
|
+
@block_alignment[:blocks] = blocks2
|
127
|
+
end
|
133
128
|
|
134
|
-
|
135
|
-
|
136
|
-
|
129
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
130
|
+
block2 = str2[b2 ... e2]
|
131
|
+
|
132
|
+
## term-based alignment
|
133
|
+
tblocks = if denotations
|
134
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
135
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
136
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
137
|
+
|
138
|
+
position = 0
|
139
|
+
tblocks = ds_in_scope.map do |term|
|
140
|
+
lex = term[:lex]
|
141
|
+
r = block2.index(lex, position)
|
142
|
+
if r.nil?
|
143
|
+
position = nil
|
144
|
+
break
|
145
|
+
end
|
146
|
+
position = r + lex.length
|
147
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
148
|
+
end
|
149
|
+
|
150
|
+
# missing term found
|
151
|
+
tblocks = [] if position.nil?
|
152
|
+
|
153
|
+
# redundant matching found
|
154
|
+
unless position.nil?
|
155
|
+
ds_in_scope.each do |term|
|
156
|
+
lex = term[:lex]
|
157
|
+
look_forward = block2.index(lex, position)
|
158
|
+
unless look_forward.nil?
|
159
|
+
puts lex
|
160
|
+
tblocks = []
|
161
|
+
break
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
tblocks
|
167
|
+
end
|
168
|
+
|
169
|
+
if tblocks.empty?
|
170
|
+
if b1 == 0 && e1 == str1.length
|
171
|
+
if str2.length > 2000
|
172
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
137
173
|
else
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
174
|
+
block1 = str1[b1 ... e1]
|
175
|
+
block2 = str2[b2 ... e2]
|
176
|
+
|
177
|
+
## character-based alignment
|
178
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
179
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
180
|
+
# alignment = :alignment
|
181
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
182
|
+
end
|
183
|
+
else
|
184
|
+
block1 = str1[b1 ... e1]
|
185
|
+
block2 = str2[b2 ... e2]
|
186
|
+
|
187
|
+
## character-based alignment
|
188
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
189
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
190
|
+
# alignmnet = :alignment
|
191
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
192
|
+
end
|
193
|
+
else
|
194
|
+
last_tblock = nil
|
195
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
196
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
197
|
+
te1 = tblock[:source][:begin]
|
144
198
|
|
145
|
-
|
146
|
-
|
147
|
-
|
199
|
+
sum += if te1 == tb1
|
200
|
+
[tblock]
|
201
|
+
else
|
202
|
+
tb2 = last_tblock ? tlast_block[:target][:end] : b2
|
203
|
+
te2 = tblock[:target][:begin]
|
204
|
+
|
205
|
+
if b2 == e2
|
206
|
+
[
|
207
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
208
|
+
tblock
|
209
|
+
]
|
148
210
|
else
|
149
|
-
|
211
|
+
[
|
212
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
213
|
+
tblock
|
214
|
+
]
|
150
215
|
end
|
216
|
+
end
|
151
217
|
|
152
|
-
|
218
|
+
last_tblock = tblock
|
219
|
+
sum
|
220
|
+
end
|
221
|
+
|
222
|
+
if last_tblock[:source][:end] < e1
|
223
|
+
if last_tblock[:target][:end] < e2
|
224
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
225
|
+
else
|
226
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
153
227
|
end
|
154
228
|
end
|
155
|
-
end
|
156
229
|
|
157
|
-
|
158
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
230
|
+
lblocks
|
159
231
|
end
|
160
232
|
end
|
161
233
|
|
234
|
+
|
235
|
+
def indices(str, target)
|
236
|
+
position = 0
|
237
|
+
len = target.len
|
238
|
+
Enumerator.new do |yielder|
|
239
|
+
while idx = str.index(target, position)
|
240
|
+
yielder << idx
|
241
|
+
position = idx + len
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
162
246
|
def transform_begin_position(begin_position)
|
163
247
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
164
248
|
block = @block_alignment[:blocks][i]
|
165
249
|
|
166
|
-
b = if block[:alignment] == :block
|
250
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
167
251
|
begin_position + block[:delta]
|
168
252
|
elsif block[:alignment] == :empty
|
169
253
|
if begin_position == block[:source][:begin]
|
170
254
|
block[:target][:begin]
|
171
255
|
else
|
172
|
-
# raise "lost annotation"
|
173
256
|
nil
|
174
257
|
end
|
175
258
|
else
|
@@ -182,13 +265,12 @@ class TextAlignment::TextAlignment
|
|
182
265
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
183
266
|
block = @block_alignment[:blocks][i]
|
184
267
|
|
185
|
-
e = if block[:alignment] == :block
|
268
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
186
269
|
end_position + block[:delta]
|
187
270
|
elsif block[:alignment] == :empty
|
188
271
|
if end_position == block[:source][:end]
|
189
272
|
block[:target][:end]
|
190
273
|
else
|
191
|
-
# raise "lost annotation"
|
192
274
|
nil
|
193
275
|
end
|
194
276
|
else
|
@@ -210,14 +292,14 @@ class TextAlignment::TextAlignment
|
|
210
292
|
@lost_annotations = []
|
211
293
|
|
212
294
|
denotations.each do |d|
|
213
|
-
begin
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
end
|
295
|
+
source = {begin:d.begin, end:d.end}
|
296
|
+
d.begin = transform_begin_position(d.begin);
|
297
|
+
d.end = transform_end_position(d.end);
|
298
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
299
|
+
rescue
|
300
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
301
|
+
d.begin = nil
|
302
|
+
d.end = nil
|
221
303
|
end
|
222
304
|
|
223
305
|
@lost_annotations
|
@@ -228,12 +310,12 @@ class TextAlignment::TextAlignment
|
|
228
310
|
@lost_annotations = []
|
229
311
|
|
230
312
|
r = hdenotations.collect do |d|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
313
|
+
t = transform_a_span(d[:span])
|
314
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
315
|
+
new_d = d.dup.merge({span:t})
|
316
|
+
rescue
|
317
|
+
@lost_annotations << {source: d[:span], target:t}
|
318
|
+
nil
|
237
319
|
end.compact
|
238
320
|
|
239
321
|
r
|
@@ -247,12 +329,16 @@ class TextAlignment::TextAlignment
|
|
247
329
|
@block_alignment[:blocks].each do |a|
|
248
330
|
show += case a[:alignment]
|
249
331
|
when :block
|
250
|
-
"===== common
|
332
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
333
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
334
|
+
when :term
|
335
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
251
336
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
252
337
|
when :empty
|
253
|
-
"
|
338
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
339
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
254
340
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
255
|
-
">>>>> string 2\n" +
|
341
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
256
342
|
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
257
343
|
else
|
258
344
|
astr1 = ''
|
@@ -286,7 +372,7 @@ class TextAlignment::TextAlignment
|
|
286
372
|
end
|
287
373
|
end.join('')
|
288
374
|
|
289
|
-
"***** local mismatch\n" +
|
375
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
290
376
|
"[#{astr1}]\n" +
|
291
377
|
"[#{astr2}]\n\n"
|
292
378
|
end
|
@@ -294,59 +380,4 @@ class TextAlignment::TextAlignment
|
|
294
380
|
show
|
295
381
|
end
|
296
382
|
|
297
|
-
private
|
298
|
-
|
299
|
-
def string_preprocessing(_str1, _str2)
|
300
|
-
str1 = _str1.dup
|
301
|
-
str2 = _str2.dup
|
302
|
-
mappings = TextAlignment::MAPPINGS.dup
|
303
|
-
|
304
|
-
## single character mappings
|
305
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
306
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
307
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
308
|
-
characters_to.gsub!(/-/, '\-')
|
309
|
-
|
310
|
-
str1.tr!(characters_from, characters_to)
|
311
|
-
str2.tr!(characters_from, characters_to)
|
312
|
-
|
313
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
314
|
-
|
315
|
-
## long to one character mappings
|
316
|
-
pletters = TextAlignment::PADDING_LETTERS
|
317
|
-
|
318
|
-
# find the padding letter for str1
|
319
|
-
padding_letter1 = begin
|
320
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
321
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
322
|
-
TextAlignment::PADDING_LETTERS[i]
|
323
|
-
end
|
324
|
-
|
325
|
-
# find the padding letter for str2
|
326
|
-
padding_letter2 = begin
|
327
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
328
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
329
|
-
TextAlignment::PADDING_LETTERS[i]
|
330
|
-
end
|
331
|
-
|
332
|
-
# ASCII foldings
|
333
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
334
|
-
ascii_foldings.each do |f|
|
335
|
-
from = f[1]
|
336
|
-
|
337
|
-
if str2.index(f[0])
|
338
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
339
|
-
str1.gsub!(from, to)
|
340
|
-
end
|
341
|
-
|
342
|
-
if str1.index(f[0])
|
343
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
344
|
-
str2.gsub!(from, to)
|
345
|
-
end
|
346
|
-
end
|
347
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
348
|
-
|
349
|
-
[str1, str2, mappings]
|
350
|
-
end
|
351
|
-
|
352
383
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|