text_alignment 0.6.4 → 0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +4 -19
- data/lib/text_alignment/text_alignment.rb +193 -106
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
|
4
|
+
data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
|
7
|
+
data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
|
data/bin/align_annotations
CHANGED
@@ -105,9 +105,7 @@ lost_annotations = []
|
|
105
105
|
target_annotations = if source_annotations.class == Array
|
106
106
|
align_mdoc(source_annotations, {text: target_text})
|
107
107
|
else
|
108
|
-
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
109
|
-
|
110
|
-
# pp alignment
|
108
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
|
111
109
|
|
112
110
|
# verification
|
113
111
|
# source_text = source_annotations[:text]
|
@@ -142,22 +140,7 @@ else
|
|
142
140
|
puts "====="
|
143
141
|
# exit
|
144
142
|
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
143
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
144
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
145
|
|
163
146
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
|
|
194
177
|
|
195
178
|
if lost_annotations
|
196
179
|
warn "\n[lost annotations]"
|
197
|
-
|
180
|
+
lost_annotations.each do |a|
|
181
|
+
p a
|
182
|
+
end
|
198
183
|
end
|
199
184
|
|
200
185
|
#puts target_annotations.to_json
|
@@ -12,43 +12,46 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
15
|
+
def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
16
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
18
|
@block_alignment = {source_text:str1, target_text:str2}
|
19
|
+
@str1 = str1
|
20
|
+
@str2 = str2
|
19
21
|
|
20
|
-
|
22
|
+
## Block exact match
|
21
23
|
block_begin = str2.index(str1)
|
22
24
|
unless block_begin.nil?
|
23
25
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
24
|
-
return
|
26
|
+
return
|
25
27
|
end
|
26
28
|
|
27
|
-
# try exact match
|
28
29
|
block_begin = str2.downcase.index(str1.downcase)
|
29
30
|
unless block_begin.nil?
|
30
31
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
31
|
-
return
|
32
|
+
return
|
32
33
|
end
|
33
34
|
|
35
|
+
|
36
|
+
## to find block alignments
|
34
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
35
38
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
last
|
40
|
-
|
41
|
-
last[:
|
42
|
-
last[:target][:end] = anchor[:target][:end]
|
39
|
+
blocks = []
|
40
|
+
while block = anchor_finder.get_next_anchor
|
41
|
+
last = blocks.last
|
42
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
43
|
+
last[:source][:end] = block[:source][:end]
|
44
|
+
last[:target][:end] = block[:target][:end]
|
43
45
|
else
|
44
|
-
|
46
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
48
|
-
# pp
|
50
|
+
# pp blocks
|
49
51
|
# puts "-----"
|
50
52
|
# puts
|
51
|
-
#
|
53
|
+
# exit
|
54
|
+
# blocks.each do |b|
|
52
55
|
# p [b[:source], b[:target]]
|
53
56
|
# puts "---"
|
54
57
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -60,114 +63,196 @@ class TextAlignment::TextAlignment
|
|
60
63
|
# puts "-=-=-=-=-"
|
61
64
|
# puts
|
62
65
|
|
63
|
-
##
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if mblocks[0][:source][:begin] > 0
|
69
|
-
e1 = mblocks[0][:source][:begin]
|
70
|
-
e2 = mblocks[0][:target][:begin]
|
66
|
+
## to fill the gaps
|
67
|
+
last_block = nil
|
68
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
69
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
70
|
+
e1 = block[:source][:begin]
|
71
71
|
|
72
|
-
if
|
73
|
-
|
72
|
+
sum += if b1 == e1
|
73
|
+
[block]
|
74
74
|
else
|
75
|
-
|
76
|
-
|
75
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
76
|
+
e2 = block[:target][:begin]
|
77
|
+
|
78
|
+
if b2 == e2
|
79
|
+
[
|
80
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
81
|
+
block
|
82
|
+
]
|
83
|
+
else
|
84
|
+
if b1 == 0 && b2 == 0
|
85
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
87
|
+
end
|
77
88
|
|
78
|
-
|
79
|
-
|
80
|
-
|
89
|
+
_str1 = str1[b1 ... e1]
|
90
|
+
_str2 = str2[b2 ... e2]
|
91
|
+
|
92
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
93
|
+
[
|
94
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
|
+
block
|
96
|
+
]
|
81
97
|
else
|
82
|
-
|
83
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
84
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
85
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
86
|
-
|
87
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
88
|
-
|
89
|
-
_str1 = str1[b1 ... e1]
|
90
|
-
_str2 = str2[b2 ... e2]
|
91
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
-
if alignment.similarity < 0.5
|
93
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
94
|
-
else
|
95
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
96
|
-
end
|
98
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
97
99
|
end
|
98
100
|
end
|
99
101
|
end
|
102
|
+
|
103
|
+
last_block = block
|
104
|
+
sum
|
100
105
|
end
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
if
|
112
|
-
|
106
|
+
|
107
|
+
# the last step
|
108
|
+
blocks2 += if last_block.nil?
|
109
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
110
|
+
else
|
111
|
+
b1 = last_block[:source][:end]
|
112
|
+
if b1 < str1.length
|
113
|
+
e1 = str1.length
|
114
|
+
|
115
|
+
b2 = last_block[:target][:end]
|
116
|
+
if b2 < str2.length
|
117
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
118
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
119
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
113
120
|
else
|
114
|
-
|
115
|
-
if alignment.similarity < 0.5
|
116
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
117
|
-
else
|
118
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
119
|
-
end
|
121
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
120
122
|
end
|
121
123
|
end
|
122
|
-
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
123
124
|
end
|
124
125
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
126
|
+
@block_alignment[:blocks] = blocks2
|
127
|
+
end
|
128
|
+
|
129
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
130
|
+
block2 = str2[b2 ... e2]
|
131
|
+
|
132
|
+
## term-based alignment
|
133
|
+
tblocks = if denotations
|
134
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
135
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
136
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
137
|
+
|
138
|
+
position = 0
|
139
|
+
tblocks = ds_in_scope.map do |term|
|
140
|
+
lex = term[:lex]
|
141
|
+
r = block2.index(lex, position)
|
142
|
+
if r.nil?
|
143
|
+
position = nil
|
144
|
+
break
|
145
|
+
end
|
146
|
+
position = r + lex.length
|
147
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
148
|
+
end
|
149
|
+
|
150
|
+
# missing term found
|
151
|
+
tblocks = [] if position.nil?
|
152
|
+
|
153
|
+
# redundant matching found
|
154
|
+
unless position.nil?
|
155
|
+
ds_in_scope.each do |term|
|
156
|
+
lex = term[:lex]
|
157
|
+
look_forward = block2.index(lex, position)
|
158
|
+
unless look_forward.nil?
|
159
|
+
puts lex
|
160
|
+
tblocks = []
|
161
|
+
break
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
tblocks
|
167
|
+
end
|
131
168
|
|
132
|
-
|
133
|
-
|
134
|
-
|
169
|
+
if tblocks.empty?
|
170
|
+
if b1 == 0 && e1 == str1.length
|
171
|
+
if str2.length > 2000
|
172
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
135
173
|
else
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
174
|
+
block1 = str1[b1 ... e1]
|
175
|
+
block2 = str2[b2 ... e2]
|
176
|
+
|
177
|
+
## character-based alignment
|
178
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
179
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
180
|
+
# alignment = :alignment
|
181
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
182
|
+
end
|
183
|
+
else
|
184
|
+
block1 = str1[b1 ... e1]
|
185
|
+
block2 = str2[b2 ... e2]
|
186
|
+
|
187
|
+
## character-based alignment
|
188
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
189
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
190
|
+
# alignmnet = :alignment
|
191
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
192
|
+
end
|
193
|
+
else
|
194
|
+
last_tblock = nil
|
195
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
196
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
197
|
+
te1 = tblock[:source][:begin]
|
142
198
|
|
143
|
-
|
144
|
-
|
145
|
-
|
199
|
+
sum += if te1 == tb1
|
200
|
+
[tblock]
|
201
|
+
else
|
202
|
+
tb2 = last_tblock ? tlast_block[:target][:end] : b2
|
203
|
+
te2 = tblock[:target][:begin]
|
204
|
+
|
205
|
+
if b2 == e2
|
206
|
+
[
|
207
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
208
|
+
tblock
|
209
|
+
]
|
146
210
|
else
|
147
|
-
|
211
|
+
[
|
212
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
213
|
+
tblock
|
214
|
+
]
|
148
215
|
end
|
216
|
+
end
|
217
|
+
|
218
|
+
last_tblock = tblock
|
219
|
+
sum
|
220
|
+
end
|
149
221
|
|
150
|
-
|
222
|
+
if last_tblock[:source][:end] < e1
|
223
|
+
if last_tblock[:target][:end] < e2
|
224
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
225
|
+
else
|
226
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
151
227
|
end
|
152
228
|
end
|
153
|
-
end
|
154
229
|
|
155
|
-
|
156
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
230
|
+
lblocks
|
157
231
|
end
|
158
232
|
end
|
159
233
|
|
234
|
+
|
235
|
+
def indices(str, target)
|
236
|
+
position = 0
|
237
|
+
len = target.len
|
238
|
+
Enumerator.new do |yielder|
|
239
|
+
while idx = str.index(target, position)
|
240
|
+
yielder << idx
|
241
|
+
position = idx + len
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
160
246
|
def transform_begin_position(begin_position)
|
161
247
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
162
248
|
block = @block_alignment[:blocks][i]
|
163
249
|
|
164
|
-
b = if block[:alignment] == :block
|
250
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
165
251
|
begin_position + block[:delta]
|
166
252
|
elsif block[:alignment] == :empty
|
167
253
|
if begin_position == block[:source][:begin]
|
168
254
|
block[:target][:begin]
|
169
255
|
else
|
170
|
-
# raise "lost annotation"
|
171
256
|
nil
|
172
257
|
end
|
173
258
|
else
|
@@ -180,13 +265,12 @@ class TextAlignment::TextAlignment
|
|
180
265
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
181
266
|
block = @block_alignment[:blocks][i]
|
182
267
|
|
183
|
-
e = if block[:alignment] == :block
|
268
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
184
269
|
end_position + block[:delta]
|
185
270
|
elsif block[:alignment] == :empty
|
186
271
|
if end_position == block[:source][:end]
|
187
272
|
block[:target][:end]
|
188
273
|
else
|
189
|
-
# raise "lost annotation"
|
190
274
|
nil
|
191
275
|
end
|
192
276
|
else
|
@@ -208,14 +292,14 @@ class TextAlignment::TextAlignment
|
|
208
292
|
@lost_annotations = []
|
209
293
|
|
210
294
|
denotations.each do |d|
|
211
|
-
begin
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
end
|
295
|
+
source = {begin:d.begin, end:d.end}
|
296
|
+
d.begin = transform_begin_position(d.begin);
|
297
|
+
d.end = transform_end_position(d.end);
|
298
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
299
|
+
rescue
|
300
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
301
|
+
d.begin = nil
|
302
|
+
d.end = nil
|
219
303
|
end
|
220
304
|
|
221
305
|
@lost_annotations
|
@@ -226,12 +310,12 @@ class TextAlignment::TextAlignment
|
|
226
310
|
@lost_annotations = []
|
227
311
|
|
228
312
|
r = hdenotations.collect do |d|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
313
|
+
t = transform_a_span(d[:span])
|
314
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
315
|
+
new_d = d.dup.merge({span:t})
|
316
|
+
rescue
|
317
|
+
@lost_annotations << {source: d[:span], target:t}
|
318
|
+
nil
|
235
319
|
end.compact
|
236
320
|
|
237
321
|
r
|
@@ -245,7 +329,10 @@ class TextAlignment::TextAlignment
|
|
245
329
|
@block_alignment[:blocks].each do |a|
|
246
330
|
show += case a[:alignment]
|
247
331
|
when :block
|
248
|
-
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
332
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
333
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
334
|
+
when :term
|
335
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
249
336
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
250
337
|
when :empty
|
251
338
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|