text_alignment 0.6.4 → 0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +4 -19
- data/lib/text_alignment/text_alignment.rb +193 -106
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
|
4
|
+
data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
|
7
|
+
data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
|
data/bin/align_annotations
CHANGED
@@ -105,9 +105,7 @@ lost_annotations = []
|
|
105
105
|
target_annotations = if source_annotations.class == Array
|
106
106
|
align_mdoc(source_annotations, {text: target_text})
|
107
107
|
else
|
108
|
-
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
109
|
-
|
110
|
-
# pp alignment
|
108
|
+
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
|
111
109
|
|
112
110
|
# verification
|
113
111
|
# source_text = source_annotations[:text]
|
@@ -142,22 +140,7 @@ else
|
|
142
140
|
puts "====="
|
143
141
|
# exit
|
144
142
|
|
145
|
-
# verification of source denotations
|
146
|
-
puts "[Invalid source denotations]"
|
147
|
-
source_annotations[:denotations] do |d|
|
148
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
|
149
|
-
end
|
150
|
-
puts "====="
|
151
|
-
puts
|
152
|
-
|
153
143
|
denotations = alignment.transform_hdenotations(source_annotations[:denotations])
|
154
|
-
puts "[Invalid transformation]"
|
155
|
-
denotations.each do |d|
|
156
|
-
p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
|
157
|
-
end
|
158
|
-
puts "====="
|
159
|
-
puts
|
160
|
-
|
161
144
|
lost_annotations += alignment.lost_annotations if alignment.lost_annotations
|
162
145
|
|
163
146
|
source_annotations.merge({text:target_text, denotations:denotations})
|
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
|
|
194
177
|
|
195
178
|
if lost_annotations
|
196
179
|
warn "\n[lost annotations]"
|
197
|
-
|
180
|
+
lost_annotations.each do |a|
|
181
|
+
p a
|
182
|
+
end
|
198
183
|
end
|
199
184
|
|
200
185
|
#puts target_annotations.to_json
|
@@ -12,43 +12,46 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
15
|
+
def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
16
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
18
|
@block_alignment = {source_text:str1, target_text:str2}
|
19
|
+
@str1 = str1
|
20
|
+
@str2 = str2
|
19
21
|
|
20
|
-
|
22
|
+
## Block exact match
|
21
23
|
block_begin = str2.index(str1)
|
22
24
|
unless block_begin.nil?
|
23
25
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
24
|
-
return
|
26
|
+
return
|
25
27
|
end
|
26
28
|
|
27
|
-
# try exact match
|
28
29
|
block_begin = str2.downcase.index(str1.downcase)
|
29
30
|
unless block_begin.nil?
|
30
31
|
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
31
|
-
return
|
32
|
+
return
|
32
33
|
end
|
33
34
|
|
35
|
+
|
36
|
+
## to find block alignments
|
34
37
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
35
38
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
last
|
40
|
-
|
41
|
-
last[:
|
42
|
-
last[:target][:end] = anchor[:target][:end]
|
39
|
+
blocks = []
|
40
|
+
while block = anchor_finder.get_next_anchor
|
41
|
+
last = blocks.last
|
42
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
43
|
+
last[:source][:end] = block[:source][:end]
|
44
|
+
last[:target][:end] = block[:target][:end]
|
43
45
|
else
|
44
|
-
|
46
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
48
|
-
# pp
|
50
|
+
# pp blocks
|
49
51
|
# puts "-----"
|
50
52
|
# puts
|
51
|
-
#
|
53
|
+
# exit
|
54
|
+
# blocks.each do |b|
|
52
55
|
# p [b[:source], b[:target]]
|
53
56
|
# puts "---"
|
54
57
|
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
@@ -60,114 +63,196 @@ class TextAlignment::TextAlignment
|
|
60
63
|
# puts "-=-=-=-=-"
|
61
64
|
# puts
|
62
65
|
|
63
|
-
##
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
if mblocks[0][:source][:begin] > 0
|
69
|
-
e1 = mblocks[0][:source][:begin]
|
70
|
-
e2 = mblocks[0][:target][:begin]
|
66
|
+
## to fill the gaps
|
67
|
+
last_block = nil
|
68
|
+
blocks2 = blocks.inject([]) do |sum, block|
|
69
|
+
b1 = last_block ? last_block[:source][:end] : 0
|
70
|
+
e1 = block[:source][:begin]
|
71
71
|
|
72
|
-
if
|
73
|
-
|
72
|
+
sum += if b1 == e1
|
73
|
+
[block]
|
74
74
|
else
|
75
|
-
|
76
|
-
|
75
|
+
b2 = last_block ? last_block[:target][:end] : 0
|
76
|
+
e2 = block[:target][:begin]
|
77
|
+
|
78
|
+
if b2 == e2
|
79
|
+
[
|
80
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
81
|
+
block
|
82
|
+
]
|
83
|
+
else
|
84
|
+
if b1 == 0 && b2 == 0
|
85
|
+
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
|
+
b2 = e2 - len_buffer if e2 > len_buffer
|
87
|
+
end
|
77
88
|
|
78
|
-
|
79
|
-
|
80
|
-
|
89
|
+
_str1 = str1[b1 ... e1]
|
90
|
+
_str2 = str2[b2 ... e2]
|
91
|
+
|
92
|
+
if _str1.strip.empty? || _str2.strip.empty?
|
93
|
+
[
|
94
|
+
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
|
+
block
|
96
|
+
]
|
81
97
|
else
|
82
|
-
|
83
|
-
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
84
|
-
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
85
|
-
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
86
|
-
|
87
|
-
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
88
|
-
|
89
|
-
_str1 = str1[b1 ... e1]
|
90
|
-
_str2 = str2[b2 ... e2]
|
91
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
-
if alignment.similarity < 0.5
|
93
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
94
|
-
else
|
95
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
96
|
-
end
|
98
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
97
99
|
end
|
98
100
|
end
|
99
101
|
end
|
102
|
+
|
103
|
+
last_block = block
|
104
|
+
sum
|
100
105
|
end
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
if
|
112
|
-
|
106
|
+
|
107
|
+
# the last step
|
108
|
+
blocks2 += if last_block.nil?
|
109
|
+
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
110
|
+
else
|
111
|
+
b1 = last_block[:source][:end]
|
112
|
+
if b1 < str1.length
|
113
|
+
e1 = str1.length
|
114
|
+
|
115
|
+
b2 = last_block[:target][:end]
|
116
|
+
if b2 < str2.length
|
117
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
118
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
119
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
113
120
|
else
|
114
|
-
|
115
|
-
if alignment.similarity < 0.5
|
116
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
117
|
-
else
|
118
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
119
|
-
end
|
121
|
+
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
120
122
|
end
|
121
123
|
end
|
122
|
-
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
123
124
|
end
|
124
125
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
126
|
+
@block_alignment[:blocks] = blocks2
|
127
|
+
end
|
128
|
+
|
129
|
+
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
130
|
+
block2 = str2[b2 ... e2]
|
131
|
+
|
132
|
+
## term-based alignment
|
133
|
+
tblocks = if denotations
|
134
|
+
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
135
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
136
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
137
|
+
|
138
|
+
position = 0
|
139
|
+
tblocks = ds_in_scope.map do |term|
|
140
|
+
lex = term[:lex]
|
141
|
+
r = block2.index(lex, position)
|
142
|
+
if r.nil?
|
143
|
+
position = nil
|
144
|
+
break
|
145
|
+
end
|
146
|
+
position = r + lex.length
|
147
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
148
|
+
end
|
149
|
+
|
150
|
+
# missing term found
|
151
|
+
tblocks = [] if position.nil?
|
152
|
+
|
153
|
+
# redundant matching found
|
154
|
+
unless position.nil?
|
155
|
+
ds_in_scope.each do |term|
|
156
|
+
lex = term[:lex]
|
157
|
+
look_forward = block2.index(lex, position)
|
158
|
+
unless look_forward.nil?
|
159
|
+
puts lex
|
160
|
+
tblocks = []
|
161
|
+
break
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
tblocks
|
167
|
+
end
|
131
168
|
|
132
|
-
|
133
|
-
|
134
|
-
|
169
|
+
if tblocks.empty?
|
170
|
+
if b1 == 0 && e1 == str1.length
|
171
|
+
if str2.length > 2000
|
172
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
135
173
|
else
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
174
|
+
block1 = str1[b1 ... e1]
|
175
|
+
block2 = str2[b2 ... e2]
|
176
|
+
|
177
|
+
## character-based alignment
|
178
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
179
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
180
|
+
# alignment = :alignment
|
181
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
182
|
+
end
|
183
|
+
else
|
184
|
+
block1 = str1[b1 ... e1]
|
185
|
+
block2 = str2[b2 ... e2]
|
186
|
+
|
187
|
+
## character-based alignment
|
188
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
189
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
|
190
|
+
# alignmnet = :alignment
|
191
|
+
# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
|
192
|
+
end
|
193
|
+
else
|
194
|
+
last_tblock = nil
|
195
|
+
lblocks = tblocks.inject([]) do |sum, tblock|
|
196
|
+
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
197
|
+
te1 = tblock[:source][:begin]
|
142
198
|
|
143
|
-
|
144
|
-
|
145
|
-
|
199
|
+
sum += if te1 == tb1
|
200
|
+
[tblock]
|
201
|
+
else
|
202
|
+
tb2 = last_tblock ? tlast_block[:target][:end] : b2
|
203
|
+
te2 = tblock[:target][:begin]
|
204
|
+
|
205
|
+
if b2 == e2
|
206
|
+
[
|
207
|
+
{source:{begin:tb1, end:te1}, alignment: :empty},
|
208
|
+
tblock
|
209
|
+
]
|
146
210
|
else
|
147
|
-
|
211
|
+
[
|
212
|
+
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
213
|
+
tblock
|
214
|
+
]
|
148
215
|
end
|
216
|
+
end
|
217
|
+
|
218
|
+
last_tblock = tblock
|
219
|
+
sum
|
220
|
+
end
|
149
221
|
|
150
|
-
|
222
|
+
if last_tblock[:source][:end] < e1
|
223
|
+
if last_tblock[:target][:end] < e2
|
224
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
225
|
+
else
|
226
|
+
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
151
227
|
end
|
152
228
|
end
|
153
|
-
end
|
154
229
|
|
155
|
-
|
156
|
-
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
230
|
+
lblocks
|
157
231
|
end
|
158
232
|
end
|
159
233
|
|
234
|
+
|
235
|
+
def indices(str, target)
|
236
|
+
position = 0
|
237
|
+
len = target.len
|
238
|
+
Enumerator.new do |yielder|
|
239
|
+
while idx = str.index(target, position)
|
240
|
+
yielder << idx
|
241
|
+
position = idx + len
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
160
246
|
def transform_begin_position(begin_position)
|
161
247
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
162
248
|
block = @block_alignment[:blocks][i]
|
163
249
|
|
164
|
-
b = if block[:alignment] == :block
|
250
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
165
251
|
begin_position + block[:delta]
|
166
252
|
elsif block[:alignment] == :empty
|
167
253
|
if begin_position == block[:source][:begin]
|
168
254
|
block[:target][:begin]
|
169
255
|
else
|
170
|
-
# raise "lost annotation"
|
171
256
|
nil
|
172
257
|
end
|
173
258
|
else
|
@@ -180,13 +265,12 @@ class TextAlignment::TextAlignment
|
|
180
265
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
181
266
|
block = @block_alignment[:blocks][i]
|
182
267
|
|
183
|
-
e = if block[:alignment] == :block
|
268
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
184
269
|
end_position + block[:delta]
|
185
270
|
elsif block[:alignment] == :empty
|
186
271
|
if end_position == block[:source][:end]
|
187
272
|
block[:target][:end]
|
188
273
|
else
|
189
|
-
# raise "lost annotation"
|
190
274
|
nil
|
191
275
|
end
|
192
276
|
else
|
@@ -208,14 +292,14 @@ class TextAlignment::TextAlignment
|
|
208
292
|
@lost_annotations = []
|
209
293
|
|
210
294
|
denotations.each do |d|
|
211
|
-
begin
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
end
|
295
|
+
source = {begin:d.begin, end:d.end}
|
296
|
+
d.begin = transform_begin_position(d.begin);
|
297
|
+
d.end = transform_end_position(d.end);
|
298
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
299
|
+
rescue
|
300
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
301
|
+
d.begin = nil
|
302
|
+
d.end = nil
|
219
303
|
end
|
220
304
|
|
221
305
|
@lost_annotations
|
@@ -226,12 +310,12 @@ class TextAlignment::TextAlignment
|
|
226
310
|
@lost_annotations = []
|
227
311
|
|
228
312
|
r = hdenotations.collect do |d|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
313
|
+
t = transform_a_span(d[:span])
|
314
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
315
|
+
new_d = d.dup.merge({span:t})
|
316
|
+
rescue
|
317
|
+
@lost_annotations << {source: d[:span], target:t}
|
318
|
+
nil
|
235
319
|
end.compact
|
236
320
|
|
237
321
|
r
|
@@ -245,7 +329,10 @@ class TextAlignment::TextAlignment
|
|
245
329
|
@block_alignment[:blocks].each do |a|
|
246
330
|
show += case a[:alignment]
|
247
331
|
when :block
|
248
|
-
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
332
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
333
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
334
|
+
when :term
|
335
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
249
336
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
250
337
|
when :empty
|
251
338
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|