text_alignment 0.4.3 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +32 -37
- data/lib/text_alignment/lcs_comparison.rb +3 -1
- data/lib/text_alignment/text_alignment.rb +124 -48
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb5dd06236d0b1a8a9c8c5fcb92807a62bdd30e0648bcbd636b95b2a8a45b9b4
|
4
|
+
data.tar.gz: 9266b852993bfee999daa92e3f38ec93e2aec77171fee27c1fea6ac2a17e4d23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ee2a590fb31bcc27121a4a227d7fcefe2e8e80646bea3898bb86729ca3ca299e0aebcf23bea30e2391687e6ec0d6573c04a4605f728562482c7edbd0c0285e0
|
7
|
+
data.tar.gz: 73612c185fe533b0daa22d44e7776ed610025cb1bd874f05d95761079f95d1e8a06ead68c88b84bab4d33e8a676edff1e98880912254d9a7ecb5c4ead5eb01fb
|
data/bin/align_annotations
CHANGED
@@ -35,6 +35,10 @@ def align_mdoc(source_annotations, target_annotations)
|
|
35
35
|
source_annotations.each do |annotations|
|
36
36
|
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
37
|
|
38
|
+
puts alignment.alignment_show
|
39
|
+
puts "-----"
|
40
|
+
puts
|
41
|
+
|
38
42
|
# alignment.block_alignments.each do |a|
|
39
43
|
# p {source:a[:source], target:a[:target]}
|
40
44
|
# puts "--"
|
@@ -103,48 +107,39 @@ target_annotations = if source_annotations.class == Array
|
|
103
107
|
else
|
104
108
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
109
|
|
106
|
-
pp alignment
|
110
|
+
# pp alignment
|
107
111
|
|
108
112
|
# verification
|
109
|
-
source_text = source_annotations[:text]
|
110
|
-
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.rstrip.length).each do |p|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
end
|
119
|
-
puts
|
120
|
-
puts "=====END"
|
121
|
-
|
122
|
-
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.rstrip.length).each do |p|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
else
|
128
|
-
print '.'
|
129
|
-
end
|
130
|
-
end
|
131
|
-
puts
|
132
|
-
puts "=====END"
|
133
|
-
|
134
|
-
# alignment.block_alignments.each do |a|
|
135
|
-
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
|
-
# # p [a[:source], a[:target]]
|
137
|
-
# # p a[:alignment]
|
113
|
+
# source_text = source_annotations[:text]
|
114
|
+
# puts "=====BEGIN"
|
115
|
+
# (0 ... source_text.rstrip.length).each do |p|
|
116
|
+
# t = alignment.transform_begin_position(p)
|
117
|
+
# if t.nil?
|
118
|
+
# print source_text[p]
|
119
|
+
# else
|
120
|
+
# print '.'
|
121
|
+
# end
|
122
|
+
# end
|
123
|
+
# puts
|
124
|
+
# puts "=====END"
|
125
|
+
|
126
|
+
# puts "=====BEGIN"
|
127
|
+
# (0 .. source_text.rstrip.length).each do |p|
|
128
|
+
# t = alignment.transform_end_position(p)
|
129
|
+
# if t.nil?
|
130
|
+
# print source_text[p]
|
138
131
|
# else
|
139
|
-
#
|
140
|
-
# p a[:alignment].similarity
|
141
|
-
# puts "--"
|
142
|
-
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
143
|
-
# puts "--"
|
144
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
145
|
-
# puts "======"
|
132
|
+
# print '.'
|
146
133
|
# end
|
147
134
|
# end
|
135
|
+
# puts
|
136
|
+
# puts "=====END"
|
137
|
+
|
138
|
+
source_text = source_annotations[:text]
|
139
|
+
|
140
|
+
puts "[block alignment]"
|
141
|
+
puts alignment.alignment_show
|
142
|
+
puts "====="
|
148
143
|
# exit
|
149
144
|
|
150
145
|
# verification of source denotations
|
@@ -33,7 +33,9 @@ class TextAlignment::LCSComparison
|
|
33
33
|
@str2_match_initial = sdiff[match_initial].new_position
|
34
34
|
@str1_match_final = sdiff[match_final].old_position
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
|
-
|
36
|
+
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
|
+
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
38
|
+
# @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
|
37
39
|
else
|
38
40
|
@str1_match_initial = 0
|
39
41
|
@str2_match_initial = 0
|
@@ -8,20 +8,29 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
9
|
|
10
10
|
class TextAlignment::TextAlignment
|
11
|
-
attr_reader :
|
11
|
+
attr_reader :block_alignment
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
15
|
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
16
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
17
|
|
18
|
+
@block_alignment = {source_text:_str1, target_text:_str2}
|
19
|
+
|
18
20
|
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
19
21
|
|
20
22
|
# try exact match
|
21
23
|
block_begin = str2.index(str1)
|
22
24
|
unless block_begin.nil?
|
23
|
-
@
|
24
|
-
return @
|
25
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
+
return @block_alignment
|
27
|
+
end
|
28
|
+
|
29
|
+
# try exact match
|
30
|
+
block_begin = str2.downcase.index(str1.downcase)
|
31
|
+
unless block_begin.nil?
|
32
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
+
return @block_alignment
|
25
34
|
end
|
26
35
|
|
27
36
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
@@ -54,7 +63,7 @@ class TextAlignment::TextAlignment
|
|
54
63
|
# puts
|
55
64
|
|
56
65
|
## To find block alignments
|
57
|
-
@
|
66
|
+
@block_alignment[:blocks] = []
|
58
67
|
return if mblocks.empty?
|
59
68
|
|
60
69
|
# Initial step
|
@@ -63,35 +72,36 @@ class TextAlignment::TextAlignment
|
|
63
72
|
e2 = mblocks[0][:target][:begin]
|
64
73
|
|
65
74
|
if mblocks[0][:target][:begin] == 0
|
66
|
-
@
|
75
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
67
76
|
else
|
68
77
|
_str1 = str1[0 ... e1]
|
69
78
|
_str2 = str2[0 ... e2]
|
70
79
|
|
71
80
|
unless _str1.strip.empty?
|
72
81
|
if _str2.strip.empty?
|
73
|
-
@
|
82
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
74
83
|
else
|
75
84
|
len_min = [_str1.length, _str2.length].min
|
76
85
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
86
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
78
87
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
79
88
|
|
80
|
-
@
|
89
|
+
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
81
90
|
|
82
91
|
_str1 = str1[b1 ... e1]
|
83
92
|
_str2 = str2[b2 ... e2]
|
84
93
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
85
|
-
|
86
|
-
|
94
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
95
|
+
if similarity < 0.6
|
96
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
87
97
|
else
|
88
|
-
@
|
98
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
89
99
|
end
|
90
100
|
end
|
91
101
|
end
|
92
102
|
end
|
93
103
|
end
|
94
|
-
@
|
104
|
+
@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
|
95
105
|
|
96
106
|
(1 ... mblocks.length).each do |i|
|
97
107
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -102,17 +112,18 @@ class TextAlignment::TextAlignment
|
|
102
112
|
_str2 = str2[b2 ... e2]
|
103
113
|
unless _str1.strip.empty?
|
104
114
|
if _str2.strip.empty?
|
105
|
-
@
|
115
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
106
116
|
else
|
107
117
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
108
|
-
|
109
|
-
|
118
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
119
|
+
if similarity < 0.6
|
120
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
110
121
|
else
|
111
|
-
@
|
122
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
112
123
|
end
|
113
124
|
end
|
114
125
|
end
|
115
|
-
@
|
126
|
+
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
116
127
|
end
|
117
128
|
|
118
129
|
# Final step
|
@@ -124,7 +135,7 @@ class TextAlignment::TextAlignment
|
|
124
135
|
|
125
136
|
unless _str1.strip.empty?
|
126
137
|
if _str2.strip.empty?
|
127
|
-
@
|
138
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
128
139
|
else
|
129
140
|
len_min = [_str1.length, _str2.length].min
|
130
141
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -134,57 +145,58 @@ class TextAlignment::TextAlignment
|
|
134
145
|
_str2 = str2[b2 ... e2]
|
135
146
|
|
136
147
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
137
|
-
|
138
|
-
|
148
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
149
|
+
if similarity < 0.6
|
150
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
139
151
|
else
|
140
|
-
@
|
152
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
141
153
|
end
|
142
154
|
|
143
|
-
@
|
155
|
+
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
144
156
|
end
|
145
157
|
end
|
146
158
|
end
|
147
159
|
|
148
|
-
@
|
160
|
+
@block_alignment[:blocks].each do |a|
|
149
161
|
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
150
162
|
end
|
151
163
|
end
|
152
164
|
|
153
165
|
def transform_begin_position(begin_position)
|
154
|
-
i = @
|
155
|
-
|
156
|
-
|
157
|
-
b = if
|
158
|
-
begin_position +
|
159
|
-
elsif
|
160
|
-
if begin_position ==
|
161
|
-
|
166
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
167
|
+
block = @block_alignment[:blocks][i]
|
168
|
+
|
169
|
+
b = if block[:alignment] == :block
|
170
|
+
begin_position + block[:delta]
|
171
|
+
elsif block[:alignment] == :empty
|
172
|
+
if begin_position == block[:source][:begin]
|
173
|
+
block[:target][:begin]
|
162
174
|
else
|
163
175
|
# raise "lost annotation"
|
164
176
|
nil
|
165
177
|
end
|
166
178
|
else
|
167
|
-
r =
|
168
|
-
r.nil? ? nil : r +
|
179
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
180
|
+
r.nil? ? nil : r + block[:target][:begin]
|
169
181
|
end
|
170
182
|
end
|
171
183
|
|
172
184
|
def transform_end_position(end_position)
|
173
|
-
i = @
|
174
|
-
|
175
|
-
|
176
|
-
e = if
|
177
|
-
end_position +
|
178
|
-
elsif
|
179
|
-
if end_position ==
|
180
|
-
|
185
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
186
|
+
block = @block_alignment[:blocks][i]
|
187
|
+
|
188
|
+
e = if block[:alignment] == :block
|
189
|
+
end_position + block[:delta]
|
190
|
+
elsif block[:alignment] == :empty
|
191
|
+
if end_position == block[:source][:end]
|
192
|
+
block[:target][:end]
|
181
193
|
else
|
182
194
|
# raise "lost annotation"
|
183
195
|
nil
|
184
196
|
end
|
185
197
|
else
|
186
|
-
r =
|
187
|
-
r.nil? ? nil : r +
|
198
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
199
|
+
r.nil? ? nil : r + block[:target][:begin]
|
188
200
|
end
|
189
201
|
end
|
190
202
|
|
@@ -230,8 +242,63 @@ class TextAlignment::TextAlignment
|
|
230
242
|
r
|
231
243
|
end
|
232
244
|
|
233
|
-
|
245
|
+
def alignment_show
|
246
|
+
stext = @block_alignment[:source_text]
|
247
|
+
ttext = @block_alignment[:target_text]
|
248
|
+
|
249
|
+
show = ''
|
250
|
+
@block_alignment[:blocks].each do |a|
|
251
|
+
show += case a[:alignment]
|
252
|
+
when :block
|
253
|
+
"===== common =====\n" +
|
254
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
255
|
+
when :empty
|
256
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
257
|
+
"<<<<< string 1\n" +
|
258
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
259
|
+
">>>>> string 2\n" +
|
260
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
261
|
+
else
|
262
|
+
astr1 = ''
|
263
|
+
astr2 = ''
|
264
|
+
|
265
|
+
base = a[:source][:begin]
|
266
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
267
|
+
case c.action
|
268
|
+
when '='
|
269
|
+
stext[c.old_position + base]
|
270
|
+
when '+'
|
271
|
+
'_'
|
272
|
+
when '-'
|
273
|
+
stext[c.old_position + base]
|
274
|
+
when '!'
|
275
|
+
stext[c.old_position + base] + '_'
|
276
|
+
end
|
277
|
+
end.join('')
|
278
|
+
|
279
|
+
base = a[:target][:begin]
|
280
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
281
|
+
case c.action
|
282
|
+
when '='
|
283
|
+
ttext[c.new_position + base]
|
284
|
+
when '+'
|
285
|
+
ttext[c.new_position + base]
|
286
|
+
when '-'
|
287
|
+
'_'
|
288
|
+
when '!'
|
289
|
+
'_' + ttext[c.new_position + base]
|
290
|
+
end
|
291
|
+
end.join('')
|
234
292
|
|
293
|
+
"***** local mismatch\n" +
|
294
|
+
"[#{astr1}]\n" +
|
295
|
+
"[#{astr2}]\n\n"
|
296
|
+
end
|
297
|
+
end
|
298
|
+
show
|
299
|
+
end
|
300
|
+
|
301
|
+
private
|
235
302
|
|
236
303
|
def string_preprocessing(_str1, _str2)
|
237
304
|
str1 = _str1.dup
|
@@ -253,15 +320,15 @@ class TextAlignment::TextAlignment
|
|
253
320
|
pletters = TextAlignment::PADDING_LETTERS
|
254
321
|
|
255
322
|
# find the padding letter for str1
|
256
|
-
padding_letter1 = begin
|
323
|
+
@padding_letter1 = begin
|
257
324
|
i = pletters.index{|l| str2.index(l).nil?}
|
258
325
|
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
259
326
|
TextAlignment::PADDING_LETTERS[i]
|
260
327
|
end
|
261
328
|
|
262
329
|
# find the padding letter for str2
|
263
|
-
padding_letter2 = begin
|
264
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
330
|
+
@padding_letter2 = begin
|
331
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
265
332
|
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
266
333
|
TextAlignment::PADDING_LETTERS[i]
|
267
334
|
end
|
@@ -272,12 +339,12 @@ class TextAlignment::TextAlignment
|
|
272
339
|
from = f[1]
|
273
340
|
|
274
341
|
if str2.index(f[0])
|
275
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
342
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
276
343
|
str1.gsub!(from, to)
|
277
344
|
end
|
278
345
|
|
279
346
|
if str1.index(f[0])
|
280
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
347
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
281
348
|
str2.gsub!(from, to)
|
282
349
|
end
|
283
350
|
end
|
@@ -286,4 +353,13 @@ class TextAlignment::TextAlignment
|
|
286
353
|
[str1, str2, mappings]
|
287
354
|
end
|
288
355
|
|
356
|
+
def alignment_similarity(_s1, _s2, alignment)
|
357
|
+
# compute the lcs only with non-whitespace letters
|
358
|
+
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
359
|
+
|
360
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
361
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
362
|
+
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
363
|
+
end
|
364
|
+
|
289
365
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|