text_alignment 0.4.3 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +32 -37
- data/lib/text_alignment/lcs_comparison.rb +3 -1
- data/lib/text_alignment/text_alignment.rb +124 -48
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb5dd06236d0b1a8a9c8c5fcb92807a62bdd30e0648bcbd636b95b2a8a45b9b4
|
4
|
+
data.tar.gz: 9266b852993bfee999daa92e3f38ec93e2aec77171fee27c1fea6ac2a17e4d23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ee2a590fb31bcc27121a4a227d7fcefe2e8e80646bea3898bb86729ca3ca299e0aebcf23bea30e2391687e6ec0d6573c04a4605f728562482c7edbd0c0285e0
|
7
|
+
data.tar.gz: 73612c185fe533b0daa22d44e7776ed610025cb1bd874f05d95761079f95d1e8a06ead68c88b84bab4d33e8a676edff1e98880912254d9a7ecb5c4ead5eb01fb
|
data/bin/align_annotations
CHANGED
@@ -35,6 +35,10 @@ def align_mdoc(source_annotations, target_annotations)
|
|
35
35
|
source_annotations.each do |annotations|
|
36
36
|
alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
|
37
37
|
|
38
|
+
puts alignment.alignment_show
|
39
|
+
puts "-----"
|
40
|
+
puts
|
41
|
+
|
38
42
|
# alignment.block_alignments.each do |a|
|
39
43
|
# p {source:a[:source], target:a[:target]}
|
40
44
|
# puts "--"
|
@@ -103,48 +107,39 @@ target_annotations = if source_annotations.class == Array
|
|
103
107
|
else
|
104
108
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
109
|
|
106
|
-
pp alignment
|
110
|
+
# pp alignment
|
107
111
|
|
108
112
|
# verification
|
109
|
-
source_text = source_annotations[:text]
|
110
|
-
puts "=====BEGIN"
|
111
|
-
(0 ... source_text.rstrip.length).each do |p|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
end
|
119
|
-
puts
|
120
|
-
puts "=====END"
|
121
|
-
|
122
|
-
puts "=====BEGIN"
|
123
|
-
(0 .. source_text.rstrip.length).each do |p|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
else
|
128
|
-
print '.'
|
129
|
-
end
|
130
|
-
end
|
131
|
-
puts
|
132
|
-
puts "=====END"
|
133
|
-
|
134
|
-
# alignment.block_alignments.each do |a|
|
135
|
-
# if a[:alignment].nil? || a[:alignment] == :empty
|
136
|
-
# # p [a[:source], a[:target]]
|
137
|
-
# # p a[:alignment]
|
113
|
+
# source_text = source_annotations[:text]
|
114
|
+
# puts "=====BEGIN"
|
115
|
+
# (0 ... source_text.rstrip.length).each do |p|
|
116
|
+
# t = alignment.transform_begin_position(p)
|
117
|
+
# if t.nil?
|
118
|
+
# print source_text[p]
|
119
|
+
# else
|
120
|
+
# print '.'
|
121
|
+
# end
|
122
|
+
# end
|
123
|
+
# puts
|
124
|
+
# puts "=====END"
|
125
|
+
|
126
|
+
# puts "=====BEGIN"
|
127
|
+
# (0 .. source_text.rstrip.length).each do |p|
|
128
|
+
# t = alignment.transform_end_position(p)
|
129
|
+
# if t.nil?
|
130
|
+
# print source_text[p]
|
138
131
|
# else
|
139
|
-
#
|
140
|
-
# p a[:alignment].similarity
|
141
|
-
# puts "--"
|
142
|
-
# puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
|
143
|
-
# puts "--"
|
144
|
-
# puts target_text[a[:target][:begin] ... a[:target][:end]]
|
145
|
-
# puts "======"
|
132
|
+
# print '.'
|
146
133
|
# end
|
147
134
|
# end
|
135
|
+
# puts
|
136
|
+
# puts "=====END"
|
137
|
+
|
138
|
+
source_text = source_annotations[:text]
|
139
|
+
|
140
|
+
puts "[block alignment]"
|
141
|
+
puts alignment.alignment_show
|
142
|
+
puts "====="
|
148
143
|
# exit
|
149
144
|
|
150
145
|
# verification of source denotations
|
@@ -33,7 +33,9 @@ class TextAlignment::LCSComparison
|
|
33
33
|
@str2_match_initial = sdiff[match_initial].new_position
|
34
34
|
@str1_match_final = sdiff[match_final].old_position
|
35
35
|
@str2_match_final = sdiff[match_final].new_position
|
36
|
-
|
36
|
+
mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
37
|
+
@similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
|
38
|
+
# @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
|
37
39
|
else
|
38
40
|
@str1_match_initial = 0
|
39
41
|
@str2_match_initial = 0
|
@@ -8,20 +8,29 @@ module TextAlignment; end unless defined? TextAlignment
|
|
8
8
|
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
9
|
|
10
10
|
class TextAlignment::TextAlignment
|
11
|
-
attr_reader :
|
11
|
+
attr_reader :block_alignment
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
15
|
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
16
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
17
|
|
18
|
+
@block_alignment = {source_text:_str1, target_text:_str2}
|
19
|
+
|
18
20
|
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
19
21
|
|
20
22
|
# try exact match
|
21
23
|
block_begin = str2.index(str1)
|
22
24
|
unless block_begin.nil?
|
23
|
-
@
|
24
|
-
return @
|
25
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
26
|
+
return @block_alignment
|
27
|
+
end
|
28
|
+
|
29
|
+
# try exact match
|
30
|
+
block_begin = str2.downcase.index(str1.downcase)
|
31
|
+
unless block_begin.nil?
|
32
|
+
@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
33
|
+
return @block_alignment
|
25
34
|
end
|
26
35
|
|
27
36
|
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
@@ -54,7 +63,7 @@ class TextAlignment::TextAlignment
|
|
54
63
|
# puts
|
55
64
|
|
56
65
|
## To find block alignments
|
57
|
-
@
|
66
|
+
@block_alignment[:blocks] = []
|
58
67
|
return if mblocks.empty?
|
59
68
|
|
60
69
|
# Initial step
|
@@ -63,35 +72,36 @@ class TextAlignment::TextAlignment
|
|
63
72
|
e2 = mblocks[0][:target][:begin]
|
64
73
|
|
65
74
|
if mblocks[0][:target][:begin] == 0
|
66
|
-
@
|
75
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
|
67
76
|
else
|
68
77
|
_str1 = str1[0 ... e1]
|
69
78
|
_str2 = str2[0 ... e2]
|
70
79
|
|
71
80
|
unless _str1.strip.empty?
|
72
81
|
if _str2.strip.empty?
|
73
|
-
@
|
82
|
+
@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
|
74
83
|
else
|
75
84
|
len_min = [_str1.length, _str2.length].min
|
76
85
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
86
|
b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
|
78
87
|
b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
|
79
88
|
|
80
|
-
@
|
89
|
+
@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
|
81
90
|
|
82
91
|
_str1 = str1[b1 ... e1]
|
83
92
|
_str2 = str2[b2 ... e2]
|
84
93
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
85
|
-
|
86
|
-
|
94
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
95
|
+
if similarity < 0.6
|
96
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
87
97
|
else
|
88
|
-
@
|
98
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
89
99
|
end
|
90
100
|
end
|
91
101
|
end
|
92
102
|
end
|
93
103
|
end
|
94
|
-
@
|
104
|
+
@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
|
95
105
|
|
96
106
|
(1 ... mblocks.length).each do |i|
|
97
107
|
b1 = mblocks[i - 1][:source][:end]
|
@@ -102,17 +112,18 @@ class TextAlignment::TextAlignment
|
|
102
112
|
_str2 = str2[b2 ... e2]
|
103
113
|
unless _str1.strip.empty?
|
104
114
|
if _str2.strip.empty?
|
105
|
-
@
|
115
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
106
116
|
else
|
107
117
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
108
|
-
|
109
|
-
|
118
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
119
|
+
if similarity < 0.6
|
120
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
110
121
|
else
|
111
|
-
@
|
122
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
112
123
|
end
|
113
124
|
end
|
114
125
|
end
|
115
|
-
@
|
126
|
+
@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
|
116
127
|
end
|
117
128
|
|
118
129
|
# Final step
|
@@ -124,7 +135,7 @@ class TextAlignment::TextAlignment
|
|
124
135
|
|
125
136
|
unless _str1.strip.empty?
|
126
137
|
if _str2.strip.empty?
|
127
|
-
@
|
138
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
|
128
139
|
else
|
129
140
|
len_min = [_str1.length, _str2.length].min
|
130
141
|
len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -134,57 +145,58 @@ class TextAlignment::TextAlignment
|
|
134
145
|
_str2 = str2[b2 ... e2]
|
135
146
|
|
136
147
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
137
|
-
|
138
|
-
|
148
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
149
|
+
if similarity < 0.6
|
150
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
139
151
|
else
|
140
|
-
@
|
152
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
141
153
|
end
|
142
154
|
|
143
|
-
@
|
155
|
+
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
144
156
|
end
|
145
157
|
end
|
146
158
|
end
|
147
159
|
|
148
|
-
@
|
160
|
+
@block_alignment[:blocks].each do |a|
|
149
161
|
a[:delta] = a[:target][:begin] - a[:source][:begin]
|
150
162
|
end
|
151
163
|
end
|
152
164
|
|
153
165
|
def transform_begin_position(begin_position)
|
154
|
-
i = @
|
155
|
-
|
156
|
-
|
157
|
-
b = if
|
158
|
-
begin_position +
|
159
|
-
elsif
|
160
|
-
if begin_position ==
|
161
|
-
|
166
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
167
|
+
block = @block_alignment[:blocks][i]
|
168
|
+
|
169
|
+
b = if block[:alignment] == :block
|
170
|
+
begin_position + block[:delta]
|
171
|
+
elsif block[:alignment] == :empty
|
172
|
+
if begin_position == block[:source][:begin]
|
173
|
+
block[:target][:begin]
|
162
174
|
else
|
163
175
|
# raise "lost annotation"
|
164
176
|
nil
|
165
177
|
end
|
166
178
|
else
|
167
|
-
r =
|
168
|
-
r.nil? ? nil : r +
|
179
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
180
|
+
r.nil? ? nil : r + block[:target][:begin]
|
169
181
|
end
|
170
182
|
end
|
171
183
|
|
172
184
|
def transform_end_position(end_position)
|
173
|
-
i = @
|
174
|
-
|
175
|
-
|
176
|
-
e = if
|
177
|
-
end_position +
|
178
|
-
elsif
|
179
|
-
if end_position ==
|
180
|
-
|
185
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
186
|
+
block = @block_alignment[:blocks][i]
|
187
|
+
|
188
|
+
e = if block[:alignment] == :block
|
189
|
+
end_position + block[:delta]
|
190
|
+
elsif block[:alignment] == :empty
|
191
|
+
if end_position == block[:source][:end]
|
192
|
+
block[:target][:end]
|
181
193
|
else
|
182
194
|
# raise "lost annotation"
|
183
195
|
nil
|
184
196
|
end
|
185
197
|
else
|
186
|
-
r =
|
187
|
-
r.nil? ? nil : r +
|
198
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
199
|
+
r.nil? ? nil : r + block[:target][:begin]
|
188
200
|
end
|
189
201
|
end
|
190
202
|
|
@@ -230,8 +242,63 @@ class TextAlignment::TextAlignment
|
|
230
242
|
r
|
231
243
|
end
|
232
244
|
|
233
|
-
|
245
|
+
def alignment_show
|
246
|
+
stext = @block_alignment[:source_text]
|
247
|
+
ttext = @block_alignment[:target_text]
|
248
|
+
|
249
|
+
show = ''
|
250
|
+
@block_alignment[:blocks].each do |a|
|
251
|
+
show += case a[:alignment]
|
252
|
+
when :block
|
253
|
+
"===== common =====\n" +
|
254
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
255
|
+
when :empty
|
256
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
257
|
+
"<<<<< string 1\n" +
|
258
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
259
|
+
">>>>> string 2\n" +
|
260
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
261
|
+
else
|
262
|
+
astr1 = ''
|
263
|
+
astr2 = ''
|
264
|
+
|
265
|
+
base = a[:source][:begin]
|
266
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
267
|
+
case c.action
|
268
|
+
when '='
|
269
|
+
stext[c.old_position + base]
|
270
|
+
when '+'
|
271
|
+
'_'
|
272
|
+
when '-'
|
273
|
+
stext[c.old_position + base]
|
274
|
+
when '!'
|
275
|
+
stext[c.old_position + base] + '_'
|
276
|
+
end
|
277
|
+
end.join('')
|
278
|
+
|
279
|
+
base = a[:target][:begin]
|
280
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
281
|
+
case c.action
|
282
|
+
when '='
|
283
|
+
ttext[c.new_position + base]
|
284
|
+
when '+'
|
285
|
+
ttext[c.new_position + base]
|
286
|
+
when '-'
|
287
|
+
'_'
|
288
|
+
when '!'
|
289
|
+
'_' + ttext[c.new_position + base]
|
290
|
+
end
|
291
|
+
end.join('')
|
234
292
|
|
293
|
+
"***** local mismatch\n" +
|
294
|
+
"[#{astr1}]\n" +
|
295
|
+
"[#{astr2}]\n\n"
|
296
|
+
end
|
297
|
+
end
|
298
|
+
show
|
299
|
+
end
|
300
|
+
|
301
|
+
private
|
235
302
|
|
236
303
|
def string_preprocessing(_str1, _str2)
|
237
304
|
str1 = _str1.dup
|
@@ -253,15 +320,15 @@ class TextAlignment::TextAlignment
|
|
253
320
|
pletters = TextAlignment::PADDING_LETTERS
|
254
321
|
|
255
322
|
# find the padding letter for str1
|
256
|
-
padding_letter1 = begin
|
323
|
+
@padding_letter1 = begin
|
257
324
|
i = pletters.index{|l| str2.index(l).nil?}
|
258
325
|
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
259
326
|
TextAlignment::PADDING_LETTERS[i]
|
260
327
|
end
|
261
328
|
|
262
329
|
# find the padding letter for str2
|
263
|
-
padding_letter2 = begin
|
264
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
330
|
+
@padding_letter2 = begin
|
331
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
265
332
|
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
266
333
|
TextAlignment::PADDING_LETTERS[i]
|
267
334
|
end
|
@@ -272,12 +339,12 @@ class TextAlignment::TextAlignment
|
|
272
339
|
from = f[1]
|
273
340
|
|
274
341
|
if str2.index(f[0])
|
275
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
342
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
276
343
|
str1.gsub!(from, to)
|
277
344
|
end
|
278
345
|
|
279
346
|
if str1.index(f[0])
|
280
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
347
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
281
348
|
str2.gsub!(from, to)
|
282
349
|
end
|
283
350
|
end
|
@@ -286,4 +353,13 @@ class TextAlignment::TextAlignment
|
|
286
353
|
[str1, str2, mappings]
|
287
354
|
end
|
288
355
|
|
356
|
+
def alignment_similarity(_s1, _s2, alignment)
|
357
|
+
# compute the lcs only with non-whitespace letters
|
358
|
+
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
359
|
+
|
360
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
361
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
362
|
+
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
363
|
+
end
|
364
|
+
|
289
365
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-10-
|
11
|
+
date: 2020-10-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|