text_alignment 0.6.2 → 0.6.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +19 -91
- data/lib/text_alignment/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bed1eba72da626227ab727ce22129d226539bcfae5ca22006ac26258b184d8c
|
4
|
+
data.tar.gz: d2c121ea072186fd25fd61fb90c5ffacb886c1d109b82c044a1666220b8f7d8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e526995325e79fdde8ecd729c04e2e6a21e13f0166acc39b341133055275a1bbd5a3318f78dd5af4a72237c140fa8eb06270441a16e2426e58a57183b91ca6a
|
7
|
+
data.tar.gz: ec423d59036b1ee5595141428fe320f0e9ca16b8b2660d46a0f59f376c3845ad70196d006c2f83390ac12f98b35ff14a1098fcd24cda0ee1c6534f36915def81
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -12,12 +12,10 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@block_alignment = {source_text:
|
19
|
-
|
20
|
-
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
21
19
|
|
22
20
|
# try exact match
|
23
21
|
block_begin = str2.index(str1)
|
@@ -90,12 +88,11 @@ class TextAlignment::TextAlignment
|
|
90
88
|
|
91
89
|
_str1 = str1[b1 ... e1]
|
92
90
|
_str2 = str2[b2 ... e2]
|
93
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
94
|
-
similarity
|
95
|
-
|
96
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
91
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
+
if alignment.similarity < 0.5
|
93
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
97
94
|
else
|
98
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
95
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
99
96
|
end
|
100
97
|
end
|
101
98
|
end
|
@@ -114,12 +111,11 @@ class TextAlignment::TextAlignment
|
|
114
111
|
if _str2.strip.empty?
|
115
112
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
116
113
|
else
|
117
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
118
|
-
similarity
|
119
|
-
|
120
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
114
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
115
|
+
if alignment.similarity < 0.5
|
116
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
121
117
|
else
|
122
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
118
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
123
119
|
end
|
124
120
|
end
|
125
121
|
end
|
@@ -144,12 +140,11 @@ class TextAlignment::TextAlignment
|
|
144
140
|
_str1 = str1[b1 ... e1]
|
145
141
|
_str2 = str2[b2 ... e2]
|
146
142
|
|
147
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
148
|
-
similarity
|
149
|
-
|
150
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
143
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
144
|
+
if alignment.similarity < 0.5
|
145
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
151
146
|
else
|
152
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
147
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
153
148
|
end
|
154
149
|
|
155
150
|
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
@@ -250,13 +245,13 @@ class TextAlignment::TextAlignment
|
|
250
245
|
@block_alignment[:blocks].each do |a|
|
251
246
|
show += case a[:alignment]
|
252
247
|
when :block
|
253
|
-
"===== common
|
248
|
+
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
254
249
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
255
250
|
when :empty
|
256
251
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
257
|
-
"<<<<< string 1\n" +
|
252
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
258
253
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
259
|
-
">>>>> string 2\n" +
|
254
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
260
255
|
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
261
256
|
else
|
262
257
|
astr1 = ''
|
@@ -290,7 +285,7 @@ class TextAlignment::TextAlignment
|
|
290
285
|
end
|
291
286
|
end.join('')
|
292
287
|
|
293
|
-
"***** local mismatch\n" +
|
288
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
294
289
|
"[#{astr1}]\n" +
|
295
290
|
"[#{astr2}]\n\n"
|
296
291
|
end
|
@@ -298,71 +293,4 @@ class TextAlignment::TextAlignment
|
|
298
293
|
show
|
299
294
|
end
|
300
295
|
|
301
|
-
private
|
302
|
-
|
303
|
-
def string_preprocessing(_str1, _str2)
|
304
|
-
str1 = _str1.dup
|
305
|
-
str2 = _str2.dup
|
306
|
-
mappings = TextAlignment::MAPPINGS.dup
|
307
|
-
|
308
|
-
## single character mappings
|
309
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
310
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
311
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
312
|
-
characters_to.gsub!(/-/, '\-')
|
313
|
-
|
314
|
-
str1.tr!(characters_from, characters_to)
|
315
|
-
str2.tr!(characters_from, characters_to)
|
316
|
-
|
317
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
318
|
-
|
319
|
-
## long to one character mappings
|
320
|
-
pletters = TextAlignment::PADDING_LETTERS
|
321
|
-
|
322
|
-
# find the padding letter for str1
|
323
|
-
@padding_letter1 = begin
|
324
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
325
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
326
|
-
TextAlignment::PADDING_LETTERS[i]
|
327
|
-
end
|
328
|
-
|
329
|
-
# find the padding letter for str2
|
330
|
-
@padding_letter2 = begin
|
331
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
332
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
333
|
-
TextAlignment::PADDING_LETTERS[i]
|
334
|
-
end
|
335
|
-
|
336
|
-
# ASCII foldings
|
337
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
338
|
-
ascii_foldings.each do |f|
|
339
|
-
from = f[1]
|
340
|
-
|
341
|
-
if str2.index(f[0])
|
342
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
343
|
-
str1.gsub!(from, to)
|
344
|
-
end
|
345
|
-
|
346
|
-
if str1.index(f[0])
|
347
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
348
|
-
str2.gsub!(from, to)
|
349
|
-
end
|
350
|
-
end
|
351
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
352
|
-
|
353
|
-
[str1, str2, mappings]
|
354
|
-
end
|
355
|
-
|
356
|
-
def alignment_similarity(_s1, _s2, alignment)
|
357
|
-
return 0 if alignment.sdiff.nil?
|
358
|
-
|
359
|
-
# compute the lcs only with non-whitespace letters
|
360
|
-
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
361
|
-
|
362
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
363
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
364
|
-
|
365
|
-
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
366
|
-
end
|
367
|
-
|
368
296
|
end
|