text_alignment 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/mixed_alignment.rb +74 -4
- data/lib/text_alignment/text_alignment.rb +19 -91
- data/lib/text_alignment/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bed1eba72da626227ab727ce22129d226539bcfae5ca22006ac26258b184d8c
|
4
|
+
data.tar.gz: d2c121ea072186fd25fd61fb90c5ffacb886c1d109b82c044a1666220b8f7d8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e526995325e79fdde8ecd729c04e2e6a21e13f0166acc39b341133055275a1bbd5a3318f78dd5af4a72237c140fa8eb06270441a16e2426e58a57183b91ca6a
|
7
|
+
data.tar.gz: ec423d59036b1ee5595141428fe320f0e9ca16b8b2660d46a0f59f376c3845ad70196d006c2f83390ac12f98b35ff14a1098fcd24cda0ee1c6534f36915def81
|
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(
|
21
|
-
raise ArgumentError, "nil string" if
|
22
|
-
|
20
|
+
def initialize(_str1, _str2)
|
21
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
|
+
|
23
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
23
24
|
|
24
25
|
_compute_mixed_alignment(str1, str2, mappings)
|
25
26
|
end
|
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
|
|
62
63
|
end
|
63
64
|
|
64
65
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
65
|
-
@similarity =
|
66
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
66
67
|
@str1_match_initial = cmp.str1_match_initial
|
67
68
|
@str1_match_final = cmp.str1_match_final
|
68
69
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
|
|
137
138
|
@position_map_begin = posmap_begin.sort.to_h
|
138
139
|
@position_map_end = posmap_end.sort.to_h
|
139
140
|
end
|
141
|
+
|
142
|
+
private
|
143
|
+
|
144
|
+
def string_preprocessing(_str1, _str2)
|
145
|
+
str1 = _str1.dup
|
146
|
+
str2 = _str2.dup
|
147
|
+
mappings = TextAlignment::MAPPINGS.dup
|
148
|
+
|
149
|
+
## single character mappings
|
150
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
+
characters_to.gsub!(/-/, '\-')
|
154
|
+
|
155
|
+
str1.tr!(characters_from, characters_to)
|
156
|
+
str2.tr!(characters_from, characters_to)
|
157
|
+
|
158
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
+
|
160
|
+
## long to one character mappings
|
161
|
+
pletters = TextAlignment::PADDING_LETTERS
|
162
|
+
|
163
|
+
# find the padding letter for str1
|
164
|
+
@padding_letter1 = begin
|
165
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
+
TextAlignment::PADDING_LETTERS[i]
|
168
|
+
end
|
169
|
+
|
170
|
+
# find the padding letter for str2
|
171
|
+
@padding_letter2 = begin
|
172
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
+
TextAlignment::PADDING_LETTERS[i]
|
175
|
+
end
|
176
|
+
|
177
|
+
# ASCII foldings
|
178
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
+
ascii_foldings.each do |f|
|
180
|
+
from = f[1]
|
181
|
+
|
182
|
+
if str2.index(f[0])
|
183
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
+
str1.gsub!(from, to)
|
185
|
+
end
|
186
|
+
|
187
|
+
if str1.index(f[0])
|
188
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
+
str2.gsub!(from, to)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
+
|
194
|
+
[str1, str2, mappings]
|
195
|
+
end
|
196
|
+
|
197
|
+
def compute_similarity(_s1, _s2, sdiff)
|
198
|
+
return 0 if sdiff.nil?
|
199
|
+
|
200
|
+
# compute the lcs only with non-whitespace letters
|
201
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
|
+
return 0 if lcs == 0
|
203
|
+
|
204
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
205
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
+
|
207
|
+
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
208
|
+
end
|
209
|
+
|
140
210
|
end
|
@@ -12,12 +12,10 @@ class TextAlignment::TextAlignment
|
|
12
12
|
attr_reader :similarity
|
13
13
|
attr_reader :lost_annotations
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
17
17
|
|
18
|
-
@block_alignment = {source_text:
|
19
|
-
|
20
|
-
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
18
|
+
@block_alignment = {source_text:str1, target_text:str2}
|
21
19
|
|
22
20
|
# try exact match
|
23
21
|
block_begin = str2.index(str1)
|
@@ -90,12 +88,11 @@ class TextAlignment::TextAlignment
|
|
90
88
|
|
91
89
|
_str1 = str1[b1 ... e1]
|
92
90
|
_str2 = str2[b2 ... e2]
|
93
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
94
|
-
similarity
|
95
|
-
|
96
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
91
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
92
|
+
if alignment.similarity < 0.5
|
93
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
97
94
|
else
|
98
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
95
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
99
96
|
end
|
100
97
|
end
|
101
98
|
end
|
@@ -114,12 +111,11 @@ class TextAlignment::TextAlignment
|
|
114
111
|
if _str2.strip.empty?
|
115
112
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
116
113
|
else
|
117
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
118
|
-
similarity
|
119
|
-
|
120
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
114
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
115
|
+
if alignment.similarity < 0.5
|
116
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
121
117
|
else
|
122
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
118
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
123
119
|
end
|
124
120
|
end
|
125
121
|
end
|
@@ -144,12 +140,11 @@ class TextAlignment::TextAlignment
|
|
144
140
|
_str1 = str1[b1 ... e1]
|
145
141
|
_str2 = str2[b2 ... e2]
|
146
142
|
|
147
|
-
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase
|
148
|
-
similarity
|
149
|
-
|
150
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
143
|
+
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
|
144
|
+
if alignment.similarity < 0.5
|
145
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
|
151
146
|
else
|
152
|
-
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
147
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
|
153
148
|
end
|
154
149
|
|
155
150
|
@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
|
@@ -250,13 +245,13 @@ class TextAlignment::TextAlignment
|
|
250
245
|
@block_alignment[:blocks].each do |a|
|
251
246
|
show += case a[:alignment]
|
252
247
|
when :block
|
253
|
-
"===== common
|
248
|
+
"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
254
249
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
255
250
|
when :empty
|
256
251
|
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
257
|
-
"<<<<< string 1\n" +
|
252
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
258
253
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
259
|
-
">>>>> string 2\n" +
|
254
|
+
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
260
255
|
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
261
256
|
else
|
262
257
|
astr1 = ''
|
@@ -290,7 +285,7 @@ class TextAlignment::TextAlignment
|
|
290
285
|
end
|
291
286
|
end.join('')
|
292
287
|
|
293
|
-
"***** local mismatch\n" +
|
288
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
294
289
|
"[#{astr1}]\n" +
|
295
290
|
"[#{astr2}]\n\n"
|
296
291
|
end
|
@@ -298,71 +293,4 @@ class TextAlignment::TextAlignment
|
|
298
293
|
show
|
299
294
|
end
|
300
295
|
|
301
|
-
private
|
302
|
-
|
303
|
-
def string_preprocessing(_str1, _str2)
|
304
|
-
str1 = _str1.dup
|
305
|
-
str2 = _str2.dup
|
306
|
-
mappings = TextAlignment::MAPPINGS.dup
|
307
|
-
|
308
|
-
## single character mappings
|
309
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
310
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
311
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
312
|
-
characters_to.gsub!(/-/, '\-')
|
313
|
-
|
314
|
-
str1.tr!(characters_from, characters_to)
|
315
|
-
str2.tr!(characters_from, characters_to)
|
316
|
-
|
317
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
318
|
-
|
319
|
-
## long to one character mappings
|
320
|
-
pletters = TextAlignment::PADDING_LETTERS
|
321
|
-
|
322
|
-
# find the padding letter for str1
|
323
|
-
@padding_letter1 = begin
|
324
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
325
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
326
|
-
TextAlignment::PADDING_LETTERS[i]
|
327
|
-
end
|
328
|
-
|
329
|
-
# find the padding letter for str2
|
330
|
-
@padding_letter2 = begin
|
331
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
332
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
333
|
-
TextAlignment::PADDING_LETTERS[i]
|
334
|
-
end
|
335
|
-
|
336
|
-
# ASCII foldings
|
337
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
338
|
-
ascii_foldings.each do |f|
|
339
|
-
from = f[1]
|
340
|
-
|
341
|
-
if str2.index(f[0])
|
342
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
343
|
-
str1.gsub!(from, to)
|
344
|
-
end
|
345
|
-
|
346
|
-
if str1.index(f[0])
|
347
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
348
|
-
str2.gsub!(from, to)
|
349
|
-
end
|
350
|
-
end
|
351
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
352
|
-
|
353
|
-
[str1, str2, mappings]
|
354
|
-
end
|
355
|
-
|
356
|
-
def alignment_similarity(_s1, _s2, alignment)
|
357
|
-
return 0 if alignment.sdiff.nil?
|
358
|
-
|
359
|
-
# compute the lcs only with non-whitespace letters
|
360
|
-
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
361
|
-
|
362
|
-
s1 = _s1.tr(@padding_letter1, ' ')
|
363
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
364
|
-
|
365
|
-
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
366
|
-
end
|
367
|
-
|
368
296
|
end
|