text_alignment 0.6 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_alignment/text_alignment.rb +24 -11
- data/lib/text_alignment/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb5dd06236d0b1a8a9c8c5fcb92807a62bdd30e0648bcbd636b95b2a8a45b9b4
|
4
|
+
data.tar.gz: 9266b852993bfee999daa92e3f38ec93e2aec77171fee27c1fea6ac2a17e4d23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ee2a590fb31bcc27121a4a227d7fcefe2e8e80646bea3898bb86729ca3ca299e0aebcf23bea30e2391687e6ec0d6573c04a4605f728562482c7edbd0c0285e0
|
7
|
+
data.tar.gz: 73612c185fe533b0daa22d44e7776ed610025cb1bd874f05d95761079f95d1e8a06ead68c88b84bab4d33e8a676edff1e98880912254d9a7ecb5c4ead5eb01fb
|
@@ -91,8 +91,9 @@ class TextAlignment::TextAlignment
|
|
91
91
|
_str1 = str1[b1 ... e1]
|
92
92
|
_str2 = str2[b2 ... e2]
|
93
93
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
94
|
-
|
95
|
-
|
94
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
95
|
+
if similarity < 0.6
|
96
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
96
97
|
else
|
97
98
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
98
99
|
end
|
@@ -114,8 +115,9 @@ class TextAlignment::TextAlignment
|
|
114
115
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
115
116
|
else
|
116
117
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
117
|
-
|
118
|
-
|
118
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
119
|
+
if similarity < 0.6
|
120
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
119
121
|
else
|
120
122
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
121
123
|
end
|
@@ -143,8 +145,9 @@ class TextAlignment::TextAlignment
|
|
143
145
|
_str2 = str2[b2 ... e2]
|
144
146
|
|
145
147
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
146
|
-
|
147
|
-
|
148
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
149
|
+
if similarity < 0.6
|
150
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
148
151
|
else
|
149
152
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
150
153
|
end
|
@@ -250,6 +253,7 @@ class TextAlignment::TextAlignment
|
|
250
253
|
"===== common =====\n" +
|
251
254
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
252
255
|
when :empty
|
256
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
253
257
|
"<<<<< string 1\n" +
|
254
258
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
255
259
|
">>>>> string 2\n" +
|
@@ -316,15 +320,15 @@ class TextAlignment::TextAlignment
|
|
316
320
|
pletters = TextAlignment::PADDING_LETTERS
|
317
321
|
|
318
322
|
# find the padding letter for str1
|
319
|
-
padding_letter1 = begin
|
323
|
+
@padding_letter1 = begin
|
320
324
|
i = pletters.index{|l| str2.index(l).nil?}
|
321
325
|
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
322
326
|
TextAlignment::PADDING_LETTERS[i]
|
323
327
|
end
|
324
328
|
|
325
329
|
# find the padding letter for str2
|
326
|
-
padding_letter2 = begin
|
327
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
330
|
+
@padding_letter2 = begin
|
331
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
328
332
|
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
329
333
|
TextAlignment::PADDING_LETTERS[i]
|
330
334
|
end
|
@@ -335,12 +339,12 @@ class TextAlignment::TextAlignment
|
|
335
339
|
from = f[1]
|
336
340
|
|
337
341
|
if str2.index(f[0])
|
338
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
342
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
339
343
|
str1.gsub!(from, to)
|
340
344
|
end
|
341
345
|
|
342
346
|
if str1.index(f[0])
|
343
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
347
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
344
348
|
str2.gsub!(from, to)
|
345
349
|
end
|
346
350
|
end
|
@@ -349,4 +353,13 @@ class TextAlignment::TextAlignment
|
|
349
353
|
[str1, str2, mappings]
|
350
354
|
end
|
351
355
|
|
356
|
+
def alignment_similarity(_s1, _s2, alignment)
|
357
|
+
# compute the lcs only with non-whitespace letters
|
358
|
+
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
359
|
+
|
360
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
361
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
362
|
+
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
363
|
+
end
|
364
|
+
|
352
365
|
end
|