text_alignment 0.6 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/text_alignment.rb +24 -11
- data/lib/text_alignment/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb5dd06236d0b1a8a9c8c5fcb92807a62bdd30e0648bcbd636b95b2a8a45b9b4
|
4
|
+
data.tar.gz: 9266b852993bfee999daa92e3f38ec93e2aec77171fee27c1fea6ac2a17e4d23
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ee2a590fb31bcc27121a4a227d7fcefe2e8e80646bea3898bb86729ca3ca299e0aebcf23bea30e2391687e6ec0d6573c04a4605f728562482c7edbd0c0285e0
|
7
|
+
data.tar.gz: 73612c185fe533b0daa22d44e7776ed610025cb1bd874f05d95761079f95d1e8a06ead68c88b84bab4d33e8a676edff1e98880912254d9a7ecb5c4ead5eb01fb
|
@@ -91,8 +91,9 @@ class TextAlignment::TextAlignment
|
|
91
91
|
_str1 = str1[b1 ... e1]
|
92
92
|
_str2 = str2[b2 ... e2]
|
93
93
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
94
|
-
|
95
|
-
|
94
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
95
|
+
if similarity < 0.6
|
96
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
|
96
97
|
else
|
97
98
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
|
98
99
|
end
|
@@ -114,8 +115,9 @@ class TextAlignment::TextAlignment
|
|
114
115
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
|
115
116
|
else
|
116
117
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
117
|
-
|
118
|
-
|
118
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
119
|
+
if similarity < 0.6
|
120
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
119
121
|
else
|
120
122
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
121
123
|
end
|
@@ -143,8 +145,9 @@ class TextAlignment::TextAlignment
|
|
143
145
|
_str2 = str2[b2 ... e2]
|
144
146
|
|
145
147
|
alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
|
146
|
-
|
147
|
-
|
148
|
+
similarity = alignment_similarity(_str1, _str2, alignment)
|
149
|
+
if similarity < 0.6
|
150
|
+
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
|
148
151
|
else
|
149
152
|
@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
|
150
153
|
end
|
@@ -250,6 +253,7 @@ class TextAlignment::TextAlignment
|
|
250
253
|
"===== common =====\n" +
|
251
254
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
252
255
|
when :empty
|
256
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
253
257
|
"<<<<< string 1\n" +
|
254
258
|
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
255
259
|
">>>>> string 2\n" +
|
@@ -316,15 +320,15 @@ class TextAlignment::TextAlignment
|
|
316
320
|
pletters = TextAlignment::PADDING_LETTERS
|
317
321
|
|
318
322
|
# find the padding letter for str1
|
319
|
-
padding_letter1 = begin
|
323
|
+
@padding_letter1 = begin
|
320
324
|
i = pletters.index{|l| str2.index(l).nil?}
|
321
325
|
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
322
326
|
TextAlignment::PADDING_LETTERS[i]
|
323
327
|
end
|
324
328
|
|
325
329
|
# find the padding letter for str2
|
326
|
-
padding_letter2 = begin
|
327
|
-
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
330
|
+
@padding_letter2 = begin
|
331
|
+
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
328
332
|
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
329
333
|
TextAlignment::PADDING_LETTERS[i]
|
330
334
|
end
|
@@ -335,12 +339,12 @@ class TextAlignment::TextAlignment
|
|
335
339
|
from = f[1]
|
336
340
|
|
337
341
|
if str2.index(f[0])
|
338
|
-
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
342
|
+
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
339
343
|
str1.gsub!(from, to)
|
340
344
|
end
|
341
345
|
|
342
346
|
if str1.index(f[0])
|
343
|
-
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
347
|
+
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
344
348
|
str2.gsub!(from, to)
|
345
349
|
end
|
346
350
|
end
|
@@ -349,4 +353,13 @@ class TextAlignment::TextAlignment
|
|
349
353
|
[str1, str2, mappings]
|
350
354
|
end
|
351
355
|
|
356
|
+
def alignment_similarity(_s1, _s2, alignment)
|
357
|
+
# compute the lcs only with non-whitespace letters
|
358
|
+
lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
359
|
+
|
360
|
+
s1 = _s1.tr(@padding_letter1, ' ')
|
361
|
+
s2 = _s2.tr(@padding_letter2, ' ')
|
362
|
+
similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
|
363
|
+
end
|
364
|
+
|
352
365
|
end
|