text_alignment 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4b2cdf0c257b74c6bec90b93d1907787f3c102108046731c2755684a1b156e9
4
- data.tar.gz: 85334dad09a046432503183e3d3ad83841612299038f2f2dac1f9d5d208e1939
3
+ metadata.gz: 6bed1eba72da626227ab727ce22129d226539bcfae5ca22006ac26258b184d8c
4
+ data.tar.gz: d2c121ea072186fd25fd61fb90c5ffacb886c1d109b82c044a1666220b8f7d8b
5
5
  SHA512:
6
- metadata.gz: 9272bdd6c56717b53d39b3f2009259accb608ea86b99758b6a7ee9cee1e7b275330db55af4e0eba1eba80ee69275a21a3179243394d24139b3018996f659abe1
7
- data.tar.gz: a6a9d97d2bf81ac0c2972fd6e9d5202116156d8ff2e5e81a9bf0306e313dbc601522f887bcbcebff8b9d888cc06826a8ce69ba908dce29fa8decad85d53008af
6
+ metadata.gz: 6e526995325e79fdde8ecd729c04e2e6a21e13f0166acc39b341133055275a1bbd5a3318f78dd5af4a72237c140fa8eb06270441a16e2426e58a57183b91ca6a
7
+ data.tar.gz: ec423d59036b1ee5595141428fe320f0e9ca16b8b2660d46a0f59f376c3845ad70196d006c2f83390ac12f98b35ff14a1098fcd24cda0ee1c6534f36915def81
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(str1, str2, mappings = [])
21
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
- mappings ||= []
20
+ def initialize(_str1, _str2)
21
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
+
23
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
23
24
 
24
25
  _compute_mixed_alignment(str1, str2, mappings)
25
26
  end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
62
63
  end
63
64
 
64
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
65
- @similarity = cmp.similarity
66
+ @similarity = compute_similarity(str1, str2, @sdiff)
66
67
  @str1_match_initial = cmp.str1_match_initial
67
68
  @str1_match_final = cmp.str1_match_final
68
69
  @str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
137
138
  @position_map_begin = posmap_begin.sort.to_h
138
139
  @position_map_end = posmap_end.sort.to_h
139
140
  end
141
+
142
+ private
143
+
144
+ def string_preprocessing(_str1, _str2)
145
+ str1 = _str1.dup
146
+ str2 = _str2.dup
147
+ mappings = TextAlignment::MAPPINGS.dup
148
+
149
+ ## single character mappings
150
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
+ characters_from = character_mappings.collect{|m| m[0]}.join
152
+ characters_to = character_mappings.collect{|m| m[1]}.join
153
+ characters_to.gsub!(/-/, '\-')
154
+
155
+ str1.tr!(characters_from, characters_to)
156
+ str2.tr!(characters_from, characters_to)
157
+
158
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
+
160
+ ## long to one character mappings
161
+ pletters = TextAlignment::PADDING_LETTERS
162
+
163
+ # find the padding letter for str1
164
+ @padding_letter1 = begin
165
+ i = pletters.index{|l| str2.index(l).nil?}
166
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
+ TextAlignment::PADDING_LETTERS[i]
168
+ end
169
+
170
+ # find the padding letter for str2
171
+ @padding_letter2 = begin
172
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
+ TextAlignment::PADDING_LETTERS[i]
175
+ end
176
+
177
+ # ASCII foldings
178
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
+ ascii_foldings.each do |f|
180
+ from = f[1]
181
+
182
+ if str2.index(f[0])
183
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
+ str1.gsub!(from, to)
185
+ end
186
+
187
+ if str1.index(f[0])
188
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
+ str2.gsub!(from, to)
190
+ end
191
+ end
192
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
+
194
+ [str1, str2, mappings]
195
+ end
196
+
197
+ def compute_similarity(_s1, _s2, sdiff)
198
+ return 0 if sdiff.nil?
199
+
200
+ # compute the lcs only with non-whitespace letters
201
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
+ return 0 if lcs == 0
203
+
204
+ s1 = _s1.tr(@padding_letter1, ' ')
205
+ s2 = _s2.tr(@padding_letter2, ' ')
206
+
207
+ similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
+ end
209
+
140
210
  end
@@ -12,12 +12,10 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
+ def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
- @block_alignment = {source_text:_str1, target_text:_str2}
19
-
20
- str1, str2, mappings = string_preprocessing(_str1, _str2)
18
+ @block_alignment = {source_text:str1, target_text:str2}
21
19
 
22
20
  # try exact match
23
21
  block_begin = str2.index(str1)
@@ -90,12 +88,11 @@ class TextAlignment::TextAlignment
90
88
 
91
89
  _str1 = str1[b1 ... e1]
92
90
  _str2 = str2[b2 ... e2]
93
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
94
- similarity = alignment_similarity(_str1, _str2, alignment)
95
- if similarity < 0.6
96
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
91
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
92
+ if alignment.similarity < 0.5
93
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: alignment.similarity}
97
94
  else
98
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
95
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment, similarity: alignment.similarity}
99
96
  end
100
97
  end
101
98
  end
@@ -114,12 +111,11 @@ class TextAlignment::TextAlignment
114
111
  if _str2.strip.empty?
115
112
  @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
116
113
  else
117
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
118
- similarity = alignment_similarity(_str1, _str2, alignment)
119
- if similarity < 0.6
120
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
114
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
115
+ if alignment.similarity < 0.5
116
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
121
117
  else
122
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
118
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
123
119
  end
124
120
  end
125
121
  end
@@ -144,12 +140,11 @@ class TextAlignment::TextAlignment
144
140
  _str1 = str1[b1 ... e1]
145
141
  _str2 = str2[b2 ... e2]
146
142
 
147
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
148
- similarity = alignment_similarity(_str1, _str2, alignment)
149
- if similarity < 0.6
150
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
143
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
144
+ if alignment.similarity < 0.5
145
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
151
146
  else
152
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
147
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
153
148
  end
154
149
 
155
150
  @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
@@ -250,13 +245,13 @@ class TextAlignment::TextAlignment
250
245
  @block_alignment[:blocks].each do |a|
251
246
  show += case a[:alignment]
252
247
  when :block
253
- "===== common =====\n" +
248
+ "===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
254
249
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
255
250
  when :empty
256
251
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
257
- "<<<<< string 1\n" +
252
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
258
253
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
259
- ">>>>> string 2\n" +
254
+ ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
260
255
  ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
261
256
  else
262
257
  astr1 = ''
@@ -290,7 +285,7 @@ class TextAlignment::TextAlignment
290
285
  end
291
286
  end.join('')
292
287
 
293
- "***** local mismatch\n" +
288
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
294
289
  "[#{astr1}]\n" +
295
290
  "[#{astr2}]\n\n"
296
291
  end
@@ -298,71 +293,4 @@ class TextAlignment::TextAlignment
298
293
  show
299
294
  end
300
295
 
301
- private
302
-
303
- def string_preprocessing(_str1, _str2)
304
- str1 = _str1.dup
305
- str2 = _str2.dup
306
- mappings = TextAlignment::MAPPINGS.dup
307
-
308
- ## single character mappings
309
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
310
- characters_from = character_mappings.collect{|m| m[0]}.join
311
- characters_to = character_mappings.collect{|m| m[1]}.join
312
- characters_to.gsub!(/-/, '\-')
313
-
314
- str1.tr!(characters_from, characters_to)
315
- str2.tr!(characters_from, characters_to)
316
-
317
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
318
-
319
- ## long to one character mappings
320
- pletters = TextAlignment::PADDING_LETTERS
321
-
322
- # find the padding letter for str1
323
- @padding_letter1 = begin
324
- i = pletters.index{|l| str2.index(l).nil?}
325
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
326
- TextAlignment::PADDING_LETTERS[i]
327
- end
328
-
329
- # find the padding letter for str2
330
- @padding_letter2 = begin
331
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
332
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
333
- TextAlignment::PADDING_LETTERS[i]
334
- end
335
-
336
- # ASCII foldings
337
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
338
- ascii_foldings.each do |f|
339
- from = f[1]
340
-
341
- if str2.index(f[0])
342
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
343
- str1.gsub!(from, to)
344
- end
345
-
346
- if str1.index(f[0])
347
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
348
- str2.gsub!(from, to)
349
- end
350
- end
351
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
352
-
353
- [str1, str2, mappings]
354
- end
355
-
356
- def alignment_similarity(_s1, _s2, alignment)
357
- return 0 if alignment.sdiff.nil?
358
-
359
- # compute the lcs only with non-whitespace letters
360
- lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
361
-
362
- s1 = _s1.tr(@padding_letter1, ' ')
363
- s2 = _s2.tr(@padding_letter2, ' ')
364
-
365
- similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
366
- end
367
-
368
296
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.2'
2
+ VERSION = '0.6.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim