text_alignment 0.6.2 → 0.6.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4b2cdf0c257b74c6bec90b93d1907787f3c102108046731c2755684a1b156e9
4
- data.tar.gz: 85334dad09a046432503183e3d3ad83841612299038f2f2dac1f9d5d208e1939
3
+ metadata.gz: 6bed1eba72da626227ab727ce22129d226539bcfae5ca22006ac26258b184d8c
4
+ data.tar.gz: d2c121ea072186fd25fd61fb90c5ffacb886c1d109b82c044a1666220b8f7d8b
5
5
  SHA512:
6
- metadata.gz: 9272bdd6c56717b53d39b3f2009259accb608ea86b99758b6a7ee9cee1e7b275330db55af4e0eba1eba80ee69275a21a3179243394d24139b3018996f659abe1
7
- data.tar.gz: a6a9d97d2bf81ac0c2972fd6e9d5202116156d8ff2e5e81a9bf0306e313dbc601522f887bcbcebff8b9d888cc06826a8ce69ba908dce29fa8decad85d53008af
6
+ metadata.gz: 6e526995325e79fdde8ecd729c04e2e6a21e13f0166acc39b341133055275a1bbd5a3318f78dd5af4a72237c140fa8eb06270441a16e2426e58a57183b91ca6a
7
+ data.tar.gz: ec423d59036b1ee5595141428fe320f0e9ca16b8b2660d46a0f59f376c3845ad70196d006c2f83390ac12f98b35ff14a1098fcd24cda0ee1c6534f36915def81
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(str1, str2, mappings = [])
21
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
- mappings ||= []
20
+ def initialize(_str1, _str2)
21
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
+
23
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
23
24
 
24
25
  _compute_mixed_alignment(str1, str2, mappings)
25
26
  end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
62
63
  end
63
64
 
64
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
65
- @similarity = cmp.similarity
66
+ @similarity = compute_similarity(str1, str2, @sdiff)
66
67
  @str1_match_initial = cmp.str1_match_initial
67
68
  @str1_match_final = cmp.str1_match_final
68
69
  @str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
137
138
  @position_map_begin = posmap_begin.sort.to_h
138
139
  @position_map_end = posmap_end.sort.to_h
139
140
  end
141
+
142
+ private
143
+
144
+ def string_preprocessing(_str1, _str2)
145
+ str1 = _str1.dup
146
+ str2 = _str2.dup
147
+ mappings = TextAlignment::MAPPINGS.dup
148
+
149
+ ## single character mappings
150
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
+ characters_from = character_mappings.collect{|m| m[0]}.join
152
+ characters_to = character_mappings.collect{|m| m[1]}.join
153
+ characters_to.gsub!(/-/, '\-')
154
+
155
+ str1.tr!(characters_from, characters_to)
156
+ str2.tr!(characters_from, characters_to)
157
+
158
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
+
160
+ ## long to one character mappings
161
+ pletters = TextAlignment::PADDING_LETTERS
162
+
163
+ # find the padding letter for str1
164
+ @padding_letter1 = begin
165
+ i = pletters.index{|l| str2.index(l).nil?}
166
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
+ TextAlignment::PADDING_LETTERS[i]
168
+ end
169
+
170
+ # find the padding letter for str2
171
+ @padding_letter2 = begin
172
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
+ TextAlignment::PADDING_LETTERS[i]
175
+ end
176
+
177
+ # ASCII foldings
178
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
+ ascii_foldings.each do |f|
180
+ from = f[1]
181
+
182
+ if str2.index(f[0])
183
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
+ str1.gsub!(from, to)
185
+ end
186
+
187
+ if str1.index(f[0])
188
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
+ str2.gsub!(from, to)
190
+ end
191
+ end
192
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
+
194
+ [str1, str2, mappings]
195
+ end
196
+
197
+ def compute_similarity(_s1, _s2, sdiff)
198
+ return 0 if sdiff.nil?
199
+
200
+ # compute the lcs only with non-whitespace letters
201
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
+ return 0 if lcs == 0
203
+
204
+ s1 = _s1.tr(@padding_letter1, ' ')
205
+ s2 = _s2.tr(@padding_letter2, ' ')
206
+
207
+ similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
+ end
209
+
140
210
  end
@@ -12,12 +12,10 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
+ def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
- @block_alignment = {source_text:_str1, target_text:_str2}
19
-
20
- str1, str2, mappings = string_preprocessing(_str1, _str2)
18
+ @block_alignment = {source_text:str1, target_text:str2}
21
19
 
22
20
  # try exact match
23
21
  block_begin = str2.index(str1)
@@ -90,12 +88,11 @@ class TextAlignment::TextAlignment
90
88
 
91
89
  _str1 = str1[b1 ... e1]
92
90
  _str2 = str2[b2 ... e2]
93
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
94
- similarity = alignment_similarity(_str1, _str2, alignment)
95
- if similarity < 0.6
96
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
91
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
92
+ if alignment.similarity < 0.5
93
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: alignment.similarity}
97
94
  else
98
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
95
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment, similarity: alignment.similarity}
99
96
  end
100
97
  end
101
98
  end
@@ -114,12 +111,11 @@ class TextAlignment::TextAlignment
114
111
  if _str2.strip.empty?
115
112
  @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
116
113
  else
117
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
118
- similarity = alignment_similarity(_str1, _str2, alignment)
119
- if similarity < 0.6
120
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
114
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
115
+ if alignment.similarity < 0.5
116
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
121
117
  else
122
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
118
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
123
119
  end
124
120
  end
125
121
  end
@@ -144,12 +140,11 @@ class TextAlignment::TextAlignment
144
140
  _str1 = str1[b1 ... e1]
145
141
  _str2 = str2[b2 ... e2]
146
142
 
147
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
148
- similarity = alignment_similarity(_str1, _str2, alignment)
149
- if similarity < 0.6
150
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
143
+ alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
144
+ if alignment.similarity < 0.5
145
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
151
146
  else
152
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
147
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
153
148
  end
154
149
 
155
150
  @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
@@ -250,13 +245,13 @@ class TextAlignment::TextAlignment
250
245
  @block_alignment[:blocks].each do |a|
251
246
  show += case a[:alignment]
252
247
  when :block
253
- "===== common =====\n" +
248
+ "===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
254
249
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
255
250
  when :empty
256
251
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
257
- "<<<<< string 1\n" +
252
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
258
253
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
259
- ">>>>> string 2\n" +
254
+ ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
260
255
  ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
261
256
  else
262
257
  astr1 = ''
@@ -290,7 +285,7 @@ class TextAlignment::TextAlignment
290
285
  end
291
286
  end.join('')
292
287
 
293
- "***** local mismatch\n" +
288
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
294
289
  "[#{astr1}]\n" +
295
290
  "[#{astr2}]\n\n"
296
291
  end
@@ -298,71 +293,4 @@ class TextAlignment::TextAlignment
298
293
  show
299
294
  end
300
295
 
301
- private
302
-
303
- def string_preprocessing(_str1, _str2)
304
- str1 = _str1.dup
305
- str2 = _str2.dup
306
- mappings = TextAlignment::MAPPINGS.dup
307
-
308
- ## single character mappings
309
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
310
- characters_from = character_mappings.collect{|m| m[0]}.join
311
- characters_to = character_mappings.collect{|m| m[1]}.join
312
- characters_to.gsub!(/-/, '\-')
313
-
314
- str1.tr!(characters_from, characters_to)
315
- str2.tr!(characters_from, characters_to)
316
-
317
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
318
-
319
- ## long to one character mappings
320
- pletters = TextAlignment::PADDING_LETTERS
321
-
322
- # find the padding letter for str1
323
- @padding_letter1 = begin
324
- i = pletters.index{|l| str2.index(l).nil?}
325
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
326
- TextAlignment::PADDING_LETTERS[i]
327
- end
328
-
329
- # find the padding letter for str2
330
- @padding_letter2 = begin
331
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
332
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
333
- TextAlignment::PADDING_LETTERS[i]
334
- end
335
-
336
- # ASCII foldings
337
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
338
- ascii_foldings.each do |f|
339
- from = f[1]
340
-
341
- if str2.index(f[0])
342
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
343
- str1.gsub!(from, to)
344
- end
345
-
346
- if str1.index(f[0])
347
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
348
- str2.gsub!(from, to)
349
- end
350
- end
351
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
352
-
353
- [str1, str2, mappings]
354
- end
355
-
356
- def alignment_similarity(_s1, _s2, alignment)
357
- return 0 if alignment.sdiff.nil?
358
-
359
- # compute the lcs only with non-whitespace letters
360
- lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
361
-
362
- s1 = _s1.tr(@padding_letter1, ' ')
363
- s2 = _s2.tr(@padding_letter2, ' ')
364
-
365
- similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
366
- end
367
-
368
296
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.2'
2
+ VERSION = '0.6.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim