text_alignment 0.4.3 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
4
- data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
3
+ metadata.gz: fb5dd06236d0b1a8a9c8c5fcb92807a62bdd30e0648bcbd636b95b2a8a45b9b4
4
+ data.tar.gz: 9266b852993bfee999daa92e3f38ec93e2aec77171fee27c1fea6ac2a17e4d23
5
5
  SHA512:
6
- metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
7
- data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
6
+ metadata.gz: 7ee2a590fb31bcc27121a4a227d7fcefe2e8e80646bea3898bb86729ca3ca299e0aebcf23bea30e2391687e6ec0d6573c04a4605f728562482c7edbd0c0285e0
7
+ data.tar.gz: 73612c185fe533b0daa22d44e7776ed610025cb1bd874f05d95761079f95d1e8a06ead68c88b84bab4d33e8a676edff1e98880912254d9a7ecb5c4ead5eb01fb
@@ -35,6 +35,10 @@ def align_mdoc(source_annotations, target_annotations)
35
35
  source_annotations.each do |annotations|
36
36
  alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
37
 
38
+ puts alignment.alignment_show
39
+ puts "-----"
40
+ puts
41
+
38
42
  # alignment.block_alignments.each do |a|
39
43
  # p {source:a[:source], target:a[:target]}
40
44
  # puts "--"
@@ -103,48 +107,39 @@ target_annotations = if source_annotations.class == Array
103
107
  else
104
108
  alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
109
 
106
- pp alignment
110
+ # pp alignment
107
111
 
108
112
  # verification
109
- source_text = source_annotations[:text]
110
- puts "=====BEGIN"
111
- (0 ... source_text.rstrip.length).each do |p|
112
- t = alignment.transform_begin_position(p)
113
- if t.nil?
114
- print source_text[p]
115
- else
116
- print '.'
117
- end
118
- end
119
- puts
120
- puts "=====END"
121
-
122
- puts "=====BEGIN"
123
- (0 .. source_text.rstrip.length).each do |p|
124
- t = alignment.transform_end_position(p)
125
- if t.nil?
126
- print source_text[p]
127
- else
128
- print '.'
129
- end
130
- end
131
- puts
132
- puts "=====END"
133
-
134
- # alignment.block_alignments.each do |a|
135
- # if a[:alignment].nil? || a[:alignment] == :empty
136
- # # p [a[:source], a[:target]]
137
- # # p a[:alignment]
113
+ # source_text = source_annotations[:text]
114
+ # puts "=====BEGIN"
115
+ # (0 ... source_text.rstrip.length).each do |p|
116
+ # t = alignment.transform_begin_position(p)
117
+ # if t.nil?
118
+ # print source_text[p]
119
+ # else
120
+ # print '.'
121
+ # end
122
+ # end
123
+ # puts
124
+ # puts "=====END"
125
+
126
+ # puts "=====BEGIN"
127
+ # (0 .. source_text.rstrip.length).each do |p|
128
+ # t = alignment.transform_end_position(p)
129
+ # if t.nil?
130
+ # print source_text[p]
138
131
  # else
139
- # p [a[:source], a[:target]]
140
- # p a[:alignment].similarity
141
- # puts "--"
142
- # puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
143
- # puts "--"
144
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
145
- # puts "======"
132
+ # print '.'
146
133
  # end
147
134
  # end
135
+ # puts
136
+ # puts "=====END"
137
+
138
+ source_text = source_annotations[:text]
139
+
140
+ puts "[block alignment]"
141
+ puts alignment.alignment_show
142
+ puts "====="
148
143
  # exit
149
144
 
150
145
  # verification of source denotations
@@ -33,7 +33,9 @@ class TextAlignment::LCSComparison
33
33
  @str2_match_initial = sdiff[match_initial].new_position
34
34
  @str1_match_final = sdiff[match_final].old_position
35
35
  @str2_match_final = sdiff[match_final].new_position
36
- @similarity = 2 * lcs / ((@str1_match_final - @str1_match_initial + 1) + (@str2_match_final - @str2_match_initial + 1)).to_f
36
+ mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
37
+ @similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
38
+ # @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
37
39
  else
38
40
  @str1_match_initial = 0
39
41
  @str2_match_initial = 0
@@ -8,20 +8,29 @@ module TextAlignment; end unless defined? TextAlignment
8
8
  TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
9
 
10
10
  class TextAlignment::TextAlignment
11
- attr_reader :block_alignments
11
+ attr_reader :block_alignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
15
  def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
16
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
17
 
18
+ @block_alignment = {source_text:_str1, target_text:_str2}
19
+
18
20
  str1, str2, mappings = string_preprocessing(_str1, _str2)
19
21
 
20
22
  # try exact match
21
23
  block_begin = str2.index(str1)
22
24
  unless block_begin.nil?
23
- @block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
24
- return @block_alignments
25
+ @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
+ return @block_alignment
27
+ end
28
+
29
+ # try exact match
30
+ block_begin = str2.downcase.index(str1.downcase)
31
+ unless block_begin.nil?
32
+ @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
33
+ return @block_alignment
25
34
  end
26
35
 
27
36
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
@@ -54,7 +63,7 @@ class TextAlignment::TextAlignment
54
63
  # puts
55
64
 
56
65
  ## To find block alignments
57
- @block_alignments = []
66
+ @block_alignment[:blocks] = []
58
67
  return if mblocks.empty?
59
68
 
60
69
  # Initial step
@@ -63,35 +72,36 @@ class TextAlignment::TextAlignment
63
72
  e2 = mblocks[0][:target][:begin]
64
73
 
65
74
  if mblocks[0][:target][:begin] == 0
66
- @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
75
+ @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
67
76
  else
68
77
  _str1 = str1[0 ... e1]
69
78
  _str2 = str2[0 ... e2]
70
79
 
71
80
  unless _str1.strip.empty?
72
81
  if _str2.strip.empty?
73
- @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
82
+ @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
74
83
  else
75
84
  len_min = [_str1.length, _str2.length].min
76
85
  len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
86
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
78
87
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
79
88
 
80
- @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
89
+ @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
81
90
 
82
91
  _str1 = str1[b1 ... e1]
83
92
  _str2 = str2[b2 ... e2]
84
93
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
85
- if alignment.similarity < 0.6
86
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
94
+ similarity = alignment_similarity(_str1, _str2, alignment)
95
+ if similarity < 0.6
96
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
87
97
  else
88
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
98
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
89
99
  end
90
100
  end
91
101
  end
92
102
  end
93
103
  end
94
- @block_alignments << mblocks[0]
104
+ @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
95
105
 
96
106
  (1 ... mblocks.length).each do |i|
97
107
  b1 = mblocks[i - 1][:source][:end]
@@ -102,17 +112,18 @@ class TextAlignment::TextAlignment
102
112
  _str2 = str2[b2 ... e2]
103
113
  unless _str1.strip.empty?
104
114
  if _str2.strip.empty?
105
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
115
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
116
  else
107
117
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
108
- if alignment.similarity < 0.6
109
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
118
+ similarity = alignment_similarity(_str1, _str2, alignment)
119
+ if similarity < 0.6
120
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
110
121
  else
111
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
122
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
112
123
  end
113
124
  end
114
125
  end
115
- @block_alignments << mblocks[i]
126
+ @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
116
127
  end
117
128
 
118
129
  # Final step
@@ -124,7 +135,7 @@ class TextAlignment::TextAlignment
124
135
 
125
136
  unless _str1.strip.empty?
126
137
  if _str2.strip.empty?
127
- @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
138
+ @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
128
139
  else
129
140
  len_min = [_str1.length, _str2.length].min
130
141
  len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
@@ -134,57 +145,58 @@ class TextAlignment::TextAlignment
134
145
  _str2 = str2[b2 ... e2]
135
146
 
136
147
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
137
- if alignment.similarity < 0.6
138
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
148
+ similarity = alignment_similarity(_str1, _str2, alignment)
149
+ if similarity < 0.6
150
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
139
151
  else
140
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
152
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
141
153
  end
142
154
 
143
- @block_alignments << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
155
+ @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
144
156
  end
145
157
  end
146
158
  end
147
159
 
148
- @block_alignments.each do |a|
160
+ @block_alignment[:blocks].each do |a|
149
161
  a[:delta] = a[:target][:begin] - a[:source][:begin]
150
162
  end
151
163
  end
152
164
 
153
165
  def transform_begin_position(begin_position)
154
- i = @block_alignments.index{|b| b[:source][:end] > begin_position}
155
- block_alignment = @block_alignments[i]
156
-
157
- b = if block_alignment[:alignment].nil?
158
- begin_position + block_alignment[:delta]
159
- elsif block_alignment[:alignment] == :empty
160
- if begin_position == block_alignment[:source][:begin]
161
- block_alignment[:target][:begin]
166
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
167
+ block = @block_alignment[:blocks][i]
168
+
169
+ b = if block[:alignment] == :block
170
+ begin_position + block[:delta]
171
+ elsif block[:alignment] == :empty
172
+ if begin_position == block[:source][:begin]
173
+ block[:target][:begin]
162
174
  else
163
175
  # raise "lost annotation"
164
176
  nil
165
177
  end
166
178
  else
167
- r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
168
- r.nil? ? nil : r + block_alignment[:target][:begin]
179
+ r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
180
+ r.nil? ? nil : r + block[:target][:begin]
169
181
  end
170
182
  end
171
183
 
172
184
  def transform_end_position(end_position)
173
- i = @block_alignments.index{|b| b[:source][:end] >= end_position}
174
- block_alignment = @block_alignments[i]
175
-
176
- e = if block_alignment[:alignment].nil?
177
- end_position + block_alignment[:delta]
178
- elsif block_alignment[:alignment] == :empty
179
- if end_position == block_alignment[:source][:end]
180
- block_alignment[:target][:end]
185
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
186
+ block = @block_alignment[:blocks][i]
187
+
188
+ e = if block[:alignment] == :block
189
+ end_position + block[:delta]
190
+ elsif block[:alignment] == :empty
191
+ if end_position == block[:source][:end]
192
+ block[:target][:end]
181
193
  else
182
194
  # raise "lost annotation"
183
195
  nil
184
196
  end
185
197
  else
186
- r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
187
- r.nil? ? nil : r + block_alignment[:target][:begin]
198
+ r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
199
+ r.nil? ? nil : r + block[:target][:begin]
188
200
  end
189
201
  end
190
202
 
@@ -230,8 +242,63 @@ class TextAlignment::TextAlignment
230
242
  r
231
243
  end
232
244
 
233
- private
245
+ def alignment_show
246
+ stext = @block_alignment[:source_text]
247
+ ttext = @block_alignment[:target_text]
248
+
249
+ show = ''
250
+ @block_alignment[:blocks].each do |a|
251
+ show += case a[:alignment]
252
+ when :block
253
+ "===== common =====\n" +
254
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
255
+ when :empty
256
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
257
+ "<<<<< string 1\n" +
258
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
259
+ ">>>>> string 2\n" +
260
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
261
+ else
262
+ astr1 = ''
263
+ astr2 = ''
264
+
265
+ base = a[:source][:begin]
266
+ astr1 = a[:alignment].sdiff.map do |c|
267
+ case c.action
268
+ when '='
269
+ stext[c.old_position + base]
270
+ when '+'
271
+ '_'
272
+ when '-'
273
+ stext[c.old_position + base]
274
+ when '!'
275
+ stext[c.old_position + base] + '_'
276
+ end
277
+ end.join('')
278
+
279
+ base = a[:target][:begin]
280
+ astr2 = a[:alignment].sdiff.map do |c|
281
+ case c.action
282
+ when '='
283
+ ttext[c.new_position + base]
284
+ when '+'
285
+ ttext[c.new_position + base]
286
+ when '-'
287
+ '_'
288
+ when '!'
289
+ '_' + ttext[c.new_position + base]
290
+ end
291
+ end.join('')
234
292
 
293
+ "***** local mismatch\n" +
294
+ "[#{astr1}]\n" +
295
+ "[#{astr2}]\n\n"
296
+ end
297
+ end
298
+ show
299
+ end
300
+
301
+ private
235
302
 
236
303
  def string_preprocessing(_str1, _str2)
237
304
  str1 = _str1.dup
@@ -253,15 +320,15 @@ class TextAlignment::TextAlignment
253
320
  pletters = TextAlignment::PADDING_LETTERS
254
321
 
255
322
  # find the padding letter for str1
256
- padding_letter1 = begin
323
+ @padding_letter1 = begin
257
324
  i = pletters.index{|l| str2.index(l).nil?}
258
325
  raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
259
326
  TextAlignment::PADDING_LETTERS[i]
260
327
  end
261
328
 
262
329
  # find the padding letter for str2
263
- padding_letter2 = begin
264
- i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
330
+ @padding_letter2 = begin
331
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
265
332
  raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
266
333
  TextAlignment::PADDING_LETTERS[i]
267
334
  end
@@ -272,12 +339,12 @@ class TextAlignment::TextAlignment
272
339
  from = f[1]
273
340
 
274
341
  if str2.index(f[0])
275
- to = f[0] + (padding_letter1 * (f[1].length - 1))
342
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
276
343
  str1.gsub!(from, to)
277
344
  end
278
345
 
279
346
  if str1.index(f[0])
280
- to = f[0] + (padding_letter2 * (f[1].length - 1))
347
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
281
348
  str2.gsub!(from, to)
282
349
  end
283
350
  end
@@ -286,4 +353,13 @@ class TextAlignment::TextAlignment
286
353
  [str1, str2, mappings]
287
354
  end
288
355
 
356
+ def alignment_similarity(_s1, _s2, alignment)
357
+ # compute the lcs only with non-whitespace letters
358
+ lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
359
+
360
+ s1 = _s1.tr(@padding_letter1, ' ')
361
+ s2 = _s2.tr(@padding_letter2, ' ')
362
+ similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
363
+ end
364
+
289
365
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.4.3'
2
+ VERSION = '0.6.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-02 00:00:00.000000000 Z
11
+ date: 2020-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary