text_alignment 0.4.3 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
4
- data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
3
+ metadata.gz: fb5dd06236d0b1a8a9c8c5fcb92807a62bdd30e0648bcbd636b95b2a8a45b9b4
4
+ data.tar.gz: 9266b852993bfee999daa92e3f38ec93e2aec77171fee27c1fea6ac2a17e4d23
5
5
  SHA512:
6
- metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
7
- data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
6
+ metadata.gz: 7ee2a590fb31bcc27121a4a227d7fcefe2e8e80646bea3898bb86729ca3ca299e0aebcf23bea30e2391687e6ec0d6573c04a4605f728562482c7edbd0c0285e0
7
+ data.tar.gz: 73612c185fe533b0daa22d44e7776ed610025cb1bd874f05d95761079f95d1e8a06ead68c88b84bab4d33e8a676edff1e98880912254d9a7ecb5c4ead5eb01fb
@@ -35,6 +35,10 @@ def align_mdoc(source_annotations, target_annotations)
35
35
  source_annotations.each do |annotations|
36
36
  alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
37
37
 
38
+ puts alignment.alignment_show
39
+ puts "-----"
40
+ puts
41
+
38
42
  # alignment.block_alignments.each do |a|
39
43
  # p {source:a[:source], target:a[:target]}
40
44
  # puts "--"
@@ -103,48 +107,39 @@ target_annotations = if source_annotations.class == Array
103
107
  else
104
108
  alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
105
109
 
106
- pp alignment
110
+ # pp alignment
107
111
 
108
112
  # verification
109
- source_text = source_annotations[:text]
110
- puts "=====BEGIN"
111
- (0 ... source_text.rstrip.length).each do |p|
112
- t = alignment.transform_begin_position(p)
113
- if t.nil?
114
- print source_text[p]
115
- else
116
- print '.'
117
- end
118
- end
119
- puts
120
- puts "=====END"
121
-
122
- puts "=====BEGIN"
123
- (0 .. source_text.rstrip.length).each do |p|
124
- t = alignment.transform_end_position(p)
125
- if t.nil?
126
- print source_text[p]
127
- else
128
- print '.'
129
- end
130
- end
131
- puts
132
- puts "=====END"
133
-
134
- # alignment.block_alignments.each do |a|
135
- # if a[:alignment].nil? || a[:alignment] == :empty
136
- # # p [a[:source], a[:target]]
137
- # # p a[:alignment]
113
+ # source_text = source_annotations[:text]
114
+ # puts "=====BEGIN"
115
+ # (0 ... source_text.rstrip.length).each do |p|
116
+ # t = alignment.transform_begin_position(p)
117
+ # if t.nil?
118
+ # print source_text[p]
119
+ # else
120
+ # print '.'
121
+ # end
122
+ # end
123
+ # puts
124
+ # puts "=====END"
125
+
126
+ # puts "=====BEGIN"
127
+ # (0 .. source_text.rstrip.length).each do |p|
128
+ # t = alignment.transform_end_position(p)
129
+ # if t.nil?
130
+ # print source_text[p]
138
131
  # else
139
- # p [a[:source], a[:target]]
140
- # p a[:alignment].similarity
141
- # puts "--"
142
- # puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
143
- # puts "--"
144
- # puts target_text[a[:target][:begin] ... a[:target][:end]]
145
- # puts "======"
132
+ # print '.'
146
133
  # end
147
134
  # end
135
+ # puts
136
+ # puts "=====END"
137
+
138
+ source_text = source_annotations[:text]
139
+
140
+ puts "[block alignment]"
141
+ puts alignment.alignment_show
142
+ puts "====="
148
143
  # exit
149
144
 
150
145
  # verification of source denotations
@@ -33,7 +33,9 @@ class TextAlignment::LCSComparison
33
33
  @str2_match_initial = sdiff[match_initial].new_position
34
34
  @str1_match_final = sdiff[match_final].old_position
35
35
  @str2_match_final = sdiff[match_final].new_position
36
- @similarity = 2 * lcs / ((@str1_match_final - @str1_match_initial + 1) + (@str2_match_final - @str2_match_initial + 1)).to_f
36
+ mlcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
37
+ @similarity = 2 * mlcs / (str1[@str1_match_initial .. @str1_match_final].scan(/\S/).count + str2[@str2_match_initial .. @str2_match_final].scan(/\S/).count).to_f
38
+ # @similarity = 2 * lcs / (str1[@str1_match_initial .. @str1_match_final].length + str2[@str2_match_initial .. @str2_match_final].length).to_f
37
39
  else
38
40
  @str1_match_initial = 0
39
41
  @str2_match_initial = 0
@@ -8,20 +8,29 @@ module TextAlignment; end unless defined? TextAlignment
8
8
  TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
9
 
10
10
  class TextAlignment::TextAlignment
11
- attr_reader :block_alignments
11
+ attr_reader :block_alignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
15
  def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
16
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
17
 
18
+ @block_alignment = {source_text:_str1, target_text:_str2}
19
+
18
20
  str1, str2, mappings = string_preprocessing(_str1, _str2)
19
21
 
20
22
  # try exact match
21
23
  block_begin = str2.index(str1)
22
24
  unless block_begin.nil?
23
- @block_alignments = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin}]
24
- return @block_alignments
25
+ @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
+ return @block_alignment
27
+ end
28
+
29
+ # try exact match
30
+ block_begin = str2.downcase.index(str1.downcase)
31
+ unless block_begin.nil?
32
+ @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
33
+ return @block_alignment
25
34
  end
26
35
 
27
36
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
@@ -54,7 +63,7 @@ class TextAlignment::TextAlignment
54
63
  # puts
55
64
 
56
65
  ## To find block alignments
57
- @block_alignments = []
66
+ @block_alignment[:blocks] = []
58
67
  return if mblocks.empty?
59
68
 
60
69
  # Initial step
@@ -63,35 +72,36 @@ class TextAlignment::TextAlignment
63
72
  e2 = mblocks[0][:target][:begin]
64
73
 
65
74
  if mblocks[0][:target][:begin] == 0
66
- @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
75
+ @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
67
76
  else
68
77
  _str1 = str1[0 ... e1]
69
78
  _str2 = str2[0 ... e2]
70
79
 
71
80
  unless _str1.strip.empty?
72
81
  if _str2.strip.empty?
73
- @block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
82
+ @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
74
83
  else
75
84
  len_min = [_str1.length, _str2.length].min
76
85
  len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
86
  b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
78
87
  b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
79
88
 
80
- @block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
89
+ @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
81
90
 
82
91
  _str1 = str1[b1 ... e1]
83
92
  _str2 = str2[b2 ... e2]
84
93
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
85
- if alignment.similarity < 0.6
86
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
94
+ similarity = alignment_similarity(_str1, _str2, alignment)
95
+ if similarity < 0.6
96
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
87
97
  else
88
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
98
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
89
99
  end
90
100
  end
91
101
  end
92
102
  end
93
103
  end
94
- @block_alignments << mblocks[0]
104
+ @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
95
105
 
96
106
  (1 ... mblocks.length).each do |i|
97
107
  b1 = mblocks[i - 1][:source][:end]
@@ -102,17 +112,18 @@ class TextAlignment::TextAlignment
102
112
  _str2 = str2[b2 ... e2]
103
113
  unless _str1.strip.empty?
104
114
  if _str2.strip.empty?
105
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
115
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
116
  else
107
117
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
108
- if alignment.similarity < 0.6
109
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
118
+ similarity = alignment_similarity(_str1, _str2, alignment)
119
+ if similarity < 0.6
120
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
110
121
  else
111
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
122
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
112
123
  end
113
124
  end
114
125
  end
115
- @block_alignments << mblocks[i]
126
+ @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
116
127
  end
117
128
 
118
129
  # Final step
@@ -124,7 +135,7 @@ class TextAlignment::TextAlignment
124
135
 
125
136
  unless _str1.strip.empty?
126
137
  if _str2.strip.empty?
127
- @block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
138
+ @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
128
139
  else
129
140
  len_min = [_str1.length, _str2.length].min
130
141
  len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
@@ -134,57 +145,58 @@ class TextAlignment::TextAlignment
134
145
  _str2 = str2[b2 ... e2]
135
146
 
136
147
  alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
137
- if alignment.similarity < 0.6
138
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
148
+ similarity = alignment_similarity(_str1, _str2, alignment)
149
+ if similarity < 0.6
150
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
139
151
  else
140
- @block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
152
+ @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
141
153
  end
142
154
 
143
- @block_alignments << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
155
+ @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
144
156
  end
145
157
  end
146
158
  end
147
159
 
148
- @block_alignments.each do |a|
160
+ @block_alignment[:blocks].each do |a|
149
161
  a[:delta] = a[:target][:begin] - a[:source][:begin]
150
162
  end
151
163
  end
152
164
 
153
165
  def transform_begin_position(begin_position)
154
- i = @block_alignments.index{|b| b[:source][:end] > begin_position}
155
- block_alignment = @block_alignments[i]
156
-
157
- b = if block_alignment[:alignment].nil?
158
- begin_position + block_alignment[:delta]
159
- elsif block_alignment[:alignment] == :empty
160
- if begin_position == block_alignment[:source][:begin]
161
- block_alignment[:target][:begin]
166
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
167
+ block = @block_alignment[:blocks][i]
168
+
169
+ b = if block[:alignment] == :block
170
+ begin_position + block[:delta]
171
+ elsif block[:alignment] == :empty
172
+ if begin_position == block[:source][:begin]
173
+ block[:target][:begin]
162
174
  else
163
175
  # raise "lost annotation"
164
176
  nil
165
177
  end
166
178
  else
167
- r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
168
- r.nil? ? nil : r + block_alignment[:target][:begin]
179
+ r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
180
+ r.nil? ? nil : r + block[:target][:begin]
169
181
  end
170
182
  end
171
183
 
172
184
  def transform_end_position(end_position)
173
- i = @block_alignments.index{|b| b[:source][:end] >= end_position}
174
- block_alignment = @block_alignments[i]
175
-
176
- e = if block_alignment[:alignment].nil?
177
- end_position + block_alignment[:delta]
178
- elsif block_alignment[:alignment] == :empty
179
- if end_position == block_alignment[:source][:end]
180
- block_alignment[:target][:end]
185
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
186
+ block = @block_alignment[:blocks][i]
187
+
188
+ e = if block[:alignment] == :block
189
+ end_position + block[:delta]
190
+ elsif block[:alignment] == :empty
191
+ if end_position == block[:source][:end]
192
+ block[:target][:end]
181
193
  else
182
194
  # raise "lost annotation"
183
195
  nil
184
196
  end
185
197
  else
186
- r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
187
- r.nil? ? nil : r + block_alignment[:target][:begin]
198
+ r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
199
+ r.nil? ? nil : r + block[:target][:begin]
188
200
  end
189
201
  end
190
202
 
@@ -230,8 +242,63 @@ class TextAlignment::TextAlignment
230
242
  r
231
243
  end
232
244
 
233
- private
245
+ def alignment_show
246
+ stext = @block_alignment[:source_text]
247
+ ttext = @block_alignment[:target_text]
248
+
249
+ show = ''
250
+ @block_alignment[:blocks].each do |a|
251
+ show += case a[:alignment]
252
+ when :block
253
+ "===== common =====\n" +
254
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
255
+ when :empty
256
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
257
+ "<<<<< string 1\n" +
258
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
259
+ ">>>>> string 2\n" +
260
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
261
+ else
262
+ astr1 = ''
263
+ astr2 = ''
264
+
265
+ base = a[:source][:begin]
266
+ astr1 = a[:alignment].sdiff.map do |c|
267
+ case c.action
268
+ when '='
269
+ stext[c.old_position + base]
270
+ when '+'
271
+ '_'
272
+ when '-'
273
+ stext[c.old_position + base]
274
+ when '!'
275
+ stext[c.old_position + base] + '_'
276
+ end
277
+ end.join('')
278
+
279
+ base = a[:target][:begin]
280
+ astr2 = a[:alignment].sdiff.map do |c|
281
+ case c.action
282
+ when '='
283
+ ttext[c.new_position + base]
284
+ when '+'
285
+ ttext[c.new_position + base]
286
+ when '-'
287
+ '_'
288
+ when '!'
289
+ '_' + ttext[c.new_position + base]
290
+ end
291
+ end.join('')
234
292
 
293
+ "***** local mismatch\n" +
294
+ "[#{astr1}]\n" +
295
+ "[#{astr2}]\n\n"
296
+ end
297
+ end
298
+ show
299
+ end
300
+
301
+ private
235
302
 
236
303
  def string_preprocessing(_str1, _str2)
237
304
  str1 = _str1.dup
@@ -253,15 +320,15 @@ class TextAlignment::TextAlignment
253
320
  pletters = TextAlignment::PADDING_LETTERS
254
321
 
255
322
  # find the padding letter for str1
256
- padding_letter1 = begin
323
+ @padding_letter1 = begin
257
324
  i = pletters.index{|l| str2.index(l).nil?}
258
325
  raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
259
326
  TextAlignment::PADDING_LETTERS[i]
260
327
  end
261
328
 
262
329
  # find the padding letter for str2
263
- padding_letter2 = begin
264
- i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
330
+ @padding_letter2 = begin
331
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
265
332
  raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
266
333
  TextAlignment::PADDING_LETTERS[i]
267
334
  end
@@ -272,12 +339,12 @@ class TextAlignment::TextAlignment
272
339
  from = f[1]
273
340
 
274
341
  if str2.index(f[0])
275
- to = f[0] + (padding_letter1 * (f[1].length - 1))
342
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
276
343
  str1.gsub!(from, to)
277
344
  end
278
345
 
279
346
  if str1.index(f[0])
280
- to = f[0] + (padding_letter2 * (f[1].length - 1))
347
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
281
348
  str2.gsub!(from, to)
282
349
  end
283
350
  end
@@ -286,4 +353,13 @@ class TextAlignment::TextAlignment
286
353
  [str1, str2, mappings]
287
354
  end
288
355
 
356
+ def alignment_similarity(_s1, _s2, alignment)
357
+ # compute the lcs only with non-whitespace letters
358
+ lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
359
+
360
+ s1 = _s1.tr(@padding_letter1, ' ')
361
+ s2 = _s2.tr(@padding_letter2, ' ')
362
+ similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
363
+ end
364
+
289
365
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.4.3'
2
+ VERSION = '0.6.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-02 00:00:00.000000000 Z
11
+ date: 2020-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary