text_alignment 0.6.1 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fb5dd06236d0b1a8a9c8c5fcb92807a62bdd30e0648bcbd636b95b2a8a45b9b4
4
- data.tar.gz: 9266b852993bfee999daa92e3f38ec93e2aec77171fee27c1fea6ac2a17e4d23
3
+ metadata.gz: '088ea92f4ca68c574cbd04bbf932aa70014b58cbbad82028b7161e0af35cdb4c'
4
+ data.tar.gz: 455e6b53a846e7ebf0a90e724b93b78b61025ccb2d02c1e167f1969946b292e2
5
5
  SHA512:
6
- metadata.gz: 7ee2a590fb31bcc27121a4a227d7fcefe2e8e80646bea3898bb86729ca3ca299e0aebcf23bea30e2391687e6ec0d6573c04a4605f728562482c7edbd0c0285e0
7
- data.tar.gz: 73612c185fe533b0daa22d44e7776ed610025cb1bd874f05d95761079f95d1e8a06ead68c88b84bab4d33e8a676edff1e98880912254d9a7ecb5c4ead5eb01fb
6
+ metadata.gz: 02cdb75cc9b95415c0e86d78bc0278ab8a0cf5a7afa1870ba5c10f0137e59ea782b92e8130ea2f9b3e16fd43b08c8c72da0d2b2ecf4546cb6a46a72ad62957ef
7
+ data.tar.gz: 1216190e0b3880acedc6b735b70a3611469f4fcb4506f059271acbef8d84ffdee793234029c8e5f9e9d1f10cdf088ed44a8c54c93986b19757036aceaf938247
@@ -105,9 +105,7 @@ lost_annotations = []
105
105
  target_annotations = if source_annotations.class == Array
106
106
  align_mdoc(source_annotations, {text: target_text})
107
107
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
108
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
111
109
 
112
110
  # verification
113
111
  # source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
142
140
  puts "====="
143
141
  # exit
144
142
 
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
143
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
144
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
145
 
163
146
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
194
177
 
195
178
  if lost_annotations
196
179
  warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
180
+ lost_annotations.each do |a|
181
+ p a
182
+ end
198
183
  end
199
184
 
200
185
  #puts target_annotations.to_json
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(str1, str2, mappings = [])
21
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
- mappings ||= []
20
+ def initialize(_str1, _str2)
21
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
+
23
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
23
24
 
24
25
  _compute_mixed_alignment(str1, str2, mappings)
25
26
  end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
62
63
  end
63
64
 
64
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
65
- @similarity = cmp.similarity
66
+ @similarity = compute_similarity(str1, str2, @sdiff)
66
67
  @str1_match_initial = cmp.str1_match_initial
67
68
  @str1_match_final = cmp.str1_match_final
68
69
  @str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
137
138
  @position_map_begin = posmap_begin.sort.to_h
138
139
  @position_map_end = posmap_end.sort.to_h
139
140
  end
141
+
142
+ private
143
+
144
+ def string_preprocessing(_str1, _str2)
145
+ str1 = _str1.dup
146
+ str2 = _str2.dup
147
+ mappings = TextAlignment::MAPPINGS.dup
148
+
149
+ ## single character mappings
150
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
+ characters_from = character_mappings.collect{|m| m[0]}.join
152
+ characters_to = character_mappings.collect{|m| m[1]}.join
153
+ characters_to.gsub!(/-/, '\-')
154
+
155
+ str1.tr!(characters_from, characters_to)
156
+ str2.tr!(characters_from, characters_to)
157
+
158
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
+
160
+ ## long to one character mappings
161
+ pletters = TextAlignment::PADDING_LETTERS
162
+
163
+ # find the padding letter for str1
164
+ @padding_letter1 = begin
165
+ i = pletters.index{|l| str2.index(l).nil?}
166
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
+ TextAlignment::PADDING_LETTERS[i]
168
+ end
169
+
170
+ # find the padding letter for str2
171
+ @padding_letter2 = begin
172
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
+ TextAlignment::PADDING_LETTERS[i]
175
+ end
176
+
177
+ # ASCII foldings
178
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
+ ascii_foldings.each do |f|
180
+ from = f[1]
181
+
182
+ if str2.index(f[0])
183
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
+ str1.gsub!(from, to)
185
+ end
186
+
187
+ if str1.index(f[0])
188
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
+ str2.gsub!(from, to)
190
+ end
191
+ end
192
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
+
194
+ [str1, str2, mappings]
195
+ end
196
+
197
+ def compute_similarity(_s1, _s2, sdiff)
198
+ return 0 if sdiff.nil?
199
+
200
+ # compute the lcs only with non-whitespace letters
201
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
+ return 0 if lcs == 0
203
+
204
+ s1 = _s1.tr(@padding_letter1, ' ')
205
+ s2 = _s2.tr(@padding_letter2, ' ')
206
+
207
+ similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
+ end
209
+
140
210
  end
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
+ def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
- @block_alignment = {source_text:_str1, target_text:_str2}
18
+ @block_alignment = {source_text:str1, target_text:str2}
19
+ @str1 = str1
20
+ @str2 = str2
19
21
 
20
- str1, str2, mappings = string_preprocessing(_str1, _str2)
21
-
22
- # try exact match
22
+ ## Block exact match
23
23
  block_begin = str2.index(str1)
24
24
  unless block_begin.nil?
25
25
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return @block_alignment
26
+ return
27
27
  end
28
28
 
29
- # try exact match
30
29
  block_begin = str2.downcase.index(str1.downcase)
31
30
  unless block_begin.nil?
32
31
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
33
- return @block_alignment
32
+ return
34
33
  end
35
34
 
35
+
36
+ ## to find block alignments
36
37
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
37
38
 
38
- # To collect matched blocks
39
- mblocks = []
40
- while anchor = anchor_finder.get_next_anchor
41
- last = mblocks.last
42
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
43
- last[:source][:end] = anchor[:source][:end]
44
- last[:target][:end] = anchor[:target][:end]
39
+ blocks = []
40
+ while block = anchor_finder.get_next_anchor
41
+ last = blocks.last
42
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
43
+ last[:source][:end] = block[:source][:end]
44
+ last[:target][:end] = block[:target][:end]
45
45
  else
46
- mblocks << anchor
46
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
47
47
  end
48
48
  end
49
49
 
50
- # pp mblocks
50
+ # pp blocks
51
51
  # puts "-----"
52
52
  # puts
53
- # mblocks.each do |b|
53
+ # exit
54
+ # blocks.each do |b|
54
55
  # p [b[:source], b[:target]]
55
56
  # puts "---"
56
57
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -62,117 +63,198 @@ class TextAlignment::TextAlignment
62
63
  # puts "-=-=-=-=-"
63
64
  # puts
64
65
 
65
- ## To find block alignments
66
- @block_alignment[:blocks] = []
67
- return if mblocks.empty?
68
-
69
- # Initial step
70
- if mblocks[0][:source][:begin] > 0
71
- e1 = mblocks[0][:source][:begin]
72
- e2 = mblocks[0][:target][:begin]
66
+ ## to fill the gaps
67
+ last_block = nil
68
+ blocks2 = blocks.inject([]) do |sum, block|
69
+ b1 = last_block ? last_block[:source][:end] : 0
70
+ e1 = block[:source][:begin]
73
71
 
74
- if mblocks[0][:target][:begin] == 0
75
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
72
+ sum += if b1 == e1
73
+ [block]
76
74
  else
77
- _str1 = str1[0 ... e1]
78
- _str2 = str2[0 ... e2]
75
+ b2 = last_block ? last_block[:target][:end] : 0
76
+ e2 = block[:target][:begin]
77
+
78
+ if b2 == e2
79
+ [
80
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
81
+ block
82
+ ]
83
+ else
84
+ if b1 == 0 && b2 == 0
85
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
+ b2 = e2 - len_buffer if e2 > len_buffer
87
+ end
79
88
 
80
- unless _str1.strip.empty?
81
- if _str2.strip.empty?
82
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
89
+ _str1 = str1[b1 ... e1]
90
+ _str2 = str2[b2 ... e2]
91
+
92
+ if _str1.strip.empty? || _str2.strip.empty?
93
+ [
94
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
+ block
96
+ ]
83
97
  else
84
- len_min = [_str1.length, _str2.length].min
85
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
87
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
88
-
89
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
90
-
91
- _str1 = str1[b1 ... e1]
92
- _str2 = str2[b2 ... e2]
93
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
94
- similarity = alignment_similarity(_str1, _str2, alignment)
95
- if similarity < 0.6
96
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
97
- else
98
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
99
- end
98
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
100
99
  end
101
100
  end
102
101
  end
102
+
103
+ last_block = block
104
+ sum
103
105
  end
104
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
105
-
106
- (1 ... mblocks.length).each do |i|
107
- b1 = mblocks[i - 1][:source][:end]
108
- b2 = mblocks[i - 1][:target][:end]
109
- e1 = mblocks[i][:source][:begin]
110
- e2 = mblocks[i][:target][:begin]
111
- _str1 = str1[b1 ... e1]
112
- _str2 = str2[b2 ... e2]
113
- unless _str1.strip.empty?
114
- if _str2.strip.empty?
115
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
+
107
+ # the last step
108
+ blocks2 += if last_block.nil?
109
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
110
+ else
111
+ b1 = last_block[:source][:end]
112
+ if b1 < str1.length
113
+ e1 = str1.length
114
+
115
+ b2 = last_block[:target][:end]
116
+ if b2 < str2.length
117
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
116
120
  else
117
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
118
- similarity = alignment_similarity(_str1, _str2, alignment)
119
- if similarity < 0.6
120
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
121
- else
122
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
121
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
122
+ end
123
+ else
124
+ []
125
+ end
126
+ end
127
+
128
+ @block_alignment[:blocks] = blocks2
129
+ end
130
+
131
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
132
+ block2 = str2[b2 ... e2]
133
+
134
+ ## term-based alignment
135
+ tblocks = if denotations
136
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
137
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
138
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
139
+
140
+ position = 0
141
+ tblocks = ds_in_scope.map do |term|
142
+ lex = term[:lex]
143
+ r = block2.index(lex, position)
144
+ if r.nil?
145
+ position = nil
146
+ break
147
+ end
148
+ position = r + lex.length
149
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
150
+ end
151
+
152
+ # missing term found
153
+ tblocks = [] if position.nil?
154
+
155
+ # redundant matching found
156
+ unless position.nil?
157
+ ds_in_scope.each do |term|
158
+ lex = term[:lex]
159
+ look_forward = block2.index(lex, position)
160
+ unless look_forward.nil?
161
+ puts lex
162
+ tblocks = []
163
+ break
123
164
  end
124
165
  end
125
166
  end
126
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
167
+
168
+ tblocks
127
169
  end
128
170
 
129
- # Final step
130
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
131
- b1 = mblocks[-1][:source][:end]
132
- b2 = mblocks[-1][:target][:end]
133
- _str1 = str1[b1 ... str1.length]
134
- _str2 = str2[b2 ... str2.length]
171
+ if tblocks.empty?
172
+ if b1 == 0 && e1 == str1.length
173
+ block1 = str1[b1 ... e1]
174
+ block2 = str2[b2 ... e2]
135
175
 
136
- unless _str1.strip.empty?
137
- if _str2.strip.empty?
138
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
176
+ ## character-based alignment
177
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
178
+ if alignment.sdiff.nil?
179
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
139
180
  else
140
- len_min = [_str1.length, _str2.length].min
141
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
142
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
143
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
144
- _str1 = str1[b1 ... e1]
145
- _str2 = str2[b2 ... e2]
181
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
182
+ end
183
+ else
184
+ block1 = str1[b1 ... e1]
185
+ block2 = str2[b2 ... e2]
186
+
187
+ ## character-based alignment
188
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
189
+ if alignment.sdiff.nil?
190
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
191
+ else
192
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
193
+ end
194
+ end
195
+ else
196
+ last_tblock = nil
197
+ lblocks = tblocks.inject([]) do |sum, tblock|
198
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
199
+ te1 = tblock[:source][:begin]
146
200
 
147
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
148
- similarity = alignment_similarity(_str1, _str2, alignment)
149
- if similarity < 0.6
150
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
201
+ sum += if te1 == tb1
202
+ [tblock]
203
+ else
204
+ tb2 = last_tblock ? last_tblock[:target][:end] : b2
205
+ te2 = tblock[:target][:begin]
206
+
207
+ if b2 == e2
208
+ [
209
+ {source:{begin:tb1, end:te1}, alignment: :empty},
210
+ tblock
211
+ ]
151
212
  else
152
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
213
+ [
214
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
215
+ tblock
216
+ ]
153
217
  end
218
+ end
154
219
 
155
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
220
+ last_tblock = tblock
221
+ sum
222
+ end
223
+
224
+ if last_tblock[:source][:end] < e1
225
+ if last_tblock[:target][:end] < e2
226
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
227
+ else
228
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
156
229
  end
157
230
  end
158
- end
159
231
 
160
- @block_alignment[:blocks].each do |a|
161
- a[:delta] = a[:target][:begin] - a[:source][:begin]
232
+ lblocks
162
233
  end
163
234
  end
164
235
 
236
+
237
+ def indices(str, target)
238
+ position = 0
239
+ len = target.len
240
+ Enumerator.new do |yielder|
241
+ while idx = str.index(target, position)
242
+ yielder << idx
243
+ position = idx + len
244
+ end
245
+ end
246
+ end
247
+
165
248
  def transform_begin_position(begin_position)
166
249
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
167
250
  block = @block_alignment[:blocks][i]
168
251
 
169
- b = if block[:alignment] == :block
252
+ b = if block[:alignment] == :block || block[:alignment] == :term
170
253
  begin_position + block[:delta]
171
254
  elsif block[:alignment] == :empty
172
255
  if begin_position == block[:source][:begin]
173
256
  block[:target][:begin]
174
257
  else
175
- # raise "lost annotation"
176
258
  nil
177
259
  end
178
260
  else
@@ -185,13 +267,12 @@ class TextAlignment::TextAlignment
185
267
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
186
268
  block = @block_alignment[:blocks][i]
187
269
 
188
- e = if block[:alignment] == :block
270
+ e = if block[:alignment] == :block || block[:alignment] == :term
189
271
  end_position + block[:delta]
190
272
  elsif block[:alignment] == :empty
191
273
  if end_position == block[:source][:end]
192
274
  block[:target][:end]
193
275
  else
194
- # raise "lost annotation"
195
276
  nil
196
277
  end
197
278
  else
@@ -213,14 +294,14 @@ class TextAlignment::TextAlignment
213
294
  @lost_annotations = []
214
295
 
215
296
  denotations.each do |d|
216
- begin
217
- d.begin = transform_begin_position(d.begin);
218
- d.end = transform_end_position(d.end);
219
- rescue
220
- @lost_annotations << d
221
- d.begin = nil
222
- d.end = nil
223
- end
297
+ source = {begin:d.begin, end:d.end}
298
+ d.begin = transform_begin_position(d.begin);
299
+ d.end = transform_end_position(d.end);
300
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
301
+ rescue
302
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
303
+ d.begin = nil
304
+ d.end = nil
224
305
  end
225
306
 
226
307
  @lost_annotations
@@ -231,12 +312,12 @@ class TextAlignment::TextAlignment
231
312
  @lost_annotations = []
232
313
 
233
314
  r = hdenotations.collect do |d|
234
- new_d = begin
235
- d.dup.merge({span:transform_a_span(d[:span])})
236
- rescue
237
- @lost_annotations << d
238
- nil
239
- end
315
+ t = transform_a_span(d[:span])
316
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
317
+ new_d = d.dup.merge({span:t})
318
+ rescue
319
+ @lost_annotations << {source: d[:span], target:t}
320
+ nil
240
321
  end.compact
241
322
 
242
323
  r
@@ -250,13 +331,16 @@ class TextAlignment::TextAlignment
250
331
  @block_alignment[:blocks].each do |a|
251
332
  show += case a[:alignment]
252
333
  when :block
253
- "===== common =====\n" +
334
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
335
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
336
+ when :term
337
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
254
338
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
255
339
  when :empty
256
340
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
257
- "<<<<< string 1\n" +
341
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
258
342
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
259
- ">>>>> string 2\n" +
343
+ ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
260
344
  ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
261
345
  else
262
346
  astr1 = ''
@@ -290,7 +374,7 @@ class TextAlignment::TextAlignment
290
374
  end
291
375
  end.join('')
292
376
 
293
- "***** local mismatch\n" +
377
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
294
378
  "[#{astr1}]\n" +
295
379
  "[#{astr2}]\n\n"
296
380
  end
@@ -298,68 +382,4 @@ class TextAlignment::TextAlignment
298
382
  show
299
383
  end
300
384
 
301
- private
302
-
303
- def string_preprocessing(_str1, _str2)
304
- str1 = _str1.dup
305
- str2 = _str2.dup
306
- mappings = TextAlignment::MAPPINGS.dup
307
-
308
- ## single character mappings
309
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
310
- characters_from = character_mappings.collect{|m| m[0]}.join
311
- characters_to = character_mappings.collect{|m| m[1]}.join
312
- characters_to.gsub!(/-/, '\-')
313
-
314
- str1.tr!(characters_from, characters_to)
315
- str2.tr!(characters_from, characters_to)
316
-
317
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
318
-
319
- ## long to one character mappings
320
- pletters = TextAlignment::PADDING_LETTERS
321
-
322
- # find the padding letter for str1
323
- @padding_letter1 = begin
324
- i = pletters.index{|l| str2.index(l).nil?}
325
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
326
- TextAlignment::PADDING_LETTERS[i]
327
- end
328
-
329
- # find the padding letter for str2
330
- @padding_letter2 = begin
331
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
332
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
333
- TextAlignment::PADDING_LETTERS[i]
334
- end
335
-
336
- # ASCII foldings
337
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
338
- ascii_foldings.each do |f|
339
- from = f[1]
340
-
341
- if str2.index(f[0])
342
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
343
- str1.gsub!(from, to)
344
- end
345
-
346
- if str1.index(f[0])
347
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
348
- str2.gsub!(from, to)
349
- end
350
- end
351
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
352
-
353
- [str1, str2, mappings]
354
- end
355
-
356
- def alignment_similarity(_s1, _s2, alignment)
357
- # compute the lcs only with non-whitespace letters
358
- lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
359
-
360
- s1 = _s1.tr(@padding_letter1, ' ')
361
- s2 = _s2.tr(@padding_letter2, ' ')
362
- similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
363
- end
364
-
365
385
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.1'
2
+ VERSION = '0.7.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary