text_alignment 0.6 → 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc820991f5f694f154b94c369158909ccba3760829e0d881c7fd2e6ef7ddd149
4
- data.tar.gz: 40ae6f2e388405426a77682bd1a3fb7a3c853076eced9b7301b632081dfd0a57
3
+ metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
4
+ data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
5
5
  SHA512:
6
- metadata.gz: 5802241b4a8394d3c570c1d4b8f5e1d7706c72852e2d6e6fb23bda2f6e2972fa09f7001695db026667144e2af982eeb91ed0b700bd8151af6df794c98e3c069b
7
- data.tar.gz: 8d7c93acbef6ab12bb2a0291444a7bcc73b0236bb5b0d06d274e95aa30c9ffc829965653b58270686147a9ac30ccf570518b3ad266120b320dfb20cd1620f5f9
6
+ metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
7
+ data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
@@ -105,9 +105,7 @@ lost_annotations = []
105
105
  target_annotations = if source_annotations.class == Array
106
106
  align_mdoc(source_annotations, {text: target_text})
107
107
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
108
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
111
109
 
112
110
  # verification
113
111
  # source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
142
140
  puts "====="
143
141
  # exit
144
142
 
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
143
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
144
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
145
 
163
146
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
194
177
 
195
178
  if lost_annotations
196
179
  warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
180
+ lost_annotations.each do |a|
181
+ p a
182
+ end
198
183
  end
199
184
 
200
185
  #puts target_annotations.to_json
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(str1, str2, mappings = [])
21
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
- mappings ||= []
20
+ def initialize(_str1, _str2)
21
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
+
23
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
23
24
 
24
25
  _compute_mixed_alignment(str1, str2, mappings)
25
26
  end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
62
63
  end
63
64
 
64
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
65
- @similarity = cmp.similarity
66
+ @similarity = compute_similarity(str1, str2, @sdiff)
66
67
  @str1_match_initial = cmp.str1_match_initial
67
68
  @str1_match_final = cmp.str1_match_final
68
69
  @str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
137
138
  @position_map_begin = posmap_begin.sort.to_h
138
139
  @position_map_end = posmap_end.sort.to_h
139
140
  end
141
+
142
+ private
143
+
144
+ def string_preprocessing(_str1, _str2)
145
+ str1 = _str1.dup
146
+ str2 = _str2.dup
147
+ mappings = TextAlignment::MAPPINGS.dup
148
+
149
+ ## single character mappings
150
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
+ characters_from = character_mappings.collect{|m| m[0]}.join
152
+ characters_to = character_mappings.collect{|m| m[1]}.join
153
+ characters_to.gsub!(/-/, '\-')
154
+
155
+ str1.tr!(characters_from, characters_to)
156
+ str2.tr!(characters_from, characters_to)
157
+
158
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
+
160
+ ## long to one character mappings
161
+ pletters = TextAlignment::PADDING_LETTERS
162
+
163
+ # find the padding letter for str1
164
+ @padding_letter1 = begin
165
+ i = pletters.index{|l| str2.index(l).nil?}
166
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
+ TextAlignment::PADDING_LETTERS[i]
168
+ end
169
+
170
+ # find the padding letter for str2
171
+ @padding_letter2 = begin
172
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
+ TextAlignment::PADDING_LETTERS[i]
175
+ end
176
+
177
+ # ASCII foldings
178
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
+ ascii_foldings.each do |f|
180
+ from = f[1]
181
+
182
+ if str2.index(f[0])
183
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
+ str1.gsub!(from, to)
185
+ end
186
+
187
+ if str1.index(f[0])
188
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
+ str2.gsub!(from, to)
190
+ end
191
+ end
192
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
+
194
+ [str1, str2, mappings]
195
+ end
196
+
197
+ def compute_similarity(_s1, _s2, sdiff)
198
+ return 0 if sdiff.nil?
199
+
200
+ # compute the lcs only with non-whitespace letters
201
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
+ return 0 if lcs == 0
203
+
204
+ s1 = _s1.tr(@padding_letter1, ' ')
205
+ s2 = _s2.tr(@padding_letter2, ' ')
206
+
207
+ similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
+ end
209
+
140
210
  end
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
+ def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
- @block_alignment = {source_text:_str1, target_text:_str2}
18
+ @block_alignment = {source_text:str1, target_text:str2}
19
+ @str1 = str1
20
+ @str2 = str2
19
21
 
20
- str1, str2, mappings = string_preprocessing(_str1, _str2)
21
-
22
- # try exact match
22
+ ## Block exact match
23
23
  block_begin = str2.index(str1)
24
24
  unless block_begin.nil?
25
25
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return @block_alignment
26
+ return
27
27
  end
28
28
 
29
- # try exact match
30
29
  block_begin = str2.downcase.index(str1.downcase)
31
30
  unless block_begin.nil?
32
31
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
33
- return @block_alignment
32
+ return
34
33
  end
35
34
 
35
+
36
+ ## to find block alignments
36
37
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
37
38
 
38
- # To collect matched blocks
39
- mblocks = []
40
- while anchor = anchor_finder.get_next_anchor
41
- last = mblocks.last
42
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
43
- last[:source][:end] = anchor[:source][:end]
44
- last[:target][:end] = anchor[:target][:end]
39
+ blocks = []
40
+ while block = anchor_finder.get_next_anchor
41
+ last = blocks.last
42
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
43
+ last[:source][:end] = block[:source][:end]
44
+ last[:target][:end] = block[:target][:end]
45
45
  else
46
- mblocks << anchor
46
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
47
47
  end
48
48
  end
49
49
 
50
- # pp mblocks
50
+ # pp blocks
51
51
  # puts "-----"
52
52
  # puts
53
- # mblocks.each do |b|
53
+ # exit
54
+ # blocks.each do |b|
54
55
  # p [b[:source], b[:target]]
55
56
  # puts "---"
56
57
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -62,114 +63,196 @@ class TextAlignment::TextAlignment
62
63
  # puts "-=-=-=-=-"
63
64
  # puts
64
65
 
65
- ## To find block alignments
66
- @block_alignment[:blocks] = []
67
- return if mblocks.empty?
66
+ ## to fill the gaps
67
+ last_block = nil
68
+ blocks2 = blocks.inject([]) do |sum, block|
69
+ b1 = last_block ? last_block[:source][:end] : 0
70
+ e1 = block[:source][:begin]
68
71
 
69
- # Initial step
70
- if mblocks[0][:source][:begin] > 0
71
- e1 = mblocks[0][:source][:begin]
72
- e2 = mblocks[0][:target][:begin]
73
-
74
- if mblocks[0][:target][:begin] == 0
75
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
72
+ sum += if b1 == e1
73
+ [block]
76
74
  else
77
- _str1 = str1[0 ... e1]
78
- _str2 = str2[0 ... e2]
75
+ b2 = last_block ? last_block[:target][:end] : 0
76
+ e2 = block[:target][:begin]
77
+
78
+ if b2 == e2
79
+ [
80
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
81
+ block
82
+ ]
83
+ else
84
+ if b1 == 0 && b2 == 0
85
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
+ b2 = e2 - len_buffer if e2 > len_buffer
87
+ end
88
+
89
+ _str1 = str1[b1 ... e1]
90
+ _str2 = str2[b2 ... e2]
79
91
 
80
- unless _str1.strip.empty?
81
- if _str2.strip.empty?
82
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
92
+ if _str1.strip.empty? || _str2.strip.empty?
93
+ [
94
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
+ block
96
+ ]
83
97
  else
84
- len_min = [_str1.length, _str2.length].min
85
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
87
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
88
-
89
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
90
-
91
- _str1 = str1[b1 ... e1]
92
- _str2 = str2[b2 ... e2]
93
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
94
- if alignment.similarity < 0.6
95
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
96
- else
97
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
98
- end
98
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
99
99
  end
100
100
  end
101
101
  end
102
+
103
+ last_block = block
104
+ sum
102
105
  end
103
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
104
-
105
- (1 ... mblocks.length).each do |i|
106
- b1 = mblocks[i - 1][:source][:end]
107
- b2 = mblocks[i - 1][:target][:end]
108
- e1 = mblocks[i][:source][:begin]
109
- e2 = mblocks[i][:target][:begin]
110
- _str1 = str1[b1 ... e1]
111
- _str2 = str2[b2 ... e2]
112
- unless _str1.strip.empty?
113
- if _str2.strip.empty?
114
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
+
107
+ # the last step
108
+ blocks2 += if last_block.nil?
109
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
110
+ else
111
+ b1 = last_block[:source][:end]
112
+ if b1 < str1.length
113
+ e1 = str1.length
114
+
115
+ b2 = last_block[:target][:end]
116
+ if b2 < str2.length
117
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
115
120
  else
116
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
117
- if alignment.similarity < 0.6
118
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
119
- else
120
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
121
- end
121
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
122
122
  end
123
123
  end
124
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
125
124
  end
126
125
 
127
- # Final step
128
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
129
- b1 = mblocks[-1][:source][:end]
130
- b2 = mblocks[-1][:target][:end]
131
- _str1 = str1[b1 ... str1.length]
132
- _str2 = str2[b2 ... str2.length]
126
+ @block_alignment[:blocks] = blocks2
127
+ end
133
128
 
134
- unless _str1.strip.empty?
135
- if _str2.strip.empty?
136
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
129
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
130
+ block2 = str2[b2 ... e2]
131
+
132
+ ## term-based alignment
133
+ tblocks = if denotations
134
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
135
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
136
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
137
+
138
+ position = 0
139
+ tblocks = ds_in_scope.map do |term|
140
+ lex = term[:lex]
141
+ r = block2.index(lex, position)
142
+ if r.nil?
143
+ position = nil
144
+ break
145
+ end
146
+ position = r + lex.length
147
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
148
+ end
149
+
150
+ # missing term found
151
+ tblocks = [] if position.nil?
152
+
153
+ # redundant matching found
154
+ unless position.nil?
155
+ ds_in_scope.each do |term|
156
+ lex = term[:lex]
157
+ look_forward = block2.index(lex, position)
158
+ unless look_forward.nil?
159
+ puts lex
160
+ tblocks = []
161
+ break
162
+ end
163
+ end
164
+ end
165
+
166
+ tblocks
167
+ end
168
+
169
+ if tblocks.empty?
170
+ if b1 == 0 && e1 == str1.length
171
+ if str2.length > 2000
172
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
137
173
  else
138
- len_min = [_str1.length, _str2.length].min
139
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
140
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
141
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
142
- _str1 = str1[b1 ... e1]
143
- _str2 = str2[b2 ... e2]
174
+ block1 = str1[b1 ... e1]
175
+ block2 = str2[b2 ... e2]
176
+
177
+ ## character-based alignment
178
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
179
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
180
+ # alignment = :alignment
181
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
182
+ end
183
+ else
184
+ block1 = str1[b1 ... e1]
185
+ block2 = str2[b2 ... e2]
186
+
187
+ ## character-based alignment
188
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
190
+ # alignmnet = :alignment
191
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
192
+ end
193
+ else
194
+ last_tblock = nil
195
+ lblocks = tblocks.inject([]) do |sum, tblock|
196
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
197
+ te1 = tblock[:source][:begin]
144
198
 
145
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
146
- if alignment.similarity < 0.6
147
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
199
+ sum += if te1 == tb1
200
+ [tblock]
201
+ else
202
+ tb2 = last_tblock ? tlast_block[:target][:end] : b2
203
+ te2 = tblock[:target][:begin]
204
+
205
+ if b2 == e2
206
+ [
207
+ {source:{begin:tb1, end:te1}, alignment: :empty},
208
+ tblock
209
+ ]
148
210
  else
149
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
211
+ [
212
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
213
+ tblock
214
+ ]
150
215
  end
216
+ end
151
217
 
152
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
218
+ last_tblock = tblock
219
+ sum
220
+ end
221
+
222
+ if last_tblock[:source][:end] < e1
223
+ if last_tblock[:target][:end] < e2
224
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
225
+ else
226
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
153
227
  end
154
228
  end
155
- end
156
229
 
157
- @block_alignment[:blocks].each do |a|
158
- a[:delta] = a[:target][:begin] - a[:source][:begin]
230
+ lblocks
159
231
  end
160
232
  end
161
233
 
234
+
235
+ def indices(str, target)
236
+ position = 0
237
+ len = target.len
238
+ Enumerator.new do |yielder|
239
+ while idx = str.index(target, position)
240
+ yielder << idx
241
+ position = idx + len
242
+ end
243
+ end
244
+ end
245
+
162
246
  def transform_begin_position(begin_position)
163
247
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
164
248
  block = @block_alignment[:blocks][i]
165
249
 
166
- b = if block[:alignment] == :block
250
+ b = if block[:alignment] == :block || block[:alignment] == :term
167
251
  begin_position + block[:delta]
168
252
  elsif block[:alignment] == :empty
169
253
  if begin_position == block[:source][:begin]
170
254
  block[:target][:begin]
171
255
  else
172
- # raise "lost annotation"
173
256
  nil
174
257
  end
175
258
  else
@@ -182,13 +265,12 @@ class TextAlignment::TextAlignment
182
265
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
183
266
  block = @block_alignment[:blocks][i]
184
267
 
185
- e = if block[:alignment] == :block
268
+ e = if block[:alignment] == :block || block[:alignment] == :term
186
269
  end_position + block[:delta]
187
270
  elsif block[:alignment] == :empty
188
271
  if end_position == block[:source][:end]
189
272
  block[:target][:end]
190
273
  else
191
- # raise "lost annotation"
192
274
  nil
193
275
  end
194
276
  else
@@ -210,14 +292,14 @@ class TextAlignment::TextAlignment
210
292
  @lost_annotations = []
211
293
 
212
294
  denotations.each do |d|
213
- begin
214
- d.begin = transform_begin_position(d.begin);
215
- d.end = transform_end_position(d.end);
216
- rescue
217
- @lost_annotations << d
218
- d.begin = nil
219
- d.end = nil
220
- end
295
+ source = {begin:d.begin, end:d.end}
296
+ d.begin = transform_begin_position(d.begin);
297
+ d.end = transform_end_position(d.end);
298
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
299
+ rescue
300
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
301
+ d.begin = nil
302
+ d.end = nil
221
303
  end
222
304
 
223
305
  @lost_annotations
@@ -228,12 +310,12 @@ class TextAlignment::TextAlignment
228
310
  @lost_annotations = []
229
311
 
230
312
  r = hdenotations.collect do |d|
231
- new_d = begin
232
- d.dup.merge({span:transform_a_span(d[:span])})
233
- rescue
234
- @lost_annotations << d
235
- nil
236
- end
313
+ t = transform_a_span(d[:span])
314
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
315
+ new_d = d.dup.merge({span:t})
316
+ rescue
317
+ @lost_annotations << {source: d[:span], target:t}
318
+ nil
237
319
  end.compact
238
320
 
239
321
  r
@@ -247,12 +329,16 @@ class TextAlignment::TextAlignment
247
329
  @block_alignment[:blocks].each do |a|
248
330
  show += case a[:alignment]
249
331
  when :block
250
- "===== common =====\n" +
332
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
333
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
334
+ when :term
335
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
251
336
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
252
337
  when :empty
253
- "<<<<< string 1\n" +
338
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
339
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
254
340
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
255
- ">>>>> string 2\n" +
341
+ ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
256
342
  ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
257
343
  else
258
344
  astr1 = ''
@@ -286,7 +372,7 @@ class TextAlignment::TextAlignment
286
372
  end
287
373
  end.join('')
288
374
 
289
- "***** local mismatch\n" +
375
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
290
376
  "[#{astr1}]\n" +
291
377
  "[#{astr2}]\n\n"
292
378
  end
@@ -294,59 +380,4 @@ class TextAlignment::TextAlignment
294
380
  show
295
381
  end
296
382
 
297
- private
298
-
299
- def string_preprocessing(_str1, _str2)
300
- str1 = _str1.dup
301
- str2 = _str2.dup
302
- mappings = TextAlignment::MAPPINGS.dup
303
-
304
- ## single character mappings
305
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
306
- characters_from = character_mappings.collect{|m| m[0]}.join
307
- characters_to = character_mappings.collect{|m| m[1]}.join
308
- characters_to.gsub!(/-/, '\-')
309
-
310
- str1.tr!(characters_from, characters_to)
311
- str2.tr!(characters_from, characters_to)
312
-
313
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
314
-
315
- ## long to one character mappings
316
- pletters = TextAlignment::PADDING_LETTERS
317
-
318
- # find the padding letter for str1
319
- padding_letter1 = begin
320
- i = pletters.index{|l| str2.index(l).nil?}
321
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
322
- TextAlignment::PADDING_LETTERS[i]
323
- end
324
-
325
- # find the padding letter for str2
326
- padding_letter2 = begin
327
- i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
328
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
329
- TextAlignment::PADDING_LETTERS[i]
330
- end
331
-
332
- # ASCII foldings
333
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
334
- ascii_foldings.each do |f|
335
- from = f[1]
336
-
337
- if str2.index(f[0])
338
- to = f[0] + (padding_letter1 * (f[1].length - 1))
339
- str1.gsub!(from, to)
340
- end
341
-
342
- if str1.index(f[0])
343
- to = f[0] + (padding_letter2 * (f[1].length - 1))
344
- str2.gsub!(from, to)
345
- end
346
- end
347
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
348
-
349
- [str1, str2, mappings]
350
- end
351
-
352
383
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6'
2
+ VERSION = '0.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.6'
4
+ version: '0.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary