text_alignment 0.6 → 0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc820991f5f694f154b94c369158909ccba3760829e0d881c7fd2e6ef7ddd149
4
- data.tar.gz: 40ae6f2e388405426a77682bd1a3fb7a3c853076eced9b7301b632081dfd0a57
3
+ metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
4
+ data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
5
5
  SHA512:
6
- metadata.gz: 5802241b4a8394d3c570c1d4b8f5e1d7706c72852e2d6e6fb23bda2f6e2972fa09f7001695db026667144e2af982eeb91ed0b700bd8151af6df794c98e3c069b
7
- data.tar.gz: 8d7c93acbef6ab12bb2a0291444a7bcc73b0236bb5b0d06d274e95aa30c9ffc829965653b58270686147a9ac30ccf570518b3ad266120b320dfb20cd1620f5f9
6
+ metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
7
+ data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
@@ -105,9 +105,7 @@ lost_annotations = []
105
105
  target_annotations = if source_annotations.class == Array
106
106
  align_mdoc(source_annotations, {text: target_text})
107
107
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
108
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
111
109
 
112
110
  # verification
113
111
  # source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
142
140
  puts "====="
143
141
  # exit
144
142
 
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
143
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
144
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
145
 
163
146
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
194
177
 
195
178
  if lost_annotations
196
179
  warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
180
+ lost_annotations.each do |a|
181
+ p a
182
+ end
198
183
  end
199
184
 
200
185
  #puts target_annotations.to_json
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(str1, str2, mappings = [])
21
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
- mappings ||= []
20
+ def initialize(_str1, _str2)
21
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
+
23
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
23
24
 
24
25
  _compute_mixed_alignment(str1, str2, mappings)
25
26
  end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
62
63
  end
63
64
 
64
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
65
- @similarity = cmp.similarity
66
+ @similarity = compute_similarity(str1, str2, @sdiff)
66
67
  @str1_match_initial = cmp.str1_match_initial
67
68
  @str1_match_final = cmp.str1_match_final
68
69
  @str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
137
138
  @position_map_begin = posmap_begin.sort.to_h
138
139
  @position_map_end = posmap_end.sort.to_h
139
140
  end
141
+
142
+ private
143
+
144
+ def string_preprocessing(_str1, _str2)
145
+ str1 = _str1.dup
146
+ str2 = _str2.dup
147
+ mappings = TextAlignment::MAPPINGS.dup
148
+
149
+ ## single character mappings
150
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
+ characters_from = character_mappings.collect{|m| m[0]}.join
152
+ characters_to = character_mappings.collect{|m| m[1]}.join
153
+ characters_to.gsub!(/-/, '\-')
154
+
155
+ str1.tr!(characters_from, characters_to)
156
+ str2.tr!(characters_from, characters_to)
157
+
158
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
+
160
+ ## long to one character mappings
161
+ pletters = TextAlignment::PADDING_LETTERS
162
+
163
+ # find the padding letter for str1
164
+ @padding_letter1 = begin
165
+ i = pletters.index{|l| str2.index(l).nil?}
166
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
+ TextAlignment::PADDING_LETTERS[i]
168
+ end
169
+
170
+ # find the padding letter for str2
171
+ @padding_letter2 = begin
172
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
+ TextAlignment::PADDING_LETTERS[i]
175
+ end
176
+
177
+ # ASCII foldings
178
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
+ ascii_foldings.each do |f|
180
+ from = f[1]
181
+
182
+ if str2.index(f[0])
183
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
+ str1.gsub!(from, to)
185
+ end
186
+
187
+ if str1.index(f[0])
188
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
+ str2.gsub!(from, to)
190
+ end
191
+ end
192
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
+
194
+ [str1, str2, mappings]
195
+ end
196
+
197
+ def compute_similarity(_s1, _s2, sdiff)
198
+ return 0 if sdiff.nil?
199
+
200
+ # compute the lcs only with non-whitespace letters
201
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
+ return 0 if lcs == 0
203
+
204
+ s1 = _s1.tr(@padding_letter1, ' ')
205
+ s2 = _s2.tr(@padding_letter2, ' ')
206
+
207
+ similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
+ end
209
+
140
210
  end
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
+ def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
- @block_alignment = {source_text:_str1, target_text:_str2}
18
+ @block_alignment = {source_text:str1, target_text:str2}
19
+ @str1 = str1
20
+ @str2 = str2
19
21
 
20
- str1, str2, mappings = string_preprocessing(_str1, _str2)
21
-
22
- # try exact match
22
+ ## Block exact match
23
23
  block_begin = str2.index(str1)
24
24
  unless block_begin.nil?
25
25
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return @block_alignment
26
+ return
27
27
  end
28
28
 
29
- # try exact match
30
29
  block_begin = str2.downcase.index(str1.downcase)
31
30
  unless block_begin.nil?
32
31
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
33
- return @block_alignment
32
+ return
34
33
  end
35
34
 
35
+
36
+ ## to find block alignments
36
37
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
37
38
 
38
- # To collect matched blocks
39
- mblocks = []
40
- while anchor = anchor_finder.get_next_anchor
41
- last = mblocks.last
42
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
43
- last[:source][:end] = anchor[:source][:end]
44
- last[:target][:end] = anchor[:target][:end]
39
+ blocks = []
40
+ while block = anchor_finder.get_next_anchor
41
+ last = blocks.last
42
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
43
+ last[:source][:end] = block[:source][:end]
44
+ last[:target][:end] = block[:target][:end]
45
45
  else
46
- mblocks << anchor
46
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
47
47
  end
48
48
  end
49
49
 
50
- # pp mblocks
50
+ # pp blocks
51
51
  # puts "-----"
52
52
  # puts
53
- # mblocks.each do |b|
53
+ # exit
54
+ # blocks.each do |b|
54
55
  # p [b[:source], b[:target]]
55
56
  # puts "---"
56
57
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -62,114 +63,196 @@ class TextAlignment::TextAlignment
62
63
  # puts "-=-=-=-=-"
63
64
  # puts
64
65
 
65
- ## To find block alignments
66
- @block_alignment[:blocks] = []
67
- return if mblocks.empty?
66
+ ## to fill the gaps
67
+ last_block = nil
68
+ blocks2 = blocks.inject([]) do |sum, block|
69
+ b1 = last_block ? last_block[:source][:end] : 0
70
+ e1 = block[:source][:begin]
68
71
 
69
- # Initial step
70
- if mblocks[0][:source][:begin] > 0
71
- e1 = mblocks[0][:source][:begin]
72
- e2 = mblocks[0][:target][:begin]
73
-
74
- if mblocks[0][:target][:begin] == 0
75
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
72
+ sum += if b1 == e1
73
+ [block]
76
74
  else
77
- _str1 = str1[0 ... e1]
78
- _str2 = str2[0 ... e2]
75
+ b2 = last_block ? last_block[:target][:end] : 0
76
+ e2 = block[:target][:begin]
77
+
78
+ if b2 == e2
79
+ [
80
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
81
+ block
82
+ ]
83
+ else
84
+ if b1 == 0 && b2 == 0
85
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
+ b2 = e2 - len_buffer if e2 > len_buffer
87
+ end
88
+
89
+ _str1 = str1[b1 ... e1]
90
+ _str2 = str2[b2 ... e2]
79
91
 
80
- unless _str1.strip.empty?
81
- if _str2.strip.empty?
82
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
92
+ if _str1.strip.empty? || _str2.strip.empty?
93
+ [
94
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
+ block
96
+ ]
83
97
  else
84
- len_min = [_str1.length, _str2.length].min
85
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
87
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
88
-
89
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
90
-
91
- _str1 = str1[b1 ... e1]
92
- _str2 = str2[b2 ... e2]
93
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
94
- if alignment.similarity < 0.6
95
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
96
- else
97
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
98
- end
98
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
99
99
  end
100
100
  end
101
101
  end
102
+
103
+ last_block = block
104
+ sum
102
105
  end
103
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
104
-
105
- (1 ... mblocks.length).each do |i|
106
- b1 = mblocks[i - 1][:source][:end]
107
- b2 = mblocks[i - 1][:target][:end]
108
- e1 = mblocks[i][:source][:begin]
109
- e2 = mblocks[i][:target][:begin]
110
- _str1 = str1[b1 ... e1]
111
- _str2 = str2[b2 ... e2]
112
- unless _str1.strip.empty?
113
- if _str2.strip.empty?
114
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
+
107
+ # the last step
108
+ blocks2 += if last_block.nil?
109
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
110
+ else
111
+ b1 = last_block[:source][:end]
112
+ if b1 < str1.length
113
+ e1 = str1.length
114
+
115
+ b2 = last_block[:target][:end]
116
+ if b2 < str2.length
117
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
115
120
  else
116
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
117
- if alignment.similarity < 0.6
118
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
119
- else
120
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
121
- end
121
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
122
122
  end
123
123
  end
124
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
125
124
  end
126
125
 
127
- # Final step
128
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
129
- b1 = mblocks[-1][:source][:end]
130
- b2 = mblocks[-1][:target][:end]
131
- _str1 = str1[b1 ... str1.length]
132
- _str2 = str2[b2 ... str2.length]
126
+ @block_alignment[:blocks] = blocks2
127
+ end
133
128
 
134
- unless _str1.strip.empty?
135
- if _str2.strip.empty?
136
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
129
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
130
+ block2 = str2[b2 ... e2]
131
+
132
+ ## term-based alignment
133
+ tblocks = if denotations
134
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
135
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
136
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
137
+
138
+ position = 0
139
+ tblocks = ds_in_scope.map do |term|
140
+ lex = term[:lex]
141
+ r = block2.index(lex, position)
142
+ if r.nil?
143
+ position = nil
144
+ break
145
+ end
146
+ position = r + lex.length
147
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
148
+ end
149
+
150
+ # missing term found
151
+ tblocks = [] if position.nil?
152
+
153
+ # redundant matching found
154
+ unless position.nil?
155
+ ds_in_scope.each do |term|
156
+ lex = term[:lex]
157
+ look_forward = block2.index(lex, position)
158
+ unless look_forward.nil?
159
+ puts lex
160
+ tblocks = []
161
+ break
162
+ end
163
+ end
164
+ end
165
+
166
+ tblocks
167
+ end
168
+
169
+ if tblocks.empty?
170
+ if b1 == 0 && e1 == str1.length
171
+ if str2.length > 2000
172
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
137
173
  else
138
- len_min = [_str1.length, _str2.length].min
139
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
140
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
141
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
142
- _str1 = str1[b1 ... e1]
143
- _str2 = str2[b2 ... e2]
174
+ block1 = str1[b1 ... e1]
175
+ block2 = str2[b2 ... e2]
176
+
177
+ ## character-based alignment
178
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
179
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
180
+ # alignment = :alignment
181
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
182
+ end
183
+ else
184
+ block1 = str1[b1 ... e1]
185
+ block2 = str2[b2 ... e2]
186
+
187
+ ## character-based alignment
188
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
190
+ # alignmnet = :alignment
191
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
192
+ end
193
+ else
194
+ last_tblock = nil
195
+ lblocks = tblocks.inject([]) do |sum, tblock|
196
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
197
+ te1 = tblock[:source][:begin]
144
198
 
145
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
146
- if alignment.similarity < 0.6
147
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
199
+ sum += if te1 == tb1
200
+ [tblock]
201
+ else
202
+ tb2 = last_tblock ? tlast_block[:target][:end] : b2
203
+ te2 = tblock[:target][:begin]
204
+
205
+ if b2 == e2
206
+ [
207
+ {source:{begin:tb1, end:te1}, alignment: :empty},
208
+ tblock
209
+ ]
148
210
  else
149
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
211
+ [
212
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
213
+ tblock
214
+ ]
150
215
  end
216
+ end
151
217
 
152
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
218
+ last_tblock = tblock
219
+ sum
220
+ end
221
+
222
+ if last_tblock[:source][:end] < e1
223
+ if last_tblock[:target][:end] < e2
224
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
225
+ else
226
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
153
227
  end
154
228
  end
155
- end
156
229
 
157
- @block_alignment[:blocks].each do |a|
158
- a[:delta] = a[:target][:begin] - a[:source][:begin]
230
+ lblocks
159
231
  end
160
232
  end
161
233
 
234
+
235
+ def indices(str, target)
236
+ position = 0
237
+ len = target.len
238
+ Enumerator.new do |yielder|
239
+ while idx = str.index(target, position)
240
+ yielder << idx
241
+ position = idx + len
242
+ end
243
+ end
244
+ end
245
+
162
246
  def transform_begin_position(begin_position)
163
247
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
164
248
  block = @block_alignment[:blocks][i]
165
249
 
166
- b = if block[:alignment] == :block
250
+ b = if block[:alignment] == :block || block[:alignment] == :term
167
251
  begin_position + block[:delta]
168
252
  elsif block[:alignment] == :empty
169
253
  if begin_position == block[:source][:begin]
170
254
  block[:target][:begin]
171
255
  else
172
- # raise "lost annotation"
173
256
  nil
174
257
  end
175
258
  else
@@ -182,13 +265,12 @@ class TextAlignment::TextAlignment
182
265
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
183
266
  block = @block_alignment[:blocks][i]
184
267
 
185
- e = if block[:alignment] == :block
268
+ e = if block[:alignment] == :block || block[:alignment] == :term
186
269
  end_position + block[:delta]
187
270
  elsif block[:alignment] == :empty
188
271
  if end_position == block[:source][:end]
189
272
  block[:target][:end]
190
273
  else
191
- # raise "lost annotation"
192
274
  nil
193
275
  end
194
276
  else
@@ -210,14 +292,14 @@ class TextAlignment::TextAlignment
210
292
  @lost_annotations = []
211
293
 
212
294
  denotations.each do |d|
213
- begin
214
- d.begin = transform_begin_position(d.begin);
215
- d.end = transform_end_position(d.end);
216
- rescue
217
- @lost_annotations << d
218
- d.begin = nil
219
- d.end = nil
220
- end
295
+ source = {begin:d.begin, end:d.end}
296
+ d.begin = transform_begin_position(d.begin);
297
+ d.end = transform_end_position(d.end);
298
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
299
+ rescue
300
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
301
+ d.begin = nil
302
+ d.end = nil
221
303
  end
222
304
 
223
305
  @lost_annotations
@@ -228,12 +310,12 @@ class TextAlignment::TextAlignment
228
310
  @lost_annotations = []
229
311
 
230
312
  r = hdenotations.collect do |d|
231
- new_d = begin
232
- d.dup.merge({span:transform_a_span(d[:span])})
233
- rescue
234
- @lost_annotations << d
235
- nil
236
- end
313
+ t = transform_a_span(d[:span])
314
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
315
+ new_d = d.dup.merge({span:t})
316
+ rescue
317
+ @lost_annotations << {source: d[:span], target:t}
318
+ nil
237
319
  end.compact
238
320
 
239
321
  r
@@ -247,12 +329,16 @@ class TextAlignment::TextAlignment
247
329
  @block_alignment[:blocks].each do |a|
248
330
  show += case a[:alignment]
249
331
  when :block
250
- "===== common =====\n" +
332
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
333
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
334
+ when :term
335
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
251
336
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
252
337
  when :empty
253
- "<<<<< string 1\n" +
338
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
339
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
254
340
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
255
- ">>>>> string 2\n" +
341
+ ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
256
342
  ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
257
343
  else
258
344
  astr1 = ''
@@ -286,7 +372,7 @@ class TextAlignment::TextAlignment
286
372
  end
287
373
  end.join('')
288
374
 
289
- "***** local mismatch\n" +
375
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
290
376
  "[#{astr1}]\n" +
291
377
  "[#{astr2}]\n\n"
292
378
  end
@@ -294,59 +380,4 @@ class TextAlignment::TextAlignment
294
380
  show
295
381
  end
296
382
 
297
- private
298
-
299
- def string_preprocessing(_str1, _str2)
300
- str1 = _str1.dup
301
- str2 = _str2.dup
302
- mappings = TextAlignment::MAPPINGS.dup
303
-
304
- ## single character mappings
305
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
306
- characters_from = character_mappings.collect{|m| m[0]}.join
307
- characters_to = character_mappings.collect{|m| m[1]}.join
308
- characters_to.gsub!(/-/, '\-')
309
-
310
- str1.tr!(characters_from, characters_to)
311
- str2.tr!(characters_from, characters_to)
312
-
313
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
314
-
315
- ## long to one character mappings
316
- pletters = TextAlignment::PADDING_LETTERS
317
-
318
- # find the padding letter for str1
319
- padding_letter1 = begin
320
- i = pletters.index{|l| str2.index(l).nil?}
321
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
322
- TextAlignment::PADDING_LETTERS[i]
323
- end
324
-
325
- # find the padding letter for str2
326
- padding_letter2 = begin
327
- i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
328
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
329
- TextAlignment::PADDING_LETTERS[i]
330
- end
331
-
332
- # ASCII foldings
333
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
334
- ascii_foldings.each do |f|
335
- from = f[1]
336
-
337
- if str2.index(f[0])
338
- to = f[0] + (padding_letter1 * (f[1].length - 1))
339
- str1.gsub!(from, to)
340
- end
341
-
342
- if str1.index(f[0])
343
- to = f[0] + (padding_letter2 * (f[1].length - 1))
344
- str2.gsub!(from, to)
345
- end
346
- end
347
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
348
-
349
- [str1, str2, mappings]
350
- end
351
-
352
383
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6'
2
+ VERSION = '0.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.6'
4
+ version: '0.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary