text_alignment 0.6.2 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4b2cdf0c257b74c6bec90b93d1907787f3c102108046731c2755684a1b156e9
4
- data.tar.gz: 85334dad09a046432503183e3d3ad83841612299038f2f2dac1f9d5d208e1939
3
+ metadata.gz: 972c5735de6aa85f5f9cd289e965f3ec3b8c38c492085e203686bc0ea897a293
4
+ data.tar.gz: fc0abe3043562c82af5a3c0cf1178586ffcee7921d7f11dbd5cdb93311cbd52a
5
5
  SHA512:
6
- metadata.gz: 9272bdd6c56717b53d39b3f2009259accb608ea86b99758b6a7ee9cee1e7b275330db55af4e0eba1eba80ee69275a21a3179243394d24139b3018996f659abe1
7
- data.tar.gz: a6a9d97d2bf81ac0c2972fd6e9d5202116156d8ff2e5e81a9bf0306e313dbc601522f887bcbcebff8b9d888cc06826a8ce69ba908dce29fa8decad85d53008af
6
+ metadata.gz: cfb1e21285616819cea937dce0f8422cddcd2ddb6ccf70d19bf2fd5851a33eede0760b4ed956049dfb3fb1cdfb7758d5bfbf19cff14ffedc2e1ffd80928200e0
7
+ data.tar.gz: 3fb72b7abe05c1a67db6c18448a0f601260b7d3f733e9b5e9fbe3ba5d9ec791e940bbdf70e00193658139480d320c2f9675426faff5e7e90d80eb9d8b07b074a
@@ -105,9 +105,7 @@ lost_annotations = []
105
105
  target_annotations = if source_annotations.class == Array
106
106
  align_mdoc(source_annotations, {text: target_text})
107
107
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
108
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
111
109
 
112
110
  # verification
113
111
  # source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
142
140
  puts "====="
143
141
  # exit
144
142
 
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
143
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
144
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
145
 
163
146
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
194
177
 
195
178
  if lost_annotations
196
179
  warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
180
+ lost_annotations.each do |a|
181
+ p a
182
+ end
198
183
  end
199
184
 
200
185
  #puts target_annotations.to_json
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(str1, str2, mappings = [])
21
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
- mappings ||= []
20
+ def initialize(_str1, _str2)
21
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
+
23
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
23
24
 
24
25
  _compute_mixed_alignment(str1, str2, mappings)
25
26
  end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
62
63
  end
63
64
 
64
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
65
- @similarity = cmp.similarity
66
+ @similarity = compute_similarity(str1, str2, @sdiff)
66
67
  @str1_match_initial = cmp.str1_match_initial
67
68
  @str1_match_final = cmp.str1_match_final
68
69
  @str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
137
138
  @position_map_begin = posmap_begin.sort.to_h
138
139
  @position_map_end = posmap_end.sort.to_h
139
140
  end
141
+
142
+ private
143
+
144
+ def string_preprocessing(_str1, _str2)
145
+ str1 = _str1.dup
146
+ str2 = _str2.dup
147
+ mappings = TextAlignment::MAPPINGS.dup
148
+
149
+ ## single character mappings
150
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
+ characters_from = character_mappings.collect{|m| m[0]}.join
152
+ characters_to = character_mappings.collect{|m| m[1]}.join
153
+ characters_to.gsub!(/-/, '\-')
154
+
155
+ str1.tr!(characters_from, characters_to)
156
+ str2.tr!(characters_from, characters_to)
157
+
158
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
+
160
+ ## long to one character mappings
161
+ pletters = TextAlignment::PADDING_LETTERS
162
+
163
+ # find the padding letter for str1
164
+ @padding_letter1 = begin
165
+ i = pletters.index{|l| str2.index(l).nil?}
166
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
+ TextAlignment::PADDING_LETTERS[i]
168
+ end
169
+
170
+ # find the padding letter for str2
171
+ @padding_letter2 = begin
172
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
+ TextAlignment::PADDING_LETTERS[i]
175
+ end
176
+
177
+ # ASCII foldings
178
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
+ ascii_foldings.each do |f|
180
+ from = f[1]
181
+
182
+ if str2.index(f[0])
183
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
+ str1.gsub!(from, to)
185
+ end
186
+
187
+ if str1.index(f[0])
188
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
+ str2.gsub!(from, to)
190
+ end
191
+ end
192
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
+
194
+ [str1, str2, mappings]
195
+ end
196
+
197
+ def compute_similarity(_s1, _s2, sdiff)
198
+ return 0 if sdiff.nil?
199
+
200
+ # compute the lcs only with non-whitespace letters
201
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
+ return 0 if lcs == 0
203
+
204
+ s1 = _s1.tr(@padding_letter1, ' ')
205
+ s2 = _s2.tr(@padding_letter2, ' ')
206
+
207
+ similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
+ end
209
+
140
210
  end
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
+ def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
- @block_alignment = {source_text:_str1, target_text:_str2}
18
+ @block_alignment = {source_text:str1, target_text:str2}
19
+ @str1 = str1
20
+ @str2 = str2
19
21
 
20
- str1, str2, mappings = string_preprocessing(_str1, _str2)
21
-
22
- # try exact match
22
+ ## Block exact match
23
23
  block_begin = str2.index(str1)
24
24
  unless block_begin.nil?
25
25
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return @block_alignment
26
+ return
27
27
  end
28
28
 
29
- # try exact match
30
29
  block_begin = str2.downcase.index(str1.downcase)
31
30
  unless block_begin.nil?
32
31
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
33
- return @block_alignment
32
+ return
34
33
  end
35
34
 
35
+
36
+ ## to find block alignments
36
37
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
37
38
 
38
- # To collect matched blocks
39
- mblocks = []
40
- while anchor = anchor_finder.get_next_anchor
41
- last = mblocks.last
42
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
43
- last[:source][:end] = anchor[:source][:end]
44
- last[:target][:end] = anchor[:target][:end]
39
+ blocks = []
40
+ while block = anchor_finder.get_next_anchor
41
+ last = blocks.last
42
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
43
+ last[:source][:end] = block[:source][:end]
44
+ last[:target][:end] = block[:target][:end]
45
45
  else
46
- mblocks << anchor
46
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
47
47
  end
48
48
  end
49
49
 
50
- # pp mblocks
50
+ # pp blocks
51
51
  # puts "-----"
52
52
  # puts
53
- # mblocks.each do |b|
53
+ # exit
54
+ # blocks.each do |b|
54
55
  # p [b[:source], b[:target]]
55
56
  # puts "---"
56
57
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -62,117 +63,202 @@ class TextAlignment::TextAlignment
62
63
  # puts "-=-=-=-=-"
63
64
  # puts
64
65
 
65
- ## To find block alignments
66
- @block_alignment[:blocks] = []
67
- return if mblocks.empty?
68
-
69
- # Initial step
70
- if mblocks[0][:source][:begin] > 0
71
- e1 = mblocks[0][:source][:begin]
72
- e2 = mblocks[0][:target][:begin]
66
+ ## to fill the gaps
67
+ last_block = nil
68
+ blocks2 = blocks.inject([]) do |sum, block|
69
+ b1 = last_block ? last_block[:source][:end] : 0
70
+ e1 = block[:source][:begin]
73
71
 
74
- if mblocks[0][:target][:begin] == 0
75
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
72
+ sum += if b1 == e1
73
+ [block]
76
74
  else
77
- _str1 = str1[0 ... e1]
78
- _str2 = str2[0 ... e2]
75
+ b2 = last_block ? last_block[:target][:end] : 0
76
+ e2 = block[:target][:begin]
77
+
78
+ if b2 == e2
79
+ [
80
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
81
+ block
82
+ ]
83
+ else
84
+ if b1 == 0 && b2 == 0
85
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
+ b2 = e2 - len_buffer if e2 > len_buffer
87
+ end
79
88
 
80
- unless _str1.strip.empty?
81
- if _str2.strip.empty?
82
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
89
+ _str1 = str1[b1 ... e1]
90
+ _str2 = str2[b2 ... e2]
91
+
92
+ if _str1.strip.empty? || _str2.strip.empty?
93
+ [
94
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
+ block
96
+ ]
83
97
  else
84
- len_min = [_str1.length, _str2.length].min
85
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
87
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
88
-
89
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
90
-
91
- _str1 = str1[b1 ... e1]
92
- _str2 = str2[b2 ... e2]
93
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
94
- similarity = alignment_similarity(_str1, _str2, alignment)
95
- if similarity < 0.6
96
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
97
- else
98
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
99
- end
98
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
100
99
  end
101
100
  end
102
101
  end
102
+
103
+ last_block = block
104
+ sum
103
105
  end
104
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
105
-
106
- (1 ... mblocks.length).each do |i|
107
- b1 = mblocks[i - 1][:source][:end]
108
- b2 = mblocks[i - 1][:target][:end]
109
- e1 = mblocks[i][:source][:begin]
110
- e2 = mblocks[i][:target][:begin]
111
- _str1 = str1[b1 ... e1]
112
- _str2 = str2[b2 ... e2]
113
- unless _str1.strip.empty?
114
- if _str2.strip.empty?
115
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
+
107
+ # the last step
108
+ blocks2 += if last_block.nil?
109
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
110
+ else
111
+ b1 = last_block[:source][:end]
112
+ if b1 < str1.length
113
+ e1 = str1.length
114
+
115
+ b2 = last_block[:target][:end]
116
+ if b2 < str2.length
117
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
116
120
  else
117
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
118
- similarity = alignment_similarity(_str1, _str2, alignment)
119
- if similarity < 0.6
120
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
121
- else
122
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
121
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
122
+ end
123
+ else
124
+ []
125
+ end
126
+ end
127
+
128
+ @block_alignment[:blocks] = blocks2
129
+ end
130
+
131
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
132
+ block2 = str2[b2 ... e2]
133
+
134
+ ## term-based alignment
135
+ tblocks = if denotations
136
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
137
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
138
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
139
+
140
+ position = 0
141
+ tblocks = ds_in_scope.map do |term|
142
+ lex = term[:lex]
143
+ r = block2.index(lex, position)
144
+ if r.nil?
145
+ position = nil
146
+ break
147
+ end
148
+ position = r + lex.length
149
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
150
+ end
151
+
152
+ # missing term found
153
+ tblocks = [] if position.nil?
154
+
155
+ # redundant matching found
156
+ unless position.nil?
157
+ ds_in_scope.each do |term|
158
+ lex = term[:lex]
159
+ look_forward = block2.index(lex, position)
160
+ unless look_forward.nil?
161
+ puts lex
162
+ tblocks = []
163
+ break
123
164
  end
124
165
  end
125
166
  end
126
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
167
+
168
+ tblocks
127
169
  end
128
170
 
129
- # Final step
130
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
131
- b1 = mblocks[-1][:source][:end]
132
- b2 = mblocks[-1][:target][:end]
133
- _str1 = str1[b1 ... str1.length]
134
- _str2 = str2[b2 ... str2.length]
171
+ if tblocks.empty?
172
+ if b1 == 0 && e1 == str1.length
173
+ if (e1 > 1000) || (e2 > 1000)
174
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
175
+ else
176
+ block1 = str1[b1 ... e1]
177
+ block2 = str2[b2 ... e2]
178
+
179
+ ## character-based alignment
180
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
181
+ if alignment.sdiff.nil?
182
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
183
+ else
184
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
185
+ end
186
+ end
187
+ else
188
+ block1 = str1[b1 ... e1]
189
+ block2 = str2[b2 ... e2]
135
190
 
136
- unless _str1.strip.empty?
137
- if _str2.strip.empty?
138
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
191
+ ## character-based alignment
192
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
193
+ if alignment.sdiff.nil?
194
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
139
195
  else
140
- len_min = [_str1.length, _str2.length].min
141
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
142
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
143
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
144
- _str1 = str1[b1 ... e1]
145
- _str2 = str2[b2 ... e2]
196
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
197
+ end
198
+ end
199
+ else
200
+ last_tblock = nil
201
+ lblocks = tblocks.inject([]) do |sum, tblock|
202
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
203
+ te1 = tblock[:source][:begin]
146
204
 
147
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
148
- similarity = alignment_similarity(_str1, _str2, alignment)
149
- if similarity < 0.6
150
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
205
+ sum += if te1 == tb1
206
+ [tblock]
207
+ else
208
+ tb2 = last_tblock ? last_tblock[:target][:end] : b2
209
+ te2 = tblock[:target][:begin]
210
+
211
+ if b2 == e2
212
+ [
213
+ {source:{begin:tb1, end:te1}, alignment: :empty},
214
+ tblock
215
+ ]
151
216
  else
152
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
217
+ [
218
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
219
+ tblock
220
+ ]
153
221
  end
222
+ end
154
223
 
155
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
224
+ last_tblock = tblock
225
+ sum
226
+ end
227
+
228
+ if last_tblock[:source][:end] < e1
229
+ if last_tblock[:target][:end] < e2
230
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
231
+ else
232
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
156
233
  end
157
234
  end
158
- end
159
235
 
160
- @block_alignment[:blocks].each do |a|
161
- a[:delta] = a[:target][:begin] - a[:source][:begin]
236
+ lblocks
162
237
  end
163
238
  end
164
239
 
240
+
241
+ def indices(str, target)
242
+ position = 0
243
+ len = target.len
244
+ Enumerator.new do |yielder|
245
+ while idx = str.index(target, position)
246
+ yielder << idx
247
+ position = idx + len
248
+ end
249
+ end
250
+ end
251
+
165
252
  def transform_begin_position(begin_position)
166
253
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
167
254
  block = @block_alignment[:blocks][i]
168
255
 
169
- b = if block[:alignment] == :block
256
+ b = if block[:alignment] == :block || block[:alignment] == :term
170
257
  begin_position + block[:delta]
171
258
  elsif block[:alignment] == :empty
172
259
  if begin_position == block[:source][:begin]
173
260
  block[:target][:begin]
174
261
  else
175
- # raise "lost annotation"
176
262
  nil
177
263
  end
178
264
  else
@@ -185,13 +271,12 @@ class TextAlignment::TextAlignment
185
271
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
186
272
  block = @block_alignment[:blocks][i]
187
273
 
188
- e = if block[:alignment] == :block
274
+ e = if block[:alignment] == :block || block[:alignment] == :term
189
275
  end_position + block[:delta]
190
276
  elsif block[:alignment] == :empty
191
277
  if end_position == block[:source][:end]
192
278
  block[:target][:end]
193
279
  else
194
- # raise "lost annotation"
195
280
  nil
196
281
  end
197
282
  else
@@ -213,14 +298,14 @@ class TextAlignment::TextAlignment
213
298
  @lost_annotations = []
214
299
 
215
300
  denotations.each do |d|
216
- begin
217
- d.begin = transform_begin_position(d.begin);
218
- d.end = transform_end_position(d.end);
219
- rescue
220
- @lost_annotations << d
221
- d.begin = nil
222
- d.end = nil
223
- end
301
+ source = {begin:d.begin, end:d.end}
302
+ d.begin = transform_begin_position(d.begin);
303
+ d.end = transform_end_position(d.end);
304
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
305
+ rescue
306
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
307
+ d.begin = nil
308
+ d.end = nil
224
309
  end
225
310
 
226
311
  @lost_annotations
@@ -231,12 +316,12 @@ class TextAlignment::TextAlignment
231
316
  @lost_annotations = []
232
317
 
233
318
  r = hdenotations.collect do |d|
234
- new_d = begin
235
- d.dup.merge({span:transform_a_span(d[:span])})
236
- rescue
237
- @lost_annotations << d
238
- nil
239
- end
319
+ t = transform_a_span(d[:span])
320
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
321
+ new_d = d.dup.merge({span:t})
322
+ rescue
323
+ @lost_annotations << {source: d[:span], target:t}
324
+ nil
240
325
  end.compact
241
326
 
242
327
  r
@@ -250,13 +335,16 @@ class TextAlignment::TextAlignment
250
335
  @block_alignment[:blocks].each do |a|
251
336
  show += case a[:alignment]
252
337
  when :block
253
- "===== common =====\n" +
338
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
339
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
340
+ when :term
341
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
254
342
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
255
343
  when :empty
256
344
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
257
- "<<<<< string 1\n" +
345
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
258
346
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
259
- ">>>>> string 2\n" +
347
+ ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
260
348
  ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
261
349
  else
262
350
  astr1 = ''
@@ -290,7 +378,7 @@ class TextAlignment::TextAlignment
290
378
  end
291
379
  end.join('')
292
380
 
293
- "***** local mismatch\n" +
381
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
294
382
  "[#{astr1}]\n" +
295
383
  "[#{astr2}]\n\n"
296
384
  end
@@ -298,71 +386,4 @@ class TextAlignment::TextAlignment
298
386
  show
299
387
  end
300
388
 
301
- private
302
-
303
- def string_preprocessing(_str1, _str2)
304
- str1 = _str1.dup
305
- str2 = _str2.dup
306
- mappings = TextAlignment::MAPPINGS.dup
307
-
308
- ## single character mappings
309
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
310
- characters_from = character_mappings.collect{|m| m[0]}.join
311
- characters_to = character_mappings.collect{|m| m[1]}.join
312
- characters_to.gsub!(/-/, '\-')
313
-
314
- str1.tr!(characters_from, characters_to)
315
- str2.tr!(characters_from, characters_to)
316
-
317
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
318
-
319
- ## long to one character mappings
320
- pletters = TextAlignment::PADDING_LETTERS
321
-
322
- # find the padding letter for str1
323
- @padding_letter1 = begin
324
- i = pletters.index{|l| str2.index(l).nil?}
325
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
326
- TextAlignment::PADDING_LETTERS[i]
327
- end
328
-
329
- # find the padding letter for str2
330
- @padding_letter2 = begin
331
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
332
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
333
- TextAlignment::PADDING_LETTERS[i]
334
- end
335
-
336
- # ASCII foldings
337
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
338
- ascii_foldings.each do |f|
339
- from = f[1]
340
-
341
- if str2.index(f[0])
342
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
343
- str1.gsub!(from, to)
344
- end
345
-
346
- if str1.index(f[0])
347
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
348
- str2.gsub!(from, to)
349
- end
350
- end
351
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
352
-
353
- [str1, str2, mappings]
354
- end
355
-
356
- def alignment_similarity(_s1, _s2, alignment)
357
- return 0 if alignment.sdiff.nil?
358
-
359
- # compute the lcs only with non-whitespace letters
360
- lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
361
-
362
- s1 = _s1.tr(@padding_letter1, ' ')
363
- s2 = _s2.tr(@padding_letter2, ' ')
364
-
365
- similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
366
- end
367
-
368
389
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.2'
2
+ VERSION = '0.7.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary