text_alignment 0.6.2 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4b2cdf0c257b74c6bec90b93d1907787f3c102108046731c2755684a1b156e9
4
- data.tar.gz: 85334dad09a046432503183e3d3ad83841612299038f2f2dac1f9d5d208e1939
3
+ metadata.gz: 972c5735de6aa85f5f9cd289e965f3ec3b8c38c492085e203686bc0ea897a293
4
+ data.tar.gz: fc0abe3043562c82af5a3c0cf1178586ffcee7921d7f11dbd5cdb93311cbd52a
5
5
  SHA512:
6
- metadata.gz: 9272bdd6c56717b53d39b3f2009259accb608ea86b99758b6a7ee9cee1e7b275330db55af4e0eba1eba80ee69275a21a3179243394d24139b3018996f659abe1
7
- data.tar.gz: a6a9d97d2bf81ac0c2972fd6e9d5202116156d8ff2e5e81a9bf0306e313dbc601522f887bcbcebff8b9d888cc06826a8ce69ba908dce29fa8decad85d53008af
6
+ metadata.gz: cfb1e21285616819cea937dce0f8422cddcd2ddb6ccf70d19bf2fd5851a33eede0760b4ed956049dfb3fb1cdfb7758d5bfbf19cff14ffedc2e1ffd80928200e0
7
+ data.tar.gz: 3fb72b7abe05c1a67db6c18448a0f601260b7d3f733e9b5e9fbe3ba5d9ec791e940bbdf70e00193658139480d320c2f9675426faff5e7e90d80eb9d8b07b074a
@@ -105,9 +105,7 @@ lost_annotations = []
105
105
  target_annotations = if source_annotations.class == Array
106
106
  align_mdoc(source_annotations, {text: target_text})
107
107
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
108
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
111
109
 
112
110
  # verification
113
111
  # source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
142
140
  puts "====="
143
141
  # exit
144
142
 
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
143
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
144
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
145
 
163
146
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
194
177
 
195
178
  if lost_annotations
196
179
  warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
180
+ lost_annotations.each do |a|
181
+ p a
182
+ end
198
183
  end
199
184
 
200
185
  #puts target_annotations.to_json
@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(str1, str2, mappings = [])
21
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
- mappings ||= []
20
+ def initialize(_str1, _str2)
21
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
+
23
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
23
24
 
24
25
  _compute_mixed_alignment(str1, str2, mappings)
25
26
  end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
62
63
  end
63
64
 
64
65
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
65
- @similarity = cmp.similarity
66
+ @similarity = compute_similarity(str1, str2, @sdiff)
66
67
  @str1_match_initial = cmp.str1_match_initial
67
68
  @str1_match_final = cmp.str1_match_final
68
69
  @str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
137
138
  @position_map_begin = posmap_begin.sort.to_h
138
139
  @position_map_end = posmap_end.sort.to_h
139
140
  end
141
+
142
+ private
143
+
144
+ def string_preprocessing(_str1, _str2)
145
+ str1 = _str1.dup
146
+ str2 = _str2.dup
147
+ mappings = TextAlignment::MAPPINGS.dup
148
+
149
+ ## single character mappings
150
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
+ characters_from = character_mappings.collect{|m| m[0]}.join
152
+ characters_to = character_mappings.collect{|m| m[1]}.join
153
+ characters_to.gsub!(/-/, '\-')
154
+
155
+ str1.tr!(characters_from, characters_to)
156
+ str2.tr!(characters_from, characters_to)
157
+
158
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
+
160
+ ## long to one character mappings
161
+ pletters = TextAlignment::PADDING_LETTERS
162
+
163
+ # find the padding letter for str1
164
+ @padding_letter1 = begin
165
+ i = pletters.index{|l| str2.index(l).nil?}
166
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
+ TextAlignment::PADDING_LETTERS[i]
168
+ end
169
+
170
+ # find the padding letter for str2
171
+ @padding_letter2 = begin
172
+ i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
+ TextAlignment::PADDING_LETTERS[i]
175
+ end
176
+
177
+ # ASCII foldings
178
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
+ ascii_foldings.each do |f|
180
+ from = f[1]
181
+
182
+ if str2.index(f[0])
183
+ to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
+ str1.gsub!(from, to)
185
+ end
186
+
187
+ if str1.index(f[0])
188
+ to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
+ str2.gsub!(from, to)
190
+ end
191
+ end
192
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
+
194
+ [str1, str2, mappings]
195
+ end
196
+
197
+ def compute_similarity(_s1, _s2, sdiff)
198
+ return 0 if sdiff.nil?
199
+
200
+ # compute the lcs only with non-whitespace letters
201
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
+ return 0 if lcs == 0
203
+
204
+ s1 = _s1.tr(@padding_letter1, ' ')
205
+ s2 = _s2.tr(@padding_letter2, ' ')
206
+
207
+ similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
208
+ end
209
+
140
210
  end
@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
+ def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
- @block_alignment = {source_text:_str1, target_text:_str2}
18
+ @block_alignment = {source_text:str1, target_text:str2}
19
+ @str1 = str1
20
+ @str2 = str2
19
21
 
20
- str1, str2, mappings = string_preprocessing(_str1, _str2)
21
-
22
- # try exact match
22
+ ## Block exact match
23
23
  block_begin = str2.index(str1)
24
24
  unless block_begin.nil?
25
25
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return @block_alignment
26
+ return
27
27
  end
28
28
 
29
- # try exact match
30
29
  block_begin = str2.downcase.index(str1.downcase)
31
30
  unless block_begin.nil?
32
31
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
33
- return @block_alignment
32
+ return
34
33
  end
35
34
 
35
+
36
+ ## to find block alignments
36
37
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
37
38
 
38
- # To collect matched blocks
39
- mblocks = []
40
- while anchor = anchor_finder.get_next_anchor
41
- last = mblocks.last
42
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
43
- last[:source][:end] = anchor[:source][:end]
44
- last[:target][:end] = anchor[:target][:end]
39
+ blocks = []
40
+ while block = anchor_finder.get_next_anchor
41
+ last = blocks.last
42
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
43
+ last[:source][:end] = block[:source][:end]
44
+ last[:target][:end] = block[:target][:end]
45
45
  else
46
- mblocks << anchor
46
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
47
47
  end
48
48
  end
49
49
 
50
- # pp mblocks
50
+ # pp blocks
51
51
  # puts "-----"
52
52
  # puts
53
- # mblocks.each do |b|
53
+ # exit
54
+ # blocks.each do |b|
54
55
  # p [b[:source], b[:target]]
55
56
  # puts "---"
56
57
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -62,117 +63,202 @@ class TextAlignment::TextAlignment
62
63
  # puts "-=-=-=-=-"
63
64
  # puts
64
65
 
65
- ## To find block alignments
66
- @block_alignment[:blocks] = []
67
- return if mblocks.empty?
68
-
69
- # Initial step
70
- if mblocks[0][:source][:begin] > 0
71
- e1 = mblocks[0][:source][:begin]
72
- e2 = mblocks[0][:target][:begin]
66
+ ## to fill the gaps
67
+ last_block = nil
68
+ blocks2 = blocks.inject([]) do |sum, block|
69
+ b1 = last_block ? last_block[:source][:end] : 0
70
+ e1 = block[:source][:begin]
73
71
 
74
- if mblocks[0][:target][:begin] == 0
75
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
72
+ sum += if b1 == e1
73
+ [block]
76
74
  else
77
- _str1 = str1[0 ... e1]
78
- _str2 = str2[0 ... e2]
75
+ b2 = last_block ? last_block[:target][:end] : 0
76
+ e2 = block[:target][:begin]
77
+
78
+ if b2 == e2
79
+ [
80
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
81
+ block
82
+ ]
83
+ else
84
+ if b1 == 0 && b2 == 0
85
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
+ b2 = e2 - len_buffer if e2 > len_buffer
87
+ end
79
88
 
80
- unless _str1.strip.empty?
81
- if _str2.strip.empty?
82
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
89
+ _str1 = str1[b1 ... e1]
90
+ _str2 = str2[b2 ... e2]
91
+
92
+ if _str1.strip.empty? || _str2.strip.empty?
93
+ [
94
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
+ block
96
+ ]
83
97
  else
84
- len_min = [_str1.length, _str2.length].min
85
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
87
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
88
-
89
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
90
-
91
- _str1 = str1[b1 ... e1]
92
- _str2 = str2[b2 ... e2]
93
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
94
- similarity = alignment_similarity(_str1, _str2, alignment)
95
- if similarity < 0.6
96
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
97
- else
98
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
99
- end
98
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
100
99
  end
101
100
  end
102
101
  end
102
+
103
+ last_block = block
104
+ sum
103
105
  end
104
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
105
-
106
- (1 ... mblocks.length).each do |i|
107
- b1 = mblocks[i - 1][:source][:end]
108
- b2 = mblocks[i - 1][:target][:end]
109
- e1 = mblocks[i][:source][:begin]
110
- e2 = mblocks[i][:target][:begin]
111
- _str1 = str1[b1 ... e1]
112
- _str2 = str2[b2 ... e2]
113
- unless _str1.strip.empty?
114
- if _str2.strip.empty?
115
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
+
107
+ # the last step
108
+ blocks2 += if last_block.nil?
109
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
110
+ else
111
+ b1 = last_block[:source][:end]
112
+ if b1 < str1.length
113
+ e1 = str1.length
114
+
115
+ b2 = last_block[:target][:end]
116
+ if b2 < str2.length
117
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
116
120
  else
117
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
118
- similarity = alignment_similarity(_str1, _str2, alignment)
119
- if similarity < 0.6
120
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
121
- else
122
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
121
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
122
+ end
123
+ else
124
+ []
125
+ end
126
+ end
127
+
128
+ @block_alignment[:blocks] = blocks2
129
+ end
130
+
131
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
132
+ block2 = str2[b2 ... e2]
133
+
134
+ ## term-based alignment
135
+ tblocks = if denotations
136
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
137
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
138
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
139
+
140
+ position = 0
141
+ tblocks = ds_in_scope.map do |term|
142
+ lex = term[:lex]
143
+ r = block2.index(lex, position)
144
+ if r.nil?
145
+ position = nil
146
+ break
147
+ end
148
+ position = r + lex.length
149
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
150
+ end
151
+
152
+ # missing term found
153
+ tblocks = [] if position.nil?
154
+
155
+ # redundant matching found
156
+ unless position.nil?
157
+ ds_in_scope.each do |term|
158
+ lex = term[:lex]
159
+ look_forward = block2.index(lex, position)
160
+ unless look_forward.nil?
161
+ puts lex
162
+ tblocks = []
163
+ break
123
164
  end
124
165
  end
125
166
  end
126
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
167
+
168
+ tblocks
127
169
  end
128
170
 
129
- # Final step
130
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
131
- b1 = mblocks[-1][:source][:end]
132
- b2 = mblocks[-1][:target][:end]
133
- _str1 = str1[b1 ... str1.length]
134
- _str2 = str2[b2 ... str2.length]
171
+ if tblocks.empty?
172
+ if b1 == 0 && e1 == str1.length
173
+ if (e1 > 1000) || (e2 > 1000)
174
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
175
+ else
176
+ block1 = str1[b1 ... e1]
177
+ block2 = str2[b2 ... e2]
178
+
179
+ ## character-based alignment
180
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
181
+ if alignment.sdiff.nil?
182
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
183
+ else
184
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
185
+ end
186
+ end
187
+ else
188
+ block1 = str1[b1 ... e1]
189
+ block2 = str2[b2 ... e2]
135
190
 
136
- unless _str1.strip.empty?
137
- if _str2.strip.empty?
138
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
191
+ ## character-based alignment
192
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
193
+ if alignment.sdiff.nil?
194
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
139
195
  else
140
- len_min = [_str1.length, _str2.length].min
141
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
142
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
143
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
144
- _str1 = str1[b1 ... e1]
145
- _str2 = str2[b2 ... e2]
196
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
197
+ end
198
+ end
199
+ else
200
+ last_tblock = nil
201
+ lblocks = tblocks.inject([]) do |sum, tblock|
202
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
203
+ te1 = tblock[:source][:begin]
146
204
 
147
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
148
- similarity = alignment_similarity(_str1, _str2, alignment)
149
- if similarity < 0.6
150
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
205
+ sum += if te1 == tb1
206
+ [tblock]
207
+ else
208
+ tb2 = last_tblock ? last_tblock[:target][:end] : b2
209
+ te2 = tblock[:target][:begin]
210
+
211
+ if b2 == e2
212
+ [
213
+ {source:{begin:tb1, end:te1}, alignment: :empty},
214
+ tblock
215
+ ]
151
216
  else
152
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
217
+ [
218
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
219
+ tblock
220
+ ]
153
221
  end
222
+ end
154
223
 
155
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
224
+ last_tblock = tblock
225
+ sum
226
+ end
227
+
228
+ if last_tblock[:source][:end] < e1
229
+ if last_tblock[:target][:end] < e2
230
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
231
+ else
232
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
156
233
  end
157
234
  end
158
- end
159
235
 
160
- @block_alignment[:blocks].each do |a|
161
- a[:delta] = a[:target][:begin] - a[:source][:begin]
236
+ lblocks
162
237
  end
163
238
  end
164
239
 
240
+
241
+ def indices(str, target)
242
+ position = 0
243
+ len = target.len
244
+ Enumerator.new do |yielder|
245
+ while idx = str.index(target, position)
246
+ yielder << idx
247
+ position = idx + len
248
+ end
249
+ end
250
+ end
251
+
165
252
  def transform_begin_position(begin_position)
166
253
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
167
254
  block = @block_alignment[:blocks][i]
168
255
 
169
- b = if block[:alignment] == :block
256
+ b = if block[:alignment] == :block || block[:alignment] == :term
170
257
  begin_position + block[:delta]
171
258
  elsif block[:alignment] == :empty
172
259
  if begin_position == block[:source][:begin]
173
260
  block[:target][:begin]
174
261
  else
175
- # raise "lost annotation"
176
262
  nil
177
263
  end
178
264
  else
@@ -185,13 +271,12 @@ class TextAlignment::TextAlignment
185
271
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
186
272
  block = @block_alignment[:blocks][i]
187
273
 
188
- e = if block[:alignment] == :block
274
+ e = if block[:alignment] == :block || block[:alignment] == :term
189
275
  end_position + block[:delta]
190
276
  elsif block[:alignment] == :empty
191
277
  if end_position == block[:source][:end]
192
278
  block[:target][:end]
193
279
  else
194
- # raise "lost annotation"
195
280
  nil
196
281
  end
197
282
  else
@@ -213,14 +298,14 @@ class TextAlignment::TextAlignment
213
298
  @lost_annotations = []
214
299
 
215
300
  denotations.each do |d|
216
- begin
217
- d.begin = transform_begin_position(d.begin);
218
- d.end = transform_end_position(d.end);
219
- rescue
220
- @lost_annotations << d
221
- d.begin = nil
222
- d.end = nil
223
- end
301
+ source = {begin:d.begin, end:d.end}
302
+ d.begin = transform_begin_position(d.begin);
303
+ d.end = transform_end_position(d.end);
304
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
305
+ rescue
306
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
307
+ d.begin = nil
308
+ d.end = nil
224
309
  end
225
310
 
226
311
  @lost_annotations
@@ -231,12 +316,12 @@ class TextAlignment::TextAlignment
231
316
  @lost_annotations = []
232
317
 
233
318
  r = hdenotations.collect do |d|
234
- new_d = begin
235
- d.dup.merge({span:transform_a_span(d[:span])})
236
- rescue
237
- @lost_annotations << d
238
- nil
239
- end
319
+ t = transform_a_span(d[:span])
320
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
321
+ new_d = d.dup.merge({span:t})
322
+ rescue
323
+ @lost_annotations << {source: d[:span], target:t}
324
+ nil
240
325
  end.compact
241
326
 
242
327
  r
@@ -250,13 +335,16 @@ class TextAlignment::TextAlignment
250
335
  @block_alignment[:blocks].each do |a|
251
336
  show += case a[:alignment]
252
337
  when :block
253
- "===== common =====\n" +
338
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
339
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
340
+ when :term
341
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
254
342
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
255
343
  when :empty
256
344
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
257
- "<<<<< string 1\n" +
345
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
258
346
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
259
- ">>>>> string 2\n" +
347
+ ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
260
348
  ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
261
349
  else
262
350
  astr1 = ''
@@ -290,7 +378,7 @@ class TextAlignment::TextAlignment
290
378
  end
291
379
  end.join('')
292
380
 
293
- "***** local mismatch\n" +
381
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
294
382
  "[#{astr1}]\n" +
295
383
  "[#{astr2}]\n\n"
296
384
  end
@@ -298,71 +386,4 @@ class TextAlignment::TextAlignment
298
386
  show
299
387
  end
300
388
 
301
- private
302
-
303
- def string_preprocessing(_str1, _str2)
304
- str1 = _str1.dup
305
- str2 = _str2.dup
306
- mappings = TextAlignment::MAPPINGS.dup
307
-
308
- ## single character mappings
309
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
310
- characters_from = character_mappings.collect{|m| m[0]}.join
311
- characters_to = character_mappings.collect{|m| m[1]}.join
312
- characters_to.gsub!(/-/, '\-')
313
-
314
- str1.tr!(characters_from, characters_to)
315
- str2.tr!(characters_from, characters_to)
316
-
317
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
318
-
319
- ## long to one character mappings
320
- pletters = TextAlignment::PADDING_LETTERS
321
-
322
- # find the padding letter for str1
323
- @padding_letter1 = begin
324
- i = pletters.index{|l| str2.index(l).nil?}
325
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
326
- TextAlignment::PADDING_LETTERS[i]
327
- end
328
-
329
- # find the padding letter for str2
330
- @padding_letter2 = begin
331
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
332
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
333
- TextAlignment::PADDING_LETTERS[i]
334
- end
335
-
336
- # ASCII foldings
337
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
338
- ascii_foldings.each do |f|
339
- from = f[1]
340
-
341
- if str2.index(f[0])
342
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
343
- str1.gsub!(from, to)
344
- end
345
-
346
- if str1.index(f[0])
347
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
348
- str2.gsub!(from, to)
349
- end
350
- end
351
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
352
-
353
- [str1, str2, mappings]
354
- end
355
-
356
- def alignment_similarity(_s1, _s2, alignment)
357
- return 0 if alignment.sdiff.nil?
358
-
359
- # compute the lcs only with non-whitespace letters
360
- lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
361
-
362
- s1 = _s1.tr(@padding_letter1, ' ')
363
- s2 = _s2.tr(@padding_letter2, ' ')
364
-
365
- similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
366
- end
367
-
368
389
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.2'
2
+ VERSION = '0.7.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary