text_alignment 0.6.4 → 0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 87f945e356349ed709996d88ed39c8ba5b83622bde1c7fd7b9e5ff63504615c2
4
- data.tar.gz: acb6e716113238c39b59a8358928de1bd936382308961a57e2c60e7bc462726f
3
+ metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
4
+ data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
5
5
  SHA512:
6
- metadata.gz: 4d5b862bb50b4111c6bd390e458d6761303dc394f2fa7dc9d6b821ee7461541705aecac925f700e5124eb282112567e52a51a9f15b84fa8349da25baaf68fdd9
7
- data.tar.gz: a044608a58181e98664a26f410a7d59927dc4d39db8d49a147666f64254e23728ceccaa781a590712b7a74b57222cc449c37eb43a709d3f16da60aa3a55c2e6f
6
+ metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
7
+ data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
@@ -105,9 +105,7 @@ lost_annotations = []
105
105
  target_annotations = if source_annotations.class == Array
106
106
  align_mdoc(source_annotations, {text: target_text})
107
107
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
108
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
111
109
 
112
110
  # verification
113
111
  # source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
142
140
  puts "====="
143
141
  # exit
144
142
 
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
143
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
144
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
145
 
163
146
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
194
177
 
195
178
  if lost_annotations
196
179
  warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
180
+ lost_annotations.each do |a|
181
+ p a
182
+ end
198
183
  end
199
184
 
200
185
  #puts target_annotations.to_json
@@ -12,43 +12,46 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
15
+ def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
16
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
18
  @block_alignment = {source_text:str1, target_text:str2}
19
+ @str1 = str1
20
+ @str2 = str2
19
21
 
20
- # try exact match
22
+ ## Block exact match
21
23
  block_begin = str2.index(str1)
22
24
  unless block_begin.nil?
23
25
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
24
- return @block_alignment
26
+ return
25
27
  end
26
28
 
27
- # try exact match
28
29
  block_begin = str2.downcase.index(str1.downcase)
29
30
  unless block_begin.nil?
30
31
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
31
- return @block_alignment
32
+ return
32
33
  end
33
34
 
35
+
36
+ ## to find block alignments
34
37
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
35
38
 
36
- # To collect matched blocks
37
- mblocks = []
38
- while anchor = anchor_finder.get_next_anchor
39
- last = mblocks.last
40
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
41
- last[:source][:end] = anchor[:source][:end]
42
- last[:target][:end] = anchor[:target][:end]
39
+ blocks = []
40
+ while block = anchor_finder.get_next_anchor
41
+ last = blocks.last
42
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
43
+ last[:source][:end] = block[:source][:end]
44
+ last[:target][:end] = block[:target][:end]
43
45
  else
44
- mblocks << anchor
46
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
45
47
  end
46
48
  end
47
49
 
48
- # pp mblocks
50
+ # pp blocks
49
51
  # puts "-----"
50
52
  # puts
51
- # mblocks.each do |b|
53
+ # exit
54
+ # blocks.each do |b|
52
55
  # p [b[:source], b[:target]]
53
56
  # puts "---"
54
57
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -60,114 +63,196 @@ class TextAlignment::TextAlignment
60
63
  # puts "-=-=-=-=-"
61
64
  # puts
62
65
 
63
- ## To find block alignments
64
- @block_alignment[:blocks] = []
65
- return if mblocks.empty?
66
-
67
- # Initial step
68
- if mblocks[0][:source][:begin] > 0
69
- e1 = mblocks[0][:source][:begin]
70
- e2 = mblocks[0][:target][:begin]
66
+ ## to fill the gaps
67
+ last_block = nil
68
+ blocks2 = blocks.inject([]) do |sum, block|
69
+ b1 = last_block ? last_block[:source][:end] : 0
70
+ e1 = block[:source][:begin]
71
71
 
72
- if mblocks[0][:target][:begin] == 0
73
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
72
+ sum += if b1 == e1
73
+ [block]
74
74
  else
75
- _str1 = str1[0 ... e1]
76
- _str2 = str2[0 ... e2]
75
+ b2 = last_block ? last_block[:target][:end] : 0
76
+ e2 = block[:target][:begin]
77
+
78
+ if b2 == e2
79
+ [
80
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
81
+ block
82
+ ]
83
+ else
84
+ if b1 == 0 && b2 == 0
85
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
+ b2 = e2 - len_buffer if e2 > len_buffer
87
+ end
77
88
 
78
- unless _str1.strip.empty?
79
- if _str2.strip.empty?
80
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
89
+ _str1 = str1[b1 ... e1]
90
+ _str2 = str2[b2 ... e2]
91
+
92
+ if _str1.strip.empty? || _str2.strip.empty?
93
+ [
94
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
+ block
96
+ ]
81
97
  else
82
- len_min = [_str1.length, _str2.length].min
83
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
84
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
85
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
86
-
87
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
88
-
89
- _str1 = str1[b1 ... e1]
90
- _str2 = str2[b2 ... e2]
91
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
92
- if alignment.similarity < 0.5
93
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
94
- else
95
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
96
- end
98
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
97
99
  end
98
100
  end
99
101
  end
102
+
103
+ last_block = block
104
+ sum
100
105
  end
101
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
102
-
103
- (1 ... mblocks.length).each do |i|
104
- b1 = mblocks[i - 1][:source][:end]
105
- b2 = mblocks[i - 1][:target][:end]
106
- e1 = mblocks[i][:source][:begin]
107
- e2 = mblocks[i][:target][:begin]
108
- _str1 = str1[b1 ... e1]
109
- _str2 = str2[b2 ... e2]
110
- unless _str1.strip.empty?
111
- if _str2.strip.empty?
112
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
+
107
+ # the last step
108
+ blocks2 += if last_block.nil?
109
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
110
+ else
111
+ b1 = last_block[:source][:end]
112
+ if b1 < str1.length
113
+ e1 = str1.length
114
+
115
+ b2 = last_block[:target][:end]
116
+ if b2 < str2.length
117
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
113
120
  else
114
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
115
- if alignment.similarity < 0.5
116
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
117
- else
118
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
119
- end
121
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
120
122
  end
121
123
  end
122
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
123
124
  end
124
125
 
125
- # Final step
126
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
127
- b1 = mblocks[-1][:source][:end]
128
- b2 = mblocks[-1][:target][:end]
129
- _str1 = str1[b1 ... str1.length]
130
- _str2 = str2[b2 ... str2.length]
126
+ @block_alignment[:blocks] = blocks2
127
+ end
128
+
129
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
130
+ block2 = str2[b2 ... e2]
131
+
132
+ ## term-based alignment
133
+ tblocks = if denotations
134
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
135
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
136
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
137
+
138
+ position = 0
139
+ tblocks = ds_in_scope.map do |term|
140
+ lex = term[:lex]
141
+ r = block2.index(lex, position)
142
+ if r.nil?
143
+ position = nil
144
+ break
145
+ end
146
+ position = r + lex.length
147
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
148
+ end
149
+
150
+ # missing term found
151
+ tblocks = [] if position.nil?
152
+
153
+ # redundant matching found
154
+ unless position.nil?
155
+ ds_in_scope.each do |term|
156
+ lex = term[:lex]
157
+ look_forward = block2.index(lex, position)
158
+ unless look_forward.nil?
159
+ puts lex
160
+ tblocks = []
161
+ break
162
+ end
163
+ end
164
+ end
165
+
166
+ tblocks
167
+ end
131
168
 
132
- unless _str1.strip.empty?
133
- if _str2.strip.empty?
134
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
169
+ if tblocks.empty?
170
+ if b1 == 0 && e1 == str1.length
171
+ if str2.length > 2000
172
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
135
173
  else
136
- len_min = [_str1.length, _str2.length].min
137
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
138
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
139
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
140
- _str1 = str1[b1 ... e1]
141
- _str2 = str2[b2 ... e2]
174
+ block1 = str1[b1 ... e1]
175
+ block2 = str2[b2 ... e2]
176
+
177
+ ## character-based alignment
178
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
179
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
180
+ # alignment = :alignment
181
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
182
+ end
183
+ else
184
+ block1 = str1[b1 ... e1]
185
+ block2 = str2[b2 ... e2]
186
+
187
+ ## character-based alignment
188
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
190
+ # alignmnet = :alignment
191
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
192
+ end
193
+ else
194
+ last_tblock = nil
195
+ lblocks = tblocks.inject([]) do |sum, tblock|
196
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
197
+ te1 = tblock[:source][:begin]
142
198
 
143
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
144
- if alignment.similarity < 0.5
145
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
199
+ sum += if te1 == tb1
200
+ [tblock]
201
+ else
202
+ tb2 = last_tblock ? tlast_block[:target][:end] : b2
203
+ te2 = tblock[:target][:begin]
204
+
205
+ if b2 == e2
206
+ [
207
+ {source:{begin:tb1, end:te1}, alignment: :empty},
208
+ tblock
209
+ ]
146
210
  else
147
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
211
+ [
212
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
213
+ tblock
214
+ ]
148
215
  end
216
+ end
217
+
218
+ last_tblock = tblock
219
+ sum
220
+ end
149
221
 
150
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
222
+ if last_tblock[:source][:end] < e1
223
+ if last_tblock[:target][:end] < e2
224
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
225
+ else
226
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
151
227
  end
152
228
  end
153
- end
154
229
 
155
- @block_alignment[:blocks].each do |a|
156
- a[:delta] = a[:target][:begin] - a[:source][:begin]
230
+ lblocks
157
231
  end
158
232
  end
159
233
 
234
+
235
+ def indices(str, target)
236
+ position = 0
237
+ len = target.len
238
+ Enumerator.new do |yielder|
239
+ while idx = str.index(target, position)
240
+ yielder << idx
241
+ position = idx + len
242
+ end
243
+ end
244
+ end
245
+
160
246
  def transform_begin_position(begin_position)
161
247
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
162
248
  block = @block_alignment[:blocks][i]
163
249
 
164
- b = if block[:alignment] == :block
250
+ b = if block[:alignment] == :block || block[:alignment] == :term
165
251
  begin_position + block[:delta]
166
252
  elsif block[:alignment] == :empty
167
253
  if begin_position == block[:source][:begin]
168
254
  block[:target][:begin]
169
255
  else
170
- # raise "lost annotation"
171
256
  nil
172
257
  end
173
258
  else
@@ -180,13 +265,12 @@ class TextAlignment::TextAlignment
180
265
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
181
266
  block = @block_alignment[:blocks][i]
182
267
 
183
- e = if block[:alignment] == :block
268
+ e = if block[:alignment] == :block || block[:alignment] == :term
184
269
  end_position + block[:delta]
185
270
  elsif block[:alignment] == :empty
186
271
  if end_position == block[:source][:end]
187
272
  block[:target][:end]
188
273
  else
189
- # raise "lost annotation"
190
274
  nil
191
275
  end
192
276
  else
@@ -208,14 +292,14 @@ class TextAlignment::TextAlignment
208
292
  @lost_annotations = []
209
293
 
210
294
  denotations.each do |d|
211
- begin
212
- d.begin = transform_begin_position(d.begin);
213
- d.end = transform_end_position(d.end);
214
- rescue
215
- @lost_annotations << d
216
- d.begin = nil
217
- d.end = nil
218
- end
295
+ source = {begin:d.begin, end:d.end}
296
+ d.begin = transform_begin_position(d.begin);
297
+ d.end = transform_end_position(d.end);
298
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
299
+ rescue
300
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
301
+ d.begin = nil
302
+ d.end = nil
219
303
  end
220
304
 
221
305
  @lost_annotations
@@ -226,12 +310,12 @@ class TextAlignment::TextAlignment
226
310
  @lost_annotations = []
227
311
 
228
312
  r = hdenotations.collect do |d|
229
- new_d = begin
230
- d.dup.merge({span:transform_a_span(d[:span])})
231
- rescue
232
- @lost_annotations << d
233
- nil
234
- end
313
+ t = transform_a_span(d[:span])
314
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
315
+ new_d = d.dup.merge({span:t})
316
+ rescue
317
+ @lost_annotations << {source: d[:span], target:t}
318
+ nil
235
319
  end.compact
236
320
 
237
321
  r
@@ -245,7 +329,10 @@ class TextAlignment::TextAlignment
245
329
  @block_alignment[:blocks].each do |a|
246
330
  show += case a[:alignment]
247
331
  when :block
248
- "===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
332
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
333
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
334
+ when :term
335
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
249
336
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
250
337
  when :empty
251
338
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.4'
2
+ VERSION = '0.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.4
4
+ version: '0.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary