text_alignment 0.6.4 → 0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 87f945e356349ed709996d88ed39c8ba5b83622bde1c7fd7b9e5ff63504615c2
4
- data.tar.gz: acb6e716113238c39b59a8358928de1bd936382308961a57e2c60e7bc462726f
3
+ metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
4
+ data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
5
5
  SHA512:
6
- metadata.gz: 4d5b862bb50b4111c6bd390e458d6761303dc394f2fa7dc9d6b821ee7461541705aecac925f700e5124eb282112567e52a51a9f15b84fa8349da25baaf68fdd9
7
- data.tar.gz: a044608a58181e98664a26f410a7d59927dc4d39db8d49a147666f64254e23728ceccaa781a590712b7a74b57222cc449c37eb43a709d3f16da60aa3a55c2e6f
6
+ metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
7
+ data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
@@ -105,9 +105,7 @@ lost_annotations = []
105
105
  target_annotations = if source_annotations.class == Array
106
106
  align_mdoc(source_annotations, {text: target_text})
107
107
  else
108
- alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
109
-
110
- # pp alignment
108
+ alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
111
109
 
112
110
  # verification
113
111
  # source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
142
140
  puts "====="
143
141
  # exit
144
142
 
145
- # verification of source denotations
146
- puts "[Invalid source denotations]"
147
- source_annotations[:denotations] do |d|
148
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
149
- end
150
- puts "====="
151
- puts
152
-
153
143
  denotations = alignment.transform_hdenotations(source_annotations[:denotations])
154
- puts "[Invalid transformation]"
155
- denotations.each do |d|
156
- p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
157
- end
158
- puts "====="
159
- puts
160
-
161
144
  lost_annotations += alignment.lost_annotations if alignment.lost_annotations
162
145
 
163
146
  source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
194
177
 
195
178
  if lost_annotations
196
179
  warn "\n[lost annotations]"
197
- warn "#{lost_annotations.length}"
180
+ lost_annotations.each do |a|
181
+ p a
182
+ end
198
183
  end
199
184
 
200
185
  #puts target_annotations.to_json
@@ -12,43 +12,46 @@ class TextAlignment::TextAlignment
12
12
  attr_reader :similarity
13
13
  attr_reader :lost_annotations
14
14
 
15
- def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
15
+ def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
16
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
17
17
 
18
18
  @block_alignment = {source_text:str1, target_text:str2}
19
+ @str1 = str1
20
+ @str2 = str2
19
21
 
20
- # try exact match
22
+ ## Block exact match
21
23
  block_begin = str2.index(str1)
22
24
  unless block_begin.nil?
23
25
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
24
- return @block_alignment
26
+ return
25
27
  end
26
28
 
27
- # try exact match
28
29
  block_begin = str2.downcase.index(str1.downcase)
29
30
  unless block_begin.nil?
30
31
  @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
31
- return @block_alignment
32
+ return
32
33
  end
33
34
 
35
+
36
+ ## to find block alignments
34
37
  anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
35
38
 
36
- # To collect matched blocks
37
- mblocks = []
38
- while anchor = anchor_finder.get_next_anchor
39
- last = mblocks.last
40
- if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
41
- last[:source][:end] = anchor[:source][:end]
42
- last[:target][:end] = anchor[:target][:end]
39
+ blocks = []
40
+ while block = anchor_finder.get_next_anchor
41
+ last = blocks.last
42
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
43
+ last[:source][:end] = block[:source][:end]
44
+ last[:target][:end] = block[:target][:end]
43
45
  else
44
- mblocks << anchor
46
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
45
47
  end
46
48
  end
47
49
 
48
- # pp mblocks
50
+ # pp blocks
49
51
  # puts "-----"
50
52
  # puts
51
- # mblocks.each do |b|
53
+ # exit
54
+ # blocks.each do |b|
52
55
  # p [b[:source], b[:target]]
53
56
  # puts "---"
54
57
  # puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -60,114 +63,196 @@ class TextAlignment::TextAlignment
60
63
  # puts "-=-=-=-=-"
61
64
  # puts
62
65
 
63
- ## To find block alignments
64
- @block_alignment[:blocks] = []
65
- return if mblocks.empty?
66
-
67
- # Initial step
68
- if mblocks[0][:source][:begin] > 0
69
- e1 = mblocks[0][:source][:begin]
70
- e2 = mblocks[0][:target][:begin]
66
+ ## to fill the gaps
67
+ last_block = nil
68
+ blocks2 = blocks.inject([]) do |sum, block|
69
+ b1 = last_block ? last_block[:source][:end] : 0
70
+ e1 = block[:source][:begin]
71
71
 
72
- if mblocks[0][:target][:begin] == 0
73
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
72
+ sum += if b1 == e1
73
+ [block]
74
74
  else
75
- _str1 = str1[0 ... e1]
76
- _str2 = str2[0 ... e2]
75
+ b2 = last_block ? last_block[:target][:end] : 0
76
+ e2 = block[:target][:begin]
77
+
78
+ if b2 == e2
79
+ [
80
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
81
+ block
82
+ ]
83
+ else
84
+ if b1 == 0 && b2 == 0
85
+ len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
+ b2 = e2 - len_buffer if e2 > len_buffer
87
+ end
77
88
 
78
- unless _str1.strip.empty?
79
- if _str2.strip.empty?
80
- @block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
89
+ _str1 = str1[b1 ... e1]
90
+ _str2 = str2[b2 ... e2]
91
+
92
+ if _str1.strip.empty? || _str2.strip.empty?
93
+ [
94
+ {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
+ block
96
+ ]
81
97
  else
82
- len_min = [_str1.length, _str2.length].min
83
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
84
- b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
85
- b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
86
-
87
- @block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
88
-
89
- _str1 = str1[b1 ... e1]
90
- _str2 = str2[b2 ... e2]
91
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
92
- if alignment.similarity < 0.5
93
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
94
- else
95
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
96
- end
98
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
97
99
  end
98
100
  end
99
101
  end
102
+
103
+ last_block = block
104
+ sum
100
105
  end
101
- @block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
102
-
103
- (1 ... mblocks.length).each do |i|
104
- b1 = mblocks[i - 1][:source][:end]
105
- b2 = mblocks[i - 1][:target][:end]
106
- e1 = mblocks[i][:source][:begin]
107
- e2 = mblocks[i][:target][:begin]
108
- _str1 = str1[b1 ... e1]
109
- _str2 = str2[b2 ... e2]
110
- unless _str1.strip.empty?
111
- if _str2.strip.empty?
112
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
106
+
107
+ # the last step
108
+ blocks2 += if last_block.nil?
109
+ local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
110
+ else
111
+ b1 = last_block[:source][:end]
112
+ if b1 < str1.length
113
+ e1 = str1.length
114
+
115
+ b2 = last_block[:target][:end]
116
+ if b2 < str2.length
117
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
113
120
  else
114
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
115
- if alignment.similarity < 0.5
116
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
117
- else
118
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
119
- end
121
+ [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
120
122
  end
121
123
  end
122
- @block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
123
124
  end
124
125
 
125
- # Final step
126
- if mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
127
- b1 = mblocks[-1][:source][:end]
128
- b2 = mblocks[-1][:target][:end]
129
- _str1 = str1[b1 ... str1.length]
130
- _str2 = str2[b2 ... str2.length]
126
+ @block_alignment[:blocks] = blocks2
127
+ end
128
+
129
+ def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
130
+ block2 = str2[b2 ... e2]
131
+
132
+ ## term-based alignment
133
+ tblocks = if denotations
134
+ ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
135
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
136
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
137
+
138
+ position = 0
139
+ tblocks = ds_in_scope.map do |term|
140
+ lex = term[:lex]
141
+ r = block2.index(lex, position)
142
+ if r.nil?
143
+ position = nil
144
+ break
145
+ end
146
+ position = r + lex.length
147
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
148
+ end
149
+
150
+ # missing term found
151
+ tblocks = [] if position.nil?
152
+
153
+ # redundant matching found
154
+ unless position.nil?
155
+ ds_in_scope.each do |term|
156
+ lex = term[:lex]
157
+ look_forward = block2.index(lex, position)
158
+ unless look_forward.nil?
159
+ puts lex
160
+ tblocks = []
161
+ break
162
+ end
163
+ end
164
+ end
165
+
166
+ tblocks
167
+ end
131
168
 
132
- unless _str1.strip.empty?
133
- if _str2.strip.empty?
134
- @block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
169
+ if tblocks.empty?
170
+ if b1 == 0 && e1 == str1.length
171
+ if str2.length > 2000
172
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
135
173
  else
136
- len_min = [_str1.length, _str2.length].min
137
- len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
138
- e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
139
- e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
140
- _str1 = str1[b1 ... e1]
141
- _str2 = str2[b2 ... e2]
174
+ block1 = str1[b1 ... e1]
175
+ block2 = str2[b2 ... e2]
176
+
177
+ ## character-based alignment
178
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
179
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
180
+ # alignment = :alignment
181
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
182
+ end
183
+ else
184
+ block1 = str1[b1 ... e1]
185
+ block2 = str2[b2 ... e2]
186
+
187
+ ## character-based alignment
188
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
189
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
190
+ # alignmnet = :alignment
191
+ # [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
192
+ end
193
+ else
194
+ last_tblock = nil
195
+ lblocks = tblocks.inject([]) do |sum, tblock|
196
+ tb1 = last_tblock ? last_tblock[:source][:end] : b1
197
+ te1 = tblock[:source][:begin]
142
198
 
143
- alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
144
- if alignment.similarity < 0.5
145
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
199
+ sum += if te1 == tb1
200
+ [tblock]
201
+ else
202
+ tb2 = last_tblock ? tlast_block[:target][:end] : b2
203
+ te2 = tblock[:target][:begin]
204
+
205
+ if b2 == e2
206
+ [
207
+ {source:{begin:tb1, end:te1}, alignment: :empty},
208
+ tblock
209
+ ]
146
210
  else
147
- @block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
211
+ [
212
+ {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
213
+ tblock
214
+ ]
148
215
  end
216
+ end
217
+
218
+ last_tblock = tblock
219
+ sum
220
+ end
149
221
 
150
- @block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
222
+ if last_tblock[:source][:end] < e1
223
+ if last_tblock[:target][:end] < e2
224
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
225
+ else
226
+ lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
151
227
  end
152
228
  end
153
- end
154
229
 
155
- @block_alignment[:blocks].each do |a|
156
- a[:delta] = a[:target][:begin] - a[:source][:begin]
230
+ lblocks
157
231
  end
158
232
  end
159
233
 
234
+
235
+ def indices(str, target)
236
+ position = 0
237
+ len = target.len
238
+ Enumerator.new do |yielder|
239
+ while idx = str.index(target, position)
240
+ yielder << idx
241
+ position = idx + len
242
+ end
243
+ end
244
+ end
245
+
160
246
  def transform_begin_position(begin_position)
161
247
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
162
248
  block = @block_alignment[:blocks][i]
163
249
 
164
- b = if block[:alignment] == :block
250
+ b = if block[:alignment] == :block || block[:alignment] == :term
165
251
  begin_position + block[:delta]
166
252
  elsif block[:alignment] == :empty
167
253
  if begin_position == block[:source][:begin]
168
254
  block[:target][:begin]
169
255
  else
170
- # raise "lost annotation"
171
256
  nil
172
257
  end
173
258
  else
@@ -180,13 +265,12 @@ class TextAlignment::TextAlignment
180
265
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
181
266
  block = @block_alignment[:blocks][i]
182
267
 
183
- e = if block[:alignment] == :block
268
+ e = if block[:alignment] == :block || block[:alignment] == :term
184
269
  end_position + block[:delta]
185
270
  elsif block[:alignment] == :empty
186
271
  if end_position == block[:source][:end]
187
272
  block[:target][:end]
188
273
  else
189
- # raise "lost annotation"
190
274
  nil
191
275
  end
192
276
  else
@@ -208,14 +292,14 @@ class TextAlignment::TextAlignment
208
292
  @lost_annotations = []
209
293
 
210
294
  denotations.each do |d|
211
- begin
212
- d.begin = transform_begin_position(d.begin);
213
- d.end = transform_end_position(d.end);
214
- rescue
215
- @lost_annotations << d
216
- d.begin = nil
217
- d.end = nil
218
- end
295
+ source = {begin:d.begin, end:d.end}
296
+ d.begin = transform_begin_position(d.begin);
297
+ d.end = transform_end_position(d.end);
298
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
299
+ rescue
300
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
301
+ d.begin = nil
302
+ d.end = nil
219
303
  end
220
304
 
221
305
  @lost_annotations
@@ -226,12 +310,12 @@ class TextAlignment::TextAlignment
226
310
  @lost_annotations = []
227
311
 
228
312
  r = hdenotations.collect do |d|
229
- new_d = begin
230
- d.dup.merge({span:transform_a_span(d[:span])})
231
- rescue
232
- @lost_annotations << d
233
- nil
234
- end
313
+ t = transform_a_span(d[:span])
314
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
315
+ new_d = d.dup.merge({span:t})
316
+ rescue
317
+ @lost_annotations << {source: d[:span], target:t}
318
+ nil
235
319
  end.compact
236
320
 
237
321
  r
@@ -245,7 +329,10 @@ class TextAlignment::TextAlignment
245
329
  @block_alignment[:blocks].each do |a|
246
330
  show += case a[:alignment]
247
331
  when :block
248
- "===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
332
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
333
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
334
+ when :term
335
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
249
336
  stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
250
337
  when :empty
251
338
  "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.6.4'
2
+ VERSION = '0.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.4
4
+ version: '0.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-06 00:00:00.000000000 Z
11
+ date: 2020-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary