text_alignment 0.11.0 → 0.11.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
- data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
3
+ metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
4
+ data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
5
5
  SHA512:
6
- metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
- data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
6
+ metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
7
+ data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
@@ -77,7 +77,9 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
77
77
  annotations[:relations].each do |r|
78
78
  reid = 'R' + (idnum_relations += 1).to_s
79
79
  ididx[r[:id]] = reid
80
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
81
83
  end
82
84
  end
83
85
 
@@ -86,7 +88,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
86
88
  annotations[:attributes].each do |a|
87
89
  reid = 'A' + (idnum_attributes += 1).to_s
88
90
  ididx[a[:id]] = reid
89
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
90
93
  end
91
94
  end
92
95
 
@@ -95,7 +98,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
95
98
  annotations[:modifications].each do |m|
96
99
  reid = 'M' + (idnum_modifications += 1).to_s
97
100
  ididx[m[:id]] = reid
98
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
99
103
  end
100
104
  end
101
105
  end
@@ -115,11 +119,13 @@ reference_text = read_text(ARGV[1])
115
119
  alignment = TextAlignment::TextAlignment.new(reference_text, true)
116
120
 
117
121
  target_annotations = if source_annotations.class == Array
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
118
123
  align_mannotations(source_annotations, reference_text, alignment, false)
119
124
  else
120
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
122
127
  source_annotations.merge({text:reference_text, denotations:denotations})
123
128
  end
124
129
 
130
+ # pp alignment.block_alignment
125
131
  puts target_annotations.to_json
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
9
  def initialize(source_str, target_str, cultivation_map)
10
- @s1, @s2 = if reverse?(source_str, target_str)
11
- [target_str.downcase, source_str.downcase]
12
- else
13
- [source_str.downcase, target_str.downcase]
14
- end
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
15
12
 
16
13
  @cultivation_map = cultivation_map
17
14
 
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
19
16
  @size_window = TextAlignment::SIZE_WINDOW
20
17
  @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
18
  @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
22
20
 
23
21
  # positions of last match
24
22
  @pos_s1_last_match = 0
25
23
  @pos_s2_last_match = 0
26
24
  end
27
25
 
28
- def reverse?(source_str = nil, target_str = nil)
29
- unless source_str.nil?
30
- @reverse_p = target_str.length < source_str.length
31
- end
32
- @reverse_p
33
- end
34
-
35
26
  def get_next_anchor
36
27
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
28
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
49
40
  # To extend the block to the left
50
41
  b1 = beg_s1
51
42
  b2 = beg_s2
52
- while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
53
45
  b1 -= 1; b2 -= 1
54
46
  end
55
- b1 += 1; b2 += 1
56
47
 
57
48
  # To extend the block to the right
58
49
  e1 = beg_s1 + @size_ngram
59
50
  e2 = beg_s2 + @size_ngram
60
- while @s1[e1] && @s1[e1] == @s2[e2]
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
61
53
  e1 += 1; e2 += 1
62
54
  end
63
55
 
64
56
  @pos_s1_last_match = e1
65
57
  @pos_s2_last_match = e2
66
58
 
67
- if reverse?
68
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
69
- else
70
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
71
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
72
60
  end
73
61
 
74
62
  private
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
91
79
  # return nil if the anchor is too much frequent
92
80
  def find_beg_s2_candidates(anchor, search_position)
93
81
  candidates = []
94
- while _beg_s2 = @s2.index(anchor, search_position)
95
- search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
- unless search_again_position.nil?
97
- search_position = search_again_position
98
- next
99
- end
100
-
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
101
83
  candidates << _beg_s2
102
84
 
103
85
  # for speed, skip anchor of high frequency
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
4
4
  attr_reader :map
5
5
 
6
6
  def initialize
7
- @map = {}
7
+ @map = []
8
8
  end
9
9
 
10
10
  def cultivate(regions)
11
- regions.each do |b, e|
12
- (b ... e).each{|p| @map[p] = e}
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
13
24
  end
25
+ @map = new_map
14
26
  end
15
27
 
16
- def search_again_position(position)
17
- @map[position]
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position < r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
18
93
  end
19
94
  end
@@ -144,11 +144,17 @@ class TextAlignment::MixedAlignment
144
144
  def compute_similarity(s1, s2, sdiff)
145
145
  return 0 if sdiff.nil?
146
146
 
147
- # compute the lcs only with non-whitespace letters
148
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
149
- return 0 if lcs == 0
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
150
 
151
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
152
- end
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
153
157
 
158
+ similarity = coverage * rate_frag
159
+ end
154
160
  end
@@ -15,8 +15,9 @@ class TextAlignment::TextAlignment
15
15
  def initialize(reference_text, to_prevent_overlap = false)
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @original_rtext = reference_text
18
+ @original_reference_text = reference_text
19
19
  @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
20
21
  @to_prevent_overlap = to_prevent_overlap
21
22
 
22
23
  @original_text = nil
@@ -34,21 +35,19 @@ class TextAlignment::TextAlignment
34
35
  @text_mapping = TextAlignment::CharMapping.new(text)
35
36
  end
36
37
 
37
- text_mapped = @text_mapping.mapped_text
38
+ @mapped_text = @text_mapping.mapped_text
38
39
  denotations_mapped = @text_mapping.enmap_denotations(denotations)
39
40
 
40
- rtext_mapped = @rtext_mapping.mapped_text
41
-
42
41
  ## To generate the block_alignment of the input text against the reference text
43
-
44
42
  # Initialization
45
- @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
43
+ # @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
44
+ @block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
46
45
 
47
46
  # Generation
48
- @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
47
+ @block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
49
48
  r
50
49
  else
51
- find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
50
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
52
51
  end
53
52
  end
54
53
 
@@ -134,7 +133,7 @@ class TextAlignment::TextAlignment
134
133
  source = {begin:d.begin, end:d.end}
135
134
  d.begin = transform_begin_position(d.begin);
136
135
  d.end = transform_end_position(d.end);
137
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
136
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
138
137
  rescue
139
138
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
140
139
  d.begin = nil
@@ -150,7 +149,7 @@ class TextAlignment::TextAlignment
150
149
 
151
150
  r = hdenotations.collect do |d|
152
151
  t = transform_a_span(d[:span])
153
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
152
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
154
153
  new_d = d.dup.merge({span:t})
155
154
  rescue
156
155
  @lost_annotations << {source: d[:span], target:t}
@@ -161,8 +160,8 @@ class TextAlignment::TextAlignment
161
160
  end
162
161
 
163
162
  def alignment_show
164
- stext = @block_alignment[:text]
165
- ttext = @block_alignment[:reference_text]
163
+ stext = @mapped_text
164
+ ttext = @mapped_reference_text
166
165
 
167
166
  show = ''
168
167
  @block_alignment[:blocks].each do |a|
@@ -218,7 +217,7 @@ class TextAlignment::TextAlignment
218
217
 
219
218
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
220
219
  "[#{astr1}]\n" +
221
- "[#{astr2}]\n\n"
220
+ "[#{astr2.gsub("\n", " ")}]\n\n"
222
221
  end
223
222
  end
224
223
  show
@@ -257,142 +256,109 @@ class TextAlignment::TextAlignment
257
256
  # puts "-=-=-=-=-"
258
257
  # puts
259
258
 
260
- ## to fill the gaps
261
- last_block = nil
262
- blocks2 = blocks.inject([]) do |sum, block|
263
- b1 = last_block ? last_block[:source][:end] : 0
264
- e1 = block[:source][:begin]
259
+ ## To fill the gaps
260
+ ## lblock: last block, cblock: current block
261
+ lblock = nil
262
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
263
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
264
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
265
265
 
266
- sum += if b1 == e1
267
- [block]
268
- else
269
- b2 = last_block ? last_block[:target][:end] : 0
270
- e2 = block[:target][:begin]
271
-
272
- if b2 == e2
273
- [
274
- {source:{begin:b1, end:e1}, alignment: :empty},
275
- block
276
- ]
266
+ if b1 < e1
267
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
268
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
269
+ _str1 = str1[b1 ... e1]
270
+ _str2 = str2[b2 ... e2]
271
+
272
+ sum += if _str1.strip.empty? || _str2.strip.empty?
273
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
277
274
  else
278
275
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
279
-
280
- if b1 == 0 && b2 == 0
281
- b2 = e2 - len_buffer if e2 > len_buffer
282
- end
283
-
284
- _str1 = str1[b1 ... e1]
285
- _str2 = str2[b2 ... e2]
286
-
287
- if _str1.strip.empty? || _str2.strip.empty?
288
- [
289
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
290
- block
291
- ]
292
- elsif ((e2 - b2) - (e1 - b1)) > len_buffer
293
- la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
294
- la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
295
- [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
296
- else
297
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
276
+ region_state, state_region = cultivation_map.region_state([b2, e2])
277
+ case region_state
278
+ when :closed
279
+ []
280
+ when :front_open
281
+ oe2 = state_region[1]
282
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
283
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
284
+ when :rear_open
285
+ ob2 = state_region[0]
286
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
287
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
288
+ when :middle_closed
289
+ oe2 = state_region[0]
290
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
291
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
292
+ if attempt1.empty?
293
+ ob2 = state_region[1]
294
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
295
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
296
+ else
297
+ attempt1
298
+ end
299
+ else # :open
300
+ if (e2 - b2) > len_buffer
301
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
302
+ if attempt1.empty?
303
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
304
+ else
305
+ attempt1
306
+ end
307
+ else
308
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
309
+ end
298
310
  end
299
311
  end
300
312
  end
301
313
 
302
- last_block = block
303
- sum
314
+ lblock = cblock
315
+ cblock.nil? ? sum : sum << cblock
304
316
  end
305
317
 
306
- # the last step
307
- blocks2 += if last_block.nil?
308
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
309
- else
310
- b1 = last_block[:source][:end]
311
- if b1 < str1.length
312
- e1 = str1.length
313
- b2 = last_block[:target][:end]
314
-
315
- _str1 = str1[b1 ... e1]
316
- if _str1.strip.empty?
317
- [{source:{begin:b1, end:e1}, alignment: :empty}]
318
- else
319
- if b2 < str2.length
320
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
321
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
322
-
323
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
324
- else
325
- [{source:{begin:b1, end:e1}, alignment: :empty}]
326
- end
327
- end
328
- else
329
- []
330
- end
331
- end
332
318
  end
333
319
 
334
320
  def whole_block_alignment(str1, str2, cultivation_map)
335
- ## Block exact match
336
- search_position = 0
337
-
338
- block_begin = begin
339
- _block_begin = str2.index(str1, search_position)
340
- break if _block_begin.nil?
341
- search_position = cultivation_map.search_again_position(_block_begin)
342
- _block_begin
343
- end until search_position.nil?
344
-
345
- unless block_begin.nil?
346
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
347
- end
321
+ block_begin = cultivation_map.index(str1, str2, 0)
322
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
348
323
 
349
- search_position = 0
324
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
325
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
350
326
 
351
- dstr1 = str1.downcase
352
- dstr2 = str2.downcase
353
- block_begin = begin
354
- _block_begin = dstr2.index(dstr1, search_position)
355
- break if _block_begin.nil?
356
- search_position = cultivation_map.search_again_position(_block_begin)
357
- _block_begin
358
- end until search_position.nil?
327
+ nil
328
+ end
359
329
 
360
- unless block_begin.nil?
361
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
330
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
332
+ if tblocks.empty?
333
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
334
+ else
335
+ tblocks
362
336
  end
363
-
364
- nil
365
337
  end
366
338
 
367
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
368
- block2 = str2[b2 ... e2]
339
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
340
+ str2_block = str2[0 ... e2]
369
341
 
370
342
  ## term-based alignment
371
343
  tblocks = if denotations
372
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
344
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
373
345
  sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
374
346
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
375
347
 
376
- position = 0
377
- _tblocks = ds_in_scope.map do |term|
378
- lex = term[:lex]
379
- r = block2.index(lex, position)
380
- if r.nil?
381
- position = nil
382
- break
383
- end
384
- position = r + lex.length
385
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
348
+ search_position = b2
349
+ _tblocks = denotations_in_scope.map do |denotation|
350
+ lex = denotation[:lex]
351
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
352
+ break [] if term_begin.nil? # break the loop if a missing term is found
353
+ search_position = term_begin + lex.length
354
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
386
355
  end
387
356
 
388
- # missing term found
389
- _tblocks = [] if position.nil?
390
-
391
357
  # redundant matching found
392
- unless position.nil?
393
- ds_in_scope.each do |term|
394
- lex = term[:lex]
395
- look_forward = block2.index(lex, position)
358
+ unless _tblocks.empty?
359
+ search_position = _tblocks.last[:target][:end]
360
+ denotations_in_scope.each do |term|
361
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
396
362
  unless look_forward.nil?
397
363
  _tblocks = []
398
364
  break
@@ -405,72 +371,37 @@ class TextAlignment::TextAlignment
405
371
  []
406
372
  end
407
373
 
408
- if tblocks.empty?
409
- if b1 == 0 && e1 == str1.length
410
- if (e1 > 2000) || (e2 > 2000)
411
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
412
- else
413
- block1 = str1[b1 ... e1]
414
- block2 = str2[b2 ... e2]
415
-
416
- ## character-based alignment
417
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
418
- if alignment.sdiff.nil?
419
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
420
- else
421
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
422
- end
423
- end
424
- else
425
- block1 = str1[b1 ... e1]
426
- block2 = str2[b2 ... e2]
374
+ ltblock = nil
375
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
376
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
377
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
427
378
 
428
- ## character-based alignment
429
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
430
- if alignment.sdiff.nil?
431
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
432
- else
433
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
434
- end
379
+ if te1 > tb1
380
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
381
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
382
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
435
383
  end
436
- else
437
- last_tblock = nil
438
- lblocks = tblocks.inject([]) do |sum, tblock|
439
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
440
- te1 = tblock[:source][:begin]
441
384
 
442
- sum += if te1 == tb1
443
- [tblock]
444
- else
445
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
446
- te2 = tblock[:target][:begin]
447
-
448
- if b2 == e2
449
- [
450
- {source:{begin:tb1, end:te1}, alignment: :empty},
451
- tblock
452
- ]
453
- else
454
- [
455
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
456
- tblock
457
- ]
458
- end
459
- end
385
+ ltblock = ctblock
386
+ ctblock.nil? ? sum : sum << ctblock
387
+ end
460
388
 
461
- last_tblock = tblock
462
- sum
463
- end
389
+ tblocks2
390
+ end
464
391
 
465
- if last_tblock[:source][:end] < e1
466
- if last_tblock[:target][:end] < e2
467
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
468
- else
469
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
470
- end
471
- end
392
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
393
+ source = {begin:b1, end:e1}
394
+ target = {begin:b2, end:e2}
472
395
 
473
- lblocks
396
+ if (e1 - b1) > 2000
397
+ [{source:source, target:target, alignment: :empty}]
398
+ else
399
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
400
+ if alignment.similarity < 0.5
401
+ [{source:source, target:target, alignment: :empty}]
402
+ else
403
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
404
+ end
474
405
  end
475
406
  end
476
407
 
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.0'
2
+ VERSION = '0.11.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-04 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary