text_alignment 0.11.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
- data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
3
+ metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
4
+ data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
5
5
  SHA512:
6
- metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
- data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
6
+ metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
7
+ data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
@@ -77,7 +77,9 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
77
77
  annotations[:relations].each do |r|
78
78
  reid = 'R' + (idnum_relations += 1).to_s
79
79
  ididx[r[:id]] = reid
80
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
81
83
  end
82
84
  end
83
85
 
@@ -86,7 +88,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
86
88
  annotations[:attributes].each do |a|
87
89
  reid = 'A' + (idnum_attributes += 1).to_s
88
90
  ididx[a[:id]] = reid
89
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
90
93
  end
91
94
  end
92
95
 
@@ -95,7 +98,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
95
98
  annotations[:modifications].each do |m|
96
99
  reid = 'M' + (idnum_modifications += 1).to_s
97
100
  ididx[m[:id]] = reid
98
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
99
103
  end
100
104
  end
101
105
  end
@@ -115,11 +119,13 @@ reference_text = read_text(ARGV[1])
115
119
  alignment = TextAlignment::TextAlignment.new(reference_text, true)
116
120
 
117
121
  target_annotations = if source_annotations.class == Array
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
118
123
  align_mannotations(source_annotations, reference_text, alignment, false)
119
124
  else
120
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
122
127
  source_annotations.merge({text:reference_text, denotations:denotations})
123
128
  end
124
129
 
130
+ # pp alignment.block_alignment
125
131
  puts target_annotations.to_json
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
9
  def initialize(source_str, target_str, cultivation_map)
10
- @s1, @s2 = if reverse?(source_str, target_str)
11
- [target_str.downcase, source_str.downcase]
12
- else
13
- [source_str.downcase, target_str.downcase]
14
- end
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
15
12
 
16
13
  @cultivation_map = cultivation_map
17
14
 
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
19
16
  @size_window = TextAlignment::SIZE_WINDOW
20
17
  @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
18
  @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
22
20
 
23
21
  # positions of last match
24
22
  @pos_s1_last_match = 0
25
23
  @pos_s2_last_match = 0
26
24
  end
27
25
 
28
- def reverse?(source_str = nil, target_str = nil)
29
- unless source_str.nil?
30
- @reverse_p = target_str.length < source_str.length
31
- end
32
- @reverse_p
33
- end
34
-
35
26
  def get_next_anchor
36
27
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
28
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
49
40
  # To extend the block to the left
50
41
  b1 = beg_s1
51
42
  b2 = beg_s2
52
- while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
53
45
  b1 -= 1; b2 -= 1
54
46
  end
55
- b1 += 1; b2 += 1
56
47
 
57
48
  # To extend the block to the right
58
49
  e1 = beg_s1 + @size_ngram
59
50
  e2 = beg_s2 + @size_ngram
60
- while @s1[e1] && @s1[e1] == @s2[e2]
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
61
53
  e1 += 1; e2 += 1
62
54
  end
63
55
 
64
56
  @pos_s1_last_match = e1
65
57
  @pos_s2_last_match = e2
66
58
 
67
- if reverse?
68
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
69
- else
70
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
71
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
72
60
  end
73
61
 
74
62
  private
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
91
79
  # return nil if the anchor is too much frequent
92
80
  def find_beg_s2_candidates(anchor, search_position)
93
81
  candidates = []
94
- while _beg_s2 = @s2.index(anchor, search_position)
95
- search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
- unless search_again_position.nil?
97
- search_position = search_again_position
98
- next
99
- end
100
-
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
101
83
  candidates << _beg_s2
102
84
 
103
85
  # for speed, skip anchor of high frequency
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
4
4
  attr_reader :map
5
5
 
6
6
  def initialize
7
- @map = {}
7
+ @map = []
8
8
  end
9
9
 
10
10
  def cultivate(regions)
11
- regions.each do |b, e|
12
- (b ... e).each{|p| @map[p] = e}
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
13
24
  end
25
+ @map = new_map
14
26
  end
15
27
 
16
- def search_again_position(position)
17
- @map[position]
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position < r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
18
93
  end
19
94
  end
@@ -144,11 +144,17 @@ class TextAlignment::MixedAlignment
144
144
  def compute_similarity(s1, s2, sdiff)
145
145
  return 0 if sdiff.nil?
146
146
 
147
- # compute the lcs only with non-whitespace letters
148
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
149
- return 0 if lcs == 0
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
150
 
151
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
152
- end
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
153
157
 
158
+ similarity = coverage * rate_frag
159
+ end
154
160
  end
@@ -15,8 +15,9 @@ class TextAlignment::TextAlignment
15
15
  def initialize(reference_text, to_prevent_overlap = false)
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @original_rtext = reference_text
18
+ @original_reference_text = reference_text
19
19
  @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
20
21
  @to_prevent_overlap = to_prevent_overlap
21
22
 
22
23
  @original_text = nil
@@ -34,21 +35,19 @@ class TextAlignment::TextAlignment
34
35
  @text_mapping = TextAlignment::CharMapping.new(text)
35
36
  end
36
37
 
37
- text_mapped = @text_mapping.mapped_text
38
+ @mapped_text = @text_mapping.mapped_text
38
39
  denotations_mapped = @text_mapping.enmap_denotations(denotations)
39
40
 
40
- rtext_mapped = @rtext_mapping.mapped_text
41
-
42
41
  ## To generate the block_alignment of the input text against the reference text
43
-
44
42
  # Initialization
45
- @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
43
+ # @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
44
+ @block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
46
45
 
47
46
  # Generation
48
- @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
47
+ @block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
49
48
  r
50
49
  else
51
- find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
50
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
52
51
  end
53
52
  end
54
53
 
@@ -134,7 +133,7 @@ class TextAlignment::TextAlignment
134
133
  source = {begin:d.begin, end:d.end}
135
134
  d.begin = transform_begin_position(d.begin);
136
135
  d.end = transform_end_position(d.end);
137
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
136
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
138
137
  rescue
139
138
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
140
139
  d.begin = nil
@@ -150,7 +149,7 @@ class TextAlignment::TextAlignment
150
149
 
151
150
  r = hdenotations.collect do |d|
152
151
  t = transform_a_span(d[:span])
153
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
152
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
154
153
  new_d = d.dup.merge({span:t})
155
154
  rescue
156
155
  @lost_annotations << {source: d[:span], target:t}
@@ -161,8 +160,8 @@ class TextAlignment::TextAlignment
161
160
  end
162
161
 
163
162
  def alignment_show
164
- stext = @block_alignment[:text]
165
- ttext = @block_alignment[:reference_text]
163
+ stext = @mapped_text
164
+ ttext = @mapped_reference_text
166
165
 
167
166
  show = ''
168
167
  @block_alignment[:blocks].each do |a|
@@ -218,7 +217,7 @@ class TextAlignment::TextAlignment
218
217
 
219
218
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
220
219
  "[#{astr1}]\n" +
221
- "[#{astr2}]\n\n"
220
+ "[#{astr2.gsub("\n", " ")}]\n\n"
222
221
  end
223
222
  end
224
223
  show
@@ -257,142 +256,109 @@ class TextAlignment::TextAlignment
257
256
  # puts "-=-=-=-=-"
258
257
  # puts
259
258
 
260
- ## to fill the gaps
261
- last_block = nil
262
- blocks2 = blocks.inject([]) do |sum, block|
263
- b1 = last_block ? last_block[:source][:end] : 0
264
- e1 = block[:source][:begin]
259
+ ## To fill the gaps
260
+ ## lblock: last block, cblock: current block
261
+ lblock = nil
262
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
263
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
264
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
265
265
 
266
- sum += if b1 == e1
267
- [block]
268
- else
269
- b2 = last_block ? last_block[:target][:end] : 0
270
- e2 = block[:target][:begin]
271
-
272
- if b2 == e2
273
- [
274
- {source:{begin:b1, end:e1}, alignment: :empty},
275
- block
276
- ]
266
+ if b1 < e1
267
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
268
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
269
+ _str1 = str1[b1 ... e1]
270
+ _str2 = str2[b2 ... e2]
271
+
272
+ sum += if _str1.strip.empty? || _str2.strip.empty?
273
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
277
274
  else
278
275
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
279
-
280
- if b1 == 0 && b2 == 0
281
- b2 = e2 - len_buffer if e2 > len_buffer
282
- end
283
-
284
- _str1 = str1[b1 ... e1]
285
- _str2 = str2[b2 ... e2]
286
-
287
- if _str1.strip.empty? || _str2.strip.empty?
288
- [
289
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
290
- block
291
- ]
292
- elsif ((e2 - b2) - (e1 - b1)) > len_buffer
293
- la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
294
- la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
295
- [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
296
- else
297
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
276
+ region_state, state_region = cultivation_map.region_state([b2, e2])
277
+ case region_state
278
+ when :closed
279
+ []
280
+ when :front_open
281
+ oe2 = state_region[1]
282
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
283
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
284
+ when :rear_open
285
+ ob2 = state_region[0]
286
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
287
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
288
+ when :middle_closed
289
+ oe2 = state_region[0]
290
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
291
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
292
+ if attempt1.empty?
293
+ ob2 = state_region[1]
294
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
295
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
296
+ else
297
+ attempt1
298
+ end
299
+ else # :open
300
+ if (e2 - b2) > len_buffer
301
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
302
+ if attempt1.empty?
303
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
304
+ else
305
+ attempt1
306
+ end
307
+ else
308
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
309
+ end
298
310
  end
299
311
  end
300
312
  end
301
313
 
302
- last_block = block
303
- sum
314
+ lblock = cblock
315
+ cblock.nil? ? sum : sum << cblock
304
316
  end
305
317
 
306
- # the last step
307
- blocks2 += if last_block.nil?
308
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
309
- else
310
- b1 = last_block[:source][:end]
311
- if b1 < str1.length
312
- e1 = str1.length
313
- b2 = last_block[:target][:end]
314
-
315
- _str1 = str1[b1 ... e1]
316
- if _str1.strip.empty?
317
- [{source:{begin:b1, end:e1}, alignment: :empty}]
318
- else
319
- if b2 < str2.length
320
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
321
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
322
-
323
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
324
- else
325
- [{source:{begin:b1, end:e1}, alignment: :empty}]
326
- end
327
- end
328
- else
329
- []
330
- end
331
- end
332
318
  end
333
319
 
334
320
  def whole_block_alignment(str1, str2, cultivation_map)
335
- ## Block exact match
336
- search_position = 0
337
-
338
- block_begin = begin
339
- _block_begin = str2.index(str1, search_position)
340
- break if _block_begin.nil?
341
- search_position = cultivation_map.search_again_position(_block_begin)
342
- _block_begin
343
- end until search_position.nil?
344
-
345
- unless block_begin.nil?
346
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
347
- end
321
+ block_begin = cultivation_map.index(str1, str2, 0)
322
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
348
323
 
349
- search_position = 0
324
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
325
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
350
326
 
351
- dstr1 = str1.downcase
352
- dstr2 = str2.downcase
353
- block_begin = begin
354
- _block_begin = dstr2.index(dstr1, search_position)
355
- break if _block_begin.nil?
356
- search_position = cultivation_map.search_again_position(_block_begin)
357
- _block_begin
358
- end until search_position.nil?
327
+ nil
328
+ end
359
329
 
360
- unless block_begin.nil?
361
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
330
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
332
+ if tblocks.empty?
333
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
334
+ else
335
+ tblocks
362
336
  end
363
-
364
- nil
365
337
  end
366
338
 
367
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
368
- block2 = str2[b2 ... e2]
339
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
340
+ str2_block = str2[0 ... e2]
369
341
 
370
342
  ## term-based alignment
371
343
  tblocks = if denotations
372
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
344
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
373
345
  sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
374
346
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
375
347
 
376
- position = 0
377
- _tblocks = ds_in_scope.map do |term|
378
- lex = term[:lex]
379
- r = block2.index(lex, position)
380
- if r.nil?
381
- position = nil
382
- break
383
- end
384
- position = r + lex.length
385
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
348
+ search_position = b2
349
+ _tblocks = denotations_in_scope.map do |denotation|
350
+ lex = denotation[:lex]
351
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
352
+ break [] if term_begin.nil? # break the loop if a missing term is found
353
+ search_position = term_begin + lex.length
354
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
386
355
  end
387
356
 
388
- # missing term found
389
- _tblocks = [] if position.nil?
390
-
391
357
  # redundant matching found
392
- unless position.nil?
393
- ds_in_scope.each do |term|
394
- lex = term[:lex]
395
- look_forward = block2.index(lex, position)
358
+ unless _tblocks.empty?
359
+ search_position = _tblocks.last[:target][:end]
360
+ denotations_in_scope.each do |term|
361
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
396
362
  unless look_forward.nil?
397
363
  _tblocks = []
398
364
  break
@@ -405,72 +371,37 @@ class TextAlignment::TextAlignment
405
371
  []
406
372
  end
407
373
 
408
- if tblocks.empty?
409
- if b1 == 0 && e1 == str1.length
410
- if (e1 > 2000) || (e2 > 2000)
411
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
412
- else
413
- block1 = str1[b1 ... e1]
414
- block2 = str2[b2 ... e2]
415
-
416
- ## character-based alignment
417
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
418
- if alignment.sdiff.nil?
419
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
420
- else
421
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
422
- end
423
- end
424
- else
425
- block1 = str1[b1 ... e1]
426
- block2 = str2[b2 ... e2]
374
+ ltblock = nil
375
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
376
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
377
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
427
378
 
428
- ## character-based alignment
429
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
430
- if alignment.sdiff.nil?
431
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
432
- else
433
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
434
- end
379
+ if te1 > tb1
380
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
381
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
382
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
435
383
  end
436
- else
437
- last_tblock = nil
438
- lblocks = tblocks.inject([]) do |sum, tblock|
439
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
440
- te1 = tblock[:source][:begin]
441
384
 
442
- sum += if te1 == tb1
443
- [tblock]
444
- else
445
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
446
- te2 = tblock[:target][:begin]
447
-
448
- if b2 == e2
449
- [
450
- {source:{begin:tb1, end:te1}, alignment: :empty},
451
- tblock
452
- ]
453
- else
454
- [
455
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
456
- tblock
457
- ]
458
- end
459
- end
385
+ ltblock = ctblock
386
+ ctblock.nil? ? sum : sum << ctblock
387
+ end
460
388
 
461
- last_tblock = tblock
462
- sum
463
- end
389
+ tblocks2
390
+ end
464
391
 
465
- if last_tblock[:source][:end] < e1
466
- if last_tblock[:target][:end] < e2
467
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
468
- else
469
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
470
- end
471
- end
392
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
393
+ source = {begin:b1, end:e1}
394
+ target = {begin:b2, end:e2}
472
395
 
473
- lblocks
396
+ if (e1 - b1) > 2000
397
+ [{source:source, target:target, alignment: :empty}]
398
+ else
399
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
400
+ if alignment.similarity < 0.5
401
+ [{source:source, target:target, alignment: :empty}]
402
+ else
403
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
404
+ end
474
405
  end
475
406
  end
476
407
 
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.0'
2
+ VERSION = '0.11.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-04 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary