text_alignment 0.9 → 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,94 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = []
8
+ end
9
+
10
+ def cultivate(regions)
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
24
+ end
25
+ @map = new_map
26
+ end
27
+
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position < r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position = 0)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
93
+ end
94
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
20
20
  def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
63
65
  end
64
66
 
65
67
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
68
+ @similarity = compute_similarity(str1, str2, @sdiff)
67
69
  @str1_match_initial = cmp.str1_match_initial
68
70
  @str1_match_final = cmp.str1_match_final
69
71
  @str2_match_initial = cmp.str2_match_initial
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
144
+ def compute_similarity(s1, s2, sdiff)
145
+ return 0 if sdiff.nil?
146
+
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
+
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
157
+
158
+ similarity = coverage * rate_frag
159
+ end
142
160
  end
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
@@ -10,255 +11,48 @@ class TextAlignment::TextAlignment
10
11
  attr_reader :similarity
11
12
  attr_reader :lost_annotations
12
13
 
13
- def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
15
17
 
16
- @block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
17
- @original_str1 = _str1
18
- @original_str2 = _str2
18
+ @original_reference_text = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
21
+ @to_prevent_overlap = to_prevent_overlap
19
22
 
20
- str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
21
-
22
- if r = whole_block_alignment(str1, str2)
23
- @block_alignment[:blocks] = r
24
- return
25
- end
26
-
27
- ## to find block alignments
28
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
29
-
30
- blocks = []
31
- while block = anchor_finder.get_next_anchor
32
- last = blocks.last
33
- if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
34
- last[:source][:end] = block[:source][:end]
35
- last[:target][:end] = block[:target][:end]
36
- else
37
- blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
38
- end
39
- end
40
-
41
- # pp blocks
42
- # puts "-----"
43
- # puts
44
- # exit
45
- # blocks.each do |b|
46
- # p [b[:source], b[:target]]
47
- # puts "---"
48
- # puts str1[b[:source][:begin] ... b[:source][:end]]
49
- # puts "---"
50
- # puts str2[b[:target][:begin] ... b[:target][:end]]
51
- # puts "====="
52
- # puts
53
- # end
54
- # puts "-=-=-=-=-"
55
- # puts
56
-
57
- ## to fill the gaps
58
- last_block = nil
59
- blocks2 = blocks.inject([]) do |sum, block|
60
- b1 = last_block ? last_block[:source][:end] : 0
61
- e1 = block[:source][:begin]
62
-
63
- sum += if b1 == e1
64
- [block]
65
- else
66
- b2 = last_block ? last_block[:target][:end] : 0
67
- e2 = block[:target][:begin]
68
-
69
- if b2 == e2
70
- [
71
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
72
- block
73
- ]
74
- else
75
- if b1 == 0 && b2 == 0
76
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
- b2 = e2 - len_buffer if e2 > len_buffer
78
- end
79
-
80
- _str1 = str1[b1 ... e1]
81
- _str2 = str2[b2 ... e2]
82
-
83
- if _str1.strip.empty? || _str2.strip.empty?
84
- [
85
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
86
- block
87
- ]
88
- else
89
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
90
- end
91
- end
92
- end
93
-
94
- last_block = block
95
- sum
96
- end
97
-
98
- # the last step
99
- blocks2 += if last_block.nil?
100
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
101
- else
102
- b1 = last_block[:source][:end]
103
- if b1 < str1.length
104
- e1 = str1.length
105
-
106
- b2 = last_block[:target][:end]
107
- if b2 < str2.length
108
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
109
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
110
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
111
- else
112
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
113
- end
114
- else
115
- []
116
- end
117
- end
118
-
119
- @block_alignment[:blocks] = blocks2
23
+ @original_text = nil
24
+ @blocks = nil
25
+ @cultivation_map = TextAlignment::CultivationMap.new
120
26
  end
121
27
 
122
- def whole_block_alignment(str1, str2)
123
- ## Block exact match
124
- block_begin = str2.index(str1)
125
- unless block_begin.nil?
126
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
- end
28
+ def align(text, denotations = nil)
29
+ # To maintain the cultivation map
30
+ update_cultivation_map if @to_prevent_overlap
128
31
 
129
- block_begin = str2.downcase.index(str1.downcase)
130
- unless block_begin.nil?
131
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
32
+ # In case the input text is the same as the previous one, reuse the previous text mapping
33
+ unless @original_text && @original_text == text
34
+ @original_text = text
35
+ @text_mapping = TextAlignment::CharMapping.new(text)
132
36
  end
133
37
 
134
- nil
135
- end
136
-
137
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
138
- block2 = str2[b2 ... e2]
139
-
140
- ## term-based alignment
141
- tblocks = if denotations
142
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
143
- sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
144
- map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
145
-
146
- position = 0
147
- tblocks = ds_in_scope.map do |term|
148
- lex = term[:lex]
149
- r = block2.index(lex, position)
150
- if r.nil?
151
- position = nil
152
- break
153
- end
154
- position = r + lex.length
155
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
156
- end
157
-
158
- # missing term found
159
- tblocks = [] if position.nil?
38
+ @mapped_text = @text_mapping.mapped_text
39
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
160
40
 
161
- # redundant matching found
162
- unless position.nil?
163
- ds_in_scope.each do |term|
164
- lex = term[:lex]
165
- look_forward = block2.index(lex, position)
166
- unless look_forward.nil?
167
- tblocks = []
168
- break
169
- end
170
- end
171
- end
172
-
173
- tblocks
41
+ ## To generate the block_alignment of the input text against the reference text
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
43
+ r
174
44
  else
175
- []
45
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
176
46
  end
177
47
 
178
- if tblocks.empty?
179
- if b1 == 0 && e1 == str1.length
180
- if (e1 > 2000) || (e2 > 2000)
181
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
182
- else
183
- block1 = str1[b1 ... e1]
184
- block2 = str2[b2 ... e2]
185
-
186
- ## character-based alignment
187
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
- if alignment.sdiff.nil?
189
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
- else
191
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
- end
193
- end
194
- else
195
- block1 = str1[b1 ... e1]
196
- block2 = str2[b2 ... e2]
197
-
198
- ## character-based alignment
199
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
200
- if alignment.sdiff.nil?
201
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
202
- else
203
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
204
- end
205
- end
206
- else
207
- last_tblock = nil
208
- lblocks = tblocks.inject([]) do |sum, tblock|
209
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
210
- te1 = tblock[:source][:begin]
211
-
212
- sum += if te1 == tb1
213
- [tblock]
214
- else
215
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
216
- te2 = tblock[:target][:begin]
217
-
218
- if b2 == e2
219
- [
220
- {source:{begin:tb1, end:te1}, alignment: :empty},
221
- tblock
222
- ]
223
- else
224
- [
225
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
226
- tblock
227
- ]
228
- end
229
- end
230
-
231
- last_tblock = tblock
232
- sum
233
- end
234
-
235
- if last_tblock[:source][:end] < e1
236
- if last_tblock[:target][:end] < e2
237
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
238
- else
239
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
240
- end
241
- end
242
-
243
- lblocks
244
- end
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
245
49
  end
246
50
 
51
+ def transform_begin_position(_begin_position)
52
+ begin_position = @text_mapping.enmap_position(_begin_position)
247
53
 
248
- def indices(str, target)
249
- position = 0
250
- len = target.len
251
- Enumerator.new do |yielder|
252
- while idx = str.index(target, position)
253
- yielder << idx
254
- position = idx + len
255
- end
256
- end
257
- end
258
-
259
- def transform_begin_position(begin_position)
260
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
261
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
262
56
 
263
57
  b = if block[:alignment] == :block || block[:alignment] == :term
264
58
  begin_position + block[:delta]
@@ -272,11 +66,15 @@ class TextAlignment::TextAlignment
272
66
  r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
273
67
  r.nil? ? nil : r + block[:target][:begin]
274
68
  end
69
+
70
+ @rtext_mapping.demap_position(b)
275
71
  end
276
72
 
277
- def transform_end_position(end_position)
278
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
279
- block = @block_alignment[:blocks][i]
73
+ def transform_end_position(_end_position)
74
+ end_position = @text_mapping.enmap_position(_end_position)
75
+
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
280
78
 
281
79
  e = if block[:alignment] == :block || block[:alignment] == :term
282
80
  end_position + block[:delta]
@@ -290,6 +88,8 @@ class TextAlignment::TextAlignment
290
88
  r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
291
89
  r.nil? ? nil : r + block[:target][:begin]
292
90
  end
91
+
92
+ @rtext_mapping.demap_position(e)
293
93
  end
294
94
 
295
95
  def transform_a_span(span)
@@ -308,7 +108,7 @@ class TextAlignment::TextAlignment
308
108
  source = {begin:d.begin, end:d.end}
309
109
  d.begin = transform_begin_position(d.begin);
310
110
  d.end = transform_end_position(d.end);
311
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
111
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
312
112
  rescue
313
113
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
314
114
  d.begin = nil
@@ -324,7 +124,7 @@ class TextAlignment::TextAlignment
324
124
 
325
125
  r = hdenotations.collect do |d|
326
126
  t = transform_a_span(d[:span])
327
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
127
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
328
128
  new_d = d.dup.merge({span:t})
329
129
  rescue
330
130
  @lost_annotations << {source: d[:span], target:t}
@@ -335,8 +135,8 @@ class TextAlignment::TextAlignment
335
135
  end
336
136
 
337
137
  def alignment_show
338
- stext = @block_alignment[:source_text]
339
- ttext = @block_alignment[:target_text]
138
+ stext = @block_alignment[:text]
139
+ ttext = @block_alignment[:reference_text]
340
140
 
341
141
  show = ''
342
142
  @block_alignment[:blocks].each do |a|
@@ -392,9 +192,242 @@ class TextAlignment::TextAlignment
392
192
 
393
193
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
394
194
  "[#{astr1}]\n" +
395
- "[#{astr2}]\n\n"
195
+ "[#{astr2.gsub("\n", " ")}]\n\n"
396
196
  end
397
197
  end
398
198
  show
399
199
  end
200
+
201
+ private
202
+
203
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
204
+ ## to find block alignments
205
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
206
+
207
+ blocks = []
208
+ while block = anchor_finder.get_next_anchor
209
+ last = blocks.last
210
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
211
+ last[:source][:end] = block[:source][:end]
212
+ last[:target][:end] = block[:target][:end]
213
+ else
214
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
215
+ end
216
+ end
217
+
218
+ # pp blocks
219
+ # puts "-----"
220
+ # puts
221
+ # exit
222
+ # blocks.each do |b|
223
+ # p [b[:source], b[:target]]
224
+ # puts "---"
225
+ # puts str1[b[:source][:begin] ... b[:source][:end]]
226
+ # puts "---"
227
+ # puts str2[b[:target][:begin] ... b[:target][:end]]
228
+ # puts "====="
229
+ # puts
230
+ # end
231
+ # puts "-=-=-=-=-"
232
+ # puts
233
+
234
+ ## To fill the gaps
235
+ ## lblock: last block, cblock: current block
236
+ lblock = nil
237
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
238
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
239
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
240
+
241
+ if b1 < e1
242
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
243
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
244
+ _str1 = str1[b1 ... e1]
245
+ _str2 = str2[b2 ... e2]
246
+
247
+ sum += if _str1.strip.empty? || _str2.strip.empty?
248
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
249
+ else
250
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
251
+ region_state, state_region = cultivation_map.region_state([b2, e2])
252
+ case region_state
253
+ when :closed
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
255
+ when :front_open
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
263
+ when :rear_open
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
271
+ when :middle_closed
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
280
+ ob2 = state_region[1]
281
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
282
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
283
+ else
284
+ attempt1
285
+ end
286
+ else # :open
287
+ if (e2 - b2) > len_buffer
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
294
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
295
+ else
296
+ attempt1
297
+ end
298
+ else
299
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
300
+ end
301
+ end
302
+ end
303
+ end
304
+
305
+ lblock = cblock
306
+ cblock.nil? ? sum : sum << cblock
307
+ end
308
+
309
+ end
310
+
311
+ def whole_block_alignment(str1, str2, cultivation_map)
312
+ block_begin = cultivation_map.index(str1, str2)
313
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
314
+
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
316
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
317
+
318
+ nil
319
+ end
320
+
321
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
+ if tblocks.empty?
324
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
+ else
326
+ tblocks
327
+ end
328
+ end
329
+
330
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ str2_block = str2[0 ... e2]
332
+
333
+ ## term-based alignment
334
+ tblocks = if denotations
335
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
336
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
337
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
338
+
339
+ search_position = b2
340
+ _tblocks = denotations_in_scope.map do |denotation|
341
+ lex = denotation[:lex]
342
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
343
+ break [] if term_begin.nil? # break the loop if a missing term is found
344
+ search_position = term_begin + lex.length
345
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
346
+ end
347
+
348
+ # redundant matching found
349
+ unless _tblocks.empty?
350
+ search_position = _tblocks.last[:target][:end]
351
+ denotations_in_scope.each do |term|
352
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
353
+ unless look_forward.nil?
354
+ _tblocks = []
355
+ break
356
+ end
357
+ end
358
+ end
359
+
360
+ _tblocks
361
+ else
362
+ []
363
+ end
364
+
365
+ ltblock = nil
366
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
367
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
368
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
369
+
370
+ if te1 > tb1
371
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
372
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
373
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
374
+ end
375
+
376
+ ltblock = ctblock
377
+ ctblock.nil? ? sum : sum << ctblock
378
+ end
379
+
380
+ tblocks2
381
+ end
382
+
383
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
384
+ source = {begin:b1, end:e1}
385
+ target = {begin:b2, end:e2}
386
+
387
+ if (e1 - b1) > 2000
388
+ [{source:source, target:target, alignment: :empty}]
389
+ else
390
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
391
+ if alignment.similarity < 0.5
392
+ [{source:source, target:target, alignment: :empty}]
393
+ else
394
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
395
+ end
396
+ end
397
+ end
398
+
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
414
+ end
415
+ condensed
416
+ end
417
+
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
420
+
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
428
+ end
429
+
430
+ blocks
431
+ end
432
+
400
433
  end