text_alignment 0.9.1 → 0.11.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,94 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = []
8
+ end
9
+
10
+ def cultivate(regions)
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
24
+ end
25
+ @map = new_map
26
+ end
27
+
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position <= r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position = 0)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
93
+ end
94
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
20
20
  def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
63
65
  end
64
66
 
65
67
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
68
+ @similarity = compute_similarity(str1, str2, @sdiff)
67
69
  @str1_match_initial = cmp.str1_match_initial
68
70
  @str1_match_final = cmp.str1_match_final
69
71
  @str2_match_initial = cmp.str2_match_initial
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
144
+ def compute_similarity(s1, s2, sdiff)
145
+ return 0 if sdiff.nil?
146
+
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
+
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/} + 1
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
157
+
158
+ similarity = coverage * rate_frag
159
+ end
142
160
  end
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
@@ -10,255 +11,48 @@ class TextAlignment::TextAlignment
10
11
  attr_reader :similarity
11
12
  attr_reader :lost_annotations
12
13
 
13
- def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
15
17
 
16
- @block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
17
- @original_str1 = _str1
18
- @original_str2 = _str2
18
+ @original_reference_text = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
21
+ @to_prevent_overlap = to_prevent_overlap
19
22
 
20
- str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
21
-
22
- if r = whole_block_alignment(str1, str2)
23
- @block_alignment[:blocks] = r
24
- return
25
- end
26
-
27
- ## to find block alignments
28
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
29
-
30
- blocks = []
31
- while block = anchor_finder.get_next_anchor
32
- last = blocks.last
33
- if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
34
- last[:source][:end] = block[:source][:end]
35
- last[:target][:end] = block[:target][:end]
36
- else
37
- blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
38
- end
39
- end
40
-
41
- # pp blocks
42
- # puts "-----"
43
- # puts
44
- # exit
45
- # blocks.each do |b|
46
- # p [b[:source], b[:target]]
47
- # puts "---"
48
- # puts str1[b[:source][:begin] ... b[:source][:end]]
49
- # puts "---"
50
- # puts str2[b[:target][:begin] ... b[:target][:end]]
51
- # puts "====="
52
- # puts
53
- # end
54
- # puts "-=-=-=-=-"
55
- # puts
56
-
57
- ## to fill the gaps
58
- last_block = nil
59
- blocks2 = blocks.inject([]) do |sum, block|
60
- b1 = last_block ? last_block[:source][:end] : 0
61
- e1 = block[:source][:begin]
62
-
63
- sum += if b1 == e1
64
- [block]
65
- else
66
- b2 = last_block ? last_block[:target][:end] : 0
67
- e2 = block[:target][:begin]
68
-
69
- if b2 == e2
70
- [
71
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
72
- block
73
- ]
74
- else
75
- if b1 == 0 && b2 == 0
76
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
- b2 = e2 - len_buffer if e2 > len_buffer
78
- end
79
-
80
- _str1 = str1[b1 ... e1]
81
- _str2 = str2[b2 ... e2]
82
-
83
- if _str1.strip.empty? || _str2.strip.empty?
84
- [
85
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
86
- block
87
- ]
88
- else
89
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
90
- end
91
- end
92
- end
93
-
94
- last_block = block
95
- sum
96
- end
97
-
98
- # the last step
99
- blocks2 += if last_block.nil?
100
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
101
- else
102
- b1 = last_block[:source][:end]
103
- if b1 < str1.length
104
- e1 = str1.length
105
-
106
- b2 = last_block[:target][:end]
107
- if b2 < str2.length
108
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
109
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
110
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
111
- else
112
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
113
- end
114
- else
115
- []
116
- end
117
- end
118
-
119
- @block_alignment[:blocks] = blocks2
23
+ @original_text = nil
24
+ @blocks = nil
25
+ @cultivation_map = TextAlignment::CultivationMap.new
120
26
  end
121
27
 
122
- def whole_block_alignment(str1, str2)
123
- ## Block exact match
124
- block_begin = str2.index(str1)
125
- unless block_begin.nil?
126
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
- end
28
+ def align(text, denotations = nil)
29
+ # To maintain the cultivation map
30
+ update_cultivation_map if @to_prevent_overlap
128
31
 
129
- block_begin = str2.downcase.index(str1.downcase)
130
- unless block_begin.nil?
131
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
32
+ # In case the input text is the same as the previous one, reuse the previous text mapping
33
+ unless @original_text && @original_text == text
34
+ @original_text = text
35
+ @text_mapping = TextAlignment::CharMapping.new(text)
132
36
  end
133
37
 
134
- nil
135
- end
136
-
137
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
138
- block2 = str2[b2 ... e2]
139
-
140
- ## term-based alignment
141
- tblocks = if denotations
142
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
143
- sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
144
- map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
145
-
146
- position = 0
147
- tblocks = ds_in_scope.map do |term|
148
- lex = term[:lex]
149
- r = block2.index(lex, position)
150
- if r.nil?
151
- position = nil
152
- break
153
- end
154
- position = r + lex.length
155
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r + b2 - term[:span][:begin]}
156
- end
157
-
158
- # missing term found
159
- tblocks = [] if position.nil?
38
+ @mapped_text = @text_mapping.mapped_text
39
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
160
40
 
161
- # redundant matching found
162
- unless position.nil?
163
- ds_in_scope.each do |term|
164
- lex = term[:lex]
165
- look_forward = block2.index(lex, position)
166
- unless look_forward.nil?
167
- tblocks = []
168
- break
169
- end
170
- end
171
- end
172
-
173
- tblocks
41
+ ## To generate the block_alignment of the input text against the reference text
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
43
+ r
174
44
  else
175
- []
45
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
176
46
  end
177
47
 
178
- if tblocks.empty?
179
- if b1 == 0 && e1 == str1.length
180
- if (e1 > 2000) || (e2 > 2000)
181
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
182
- else
183
- block1 = str1[b1 ... e1]
184
- block2 = str2[b2 ... e2]
185
-
186
- ## character-based alignment
187
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
- if alignment.sdiff.nil?
189
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
- else
191
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
- end
193
- end
194
- else
195
- block1 = str1[b1 ... e1]
196
- block2 = str2[b2 ... e2]
197
-
198
- ## character-based alignment
199
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
200
- if alignment.sdiff.nil?
201
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
202
- else
203
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
204
- end
205
- end
206
- else
207
- last_tblock = nil
208
- lblocks = tblocks.inject([]) do |sum, tblock|
209
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
210
- te1 = tblock[:source][:begin]
211
-
212
- sum += if te1 == tb1
213
- [tblock]
214
- else
215
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
216
- te2 = tblock[:target][:begin]
217
-
218
- if b2 == e2
219
- [
220
- {source:{begin:tb1, end:te1}, alignment: :empty},
221
- tblock
222
- ]
223
- else
224
- [
225
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
226
- tblock
227
- ]
228
- end
229
- end
230
-
231
- last_tblock = tblock
232
- sum
233
- end
234
-
235
- if last_tblock[:source][:end] < e1
236
- if last_tblock[:target][:end] < e2
237
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
238
- else
239
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
240
- end
241
- end
242
-
243
- lblocks
244
- end
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
245
49
  end
246
50
 
51
+ def transform_begin_position(_begin_position)
52
+ begin_position = @text_mapping.enmap_position(_begin_position)
247
53
 
248
- def indices(str, target)
249
- position = 0
250
- len = target.len
251
- Enumerator.new do |yielder|
252
- while idx = str.index(target, position)
253
- yielder << idx
254
- position = idx + len
255
- end
256
- end
257
- end
258
-
259
- def transform_begin_position(begin_position)
260
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
261
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
262
56
 
263
57
  b = if block[:alignment] == :block || block[:alignment] == :term
264
58
  begin_position + block[:delta]
@@ -272,11 +66,15 @@ class TextAlignment::TextAlignment
272
66
  r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
273
67
  r.nil? ? nil : r + block[:target][:begin]
274
68
  end
69
+
70
+ @rtext_mapping.demap_position(b)
275
71
  end
276
72
 
277
- def transform_end_position(end_position)
278
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
279
- block = @block_alignment[:blocks][i]
73
+ def transform_end_position(_end_position)
74
+ end_position = @text_mapping.enmap_position(_end_position)
75
+
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
280
78
 
281
79
  e = if block[:alignment] == :block || block[:alignment] == :term
282
80
  end_position + block[:delta]
@@ -290,6 +88,8 @@ class TextAlignment::TextAlignment
290
88
  r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
291
89
  r.nil? ? nil : r + block[:target][:begin]
292
90
  end
91
+
92
+ @rtext_mapping.demap_position(e)
293
93
  end
294
94
 
295
95
  def transform_a_span(span)
@@ -308,7 +108,7 @@ class TextAlignment::TextAlignment
308
108
  source = {begin:d.begin, end:d.end}
309
109
  d.begin = transform_begin_position(d.begin);
310
110
  d.end = transform_end_position(d.end);
311
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
111
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
312
112
  rescue
313
113
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
314
114
  d.begin = nil
@@ -324,7 +124,7 @@ class TextAlignment::TextAlignment
324
124
 
325
125
  r = hdenotations.collect do |d|
326
126
  t = transform_a_span(d[:span])
327
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
127
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
328
128
  new_d = d.dup.merge({span:t})
329
129
  rescue
330
130
  @lost_annotations << {source: d[:span], target:t}
@@ -335,8 +135,8 @@ class TextAlignment::TextAlignment
335
135
  end
336
136
 
337
137
  def alignment_show
338
- stext = @block_alignment[:source_text]
339
- ttext = @block_alignment[:target_text]
138
+ stext = @block_alignment[:text]
139
+ ttext = @block_alignment[:reference_text]
340
140
 
341
141
  show = ''
342
142
  @block_alignment[:blocks].each do |a|
@@ -392,9 +192,242 @@ class TextAlignment::TextAlignment
392
192
 
393
193
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
394
194
  "[#{astr1}]\n" +
395
- "[#{astr2}]\n\n"
195
+ "[#{astr2.gsub("\n", " ")}]\n\n"
396
196
  end
397
197
  end
398
198
  show
399
199
  end
200
+
201
+ private
202
+
203
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
204
+ ## to find block alignments
205
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
206
+
207
+ blocks = []
208
+ while block = anchor_finder.get_next_anchor
209
+ last = blocks.last
210
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
211
+ last[:source][:end] = block[:source][:end]
212
+ last[:target][:end] = block[:target][:end]
213
+ else
214
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
215
+ end
216
+ end
217
+
218
+ # pp blocks
219
+ # puts "-----"
220
+ # puts
221
+ # exit
222
+ # blocks.each do |b|
223
+ # p [b[:source], b[:target]]
224
+ # puts "---"
225
+ # puts str1[b[:source][:begin] ... b[:source][:end]]
226
+ # puts "---"
227
+ # puts str2[b[:target][:begin] ... b[:target][:end]]
228
+ # puts "====="
229
+ # puts
230
+ # end
231
+ # puts "-=-=-=-=-"
232
+ # puts
233
+
234
+ ## To fill the gaps
235
+ ## lblock: last block, cblock: current block
236
+ lblock = nil
237
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
238
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
239
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
240
+
241
+ if b1 < e1
242
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
243
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
244
+ _str1 = str1[b1 ... e1]
245
+ _str2 = str2[b2 ... e2]
246
+
247
+ sum += if _str1.strip.empty? || _str2.strip.empty?
248
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
249
+ else
250
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
251
+ region_state, state_region = cultivation_map.region_state([b2, e2])
252
+ case region_state
253
+ when :closed
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
255
+ when :front_open
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
263
+ when :rear_open
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
271
+ when :middle_closed
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
280
+ ob2 = state_region[1]
281
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
282
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
283
+ else
284
+ attempt1
285
+ end
286
+ else # :open
287
+ if (e2 - b2) > len_buffer
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
294
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
295
+ else
296
+ attempt1
297
+ end
298
+ else
299
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
300
+ end
301
+ end
302
+ end
303
+ end
304
+
305
+ lblock = cblock
306
+ cblock.nil? ? sum : sum << cblock
307
+ end
308
+
309
+ end
310
+
311
+ def whole_block_alignment(str1, str2, cultivation_map)
312
+ block_begin = cultivation_map.index(str1, str2)
313
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
314
+
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
316
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
317
+
318
+ nil
319
+ end
320
+
321
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
+ if tblocks.empty?
324
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
+ else
326
+ tblocks
327
+ end
328
+ end
329
+
330
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ str2_block = str2[0 ... e2]
332
+
333
+ ## term-based alignment
334
+ tblocks = if denotations
335
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
336
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
337
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
338
+
339
+ search_position = b2
340
+ _tblocks = denotations_in_scope.map do |denotation|
341
+ lex = denotation[:lex]
342
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
343
+ break [] if term_begin.nil? # break the loop if a missing term is found
344
+ search_position = term_begin + lex.length
345
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
346
+ end
347
+
348
+ # redundant matching found
349
+ unless _tblocks.empty?
350
+ search_position = _tblocks.last[:target][:end]
351
+ denotations_in_scope.each do |term|
352
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
353
+ unless look_forward.nil?
354
+ _tblocks = []
355
+ break
356
+ end
357
+ end
358
+ end
359
+
360
+ _tblocks
361
+ else
362
+ []
363
+ end
364
+
365
+ ltblock = nil
366
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
367
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
368
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
369
+
370
+ if te1 > tb1
371
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
372
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
373
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
374
+ end
375
+
376
+ ltblock = ctblock
377
+ ctblock.nil? ? sum : sum << ctblock
378
+ end
379
+
380
+ tblocks2
381
+ end
382
+
383
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
384
+ source = {begin:b1, end:e1}
385
+ target = {begin:b2, end:e2}
386
+
387
+ if (e1 - b1) > 2000
388
+ [{source:source, target:target, alignment: :empty}]
389
+ else
390
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
391
+ if alignment.similarity < 0.5
392
+ [{source:source, target:target, alignment: :empty}]
393
+ else
394
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
395
+ end
396
+ end
397
+ end
398
+
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
414
+ end
415
+ condensed
416
+ end
417
+
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
420
+
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
428
+ end
429
+
430
+ blocks
431
+ end
432
+
400
433
  end