text_alignment 0.9 → 0.11.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +24 -14
- data/lib/text_alignment/anchor_finder.rb +120 -70
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +93 -75
- data/lib/text_alignment/cultivation_map.rb +94 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +21 -3
- data/lib/text_alignment/text_alignment.rb +276 -243
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
@@ -0,0 +1,94 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
class TextAlignment::CultivationMap
|
4
|
+
attr_reader :map
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@map = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def cultivate(regions)
|
11
|
+
@map += regions
|
12
|
+
@map.sort!{|a, b| a[0] <=> b[0]}
|
13
|
+
new_map = []
|
14
|
+
@map.each do |region|
|
15
|
+
if new_map.empty?
|
16
|
+
new_map << region
|
17
|
+
elsif new_map.last[1] > region[0]
|
18
|
+
raise "Overlapping regions: #{new_map.last} : #{region}"
|
19
|
+
elsif new_map.last[1] == region[0]
|
20
|
+
new_map.last[1] == region[1]
|
21
|
+
else
|
22
|
+
new_map << region
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@map = new_map
|
26
|
+
end
|
27
|
+
|
28
|
+
def search_again_position(position, end_position = nil)
|
29
|
+
end_position ||= position
|
30
|
+
region = @map.bsearch{|r| end_position < r[1]}
|
31
|
+
if region.nil? || region[0] > position
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
region[1]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def last_cultivated_position(position)
|
39
|
+
ridx = @map.rindex{|r| r[1] <= position}
|
40
|
+
ridx.nil? ? nil : @map[ridx][1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next_cultivated_position(position)
|
44
|
+
region = @map.bsearch{|r| position < r[0]}
|
45
|
+
region.nil? ? nil : region[0]
|
46
|
+
end
|
47
|
+
|
48
|
+
def in_regions(region)
|
49
|
+
@map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
|
50
|
+
end
|
51
|
+
|
52
|
+
def region_state(region)
|
53
|
+
closed_parts = in_regions(region)
|
54
|
+
if closed_parts.empty?
|
55
|
+
[:open, region]
|
56
|
+
else
|
57
|
+
if front_open?(region, closed_parts)
|
58
|
+
if rear_open?(region, closed_parts)
|
59
|
+
[:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
|
60
|
+
else
|
61
|
+
[:front_open, [region[0], closed_parts.first[0]]]
|
62
|
+
end
|
63
|
+
else
|
64
|
+
if rear_open?(region, closed_parts)
|
65
|
+
[:rear_open, [closed_parts.last[1], region[1]]]
|
66
|
+
else
|
67
|
+
[:closed, nil]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def index(target, string, position = 0)
|
74
|
+
length = target.length
|
75
|
+
loop do
|
76
|
+
_begin = string.index(target, position)
|
77
|
+
break if _begin.nil?
|
78
|
+
position = search_again_position(_begin)
|
79
|
+
next unless position.nil?
|
80
|
+
break _begin if region_state([_begin, _begin + length])[0] == :open
|
81
|
+
position = _begin + 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def front_open?(region, closed_parts)
|
88
|
+
closed_parts.first[0] > region[0]
|
89
|
+
end
|
90
|
+
|
91
|
+
def rear_open?(region, closed_parts)
|
92
|
+
closed_parts.last[1] < region[1]
|
93
|
+
end
|
94
|
+
end
|
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
|
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/glcs_alignment'
|
8
|
-
require 'text_alignment/
|
8
|
+
require 'text_alignment/char_mapping'
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
@@ -106,7 +106,7 @@ if __FILE__ == $0
|
|
106
106
|
|
107
107
|
dictionary = [["β", "beta"]]
|
108
108
|
# align = TextAlignment::TextAlignment.new(str1, str2)
|
109
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::
|
109
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
|
110
110
|
p align.common_elements
|
111
111
|
p align.mapped_elements
|
112
112
|
end
|
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
|
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/lcs_cdiff'
|
8
8
|
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/
|
9
|
+
require 'text_alignment/char_mapping'
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
|
|
20
20
|
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
|
23
|
+
mappings ||= TextAlignment::CHAR_MAPPING
|
24
|
+
str1 = _str1.dup
|
25
|
+
str2 = _str2.dup
|
24
26
|
|
25
27
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
28
|
end
|
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
|
|
63
65
|
end
|
64
66
|
|
65
67
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity =
|
68
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
67
69
|
@str1_match_initial = cmp.str1_match_initial
|
68
70
|
@str1_match_final = cmp.str1_match_final
|
69
71
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
|
|
139
141
|
@position_map_end = posmap_end.sort.to_h
|
140
142
|
end
|
141
143
|
|
144
|
+
def compute_similarity(s1, s2, sdiff)
|
145
|
+
return 0 if sdiff.nil?
|
146
|
+
|
147
|
+
# recoverbility
|
148
|
+
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
|
+
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
|
+
|
151
|
+
coverage = count_nws_match.to_f / count_nws
|
152
|
+
|
153
|
+
# fragmentation rate
|
154
|
+
count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
|
155
|
+
count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
|
156
|
+
rate_frag = count_ofrag.to_f / count_frag
|
157
|
+
|
158
|
+
similarity = coverage * rate_frag
|
159
|
+
end
|
142
160
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment/constants'
|
3
3
|
require 'text_alignment/anchor_finder'
|
4
4
|
require 'text_alignment/mixed_alignment'
|
5
|
+
require 'text_alignment/cultivation_map'
|
5
6
|
|
6
7
|
module TextAlignment; end unless defined? TextAlignment
|
7
8
|
|
@@ -10,255 +11,48 @@ class TextAlignment::TextAlignment
|
|
10
11
|
attr_reader :similarity
|
11
12
|
attr_reader :lost_annotations
|
12
13
|
|
13
|
-
|
14
|
-
|
14
|
+
# Initialize with a reference text, again which texts will be aligned
|
15
|
+
def initialize(reference_text, to_prevent_overlap = false)
|
16
|
+
raise ArgumentError, "nil text" if reference_text.nil?
|
15
17
|
|
16
|
-
@
|
17
|
-
@
|
18
|
-
@
|
18
|
+
@original_reference_text = reference_text
|
19
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
+
@to_prevent_overlap = to_prevent_overlap
|
19
22
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
@block_alignment[:blocks] = r
|
24
|
-
return
|
25
|
-
end
|
26
|
-
|
27
|
-
## to find block alignments
|
28
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
29
|
-
|
30
|
-
blocks = []
|
31
|
-
while block = anchor_finder.get_next_anchor
|
32
|
-
last = blocks.last
|
33
|
-
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
34
|
-
last[:source][:end] = block[:source][:end]
|
35
|
-
last[:target][:end] = block[:target][:end]
|
36
|
-
else
|
37
|
-
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
# pp blocks
|
42
|
-
# puts "-----"
|
43
|
-
# puts
|
44
|
-
# exit
|
45
|
-
# blocks.each do |b|
|
46
|
-
# p [b[:source], b[:target]]
|
47
|
-
# puts "---"
|
48
|
-
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
49
|
-
# puts "---"
|
50
|
-
# puts str2[b[:target][:begin] ... b[:target][:end]]
|
51
|
-
# puts "====="
|
52
|
-
# puts
|
53
|
-
# end
|
54
|
-
# puts "-=-=-=-=-"
|
55
|
-
# puts
|
56
|
-
|
57
|
-
## to fill the gaps
|
58
|
-
last_block = nil
|
59
|
-
blocks2 = blocks.inject([]) do |sum, block|
|
60
|
-
b1 = last_block ? last_block[:source][:end] : 0
|
61
|
-
e1 = block[:source][:begin]
|
62
|
-
|
63
|
-
sum += if b1 == e1
|
64
|
-
[block]
|
65
|
-
else
|
66
|
-
b2 = last_block ? last_block[:target][:end] : 0
|
67
|
-
e2 = block[:target][:begin]
|
68
|
-
|
69
|
-
if b2 == e2
|
70
|
-
[
|
71
|
-
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
72
|
-
block
|
73
|
-
]
|
74
|
-
else
|
75
|
-
if b1 == 0 && b2 == 0
|
76
|
-
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
|
-
b2 = e2 - len_buffer if e2 > len_buffer
|
78
|
-
end
|
79
|
-
|
80
|
-
_str1 = str1[b1 ... e1]
|
81
|
-
_str2 = str2[b2 ... e2]
|
82
|
-
|
83
|
-
if _str1.strip.empty? || _str2.strip.empty?
|
84
|
-
[
|
85
|
-
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
86
|
-
block
|
87
|
-
]
|
88
|
-
else
|
89
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
last_block = block
|
95
|
-
sum
|
96
|
-
end
|
97
|
-
|
98
|
-
# the last step
|
99
|
-
blocks2 += if last_block.nil?
|
100
|
-
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
101
|
-
else
|
102
|
-
b1 = last_block[:source][:end]
|
103
|
-
if b1 < str1.length
|
104
|
-
e1 = str1.length
|
105
|
-
|
106
|
-
b2 = last_block[:target][:end]
|
107
|
-
if b2 < str2.length
|
108
|
-
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
109
|
-
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
110
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
111
|
-
else
|
112
|
-
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
113
|
-
end
|
114
|
-
else
|
115
|
-
[]
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
@block_alignment[:blocks] = blocks2
|
23
|
+
@original_text = nil
|
24
|
+
@blocks = nil
|
25
|
+
@cultivation_map = TextAlignment::CultivationMap.new
|
120
26
|
end
|
121
27
|
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
unless block_begin.nil?
|
126
|
-
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
|
-
end
|
28
|
+
def align(text, denotations = nil)
|
29
|
+
# To maintain the cultivation map
|
30
|
+
update_cultivation_map if @to_prevent_overlap
|
128
31
|
|
129
|
-
|
130
|
-
unless
|
131
|
-
|
32
|
+
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
|
+
unless @original_text && @original_text == text
|
34
|
+
@original_text = text
|
35
|
+
@text_mapping = TextAlignment::CharMapping.new(text)
|
132
36
|
end
|
133
37
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
138
|
-
block2 = str2[b2 ... e2]
|
139
|
-
|
140
|
-
## term-based alignment
|
141
|
-
tblocks = if denotations
|
142
|
-
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
143
|
-
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
144
|
-
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
145
|
-
|
146
|
-
position = 0
|
147
|
-
tblocks = ds_in_scope.map do |term|
|
148
|
-
lex = term[:lex]
|
149
|
-
r = block2.index(lex, position)
|
150
|
-
if r.nil?
|
151
|
-
position = nil
|
152
|
-
break
|
153
|
-
end
|
154
|
-
position = r + lex.length
|
155
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
156
|
-
end
|
157
|
-
|
158
|
-
# missing term found
|
159
|
-
tblocks = [] if position.nil?
|
38
|
+
@mapped_text = @text_mapping.mapped_text
|
39
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
160
40
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
lex = term[:lex]
|
165
|
-
look_forward = block2.index(lex, position)
|
166
|
-
unless look_forward.nil?
|
167
|
-
tblocks = []
|
168
|
-
break
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
tblocks
|
41
|
+
## To generate the block_alignment of the input text against the reference text
|
42
|
+
@blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
43
|
+
r
|
174
44
|
else
|
175
|
-
|
45
|
+
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
176
46
|
end
|
177
47
|
|
178
|
-
|
179
|
-
if b1 == 0 && e1 == str1.length
|
180
|
-
if (e1 > 2000) || (e2 > 2000)
|
181
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
182
|
-
else
|
183
|
-
block1 = str1[b1 ... e1]
|
184
|
-
block2 = str2[b2 ... e2]
|
185
|
-
|
186
|
-
## character-based alignment
|
187
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
188
|
-
if alignment.sdiff.nil?
|
189
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
|
-
else
|
191
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
192
|
-
end
|
193
|
-
end
|
194
|
-
else
|
195
|
-
block1 = str1[b1 ... e1]
|
196
|
-
block2 = str2[b2 ... e2]
|
197
|
-
|
198
|
-
## character-based alignment
|
199
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
200
|
-
if alignment.sdiff.nil?
|
201
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
202
|
-
else
|
203
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
204
|
-
end
|
205
|
-
end
|
206
|
-
else
|
207
|
-
last_tblock = nil
|
208
|
-
lblocks = tblocks.inject([]) do |sum, tblock|
|
209
|
-
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
210
|
-
te1 = tblock[:source][:begin]
|
211
|
-
|
212
|
-
sum += if te1 == tb1
|
213
|
-
[tblock]
|
214
|
-
else
|
215
|
-
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
216
|
-
te2 = tblock[:target][:begin]
|
217
|
-
|
218
|
-
if b2 == e2
|
219
|
-
[
|
220
|
-
{source:{begin:tb1, end:te1}, alignment: :empty},
|
221
|
-
tblock
|
222
|
-
]
|
223
|
-
else
|
224
|
-
[
|
225
|
-
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
226
|
-
tblock
|
227
|
-
]
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
last_tblock = tblock
|
232
|
-
sum
|
233
|
-
end
|
234
|
-
|
235
|
-
if last_tblock[:source][:end] < e1
|
236
|
-
if last_tblock[:target][:end] < e2
|
237
|
-
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
238
|
-
else
|
239
|
-
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
lblocks
|
244
|
-
end
|
48
|
+
@block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
|
245
49
|
end
|
246
50
|
|
51
|
+
def transform_begin_position(_begin_position)
|
52
|
+
begin_position = @text_mapping.enmap_position(_begin_position)
|
247
53
|
|
248
|
-
|
249
|
-
|
250
|
-
len = target.len
|
251
|
-
Enumerator.new do |yielder|
|
252
|
-
while idx = str.index(target, position)
|
253
|
-
yielder << idx
|
254
|
-
position = idx + len
|
255
|
-
end
|
256
|
-
end
|
257
|
-
end
|
258
|
-
|
259
|
-
def transform_begin_position(begin_position)
|
260
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
261
|
-
block = @block_alignment[:blocks][i]
|
54
|
+
i = @blocks.index{|b| b[:source][:end] > begin_position}
|
55
|
+
block = @blocks[i]
|
262
56
|
|
263
57
|
b = if block[:alignment] == :block || block[:alignment] == :term
|
264
58
|
begin_position + block[:delta]
|
@@ -272,11 +66,15 @@ class TextAlignment::TextAlignment
|
|
272
66
|
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
273
67
|
r.nil? ? nil : r + block[:target][:begin]
|
274
68
|
end
|
69
|
+
|
70
|
+
@rtext_mapping.demap_position(b)
|
275
71
|
end
|
276
72
|
|
277
|
-
def transform_end_position(
|
278
|
-
|
279
|
-
|
73
|
+
def transform_end_position(_end_position)
|
74
|
+
end_position = @text_mapping.enmap_position(_end_position)
|
75
|
+
|
76
|
+
i = @blocks.index{|b| b[:source][:end] >= end_position}
|
77
|
+
block = @blocks[i]
|
280
78
|
|
281
79
|
e = if block[:alignment] == :block || block[:alignment] == :term
|
282
80
|
end_position + block[:delta]
|
@@ -290,6 +88,8 @@ class TextAlignment::TextAlignment
|
|
290
88
|
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
291
89
|
r.nil? ? nil : r + block[:target][:begin]
|
292
90
|
end
|
91
|
+
|
92
|
+
@rtext_mapping.demap_position(e)
|
293
93
|
end
|
294
94
|
|
295
95
|
def transform_a_span(span)
|
@@ -308,7 +108,7 @@ class TextAlignment::TextAlignment
|
|
308
108
|
source = {begin:d.begin, end:d.end}
|
309
109
|
d.begin = transform_begin_position(d.begin);
|
310
110
|
d.end = transform_end_position(d.end);
|
311
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
111
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
|
312
112
|
rescue
|
313
113
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
314
114
|
d.begin = nil
|
@@ -324,7 +124,7 @@ class TextAlignment::TextAlignment
|
|
324
124
|
|
325
125
|
r = hdenotations.collect do |d|
|
326
126
|
t = transform_a_span(d[:span])
|
327
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
127
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
|
328
128
|
new_d = d.dup.merge({span:t})
|
329
129
|
rescue
|
330
130
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -335,8 +135,8 @@ class TextAlignment::TextAlignment
|
|
335
135
|
end
|
336
136
|
|
337
137
|
def alignment_show
|
338
|
-
stext = @block_alignment[:
|
339
|
-
ttext = @block_alignment[:
|
138
|
+
stext = @block_alignment[:text]
|
139
|
+
ttext = @block_alignment[:reference_text]
|
340
140
|
|
341
141
|
show = ''
|
342
142
|
@block_alignment[:blocks].each do |a|
|
@@ -392,9 +192,242 @@ class TextAlignment::TextAlignment
|
|
392
192
|
|
393
193
|
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
394
194
|
"[#{astr1}]\n" +
|
395
|
-
"[#{astr2}]\n\n"
|
195
|
+
"[#{astr2.gsub("\n", " ")}]\n\n"
|
396
196
|
end
|
397
197
|
end
|
398
198
|
show
|
399
199
|
end
|
200
|
+
|
201
|
+
private
|
202
|
+
|
203
|
+
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
204
|
+
## to find block alignments
|
205
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
206
|
+
|
207
|
+
blocks = []
|
208
|
+
while block = anchor_finder.get_next_anchor
|
209
|
+
last = blocks.last
|
210
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
211
|
+
last[:source][:end] = block[:source][:end]
|
212
|
+
last[:target][:end] = block[:target][:end]
|
213
|
+
else
|
214
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# pp blocks
|
219
|
+
# puts "-----"
|
220
|
+
# puts
|
221
|
+
# exit
|
222
|
+
# blocks.each do |b|
|
223
|
+
# p [b[:source], b[:target]]
|
224
|
+
# puts "---"
|
225
|
+
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
226
|
+
# puts "---"
|
227
|
+
# puts str2[b[:target][:begin] ... b[:target][:end]]
|
228
|
+
# puts "====="
|
229
|
+
# puts
|
230
|
+
# end
|
231
|
+
# puts "-=-=-=-=-"
|
232
|
+
# puts
|
233
|
+
|
234
|
+
## To fill the gaps
|
235
|
+
## lblock: last block, cblock: current block
|
236
|
+
lblock = nil
|
237
|
+
blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
|
238
|
+
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
239
|
+
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
240
|
+
|
241
|
+
if b1 < e1
|
242
|
+
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
243
|
+
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
244
|
+
_str1 = str1[b1 ... e1]
|
245
|
+
_str2 = str2[b2 ... e2]
|
246
|
+
|
247
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
248
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
249
|
+
else
|
250
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
251
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
252
|
+
case region_state
|
253
|
+
when :closed
|
254
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
255
|
+
when :front_open
|
256
|
+
if sum.empty? # when there is no preceding matched block
|
257
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
258
|
+
else
|
259
|
+
oe2 = state_region[1]
|
260
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
261
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
262
|
+
end
|
263
|
+
when :rear_open
|
264
|
+
if cblock.nil? # when there is no following matched block
|
265
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
266
|
+
else
|
267
|
+
ob2 = state_region[0]
|
268
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
269
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
270
|
+
end
|
271
|
+
when :middle_closed
|
272
|
+
attempt1 = if sum.empty?
|
273
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
274
|
+
else
|
275
|
+
oe2 = state_region[0]
|
276
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
277
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
278
|
+
end
|
279
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
280
|
+
ob2 = state_region[1]
|
281
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
282
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
283
|
+
else
|
284
|
+
attempt1
|
285
|
+
end
|
286
|
+
else # :open
|
287
|
+
if (e2 - b2) > len_buffer
|
288
|
+
attempt1 = if sum.empty?
|
289
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
290
|
+
else
|
291
|
+
local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
292
|
+
end
|
293
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
294
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
295
|
+
else
|
296
|
+
attempt1
|
297
|
+
end
|
298
|
+
else
|
299
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
lblock = cblock
|
306
|
+
cblock.nil? ? sum : sum << cblock
|
307
|
+
end
|
308
|
+
|
309
|
+
end
|
310
|
+
|
311
|
+
def whole_block_alignment(str1, str2, cultivation_map)
|
312
|
+
block_begin = cultivation_map.index(str1, str2)
|
313
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
314
|
+
|
315
|
+
block_begin = cultivation_map.index(str1.downcase, str2.downcase)
|
316
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
317
|
+
|
318
|
+
nil
|
319
|
+
end
|
320
|
+
|
321
|
+
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
322
|
+
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
323
|
+
if tblocks.empty?
|
324
|
+
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
325
|
+
else
|
326
|
+
tblocks
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
331
|
+
str2_block = str2[0 ... e2]
|
332
|
+
|
333
|
+
## term-based alignment
|
334
|
+
tblocks = if denotations
|
335
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
336
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
337
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
338
|
+
|
339
|
+
search_position = b2
|
340
|
+
_tblocks = denotations_in_scope.map do |denotation|
|
341
|
+
lex = denotation[:lex]
|
342
|
+
term_begin = cultivation_map.index(lex, str2_block, search_position)
|
343
|
+
break [] if term_begin.nil? # break the loop if a missing term is found
|
344
|
+
search_position = term_begin + lex.length
|
345
|
+
{source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
|
346
|
+
end
|
347
|
+
|
348
|
+
# redundant matching found
|
349
|
+
unless _tblocks.empty?
|
350
|
+
search_position = _tblocks.last[:target][:end]
|
351
|
+
denotations_in_scope.each do |term|
|
352
|
+
look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
|
353
|
+
unless look_forward.nil?
|
354
|
+
_tblocks = []
|
355
|
+
break
|
356
|
+
end
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
_tblocks
|
361
|
+
else
|
362
|
+
[]
|
363
|
+
end
|
364
|
+
|
365
|
+
ltblock = nil
|
366
|
+
tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
|
367
|
+
tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
|
368
|
+
te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
|
369
|
+
|
370
|
+
if te1 > tb1
|
371
|
+
tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
|
372
|
+
te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
|
373
|
+
sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
|
374
|
+
end
|
375
|
+
|
376
|
+
ltblock = ctblock
|
377
|
+
ctblock.nil? ? sum : sum << ctblock
|
378
|
+
end
|
379
|
+
|
380
|
+
tblocks2
|
381
|
+
end
|
382
|
+
|
383
|
+
def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
384
|
+
source = {begin:b1, end:e1}
|
385
|
+
target = {begin:b2, end:e2}
|
386
|
+
|
387
|
+
if (e1 - b1) > 2000
|
388
|
+
[{source:source, target:target, alignment: :empty}]
|
389
|
+
else
|
390
|
+
alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
|
391
|
+
if alignment.similarity < 0.5
|
392
|
+
[{source:source, target:target, alignment: :empty}]
|
393
|
+
else
|
394
|
+
[{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
def update_cultivation_map
|
400
|
+
return if @blocks.nil?
|
401
|
+
|
402
|
+
## To update the cultivation map
|
403
|
+
newly_cultivated_regions = @blocks.collect do |b|
|
404
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
405
|
+
[b[:target][:begin], b[:target][:end]]
|
406
|
+
else
|
407
|
+
nil
|
408
|
+
end
|
409
|
+
end.compact.inject([]) do |condensed, region|
|
410
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
411
|
+
condensed.push region
|
412
|
+
else
|
413
|
+
condensed.last[1] = region.last
|
414
|
+
end
|
415
|
+
condensed
|
416
|
+
end
|
417
|
+
|
418
|
+
@cultivation_map.cultivate(newly_cultivated_regions)
|
419
|
+
end
|
420
|
+
|
421
|
+
def demap_blocks(_blocks)
|
422
|
+
return nil if _blocks.nil?
|
423
|
+
|
424
|
+
blocks = _blocks.map{|b| b.dup}
|
425
|
+
blocks.each do |b|
|
426
|
+
b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
|
427
|
+
b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
|
428
|
+
end
|
429
|
+
|
430
|
+
blocks
|
431
|
+
end
|
432
|
+
|
400
433
|
end
|