text_alignment 0.9 → 0.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +24 -14
- data/lib/text_alignment/anchor_finder.rb +120 -70
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +93 -75
- data/lib/text_alignment/cultivation_map.rb +94 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +21 -3
- data/lib/text_alignment/text_alignment.rb +276 -243
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
@@ -0,0 +1,94 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
class TextAlignment::CultivationMap
|
4
|
+
attr_reader :map
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@map = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def cultivate(regions)
|
11
|
+
@map += regions
|
12
|
+
@map.sort!{|a, b| a[0] <=> b[0]}
|
13
|
+
new_map = []
|
14
|
+
@map.each do |region|
|
15
|
+
if new_map.empty?
|
16
|
+
new_map << region
|
17
|
+
elsif new_map.last[1] > region[0]
|
18
|
+
raise "Overlapping regions: #{new_map.last} : #{region}"
|
19
|
+
elsif new_map.last[1] == region[0]
|
20
|
+
new_map.last[1] == region[1]
|
21
|
+
else
|
22
|
+
new_map << region
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@map = new_map
|
26
|
+
end
|
27
|
+
|
28
|
+
def search_again_position(position, end_position = nil)
|
29
|
+
end_position ||= position
|
30
|
+
region = @map.bsearch{|r| end_position < r[1]}
|
31
|
+
if region.nil? || region[0] > position
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
region[1]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def last_cultivated_position(position)
|
39
|
+
ridx = @map.rindex{|r| r[1] <= position}
|
40
|
+
ridx.nil? ? nil : @map[ridx][1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next_cultivated_position(position)
|
44
|
+
region = @map.bsearch{|r| position < r[0]}
|
45
|
+
region.nil? ? nil : region[0]
|
46
|
+
end
|
47
|
+
|
48
|
+
def in_regions(region)
|
49
|
+
@map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
|
50
|
+
end
|
51
|
+
|
52
|
+
def region_state(region)
|
53
|
+
closed_parts = in_regions(region)
|
54
|
+
if closed_parts.empty?
|
55
|
+
[:open, region]
|
56
|
+
else
|
57
|
+
if front_open?(region, closed_parts)
|
58
|
+
if rear_open?(region, closed_parts)
|
59
|
+
[:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
|
60
|
+
else
|
61
|
+
[:front_open, [region[0], closed_parts.first[0]]]
|
62
|
+
end
|
63
|
+
else
|
64
|
+
if rear_open?(region, closed_parts)
|
65
|
+
[:rear_open, [closed_parts.last[1], region[1]]]
|
66
|
+
else
|
67
|
+
[:closed, nil]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def index(target, string, position = 0)
|
74
|
+
length = target.length
|
75
|
+
loop do
|
76
|
+
_begin = string.index(target, position)
|
77
|
+
break if _begin.nil?
|
78
|
+
position = search_again_position(_begin)
|
79
|
+
next unless position.nil?
|
80
|
+
break _begin if region_state([_begin, _begin + length])[0] == :open
|
81
|
+
position = _begin + 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def front_open?(region, closed_parts)
|
88
|
+
closed_parts.first[0] > region[0]
|
89
|
+
end
|
90
|
+
|
91
|
+
def rear_open?(region, closed_parts)
|
92
|
+
closed_parts.last[1] < region[1]
|
93
|
+
end
|
94
|
+
end
|
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
|
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/glcs_alignment'
|
8
|
-
require 'text_alignment/
|
8
|
+
require 'text_alignment/char_mapping'
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
@@ -106,7 +106,7 @@ if __FILE__ == $0
|
|
106
106
|
|
107
107
|
dictionary = [["β", "beta"]]
|
108
108
|
# align = TextAlignment::TextAlignment.new(str1, str2)
|
109
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::
|
109
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
|
110
110
|
p align.common_elements
|
111
111
|
p align.mapped_elements
|
112
112
|
end
|
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
|
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/lcs_cdiff'
|
8
8
|
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/
|
9
|
+
require 'text_alignment/char_mapping'
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
|
|
20
20
|
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
|
23
|
+
mappings ||= TextAlignment::CHAR_MAPPING
|
24
|
+
str1 = _str1.dup
|
25
|
+
str2 = _str2.dup
|
24
26
|
|
25
27
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
28
|
end
|
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
|
|
63
65
|
end
|
64
66
|
|
65
67
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity =
|
68
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
67
69
|
@str1_match_initial = cmp.str1_match_initial
|
68
70
|
@str1_match_final = cmp.str1_match_final
|
69
71
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
|
|
139
141
|
@position_map_end = posmap_end.sort.to_h
|
140
142
|
end
|
141
143
|
|
144
|
+
def compute_similarity(s1, s2, sdiff)
|
145
|
+
return 0 if sdiff.nil?
|
146
|
+
|
147
|
+
# recoverbility
|
148
|
+
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
|
+
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
|
+
|
151
|
+
coverage = count_nws_match.to_f / count_nws
|
152
|
+
|
153
|
+
# fragmentation rate
|
154
|
+
count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
|
155
|
+
count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
|
156
|
+
rate_frag = count_ofrag.to_f / count_frag
|
157
|
+
|
158
|
+
similarity = coverage * rate_frag
|
159
|
+
end
|
142
160
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment/constants'
|
3
3
|
require 'text_alignment/anchor_finder'
|
4
4
|
require 'text_alignment/mixed_alignment'
|
5
|
+
require 'text_alignment/cultivation_map'
|
5
6
|
|
6
7
|
module TextAlignment; end unless defined? TextAlignment
|
7
8
|
|
@@ -10,255 +11,48 @@ class TextAlignment::TextAlignment
|
|
10
11
|
attr_reader :similarity
|
11
12
|
attr_reader :lost_annotations
|
12
13
|
|
13
|
-
|
14
|
-
|
14
|
+
# Initialize with a reference text, again which texts will be aligned
|
15
|
+
def initialize(reference_text, to_prevent_overlap = false)
|
16
|
+
raise ArgumentError, "nil text" if reference_text.nil?
|
15
17
|
|
16
|
-
@
|
17
|
-
@
|
18
|
-
@
|
18
|
+
@original_reference_text = reference_text
|
19
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
+
@to_prevent_overlap = to_prevent_overlap
|
19
22
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
@block_alignment[:blocks] = r
|
24
|
-
return
|
25
|
-
end
|
26
|
-
|
27
|
-
## to find block alignments
|
28
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
29
|
-
|
30
|
-
blocks = []
|
31
|
-
while block = anchor_finder.get_next_anchor
|
32
|
-
last = blocks.last
|
33
|
-
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
34
|
-
last[:source][:end] = block[:source][:end]
|
35
|
-
last[:target][:end] = block[:target][:end]
|
36
|
-
else
|
37
|
-
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
# pp blocks
|
42
|
-
# puts "-----"
|
43
|
-
# puts
|
44
|
-
# exit
|
45
|
-
# blocks.each do |b|
|
46
|
-
# p [b[:source], b[:target]]
|
47
|
-
# puts "---"
|
48
|
-
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
49
|
-
# puts "---"
|
50
|
-
# puts str2[b[:target][:begin] ... b[:target][:end]]
|
51
|
-
# puts "====="
|
52
|
-
# puts
|
53
|
-
# end
|
54
|
-
# puts "-=-=-=-=-"
|
55
|
-
# puts
|
56
|
-
|
57
|
-
## to fill the gaps
|
58
|
-
last_block = nil
|
59
|
-
blocks2 = blocks.inject([]) do |sum, block|
|
60
|
-
b1 = last_block ? last_block[:source][:end] : 0
|
61
|
-
e1 = block[:source][:begin]
|
62
|
-
|
63
|
-
sum += if b1 == e1
|
64
|
-
[block]
|
65
|
-
else
|
66
|
-
b2 = last_block ? last_block[:target][:end] : 0
|
67
|
-
e2 = block[:target][:begin]
|
68
|
-
|
69
|
-
if b2 == e2
|
70
|
-
[
|
71
|
-
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
72
|
-
block
|
73
|
-
]
|
74
|
-
else
|
75
|
-
if b1 == 0 && b2 == 0
|
76
|
-
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
|
-
b2 = e2 - len_buffer if e2 > len_buffer
|
78
|
-
end
|
79
|
-
|
80
|
-
_str1 = str1[b1 ... e1]
|
81
|
-
_str2 = str2[b2 ... e2]
|
82
|
-
|
83
|
-
if _str1.strip.empty? || _str2.strip.empty?
|
84
|
-
[
|
85
|
-
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
86
|
-
block
|
87
|
-
]
|
88
|
-
else
|
89
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
last_block = block
|
95
|
-
sum
|
96
|
-
end
|
97
|
-
|
98
|
-
# the last step
|
99
|
-
blocks2 += if last_block.nil?
|
100
|
-
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
101
|
-
else
|
102
|
-
b1 = last_block[:source][:end]
|
103
|
-
if b1 < str1.length
|
104
|
-
e1 = str1.length
|
105
|
-
|
106
|
-
b2 = last_block[:target][:end]
|
107
|
-
if b2 < str2.length
|
108
|
-
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
109
|
-
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
110
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
111
|
-
else
|
112
|
-
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
113
|
-
end
|
114
|
-
else
|
115
|
-
[]
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
@block_alignment[:blocks] = blocks2
|
23
|
+
@original_text = nil
|
24
|
+
@blocks = nil
|
25
|
+
@cultivation_map = TextAlignment::CultivationMap.new
|
120
26
|
end
|
121
27
|
|
122
|
-
def
|
123
|
-
|
124
|
-
|
125
|
-
unless block_begin.nil?
|
126
|
-
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
|
-
end
|
28
|
+
def align(text, denotations = nil)
|
29
|
+
# To maintain the cultivation map
|
30
|
+
update_cultivation_map if @to_prevent_overlap
|
128
31
|
|
129
|
-
|
130
|
-
unless
|
131
|
-
|
32
|
+
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
|
+
unless @original_text && @original_text == text
|
34
|
+
@original_text = text
|
35
|
+
@text_mapping = TextAlignment::CharMapping.new(text)
|
132
36
|
end
|
133
37
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
138
|
-
block2 = str2[b2 ... e2]
|
139
|
-
|
140
|
-
## term-based alignment
|
141
|
-
tblocks = if denotations
|
142
|
-
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
143
|
-
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
144
|
-
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
145
|
-
|
146
|
-
position = 0
|
147
|
-
tblocks = ds_in_scope.map do |term|
|
148
|
-
lex = term[:lex]
|
149
|
-
r = block2.index(lex, position)
|
150
|
-
if r.nil?
|
151
|
-
position = nil
|
152
|
-
break
|
153
|
-
end
|
154
|
-
position = r + lex.length
|
155
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
156
|
-
end
|
157
|
-
|
158
|
-
# missing term found
|
159
|
-
tblocks = [] if position.nil?
|
38
|
+
@mapped_text = @text_mapping.mapped_text
|
39
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
160
40
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
lex = term[:lex]
|
165
|
-
look_forward = block2.index(lex, position)
|
166
|
-
unless look_forward.nil?
|
167
|
-
tblocks = []
|
168
|
-
break
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
tblocks
|
41
|
+
## To generate the block_alignment of the input text against the reference text
|
42
|
+
@blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
43
|
+
r
|
174
44
|
else
|
175
|
-
|
45
|
+
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
176
46
|
end
|
177
47
|
|
178
|
-
|
179
|
-
if b1 == 0 && e1 == str1.length
|
180
|
-
if (e1 > 2000) || (e2 > 2000)
|
181
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
182
|
-
else
|
183
|
-
block1 = str1[b1 ... e1]
|
184
|
-
block2 = str2[b2 ... e2]
|
185
|
-
|
186
|
-
## character-based alignment
|
187
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
188
|
-
if alignment.sdiff.nil?
|
189
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
|
-
else
|
191
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
192
|
-
end
|
193
|
-
end
|
194
|
-
else
|
195
|
-
block1 = str1[b1 ... e1]
|
196
|
-
block2 = str2[b2 ... e2]
|
197
|
-
|
198
|
-
## character-based alignment
|
199
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
200
|
-
if alignment.sdiff.nil?
|
201
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
202
|
-
else
|
203
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
204
|
-
end
|
205
|
-
end
|
206
|
-
else
|
207
|
-
last_tblock = nil
|
208
|
-
lblocks = tblocks.inject([]) do |sum, tblock|
|
209
|
-
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
210
|
-
te1 = tblock[:source][:begin]
|
211
|
-
|
212
|
-
sum += if te1 == tb1
|
213
|
-
[tblock]
|
214
|
-
else
|
215
|
-
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
216
|
-
te2 = tblock[:target][:begin]
|
217
|
-
|
218
|
-
if b2 == e2
|
219
|
-
[
|
220
|
-
{source:{begin:tb1, end:te1}, alignment: :empty},
|
221
|
-
tblock
|
222
|
-
]
|
223
|
-
else
|
224
|
-
[
|
225
|
-
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
226
|
-
tblock
|
227
|
-
]
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
last_tblock = tblock
|
232
|
-
sum
|
233
|
-
end
|
234
|
-
|
235
|
-
if last_tblock[:source][:end] < e1
|
236
|
-
if last_tblock[:target][:end] < e2
|
237
|
-
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
|
238
|
-
else
|
239
|
-
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
lblocks
|
244
|
-
end
|
48
|
+
@block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
|
245
49
|
end
|
246
50
|
|
51
|
+
def transform_begin_position(_begin_position)
|
52
|
+
begin_position = @text_mapping.enmap_position(_begin_position)
|
247
53
|
|
248
|
-
|
249
|
-
|
250
|
-
len = target.len
|
251
|
-
Enumerator.new do |yielder|
|
252
|
-
while idx = str.index(target, position)
|
253
|
-
yielder << idx
|
254
|
-
position = idx + len
|
255
|
-
end
|
256
|
-
end
|
257
|
-
end
|
258
|
-
|
259
|
-
def transform_begin_position(begin_position)
|
260
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
261
|
-
block = @block_alignment[:blocks][i]
|
54
|
+
i = @blocks.index{|b| b[:source][:end] > begin_position}
|
55
|
+
block = @blocks[i]
|
262
56
|
|
263
57
|
b = if block[:alignment] == :block || block[:alignment] == :term
|
264
58
|
begin_position + block[:delta]
|
@@ -272,11 +66,15 @@ class TextAlignment::TextAlignment
|
|
272
66
|
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
273
67
|
r.nil? ? nil : r + block[:target][:begin]
|
274
68
|
end
|
69
|
+
|
70
|
+
@rtext_mapping.demap_position(b)
|
275
71
|
end
|
276
72
|
|
277
|
-
def transform_end_position(
|
278
|
-
|
279
|
-
|
73
|
+
def transform_end_position(_end_position)
|
74
|
+
end_position = @text_mapping.enmap_position(_end_position)
|
75
|
+
|
76
|
+
i = @blocks.index{|b| b[:source][:end] >= end_position}
|
77
|
+
block = @blocks[i]
|
280
78
|
|
281
79
|
e = if block[:alignment] == :block || block[:alignment] == :term
|
282
80
|
end_position + block[:delta]
|
@@ -290,6 +88,8 @@ class TextAlignment::TextAlignment
|
|
290
88
|
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
291
89
|
r.nil? ? nil : r + block[:target][:begin]
|
292
90
|
end
|
91
|
+
|
92
|
+
@rtext_mapping.demap_position(e)
|
293
93
|
end
|
294
94
|
|
295
95
|
def transform_a_span(span)
|
@@ -308,7 +108,7 @@ class TextAlignment::TextAlignment
|
|
308
108
|
source = {begin:d.begin, end:d.end}
|
309
109
|
d.begin = transform_begin_position(d.begin);
|
310
110
|
d.end = transform_end_position(d.end);
|
311
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
111
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
|
312
112
|
rescue
|
313
113
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
314
114
|
d.begin = nil
|
@@ -324,7 +124,7 @@ class TextAlignment::TextAlignment
|
|
324
124
|
|
325
125
|
r = hdenotations.collect do |d|
|
326
126
|
t = transform_a_span(d[:span])
|
327
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
127
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
|
328
128
|
new_d = d.dup.merge({span:t})
|
329
129
|
rescue
|
330
130
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -335,8 +135,8 @@ class TextAlignment::TextAlignment
|
|
335
135
|
end
|
336
136
|
|
337
137
|
def alignment_show
|
338
|
-
stext = @block_alignment[:
|
339
|
-
ttext = @block_alignment[:
|
138
|
+
stext = @block_alignment[:text]
|
139
|
+
ttext = @block_alignment[:reference_text]
|
340
140
|
|
341
141
|
show = ''
|
342
142
|
@block_alignment[:blocks].each do |a|
|
@@ -392,9 +192,242 @@ class TextAlignment::TextAlignment
|
|
392
192
|
|
393
193
|
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
394
194
|
"[#{astr1}]\n" +
|
395
|
-
"[#{astr2}]\n\n"
|
195
|
+
"[#{astr2.gsub("\n", " ")}]\n\n"
|
396
196
|
end
|
397
197
|
end
|
398
198
|
show
|
399
199
|
end
|
200
|
+
|
201
|
+
private
|
202
|
+
|
203
|
+
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
204
|
+
## to find block alignments
|
205
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
206
|
+
|
207
|
+
blocks = []
|
208
|
+
while block = anchor_finder.get_next_anchor
|
209
|
+
last = blocks.last
|
210
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
211
|
+
last[:source][:end] = block[:source][:end]
|
212
|
+
last[:target][:end] = block[:target][:end]
|
213
|
+
else
|
214
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
# pp blocks
|
219
|
+
# puts "-----"
|
220
|
+
# puts
|
221
|
+
# exit
|
222
|
+
# blocks.each do |b|
|
223
|
+
# p [b[:source], b[:target]]
|
224
|
+
# puts "---"
|
225
|
+
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
226
|
+
# puts "---"
|
227
|
+
# puts str2[b[:target][:begin] ... b[:target][:end]]
|
228
|
+
# puts "====="
|
229
|
+
# puts
|
230
|
+
# end
|
231
|
+
# puts "-=-=-=-=-"
|
232
|
+
# puts
|
233
|
+
|
234
|
+
## To fill the gaps
|
235
|
+
## lblock: last block, cblock: current block
|
236
|
+
lblock = nil
|
237
|
+
blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
|
238
|
+
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
239
|
+
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
240
|
+
|
241
|
+
if b1 < e1
|
242
|
+
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
243
|
+
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
244
|
+
_str1 = str1[b1 ... e1]
|
245
|
+
_str2 = str2[b2 ... e2]
|
246
|
+
|
247
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
248
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
249
|
+
else
|
250
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
251
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
252
|
+
case region_state
|
253
|
+
when :closed
|
254
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
255
|
+
when :front_open
|
256
|
+
if sum.empty? # when there is no preceding matched block
|
257
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
258
|
+
else
|
259
|
+
oe2 = state_region[1]
|
260
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
261
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
262
|
+
end
|
263
|
+
when :rear_open
|
264
|
+
if cblock.nil? # when there is no following matched block
|
265
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
266
|
+
else
|
267
|
+
ob2 = state_region[0]
|
268
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
269
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
270
|
+
end
|
271
|
+
when :middle_closed
|
272
|
+
attempt1 = if sum.empty?
|
273
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
274
|
+
else
|
275
|
+
oe2 = state_region[0]
|
276
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
277
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
278
|
+
end
|
279
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
280
|
+
ob2 = state_region[1]
|
281
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
282
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
283
|
+
else
|
284
|
+
attempt1
|
285
|
+
end
|
286
|
+
else # :open
|
287
|
+
if (e2 - b2) > len_buffer
|
288
|
+
attempt1 = if sum.empty?
|
289
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
290
|
+
else
|
291
|
+
local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
292
|
+
end
|
293
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
294
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
295
|
+
else
|
296
|
+
attempt1
|
297
|
+
end
|
298
|
+
else
|
299
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
lblock = cblock
|
306
|
+
cblock.nil? ? sum : sum << cblock
|
307
|
+
end
|
308
|
+
|
309
|
+
end
|
310
|
+
|
311
|
+
def whole_block_alignment(str1, str2, cultivation_map)
|
312
|
+
block_begin = cultivation_map.index(str1, str2)
|
313
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
314
|
+
|
315
|
+
block_begin = cultivation_map.index(str1.downcase, str2.downcase)
|
316
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
317
|
+
|
318
|
+
nil
|
319
|
+
end
|
320
|
+
|
321
|
+
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
322
|
+
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
323
|
+
if tblocks.empty?
|
324
|
+
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
325
|
+
else
|
326
|
+
tblocks
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
331
|
+
str2_block = str2[0 ... e2]
|
332
|
+
|
333
|
+
## term-based alignment
|
334
|
+
tblocks = if denotations
|
335
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
336
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
337
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
338
|
+
|
339
|
+
search_position = b2
|
340
|
+
_tblocks = denotations_in_scope.map do |denotation|
|
341
|
+
lex = denotation[:lex]
|
342
|
+
term_begin = cultivation_map.index(lex, str2_block, search_position)
|
343
|
+
break [] if term_begin.nil? # break the loop if a missing term is found
|
344
|
+
search_position = term_begin + lex.length
|
345
|
+
{source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
|
346
|
+
end
|
347
|
+
|
348
|
+
# redundant matching found
|
349
|
+
unless _tblocks.empty?
|
350
|
+
search_position = _tblocks.last[:target][:end]
|
351
|
+
denotations_in_scope.each do |term|
|
352
|
+
look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
|
353
|
+
unless look_forward.nil?
|
354
|
+
_tblocks = []
|
355
|
+
break
|
356
|
+
end
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
_tblocks
|
361
|
+
else
|
362
|
+
[]
|
363
|
+
end
|
364
|
+
|
365
|
+
ltblock = nil
|
366
|
+
tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
|
367
|
+
tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
|
368
|
+
te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
|
369
|
+
|
370
|
+
if te1 > tb1
|
371
|
+
tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
|
372
|
+
te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
|
373
|
+
sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
|
374
|
+
end
|
375
|
+
|
376
|
+
ltblock = ctblock
|
377
|
+
ctblock.nil? ? sum : sum << ctblock
|
378
|
+
end
|
379
|
+
|
380
|
+
tblocks2
|
381
|
+
end
|
382
|
+
|
383
|
+
def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
384
|
+
source = {begin:b1, end:e1}
|
385
|
+
target = {begin:b2, end:e2}
|
386
|
+
|
387
|
+
if (e1 - b1) > 2000
|
388
|
+
[{source:source, target:target, alignment: :empty}]
|
389
|
+
else
|
390
|
+
alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
|
391
|
+
if alignment.similarity < 0.5
|
392
|
+
[{source:source, target:target, alignment: :empty}]
|
393
|
+
else
|
394
|
+
[{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
|
395
|
+
end
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
def update_cultivation_map
|
400
|
+
return if @blocks.nil?
|
401
|
+
|
402
|
+
## To update the cultivation map
|
403
|
+
newly_cultivated_regions = @blocks.collect do |b|
|
404
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
405
|
+
[b[:target][:begin], b[:target][:end]]
|
406
|
+
else
|
407
|
+
nil
|
408
|
+
end
|
409
|
+
end.compact.inject([]) do |condensed, region|
|
410
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
411
|
+
condensed.push region
|
412
|
+
else
|
413
|
+
condensed.last[1] = region.last
|
414
|
+
end
|
415
|
+
condensed
|
416
|
+
end
|
417
|
+
|
418
|
+
@cultivation_map.cultivate(newly_cultivated_regions)
|
419
|
+
end
|
420
|
+
|
421
|
+
def demap_blocks(_blocks)
|
422
|
+
return nil if _blocks.nil?
|
423
|
+
|
424
|
+
blocks = _blocks.map{|b| b.dup}
|
425
|
+
blocks.each do |b|
|
426
|
+
b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
|
427
|
+
b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
|
428
|
+
end
|
429
|
+
|
430
|
+
blocks
|
431
|
+
end
|
432
|
+
|
400
433
|
end
|