text_alignment 0.11.0 → 0.11.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +11 -5
- data/lib/text_alignment/anchor_finder.rb +9 -27
- data/lib/text_alignment/cultivation_map.rb +80 -5
- data/lib/text_alignment/mixed_alignment.rb +11 -5
- data/lib/text_alignment/text_alignment.rb +113 -182
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
|
4
|
+
data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
|
7
|
+
data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
|
data/bin/align_annotations
CHANGED
@@ -77,7 +77,9 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
77
77
|
annotations[:relations].each do |r|
|
78
78
|
reid = 'R' + (idnum_relations += 1).to_s
|
79
79
|
ididx[r[:id]] = reid
|
80
|
-
|
80
|
+
sid = ididx[r[:subj]]
|
81
|
+
oid = ididx[r[:obj]]
|
82
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
|
81
83
|
end
|
82
84
|
end
|
83
85
|
|
@@ -86,7 +88,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
86
88
|
annotations[:attributes].each do |a|
|
87
89
|
reid = 'A' + (idnum_attributes += 1).to_s
|
88
90
|
ididx[a[:id]] = reid
|
89
|
-
|
91
|
+
sid = ididx[a[:subj]]
|
92
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
|
90
93
|
end
|
91
94
|
end
|
92
95
|
|
@@ -95,7 +98,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
95
98
|
annotations[:modifications].each do |m|
|
96
99
|
reid = 'M' + (idnum_modifications += 1).to_s
|
97
100
|
ididx[m[:id]] = reid
|
98
|
-
|
101
|
+
oid = ididx[m[:obj]]
|
102
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
|
99
103
|
end
|
100
104
|
end
|
101
105
|
end
|
@@ -115,11 +119,13 @@ reference_text = read_text(ARGV[1])
|
|
115
119
|
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
116
120
|
|
117
121
|
target_annotations = if source_annotations.class == Array
|
122
|
+
# align_mannotations(source_annotations, reference_text, alignment, true)
|
118
123
|
align_mannotations(source_annotations, reference_text, alignment, false)
|
119
124
|
else
|
120
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
121
|
-
|
125
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
126
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
122
127
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
123
128
|
end
|
124
129
|
|
130
|
+
# pp alignment.block_alignment
|
125
131
|
puts target_annotations.to_json
|
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
9
|
def initialize(source_str, target_str, cultivation_map)
|
10
|
-
@s1
|
11
|
-
|
12
|
-
else
|
13
|
-
[source_str.downcase, target_str.downcase]
|
14
|
-
end
|
10
|
+
@s1 = source_str.downcase
|
11
|
+
@s2 = target_str.downcase
|
15
12
|
|
16
13
|
@cultivation_map = cultivation_map
|
17
14
|
|
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
|
|
19
16
|
@size_window = TextAlignment::SIZE_WINDOW
|
20
17
|
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
21
18
|
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
19
|
+
@pos_s2_final_possible_end = @s2.length
|
22
20
|
|
23
21
|
# positions of last match
|
24
22
|
@pos_s1_last_match = 0
|
25
23
|
@pos_s2_last_match = 0
|
26
24
|
end
|
27
25
|
|
28
|
-
def reverse?(source_str = nil, target_str = nil)
|
29
|
-
unless source_str.nil?
|
30
|
-
@reverse_p = target_str.length < source_str.length
|
31
|
-
end
|
32
|
-
@reverse_p
|
33
|
-
end
|
34
|
-
|
35
26
|
def get_next_anchor
|
36
27
|
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
37
28
|
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
|
|
49
40
|
# To extend the block to the left
|
50
41
|
b1 = beg_s1
|
51
42
|
b2 = beg_s2
|
52
|
-
|
43
|
+
left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
|
44
|
+
while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
|
53
45
|
b1 -= 1; b2 -= 1
|
54
46
|
end
|
55
|
-
b1 += 1; b2 += 1
|
56
47
|
|
57
48
|
# To extend the block to the right
|
58
49
|
e1 = beg_s1 + @size_ngram
|
59
50
|
e2 = beg_s2 + @size_ngram
|
60
|
-
|
51
|
+
right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
|
52
|
+
while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
|
61
53
|
e1 += 1; e2 += 1
|
62
54
|
end
|
63
55
|
|
64
56
|
@pos_s1_last_match = e1
|
65
57
|
@pos_s2_last_match = e2
|
66
58
|
|
67
|
-
|
68
|
-
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
69
|
-
else
|
70
|
-
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
71
|
-
end
|
59
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
72
60
|
end
|
73
61
|
|
74
62
|
private
|
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
|
|
91
79
|
# return nil if the anchor is too much frequent
|
92
80
|
def find_beg_s2_candidates(anchor, search_position)
|
93
81
|
candidates = []
|
94
|
-
while _beg_s2 = @
|
95
|
-
search_again_position = @cultivation_map.search_again_position(_beg_s2)
|
96
|
-
unless search_again_position.nil?
|
97
|
-
search_position = search_again_position
|
98
|
-
next
|
99
|
-
end
|
100
|
-
|
82
|
+
while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
|
101
83
|
candidates << _beg_s2
|
102
84
|
|
103
85
|
# for speed, skip anchor of high frequency
|
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
|
|
4
4
|
attr_reader :map
|
5
5
|
|
6
6
|
def initialize
|
7
|
-
@map =
|
7
|
+
@map = []
|
8
8
|
end
|
9
9
|
|
10
10
|
def cultivate(regions)
|
11
|
-
|
12
|
-
|
11
|
+
@map += regions
|
12
|
+
@map.sort!{|a, b| a[0] <=> b[0]}
|
13
|
+
new_map = []
|
14
|
+
@map.each do |region|
|
15
|
+
if new_map.empty?
|
16
|
+
new_map << region
|
17
|
+
elsif new_map.last[1] > region[0]
|
18
|
+
raise "Overlapping regions: #{new_map.last} : #{region}"
|
19
|
+
elsif new_map.last[1] == region[0]
|
20
|
+
new_map.last[1] == region[1]
|
21
|
+
else
|
22
|
+
new_map << region
|
23
|
+
end
|
13
24
|
end
|
25
|
+
@map = new_map
|
14
26
|
end
|
15
27
|
|
16
|
-
def search_again_position(position)
|
17
|
-
|
28
|
+
def search_again_position(position, end_position = nil)
|
29
|
+
end_position ||= position
|
30
|
+
region = @map.bsearch{|r| end_position < r[1]}
|
31
|
+
if region.nil? || region[0] > position
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
region[1]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def last_cultivated_position(position)
|
39
|
+
ridx = @map.rindex{|r| r[1] <= position}
|
40
|
+
ridx.nil? ? nil : @map[ridx][1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next_cultivated_position(position)
|
44
|
+
region = @map.bsearch{|r| position < r[0]}
|
45
|
+
region.nil? ? nil : region[0]
|
46
|
+
end
|
47
|
+
|
48
|
+
def in_regions(region)
|
49
|
+
@map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
|
50
|
+
end
|
51
|
+
|
52
|
+
def region_state(region)
|
53
|
+
closed_parts = in_regions(region)
|
54
|
+
if closed_parts.empty?
|
55
|
+
[:open, region]
|
56
|
+
else
|
57
|
+
if front_open?(region, closed_parts)
|
58
|
+
if rear_open?(region, closed_parts)
|
59
|
+
[:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
|
60
|
+
else
|
61
|
+
[:front_open, [region[0], closed_parts.first[0]]]
|
62
|
+
end
|
63
|
+
else
|
64
|
+
if rear_open?(region, closed_parts)
|
65
|
+
[:rear_open, [closed_parts.last[1], region[1]]]
|
66
|
+
else
|
67
|
+
[:closed, nil]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def index(target, string, position)
|
74
|
+
length = target.length
|
75
|
+
loop do
|
76
|
+
_begin = string.index(target, position)
|
77
|
+
break if _begin.nil?
|
78
|
+
position = search_again_position(_begin)
|
79
|
+
next unless position.nil?
|
80
|
+
break _begin if region_state([_begin, _begin + length])[0] == :open
|
81
|
+
position = _begin + 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def front_open?(region, closed_parts)
|
88
|
+
closed_parts.first[0] > region[0]
|
89
|
+
end
|
90
|
+
|
91
|
+
def rear_open?(region, closed_parts)
|
92
|
+
closed_parts.last[1] < region[1]
|
18
93
|
end
|
19
94
|
end
|
@@ -144,11 +144,17 @@ class TextAlignment::MixedAlignment
|
|
144
144
|
def compute_similarity(s1, s2, sdiff)
|
145
145
|
return 0 if sdiff.nil?
|
146
146
|
|
147
|
-
#
|
148
|
-
|
149
|
-
|
147
|
+
# recoverbility
|
148
|
+
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
|
+
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
150
|
|
151
|
-
|
152
|
-
|
151
|
+
coverage = count_nws_match.to_f / count_nws
|
152
|
+
|
153
|
+
# fragmentation rate
|
154
|
+
count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
|
155
|
+
count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
|
156
|
+
rate_frag = count_ofrag.to_f / count_frag
|
153
157
|
|
158
|
+
similarity = coverage * rate_frag
|
159
|
+
end
|
154
160
|
end
|
@@ -15,8 +15,9 @@ class TextAlignment::TextAlignment
|
|
15
15
|
def initialize(reference_text, to_prevent_overlap = false)
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
-
@
|
18
|
+
@original_reference_text = reference_text
|
19
19
|
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@mapped_reference_text = @rtext_mapping.mapped_text
|
20
21
|
@to_prevent_overlap = to_prevent_overlap
|
21
22
|
|
22
23
|
@original_text = nil
|
@@ -34,21 +35,19 @@ class TextAlignment::TextAlignment
|
|
34
35
|
@text_mapping = TextAlignment::CharMapping.new(text)
|
35
36
|
end
|
36
37
|
|
37
|
-
|
38
|
+
@mapped_text = @text_mapping.mapped_text
|
38
39
|
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
39
40
|
|
40
|
-
rtext_mapped = @rtext_mapping.mapped_text
|
41
|
-
|
42
41
|
## To generate the block_alignment of the input text against the reference text
|
43
|
-
|
44
42
|
# Initialization
|
45
|
-
@block_alignment = {text: @original_text, reference_text: @
|
43
|
+
# @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
|
44
|
+
@block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
|
46
45
|
|
47
46
|
# Generation
|
48
|
-
@block_alignment[:blocks] = if r = whole_block_alignment(
|
47
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
49
48
|
r
|
50
49
|
else
|
51
|
-
find_block_alignment(
|
50
|
+
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
52
51
|
end
|
53
52
|
end
|
54
53
|
|
@@ -134,7 +133,7 @@ class TextAlignment::TextAlignment
|
|
134
133
|
source = {begin:d.begin, end:d.end}
|
135
134
|
d.begin = transform_begin_position(d.begin);
|
136
135
|
d.end = transform_end_position(d.end);
|
137
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
136
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
|
138
137
|
rescue
|
139
138
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
140
139
|
d.begin = nil
|
@@ -150,7 +149,7 @@ class TextAlignment::TextAlignment
|
|
150
149
|
|
151
150
|
r = hdenotations.collect do |d|
|
152
151
|
t = transform_a_span(d[:span])
|
153
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
152
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
|
154
153
|
new_d = d.dup.merge({span:t})
|
155
154
|
rescue
|
156
155
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -161,8 +160,8 @@ class TextAlignment::TextAlignment
|
|
161
160
|
end
|
162
161
|
|
163
162
|
def alignment_show
|
164
|
-
stext = @
|
165
|
-
ttext = @
|
163
|
+
stext = @mapped_text
|
164
|
+
ttext = @mapped_reference_text
|
166
165
|
|
167
166
|
show = ''
|
168
167
|
@block_alignment[:blocks].each do |a|
|
@@ -218,7 +217,7 @@ class TextAlignment::TextAlignment
|
|
218
217
|
|
219
218
|
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
220
219
|
"[#{astr1}]\n" +
|
221
|
-
"[#{astr2}]\n\n"
|
220
|
+
"[#{astr2.gsub("\n", " ")}]\n\n"
|
222
221
|
end
|
223
222
|
end
|
224
223
|
show
|
@@ -257,142 +256,109 @@ class TextAlignment::TextAlignment
|
|
257
256
|
# puts "-=-=-=-=-"
|
258
257
|
# puts
|
259
258
|
|
260
|
-
##
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
259
|
+
## To fill the gaps
|
260
|
+
## lblock: last block, cblock: current block
|
261
|
+
lblock = nil
|
262
|
+
blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
|
263
|
+
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
264
|
+
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
265
265
|
|
266
|
-
|
267
|
-
[
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
if
|
273
|
-
[
|
274
|
-
{source:{begin:b1, end:e1}, alignment: :empty},
|
275
|
-
block
|
276
|
-
]
|
266
|
+
if b1 < e1
|
267
|
+
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
268
|
+
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
269
|
+
_str1 = str1[b1 ... e1]
|
270
|
+
_str2 = str2[b2 ... e2]
|
271
|
+
|
272
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
273
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
277
274
|
else
|
278
275
|
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
[
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
276
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
277
|
+
case region_state
|
278
|
+
when :closed
|
279
|
+
[]
|
280
|
+
when :front_open
|
281
|
+
oe2 = state_region[1]
|
282
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
283
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
284
|
+
when :rear_open
|
285
|
+
ob2 = state_region[0]
|
286
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
287
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
288
|
+
when :middle_closed
|
289
|
+
oe2 = state_region[0]
|
290
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
291
|
+
attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
292
|
+
if attempt1.empty?
|
293
|
+
ob2 = state_region[1]
|
294
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
295
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
296
|
+
else
|
297
|
+
attempt1
|
298
|
+
end
|
299
|
+
else # :open
|
300
|
+
if (e2 - b2) > len_buffer
|
301
|
+
attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
302
|
+
if attempt1.empty?
|
303
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
304
|
+
else
|
305
|
+
attempt1
|
306
|
+
end
|
307
|
+
else
|
308
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
309
|
+
end
|
298
310
|
end
|
299
311
|
end
|
300
312
|
end
|
301
313
|
|
302
|
-
|
303
|
-
sum
|
314
|
+
lblock = cblock
|
315
|
+
cblock.nil? ? sum : sum << cblock
|
304
316
|
end
|
305
317
|
|
306
|
-
# the last step
|
307
|
-
blocks2 += if last_block.nil?
|
308
|
-
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
309
|
-
else
|
310
|
-
b1 = last_block[:source][:end]
|
311
|
-
if b1 < str1.length
|
312
|
-
e1 = str1.length
|
313
|
-
b2 = last_block[:target][:end]
|
314
|
-
|
315
|
-
_str1 = str1[b1 ... e1]
|
316
|
-
if _str1.strip.empty?
|
317
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
318
|
-
else
|
319
|
-
if b2 < str2.length
|
320
|
-
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
321
|
-
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
322
|
-
|
323
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
324
|
-
else
|
325
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
326
|
-
end
|
327
|
-
end
|
328
|
-
else
|
329
|
-
[]
|
330
|
-
end
|
331
|
-
end
|
332
318
|
end
|
333
319
|
|
334
320
|
def whole_block_alignment(str1, str2, cultivation_map)
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
block_begin = begin
|
339
|
-
_block_begin = str2.index(str1, search_position)
|
340
|
-
break if _block_begin.nil?
|
341
|
-
search_position = cultivation_map.search_again_position(_block_begin)
|
342
|
-
_block_begin
|
343
|
-
end until search_position.nil?
|
344
|
-
|
345
|
-
unless block_begin.nil?
|
346
|
-
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
347
|
-
end
|
321
|
+
block_begin = cultivation_map.index(str1, str2, 0)
|
322
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
348
323
|
|
349
|
-
|
324
|
+
block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
|
325
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
350
326
|
|
351
|
-
|
352
|
-
|
353
|
-
block_begin = begin
|
354
|
-
_block_begin = dstr2.index(dstr1, search_position)
|
355
|
-
break if _block_begin.nil?
|
356
|
-
search_position = cultivation_map.search_again_position(_block_begin)
|
357
|
-
_block_begin
|
358
|
-
end until search_position.nil?
|
327
|
+
nil
|
328
|
+
end
|
359
329
|
|
360
|
-
|
361
|
-
|
330
|
+
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
331
|
+
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
332
|
+
if tblocks.empty?
|
333
|
+
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
334
|
+
else
|
335
|
+
tblocks
|
362
336
|
end
|
363
|
-
|
364
|
-
nil
|
365
337
|
end
|
366
338
|
|
367
|
-
def
|
368
|
-
|
339
|
+
def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
340
|
+
str2_block = str2[0 ... e2]
|
369
341
|
|
370
342
|
## term-based alignment
|
371
343
|
tblocks = if denotations
|
372
|
-
|
344
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
373
345
|
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
374
346
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
375
347
|
|
376
|
-
|
377
|
-
_tblocks =
|
378
|
-
lex =
|
379
|
-
|
380
|
-
if
|
381
|
-
|
382
|
-
|
383
|
-
end
|
384
|
-
position = r + lex.length
|
385
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
|
348
|
+
search_position = b2
|
349
|
+
_tblocks = denotations_in_scope.map do |denotation|
|
350
|
+
lex = denotation[:lex]
|
351
|
+
term_begin = cultivation_map.index(lex, str2_block, search_position)
|
352
|
+
break [] if term_begin.nil? # break the loop if a missing term is found
|
353
|
+
search_position = term_begin + lex.length
|
354
|
+
{source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
|
386
355
|
end
|
387
356
|
|
388
|
-
# missing term found
|
389
|
-
_tblocks = [] if position.nil?
|
390
|
-
|
391
357
|
# redundant matching found
|
392
|
-
unless
|
393
|
-
|
394
|
-
|
395
|
-
look_forward =
|
358
|
+
unless _tblocks.empty?
|
359
|
+
search_position = _tblocks.last[:target][:end]
|
360
|
+
denotations_in_scope.each do |term|
|
361
|
+
look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
|
396
362
|
unless look_forward.nil?
|
397
363
|
_tblocks = []
|
398
364
|
break
|
@@ -405,72 +371,37 @@ class TextAlignment::TextAlignment
|
|
405
371
|
[]
|
406
372
|
end
|
407
373
|
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
else
|
413
|
-
block1 = str1[b1 ... e1]
|
414
|
-
block2 = str2[b2 ... e2]
|
415
|
-
|
416
|
-
## character-based alignment
|
417
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
418
|
-
if alignment.sdiff.nil?
|
419
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
420
|
-
else
|
421
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
422
|
-
end
|
423
|
-
end
|
424
|
-
else
|
425
|
-
block1 = str1[b1 ... e1]
|
426
|
-
block2 = str2[b2 ... e2]
|
374
|
+
ltblock = nil
|
375
|
+
tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
|
376
|
+
tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
|
377
|
+
te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
|
427
378
|
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
else
|
433
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
434
|
-
end
|
379
|
+
if te1 > tb1
|
380
|
+
tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
|
381
|
+
te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
|
382
|
+
sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
|
435
383
|
end
|
436
|
-
else
|
437
|
-
last_tblock = nil
|
438
|
-
lblocks = tblocks.inject([]) do |sum, tblock|
|
439
|
-
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
440
|
-
te1 = tblock[:source][:begin]
|
441
384
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
446
|
-
te2 = tblock[:target][:begin]
|
447
|
-
|
448
|
-
if b2 == e2
|
449
|
-
[
|
450
|
-
{source:{begin:tb1, end:te1}, alignment: :empty},
|
451
|
-
tblock
|
452
|
-
]
|
453
|
-
else
|
454
|
-
[
|
455
|
-
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
456
|
-
tblock
|
457
|
-
]
|
458
|
-
end
|
459
|
-
end
|
385
|
+
ltblock = ctblock
|
386
|
+
ctblock.nil? ? sum : sum << ctblock
|
387
|
+
end
|
460
388
|
|
461
|
-
|
462
|
-
|
463
|
-
end
|
389
|
+
tblocks2
|
390
|
+
end
|
464
391
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
else
|
469
|
-
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
470
|
-
end
|
471
|
-
end
|
392
|
+
def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
393
|
+
source = {begin:b1, end:e1}
|
394
|
+
target = {begin:b2, end:e2}
|
472
395
|
|
473
|
-
|
396
|
+
if (e1 - b1) > 2000
|
397
|
+
[{source:source, target:target, alignment: :empty}]
|
398
|
+
else
|
399
|
+
alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
|
400
|
+
if alignment.similarity < 0.5
|
401
|
+
[{source:source, target:target, alignment: :empty}]
|
402
|
+
else
|
403
|
+
[{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
|
404
|
+
end
|
474
405
|
end
|
475
406
|
end
|
476
407
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|