text_alignment 0.11.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +11 -5
- data/lib/text_alignment/anchor_finder.rb +9 -27
- data/lib/text_alignment/cultivation_map.rb +80 -5
- data/lib/text_alignment/mixed_alignment.rb +11 -5
- data/lib/text_alignment/text_alignment.rb +113 -182
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
|
4
|
+
data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
|
7
|
+
data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
|
data/bin/align_annotations
CHANGED
@@ -77,7 +77,9 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
77
77
|
annotations[:relations].each do |r|
|
78
78
|
reid = 'R' + (idnum_relations += 1).to_s
|
79
79
|
ididx[r[:id]] = reid
|
80
|
-
|
80
|
+
sid = ididx[r[:subj]]
|
81
|
+
oid = ididx[r[:obj]]
|
82
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
|
81
83
|
end
|
82
84
|
end
|
83
85
|
|
@@ -86,7 +88,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
86
88
|
annotations[:attributes].each do |a|
|
87
89
|
reid = 'A' + (idnum_attributes += 1).to_s
|
88
90
|
ididx[a[:id]] = reid
|
89
|
-
|
91
|
+
sid = ididx[a[:subj]]
|
92
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
|
90
93
|
end
|
91
94
|
end
|
92
95
|
|
@@ -95,7 +98,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
95
98
|
annotations[:modifications].each do |m|
|
96
99
|
reid = 'M' + (idnum_modifications += 1).to_s
|
97
100
|
ididx[m[:id]] = reid
|
98
|
-
|
101
|
+
oid = ididx[m[:obj]]
|
102
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
|
99
103
|
end
|
100
104
|
end
|
101
105
|
end
|
@@ -115,11 +119,13 @@ reference_text = read_text(ARGV[1])
|
|
115
119
|
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
116
120
|
|
117
121
|
target_annotations = if source_annotations.class == Array
|
122
|
+
# align_mannotations(source_annotations, reference_text, alignment, true)
|
118
123
|
align_mannotations(source_annotations, reference_text, alignment, false)
|
119
124
|
else
|
120
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
121
|
-
|
125
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
126
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
122
127
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
123
128
|
end
|
124
129
|
|
130
|
+
# pp alignment.block_alignment
|
125
131
|
puts target_annotations.to_json
|
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
9
|
def initialize(source_str, target_str, cultivation_map)
|
10
|
-
@s1
|
11
|
-
|
12
|
-
else
|
13
|
-
[source_str.downcase, target_str.downcase]
|
14
|
-
end
|
10
|
+
@s1 = source_str.downcase
|
11
|
+
@s2 = target_str.downcase
|
15
12
|
|
16
13
|
@cultivation_map = cultivation_map
|
17
14
|
|
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
|
|
19
16
|
@size_window = TextAlignment::SIZE_WINDOW
|
20
17
|
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
21
18
|
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
19
|
+
@pos_s2_final_possible_end = @s2.length
|
22
20
|
|
23
21
|
# positions of last match
|
24
22
|
@pos_s1_last_match = 0
|
25
23
|
@pos_s2_last_match = 0
|
26
24
|
end
|
27
25
|
|
28
|
-
def reverse?(source_str = nil, target_str = nil)
|
29
|
-
unless source_str.nil?
|
30
|
-
@reverse_p = target_str.length < source_str.length
|
31
|
-
end
|
32
|
-
@reverse_p
|
33
|
-
end
|
34
|
-
|
35
26
|
def get_next_anchor
|
36
27
|
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
37
28
|
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
|
|
49
40
|
# To extend the block to the left
|
50
41
|
b1 = beg_s1
|
51
42
|
b2 = beg_s2
|
52
|
-
|
43
|
+
left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
|
44
|
+
while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
|
53
45
|
b1 -= 1; b2 -= 1
|
54
46
|
end
|
55
|
-
b1 += 1; b2 += 1
|
56
47
|
|
57
48
|
# To extend the block to the right
|
58
49
|
e1 = beg_s1 + @size_ngram
|
59
50
|
e2 = beg_s2 + @size_ngram
|
60
|
-
|
51
|
+
right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
|
52
|
+
while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
|
61
53
|
e1 += 1; e2 += 1
|
62
54
|
end
|
63
55
|
|
64
56
|
@pos_s1_last_match = e1
|
65
57
|
@pos_s2_last_match = e2
|
66
58
|
|
67
|
-
|
68
|
-
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
69
|
-
else
|
70
|
-
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
71
|
-
end
|
59
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
72
60
|
end
|
73
61
|
|
74
62
|
private
|
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
|
|
91
79
|
# return nil if the anchor is too much frequent
|
92
80
|
def find_beg_s2_candidates(anchor, search_position)
|
93
81
|
candidates = []
|
94
|
-
while _beg_s2 = @
|
95
|
-
search_again_position = @cultivation_map.search_again_position(_beg_s2)
|
96
|
-
unless search_again_position.nil?
|
97
|
-
search_position = search_again_position
|
98
|
-
next
|
99
|
-
end
|
100
|
-
|
82
|
+
while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
|
101
83
|
candidates << _beg_s2
|
102
84
|
|
103
85
|
# for speed, skip anchor of high frequency
|
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
|
|
4
4
|
attr_reader :map
|
5
5
|
|
6
6
|
def initialize
|
7
|
-
@map =
|
7
|
+
@map = []
|
8
8
|
end
|
9
9
|
|
10
10
|
def cultivate(regions)
|
11
|
-
|
12
|
-
|
11
|
+
@map += regions
|
12
|
+
@map.sort!{|a, b| a[0] <=> b[0]}
|
13
|
+
new_map = []
|
14
|
+
@map.each do |region|
|
15
|
+
if new_map.empty?
|
16
|
+
new_map << region
|
17
|
+
elsif new_map.last[1] > region[0]
|
18
|
+
raise "Overlapping regions: #{new_map.last} : #{region}"
|
19
|
+
elsif new_map.last[1] == region[0]
|
20
|
+
new_map.last[1] == region[1]
|
21
|
+
else
|
22
|
+
new_map << region
|
23
|
+
end
|
13
24
|
end
|
25
|
+
@map = new_map
|
14
26
|
end
|
15
27
|
|
16
|
-
def search_again_position(position)
|
17
|
-
|
28
|
+
def search_again_position(position, end_position = nil)
|
29
|
+
end_position ||= position
|
30
|
+
region = @map.bsearch{|r| end_position < r[1]}
|
31
|
+
if region.nil? || region[0] > position
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
region[1]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def last_cultivated_position(position)
|
39
|
+
ridx = @map.rindex{|r| r[1] <= position}
|
40
|
+
ridx.nil? ? nil : @map[ridx][1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next_cultivated_position(position)
|
44
|
+
region = @map.bsearch{|r| position < r[0]}
|
45
|
+
region.nil? ? nil : region[0]
|
46
|
+
end
|
47
|
+
|
48
|
+
def in_regions(region)
|
49
|
+
@map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
|
50
|
+
end
|
51
|
+
|
52
|
+
def region_state(region)
|
53
|
+
closed_parts = in_regions(region)
|
54
|
+
if closed_parts.empty?
|
55
|
+
[:open, region]
|
56
|
+
else
|
57
|
+
if front_open?(region, closed_parts)
|
58
|
+
if rear_open?(region, closed_parts)
|
59
|
+
[:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
|
60
|
+
else
|
61
|
+
[:front_open, [region[0], closed_parts.first[0]]]
|
62
|
+
end
|
63
|
+
else
|
64
|
+
if rear_open?(region, closed_parts)
|
65
|
+
[:rear_open, [closed_parts.last[1], region[1]]]
|
66
|
+
else
|
67
|
+
[:closed, nil]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def index(target, string, position)
|
74
|
+
length = target.length
|
75
|
+
loop do
|
76
|
+
_begin = string.index(target, position)
|
77
|
+
break if _begin.nil?
|
78
|
+
position = search_again_position(_begin)
|
79
|
+
next unless position.nil?
|
80
|
+
break _begin if region_state([_begin, _begin + length])[0] == :open
|
81
|
+
position = _begin + 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def front_open?(region, closed_parts)
|
88
|
+
closed_parts.first[0] > region[0]
|
89
|
+
end
|
90
|
+
|
91
|
+
def rear_open?(region, closed_parts)
|
92
|
+
closed_parts.last[1] < region[1]
|
18
93
|
end
|
19
94
|
end
|
@@ -144,11 +144,17 @@ class TextAlignment::MixedAlignment
|
|
144
144
|
def compute_similarity(s1, s2, sdiff)
|
145
145
|
return 0 if sdiff.nil?
|
146
146
|
|
147
|
-
#
|
148
|
-
|
149
|
-
|
147
|
+
# recoverbility
|
148
|
+
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
|
+
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
150
|
|
151
|
-
|
152
|
-
|
151
|
+
coverage = count_nws_match.to_f / count_nws
|
152
|
+
|
153
|
+
# fragmentation rate
|
154
|
+
count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
|
155
|
+
count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
|
156
|
+
rate_frag = count_ofrag.to_f / count_frag
|
153
157
|
|
158
|
+
similarity = coverage * rate_frag
|
159
|
+
end
|
154
160
|
end
|
@@ -15,8 +15,9 @@ class TextAlignment::TextAlignment
|
|
15
15
|
def initialize(reference_text, to_prevent_overlap = false)
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
-
@
|
18
|
+
@original_reference_text = reference_text
|
19
19
|
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@mapped_reference_text = @rtext_mapping.mapped_text
|
20
21
|
@to_prevent_overlap = to_prevent_overlap
|
21
22
|
|
22
23
|
@original_text = nil
|
@@ -34,21 +35,19 @@ class TextAlignment::TextAlignment
|
|
34
35
|
@text_mapping = TextAlignment::CharMapping.new(text)
|
35
36
|
end
|
36
37
|
|
37
|
-
|
38
|
+
@mapped_text = @text_mapping.mapped_text
|
38
39
|
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
39
40
|
|
40
|
-
rtext_mapped = @rtext_mapping.mapped_text
|
41
|
-
|
42
41
|
## To generate the block_alignment of the input text against the reference text
|
43
|
-
|
44
42
|
# Initialization
|
45
|
-
@block_alignment = {text: @original_text, reference_text: @
|
43
|
+
# @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
|
44
|
+
@block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
|
46
45
|
|
47
46
|
# Generation
|
48
|
-
@block_alignment[:blocks] = if r = whole_block_alignment(
|
47
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
49
48
|
r
|
50
49
|
else
|
51
|
-
find_block_alignment(
|
50
|
+
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
52
51
|
end
|
53
52
|
end
|
54
53
|
|
@@ -134,7 +133,7 @@ class TextAlignment::TextAlignment
|
|
134
133
|
source = {begin:d.begin, end:d.end}
|
135
134
|
d.begin = transform_begin_position(d.begin);
|
136
135
|
d.end = transform_end_position(d.end);
|
137
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
136
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
|
138
137
|
rescue
|
139
138
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
140
139
|
d.begin = nil
|
@@ -150,7 +149,7 @@ class TextAlignment::TextAlignment
|
|
150
149
|
|
151
150
|
r = hdenotations.collect do |d|
|
152
151
|
t = transform_a_span(d[:span])
|
153
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
152
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
|
154
153
|
new_d = d.dup.merge({span:t})
|
155
154
|
rescue
|
156
155
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -161,8 +160,8 @@ class TextAlignment::TextAlignment
|
|
161
160
|
end
|
162
161
|
|
163
162
|
def alignment_show
|
164
|
-
stext = @
|
165
|
-
ttext = @
|
163
|
+
stext = @mapped_text
|
164
|
+
ttext = @mapped_reference_text
|
166
165
|
|
167
166
|
show = ''
|
168
167
|
@block_alignment[:blocks].each do |a|
|
@@ -218,7 +217,7 @@ class TextAlignment::TextAlignment
|
|
218
217
|
|
219
218
|
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
220
219
|
"[#{astr1}]\n" +
|
221
|
-
"[#{astr2}]\n\n"
|
220
|
+
"[#{astr2.gsub("\n", " ")}]\n\n"
|
222
221
|
end
|
223
222
|
end
|
224
223
|
show
|
@@ -257,142 +256,109 @@ class TextAlignment::TextAlignment
|
|
257
256
|
# puts "-=-=-=-=-"
|
258
257
|
# puts
|
259
258
|
|
260
|
-
##
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
259
|
+
## To fill the gaps
|
260
|
+
## lblock: last block, cblock: current block
|
261
|
+
lblock = nil
|
262
|
+
blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
|
263
|
+
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
264
|
+
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
265
265
|
|
266
|
-
|
267
|
-
[
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
if
|
273
|
-
[
|
274
|
-
{source:{begin:b1, end:e1}, alignment: :empty},
|
275
|
-
block
|
276
|
-
]
|
266
|
+
if b1 < e1
|
267
|
+
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
268
|
+
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
269
|
+
_str1 = str1[b1 ... e1]
|
270
|
+
_str2 = str2[b2 ... e2]
|
271
|
+
|
272
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
273
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
277
274
|
else
|
278
275
|
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
[
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
276
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
277
|
+
case region_state
|
278
|
+
when :closed
|
279
|
+
[]
|
280
|
+
when :front_open
|
281
|
+
oe2 = state_region[1]
|
282
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
283
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
284
|
+
when :rear_open
|
285
|
+
ob2 = state_region[0]
|
286
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
287
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
288
|
+
when :middle_closed
|
289
|
+
oe2 = state_region[0]
|
290
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
291
|
+
attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
292
|
+
if attempt1.empty?
|
293
|
+
ob2 = state_region[1]
|
294
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
295
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
296
|
+
else
|
297
|
+
attempt1
|
298
|
+
end
|
299
|
+
else # :open
|
300
|
+
if (e2 - b2) > len_buffer
|
301
|
+
attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
302
|
+
if attempt1.empty?
|
303
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
304
|
+
else
|
305
|
+
attempt1
|
306
|
+
end
|
307
|
+
else
|
308
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
309
|
+
end
|
298
310
|
end
|
299
311
|
end
|
300
312
|
end
|
301
313
|
|
302
|
-
|
303
|
-
sum
|
314
|
+
lblock = cblock
|
315
|
+
cblock.nil? ? sum : sum << cblock
|
304
316
|
end
|
305
317
|
|
306
|
-
# the last step
|
307
|
-
blocks2 += if last_block.nil?
|
308
|
-
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
309
|
-
else
|
310
|
-
b1 = last_block[:source][:end]
|
311
|
-
if b1 < str1.length
|
312
|
-
e1 = str1.length
|
313
|
-
b2 = last_block[:target][:end]
|
314
|
-
|
315
|
-
_str1 = str1[b1 ... e1]
|
316
|
-
if _str1.strip.empty?
|
317
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
318
|
-
else
|
319
|
-
if b2 < str2.length
|
320
|
-
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
321
|
-
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
322
|
-
|
323
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
324
|
-
else
|
325
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
326
|
-
end
|
327
|
-
end
|
328
|
-
else
|
329
|
-
[]
|
330
|
-
end
|
331
|
-
end
|
332
318
|
end
|
333
319
|
|
334
320
|
def whole_block_alignment(str1, str2, cultivation_map)
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
block_begin = begin
|
339
|
-
_block_begin = str2.index(str1, search_position)
|
340
|
-
break if _block_begin.nil?
|
341
|
-
search_position = cultivation_map.search_again_position(_block_begin)
|
342
|
-
_block_begin
|
343
|
-
end until search_position.nil?
|
344
|
-
|
345
|
-
unless block_begin.nil?
|
346
|
-
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
347
|
-
end
|
321
|
+
block_begin = cultivation_map.index(str1, str2, 0)
|
322
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
348
323
|
|
349
|
-
|
324
|
+
block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
|
325
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
350
326
|
|
351
|
-
|
352
|
-
|
353
|
-
block_begin = begin
|
354
|
-
_block_begin = dstr2.index(dstr1, search_position)
|
355
|
-
break if _block_begin.nil?
|
356
|
-
search_position = cultivation_map.search_again_position(_block_begin)
|
357
|
-
_block_begin
|
358
|
-
end until search_position.nil?
|
327
|
+
nil
|
328
|
+
end
|
359
329
|
|
360
|
-
|
361
|
-
|
330
|
+
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
331
|
+
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
332
|
+
if tblocks.empty?
|
333
|
+
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
334
|
+
else
|
335
|
+
tblocks
|
362
336
|
end
|
363
|
-
|
364
|
-
nil
|
365
337
|
end
|
366
338
|
|
367
|
-
def
|
368
|
-
|
339
|
+
def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
340
|
+
str2_block = str2[0 ... e2]
|
369
341
|
|
370
342
|
## term-based alignment
|
371
343
|
tblocks = if denotations
|
372
|
-
|
344
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
373
345
|
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
374
346
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
375
347
|
|
376
|
-
|
377
|
-
_tblocks =
|
378
|
-
lex =
|
379
|
-
|
380
|
-
if
|
381
|
-
|
382
|
-
|
383
|
-
end
|
384
|
-
position = r + lex.length
|
385
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
|
348
|
+
search_position = b2
|
349
|
+
_tblocks = denotations_in_scope.map do |denotation|
|
350
|
+
lex = denotation[:lex]
|
351
|
+
term_begin = cultivation_map.index(lex, str2_block, search_position)
|
352
|
+
break [] if term_begin.nil? # break the loop if a missing term is found
|
353
|
+
search_position = term_begin + lex.length
|
354
|
+
{source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
|
386
355
|
end
|
387
356
|
|
388
|
-
# missing term found
|
389
|
-
_tblocks = [] if position.nil?
|
390
|
-
|
391
357
|
# redundant matching found
|
392
|
-
unless
|
393
|
-
|
394
|
-
|
395
|
-
look_forward =
|
358
|
+
unless _tblocks.empty?
|
359
|
+
search_position = _tblocks.last[:target][:end]
|
360
|
+
denotations_in_scope.each do |term|
|
361
|
+
look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
|
396
362
|
unless look_forward.nil?
|
397
363
|
_tblocks = []
|
398
364
|
break
|
@@ -405,72 +371,37 @@ class TextAlignment::TextAlignment
|
|
405
371
|
[]
|
406
372
|
end
|
407
373
|
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
else
|
413
|
-
block1 = str1[b1 ... e1]
|
414
|
-
block2 = str2[b2 ... e2]
|
415
|
-
|
416
|
-
## character-based alignment
|
417
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
418
|
-
if alignment.sdiff.nil?
|
419
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
420
|
-
else
|
421
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
422
|
-
end
|
423
|
-
end
|
424
|
-
else
|
425
|
-
block1 = str1[b1 ... e1]
|
426
|
-
block2 = str2[b2 ... e2]
|
374
|
+
ltblock = nil
|
375
|
+
tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
|
376
|
+
tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
|
377
|
+
te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
|
427
378
|
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
else
|
433
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
434
|
-
end
|
379
|
+
if te1 > tb1
|
380
|
+
tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
|
381
|
+
te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
|
382
|
+
sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
|
435
383
|
end
|
436
|
-
else
|
437
|
-
last_tblock = nil
|
438
|
-
lblocks = tblocks.inject([]) do |sum, tblock|
|
439
|
-
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
440
|
-
te1 = tblock[:source][:begin]
|
441
384
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
446
|
-
te2 = tblock[:target][:begin]
|
447
|
-
|
448
|
-
if b2 == e2
|
449
|
-
[
|
450
|
-
{source:{begin:tb1, end:te1}, alignment: :empty},
|
451
|
-
tblock
|
452
|
-
]
|
453
|
-
else
|
454
|
-
[
|
455
|
-
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
456
|
-
tblock
|
457
|
-
]
|
458
|
-
end
|
459
|
-
end
|
385
|
+
ltblock = ctblock
|
386
|
+
ctblock.nil? ? sum : sum << ctblock
|
387
|
+
end
|
460
388
|
|
461
|
-
|
462
|
-
|
463
|
-
end
|
389
|
+
tblocks2
|
390
|
+
end
|
464
391
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
else
|
469
|
-
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
470
|
-
end
|
471
|
-
end
|
392
|
+
def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
393
|
+
source = {begin:b1, end:e1}
|
394
|
+
target = {begin:b2, end:e2}
|
472
395
|
|
473
|
-
|
396
|
+
if (e1 - b1) > 2000
|
397
|
+
[{source:source, target:target, alignment: :empty}]
|
398
|
+
else
|
399
|
+
alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
|
400
|
+
if alignment.similarity < 0.5
|
401
|
+
[{source:source, target:target, alignment: :empty}]
|
402
|
+
else
|
403
|
+
[{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
|
404
|
+
end
|
474
405
|
end
|
475
406
|
end
|
476
407
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|