text_alignment 0.11.0 → 0.11.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +39 -10
- data/lib/text_alignment/anchor_finder.rb +16 -30
- data/lib/text_alignment/char_mapping.rb +6 -5
- data/lib/text_alignment/cultivation_map.rb +80 -5
- data/lib/text_alignment/mixed_alignment.rb +22 -5
- data/lib/text_alignment/text_alignment.rb +163 -207
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ed6071b0293e9b7fa86acf4a737c80c9d784f453d3fb77992e3d9f1acc02bbe
|
4
|
+
data.tar.gz: 7a57b7afbe21061d9aac2f96cd9f7a3c83ff2f01b6c3973905f7681a40463287
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 141158c2f6b80975bacf68babc713b06d809f02b96cfd7accdecb00c6b4c362b9d130e29820074be7b89ac53c7cc93f850ec3d7ea9796eaad5021d441528e82b
|
7
|
+
data.tar.gz: 8bc4cc9ca9becc1c5638b6501268d33dd7700e8369d7c20c1ec6fd9ef11e9fd73e6a1354cb37376af250c8462e945cccf08417ba6d258f08660c53eb418e4658
|
data/bin/align_annotations
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment'
|
3
3
|
require 'json'
|
4
4
|
require 'pp'
|
5
|
+
require 'optparse'
|
5
6
|
|
6
7
|
def read_annotations(filename)
|
7
8
|
case File.extname(filename)
|
@@ -37,7 +38,7 @@ def align_denotations(denotations, source_text, alignment, debug = false)
|
|
37
38
|
end
|
38
39
|
|
39
40
|
lost_annotations = alignment.lost_annotations
|
40
|
-
unless lost_annotations.empty?
|
41
|
+
unless lost_annotations.nil? || lost_annotations.empty?
|
41
42
|
warn "\n[lost annotations] #{lost_annotations.length}"
|
42
43
|
lost_annotations.each do |a|
|
43
44
|
warn "#{a}"
|
@@ -77,7 +78,9 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
77
78
|
annotations[:relations].each do |r|
|
78
79
|
reid = 'R' + (idnum_relations += 1).to_s
|
79
80
|
ididx[r[:id]] = reid
|
80
|
-
|
81
|
+
sid = ididx[r[:subj]]
|
82
|
+
oid = ididx[r[:obj]]
|
83
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
|
81
84
|
end
|
82
85
|
end
|
83
86
|
|
@@ -86,7 +89,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
86
89
|
annotations[:attributes].each do |a|
|
87
90
|
reid = 'A' + (idnum_attributes += 1).to_s
|
88
91
|
ididx[a[:id]] = reid
|
89
|
-
|
92
|
+
sid = ididx[a[:subj]]
|
93
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
|
90
94
|
end
|
91
95
|
end
|
92
96
|
|
@@ -95,7 +99,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
95
99
|
annotations[:modifications].each do |m|
|
96
100
|
reid = 'M' + (idnum_modifications += 1).to_s
|
97
101
|
ididx[m[:id]] = reid
|
98
|
-
|
102
|
+
oid = ididx[m[:obj]]
|
103
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
|
99
104
|
end
|
100
105
|
end
|
101
106
|
end
|
@@ -104,21 +109,45 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
104
109
|
end
|
105
110
|
|
106
111
|
|
112
|
+
## Options
|
113
|
+
overlap_p = false
|
114
|
+
debug_p = false
|
115
|
+
|
116
|
+
## command line option processing
|
117
|
+
require 'optparse'
|
118
|
+
optparse = OptionParser.new do |opts|
|
119
|
+
opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
|
120
|
+
|
121
|
+
opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
|
122
|
+
overlap_p = true
|
123
|
+
end
|
124
|
+
|
125
|
+
opts.on('-d', '--debug', 'tells it to show debugging information.') do
|
126
|
+
debug_p = true
|
127
|
+
end
|
128
|
+
|
129
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
130
|
+
puts opts
|
131
|
+
exit
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
optparse.parse!
|
136
|
+
|
107
137
|
unless ARGV.length == 2
|
108
|
-
|
109
|
-
exit
|
138
|
+
puts optparse.help
|
139
|
+
exit 1
|
110
140
|
end
|
111
141
|
|
112
142
|
source_annotations = read_annotations(ARGV[0])
|
113
143
|
reference_text = read_text(ARGV[1])
|
114
144
|
|
115
|
-
alignment = TextAlignment::TextAlignment.new(reference_text,
|
145
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
|
116
146
|
|
117
147
|
target_annotations = if source_annotations.class == Array
|
118
|
-
align_mannotations(source_annotations, reference_text, alignment,
|
148
|
+
align_mannotations(source_annotations, reference_text, alignment, debug_p)
|
119
149
|
else
|
120
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
121
|
-
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
|
150
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
|
122
151
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
123
152
|
end
|
124
153
|
|
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
9
|
def initialize(source_str, target_str, cultivation_map)
|
10
|
-
@s1
|
11
|
-
|
12
|
-
else
|
13
|
-
[source_str.downcase, target_str.downcase]
|
14
|
-
end
|
10
|
+
@s1 = source_str.downcase
|
11
|
+
@s2 = target_str.downcase
|
15
12
|
|
16
13
|
@cultivation_map = cultivation_map
|
17
14
|
|
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
|
|
19
16
|
@size_window = TextAlignment::SIZE_WINDOW
|
20
17
|
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
21
18
|
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
19
|
+
@pos_s2_final_possible_end = @s2.length
|
22
20
|
|
23
21
|
# positions of last match
|
24
22
|
@pos_s1_last_match = 0
|
25
23
|
@pos_s2_last_match = 0
|
26
24
|
end
|
27
25
|
|
28
|
-
def reverse?(source_str = nil, target_str = nil)
|
29
|
-
unless source_str.nil?
|
30
|
-
@reverse_p = target_str.length < source_str.length
|
31
|
-
end
|
32
|
-
@reverse_p
|
33
|
-
end
|
34
|
-
|
35
26
|
def get_next_anchor
|
36
27
|
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
37
28
|
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
|
|
49
40
|
# To extend the block to the left
|
50
41
|
b1 = beg_s1
|
51
42
|
b2 = beg_s2
|
52
|
-
|
43
|
+
left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
|
44
|
+
while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
|
53
45
|
b1 -= 1; b2 -= 1
|
54
46
|
end
|
55
|
-
b1 += 1; b2 += 1
|
56
47
|
|
57
48
|
# To extend the block to the right
|
58
49
|
e1 = beg_s1 + @size_ngram
|
59
50
|
e2 = beg_s2 + @size_ngram
|
60
|
-
|
51
|
+
right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
|
52
|
+
while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
|
61
53
|
e1 += 1; e2 += 1
|
62
54
|
end
|
63
55
|
|
64
56
|
@pos_s1_last_match = e1
|
65
57
|
@pos_s2_last_match = e2
|
66
58
|
|
67
|
-
|
68
|
-
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
69
|
-
else
|
70
|
-
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
71
|
-
end
|
59
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
72
60
|
end
|
73
61
|
|
74
62
|
private
|
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
|
|
91
79
|
# return nil if the anchor is too much frequent
|
92
80
|
def find_beg_s2_candidates(anchor, search_position)
|
93
81
|
candidates = []
|
94
|
-
while _beg_s2 = @
|
95
|
-
search_again_position = @cultivation_map.search_again_position(_beg_s2)
|
96
|
-
unless search_again_position.nil?
|
97
|
-
search_position = search_again_position
|
98
|
-
next
|
99
|
-
end
|
100
|
-
|
82
|
+
while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
|
101
83
|
candidates << _beg_s2
|
102
84
|
|
103
85
|
# for speed, skip anchor of high frequency
|
@@ -126,14 +108,14 @@ class TextAlignment::AnchorFinder
|
|
126
108
|
next
|
127
109
|
end
|
128
110
|
|
129
|
-
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
|
111
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
|
130
112
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
131
113
|
break unless valid_beg_s2.nil?
|
132
114
|
valid_beg_s2 = beg_s2
|
133
115
|
next
|
134
116
|
end
|
135
117
|
|
136
|
-
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
|
118
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
|
137
119
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
138
120
|
break unless valid_beg_s2.nil?
|
139
121
|
valid_beg_s2 = beg_s2
|
@@ -143,7 +125,11 @@ class TextAlignment::AnchorFinder
|
|
143
125
|
|
144
126
|
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
145
127
|
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
146
|
-
|
128
|
+
if r.nil?
|
129
|
+
valid_beg_s2 = nil
|
130
|
+
else
|
131
|
+
break
|
132
|
+
end
|
147
133
|
end
|
148
134
|
|
149
135
|
valid_beg_s2
|
@@ -61,6 +61,7 @@ TextAlignment::CHAR_MAPPING = [
|
|
61
61
|
["•", "*"], #U+2022 (bullet)
|
62
62
|
[" ", " "], #U+2009 (thin space)
|
63
63
|
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+202F (narrow no-break space)
|
64
65
|
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
66
|
[" ", " "], #U+3000 (ideographic space)
|
66
67
|
["‐", "-"], #U+2010 (Hyphen)
|
@@ -94,10 +95,10 @@ class TextAlignment::CharMapping
|
|
94
95
|
@index_demap[position]
|
95
96
|
end
|
96
97
|
|
97
|
-
def enmap_denotations(
|
98
|
-
return nil if
|
98
|
+
def enmap_denotations(denotations)
|
99
|
+
return nil if denotations.nil?
|
99
100
|
|
100
|
-
denotations
|
101
|
+
denotations.map do |d|
|
101
102
|
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
102
103
|
end
|
103
104
|
end
|
@@ -107,7 +108,7 @@ class TextAlignment::CharMapping
|
|
107
108
|
def enmap_text(_text, char_mapping)
|
108
109
|
text = _text.dup
|
109
110
|
|
110
|
-
# To execute the single letter mapping
|
111
|
+
# To execute the single letter mapping replacement
|
111
112
|
char_mapping.each do |one, long|
|
112
113
|
text.gsub!(one, long) if long.length == 1
|
113
114
|
end
|
@@ -175,7 +176,7 @@ if __FILE__ == $0
|
|
175
176
|
exit
|
176
177
|
end
|
177
178
|
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
178
|
-
denotations = annotations[:denotations]
|
179
|
+
denotations = annotations[:denotations] || []
|
179
180
|
if denotations.nil? && annotations[:tracks]
|
180
181
|
denotations = annotations[:tracks].first[:denotations]
|
181
182
|
end
|
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
|
|
4
4
|
attr_reader :map
|
5
5
|
|
6
6
|
def initialize
|
7
|
-
@map =
|
7
|
+
@map = []
|
8
8
|
end
|
9
9
|
|
10
10
|
def cultivate(regions)
|
11
|
-
|
12
|
-
|
11
|
+
@map += regions
|
12
|
+
@map.sort!{|a, b| a[0] <=> b[0]}
|
13
|
+
new_map = []
|
14
|
+
@map.each do |region|
|
15
|
+
if new_map.empty?
|
16
|
+
new_map << region
|
17
|
+
elsif new_map.last[1] > region[0]
|
18
|
+
raise "Overlapping regions: #{new_map.last} : #{region}"
|
19
|
+
elsif new_map.last[1] == region[0]
|
20
|
+
new_map.last[1] == region[1]
|
21
|
+
else
|
22
|
+
new_map << region
|
23
|
+
end
|
13
24
|
end
|
25
|
+
@map = new_map
|
14
26
|
end
|
15
27
|
|
16
|
-
def search_again_position(position)
|
17
|
-
|
28
|
+
def search_again_position(position, end_position = nil)
|
29
|
+
end_position ||= position
|
30
|
+
region = @map.bsearch{|r| end_position < r[1]}
|
31
|
+
if region.nil? || region[0] > position
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
region[1]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def last_cultivated_position(position)
|
39
|
+
ridx = @map.rindex{|r| r[1] <= position}
|
40
|
+
ridx.nil? ? nil : @map[ridx][1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next_cultivated_position(position)
|
44
|
+
region = @map.bsearch{|r| position <= r[0]}
|
45
|
+
region.nil? ? nil : region[0]
|
46
|
+
end
|
47
|
+
|
48
|
+
def in_regions(region)
|
49
|
+
@map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
|
50
|
+
end
|
51
|
+
|
52
|
+
def region_state(region)
|
53
|
+
closed_parts = in_regions(region)
|
54
|
+
if closed_parts.empty?
|
55
|
+
[:open, region]
|
56
|
+
else
|
57
|
+
if front_open?(region, closed_parts)
|
58
|
+
if rear_open?(region, closed_parts)
|
59
|
+
[:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
|
60
|
+
else
|
61
|
+
[:front_open, [region[0], closed_parts.first[0]]]
|
62
|
+
end
|
63
|
+
else
|
64
|
+
if rear_open?(region, closed_parts)
|
65
|
+
[:rear_open, [closed_parts.last[1], region[1]]]
|
66
|
+
else
|
67
|
+
[:closed, nil]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def index(target, string, position = 0)
|
74
|
+
length = target.length
|
75
|
+
loop do
|
76
|
+
_begin = string.index(target, position)
|
77
|
+
break if _begin.nil?
|
78
|
+
position = search_again_position(_begin)
|
79
|
+
next unless position.nil?
|
80
|
+
break _begin if region_state([_begin, _begin + length])[0] == :open
|
81
|
+
position = _begin + 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def front_open?(region, closed_parts)
|
88
|
+
closed_parts.first[0] > region[0]
|
89
|
+
end
|
90
|
+
|
91
|
+
def rear_open?(region, closed_parts)
|
92
|
+
closed_parts.last[1] < region[1]
|
18
93
|
end
|
19
94
|
end
|
@@ -144,11 +144,28 @@ class TextAlignment::MixedAlignment
|
|
144
144
|
def compute_similarity(s1, s2, sdiff)
|
145
145
|
return 0 if sdiff.nil?
|
146
146
|
|
147
|
-
#
|
148
|
-
|
149
|
-
|
147
|
+
# recoverbility
|
148
|
+
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
|
+
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
|
+
coverage = count_nws_match.to_f / count_nws
|
151
|
+
|
152
|
+
# fragmentation rate
|
153
|
+
frag_str = sdiff.collect do |d|
|
154
|
+
case d.action
|
155
|
+
when '='
|
156
|
+
'='
|
157
|
+
when '-'
|
158
|
+
''
|
159
|
+
when '+'
|
160
|
+
(d.new_element =~ /\S/) ? '+' : ''
|
161
|
+
else
|
162
|
+
''
|
163
|
+
end
|
164
|
+
end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
|
150
165
|
|
151
|
-
|
152
|
-
|
166
|
+
count_frag = frag_str.scan(/=+/).count
|
167
|
+
rate_frag = 1.0 / count_frag
|
153
168
|
|
169
|
+
similarity = coverage * rate_frag
|
170
|
+
end
|
154
171
|
end
|
@@ -15,12 +15,13 @@ class TextAlignment::TextAlignment
|
|
15
15
|
def initialize(reference_text, to_prevent_overlap = false)
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
-
@
|
18
|
+
@original_reference_text = reference_text
|
19
19
|
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@mapped_reference_text = @rtext_mapping.mapped_text
|
20
21
|
@to_prevent_overlap = to_prevent_overlap
|
21
22
|
|
22
23
|
@original_text = nil
|
23
|
-
@
|
24
|
+
@blocks = nil
|
24
25
|
@cultivation_map = TextAlignment::CultivationMap.new
|
25
26
|
end
|
26
27
|
|
@@ -34,51 +35,24 @@ class TextAlignment::TextAlignment
|
|
34
35
|
@text_mapping = TextAlignment::CharMapping.new(text)
|
35
36
|
end
|
36
37
|
|
37
|
-
|
38
|
+
@mapped_text = @text_mapping.mapped_text
|
38
39
|
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
39
40
|
|
40
|
-
rtext_mapped = @rtext_mapping.mapped_text
|
41
|
-
|
42
41
|
## To generate the block_alignment of the input text against the reference text
|
43
|
-
|
44
|
-
# Initialization
|
45
|
-
@block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
|
46
|
-
|
47
|
-
# Generation
|
48
|
-
@block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
|
42
|
+
@blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
49
43
|
r
|
50
44
|
else
|
51
|
-
find_block_alignment(
|
45
|
+
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
52
46
|
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def update_cultivation_map
|
56
|
-
return if @block_alignment.nil? || @block_alignment[:blocks].nil?
|
57
47
|
|
58
|
-
|
59
|
-
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
60
|
-
if b[:alignment] == :block || b[:alignment] == :term
|
61
|
-
[b[:target][:begin], b[:target][:end]]
|
62
|
-
else
|
63
|
-
nil
|
64
|
-
end
|
65
|
-
end.compact.inject([]) do |condensed, region|
|
66
|
-
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
67
|
-
condensed.push region
|
68
|
-
else
|
69
|
-
condensed.last[1] = region.last
|
70
|
-
end
|
71
|
-
condensed
|
72
|
-
end
|
73
|
-
|
74
|
-
@cultivation_map.cultivate(newly_cultivated_regions)
|
48
|
+
@block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
|
75
49
|
end
|
76
50
|
|
77
51
|
def transform_begin_position(_begin_position)
|
78
52
|
begin_position = @text_mapping.enmap_position(_begin_position)
|
79
53
|
|
80
|
-
i = @
|
81
|
-
block = @
|
54
|
+
i = @blocks.index{|b| b[:source][:end] > begin_position}
|
55
|
+
block = @blocks[i]
|
82
56
|
|
83
57
|
b = if block[:alignment] == :block || block[:alignment] == :term
|
84
58
|
begin_position + block[:delta]
|
@@ -99,8 +73,8 @@ class TextAlignment::TextAlignment
|
|
99
73
|
def transform_end_position(_end_position)
|
100
74
|
end_position = @text_mapping.enmap_position(_end_position)
|
101
75
|
|
102
|
-
i = @
|
103
|
-
block = @
|
76
|
+
i = @blocks.index{|b| b[:source][:end] >= end_position}
|
77
|
+
block = @blocks[i]
|
104
78
|
|
105
79
|
e = if block[:alignment] == :block || block[:alignment] == :term
|
106
80
|
end_position + block[:delta]
|
@@ -134,7 +108,7 @@ class TextAlignment::TextAlignment
|
|
134
108
|
source = {begin:d.begin, end:d.end}
|
135
109
|
d.begin = transform_begin_position(d.begin);
|
136
110
|
d.end = transform_end_position(d.end);
|
137
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
111
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
|
138
112
|
rescue
|
139
113
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
140
114
|
d.begin = nil
|
@@ -150,7 +124,7 @@ class TextAlignment::TextAlignment
|
|
150
124
|
|
151
125
|
r = hdenotations.collect do |d|
|
152
126
|
t = transform_a_span(d[:span])
|
153
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
127
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
|
154
128
|
new_d = d.dup.merge({span:t})
|
155
129
|
rescue
|
156
130
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -218,7 +192,7 @@ class TextAlignment::TextAlignment
|
|
218
192
|
|
219
193
|
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
220
194
|
"[#{astr1}]\n" +
|
221
|
-
"[#{astr2}]\n\n"
|
195
|
+
"[#{astr2.gsub("\n", " ")}]\n\n"
|
222
196
|
end
|
223
197
|
end
|
224
198
|
show
|
@@ -257,142 +231,125 @@ class TextAlignment::TextAlignment
|
|
257
231
|
# puts "-=-=-=-=-"
|
258
232
|
# puts
|
259
233
|
|
260
|
-
##
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
234
|
+
## To fill the gaps
|
235
|
+
## lblock: last block, cblock: current block
|
236
|
+
lblock = nil
|
237
|
+
blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
|
238
|
+
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
239
|
+
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
265
240
|
|
266
|
-
|
267
|
-
[
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
if
|
273
|
-
[
|
274
|
-
{source:{begin:b1, end:e1}, alignment: :empty},
|
275
|
-
block
|
276
|
-
]
|
241
|
+
if b1 < e1
|
242
|
+
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
243
|
+
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
244
|
+
_str1 = str1[b1 ... e1]
|
245
|
+
_str2 = str2[b2 ... e2]
|
246
|
+
|
247
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
248
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
277
249
|
else
|
278
250
|
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
251
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
252
|
+
case region_state
|
253
|
+
when :closed
|
254
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
255
|
+
when :front_open
|
256
|
+
if sum.empty? # when there is no preceding matched block
|
257
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
258
|
+
else
|
259
|
+
oe2 = state_region[1]
|
260
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
261
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
262
|
+
end
|
263
|
+
when :rear_open
|
264
|
+
if cblock.nil? # when there is no following matched block
|
265
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
266
|
+
else
|
267
|
+
ob2 = state_region[0]
|
268
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
269
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
270
|
+
end
|
271
|
+
when :middle_closed
|
272
|
+
attempt1 = if sum.empty?
|
273
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
274
|
+
else
|
275
|
+
oe2 = state_region[0]
|
276
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
277
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
278
|
+
end
|
279
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
280
|
+
ob2 = state_region[1]
|
281
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
282
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
283
|
+
else
|
284
|
+
attempt1
|
285
|
+
end
|
286
|
+
else # :open
|
287
|
+
if (e2 - b2) > len_buffer
|
288
|
+
attempt1 = if sum.empty?
|
289
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
290
|
+
else
|
291
|
+
local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
292
|
+
end
|
293
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
294
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
295
|
+
else
|
296
|
+
attempt1
|
297
|
+
end
|
298
|
+
else
|
299
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
300
|
+
end
|
298
301
|
end
|
299
302
|
end
|
300
303
|
end
|
301
304
|
|
302
|
-
|
303
|
-
sum
|
305
|
+
lblock = cblock
|
306
|
+
cblock.nil? ? sum : sum << cblock
|
304
307
|
end
|
305
308
|
|
306
|
-
# the last step
|
307
|
-
blocks2 += if last_block.nil?
|
308
|
-
local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
|
309
|
-
else
|
310
|
-
b1 = last_block[:source][:end]
|
311
|
-
if b1 < str1.length
|
312
|
-
e1 = str1.length
|
313
|
-
b2 = last_block[:target][:end]
|
314
|
-
|
315
|
-
_str1 = str1[b1 ... e1]
|
316
|
-
if _str1.strip.empty?
|
317
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
318
|
-
else
|
319
|
-
if b2 < str2.length
|
320
|
-
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
321
|
-
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
322
|
-
|
323
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
324
|
-
else
|
325
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
326
|
-
end
|
327
|
-
end
|
328
|
-
else
|
329
|
-
[]
|
330
|
-
end
|
331
|
-
end
|
332
309
|
end
|
333
310
|
|
334
311
|
def whole_block_alignment(str1, str2, cultivation_map)
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
block_begin = begin
|
339
|
-
_block_begin = str2.index(str1, search_position)
|
340
|
-
break if _block_begin.nil?
|
341
|
-
search_position = cultivation_map.search_again_position(_block_begin)
|
342
|
-
_block_begin
|
343
|
-
end until search_position.nil?
|
344
|
-
|
345
|
-
unless block_begin.nil?
|
346
|
-
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
347
|
-
end
|
312
|
+
block_begin = cultivation_map.index(str1, str2)
|
313
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
348
314
|
|
349
|
-
|
315
|
+
block_begin = cultivation_map.index(str1.downcase, str2.downcase)
|
316
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
350
317
|
|
351
|
-
|
352
|
-
|
353
|
-
block_begin = begin
|
354
|
-
_block_begin = dstr2.index(dstr1, search_position)
|
355
|
-
break if _block_begin.nil?
|
356
|
-
search_position = cultivation_map.search_again_position(_block_begin)
|
357
|
-
_block_begin
|
358
|
-
end until search_position.nil?
|
318
|
+
nil
|
319
|
+
end
|
359
320
|
|
360
|
-
|
361
|
-
|
321
|
+
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
322
|
+
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
323
|
+
if tblocks.empty? || tblocks.first[:alignment] == :empty
|
324
|
+
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
325
|
+
else
|
326
|
+
tblocks
|
362
327
|
end
|
363
|
-
|
364
|
-
nil
|
365
328
|
end
|
366
329
|
|
367
|
-
def
|
368
|
-
|
330
|
+
def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
331
|
+
str2_block = str2[0 ... e2]
|
369
332
|
|
370
333
|
## term-based alignment
|
371
334
|
tblocks = if denotations
|
372
|
-
|
335
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
373
336
|
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
374
337
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
375
338
|
|
376
|
-
|
377
|
-
_tblocks =
|
378
|
-
lex =
|
379
|
-
|
380
|
-
if
|
381
|
-
|
382
|
-
|
383
|
-
end
|
384
|
-
position = r + lex.length
|
385
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
|
339
|
+
search_position = b2
|
340
|
+
_tblocks = denotations_in_scope.map do |denotation|
|
341
|
+
lex = denotation[:lex]
|
342
|
+
term_begin = cultivation_map.index(lex, str2_block, search_position)
|
343
|
+
break [] if term_begin.nil? # break the loop if a missing term is found
|
344
|
+
search_position = term_begin + lex.length
|
345
|
+
{source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
|
386
346
|
end
|
387
347
|
|
388
|
-
# missing term found
|
389
|
-
_tblocks = [] if position.nil?
|
390
|
-
|
391
348
|
# redundant matching found
|
392
|
-
unless
|
393
|
-
|
394
|
-
|
395
|
-
look_forward =
|
349
|
+
unless _tblocks.empty?
|
350
|
+
search_position = _tblocks.last[:target][:end]
|
351
|
+
denotations_in_scope.each do |term|
|
352
|
+
look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
|
396
353
|
unless look_forward.nil?
|
397
354
|
_tblocks = []
|
398
355
|
break
|
@@ -405,73 +362,72 @@ class TextAlignment::TextAlignment
|
|
405
362
|
[]
|
406
363
|
end
|
407
364
|
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
else
|
413
|
-
block1 = str1[b1 ... e1]
|
414
|
-
block2 = str2[b2 ... e2]
|
415
|
-
|
416
|
-
## character-based alignment
|
417
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
418
|
-
if alignment.sdiff.nil?
|
419
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
420
|
-
else
|
421
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
422
|
-
end
|
423
|
-
end
|
424
|
-
else
|
425
|
-
block1 = str1[b1 ... e1]
|
426
|
-
block2 = str2[b2 ... e2]
|
365
|
+
ltblock = nil
|
366
|
+
tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
|
367
|
+
tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
|
368
|
+
te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
|
427
369
|
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
else
|
433
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
434
|
-
end
|
370
|
+
if te1 > tb1
|
371
|
+
tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
|
372
|
+
te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
|
373
|
+
sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
|
435
374
|
end
|
436
|
-
else
|
437
|
-
last_tblock = nil
|
438
|
-
lblocks = tblocks.inject([]) do |sum, tblock|
|
439
|
-
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
440
|
-
te1 = tblock[:source][:begin]
|
441
375
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
if b2 == e2
|
449
|
-
[
|
450
|
-
{source:{begin:tb1, end:te1}, alignment: :empty},
|
451
|
-
tblock
|
452
|
-
]
|
453
|
-
else
|
454
|
-
[
|
455
|
-
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
456
|
-
tblock
|
457
|
-
]
|
458
|
-
end
|
459
|
-
end
|
376
|
+
ltblock = ctblock
|
377
|
+
ctblock.nil? ? sum : sum << ctblock
|
378
|
+
end
|
379
|
+
|
380
|
+
tblocks2
|
381
|
+
end
|
460
382
|
|
461
|
-
|
462
|
-
|
383
|
+
def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
384
|
+
source = {begin:b1, end:e1}
|
385
|
+
target = {begin:b2, end:e2}
|
386
|
+
|
387
|
+
if (e1 - b1) > 2000
|
388
|
+
[{source:source, target:target, alignment: :empty}]
|
389
|
+
else
|
390
|
+
alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
|
391
|
+
if alignment.similarity < 0.5
|
392
|
+
[{source:source, target:target, alignment: :empty}]
|
393
|
+
else
|
394
|
+
[{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
|
463
395
|
end
|
396
|
+
end
|
397
|
+
end
|
464
398
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
399
|
+
def update_cultivation_map
|
400
|
+
return if @blocks.nil?
|
401
|
+
|
402
|
+
## To update the cultivation map
|
403
|
+
newly_cultivated_regions = @blocks.collect do |b|
|
404
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
405
|
+
[b[:target][:begin], b[:target][:end]]
|
406
|
+
else
|
407
|
+
nil
|
408
|
+
end
|
409
|
+
end.compact.inject([]) do |condensed, region|
|
410
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
411
|
+
condensed.push region
|
412
|
+
else
|
413
|
+
condensed.last[1] = region.last
|
471
414
|
end
|
415
|
+
condensed
|
416
|
+
end
|
472
417
|
|
473
|
-
|
418
|
+
@cultivation_map.cultivate(newly_cultivated_regions)
|
419
|
+
end
|
420
|
+
|
421
|
+
def demap_blocks(_blocks)
|
422
|
+
return nil if _blocks.nil?
|
423
|
+
|
424
|
+
blocks = _blocks.map{|b| b.dup}
|
425
|
+
blocks.each do |b|
|
426
|
+
b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
|
427
|
+
b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
|
474
428
|
end
|
429
|
+
|
430
|
+
blocks
|
475
431
|
end
|
476
432
|
|
477
433
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-03-
|
11
|
+
date: 2021-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|