text_alignment 0.8.1 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +23 -13
- data/lib/text_alignment/anchor_finder.rb +120 -70
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +92 -75
- data/lib/text_alignment/cultivation_map.rb +94 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +21 -3
- data/lib/text_alignment/text_alignment.rb +241 -233
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
|
4
|
+
data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
|
7
|
+
data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
|
data/bin/align_annotations
CHANGED
@@ -26,8 +26,8 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text,
|
30
|
-
alignment
|
29
|
+
def align_denotations(denotations, source_text, alignment, debug = false)
|
30
|
+
alignment.align(source_text, denotations)
|
31
31
|
new_denotations = alignment.transform_hdenotations(denotations)
|
32
32
|
|
33
33
|
if debug
|
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
50
50
|
new_denotations
|
51
51
|
end
|
52
52
|
|
53
|
-
def align_mannotations(source_annotations,
|
54
|
-
target_annotations = {text:
|
53
|
+
def align_mannotations(source_annotations, reference_text, alignment, debug = false)
|
54
|
+
target_annotations = {text:reference_text}
|
55
55
|
|
56
56
|
idnum_denotations = 0
|
57
57
|
idnum_relations = 0
|
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
62
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
63
63
|
ididx = {}
|
64
64
|
warn "[#{i}]-=-=-=-=-"
|
65
|
-
denotations = align_denotations(annotations[:denotations], annotations[:text],
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
|
66
|
+
|
66
67
|
denotations.each do |d|
|
67
68
|
reid = 'T' + (idnum_denotations += 1).to_s
|
68
69
|
ididx[d[:id]] = reid
|
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
76
77
|
annotations[:relations].each do |r|
|
77
78
|
reid = 'R' + (idnum_relations += 1).to_s
|
78
79
|
ididx[r[:id]] = reid
|
79
|
-
|
80
|
+
sid = ididx[r[:subj]]
|
81
|
+
oid = ididx[r[:obj]]
|
82
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
|
80
83
|
end
|
81
84
|
end
|
82
85
|
|
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
85
88
|
annotations[:attributes].each do |a|
|
86
89
|
reid = 'A' + (idnum_attributes += 1).to_s
|
87
90
|
ididx[a[:id]] = reid
|
88
|
-
|
91
|
+
sid = ididx[a[:subj]]
|
92
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
|
89
93
|
end
|
90
94
|
end
|
91
95
|
|
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
94
98
|
annotations[:modifications].each do |m|
|
95
99
|
reid = 'M' + (idnum_modifications += 1).to_s
|
96
100
|
ididx[m[:id]] = reid
|
97
|
-
|
101
|
+
oid = ididx[m[:obj]]
|
102
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
|
98
103
|
end
|
99
104
|
end
|
100
105
|
end
|
@@ -109,13 +114,18 @@ unless ARGV.length == 2
|
|
109
114
|
end
|
110
115
|
|
111
116
|
source_annotations = read_annotations(ARGV[0])
|
112
|
-
|
117
|
+
reference_text = read_text(ARGV[1])
|
118
|
+
|
119
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
113
120
|
|
114
121
|
target_annotations = if source_annotations.class == Array
|
115
|
-
align_mannotations(source_annotations,
|
122
|
+
# align_mannotations(source_annotations, reference_text, alignment, true)
|
123
|
+
align_mannotations(source_annotations, reference_text, alignment, false)
|
116
124
|
else
|
117
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text],
|
118
|
-
source_annotations
|
125
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
126
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
127
|
+
source_annotations.merge({text:reference_text, denotations:denotations})
|
119
128
|
end
|
120
129
|
|
121
|
-
#
|
130
|
+
# pp alignment.block_alignment
|
131
|
+
puts target_annotations.to_json
|
@@ -6,92 +6,141 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str,
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
9
|
+
def initialize(source_str, target_str, cultivation_map)
|
10
|
+
@s1 = source_str.downcase
|
11
|
+
@s2 = target_str.downcase
|
13
12
|
|
14
|
-
@
|
13
|
+
@cultivation_map = cultivation_map
|
15
14
|
|
16
|
-
@
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
@size_ngram = TextAlignment::SIZE_NGRAM
|
16
|
+
@size_window = TextAlignment::SIZE_WINDOW
|
17
|
+
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
18
|
+
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
19
|
+
@pos_s2_final_possible_end = @s2.length
|
21
20
|
|
22
|
-
#
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@end_s2_prev = 0
|
21
|
+
# positions of last match
|
22
|
+
@pos_s1_last_match = 0
|
23
|
+
@pos_s2_last_match = 0
|
26
24
|
end
|
27
25
|
|
28
26
|
def get_next_anchor
|
29
|
-
# find the
|
30
|
-
|
31
|
-
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
-
@beg_s1 += 1
|
33
|
-
next
|
34
|
-
end
|
35
|
-
anchor = @s1[@beg_s1, @size_ngram]
|
36
|
-
|
37
|
-
# search_position = 0
|
38
|
-
search_position = @end_s2_prev
|
39
|
-
while @beg_s2 = @s2.index(anchor, search_position)
|
40
|
-
# if both the begining points are sufficiantly close to the end points of the last match
|
41
|
-
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
27
|
+
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
28
|
+
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
42
29
|
|
43
|
-
|
44
|
-
|
30
|
+
# To skip whitespace letters
|
31
|
+
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
45
32
|
|
46
|
-
|
47
|
-
|
33
|
+
_beg_s2 = get_beg_s2(beg_s1)
|
34
|
+
break _beg_s2 unless _beg_s2.nil?
|
35
|
+
end
|
48
36
|
|
49
|
-
|
50
|
-
|
37
|
+
# To return nil when it fails to find an anchor
|
38
|
+
return nil if beg_s2.class == Range
|
51
39
|
|
52
|
-
|
40
|
+
# To extend the block to the left
|
41
|
+
b1 = beg_s1
|
42
|
+
b2 = beg_s2
|
43
|
+
left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
|
44
|
+
while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
|
45
|
+
b1 -= 1; b2 -= 1
|
46
|
+
end
|
53
47
|
|
54
|
-
|
48
|
+
# To extend the block to the right
|
49
|
+
e1 = beg_s1 + @size_ngram
|
50
|
+
e2 = beg_s2 + @size_ngram
|
51
|
+
right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
|
52
|
+
while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
|
53
|
+
e1 += 1; e2 += 1
|
55
54
|
end
|
56
55
|
|
57
|
-
|
56
|
+
@pos_s1_last_match = e1
|
57
|
+
@pos_s2_last_match = e2
|
58
58
|
|
59
|
-
|
60
|
-
|
61
|
-
b2 = @beg_s2
|
62
|
-
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
63
|
-
b1 -= 1; b2 -= 1
|
64
|
-
end
|
59
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
60
|
+
end
|
65
61
|
|
66
|
-
|
62
|
+
private
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
64
|
+
def get_beg_s2(beg_s1)
|
65
|
+
# to get the anchor to search for in s2
|
66
|
+
anchor = @s1[beg_s1, @size_ngram]
|
67
|
+
|
68
|
+
# comment out below with the assumption that texts are in the same order
|
69
|
+
# search_position = 0
|
70
|
+
search_position = @pos_s2_last_match
|
71
|
+
|
72
|
+
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
73
|
+
return nil if beg_s2_candidates.empty?
|
74
|
+
|
75
|
+
find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
76
|
+
end
|
77
|
+
|
78
|
+
# To find beg_s2 which match to the anchor
|
79
|
+
# return nil if the anchor is too much frequent
|
80
|
+
def find_beg_s2_candidates(anchor, search_position)
|
81
|
+
candidates = []
|
82
|
+
while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
|
83
|
+
candidates << _beg_s2
|
84
|
+
|
85
|
+
# for speed, skip anchor of high frequency
|
86
|
+
if candidates.length > 5
|
87
|
+
candidates.clear
|
88
|
+
break
|
89
|
+
end
|
90
|
+
|
91
|
+
search_position = _beg_s2 + 1
|
72
92
|
end
|
93
|
+
candidates
|
94
|
+
end
|
95
|
+
|
96
|
+
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
97
|
+
valid_beg_s2 = nil
|
73
98
|
|
74
|
-
|
75
|
-
|
76
|
-
@beg_s1 = e1
|
99
|
+
(10 .. 30).step(10).each do |size_window|
|
100
|
+
valid_beg_s2 = nil
|
77
101
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
102
|
+
r = beg_s2_candidates.each do |beg_s2|
|
103
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
104
|
+
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
105
|
+
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
106
|
+
break unless valid_beg_s2.nil?
|
107
|
+
valid_beg_s2 = beg_s2
|
108
|
+
next
|
109
|
+
end
|
110
|
+
|
111
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
|
112
|
+
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
|
+
break unless valid_beg_s2.nil?
|
114
|
+
valid_beg_s2 = beg_s2
|
115
|
+
next
|
116
|
+
end
|
117
|
+
|
118
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
|
119
|
+
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
|
+
break unless valid_beg_s2.nil?
|
121
|
+
valid_beg_s2 = beg_s2
|
122
|
+
next
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
127
|
+
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
128
|
+
break unless r.nil?
|
82
129
|
end
|
130
|
+
|
131
|
+
valid_beg_s2
|
83
132
|
end
|
84
133
|
|
85
|
-
|
134
|
+
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
135
|
+
size_window ||= @size_window
|
86
136
|
|
87
|
-
|
88
|
-
#
|
89
|
-
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
137
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
138
|
+
# return if @beg_s1 < size_window || @beg_s2 < size_window
|
90
139
|
|
91
140
|
window_s1 = ''
|
92
|
-
loc =
|
141
|
+
loc = beg_s1 - 1
|
93
142
|
count = 0
|
94
|
-
while count <
|
143
|
+
while count < size_window && loc >= 0
|
95
144
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
96
145
|
window_s1 += @s1[loc]
|
97
146
|
count += 1
|
@@ -100,9 +149,9 @@ class TextAlignment::AnchorFinder
|
|
100
149
|
end
|
101
150
|
|
102
151
|
window_s2 = ''
|
103
|
-
loc =
|
152
|
+
loc = beg_s2 - 1
|
104
153
|
count = 0
|
105
|
-
while count <
|
154
|
+
while count < size_window && loc >= 0
|
106
155
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
107
156
|
window_s2 += @s2[loc]
|
108
157
|
count += 1
|
@@ -113,15 +162,17 @@ class TextAlignment::AnchorFinder
|
|
113
162
|
[window_s1, window_s2]
|
114
163
|
end
|
115
164
|
|
116
|
-
def get_right_windows
|
165
|
+
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
166
|
+
size_window ||= @size_window
|
167
|
+
|
117
168
|
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
-
# return if (@beg_s1 + @size_ngram > (@s1.length -
|
169
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
119
170
|
|
120
171
|
window_s1 = ''
|
121
|
-
loc =
|
172
|
+
loc = beg_s1 + @size_ngram
|
122
173
|
len_s1 = @s1.length
|
123
174
|
count = 0
|
124
|
-
while count <
|
175
|
+
while count < size_window && loc < len_s1
|
125
176
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
126
177
|
window_s1 += @s1[loc]
|
127
178
|
count += 1
|
@@ -130,10 +181,10 @@ class TextAlignment::AnchorFinder
|
|
130
181
|
end
|
131
182
|
|
132
183
|
window_s2 = ''
|
133
|
-
loc =
|
184
|
+
loc = beg_s2 + @size_ngram
|
134
185
|
len_s2 = @s2.length
|
135
186
|
count = 0
|
136
|
-
while count <
|
187
|
+
while count < size_window && loc < len_s2
|
137
188
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
138
189
|
window_s2 += @s2[loc]
|
139
190
|
count += 1
|
@@ -148,5 +199,4 @@ class TextAlignment::AnchorFinder
|
|
148
199
|
return 0 if str1.nil? || str2.nil?
|
149
200
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
150
201
|
end
|
151
|
-
|
152
|
-
end
|
202
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
|
-
TextAlignment::
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
4
|
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
5
|
|
6
6
|
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
|
|
61
61
|
["•", "*"], #U+2022 (bullet)
|
62
62
|
[" ", " "], #U+2009 (thin space)
|
63
63
|
[" ", " "], #U+200A (hair space)
|
64
|
-
[" ", " "], #U+00A0 (
|
64
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
65
|
[" ", " "], #U+3000 (ideographic space)
|
66
|
-
["
|
66
|
+
["‐", "-"], #U+2010 (Hyphen)
|
67
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
67
68
|
["−", "-"], #U+2212 (minus sign)
|
68
69
|
["–", "-"], #U+2013 (en dash)
|
69
70
|
["′", "'"], #U+2032 (prime)
|
@@ -75,98 +76,114 @@ TextAlignment::MAPPINGS = [
|
|
75
76
|
]
|
76
77
|
|
77
78
|
|
78
|
-
|
79
|
+
class TextAlignment::CharMapping
|
80
|
+
attr_reader :mapped_text
|
79
81
|
|
82
|
+
def initialize(_text, char_mapping = nil)
|
83
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
84
|
+
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
85
|
+
@index_enmap = offset_mapping.to_h
|
86
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
87
|
+
end
|
80
88
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
86
|
-
if character_mappings.empty?
|
87
|
-
[_str1, _str2, _mappings]
|
88
|
-
else
|
89
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
90
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
-
characters_to.gsub!(/-/, '\-')
|
89
|
+
def enmap_position(position)
|
90
|
+
@index_enmap[position]
|
91
|
+
end
|
92
92
|
|
93
|
-
|
94
|
-
|
93
|
+
def demap_position(position)
|
94
|
+
@index_demap[position]
|
95
|
+
end
|
95
96
|
|
96
|
-
|
97
|
+
def enmap_denotations(_denotations)
|
98
|
+
return nil if _denotations.nil?
|
97
99
|
|
98
|
-
|
100
|
+
denotations = _denotations.map do |d|
|
101
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
99
102
|
end
|
100
103
|
end
|
101
104
|
|
102
|
-
|
103
|
-
_mappings ||= TextAlignment::MAPPINGS
|
104
|
-
|
105
|
-
long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
106
|
-
if long_to_one_mappings.empty?
|
107
|
-
[_str1, _str2, _mappings]
|
108
|
-
else
|
109
|
-
## long to one character mappings
|
110
|
-
pletters = TextAlignment::PADDING_LETTERS
|
111
|
-
|
112
|
-
# find the padding letter for str1
|
113
|
-
@padding_letter1 = begin
|
114
|
-
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
-
TextAlignment::PADDING_LETTERS[i]
|
117
|
-
end
|
105
|
+
private
|
118
106
|
|
119
|
-
|
120
|
-
|
121
|
-
i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
|
122
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
123
|
-
TextAlignment::PADDING_LETTERS[i]
|
124
|
-
end
|
107
|
+
def enmap_text(_text, char_mapping)
|
108
|
+
text = _text.dup
|
125
109
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
else
|
141
|
-
_str2
|
142
|
-
end
|
110
|
+
# To execute the single letter mapping
|
111
|
+
char_mapping.each do |one, long|
|
112
|
+
text.gsub!(one, long) if long.length == 1
|
113
|
+
end
|
114
|
+
|
115
|
+
# To get the (location, length) index for replacements
|
116
|
+
loc_len = []
|
117
|
+
char_mapping.each do |one, long|
|
118
|
+
next if long.length == 1
|
119
|
+
|
120
|
+
init_next = 0
|
121
|
+
while loc = text.index(long, init_next)
|
122
|
+
loc_len << [loc, long.length]
|
123
|
+
init_next = loc + long.length
|
143
124
|
end
|
144
|
-
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
145
125
|
|
146
|
-
|
126
|
+
# a workaround to avoid messing-up due to embedding
|
127
|
+
text.gsub!(long, one * long.length)
|
147
128
|
end
|
148
|
-
end
|
149
129
|
|
150
|
-
|
151
|
-
|
130
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
131
|
+
init_next = 0
|
132
|
+
while loc = text.index(/\s{2,}/, init_next)
|
133
|
+
len = $~[0].length
|
134
|
+
loc_len << [loc, len]
|
135
|
+
init_next = loc + len
|
136
|
+
end
|
137
|
+
|
138
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
139
|
+
|
140
|
+
# To get the offset_mapping before and after replacement
|
141
|
+
offset_mapping = []
|
142
|
+
init_next = 0
|
143
|
+
j = 0
|
152
144
|
|
153
|
-
|
154
|
-
|
155
|
-
|
145
|
+
loc_len.each do |loc, len|
|
146
|
+
offset_mapping += (init_next .. loc).map do |i|
|
147
|
+
j += 1
|
148
|
+
[i, j - 1]
|
149
|
+
end
|
150
|
+
init_next = loc + len
|
151
|
+
end
|
156
152
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
_s1
|
153
|
+
offset_mapping += (init_next .. text.length).map do |i|
|
154
|
+
j += 1
|
155
|
+
[i, j - 1]
|
161
156
|
end
|
162
157
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
_s2
|
158
|
+
# To execute the long letter mapping
|
159
|
+
char_mapping.each do |one, long|
|
160
|
+
text.gsub!(one * long.length, one) if long.length > 1
|
167
161
|
end
|
168
162
|
|
169
|
-
|
163
|
+
# To replace multi whitespace sequences to a space
|
164
|
+
text.gsub!(/\s{2,}/, ' ')
|
165
|
+
|
166
|
+
[text, offset_mapping]
|
170
167
|
end
|
168
|
+
end
|
169
|
+
|
170
|
+
if __FILE__ == $0
|
171
|
+
require 'json'
|
172
|
+
|
173
|
+
unless ARGV.length == 1
|
174
|
+
warn "#{$0} an_annotation_json_file.json"
|
175
|
+
exit
|
176
|
+
end
|
177
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
178
|
+
denotations = annotations[:denotations]
|
179
|
+
if denotations.nil? && annotations[:tracks]
|
180
|
+
denotations = annotations[:tracks].first[:denotations]
|
181
|
+
end
|
182
|
+
|
183
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
184
|
+
text_mapped = text_mapping.mapped_text
|
185
|
+
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
186
|
+
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
171
187
|
|
188
|
+
puts new_annotations.to_json
|
172
189
|
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
class TextAlignment::CultivationMap
|
4
|
+
attr_reader :map
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@map = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def cultivate(regions)
|
11
|
+
@map += regions
|
12
|
+
@map.sort!{|a, b| a[0] <=> b[0]}
|
13
|
+
new_map = []
|
14
|
+
@map.each do |region|
|
15
|
+
if new_map.empty?
|
16
|
+
new_map << region
|
17
|
+
elsif new_map.last[1] > region[0]
|
18
|
+
raise "Overlapping regions: #{new_map.last} : #{region}"
|
19
|
+
elsif new_map.last[1] == region[0]
|
20
|
+
new_map.last[1] == region[1]
|
21
|
+
else
|
22
|
+
new_map << region
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@map = new_map
|
26
|
+
end
|
27
|
+
|
28
|
+
def search_again_position(position, end_position = nil)
|
29
|
+
end_position ||= position
|
30
|
+
region = @map.bsearch{|r| end_position < r[1]}
|
31
|
+
if region.nil? || region[0] > position
|
32
|
+
nil
|
33
|
+
else
|
34
|
+
region[1]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def last_cultivated_position(position)
|
39
|
+
ridx = @map.rindex{|r| r[1] <= position}
|
40
|
+
ridx.nil? ? nil : @map[ridx][1]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next_cultivated_position(position)
|
44
|
+
region = @map.bsearch{|r| position < r[0]}
|
45
|
+
region.nil? ? nil : region[0]
|
46
|
+
end
|
47
|
+
|
48
|
+
def in_regions(region)
|
49
|
+
@map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
|
50
|
+
end
|
51
|
+
|
52
|
+
def region_state(region)
|
53
|
+
closed_parts = in_regions(region)
|
54
|
+
if closed_parts.empty?
|
55
|
+
[:open, region]
|
56
|
+
else
|
57
|
+
if front_open?(region, closed_parts)
|
58
|
+
if rear_open?(region, closed_parts)
|
59
|
+
[:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
|
60
|
+
else
|
61
|
+
[:front_open, [region[0], closed_parts.first[0]]]
|
62
|
+
end
|
63
|
+
else
|
64
|
+
if rear_open?(region, closed_parts)
|
65
|
+
[:rear_open, [closed_parts.last[1], region[1]]]
|
66
|
+
else
|
67
|
+
[:closed, nil]
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def index(target, string, position)
|
74
|
+
length = target.length
|
75
|
+
loop do
|
76
|
+
_begin = string.index(target, position)
|
77
|
+
break if _begin.nil?
|
78
|
+
position = search_again_position(_begin)
|
79
|
+
next unless position.nil?
|
80
|
+
break _begin if region_state([_begin, _begin + length])[0] == :open
|
81
|
+
position = _begin + 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def front_open?(region, closed_parts)
|
88
|
+
closed_parts.first[0] > region[0]
|
89
|
+
end
|
90
|
+
|
91
|
+
def rear_open?(region, closed_parts)
|
92
|
+
closed_parts.last[1] < region[1]
|
93
|
+
end
|
94
|
+
end
|
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
|
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/glcs_alignment'
|
8
|
-
require 'text_alignment/
|
8
|
+
require 'text_alignment/char_mapping'
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
@@ -106,7 +106,7 @@ if __FILE__ == $0
|
|
106
106
|
|
107
107
|
dictionary = [["β", "beta"]]
|
108
108
|
# align = TextAlignment::TextAlignment.new(str1, str2)
|
109
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::
|
109
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
|
110
110
|
p align.common_elements
|
111
111
|
p align.mapped_elements
|
112
112
|
end
|
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
|
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/lcs_cdiff'
|
8
8
|
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/
|
9
|
+
require 'text_alignment/char_mapping'
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
|
|
20
20
|
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
|
23
|
+
mappings ||= TextAlignment::CHAR_MAPPING
|
24
|
+
str1 = _str1.dup
|
25
|
+
str2 = _str2.dup
|
24
26
|
|
25
27
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
28
|
end
|
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
|
|
63
65
|
end
|
64
66
|
|
65
67
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity =
|
68
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
67
69
|
@str1_match_initial = cmp.str1_match_initial
|
68
70
|
@str1_match_final = cmp.str1_match_final
|
69
71
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
|
|
139
141
|
@position_map_end = posmap_end.sort.to_h
|
140
142
|
end
|
141
143
|
|
144
|
+
def compute_similarity(s1, s2, sdiff)
|
145
|
+
return 0 if sdiff.nil?
|
146
|
+
|
147
|
+
# recoverbility
|
148
|
+
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
|
+
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
|
+
|
151
|
+
coverage = count_nws_match.to_f / count_nws
|
152
|
+
|
153
|
+
# fragmentation rate
|
154
|
+
count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
|
155
|
+
count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
|
156
|
+
rate_frag = count_ofrag.to_f / count_frag
|
157
|
+
|
158
|
+
similarity = coverage * rate_frag
|
159
|
+
end
|
142
160
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment/constants'
|
3
3
|
require 'text_alignment/anchor_finder'
|
4
4
|
require 'text_alignment/mixed_alignment'
|
5
|
+
require 'text_alignment/cultivation_map'
|
5
6
|
|
6
7
|
module TextAlignment; end unless defined? TextAlignment
|
7
8
|
|
@@ -10,253 +11,71 @@ class TextAlignment::TextAlignment
|
|
10
11
|
attr_reader :similarity
|
11
12
|
attr_reader :lost_annotations
|
12
13
|
|
13
|
-
|
14
|
-
|
14
|
+
# Initialize with a reference text, again which texts will be aligned
|
15
|
+
def initialize(reference_text, to_prevent_overlap = false)
|
16
|
+
raise ArgumentError, "nil text" if reference_text.nil?
|
15
17
|
|
16
|
-
@
|
17
|
-
@
|
18
|
-
@
|
18
|
+
@original_reference_text = reference_text
|
19
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
+
@to_prevent_overlap = to_prevent_overlap
|
19
22
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
return
|
25
|
-
end
|
23
|
+
@original_text = nil
|
24
|
+
@block_alignment = nil
|
25
|
+
@cultivation_map = TextAlignment::CultivationMap.new
|
26
|
+
end
|
26
27
|
|
27
|
-
|
28
|
-
|
28
|
+
def align(text, denotations = nil)
|
29
|
+
# To maintain the cultivation map
|
30
|
+
update_cultivation_map if @to_prevent_overlap
|
29
31
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
last[:source][:end] = block[:source][:end]
|
35
|
-
last[:target][:end] = block[:target][:end]
|
36
|
-
else
|
37
|
-
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
38
|
-
end
|
32
|
+
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
|
+
unless @original_text && @original_text == text
|
34
|
+
@original_text = text
|
35
|
+
@text_mapping = TextAlignment::CharMapping.new(text)
|
39
36
|
end
|
40
37
|
|
41
|
-
|
42
|
-
|
43
|
-
# puts
|
44
|
-
# exit
|
45
|
-
# blocks.each do |b|
|
46
|
-
# p [b[:source], b[:target]]
|
47
|
-
# puts "---"
|
48
|
-
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
49
|
-
# puts "---"
|
50
|
-
# puts str2[b[:target][:begin] ... b[:target][:end]]
|
51
|
-
# puts "====="
|
52
|
-
# puts
|
53
|
-
# end
|
54
|
-
# puts "-=-=-=-=-"
|
55
|
-
# puts
|
56
|
-
|
57
|
-
## to fill the gaps
|
58
|
-
last_block = nil
|
59
|
-
blocks2 = blocks.inject([]) do |sum, block|
|
60
|
-
b1 = last_block ? last_block[:source][:end] : 0
|
61
|
-
e1 = block[:source][:begin]
|
62
|
-
|
63
|
-
sum += if b1 == e1
|
64
|
-
[block]
|
65
|
-
else
|
66
|
-
b2 = last_block ? last_block[:target][:end] : 0
|
67
|
-
e2 = block[:target][:begin]
|
68
|
-
|
69
|
-
if b2 == e2
|
70
|
-
[
|
71
|
-
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
72
|
-
block
|
73
|
-
]
|
74
|
-
else
|
75
|
-
if b1 == 0 && b2 == 0
|
76
|
-
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
|
-
b2 = e2 - len_buffer if e2 > len_buffer
|
78
|
-
end
|
79
|
-
|
80
|
-
_str1 = str1[b1 ... e1]
|
81
|
-
_str2 = str2[b2 ... e2]
|
82
|
-
|
83
|
-
if _str1.strip.empty? || _str2.strip.empty?
|
84
|
-
[
|
85
|
-
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
86
|
-
block
|
87
|
-
]
|
88
|
-
else
|
89
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
38
|
+
@mapped_text = @text_mapping.mapped_text
|
39
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
93
40
|
|
94
|
-
|
95
|
-
|
96
|
-
|
41
|
+
## To generate the block_alignment of the input text against the reference text
|
42
|
+
# Initialization
|
43
|
+
# @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
|
44
|
+
@block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
|
97
45
|
|
98
|
-
#
|
99
|
-
|
100
|
-
|
46
|
+
# Generation
|
47
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
48
|
+
r
|
101
49
|
else
|
102
|
-
|
103
|
-
if b1 < str1.length
|
104
|
-
e1 = str1.length
|
105
|
-
|
106
|
-
b2 = last_block[:target][:end]
|
107
|
-
if b2 < str2.length
|
108
|
-
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
109
|
-
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
110
|
-
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
111
|
-
else
|
112
|
-
[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
|
113
|
-
end
|
114
|
-
else
|
115
|
-
[]
|
116
|
-
end
|
50
|
+
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
117
51
|
end
|
118
|
-
|
119
|
-
@block_alignment[:blocks] = blocks2
|
120
52
|
end
|
121
53
|
|
122
|
-
def
|
123
|
-
|
124
|
-
block_begin = str2.index(str1)
|
125
|
-
unless block_begin.nil?
|
126
|
-
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
|
-
end
|
54
|
+
def update_cultivation_map
|
55
|
+
return if @block_alignment.nil? || @block_alignment[:blocks].nil?
|
128
56
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
nil
|
135
|
-
end
|
136
|
-
|
137
|
-
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
138
|
-
block2 = str2[b2 ... e2]
|
139
|
-
|
140
|
-
## term-based alignment
|
141
|
-
tblocks = if denotations
|
142
|
-
ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
143
|
-
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
144
|
-
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
145
|
-
|
146
|
-
position = 0
|
147
|
-
tblocks = ds_in_scope.map do |term|
|
148
|
-
lex = term[:lex]
|
149
|
-
r = block2.index(lex, position)
|
150
|
-
if r.nil?
|
151
|
-
position = nil
|
152
|
-
break
|
153
|
-
end
|
154
|
-
position = r + lex.length
|
155
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
156
|
-
end
|
157
|
-
|
158
|
-
# missing term found
|
159
|
-
tblocks = [] if position.nil?
|
160
|
-
|
161
|
-
# redundant matching found
|
162
|
-
unless position.nil?
|
163
|
-
ds_in_scope.each do |term|
|
164
|
-
lex = term[:lex]
|
165
|
-
look_forward = block2.index(lex, position)
|
166
|
-
unless look_forward.nil?
|
167
|
-
tblocks = []
|
168
|
-
break
|
169
|
-
end
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
tblocks
|
174
|
-
else
|
175
|
-
[]
|
176
|
-
end
|
177
|
-
|
178
|
-
if tblocks.empty?
|
179
|
-
if b1 == 0 && e1 == str1.length
|
180
|
-
if (e1 > 2000) || (e2 > 2000)
|
181
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
182
|
-
else
|
183
|
-
block1 = str1[b1 ... e1]
|
184
|
-
block2 = str2[b2 ... e2]
|
185
|
-
|
186
|
-
## character-based alignment
|
187
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
188
|
-
if alignment.sdiff.nil?
|
189
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
|
-
else
|
191
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
192
|
-
end
|
193
|
-
end
|
57
|
+
## To update the cultivation map
|
58
|
+
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
59
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
60
|
+
[b[:target][:begin], b[:target][:end]]
|
194
61
|
else
|
195
|
-
|
196
|
-
block2 = str2[b2 ... e2]
|
197
|
-
|
198
|
-
## character-based alignment
|
199
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
|
200
|
-
if alignment.sdiff.nil?
|
201
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
202
|
-
else
|
203
|
-
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
|
204
|
-
end
|
205
|
-
end
|
206
|
-
else
|
207
|
-
last_tblock = nil
|
208
|
-
lblocks = tblocks.inject([]) do |sum, tblock|
|
209
|
-
tb1 = last_tblock ? last_tblock[:source][:end] : b1
|
210
|
-
te1 = tblock[:source][:begin]
|
211
|
-
|
212
|
-
sum += if te1 == tb1
|
213
|
-
[tblock]
|
214
|
-
else
|
215
|
-
tb2 = last_tblock ? last_tblock[:target][:end] : b2
|
216
|
-
te2 = tblock[:target][:begin]
|
217
|
-
|
218
|
-
if b2 == e2
|
219
|
-
[
|
220
|
-
{source:{begin:tb1, end:te1}, alignment: :empty},
|
221
|
-
tblock
|
222
|
-
]
|
223
|
-
else
|
224
|
-
[
|
225
|
-
{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
|
226
|
-
tblock
|
227
|
-
]
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
last_tblock = tblock
|
232
|
-
sum
|
62
|
+
nil
|
233
63
|
end
|
234
|
-
|
235
|
-
if
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
|
240
|
-
end
|
64
|
+
end.compact.inject([]) do |condensed, region|
|
65
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
66
|
+
condensed.push region
|
67
|
+
else
|
68
|
+
condensed.last[1] = region.last
|
241
69
|
end
|
242
|
-
|
243
|
-
lblocks
|
70
|
+
condensed
|
244
71
|
end
|
245
|
-
end
|
246
|
-
|
247
72
|
|
248
|
-
|
249
|
-
position = 0
|
250
|
-
len = target.len
|
251
|
-
Enumerator.new do |yielder|
|
252
|
-
while idx = str.index(target, position)
|
253
|
-
yielder << idx
|
254
|
-
position = idx + len
|
255
|
-
end
|
256
|
-
end
|
73
|
+
@cultivation_map.cultivate(newly_cultivated_regions)
|
257
74
|
end
|
258
75
|
|
259
|
-
def transform_begin_position(
|
76
|
+
def transform_begin_position(_begin_position)
|
77
|
+
begin_position = @text_mapping.enmap_position(_begin_position)
|
78
|
+
|
260
79
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
261
80
|
block = @block_alignment[:blocks][i]
|
262
81
|
|
@@ -272,9 +91,13 @@ class TextAlignment::TextAlignment
|
|
272
91
|
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
273
92
|
r.nil? ? nil : r + block[:target][:begin]
|
274
93
|
end
|
94
|
+
|
95
|
+
@rtext_mapping.demap_position(b)
|
275
96
|
end
|
276
97
|
|
277
|
-
def transform_end_position(
|
98
|
+
def transform_end_position(_end_position)
|
99
|
+
end_position = @text_mapping.enmap_position(_end_position)
|
100
|
+
|
278
101
|
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
279
102
|
block = @block_alignment[:blocks][i]
|
280
103
|
|
@@ -290,6 +113,8 @@ class TextAlignment::TextAlignment
|
|
290
113
|
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
291
114
|
r.nil? ? nil : r + block[:target][:begin]
|
292
115
|
end
|
116
|
+
|
117
|
+
@rtext_mapping.demap_position(e)
|
293
118
|
end
|
294
119
|
|
295
120
|
def transform_a_span(span)
|
@@ -308,7 +133,7 @@ class TextAlignment::TextAlignment
|
|
308
133
|
source = {begin:d.begin, end:d.end}
|
309
134
|
d.begin = transform_begin_position(d.begin);
|
310
135
|
d.end = transform_end_position(d.end);
|
311
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @
|
136
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
|
312
137
|
rescue
|
313
138
|
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
314
139
|
d.begin = nil
|
@@ -324,7 +149,7 @@ class TextAlignment::TextAlignment
|
|
324
149
|
|
325
150
|
r = hdenotations.collect do |d|
|
326
151
|
t = transform_a_span(d[:span])
|
327
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @
|
152
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
|
328
153
|
new_d = d.dup.merge({span:t})
|
329
154
|
rescue
|
330
155
|
@lost_annotations << {source: d[:span], target:t}
|
@@ -335,8 +160,8 @@ class TextAlignment::TextAlignment
|
|
335
160
|
end
|
336
161
|
|
337
162
|
def alignment_show
|
338
|
-
stext = @
|
339
|
-
ttext = @
|
163
|
+
stext = @mapped_text
|
164
|
+
ttext = @mapped_reference_text
|
340
165
|
|
341
166
|
show = ''
|
342
167
|
@block_alignment[:blocks].each do |a|
|
@@ -392,9 +217,192 @@ class TextAlignment::TextAlignment
|
|
392
217
|
|
393
218
|
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
394
219
|
"[#{astr1}]\n" +
|
395
|
-
"[#{astr2}]\n\n"
|
220
|
+
"[#{astr2.gsub("\n", " ")}]\n\n"
|
396
221
|
end
|
397
222
|
end
|
398
223
|
show
|
399
224
|
end
|
225
|
+
|
226
|
+
private
|
227
|
+
|
228
|
+
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
229
|
+
## to find block alignments
|
230
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
231
|
+
|
232
|
+
blocks = []
|
233
|
+
while block = anchor_finder.get_next_anchor
|
234
|
+
last = blocks.last
|
235
|
+
if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
|
236
|
+
last[:source][:end] = block[:source][:end]
|
237
|
+
last[:target][:end] = block[:target][:end]
|
238
|
+
else
|
239
|
+
blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
# pp blocks
|
244
|
+
# puts "-----"
|
245
|
+
# puts
|
246
|
+
# exit
|
247
|
+
# blocks.each do |b|
|
248
|
+
# p [b[:source], b[:target]]
|
249
|
+
# puts "---"
|
250
|
+
# puts str1[b[:source][:begin] ... b[:source][:end]]
|
251
|
+
# puts "---"
|
252
|
+
# puts str2[b[:target][:begin] ... b[:target][:end]]
|
253
|
+
# puts "====="
|
254
|
+
# puts
|
255
|
+
# end
|
256
|
+
# puts "-=-=-=-=-"
|
257
|
+
# puts
|
258
|
+
|
259
|
+
## To fill the gaps
|
260
|
+
## lblock: last block, cblock: current block
|
261
|
+
lblock = nil
|
262
|
+
blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
|
263
|
+
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
264
|
+
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
265
|
+
|
266
|
+
if b1 < e1
|
267
|
+
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
268
|
+
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
269
|
+
_str1 = str1[b1 ... e1]
|
270
|
+
_str2 = str2[b2 ... e2]
|
271
|
+
|
272
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
273
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
274
|
+
else
|
275
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
276
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
277
|
+
case region_state
|
278
|
+
when :closed
|
279
|
+
[]
|
280
|
+
when :front_open
|
281
|
+
oe2 = state_region[1]
|
282
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
283
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
284
|
+
when :rear_open
|
285
|
+
ob2 = state_region[0]
|
286
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
287
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
288
|
+
when :middle_closed
|
289
|
+
oe2 = state_region[0]
|
290
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
291
|
+
attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
292
|
+
if attempt1.empty?
|
293
|
+
ob2 = state_region[1]
|
294
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
295
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
296
|
+
else
|
297
|
+
attempt1
|
298
|
+
end
|
299
|
+
else # :open
|
300
|
+
if (e2 - b2) > len_buffer
|
301
|
+
attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
302
|
+
if attempt1.empty?
|
303
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
304
|
+
else
|
305
|
+
attempt1
|
306
|
+
end
|
307
|
+
else
|
308
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
lblock = cblock
|
315
|
+
cblock.nil? ? sum : sum << cblock
|
316
|
+
end
|
317
|
+
|
318
|
+
end
|
319
|
+
|
320
|
+
def whole_block_alignment(str1, str2, cultivation_map)
|
321
|
+
block_begin = cultivation_map.index(str1, str2, 0)
|
322
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
323
|
+
|
324
|
+
block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
|
325
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
|
326
|
+
|
327
|
+
nil
|
328
|
+
end
|
329
|
+
|
330
|
+
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
331
|
+
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
332
|
+
if tblocks.empty?
|
333
|
+
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
334
|
+
else
|
335
|
+
tblocks
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
340
|
+
str2_block = str2[0 ... e2]
|
341
|
+
|
342
|
+
## term-based alignment
|
343
|
+
tblocks = if denotations
|
344
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
345
|
+
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
346
|
+
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
347
|
+
|
348
|
+
search_position = b2
|
349
|
+
_tblocks = denotations_in_scope.map do |denotation|
|
350
|
+
lex = denotation[:lex]
|
351
|
+
term_begin = cultivation_map.index(lex, str2_block, search_position)
|
352
|
+
break [] if term_begin.nil? # break the loop if a missing term is found
|
353
|
+
search_position = term_begin + lex.length
|
354
|
+
{source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
|
355
|
+
end
|
356
|
+
|
357
|
+
# redundant matching found
|
358
|
+
unless _tblocks.empty?
|
359
|
+
search_position = _tblocks.last[:target][:end]
|
360
|
+
denotations_in_scope.each do |term|
|
361
|
+
look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
|
362
|
+
unless look_forward.nil?
|
363
|
+
_tblocks = []
|
364
|
+
break
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
_tblocks
|
370
|
+
else
|
371
|
+
[]
|
372
|
+
end
|
373
|
+
|
374
|
+
ltblock = nil
|
375
|
+
tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
|
376
|
+
tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
|
377
|
+
te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
|
378
|
+
|
379
|
+
if te1 > tb1
|
380
|
+
tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
|
381
|
+
te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
|
382
|
+
sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
|
383
|
+
end
|
384
|
+
|
385
|
+
ltblock = ctblock
|
386
|
+
ctblock.nil? ? sum : sum << ctblock
|
387
|
+
end
|
388
|
+
|
389
|
+
tblocks2
|
390
|
+
end
|
391
|
+
|
392
|
+
def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
393
|
+
source = {begin:b1, end:e1}
|
394
|
+
target = {begin:b2, end:e2}
|
395
|
+
|
396
|
+
if (e1 - b1) > 2000
|
397
|
+
[{source:source, target:target, alignment: :empty}]
|
398
|
+
else
|
399
|
+
alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
|
400
|
+
if alignment.similarity < 0.5
|
401
|
+
[{source:source, target:target, alignment: :empty}]
|
402
|
+
else
|
403
|
+
[{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
|
404
|
+
end
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
400
408
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,7 +77,9 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/char_mapping.rb
|
80
81
|
- lib/text_alignment/constants.rb
|
82
|
+
- lib/text_alignment/cultivation_map.rb
|
81
83
|
- lib/text_alignment/find_divisions.rb
|
82
84
|
- lib/text_alignment/glcs_alignment.rb
|
83
85
|
- lib/text_alignment/glcs_alignment_fast.rb
|
@@ -86,7 +88,6 @@ files:
|
|
86
88
|
- lib/text_alignment/lcs_cdiff.rb
|
87
89
|
- lib/text_alignment/lcs_comparison.rb
|
88
90
|
- lib/text_alignment/lcs_min.rb
|
89
|
-
- lib/text_alignment/mappings.rb
|
90
91
|
- lib/text_alignment/mixed_alignment.rb
|
91
92
|
- lib/text_alignment/text_alignment.rb
|
92
93
|
- lib/text_alignment/version.rb
|