text_alignment 0.9.1 → 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +10 -6
- data/lib/text_alignment/anchor_finder.rb +130 -62
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +90 -75
- data/lib/text_alignment/cultivation_map.rb +19 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +15 -3
- data/lib/text_alignment/text_alignment.rb +238 -180
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
|
4
|
+
data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
|
7
|
+
data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
|
data/bin/align_annotations
CHANGED
@@ -26,8 +26,9 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text, target_text, debug = false)
|
30
|
-
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
|
29
|
+
def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
|
30
|
+
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
|
31
|
+
cm = alignment.cultivation_map
|
31
32
|
new_denotations = alignment.transform_hdenotations(denotations)
|
32
33
|
|
33
34
|
if debug
|
@@ -47,7 +48,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
47
48
|
warn
|
48
49
|
|
49
50
|
# return target annotations
|
50
|
-
new_denotations
|
51
|
+
[new_denotations, cm]
|
51
52
|
end
|
52
53
|
|
53
54
|
def align_mannotations(source_annotations, target_text, debug = false)
|
@@ -58,11 +59,13 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
58
59
|
idnum_attributes = 0
|
59
60
|
idnum_modifications = 0
|
60
61
|
|
62
|
+
cm = nil
|
61
63
|
source_annotations.each_with_index do |annotations, i|
|
62
64
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
63
65
|
ididx = {}
|
64
66
|
warn "[#{i}]-=-=-=-=-"
|
65
|
-
denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
|
67
|
+
denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
|
68
|
+
|
66
69
|
denotations.each do |d|
|
67
70
|
reid = 'T' + (idnum_denotations += 1).to_s
|
68
71
|
ididx[d[:id]] = reid
|
@@ -114,8 +117,9 @@ target_text = read_text(ARGV[1])
|
|
114
117
|
target_annotations = if source_annotations.class == Array
|
115
118
|
align_mannotations(source_annotations, target_text, false)
|
116
119
|
else
|
117
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
120
|
+
denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
121
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
|
118
122
|
source_annotations.merge({text:target_text, denotations:denotations})
|
119
123
|
end
|
120
124
|
|
121
|
-
|
125
|
+
puts target_annotations.to_json
|
@@ -6,76 +6,65 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str,
|
10
|
-
@
|
11
|
-
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
12
|
-
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
13
|
-
|
14
|
-
@reverse = (target_str.length < source_str.length)
|
15
|
-
|
16
|
-
@s1, @s2 = if @reverse
|
9
|
+
def initialize(source_str, target_str, cultivation_map)
|
10
|
+
@s1, @s2 = if reverse?(source_str, target_str)
|
17
11
|
[target_str.downcase, source_str.downcase]
|
18
12
|
else
|
19
13
|
[source_str.downcase, target_str.downcase]
|
20
14
|
end
|
21
15
|
|
22
|
-
|
23
|
-
@beg_s1 = 0
|
24
|
-
@end_s1_prev = 0
|
25
|
-
@end_s2_prev = 0
|
26
|
-
end
|
27
|
-
|
28
|
-
def get_next_anchor
|
29
|
-
# find the position of an anchor ngram in s1 and s2
|
30
|
-
while @beg_s1 < (@s1.length - @size_ngram)
|
31
|
-
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
-
@beg_s1 += 1
|
33
|
-
next
|
34
|
-
end
|
35
|
-
anchor = @s1[@beg_s1, @size_ngram]
|
16
|
+
@cultivation_map = cultivation_map
|
36
17
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
18
|
+
@size_ngram = TextAlignment::SIZE_NGRAM
|
19
|
+
@size_window = TextAlignment::SIZE_WINDOW
|
20
|
+
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
21
|
+
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
42
22
|
|
43
|
-
|
44
|
-
|
23
|
+
# positions of last match
|
24
|
+
@pos_s1_last_match = 0
|
25
|
+
@pos_s2_last_match = 0
|
26
|
+
end
|
45
27
|
|
46
|
-
|
47
|
-
|
28
|
+
def reverse?(source_str = nil, target_str = nil)
|
29
|
+
unless source_str.nil?
|
30
|
+
@reverse_p = target_str.length < source_str.length
|
31
|
+
end
|
32
|
+
@reverse_p
|
33
|
+
end
|
48
34
|
|
49
|
-
|
50
|
-
|
35
|
+
def get_next_anchor
|
36
|
+
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
37
|
+
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
51
38
|
|
52
|
-
|
39
|
+
# To skip whitespace letters
|
40
|
+
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
53
41
|
|
54
|
-
|
42
|
+
_beg_s2 = get_beg_s2(beg_s1)
|
43
|
+
break _beg_s2 unless _beg_s2.nil?
|
55
44
|
end
|
56
45
|
|
57
|
-
return nil
|
46
|
+
# To return nil when it fails to find an anchor
|
47
|
+
return nil if beg_s2.class == Range
|
58
48
|
|
59
|
-
# extend the block
|
60
|
-
b1 =
|
61
|
-
b2 =
|
62
|
-
while b1 >= @
|
49
|
+
# To extend the block to the left
|
50
|
+
b1 = beg_s1
|
51
|
+
b2 = beg_s2
|
52
|
+
while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
|
63
53
|
b1 -= 1; b2 -= 1
|
64
54
|
end
|
65
|
-
|
66
55
|
b1 += 1; b2 += 1
|
67
56
|
|
68
|
-
|
69
|
-
|
57
|
+
# To extend the block to the right
|
58
|
+
e1 = beg_s1 + @size_ngram
|
59
|
+
e2 = beg_s2 + @size_ngram
|
70
60
|
while @s1[e1] && @s1[e1] == @s2[e2]
|
71
61
|
e1 += 1; e2 += 1
|
72
62
|
end
|
73
63
|
|
74
|
-
@
|
75
|
-
@
|
76
|
-
@beg_s1 = e1
|
64
|
+
@pos_s1_last_match = e1
|
65
|
+
@pos_s2_last_match = e2
|
77
66
|
|
78
|
-
if
|
67
|
+
if reverse?
|
79
68
|
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
80
69
|
else
|
81
70
|
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
@@ -84,14 +73,92 @@ class TextAlignment::AnchorFinder
|
|
84
73
|
|
85
74
|
private
|
86
75
|
|
87
|
-
def
|
88
|
-
#
|
89
|
-
|
76
|
+
def get_beg_s2(beg_s1)
|
77
|
+
# to get the anchor to search for in s2
|
78
|
+
anchor = @s1[beg_s1, @size_ngram]
|
79
|
+
|
80
|
+
# comment out below with the assumption that texts are in the same order
|
81
|
+
# search_position = 0
|
82
|
+
search_position = @pos_s2_last_match
|
83
|
+
|
84
|
+
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
85
|
+
return nil if beg_s2_candidates.empty?
|
86
|
+
|
87
|
+
find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
88
|
+
end
|
89
|
+
|
90
|
+
# To find beg_s2 which match to the anchor
|
91
|
+
# return nil if the anchor is too much frequent
|
92
|
+
def find_beg_s2_candidates(anchor, search_position)
|
93
|
+
candidates = []
|
94
|
+
while _beg_s2 = @s2.index(anchor, search_position)
|
95
|
+
search_again_position = @cultivation_map.search_again_position(_beg_s2)
|
96
|
+
unless search_again_position.nil?
|
97
|
+
search_position = search_again_position
|
98
|
+
next
|
99
|
+
end
|
100
|
+
|
101
|
+
candidates << _beg_s2
|
102
|
+
|
103
|
+
# for speed, skip anchor of high frequency
|
104
|
+
if candidates.length > 5
|
105
|
+
candidates.clear
|
106
|
+
break
|
107
|
+
end
|
108
|
+
|
109
|
+
search_position = _beg_s2 + 1
|
110
|
+
end
|
111
|
+
candidates
|
112
|
+
end
|
113
|
+
|
114
|
+
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
115
|
+
valid_beg_s2 = nil
|
116
|
+
|
117
|
+
(10 .. 30).step(10).each do |size_window|
|
118
|
+
valid_beg_s2 = nil
|
119
|
+
|
120
|
+
r = beg_s2_candidates.each do |beg_s2|
|
121
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
122
|
+
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
123
|
+
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
124
|
+
break unless valid_beg_s2.nil?
|
125
|
+
valid_beg_s2 = beg_s2
|
126
|
+
next
|
127
|
+
end
|
128
|
+
|
129
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
|
130
|
+
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
131
|
+
break unless valid_beg_s2.nil?
|
132
|
+
valid_beg_s2 = beg_s2
|
133
|
+
next
|
134
|
+
end
|
135
|
+
|
136
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
|
137
|
+
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
138
|
+
break unless valid_beg_s2.nil?
|
139
|
+
valid_beg_s2 = beg_s2
|
140
|
+
next
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
145
|
+
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
146
|
+
break unless r.nil?
|
147
|
+
end
|
148
|
+
|
149
|
+
valid_beg_s2
|
150
|
+
end
|
151
|
+
|
152
|
+
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
153
|
+
size_window ||= @size_window
|
154
|
+
|
155
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
156
|
+
# return if @beg_s1 < size_window || @beg_s2 < size_window
|
90
157
|
|
91
158
|
window_s1 = ''
|
92
|
-
loc =
|
159
|
+
loc = beg_s1 - 1
|
93
160
|
count = 0
|
94
|
-
while count <
|
161
|
+
while count < size_window && loc >= 0
|
95
162
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
96
163
|
window_s1 += @s1[loc]
|
97
164
|
count += 1
|
@@ -100,9 +167,9 @@ class TextAlignment::AnchorFinder
|
|
100
167
|
end
|
101
168
|
|
102
169
|
window_s2 = ''
|
103
|
-
loc =
|
170
|
+
loc = beg_s2 - 1
|
104
171
|
count = 0
|
105
|
-
while count <
|
172
|
+
while count < size_window && loc >= 0
|
106
173
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
107
174
|
window_s2 += @s2[loc]
|
108
175
|
count += 1
|
@@ -113,15 +180,17 @@ class TextAlignment::AnchorFinder
|
|
113
180
|
[window_s1, window_s2]
|
114
181
|
end
|
115
182
|
|
116
|
-
def get_right_windows
|
183
|
+
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
184
|
+
size_window ||= @size_window
|
185
|
+
|
117
186
|
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
-
# return if (@beg_s1 + @size_ngram > (@s1.length -
|
187
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
119
188
|
|
120
189
|
window_s1 = ''
|
121
|
-
loc =
|
190
|
+
loc = beg_s1 + @size_ngram
|
122
191
|
len_s1 = @s1.length
|
123
192
|
count = 0
|
124
|
-
while count <
|
193
|
+
while count < size_window && loc < len_s1
|
125
194
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
126
195
|
window_s1 += @s1[loc]
|
127
196
|
count += 1
|
@@ -130,10 +199,10 @@ class TextAlignment::AnchorFinder
|
|
130
199
|
end
|
131
200
|
|
132
201
|
window_s2 = ''
|
133
|
-
loc =
|
202
|
+
loc = beg_s2 + @size_ngram
|
134
203
|
len_s2 = @s2.length
|
135
204
|
count = 0
|
136
|
-
while count <
|
205
|
+
while count < size_window && loc < len_s2
|
137
206
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
138
207
|
window_s2 += @s2[loc]
|
139
208
|
count += 1
|
@@ -148,5 +217,4 @@ class TextAlignment::AnchorFinder
|
|
148
217
|
return 0 if str1.nil? || str2.nil?
|
149
218
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
150
219
|
end
|
151
|
-
|
152
|
-
end
|
220
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
|
-
TextAlignment::
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
4
|
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
5
|
|
6
6
|
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
|
|
61
61
|
["•", "*"], #U+2022 (bullet)
|
62
62
|
[" ", " "], #U+2009 (thin space)
|
63
63
|
[" ", " "], #U+200A (hair space)
|
64
|
-
[" ", " "], #U+00A0 (
|
64
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
65
|
[" ", " "], #U+3000 (ideographic space)
|
66
|
-
["
|
66
|
+
["‐", "-"], #U+2010 (Hyphen)
|
67
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
67
68
|
["−", "-"], #U+2212 (minus sign)
|
68
69
|
["–", "-"], #U+2013 (en dash)
|
69
70
|
["′", "'"], #U+2032 (prime)
|
@@ -75,98 +76,112 @@ TextAlignment::MAPPINGS = [
|
|
75
76
|
]
|
76
77
|
|
77
78
|
|
78
|
-
|
79
|
+
class TextAlignment::CharMapping
|
80
|
+
attr_reader :str
|
79
81
|
|
82
|
+
def initialize(_str, char_mapping = nil)
|
83
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
84
|
+
@str, offset_mapping = enmap_str(_str, char_mapping)
|
85
|
+
@index_enmap = offset_mapping.to_h
|
86
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
87
|
+
end
|
88
|
+
|
89
|
+
def enmap_position(position)
|
90
|
+
@index_enmap[position]
|
91
|
+
end
|
80
92
|
|
81
|
-
|
82
|
-
|
83
|
-
|
93
|
+
def demap_position(position)
|
94
|
+
@index_demap[position]
|
95
|
+
end
|
84
96
|
|
85
|
-
|
86
|
-
|
87
|
-
[
|
88
|
-
|
89
|
-
|
90
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
-
characters_to.gsub!(/-/, '\-')
|
97
|
+
def enmap_denotations(_denotations)
|
98
|
+
denotations = _denotations.map do |d|
|
99
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
100
|
+
end
|
101
|
+
end
|
92
102
|
|
93
|
-
|
94
|
-
str2 = _str2.tr(characters_from, characters_to)
|
103
|
+
private
|
95
104
|
|
96
|
-
|
105
|
+
def enmap_str(_str, char_mapping)
|
106
|
+
str = _str.dup
|
97
107
|
|
98
|
-
|
108
|
+
# To execute the single letter mapping
|
109
|
+
char_mapping.each do |one, long|
|
110
|
+
str.gsub!(one, long) if long.length == 1
|
99
111
|
end
|
100
|
-
end
|
101
112
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
if long_to_one_mappings.empty?
|
107
|
-
[_str1, _str2, _mappings]
|
108
|
-
else
|
109
|
-
## long to one character mappings
|
110
|
-
pletters = TextAlignment::PADDING_LETTERS
|
111
|
-
|
112
|
-
# find the padding letter for str1
|
113
|
-
@padding_letter1 = begin
|
114
|
-
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
-
TextAlignment::PADDING_LETTERS[i]
|
117
|
-
end
|
113
|
+
# To get the (location, length) index for replacements
|
114
|
+
loc_len = []
|
115
|
+
char_mapping.each do |one, long|
|
116
|
+
next if long.length == 1
|
118
117
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
TextAlignment::PADDING_LETTERS[i]
|
118
|
+
init_next = 0
|
119
|
+
while loc = str.index(long, init_next)
|
120
|
+
loc_len << [loc, long.length]
|
121
|
+
init_next = loc + long.length
|
124
122
|
end
|
125
123
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
str1 = if _str2.index(f[0])
|
131
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
132
|
-
_str1.gsub(from, to)
|
133
|
-
else
|
134
|
-
_str1
|
135
|
-
end
|
136
|
-
|
137
|
-
str2 = if _str1.index(f[0])
|
138
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
139
|
-
_str2.gsub(from, to)
|
140
|
-
else
|
141
|
-
_str2
|
142
|
-
end
|
143
|
-
end
|
144
|
-
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
124
|
+
# a workaround to avoid messing-up due to embedding
|
125
|
+
str.gsub!(long, one * long.length)
|
126
|
+
end
|
145
127
|
|
146
|
-
|
128
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
129
|
+
init_next = 0
|
130
|
+
while loc = str.index(/\s{2,}/, init_next)
|
131
|
+
len = $~[0].length
|
132
|
+
loc_len << [loc, len]
|
133
|
+
init_next = loc + len
|
147
134
|
end
|
148
|
-
end
|
149
135
|
|
150
|
-
|
151
|
-
return 0 if sdiff.nil?
|
136
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
152
137
|
|
153
|
-
#
|
154
|
-
|
155
|
-
|
138
|
+
# To get the offset_mapping before and after replacement
|
139
|
+
offset_mapping = []
|
140
|
+
init_next = 0
|
141
|
+
j = 0
|
156
142
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
143
|
+
loc_len.each do |loc, len|
|
144
|
+
offset_mapping += (init_next .. loc).map do |i|
|
145
|
+
j += 1
|
146
|
+
[i, j - 1]
|
147
|
+
end
|
148
|
+
init_next = loc + len
|
149
|
+
end
|
150
|
+
|
151
|
+
offset_mapping += (init_next .. str.length).map do |i|
|
152
|
+
j += 1
|
153
|
+
[i, j - 1]
|
161
154
|
end
|
162
155
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
_s2
|
156
|
+
# To execute the long letter mapping
|
157
|
+
char_mapping.each do |one, long|
|
158
|
+
str.gsub!(one * long.length, one) if long.length > 1
|
167
159
|
end
|
168
160
|
|
169
|
-
|
161
|
+
# To replace multi whitespace sequences to a space
|
162
|
+
str.gsub!(/\s{2,}/, ' ')
|
163
|
+
|
164
|
+
[str, offset_mapping]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
if __FILE__ == $0
|
169
|
+
require 'json'
|
170
|
+
|
171
|
+
unless ARGV.length == 1
|
172
|
+
warn "#{$0} an_annotation_json_file.json"
|
173
|
+
exit
|
170
174
|
end
|
175
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
176
|
+
denotations = annotations[:denotations]
|
177
|
+
if denotations.nil? && annotations[:tracks]
|
178
|
+
denotations = annotations[:tracks].first[:denotations]
|
179
|
+
end
|
180
|
+
|
181
|
+
str_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
182
|
+
str_mapped = str_mapping.str
|
183
|
+
denotations_mapped = str_mapping.enmap_denotations(denotations)
|
184
|
+
new_annotations = {text:str_mapped, denotations:denotations_mapped}
|
171
185
|
|
186
|
+
puts new_annotations.to_json
|
172
187
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
class TextAlignment::CultivationMap
|
4
|
+
attr_reader :map
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@map = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def cultivate(regions)
|
11
|
+
regions.each do |b, e|
|
12
|
+
(b ... e).each{|p| @map[p] = e}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def search_again_position(position)
|
17
|
+
@map[position]
|
18
|
+
end
|
19
|
+
end
|
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
|
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/glcs_alignment'
|
8
|
-
require 'text_alignment/
|
8
|
+
require 'text_alignment/char_mapping'
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
@@ -106,7 +106,7 @@ if __FILE__ == $0
|
|
106
106
|
|
107
107
|
dictionary = [["β", "beta"]]
|
108
108
|
# align = TextAlignment::TextAlignment.new(str1, str2)
|
109
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::
|
109
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
|
110
110
|
p align.common_elements
|
111
111
|
p align.mapped_elements
|
112
112
|
end
|
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
|
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/lcs_cdiff'
|
8
8
|
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/
|
9
|
+
require 'text_alignment/char_mapping'
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
|
|
20
20
|
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
|
23
|
+
mappings ||= TextAlignment::CHAR_MAPPING
|
24
|
+
str1 = _str1.dup
|
25
|
+
str2 = _str2.dup
|
24
26
|
|
25
27
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
28
|
end
|
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
|
|
63
65
|
end
|
64
66
|
|
65
67
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity =
|
68
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
67
69
|
@str1_match_initial = cmp.str1_match_initial
|
68
70
|
@str1_match_final = cmp.str1_match_final
|
69
71
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,4 +141,14 @@ class TextAlignment::MixedAlignment
|
|
139
141
|
@position_map_end = posmap_end.sort.to_h
|
140
142
|
end
|
141
143
|
|
144
|
+
def compute_similarity(s1, s2, sdiff)
|
145
|
+
return 0 if sdiff.nil?
|
146
|
+
|
147
|
+
# compute the lcs only with non-whitespace letters
|
148
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
149
|
+
return 0 if lcs == 0
|
150
|
+
|
151
|
+
similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
|
152
|
+
end
|
153
|
+
|
142
154
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment/constants'
|
3
3
|
require 'text_alignment/anchor_finder'
|
4
4
|
require 'text_alignment/mixed_alignment'
|
5
|
+
require 'text_alignment/cultivation_map'
|
5
6
|
|
6
7
|
module TextAlignment; end unless defined? TextAlignment
|
7
8
|
|
@@ -9,23 +10,206 @@ class TextAlignment::TextAlignment
|
|
9
10
|
attr_reader :block_alignment
|
10
11
|
attr_reader :similarity
|
11
12
|
attr_reader :lost_annotations
|
13
|
+
attr_reader :cultivation_map
|
12
14
|
|
13
|
-
def initialize(_str1, _str2,
|
15
|
+
def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
|
14
16
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
15
17
|
|
16
|
-
@block_alignment = {source_text: _str1, target_text: _str2, denotations:
|
18
|
+
@block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
|
17
19
|
@original_str1 = _str1
|
18
20
|
@original_str2 = _str2
|
19
21
|
|
20
|
-
|
22
|
+
@str1_mapping = TextAlignment::CharMapping.new(_str1)
|
23
|
+
@str2_mapping = TextAlignment::CharMapping.new(_str2)
|
21
24
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
str1 = @str1_mapping.str
|
26
|
+
denotations = @str1_mapping.enmap_denotations(_denotations)
|
27
|
+
|
28
|
+
str2 = @str2_mapping.str
|
29
|
+
|
30
|
+
@cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
|
31
|
+
|
32
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
|
33
|
+
# whole block alignment
|
34
|
+
r
|
35
|
+
else
|
36
|
+
find_block_alignment(str1, str2, denotations, @cultivation_map)
|
37
|
+
end
|
38
|
+
|
39
|
+
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
40
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
41
|
+
[b[:target][:begin], b[:target][:end]]
|
42
|
+
else
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
end.compact
|
46
|
+
newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
|
47
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
48
|
+
condensed.push region
|
49
|
+
else
|
50
|
+
condensed.last[1] = region.last
|
51
|
+
end
|
52
|
+
condensed
|
53
|
+
end
|
54
|
+
|
55
|
+
@cultivation_map.cultivate(newly_cultivated_regions_condensed)
|
56
|
+
end
|
57
|
+
|
58
|
+
def transform_begin_position(_begin_position)
|
59
|
+
begin_position = @str1_mapping.enmap_position(_begin_position)
|
60
|
+
|
61
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
62
|
+
block = @block_alignment[:blocks][i]
|
63
|
+
|
64
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
65
|
+
begin_position + block[:delta]
|
66
|
+
elsif block[:alignment] == :empty
|
67
|
+
if begin_position == block[:source][:begin]
|
68
|
+
block[:target][:begin]
|
69
|
+
else
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
else
|
73
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
74
|
+
r.nil? ? nil : r + block[:target][:begin]
|
25
75
|
end
|
26
76
|
|
77
|
+
@str2_mapping.demap_position(b)
|
78
|
+
end
|
79
|
+
|
80
|
+
def transform_end_position(_end_position)
|
81
|
+
end_position = @str1_mapping.enmap_position(_end_position)
|
82
|
+
|
83
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
84
|
+
block = @block_alignment[:blocks][i]
|
85
|
+
|
86
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
87
|
+
end_position + block[:delta]
|
88
|
+
elsif block[:alignment] == :empty
|
89
|
+
if end_position == block[:source][:end]
|
90
|
+
block[:target][:end]
|
91
|
+
else
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
else
|
95
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
96
|
+
r.nil? ? nil : r + block[:target][:begin]
|
97
|
+
end
|
98
|
+
|
99
|
+
@str2_mapping.demap_position(e)
|
100
|
+
end
|
101
|
+
|
102
|
+
def transform_a_span(span)
|
103
|
+
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
104
|
+
end
|
105
|
+
|
106
|
+
def transform_spans(spans)
|
107
|
+
spans.map{|span| transform_a_span(span)}
|
108
|
+
end
|
109
|
+
|
110
|
+
def transform_denotations!(denotations)
|
111
|
+
return nil if denotations.nil?
|
112
|
+
@lost_annotations = []
|
113
|
+
|
114
|
+
denotations.each do |d|
|
115
|
+
source = {begin:d.begin, end:d.end}
|
116
|
+
d.begin = transform_begin_position(d.begin);
|
117
|
+
d.end = transform_end_position(d.end);
|
118
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
119
|
+
rescue
|
120
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
121
|
+
d.begin = nil
|
122
|
+
d.end = nil
|
123
|
+
end
|
124
|
+
|
125
|
+
@lost_annotations
|
126
|
+
end
|
127
|
+
|
128
|
+
def transform_hdenotations(hdenotations)
|
129
|
+
return nil if hdenotations.nil?
|
130
|
+
@lost_annotations = []
|
131
|
+
|
132
|
+
r = hdenotations.collect do |d|
|
133
|
+
t = transform_a_span(d[:span])
|
134
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
135
|
+
new_d = d.dup.merge({span:t})
|
136
|
+
rescue
|
137
|
+
@lost_annotations << {source: d[:span], target:t}
|
138
|
+
nil
|
139
|
+
end.compact
|
140
|
+
|
141
|
+
r
|
142
|
+
end
|
143
|
+
|
144
|
+
def alignment_show
|
145
|
+
stext = @block_alignment[:source_text]
|
146
|
+
ttext = @block_alignment[:target_text]
|
147
|
+
|
148
|
+
show = ''
|
149
|
+
@block_alignment[:blocks].each do |a|
|
150
|
+
show += case a[:alignment]
|
151
|
+
when :block
|
152
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
153
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
154
|
+
when :term
|
155
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
156
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
157
|
+
when :empty
|
158
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
159
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
160
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
161
|
+
">>>>> string 2 " +
|
162
|
+
if a[:target]
|
163
|
+
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
164
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
165
|
+
else
|
166
|
+
"[-]\n\n"
|
167
|
+
end
|
168
|
+
else
|
169
|
+
astr1 = ''
|
170
|
+
astr2 = ''
|
171
|
+
|
172
|
+
base = a[:source][:begin]
|
173
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
174
|
+
case c.action
|
175
|
+
when '='
|
176
|
+
stext[c.old_position + base]
|
177
|
+
when '+'
|
178
|
+
'_'
|
179
|
+
when '-'
|
180
|
+
stext[c.old_position + base]
|
181
|
+
when '!'
|
182
|
+
stext[c.old_position + base] + '_'
|
183
|
+
end
|
184
|
+
end.join('')
|
185
|
+
|
186
|
+
base = a[:target][:begin]
|
187
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
188
|
+
case c.action
|
189
|
+
when '='
|
190
|
+
ttext[c.new_position + base]
|
191
|
+
when '+'
|
192
|
+
ttext[c.new_position + base]
|
193
|
+
when '-'
|
194
|
+
'_'
|
195
|
+
when '!'
|
196
|
+
'_' + ttext[c.new_position + base]
|
197
|
+
end
|
198
|
+
end.join('')
|
199
|
+
|
200
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
201
|
+
"[#{astr1}]\n" +
|
202
|
+
"[#{astr2}]\n\n"
|
203
|
+
end
|
204
|
+
end
|
205
|
+
show
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
27
211
|
## to find block alignments
|
28
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2,
|
212
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
29
213
|
|
30
214
|
blocks = []
|
31
215
|
while block = anchor_finder.get_next_anchor
|
@@ -68,12 +252,13 @@ class TextAlignment::TextAlignment
|
|
68
252
|
|
69
253
|
if b2 == e2
|
70
254
|
[
|
71
|
-
{source:{begin:b1, end:e1},
|
255
|
+
{source:{begin:b1, end:e1}, alignment: :empty},
|
72
256
|
block
|
73
257
|
]
|
74
258
|
else
|
259
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
260
|
+
|
75
261
|
if b1 == 0 && b2 == 0
|
76
|
-
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
262
|
b2 = e2 - len_buffer if e2 > len_buffer
|
78
263
|
end
|
79
264
|
|
@@ -85,6 +270,10 @@ class TextAlignment::TextAlignment
|
|
85
270
|
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
86
271
|
block
|
87
272
|
]
|
273
|
+
elsif ((e2 - b2) - (e1 - b1)) > len_buffer
|
274
|
+
la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
|
275
|
+
la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
|
276
|
+
[la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
|
88
277
|
else
|
89
278
|
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
90
279
|
end
|
@@ -102,31 +291,53 @@ class TextAlignment::TextAlignment
|
|
102
291
|
b1 = last_block[:source][:end]
|
103
292
|
if b1 < str1.length
|
104
293
|
e1 = str1.length
|
105
|
-
|
106
294
|
b2 = last_block[:target][:end]
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
295
|
+
|
296
|
+
_str1 = str1[b1 ... e1]
|
297
|
+
if _str1.strip.empty?
|
298
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
111
299
|
else
|
112
|
-
|
300
|
+
if b2 < str2.length
|
301
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
302
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
303
|
+
|
304
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
305
|
+
else
|
306
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
307
|
+
end
|
113
308
|
end
|
114
309
|
else
|
115
310
|
[]
|
116
311
|
end
|
117
312
|
end
|
118
|
-
|
119
|
-
@block_alignment[:blocks] = blocks2
|
120
313
|
end
|
121
314
|
|
122
|
-
def whole_block_alignment(str1, str2)
|
315
|
+
def whole_block_alignment(str1, str2, cultivation_map)
|
123
316
|
## Block exact match
|
124
|
-
|
317
|
+
search_position = 0
|
318
|
+
|
319
|
+
block_begin = begin
|
320
|
+
_block_begin = str2.index(str1, search_position)
|
321
|
+
break if _block_begin.nil?
|
322
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
323
|
+
_block_begin
|
324
|
+
end until search_position.nil?
|
325
|
+
|
125
326
|
unless block_begin.nil?
|
126
327
|
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
328
|
end
|
128
329
|
|
129
|
-
|
330
|
+
search_position = 0
|
331
|
+
|
332
|
+
dstr1 = str1.downcase
|
333
|
+
dstr2 = str2.downcase
|
334
|
+
block_begin = begin
|
335
|
+
_block_begin = dstr2.index(dstr1, search_position)
|
336
|
+
break if _block_begin.nil?
|
337
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
338
|
+
_block_begin
|
339
|
+
end until search_position.nil?
|
340
|
+
|
130
341
|
unless block_begin.nil?
|
131
342
|
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
132
343
|
end
|
@@ -144,7 +355,7 @@ class TextAlignment::TextAlignment
|
|
144
355
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
145
356
|
|
146
357
|
position = 0
|
147
|
-
|
358
|
+
_tblocks = ds_in_scope.map do |term|
|
148
359
|
lex = term[:lex]
|
149
360
|
r = block2.index(lex, position)
|
150
361
|
if r.nil?
|
@@ -152,11 +363,11 @@ class TextAlignment::TextAlignment
|
|
152
363
|
break
|
153
364
|
end
|
154
365
|
position = r + lex.length
|
155
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r + b2 - term[:span][:begin]}
|
366
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
|
156
367
|
end
|
157
368
|
|
158
369
|
# missing term found
|
159
|
-
|
370
|
+
_tblocks = [] if position.nil?
|
160
371
|
|
161
372
|
# redundant matching found
|
162
373
|
unless position.nil?
|
@@ -164,13 +375,13 @@ class TextAlignment::TextAlignment
|
|
164
375
|
lex = term[:lex]
|
165
376
|
look_forward = block2.index(lex, position)
|
166
377
|
unless look_forward.nil?
|
167
|
-
|
378
|
+
_tblocks = []
|
168
379
|
break
|
169
380
|
end
|
170
381
|
end
|
171
382
|
end
|
172
383
|
|
173
|
-
|
384
|
+
_tblocks
|
174
385
|
else
|
175
386
|
[]
|
176
387
|
end
|
@@ -184,7 +395,7 @@ class TextAlignment::TextAlignment
|
|
184
395
|
block2 = str2[b2 ... e2]
|
185
396
|
|
186
397
|
## character-based alignment
|
187
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase
|
398
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
188
399
|
if alignment.sdiff.nil?
|
189
400
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
401
|
else
|
@@ -196,7 +407,7 @@ class TextAlignment::TextAlignment
|
|
196
407
|
block2 = str2[b2 ... e2]
|
197
408
|
|
198
409
|
## character-based alignment
|
199
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase
|
410
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
200
411
|
if alignment.sdiff.nil?
|
201
412
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
202
413
|
else
|
@@ -244,157 +455,4 @@ class TextAlignment::TextAlignment
|
|
244
455
|
end
|
245
456
|
end
|
246
457
|
|
247
|
-
|
248
|
-
def indices(str, target)
|
249
|
-
position = 0
|
250
|
-
len = target.len
|
251
|
-
Enumerator.new do |yielder|
|
252
|
-
while idx = str.index(target, position)
|
253
|
-
yielder << idx
|
254
|
-
position = idx + len
|
255
|
-
end
|
256
|
-
end
|
257
|
-
end
|
258
|
-
|
259
|
-
def transform_begin_position(begin_position)
|
260
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
261
|
-
block = @block_alignment[:blocks][i]
|
262
|
-
|
263
|
-
b = if block[:alignment] == :block || block[:alignment] == :term
|
264
|
-
begin_position + block[:delta]
|
265
|
-
elsif block[:alignment] == :empty
|
266
|
-
if begin_position == block[:source][:begin]
|
267
|
-
block[:target][:begin]
|
268
|
-
else
|
269
|
-
nil
|
270
|
-
end
|
271
|
-
else
|
272
|
-
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
273
|
-
r.nil? ? nil : r + block[:target][:begin]
|
274
|
-
end
|
275
|
-
end
|
276
|
-
|
277
|
-
def transform_end_position(end_position)
|
278
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
279
|
-
block = @block_alignment[:blocks][i]
|
280
|
-
|
281
|
-
e = if block[:alignment] == :block || block[:alignment] == :term
|
282
|
-
end_position + block[:delta]
|
283
|
-
elsif block[:alignment] == :empty
|
284
|
-
if end_position == block[:source][:end]
|
285
|
-
block[:target][:end]
|
286
|
-
else
|
287
|
-
nil
|
288
|
-
end
|
289
|
-
else
|
290
|
-
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
291
|
-
r.nil? ? nil : r + block[:target][:begin]
|
292
|
-
end
|
293
|
-
end
|
294
|
-
|
295
|
-
def transform_a_span(span)
|
296
|
-
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
297
|
-
end
|
298
|
-
|
299
|
-
def transform_spans(spans)
|
300
|
-
spans.map{|span| transform_a_span(span)}
|
301
|
-
end
|
302
|
-
|
303
|
-
def transform_denotations!(denotations)
|
304
|
-
return nil if denotations.nil?
|
305
|
-
@lost_annotations = []
|
306
|
-
|
307
|
-
denotations.each do |d|
|
308
|
-
source = {begin:d.begin, end:d.end}
|
309
|
-
d.begin = transform_begin_position(d.begin);
|
310
|
-
d.end = transform_end_position(d.end);
|
311
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
312
|
-
rescue
|
313
|
-
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
314
|
-
d.begin = nil
|
315
|
-
d.end = nil
|
316
|
-
end
|
317
|
-
|
318
|
-
@lost_annotations
|
319
|
-
end
|
320
|
-
|
321
|
-
def transform_hdenotations(hdenotations)
|
322
|
-
return nil if hdenotations.nil?
|
323
|
-
@lost_annotations = []
|
324
|
-
|
325
|
-
r = hdenotations.collect do |d|
|
326
|
-
t = transform_a_span(d[:span])
|
327
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
328
|
-
new_d = d.dup.merge({span:t})
|
329
|
-
rescue
|
330
|
-
@lost_annotations << {source: d[:span], target:t}
|
331
|
-
nil
|
332
|
-
end.compact
|
333
|
-
|
334
|
-
r
|
335
|
-
end
|
336
|
-
|
337
|
-
def alignment_show
|
338
|
-
stext = @block_alignment[:source_text]
|
339
|
-
ttext = @block_alignment[:target_text]
|
340
|
-
|
341
|
-
show = ''
|
342
|
-
@block_alignment[:blocks].each do |a|
|
343
|
-
show += case a[:alignment]
|
344
|
-
when :block
|
345
|
-
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
346
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
347
|
-
when :term
|
348
|
-
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
349
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
350
|
-
when :empty
|
351
|
-
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
352
|
-
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
353
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
354
|
-
">>>>> string 2 " +
|
355
|
-
if a[:target]
|
356
|
-
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
357
|
-
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
358
|
-
else
|
359
|
-
"[-]\n\n"
|
360
|
-
end
|
361
|
-
else
|
362
|
-
astr1 = ''
|
363
|
-
astr2 = ''
|
364
|
-
|
365
|
-
base = a[:source][:begin]
|
366
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
367
|
-
case c.action
|
368
|
-
when '='
|
369
|
-
stext[c.old_position + base]
|
370
|
-
when '+'
|
371
|
-
'_'
|
372
|
-
when '-'
|
373
|
-
stext[c.old_position + base]
|
374
|
-
when '!'
|
375
|
-
stext[c.old_position + base] + '_'
|
376
|
-
end
|
377
|
-
end.join('')
|
378
|
-
|
379
|
-
base = a[:target][:begin]
|
380
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
381
|
-
case c.action
|
382
|
-
when '='
|
383
|
-
ttext[c.new_position + base]
|
384
|
-
when '+'
|
385
|
-
ttext[c.new_position + base]
|
386
|
-
when '-'
|
387
|
-
'_'
|
388
|
-
when '!'
|
389
|
-
'_' + ttext[c.new_position + base]
|
390
|
-
end
|
391
|
-
end.join('')
|
392
|
-
|
393
|
-
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
394
|
-
"[#{astr1}]\n" +
|
395
|
-
"[#{astr2}]\n\n"
|
396
|
-
end
|
397
|
-
end
|
398
|
-
show
|
399
|
-
end
|
400
458
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,7 +77,9 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/char_mapping.rb
|
80
81
|
- lib/text_alignment/constants.rb
|
82
|
+
- lib/text_alignment/cultivation_map.rb
|
81
83
|
- lib/text_alignment/find_divisions.rb
|
82
84
|
- lib/text_alignment/glcs_alignment.rb
|
83
85
|
- lib/text_alignment/glcs_alignment_fast.rb
|
@@ -86,7 +88,6 @@ files:
|
|
86
88
|
- lib/text_alignment/lcs_cdiff.rb
|
87
89
|
- lib/text_alignment/lcs_comparison.rb
|
88
90
|
- lib/text_alignment/lcs_min.rb
|
89
|
-
- lib/text_alignment/mappings.rb
|
90
91
|
- lib/text_alignment/mixed_alignment.rb
|
91
92
|
- lib/text_alignment/text_alignment.rb
|
92
93
|
- lib/text_alignment/version.rb
|