text_alignment 0.9.1 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +10 -6
- data/lib/text_alignment/anchor_finder.rb +130 -62
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +90 -75
- data/lib/text_alignment/cultivation_map.rb +19 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +15 -3
- data/lib/text_alignment/text_alignment.rb +238 -180
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
|
4
|
+
data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
|
7
|
+
data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
|
data/bin/align_annotations
CHANGED
@@ -26,8 +26,9 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text, target_text, debug = false)
|
30
|
-
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
|
29
|
+
def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
|
30
|
+
alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
|
31
|
+
cm = alignment.cultivation_map
|
31
32
|
new_denotations = alignment.transform_hdenotations(denotations)
|
32
33
|
|
33
34
|
if debug
|
@@ -47,7 +48,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
47
48
|
warn
|
48
49
|
|
49
50
|
# return target annotations
|
50
|
-
new_denotations
|
51
|
+
[new_denotations, cm]
|
51
52
|
end
|
52
53
|
|
53
54
|
def align_mannotations(source_annotations, target_text, debug = false)
|
@@ -58,11 +59,13 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
58
59
|
idnum_attributes = 0
|
59
60
|
idnum_modifications = 0
|
60
61
|
|
62
|
+
cm = nil
|
61
63
|
source_annotations.each_with_index do |annotations, i|
|
62
64
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
63
65
|
ididx = {}
|
64
66
|
warn "[#{i}]-=-=-=-=-"
|
65
|
-
denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
|
67
|
+
denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
|
68
|
+
|
66
69
|
denotations.each do |d|
|
67
70
|
reid = 'T' + (idnum_denotations += 1).to_s
|
68
71
|
ididx[d[:id]] = reid
|
@@ -114,8 +117,9 @@ target_text = read_text(ARGV[1])
|
|
114
117
|
target_annotations = if source_annotations.class == Array
|
115
118
|
align_mannotations(source_annotations, target_text, false)
|
116
119
|
else
|
117
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
120
|
+
denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
|
121
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
|
118
122
|
source_annotations.merge({text:target_text, denotations:denotations})
|
119
123
|
end
|
120
124
|
|
121
|
-
|
125
|
+
puts target_annotations.to_json
|
@@ -6,76 +6,65 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str,
|
10
|
-
@
|
11
|
-
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
12
|
-
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
13
|
-
|
14
|
-
@reverse = (target_str.length < source_str.length)
|
15
|
-
|
16
|
-
@s1, @s2 = if @reverse
|
9
|
+
def initialize(source_str, target_str, cultivation_map)
|
10
|
+
@s1, @s2 = if reverse?(source_str, target_str)
|
17
11
|
[target_str.downcase, source_str.downcase]
|
18
12
|
else
|
19
13
|
[source_str.downcase, target_str.downcase]
|
20
14
|
end
|
21
15
|
|
22
|
-
|
23
|
-
@beg_s1 = 0
|
24
|
-
@end_s1_prev = 0
|
25
|
-
@end_s2_prev = 0
|
26
|
-
end
|
27
|
-
|
28
|
-
def get_next_anchor
|
29
|
-
# find the position of an anchor ngram in s1 and s2
|
30
|
-
while @beg_s1 < (@s1.length - @size_ngram)
|
31
|
-
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
-
@beg_s1 += 1
|
33
|
-
next
|
34
|
-
end
|
35
|
-
anchor = @s1[@beg_s1, @size_ngram]
|
16
|
+
@cultivation_map = cultivation_map
|
36
17
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
18
|
+
@size_ngram = TextAlignment::SIZE_NGRAM
|
19
|
+
@size_window = TextAlignment::SIZE_WINDOW
|
20
|
+
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
21
|
+
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
42
22
|
|
43
|
-
|
44
|
-
|
23
|
+
# positions of last match
|
24
|
+
@pos_s1_last_match = 0
|
25
|
+
@pos_s2_last_match = 0
|
26
|
+
end
|
45
27
|
|
46
|
-
|
47
|
-
|
28
|
+
def reverse?(source_str = nil, target_str = nil)
|
29
|
+
unless source_str.nil?
|
30
|
+
@reverse_p = target_str.length < source_str.length
|
31
|
+
end
|
32
|
+
@reverse_p
|
33
|
+
end
|
48
34
|
|
49
|
-
|
50
|
-
|
35
|
+
def get_next_anchor
|
36
|
+
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
37
|
+
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
51
38
|
|
52
|
-
|
39
|
+
# To skip whitespace letters
|
40
|
+
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
53
41
|
|
54
|
-
|
42
|
+
_beg_s2 = get_beg_s2(beg_s1)
|
43
|
+
break _beg_s2 unless _beg_s2.nil?
|
55
44
|
end
|
56
45
|
|
57
|
-
return nil
|
46
|
+
# To return nil when it fails to find an anchor
|
47
|
+
return nil if beg_s2.class == Range
|
58
48
|
|
59
|
-
# extend the block
|
60
|
-
b1 =
|
61
|
-
b2 =
|
62
|
-
while b1 >= @
|
49
|
+
# To extend the block to the left
|
50
|
+
b1 = beg_s1
|
51
|
+
b2 = beg_s2
|
52
|
+
while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
|
63
53
|
b1 -= 1; b2 -= 1
|
64
54
|
end
|
65
|
-
|
66
55
|
b1 += 1; b2 += 1
|
67
56
|
|
68
|
-
|
69
|
-
|
57
|
+
# To extend the block to the right
|
58
|
+
e1 = beg_s1 + @size_ngram
|
59
|
+
e2 = beg_s2 + @size_ngram
|
70
60
|
while @s1[e1] && @s1[e1] == @s2[e2]
|
71
61
|
e1 += 1; e2 += 1
|
72
62
|
end
|
73
63
|
|
74
|
-
@
|
75
|
-
@
|
76
|
-
@beg_s1 = e1
|
64
|
+
@pos_s1_last_match = e1
|
65
|
+
@pos_s2_last_match = e2
|
77
66
|
|
78
|
-
if
|
67
|
+
if reverse?
|
79
68
|
{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
|
80
69
|
else
|
81
70
|
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
@@ -84,14 +73,92 @@ class TextAlignment::AnchorFinder
|
|
84
73
|
|
85
74
|
private
|
86
75
|
|
87
|
-
def
|
88
|
-
#
|
89
|
-
|
76
|
+
def get_beg_s2(beg_s1)
|
77
|
+
# to get the anchor to search for in s2
|
78
|
+
anchor = @s1[beg_s1, @size_ngram]
|
79
|
+
|
80
|
+
# comment out below with the assumption that texts are in the same order
|
81
|
+
# search_position = 0
|
82
|
+
search_position = @pos_s2_last_match
|
83
|
+
|
84
|
+
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
85
|
+
return nil if beg_s2_candidates.empty?
|
86
|
+
|
87
|
+
find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
88
|
+
end
|
89
|
+
|
90
|
+
# To find beg_s2 which match to the anchor
|
91
|
+
# return nil if the anchor is too much frequent
|
92
|
+
def find_beg_s2_candidates(anchor, search_position)
|
93
|
+
candidates = []
|
94
|
+
while _beg_s2 = @s2.index(anchor, search_position)
|
95
|
+
search_again_position = @cultivation_map.search_again_position(_beg_s2)
|
96
|
+
unless search_again_position.nil?
|
97
|
+
search_position = search_again_position
|
98
|
+
next
|
99
|
+
end
|
100
|
+
|
101
|
+
candidates << _beg_s2
|
102
|
+
|
103
|
+
# for speed, skip anchor of high frequency
|
104
|
+
if candidates.length > 5
|
105
|
+
candidates.clear
|
106
|
+
break
|
107
|
+
end
|
108
|
+
|
109
|
+
search_position = _beg_s2 + 1
|
110
|
+
end
|
111
|
+
candidates
|
112
|
+
end
|
113
|
+
|
114
|
+
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
115
|
+
valid_beg_s2 = nil
|
116
|
+
|
117
|
+
(10 .. 30).step(10).each do |size_window|
|
118
|
+
valid_beg_s2 = nil
|
119
|
+
|
120
|
+
r = beg_s2_candidates.each do |beg_s2|
|
121
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
122
|
+
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
123
|
+
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
124
|
+
break unless valid_beg_s2.nil?
|
125
|
+
valid_beg_s2 = beg_s2
|
126
|
+
next
|
127
|
+
end
|
128
|
+
|
129
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
|
130
|
+
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
131
|
+
break unless valid_beg_s2.nil?
|
132
|
+
valid_beg_s2 = beg_s2
|
133
|
+
next
|
134
|
+
end
|
135
|
+
|
136
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
|
137
|
+
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
138
|
+
break unless valid_beg_s2.nil?
|
139
|
+
valid_beg_s2 = beg_s2
|
140
|
+
next
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
145
|
+
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
146
|
+
break unless r.nil?
|
147
|
+
end
|
148
|
+
|
149
|
+
valid_beg_s2
|
150
|
+
end
|
151
|
+
|
152
|
+
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
153
|
+
size_window ||= @size_window
|
154
|
+
|
155
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
156
|
+
# return if @beg_s1 < size_window || @beg_s2 < size_window
|
90
157
|
|
91
158
|
window_s1 = ''
|
92
|
-
loc =
|
159
|
+
loc = beg_s1 - 1
|
93
160
|
count = 0
|
94
|
-
while count <
|
161
|
+
while count < size_window && loc >= 0
|
95
162
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
96
163
|
window_s1 += @s1[loc]
|
97
164
|
count += 1
|
@@ -100,9 +167,9 @@ class TextAlignment::AnchorFinder
|
|
100
167
|
end
|
101
168
|
|
102
169
|
window_s2 = ''
|
103
|
-
loc =
|
170
|
+
loc = beg_s2 - 1
|
104
171
|
count = 0
|
105
|
-
while count <
|
172
|
+
while count < size_window && loc >= 0
|
106
173
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
107
174
|
window_s2 += @s2[loc]
|
108
175
|
count += 1
|
@@ -113,15 +180,17 @@ class TextAlignment::AnchorFinder
|
|
113
180
|
[window_s1, window_s2]
|
114
181
|
end
|
115
182
|
|
116
|
-
def get_right_windows
|
183
|
+
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
184
|
+
size_window ||= @size_window
|
185
|
+
|
117
186
|
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
-
# return if (@beg_s1 + @size_ngram > (@s1.length -
|
187
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
119
188
|
|
120
189
|
window_s1 = ''
|
121
|
-
loc =
|
190
|
+
loc = beg_s1 + @size_ngram
|
122
191
|
len_s1 = @s1.length
|
123
192
|
count = 0
|
124
|
-
while count <
|
193
|
+
while count < size_window && loc < len_s1
|
125
194
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
126
195
|
window_s1 += @s1[loc]
|
127
196
|
count += 1
|
@@ -130,10 +199,10 @@ class TextAlignment::AnchorFinder
|
|
130
199
|
end
|
131
200
|
|
132
201
|
window_s2 = ''
|
133
|
-
loc =
|
202
|
+
loc = beg_s2 + @size_ngram
|
134
203
|
len_s2 = @s2.length
|
135
204
|
count = 0
|
136
|
-
while count <
|
205
|
+
while count < size_window && loc < len_s2
|
137
206
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
138
207
|
window_s2 += @s2[loc]
|
139
208
|
count += 1
|
@@ -148,5 +217,4 @@ class TextAlignment::AnchorFinder
|
|
148
217
|
return 0 if str1.nil? || str2.nil?
|
149
218
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
150
219
|
end
|
151
|
-
|
152
|
-
end
|
220
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
|
-
TextAlignment::
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
4
|
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
5
|
|
6
6
|
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
|
|
61
61
|
["•", "*"], #U+2022 (bullet)
|
62
62
|
[" ", " "], #U+2009 (thin space)
|
63
63
|
[" ", " "], #U+200A (hair space)
|
64
|
-
[" ", " "], #U+00A0 (
|
64
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
65
|
[" ", " "], #U+3000 (ideographic space)
|
66
|
-
["
|
66
|
+
["‐", "-"], #U+2010 (Hyphen)
|
67
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
67
68
|
["−", "-"], #U+2212 (minus sign)
|
68
69
|
["–", "-"], #U+2013 (en dash)
|
69
70
|
["′", "'"], #U+2032 (prime)
|
@@ -75,98 +76,112 @@ TextAlignment::MAPPINGS = [
|
|
75
76
|
]
|
76
77
|
|
77
78
|
|
78
|
-
|
79
|
+
class TextAlignment::CharMapping
|
80
|
+
attr_reader :str
|
79
81
|
|
82
|
+
def initialize(_str, char_mapping = nil)
|
83
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
84
|
+
@str, offset_mapping = enmap_str(_str, char_mapping)
|
85
|
+
@index_enmap = offset_mapping.to_h
|
86
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
87
|
+
end
|
88
|
+
|
89
|
+
def enmap_position(position)
|
90
|
+
@index_enmap[position]
|
91
|
+
end
|
80
92
|
|
81
|
-
|
82
|
-
|
83
|
-
|
93
|
+
def demap_position(position)
|
94
|
+
@index_demap[position]
|
95
|
+
end
|
84
96
|
|
85
|
-
|
86
|
-
|
87
|
-
[
|
88
|
-
|
89
|
-
|
90
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
-
characters_to.gsub!(/-/, '\-')
|
97
|
+
def enmap_denotations(_denotations)
|
98
|
+
denotations = _denotations.map do |d|
|
99
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
100
|
+
end
|
101
|
+
end
|
92
102
|
|
93
|
-
|
94
|
-
str2 = _str2.tr(characters_from, characters_to)
|
103
|
+
private
|
95
104
|
|
96
|
-
|
105
|
+
def enmap_str(_str, char_mapping)
|
106
|
+
str = _str.dup
|
97
107
|
|
98
|
-
|
108
|
+
# To execute the single letter mapping
|
109
|
+
char_mapping.each do |one, long|
|
110
|
+
str.gsub!(one, long) if long.length == 1
|
99
111
|
end
|
100
|
-
end
|
101
112
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
if long_to_one_mappings.empty?
|
107
|
-
[_str1, _str2, _mappings]
|
108
|
-
else
|
109
|
-
## long to one character mappings
|
110
|
-
pletters = TextAlignment::PADDING_LETTERS
|
111
|
-
|
112
|
-
# find the padding letter for str1
|
113
|
-
@padding_letter1 = begin
|
114
|
-
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
-
TextAlignment::PADDING_LETTERS[i]
|
117
|
-
end
|
113
|
+
# To get the (location, length) index for replacements
|
114
|
+
loc_len = []
|
115
|
+
char_mapping.each do |one, long|
|
116
|
+
next if long.length == 1
|
118
117
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
TextAlignment::PADDING_LETTERS[i]
|
118
|
+
init_next = 0
|
119
|
+
while loc = str.index(long, init_next)
|
120
|
+
loc_len << [loc, long.length]
|
121
|
+
init_next = loc + long.length
|
124
122
|
end
|
125
123
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
str1 = if _str2.index(f[0])
|
131
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
132
|
-
_str1.gsub(from, to)
|
133
|
-
else
|
134
|
-
_str1
|
135
|
-
end
|
136
|
-
|
137
|
-
str2 = if _str1.index(f[0])
|
138
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
139
|
-
_str2.gsub(from, to)
|
140
|
-
else
|
141
|
-
_str2
|
142
|
-
end
|
143
|
-
end
|
144
|
-
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
124
|
+
# a workaround to avoid messing-up due to embedding
|
125
|
+
str.gsub!(long, one * long.length)
|
126
|
+
end
|
145
127
|
|
146
|
-
|
128
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
129
|
+
init_next = 0
|
130
|
+
while loc = str.index(/\s{2,}/, init_next)
|
131
|
+
len = $~[0].length
|
132
|
+
loc_len << [loc, len]
|
133
|
+
init_next = loc + len
|
147
134
|
end
|
148
|
-
end
|
149
135
|
|
150
|
-
|
151
|
-
return 0 if sdiff.nil?
|
136
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
152
137
|
|
153
|
-
#
|
154
|
-
|
155
|
-
|
138
|
+
# To get the offset_mapping before and after replacement
|
139
|
+
offset_mapping = []
|
140
|
+
init_next = 0
|
141
|
+
j = 0
|
156
142
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
143
|
+
loc_len.each do |loc, len|
|
144
|
+
offset_mapping += (init_next .. loc).map do |i|
|
145
|
+
j += 1
|
146
|
+
[i, j - 1]
|
147
|
+
end
|
148
|
+
init_next = loc + len
|
149
|
+
end
|
150
|
+
|
151
|
+
offset_mapping += (init_next .. str.length).map do |i|
|
152
|
+
j += 1
|
153
|
+
[i, j - 1]
|
161
154
|
end
|
162
155
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
_s2
|
156
|
+
# To execute the long letter mapping
|
157
|
+
char_mapping.each do |one, long|
|
158
|
+
str.gsub!(one * long.length, one) if long.length > 1
|
167
159
|
end
|
168
160
|
|
169
|
-
|
161
|
+
# To replace multi whitespace sequences to a space
|
162
|
+
str.gsub!(/\s{2,}/, ' ')
|
163
|
+
|
164
|
+
[str, offset_mapping]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
if __FILE__ == $0
|
169
|
+
require 'json'
|
170
|
+
|
171
|
+
unless ARGV.length == 1
|
172
|
+
warn "#{$0} an_annotation_json_file.json"
|
173
|
+
exit
|
170
174
|
end
|
175
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
176
|
+
denotations = annotations[:denotations]
|
177
|
+
if denotations.nil? && annotations[:tracks]
|
178
|
+
denotations = annotations[:tracks].first[:denotations]
|
179
|
+
end
|
180
|
+
|
181
|
+
str_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
182
|
+
str_mapped = str_mapping.str
|
183
|
+
denotations_mapped = str_mapping.enmap_denotations(denotations)
|
184
|
+
new_annotations = {text:str_mapped, denotations:denotations_mapped}
|
171
185
|
|
186
|
+
puts new_annotations.to_json
|
172
187
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
class TextAlignment::CultivationMap
|
4
|
+
attr_reader :map
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@map = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def cultivate(regions)
|
11
|
+
regions.each do |b, e|
|
12
|
+
(b ... e).each{|p| @map[p] = e}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def search_again_position(position)
|
17
|
+
@map[position]
|
18
|
+
end
|
19
|
+
end
|
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
|
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/glcs_alignment'
|
8
|
-
require 'text_alignment/
|
8
|
+
require 'text_alignment/char_mapping'
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
@@ -106,7 +106,7 @@ if __FILE__ == $0
|
|
106
106
|
|
107
107
|
dictionary = [["β", "beta"]]
|
108
108
|
# align = TextAlignment::TextAlignment.new(str1, str2)
|
109
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::
|
109
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
|
110
110
|
p align.common_elements
|
111
111
|
p align.mapped_elements
|
112
112
|
end
|
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
|
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/lcs_cdiff'
|
8
8
|
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/
|
9
|
+
require 'text_alignment/char_mapping'
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
|
|
20
20
|
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
|
23
|
+
mappings ||= TextAlignment::CHAR_MAPPING
|
24
|
+
str1 = _str1.dup
|
25
|
+
str2 = _str2.dup
|
24
26
|
|
25
27
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
28
|
end
|
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
|
|
63
65
|
end
|
64
66
|
|
65
67
|
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
66
|
-
@similarity =
|
68
|
+
@similarity = compute_similarity(str1, str2, @sdiff)
|
67
69
|
@str1_match_initial = cmp.str1_match_initial
|
68
70
|
@str1_match_final = cmp.str1_match_final
|
69
71
|
@str2_match_initial = cmp.str2_match_initial
|
@@ -139,4 +141,14 @@ class TextAlignment::MixedAlignment
|
|
139
141
|
@position_map_end = posmap_end.sort.to_h
|
140
142
|
end
|
141
143
|
|
144
|
+
def compute_similarity(s1, s2, sdiff)
|
145
|
+
return 0 if sdiff.nil?
|
146
|
+
|
147
|
+
# compute the lcs only with non-whitespace letters
|
148
|
+
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
149
|
+
return 0 if lcs == 0
|
150
|
+
|
151
|
+
similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
|
152
|
+
end
|
153
|
+
|
142
154
|
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment/constants'
|
3
3
|
require 'text_alignment/anchor_finder'
|
4
4
|
require 'text_alignment/mixed_alignment'
|
5
|
+
require 'text_alignment/cultivation_map'
|
5
6
|
|
6
7
|
module TextAlignment; end unless defined? TextAlignment
|
7
8
|
|
@@ -9,23 +10,206 @@ class TextAlignment::TextAlignment
|
|
9
10
|
attr_reader :block_alignment
|
10
11
|
attr_reader :similarity
|
11
12
|
attr_reader :lost_annotations
|
13
|
+
attr_reader :cultivation_map
|
12
14
|
|
13
|
-
def initialize(_str1, _str2,
|
15
|
+
def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
|
14
16
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
15
17
|
|
16
|
-
@block_alignment = {source_text: _str1, target_text: _str2, denotations:
|
18
|
+
@block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
|
17
19
|
@original_str1 = _str1
|
18
20
|
@original_str2 = _str2
|
19
21
|
|
20
|
-
|
22
|
+
@str1_mapping = TextAlignment::CharMapping.new(_str1)
|
23
|
+
@str2_mapping = TextAlignment::CharMapping.new(_str2)
|
21
24
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
str1 = @str1_mapping.str
|
26
|
+
denotations = @str1_mapping.enmap_denotations(_denotations)
|
27
|
+
|
28
|
+
str2 = @str2_mapping.str
|
29
|
+
|
30
|
+
@cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
|
31
|
+
|
32
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
|
33
|
+
# whole block alignment
|
34
|
+
r
|
35
|
+
else
|
36
|
+
find_block_alignment(str1, str2, denotations, @cultivation_map)
|
37
|
+
end
|
38
|
+
|
39
|
+
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
40
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
41
|
+
[b[:target][:begin], b[:target][:end]]
|
42
|
+
else
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
end.compact
|
46
|
+
newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
|
47
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
48
|
+
condensed.push region
|
49
|
+
else
|
50
|
+
condensed.last[1] = region.last
|
51
|
+
end
|
52
|
+
condensed
|
53
|
+
end
|
54
|
+
|
55
|
+
@cultivation_map.cultivate(newly_cultivated_regions_condensed)
|
56
|
+
end
|
57
|
+
|
58
|
+
def transform_begin_position(_begin_position)
|
59
|
+
begin_position = @str1_mapping.enmap_position(_begin_position)
|
60
|
+
|
61
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
62
|
+
block = @block_alignment[:blocks][i]
|
63
|
+
|
64
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
65
|
+
begin_position + block[:delta]
|
66
|
+
elsif block[:alignment] == :empty
|
67
|
+
if begin_position == block[:source][:begin]
|
68
|
+
block[:target][:begin]
|
69
|
+
else
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
else
|
73
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
74
|
+
r.nil? ? nil : r + block[:target][:begin]
|
25
75
|
end
|
26
76
|
|
77
|
+
@str2_mapping.demap_position(b)
|
78
|
+
end
|
79
|
+
|
80
|
+
def transform_end_position(_end_position)
|
81
|
+
end_position = @str1_mapping.enmap_position(_end_position)
|
82
|
+
|
83
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
84
|
+
block = @block_alignment[:blocks][i]
|
85
|
+
|
86
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
87
|
+
end_position + block[:delta]
|
88
|
+
elsif block[:alignment] == :empty
|
89
|
+
if end_position == block[:source][:end]
|
90
|
+
block[:target][:end]
|
91
|
+
else
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
else
|
95
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
96
|
+
r.nil? ? nil : r + block[:target][:begin]
|
97
|
+
end
|
98
|
+
|
99
|
+
@str2_mapping.demap_position(e)
|
100
|
+
end
|
101
|
+
|
102
|
+
def transform_a_span(span)
|
103
|
+
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
104
|
+
end
|
105
|
+
|
106
|
+
def transform_spans(spans)
|
107
|
+
spans.map{|span| transform_a_span(span)}
|
108
|
+
end
|
109
|
+
|
110
|
+
def transform_denotations!(denotations)
|
111
|
+
return nil if denotations.nil?
|
112
|
+
@lost_annotations = []
|
113
|
+
|
114
|
+
denotations.each do |d|
|
115
|
+
source = {begin:d.begin, end:d.end}
|
116
|
+
d.begin = transform_begin_position(d.begin);
|
117
|
+
d.end = transform_end_position(d.end);
|
118
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
119
|
+
rescue
|
120
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
121
|
+
d.begin = nil
|
122
|
+
d.end = nil
|
123
|
+
end
|
124
|
+
|
125
|
+
@lost_annotations
|
126
|
+
end
|
127
|
+
|
128
|
+
def transform_hdenotations(hdenotations)
|
129
|
+
return nil if hdenotations.nil?
|
130
|
+
@lost_annotations = []
|
131
|
+
|
132
|
+
r = hdenotations.collect do |d|
|
133
|
+
t = transform_a_span(d[:span])
|
134
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
135
|
+
new_d = d.dup.merge({span:t})
|
136
|
+
rescue
|
137
|
+
@lost_annotations << {source: d[:span], target:t}
|
138
|
+
nil
|
139
|
+
end.compact
|
140
|
+
|
141
|
+
r
|
142
|
+
end
|
143
|
+
|
144
|
+
def alignment_show
|
145
|
+
stext = @block_alignment[:source_text]
|
146
|
+
ttext = @block_alignment[:target_text]
|
147
|
+
|
148
|
+
show = ''
|
149
|
+
@block_alignment[:blocks].each do |a|
|
150
|
+
show += case a[:alignment]
|
151
|
+
when :block
|
152
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
153
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
154
|
+
when :term
|
155
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
156
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
157
|
+
when :empty
|
158
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
159
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
160
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
161
|
+
">>>>> string 2 " +
|
162
|
+
if a[:target]
|
163
|
+
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
164
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
165
|
+
else
|
166
|
+
"[-]\n\n"
|
167
|
+
end
|
168
|
+
else
|
169
|
+
astr1 = ''
|
170
|
+
astr2 = ''
|
171
|
+
|
172
|
+
base = a[:source][:begin]
|
173
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
174
|
+
case c.action
|
175
|
+
when '='
|
176
|
+
stext[c.old_position + base]
|
177
|
+
when '+'
|
178
|
+
'_'
|
179
|
+
when '-'
|
180
|
+
stext[c.old_position + base]
|
181
|
+
when '!'
|
182
|
+
stext[c.old_position + base] + '_'
|
183
|
+
end
|
184
|
+
end.join('')
|
185
|
+
|
186
|
+
base = a[:target][:begin]
|
187
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
188
|
+
case c.action
|
189
|
+
when '='
|
190
|
+
ttext[c.new_position + base]
|
191
|
+
when '+'
|
192
|
+
ttext[c.new_position + base]
|
193
|
+
when '-'
|
194
|
+
'_'
|
195
|
+
when '!'
|
196
|
+
'_' + ttext[c.new_position + base]
|
197
|
+
end
|
198
|
+
end.join('')
|
199
|
+
|
200
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
201
|
+
"[#{astr1}]\n" +
|
202
|
+
"[#{astr2}]\n\n"
|
203
|
+
end
|
204
|
+
end
|
205
|
+
show
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
27
211
|
## to find block alignments
|
28
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2,
|
212
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
29
213
|
|
30
214
|
blocks = []
|
31
215
|
while block = anchor_finder.get_next_anchor
|
@@ -68,12 +252,13 @@ class TextAlignment::TextAlignment
|
|
68
252
|
|
69
253
|
if b2 == e2
|
70
254
|
[
|
71
|
-
{source:{begin:b1, end:e1},
|
255
|
+
{source:{begin:b1, end:e1}, alignment: :empty},
|
72
256
|
block
|
73
257
|
]
|
74
258
|
else
|
259
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
260
|
+
|
75
261
|
if b1 == 0 && b2 == 0
|
76
|
-
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
77
262
|
b2 = e2 - len_buffer if e2 > len_buffer
|
78
263
|
end
|
79
264
|
|
@@ -85,6 +270,10 @@ class TextAlignment::TextAlignment
|
|
85
270
|
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
86
271
|
block
|
87
272
|
]
|
273
|
+
elsif ((e2 - b2) - (e1 - b1)) > len_buffer
|
274
|
+
la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
|
275
|
+
la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
|
276
|
+
[la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
|
88
277
|
else
|
89
278
|
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
90
279
|
end
|
@@ -102,31 +291,53 @@ class TextAlignment::TextAlignment
|
|
102
291
|
b1 = last_block[:source][:end]
|
103
292
|
if b1 < str1.length
|
104
293
|
e1 = str1.length
|
105
|
-
|
106
294
|
b2 = last_block[:target][:end]
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
295
|
+
|
296
|
+
_str1 = str1[b1 ... e1]
|
297
|
+
if _str1.strip.empty?
|
298
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
111
299
|
else
|
112
|
-
|
300
|
+
if b2 < str2.length
|
301
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
302
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
303
|
+
|
304
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
305
|
+
else
|
306
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
307
|
+
end
|
113
308
|
end
|
114
309
|
else
|
115
310
|
[]
|
116
311
|
end
|
117
312
|
end
|
118
|
-
|
119
|
-
@block_alignment[:blocks] = blocks2
|
120
313
|
end
|
121
314
|
|
122
|
-
def whole_block_alignment(str1, str2)
|
315
|
+
def whole_block_alignment(str1, str2, cultivation_map)
|
123
316
|
## Block exact match
|
124
|
-
|
317
|
+
search_position = 0
|
318
|
+
|
319
|
+
block_begin = begin
|
320
|
+
_block_begin = str2.index(str1, search_position)
|
321
|
+
break if _block_begin.nil?
|
322
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
323
|
+
_block_begin
|
324
|
+
end until search_position.nil?
|
325
|
+
|
125
326
|
unless block_begin.nil?
|
126
327
|
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
127
328
|
end
|
128
329
|
|
129
|
-
|
330
|
+
search_position = 0
|
331
|
+
|
332
|
+
dstr1 = str1.downcase
|
333
|
+
dstr2 = str2.downcase
|
334
|
+
block_begin = begin
|
335
|
+
_block_begin = dstr2.index(dstr1, search_position)
|
336
|
+
break if _block_begin.nil?
|
337
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
338
|
+
_block_begin
|
339
|
+
end until search_position.nil?
|
340
|
+
|
130
341
|
unless block_begin.nil?
|
131
342
|
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
132
343
|
end
|
@@ -144,7 +355,7 @@ class TextAlignment::TextAlignment
|
|
144
355
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
145
356
|
|
146
357
|
position = 0
|
147
|
-
|
358
|
+
_tblocks = ds_in_scope.map do |term|
|
148
359
|
lex = term[:lex]
|
149
360
|
r = block2.index(lex, position)
|
150
361
|
if r.nil?
|
@@ -152,11 +363,11 @@ class TextAlignment::TextAlignment
|
|
152
363
|
break
|
153
364
|
end
|
154
365
|
position = r + lex.length
|
155
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r + b2 - term[:span][:begin]}
|
366
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
|
156
367
|
end
|
157
368
|
|
158
369
|
# missing term found
|
159
|
-
|
370
|
+
_tblocks = [] if position.nil?
|
160
371
|
|
161
372
|
# redundant matching found
|
162
373
|
unless position.nil?
|
@@ -164,13 +375,13 @@ class TextAlignment::TextAlignment
|
|
164
375
|
lex = term[:lex]
|
165
376
|
look_forward = block2.index(lex, position)
|
166
377
|
unless look_forward.nil?
|
167
|
-
|
378
|
+
_tblocks = []
|
168
379
|
break
|
169
380
|
end
|
170
381
|
end
|
171
382
|
end
|
172
383
|
|
173
|
-
|
384
|
+
_tblocks
|
174
385
|
else
|
175
386
|
[]
|
176
387
|
end
|
@@ -184,7 +395,7 @@ class TextAlignment::TextAlignment
|
|
184
395
|
block2 = str2[b2 ... e2]
|
185
396
|
|
186
397
|
## character-based alignment
|
187
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase
|
398
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
188
399
|
if alignment.sdiff.nil?
|
189
400
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
190
401
|
else
|
@@ -196,7 +407,7 @@ class TextAlignment::TextAlignment
|
|
196
407
|
block2 = str2[b2 ... e2]
|
197
408
|
|
198
409
|
## character-based alignment
|
199
|
-
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase
|
410
|
+
alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
|
200
411
|
if alignment.sdiff.nil?
|
201
412
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
202
413
|
else
|
@@ -244,157 +455,4 @@ class TextAlignment::TextAlignment
|
|
244
455
|
end
|
245
456
|
end
|
246
457
|
|
247
|
-
|
248
|
-
def indices(str, target)
|
249
|
-
position = 0
|
250
|
-
len = target.len
|
251
|
-
Enumerator.new do |yielder|
|
252
|
-
while idx = str.index(target, position)
|
253
|
-
yielder << idx
|
254
|
-
position = idx + len
|
255
|
-
end
|
256
|
-
end
|
257
|
-
end
|
258
|
-
|
259
|
-
def transform_begin_position(begin_position)
|
260
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
261
|
-
block = @block_alignment[:blocks][i]
|
262
|
-
|
263
|
-
b = if block[:alignment] == :block || block[:alignment] == :term
|
264
|
-
begin_position + block[:delta]
|
265
|
-
elsif block[:alignment] == :empty
|
266
|
-
if begin_position == block[:source][:begin]
|
267
|
-
block[:target][:begin]
|
268
|
-
else
|
269
|
-
nil
|
270
|
-
end
|
271
|
-
else
|
272
|
-
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
273
|
-
r.nil? ? nil : r + block[:target][:begin]
|
274
|
-
end
|
275
|
-
end
|
276
|
-
|
277
|
-
def transform_end_position(end_position)
|
278
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
279
|
-
block = @block_alignment[:blocks][i]
|
280
|
-
|
281
|
-
e = if block[:alignment] == :block || block[:alignment] == :term
|
282
|
-
end_position + block[:delta]
|
283
|
-
elsif block[:alignment] == :empty
|
284
|
-
if end_position == block[:source][:end]
|
285
|
-
block[:target][:end]
|
286
|
-
else
|
287
|
-
nil
|
288
|
-
end
|
289
|
-
else
|
290
|
-
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
291
|
-
r.nil? ? nil : r + block[:target][:begin]
|
292
|
-
end
|
293
|
-
end
|
294
|
-
|
295
|
-
def transform_a_span(span)
|
296
|
-
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
297
|
-
end
|
298
|
-
|
299
|
-
def transform_spans(spans)
|
300
|
-
spans.map{|span| transform_a_span(span)}
|
301
|
-
end
|
302
|
-
|
303
|
-
def transform_denotations!(denotations)
|
304
|
-
return nil if denotations.nil?
|
305
|
-
@lost_annotations = []
|
306
|
-
|
307
|
-
denotations.each do |d|
|
308
|
-
source = {begin:d.begin, end:d.end}
|
309
|
-
d.begin = transform_begin_position(d.begin);
|
310
|
-
d.end = transform_end_position(d.end);
|
311
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
312
|
-
rescue
|
313
|
-
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
314
|
-
d.begin = nil
|
315
|
-
d.end = nil
|
316
|
-
end
|
317
|
-
|
318
|
-
@lost_annotations
|
319
|
-
end
|
320
|
-
|
321
|
-
def transform_hdenotations(hdenotations)
|
322
|
-
return nil if hdenotations.nil?
|
323
|
-
@lost_annotations = []
|
324
|
-
|
325
|
-
r = hdenotations.collect do |d|
|
326
|
-
t = transform_a_span(d[:span])
|
327
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
328
|
-
new_d = d.dup.merge({span:t})
|
329
|
-
rescue
|
330
|
-
@lost_annotations << {source: d[:span], target:t}
|
331
|
-
nil
|
332
|
-
end.compact
|
333
|
-
|
334
|
-
r
|
335
|
-
end
|
336
|
-
|
337
|
-
def alignment_show
|
338
|
-
stext = @block_alignment[:source_text]
|
339
|
-
ttext = @block_alignment[:target_text]
|
340
|
-
|
341
|
-
show = ''
|
342
|
-
@block_alignment[:blocks].each do |a|
|
343
|
-
show += case a[:alignment]
|
344
|
-
when :block
|
345
|
-
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
346
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
347
|
-
when :term
|
348
|
-
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
349
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
350
|
-
when :empty
|
351
|
-
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
352
|
-
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
353
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
354
|
-
">>>>> string 2 " +
|
355
|
-
if a[:target]
|
356
|
-
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
357
|
-
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
358
|
-
else
|
359
|
-
"[-]\n\n"
|
360
|
-
end
|
361
|
-
else
|
362
|
-
astr1 = ''
|
363
|
-
astr2 = ''
|
364
|
-
|
365
|
-
base = a[:source][:begin]
|
366
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
367
|
-
case c.action
|
368
|
-
when '='
|
369
|
-
stext[c.old_position + base]
|
370
|
-
when '+'
|
371
|
-
'_'
|
372
|
-
when '-'
|
373
|
-
stext[c.old_position + base]
|
374
|
-
when '!'
|
375
|
-
stext[c.old_position + base] + '_'
|
376
|
-
end
|
377
|
-
end.join('')
|
378
|
-
|
379
|
-
base = a[:target][:begin]
|
380
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
381
|
-
case c.action
|
382
|
-
when '='
|
383
|
-
ttext[c.new_position + base]
|
384
|
-
when '+'
|
385
|
-
ttext[c.new_position + base]
|
386
|
-
when '-'
|
387
|
-
'_'
|
388
|
-
when '!'
|
389
|
-
'_' + ttext[c.new_position + base]
|
390
|
-
end
|
391
|
-
end.join('')
|
392
|
-
|
393
|
-
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
394
|
-
"[#{astr1}]\n" +
|
395
|
-
"[#{astr2}]\n\n"
|
396
|
-
end
|
397
|
-
end
|
398
|
-
show
|
399
|
-
end
|
400
458
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,7 +77,9 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/char_mapping.rb
|
80
81
|
- lib/text_alignment/constants.rb
|
82
|
+
- lib/text_alignment/cultivation_map.rb
|
81
83
|
- lib/text_alignment/find_divisions.rb
|
82
84
|
- lib/text_alignment/glcs_alignment.rb
|
83
85
|
- lib/text_alignment/glcs_alignment_fast.rb
|
@@ -86,7 +88,6 @@ files:
|
|
86
88
|
- lib/text_alignment/lcs_cdiff.rb
|
87
89
|
- lib/text_alignment/lcs_comparison.rb
|
88
90
|
- lib/text_alignment/lcs_min.rb
|
89
|
-
- lib/text_alignment/mappings.rb
|
90
91
|
- lib/text_alignment/mixed_alignment.rb
|
91
92
|
- lib/text_alignment/text_alignment.rb
|
92
93
|
- lib/text_alignment/version.rb
|