text_alignment 0.9.1 → 0.11.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +24 -14
- data/lib/text_alignment/anchor_finder.rb +124 -70
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +93 -75
- data/lib/text_alignment/cultivation_map.rb +94 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +21 -3
- data/lib/text_alignment/text_alignment.rb +276 -243
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0f019e7fbd144890e96eda8f2cf9b27cf091930c96b81236452172a5142e2cf3
|
4
|
+
data.tar.gz: 47d32ec727511d53730bf56557992f972f7747832f9437f5e2f5798cd0764f41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf2720ce7af3612a8c0b1823bf6265e90f0d5e92f315d7eb697c4b13e1c9752e795adb5b1dbc840629379f3e96cf94115dd5d9400663f1d43a4caf428274f69a
|
7
|
+
data.tar.gz: d7bca56968c54fa68d83b5e9d89aa89107774cde761117e28d561cf9c63226b08a613e84b3484f2df58edaadfac7e1b286155bc3486559d6fd7c63d6ec082907
|
data/bin/align_annotations
CHANGED
@@ -26,8 +26,8 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text,
|
30
|
-
alignment
|
29
|
+
def align_denotations(denotations, source_text, alignment, debug = false)
|
30
|
+
alignment.align(source_text, denotations)
|
31
31
|
new_denotations = alignment.transform_hdenotations(denotations)
|
32
32
|
|
33
33
|
if debug
|
@@ -37,7 +37,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
37
37
|
end
|
38
38
|
|
39
39
|
lost_annotations = alignment.lost_annotations
|
40
|
-
unless lost_annotations.empty?
|
40
|
+
unless lost_annotations.nil? || lost_annotations.empty?
|
41
41
|
warn "\n[lost annotations] #{lost_annotations.length}"
|
42
42
|
lost_annotations.each do |a|
|
43
43
|
warn "#{a}"
|
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
50
50
|
new_denotations
|
51
51
|
end
|
52
52
|
|
53
|
-
def align_mannotations(source_annotations,
|
54
|
-
target_annotations = {text:
|
53
|
+
def align_mannotations(source_annotations, reference_text, alignment, debug = false)
|
54
|
+
target_annotations = {text:reference_text}
|
55
55
|
|
56
56
|
idnum_denotations = 0
|
57
57
|
idnum_relations = 0
|
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
62
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
63
63
|
ididx = {}
|
64
64
|
warn "[#{i}]-=-=-=-=-"
|
65
|
-
denotations = align_denotations(annotations[:denotations], annotations[:text],
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
|
66
|
+
|
66
67
|
denotations.each do |d|
|
67
68
|
reid = 'T' + (idnum_denotations += 1).to_s
|
68
69
|
ididx[d[:id]] = reid
|
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
76
77
|
annotations[:relations].each do |r|
|
77
78
|
reid = 'R' + (idnum_relations += 1).to_s
|
78
79
|
ididx[r[:id]] = reid
|
79
|
-
|
80
|
+
sid = ididx[r[:subj]]
|
81
|
+
oid = ididx[r[:obj]]
|
82
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
|
80
83
|
end
|
81
84
|
end
|
82
85
|
|
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
85
88
|
annotations[:attributes].each do |a|
|
86
89
|
reid = 'A' + (idnum_attributes += 1).to_s
|
87
90
|
ididx[a[:id]] = reid
|
88
|
-
|
91
|
+
sid = ididx[a[:subj]]
|
92
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
|
89
93
|
end
|
90
94
|
end
|
91
95
|
|
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
94
98
|
annotations[:modifications].each do |m|
|
95
99
|
reid = 'M' + (idnum_modifications += 1).to_s
|
96
100
|
ididx[m[:id]] = reid
|
97
|
-
|
101
|
+
oid = ididx[m[:obj]]
|
102
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
|
98
103
|
end
|
99
104
|
end
|
100
105
|
end
|
@@ -109,13 +114,18 @@ unless ARGV.length == 2
|
|
109
114
|
end
|
110
115
|
|
111
116
|
source_annotations = read_annotations(ARGV[0])
|
112
|
-
|
117
|
+
reference_text = read_text(ARGV[1])
|
118
|
+
|
119
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
113
120
|
|
114
121
|
target_annotations = if source_annotations.class == Array
|
115
|
-
align_mannotations(source_annotations,
|
122
|
+
# align_mannotations(source_annotations, reference_text, alignment, true)
|
123
|
+
align_mannotations(source_annotations, reference_text, alignment, false)
|
116
124
|
else
|
117
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text],
|
118
|
-
source_annotations
|
125
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
126
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
127
|
+
source_annotations.merge({text:reference_text, denotations:denotations})
|
119
128
|
end
|
120
129
|
|
121
|
-
#
|
130
|
+
# pp alignment.block_alignment
|
131
|
+
# puts target_annotations.to_json
|
@@ -6,92 +6,145 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str,
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
9
|
+
def initialize(source_str, target_str, cultivation_map)
|
10
|
+
@s1 = source_str.downcase
|
11
|
+
@s2 = target_str.downcase
|
13
12
|
|
14
|
-
@
|
13
|
+
@cultivation_map = cultivation_map
|
15
14
|
|
16
|
-
@
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
@size_ngram = TextAlignment::SIZE_NGRAM
|
16
|
+
@size_window = TextAlignment::SIZE_WINDOW
|
17
|
+
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
18
|
+
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
19
|
+
@pos_s2_final_possible_end = @s2.length
|
21
20
|
|
22
|
-
#
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@end_s2_prev = 0
|
21
|
+
# positions of last match
|
22
|
+
@pos_s1_last_match = 0
|
23
|
+
@pos_s2_last_match = 0
|
26
24
|
end
|
27
25
|
|
28
26
|
def get_next_anchor
|
29
|
-
# find the
|
30
|
-
|
31
|
-
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
-
@beg_s1 += 1
|
33
|
-
next
|
34
|
-
end
|
35
|
-
anchor = @s1[@beg_s1, @size_ngram]
|
36
|
-
|
37
|
-
# search_position = 0
|
38
|
-
search_position = @end_s2_prev
|
39
|
-
while @beg_s2 = @s2.index(anchor, search_position)
|
40
|
-
# if both the begining points are sufficiantly close to the end points of the last match
|
41
|
-
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
27
|
+
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
28
|
+
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
42
29
|
|
43
|
-
|
44
|
-
|
30
|
+
# To skip whitespace letters
|
31
|
+
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
45
32
|
|
46
|
-
|
47
|
-
|
33
|
+
_beg_s2 = get_beg_s2(beg_s1)
|
34
|
+
break _beg_s2 unless _beg_s2.nil?
|
35
|
+
end
|
48
36
|
|
49
|
-
|
50
|
-
|
37
|
+
# To return nil when it fails to find an anchor
|
38
|
+
return nil if beg_s2.class == Range
|
51
39
|
|
52
|
-
|
40
|
+
# To extend the block to the left
|
41
|
+
b1 = beg_s1
|
42
|
+
b2 = beg_s2
|
43
|
+
left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
|
44
|
+
while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
|
45
|
+
b1 -= 1; b2 -= 1
|
46
|
+
end
|
53
47
|
|
54
|
-
|
48
|
+
# To extend the block to the right
|
49
|
+
e1 = beg_s1 + @size_ngram
|
50
|
+
e2 = beg_s2 + @size_ngram
|
51
|
+
right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
|
52
|
+
while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
|
53
|
+
e1 += 1; e2 += 1
|
55
54
|
end
|
56
55
|
|
57
|
-
|
56
|
+
@pos_s1_last_match = e1
|
57
|
+
@pos_s2_last_match = e2
|
58
58
|
|
59
|
-
|
60
|
-
|
61
|
-
b2 = @beg_s2
|
62
|
-
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
63
|
-
b1 -= 1; b2 -= 1
|
64
|
-
end
|
59
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
60
|
+
end
|
65
61
|
|
66
|
-
|
62
|
+
private
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
64
|
+
def get_beg_s2(beg_s1)
|
65
|
+
# to get the anchor to search for in s2
|
66
|
+
anchor = @s1[beg_s1, @size_ngram]
|
67
|
+
|
68
|
+
# comment out below with the assumption that texts are in the same order
|
69
|
+
# search_position = 0
|
70
|
+
search_position = @pos_s2_last_match
|
71
|
+
|
72
|
+
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
73
|
+
return nil if beg_s2_candidates.empty?
|
74
|
+
|
75
|
+
find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
76
|
+
end
|
77
|
+
|
78
|
+
# To find beg_s2 which match to the anchor
|
79
|
+
# return nil if the anchor is too much frequent
|
80
|
+
def find_beg_s2_candidates(anchor, search_position)
|
81
|
+
candidates = []
|
82
|
+
while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
|
83
|
+
candidates << _beg_s2
|
84
|
+
|
85
|
+
# for speed, skip anchor of high frequency
|
86
|
+
if candidates.length > 5
|
87
|
+
candidates.clear
|
88
|
+
break
|
89
|
+
end
|
90
|
+
|
91
|
+
search_position = _beg_s2 + 1
|
72
92
|
end
|
93
|
+
candidates
|
94
|
+
end
|
73
95
|
|
74
|
-
|
75
|
-
|
76
|
-
@beg_s1 = e1
|
96
|
+
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
97
|
+
valid_beg_s2 = nil
|
77
98
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
99
|
+
(10 .. 30).step(10).each do |size_window|
|
100
|
+
valid_beg_s2 = nil
|
101
|
+
|
102
|
+
r = beg_s2_candidates.each do |beg_s2|
|
103
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
104
|
+
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
105
|
+
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
106
|
+
break unless valid_beg_s2.nil?
|
107
|
+
valid_beg_s2 = beg_s2
|
108
|
+
next
|
109
|
+
end
|
110
|
+
|
111
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
|
112
|
+
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
|
+
break unless valid_beg_s2.nil?
|
114
|
+
valid_beg_s2 = beg_s2
|
115
|
+
next
|
116
|
+
end
|
117
|
+
|
118
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
|
119
|
+
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
|
+
break unless valid_beg_s2.nil?
|
121
|
+
valid_beg_s2 = beg_s2
|
122
|
+
next
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
127
|
+
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
128
|
+
if r.nil?
|
129
|
+
valid_beg_s2 = nil
|
130
|
+
else
|
131
|
+
break
|
132
|
+
end
|
82
133
|
end
|
134
|
+
|
135
|
+
valid_beg_s2
|
83
136
|
end
|
84
137
|
|
85
|
-
|
138
|
+
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
139
|
+
size_window ||= @size_window
|
86
140
|
|
87
|
-
|
88
|
-
#
|
89
|
-
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
141
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
142
|
+
# return if @beg_s1 < size_window || @beg_s2 < size_window
|
90
143
|
|
91
144
|
window_s1 = ''
|
92
|
-
loc =
|
145
|
+
loc = beg_s1 - 1
|
93
146
|
count = 0
|
94
|
-
while count <
|
147
|
+
while count < size_window && loc >= 0
|
95
148
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
96
149
|
window_s1 += @s1[loc]
|
97
150
|
count += 1
|
@@ -100,9 +153,9 @@ class TextAlignment::AnchorFinder
|
|
100
153
|
end
|
101
154
|
|
102
155
|
window_s2 = ''
|
103
|
-
loc =
|
156
|
+
loc = beg_s2 - 1
|
104
157
|
count = 0
|
105
|
-
while count <
|
158
|
+
while count < size_window && loc >= 0
|
106
159
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
107
160
|
window_s2 += @s2[loc]
|
108
161
|
count += 1
|
@@ -113,15 +166,17 @@ class TextAlignment::AnchorFinder
|
|
113
166
|
[window_s1, window_s2]
|
114
167
|
end
|
115
168
|
|
116
|
-
def get_right_windows
|
169
|
+
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
170
|
+
size_window ||= @size_window
|
171
|
+
|
117
172
|
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
-
# return if (@beg_s1 + @size_ngram > (@s1.length -
|
173
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
119
174
|
|
120
175
|
window_s1 = ''
|
121
|
-
loc =
|
176
|
+
loc = beg_s1 + @size_ngram
|
122
177
|
len_s1 = @s1.length
|
123
178
|
count = 0
|
124
|
-
while count <
|
179
|
+
while count < size_window && loc < len_s1
|
125
180
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
126
181
|
window_s1 += @s1[loc]
|
127
182
|
count += 1
|
@@ -130,10 +185,10 @@ class TextAlignment::AnchorFinder
|
|
130
185
|
end
|
131
186
|
|
132
187
|
window_s2 = ''
|
133
|
-
loc =
|
188
|
+
loc = beg_s2 + @size_ngram
|
134
189
|
len_s2 = @s2.length
|
135
190
|
count = 0
|
136
|
-
while count <
|
191
|
+
while count < size_window && loc < len_s2
|
137
192
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
138
193
|
window_s2 += @s2[loc]
|
139
194
|
count += 1
|
@@ -148,5 +203,4 @@ class TextAlignment::AnchorFinder
|
|
148
203
|
return 0 if str1.nil? || str2.nil?
|
149
204
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
150
205
|
end
|
151
|
-
|
152
|
-
end
|
206
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
|
-
TextAlignment::
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
4
|
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
5
|
|
6
6
|
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
@@ -61,9 +61,11 @@ TextAlignment::MAPPINGS = [
|
|
61
61
|
["•", "*"], #U+2022 (bullet)
|
62
62
|
[" ", " "], #U+2009 (thin space)
|
63
63
|
[" ", " "], #U+200A (hair space)
|
64
|
-
["
|
64
|
+
[" ", " "], #U+202F (narrow no-break space)
|
65
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
66
|
[" ", " "], #U+3000 (ideographic space)
|
66
|
-
["
|
67
|
+
["‐", "-"], #U+2010 (Hyphen)
|
68
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
67
69
|
["−", "-"], #U+2212 (minus sign)
|
68
70
|
["–", "-"], #U+2013 (en dash)
|
69
71
|
["′", "'"], #U+2032 (prime)
|
@@ -75,98 +77,114 @@ TextAlignment::MAPPINGS = [
|
|
75
77
|
]
|
76
78
|
|
77
79
|
|
78
|
-
|
80
|
+
class TextAlignment::CharMapping
|
81
|
+
attr_reader :mapped_text
|
79
82
|
|
83
|
+
def initialize(_text, char_mapping = nil)
|
84
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
85
|
+
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
|
+
@index_enmap = offset_mapping.to_h
|
87
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
88
|
+
end
|
80
89
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
86
|
-
if character_mappings.empty?
|
87
|
-
[_str1, _str2, _mappings]
|
88
|
-
else
|
89
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
90
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
-
characters_to.gsub!(/-/, '\-')
|
90
|
+
def enmap_position(position)
|
91
|
+
@index_enmap[position]
|
92
|
+
end
|
92
93
|
|
93
|
-
|
94
|
-
|
94
|
+
def demap_position(position)
|
95
|
+
@index_demap[position]
|
96
|
+
end
|
95
97
|
|
96
|
-
|
98
|
+
def enmap_denotations(denotations)
|
99
|
+
return nil if denotations.nil?
|
97
100
|
|
98
|
-
|
101
|
+
denotations.map do |d|
|
102
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
99
103
|
end
|
100
104
|
end
|
101
105
|
|
102
|
-
|
103
|
-
_mappings ||= TextAlignment::MAPPINGS
|
104
|
-
|
105
|
-
long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
106
|
-
if long_to_one_mappings.empty?
|
107
|
-
[_str1, _str2, _mappings]
|
108
|
-
else
|
109
|
-
## long to one character mappings
|
110
|
-
pletters = TextAlignment::PADDING_LETTERS
|
111
|
-
|
112
|
-
# find the padding letter for str1
|
113
|
-
@padding_letter1 = begin
|
114
|
-
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
-
TextAlignment::PADDING_LETTERS[i]
|
117
|
-
end
|
106
|
+
private
|
118
107
|
|
119
|
-
|
120
|
-
|
121
|
-
i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
|
122
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
123
|
-
TextAlignment::PADDING_LETTERS[i]
|
124
|
-
end
|
108
|
+
def enmap_text(_text, char_mapping)
|
109
|
+
text = _text.dup
|
125
110
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
else
|
141
|
-
_str2
|
142
|
-
end
|
111
|
+
# To execute the single letter mapping
|
112
|
+
char_mapping.each do |one, long|
|
113
|
+
text.gsub!(one, long) if long.length == 1
|
114
|
+
end
|
115
|
+
|
116
|
+
# To get the (location, length) index for replacements
|
117
|
+
loc_len = []
|
118
|
+
char_mapping.each do |one, long|
|
119
|
+
next if long.length == 1
|
120
|
+
|
121
|
+
init_next = 0
|
122
|
+
while loc = text.index(long, init_next)
|
123
|
+
loc_len << [loc, long.length]
|
124
|
+
init_next = loc + long.length
|
143
125
|
end
|
144
|
-
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
145
126
|
|
146
|
-
|
127
|
+
# a workaround to avoid messing-up due to embedding
|
128
|
+
text.gsub!(long, one * long.length)
|
147
129
|
end
|
148
|
-
end
|
149
130
|
|
150
|
-
|
151
|
-
|
131
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
132
|
+
init_next = 0
|
133
|
+
while loc = text.index(/\s{2,}/, init_next)
|
134
|
+
len = $~[0].length
|
135
|
+
loc_len << [loc, len]
|
136
|
+
init_next = loc + len
|
137
|
+
end
|
138
|
+
|
139
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
140
|
+
|
141
|
+
# To get the offset_mapping before and after replacement
|
142
|
+
offset_mapping = []
|
143
|
+
init_next = 0
|
144
|
+
j = 0
|
152
145
|
|
153
|
-
|
154
|
-
|
155
|
-
|
146
|
+
loc_len.each do |loc, len|
|
147
|
+
offset_mapping += (init_next .. loc).map do |i|
|
148
|
+
j += 1
|
149
|
+
[i, j - 1]
|
150
|
+
end
|
151
|
+
init_next = loc + len
|
152
|
+
end
|
156
153
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
_s1
|
154
|
+
offset_mapping += (init_next .. text.length).map do |i|
|
155
|
+
j += 1
|
156
|
+
[i, j - 1]
|
161
157
|
end
|
162
158
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
_s2
|
159
|
+
# To execute the long letter mapping
|
160
|
+
char_mapping.each do |one, long|
|
161
|
+
text.gsub!(one * long.length, one) if long.length > 1
|
167
162
|
end
|
168
163
|
|
169
|
-
|
164
|
+
# To replace multi whitespace sequences to a space
|
165
|
+
text.gsub!(/\s{2,}/, ' ')
|
166
|
+
|
167
|
+
[text, offset_mapping]
|
170
168
|
end
|
169
|
+
end
|
170
|
+
|
171
|
+
if __FILE__ == $0
|
172
|
+
require 'json'
|
173
|
+
|
174
|
+
unless ARGV.length == 1
|
175
|
+
warn "#{$0} an_annotation_json_file.json"
|
176
|
+
exit
|
177
|
+
end
|
178
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
179
|
+
denotations = annotations[:denotations] || []
|
180
|
+
if denotations.nil? && annotations[:tracks]
|
181
|
+
denotations = annotations[:tracks].first[:denotations]
|
182
|
+
end
|
183
|
+
|
184
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
185
|
+
text_mapped = text_mapping.mapped_text
|
186
|
+
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
187
|
+
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
171
188
|
|
189
|
+
puts new_annotations.to_json
|
172
190
|
end
|