text_alignment 0.9 → 0.11.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +24 -14
- data/lib/text_alignment/anchor_finder.rb +120 -70
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +93 -75
- data/lib/text_alignment/cultivation_map.rb +94 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +21 -3
- data/lib/text_alignment/text_alignment.rb +276 -243
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81994ec9a8c7c08d2aad32b351b5942fef1748e4035ba762af546d4f3fe7cee8
|
4
|
+
data.tar.gz: bfa75451d33b9d21c2baa1a52280f03486bf10a41b5ac2a97469f0ca3a4f7379
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73d0ff212a89d6ad33751f87f14a9b292b45ce177c61efd0ede5f852eb3834d1bd3940d202c4d87cfb0422c4dc566dea30c560f8500a220dcd3e1dd492e29eac
|
7
|
+
data.tar.gz: 6c763a564e339267624bec5c809b334b0adf9951d25409eac21ae4b4582beae4a08739119dfa369022c44c19d4faacfd0e06766c6223bd219a43c3961cfab08c
|
data/bin/align_annotations
CHANGED
@@ -26,8 +26,8 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text,
|
30
|
-
alignment
|
29
|
+
def align_denotations(denotations, source_text, alignment, debug = false)
|
30
|
+
alignment.align(source_text, denotations)
|
31
31
|
new_denotations = alignment.transform_hdenotations(denotations)
|
32
32
|
|
33
33
|
if debug
|
@@ -37,7 +37,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
37
37
|
end
|
38
38
|
|
39
39
|
lost_annotations = alignment.lost_annotations
|
40
|
-
unless lost_annotations.empty?
|
40
|
+
unless lost_annotations.nil? || lost_annotations.empty?
|
41
41
|
warn "\n[lost annotations] #{lost_annotations.length}"
|
42
42
|
lost_annotations.each do |a|
|
43
43
|
warn "#{a}"
|
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
50
50
|
new_denotations
|
51
51
|
end
|
52
52
|
|
53
|
-
def align_mannotations(source_annotations,
|
54
|
-
target_annotations = {text:
|
53
|
+
def align_mannotations(source_annotations, reference_text, alignment, debug = false)
|
54
|
+
target_annotations = {text:reference_text}
|
55
55
|
|
56
56
|
idnum_denotations = 0
|
57
57
|
idnum_relations = 0
|
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
62
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
63
63
|
ididx = {}
|
64
64
|
warn "[#{i}]-=-=-=-=-"
|
65
|
-
denotations = align_denotations(annotations[:denotations], annotations[:text],
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
|
66
|
+
|
66
67
|
denotations.each do |d|
|
67
68
|
reid = 'T' + (idnum_denotations += 1).to_s
|
68
69
|
ididx[d[:id]] = reid
|
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
76
77
|
annotations[:relations].each do |r|
|
77
78
|
reid = 'R' + (idnum_relations += 1).to_s
|
78
79
|
ididx[r[:id]] = reid
|
79
|
-
|
80
|
+
sid = ididx[r[:subj]]
|
81
|
+
oid = ididx[r[:obj]]
|
82
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
|
80
83
|
end
|
81
84
|
end
|
82
85
|
|
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
85
88
|
annotations[:attributes].each do |a|
|
86
89
|
reid = 'A' + (idnum_attributes += 1).to_s
|
87
90
|
ididx[a[:id]] = reid
|
88
|
-
|
91
|
+
sid = ididx[a[:subj]]
|
92
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
|
89
93
|
end
|
90
94
|
end
|
91
95
|
|
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
94
98
|
annotations[:modifications].each do |m|
|
95
99
|
reid = 'M' + (idnum_modifications += 1).to_s
|
96
100
|
ididx[m[:id]] = reid
|
97
|
-
|
101
|
+
oid = ididx[m[:obj]]
|
102
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
|
98
103
|
end
|
99
104
|
end
|
100
105
|
end
|
@@ -109,13 +114,18 @@ unless ARGV.length == 2
|
|
109
114
|
end
|
110
115
|
|
111
116
|
source_annotations = read_annotations(ARGV[0])
|
112
|
-
|
117
|
+
reference_text = read_text(ARGV[1])
|
118
|
+
|
119
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
113
120
|
|
114
121
|
target_annotations = if source_annotations.class == Array
|
115
|
-
align_mannotations(source_annotations,
|
122
|
+
# align_mannotations(source_annotations, reference_text, alignment, true)
|
123
|
+
align_mannotations(source_annotations, reference_text, alignment, false)
|
116
124
|
else
|
117
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text],
|
118
|
-
source_annotations
|
125
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
126
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
127
|
+
source_annotations.merge({text:reference_text, denotations:denotations})
|
119
128
|
end
|
120
129
|
|
121
|
-
#
|
130
|
+
# pp alignment.block_alignment
|
131
|
+
# puts target_annotations.to_json
|
@@ -6,92 +6,141 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str,
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
9
|
+
def initialize(source_str, target_str, cultivation_map)
|
10
|
+
@s1 = source_str.downcase
|
11
|
+
@s2 = target_str.downcase
|
13
12
|
|
14
|
-
@
|
13
|
+
@cultivation_map = cultivation_map
|
15
14
|
|
16
|
-
@
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
@size_ngram = TextAlignment::SIZE_NGRAM
|
16
|
+
@size_window = TextAlignment::SIZE_WINDOW
|
17
|
+
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
18
|
+
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
19
|
+
@pos_s2_final_possible_end = @s2.length
|
21
20
|
|
22
|
-
#
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@end_s2_prev = 0
|
21
|
+
# positions of last match
|
22
|
+
@pos_s1_last_match = 0
|
23
|
+
@pos_s2_last_match = 0
|
26
24
|
end
|
27
25
|
|
28
26
|
def get_next_anchor
|
29
|
-
# find the
|
30
|
-
|
31
|
-
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
-
@beg_s1 += 1
|
33
|
-
next
|
34
|
-
end
|
35
|
-
anchor = @s1[@beg_s1, @size_ngram]
|
36
|
-
|
37
|
-
# search_position = 0
|
38
|
-
search_position = @end_s2_prev
|
39
|
-
while @beg_s2 = @s2.index(anchor, search_position)
|
40
|
-
# if both the begining points are sufficiantly close to the end points of the last match
|
41
|
-
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
27
|
+
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
28
|
+
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
42
29
|
|
43
|
-
|
44
|
-
|
30
|
+
# To skip whitespace letters
|
31
|
+
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
45
32
|
|
46
|
-
|
47
|
-
|
33
|
+
_beg_s2 = get_beg_s2(beg_s1)
|
34
|
+
break _beg_s2 unless _beg_s2.nil?
|
35
|
+
end
|
48
36
|
|
49
|
-
|
50
|
-
|
37
|
+
# To return nil when it fails to find an anchor
|
38
|
+
return nil if beg_s2.class == Range
|
51
39
|
|
52
|
-
|
40
|
+
# To extend the block to the left
|
41
|
+
b1 = beg_s1
|
42
|
+
b2 = beg_s2
|
43
|
+
left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
|
44
|
+
while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
|
45
|
+
b1 -= 1; b2 -= 1
|
46
|
+
end
|
53
47
|
|
54
|
-
|
48
|
+
# To extend the block to the right
|
49
|
+
e1 = beg_s1 + @size_ngram
|
50
|
+
e2 = beg_s2 + @size_ngram
|
51
|
+
right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
|
52
|
+
while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
|
53
|
+
e1 += 1; e2 += 1
|
55
54
|
end
|
56
55
|
|
57
|
-
|
56
|
+
@pos_s1_last_match = e1
|
57
|
+
@pos_s2_last_match = e2
|
58
58
|
|
59
|
-
|
60
|
-
|
61
|
-
b2 = @beg_s2
|
62
|
-
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
63
|
-
b1 -= 1; b2 -= 1
|
64
|
-
end
|
59
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
60
|
+
end
|
65
61
|
|
66
|
-
|
62
|
+
private
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
64
|
+
def get_beg_s2(beg_s1)
|
65
|
+
# to get the anchor to search for in s2
|
66
|
+
anchor = @s1[beg_s1, @size_ngram]
|
67
|
+
|
68
|
+
# comment out below with the assumption that texts are in the same order
|
69
|
+
# search_position = 0
|
70
|
+
search_position = @pos_s2_last_match
|
71
|
+
|
72
|
+
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
73
|
+
return nil if beg_s2_candidates.empty?
|
74
|
+
|
75
|
+
find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
76
|
+
end
|
77
|
+
|
78
|
+
# To find beg_s2 which match to the anchor
|
79
|
+
# return nil if the anchor is too much frequent
|
80
|
+
def find_beg_s2_candidates(anchor, search_position)
|
81
|
+
candidates = []
|
82
|
+
while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
|
83
|
+
candidates << _beg_s2
|
84
|
+
|
85
|
+
# for speed, skip anchor of high frequency
|
86
|
+
if candidates.length > 5
|
87
|
+
candidates.clear
|
88
|
+
break
|
89
|
+
end
|
90
|
+
|
91
|
+
search_position = _beg_s2 + 1
|
72
92
|
end
|
93
|
+
candidates
|
94
|
+
end
|
95
|
+
|
96
|
+
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
97
|
+
valid_beg_s2 = nil
|
73
98
|
|
74
|
-
|
75
|
-
|
76
|
-
@beg_s1 = e1
|
99
|
+
(10 .. 30).step(10).each do |size_window|
|
100
|
+
valid_beg_s2 = nil
|
77
101
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
102
|
+
r = beg_s2_candidates.each do |beg_s2|
|
103
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
104
|
+
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
105
|
+
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
106
|
+
break unless valid_beg_s2.nil?
|
107
|
+
valid_beg_s2 = beg_s2
|
108
|
+
next
|
109
|
+
end
|
110
|
+
|
111
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
|
112
|
+
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
|
+
break unless valid_beg_s2.nil?
|
114
|
+
valid_beg_s2 = beg_s2
|
115
|
+
next
|
116
|
+
end
|
117
|
+
|
118
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
|
119
|
+
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
|
+
break unless valid_beg_s2.nil?
|
121
|
+
valid_beg_s2 = beg_s2
|
122
|
+
next
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
127
|
+
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
128
|
+
break unless r.nil?
|
82
129
|
end
|
130
|
+
|
131
|
+
valid_beg_s2
|
83
132
|
end
|
84
133
|
|
85
|
-
|
134
|
+
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
135
|
+
size_window ||= @size_window
|
86
136
|
|
87
|
-
|
88
|
-
#
|
89
|
-
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
137
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
138
|
+
# return if @beg_s1 < size_window || @beg_s2 < size_window
|
90
139
|
|
91
140
|
window_s1 = ''
|
92
|
-
loc =
|
141
|
+
loc = beg_s1 - 1
|
93
142
|
count = 0
|
94
|
-
while count <
|
143
|
+
while count < size_window && loc >= 0
|
95
144
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
96
145
|
window_s1 += @s1[loc]
|
97
146
|
count += 1
|
@@ -100,9 +149,9 @@ class TextAlignment::AnchorFinder
|
|
100
149
|
end
|
101
150
|
|
102
151
|
window_s2 = ''
|
103
|
-
loc =
|
152
|
+
loc = beg_s2 - 1
|
104
153
|
count = 0
|
105
|
-
while count <
|
154
|
+
while count < size_window && loc >= 0
|
106
155
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
107
156
|
window_s2 += @s2[loc]
|
108
157
|
count += 1
|
@@ -113,15 +162,17 @@ class TextAlignment::AnchorFinder
|
|
113
162
|
[window_s1, window_s2]
|
114
163
|
end
|
115
164
|
|
116
|
-
def get_right_windows
|
165
|
+
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
166
|
+
size_window ||= @size_window
|
167
|
+
|
117
168
|
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
-
# return if (@beg_s1 + @size_ngram > (@s1.length -
|
169
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
119
170
|
|
120
171
|
window_s1 = ''
|
121
|
-
loc =
|
172
|
+
loc = beg_s1 + @size_ngram
|
122
173
|
len_s1 = @s1.length
|
123
174
|
count = 0
|
124
|
-
while count <
|
175
|
+
while count < size_window && loc < len_s1
|
125
176
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
126
177
|
window_s1 += @s1[loc]
|
127
178
|
count += 1
|
@@ -130,10 +181,10 @@ class TextAlignment::AnchorFinder
|
|
130
181
|
end
|
131
182
|
|
132
183
|
window_s2 = ''
|
133
|
-
loc =
|
184
|
+
loc = beg_s2 + @size_ngram
|
134
185
|
len_s2 = @s2.length
|
135
186
|
count = 0
|
136
|
-
while count <
|
187
|
+
while count < size_window && loc < len_s2
|
137
188
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
138
189
|
window_s2 += @s2[loc]
|
139
190
|
count += 1
|
@@ -148,5 +199,4 @@ class TextAlignment::AnchorFinder
|
|
148
199
|
return 0 if str1.nil? || str2.nil?
|
149
200
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
150
201
|
end
|
151
|
-
|
152
|
-
end
|
202
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
|
-
TextAlignment::
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
4
|
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
5
|
|
6
6
|
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
@@ -61,9 +61,11 @@ TextAlignment::MAPPINGS = [
|
|
61
61
|
["•", "*"], #U+2022 (bullet)
|
62
62
|
[" ", " "], #U+2009 (thin space)
|
63
63
|
[" ", " "], #U+200A (hair space)
|
64
|
-
["
|
64
|
+
[" ", " "], #U+202F (narrow no-break space)
|
65
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
66
|
[" ", " "], #U+3000 (ideographic space)
|
66
|
-
["
|
67
|
+
["‐", "-"], #U+2010 (Hyphen)
|
68
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
67
69
|
["−", "-"], #U+2212 (minus sign)
|
68
70
|
["–", "-"], #U+2013 (en dash)
|
69
71
|
["′", "'"], #U+2032 (prime)
|
@@ -75,98 +77,114 @@ TextAlignment::MAPPINGS = [
|
|
75
77
|
]
|
76
78
|
|
77
79
|
|
78
|
-
|
80
|
+
class TextAlignment::CharMapping
|
81
|
+
attr_reader :mapped_text
|
79
82
|
|
83
|
+
def initialize(_text, char_mapping = nil)
|
84
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
85
|
+
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
|
+
@index_enmap = offset_mapping.to_h
|
87
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
88
|
+
end
|
80
89
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
86
|
-
if character_mappings.empty?
|
87
|
-
[_str1, _str2, _mappings]
|
88
|
-
else
|
89
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
90
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
-
characters_to.gsub!(/-/, '\-')
|
90
|
+
def enmap_position(position)
|
91
|
+
@index_enmap[position]
|
92
|
+
end
|
92
93
|
|
93
|
-
|
94
|
-
|
94
|
+
def demap_position(position)
|
95
|
+
@index_demap[position]
|
96
|
+
end
|
95
97
|
|
96
|
-
|
98
|
+
def enmap_denotations(denotations)
|
99
|
+
return nil if denotations.nil?
|
97
100
|
|
98
|
-
|
101
|
+
denotations.map do |d|
|
102
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
99
103
|
end
|
100
104
|
end
|
101
105
|
|
102
|
-
|
103
|
-
_mappings ||= TextAlignment::MAPPINGS
|
104
|
-
|
105
|
-
long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
106
|
-
if long_to_one_mappings.empty?
|
107
|
-
[_str1, _str2, _mappings]
|
108
|
-
else
|
109
|
-
## long to one character mappings
|
110
|
-
pletters = TextAlignment::PADDING_LETTERS
|
111
|
-
|
112
|
-
# find the padding letter for str1
|
113
|
-
@padding_letter1 = begin
|
114
|
-
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
-
TextAlignment::PADDING_LETTERS[i]
|
117
|
-
end
|
106
|
+
private
|
118
107
|
|
119
|
-
|
120
|
-
|
121
|
-
i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
|
122
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
123
|
-
TextAlignment::PADDING_LETTERS[i]
|
124
|
-
end
|
108
|
+
def enmap_text(_text, char_mapping)
|
109
|
+
text = _text.dup
|
125
110
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
else
|
141
|
-
_str2
|
142
|
-
end
|
111
|
+
# To execute the single letter mapping
|
112
|
+
char_mapping.each do |one, long|
|
113
|
+
text.gsub!(one, long) if long.length == 1
|
114
|
+
end
|
115
|
+
|
116
|
+
# To get the (location, length) index for replacements
|
117
|
+
loc_len = []
|
118
|
+
char_mapping.each do |one, long|
|
119
|
+
next if long.length == 1
|
120
|
+
|
121
|
+
init_next = 0
|
122
|
+
while loc = text.index(long, init_next)
|
123
|
+
loc_len << [loc, long.length]
|
124
|
+
init_next = loc + long.length
|
143
125
|
end
|
144
|
-
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
145
126
|
|
146
|
-
|
127
|
+
# a workaround to avoid messing-up due to embedding
|
128
|
+
text.gsub!(long, one * long.length)
|
147
129
|
end
|
148
|
-
end
|
149
130
|
|
150
|
-
|
151
|
-
|
131
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
132
|
+
init_next = 0
|
133
|
+
while loc = text.index(/\s{2,}/, init_next)
|
134
|
+
len = $~[0].length
|
135
|
+
loc_len << [loc, len]
|
136
|
+
init_next = loc + len
|
137
|
+
end
|
138
|
+
|
139
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
140
|
+
|
141
|
+
# To get the offset_mapping before and after replacement
|
142
|
+
offset_mapping = []
|
143
|
+
init_next = 0
|
144
|
+
j = 0
|
152
145
|
|
153
|
-
|
154
|
-
|
155
|
-
|
146
|
+
loc_len.each do |loc, len|
|
147
|
+
offset_mapping += (init_next .. loc).map do |i|
|
148
|
+
j += 1
|
149
|
+
[i, j - 1]
|
150
|
+
end
|
151
|
+
init_next = loc + len
|
152
|
+
end
|
156
153
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
_s1
|
154
|
+
offset_mapping += (init_next .. text.length).map do |i|
|
155
|
+
j += 1
|
156
|
+
[i, j - 1]
|
161
157
|
end
|
162
158
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
_s2
|
159
|
+
# To execute the long letter mapping
|
160
|
+
char_mapping.each do |one, long|
|
161
|
+
text.gsub!(one * long.length, one) if long.length > 1
|
167
162
|
end
|
168
163
|
|
169
|
-
|
164
|
+
# To replace multi whitespace sequences to a space
|
165
|
+
text.gsub!(/\s{2,}/, ' ')
|
166
|
+
|
167
|
+
[text, offset_mapping]
|
170
168
|
end
|
169
|
+
end
|
170
|
+
|
171
|
+
if __FILE__ == $0
|
172
|
+
require 'json'
|
173
|
+
|
174
|
+
unless ARGV.length == 1
|
175
|
+
warn "#{$0} an_annotation_json_file.json"
|
176
|
+
exit
|
177
|
+
end
|
178
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
179
|
+
denotations = annotations[:denotations] || []
|
180
|
+
if denotations.nil? && annotations[:tracks]
|
181
|
+
denotations = annotations[:tracks].first[:denotations]
|
182
|
+
end
|
183
|
+
|
184
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
185
|
+
text_mapped = text_mapping.mapped_text
|
186
|
+
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
187
|
+
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
171
188
|
|
189
|
+
puts new_annotations.to_json
|
172
190
|
end
|