text_alignment 0.9 → 0.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +24 -14
- data/lib/text_alignment/anchor_finder.rb +120 -70
- data/lib/text_alignment/{mappings.rb → char_mapping.rb} +93 -75
- data/lib/text_alignment/cultivation_map.rb +94 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +21 -3
- data/lib/text_alignment/text_alignment.rb +276 -243
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81994ec9a8c7c08d2aad32b351b5942fef1748e4035ba762af546d4f3fe7cee8
|
4
|
+
data.tar.gz: bfa75451d33b9d21c2baa1a52280f03486bf10a41b5ac2a97469f0ca3a4f7379
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73d0ff212a89d6ad33751f87f14a9b292b45ce177c61efd0ede5f852eb3834d1bd3940d202c4d87cfb0422c4dc566dea30c560f8500a220dcd3e1dd492e29eac
|
7
|
+
data.tar.gz: 6c763a564e339267624bec5c809b334b0adf9951d25409eac21ae4b4582beae4a08739119dfa369022c44c19d4faacfd0e06766c6223bd219a43c3961cfab08c
|
data/bin/align_annotations
CHANGED
@@ -26,8 +26,8 @@ def read_text(filename)
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
def align_denotations(denotations, source_text,
|
30
|
-
alignment
|
29
|
+
def align_denotations(denotations, source_text, alignment, debug = false)
|
30
|
+
alignment.align(source_text, denotations)
|
31
31
|
new_denotations = alignment.transform_hdenotations(denotations)
|
32
32
|
|
33
33
|
if debug
|
@@ -37,7 +37,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
37
37
|
end
|
38
38
|
|
39
39
|
lost_annotations = alignment.lost_annotations
|
40
|
-
unless lost_annotations.empty?
|
40
|
+
unless lost_annotations.nil? || lost_annotations.empty?
|
41
41
|
warn "\n[lost annotations] #{lost_annotations.length}"
|
42
42
|
lost_annotations.each do |a|
|
43
43
|
warn "#{a}"
|
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
|
|
50
50
|
new_denotations
|
51
51
|
end
|
52
52
|
|
53
|
-
def align_mannotations(source_annotations,
|
54
|
-
target_annotations = {text:
|
53
|
+
def align_mannotations(source_annotations, reference_text, alignment, debug = false)
|
54
|
+
target_annotations = {text:reference_text}
|
55
55
|
|
56
56
|
idnum_denotations = 0
|
57
57
|
idnum_relations = 0
|
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
62
62
|
if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
|
63
63
|
ididx = {}
|
64
64
|
warn "[#{i}]-=-=-=-=-"
|
65
|
-
denotations = align_denotations(annotations[:denotations], annotations[:text],
|
65
|
+
denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
|
66
|
+
|
66
67
|
denotations.each do |d|
|
67
68
|
reid = 'T' + (idnum_denotations += 1).to_s
|
68
69
|
ididx[d[:id]] = reid
|
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
76
77
|
annotations[:relations].each do |r|
|
77
78
|
reid = 'R' + (idnum_relations += 1).to_s
|
78
79
|
ididx[r[:id]] = reid
|
79
|
-
|
80
|
+
sid = ididx[r[:subj]]
|
81
|
+
oid = ididx[r[:obj]]
|
82
|
+
target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
|
80
83
|
end
|
81
84
|
end
|
82
85
|
|
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
85
88
|
annotations[:attributes].each do |a|
|
86
89
|
reid = 'A' + (idnum_attributes += 1).to_s
|
87
90
|
ididx[a[:id]] = reid
|
88
|
-
|
91
|
+
sid = ididx[a[:subj]]
|
92
|
+
target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
|
89
93
|
end
|
90
94
|
end
|
91
95
|
|
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
|
|
94
98
|
annotations[:modifications].each do |m|
|
95
99
|
reid = 'M' + (idnum_modifications += 1).to_s
|
96
100
|
ididx[m[:id]] = reid
|
97
|
-
|
101
|
+
oid = ididx[m[:obj]]
|
102
|
+
target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
|
98
103
|
end
|
99
104
|
end
|
100
105
|
end
|
@@ -109,13 +114,18 @@ unless ARGV.length == 2
|
|
109
114
|
end
|
110
115
|
|
111
116
|
source_annotations = read_annotations(ARGV[0])
|
112
|
-
|
117
|
+
reference_text = read_text(ARGV[1])
|
118
|
+
|
119
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, true)
|
113
120
|
|
114
121
|
target_annotations = if source_annotations.class == Array
|
115
|
-
align_mannotations(source_annotations,
|
122
|
+
# align_mannotations(source_annotations, reference_text, alignment, true)
|
123
|
+
align_mannotations(source_annotations, reference_text, alignment, false)
|
116
124
|
else
|
117
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text],
|
118
|
-
source_annotations
|
125
|
+
# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
|
126
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
127
|
+
source_annotations.merge({text:reference_text, denotations:denotations})
|
119
128
|
end
|
120
129
|
|
121
|
-
#
|
130
|
+
# pp alignment.block_alignment
|
131
|
+
# puts target_annotations.to_json
|
@@ -6,92 +6,141 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str,
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
9
|
+
def initialize(source_str, target_str, cultivation_map)
|
10
|
+
@s1 = source_str.downcase
|
11
|
+
@s2 = target_str.downcase
|
13
12
|
|
14
|
-
@
|
13
|
+
@cultivation_map = cultivation_map
|
15
14
|
|
16
|
-
@
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
@size_ngram = TextAlignment::SIZE_NGRAM
|
16
|
+
@size_window = TextAlignment::SIZE_WINDOW
|
17
|
+
@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
18
|
+
@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
|
19
|
+
@pos_s2_final_possible_end = @s2.length
|
21
20
|
|
22
|
-
#
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@end_s2_prev = 0
|
21
|
+
# positions of last match
|
22
|
+
@pos_s1_last_match = 0
|
23
|
+
@pos_s2_last_match = 0
|
26
24
|
end
|
27
25
|
|
28
26
|
def get_next_anchor
|
29
|
-
# find the
|
30
|
-
|
31
|
-
if [' ', "\n", "\t"].include? @s1[@beg_s1]
|
32
|
-
@beg_s1 += 1
|
33
|
-
next
|
34
|
-
end
|
35
|
-
anchor = @s1[@beg_s1, @size_ngram]
|
36
|
-
|
37
|
-
# search_position = 0
|
38
|
-
search_position = @end_s2_prev
|
39
|
-
while @beg_s2 = @s2.index(anchor, search_position)
|
40
|
-
# if both the begining points are sufficiantly close to the end points of the last match
|
41
|
-
break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
27
|
+
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
28
|
+
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
42
29
|
|
43
|
-
|
44
|
-
|
30
|
+
# To skip whitespace letters
|
31
|
+
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
45
32
|
|
46
|
-
|
47
|
-
|
33
|
+
_beg_s2 = get_beg_s2(beg_s1)
|
34
|
+
break _beg_s2 unless _beg_s2.nil?
|
35
|
+
end
|
48
36
|
|
49
|
-
|
50
|
-
|
37
|
+
# To return nil when it fails to find an anchor
|
38
|
+
return nil if beg_s2.class == Range
|
51
39
|
|
52
|
-
|
40
|
+
# To extend the block to the left
|
41
|
+
b1 = beg_s1
|
42
|
+
b2 = beg_s2
|
43
|
+
left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
|
44
|
+
while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
|
45
|
+
b1 -= 1; b2 -= 1
|
46
|
+
end
|
53
47
|
|
54
|
-
|
48
|
+
# To extend the block to the right
|
49
|
+
e1 = beg_s1 + @size_ngram
|
50
|
+
e2 = beg_s2 + @size_ngram
|
51
|
+
right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
|
52
|
+
while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
|
53
|
+
e1 += 1; e2 += 1
|
55
54
|
end
|
56
55
|
|
57
|
-
|
56
|
+
@pos_s1_last_match = e1
|
57
|
+
@pos_s2_last_match = e2
|
58
58
|
|
59
|
-
|
60
|
-
|
61
|
-
b2 = @beg_s2
|
62
|
-
while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
|
63
|
-
b1 -= 1; b2 -= 1
|
64
|
-
end
|
59
|
+
{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
|
60
|
+
end
|
65
61
|
|
66
|
-
|
62
|
+
private
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
64
|
+
def get_beg_s2(beg_s1)
|
65
|
+
# to get the anchor to search for in s2
|
66
|
+
anchor = @s1[beg_s1, @size_ngram]
|
67
|
+
|
68
|
+
# comment out below with the assumption that texts are in the same order
|
69
|
+
# search_position = 0
|
70
|
+
search_position = @pos_s2_last_match
|
71
|
+
|
72
|
+
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
73
|
+
return nil if beg_s2_candidates.empty?
|
74
|
+
|
75
|
+
find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
76
|
+
end
|
77
|
+
|
78
|
+
# To find beg_s2 which match to the anchor
|
79
|
+
# return nil if the anchor is too much frequent
|
80
|
+
def find_beg_s2_candidates(anchor, search_position)
|
81
|
+
candidates = []
|
82
|
+
while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
|
83
|
+
candidates << _beg_s2
|
84
|
+
|
85
|
+
# for speed, skip anchor of high frequency
|
86
|
+
if candidates.length > 5
|
87
|
+
candidates.clear
|
88
|
+
break
|
89
|
+
end
|
90
|
+
|
91
|
+
search_position = _beg_s2 + 1
|
72
92
|
end
|
93
|
+
candidates
|
94
|
+
end
|
95
|
+
|
96
|
+
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
97
|
+
valid_beg_s2 = nil
|
73
98
|
|
74
|
-
|
75
|
-
|
76
|
-
@beg_s1 = e1
|
99
|
+
(10 .. 30).step(10).each do |size_window|
|
100
|
+
valid_beg_s2 = nil
|
77
101
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
102
|
+
r = beg_s2_candidates.each do |beg_s2|
|
103
|
+
# if both the begining points are sufficiantly close to the end points of the last match
|
104
|
+
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
105
|
+
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
106
|
+
break unless valid_beg_s2.nil?
|
107
|
+
valid_beg_s2 = beg_s2
|
108
|
+
next
|
109
|
+
end
|
110
|
+
|
111
|
+
left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
|
112
|
+
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
|
+
break unless valid_beg_s2.nil?
|
114
|
+
valid_beg_s2 = beg_s2
|
115
|
+
next
|
116
|
+
end
|
117
|
+
|
118
|
+
right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
|
119
|
+
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
|
+
break unless valid_beg_s2.nil?
|
121
|
+
valid_beg_s2 = beg_s2
|
122
|
+
next
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
127
|
+
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
128
|
+
break unless r.nil?
|
82
129
|
end
|
130
|
+
|
131
|
+
valid_beg_s2
|
83
132
|
end
|
84
133
|
|
85
|
-
|
134
|
+
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
135
|
+
size_window ||= @size_window
|
86
136
|
|
87
|
-
|
88
|
-
#
|
89
|
-
# return if @beg_s1 < @size_window || @beg_s2 < @size_window
|
137
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
138
|
+
# return if @beg_s1 < size_window || @beg_s2 < size_window
|
90
139
|
|
91
140
|
window_s1 = ''
|
92
|
-
loc =
|
141
|
+
loc = beg_s1 - 1
|
93
142
|
count = 0
|
94
|
-
while count <
|
143
|
+
while count < size_window && loc >= 0
|
95
144
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
96
145
|
window_s1 += @s1[loc]
|
97
146
|
count += 1
|
@@ -100,9 +149,9 @@ class TextAlignment::AnchorFinder
|
|
100
149
|
end
|
101
150
|
|
102
151
|
window_s2 = ''
|
103
|
-
loc =
|
152
|
+
loc = beg_s2 - 1
|
104
153
|
count = 0
|
105
|
-
while count <
|
154
|
+
while count < size_window && loc >= 0
|
106
155
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
107
156
|
window_s2 += @s2[loc]
|
108
157
|
count += 1
|
@@ -113,15 +162,17 @@ class TextAlignment::AnchorFinder
|
|
113
162
|
[window_s1, window_s2]
|
114
163
|
end
|
115
164
|
|
116
|
-
def get_right_windows
|
165
|
+
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
166
|
+
size_window ||= @size_window
|
167
|
+
|
117
168
|
# commend below with the assumption that the end of a document gives a significant locational
|
118
|
-
# return if (@beg_s1 + @size_ngram > (@s1.length -
|
169
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
119
170
|
|
120
171
|
window_s1 = ''
|
121
|
-
loc =
|
172
|
+
loc = beg_s1 + @size_ngram
|
122
173
|
len_s1 = @s1.length
|
123
174
|
count = 0
|
124
|
-
while count <
|
175
|
+
while count < size_window && loc < len_s1
|
125
176
|
if @s1[loc] =~ /[0-9a-zA-Z]/
|
126
177
|
window_s1 += @s1[loc]
|
127
178
|
count += 1
|
@@ -130,10 +181,10 @@ class TextAlignment::AnchorFinder
|
|
130
181
|
end
|
131
182
|
|
132
183
|
window_s2 = ''
|
133
|
-
loc =
|
184
|
+
loc = beg_s2 + @size_ngram
|
134
185
|
len_s2 = @s2.length
|
135
186
|
count = 0
|
136
|
-
while count <
|
187
|
+
while count < size_window && loc < len_s2
|
137
188
|
if @s2[loc] =~ /[0-9a-zA-Z]/
|
138
189
|
window_s2 += @s2[loc]
|
139
190
|
count += 1
|
@@ -148,5 +199,4 @@ class TextAlignment::AnchorFinder
|
|
148
199
|
return 0 if str1.nil? || str2.nil?
|
149
200
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
150
201
|
end
|
151
|
-
|
152
|
-
end
|
202
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
|
-
TextAlignment::
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
4
|
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
5
|
|
6
6
|
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
@@ -61,9 +61,11 @@ TextAlignment::MAPPINGS = [
|
|
61
61
|
["•", "*"], #U+2022 (bullet)
|
62
62
|
[" ", " "], #U+2009 (thin space)
|
63
63
|
[" ", " "], #U+200A (hair space)
|
64
|
-
["
|
64
|
+
[" ", " "], #U+202F (narrow no-break space)
|
65
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
66
|
[" ", " "], #U+3000 (ideographic space)
|
66
|
-
["
|
67
|
+
["‐", "-"], #U+2010 (Hyphen)
|
68
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
67
69
|
["−", "-"], #U+2212 (minus sign)
|
68
70
|
["–", "-"], #U+2013 (en dash)
|
69
71
|
["′", "'"], #U+2032 (prime)
|
@@ -75,98 +77,114 @@ TextAlignment::MAPPINGS = [
|
|
75
77
|
]
|
76
78
|
|
77
79
|
|
78
|
-
|
80
|
+
class TextAlignment::CharMapping
|
81
|
+
attr_reader :mapped_text
|
79
82
|
|
83
|
+
def initialize(_text, char_mapping = nil)
|
84
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
85
|
+
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
|
+
@index_enmap = offset_mapping.to_h
|
87
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
88
|
+
end
|
80
89
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
86
|
-
if character_mappings.empty?
|
87
|
-
[_str1, _str2, _mappings]
|
88
|
-
else
|
89
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
90
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
91
|
-
characters_to.gsub!(/-/, '\-')
|
90
|
+
def enmap_position(position)
|
91
|
+
@index_enmap[position]
|
92
|
+
end
|
92
93
|
|
93
|
-
|
94
|
-
|
94
|
+
def demap_position(position)
|
95
|
+
@index_demap[position]
|
96
|
+
end
|
95
97
|
|
96
|
-
|
98
|
+
def enmap_denotations(denotations)
|
99
|
+
return nil if denotations.nil?
|
97
100
|
|
98
|
-
|
101
|
+
denotations.map do |d|
|
102
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
99
103
|
end
|
100
104
|
end
|
101
105
|
|
102
|
-
|
103
|
-
_mappings ||= TextAlignment::MAPPINGS
|
104
|
-
|
105
|
-
long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
106
|
-
if long_to_one_mappings.empty?
|
107
|
-
[_str1, _str2, _mappings]
|
108
|
-
else
|
109
|
-
## long to one character mappings
|
110
|
-
pletters = TextAlignment::PADDING_LETTERS
|
111
|
-
|
112
|
-
# find the padding letter for str1
|
113
|
-
@padding_letter1 = begin
|
114
|
-
i = pletters.index{|l| _str2.index(l).nil?}
|
115
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
116
|
-
TextAlignment::PADDING_LETTERS[i]
|
117
|
-
end
|
106
|
+
private
|
118
107
|
|
119
|
-
|
120
|
-
|
121
|
-
i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
|
122
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
123
|
-
TextAlignment::PADDING_LETTERS[i]
|
124
|
-
end
|
108
|
+
def enmap_text(_text, char_mapping)
|
109
|
+
text = _text.dup
|
125
110
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
else
|
141
|
-
_str2
|
142
|
-
end
|
111
|
+
# To execute the single letter mapping
|
112
|
+
char_mapping.each do |one, long|
|
113
|
+
text.gsub!(one, long) if long.length == 1
|
114
|
+
end
|
115
|
+
|
116
|
+
# To get the (location, length) index for replacements
|
117
|
+
loc_len = []
|
118
|
+
char_mapping.each do |one, long|
|
119
|
+
next if long.length == 1
|
120
|
+
|
121
|
+
init_next = 0
|
122
|
+
while loc = text.index(long, init_next)
|
123
|
+
loc_len << [loc, long.length]
|
124
|
+
init_next = loc + long.length
|
143
125
|
end
|
144
|
-
mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
|
145
126
|
|
146
|
-
|
127
|
+
# a workaround to avoid messing-up due to embedding
|
128
|
+
text.gsub!(long, one * long.length)
|
147
129
|
end
|
148
|
-
end
|
149
130
|
|
150
|
-
|
151
|
-
|
131
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
132
|
+
init_next = 0
|
133
|
+
while loc = text.index(/\s{2,}/, init_next)
|
134
|
+
len = $~[0].length
|
135
|
+
loc_len << [loc, len]
|
136
|
+
init_next = loc + len
|
137
|
+
end
|
138
|
+
|
139
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
140
|
+
|
141
|
+
# To get the offset_mapping before and after replacement
|
142
|
+
offset_mapping = []
|
143
|
+
init_next = 0
|
144
|
+
j = 0
|
152
145
|
|
153
|
-
|
154
|
-
|
155
|
-
|
146
|
+
loc_len.each do |loc, len|
|
147
|
+
offset_mapping += (init_next .. loc).map do |i|
|
148
|
+
j += 1
|
149
|
+
[i, j - 1]
|
150
|
+
end
|
151
|
+
init_next = loc + len
|
152
|
+
end
|
156
153
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
_s1
|
154
|
+
offset_mapping += (init_next .. text.length).map do |i|
|
155
|
+
j += 1
|
156
|
+
[i, j - 1]
|
161
157
|
end
|
162
158
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
_s2
|
159
|
+
# To execute the long letter mapping
|
160
|
+
char_mapping.each do |one, long|
|
161
|
+
text.gsub!(one * long.length, one) if long.length > 1
|
167
162
|
end
|
168
163
|
|
169
|
-
|
164
|
+
# To replace multi whitespace sequences to a space
|
165
|
+
text.gsub!(/\s{2,}/, ' ')
|
166
|
+
|
167
|
+
[text, offset_mapping]
|
170
168
|
end
|
169
|
+
end
|
170
|
+
|
171
|
+
if __FILE__ == $0
|
172
|
+
require 'json'
|
173
|
+
|
174
|
+
unless ARGV.length == 1
|
175
|
+
warn "#{$0} an_annotation_json_file.json"
|
176
|
+
exit
|
177
|
+
end
|
178
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
179
|
+
denotations = annotations[:denotations] || []
|
180
|
+
if denotations.nil? && annotations[:tracks]
|
181
|
+
denotations = annotations[:tracks].first[:denotations]
|
182
|
+
end
|
183
|
+
|
184
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
185
|
+
text_mapped = text_mapping.mapped_text
|
186
|
+
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
187
|
+
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
171
188
|
|
189
|
+
puts new_annotations.to_json
|
172
190
|
end
|