text_alignment 0.11.8 → 0.11.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +17 -9
- data/lib/text_alignment/anchor_finder.rb +6 -8
- data/lib/text_alignment/char_mapping.rb +20 -28
- data/lib/text_alignment/text_alignment.rb +61 -53
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f95c7fb8bfdeb768fbd1400f7e785a2ed18016322f2cdcba3ea7196aa4e86ac
|
4
|
+
data.tar.gz: cad0296a218108884703af07bcbf1b303a6e12c51ab14ad872586c50cfc8e82c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d18a8d142974967fcfe358b561c83d76d1c5d8d2a277092ed4a8b42de14cd791b004830ff9f145735e6bc0a1265295f4fd6190c674ba62b57190d7e035a863b
|
7
|
+
data.tar.gz: c155ff9780d5f82893825787ee39ba319ddb1f16396d6a933334a83e485ff4d2eb2840d5fc1399f9873cd69a2458f029ced0d128d9099c4f08137d360e3e2007
|
data/bin/align_annotations
CHANGED
@@ -110,20 +110,28 @@ end
|
|
110
110
|
|
111
111
|
|
112
112
|
## Options
|
113
|
-
|
114
|
-
|
113
|
+
options = {}
|
114
|
+
verbose = false
|
115
115
|
|
116
116
|
## command line option processing
|
117
117
|
require 'optparse'
|
118
118
|
optparse = OptionParser.new do |opts|
|
119
119
|
opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
|
120
120
|
|
121
|
-
opts.on('-
|
122
|
-
|
121
|
+
opts.on('-d', '--duplicate', 'tells it to assume there may be duplicate texts.') do
|
122
|
+
options[:duplicate_texts] = true
|
123
123
|
end
|
124
124
|
|
125
|
-
opts.on('-
|
126
|
-
|
125
|
+
opts.on('-w', '--no-whitespaces', 'tells it to ignore whitespaces.') do
|
126
|
+
options[:to_ignore_whitespaces] = true
|
127
|
+
end
|
128
|
+
|
129
|
+
opts.on('-o', '--no-order', 'tells it to ignore the order of the texts.') do
|
130
|
+
options[:to_ignore_text_order] = true
|
131
|
+
end
|
132
|
+
|
133
|
+
opts.on('-v', '--verbose', 'tells it to show the state verbosely for debugging.') do
|
134
|
+
verbose = true
|
127
135
|
end
|
128
136
|
|
129
137
|
opts.on('-h', '--help', 'displays this screen.') do
|
@@ -142,12 +150,12 @@ end
|
|
142
150
|
source_annotations = read_annotations(ARGV[0])
|
143
151
|
reference_text = read_text(ARGV[1])
|
144
152
|
|
145
|
-
alignment = TextAlignment::TextAlignment.new(reference_text,
|
153
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, options)
|
146
154
|
|
147
155
|
target_annotations = if source_annotations.class == Array
|
148
|
-
align_mannotations(source_annotations, reference_text, alignment,
|
156
|
+
align_mannotations(source_annotations, reference_text, alignment, verbose)
|
149
157
|
else
|
150
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment,
|
158
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, verbose)
|
151
159
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
152
160
|
end
|
153
161
|
|
@@ -6,17 +6,18 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str, cultivation_map,
|
10
|
-
@method_get_left_windows, @method_get_right_windows = if
|
11
|
-
[method(:get_left_windows), method(:get_right_windows)]
|
12
|
-
else
|
9
|
+
def initialize(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false)
|
10
|
+
@method_get_left_windows, @method_get_right_windows = if to_ignore_whitespaces
|
13
11
|
[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
|
12
|
+
else
|
13
|
+
[method(:get_left_windows), method(:get_right_windows)]
|
14
14
|
end
|
15
15
|
|
16
16
|
@s1 = source_str.downcase
|
17
17
|
@s2 = target_str.downcase
|
18
18
|
|
19
19
|
@cultivation_map = cultivation_map
|
20
|
+
@to_ignore_text_order = to_ignore_text_order
|
20
21
|
|
21
22
|
@size_ngram = TextAlignment::SIZE_NGRAM
|
22
23
|
@size_window = TextAlignment::SIZE_WINDOW
|
@@ -71,10 +72,7 @@ class TextAlignment::AnchorFinder
|
|
71
72
|
# to get the anchor to search for in s2
|
72
73
|
anchor = @s1[beg_s1, @size_ngram]
|
73
74
|
|
74
|
-
|
75
|
-
# search_position = 0
|
76
|
-
search_position = @pos_s2_last_match
|
77
|
-
|
75
|
+
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
78
76
|
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
79
77
|
return nil if beg_s2_candidates.empty?
|
80
78
|
|
@@ -80,10 +80,10 @@ TextAlignment::CHAR_MAPPING = [
|
|
80
80
|
|
81
81
|
|
82
82
|
class TextAlignment::CharMapping
|
83
|
-
attr_reader :mapped_text
|
83
|
+
attr_reader :mapped_text, :index_enmap
|
84
84
|
|
85
|
-
def initialize(_text, char_mapping = nil,
|
86
|
-
if
|
85
|
+
def initialize(_text, char_mapping = nil, to_ignore_whitespaces = false)
|
86
|
+
if to_ignore_whitespaces
|
87
87
|
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
|
88
88
|
@method_squeeze_ws = method(:squeeze_ws_0!)
|
89
89
|
else
|
@@ -140,29 +140,25 @@ class TextAlignment::CharMapping
|
|
140
140
|
|
141
141
|
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
142
142
|
rpositions += @method_get_positions_squeeze_ws.call(text)
|
143
|
-
|
144
143
|
rpositions.sort!{|a, b| a[0] <=> b[0]}
|
145
144
|
|
146
145
|
# To get the offset_mapping before and after replacement
|
147
|
-
offset_mapping =
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
j +=
|
146
|
+
offset_mapping = begin
|
147
|
+
i, j = 0, 0
|
148
|
+
|
149
|
+
offset_mappings = rpositions.map do |loc, old_len, new_len|
|
150
|
+
pre_len = loc - i
|
151
|
+
m = (0 .. pre_len).map{|c| [i + c, j + c]}
|
152
|
+
i = loc + old_len
|
153
|
+
j += pre_len + new_len
|
154
|
+
|
155
155
|
m
|
156
156
|
end
|
157
157
|
|
158
|
-
|
159
|
-
|
160
|
-
end
|
158
|
+
pre_len = text.length - i
|
159
|
+
offset_mappings << (0 .. pre_len).map{|c| [i + c, j + c]}
|
161
160
|
|
162
|
-
|
163
|
-
m = [i, j]
|
164
|
-
j += 1
|
165
|
-
m
|
161
|
+
offset_mappings.reduce(:+)
|
166
162
|
end
|
167
163
|
|
168
164
|
# To execute the long letter mapping
|
@@ -179,7 +175,7 @@ class TextAlignment::CharMapping
|
|
179
175
|
# To get squeeze positions of whitespaces to one
|
180
176
|
def get_positions_squeeze_ws_1(text)
|
181
177
|
rpositions = []
|
182
|
-
text.scan(
|
178
|
+
text.scan(/\s{2,}/) do |s|
|
183
179
|
loc = $~.begin(0)
|
184
180
|
len = $~.end(0) - loc
|
185
181
|
rpositions << [loc, len, 1]
|
@@ -189,13 +185,7 @@ class TextAlignment::CharMapping
|
|
189
185
|
|
190
186
|
# To get squeeze positions of whitespaces to zero
|
191
187
|
def get_positions_squeeze_ws_0(text)
|
192
|
-
|
193
|
-
text.scan(/\s+/) do |s|
|
194
|
-
loc = $~.begin(0)
|
195
|
-
len = $~.end(0) - loc
|
196
|
-
rpositions << [loc, len, 0]
|
197
|
-
end
|
198
|
-
rpositions
|
188
|
+
text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
|
199
189
|
end
|
200
190
|
|
201
191
|
def squeeze_ws_1!(text)
|
@@ -210,6 +200,7 @@ end
|
|
210
200
|
|
211
201
|
if __FILE__ == $0
|
212
202
|
require 'json'
|
203
|
+
# require 'profile'
|
213
204
|
|
214
205
|
unless ARGV.length == 1
|
215
206
|
warn "#{$0} an_annotation_json_file.json"
|
@@ -221,7 +212,8 @@ if __FILE__ == $0
|
|
221
212
|
denotations = annotations[:tracks].first[:denotations]
|
222
213
|
end
|
223
214
|
|
224
|
-
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
215
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, false)
|
216
|
+
# text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, true)
|
225
217
|
text_mapped = text_mapping.mapped_text
|
226
218
|
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
227
219
|
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
@@ -16,11 +16,12 @@ class TextAlignment::TextAlignment
|
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
18
|
options ||= {}
|
19
|
-
@
|
20
|
-
@
|
19
|
+
@duplicate_texts = options[:duplicate_texts] || false
|
20
|
+
@to_ignore_whitespaces = options[:to_ignore_whitespaces] || false
|
21
|
+
@to_ignore_text_order = options[:to_ignore_text_order] || false
|
21
22
|
|
22
23
|
@original_reference_text = reference_text
|
23
|
-
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @
|
24
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @to_ignore_whitespaces)
|
24
25
|
@mapped_reference_text = @rtext_mapping.mapped_text
|
25
26
|
|
26
27
|
@original_text = nil
|
@@ -30,12 +31,12 @@ class TextAlignment::TextAlignment
|
|
30
31
|
|
31
32
|
def align(text, denotations = nil)
|
32
33
|
# To maintain the cultivation map
|
33
|
-
update_cultivation_map
|
34
|
+
update_cultivation_map unless @duplicate_texts
|
34
35
|
|
35
36
|
# In case the input text is the same as the previous one, reuse the previous text mapping
|
36
37
|
unless @original_text && @original_text == text
|
37
38
|
@original_text = text
|
38
|
-
@text_mapping = TextAlignment::CharMapping.new(text, nil, @
|
39
|
+
@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
|
39
40
|
end
|
40
41
|
|
41
42
|
@mapped_text = @text_mapping.mapped_text
|
@@ -205,7 +206,7 @@ class TextAlignment::TextAlignment
|
|
205
206
|
|
206
207
|
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
207
208
|
## to find block alignments
|
208
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @
|
209
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @to_ignore_whitespaces, @to_ignore_text_order)
|
209
210
|
|
210
211
|
blocks = []
|
211
212
|
while block = anchor_finder.get_next_anchor
|
@@ -241,68 +242,75 @@ class TextAlignment::TextAlignment
|
|
241
242
|
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
242
243
|
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
243
244
|
|
244
|
-
if b1
|
245
|
+
if b1 <= e1
|
246
|
+
_str1 = str1[b1 ... e1]
|
247
|
+
|
245
248
|
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
246
249
|
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
247
|
-
_str1 = str1[b1 ... e1]
|
248
|
-
_str2 = str2[b2 ... e2]
|
249
250
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
261
|
-
else
|
262
|
-
oe2 = state_region[1]
|
263
|
-
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
264
|
-
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
265
|
-
end
|
266
|
-
when :rear_open
|
267
|
-
if cblock.nil? # when there is no following matched block
|
268
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
269
|
-
else
|
270
|
-
ob2 = state_region[0]
|
271
|
-
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
272
|
-
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
273
|
-
end
|
274
|
-
when :middle_closed
|
275
|
-
attempt1 = if sum.empty?
|
251
|
+
if b2 < e2
|
252
|
+
_str2 = str2[b2 ... e2]
|
253
|
+
|
254
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
255
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
256
|
+
else
|
257
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
258
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
259
|
+
case region_state
|
260
|
+
when :closed
|
276
261
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
262
|
+
when :front_open
|
263
|
+
if sum.empty? # when there is no preceding matched block
|
264
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
265
|
+
else
|
266
|
+
oe2 = state_region[1]
|
267
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
268
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
269
|
+
end
|
270
|
+
when :rear_open
|
271
|
+
if cblock.nil? # when there is no following matched block
|
272
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
273
|
+
else
|
274
|
+
ob2 = state_region[0]
|
275
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
276
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
277
|
+
end
|
278
|
+
when :middle_closed
|
291
279
|
attempt1 = if sum.empty?
|
292
280
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
293
281
|
else
|
294
|
-
|
282
|
+
oe2 = state_region[0]
|
283
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
284
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
295
285
|
end
|
296
286
|
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
297
|
-
|
287
|
+
ob2 = state_region[1]
|
288
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
289
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
298
290
|
else
|
299
291
|
attempt1
|
300
292
|
end
|
301
|
-
else
|
302
|
-
|
293
|
+
else # :open
|
294
|
+
if (e2 - b2) > len_buffer
|
295
|
+
attempt1 = if sum.empty?
|
296
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
297
|
+
else
|
298
|
+
local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
299
|
+
end
|
300
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
301
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
302
|
+
else
|
303
|
+
attempt1
|
304
|
+
end
|
305
|
+
else
|
306
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
307
|
+
end
|
303
308
|
end
|
304
309
|
end
|
310
|
+
elsif b2 > e2 # when out of order
|
311
|
+
# ToDo
|
305
312
|
end
|
313
|
+
|
306
314
|
end
|
307
315
|
|
308
316
|
lblock = cblock
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-04-
|
11
|
+
date: 2021-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: '0'
|
113
113
|
requirements: []
|
114
|
-
rubygems_version: 3.0.
|
114
|
+
rubygems_version: 3.0.9
|
115
115
|
signing_key:
|
116
116
|
specification_version: 4
|
117
117
|
summary: Ruby class for aligning two character strings
|