text_alignment 0.11.8 → 0.11.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +17 -9
- data/lib/text_alignment/anchor_finder.rb +6 -8
- data/lib/text_alignment/char_mapping.rb +20 -28
- data/lib/text_alignment/text_alignment.rb +61 -53
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f95c7fb8bfdeb768fbd1400f7e785a2ed18016322f2cdcba3ea7196aa4e86ac
|
4
|
+
data.tar.gz: cad0296a218108884703af07bcbf1b303a6e12c51ab14ad872586c50cfc8e82c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d18a8d142974967fcfe358b561c83d76d1c5d8d2a277092ed4a8b42de14cd791b004830ff9f145735e6bc0a1265295f4fd6190c674ba62b57190d7e035a863b
|
7
|
+
data.tar.gz: c155ff9780d5f82893825787ee39ba319ddb1f16396d6a933334a83e485ff4d2eb2840d5fc1399f9873cd69a2458f029ced0d128d9099c4f08137d360e3e2007
|
data/bin/align_annotations
CHANGED
@@ -110,20 +110,28 @@ end
|
|
110
110
|
|
111
111
|
|
112
112
|
## Options
|
113
|
-
|
114
|
-
|
113
|
+
options = {}
|
114
|
+
verbose = false
|
115
115
|
|
116
116
|
## command line option processing
|
117
117
|
require 'optparse'
|
118
118
|
optparse = OptionParser.new do |opts|
|
119
119
|
opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
|
120
120
|
|
121
|
-
opts.on('-
|
122
|
-
|
121
|
+
opts.on('-d', '--duplicate', 'tells it to assume there may be duplicate texts.') do
|
122
|
+
options[:duplicate_texts] = true
|
123
123
|
end
|
124
124
|
|
125
|
-
opts.on('-
|
126
|
-
|
125
|
+
opts.on('-w', '--no-whitespaces', 'tells it to ignore whitespaces.') do
|
126
|
+
options[:to_ignore_whitespaces] = true
|
127
|
+
end
|
128
|
+
|
129
|
+
opts.on('-o', '--no-order', 'tells it to ignore the order of the texts.') do
|
130
|
+
options[:to_ignore_text_order] = true
|
131
|
+
end
|
132
|
+
|
133
|
+
opts.on('-v', '--verbose', 'tells it to show the state verbosely for debugging.') do
|
134
|
+
verbose = true
|
127
135
|
end
|
128
136
|
|
129
137
|
opts.on('-h', '--help', 'displays this screen.') do
|
@@ -142,12 +150,12 @@ end
|
|
142
150
|
source_annotations = read_annotations(ARGV[0])
|
143
151
|
reference_text = read_text(ARGV[1])
|
144
152
|
|
145
|
-
alignment = TextAlignment::TextAlignment.new(reference_text,
|
153
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, options)
|
146
154
|
|
147
155
|
target_annotations = if source_annotations.class == Array
|
148
|
-
align_mannotations(source_annotations, reference_text, alignment,
|
156
|
+
align_mannotations(source_annotations, reference_text, alignment, verbose)
|
149
157
|
else
|
150
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment,
|
158
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, verbose)
|
151
159
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
152
160
|
end
|
153
161
|
|
@@ -6,17 +6,18 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str, cultivation_map,
|
10
|
-
@method_get_left_windows, @method_get_right_windows = if
|
11
|
-
[method(:get_left_windows), method(:get_right_windows)]
|
12
|
-
else
|
9
|
+
def initialize(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false)
|
10
|
+
@method_get_left_windows, @method_get_right_windows = if to_ignore_whitespaces
|
13
11
|
[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
|
12
|
+
else
|
13
|
+
[method(:get_left_windows), method(:get_right_windows)]
|
14
14
|
end
|
15
15
|
|
16
16
|
@s1 = source_str.downcase
|
17
17
|
@s2 = target_str.downcase
|
18
18
|
|
19
19
|
@cultivation_map = cultivation_map
|
20
|
+
@to_ignore_text_order = to_ignore_text_order
|
20
21
|
|
21
22
|
@size_ngram = TextAlignment::SIZE_NGRAM
|
22
23
|
@size_window = TextAlignment::SIZE_WINDOW
|
@@ -71,10 +72,7 @@ class TextAlignment::AnchorFinder
|
|
71
72
|
# to get the anchor to search for in s2
|
72
73
|
anchor = @s1[beg_s1, @size_ngram]
|
73
74
|
|
74
|
-
|
75
|
-
# search_position = 0
|
76
|
-
search_position = @pos_s2_last_match
|
77
|
-
|
75
|
+
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
78
76
|
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
79
77
|
return nil if beg_s2_candidates.empty?
|
80
78
|
|
@@ -80,10 +80,10 @@ TextAlignment::CHAR_MAPPING = [
|
|
80
80
|
|
81
81
|
|
82
82
|
class TextAlignment::CharMapping
|
83
|
-
attr_reader :mapped_text
|
83
|
+
attr_reader :mapped_text, :index_enmap
|
84
84
|
|
85
|
-
def initialize(_text, char_mapping = nil,
|
86
|
-
if
|
85
|
+
def initialize(_text, char_mapping = nil, to_ignore_whitespaces = false)
|
86
|
+
if to_ignore_whitespaces
|
87
87
|
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
|
88
88
|
@method_squeeze_ws = method(:squeeze_ws_0!)
|
89
89
|
else
|
@@ -140,29 +140,25 @@ class TextAlignment::CharMapping
|
|
140
140
|
|
141
141
|
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
142
142
|
rpositions += @method_get_positions_squeeze_ws.call(text)
|
143
|
-
|
144
143
|
rpositions.sort!{|a, b| a[0] <=> b[0]}
|
145
144
|
|
146
145
|
# To get the offset_mapping before and after replacement
|
147
|
-
offset_mapping =
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
j +=
|
146
|
+
offset_mapping = begin
|
147
|
+
i, j = 0, 0
|
148
|
+
|
149
|
+
offset_mappings = rpositions.map do |loc, old_len, new_len|
|
150
|
+
pre_len = loc - i
|
151
|
+
m = (0 .. pre_len).map{|c| [i + c, j + c]}
|
152
|
+
i = loc + old_len
|
153
|
+
j += pre_len + new_len
|
154
|
+
|
155
155
|
m
|
156
156
|
end
|
157
157
|
|
158
|
-
|
159
|
-
|
160
|
-
end
|
158
|
+
pre_len = text.length - i
|
159
|
+
offset_mappings << (0 .. pre_len).map{|c| [i + c, j + c]}
|
161
160
|
|
162
|
-
|
163
|
-
m = [i, j]
|
164
|
-
j += 1
|
165
|
-
m
|
161
|
+
offset_mappings.reduce(:+)
|
166
162
|
end
|
167
163
|
|
168
164
|
# To execute the long letter mapping
|
@@ -179,7 +175,7 @@ class TextAlignment::CharMapping
|
|
179
175
|
# To get squeeze positions of whitespaces to one
|
180
176
|
def get_positions_squeeze_ws_1(text)
|
181
177
|
rpositions = []
|
182
|
-
text.scan(
|
178
|
+
text.scan(/\s{2,}/) do |s|
|
183
179
|
loc = $~.begin(0)
|
184
180
|
len = $~.end(0) - loc
|
185
181
|
rpositions << [loc, len, 1]
|
@@ -189,13 +185,7 @@ class TextAlignment::CharMapping
|
|
189
185
|
|
190
186
|
# To get squeeze positions of whitespaces to zero
|
191
187
|
def get_positions_squeeze_ws_0(text)
|
192
|
-
|
193
|
-
text.scan(/\s+/) do |s|
|
194
|
-
loc = $~.begin(0)
|
195
|
-
len = $~.end(0) - loc
|
196
|
-
rpositions << [loc, len, 0]
|
197
|
-
end
|
198
|
-
rpositions
|
188
|
+
text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
|
199
189
|
end
|
200
190
|
|
201
191
|
def squeeze_ws_1!(text)
|
@@ -210,6 +200,7 @@ end
|
|
210
200
|
|
211
201
|
if __FILE__ == $0
|
212
202
|
require 'json'
|
203
|
+
# require 'profile'
|
213
204
|
|
214
205
|
unless ARGV.length == 1
|
215
206
|
warn "#{$0} an_annotation_json_file.json"
|
@@ -221,7 +212,8 @@ if __FILE__ == $0
|
|
221
212
|
denotations = annotations[:tracks].first[:denotations]
|
222
213
|
end
|
223
214
|
|
224
|
-
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
215
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, false)
|
216
|
+
# text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, true)
|
225
217
|
text_mapped = text_mapping.mapped_text
|
226
218
|
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
227
219
|
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
@@ -16,11 +16,12 @@ class TextAlignment::TextAlignment
|
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
18
|
options ||= {}
|
19
|
-
@
|
20
|
-
@
|
19
|
+
@duplicate_texts = options[:duplicate_texts] || false
|
20
|
+
@to_ignore_whitespaces = options[:to_ignore_whitespaces] || false
|
21
|
+
@to_ignore_text_order = options[:to_ignore_text_order] || false
|
21
22
|
|
22
23
|
@original_reference_text = reference_text
|
23
|
-
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @
|
24
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @to_ignore_whitespaces)
|
24
25
|
@mapped_reference_text = @rtext_mapping.mapped_text
|
25
26
|
|
26
27
|
@original_text = nil
|
@@ -30,12 +31,12 @@ class TextAlignment::TextAlignment
|
|
30
31
|
|
31
32
|
def align(text, denotations = nil)
|
32
33
|
# To maintain the cultivation map
|
33
|
-
update_cultivation_map
|
34
|
+
update_cultivation_map unless @duplicate_texts
|
34
35
|
|
35
36
|
# In case the input text is the same as the previous one, reuse the previous text mapping
|
36
37
|
unless @original_text && @original_text == text
|
37
38
|
@original_text = text
|
38
|
-
@text_mapping = TextAlignment::CharMapping.new(text, nil, @
|
39
|
+
@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
|
39
40
|
end
|
40
41
|
|
41
42
|
@mapped_text = @text_mapping.mapped_text
|
@@ -205,7 +206,7 @@ class TextAlignment::TextAlignment
|
|
205
206
|
|
206
207
|
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
207
208
|
## to find block alignments
|
208
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @
|
209
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @to_ignore_whitespaces, @to_ignore_text_order)
|
209
210
|
|
210
211
|
blocks = []
|
211
212
|
while block = anchor_finder.get_next_anchor
|
@@ -241,68 +242,75 @@ class TextAlignment::TextAlignment
|
|
241
242
|
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
242
243
|
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
243
244
|
|
244
|
-
if b1
|
245
|
+
if b1 <= e1
|
246
|
+
_str1 = str1[b1 ... e1]
|
247
|
+
|
245
248
|
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
246
249
|
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
247
|
-
_str1 = str1[b1 ... e1]
|
248
|
-
_str2 = str2[b2 ... e2]
|
249
250
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
261
|
-
else
|
262
|
-
oe2 = state_region[1]
|
263
|
-
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
264
|
-
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
265
|
-
end
|
266
|
-
when :rear_open
|
267
|
-
if cblock.nil? # when there is no following matched block
|
268
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
269
|
-
else
|
270
|
-
ob2 = state_region[0]
|
271
|
-
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
272
|
-
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
273
|
-
end
|
274
|
-
when :middle_closed
|
275
|
-
attempt1 = if sum.empty?
|
251
|
+
if b2 < e2
|
252
|
+
_str2 = str2[b2 ... e2]
|
253
|
+
|
254
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
255
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
256
|
+
else
|
257
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
258
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
259
|
+
case region_state
|
260
|
+
when :closed
|
276
261
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
262
|
+
when :front_open
|
263
|
+
if sum.empty? # when there is no preceding matched block
|
264
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
265
|
+
else
|
266
|
+
oe2 = state_region[1]
|
267
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
268
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
269
|
+
end
|
270
|
+
when :rear_open
|
271
|
+
if cblock.nil? # when there is no following matched block
|
272
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
273
|
+
else
|
274
|
+
ob2 = state_region[0]
|
275
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
276
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
277
|
+
end
|
278
|
+
when :middle_closed
|
291
279
|
attempt1 = if sum.empty?
|
292
280
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
293
281
|
else
|
294
|
-
|
282
|
+
oe2 = state_region[0]
|
283
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
284
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
295
285
|
end
|
296
286
|
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
297
|
-
|
287
|
+
ob2 = state_region[1]
|
288
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
289
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
298
290
|
else
|
299
291
|
attempt1
|
300
292
|
end
|
301
|
-
else
|
302
|
-
|
293
|
+
else # :open
|
294
|
+
if (e2 - b2) > len_buffer
|
295
|
+
attempt1 = if sum.empty?
|
296
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
297
|
+
else
|
298
|
+
local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
299
|
+
end
|
300
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
301
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
302
|
+
else
|
303
|
+
attempt1
|
304
|
+
end
|
305
|
+
else
|
306
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
307
|
+
end
|
303
308
|
end
|
304
309
|
end
|
310
|
+
elsif b2 > e2 # when out of order
|
311
|
+
# ToDo
|
305
312
|
end
|
313
|
+
|
306
314
|
end
|
307
315
|
|
308
316
|
lblock = cblock
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-04-
|
11
|
+
date: 2021-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: '0'
|
113
113
|
requirements: []
|
114
|
-
rubygems_version: 3.0.
|
114
|
+
rubygems_version: 3.0.9
|
115
115
|
signing_key:
|
116
116
|
specification_version: 4
|
117
117
|
summary: Ruby class for aligning two character strings
|