text_alignment 0.11.3 → 0.11.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +39 -8
- data/lib/text_alignment/anchor_finder.rb +51 -9
- data/lib/text_alignment/char_mapping.rb +65 -33
- data/lib/text_alignment/mixed_alignment.rb +15 -4
- data/lib/text_alignment/text_alignment.rb +66 -55
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f95c7fb8bfdeb768fbd1400f7e785a2ed18016322f2cdcba3ea7196aa4e86ac
|
4
|
+
data.tar.gz: cad0296a218108884703af07bcbf1b303a6e12c51ab14ad872586c50cfc8e82c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d18a8d142974967fcfe358b561c83d76d1c5d8d2a277092ed4a8b42de14cd791b004830ff9f145735e6bc0a1265295f4fd6190c674ba62b57190d7e035a863b
|
7
|
+
data.tar.gz: c155ff9780d5f82893825787ee39ba319ddb1f16396d6a933334a83e485ff4d2eb2840d5fc1399f9873cd69a2458f029ced0d128d9099c4f08137d360e3e2007
|
data/bin/align_annotations
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment'
|
3
3
|
require 'json'
|
4
4
|
require 'pp'
|
5
|
+
require 'optparse'
|
5
6
|
|
6
7
|
def read_annotations(filename)
|
7
8
|
case File.extname(filename)
|
@@ -108,24 +109,54 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
108
109
|
end
|
109
110
|
|
110
111
|
|
112
|
+
## Options
|
113
|
+
options = {}
|
114
|
+
verbose = false
|
115
|
+
|
116
|
+
## command line option processing
|
117
|
+
require 'optparse'
|
118
|
+
optparse = OptionParser.new do |opts|
|
119
|
+
opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
|
120
|
+
|
121
|
+
opts.on('-d', '--duplicate', 'tells it to assume there may be duplicate texts.') do
|
122
|
+
options[:duplicate_texts] = true
|
123
|
+
end
|
124
|
+
|
125
|
+
opts.on('-w', '--no-whitespaces', 'tells it to ignore whitespaces.') do
|
126
|
+
options[:to_ignore_whitespaces] = true
|
127
|
+
end
|
128
|
+
|
129
|
+
opts.on('-o', '--no-order', 'tells it to ignore the order of the texts.') do
|
130
|
+
options[:to_ignore_text_order] = true
|
131
|
+
end
|
132
|
+
|
133
|
+
opts.on('-v', '--verbose', 'tells it to show the state verbosely for debugging.') do
|
134
|
+
verbose = true
|
135
|
+
end
|
136
|
+
|
137
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
138
|
+
puts opts
|
139
|
+
exit
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
optparse.parse!
|
144
|
+
|
111
145
|
unless ARGV.length == 2
|
112
|
-
|
113
|
-
exit
|
146
|
+
puts optparse.help
|
147
|
+
exit 1
|
114
148
|
end
|
115
149
|
|
116
150
|
source_annotations = read_annotations(ARGV[0])
|
117
151
|
reference_text = read_text(ARGV[1])
|
118
152
|
|
119
|
-
alignment = TextAlignment::TextAlignment.new(reference_text,
|
153
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, options)
|
120
154
|
|
121
155
|
target_annotations = if source_annotations.class == Array
|
122
|
-
|
123
|
-
align_mannotations(source_annotations, reference_text, alignment, false)
|
156
|
+
align_mannotations(source_annotations, reference_text, alignment, verbose)
|
124
157
|
else
|
125
|
-
|
126
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
158
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, verbose)
|
127
159
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
128
160
|
end
|
129
161
|
|
130
|
-
# pp alignment.block_alignment
|
131
162
|
# puts target_annotations.to_json
|
@@ -6,11 +6,18 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str, cultivation_map)
|
9
|
+
def initialize(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false)
|
10
|
+
@method_get_left_windows, @method_get_right_windows = if to_ignore_whitespaces
|
11
|
+
[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
|
12
|
+
else
|
13
|
+
[method(:get_left_windows), method(:get_right_windows)]
|
14
|
+
end
|
15
|
+
|
10
16
|
@s1 = source_str.downcase
|
11
17
|
@s2 = target_str.downcase
|
12
18
|
|
13
19
|
@cultivation_map = cultivation_map
|
20
|
+
@to_ignore_text_order = to_ignore_text_order
|
14
21
|
|
15
22
|
@size_ngram = TextAlignment::SIZE_NGRAM
|
16
23
|
@size_window = TextAlignment::SIZE_WINDOW
|
@@ -65,10 +72,7 @@ class TextAlignment::AnchorFinder
|
|
65
72
|
# to get the anchor to search for in s2
|
66
73
|
anchor = @s1[beg_s1, @size_ngram]
|
67
74
|
|
68
|
-
|
69
|
-
# search_position = 0
|
70
|
-
search_position = @pos_s2_last_match
|
71
|
-
|
75
|
+
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
72
76
|
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
73
77
|
return nil if beg_s2_candidates.empty?
|
74
78
|
|
@@ -108,14 +112,14 @@ class TextAlignment::AnchorFinder
|
|
108
112
|
next
|
109
113
|
end
|
110
114
|
|
111
|
-
left_window_s1, left_window_s2 =
|
115
|
+
left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
|
112
116
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
117
|
break unless valid_beg_s2.nil?
|
114
118
|
valid_beg_s2 = beg_s2
|
115
119
|
next
|
116
120
|
end
|
117
121
|
|
118
|
-
right_window_s1, right_window_s2 =
|
122
|
+
right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
|
119
123
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
124
|
break unless valid_beg_s2.nil?
|
121
125
|
valid_beg_s2 = beg_s2
|
@@ -139,7 +143,7 @@ class TextAlignment::AnchorFinder
|
|
139
143
|
size_window ||= @size_window
|
140
144
|
|
141
145
|
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
142
|
-
# return if
|
146
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
143
147
|
|
144
148
|
window_s1 = ''
|
145
149
|
loc = beg_s1 - 1
|
@@ -170,7 +174,7 @@ class TextAlignment::AnchorFinder
|
|
170
174
|
size_window ||= @size_window
|
171
175
|
|
172
176
|
# commend below with the assumption that the end of a document gives a significant locational
|
173
|
-
# return if (
|
177
|
+
# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
|
174
178
|
|
175
179
|
window_s1 = ''
|
176
180
|
loc = beg_s1 + @size_ngram
|
@@ -199,6 +203,44 @@ class TextAlignment::AnchorFinder
|
|
199
203
|
[window_s1, window_s2]
|
200
204
|
end
|
201
205
|
|
206
|
+
def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
207
|
+
size_window ||= @size_window
|
208
|
+
|
209
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
210
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
211
|
+
|
212
|
+
wbeg = beg_s1 - size_window
|
213
|
+
wbeg = 0 if wbeg < 0
|
214
|
+
window_s1 = @s1[wbeg ... beg_s1]
|
215
|
+
|
216
|
+
wbeg = beg_s2 - size_window
|
217
|
+
wbeg = 0 if wbeg < 0
|
218
|
+
window_s2 = @s2[wbeg ... beg_s2]
|
219
|
+
|
220
|
+
[window_s1, window_s2]
|
221
|
+
end
|
222
|
+
|
223
|
+
def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
224
|
+
size_window ||= @size_window
|
225
|
+
|
226
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
227
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
228
|
+
|
229
|
+
slen = @s1.length
|
230
|
+
wbeg = beg_s1 + @size_ngram
|
231
|
+
wend = wbeg + size_window
|
232
|
+
wend = slen if wend > slen
|
233
|
+
window_s1 = @s1[wbeg ... wend]
|
234
|
+
|
235
|
+
slen = @s2.length
|
236
|
+
wbeg = beg_s2 + @size_ngram
|
237
|
+
wend = wbeg + size_window
|
238
|
+
wend = slen if wend > slen
|
239
|
+
window_s2 = @s2[wbeg ... wend]
|
240
|
+
|
241
|
+
[window_s1, window_s2]
|
242
|
+
end
|
243
|
+
|
202
244
|
def text_similarity(str1, str2, ngram_order = 2)
|
203
245
|
return 0 if str1.nil? || str2.nil?
|
204
246
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module TextAlignment; end unless defined? TextAlignment
|
2
4
|
|
3
5
|
TextAlignment::CHAR_MAPPING = [
|
@@ -78,10 +80,18 @@ TextAlignment::CHAR_MAPPING = [
|
|
78
80
|
|
79
81
|
|
80
82
|
class TextAlignment::CharMapping
|
81
|
-
attr_reader :mapped_text
|
83
|
+
attr_reader :mapped_text, :index_enmap
|
84
|
+
|
85
|
+
def initialize(_text, char_mapping = nil, to_ignore_whitespaces = false)
|
86
|
+
if to_ignore_whitespaces
|
87
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
|
88
|
+
@method_squeeze_ws = method(:squeeze_ws_0!)
|
89
|
+
else
|
90
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
|
91
|
+
@method_squeeze_ws = method(:squeeze_ws_1!)
|
92
|
+
end
|
82
93
|
|
83
|
-
|
84
|
-
char_mapping ||= TextAlignment::CHAR_MAPPING
|
94
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
85
95
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
96
|
@index_enmap = offset_mapping.to_h
|
87
97
|
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
@@ -105,22 +115,22 @@ class TextAlignment::CharMapping
|
|
105
115
|
|
106
116
|
private
|
107
117
|
|
108
|
-
def enmap_text(_text, char_mapping)
|
118
|
+
def enmap_text(_text, char_mapping, no_ws = false)
|
109
119
|
text = _text.dup
|
110
120
|
|
111
|
-
# To execute the single letter mapping
|
121
|
+
# To execute the single letter mapping replacement
|
112
122
|
char_mapping.each do |one, long|
|
113
123
|
text.gsub!(one, long) if long.length == 1
|
114
124
|
end
|
115
125
|
|
116
|
-
# To get the (
|
117
|
-
|
126
|
+
# To get the replacement positions, (position, old_length, new_length), for char mappings
|
127
|
+
rpositions = []
|
118
128
|
char_mapping.each do |one, long|
|
119
129
|
next if long.length == 1
|
120
130
|
|
121
131
|
init_next = 0
|
122
132
|
while loc = text.index(long, init_next)
|
123
|
-
|
133
|
+
rpositions << [loc, long.length, 1]
|
124
134
|
init_next = loc + long.length
|
125
135
|
end
|
126
136
|
|
@@ -128,32 +138,27 @@ class TextAlignment::CharMapping
|
|
128
138
|
text.gsub!(long, one * long.length)
|
129
139
|
end
|
130
140
|
|
131
|
-
# To get the (
|
132
|
-
|
133
|
-
|
134
|
-
len = $~[0].length
|
135
|
-
loc_len << [loc, len]
|
136
|
-
init_next = loc + len
|
137
|
-
end
|
138
|
-
|
139
|
-
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
141
|
+
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
142
|
+
rpositions += @method_get_positions_squeeze_ws.call(text)
|
143
|
+
rpositions.sort!{|a, b| a[0] <=> b[0]}
|
140
144
|
|
141
145
|
# To get the offset_mapping before and after replacement
|
142
|
-
offset_mapping =
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
146
|
+
offset_mapping = begin
|
147
|
+
i, j = 0, 0
|
148
|
+
|
149
|
+
offset_mappings = rpositions.map do |loc, old_len, new_len|
|
150
|
+
pre_len = loc - i
|
151
|
+
m = (0 .. pre_len).map{|c| [i + c, j + c]}
|
152
|
+
i = loc + old_len
|
153
|
+
j += pre_len + new_len
|
154
|
+
|
155
|
+
m
|
150
156
|
end
|
151
|
-
init_next = loc + len
|
152
|
-
end
|
153
157
|
|
154
|
-
|
155
|
-
j
|
156
|
-
|
158
|
+
pre_len = text.length - i
|
159
|
+
offset_mappings << (0 .. pre_len).map{|c| [i + c, j + c]}
|
160
|
+
|
161
|
+
offset_mappings.reduce(:+)
|
157
162
|
end
|
158
163
|
|
159
164
|
# To execute the long letter mapping
|
@@ -162,14 +167,40 @@ class TextAlignment::CharMapping
|
|
162
167
|
end
|
163
168
|
|
164
169
|
# To replace multi whitespace sequences to a space
|
165
|
-
|
170
|
+
@method_squeeze_ws.call(text)
|
166
171
|
|
167
172
|
[text, offset_mapping]
|
168
173
|
end
|
174
|
+
|
175
|
+
# To get squeeze positions of whitespaces to one
|
176
|
+
def get_positions_squeeze_ws_1(text)
|
177
|
+
rpositions = []
|
178
|
+
text.scan(/\s{2,}/) do |s|
|
179
|
+
loc = $~.begin(0)
|
180
|
+
len = $~.end(0) - loc
|
181
|
+
rpositions << [loc, len, 1]
|
182
|
+
end
|
183
|
+
rpositions
|
184
|
+
end
|
185
|
+
|
186
|
+
# To get squeeze positions of whitespaces to zero
|
187
|
+
def get_positions_squeeze_ws_0(text)
|
188
|
+
text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
|
189
|
+
end
|
190
|
+
|
191
|
+
def squeeze_ws_1!(text)
|
192
|
+
text.gsub!(/\s{2,}/, ' ')
|
193
|
+
end
|
194
|
+
|
195
|
+
def squeeze_ws_0!(text)
|
196
|
+
text.gsub!(/\s+/, '')
|
197
|
+
end
|
198
|
+
|
169
199
|
end
|
170
200
|
|
171
201
|
if __FILE__ == $0
|
172
202
|
require 'json'
|
203
|
+
# require 'profile'
|
173
204
|
|
174
205
|
unless ARGV.length == 1
|
175
206
|
warn "#{$0} an_annotation_json_file.json"
|
@@ -181,10 +212,11 @@ if __FILE__ == $0
|
|
181
212
|
denotations = annotations[:tracks].first[:denotations]
|
182
213
|
end
|
183
214
|
|
184
|
-
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
215
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, false)
|
216
|
+
# text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, true)
|
185
217
|
text_mapped = text_mapping.mapped_text
|
186
218
|
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
187
219
|
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
188
220
|
|
189
|
-
puts new_annotations.to_json
|
221
|
+
# puts new_annotations.to_json
|
190
222
|
end
|
@@ -147,13 +147,24 @@ class TextAlignment::MixedAlignment
|
|
147
147
|
# recoverbility
|
148
148
|
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
149
|
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
|
-
|
151
150
|
coverage = count_nws_match.to_f / count_nws
|
152
151
|
|
153
152
|
# fragmentation rate
|
154
|
-
|
155
|
-
|
156
|
-
|
153
|
+
frag_str = sdiff.collect do |d|
|
154
|
+
case d.action
|
155
|
+
when '='
|
156
|
+
'='
|
157
|
+
when '-'
|
158
|
+
''
|
159
|
+
when '+'
|
160
|
+
(d.new_element =~ /\S/) ? '+' : ''
|
161
|
+
else
|
162
|
+
''
|
163
|
+
end
|
164
|
+
end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
|
165
|
+
|
166
|
+
count_frag = frag_str.scan(/=+/).count
|
167
|
+
rate_frag = 1.0 / count_frag
|
157
168
|
|
158
169
|
similarity = coverage * rate_frag
|
159
170
|
end
|
@@ -11,14 +11,18 @@ class TextAlignment::TextAlignment
|
|
11
11
|
attr_reader :similarity
|
12
12
|
attr_reader :lost_annotations
|
13
13
|
|
14
|
-
# Initialize with a reference text,
|
15
|
-
def initialize(reference_text,
|
14
|
+
# Initialize with a reference text, against which texts will be aligned
|
15
|
+
def initialize(reference_text, options = {})
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
+
options ||= {}
|
19
|
+
@duplicate_texts = options[:duplicate_texts] || false
|
20
|
+
@to_ignore_whitespaces = options[:to_ignore_whitespaces] || false
|
21
|
+
@to_ignore_text_order = options[:to_ignore_text_order] || false
|
22
|
+
|
18
23
|
@original_reference_text = reference_text
|
19
|
-
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
24
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @to_ignore_whitespaces)
|
20
25
|
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
-
@to_prevent_overlap = to_prevent_overlap
|
22
26
|
|
23
27
|
@original_text = nil
|
24
28
|
@blocks = nil
|
@@ -27,12 +31,12 @@ class TextAlignment::TextAlignment
|
|
27
31
|
|
28
32
|
def align(text, denotations = nil)
|
29
33
|
# To maintain the cultivation map
|
30
|
-
update_cultivation_map
|
34
|
+
update_cultivation_map unless @duplicate_texts
|
31
35
|
|
32
36
|
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
37
|
unless @original_text && @original_text == text
|
34
38
|
@original_text = text
|
35
|
-
@text_mapping = TextAlignment::CharMapping.new(text)
|
39
|
+
@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
|
36
40
|
end
|
37
41
|
|
38
42
|
@mapped_text = @text_mapping.mapped_text
|
@@ -202,7 +206,7 @@ class TextAlignment::TextAlignment
|
|
202
206
|
|
203
207
|
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
204
208
|
## to find block alignments
|
205
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
209
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @to_ignore_whitespaces, @to_ignore_text_order)
|
206
210
|
|
207
211
|
blocks = []
|
208
212
|
while block = anchor_finder.get_next_anchor
|
@@ -238,68 +242,75 @@ class TextAlignment::TextAlignment
|
|
238
242
|
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
239
243
|
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
240
244
|
|
241
|
-
if b1
|
245
|
+
if b1 <= e1
|
246
|
+
_str1 = str1[b1 ... e1]
|
247
|
+
|
242
248
|
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
243
249
|
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
244
|
-
_str1 = str1[b1 ... e1]
|
245
|
-
_str2 = str2[b2 ... e2]
|
246
250
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
258
|
-
else
|
259
|
-
oe2 = state_region[1]
|
260
|
-
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
261
|
-
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
262
|
-
end
|
263
|
-
when :rear_open
|
264
|
-
if cblock.nil? # when there is no following matched block
|
265
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
266
|
-
else
|
267
|
-
ob2 = state_region[0]
|
268
|
-
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
269
|
-
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
270
|
-
end
|
271
|
-
when :middle_closed
|
272
|
-
attempt1 = if sum.empty?
|
251
|
+
if b2 < e2
|
252
|
+
_str2 = str2[b2 ... e2]
|
253
|
+
|
254
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
255
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
256
|
+
else
|
257
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
258
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
259
|
+
case region_state
|
260
|
+
when :closed
|
273
261
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
262
|
+
when :front_open
|
263
|
+
if sum.empty? # when there is no preceding matched block
|
264
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
265
|
+
else
|
266
|
+
oe2 = state_region[1]
|
267
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
268
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
269
|
+
end
|
270
|
+
when :rear_open
|
271
|
+
if cblock.nil? # when there is no following matched block
|
272
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
273
|
+
else
|
274
|
+
ob2 = state_region[0]
|
275
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
276
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
277
|
+
end
|
278
|
+
when :middle_closed
|
288
279
|
attempt1 = if sum.empty?
|
289
280
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
290
281
|
else
|
291
|
-
|
282
|
+
oe2 = state_region[0]
|
283
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
284
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
292
285
|
end
|
293
286
|
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
294
|
-
|
287
|
+
ob2 = state_region[1]
|
288
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
289
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
295
290
|
else
|
296
291
|
attempt1
|
297
292
|
end
|
298
|
-
else
|
299
|
-
|
293
|
+
else # :open
|
294
|
+
if (e2 - b2) > len_buffer
|
295
|
+
attempt1 = if sum.empty?
|
296
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
297
|
+
else
|
298
|
+
local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
299
|
+
end
|
300
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
301
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
302
|
+
else
|
303
|
+
attempt1
|
304
|
+
end
|
305
|
+
else
|
306
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
307
|
+
end
|
300
308
|
end
|
301
309
|
end
|
310
|
+
elsif b2 > e2 # when out of order
|
311
|
+
# ToDo
|
302
312
|
end
|
313
|
+
|
303
314
|
end
|
304
315
|
|
305
316
|
lblock = cblock
|
@@ -320,7 +331,7 @@ class TextAlignment::TextAlignment
|
|
320
331
|
|
321
332
|
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
322
333
|
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
323
|
-
if tblocks.empty?
|
334
|
+
if tblocks.empty? || tblocks.first[:alignment] == :empty
|
324
335
|
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
325
336
|
else
|
326
337
|
tblocks
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: '0'
|
113
113
|
requirements: []
|
114
|
-
rubygems_version: 3.0.
|
114
|
+
rubygems_version: 3.0.9
|
115
115
|
signing_key:
|
116
116
|
specification_version: 4
|
117
117
|
summary: Ruby class for aligning two character strings
|