text_alignment 0.11.3 → 0.11.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +39 -8
- data/lib/text_alignment/anchor_finder.rb +51 -9
- data/lib/text_alignment/char_mapping.rb +65 -33
- data/lib/text_alignment/mixed_alignment.rb +15 -4
- data/lib/text_alignment/text_alignment.rb +66 -55
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f95c7fb8bfdeb768fbd1400f7e785a2ed18016322f2cdcba3ea7196aa4e86ac
|
4
|
+
data.tar.gz: cad0296a218108884703af07bcbf1b303a6e12c51ab14ad872586c50cfc8e82c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d18a8d142974967fcfe358b561c83d76d1c5d8d2a277092ed4a8b42de14cd791b004830ff9f145735e6bc0a1265295f4fd6190c674ba62b57190d7e035a863b
|
7
|
+
data.tar.gz: c155ff9780d5f82893825787ee39ba319ddb1f16396d6a933334a83e485ff4d2eb2840d5fc1399f9873cd69a2458f029ced0d128d9099c4f08137d360e3e2007
|
data/bin/align_annotations
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment'
|
3
3
|
require 'json'
|
4
4
|
require 'pp'
|
5
|
+
require 'optparse'
|
5
6
|
|
6
7
|
def read_annotations(filename)
|
7
8
|
case File.extname(filename)
|
@@ -108,24 +109,54 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
108
109
|
end
|
109
110
|
|
110
111
|
|
112
|
+
## Options
|
113
|
+
options = {}
|
114
|
+
verbose = false
|
115
|
+
|
116
|
+
## command line option processing
|
117
|
+
require 'optparse'
|
118
|
+
optparse = OptionParser.new do |opts|
|
119
|
+
opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
|
120
|
+
|
121
|
+
opts.on('-d', '--duplicate', 'tells it to assume there may be duplicate texts.') do
|
122
|
+
options[:duplicate_texts] = true
|
123
|
+
end
|
124
|
+
|
125
|
+
opts.on('-w', '--no-whitespaces', 'tells it to ignore whitespaces.') do
|
126
|
+
options[:to_ignore_whitespaces] = true
|
127
|
+
end
|
128
|
+
|
129
|
+
opts.on('-o', '--no-order', 'tells it to ignore the order of the texts.') do
|
130
|
+
options[:to_ignore_text_order] = true
|
131
|
+
end
|
132
|
+
|
133
|
+
opts.on('-v', '--verbose', 'tells it to show the state verbosely for debugging.') do
|
134
|
+
verbose = true
|
135
|
+
end
|
136
|
+
|
137
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
138
|
+
puts opts
|
139
|
+
exit
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
optparse.parse!
|
144
|
+
|
111
145
|
unless ARGV.length == 2
|
112
|
-
|
113
|
-
exit
|
146
|
+
puts optparse.help
|
147
|
+
exit 1
|
114
148
|
end
|
115
149
|
|
116
150
|
source_annotations = read_annotations(ARGV[0])
|
117
151
|
reference_text = read_text(ARGV[1])
|
118
152
|
|
119
|
-
alignment = TextAlignment::TextAlignment.new(reference_text,
|
153
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, options)
|
120
154
|
|
121
155
|
target_annotations = if source_annotations.class == Array
|
122
|
-
|
123
|
-
align_mannotations(source_annotations, reference_text, alignment, false)
|
156
|
+
align_mannotations(source_annotations, reference_text, alignment, verbose)
|
124
157
|
else
|
125
|
-
|
126
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
158
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, verbose)
|
127
159
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
128
160
|
end
|
129
161
|
|
130
|
-
# pp alignment.block_alignment
|
131
162
|
# puts target_annotations.to_json
|
@@ -6,11 +6,18 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str, cultivation_map)
|
9
|
+
def initialize(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false)
|
10
|
+
@method_get_left_windows, @method_get_right_windows = if to_ignore_whitespaces
|
11
|
+
[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
|
12
|
+
else
|
13
|
+
[method(:get_left_windows), method(:get_right_windows)]
|
14
|
+
end
|
15
|
+
|
10
16
|
@s1 = source_str.downcase
|
11
17
|
@s2 = target_str.downcase
|
12
18
|
|
13
19
|
@cultivation_map = cultivation_map
|
20
|
+
@to_ignore_text_order = to_ignore_text_order
|
14
21
|
|
15
22
|
@size_ngram = TextAlignment::SIZE_NGRAM
|
16
23
|
@size_window = TextAlignment::SIZE_WINDOW
|
@@ -65,10 +72,7 @@ class TextAlignment::AnchorFinder
|
|
65
72
|
# to get the anchor to search for in s2
|
66
73
|
anchor = @s1[beg_s1, @size_ngram]
|
67
74
|
|
68
|
-
|
69
|
-
# search_position = 0
|
70
|
-
search_position = @pos_s2_last_match
|
71
|
-
|
75
|
+
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
72
76
|
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
73
77
|
return nil if beg_s2_candidates.empty?
|
74
78
|
|
@@ -108,14 +112,14 @@ class TextAlignment::AnchorFinder
|
|
108
112
|
next
|
109
113
|
end
|
110
114
|
|
111
|
-
left_window_s1, left_window_s2 =
|
115
|
+
left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
|
112
116
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
117
|
break unless valid_beg_s2.nil?
|
114
118
|
valid_beg_s2 = beg_s2
|
115
119
|
next
|
116
120
|
end
|
117
121
|
|
118
|
-
right_window_s1, right_window_s2 =
|
122
|
+
right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
|
119
123
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
124
|
break unless valid_beg_s2.nil?
|
121
125
|
valid_beg_s2 = beg_s2
|
@@ -139,7 +143,7 @@ class TextAlignment::AnchorFinder
|
|
139
143
|
size_window ||= @size_window
|
140
144
|
|
141
145
|
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
142
|
-
# return if
|
146
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
143
147
|
|
144
148
|
window_s1 = ''
|
145
149
|
loc = beg_s1 - 1
|
@@ -170,7 +174,7 @@ class TextAlignment::AnchorFinder
|
|
170
174
|
size_window ||= @size_window
|
171
175
|
|
172
176
|
# commend below with the assumption that the end of a document gives a significant locational
|
173
|
-
# return if (
|
177
|
+
# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
|
174
178
|
|
175
179
|
window_s1 = ''
|
176
180
|
loc = beg_s1 + @size_ngram
|
@@ -199,6 +203,44 @@ class TextAlignment::AnchorFinder
|
|
199
203
|
[window_s1, window_s2]
|
200
204
|
end
|
201
205
|
|
206
|
+
def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
207
|
+
size_window ||= @size_window
|
208
|
+
|
209
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
210
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
211
|
+
|
212
|
+
wbeg = beg_s1 - size_window
|
213
|
+
wbeg = 0 if wbeg < 0
|
214
|
+
window_s1 = @s1[wbeg ... beg_s1]
|
215
|
+
|
216
|
+
wbeg = beg_s2 - size_window
|
217
|
+
wbeg = 0 if wbeg < 0
|
218
|
+
window_s2 = @s2[wbeg ... beg_s2]
|
219
|
+
|
220
|
+
[window_s1, window_s2]
|
221
|
+
end
|
222
|
+
|
223
|
+
def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
224
|
+
size_window ||= @size_window
|
225
|
+
|
226
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
227
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
228
|
+
|
229
|
+
slen = @s1.length
|
230
|
+
wbeg = beg_s1 + @size_ngram
|
231
|
+
wend = wbeg + size_window
|
232
|
+
wend = slen if wend > slen
|
233
|
+
window_s1 = @s1[wbeg ... wend]
|
234
|
+
|
235
|
+
slen = @s2.length
|
236
|
+
wbeg = beg_s2 + @size_ngram
|
237
|
+
wend = wbeg + size_window
|
238
|
+
wend = slen if wend > slen
|
239
|
+
window_s2 = @s2[wbeg ... wend]
|
240
|
+
|
241
|
+
[window_s1, window_s2]
|
242
|
+
end
|
243
|
+
|
202
244
|
def text_similarity(str1, str2, ngram_order = 2)
|
203
245
|
return 0 if str1.nil? || str2.nil?
|
204
246
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module TextAlignment; end unless defined? TextAlignment
|
2
4
|
|
3
5
|
TextAlignment::CHAR_MAPPING = [
|
@@ -78,10 +80,18 @@ TextAlignment::CHAR_MAPPING = [
|
|
78
80
|
|
79
81
|
|
80
82
|
class TextAlignment::CharMapping
|
81
|
-
attr_reader :mapped_text
|
83
|
+
attr_reader :mapped_text, :index_enmap
|
84
|
+
|
85
|
+
def initialize(_text, char_mapping = nil, to_ignore_whitespaces = false)
|
86
|
+
if to_ignore_whitespaces
|
87
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
|
88
|
+
@method_squeeze_ws = method(:squeeze_ws_0!)
|
89
|
+
else
|
90
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
|
91
|
+
@method_squeeze_ws = method(:squeeze_ws_1!)
|
92
|
+
end
|
82
93
|
|
83
|
-
|
84
|
-
char_mapping ||= TextAlignment::CHAR_MAPPING
|
94
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
85
95
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
96
|
@index_enmap = offset_mapping.to_h
|
87
97
|
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
@@ -105,22 +115,22 @@ class TextAlignment::CharMapping
|
|
105
115
|
|
106
116
|
private
|
107
117
|
|
108
|
-
def enmap_text(_text, char_mapping)
|
118
|
+
def enmap_text(_text, char_mapping, no_ws = false)
|
109
119
|
text = _text.dup
|
110
120
|
|
111
|
-
# To execute the single letter mapping
|
121
|
+
# To execute the single letter mapping replacement
|
112
122
|
char_mapping.each do |one, long|
|
113
123
|
text.gsub!(one, long) if long.length == 1
|
114
124
|
end
|
115
125
|
|
116
|
-
# To get the (
|
117
|
-
|
126
|
+
# To get the replacement positions, (position, old_length, new_length), for char mappings
|
127
|
+
rpositions = []
|
118
128
|
char_mapping.each do |one, long|
|
119
129
|
next if long.length == 1
|
120
130
|
|
121
131
|
init_next = 0
|
122
132
|
while loc = text.index(long, init_next)
|
123
|
-
|
133
|
+
rpositions << [loc, long.length, 1]
|
124
134
|
init_next = loc + long.length
|
125
135
|
end
|
126
136
|
|
@@ -128,32 +138,27 @@ class TextAlignment::CharMapping
|
|
128
138
|
text.gsub!(long, one * long.length)
|
129
139
|
end
|
130
140
|
|
131
|
-
# To get the (
|
132
|
-
|
133
|
-
|
134
|
-
len = $~[0].length
|
135
|
-
loc_len << [loc, len]
|
136
|
-
init_next = loc + len
|
137
|
-
end
|
138
|
-
|
139
|
-
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
141
|
+
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
142
|
+
rpositions += @method_get_positions_squeeze_ws.call(text)
|
143
|
+
rpositions.sort!{|a, b| a[0] <=> b[0]}
|
140
144
|
|
141
145
|
# To get the offset_mapping before and after replacement
|
142
|
-
offset_mapping =
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
146
|
+
offset_mapping = begin
|
147
|
+
i, j = 0, 0
|
148
|
+
|
149
|
+
offset_mappings = rpositions.map do |loc, old_len, new_len|
|
150
|
+
pre_len = loc - i
|
151
|
+
m = (0 .. pre_len).map{|c| [i + c, j + c]}
|
152
|
+
i = loc + old_len
|
153
|
+
j += pre_len + new_len
|
154
|
+
|
155
|
+
m
|
150
156
|
end
|
151
|
-
init_next = loc + len
|
152
|
-
end
|
153
157
|
|
154
|
-
|
155
|
-
j
|
156
|
-
|
158
|
+
pre_len = text.length - i
|
159
|
+
offset_mappings << (0 .. pre_len).map{|c| [i + c, j + c]}
|
160
|
+
|
161
|
+
offset_mappings.reduce(:+)
|
157
162
|
end
|
158
163
|
|
159
164
|
# To execute the long letter mapping
|
@@ -162,14 +167,40 @@ class TextAlignment::CharMapping
|
|
162
167
|
end
|
163
168
|
|
164
169
|
# To replace multi whitespace sequences to a space
|
165
|
-
|
170
|
+
@method_squeeze_ws.call(text)
|
166
171
|
|
167
172
|
[text, offset_mapping]
|
168
173
|
end
|
174
|
+
|
175
|
+
# To get squeeze positions of whitespaces to one
|
176
|
+
def get_positions_squeeze_ws_1(text)
|
177
|
+
rpositions = []
|
178
|
+
text.scan(/\s{2,}/) do |s|
|
179
|
+
loc = $~.begin(0)
|
180
|
+
len = $~.end(0) - loc
|
181
|
+
rpositions << [loc, len, 1]
|
182
|
+
end
|
183
|
+
rpositions
|
184
|
+
end
|
185
|
+
|
186
|
+
# To get squeeze positions of whitespaces to zero
|
187
|
+
def get_positions_squeeze_ws_0(text)
|
188
|
+
text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
|
189
|
+
end
|
190
|
+
|
191
|
+
def squeeze_ws_1!(text)
|
192
|
+
text.gsub!(/\s{2,}/, ' ')
|
193
|
+
end
|
194
|
+
|
195
|
+
def squeeze_ws_0!(text)
|
196
|
+
text.gsub!(/\s+/, '')
|
197
|
+
end
|
198
|
+
|
169
199
|
end
|
170
200
|
|
171
201
|
if __FILE__ == $0
|
172
202
|
require 'json'
|
203
|
+
# require 'profile'
|
173
204
|
|
174
205
|
unless ARGV.length == 1
|
175
206
|
warn "#{$0} an_annotation_json_file.json"
|
@@ -181,10 +212,11 @@ if __FILE__ == $0
|
|
181
212
|
denotations = annotations[:tracks].first[:denotations]
|
182
213
|
end
|
183
214
|
|
184
|
-
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
215
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, false)
|
216
|
+
# text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, true)
|
185
217
|
text_mapped = text_mapping.mapped_text
|
186
218
|
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
187
219
|
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
188
220
|
|
189
|
-
puts new_annotations.to_json
|
221
|
+
# puts new_annotations.to_json
|
190
222
|
end
|
@@ -147,13 +147,24 @@ class TextAlignment::MixedAlignment
|
|
147
147
|
# recoverbility
|
148
148
|
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
149
|
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
|
-
|
151
150
|
coverage = count_nws_match.to_f / count_nws
|
152
151
|
|
153
152
|
# fragmentation rate
|
154
|
-
|
155
|
-
|
156
|
-
|
153
|
+
frag_str = sdiff.collect do |d|
|
154
|
+
case d.action
|
155
|
+
when '='
|
156
|
+
'='
|
157
|
+
when '-'
|
158
|
+
''
|
159
|
+
when '+'
|
160
|
+
(d.new_element =~ /\S/) ? '+' : ''
|
161
|
+
else
|
162
|
+
''
|
163
|
+
end
|
164
|
+
end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
|
165
|
+
|
166
|
+
count_frag = frag_str.scan(/=+/).count
|
167
|
+
rate_frag = 1.0 / count_frag
|
157
168
|
|
158
169
|
similarity = coverage * rate_frag
|
159
170
|
end
|
@@ -11,14 +11,18 @@ class TextAlignment::TextAlignment
|
|
11
11
|
attr_reader :similarity
|
12
12
|
attr_reader :lost_annotations
|
13
13
|
|
14
|
-
# Initialize with a reference text,
|
15
|
-
def initialize(reference_text,
|
14
|
+
# Initialize with a reference text, against which texts will be aligned
|
15
|
+
def initialize(reference_text, options = {})
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
+
options ||= {}
|
19
|
+
@duplicate_texts = options[:duplicate_texts] || false
|
20
|
+
@to_ignore_whitespaces = options[:to_ignore_whitespaces] || false
|
21
|
+
@to_ignore_text_order = options[:to_ignore_text_order] || false
|
22
|
+
|
18
23
|
@original_reference_text = reference_text
|
19
|
-
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
24
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @to_ignore_whitespaces)
|
20
25
|
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
-
@to_prevent_overlap = to_prevent_overlap
|
22
26
|
|
23
27
|
@original_text = nil
|
24
28
|
@blocks = nil
|
@@ -27,12 +31,12 @@ class TextAlignment::TextAlignment
|
|
27
31
|
|
28
32
|
def align(text, denotations = nil)
|
29
33
|
# To maintain the cultivation map
|
30
|
-
update_cultivation_map
|
34
|
+
update_cultivation_map unless @duplicate_texts
|
31
35
|
|
32
36
|
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
37
|
unless @original_text && @original_text == text
|
34
38
|
@original_text = text
|
35
|
-
@text_mapping = TextAlignment::CharMapping.new(text)
|
39
|
+
@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
|
36
40
|
end
|
37
41
|
|
38
42
|
@mapped_text = @text_mapping.mapped_text
|
@@ -202,7 +206,7 @@ class TextAlignment::TextAlignment
|
|
202
206
|
|
203
207
|
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
204
208
|
## to find block alignments
|
205
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
209
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @to_ignore_whitespaces, @to_ignore_text_order)
|
206
210
|
|
207
211
|
blocks = []
|
208
212
|
while block = anchor_finder.get_next_anchor
|
@@ -238,68 +242,75 @@ class TextAlignment::TextAlignment
|
|
238
242
|
b1 = lblock.nil? ? 0 : lblock[:source][:end]
|
239
243
|
e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
|
240
244
|
|
241
|
-
if b1
|
245
|
+
if b1 <= e1
|
246
|
+
_str1 = str1[b1 ... e1]
|
247
|
+
|
242
248
|
b2 = lblock.nil? ? 0 : lblock[:target][:end]
|
243
249
|
e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
|
244
|
-
_str1 = str1[b1 ... e1]
|
245
|
-
_str2 = str2[b2 ... e2]
|
246
250
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
258
|
-
else
|
259
|
-
oe2 = state_region[1]
|
260
|
-
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
261
|
-
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
262
|
-
end
|
263
|
-
when :rear_open
|
264
|
-
if cblock.nil? # when there is no following matched block
|
265
|
-
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
266
|
-
else
|
267
|
-
ob2 = state_region[0]
|
268
|
-
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
269
|
-
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
270
|
-
end
|
271
|
-
when :middle_closed
|
272
|
-
attempt1 = if sum.empty?
|
251
|
+
if b2 < e2
|
252
|
+
_str2 = str2[b2 ... e2]
|
253
|
+
|
254
|
+
sum += if _str1.strip.empty? || _str2.strip.empty?
|
255
|
+
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
256
|
+
else
|
257
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
258
|
+
region_state, state_region = cultivation_map.region_state([b2, e2])
|
259
|
+
case region_state
|
260
|
+
when :closed
|
273
261
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
262
|
+
when :front_open
|
263
|
+
if sum.empty? # when there is no preceding matched block
|
264
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
265
|
+
else
|
266
|
+
oe2 = state_region[1]
|
267
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
268
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
269
|
+
end
|
270
|
+
when :rear_open
|
271
|
+
if cblock.nil? # when there is no following matched block
|
272
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
273
|
+
else
|
274
|
+
ob2 = state_region[0]
|
275
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
276
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
277
|
+
end
|
278
|
+
when :middle_closed
|
288
279
|
attempt1 = if sum.empty?
|
289
280
|
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
290
281
|
else
|
291
|
-
|
282
|
+
oe2 = state_region[0]
|
283
|
+
me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
|
284
|
+
local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
|
292
285
|
end
|
293
286
|
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
294
|
-
|
287
|
+
ob2 = state_region[1]
|
288
|
+
mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
|
289
|
+
local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
|
295
290
|
else
|
296
291
|
attempt1
|
297
292
|
end
|
298
|
-
else
|
299
|
-
|
293
|
+
else # :open
|
294
|
+
if (e2 - b2) > len_buffer
|
295
|
+
attempt1 = if sum.empty?
|
296
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
297
|
+
else
|
298
|
+
local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
|
299
|
+
end
|
300
|
+
if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
|
301
|
+
local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
|
302
|
+
else
|
303
|
+
attempt1
|
304
|
+
end
|
305
|
+
else
|
306
|
+
local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
307
|
+
end
|
300
308
|
end
|
301
309
|
end
|
310
|
+
elsif b2 > e2 # when out of order
|
311
|
+
# ToDo
|
302
312
|
end
|
313
|
+
|
303
314
|
end
|
304
315
|
|
305
316
|
lblock = cblock
|
@@ -320,7 +331,7 @@ class TextAlignment::TextAlignment
|
|
320
331
|
|
321
332
|
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
322
333
|
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
323
|
-
if tblocks.empty?
|
334
|
+
if tblocks.empty? || tblocks.first[:alignment] == :empty
|
324
335
|
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
325
336
|
else
|
326
337
|
tblocks
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: '0'
|
113
113
|
requirements: []
|
114
|
-
rubygems_version: 3.0.
|
114
|
+
rubygems_version: 3.0.9
|
115
115
|
signing_key:
|
116
116
|
specification_version: 4
|
117
117
|
summary: Ruby class for aligning two character strings
|