text_alignment 0.11.2 → 0.11.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +31 -8
- data/lib/text_alignment/anchor_finder.rb +54 -6
- data/lib/text_alignment/char_mapping.rb +61 -21
- data/lib/text_alignment/cultivation_map.rb +1 -1
- data/lib/text_alignment/mixed_alignment.rb +15 -4
- data/lib/text_alignment/text_alignment.rb +10 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
|
4
|
+
data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
|
7
|
+
data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
|
data/bin/align_annotations
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'text_alignment'
|
3
3
|
require 'json'
|
4
4
|
require 'pp'
|
5
|
+
require 'optparse'
|
5
6
|
|
6
7
|
def read_annotations(filename)
|
7
8
|
case File.extname(filename)
|
@@ -108,24 +109,46 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
|
|
108
109
|
end
|
109
110
|
|
110
111
|
|
112
|
+
## Options
|
113
|
+
overlap_p = false
|
114
|
+
debug_p = false
|
115
|
+
|
116
|
+
## command line option processing
|
117
|
+
require 'optparse'
|
118
|
+
optparse = OptionParser.new do |opts|
|
119
|
+
opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
|
120
|
+
|
121
|
+
opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
|
122
|
+
overlap_p = true
|
123
|
+
end
|
124
|
+
|
125
|
+
opts.on('-d', '--debug', 'tells it to show debugging information.') do
|
126
|
+
debug_p = true
|
127
|
+
end
|
128
|
+
|
129
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
130
|
+
puts opts
|
131
|
+
exit
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
optparse.parse!
|
136
|
+
|
111
137
|
unless ARGV.length == 2
|
112
|
-
|
113
|
-
exit
|
138
|
+
puts optparse.help
|
139
|
+
exit 1
|
114
140
|
end
|
115
141
|
|
116
142
|
source_annotations = read_annotations(ARGV[0])
|
117
143
|
reference_text = read_text(ARGV[1])
|
118
144
|
|
119
|
-
alignment = TextAlignment::TextAlignment.new(reference_text,
|
145
|
+
alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
|
120
146
|
|
121
147
|
target_annotations = if source_annotations.class == Array
|
122
|
-
|
123
|
-
align_mannotations(source_annotations, reference_text, alignment, false)
|
148
|
+
align_mannotations(source_annotations, reference_text, alignment, debug_p)
|
124
149
|
else
|
125
|
-
|
126
|
-
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
|
150
|
+
denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
|
127
151
|
source_annotations.merge({text:reference_text, denotations:denotations})
|
128
152
|
end
|
129
153
|
|
130
|
-
# pp alignment.block_alignment
|
131
154
|
# puts target_annotations.to_json
|
@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str, cultivation_map)
|
9
|
+
def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
|
10
|
+
@method_get_left_windows, @method_get_right_windows = if squeeze_ws
|
11
|
+
[method(:get_left_windows), method(:get_right_windows)]
|
12
|
+
else
|
13
|
+
[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
|
14
|
+
end
|
15
|
+
|
10
16
|
@s1 = source_str.downcase
|
11
17
|
@s2 = target_str.downcase
|
12
18
|
|
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
|
|
108
114
|
next
|
109
115
|
end
|
110
116
|
|
111
|
-
left_window_s1, left_window_s2 =
|
117
|
+
left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
|
112
118
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
119
|
break unless valid_beg_s2.nil?
|
114
120
|
valid_beg_s2 = beg_s2
|
115
121
|
next
|
116
122
|
end
|
117
123
|
|
118
|
-
right_window_s1, right_window_s2 =
|
124
|
+
right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
|
119
125
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
126
|
break unless valid_beg_s2.nil?
|
121
127
|
valid_beg_s2 = beg_s2
|
@@ -125,7 +131,11 @@ class TextAlignment::AnchorFinder
|
|
125
131
|
|
126
132
|
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
127
133
|
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
128
|
-
|
134
|
+
if r.nil?
|
135
|
+
valid_beg_s2 = nil
|
136
|
+
else
|
137
|
+
break
|
138
|
+
end
|
129
139
|
end
|
130
140
|
|
131
141
|
valid_beg_s2
|
@@ -135,7 +145,7 @@ class TextAlignment::AnchorFinder
|
|
135
145
|
size_window ||= @size_window
|
136
146
|
|
137
147
|
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
138
|
-
# return if
|
148
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
139
149
|
|
140
150
|
window_s1 = ''
|
141
151
|
loc = beg_s1 - 1
|
@@ -166,7 +176,7 @@ class TextAlignment::AnchorFinder
|
|
166
176
|
size_window ||= @size_window
|
167
177
|
|
168
178
|
# commend below with the assumption that the end of a document gives a significant locational
|
169
|
-
# return if (
|
179
|
+
# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
|
170
180
|
|
171
181
|
window_s1 = ''
|
172
182
|
loc = beg_s1 + @size_ngram
|
@@ -195,6 +205,44 @@ class TextAlignment::AnchorFinder
|
|
195
205
|
[window_s1, window_s2]
|
196
206
|
end
|
197
207
|
|
208
|
+
def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
209
|
+
size_window ||= @size_window
|
210
|
+
|
211
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
212
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
213
|
+
|
214
|
+
wbeg = beg_s1 - size_window
|
215
|
+
wbeg = 0 if wbeg < 0
|
216
|
+
window_s1 = @s1[wbeg ... beg_s1]
|
217
|
+
|
218
|
+
wbeg = beg_s2 - size_window
|
219
|
+
wbeg = 0 if wbeg < 0
|
220
|
+
window_s2 = @s2[wbeg ... beg_s2]
|
221
|
+
|
222
|
+
[window_s1, window_s2]
|
223
|
+
end
|
224
|
+
|
225
|
+
def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
226
|
+
size_window ||= @size_window
|
227
|
+
|
228
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
229
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
230
|
+
|
231
|
+
slen = @s1.length
|
232
|
+
wbeg = beg_s1 + @size_ngram
|
233
|
+
wend = wbeg + size_window
|
234
|
+
wend = slen if wend > slen
|
235
|
+
window_s1 = @s1[wbeg ... wend]
|
236
|
+
|
237
|
+
slen = @s2.length
|
238
|
+
wbeg = beg_s2 + @size_ngram
|
239
|
+
wend = wbeg + size_window
|
240
|
+
wend = slen if wend > slen
|
241
|
+
window_s2 = @s2[wbeg ... wend]
|
242
|
+
|
243
|
+
[window_s1, window_s2]
|
244
|
+
end
|
245
|
+
|
198
246
|
def text_similarity(str1, str2, ngram_order = 2)
|
199
247
|
return 0 if str1.nil? || str2.nil?
|
200
248
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module TextAlignment; end unless defined? TextAlignment
|
2
4
|
|
3
5
|
TextAlignment::CHAR_MAPPING = [
|
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
|
|
80
82
|
class TextAlignment::CharMapping
|
81
83
|
attr_reader :mapped_text
|
82
84
|
|
83
|
-
def initialize(_text, char_mapping = nil)
|
84
|
-
|
85
|
+
def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
|
86
|
+
if squeeze_ws_to == 0
|
87
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
|
88
|
+
@method_squeeze_ws = method(:squeeze_ws_0!)
|
89
|
+
else
|
90
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
|
91
|
+
@method_squeeze_ws = method(:squeeze_ws_1!)
|
92
|
+
end
|
93
|
+
|
94
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
85
95
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
96
|
@index_enmap = offset_mapping.to_h
|
87
97
|
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
@@ -105,22 +115,22 @@ class TextAlignment::CharMapping
|
|
105
115
|
|
106
116
|
private
|
107
117
|
|
108
|
-
def enmap_text(_text, char_mapping)
|
118
|
+
def enmap_text(_text, char_mapping, no_ws = false)
|
109
119
|
text = _text.dup
|
110
120
|
|
111
|
-
# To execute the single letter mapping
|
121
|
+
# To execute the single letter mapping replacement
|
112
122
|
char_mapping.each do |one, long|
|
113
123
|
text.gsub!(one, long) if long.length == 1
|
114
124
|
end
|
115
125
|
|
116
|
-
# To get the (
|
117
|
-
|
126
|
+
# To get the replacement positions, (position, old_length, new_length), for char mappings
|
127
|
+
rpositions = []
|
118
128
|
char_mapping.each do |one, long|
|
119
129
|
next if long.length == 1
|
120
130
|
|
121
131
|
init_next = 0
|
122
132
|
while loc = text.index(long, init_next)
|
123
|
-
|
133
|
+
rpositions << [loc, long.length, 1]
|
124
134
|
init_next = loc + long.length
|
125
135
|
end
|
126
136
|
|
@@ -128,32 +138,31 @@ class TextAlignment::CharMapping
|
|
128
138
|
text.gsub!(long, one * long.length)
|
129
139
|
end
|
130
140
|
|
131
|
-
# To get the (
|
132
|
-
|
133
|
-
while loc = text.index(/\s{2,}/, init_next)
|
134
|
-
len = $~[0].length
|
135
|
-
loc_len << [loc, len]
|
136
|
-
init_next = loc + len
|
137
|
-
end
|
141
|
+
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
142
|
+
rpositions += @method_get_positions_squeeze_ws.call(text)
|
138
143
|
|
139
|
-
|
144
|
+
rpositions.sort!{|a, b| a[0] <=> b[0]}
|
140
145
|
|
141
146
|
# To get the offset_mapping before and after replacement
|
142
147
|
offset_mapping = []
|
143
148
|
init_next = 0
|
144
149
|
j = 0
|
145
150
|
|
146
|
-
|
151
|
+
rpositions.each do |loc, old_len, new_len|
|
147
152
|
offset_mapping += (init_next .. loc).map do |i|
|
153
|
+
m = [i, j]
|
148
154
|
j += 1
|
149
|
-
|
155
|
+
m
|
150
156
|
end
|
151
|
-
|
157
|
+
|
158
|
+
init_next = loc + old_len
|
159
|
+
j += (new_len - 1)
|
152
160
|
end
|
153
161
|
|
154
162
|
offset_mapping += (init_next .. text.length).map do |i|
|
163
|
+
m = [i, j]
|
155
164
|
j += 1
|
156
|
-
|
165
|
+
m
|
157
166
|
end
|
158
167
|
|
159
168
|
# To execute the long letter mapping
|
@@ -162,10 +171,41 @@ class TextAlignment::CharMapping
|
|
162
171
|
end
|
163
172
|
|
164
173
|
# To replace multi whitespace sequences to a space
|
165
|
-
|
174
|
+
@method_squeeze_ws.call(text)
|
166
175
|
|
167
176
|
[text, offset_mapping]
|
168
177
|
end
|
178
|
+
|
179
|
+
# To get squeeze positions of whitespaces to one
|
180
|
+
def get_positions_squeeze_ws_1(text)
|
181
|
+
rpositions = []
|
182
|
+
text.scan(/s{2,}/) do |s|
|
183
|
+
loc = $~.begin(0)
|
184
|
+
len = $~.end(0) - loc
|
185
|
+
rpositions << [loc, len, 1]
|
186
|
+
end
|
187
|
+
rpositions
|
188
|
+
end
|
189
|
+
|
190
|
+
# To get squeeze positions of whitespaces to zero
|
191
|
+
def get_positions_squeeze_ws_0(text)
|
192
|
+
rpositions = []
|
193
|
+
text.scan(/\s+/) do |s|
|
194
|
+
loc = $~.begin(0)
|
195
|
+
len = $~.end(0) - loc
|
196
|
+
rpositions << [loc, len, 0]
|
197
|
+
end
|
198
|
+
rpositions
|
199
|
+
end
|
200
|
+
|
201
|
+
def squeeze_ws_1!(text)
|
202
|
+
text.gsub!(/\s{2,}/, ' ')
|
203
|
+
end
|
204
|
+
|
205
|
+
def squeeze_ws_0!(text)
|
206
|
+
text.gsub!(/\s+/, '')
|
207
|
+
end
|
208
|
+
|
169
209
|
end
|
170
210
|
|
171
211
|
if __FILE__ == $0
|
@@ -186,5 +226,5 @@ if __FILE__ == $0
|
|
186
226
|
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
187
227
|
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
188
228
|
|
189
|
-
puts new_annotations.to_json
|
229
|
+
# puts new_annotations.to_json
|
190
230
|
end
|
@@ -147,13 +147,24 @@ class TextAlignment::MixedAlignment
|
|
147
147
|
# recoverbility
|
148
148
|
count_nws = sdiff.count{|d| d.old_element =~ /\S/}
|
149
149
|
count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
|
150
|
-
|
151
150
|
coverage = count_nws_match.to_f / count_nws
|
152
151
|
|
153
152
|
# fragmentation rate
|
154
|
-
|
155
|
-
|
156
|
-
|
153
|
+
frag_str = sdiff.collect do |d|
|
154
|
+
case d.action
|
155
|
+
when '='
|
156
|
+
'='
|
157
|
+
when '-'
|
158
|
+
''
|
159
|
+
when '+'
|
160
|
+
(d.new_element =~ /\S/) ? '+' : ''
|
161
|
+
else
|
162
|
+
''
|
163
|
+
end
|
164
|
+
end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
|
165
|
+
|
166
|
+
count_frag = frag_str.scan(/=+/).count
|
167
|
+
rate_frag = 1.0 / count_frag
|
157
168
|
|
158
169
|
similarity = coverage * rate_frag
|
159
170
|
end
|
@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
|
|
11
11
|
attr_reader :similarity
|
12
12
|
attr_reader :lost_annotations
|
13
13
|
|
14
|
-
# Initialize with a reference text,
|
15
|
-
def initialize(reference_text,
|
14
|
+
# Initialize with a reference text, against which texts will be aligned
|
15
|
+
def initialize(reference_text, options = {})
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
+
options ||= {}
|
19
|
+
@to_prevent_overlap = options[:to_prevent_overlap] || false
|
20
|
+
@squeeze_ws_to = options[:squeeze_ws_to] || 0
|
21
|
+
|
18
22
|
@original_reference_text = reference_text
|
19
|
-
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
23
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
|
20
24
|
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
-
@to_prevent_overlap = to_prevent_overlap
|
22
25
|
|
23
26
|
@original_text = nil
|
24
27
|
@blocks = nil
|
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
|
|
32
35
|
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
36
|
unless @original_text && @original_text == text
|
34
37
|
@original_text = text
|
35
|
-
@text_mapping = TextAlignment::CharMapping.new(text)
|
38
|
+
@text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
|
36
39
|
end
|
37
40
|
|
38
41
|
@mapped_text = @text_mapping.mapped_text
|
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
|
|
202
205
|
|
203
206
|
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
204
207
|
## to find block alignments
|
205
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
208
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
|
206
209
|
|
207
210
|
blocks = []
|
208
211
|
while block = anchor_finder.get_next_anchor
|
@@ -320,7 +323,7 @@ class TextAlignment::TextAlignment
|
|
320
323
|
|
321
324
|
def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
|
322
325
|
tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
|
323
|
-
if tblocks.empty?
|
326
|
+
if tblocks.empty? || tblocks.first[:alignment] == :empty
|
324
327
|
lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
|
325
328
|
else
|
326
329
|
tblocks
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-04-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|