text_alignment 0.11.7 → 0.11.8
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
|
4
|
+
data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
|
7
|
+
data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
|
@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str, cultivation_map)
|
9
|
+
def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
|
10
|
+
@method_get_left_windows, @method_get_right_windows = if squeeze_ws
|
11
|
+
[method(:get_left_windows), method(:get_right_windows)]
|
12
|
+
else
|
13
|
+
[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
|
14
|
+
end
|
15
|
+
|
10
16
|
@s1 = source_str.downcase
|
11
17
|
@s2 = target_str.downcase
|
12
18
|
|
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
|
|
108
114
|
next
|
109
115
|
end
|
110
116
|
|
111
|
-
left_window_s1, left_window_s2 =
|
117
|
+
left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
|
112
118
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
119
|
break unless valid_beg_s2.nil?
|
114
120
|
valid_beg_s2 = beg_s2
|
115
121
|
next
|
116
122
|
end
|
117
123
|
|
118
|
-
right_window_s1, right_window_s2 =
|
124
|
+
right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
|
119
125
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
126
|
break unless valid_beg_s2.nil?
|
121
127
|
valid_beg_s2 = beg_s2
|
@@ -139,7 +145,7 @@ class TextAlignment::AnchorFinder
|
|
139
145
|
size_window ||= @size_window
|
140
146
|
|
141
147
|
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
142
|
-
# return if
|
148
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
143
149
|
|
144
150
|
window_s1 = ''
|
145
151
|
loc = beg_s1 - 1
|
@@ -170,7 +176,7 @@ class TextAlignment::AnchorFinder
|
|
170
176
|
size_window ||= @size_window
|
171
177
|
|
172
178
|
# commend below with the assumption that the end of a document gives a significant locational
|
173
|
-
# return if (
|
179
|
+
# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
|
174
180
|
|
175
181
|
window_s1 = ''
|
176
182
|
loc = beg_s1 + @size_ngram
|
@@ -199,6 +205,44 @@ class TextAlignment::AnchorFinder
|
|
199
205
|
[window_s1, window_s2]
|
200
206
|
end
|
201
207
|
|
208
|
+
def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
209
|
+
size_window ||= @size_window
|
210
|
+
|
211
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
212
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
213
|
+
|
214
|
+
wbeg = beg_s1 - size_window
|
215
|
+
wbeg = 0 if wbeg < 0
|
216
|
+
window_s1 = @s1[wbeg ... beg_s1]
|
217
|
+
|
218
|
+
wbeg = beg_s2 - size_window
|
219
|
+
wbeg = 0 if wbeg < 0
|
220
|
+
window_s2 = @s2[wbeg ... beg_s2]
|
221
|
+
|
222
|
+
[window_s1, window_s2]
|
223
|
+
end
|
224
|
+
|
225
|
+
def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
226
|
+
size_window ||= @size_window
|
227
|
+
|
228
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
229
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
230
|
+
|
231
|
+
slen = @s1.length
|
232
|
+
wbeg = beg_s1 + @size_ngram
|
233
|
+
wend = wbeg + size_window
|
234
|
+
wend = slen if wend > slen
|
235
|
+
window_s1 = @s1[wbeg ... wend]
|
236
|
+
|
237
|
+
slen = @s2.length
|
238
|
+
wbeg = beg_s2 + @size_ngram
|
239
|
+
wend = wbeg + size_window
|
240
|
+
wend = slen if wend > slen
|
241
|
+
window_s2 = @s2[wbeg ... wend]
|
242
|
+
|
243
|
+
[window_s1, window_s2]
|
244
|
+
end
|
245
|
+
|
202
246
|
def text_similarity(str1, str2, ngram_order = 2)
|
203
247
|
return 0 if str1.nil? || str2.nil?
|
204
248
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module TextAlignment; end unless defined? TextAlignment
|
2
4
|
|
3
5
|
TextAlignment::CHAR_MAPPING = [
|
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
|
|
80
82
|
class TextAlignment::CharMapping
|
81
83
|
attr_reader :mapped_text
|
82
84
|
|
83
|
-
def initialize(_text, char_mapping = nil)
|
84
|
-
|
85
|
+
def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
|
86
|
+
if squeeze_ws_to == 0
|
87
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
|
88
|
+
@method_squeeze_ws = method(:squeeze_ws_0!)
|
89
|
+
else
|
90
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
|
91
|
+
@method_squeeze_ws = method(:squeeze_ws_1!)
|
92
|
+
end
|
93
|
+
|
94
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
85
95
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
96
|
@index_enmap = offset_mapping.to_h
|
87
97
|
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
@@ -105,7 +115,7 @@ class TextAlignment::CharMapping
|
|
105
115
|
|
106
116
|
private
|
107
117
|
|
108
|
-
def enmap_text(_text, char_mapping)
|
118
|
+
def enmap_text(_text, char_mapping, no_ws = false)
|
109
119
|
text = _text.dup
|
110
120
|
|
111
121
|
# To execute the single letter mapping replacement
|
@@ -113,14 +123,14 @@ class TextAlignment::CharMapping
|
|
113
123
|
text.gsub!(one, long) if long.length == 1
|
114
124
|
end
|
115
125
|
|
116
|
-
# To get the (
|
117
|
-
|
126
|
+
# To get the replacement positions, (position, old_length, new_length), for char mappings
|
127
|
+
rpositions = []
|
118
128
|
char_mapping.each do |one, long|
|
119
129
|
next if long.length == 1
|
120
130
|
|
121
131
|
init_next = 0
|
122
132
|
while loc = text.index(long, init_next)
|
123
|
-
|
133
|
+
rpositions << [loc, long.length, 1]
|
124
134
|
init_next = loc + long.length
|
125
135
|
end
|
126
136
|
|
@@ -128,22 +138,17 @@ class TextAlignment::CharMapping
|
|
128
138
|
text.gsub!(long, one * long.length)
|
129
139
|
end
|
130
140
|
|
131
|
-
# To get the (
|
132
|
-
|
133
|
-
while loc = text.index(/\s{1,}/, init_next)
|
134
|
-
len = $~[0].length
|
135
|
-
loc_len << [loc, len, 0]
|
136
|
-
init_next = loc + len
|
137
|
-
end
|
141
|
+
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
142
|
+
rpositions += @method_get_positions_squeeze_ws.call(text)
|
138
143
|
|
139
|
-
|
144
|
+
rpositions.sort!{|a, b| a[0] <=> b[0]}
|
140
145
|
|
141
146
|
# To get the offset_mapping before and after replacement
|
142
147
|
offset_mapping = []
|
143
148
|
init_next = 0
|
144
149
|
j = 0
|
145
150
|
|
146
|
-
|
151
|
+
rpositions.each do |loc, old_len, new_len|
|
147
152
|
offset_mapping += (init_next .. loc).map do |i|
|
148
153
|
m = [i, j]
|
149
154
|
j += 1
|
@@ -166,10 +171,41 @@ class TextAlignment::CharMapping
|
|
166
171
|
end
|
167
172
|
|
168
173
|
# To replace multi whitespace sequences to a space
|
169
|
-
|
174
|
+
@method_squeeze_ws.call(text)
|
170
175
|
|
171
176
|
[text, offset_mapping]
|
172
177
|
end
|
178
|
+
|
179
|
+
# To get squeeze positions of whitespaces to one
|
180
|
+
def get_positions_squeeze_ws_1(text)
|
181
|
+
rpositions = []
|
182
|
+
text.scan(/s{2,}/) do |s|
|
183
|
+
loc = $~.begin(0)
|
184
|
+
len = $~.end(0) - loc
|
185
|
+
rpositions << [loc, len, 1]
|
186
|
+
end
|
187
|
+
rpositions
|
188
|
+
end
|
189
|
+
|
190
|
+
# To get squeeze positions of whitespaces to zero
|
191
|
+
def get_positions_squeeze_ws_0(text)
|
192
|
+
rpositions = []
|
193
|
+
text.scan(/\s+/) do |s|
|
194
|
+
loc = $~.begin(0)
|
195
|
+
len = $~.end(0) - loc
|
196
|
+
rpositions << [loc, len, 0]
|
197
|
+
end
|
198
|
+
rpositions
|
199
|
+
end
|
200
|
+
|
201
|
+
def squeeze_ws_1!(text)
|
202
|
+
text.gsub!(/\s{2,}/, ' ')
|
203
|
+
end
|
204
|
+
|
205
|
+
def squeeze_ws_0!(text)
|
206
|
+
text.gsub!(/\s+/, '')
|
207
|
+
end
|
208
|
+
|
173
209
|
end
|
174
210
|
|
175
211
|
if __FILE__ == $0
|
@@ -190,5 +226,5 @@ if __FILE__ == $0
|
|
190
226
|
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
191
227
|
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
192
228
|
|
193
|
-
puts new_annotations.to_json
|
229
|
+
# puts new_annotations.to_json
|
194
230
|
end
|
@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
|
|
11
11
|
attr_reader :similarity
|
12
12
|
attr_reader :lost_annotations
|
13
13
|
|
14
|
-
# Initialize with a reference text,
|
15
|
-
def initialize(reference_text,
|
14
|
+
# Initialize with a reference text, against which texts will be aligned
|
15
|
+
def initialize(reference_text, options = {})
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
+
options ||= {}
|
19
|
+
@to_prevent_overlap = options[:to_prevent_overlap] || false
|
20
|
+
@squeeze_ws_to = options[:squeeze_ws_to] || 0
|
21
|
+
|
18
22
|
@original_reference_text = reference_text
|
19
|
-
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
23
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
|
20
24
|
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
-
@to_prevent_overlap = to_prevent_overlap
|
22
25
|
|
23
26
|
@original_text = nil
|
24
27
|
@blocks = nil
|
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
|
|
32
35
|
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
36
|
unless @original_text && @original_text == text
|
34
37
|
@original_text = text
|
35
|
-
@text_mapping = TextAlignment::CharMapping.new(text)
|
38
|
+
@text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
|
36
39
|
end
|
37
40
|
|
38
41
|
@mapped_text = @text_mapping.mapped_text
|
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
|
|
202
205
|
|
203
206
|
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
204
207
|
## to find block alignments
|
205
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
208
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
|
206
209
|
|
207
210
|
blocks = []
|
208
211
|
while block = anchor_finder.get_next_anchor
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-04-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|