text_alignment 0.11.7 → 0.11.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
|
4
|
+
data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
|
7
|
+
data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
|
@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
|
|
6
6
|
|
7
7
|
class TextAlignment::AnchorFinder
|
8
8
|
|
9
|
-
def initialize(source_str, target_str, cultivation_map)
|
9
|
+
def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
|
10
|
+
@method_get_left_windows, @method_get_right_windows = if squeeze_ws
|
11
|
+
[method(:get_left_windows), method(:get_right_windows)]
|
12
|
+
else
|
13
|
+
[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
|
14
|
+
end
|
15
|
+
|
10
16
|
@s1 = source_str.downcase
|
11
17
|
@s2 = target_str.downcase
|
12
18
|
|
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
|
|
108
114
|
next
|
109
115
|
end
|
110
116
|
|
111
|
-
left_window_s1, left_window_s2 =
|
117
|
+
left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
|
112
118
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
113
119
|
break unless valid_beg_s2.nil?
|
114
120
|
valid_beg_s2 = beg_s2
|
115
121
|
next
|
116
122
|
end
|
117
123
|
|
118
|
-
right_window_s1, right_window_s2 =
|
124
|
+
right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
|
119
125
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
120
126
|
break unless valid_beg_s2.nil?
|
121
127
|
valid_beg_s2 = beg_s2
|
@@ -139,7 +145,7 @@ class TextAlignment::AnchorFinder
|
|
139
145
|
size_window ||= @size_window
|
140
146
|
|
141
147
|
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
142
|
-
# return if
|
148
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
143
149
|
|
144
150
|
window_s1 = ''
|
145
151
|
loc = beg_s1 - 1
|
@@ -170,7 +176,7 @@ class TextAlignment::AnchorFinder
|
|
170
176
|
size_window ||= @size_window
|
171
177
|
|
172
178
|
# commend below with the assumption that the end of a document gives a significant locational
|
173
|
-
# return if (
|
179
|
+
# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
|
174
180
|
|
175
181
|
window_s1 = ''
|
176
182
|
loc = beg_s1 + @size_ngram
|
@@ -199,6 +205,44 @@ class TextAlignment::AnchorFinder
|
|
199
205
|
[window_s1, window_s2]
|
200
206
|
end
|
201
207
|
|
208
|
+
def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
209
|
+
size_window ||= @size_window
|
210
|
+
|
211
|
+
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
212
|
+
# return if beg_s1 < size_window || beg_s2 < size_window
|
213
|
+
|
214
|
+
wbeg = beg_s1 - size_window
|
215
|
+
wbeg = 0 if wbeg < 0
|
216
|
+
window_s1 = @s1[wbeg ... beg_s1]
|
217
|
+
|
218
|
+
wbeg = beg_s2 - size_window
|
219
|
+
wbeg = 0 if wbeg < 0
|
220
|
+
window_s2 = @s2[wbeg ... beg_s2]
|
221
|
+
|
222
|
+
[window_s1, window_s2]
|
223
|
+
end
|
224
|
+
|
225
|
+
def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
226
|
+
size_window ||= @size_window
|
227
|
+
|
228
|
+
# commend below with the assumption that the end of a document gives a significant locational
|
229
|
+
# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
|
230
|
+
|
231
|
+
slen = @s1.length
|
232
|
+
wbeg = beg_s1 + @size_ngram
|
233
|
+
wend = wbeg + size_window
|
234
|
+
wend = slen if wend > slen
|
235
|
+
window_s1 = @s1[wbeg ... wend]
|
236
|
+
|
237
|
+
slen = @s2.length
|
238
|
+
wbeg = beg_s2 + @size_ngram
|
239
|
+
wend = wbeg + size_window
|
240
|
+
wend = slen if wend > slen
|
241
|
+
window_s2 = @s2[wbeg ... wend]
|
242
|
+
|
243
|
+
[window_s1, window_s2]
|
244
|
+
end
|
245
|
+
|
202
246
|
def text_similarity(str1, str2, ngram_order = 2)
|
203
247
|
return 0 if str1.nil? || str2.nil?
|
204
248
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
1
3
|
module TextAlignment; end unless defined? TextAlignment
|
2
4
|
|
3
5
|
TextAlignment::CHAR_MAPPING = [
|
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
|
|
80
82
|
class TextAlignment::CharMapping
|
81
83
|
attr_reader :mapped_text
|
82
84
|
|
83
|
-
def initialize(_text, char_mapping = nil)
|
84
|
-
|
85
|
+
def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
|
86
|
+
if squeeze_ws_to == 0
|
87
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
|
88
|
+
@method_squeeze_ws = method(:squeeze_ws_0!)
|
89
|
+
else
|
90
|
+
@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
|
91
|
+
@method_squeeze_ws = method(:squeeze_ws_1!)
|
92
|
+
end
|
93
|
+
|
94
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
85
95
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
86
96
|
@index_enmap = offset_mapping.to_h
|
87
97
|
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
@@ -105,7 +115,7 @@ class TextAlignment::CharMapping
|
|
105
115
|
|
106
116
|
private
|
107
117
|
|
108
|
-
def enmap_text(_text, char_mapping)
|
118
|
+
def enmap_text(_text, char_mapping, no_ws = false)
|
109
119
|
text = _text.dup
|
110
120
|
|
111
121
|
# To execute the single letter mapping replacement
|
@@ -113,14 +123,14 @@ class TextAlignment::CharMapping
|
|
113
123
|
text.gsub!(one, long) if long.length == 1
|
114
124
|
end
|
115
125
|
|
116
|
-
# To get the (
|
117
|
-
|
126
|
+
# To get the replacement positions, (position, old_length, new_length), for char mappings
|
127
|
+
rpositions = []
|
118
128
|
char_mapping.each do |one, long|
|
119
129
|
next if long.length == 1
|
120
130
|
|
121
131
|
init_next = 0
|
122
132
|
while loc = text.index(long, init_next)
|
123
|
-
|
133
|
+
rpositions << [loc, long.length, 1]
|
124
134
|
init_next = loc + long.length
|
125
135
|
end
|
126
136
|
|
@@ -128,22 +138,17 @@ class TextAlignment::CharMapping
|
|
128
138
|
text.gsub!(long, one * long.length)
|
129
139
|
end
|
130
140
|
|
131
|
-
# To get the (
|
132
|
-
|
133
|
-
while loc = text.index(/\s{1,}/, init_next)
|
134
|
-
len = $~[0].length
|
135
|
-
loc_len << [loc, len, 0]
|
136
|
-
init_next = loc + len
|
137
|
-
end
|
141
|
+
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
142
|
+
rpositions += @method_get_positions_squeeze_ws.call(text)
|
138
143
|
|
139
|
-
|
144
|
+
rpositions.sort!{|a, b| a[0] <=> b[0]}
|
140
145
|
|
141
146
|
# To get the offset_mapping before and after replacement
|
142
147
|
offset_mapping = []
|
143
148
|
init_next = 0
|
144
149
|
j = 0
|
145
150
|
|
146
|
-
|
151
|
+
rpositions.each do |loc, old_len, new_len|
|
147
152
|
offset_mapping += (init_next .. loc).map do |i|
|
148
153
|
m = [i, j]
|
149
154
|
j += 1
|
@@ -166,10 +171,41 @@ class TextAlignment::CharMapping
|
|
166
171
|
end
|
167
172
|
|
168
173
|
# To replace multi whitespace sequences to a space
|
169
|
-
|
174
|
+
@method_squeeze_ws.call(text)
|
170
175
|
|
171
176
|
[text, offset_mapping]
|
172
177
|
end
|
178
|
+
|
179
|
+
# To get squeeze positions of whitespaces to one
|
180
|
+
def get_positions_squeeze_ws_1(text)
|
181
|
+
rpositions = []
|
182
|
+
text.scan(/s{2,}/) do |s|
|
183
|
+
loc = $~.begin(0)
|
184
|
+
len = $~.end(0) - loc
|
185
|
+
rpositions << [loc, len, 1]
|
186
|
+
end
|
187
|
+
rpositions
|
188
|
+
end
|
189
|
+
|
190
|
+
# To get squeeze positions of whitespaces to zero
|
191
|
+
def get_positions_squeeze_ws_0(text)
|
192
|
+
rpositions = []
|
193
|
+
text.scan(/\s+/) do |s|
|
194
|
+
loc = $~.begin(0)
|
195
|
+
len = $~.end(0) - loc
|
196
|
+
rpositions << [loc, len, 0]
|
197
|
+
end
|
198
|
+
rpositions
|
199
|
+
end
|
200
|
+
|
201
|
+
def squeeze_ws_1!(text)
|
202
|
+
text.gsub!(/\s{2,}/, ' ')
|
203
|
+
end
|
204
|
+
|
205
|
+
def squeeze_ws_0!(text)
|
206
|
+
text.gsub!(/\s+/, '')
|
207
|
+
end
|
208
|
+
|
173
209
|
end
|
174
210
|
|
175
211
|
if __FILE__ == $0
|
@@ -190,5 +226,5 @@ if __FILE__ == $0
|
|
190
226
|
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
191
227
|
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
192
228
|
|
193
|
-
puts new_annotations.to_json
|
229
|
+
# puts new_annotations.to_json
|
194
230
|
end
|
@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
|
|
11
11
|
attr_reader :similarity
|
12
12
|
attr_reader :lost_annotations
|
13
13
|
|
14
|
-
# Initialize with a reference text,
|
15
|
-
def initialize(reference_text,
|
14
|
+
# Initialize with a reference text, against which texts will be aligned
|
15
|
+
def initialize(reference_text, options = {})
|
16
16
|
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
+
options ||= {}
|
19
|
+
@to_prevent_overlap = options[:to_prevent_overlap] || false
|
20
|
+
@squeeze_ws_to = options[:squeeze_ws_to] || 0
|
21
|
+
|
18
22
|
@original_reference_text = reference_text
|
19
|
-
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
23
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
|
20
24
|
@mapped_reference_text = @rtext_mapping.mapped_text
|
21
|
-
@to_prevent_overlap = to_prevent_overlap
|
22
25
|
|
23
26
|
@original_text = nil
|
24
27
|
@blocks = nil
|
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
|
|
32
35
|
# In case the input text is the same as the previous one, reuse the previous text mapping
|
33
36
|
unless @original_text && @original_text == text
|
34
37
|
@original_text = text
|
35
|
-
@text_mapping = TextAlignment::CharMapping.new(text)
|
38
|
+
@text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
|
36
39
|
end
|
37
40
|
|
38
41
|
@mapped_text = @text_mapping.mapped_text
|
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
|
|
202
205
|
|
203
206
|
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
204
207
|
## to find block alignments
|
205
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
208
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
|
206
209
|
|
207
210
|
blocks = []
|
208
211
|
while block = anchor_finder.get_next_anchor
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.11.
|
4
|
+
version: 0.11.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-04-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|