text_alignment 0.12.9 → 0.12.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/anchor_finder.rb +39 -44
- data/lib/text_alignment/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2036a650644cd3b814d8c74d9a7bc7c4cb647adf51b38a5965b5f0f121144276
|
4
|
+
data.tar.gz: 4619c1c428626857d1d189cb1a6d64c870cc3a427c1e31ea0fb8f4decb417b88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9fa5d56dbd8bc4372022e3d58b82958bc3561155a8d33c9f9e05ec48f96ef431d1309e4634a11909b31f32609491f8691a3b8ad41f798e7bb584fd5aa3b5c7ac
|
7
|
+
data.tar.gz: fb58b8aea21cfd8a9ebf24edc693d63397ee7a8f72d175feb8da63a0d0160bb3a4740202e7ce5dd9ba7803d9ad18ebeb38f98e493741d46517c5007f40c8aba9
|
@@ -28,6 +28,9 @@ class TextAlignment::AnchorFinder
|
|
28
28
|
# positions of last match
|
29
29
|
@pos_s1_last_match = 0
|
30
30
|
@pos_s2_last_match = 0
|
31
|
+
|
32
|
+
# Performance: cache for character classification
|
33
|
+
@half_ngram = @size_ngram / 2
|
31
34
|
end
|
32
35
|
|
33
36
|
def get_next_anchor
|
@@ -36,11 +39,13 @@ class TextAlignment::AnchorFinder
|
|
36
39
|
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
37
40
|
iterations += 1
|
38
41
|
|
42
|
+
char = @s1[beg_s1]
|
43
|
+
|
39
44
|
# To skip whitespace letters
|
40
|
-
next if
|
45
|
+
next if char == ' ' || char == "\n" || char == "\t"
|
41
46
|
|
42
47
|
# Skip positions that start with punctuation or numbers (likely poor anchors)
|
43
|
-
next if
|
48
|
+
next if char < 'A' || (char > 'Z' && char < 'a') || char > 'z'
|
44
49
|
|
45
50
|
# Performance optimization: skip if we've had too many failed attempts recently
|
46
51
|
if iterations > 50 && @recent_failures && @recent_failures > 10
|
@@ -93,11 +98,11 @@ class TextAlignment::AnchorFinder
|
|
93
98
|
|
94
99
|
# Quick frequency check: skip very short or very common ngrams
|
95
100
|
return nil if anchor.length < @size_ngram
|
96
|
-
return nil if anchor
|
101
|
+
return nil if anchor.chars.uniq.length == 1 # Skip repeating character patterns like "aaaaaaaa"
|
97
102
|
|
98
103
|
# Skip ngrams that are mostly whitespace or punctuation
|
99
104
|
non_alnum_count = anchor.count("^a-zA-Z0-9")
|
100
|
-
return nil if non_alnum_count > @
|
105
|
+
return nil if non_alnum_count > @half_ngram
|
101
106
|
|
102
107
|
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
103
108
|
|
@@ -126,45 +131,26 @@ class TextAlignment::AnchorFinder
|
|
126
131
|
end
|
127
132
|
|
128
133
|
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
valid_beg_s2 = nil
|
133
|
-
|
134
|
-
r = beg_s2_candidates.each do |beg_s2|
|
135
|
-
# if both the begining points are sufficiantly close to the end points of the last match
|
136
|
-
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
134
|
+
[10, 20, 30].each do |size_window|
|
135
|
+
beg_s2_candidates.each do |beg_s2|
|
136
|
+
# if both the beginning points are sufficiently close to the end points of the last match
|
137
137
|
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
138
|
-
|
139
|
-
valid_beg_s2 = beg_s2
|
140
|
-
next
|
138
|
+
return beg_s2
|
141
139
|
end
|
142
140
|
|
143
141
|
left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
|
144
142
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
145
|
-
|
146
|
-
valid_beg_s2 = beg_s2
|
147
|
-
next
|
143
|
+
return beg_s2
|
148
144
|
end
|
149
145
|
|
150
146
|
right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
|
151
147
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
152
|
-
|
153
|
-
valid_beg_s2 = beg_s2
|
154
|
-
next
|
148
|
+
return beg_s2
|
155
149
|
end
|
156
150
|
end
|
157
|
-
|
158
|
-
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
159
|
-
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
160
|
-
if r.nil?
|
161
|
-
valid_beg_s2 = nil
|
162
|
-
else
|
163
|
-
break
|
164
|
-
end
|
165
151
|
end
|
166
152
|
|
167
|
-
|
153
|
+
nil
|
168
154
|
end
|
169
155
|
|
170
156
|
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
@@ -173,29 +159,31 @@ class TextAlignment::AnchorFinder
|
|
173
159
|
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
174
160
|
# return if beg_s1 < size_window || beg_s2 < size_window
|
175
161
|
|
176
|
-
|
162
|
+
chars1 = []
|
177
163
|
loc = beg_s1 - 1
|
178
164
|
count = 0
|
179
165
|
while count < size_window && loc >= 0
|
180
|
-
|
181
|
-
|
166
|
+
char = @s1[loc]
|
167
|
+
if alnum_char?(char)
|
168
|
+
chars1 << char
|
182
169
|
count += 1
|
183
170
|
end
|
184
171
|
loc -= 1
|
185
172
|
end
|
186
173
|
|
187
|
-
|
174
|
+
chars2 = []
|
188
175
|
loc = beg_s2 - 1
|
189
176
|
count = 0
|
190
177
|
while count < size_window && loc >= 0
|
191
|
-
|
192
|
-
|
178
|
+
char = @s2[loc]
|
179
|
+
if alnum_char?(char)
|
180
|
+
chars2 << char
|
193
181
|
count += 1
|
194
182
|
end
|
195
183
|
loc -= 1
|
196
184
|
end
|
197
185
|
|
198
|
-
[
|
186
|
+
[chars1.join, chars2.join]
|
199
187
|
end
|
200
188
|
|
201
189
|
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
@@ -204,31 +192,33 @@ class TextAlignment::AnchorFinder
|
|
204
192
|
# commend below with the assumption that the end of a document gives a significant locational
|
205
193
|
# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
|
206
194
|
|
207
|
-
|
195
|
+
chars1 = []
|
208
196
|
loc = beg_s1 + @size_ngram
|
209
197
|
len_s1 = @s1.length
|
210
198
|
count = 0
|
211
199
|
while count < size_window && loc < len_s1
|
212
|
-
|
213
|
-
|
200
|
+
char = @s1[loc]
|
201
|
+
if alnum_char?(char)
|
202
|
+
chars1 << char
|
214
203
|
count += 1
|
215
204
|
end
|
216
205
|
loc += 1
|
217
206
|
end
|
218
207
|
|
219
|
-
|
208
|
+
chars2 = []
|
220
209
|
loc = beg_s2 + @size_ngram
|
221
210
|
len_s2 = @s2.length
|
222
211
|
count = 0
|
223
212
|
while count < size_window && loc < len_s2
|
224
|
-
|
225
|
-
|
213
|
+
char = @s2[loc]
|
214
|
+
if alnum_char?(char)
|
215
|
+
chars2 << char
|
226
216
|
count += 1
|
227
217
|
end
|
228
218
|
loc += 1
|
229
219
|
end
|
230
220
|
|
231
|
-
[
|
221
|
+
[chars1.join, chars2.join]
|
232
222
|
end
|
233
223
|
|
234
224
|
def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
@@ -273,4 +263,9 @@ class TextAlignment::AnchorFinder
|
|
273
263
|
return 0 if str1.nil? || str2.nil?
|
274
264
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
275
265
|
end
|
266
|
+
|
267
|
+
# Fast alphanumeric character check without regex
|
268
|
+
def alnum_char?(char)
|
269
|
+
(char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || (char >= '0' && char <= '9')
|
270
|
+
end
|
276
271
|
end
|