text_alignment 0.12.8 → 0.12.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/anchor_finder.rb +65 -42
- data/lib/text_alignment/text_alignment.rb +4 -2
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2036a650644cd3b814d8c74d9a7bc7c4cb647adf51b38a5965b5f0f121144276
|
4
|
+
data.tar.gz: 4619c1c428626857d1d189cb1a6d64c870cc3a427c1e31ea0fb8f4decb417b88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9fa5d56dbd8bc4372022e3d58b82958bc3561155a8d33c9f9e05ec48f96ef431d1309e4634a11909b31f32609491f8691a3b8ad41f798e7bb584fd5aa3b5c7ac
|
7
|
+
data.tar.gz: fb58b8aea21cfd8a9ebf24edc693d63397ee7a8f72d175feb8da63a0d0160bb3a4740202e7ce5dd9ba7803d9ad18ebeb38f98e493741d46517c5007f40c8aba9
|
@@ -28,16 +28,40 @@ class TextAlignment::AnchorFinder
|
|
28
28
|
# positions of last match
|
29
29
|
@pos_s1_last_match = 0
|
30
30
|
@pos_s2_last_match = 0
|
31
|
+
|
32
|
+
# Performance: cache for character classification
|
33
|
+
@half_ngram = @size_ngram / 2
|
31
34
|
end
|
32
35
|
|
33
36
|
def get_next_anchor
|
34
37
|
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
38
|
+
iterations = 0
|
35
39
|
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
40
|
+
iterations += 1
|
41
|
+
|
42
|
+
char = @s1[beg_s1]
|
36
43
|
|
37
44
|
# To skip whitespace letters
|
38
|
-
next if
|
45
|
+
next if char == ' ' || char == "\n" || char == "\t"
|
46
|
+
|
47
|
+
# Skip positions that start with punctuation or numbers (likely poor anchors)
|
48
|
+
next if char < 'A' || (char > 'Z' && char < 'a') || char > 'z'
|
49
|
+
|
50
|
+
# Performance optimization: skip if we've had too many failed attempts recently
|
51
|
+
if iterations > 50 && @recent_failures && @recent_failures > 10
|
52
|
+
step_size = [@recent_failures / 5, 3].min
|
53
|
+
beg_s1 += step_size
|
54
|
+
next if beg_s1 > @pos_s1_final_possible_begin
|
55
|
+
end
|
39
56
|
|
40
57
|
_beg_s2 = get_beg_s2(beg_s1)
|
58
|
+
|
59
|
+
if _beg_s2.nil?
|
60
|
+
@recent_failures = (@recent_failures || 0) + 1
|
61
|
+
else
|
62
|
+
@recent_failures = 0 # Reset on success
|
63
|
+
end
|
64
|
+
|
41
65
|
break _beg_s2 unless _beg_s2.nil?
|
42
66
|
end
|
43
67
|
|
@@ -72,7 +96,16 @@ class TextAlignment::AnchorFinder
|
|
72
96
|
# to get the anchor to search for in s2
|
73
97
|
anchor = @s1[beg_s1, @size_ngram]
|
74
98
|
|
99
|
+
# Quick frequency check: skip very short or very common ngrams
|
100
|
+
return nil if anchor.length < @size_ngram
|
101
|
+
return nil if anchor.chars.uniq.length == 1 # Skip repeating character patterns like "aaaaaaaa"
|
102
|
+
|
103
|
+
# Skip ngrams that are mostly whitespace or punctuation
|
104
|
+
non_alnum_count = anchor.count("^a-zA-Z0-9")
|
105
|
+
return nil if non_alnum_count > @half_ngram
|
106
|
+
|
75
107
|
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
108
|
+
|
76
109
|
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
77
110
|
return nil if beg_s2_candidates.empty?
|
78
111
|
|
@@ -87,7 +120,7 @@ class TextAlignment::AnchorFinder
|
|
87
120
|
candidates << _beg_s2
|
88
121
|
|
89
122
|
# for speed, skip anchor of high frequency
|
90
|
-
if candidates.length >
|
123
|
+
if candidates.length > 3
|
91
124
|
candidates.clear
|
92
125
|
break
|
93
126
|
end
|
@@ -98,45 +131,26 @@ class TextAlignment::AnchorFinder
|
|
98
131
|
end
|
99
132
|
|
100
133
|
def find_valid_beg_s2(beg_s1, beg_s2_candidates)
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
valid_beg_s2 = nil
|
105
|
-
|
106
|
-
r = beg_s2_candidates.each do |beg_s2|
|
107
|
-
# if both the begining points are sufficiantly close to the end points of the last match
|
108
|
-
# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
|
134
|
+
[10, 20, 30].each do |size_window|
|
135
|
+
beg_s2_candidates.each do |beg_s2|
|
136
|
+
# if both the beginning points are sufficiently close to the end points of the last match
|
109
137
|
if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
|
110
|
-
|
111
|
-
valid_beg_s2 = beg_s2
|
112
|
-
next
|
138
|
+
return beg_s2
|
113
139
|
end
|
114
140
|
|
115
141
|
left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
|
116
142
|
if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
|
117
|
-
|
118
|
-
valid_beg_s2 = beg_s2
|
119
|
-
next
|
143
|
+
return beg_s2
|
120
144
|
end
|
121
145
|
|
122
146
|
right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
|
123
147
|
if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
|
124
|
-
|
125
|
-
valid_beg_s2 = beg_s2
|
126
|
-
next
|
148
|
+
return beg_s2
|
127
149
|
end
|
128
150
|
end
|
129
|
-
|
130
|
-
# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
|
131
|
-
# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
|
132
|
-
if r.nil?
|
133
|
-
valid_beg_s2 = nil
|
134
|
-
else
|
135
|
-
break
|
136
|
-
end
|
137
151
|
end
|
138
152
|
|
139
|
-
|
153
|
+
nil
|
140
154
|
end
|
141
155
|
|
142
156
|
def get_left_windows(beg_s1, beg_s2, size_window = nil)
|
@@ -145,29 +159,31 @@ class TextAlignment::AnchorFinder
|
|
145
159
|
# comment out below with the assumption that the beginning of a document gives a significant locational information
|
146
160
|
# return if beg_s1 < size_window || beg_s2 < size_window
|
147
161
|
|
148
|
-
|
162
|
+
chars1 = []
|
149
163
|
loc = beg_s1 - 1
|
150
164
|
count = 0
|
151
165
|
while count < size_window && loc >= 0
|
152
|
-
|
153
|
-
|
166
|
+
char = @s1[loc]
|
167
|
+
if alnum_char?(char)
|
168
|
+
chars1 << char
|
154
169
|
count += 1
|
155
170
|
end
|
156
171
|
loc -= 1
|
157
172
|
end
|
158
173
|
|
159
|
-
|
174
|
+
chars2 = []
|
160
175
|
loc = beg_s2 - 1
|
161
176
|
count = 0
|
162
177
|
while count < size_window && loc >= 0
|
163
|
-
|
164
|
-
|
178
|
+
char = @s2[loc]
|
179
|
+
if alnum_char?(char)
|
180
|
+
chars2 << char
|
165
181
|
count += 1
|
166
182
|
end
|
167
183
|
loc -= 1
|
168
184
|
end
|
169
185
|
|
170
|
-
[
|
186
|
+
[chars1.join, chars2.join]
|
171
187
|
end
|
172
188
|
|
173
189
|
def get_right_windows(beg_s1, beg_s2, size_window = nil)
|
@@ -176,31 +192,33 @@ class TextAlignment::AnchorFinder
|
|
176
192
|
# commend below with the assumption that the end of a document gives a significant locational
|
177
193
|
# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
|
178
194
|
|
179
|
-
|
195
|
+
chars1 = []
|
180
196
|
loc = beg_s1 + @size_ngram
|
181
197
|
len_s1 = @s1.length
|
182
198
|
count = 0
|
183
199
|
while count < size_window && loc < len_s1
|
184
|
-
|
185
|
-
|
200
|
+
char = @s1[loc]
|
201
|
+
if alnum_char?(char)
|
202
|
+
chars1 << char
|
186
203
|
count += 1
|
187
204
|
end
|
188
205
|
loc += 1
|
189
206
|
end
|
190
207
|
|
191
|
-
|
208
|
+
chars2 = []
|
192
209
|
loc = beg_s2 + @size_ngram
|
193
210
|
len_s2 = @s2.length
|
194
211
|
count = 0
|
195
212
|
while count < size_window && loc < len_s2
|
196
|
-
|
197
|
-
|
213
|
+
char = @s2[loc]
|
214
|
+
if alnum_char?(char)
|
215
|
+
chars2 << char
|
198
216
|
count += 1
|
199
217
|
end
|
200
218
|
loc += 1
|
201
219
|
end
|
202
220
|
|
203
|
-
[
|
221
|
+
[chars1.join, chars2.join]
|
204
222
|
end
|
205
223
|
|
206
224
|
def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
|
@@ -245,4 +263,9 @@ class TextAlignment::AnchorFinder
|
|
245
263
|
return 0 if str1.nil? || str2.nil?
|
246
264
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
247
265
|
end
|
266
|
+
|
267
|
+
# Fast alphanumeric character check without regex
|
268
|
+
def alnum_char?(char)
|
269
|
+
(char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || (char >= '0' && char <= '9')
|
270
|
+
end
|
248
271
|
end
|
@@ -38,7 +38,6 @@ class TextAlignment::TextAlignment
|
|
38
38
|
@original_text = text
|
39
39
|
@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
|
40
40
|
end
|
41
|
-
|
42
41
|
@mapped_text = @text_mapping.mapped_text
|
43
42
|
|
44
43
|
## To generate the block_alignment of the input text against the reference text
|
@@ -251,7 +250,7 @@ class TextAlignment::TextAlignment
|
|
251
250
|
if b2 < e2
|
252
251
|
_str2 = str2[b2 ... e2]
|
253
252
|
|
254
|
-
|
253
|
+
gap_result = if _str1.strip.empty? || _str2.strip.empty?
|
255
254
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
256
255
|
else
|
257
256
|
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -307,6 +306,8 @@ class TextAlignment::TextAlignment
|
|
307
306
|
end
|
308
307
|
end
|
309
308
|
end
|
309
|
+
|
310
|
+
sum += gap_result
|
310
311
|
elsif b2 > e2 # when out of order
|
311
312
|
# ToDo
|
312
313
|
end
|
@@ -317,6 +318,7 @@ class TextAlignment::TextAlignment
|
|
317
318
|
cblock.nil? ? sum : sum << cblock
|
318
319
|
end
|
319
320
|
|
321
|
+
blocks2
|
320
322
|
end
|
321
323
|
|
322
324
|
def whole_block_alignment(str1, str2, cultivation_map)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|