text_alignment 0.12.9 → 0.12.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7da208b2cd252983fd0c7f8378130f7b13bba3df1c698f55e2133d54d9dab61d
4
- data.tar.gz: 998405eb7c03b065faae083368a285cf526a193d7b59d22aafce643887b4150e
3
+ metadata.gz: 77b16002a92b4aedc56351a568698fa4032f750b49cbdf1ee4927548a278fe2a
4
+ data.tar.gz: 40b69153c41b6af4182f15769b5dc0eef2b988d4c838d16c3f3a803a6d03cad3
5
5
  SHA512:
6
- metadata.gz: 246e0796040cedd989e12f7ef51f33a3209e0f82a396813ad9062323459e731c8d7b651a2461b732d90119c2e95586e1dbc296db91a99aeca574f4337aca6c8a
7
- data.tar.gz: 06dedaa086dd878e41d26d18e427cf1d7c564583b34d047758a57e4508506cba6714abe9d8e04fa63c474ec2cc3fb9a585fef00312efe409c22b73528bdbde97
6
+ metadata.gz: 71d13c5a54db05fba9fd32b6eaa1cd7a714a9bef0b69c83dc9fae240e302a7aa7c06591b26749a6d4206bf25ba83ddf1411d37ee74dffd26b93a782a99926983
7
+ data.tar.gz: fb67eb1ecc933228ede81ecc2690a577853a9a16d12e17e0b5be696dcc661f1aa39399ff877ad567179a0a1d3d3a07134388d87dfd7a72510b56943c87339fdb
@@ -28,6 +28,9 @@ class TextAlignment::AnchorFinder
28
28
  # positions of last match
29
29
  @pos_s1_last_match = 0
30
30
  @pos_s2_last_match = 0
31
+
32
+ # Performance: cache for character classification
33
+ @half_ngram = @size_ngram / 2
31
34
  end
32
35
 
33
36
  def get_next_anchor
@@ -36,11 +39,13 @@ class TextAlignment::AnchorFinder
36
39
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
37
40
  iterations += 1
38
41
 
42
+ char = @s1[beg_s1]
43
+
39
44
  # To skip whitespace letters
40
- next if [' ', "\n", "\t"].include? @s1[beg_s1]
45
+ next if char == ' ' || char == "\n" || char == "\t"
41
46
 
42
47
  # Skip positions that start with punctuation or numbers (likely poor anchors)
43
- next if @s1[beg_s1] =~ /[^a-zA-Z]/
48
+ next if char < 'A' || (char > 'Z' && char < 'a') || char > 'z'
44
49
 
45
50
  # Performance optimization: skip if we've had too many failed attempts recently
46
51
  if iterations > 50 && @recent_failures && @recent_failures > 10
@@ -93,11 +98,11 @@ class TextAlignment::AnchorFinder
93
98
 
94
99
  # Quick frequency check: skip very short or very common ngrams
95
100
  return nil if anchor.length < @size_ngram
96
- return nil if anchor =~ /^(.)\1+$/ # Skip repeating character patterns like "aaaaaaaa"
101
+ return nil if anchor.chars.uniq.length == 1 # Skip repeating character patterns like "aaaaaaaa"
97
102
 
98
103
  # Skip ngrams that are mostly whitespace or punctuation
99
104
  non_alnum_count = anchor.count("^a-zA-Z0-9")
100
- return nil if non_alnum_count > @size_ngram / 2
105
+ return nil if non_alnum_count > @half_ngram
101
106
 
102
107
  search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
103
108
 
@@ -126,45 +131,26 @@ class TextAlignment::AnchorFinder
126
131
  end
127
132
 
128
133
  def find_valid_beg_s2(beg_s1, beg_s2_candidates)
129
- valid_beg_s2 = nil
130
-
131
- (10 .. 30).step(10).each do |size_window|
132
- valid_beg_s2 = nil
133
-
134
- r = beg_s2_candidates.each do |beg_s2|
135
- # if both the begining points are sufficiantly close to the end points of the last match
136
- # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
134
+ [10, 20, 30].each do |size_window|
135
+ beg_s2_candidates.each do |beg_s2|
136
+ # if both the beginning points are sufficiently close to the end points of the last match
137
137
  if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
138
- break unless valid_beg_s2.nil?
139
- valid_beg_s2 = beg_s2
140
- next
138
+ return beg_s2
141
139
  end
142
140
 
143
141
  left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
144
142
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
145
- break unless valid_beg_s2.nil?
146
- valid_beg_s2 = beg_s2
147
- next
143
+ return beg_s2
148
144
  end
149
145
 
150
146
  right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
151
147
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
152
- break unless valid_beg_s2.nil?
153
- valid_beg_s2 = beg_s2
154
- next
148
+ return beg_s2
155
149
  end
156
150
  end
157
-
158
- # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
159
- # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
160
- if r.nil?
161
- valid_beg_s2 = nil
162
- else
163
- break
164
- end
165
151
  end
166
152
 
167
- valid_beg_s2
153
+ nil
168
154
  end
169
155
 
170
156
  def get_left_windows(beg_s1, beg_s2, size_window = nil)
@@ -173,29 +159,31 @@ class TextAlignment::AnchorFinder
173
159
  # comment out below with the assumption that the beginning of a document gives a significant locational information
174
160
  # return if beg_s1 < size_window || beg_s2 < size_window
175
161
 
176
- window_s1 = ''
162
+ chars1 = []
177
163
  loc = beg_s1 - 1
178
164
  count = 0
179
165
  while count < size_window && loc >= 0
180
- if @s1[loc] =~ /[0-9a-zA-Z]/
181
- window_s1 += @s1[loc]
166
+ char = @s1[loc]
167
+ if alnum_char?(char)
168
+ chars1 << char
182
169
  count += 1
183
170
  end
184
171
  loc -= 1
185
172
  end
186
173
 
187
- window_s2 = ''
174
+ chars2 = []
188
175
  loc = beg_s2 - 1
189
176
  count = 0
190
177
  while count < size_window && loc >= 0
191
- if @s2[loc] =~ /[0-9a-zA-Z]/
192
- window_s2 += @s2[loc]
178
+ char = @s2[loc]
179
+ if alnum_char?(char)
180
+ chars2 << char
193
181
  count += 1
194
182
  end
195
183
  loc -= 1
196
184
  end
197
185
 
198
- [window_s1, window_s2]
186
+ [chars1.join, chars2.join]
199
187
  end
200
188
 
201
189
  def get_right_windows(beg_s1, beg_s2, size_window = nil)
@@ -204,31 +192,33 @@ class TextAlignment::AnchorFinder
204
192
  # commend below with the assumption that the end of a document gives a significant locational
205
193
  # return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
206
194
 
207
- window_s1 = ''
195
+ chars1 = []
208
196
  loc = beg_s1 + @size_ngram
209
197
  len_s1 = @s1.length
210
198
  count = 0
211
199
  while count < size_window && loc < len_s1
212
- if @s1[loc] =~ /[0-9a-zA-Z]/
213
- window_s1 += @s1[loc]
200
+ char = @s1[loc]
201
+ if alnum_char?(char)
202
+ chars1 << char
214
203
  count += 1
215
204
  end
216
205
  loc += 1
217
206
  end
218
207
 
219
- window_s2 = ''
208
+ chars2 = []
220
209
  loc = beg_s2 + @size_ngram
221
210
  len_s2 = @s2.length
222
211
  count = 0
223
212
  while count < size_window && loc < len_s2
224
- if @s2[loc] =~ /[0-9a-zA-Z]/
225
- window_s2 += @s2[loc]
213
+ char = @s2[loc]
214
+ if alnum_char?(char)
215
+ chars2 << char
226
216
  count += 1
227
217
  end
228
218
  loc += 1
229
219
  end
230
220
 
231
- [window_s1, window_s2]
221
+ [chars1.join, chars2.join]
232
222
  end
233
223
 
234
224
  def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
@@ -273,4 +263,9 @@ class TextAlignment::AnchorFinder
273
263
  return 0 if str1.nil? || str2.nil?
274
264
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
275
265
  end
266
+
267
+ # Fast alphanumeric character check without regex
268
+ def alnum_char?(char)
269
+ (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || (char >= '0' && char <= '9')
270
+ end
276
271
  end
@@ -151,29 +151,31 @@ class TextAlignment::CharMapping
151
151
 
152
152
  def enmap_text(_text, char_mapping, no_ws = false)
153
153
  text = _text.dup
154
-
155
- # To perform the single letter mapping replacement
156
- char_mapping.each do |one, long|
157
- # text.gsub!(one, long) if long.length == 1
158
- text.tr!(one, long) if long.length == 1
159
- end
160
-
161
- # To get the replacement positions, (position, old_length, new_length), for char mappings
162
154
  rpositions = []
163
- char_mapping.each do |one, long|
164
- next if long.length == 1
165
155
 
166
- init_next = 0
167
- while loc = text.index(long, init_next)
168
- # Huristics to check if the surrounding letters are sufficiently distinguished.
169
- if long.length > 3 || ((text[loc - 1, 2] !~ /[a-z][a-z]/) && (text[loc + long.length - 1, 2] !~ /[a-z][a-z]/))
170
- # if true
171
- rpositions << [loc, long.length, 1]
156
+ # Skip character mapping if text is pure ASCII (performance optimization)
157
+ unless text.ascii_only?
158
+ # To perform the single letter mapping replacement
159
+ char_mapping.each do |one, long|
160
+ text.gsub!(one, long) if long.length == 1
161
+ end
172
162
 
173
- # a workaround to avoid messing-up due to embedding
174
- text[loc, long.length] = one * long.length
163
+ # To get the replacement positions, (position, old_length, new_length), for char mappings
164
+ char_mapping.each do |one, long|
165
+ next if long.length == 1
166
+
167
+ init_next = 0
168
+ while loc = text.index(long, init_next)
169
+ # Huristics to check if the surrounding letters are sufficiently distinguished.
170
+ if long.length > 3 || ((text[loc - 1, 2] !~ /[a-z][a-z]/) && (text[loc + long.length - 1, 2] !~ /[a-z][a-z]/))
171
+ # if true
172
+ rpositions << [loc, long.length, 1]
173
+
174
+ # a workaround to avoid messing-up due to embedding
175
+ text[loc, long.length] = one * long.length
176
+ end
177
+ init_next = loc + long.length
175
178
  end
176
- init_next = loc + long.length
177
179
  end
178
180
  end
179
181
 
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.12.9'
2
+ VERSION = '0.12.11'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.9
4
+ version: 0.12.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-09-30 00:00:00.000000000 Z
11
+ date: 2025-10-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary