text_alignment 0.12.8 → 0.12.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 528dd8cf88da73d13e8933b69ce2cfb27a1cbd245392e072c53c2c02bd1b57c4
4
- data.tar.gz: b74d808fe3412f704e7770ac0ac50a645ef75e9c8667f715324f7769f3bf151f
3
+ metadata.gz: 2036a650644cd3b814d8c74d9a7bc7c4cb647adf51b38a5965b5f0f121144276
4
+ data.tar.gz: 4619c1c428626857d1d189cb1a6d64c870cc3a427c1e31ea0fb8f4decb417b88
5
5
  SHA512:
6
- metadata.gz: 461745ed09343a23ab2a5ca008cc2625ec7679573c57bbb1e70e1f76cd2d51478ed2bcbb47197aee76255de3404897138ec569cfb93149690b974f26d203594d
7
- data.tar.gz: 937f951a7e84bf065ab7ad4818d7d9731f365c16abadf165cb9daca27eefde83f0d695b33b60b5d54a3af97ea3b6c963b95183ff25e19794fa005474602ee6e9
6
+ metadata.gz: 9fa5d56dbd8bc4372022e3d58b82958bc3561155a8d33c9f9e05ec48f96ef431d1309e4634a11909b31f32609491f8691a3b8ad41f798e7bb584fd5aa3b5c7ac
7
+ data.tar.gz: fb58b8aea21cfd8a9ebf24edc693d63397ee7a8f72d175feb8da63a0d0160bb3a4740202e7ce5dd9ba7803d9ad18ebeb38f98e493741d46517c5007f40c8aba9
@@ -28,16 +28,40 @@ class TextAlignment::AnchorFinder
28
28
  # positions of last match
29
29
  @pos_s1_last_match = 0
30
30
  @pos_s2_last_match = 0
31
+
32
+ # Performance: cache for character classification
33
+ @half_ngram = @size_ngram / 2
31
34
  end
32
35
 
33
36
  def get_next_anchor
34
37
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
38
+ iterations = 0
35
39
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
40
+ iterations += 1
41
+
42
+ char = @s1[beg_s1]
36
43
 
37
44
  # To skip whitespace letters
38
- next if [' ', "\n", "\t"].include? @s1[beg_s1]
45
+ next if char == ' ' || char == "\n" || char == "\t"
46
+
47
+ # Skip positions that start with punctuation or numbers (likely poor anchors)
48
+ next if char < 'A' || (char > 'Z' && char < 'a') || char > 'z'
49
+
50
+ # Performance optimization: skip if we've had too many failed attempts recently
51
+ if iterations > 50 && @recent_failures && @recent_failures > 10
52
+ step_size = [@recent_failures / 5, 3].min
53
+ beg_s1 += step_size
54
+ next if beg_s1 > @pos_s1_final_possible_begin
55
+ end
39
56
 
40
57
  _beg_s2 = get_beg_s2(beg_s1)
58
+
59
+ if _beg_s2.nil?
60
+ @recent_failures = (@recent_failures || 0) + 1
61
+ else
62
+ @recent_failures = 0 # Reset on success
63
+ end
64
+
41
65
  break _beg_s2 unless _beg_s2.nil?
42
66
  end
43
67
 
@@ -72,7 +96,16 @@ class TextAlignment::AnchorFinder
72
96
  # to get the anchor to search for in s2
73
97
  anchor = @s1[beg_s1, @size_ngram]
74
98
 
99
+ # Quick frequency check: skip very short or very common ngrams
100
+ return nil if anchor.length < @size_ngram
101
+ return nil if anchor.chars.uniq.length == 1 # Skip repeating character patterns like "aaaaaaaa"
102
+
103
+ # Skip ngrams that are mostly whitespace or punctuation
104
+ non_alnum_count = anchor.count("^a-zA-Z0-9")
105
+ return nil if non_alnum_count > @half_ngram
106
+
75
107
  search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
108
+
76
109
  beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
77
110
  return nil if beg_s2_candidates.empty?
78
111
 
@@ -87,7 +120,7 @@ class TextAlignment::AnchorFinder
87
120
  candidates << _beg_s2
88
121
 
89
122
  # for speed, skip anchor of high frequency
90
- if candidates.length > 5
123
+ if candidates.length > 3
91
124
  candidates.clear
92
125
  break
93
126
  end
@@ -98,45 +131,26 @@ class TextAlignment::AnchorFinder
98
131
  end
99
132
 
100
133
  def find_valid_beg_s2(beg_s1, beg_s2_candidates)
101
- valid_beg_s2 = nil
102
-
103
- (10 .. 30).step(10).each do |size_window|
104
- valid_beg_s2 = nil
105
-
106
- r = beg_s2_candidates.each do |beg_s2|
107
- # if both the begining points are sufficiantly close to the end points of the last match
108
- # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
134
+ [10, 20, 30].each do |size_window|
135
+ beg_s2_candidates.each do |beg_s2|
136
+ # if both the beginning points are sufficiently close to the end points of the last match
109
137
  if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
110
- break unless valid_beg_s2.nil?
111
- valid_beg_s2 = beg_s2
112
- next
138
+ return beg_s2
113
139
  end
114
140
 
115
141
  left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
116
142
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
117
- break unless valid_beg_s2.nil?
118
- valid_beg_s2 = beg_s2
119
- next
143
+ return beg_s2
120
144
  end
121
145
 
122
146
  right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
123
147
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
124
- break unless valid_beg_s2.nil?
125
- valid_beg_s2 = beg_s2
126
- next
148
+ return beg_s2
127
149
  end
128
150
  end
129
-
130
- # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
131
- # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
132
- if r.nil?
133
- valid_beg_s2 = nil
134
- else
135
- break
136
- end
137
151
  end
138
152
 
139
- valid_beg_s2
153
+ nil
140
154
  end
141
155
 
142
156
  def get_left_windows(beg_s1, beg_s2, size_window = nil)
@@ -145,29 +159,31 @@ class TextAlignment::AnchorFinder
145
159
  # comment out below with the assumption that the beginning of a document gives a significant locational information
146
160
  # return if beg_s1 < size_window || beg_s2 < size_window
147
161
 
148
- window_s1 = ''
162
+ chars1 = []
149
163
  loc = beg_s1 - 1
150
164
  count = 0
151
165
  while count < size_window && loc >= 0
152
- if @s1[loc] =~ /[0-9a-zA-Z]/
153
- window_s1 += @s1[loc]
166
+ char = @s1[loc]
167
+ if alnum_char?(char)
168
+ chars1 << char
154
169
  count += 1
155
170
  end
156
171
  loc -= 1
157
172
  end
158
173
 
159
- window_s2 = ''
174
+ chars2 = []
160
175
  loc = beg_s2 - 1
161
176
  count = 0
162
177
  while count < size_window && loc >= 0
163
- if @s2[loc] =~ /[0-9a-zA-Z]/
164
- window_s2 += @s2[loc]
178
+ char = @s2[loc]
179
+ if alnum_char?(char)
180
+ chars2 << char
165
181
  count += 1
166
182
  end
167
183
  loc -= 1
168
184
  end
169
185
 
170
- [window_s1, window_s2]
186
+ [chars1.join, chars2.join]
171
187
  end
172
188
 
173
189
  def get_right_windows(beg_s1, beg_s2, size_window = nil)
@@ -176,31 +192,33 @@ class TextAlignment::AnchorFinder
176
192
  # commend below with the assumption that the end of a document gives a significant locational
177
193
  # return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
178
194
 
179
- window_s1 = ''
195
+ chars1 = []
180
196
  loc = beg_s1 + @size_ngram
181
197
  len_s1 = @s1.length
182
198
  count = 0
183
199
  while count < size_window && loc < len_s1
184
- if @s1[loc] =~ /[0-9a-zA-Z]/
185
- window_s1 += @s1[loc]
200
+ char = @s1[loc]
201
+ if alnum_char?(char)
202
+ chars1 << char
186
203
  count += 1
187
204
  end
188
205
  loc += 1
189
206
  end
190
207
 
191
- window_s2 = ''
208
+ chars2 = []
192
209
  loc = beg_s2 + @size_ngram
193
210
  len_s2 = @s2.length
194
211
  count = 0
195
212
  while count < size_window && loc < len_s2
196
- if @s2[loc] =~ /[0-9a-zA-Z]/
197
- window_s2 += @s2[loc]
213
+ char = @s2[loc]
214
+ if alnum_char?(char)
215
+ chars2 << char
198
216
  count += 1
199
217
  end
200
218
  loc += 1
201
219
  end
202
220
 
203
- [window_s1, window_s2]
221
+ [chars1.join, chars2.join]
204
222
  end
205
223
 
206
224
  def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
@@ -245,4 +263,9 @@ class TextAlignment::AnchorFinder
245
263
  return 0 if str1.nil? || str2.nil?
246
264
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
247
265
  end
266
+
267
+ # Fast alphanumeric character check without regex
268
+ def alnum_char?(char)
269
+ (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || (char >= '0' && char <= '9')
270
+ end
248
271
  end
@@ -38,7 +38,6 @@ class TextAlignment::TextAlignment
38
38
  @original_text = text
39
39
  @text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
40
40
  end
41
-
42
41
  @mapped_text = @text_mapping.mapped_text
43
42
 
44
43
  ## To generate the block_alignment of the input text against the reference text
@@ -251,7 +250,7 @@ class TextAlignment::TextAlignment
251
250
  if b2 < e2
252
251
  _str2 = str2[b2 ... e2]
253
252
 
254
- sum += if _str1.strip.empty? || _str2.strip.empty?
253
+ gap_result = if _str1.strip.empty? || _str2.strip.empty?
255
254
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
256
255
  else
257
256
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
@@ -307,6 +306,8 @@ class TextAlignment::TextAlignment
307
306
  end
308
307
  end
309
308
  end
309
+
310
+ sum += gap_result
310
311
  elsif b2 > e2 # when out of order
311
312
  # ToDo
312
313
  end
@@ -317,6 +318,7 @@ class TextAlignment::TextAlignment
317
318
  cblock.nil? ? sum : sum << cblock
318
319
  end
319
320
 
321
+ blocks2
320
322
  end
321
323
 
322
324
  def whole_block_alignment(str1, str2, cultivation_map)
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.12.8'
2
+ VERSION = '0.12.10'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.8
4
+ version: 0.12.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-05-25 00:00:00.000000000 Z
11
+ date: 2025-09-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary