text_alignment 0.12.7 → 0.12.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 354d6971ab0c7d6e80af0633512d5f7257b5e6e1a4bef021bfc1e1eee6e546c1
4
- data.tar.gz: bae6c626e170de96ae32beb24c9d074b1e56283e497dffc53c2da270c20de058
3
+ metadata.gz: 7da208b2cd252983fd0c7f8378130f7b13bba3df1c698f55e2133d54d9dab61d
4
+ data.tar.gz: 998405eb7c03b065faae083368a285cf526a193d7b59d22aafce643887b4150e
5
5
  SHA512:
6
- metadata.gz: 3409a5c7419c43a311e76468a6783b6a6a808c12dd14343eace7c0242dc3cd79d01616069dce12d49f5db8cc77b293ab469efa07b6a5878e1bade341f24f1c1d
7
- data.tar.gz: fe7df5b352f14989701c0dbf0bc94169b415a5c155b7e04f670803d154576e4a39ea7dddf3f320a9df8e8e9c3fb3bc67ddc051d7e3d939ca9bc7a58de1b3b952
6
+ metadata.gz: 246e0796040cedd989e12f7ef51f33a3209e0f82a396813ad9062323459e731c8d7b651a2461b732d90119c2e95586e1dbc296db91a99aeca574f4337aca6c8a
7
+ data.tar.gz: 06dedaa086dd878e41d26d18e427cf1d7c564583b34d047758a57e4508506cba6714abe9d8e04fa63c474ec2cc3fb9a585fef00312efe409c22b73528bdbde97
@@ -32,12 +32,31 @@ class TextAlignment::AnchorFinder
32
32
 
33
33
  def get_next_anchor
34
34
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
35
+ iterations = 0
35
36
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
37
+ iterations += 1
36
38
 
37
39
  # To skip whitespace letters
38
40
  next if [' ', "\n", "\t"].include? @s1[beg_s1]
39
41
 
42
+ # Skip positions that start with punctuation or numbers (likely poor anchors)
43
+ next if @s1[beg_s1] =~ /[^a-zA-Z]/
44
+
45
+ # Performance optimization: skip if we've had too many failed attempts recently
46
+ if iterations > 50 && @recent_failures && @recent_failures > 10
47
+ step_size = [@recent_failures / 5, 3].min
48
+ beg_s1 += step_size
49
+ next if beg_s1 > @pos_s1_final_possible_begin
50
+ end
51
+
40
52
  _beg_s2 = get_beg_s2(beg_s1)
53
+
54
+ if _beg_s2.nil?
55
+ @recent_failures = (@recent_failures || 0) + 1
56
+ else
57
+ @recent_failures = 0 # Reset on success
58
+ end
59
+
41
60
  break _beg_s2 unless _beg_s2.nil?
42
61
  end
43
62
 
@@ -72,7 +91,16 @@ class TextAlignment::AnchorFinder
72
91
  # to get the anchor to search for in s2
73
92
  anchor = @s1[beg_s1, @size_ngram]
74
93
 
94
+ # Quick frequency check: skip very short or very common ngrams
95
+ return nil if anchor.length < @size_ngram
96
+ return nil if anchor =~ /^(.)\1+$/ # Skip repeating character patterns like "aaaaaaaa"
97
+
98
+ # Skip ngrams that are mostly whitespace or punctuation
99
+ non_alnum_count = anchor.count("^a-zA-Z0-9")
100
+ return nil if non_alnum_count > @size_ngram / 2
101
+
75
102
  search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
103
+
76
104
  beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
77
105
  return nil if beg_s2_candidates.empty?
78
106
 
@@ -87,7 +115,7 @@ class TextAlignment::AnchorFinder
87
115
  candidates << _beg_s2
88
116
 
89
117
  # for speed, skip anchor of high frequency
90
- if candidates.length > 5
118
+ if candidates.length > 3
91
119
  candidates.clear
92
120
  break
93
121
  end
@@ -202,7 +202,7 @@ class TextAlignment::CharMapping
202
202
 
203
203
  # To execute the long letter mapping
204
204
  char_mapping.each do |one, long|
205
- next unless text =~ /#{one}/
205
+ next unless text.include?(one)
206
206
  text.gsub!(one * long.length, one) if long.length > 1
207
207
  end
208
208
 
@@ -215,17 +215,29 @@ class TextAlignment::CharMapping
215
215
  # To get squeeze positions of whitespaces to one
216
216
  def get_positions_squeeze_ws_1(text)
217
217
  rpositions = []
218
- text.scan(/\s{2,}/) do |s|
219
- loc = $~.begin(0)
220
- len = $~.end(0) - loc
218
+ scanner = StringScanner.new(text)
219
+
220
+ while scanner.scan_until(/\s{2,}/)
221
+ len = scanner.matched_size
222
+ loc = scanner.pos - len
221
223
  rpositions << [loc, len, 1]
222
224
  end
225
+
223
226
  rpositions
224
227
  end
225
228
 
226
229
  # To get squeeze positions of whitespaces to zero
227
230
  def get_positions_squeeze_ws_0(text)
228
- text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
231
+ rpositions = []
232
+ scanner = StringScanner.new(text)
233
+
234
+ while scanner.scan(/\s+/)
235
+ len = scanner.matched_size
236
+ start = scanner.pos - len
237
+ rpositions << [start, len, 0]
238
+ end
239
+
240
+ rpositions
229
241
  end
230
242
 
231
243
  def squeeze_ws_1!(text)
@@ -38,7 +38,6 @@ class TextAlignment::TextAlignment
38
38
  @original_text = text
39
39
  @text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
40
40
  end
41
-
42
41
  @mapped_text = @text_mapping.mapped_text
43
42
 
44
43
  ## To generate the block_alignment of the input text against the reference text
@@ -251,7 +250,7 @@ class TextAlignment::TextAlignment
251
250
  if b2 < e2
252
251
  _str2 = str2[b2 ... e2]
253
252
 
254
- sum += if _str1.strip.empty? || _str2.strip.empty?
253
+ gap_result = if _str1.strip.empty? || _str2.strip.empty?
255
254
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
256
255
  else
257
256
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
@@ -307,6 +306,8 @@ class TextAlignment::TextAlignment
307
306
  end
308
307
  end
309
308
  end
309
+
310
+ sum += gap_result
310
311
  elsif b2 > e2 # when out of order
311
312
  # ToDo
312
313
  end
@@ -317,6 +318,7 @@ class TextAlignment::TextAlignment
317
318
  cblock.nil? ? sum : sum << cblock
318
319
  end
319
320
 
321
+ blocks2
320
322
  end
321
323
 
322
324
  def whole_block_alignment(str1, str2, cultivation_map)
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.12.7'
2
+ VERSION = '0.12.9'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.7
4
+ version: 0.12.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-05-22 00:00:00.000000000 Z
11
+ date: 2025-09-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary