text_alignment 0.12.7 → 0.12.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7da208b2cd252983fd0c7f8378130f7b13bba3df1c698f55e2133d54d9dab61d
|
4
|
+
data.tar.gz: 998405eb7c03b065faae083368a285cf526a193d7b59d22aafce643887b4150e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 246e0796040cedd989e12f7ef51f33a3209e0f82a396813ad9062323459e731c8d7b651a2461b732d90119c2e95586e1dbc296db91a99aeca574f4337aca6c8a
|
7
|
+
data.tar.gz: 06dedaa086dd878e41d26d18e427cf1d7c564583b34d047758a57e4508506cba6714abe9d8e04fa63c474ec2cc3fb9a585fef00312efe409c22b73528bdbde97
|
@@ -32,12 +32,31 @@ class TextAlignment::AnchorFinder
|
|
32
32
|
|
33
33
|
def get_next_anchor
|
34
34
|
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
35
|
+
iterations = 0
|
35
36
|
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
37
|
+
iterations += 1
|
36
38
|
|
37
39
|
# To skip whitespace letters
|
38
40
|
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
39
41
|
|
42
|
+
# Skip positions that start with punctuation or numbers (likely poor anchors)
|
43
|
+
next if @s1[beg_s1] =~ /[^a-zA-Z]/
|
44
|
+
|
45
|
+
# Performance optimization: skip if we've had too many failed attempts recently
|
46
|
+
if iterations > 50 && @recent_failures && @recent_failures > 10
|
47
|
+
step_size = [@recent_failures / 5, 3].min
|
48
|
+
beg_s1 += step_size
|
49
|
+
next if beg_s1 > @pos_s1_final_possible_begin
|
50
|
+
end
|
51
|
+
|
40
52
|
_beg_s2 = get_beg_s2(beg_s1)
|
53
|
+
|
54
|
+
if _beg_s2.nil?
|
55
|
+
@recent_failures = (@recent_failures || 0) + 1
|
56
|
+
else
|
57
|
+
@recent_failures = 0 # Reset on success
|
58
|
+
end
|
59
|
+
|
41
60
|
break _beg_s2 unless _beg_s2.nil?
|
42
61
|
end
|
43
62
|
|
@@ -72,7 +91,16 @@ class TextAlignment::AnchorFinder
|
|
72
91
|
# to get the anchor to search for in s2
|
73
92
|
anchor = @s1[beg_s1, @size_ngram]
|
74
93
|
|
94
|
+
# Quick frequency check: skip very short or very common ngrams
|
95
|
+
return nil if anchor.length < @size_ngram
|
96
|
+
return nil if anchor =~ /^(.)\1+$/ # Skip repeating character patterns like "aaaaaaaa"
|
97
|
+
|
98
|
+
# Skip ngrams that are mostly whitespace or punctuation
|
99
|
+
non_alnum_count = anchor.count("^a-zA-Z0-9")
|
100
|
+
return nil if non_alnum_count > @size_ngram / 2
|
101
|
+
|
75
102
|
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
103
|
+
|
76
104
|
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
77
105
|
return nil if beg_s2_candidates.empty?
|
78
106
|
|
@@ -87,7 +115,7 @@ class TextAlignment::AnchorFinder
|
|
87
115
|
candidates << _beg_s2
|
88
116
|
|
89
117
|
# for speed, skip anchor of high frequency
|
90
|
-
if candidates.length >
|
118
|
+
if candidates.length > 3
|
91
119
|
candidates.clear
|
92
120
|
break
|
93
121
|
end
|
@@ -202,7 +202,7 @@ class TextAlignment::CharMapping
|
|
202
202
|
|
203
203
|
# To execute the long letter mapping
|
204
204
|
char_mapping.each do |one, long|
|
205
|
-
next unless text
|
205
|
+
next unless text.include?(one)
|
206
206
|
text.gsub!(one * long.length, one) if long.length > 1
|
207
207
|
end
|
208
208
|
|
@@ -215,17 +215,29 @@ class TextAlignment::CharMapping
|
|
215
215
|
# To get squeeze positions of whitespaces to one
|
216
216
|
def get_positions_squeeze_ws_1(text)
|
217
217
|
rpositions = []
|
218
|
-
|
219
|
-
|
220
|
-
|
218
|
+
scanner = StringScanner.new(text)
|
219
|
+
|
220
|
+
while scanner.scan_until(/\s{2,}/)
|
221
|
+
len = scanner.matched_size
|
222
|
+
loc = scanner.pos - len
|
221
223
|
rpositions << [loc, len, 1]
|
222
224
|
end
|
225
|
+
|
223
226
|
rpositions
|
224
227
|
end
|
225
228
|
|
226
229
|
# To get squeeze positions of whitespaces to zero
|
227
230
|
def get_positions_squeeze_ws_0(text)
|
228
|
-
|
231
|
+
rpositions = []
|
232
|
+
scanner = StringScanner.new(text)
|
233
|
+
|
234
|
+
while scanner.scan(/\s+/)
|
235
|
+
len = scanner.matched_size
|
236
|
+
start = scanner.pos - len
|
237
|
+
rpositions << [start, len, 0]
|
238
|
+
end
|
239
|
+
|
240
|
+
rpositions
|
229
241
|
end
|
230
242
|
|
231
243
|
def squeeze_ws_1!(text)
|
@@ -38,7 +38,6 @@ class TextAlignment::TextAlignment
|
|
38
38
|
@original_text = text
|
39
39
|
@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
|
40
40
|
end
|
41
|
-
|
42
41
|
@mapped_text = @text_mapping.mapped_text
|
43
42
|
|
44
43
|
## To generate the block_alignment of the input text against the reference text
|
@@ -251,7 +250,7 @@ class TextAlignment::TextAlignment
|
|
251
250
|
if b2 < e2
|
252
251
|
_str2 = str2[b2 ... e2]
|
253
252
|
|
254
|
-
|
253
|
+
gap_result = if _str1.strip.empty? || _str2.strip.empty?
|
255
254
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
256
255
|
else
|
257
256
|
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -307,6 +306,8 @@ class TextAlignment::TextAlignment
|
|
307
306
|
end
|
308
307
|
end
|
309
308
|
end
|
309
|
+
|
310
|
+
sum += gap_result
|
310
311
|
elsif b2 > e2 # when out of order
|
311
312
|
# ToDo
|
312
313
|
end
|
@@ -317,6 +318,7 @@ class TextAlignment::TextAlignment
|
|
317
318
|
cblock.nil? ? sum : sum << cblock
|
318
319
|
end
|
319
320
|
|
321
|
+
blocks2
|
320
322
|
end
|
321
323
|
|
322
324
|
def whole_block_alignment(str1, str2, cultivation_map)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|