text_alignment 0.12.8 → 0.12.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/anchor_finder.rb +29 -1
- data/lib/text_alignment/text_alignment.rb +4 -2
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7da208b2cd252983fd0c7f8378130f7b13bba3df1c698f55e2133d54d9dab61d
|
4
|
+
data.tar.gz: 998405eb7c03b065faae083368a285cf526a193d7b59d22aafce643887b4150e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 246e0796040cedd989e12f7ef51f33a3209e0f82a396813ad9062323459e731c8d7b651a2461b732d90119c2e95586e1dbc296db91a99aeca574f4337aca6c8a
|
7
|
+
data.tar.gz: 06dedaa086dd878e41d26d18e427cf1d7c564583b34d047758a57e4508506cba6714abe9d8e04fa63c474ec2cc3fb9a585fef00312efe409c22b73528bdbde97
|
@@ -32,12 +32,31 @@ class TextAlignment::AnchorFinder
|
|
32
32
|
|
33
33
|
def get_next_anchor
|
34
34
|
# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
|
35
|
+
iterations = 0
|
35
36
|
beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
|
37
|
+
iterations += 1
|
36
38
|
|
37
39
|
# To skip whitespace letters
|
38
40
|
next if [' ', "\n", "\t"].include? @s1[beg_s1]
|
39
41
|
|
42
|
+
# Skip positions that start with punctuation or numbers (likely poor anchors)
|
43
|
+
next if @s1[beg_s1] =~ /[^a-zA-Z]/
|
44
|
+
|
45
|
+
# Performance optimization: skip if we've had too many failed attempts recently
|
46
|
+
if iterations > 50 && @recent_failures && @recent_failures > 10
|
47
|
+
step_size = [@recent_failures / 5, 3].min
|
48
|
+
beg_s1 += step_size
|
49
|
+
next if beg_s1 > @pos_s1_final_possible_begin
|
50
|
+
end
|
51
|
+
|
40
52
|
_beg_s2 = get_beg_s2(beg_s1)
|
53
|
+
|
54
|
+
if _beg_s2.nil?
|
55
|
+
@recent_failures = (@recent_failures || 0) + 1
|
56
|
+
else
|
57
|
+
@recent_failures = 0 # Reset on success
|
58
|
+
end
|
59
|
+
|
41
60
|
break _beg_s2 unless _beg_s2.nil?
|
42
61
|
end
|
43
62
|
|
@@ -72,7 +91,16 @@ class TextAlignment::AnchorFinder
|
|
72
91
|
# to get the anchor to search for in s2
|
73
92
|
anchor = @s1[beg_s1, @size_ngram]
|
74
93
|
|
94
|
+
# Quick frequency check: skip very short or very common ngrams
|
95
|
+
return nil if anchor.length < @size_ngram
|
96
|
+
return nil if anchor =~ /^(.)\1+$/ # Skip repeating character patterns like "aaaaaaaa"
|
97
|
+
|
98
|
+
# Skip ngrams that are mostly whitespace or punctuation
|
99
|
+
non_alnum_count = anchor.count("^a-zA-Z0-9")
|
100
|
+
return nil if non_alnum_count > @size_ngram / 2
|
101
|
+
|
75
102
|
search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
|
103
|
+
|
76
104
|
beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
|
77
105
|
return nil if beg_s2_candidates.empty?
|
78
106
|
|
@@ -87,7 +115,7 @@ class TextAlignment::AnchorFinder
|
|
87
115
|
candidates << _beg_s2
|
88
116
|
|
89
117
|
# for speed, skip anchor of high frequency
|
90
|
-
if candidates.length >
|
118
|
+
if candidates.length > 3
|
91
119
|
candidates.clear
|
92
120
|
break
|
93
121
|
end
|
@@ -38,7 +38,6 @@ class TextAlignment::TextAlignment
|
|
38
38
|
@original_text = text
|
39
39
|
@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
|
40
40
|
end
|
41
|
-
|
42
41
|
@mapped_text = @text_mapping.mapped_text
|
43
42
|
|
44
43
|
## To generate the block_alignment of the input text against the reference text
|
@@ -251,7 +250,7 @@ class TextAlignment::TextAlignment
|
|
251
250
|
if b2 < e2
|
252
251
|
_str2 = str2[b2 ... e2]
|
253
252
|
|
254
|
-
|
253
|
+
gap_result = if _str1.strip.empty? || _str2.strip.empty?
|
255
254
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
256
255
|
else
|
257
256
|
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
@@ -307,6 +306,8 @@ class TextAlignment::TextAlignment
|
|
307
306
|
end
|
308
307
|
end
|
309
308
|
end
|
309
|
+
|
310
|
+
sum += gap_result
|
310
311
|
elsif b2 > e2 # when out of order
|
311
312
|
# ToDo
|
312
313
|
end
|
@@ -317,6 +318,7 @@ class TextAlignment::TextAlignment
|
|
317
318
|
cblock.nil? ? sum : sum << cblock
|
318
319
|
end
|
319
320
|
|
321
|
+
blocks2
|
320
322
|
end
|
321
323
|
|
322
324
|
def whole_block_alignment(str1, str2, cultivation_map)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|