RubyGems - text_alignment - Versions diffs - 0.12.7 → 0.12.9 - Mend

text_alignment 0.12.7 → 0.12.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/text_alignment/anchor_finder.rb +29 -1
data/lib/text_alignment/char_mapping.rb +17 -5
data/lib/text_alignment/text_alignment.rb +4 -2
data/lib/text_alignment/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 354d6971ab0c7d6e80af0633512d5f7257b5e6e1a4bef021bfc1e1eee6e546c1
-  data.tar.gz: bae6c626e170de96ae32beb24c9d074b1e56283e497dffc53c2da270c20de058
+  metadata.gz: 7da208b2cd252983fd0c7f8378130f7b13bba3df1c698f55e2133d54d9dab61d
+  data.tar.gz: 998405eb7c03b065faae083368a285cf526a193d7b59d22aafce643887b4150e
 SHA512:
-  metadata.gz: 3409a5c7419c43a311e76468a6783b6a6a808c12dd14343eace7c0242dc3cd79d01616069dce12d49f5db8cc77b293ab469efa07b6a5878e1bade341f24f1c1d
-  data.tar.gz: fe7df5b352f14989701c0dbf0bc94169b415a5c155b7e04f670803d154576e4a39ea7dddf3f320a9df8e8e9c3fb3bc67ddc051d7e3d939ca9bc7a58de1b3b952
+  metadata.gz: 246e0796040cedd989e12f7ef51f33a3209e0f82a396813ad9062323459e731c8d7b651a2461b732d90119c2e95586e1dbc296db91a99aeca574f4337aca6c8a
+  data.tar.gz: 06dedaa086dd878e41d26d18e427cf1d7c564583b34d047758a57e4508506cba6714abe9d8e04fa63c474ec2cc3fb9a585fef00312efe409c22b73528bdbde97

data/lib/text_alignment/anchor_finder.rb CHANGED Viewed

@@ -32,12 +32,31 @@ class TextAlignment::AnchorFinder
 	def get_next_anchor
 		# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
+		iterations = 0
 		beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
+			iterations += 1
 			# To skip whitespace letters
 			next if [' ', "\n", "\t"].include? @s1[beg_s1]
+			# Skip positions that start with punctuation or numbers (likely poor anchors)
+			next if @s1[beg_s1] =~ /[^a-zA-Z]/
+			# Performance optimization: skip if we've had too many failed attempts recently
+			if iterations > 50 && @recent_failures && @recent_failures > 10
+				step_size = [@recent_failures / 5, 3].min
+				beg_s1 += step_size
+				next if beg_s1 > @pos_s1_final_possible_begin
+			end
 			_beg_s2 = get_beg_s2(beg_s1)
+			if _beg_s2.nil?
+				@recent_failures = (@recent_failures || 0) + 1
+			else
+				@recent_failures = 0  # Reset on success
+			end
 			break _beg_s2 unless _beg_s2.nil?
 		end
@@ -72,7 +91,16 @@ class TextAlignment::AnchorFinder
 		# to get the anchor to search for in s2
 		anchor = @s1[beg_s1, @size_ngram]
+		# Quick frequency check: skip very short or very common ngrams
+		return nil if anchor.length < @size_ngram
+		return nil if anchor =~ /^(.)\1+$/ # Skip repeating character patterns like "aaaaaaaa"
+		# Skip ngrams that are mostly whitespace or punctuation
+		non_alnum_count = anchor.count("^a-zA-Z0-9")
+		return nil if non_alnum_count > @size_ngram / 2
 		search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
 		beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
 		return nil if beg_s2_candidates.empty?
@@ -87,7 +115,7 @@ class TextAlignment::AnchorFinder
 			candidates << _beg_s2
 			# for speed, skip anchor of high frequency
-			if candidates.length > 5
+			if candidates.length > 3
 				candidates.clear
 				break
 			end

data/lib/text_alignment/char_mapping.rb CHANGED Viewed

@@ -202,7 +202,7 @@ class TextAlignment::CharMapping
 		# To execute the long letter mapping
 		char_mapping.each do |one, long|
-			next unless text =~ /#{one}/
+			next unless text.include?(one)
 			text.gsub!(one * long.length, one) if long.length > 1
 		end
@@ -215,17 +215,29 @@ class TextAlignment::CharMapping
 	# To get squeeze positions of whitespaces to one
 	def get_positions_squeeze_ws_1(text)
 		rpositions = []
-		text.scan(/\s{2,}/) do |s|
-			loc = $~.begin(0)
-			len = $~.end(0) - loc
+		scanner    = StringScanner.new(text)
+		while scanner.scan_until(/\s{2,}/)
+			len = scanner.matched_size
+			loc = scanner.pos - len
 			rpositions << [loc, len, 1]
 		end
 		rpositions
 	end
 	# To get squeeze positions of whitespaces to zero
 	def get_positions_squeeze_ws_0(text)
-		text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
+		rpositions = []
+		scanner    = StringScanner.new(text)
+		while scanner.scan(/\s+/)
+			len   = scanner.matched_size
+			start = scanner.pos - len
+			rpositions << [start, len, 0]
+		end
+		rpositions
 	end
 	def squeeze_ws_1!(text)

data/lib/text_alignment/text_alignment.rb CHANGED Viewed

@@ -38,7 +38,6 @@ class TextAlignment::TextAlignment
 			@original_text = text
 			@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
 		end
 		@mapped_text = @text_mapping.mapped_text
 		## To generate the block_alignment of the input text against the reference text
@@ -251,7 +250,7 @@ class TextAlignment::TextAlignment
 				if b2 < e2
 					_str2 = str2[b2 ... e2]
-					sum += if _str1.strip.empty? || _str2.strip.empty?
+					gap_result = if _str1.strip.empty? || _str2.strip.empty?
 						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
 					else
 						len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
@@ -307,6 +306,8 @@ class TextAlignment::TextAlignment
 							end
 						end
 					end
+					sum += gap_result
 				elsif b2 > e2 # when out of order
 					# ToDo
 				end
@@ -317,6 +318,7 @@ class TextAlignment::TextAlignment
 			cblock.nil? ? sum : sum << cblock
 		end
+		blocks2
 	end
 	def whole_block_alignment(str1, str2, cultivation_map)

data/lib/text_alignment/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.12.7'
+	VERSION = '0.12.9'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.12.7
+  version: 0.12.9
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-05-22 00:00:00.000000000 Z
+date: 2025-09-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary