RubyGems - text_alignment - Versions diffs - 0.12.8 → 0.12.10 - Mend

text_alignment 0.12.8 → 0.12.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/text_alignment/anchor_finder.rb +65 -42
data/lib/text_alignment/text_alignment.rb +4 -2
data/lib/text_alignment/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 528dd8cf88da73d13e8933b69ce2cfb27a1cbd245392e072c53c2c02bd1b57c4
-  data.tar.gz: b74d808fe3412f704e7770ac0ac50a645ef75e9c8667f715324f7769f3bf151f
+  metadata.gz: 2036a650644cd3b814d8c74d9a7bc7c4cb647adf51b38a5965b5f0f121144276
+  data.tar.gz: 4619c1c428626857d1d189cb1a6d64c870cc3a427c1e31ea0fb8f4decb417b88
 SHA512:
-  metadata.gz: 461745ed09343a23ab2a5ca008cc2625ec7679573c57bbb1e70e1f76cd2d51478ed2bcbb47197aee76255de3404897138ec569cfb93149690b974f26d203594d
-  data.tar.gz: 937f951a7e84bf065ab7ad4818d7d9731f365c16abadf165cb9daca27eefde83f0d695b33b60b5d54a3af97ea3b6c963b95183ff25e19794fa005474602ee6e9
+  metadata.gz: 9fa5d56dbd8bc4372022e3d58b82958bc3561155a8d33c9f9e05ec48f96ef431d1309e4634a11909b31f32609491f8691a3b8ad41f798e7bb584fd5aa3b5c7ac
+  data.tar.gz: fb58b8aea21cfd8a9ebf24edc693d63397ee7a8f72d175feb8da63a0d0160bb3a4740202e7ce5dd9ba7803d9ad18ebeb38f98e493741d46517c5007f40c8aba9

data/lib/text_alignment/anchor_finder.rb CHANGED Viewed

@@ -28,16 +28,40 @@ class TextAlignment::AnchorFinder
 		# positions of last match
 		@pos_s1_last_match = 0
 		@pos_s2_last_match = 0
+		# Performance: cache for character classification
+		@half_ngram = @size_ngram / 2
 	end
 	def get_next_anchor
 		# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
+		iterations = 0
 		beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
+			iterations += 1
+			char = @s1[beg_s1]
 			# To skip whitespace letters
-			next if [' ', "\n", "\t"].include? @s1[beg_s1]
+			next if char == ' ' || char == "\n" || char == "\t"
+			# Skip positions that start with punctuation or numbers (likely poor anchors)
+			next if char < 'A' || (char > 'Z' && char < 'a') || char > 'z'
+			# Performance optimization: skip if we've had too many failed attempts recently
+			if iterations > 50 && @recent_failures && @recent_failures > 10
+				step_size = [@recent_failures / 5, 3].min
+				beg_s1 += step_size
+				next if beg_s1 > @pos_s1_final_possible_begin
+			end
 			_beg_s2 = get_beg_s2(beg_s1)
+			if _beg_s2.nil?
+				@recent_failures = (@recent_failures || 0) + 1
+			else
+				@recent_failures = 0  # Reset on success
+			end
 			break _beg_s2 unless _beg_s2.nil?
 		end
@@ -72,7 +96,16 @@ class TextAlignment::AnchorFinder
 		# to get the anchor to search for in s2
 		anchor = @s1[beg_s1, @size_ngram]
+		# Quick frequency check: skip very short or very common ngrams
+		return nil if anchor.length < @size_ngram
+		return nil if anchor.chars.uniq.length == 1 # Skip repeating character patterns like "aaaaaaaa"
+		# Skip ngrams that are mostly whitespace or punctuation
+		non_alnum_count = anchor.count("^a-zA-Z0-9")
+		return nil if non_alnum_count > @half_ngram
 		search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
 		beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
 		return nil if beg_s2_candidates.empty?
@@ -87,7 +120,7 @@ class TextAlignment::AnchorFinder
 			candidates << _beg_s2
 			# for speed, skip anchor of high frequency
-			if candidates.length > 5
+			if candidates.length > 3
 				candidates.clear
 				break
 			end
@@ -98,45 +131,26 @@ class TextAlignment::AnchorFinder
 	end
 	def find_valid_beg_s2(beg_s1, beg_s2_candidates)
-		valid_beg_s2 = nil
-		(10 .. 30).step(10).each do |size_window|
-			valid_beg_s2 = nil
-			r = beg_s2_candidates.each do |beg_s2|
-				# if both the begining points are sufficiantly close to the end points of the last match
-				# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
+		[10, 20, 30].each do |size_window|
+			beg_s2_candidates.each do |beg_s2|
+				# if both the beginning points are sufficiently close to the end points of the last match
 				if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
-					break unless valid_beg_s2.nil?
-					valid_beg_s2 = beg_s2
-					next
+					return beg_s2
 				end
 				left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
 				if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
-					break unless valid_beg_s2.nil?
-					valid_beg_s2 = beg_s2
-					next
+					return beg_s2
 				end
 				right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
 				if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
-					break unless valid_beg_s2.nil?
-					valid_beg_s2 = beg_s2
-					next
+					return beg_s2
 				end
 			end
-			# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
-			# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
-			if r.nil?
-				valid_beg_s2 = nil
-			else
-				break
-			end
 		end
-		valid_beg_s2
+		nil
 	end
 	def get_left_windows(beg_s1, beg_s2, size_window = nil)
@@ -145,29 +159,31 @@ class TextAlignment::AnchorFinder
 		# comment out below with the assumption that the beginning of a document gives a significant locational information
 		# return if beg_s1 < size_window || beg_s2 < size_window
-		window_s1 = ''
+		chars1 = []
 		loc = beg_s1 - 1
 		count = 0
 		while count < size_window && loc >= 0
-			if @s1[loc] =~ /[0-9a-zA-Z]/
-				window_s1 += @s1[loc]
+			char = @s1[loc]
+			if alnum_char?(char)
+				chars1 << char
 				count += 1
 			end
 			loc -= 1
 		end
-		window_s2 = ''
+		chars2 = []
 		loc = beg_s2 - 1
 		count = 0
 		while count < size_window && loc >= 0
-			if @s2[loc] =~ /[0-9a-zA-Z]/
-				window_s2 += @s2[loc]
+			char = @s2[loc]
+			if alnum_char?(char)
+				chars2 << char
 				count += 1
 			end
 			loc -= 1
 		end
-		[window_s1, window_s2]
+		[chars1.join, chars2.join]
 	end
 	def get_right_windows(beg_s1, beg_s2, size_window = nil)
@@ -176,31 +192,33 @@ class TextAlignment::AnchorFinder
 		# commend below with the assumption that the end of a document gives a significant locational
 		# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
-		window_s1 = ''
+		chars1 = []
 		loc = beg_s1 + @size_ngram
 		len_s1 = @s1.length
 		count = 0
 		while count < size_window && loc < len_s1
-			if @s1[loc] =~ /[0-9a-zA-Z]/
-				window_s1 += @s1[loc]
+			char = @s1[loc]
+			if alnum_char?(char)
+				chars1 << char
 				count += 1
 			end
 			loc += 1
 		end
-		window_s2 = ''
+		chars2 = []
 		loc = beg_s2 + @size_ngram
 		len_s2 = @s2.length
 		count = 0
 		while count < size_window && loc < len_s2
-			if @s2[loc] =~ /[0-9a-zA-Z]/
-				window_s2 += @s2[loc]
+			char = @s2[loc]
+			if alnum_char?(char)
+				chars2 << char
 				count += 1
 			end
 			loc += 1
 		end
-		[window_s1, window_s2]
+		[chars1.join, chars2.join]
 	end
 	def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
@@ -245,4 +263,9 @@ class TextAlignment::AnchorFinder
 		return 0 if str1.nil? || str2.nil?
 		String::Similarity.cosine(str1, str2, ngram:ngram_order)
 	end
+	# Fast alphanumeric character check without regex
+	def alnum_char?(char)
+		(char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || (char >= '0' && char <= '9')
+	end
 end

data/lib/text_alignment/text_alignment.rb CHANGED Viewed

@@ -38,7 +38,6 @@ class TextAlignment::TextAlignment
 			@original_text = text
 			@text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
 		end
 		@mapped_text = @text_mapping.mapped_text
 		## To generate the block_alignment of the input text against the reference text
@@ -251,7 +250,7 @@ class TextAlignment::TextAlignment
 				if b2 < e2
 					_str2 = str2[b2 ... e2]
-					sum += if _str1.strip.empty? || _str2.strip.empty?
+					gap_result = if _str1.strip.empty? || _str2.strip.empty?
 						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
 					else
 						len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
@@ -307,6 +306,8 @@ class TextAlignment::TextAlignment
 							end
 						end
 					end
+					sum += gap_result
 				elsif b2 > e2 # when out of order
 					# ToDo
 				end
@@ -317,6 +318,7 @@ class TextAlignment::TextAlignment
 			cblock.nil? ? sum : sum << cblock
 		end
+		blocks2
 	end
 	def whole_block_alignment(str1, str2, cultivation_map)

data/lib/text_alignment/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.12.8'
+	VERSION = '0.12.10'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.12.8
+  version: 0.12.10
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-05-25 00:00:00.000000000 Z
+date: 2025-09-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary