RubyGems - text_alignment - Versions diffs - 0.9.1 → 0.10.1 - Mend

text_alignment 0.9.1 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/bin/align_annotations +10 -6
data/lib/text_alignment/anchor_finder.rb +130 -62
data/lib/text_alignment/{mappings.rb → char_mapping.rb} +90 -75
data/lib/text_alignment/cultivation_map.rb +19 -0
data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
data/lib/text_alignment/mixed_alignment.rb +15 -3
data/lib/text_alignment/text_alignment.rb +238 -180
data/lib/text_alignment/version.rb +1 -1
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: bc60f2422e09058c8abc037a5f4c7e28a2c26c4b0defa3e157a478f6c691e85e
-  data.tar.gz: 3732d51c46d0597cec005396c13e5aa7c84c766232f5de0c5b90e789a2fa77f1
+  metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
+  data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
 SHA512:
-  metadata.gz: '0095e5682996e5ccb6d6cc7529c40901656f169670e49d26331acad139964b528a6b3ae9c48f32844fbe2a8737f0ab66fdc4f4da51dc37808bed65e7a7447f37'
-  data.tar.gz: b8e00566dbcba94fbfd1d84bd7d10ac6ba7677124aa8a0676797223d4969e76917ea21013cb509762a46d14324eb28e38b1d6ad7dc26cd0fcb2a30af573e6612
+  metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
+  data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'

data/bin/align_annotations CHANGED Viewed

@@ -26,8 +26,9 @@ def read_text(filename)
 	end
 end
-def align_denotations(denotations, source_text, target_text, debug = false)
-	alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
+def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
+	alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
+	cm = alignment.cultivation_map
 	new_denotations = alignment.transform_hdenotations(denotations)
 	if debug
@@ -47,7 +48,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
 	warn
 	# return target annotations
-	new_denotations
+	[new_denotations, cm]
 end
 def align_mannotations(source_annotations, target_text, debug = false)
@@ -58,11 +59,13 @@ def align_mannotations(source_annotations, target_text, debug = false)
 	idnum_attributes = 0
 	idnum_modifications = 0
+	cm = nil
 	source_annotations.each_with_index do |annotations, i|
 		if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
 			ididx = {}
 			warn "[#{i}]-=-=-=-=-"
-			denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
+			denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
 			denotations.each do |d|
 				reid = 'T' + (idnum_denotations += 1).to_s
 				ididx[d[:id]] = reid
@@ -114,8 +117,9 @@ target_text = read_text(ARGV[1])
 target_annotations = if source_annotations.class == Array
 	align_mannotations(source_annotations, target_text, false)
 else
-	denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
+	denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
+	# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
 	source_annotations.merge({text:target_text, denotations:denotations})
 end
-# puts target_annotations.to_json
+puts target_annotations.to_json

data/lib/text_alignment/anchor_finder.rb CHANGED Viewed

@@ -6,76 +6,65 @@ module TextAlignment; end unless defined? TextAlignment
 class TextAlignment::AnchorFinder
-	def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
-		@size_ngram  = _size_ngram  || TextAlignment::SIZE_NGRAM
-		@size_window = _size_window || TextAlignment::SIZE_WINDOW
-		@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
-		@reverse = (target_str.length < source_str.length)
-		@s1, @s2 = if @reverse
+	def initialize(source_str, target_str, cultivation_map)
+		@s1, @s2 = if reverse?(source_str, target_str)
 			[target_str.downcase, source_str.downcase]
 		else
 			[source_str.downcase, target_str.downcase]
 		end
-		# current position in s1
-		@beg_s1 = 0
-		@end_s1_prev = 0
-		@end_s2_prev = 0
-	end
-	def get_next_anchor
-		# find the position of an anchor ngram in s1 and s2
-		while @beg_s1 < (@s1.length - @size_ngram)
-			if [' ', "\n", "\t"].include? @s1[@beg_s1]
-				@beg_s1 += 1
-				next
-			end
-			anchor = @s1[@beg_s1, @size_ngram]
+		@cultivation_map = cultivation_map
-			# search_position = 0
-			search_position = @end_s2_prev
-			while @beg_s2 = @s2.index(anchor, search_position)
-				# if both the begining points are sufficiantly close to the end points of the last match
-				break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
+		@size_ngram  = TextAlignment::SIZE_NGRAM
+		@size_window = TextAlignment::SIZE_WINDOW
+		@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
+		@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
-				left_window_s1, left_window_s2 = get_left_windows
-				break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
+		# positions of last match
+		@pos_s1_last_match = 0
+		@pos_s2_last_match = 0
+	end
-				right_window_s1, right_window_s2 = get_right_windows
-				break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
+	def reverse?(source_str = nil, target_str = nil)
+		unless source_str.nil?
+			@reverse_p = target_str.length < source_str.length
+		end
+		@reverse_p
+	end
-				search_position = @beg_s2 + 1
-			end
+	def get_next_anchor
+		# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
+		beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
-			break unless @beg_s2.nil?
+			# To skip whitespace letters
+			next if [' ', "\n", "\t"].include? @s1[beg_s1]
-			@beg_s1 += 1
+			_beg_s2 = get_beg_s2(beg_s1)
+			break _beg_s2 unless _beg_s2.nil?
 		end
-		return nil if @beg_s1 >= (@s1.length - @size_ngram)
+		# To return nil when it fails to find an anchor
+		return nil if beg_s2.class == Range
-		# extend the block
-		b1 = @beg_s1
-		b2 = @beg_s2
-		while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
+		# To extend the block to the left
+		b1 = beg_s1
+		b2 = beg_s2
+		while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
 			b1 -= 1; b2 -= 1
 		end
 		b1 += 1; b2 += 1
-		e1 = @beg_s1 + @size_ngram
-		e2 = @beg_s2 + @size_ngram
+		# To extend the block to the right
+		e1 = beg_s1 + @size_ngram
+		e2 = beg_s2 + @size_ngram
 		while @s1[e1] && @s1[e1] == @s2[e2]
 			e1 += 1; e2 += 1
 		end
-		@end_s1_prev = e1
-		@end_s2_prev = e2
-		@beg_s1 = e1
+		@pos_s1_last_match = e1
+		@pos_s2_last_match = e2
-		if @reverse
+		if reverse?
 			{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
 		else
 			{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
@@ -84,14 +73,92 @@ class TextAlignment::AnchorFinder
 	private
-	def get_left_windows
-		# commend below with the assumption that the beginning of a document gives a significant locational information
-		# return if @beg_s1 < @size_window || @beg_s2 < @size_window
+	def get_beg_s2(beg_s1)
+		# to get the anchor to search for in s2
+		anchor = @s1[beg_s1, @size_ngram]
+		# comment out below with the assumption that texts are in the same order
+		# search_position = 0
+		search_position = @pos_s2_last_match
+		beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
+		return nil if beg_s2_candidates.empty?
+		find_valid_beg_s2(beg_s1, beg_s2_candidates)
+	end
+	# To find beg_s2 which match to the anchor
+	# return nil if the anchor is too much frequent
+	def find_beg_s2_candidates(anchor, search_position)
+		candidates = []
+		while _beg_s2 = @s2.index(anchor, search_position)
+			search_again_position = @cultivation_map.search_again_position(_beg_s2)
+			unless search_again_position.nil?
+				search_position = search_again_position
+				next
+			end
+			candidates << _beg_s2
+			# for speed, skip anchor of high frequency
+			if candidates.length > 5
+				candidates.clear
+				break
+			end
+			search_position = _beg_s2 + 1
+		end
+		candidates
+	end
+	def find_valid_beg_s2(beg_s1, beg_s2_candidates)
+		valid_beg_s2 = nil
+		(10 .. 30).step(10).each do |size_window|
+			valid_beg_s2 = nil
+			r = beg_s2_candidates.each do |beg_s2|
+				# if both the begining points are sufficiantly close to the end points of the last match
+				# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
+				if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
+					break unless valid_beg_s2.nil?
+					valid_beg_s2 = beg_s2
+					next
+				end
+				left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
+				if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
+					break unless valid_beg_s2.nil?
+					valid_beg_s2 = beg_s2
+					next
+				end
+				right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
+				if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
+					break unless valid_beg_s2.nil?
+					valid_beg_s2 = beg_s2
+					next
+				end
+			end
+			# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
+			# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
+			break unless r.nil?
+		end
+		valid_beg_s2
+	end
+	def get_left_windows(beg_s1, beg_s2, size_window = nil)
+		size_window ||= @size_window
+		# comment out below with the assumption that the beginning of a document gives a significant locational information
+		# return if @beg_s1 < size_window || @beg_s2 < size_window
 		window_s1 = ''
-		loc = @beg_s1 - 1
+		loc = beg_s1 - 1
 		count = 0
-		while count < @size_window && loc >= 0
+		while count < size_window && loc >= 0
 			if @s1[loc] =~ /[0-9a-zA-Z]/
 				window_s1 += @s1[loc]
 				count += 1
@@ -100,9 +167,9 @@ class TextAlignment::AnchorFinder
 		end
 		window_s2 = ''
-		loc = @beg_s2 - 1
+		loc = beg_s2 - 1
 		count = 0
-		while count < @size_window && loc >= 0
+		while count < size_window && loc >= 0
 			if @s2[loc] =~ /[0-9a-zA-Z]/
 				window_s2 += @s2[loc]
 				count += 1
@@ -113,15 +180,17 @@ class TextAlignment::AnchorFinder
 		[window_s1, window_s2]
 	end
-	def get_right_windows
+	def get_right_windows(beg_s1, beg_s2, size_window = nil)
+		size_window ||= @size_window
 		# commend below with the assumption that the end of a document gives a significant locational
-		# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
+		# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
 		window_s1 = ''
-		loc = @beg_s1 + @size_ngram
+		loc = beg_s1 + @size_ngram
 		len_s1 = @s1.length
 		count = 0
-		while count < @size_window && loc < len_s1
+		while count < size_window && loc < len_s1
 			if @s1[loc] =~ /[0-9a-zA-Z]/
 				window_s1 += @s1[loc]
 				count += 1
@@ -130,10 +199,10 @@ class TextAlignment::AnchorFinder
 		end
 		window_s2 = ''
-		loc = @beg_s2 + @size_ngram
+		loc = beg_s2 + @size_ngram
 		len_s2 = @s2.length
 		count = 0
-		while count < @size_window && loc < len_s2
+		while count < size_window && loc < len_s2
 			if @s2[loc] =~ /[0-9a-zA-Z]/
 				window_s2 += @s2[loc]
 				count += 1
@@ -148,5 +217,4 @@ class TextAlignment::AnchorFinder
 		return 0 if str1.nil? || str2.nil?
 		String::Similarity.cosine(str1, str2, ngram:ngram_order)
 	end
-end
+end

data/lib/text_alignment/{mappings.rb → char_mapping.rb} RENAMED Viewed

@@ -1,6 +1,6 @@
 module TextAlignment; end unless defined? TextAlignment
-TextAlignment::MAPPINGS = [
+TextAlignment::CHAR_MAPPING = [
 	["©", "(c)"],			#U+00A9 (Copyright Sign)
 	["α", "alpha"],		#U+03B1 (greek small letter alpha)
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
 	["•", "*"],				#U+2022 (bullet)
 	[" ", " "],				#U+2009 (thin space)
 	[" ", " "],				#U+200A (hair space)
-	[" ", " "],				#U+00A0 (no-break space)
+	[" ", " "],				#U+00A0 (Non-Breaking space)
 	["　", " "],				#U+3000 (ideographic space)
-	["‑", "-"],				#U+2211 (Non-Breaking Hyphen)
+	["‐", "-"],				#U+2010 (Hyphen)
+	["‑", "-"],				#U+2011 (Non-Breaking Hyphen)
 	["−", "-"],				#U+2212 (minus sign)
 	["–", "-"],				#U+2013 (en dash)
 	["′", "'"],				#U+2032 (prime)
@@ -75,98 +76,112 @@ TextAlignment::MAPPINGS = [
 ]
-TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
+class TextAlignment::CharMapping
+	attr_reader :str
+	def initialize(_str, char_mapping = nil)
+		char_mapping ||= TextAlignment::CHAR_MAPPING
+		@str, offset_mapping = enmap_str(_str, char_mapping)
+		@index_enmap = offset_mapping.to_h
+		@index_demap = offset_mapping.map{|m| m.reverse}.to_h
+	end
+	def enmap_position(position)
+		@index_enmap[position]
+	end
-class << TextAlignment
-	def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
-		_mappings ||= TextAlignment::MAPPINGS
+	def demap_position(position)
+		@index_demap[position]
+	end
-		character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
-		if character_mappings.empty?
-			[_str1, _str2, _mappings]
-		else
-			characters_from = character_mappings.collect{|m| m[0]}.join
-			characters_to   = character_mappings.collect{|m| m[1]}.join
-			characters_to.gsub!(/-/, '\-')
+	def enmap_denotations(_denotations)
+		denotations = _denotations.map do |d|
+			d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
+		end
+	end
-			str1 = _str1.tr(characters_from, characters_to)
-			str2 = _str2.tr(characters_from, characters_to)
+	private
-			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
+	def enmap_str(_str, char_mapping)
+		str = _str.dup
-			[str1, str2, mappings]
+		# To execute the single letter mapping
+		char_mapping.each do |one, long|
+			str.gsub!(one, long) if long.length == 1
 		end
-	end
-	def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
-		_mappings ||= TextAlignment::MAPPINGS
-		long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
-		if long_to_one_mappings.empty?
-			[_str1, _str2, _mappings]
-		else
-			## long to one character mappings
-			pletters = TextAlignment::PADDING_LETTERS
-			# find the padding letter for str1
-			@padding_letter1 = begin
-				i = pletters.index{|l| _str2.index(l).nil?}
-				raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
-				TextAlignment::PADDING_LETTERS[i]
-			end
+		# To get the (location, length) index for replacements
+		loc_len = []
+		char_mapping.each do |one, long|
+			next if long.length == 1
-			# find the padding letter for str2
-			@padding_letter2 = begin
-				i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
-				raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
-				TextAlignment::PADDING_LETTERS[i]
+			init_next = 0
+			while loc = str.index(long, init_next)
+				loc_len << [loc, long.length]
+				init_next = loc + long.length
 			end
-			str1 = str2 = nil
-			long_to_one_mappings.each do |f|
-				from = f[1]
-				str1 = if _str2.index(f[0])
-					to = f[0] + (@padding_letter1 * (f[1].length - 1))
-					_str1.gsub(from, to)
-				else
-					_str1
-				end
-				str2 = if _str1.index(f[0])
-					to = f[0] + (@padding_letter2 * (f[1].length - 1))
-					_str2.gsub(from, to)
-				else
-					_str2
-				end
-			end
-			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
+			# a workaround to avoid messing-up due to embedding
+			str.gsub!(long, one * long.length)
+		end
-			[str1, str2, mappings]
+		# To get the (location, length) index for consecutive whitespace sequences
+		init_next = 0
+		while loc = str.index(/\s{2,}/, init_next)
+			len = $~[0].length
+			loc_len << [loc, len]
+			init_next = loc + len
 		end
-	end
-	def compute_similarity(_s1, _s2, sdiff)
-		return 0 if sdiff.nil?
+		loc_len.sort!{|a, b| a[0] <=> b[0]}
-		# compute the lcs only with non-whitespace letters
-		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
-		return 0 if lcs == 0
+		# To get the offset_mapping before and after replacement
+		offset_mapping = []
+		init_next = 0
+		j = 0
-		s1 = if @padding_letter1
-			_s1.tr(@padding_letter1, ' ')
-		else
-			_s1
+		loc_len.each do |loc, len|
+			offset_mapping += (init_next .. loc).map do |i|
+				j += 1
+				[i, j - 1]
+			end
+			init_next = loc + len
+		end
+		offset_mapping += (init_next .. str.length).map do |i|
+			j += 1
+			[i, j - 1]
 		end
-		s2 = if @padding_letter2
-			_s2.tr(@padding_letter2, ' ')
-		else
-			_s2
+		# To execute the long letter mapping
+		char_mapping.each do |one, long|
+			str.gsub!(one * long.length, one) if long.length > 1
 		end
-		similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
+		# To replace multi whitespace sequences to a space
+		str.gsub!(/\s{2,}/, ' ')
+		[str, offset_mapping]
+	end
+end
+if __FILE__ == $0
+	require 'json'
+	unless ARGV.length == 1
+		warn "#{$0} an_annotation_json_file.json"
+		exit
 	end
+	annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
+	denotations = annotations[:denotations]
+	if denotations.nil? && annotations[:tracks]
+		denotations = annotations[:tracks].first[:denotations]
+	end
+	str_mapping = TextAlignment::CharMapping.new(annotations[:text])
+	str_mapped = str_mapping.str
+	denotations_mapped = str_mapping.enmap_denotations(denotations)
+	new_annotations = {text:str_mapped, denotations:denotations_mapped}
+	puts new_annotations.to_json
 end

data/lib/text_alignment/cultivation_map.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module TextAlignment; end unless defined? TextAlignment
+class TextAlignment::CultivationMap
+	attr_reader :map
+	def initialize
+		@map = {}
+	end
+	def cultivate(regions)
+		regions.each do |b, e|
+			(b ... e).each{|p| @map[p] = e}
+		end
+	end
+	def search_again_position(position)
+		@map[position]
+	end
+end

data/lib/text_alignment/glcs_alignment_fast.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
 require 'text_alignment/lcs_comparison'
 require 'text_alignment/lcs_alignment'
 require 'text_alignment/glcs_alignment'
-require 'text_alignment/mappings'
+require 'text_alignment/char_mapping'
 module TextAlignment; end unless defined? TextAlignment
@@ -106,7 +106,7 @@ if __FILE__ == $0
 	dictionary = [["β", "beta"]]
 	# align = TextAlignment::TextAlignment.new(str1, str2)
-	align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
+	align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
 	p align.common_elements
 	p align.mapped_elements
 end

data/lib/text_alignment/mixed_alignment.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
 require 'text_alignment/lcs_alignment'
 require 'text_alignment/lcs_cdiff'
 require 'text_alignment/glcs_alignment'
-require 'text_alignment/mappings'
+require 'text_alignment/char_mapping'
 module TextAlignment; end unless defined? TextAlignment
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
 	def initialize(_str1, _str2, _mappings = nil)
 		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
-		str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
+		mappings ||= TextAlignment::CHAR_MAPPING
+		str1 = _str1.dup
+		str2 = _str2.dup
 		_compute_mixed_alignment(str1, str2, mappings)
 	end
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
 		end
 		cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
-		@similarity         = TextAlignment::compute_similarity(str1, str2, @sdiff)
+		@similarity         = compute_similarity(str1, str2, @sdiff)
 		@str1_match_initial = cmp.str1_match_initial
 		@str1_match_final   = cmp.str1_match_final
 		@str2_match_initial = cmp.str2_match_initial
@@ -139,4 +141,14 @@ class TextAlignment::MixedAlignment
 		@position_map_end = posmap_end.sort.to_h
 	end
+	def compute_similarity(s1, s2, sdiff)
+		return 0 if sdiff.nil?
+		# compute the lcs only with non-whitespace letters
+		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
+		return 0 if lcs == 0
+		similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
+	end
 end

data/lib/text_alignment/text_alignment.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'text_alignment/constants'
 require 'text_alignment/anchor_finder'
 require 'text_alignment/mixed_alignment'
+require 'text_alignment/cultivation_map'
 module TextAlignment; end unless defined? TextAlignment
@@ -9,23 +10,206 @@ class TextAlignment::TextAlignment
 	attr_reader :block_alignment
 	attr_reader :similarity
 	attr_reader :lost_annotations
+	attr_reader :cultivation_map
-	def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
+	def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
 		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
-		@block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
+		@block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
 		@original_str1 = _str1
 		@original_str2 = _str2
-		str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
+		@str1_mapping = TextAlignment::CharMapping.new(_str1)
+		@str2_mapping = TextAlignment::CharMapping.new(_str2)
-		if r = whole_block_alignment(str1, str2)
-			@block_alignment[:blocks] = r
-			return
+		str1 = @str1_mapping.str
+		denotations = @str1_mapping.enmap_denotations(_denotations)
+		str2 = @str2_mapping.str
+		@cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
+		@block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
+			# whole block alignment
+			r
+		else
+			find_block_alignment(str1, str2, denotations, @cultivation_map)
+		end
+		newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
+			if b[:alignment] == :block || b[:alignment] == :term
+				[b[:target][:begin], b[:target][:end]]
+			else
+				nil
+			end
+		end.compact
+		newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
+			if condensed.empty? || (condensed.last.last + 1 < region.first)
+				condensed.push region
+			else
+				condensed.last[1] = region.last
+			end
+			condensed
+		end
+		@cultivation_map.cultivate(newly_cultivated_regions_condensed)
+	end
+	def transform_begin_position(_begin_position)
+		begin_position = @str1_mapping.enmap_position(_begin_position)
+		i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
+		block = @block_alignment[:blocks][i]
+		b = if block[:alignment] == :block || block[:alignment] == :term
+			begin_position + block[:delta]
+		elsif block[:alignment] == :empty
+			if begin_position == block[:source][:begin]
+				block[:target][:begin]
+			else
+				nil
+			end
+		else
+			r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
+			r.nil? ? nil : r + block[:target][:begin]
 		end
+		@str2_mapping.demap_position(b)
+	end
+	def transform_end_position(_end_position)
+		end_position = @str1_mapping.enmap_position(_end_position)
+		i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
+		block = @block_alignment[:blocks][i]
+		e = if block[:alignment] == :block || block[:alignment] == :term
+			end_position + block[:delta]
+		elsif block[:alignment] == :empty
+			if end_position == block[:source][:end]
+				block[:target][:end]
+			else
+				nil
+			end
+		else
+			r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
+			r.nil? ? nil : r + block[:target][:begin]
+		end
+		@str2_mapping.demap_position(e)
+	end
+	def transform_a_span(span)
+		{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
+	end
+	def transform_spans(spans)
+		spans.map{|span| transform_a_span(span)}
+	end
+	def transform_denotations!(denotations)
+		return nil if denotations.nil?
+		@lost_annotations = []
+		denotations.each do |d|
+			source = {begin:d.begin, end:d.end}
+			d.begin = transform_begin_position(d.begin);
+			d.end = transform_end_position(d.end);
+			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
+		rescue
+			@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
+			d.begin = nil
+			d.end = nil
+		end
+		@lost_annotations
+	end
+	def transform_hdenotations(hdenotations)
+		return nil if hdenotations.nil?
+		@lost_annotations = []
+		r = hdenotations.collect do |d|
+			t = transform_a_span(d[:span])
+			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
+			new_d = d.dup.merge({span:t})
+		rescue
+			@lost_annotations << {source: d[:span], target:t}
+			nil
+		end.compact
+		r
+	end
+	def alignment_show
+		stext = @block_alignment[:source_text]
+		ttext = @block_alignment[:target_text]
+		show = ''
+		@block_alignment[:blocks].each do |a|
+			show += case a[:alignment]
+			when :block
+				"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
+			when :term
+				"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
+			when :empty
+				"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
+				"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
+				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
+				">>>>> string 2 " +
+				if a[:target]
+					"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+					ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
+				else
+					"[-]\n\n"
+				end
+			else
+				astr1 = ''
+				astr2 = ''
+				base = a[:source][:begin]
+				astr1 = a[:alignment].sdiff.map do |c|
+					case c.action
+					when '='
+						stext[c.old_position + base]
+					when '+'
+						'_'
+					when '-'
+						stext[c.old_position + base]
+					when '!'
+						stext[c.old_position + base] + '_'
+					end
+				end.join('')
+				base = a[:target][:begin]
+				astr2 = a[:alignment].sdiff.map do |c|
+					case c.action
+					when '='
+						ttext[c.new_position + base]
+					when '+'
+						ttext[c.new_position + base]
+					when '-'
+						'_'
+					when '!'
+						'_' + ttext[c.new_position + base]
+					end
+				end.join('')
+				"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
+				"[#{astr1}]\n" +
+				"[#{astr2}]\n\n"
+			end
+		end
+		show
+	end
+	private
+	def find_block_alignment(str1, str2, denotations, cultivation_map)
 		## to find block alignments
-		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
+		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
 		blocks = []
 		while block = anchor_finder.get_next_anchor
@@ -68,12 +252,13 @@ class TextAlignment::TextAlignment
 				if b2 == e2
 					[
-						{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
+						{source:{begin:b1, end:e1}, alignment: :empty},
 						block
 					]
 				else
+					len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
 					if b1 == 0 && b2 == 0
-						len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
 						b2 = e2 - len_buffer if e2 > len_buffer
 					end
@@ -85,6 +270,10 @@ class TextAlignment::TextAlignment
 							{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
 							block
 						]
+					elsif ((e2 - b2) - (e1 - b1)) > len_buffer
+						la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
+						la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
+						[la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
 					else
 						local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
 					end
@@ -102,31 +291,53 @@ class TextAlignment::TextAlignment
 			b1 = last_block[:source][:end]
 			if b1 < str1.length
 				e1 = str1.length
 				b2 = last_block[:target][:end]
-				if b2 < str2.length
-					len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
-					e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
-					local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
+				_str1 = str1[b1 ... e1]
+				if _str1.strip.empty?
+					[{source:{begin:b1, end:e1}, alignment: :empty}]
 				else
-					[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
+					if b2 < str2.length
+						len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
+						e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
+						local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
+					else
+						[{source:{begin:b1, end:e1}, alignment: :empty}]
+					end
 				end
 			else
 				[]
 			end
 		end
-		@block_alignment[:blocks] = blocks2
 	end
-	def whole_block_alignment(str1, str2)
+	def whole_block_alignment(str1, str2, cultivation_map)
 		## Block exact match
-		block_begin = str2.index(str1)
+		search_position = 0
+		block_begin = begin
+			_block_begin = str2.index(str1, search_position)
+			break if _block_begin.nil?
+			search_position = cultivation_map.search_again_position(_block_begin)
+			_block_begin
+		end until search_position.nil?
 		unless block_begin.nil?
 			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
 		end
-		block_begin = str2.downcase.index(str1.downcase)
+		search_position = 0
+		dstr1 = str1.downcase
+		dstr2 = str2.downcase
+		block_begin = begin
+			_block_begin = dstr2.index(dstr1, search_position)
+			break if _block_begin.nil?
+			search_position = cultivation_map.search_again_position(_block_begin)
+			_block_begin
+		end until search_position.nil?
 		unless block_begin.nil?
 			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
 		end
@@ -144,7 +355,7 @@ class TextAlignment::TextAlignment
 							map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
 			position = 0
-			tblocks = ds_in_scope.map do |term|
+			_tblocks = ds_in_scope.map do |term|
 				lex = term[:lex]
 				r = block2.index(lex, position)
 				if r.nil?
@@ -152,11 +363,11 @@ class TextAlignment::TextAlignment
 					break
 				end
 				position = r + lex.length
-				{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r + b2 - term[:span][:begin]}
+				{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
 			end
 			# missing term found
-			tblocks = [] if position.nil?
+			_tblocks = [] if position.nil?
 			# redundant matching found
 			unless position.nil?
@@ -164,13 +375,13 @@ class TextAlignment::TextAlignment
 					lex = term[:lex]
 					look_forward = block2.index(lex, position)
 					unless look_forward.nil?
-						tblocks = []
+						_tblocks = []
 						break
 					end
 				end
 			end
-			tblocks
+			_tblocks
 		else
 			[]
 		end
@@ -184,7 +395,7 @@ class TextAlignment::TextAlignment
 					block2 = str2[b2 ... e2]
 					## character-based alignment
-					alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
+					alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
 					if alignment.sdiff.nil?
 						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
 					else
@@ -196,7 +407,7 @@ class TextAlignment::TextAlignment
 				block2 = str2[b2 ... e2]
 				## character-based alignment
-				alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
+				alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
 				if alignment.sdiff.nil?
 					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
 				else
@@ -244,157 +455,4 @@ class TextAlignment::TextAlignment
 		end
 	end
-	def indices(str, target)
-	  position = 0
-	  len = target.len
-	  Enumerator.new do |yielder|
-	    while idx = str.index(target, position)
-	      yielder << idx
-	      position = idx + len
-	    end
-	  end
-	end
-	def transform_begin_position(begin_position)
-		i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
-		block = @block_alignment[:blocks][i]
-		b = if block[:alignment] == :block || block[:alignment] == :term
-			begin_position + block[:delta]
-		elsif block[:alignment] == :empty
-			if begin_position == block[:source][:begin]
-				block[:target][:begin]
-			else
-				nil
-			end
-		else
-			r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
-			r.nil? ? nil : r + block[:target][:begin]
-		end
-	end
-	def transform_end_position(end_position)
-		i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
-		block = @block_alignment[:blocks][i]
-		e = if block[:alignment] == :block || block[:alignment] == :term
-			end_position + block[:delta]
-		elsif block[:alignment] == :empty
-			if end_position == block[:source][:end]
-				block[:target][:end]
-			else
-				nil
-			end
-		else
-			r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
-			r.nil? ? nil : r + block[:target][:begin]
-		end
-	end
-	def transform_a_span(span)
-		{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
-	end
-	def transform_spans(spans)
-		spans.map{|span| transform_a_span(span)}
-	end
-	def transform_denotations!(denotations)
-		return nil if denotations.nil?
-		@lost_annotations = []
-		denotations.each do |d|
-			source = {begin:d.begin, end:d.end}
-			d.begin = transform_begin_position(d.begin);
-			d.end = transform_end_position(d.end);
-			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
-		rescue
-			@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
-			d.begin = nil
-			d.end = nil
-		end
-		@lost_annotations
-	end
-	def transform_hdenotations(hdenotations)
-		return nil if hdenotations.nil?
-		@lost_annotations = []
-		r = hdenotations.collect do |d|
-			t = transform_a_span(d[:span])
-			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
-			new_d = d.dup.merge({span:t})
-		rescue
-			@lost_annotations << {source: d[:span], target:t}
-			nil
-		end.compact
-		r
-	end
-	def alignment_show
-		stext = @block_alignment[:source_text]
-		ttext = @block_alignment[:target_text]
-		show = ''
-		@block_alignment[:blocks].each do |a|
-			show += case a[:alignment]
-			when :block
-				"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
-				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
-			when :term
-				"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
-				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
-			when :empty
-				"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
-				"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
-				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
-				">>>>> string 2 " +
-				if a[:target]
-					"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
-					ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
-				else
-					"[-]\n\n"
-				end
-			else
-				astr1 = ''
-				astr2 = ''
-				base = a[:source][:begin]
-				astr1 = a[:alignment].sdiff.map do |c|
-					case c.action
-					when '='
-						stext[c.old_position + base]
-					when '+'
-						'_'
-					when '-'
-						stext[c.old_position + base]
-					when '!'
-						stext[c.old_position + base] + '_'
-					end
-				end.join('')
-				base = a[:target][:begin]
-				astr2 = a[:alignment].sdiff.map do |c|
-					case c.action
-					when '='
-						ttext[c.new_position + base]
-					when '+'
-						ttext[c.new_position + base]
-					when '-'
-						'_'
-					when '!'
-						'_' + ttext[c.new_position + base]
-					end
-				end.join('')
-				"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
-				"[#{astr1}]\n" +
-				"[#{astr2}]\n\n"
-			end
-		end
-		show
-	end
 end

data/lib/text_alignment/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.9.1'
+	VERSION = '0.10.1'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.9.1
+  version: 0.10.1
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-01-20 00:00:00.000000000 Z
+date: 2021-03-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary
@@ -77,7 +77,9 @@ files:
 - lib/text_alignment.rb
 - lib/text_alignment/anchor_finder.rb
 - lib/text_alignment/approximate_fit.rb
+- lib/text_alignment/char_mapping.rb
 - lib/text_alignment/constants.rb
+- lib/text_alignment/cultivation_map.rb
 - lib/text_alignment/find_divisions.rb
 - lib/text_alignment/glcs_alignment.rb
 - lib/text_alignment/glcs_alignment_fast.rb
@@ -86,7 +88,6 @@ files:
 - lib/text_alignment/lcs_cdiff.rb
 - lib/text_alignment/lcs_comparison.rb
 - lib/text_alignment/lcs_min.rb
-- lib/text_alignment/mappings.rb
 - lib/text_alignment/mixed_alignment.rb
 - lib/text_alignment/text_alignment.rb
 - lib/text_alignment/version.rb