RubyGems - text_alignment - Versions diffs - 0.8.1 → 0.11.1 - Mend

text_alignment 0.8.1 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/bin/align_annotations +23 -13
data/lib/text_alignment/anchor_finder.rb +120 -70
data/lib/text_alignment/{mappings.rb → char_mapping.rb} +92 -75
data/lib/text_alignment/cultivation_map.rb +94 -0
data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
data/lib/text_alignment/mixed_alignment.rb +21 -3
data/lib/text_alignment/text_alignment.rb +241 -233
data/lib/text_alignment/version.rb +1 -1
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
-  data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
+  metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
+  data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
 SHA512:
-  metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
-  data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b
+  metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
+  data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a

data/bin/align_annotations CHANGED Viewed

@@ -26,8 +26,8 @@ def read_text(filename)
 	end
 end
-def align_denotations(denotations, source_text, target_text, debug = false)
-	alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
+def align_denotations(denotations, source_text, alignment, debug = false)
+	alignment.align(source_text, denotations)
 	new_denotations = alignment.transform_hdenotations(denotations)
 	if debug
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
 	new_denotations
 end
-def align_mannotations(source_annotations, target_text, debug = false)
-	target_annotations = {text:target_text}
+def align_mannotations(source_annotations, reference_text, alignment, debug = false)
+	target_annotations = {text:reference_text}
 	idnum_denotations = 0
 	idnum_relations = 0
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
 		if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
 			ididx = {}
 			warn "[#{i}]-=-=-=-=-"
-			denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
+			denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
 			denotations.each do |d|
 				reid = 'T' + (idnum_denotations += 1).to_s
 				ididx[d[:id]] = reid
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
 				annotations[:relations].each do |r|
 					reid = 'R' + (idnum_relations += 1).to_s
 					ididx[r[:id]] = reid
-					target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
+					sid = ididx[r[:subj]]
+					oid = ididx[r[:obj]]
+					target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
 				end
 			end
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
 				annotations[:attributes].each do |a|
 					reid = 'A' + (idnum_attributes += 1).to_s
 					ididx[a[:id]] = reid
-					target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
+					sid = ididx[a[:subj]]
+					target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
 				end
 			end
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
 				annotations[:modifications].each do |m|
 					reid = 'M' + (idnum_modifications += 1).to_s
 					ididx[m[:id]] = reid
-					target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
+					oid = ididx[m[:obj]]
+					target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
 				end
 			end
 		end
@@ -109,13 +114,18 @@ unless ARGV.length == 2
 end
 source_annotations = read_annotations(ARGV[0])
-target_text = read_text(ARGV[1])
+reference_text = read_text(ARGV[1])
+alignment = TextAlignment::TextAlignment.new(reference_text, true)
 target_annotations = if source_annotations.class == Array
-	align_mannotations(source_annotations, target_text, false)
+	# align_mannotations(source_annotations, reference_text, alignment, true)
+	align_mannotations(source_annotations, reference_text, alignment, false)
 else
-	denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
-	source_annotations.merge({text:target_text, denotations:denotations})
+	# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
+	denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
+	source_annotations.merge({text:reference_text, denotations:denotations})
 end
-# puts target_annotations.to_json
+# pp alignment.block_alignment
+puts target_annotations.to_json

data/lib/text_alignment/anchor_finder.rb CHANGED Viewed

@@ -6,92 +6,141 @@ module TextAlignment; end unless defined? TextAlignment
 class TextAlignment::AnchorFinder
-	def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
-		@size_ngram  = _size_ngram  || TextAlignment::SIZE_NGRAM
-		@size_window = _size_window || TextAlignment::SIZE_WINDOW
-		@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
+	def initialize(source_str, target_str, cultivation_map)
+		@s1 = source_str.downcase
+		@s2 = target_str.downcase
-		@reverse = (target_str.length < source_str.length)
+		@cultivation_map = cultivation_map
-		@s1, @s2 = if @reverse
-			[target_str.downcase, source_str.downcase]
-		else
-			[source_str.downcase, target_str.downcase]
-		end
+		@size_ngram  = TextAlignment::SIZE_NGRAM
+		@size_window = TextAlignment::SIZE_WINDOW
+		@sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
+		@pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
+		@pos_s2_final_possible_end = @s2.length
-		# current position in s1
-		@beg_s1 = 0
-		@end_s1_prev = 0
-		@end_s2_prev = 0
+		# positions of last match
+		@pos_s1_last_match = 0
+		@pos_s2_last_match = 0
 	end
 	def get_next_anchor
-		# find the position of an anchor ngram in s1 and s2
-		while @beg_s1 < (@s1.length - @size_ngram)
-			if [' ', "\n", "\t"].include? @s1[@beg_s1]
-				@beg_s1 += 1
-				next
-			end
-			anchor = @s1[@beg_s1, @size_ngram]
-			# search_position = 0
-			search_position = @end_s2_prev
-			while @beg_s2 = @s2.index(anchor, search_position)
-				# if both the begining points are sufficiantly close to the end points of the last match
-				break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
+		# To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
+		beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
-				left_window_s1, left_window_s2 = get_left_windows
-				break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
+			# To skip whitespace letters
+			next if [' ', "\n", "\t"].include? @s1[beg_s1]
-				right_window_s1, right_window_s2 = get_right_windows
-				break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
+			_beg_s2 = get_beg_s2(beg_s1)
+			break _beg_s2 unless _beg_s2.nil?
+		end
-				search_position = @beg_s2 + 1
-			end
+		# To return nil when it fails to find an anchor
+		return nil if beg_s2.class == Range
-			break unless @beg_s2.nil?
+		# To extend the block to the left
+		b1 = beg_s1
+		b2 = beg_s2
+		left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
+		while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
+			b1 -= 1; b2 -= 1
+		end
-			@beg_s1 += 1
+		# To extend the block to the right
+		e1 = beg_s1 + @size_ngram
+		e2 = beg_s2 + @size_ngram
+		right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
+		while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
+			e1 += 1; e2 += 1
 		end
-		return nil if @beg_s1 >= (@s1.length - @size_ngram)
+		@pos_s1_last_match = e1
+		@pos_s2_last_match = e2
-		# extend the block
-		b1 = @beg_s1
-		b2 = @beg_s2
-		while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
-			b1 -= 1; b2 -= 1
-		end
+		{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
+	end
-		b1 += 1; b2 += 1
+	private
-		e1 = @beg_s1 + @size_ngram
-		e2 = @beg_s2 + @size_ngram
-		while @s1[e1] && @s1[e1] == @s2[e2]
-			e1 += 1; e2 += 1
+	def get_beg_s2(beg_s1)
+		# to get the anchor to search for in s2
+		anchor = @s1[beg_s1, @size_ngram]
+		# comment out below with the assumption that texts are in the same order
+		# search_position = 0
+		search_position = @pos_s2_last_match
+		beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
+		return nil if beg_s2_candidates.empty?
+		find_valid_beg_s2(beg_s1, beg_s2_candidates)
+	end
+	# To find beg_s2 which match to the anchor
+	# return nil if the anchor is too much frequent
+	def find_beg_s2_candidates(anchor, search_position)
+		candidates = []
+		while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
+			candidates << _beg_s2
+			# for speed, skip anchor of high frequency
+			if candidates.length > 5
+				candidates.clear
+				break
+			end
+			search_position = _beg_s2 + 1
 		end
+		candidates
+	end
+	def find_valid_beg_s2(beg_s1, beg_s2_candidates)
+		valid_beg_s2 = nil
-		@end_s1_prev = e1
-		@end_s2_prev = e2
-		@beg_s1 = e1
+		(10 .. 30).step(10).each do |size_window|
+			valid_beg_s2 = nil
-		if @reverse
-			{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
-		else
-			{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
+			r = beg_s2_candidates.each do |beg_s2|
+				# if both the begining points are sufficiantly close to the end points of the last match
+				# break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
+				if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
+					break unless valid_beg_s2.nil?
+					valid_beg_s2 = beg_s2
+					next
+				end
+				left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
+				if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
+					break unless valid_beg_s2.nil?
+					valid_beg_s2 = beg_s2
+					next
+				end
+				right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
+				if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
+					break unless valid_beg_s2.nil?
+					valid_beg_s2 = beg_s2
+					next
+				end
+			end
+			# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
+			# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
+			break unless r.nil?
 		end
+		valid_beg_s2
 	end
-	private
+	def get_left_windows(beg_s1, beg_s2, size_window = nil)
+		size_window ||= @size_window
-	def get_left_windows
-		# commend below with the assumption that the beginning of a document gives a significant locational information
-		# return if @beg_s1 < @size_window || @beg_s2 < @size_window
+		# comment out below with the assumption that the beginning of a document gives a significant locational information
+		# return if @beg_s1 < size_window || @beg_s2 < size_window
 		window_s1 = ''
-		loc = @beg_s1 - 1
+		loc = beg_s1 - 1
 		count = 0
-		while count < @size_window && loc >= 0
+		while count < size_window && loc >= 0
 			if @s1[loc] =~ /[0-9a-zA-Z]/
 				window_s1 += @s1[loc]
 				count += 1
@@ -100,9 +149,9 @@ class TextAlignment::AnchorFinder
 		end
 		window_s2 = ''
-		loc = @beg_s2 - 1
+		loc = beg_s2 - 1
 		count = 0
-		while count < @size_window && loc >= 0
+		while count < size_window && loc >= 0
 			if @s2[loc] =~ /[0-9a-zA-Z]/
 				window_s2 += @s2[loc]
 				count += 1
@@ -113,15 +162,17 @@ class TextAlignment::AnchorFinder
 		[window_s1, window_s2]
 	end
-	def get_right_windows
+	def get_right_windows(beg_s1, beg_s2, size_window = nil)
+		size_window ||= @size_window
 		# commend below with the assumption that the end of a document gives a significant locational
-		# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
+		# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
 		window_s1 = ''
-		loc = @beg_s1 + @size_ngram
+		loc = beg_s1 + @size_ngram
 		len_s1 = @s1.length
 		count = 0
-		while count < @size_window && loc < len_s1
+		while count < size_window && loc < len_s1
 			if @s1[loc] =~ /[0-9a-zA-Z]/
 				window_s1 += @s1[loc]
 				count += 1
@@ -130,10 +181,10 @@ class TextAlignment::AnchorFinder
 		end
 		window_s2 = ''
-		loc = @beg_s2 + @size_ngram
+		loc = beg_s2 + @size_ngram
 		len_s2 = @s2.length
 		count = 0
-		while count < @size_window && loc < len_s2
+		while count < size_window && loc < len_s2
 			if @s2[loc] =~ /[0-9a-zA-Z]/
 				window_s2 += @s2[loc]
 				count += 1
@@ -148,5 +199,4 @@ class TextAlignment::AnchorFinder
 		return 0 if str1.nil? || str2.nil?
 		String::Similarity.cosine(str1, str2, ngram:ngram_order)
 	end
-end
+end

data/lib/text_alignment/{mappings.rb → char_mapping.rb} RENAMED Viewed

@@ -1,6 +1,6 @@
 module TextAlignment; end unless defined? TextAlignment
-TextAlignment::MAPPINGS = [
+TextAlignment::CHAR_MAPPING = [
 	["©", "(c)"],			#U+00A9 (Copyright Sign)
 	["α", "alpha"],		#U+03B1 (greek small letter alpha)
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
 	["•", "*"],				#U+2022 (bullet)
 	[" ", " "],				#U+2009 (thin space)
 	[" ", " "],				#U+200A (hair space)
-	[" ", " "],				#U+00A0 (no-break space)
+	[" ", " "],				#U+00A0 (Non-Breaking space)
 	["　", " "],				#U+3000 (ideographic space)
-	["‑", "-"],				#U+2211 (Non-Breaking Hyphen)
+	["‐", "-"],				#U+2010 (Hyphen)
+	["‑", "-"],				#U+2011 (Non-Breaking Hyphen)
 	["−", "-"],				#U+2212 (minus sign)
 	["–", "-"],				#U+2013 (en dash)
 	["′", "'"],				#U+2032 (prime)
@@ -75,98 +76,114 @@ TextAlignment::MAPPINGS = [
 ]
-TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
+class TextAlignment::CharMapping
+	attr_reader :mapped_text
+	def initialize(_text, char_mapping = nil)
+		char_mapping ||= TextAlignment::CHAR_MAPPING
+		@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
+		@index_enmap = offset_mapping.to_h
+		@index_demap = offset_mapping.map{|m| m.reverse}.to_h
+	end
-class << TextAlignment
-	def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
-		_mappings ||= TextAlignment::MAPPINGS
-		character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
-		if character_mappings.empty?
-			[_str1, _str2, _mappings]
-		else
-			characters_from = character_mappings.collect{|m| m[0]}.join
-			characters_to   = character_mappings.collect{|m| m[1]}.join
-			characters_to.gsub!(/-/, '\-')
+	def enmap_position(position)
+		@index_enmap[position]
+	end
-			str1 = _str1.tr(characters_from, characters_to)
-			str2 = _str2.tr(characters_from, characters_to)
+	def demap_position(position)
+		@index_demap[position]
+	end
-			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
+	def enmap_denotations(_denotations)
+		return nil if _denotations.nil?
-			[str1, str2, mappings]
+		denotations = _denotations.map do |d|
+			d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
 		end
 	end
-	def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
-		_mappings ||= TextAlignment::MAPPINGS
-		long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
-		if long_to_one_mappings.empty?
-			[_str1, _str2, _mappings]
-		else
-			## long to one character mappings
-			pletters = TextAlignment::PADDING_LETTERS
-			# find the padding letter for str1
-			@padding_letter1 = begin
-				i = pletters.index{|l| _str2.index(l).nil?}
-				raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
-				TextAlignment::PADDING_LETTERS[i]
-			end
+	private
-			# find the padding letter for str2
-			@padding_letter2 = begin
-				i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
-				raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
-				TextAlignment::PADDING_LETTERS[i]
-			end
+	def enmap_text(_text, char_mapping)
+		text = _text.dup
-			str1 = str2 = nil
-			long_to_one_mappings.each do |f|
-				from = f[1]
-				str1 = if _str2.index(f[0])
-					to = f[0] + (@padding_letter1 * (f[1].length - 1))
-					_str1.gsub(from, to)
-				else
-					_str1
-				end
-				str2 = if _str1.index(f[0])
-					to = f[0] + (@padding_letter2 * (f[1].length - 1))
-					_str2.gsub(from, to)
-				else
-					_str2
-				end
+		# To execute the single letter mapping
+		char_mapping.each do |one, long|
+			text.gsub!(one, long) if long.length == 1
+		end
+		# To get the (location, length) index for replacements
+		loc_len = []
+		char_mapping.each do |one, long|
+			next if long.length == 1
+			init_next = 0
+			while loc = text.index(long, init_next)
+				loc_len << [loc, long.length]
+				init_next = loc + long.length
 			end
-			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
-			[str1, str2, mappings]
+			# a workaround to avoid messing-up due to embedding
+			text.gsub!(long, one * long.length)
 		end
-	end
-	def compute_similarity(_s1, _s2, sdiff)
-		return 0 if sdiff.nil?
+		# To get the (location, length) index for consecutive whitespace sequences
+		init_next = 0
+		while loc = text.index(/\s{2,}/, init_next)
+			len = $~[0].length
+			loc_len << [loc, len]
+			init_next = loc + len
+		end
+		loc_len.sort!{|a, b| a[0] <=> b[0]}
+		# To get the offset_mapping before and after replacement
+		offset_mapping = []
+		init_next = 0
+		j = 0
-		# compute the lcs only with non-whitespace letters
-		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
-		return 0 if lcs == 0
+		loc_len.each do |loc, len|
+			offset_mapping += (init_next .. loc).map do |i|
+				j += 1
+				[i, j - 1]
+			end
+			init_next = loc + len
+		end
-		s1 = if @padding_letter1
-			_s1.tr(@padding_letter1, ' ')
-		else
-			_s1
+		offset_mapping += (init_next .. text.length).map do |i|
+			j += 1
+			[i, j - 1]
 		end
-		s2 = if @padding_letter2
-			_s2.tr(@padding_letter2, ' ')
-		else
-			_s2
+		# To execute the long letter mapping
+		char_mapping.each do |one, long|
+			text.gsub!(one * long.length, one) if long.length > 1
 		end
-		similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
+		# To replace multi whitespace sequences to a space
+		text.gsub!(/\s{2,}/, ' ')
+		[text, offset_mapping]
 	end
+end
+if __FILE__ == $0
+	require 'json'
+	unless ARGV.length == 1
+		warn "#{$0} an_annotation_json_file.json"
+		exit
+	end
+	annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
+	denotations = annotations[:denotations]
+	if denotations.nil? && annotations[:tracks]
+		denotations = annotations[:tracks].first[:denotations]
+	end
+	text_mapping = TextAlignment::CharMapping.new(annotations[:text])
+	text_mapped = text_mapping.mapped_text
+	denotations_mapped = text_mapping.enmap_denotations(denotations)
+	new_annotations = {text:text_mapped, denotations:denotations_mapped}
+	puts new_annotations.to_json
 end

data/lib/text_alignment/cultivation_map.rb ADDED Viewed

@@ -0,0 +1,94 @@
+module TextAlignment; end unless defined? TextAlignment
+class TextAlignment::CultivationMap
+	attr_reader :map
+	def initialize
+		@map = []
+	end
+	def cultivate(regions)
+		@map += regions
+		@map.sort!{|a, b| a[0] <=> b[0]}
+		new_map = []
+		@map.each do |region|
+			if new_map.empty?
+				new_map << region
+			elsif new_map.last[1] > region[0]
+				raise "Overlapping regions: #{new_map.last} : #{region}"
+			elsif new_map.last[1] == region[0]
+				new_map.last[1] == region[1]
+			else
+				new_map << region
+			end
+		end
+		@map = new_map
+	end
+	def search_again_position(position, end_position = nil)
+		end_position ||= position
+		region = @map.bsearch{|r| end_position < r[1]}
+		if region.nil? || region[0] > position
+			nil
+		else
+			region[1]
+		end
+	end
+	def last_cultivated_position(position)
+		ridx = @map.rindex{|r| r[1] <= position}
+		ridx.nil? ? nil : @map[ridx][1]
+	end
+	def next_cultivated_position(position)
+		region = @map.bsearch{|r| position < r[0]}
+		region.nil? ? nil : region[0]
+	end
+	def in_regions(region)
+		@map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
+	end
+	def region_state(region)
+		closed_parts = in_regions(region)
+		if closed_parts.empty?
+			[:open, region]
+		else
+			if front_open?(region, closed_parts)
+				if rear_open?(region, closed_parts)
+					[:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
+				else
+					[:front_open, [region[0], closed_parts.first[0]]]
+				end
+			else
+				if rear_open?(region, closed_parts)
+					[:rear_open, [closed_parts.last[1], region[1]]]
+				else
+					[:closed, nil]
+				end
+			end
+		end
+	end
+	def index(target, string, position)
+		length = target.length
+		loop do
+			_begin = string.index(target, position)
+			break if _begin.nil?
+			position = search_again_position(_begin)
+			next unless position.nil?
+			break _begin if region_state([_begin, _begin + length])[0] == :open
+			position = _begin + 1
+		end
+	end
+	private
+	def front_open?(region, closed_parts)
+		closed_parts.first[0] > region[0]
+	end
+	def rear_open?(region, closed_parts)
+		closed_parts.last[1] < region[1]
+	end
+end

data/lib/text_alignment/glcs_alignment_fast.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
 require 'text_alignment/lcs_comparison'
 require 'text_alignment/lcs_alignment'
 require 'text_alignment/glcs_alignment'
-require 'text_alignment/mappings'
+require 'text_alignment/char_mapping'
 module TextAlignment; end unless defined? TextAlignment
@@ -106,7 +106,7 @@ if __FILE__ == $0
 	dictionary = [["β", "beta"]]
 	# align = TextAlignment::TextAlignment.new(str1, str2)
-	align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
+	align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
 	p align.common_elements
 	p align.mapped_elements
 end

data/lib/text_alignment/mixed_alignment.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
 require 'text_alignment/lcs_alignment'
 require 'text_alignment/lcs_cdiff'
 require 'text_alignment/glcs_alignment'
-require 'text_alignment/mappings'
+require 'text_alignment/char_mapping'
 module TextAlignment; end unless defined? TextAlignment
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
 	def initialize(_str1, _str2, _mappings = nil)
 		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
-		str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
+		mappings ||= TextAlignment::CHAR_MAPPING
+		str1 = _str1.dup
+		str2 = _str2.dup
 		_compute_mixed_alignment(str1, str2, mappings)
 	end
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
 		end
 		cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
-		@similarity         = TextAlignment::compute_similarity(str1, str2, @sdiff)
+		@similarity         = compute_similarity(str1, str2, @sdiff)
 		@str1_match_initial = cmp.str1_match_initial
 		@str1_match_final   = cmp.str1_match_final
 		@str2_match_initial = cmp.str2_match_initial
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
 		@position_map_end = posmap_end.sort.to_h
 	end
+	def compute_similarity(s1, s2, sdiff)
+		return 0 if sdiff.nil?
+		# recoverbility
+		count_nws =	sdiff.count{|d| d.old_element =~ /\S/}
+		count_nws_match =	sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
+		coverage = count_nws_match.to_f / count_nws
+		# fragmentation rate
+		count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
+		count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
+		rate_frag = count_ofrag.to_f / count_frag
+		similarity = coverage * rate_frag
+	end
 end

data/lib/text_alignment/text_alignment.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'text_alignment/constants'
 require 'text_alignment/anchor_finder'
 require 'text_alignment/mixed_alignment'
+require 'text_alignment/cultivation_map'
 module TextAlignment; end unless defined? TextAlignment
@@ -10,253 +11,71 @@ class TextAlignment::TextAlignment
 	attr_reader :similarity
 	attr_reader :lost_annotations
-	def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
-		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
+	# Initialize with a reference text, again which texts will be aligned
+	def initialize(reference_text, to_prevent_overlap = false)
+		raise ArgumentError, "nil text" if reference_text.nil?
-		@block_alignment = {source_text:_str1, target_text:_str2}
-		@original_str1 = _str1
-		@original_str2 = _str2
+		@original_reference_text = reference_text
+		@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
+		@mapped_reference_text = @rtext_mapping.mapped_text
+		@to_prevent_overlap = to_prevent_overlap
-		str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
-		if r = whole_block_alignment(str1, str2)
-			@block_alignment[:blocks] = r
-			return
-		end
+		@original_text = nil
+		@block_alignment = nil
+		@cultivation_map = TextAlignment::CultivationMap.new
+	end
-		## to find block alignments
-		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
+	def align(text, denotations = nil)
+		# To maintain the cultivation map
+		update_cultivation_map if @to_prevent_overlap
-		blocks = []
-		while block = anchor_finder.get_next_anchor
-			last = blocks.last
-			if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
-				last[:source][:end] = block[:source][:end]
-				last[:target][:end] = block[:target][:end]
-			else
-				blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
-			end
+		# In case the input text is the same as the previous one, reuse the previous text mapping
+		unless @original_text && @original_text == text
+			@original_text = text
+			@text_mapping = TextAlignment::CharMapping.new(text)
 		end
-		# pp blocks
-		# puts "-----"
-		# puts
-		# exit
-		# blocks.each do |b|
-		# 	p [b[:source], b[:target]]
-		# 	puts "---"
-		# 	puts str1[b[:source][:begin] ... b[:source][:end]]
-		# 	puts "---"
-		# 	puts str2[b[:target][:begin] ... b[:target][:end]]
-		# 	puts "====="
-		# 	puts
-		# end
-		# puts "-=-=-=-=-"
-		# puts
-		## to fill the gaps
-		last_block = nil
-		blocks2 = blocks.inject([]) do |sum, block|
-			b1 = last_block ? last_block[:source][:end] : 0
-			e1 = block[:source][:begin]
-			sum += if b1 == e1
-				[block]
-			else
-				b2 = last_block ? last_block[:target][:end] : 0
-				e2 = block[:target][:begin]
-				if b2 == e2
-					[
-						{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
-						block
-					]
-				else
-					if b1 == 0 && b2 == 0
-						len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
-						b2 = e2 - len_buffer if e2 > len_buffer
-					end
-					_str1 = str1[b1 ... e1]
-					_str2 = str2[b2 ... e2]
-					if _str1.strip.empty? || _str2.strip.empty?
-						[
-							{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
-							block
-						]
-					else
-						local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
-					end
-				end
-			end
+		@mapped_text = @text_mapping.mapped_text
+		denotations_mapped = @text_mapping.enmap_denotations(denotations)
-			last_block = block
-			sum
-		end
+		## To generate the block_alignment of the input text against the reference text
+		# Initialization
+		# @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
+		@block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
-		# the last step
-		blocks2 += if last_block.nil?
-			local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
+		# Generation
+		@block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
+			r
 		else
-			b1 = last_block[:source][:end]
-			if b1 < str1.length
-				e1 = str1.length
-				b2 = last_block[:target][:end]
-				if b2 < str2.length
-					len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
-					e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
-					local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
-				else
-					[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
-				end
-			else
-				[]
-			end
+			find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
 		end
-		@block_alignment[:blocks] = blocks2
 	end
-	def whole_block_alignment(str1, str2)
-		## Block exact match
-		block_begin = str2.index(str1)
-		unless block_begin.nil?
-			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
-		end
+	def update_cultivation_map
+		return if @block_alignment.nil? || @block_alignment[:blocks].nil?
-		block_begin = str2.downcase.index(str1.downcase)
-		unless block_begin.nil?
-			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
-		end
-		nil
-	end
-	def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
-		block2 = str2[b2 ... e2]
-		## term-based alignment
-		tblocks = if denotations
-			ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
-							sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
-							map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
-			position = 0
-			tblocks = ds_in_scope.map do |term|
-				lex = term[:lex]
-				r = block2.index(lex, position)
-				if r.nil?
-					position = nil
-					break
-				end
-				position = r + lex.length
-				{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
-			end
-			# missing term found
-			tblocks = [] if position.nil?
-			# redundant matching found
-			unless position.nil?
-				ds_in_scope.each do |term|
-					lex = term[:lex]
-					look_forward = block2.index(lex, position)
-					unless look_forward.nil?
-						tblocks = []
-						break
-					end
-				end
-			end
-			tblocks
-		else
-			[]
-		end
-		if tblocks.empty?
-			if b1 == 0 && e1 == str1.length
-				if (e1 > 2000) || (e2 > 2000)
-					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
-				else
-					block1 = str1[b1 ... e1]
-					block2 = str2[b2 ... e2]
-					## character-based alignment
-					alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
-					if alignment.sdiff.nil?
-						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
-					else
-						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
-					end
-				end
+		## To update the cultivation map
+		newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
+			if b[:alignment] == :block || b[:alignment] == :term
+				[b[:target][:begin], b[:target][:end]]
 			else
-				block1 = str1[b1 ... e1]
-				block2 = str2[b2 ... e2]
-				## character-based alignment
-				alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
-				if alignment.sdiff.nil?
-					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
-				else
-					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
-				end
-			end
-		else
-			last_tblock = nil
-			lblocks = tblocks.inject([]) do |sum, tblock|
-				tb1 = last_tblock ? last_tblock[:source][:end] : b1
-				te1 = tblock[:source][:begin]
-				sum += if te1 == tb1
-					[tblock]
-				else
-					tb2 = last_tblock ? last_tblock[:target][:end] : b2
-					te2 = tblock[:target][:begin]
-					if b2 == e2
-						[
-							{source:{begin:tb1, end:te1}, alignment: :empty},
-							tblock
-						]
-					else
-						[
-							{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
-							tblock
-						]
-					end
-				end
-				last_tblock = tblock
-				sum
+				nil
 			end
-			if last_tblock[:source][:end] < e1
-				if last_tblock[:target][:end] < e2
-					lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
-				else
-					lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
-				end
+		end.compact.inject([]) do |condensed, region|
+			if condensed.empty? || (condensed.last.last + 1 < region.first)
+				condensed.push region
+			else
+				condensed.last[1] = region.last
 			end
-			lblocks
+			condensed
 		end
-	end
-	def indices(str, target)
-	  position = 0
-	  len = target.len
-	  Enumerator.new do |yielder|
-	    while idx = str.index(target, position)
-	      yielder << idx
-	      position = idx + len
-	    end
-	  end
+		@cultivation_map.cultivate(newly_cultivated_regions)
 	end
-	def transform_begin_position(begin_position)
+	def transform_begin_position(_begin_position)
+		begin_position = @text_mapping.enmap_position(_begin_position)
 		i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
 		block = @block_alignment[:blocks][i]
@@ -272,9 +91,13 @@ class TextAlignment::TextAlignment
 			r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
 			r.nil? ? nil : r + block[:target][:begin]
 		end
+		@rtext_mapping.demap_position(b)
 	end
-	def transform_end_position(end_position)
+	def transform_end_position(_end_position)
+		end_position = @text_mapping.enmap_position(_end_position)
 		i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
 		block = @block_alignment[:blocks][i]
@@ -290,6 +113,8 @@ class TextAlignment::TextAlignment
 			r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
 			r.nil? ? nil : r + block[:target][:begin]
 		end
+		@rtext_mapping.demap_position(e)
 	end
 	def transform_a_span(span)
@@ -308,7 +133,7 @@ class TextAlignment::TextAlignment
 			source = {begin:d.begin, end:d.end}
 			d.begin = transform_begin_position(d.begin);
 			d.end = transform_end_position(d.end);
-			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
+			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
 		rescue
 			@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
 			d.begin = nil
@@ -324,7 +149,7 @@ class TextAlignment::TextAlignment
 		r = hdenotations.collect do |d|
 			t = transform_a_span(d[:span])
-			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
+			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
 			new_d = d.dup.merge({span:t})
 		rescue
 			@lost_annotations << {source: d[:span], target:t}
@@ -335,8 +160,8 @@ class TextAlignment::TextAlignment
 	end
 	def alignment_show
-		stext = @block_alignment[:source_text]
-		ttext = @block_alignment[:target_text]
+		stext = @mapped_text
+		ttext = @mapped_reference_text
 		show = ''
 		@block_alignment[:blocks].each do |a|
@@ -392,9 +217,192 @@ class TextAlignment::TextAlignment
 				"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
 				"[#{astr1}]\n" +
-				"[#{astr2}]\n\n"
+				"[#{astr2.gsub("\n", " ")}]\n\n"
 			end
 		end
 		show
 	end
+	private
+	def find_block_alignment(str1, str2, denotations, cultivation_map)
+		## to find block alignments
+		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
+		blocks = []
+		while block = anchor_finder.get_next_anchor
+			last = blocks.last
+			if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
+				last[:source][:end] = block[:source][:end]
+				last[:target][:end] = block[:target][:end]
+			else
+				blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
+			end
+		end
+		# pp blocks
+		# puts "-----"
+		# puts
+		# exit
+		# blocks.each do |b|
+		# 	p [b[:source], b[:target]]
+		# 	puts "---"
+		# 	puts str1[b[:source][:begin] ... b[:source][:end]]
+		# 	puts "---"
+		# 	puts str2[b[:target][:begin] ... b[:target][:end]]
+		# 	puts "====="
+		# 	puts
+		# end
+		# puts "-=-=-=-=-"
+		# puts
+		## To fill the gaps
+		## lblock: last block, cblock: current block
+		lblock = nil
+		blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
+			b1 = lblock.nil? ? 0 : lblock[:source][:end]
+			e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
+			if b1 < e1
+				b2 = lblock.nil? ? 0 : lblock[:target][:end]
+				e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
+				_str1 = str1[b1 ... e1]
+				_str2 = str2[b2 ... e2]
+				sum += if _str1.strip.empty? || _str2.strip.empty?
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
+				else
+					len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
+					region_state, state_region = cultivation_map.region_state([b2, e2])
+					case region_state
+					when :closed
+						[]
+					when :front_open
+						oe2 = state_region[1]
+						me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
+						local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
+					when :rear_open
+						ob2 = state_region[0]
+						mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
+						local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
+					when :middle_closed
+						oe2 = state_region[0]
+						me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
+						attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
+						if attempt1.empty?
+							ob2 = state_region[1]
+							mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
+							local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
+						else
+							attempt1
+						end
+					else # :open
+						if (e2 - b2) > len_buffer
+							attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
+							if attempt1.empty?
+								local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
+							else
+								attempt1
+							end
+						else
+							local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
+						end
+					end
+				end
+			end
+			lblock = cblock
+			cblock.nil? ? sum : sum << cblock
+		end
+	end
+	def whole_block_alignment(str1, str2, cultivation_map)
+		block_begin = cultivation_map.index(str1, str2, 0)
+		return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
+		block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
+		return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
+		nil
+	end
+	def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
+		tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
+		if tblocks.empty?
+			lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
+		else
+			tblocks
+		end
+	end
+	def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
+		str2_block = str2[0 ... e2]
+		## term-based alignment
+		tblocks = if denotations
+			denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
+							sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
+							map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
+			search_position = b2
+			_tblocks = denotations_in_scope.map do |denotation|
+				lex = denotation[:lex]
+				term_begin = cultivation_map.index(lex, str2_block, search_position)
+				break [] if term_begin.nil? # break the loop if a missing term is found
+				search_position = term_begin + lex.length
+				{source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
+			end
+			# redundant matching found
+			unless _tblocks.empty?
+				search_position = _tblocks.last[:target][:end]
+				denotations_in_scope.each do |term|
+					look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
+					unless look_forward.nil?
+						_tblocks = []
+						break
+					end
+				end
+			end
+			_tblocks
+		else
+			[]
+		end
+		ltblock = nil
+		tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
+			tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
+			te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
+			if te1 > tb1
+				tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
+				te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
+				sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
+			end
+			ltblock = ctblock
+			ctblock.nil? ? sum : sum << ctblock
+		end
+		tblocks2
+	end
+	def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
+		source = {begin:b1, end:e1}
+		target = {begin:b2, end:e2}
+		if (e1 - b1) > 2000
+			[{source:source, target:target, alignment: :empty}]
+		else
+			alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
+			if alignment.similarity < 0.5
+				[{source:source, target:target, alignment: :empty}]
+			else
+				[{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
+			end
+		end
+	end
 end

data/lib/text_alignment/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.8.1'
+	VERSION = '0.11.1'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.8.1
+  version: 0.11.1
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-10-26 00:00:00.000000000 Z
+date: 2021-03-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary
@@ -77,7 +77,9 @@ files:
 - lib/text_alignment.rb
 - lib/text_alignment/anchor_finder.rb
 - lib/text_alignment/approximate_fit.rb
+- lib/text_alignment/char_mapping.rb
 - lib/text_alignment/constants.rb
+- lib/text_alignment/cultivation_map.rb
 - lib/text_alignment/find_divisions.rb
 - lib/text_alignment/glcs_alignment.rb
 - lib/text_alignment/glcs_alignment_fast.rb
@@ -86,7 +88,6 @@ files:
 - lib/text_alignment/lcs_cdiff.rb
 - lib/text_alignment/lcs_comparison.rb
 - lib/text_alignment/lcs_min.rb
-- lib/text_alignment/mappings.rb
 - lib/text_alignment/mixed_alignment.rb
 - lib/text_alignment/text_alignment.rb
 - lib/text_alignment/version.rb