RubyGems - text_alignment - Versions diffs - 0.6.2 → 0.7.2 - Mend

text_alignment 0.6.2 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/bin/align_annotations +4 -19
data/lib/text_alignment/mixed_alignment.rb +74 -4
data/lib/text_alignment/text_alignment.rb +203 -182
data/lib/text_alignment/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c4b2cdf0c257b74c6bec90b93d1907787f3c102108046731c2755684a1b156e9
-  data.tar.gz: 85334dad09a046432503183e3d3ad83841612299038f2f2dac1f9d5d208e1939
+  metadata.gz: 972c5735de6aa85f5f9cd289e965f3ec3b8c38c492085e203686bc0ea897a293
+  data.tar.gz: fc0abe3043562c82af5a3c0cf1178586ffcee7921d7f11dbd5cdb93311cbd52a
 SHA512:
-  metadata.gz: 9272bdd6c56717b53d39b3f2009259accb608ea86b99758b6a7ee9cee1e7b275330db55af4e0eba1eba80ee69275a21a3179243394d24139b3018996f659abe1
-  data.tar.gz: a6a9d97d2bf81ac0c2972fd6e9d5202116156d8ff2e5e81a9bf0306e313dbc601522f887bcbcebff8b9d888cc06826a8ce69ba908dce29fa8decad85d53008af
+  metadata.gz: cfb1e21285616819cea937dce0f8422cddcd2ddb6ccf70d19bf2fd5851a33eede0760b4ed956049dfb3fb1cdfb7758d5bfbf19cff14ffedc2e1ffd80928200e0
+  data.tar.gz: 3fb72b7abe05c1a67db6c18448a0f601260b7d3f733e9b5e9fbe3ba5d9ec791e940bbdf70e00193658139480d320c2f9675426faff5e7e90d80eb9d8b07b074a

data/bin/align_annotations CHANGED

@@ -105,9 +105,7 @@ lost_annotations = []
 target_annotations = if source_annotations.class == Array
 	align_mdoc(source_annotations, {text: target_text})
 else
-	alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
-	# pp alignment
+	alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
 	# verification
 	# source_text = source_annotations[:text]
@@ -142,22 +140,7 @@ else
 	puts "====="
 	# exit
-	# verification of source denotations
-	puts "[Invalid source denotations]"
-	source_annotations[:denotations] do |d|
-		p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
-	end
-	puts "====="
-	puts
 	denotations = alignment.transform_hdenotations(source_annotations[:denotations])
-	puts "[Invalid transformation]"
-	denotations.each do |d|
-		p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
-	end
-	puts "====="
-	puts
 	lost_annotations += alignment.lost_annotations if alignment.lost_annotations
 	source_annotations.merge({text:target_text, denotations:denotations})
@@ -194,7 +177,9 @@ warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotat
 if lost_annotations
 	warn "\n[lost annotations]"
-	warn "#{lost_annotations.length}"
+	lost_annotations.each do |a|
+		p a
+	end
 end
 #puts target_annotations.to_json

data/lib/text_alignment/mixed_alignment.rb CHANGED

@@ -17,9 +17,10 @@ class TextAlignment::MixedAlignment
 	attr_reader :similarity
 	attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
-	def initialize(str1, str2, mappings = [])
-		raise ArgumentError, "nil string" if str1.nil? || str2.nil?
-		mappings ||= []
+	def initialize(_str1, _str2)
+		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
+		str1, str2, mappings = string_preprocessing(_str1, _str2)
 		_compute_mixed_alignment(str1, str2, mappings)
 	end
@@ -62,7 +63,7 @@ class TextAlignment::MixedAlignment
 		end
 		cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
-		@similarity         = cmp.similarity
+		@similarity         = compute_similarity(str1, str2, @sdiff)
 		@str1_match_initial = cmp.str1_match_initial
 		@str1_match_final   = cmp.str1_match_final
 		@str2_match_initial = cmp.str2_match_initial
@@ -137,4 +138,73 @@ class TextAlignment::MixedAlignment
 		@position_map_begin = posmap_begin.sort.to_h
 		@position_map_end = posmap_end.sort.to_h
 	end
+	private
+	def string_preprocessing(_str1, _str2)
+		str1 = _str1.dup
+		str2 = _str2.dup
+		mappings = TextAlignment::MAPPINGS.dup
+		## single character mappings
+		character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
+		characters_from = character_mappings.collect{|m| m[0]}.join
+		characters_to   = character_mappings.collect{|m| m[1]}.join
+		characters_to.gsub!(/-/, '\-')
+		str1.tr!(characters_from, characters_to)
+		str2.tr!(characters_from, characters_to)
+		mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
+		## long to one character mappings
+		pletters = TextAlignment::PADDING_LETTERS
+		# find the padding letter for str1
+		@padding_letter1 = begin
+			i = pletters.index{|l| str2.index(l).nil?}
+			raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
+			TextAlignment::PADDING_LETTERS[i]
+		end
+		# find the padding letter for str2
+		@padding_letter2 = begin
+			i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
+			raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
+			TextAlignment::PADDING_LETTERS[i]
+		end
+		# ASCII foldings
+		ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
+		ascii_foldings.each do |f|
+			from = f[1]
+			if str2.index(f[0])
+				to   = f[0] + (@padding_letter1 * (f[1].length - 1))
+				str1.gsub!(from, to)
+			end
+			if str1.index(f[0])
+				to   = f[0] + (@padding_letter2 * (f[1].length - 1))
+				str2.gsub!(from, to)
+			end
+		end
+		mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
+		[str1, str2, mappings]
+	end
+	def compute_similarity(_s1, _s2, sdiff)
+		return 0 if sdiff.nil?
+		# compute the lcs only with non-whitespace letters
+		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
+		return 0 if lcs == 0
+		s1 = _s1.tr(@padding_letter1, ' ')
+		s2 = _s2.tr(@padding_letter2, ' ')
+		similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
+	end
 end

data/lib/text_alignment/text_alignment.rb CHANGED

@@ -12,45 +12,46 @@ class TextAlignment::TextAlignment
 	attr_reader :similarity
 	attr_reader :lost_annotations
-	def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
-		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
+	def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
+		raise ArgumentError, "nil string" if str1.nil? || str2.nil?
-		@block_alignment = {source_text:_str1, target_text:_str2}
+		@block_alignment = {source_text:str1, target_text:str2}
+		@str1 = str1
+		@str2 = str2
-		str1, str2, mappings = string_preprocessing(_str1, _str2)
-		# try exact match
+		## Block exact match
 		block_begin = str2.index(str1)
 		unless block_begin.nil?
 			@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
-			return @block_alignment
+			return
 		end
-		# try exact match
 		block_begin = str2.downcase.index(str1.downcase)
 		unless block_begin.nil?
 			@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
-			return @block_alignment
+			return
 		end
+		## to find block alignments
 		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
-		# To collect matched blocks
-		mblocks = []
-		while anchor = anchor_finder.get_next_anchor
-			last = mblocks.last
-			if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
-				last[:source][:end] = anchor[:source][:end]
-				last[:target][:end] = anchor[:target][:end]
+		blocks = []
+		while block = anchor_finder.get_next_anchor
+			last = blocks.last
+			if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
+				last[:source][:end] = block[:source][:end]
+				last[:target][:end] = block[:target][:end]
 			else
-				mblocks << anchor
+				blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
 			end
 		end
-		# pp mblocks
+		# pp blocks
 		# puts "-----"
 		# puts
-		# mblocks.each do |b|
+		# exit
+		# blocks.each do |b|
 		# 	p [b[:source], b[:target]]
 		# 	puts "---"
 		# 	puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -62,117 +63,202 @@ class TextAlignment::TextAlignment
 		# puts "-=-=-=-=-"
 		# puts
-		## To find block alignments
-		@block_alignment[:blocks] = []
-		return if mblocks.empty?
-		# Initial step
-		if mblocks[0][:source][:begin] > 0
-			e1 = mblocks[0][:source][:begin]
-			e2 = mblocks[0][:target][:begin]
+		## to fill the gaps
+		last_block = nil
+		blocks2 = blocks.inject([]) do |sum, block|
+			b1 = last_block ? last_block[:source][:end] : 0
+			e1 = block[:source][:begin]
-			if mblocks[0][:target][:begin] == 0
-				@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
+			sum += if b1 == e1
+				[block]
 			else
-				_str1 = str1[0 ... e1]
-				_str2 = str2[0 ... e2]
+				b2 = last_block ? last_block[:target][:end] : 0
+				e2 = block[:target][:begin]
+				if b2 == e2
+					[
+						{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
+						block
+					]
+				else
+					if b1 == 0 && b2 == 0
+						len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
+						b2 = e2 - len_buffer if e2 > len_buffer
+					end
-				unless _str1.strip.empty?
-					if _str2.strip.empty?
-						@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
+					_str1 = str1[b1 ... e1]
+					_str2 = str2[b2 ... e2]
+					if _str1.strip.empty? || _str2.strip.empty?
+						[
+							{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
+							block
+						]
 					else
-						len_min = [_str1.length, _str2.length].min
-						len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
-						b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
-						b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
-						@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
-						_str1 = str1[b1 ... e1]
-						_str2 = str2[b2 ... e2]
-						alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
-						similarity = alignment_similarity(_str1, _str2, alignment)
-						if similarity < 0.6
-							@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty, similarity: similarity}
-						else
-							@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment:alignment}
-						end
+						local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
 					end
 				end
 			end
+			last_block = block
+			sum
 		end
-		@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
-		(1 ... mblocks.length).each do |i|
-			b1 = mblocks[i - 1][:source][:end]
-			b2 = mblocks[i - 1][:target][:end]
-			e1 = mblocks[i][:source][:begin]
-			e2 = mblocks[i][:target][:begin]
-			_str1 = str1[b1 ... e1]
-			_str2 = str2[b2 ... e2]
-			unless _str1.strip.empty?
-				if _str2.strip.empty?
-					@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
+		# the last step
+		blocks2 += if last_block.nil?
+			local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
+		else
+			b1 = last_block[:source][:end]
+			if b1 < str1.length
+				e1 = str1.length
+				b2 = last_block[:target][:end]
+				if b2 < str2.length
+					len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
+					e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
+					local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
 				else
-					alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
-					similarity = alignment_similarity(_str1, _str2, alignment)
-					if similarity < 0.6
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
-					else
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
+					[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
+				end
+			else
+				[]
+			end
+		end
+		@block_alignment[:blocks] = blocks2
+	end
+	def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
+		block2 = str2[b2 ... e2]
+		## term-based alignment
+		tblocks = if denotations
+			ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
+							sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
+							map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
+			position = 0
+			tblocks = ds_in_scope.map do |term|
+				lex = term[:lex]
+				r = block2.index(lex, position)
+				if r.nil?
+					position = nil
+					break
+				end
+				position = r + lex.length
+				{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
+			end
+			# missing term found
+			tblocks = [] if position.nil?
+			# redundant matching found
+			unless position.nil?
+				ds_in_scope.each do |term|
+					lex = term[:lex]
+					look_forward = block2.index(lex, position)
+					unless look_forward.nil?
+						puts lex
+						tblocks = []
+						break
 					end
 				end
 			end
-			@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
+			tblocks
 		end
-		# Final step
-		if  mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
-			b1 = mblocks[-1][:source][:end]
-			b2 = mblocks[-1][:target][:end]
-			_str1 = str1[b1 ... str1.length]
-			_str2 = str2[b2 ... str2.length]
+		if tblocks.empty?
+			if b1 == 0 && e1 == str1.length
+				if (e1 > 1000) || (e2 > 1000)
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
+				else
+					block1 = str1[b1 ... e1]
+					block2 = str2[b2 ... e2]
+					## character-based alignment
+					alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
+					if alignment.sdiff.nil?
+						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
+					else
+						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
+					end
+				end
+			else
+				block1 = str1[b1 ... e1]
+				block2 = str2[b2 ... e2]
-			unless _str1.strip.empty?
-				if _str2.strip.empty?
-					@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
+				## character-based alignment
+				alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
+				if alignment.sdiff.nil?
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
 				else
-					len_min = [_str1.length, _str2.length].min
-					len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
-					e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
-					e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
-					_str1 = str1[b1 ... e1]
-					_str2 = str2[b2 ... e2]
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
+				end
+			end
+		else
+			last_tblock = nil
+			lblocks = tblocks.inject([]) do |sum, tblock|
+				tb1 = last_tblock ? last_tblock[:source][:end] : b1
+				te1 = tblock[:source][:begin]
-					alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
-					similarity = alignment_similarity(_str1, _str2, alignment)
-					if similarity < 0.6
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: similarity}
+				sum += if te1 == tb1
+					[tblock]
+				else
+					tb2 = last_tblock ? last_tblock[:target][:end] : b2
+					te2 = tblock[:target][:begin]
+					if b2 == e2
+						[
+							{source:{begin:tb1, end:te1}, alignment: :empty},
+							tblock
+						]
 					else
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment}
+						[
+							{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
+							tblock
+						]
 					end
+				end
-					@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
+				last_tblock = tblock
+				sum
+			end
+			if last_tblock[:source][:end] < e1
+				if last_tblock[:target][:end] < e2
+					lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
+				else
+					lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
 				end
 			end
-		end
-		@block_alignment[:blocks].each do |a|
-			a[:delta] = a[:target][:begin] - a[:source][:begin]
+			lblocks
 		end
 	end
+	def indices(str, target)
+	  position = 0
+	  len = target.len
+	  Enumerator.new do |yielder|
+	    while idx = str.index(target, position)
+	      yielder << idx
+	      position = idx + len
+	    end
+	  end
+	end
 	def transform_begin_position(begin_position)
 		i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
 		block = @block_alignment[:blocks][i]
-		b = if block[:alignment] == :block
+		b = if block[:alignment] == :block || block[:alignment] == :term
 			begin_position + block[:delta]
 		elsif block[:alignment] == :empty
 			if begin_position == block[:source][:begin]
 				block[:target][:begin]
 			else
-				# raise "lost annotation"
 				nil
 			end
 		else
@@ -185,13 +271,12 @@ class TextAlignment::TextAlignment
 		i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
 		block = @block_alignment[:blocks][i]
-		e = if block[:alignment] == :block
+		e = if block[:alignment] == :block || block[:alignment] == :term
 			end_position + block[:delta]
 		elsif block[:alignment] == :empty
 			if end_position == block[:source][:end]
 				block[:target][:end]
 			else
-				# raise "lost annotation"
 				nil
 			end
 		else
@@ -213,14 +298,14 @@ class TextAlignment::TextAlignment
 		@lost_annotations = []
 		denotations.each do |d|
-			begin
-				d.begin = transform_begin_position(d.begin);
-				d.end = transform_end_position(d.end);
-			rescue
-				@lost_annotations << d
-				d.begin = nil
-				d.end = nil
-			end
+			source = {begin:d.begin, end:d.end}
+			d.begin = transform_begin_position(d.begin);
+			d.end = transform_end_position(d.end);
+			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
+		rescue
+			@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
+			d.begin = nil
+			d.end = nil
 		end
 		@lost_annotations
@@ -231,12 +316,12 @@ class TextAlignment::TextAlignment
 		@lost_annotations = []
 		r = hdenotations.collect do |d|
-			new_d = begin
-				d.dup.merge({span:transform_a_span(d[:span])})
-			rescue
-				@lost_annotations << d
-				nil
-			end
+			t = transform_a_span(d[:span])
+			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
+			new_d = d.dup.merge({span:t})
+		rescue
+			@lost_annotations << {source: d[:span], target:t}
+			nil
 		end.compact
 		r
@@ -250,13 +335,16 @@ class TextAlignment::TextAlignment
 		@block_alignment[:blocks].each do |a|
 			show += case a[:alignment]
 			when :block
-				"===== common =====\n" +
+				"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
+			when :term
+				"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
 				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
 			when :empty
 				"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
-				"<<<<< string 1\n" +
+				"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
 				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
-				">>>>> string 2\n" +
+				">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
 				ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
 			else
 				astr1 = ''
@@ -290,7 +378,7 @@ class TextAlignment::TextAlignment
 					end
 				end.join('')
-				"***** local mismatch\n" +
+				"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
 				"[#{astr1}]\n" +
 				"[#{astr2}]\n\n"
 			end
@@ -298,71 +386,4 @@ class TextAlignment::TextAlignment
 		show
 	end
-	private
-	def string_preprocessing(_str1, _str2)
-		str1 = _str1.dup
-		str2 = _str2.dup
-		mappings = TextAlignment::MAPPINGS.dup
-		## single character mappings
-		character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
-		characters_from = character_mappings.collect{|m| m[0]}.join
-		characters_to   = character_mappings.collect{|m| m[1]}.join
-		characters_to.gsub!(/-/, '\-')
-		str1.tr!(characters_from, characters_to)
-		str2.tr!(characters_from, characters_to)
-		mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
-		## long to one character mappings
-		pletters = TextAlignment::PADDING_LETTERS
-		# find the padding letter for str1
-		@padding_letter1 = begin
-			i = pletters.index{|l| str2.index(l).nil?}
-			raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
-			TextAlignment::PADDING_LETTERS[i]
-		end
-		# find the padding letter for str2
-		@padding_letter2 = begin
-			i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
-			raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
-			TextAlignment::PADDING_LETTERS[i]
-		end
-		# ASCII foldings
-		ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
-		ascii_foldings.each do |f|
-			from = f[1]
-			if str2.index(f[0])
-				to   = f[0] + (@padding_letter1 * (f[1].length - 1))
-				str1.gsub!(from, to)
-			end
-			if str1.index(f[0])
-				to   = f[0] + (@padding_letter2 * (f[1].length - 1))
-				str2.gsub!(from, to)
-			end
-		end
-		mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
-		[str1, str2, mappings]
-	end
-	def alignment_similarity(_s1, _s2, alignment)
-		return 0 if alignment.sdiff.nil?
-		# compute the lcs only with non-whitespace letters
-		lcs = alignment.sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
-		s1 = _s1.tr(@padding_letter1, ' ')
-		s2 = _s2.tr(@padding_letter2, ' ')
-		similarity = 2 * lcs / (s1.scan(/\S/).count + s2.scan(/\S/).count).to_f
-	end
 end

data/lib/text_alignment/version.rb CHANGED

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.6.2'
+	VERSION = '0.7.2'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.6.2
+  version: 0.7.2
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-10-06 00:00:00.000000000 Z
+date: 2020-10-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary