RubyGems - text_alignment - Versions diffs - 0.6.4 → 0.8.1 - Mend

text_alignment 0.6.4 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/bin/align_annotations +33 -153
data/lib/text_alignment/constants.rb +1 -1
data/lib/text_alignment/mappings.rb +168 -70
data/lib/text_alignment/mixed_alignment.rb +3 -71
data/lib/text_alignment/text_alignment.rb +223 -119
data/lib/text_alignment/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 87f945e356349ed709996d88ed39c8ba5b83622bde1c7fd7b9e5ff63504615c2
-  data.tar.gz: acb6e716113238c39b59a8358928de1bd936382308961a57e2c60e7bc462726f
+  metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
+  data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
 SHA512:
-  metadata.gz: 4d5b862bb50b4111c6bd390e458d6761303dc394f2fa7dc9d6b821ee7461541705aecac925f700e5124eb282112567e52a51a9f15b84fa8349da25baaf68fdd9
-  data.tar.gz: a044608a58181e98664a26f410a7d59927dc4d39db8d49a147666f64254e23728ceccaa781a590712b7a74b57222cc449c37eb43a709d3f16da60aa3a55c2e6f
+  metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
+  data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b

data/bin/align_annotations CHANGED

@@ -26,33 +26,43 @@ def read_text(filename)
 	end
 end
-def align_mdoc(source_annotations, target_annotations)
+def align_denotations(denotations, source_text, target_text, debug = false)
+	alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
+	new_denotations = alignment.transform_hdenotations(denotations)
+	if debug
+		warn "[block alignment]"
+		warn alignment.alignment_show
+		warn "-----"
+	end
+	lost_annotations = alignment.lost_annotations
+	unless lost_annotations.empty?
+		warn "\n[lost annotations] #{lost_annotations.length}"
+		lost_annotations.each do |a|
+			warn "#{a}"
+		end
+		warn "====="
+	end
+	warn
+	# return target annotations
+	new_denotations
+end
+def align_mannotations(source_annotations, target_text, debug = false)
+	target_annotations = {text:target_text}
 	idnum_denotations = 0
 	idnum_relations = 0
 	idnum_attributes = 0
 	idnum_modifications = 0
-	source_annotations.each do |annotations|
-		alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
-		puts alignment.alignment_show
-		puts "-----"
-		puts
-		# alignment.block_alignments.each do |a|
-			# p {source:a[:source], target:a[:target]}
-			# puts "--"
-			# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
-			# puts "--"
-			# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
-			# puts "--"
-			# puts target_text[a[:target][:begin] ... a[:target][:end]]
-			# puts "======"
-		# end
+	source_annotations.each_with_index do |annotations, i|
 		if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
 			ididx = {}
-			denotations = alignment.transform_hdenotations(annotations[:denotations])
+			warn "[#{i}]-=-=-=-=-"
+			denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
 			denotations.each do |d|
 				reid = 'T' + (idnum_denotations += 1).to_s
 				ididx[d[:id]] = reid
@@ -101,141 +111,11 @@ end
 source_annotations = read_annotations(ARGV[0])
 target_text = read_text(ARGV[1])
-lost_annotations = []
 target_annotations = if source_annotations.class == Array
-	align_mdoc(source_annotations, {text: target_text})
+	align_mannotations(source_annotations, target_text, false)
 else
-	alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
-	# pp alignment
-	# verification
-	# source_text = source_annotations[:text]
-	# puts "=====BEGIN"
-	# (0 ... source_text.rstrip.length).each do |p|
-	# 	t = alignment.transform_begin_position(p)
-	# 	if t.nil?
-	# 		print source_text[p]
-	# 	else
-	# 		print '.'
-	# 	end
-	# end
-	# puts
-	# puts "=====END"
-	# puts "=====BEGIN"
-	# (0 .. source_text.rstrip.length).each do |p|
-	# 	t = alignment.transform_end_position(p)
-	# 	if t.nil?
-	# 		print source_text[p]
-	# 	else
-	# 		print '.'
-	# 	end
-	# end
-	# puts
-	# puts "=====END"
-	source_text = source_annotations[:text]
-	puts "[block alignment]"
-	puts alignment.alignment_show
-	puts "====="
-	# exit
-	# verification of source denotations
-	puts "[Invalid source denotations]"
-	source_annotations[:denotations] do |d|
-		p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
-	end
-	puts "====="
-	puts
-	denotations = alignment.transform_hdenotations(source_annotations[:denotations])
-	puts "[Invalid transformation]"
-	denotations.each do |d|
-		p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
-	end
-	puts "====="
-	puts
-	lost_annotations += alignment.lost_annotations if alignment.lost_annotations
+	denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
 	source_annotations.merge({text:target_text, denotations:denotations})
 end
-num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
-	num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
-	source_annotations.each do |annotations|
-		num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
-		num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
-		num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
-		num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
-	end
-	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
-else
-	num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
-	num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
-	num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
-	num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
-	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
-end
-warn "[source]"
-warn "denotations:\t#{num_denotations_source}"
-# warn "relations:\t#{num_relations_source}"
-# warn "attributes:\t#{num_attributes_source}"
-# warn "modifications:\t#{num_modifications_source}"
-warn "\n[target]"
-warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
-# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
-# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
-# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
-if lost_annotations
-	warn "\n[lost annotations]"
-	warn "#{lost_annotations.length}"
-end
-#puts target_annotations.to_json
-# denotations = anns1[:denotations]
-# puts "[Alignment1]====="
-# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
-# align.alignment.each do |a|
-# 	p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
-# end
-# puts TextAlignment::sdiff2cdiff(align.sdiff)
-# puts
-# puts "[Similarity]\n#{align.similarity}"
-# puts
-# puts '[Denotations original]'
-# pp denotations
-# puts
-# puts '[Denotations transformed]'
-# new_denotations = align.transform_hdenotations(denotations)
-# pp new_denotations
-# puts
-# puts "[Alignment2 (downcased)]====="
-# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
-# puts TextAlignment::sdiff2cdiff(align.sdiff)
-# puts
-# puts "[Similarity]\n#{align.similarity}"
-# puts
-# puts '[Denotations original]'
-# pp denotations
-# puts
-# puts '[Denotations transformed]'
-# new_denotations = align.transform_hdenotations(denotations)
-# pp new_denotations
-# puts
-# puts '[Annotations transformed]'
-# anns2[:denotations] = new_denotations
-# puts anns2.to_json
-# p align.common_elements
-# puts "---------------"
-# p align.mapped_elements
+# puts target_annotations.to_json

data/lib/text_alignment/constants.rb CHANGED

@@ -1,7 +1,7 @@
 module TextAlignment; end unless defined? TextAlignment
 TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
-TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
+TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
 TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
 TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
 TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD

data/lib/text_alignment/mappings.rb CHANGED

@@ -1,74 +1,172 @@
 module TextAlignment; end unless defined? TextAlignment
 TextAlignment::MAPPINGS = [
-	["©", "(c)"],   #U+00A9 (Copyright Sign)
-	["α", "alpha"],   #U+03B1 (greek small letter alpha)
-	["β", "beta"],    #U+03B2 (greek small letter beta)
-	["γ", "gamma"],   #U+03B3 (greek small letter gamma)
-	["δ", "delta"],   #U+03B4 (greek small letter delta)
-	["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
-	["ζ", "zeta"],    #U+03B6 (greek small letter zeta)
-	["η", "eta"],     #U+03B7 (greek small letter eta)
-	["θ", "theta"],   #U+03B7 (greek small letter eta)
-	["ι", "iota"],    #U+03B7 (greek small letter eta)
-	["κ", "kappa"],   #U+03BA (greek small letter kappa)
-	["λ", "lambda"],  #U+03BB (greek small letter lambda)
-	["λ", "lamda"],  #U+03BB (greek small letter lambda)
-	["μ", "mu"],      #U+03BC (greek small letter mu)
-	["ν", "nu"],      #U+03BD (greek small letter nu)
-	["ξ", "xi"],      #U+03BE (greek small letter xi)
-	["ο", "omicron"], #U+03BF (greek small letter omicron)
-	["π", "pi"],      #U+03C0 (greek small letter pi)
-	["ρ", "rho"],     #U+03C1 (greek small letter rho)
-	["σ", "sigma"],   #U+03C3 (greek small letter sigma)
-	["τ", "tau"],     #U+03C4 (greek small letter tau)
-	["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
-	["φ", "phi"],     #U+03C6 (greek small letter phi)
-	["χ", "chi"],     #U+03C7 (greek small letter chi)
-	["ψ", "psi"],     #U+03C8 (greek small letter psi)
-	["ω", "omega"],   #U+03C9 (greek small letter omega)
-	["Α", "Alpha"],   #U+0391 (greek capital letter alpha)
-	["Β", "Beta"],    #U+0392 (greek capital letter beta)
-	["Γ", "Gamma"],   #U+0393 (greek capital letter gamma)
-	["Δ", "Delta"],   #U+0394 (greek capital letter delta)
-	["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
-	["Ζ", "Zeta"],    #U+0396 (greek capital letter zeta)
-	["Η", "Eta"],     #U+0397 (greek capital letter eta)
-	["Θ", "Theta"],   #U+0398 (greek capital letter theta)
-	["Ι", "Iota"],    #U+0399 (greek capital letter iota)
-	["Κ", "Kappa"],   #U+039A (greek capital letter kappa)
-	["Λ", "Lambda"],  #U+039B (greek capital letter lambda)
-	["Λ", "Lamda"],  #U+039B (greek capital letter lambda)
-	["Μ", "Mu"],      #U+039C (greek capital letter mu)
-	["Ν", "Nu"],      #U+039D (greek capital letter nu)
-	["Ξ", "Xi"],      #U+039E (greek capital letter xi)
-	["Ο", "Omicron"], #U+039F (greek capital letter omicron)
-	["Π", "Pi"],      #U+03A0 (greek capital letter pi)
-	["Ρ", "Rho"],     #U+03A1 (greek capital letter rho)
-	["Σ", "Sigma"],   #U+03A3 (greek capital letter sigma)
-	["Τ", "Tau"],     #U+03A4 (greek capital letter tau)
-	["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
-	["Φ", "Phi"],     #U+03A6 (greek capital letter phi)
-	["Χ", "Chi"],     #U+03A7 (greek capital letter chi)
-	["Ψ", "Psi"],     #U+03A8 (greek capital letter Psi)
-	["Ω", "Omega"],   #U+03A9 (greek capital letter omega)
-	["ϕ", "phi"],     #U+03D5 (greek phi symbol)
-	["×", "x"],       #U+00D7 (multiplication sign)
-	["•", "*"],       #U+2022 (bullet)
-	[" ", " "],       #U+2009 (thin space)
-	[" ", " "],       #U+200A (hair space)
-	[" ", " "],       #U+00A0 (no-break space)
-	["　", " "],       #U+3000 (ideographic space)
-	["−", "-"],       #U+2212 (minus sign)
-	["–", "-"],       #U+2013 (en dash)
-	["′", "'"],       #U+2032 (prime)
-	["‘", "'"],       #U+2018 (left single quotation mark)
-	["’", "'"],       #U+2019 (right single quotation mark)
-	["“", '"'],       #U+201C (left double quotation mark)
-	["”", '"'],        #U+201D (right double quotation mark)
+	["©", "(c)"],			#U+00A9 (Copyright Sign)
+	["α", "alpha"],		#U+03B1 (greek small letter alpha)
+	["β", "beta"],		#U+03B2 (greek small letter beta)
+	["γ", "gamma"],		#U+03B3 (greek small letter gamma)
+	["δ", "delta"],		#U+03B4 (greek small letter delta)
+	["ε", "epsilon"],	#U+03B5 (greek small letter epsilon)
+	["ζ", "zeta"],		#U+03B6 (greek small letter zeta)
+	["η", "eta"],			#U+03B7 (greek small letter eta)
+	["θ", "theta"],		#U+03B7 (greek small letter eta)
+	["ι", "iota"],		#U+03B7 (greek small letter eta)
+	["κ", "kappa"],		#U+03BA (greek small letter kappa)
+	["λ", "lambda"],	#U+03BB (greek small letter lambda)
+	["λ", "lamda"],		#U+03BB (greek small letter lambda)
+	["μ", "mu"],			#U+03BC (greek small letter mu)
+	["ν", "nu"],			#U+03BD (greek small letter nu)
+	["ξ", "xi"],			#U+03BE (greek small letter xi)
+	["ο", "omicron"],	#U+03BF (greek small letter omicron)
+	["π", "pi"],			#U+03C0 (greek small letter pi)
+	["ρ", "rho"],			#U+03C1 (greek small letter rho)
+	["σ", "sigma"],		#U+03C3 (greek small letter sigma)
+	["τ", "tau"],			#U+03C4 (greek small letter tau)
+	["υ", "upsilon"],	#U+03C5 (greek small letter upsilon)
+	["φ", "phi"],			#U+03C6 (greek small letter phi)
+	["χ", "chi"],			#U+03C7 (greek small letter chi)
+	["ψ", "psi"],			#U+03C8 (greek small letter psi)
+	["ω", "omega"],		#U+03C9 (greek small letter omega)
+	["Α", "Alpha"],		#U+0391 (greek capital letter alpha)
+	["Β", "Beta"],		#U+0392 (greek capital letter beta)
+	["Γ", "Gamma"],		#U+0393 (greek capital letter gamma)
+	["Δ", "Delta"],		#U+0394 (greek capital letter delta)
+	["Ε", "Epsilon"],	#U+0395 (greek capital letter epsilon)
+	["Ζ", "Zeta"],		#U+0396 (greek capital letter zeta)
+	["Η", "Eta"],			#U+0397 (greek capital letter eta)
+	["Θ", "Theta"],		#U+0398 (greek capital letter theta)
+	["Ι", "Iota"],		#U+0399 (greek capital letter iota)
+	["Κ", "Kappa"],		#U+039A (greek capital letter kappa)
+	["Λ", "Lambda"],	#U+039B (greek capital letter lambda)
+	["Λ", "Lamda"],		#U+039B (greek capital letter lambda)
+	["Μ", "Mu"],			#U+039C (greek capital letter mu)
+	["Ν", "Nu"],			#U+039D (greek capital letter nu)
+	["Ξ", "Xi"],			#U+039E (greek capital letter xi)
+	["Ο", "Omicron"],	#U+039F (greek capital letter omicron)
+	["Π", "Pi"],			#U+03A0 (greek capital letter pi)
+	["Ρ", "Rho"],			#U+03A1 (greek capital letter rho)
+	["Σ", "Sigma"],		#U+03A3 (greek capital letter sigma)
+	["Τ", "Tau"],			#U+03A4 (greek capital letter tau)
+	["Υ", "Upsilon"],	#U+03A5 (greek capital letter upsilon)
+	["Φ", "Phi"],			#U+03A6 (greek capital letter phi)
+	["Χ", "Chi"],			#U+03A7 (greek capital letter chi)
+	["Ψ", "Psi"],			#U+03A8 (greek capital letter Psi)
+	["Ω", "Omega"],		#U+03A9 (greek capital letter omega)
+	["ϕ", "phi"],			#U+03D5 (greek phi symbol)
+	["×", "x"],				#U+00D7 (multiplication sign)
+	["•", "*"],				#U+2022 (bullet)
+	[" ", " "],				#U+2009 (thin space)
+	[" ", " "],				#U+200A (hair space)
+	[" ", " "],				#U+00A0 (no-break space)
+	["　", " "],				#U+3000 (ideographic space)
+	["‑", "-"],				#U+2211 (Non-Breaking Hyphen)
+	["−", "-"],				#U+2212 (minus sign)
+	["–", "-"],				#U+2013 (en dash)
+	["′", "'"],				#U+2032 (prime)
+	["‘", "'"],				#U+2018 (left single quotation mark)
+	["’", "'"],				#U+2019 (right single quotation mark)
+	["“", '"'],				#U+201C (left double quotation mark)
+	["”", '"'],				#U+201D (right double quotation mark)
 	['"', "''"]
-  ]
+]
+TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
+class << TextAlignment
+	def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
+		_mappings ||= TextAlignment::MAPPINGS
+		character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
+		if character_mappings.empty?
+			[_str1, _str2, _mappings]
+		else
+			characters_from = character_mappings.collect{|m| m[0]}.join
+			characters_to   = character_mappings.collect{|m| m[1]}.join
+			characters_to.gsub!(/-/, '\-')
+			str1 = _str1.tr(characters_from, characters_to)
+			str2 = _str2.tr(characters_from, characters_to)
+			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
+			[str1, str2, mappings]
+		end
+	end
+	def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
+		_mappings ||= TextAlignment::MAPPINGS
+		long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
+		if long_to_one_mappings.empty?
+			[_str1, _str2, _mappings]
+		else
+			## long to one character mappings
+			pletters = TextAlignment::PADDING_LETTERS
+			# find the padding letter for str1
+			@padding_letter1 = begin
+				i = pletters.index{|l| _str2.index(l).nil?}
+				raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
+				TextAlignment::PADDING_LETTERS[i]
+			end
+			# find the padding letter for str2
+			@padding_letter2 = begin
+				i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
+				raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
+				TextAlignment::PADDING_LETTERS[i]
+			end
+			str1 = str2 = nil
+			long_to_one_mappings.each do |f|
+				from = f[1]
+				str1 = if _str2.index(f[0])
+					to = f[0] + (@padding_letter1 * (f[1].length - 1))
+					_str1.gsub(from, to)
+				else
+					_str1
+				end
+				str2 = if _str1.index(f[0])
+					to = f[0] + (@padding_letter2 * (f[1].length - 1))
+					_str2.gsub(from, to)
+				else
+					_str2
+				end
+			end
+			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
+			[str1, str2, mappings]
+		end
+	end
+	def compute_similarity(_s1, _s2, sdiff)
+		return 0 if sdiff.nil?
+		# compute the lcs only with non-whitespace letters
+		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
+		return 0 if lcs == 0
+		s1 = if @padding_letter1
+			_s1.tr(@padding_letter1, ' ')
+		else
+			_s1
+		end
+		s2 = if @padding_letter2
+			_s2.tr(@padding_letter2, ' ')
+		else
+			_s2
+		end
+		similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
+	end
+end

data/lib/text_alignment/mixed_alignment.rb CHANGED

@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
 	attr_reader :similarity
 	attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
-	def initialize(_str1, _str2)
+	def initialize(_str1, _str2, _mappings = nil)
 		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
-		str1, str2, mappings = string_preprocessing(_str1, _str2)
+		str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
 		_compute_mixed_alignment(str1, str2, mappings)
 	end
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
 		end
 		cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
-		@similarity         = compute_similarity(str1, str2, @sdiff)
+		@similarity         = TextAlignment::compute_similarity(str1, str2, @sdiff)
 		@str1_match_initial = cmp.str1_match_initial
 		@str1_match_final   = cmp.str1_match_final
 		@str2_match_initial = cmp.str2_match_initial
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
 		@position_map_end = posmap_end.sort.to_h
 	end
-	private
-	def string_preprocessing(_str1, _str2)
-		str1 = _str1.dup
-		str2 = _str2.dup
-		mappings = TextAlignment::MAPPINGS.dup
-		## single character mappings
-		character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
-		characters_from = character_mappings.collect{|m| m[0]}.join
-		characters_to   = character_mappings.collect{|m| m[1]}.join
-		characters_to.gsub!(/-/, '\-')
-		str1.tr!(characters_from, characters_to)
-		str2.tr!(characters_from, characters_to)
-		mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
-		## long to one character mappings
-		pletters = TextAlignment::PADDING_LETTERS
-		# find the padding letter for str1
-		@padding_letter1 = begin
-			i = pletters.index{|l| str2.index(l).nil?}
-			raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
-			TextAlignment::PADDING_LETTERS[i]
-		end
-		# find the padding letter for str2
-		@padding_letter2 = begin
-			i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
-			raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
-			TextAlignment::PADDING_LETTERS[i]
-		end
-		# ASCII foldings
-		ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
-		ascii_foldings.each do |f|
-			from = f[1]
-			if str2.index(f[0])
-				to   = f[0] + (@padding_letter1 * (f[1].length - 1))
-				str1.gsub!(from, to)
-			end
-			if str1.index(f[0])
-				to   = f[0] + (@padding_letter2 * (f[1].length - 1))
-				str2.gsub!(from, to)
-			end
-		end
-		mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
-		[str1, str2, mappings]
-	end
-	def compute_similarity(_s1, _s2, sdiff)
-		return 0 if sdiff.nil?
-		# compute the lcs only with non-whitespace letters
-		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
-		return 0 if lcs == 0
-		s1 = _s1.tr(@padding_letter1, ' ')
-		s2 = _s2.tr(@padding_letter2, ' ')
-		similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
-	end
 end

data/lib/text_alignment/text_alignment.rb CHANGED

@@ -5,50 +5,44 @@ require 'text_alignment/mixed_alignment'
 module TextAlignment; end unless defined? TextAlignment
-TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
 class TextAlignment::TextAlignment
 	attr_reader :block_alignment
 	attr_reader :similarity
 	attr_reader :lost_annotations
-	def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
-		raise ArgumentError, "nil string" if str1.nil? || str2.nil?
+	def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
+		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
-		@block_alignment = {source_text:str1, target_text:str2}
+		@block_alignment = {source_text:_str1, target_text:_str2}
+		@original_str1 = _str1
+		@original_str2 = _str2
-		# try exact match
-		block_begin = str2.index(str1)
-		unless block_begin.nil?
-			@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
-			return @block_alignment
-		end
+		str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
-		# try exact match
-		block_begin = str2.downcase.index(str1.downcase)
-		unless block_begin.nil?
-			@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
-			return @block_alignment
+		if r = whole_block_alignment(str1, str2)
+			@block_alignment[:blocks] = r
+			return
 		end
+		## to find block alignments
 		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
-		# To collect matched blocks
-		mblocks = []
-		while anchor = anchor_finder.get_next_anchor
-			last = mblocks.last
-			if last && (anchor[:source][:begin] == last[:source][:end] + 1) && (anchor[:target][:begin] == last[:target][:end] + 1)
-				last[:source][:end] = anchor[:source][:end]
-				last[:target][:end] = anchor[:target][:end]
+		blocks = []
+		while block = anchor_finder.get_next_anchor
+			last = blocks.last
+			if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
+				last[:source][:end] = block[:source][:end]
+				last[:target][:end] = block[:target][:end]
 			else
-				mblocks << anchor
+				blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
 			end
 		end
-		# pp mblocks
+		# pp blocks
 		# puts "-----"
 		# puts
-		# mblocks.each do |b|
+		# exit
+		# blocks.each do |b|
 		# 	p [b[:source], b[:target]]
 		# 	puts "---"
 		# 	puts str1[b[:source][:begin] ... b[:source][:end]]
@@ -60,114 +54,218 @@ class TextAlignment::TextAlignment
 		# puts "-=-=-=-=-"
 		# puts
-		## To find block alignments
-		@block_alignment[:blocks] = []
-		return if mblocks.empty?
-		# Initial step
-		if mblocks[0][:source][:begin] > 0
-			e1 = mblocks[0][:source][:begin]
-			e2 = mblocks[0][:target][:begin]
+		## to fill the gaps
+		last_block = nil
+		blocks2 = blocks.inject([]) do |sum, block|
+			b1 = last_block ? last_block[:source][:end] : 0
+			e1 = block[:source][:begin]
-			if mblocks[0][:target][:begin] == 0
-				@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:0}, alignment: :empty}
+			sum += if b1 == e1
+				[block]
 			else
-				_str1 = str1[0 ... e1]
-				_str2 = str2[0 ... e2]
+				b2 = last_block ? last_block[:target][:end] : 0
+				e2 = block[:target][:begin]
+				if b2 == e2
+					[
+						{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
+						block
+					]
+				else
+					if b1 == 0 && b2 == 0
+						len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
+						b2 = e2 - len_buffer if e2 > len_buffer
+					end
-				unless _str1.strip.empty?
-					if _str2.strip.empty?
-						@block_alignment[:blocks] << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
+					_str1 = str1[b1 ... e1]
+					_str2 = str2[b2 ... e2]
+					if _str1.strip.empty? || _str2.strip.empty?
+						[
+							{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
+							block
+						]
 					else
-						len_min = [_str1.length, _str2.length].min
-						len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
-						b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
-						b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
-						@block_alignment[:blocks] << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
-						_str1 = str1[b1 ... e1]
-						_str2 = str2[b2 ... e2]
-						alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
-						if alignment.similarity < 0.5
-							@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
-						else
-							@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
-						end
+						local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
 					end
 				end
 			end
+			last_block = block
+			sum
 		end
-		@block_alignment[:blocks] << mblocks[0].merge(alignment: :block)
-		(1 ... mblocks.length).each do |i|
-			b1 = mblocks[i - 1][:source][:end]
-			b2 = mblocks[i - 1][:target][:end]
-			e1 = mblocks[i][:source][:begin]
-			e2 = mblocks[i][:target][:begin]
-			_str1 = str1[b1 ... e1]
-			_str2 = str2[b2 ... e2]
-			unless _str1.strip.empty?
-				if _str2.strip.empty?
-					@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
+		# the last step
+		blocks2 += if last_block.nil?
+			local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
+		else
+			b1 = last_block[:source][:end]
+			if b1 < str1.length
+				e1 = str1.length
+				b2 = last_block[:target][:end]
+				if b2 < str2.length
+					len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
+					e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
+					local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
 				else
-					alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
-					if alignment.similarity < 0.5
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
-					else
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
+					[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
+				end
+			else
+				[]
+			end
+		end
+		@block_alignment[:blocks] = blocks2
+	end
+	def whole_block_alignment(str1, str2)
+		## Block exact match
+		block_begin = str2.index(str1)
+		unless block_begin.nil?
+			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
+		end
+		block_begin = str2.downcase.index(str1.downcase)
+		unless block_begin.nil?
+			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
+		end
+		nil
+	end
+	def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
+		block2 = str2[b2 ... e2]
+		## term-based alignment
+		tblocks = if denotations
+			ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
+							sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
+							map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
+			position = 0
+			tblocks = ds_in_scope.map do |term|
+				lex = term[:lex]
+				r = block2.index(lex, position)
+				if r.nil?
+					position = nil
+					break
+				end
+				position = r + lex.length
+				{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
+			end
+			# missing term found
+			tblocks = [] if position.nil?
+			# redundant matching found
+			unless position.nil?
+				ds_in_scope.each do |term|
+					lex = term[:lex]
+					look_forward = block2.index(lex, position)
+					unless look_forward.nil?
+						tblocks = []
+						break
 					end
 				end
 			end
-			@block_alignment[:blocks] << mblocks[i].merge(alignment: :block)
+			tblocks
+		else
+			[]
 		end
-		# Final step
-		if  mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
-			b1 = mblocks[-1][:source][:end]
-			b2 = mblocks[-1][:target][:end]
-			_str1 = str1[b1 ... str1.length]
-			_str2 = str2[b2 ... str2.length]
+		if tblocks.empty?
+			if b1 == 0 && e1 == str1.length
+				if (e1 > 2000) || (e2 > 2000)
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
+				else
+					block1 = str1[b1 ... e1]
+					block2 = str2[b2 ... e2]
-			unless _str1.strip.empty?
-				if _str2.strip.empty?
-					@block_alignment[:blocks] << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
+					## character-based alignment
+					alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
+					if alignment.sdiff.nil?
+						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
+					else
+						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
+					end
+				end
+			else
+				block1 = str1[b1 ... e1]
+				block2 = str2[b2 ... e2]
+				## character-based alignment
+				alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
+				if alignment.sdiff.nil?
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
 				else
-					len_min = [_str1.length, _str2.length].min
-					len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
-					e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
-					e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
-					_str1 = str1[b1 ... e1]
-					_str2 = str2[b2 ... e2]
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
+				end
+			end
+		else
+			last_tblock = nil
+			lblocks = tblocks.inject([]) do |sum, tblock|
+				tb1 = last_tblock ? last_tblock[:source][:end] : b1
+				te1 = tblock[:source][:begin]
-					alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase)
-					if alignment.similarity < 0.5
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty, similarity: alignment.similarity}
+				sum += if te1 == tb1
+					[tblock]
+				else
+					tb2 = last_tblock ? last_tblock[:target][:end] : b2
+					te2 = tblock[:target][:begin]
+					if b2 == e2
+						[
+							{source:{begin:tb1, end:te1}, alignment: :empty},
+							tblock
+						]
 					else
-						@block_alignment[:blocks] << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}
+						[
+							{source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
+							tblock
+						]
 					end
+				end
+				last_tblock = tblock
+				sum
+			end
-					@block_alignment[:blocks] << {source:{begin:e1, end:-1}, target:{begin:e2, end:-1}, alignment: :empty} if e1 < str1.length
+			if last_tblock[:source][:end] < e1
+				if last_tblock[:target][:end] < e2
+					lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
+				else
+					lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
 				end
 			end
-		end
-		@block_alignment[:blocks].each do |a|
-			a[:delta] = a[:target][:begin] - a[:source][:begin]
+			lblocks
 		end
 	end
+	def indices(str, target)
+	  position = 0
+	  len = target.len
+	  Enumerator.new do |yielder|
+	    while idx = str.index(target, position)
+	      yielder << idx
+	      position = idx + len
+	    end
+	  end
+	end
 	def transform_begin_position(begin_position)
 		i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
 		block = @block_alignment[:blocks][i]
-		b = if block[:alignment] == :block
+		b = if block[:alignment] == :block || block[:alignment] == :term
 			begin_position + block[:delta]
 		elsif block[:alignment] == :empty
 			if begin_position == block[:source][:begin]
 				block[:target][:begin]
 			else
-				# raise "lost annotation"
 				nil
 			end
 		else
@@ -180,13 +278,12 @@ class TextAlignment::TextAlignment
 		i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
 		block = @block_alignment[:blocks][i]
-		e = if block[:alignment] == :block
+		e = if block[:alignment] == :block || block[:alignment] == :term
 			end_position + block[:delta]
 		elsif block[:alignment] == :empty
 			if end_position == block[:source][:end]
 				block[:target][:end]
 			else
-				# raise "lost annotation"
 				nil
 			end
 		else
@@ -208,14 +305,14 @@ class TextAlignment::TextAlignment
 		@lost_annotations = []
 		denotations.each do |d|
-			begin
-				d.begin = transform_begin_position(d.begin);
-				d.end = transform_end_position(d.end);
-			rescue
-				@lost_annotations << d
-				d.begin = nil
-				d.end = nil
-			end
+			source = {begin:d.begin, end:d.end}
+			d.begin = transform_begin_position(d.begin);
+			d.end = transform_end_position(d.end);
+			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
+		rescue
+			@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
+			d.begin = nil
+			d.end = nil
 		end
 		@lost_annotations
@@ -226,12 +323,12 @@ class TextAlignment::TextAlignment
 		@lost_annotations = []
 		r = hdenotations.collect do |d|
-			new_d = begin
-				d.dup.merge({span:transform_a_span(d[:span])})
-			rescue
-				@lost_annotations << d
-				nil
-			end
+			t = transform_a_span(d[:span])
+			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
+			new_d = d.dup.merge({span:t})
+		rescue
+			@lost_annotations << {source: d[:span], target:t}
+			nil
 		end.compact
 		r
@@ -245,14 +342,22 @@ class TextAlignment::TextAlignment
 		@block_alignment[:blocks].each do |a|
 			show += case a[:alignment]
 			when :block
-				"===== common ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+				"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
+			when :term
+				"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
 				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
 			when :empty
 				"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
 				"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
 				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
-				">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
-				ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
+				">>>>> string 2 " +
+				if a[:target]
+					"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+					ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
+				else
+					"[-]\n\n"
+				end
 			else
 				astr1 = ''
 				astr2 = ''
@@ -292,5 +397,4 @@ class TextAlignment::TextAlignment
 		end
 		show
 	end
 end

data/lib/text_alignment/version.rb CHANGED

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.6.4'
+	VERSION = '0.8.1'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.6.4
+  version: 0.8.1
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-10-06 00:00:00.000000000 Z
+date: 2020-10-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary