RubyGems - text_alignment - Versions diffs - 0.7 → 0.9 - Mend

text_alignment 0.7 → 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/bin/align_annotations +33 -138
data/lib/text_alignment/constants.rb +1 -1
data/lib/text_alignment/mappings.rb +168 -70
data/lib/text_alignment/mixed_alignment.rb +3 -71
data/lib/text_alignment/text_alignment.rb +50 -33
data/lib/text_alignment/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 40796a906cbc366312741d3b410aaee15d28b778980674bb79d19ea1b6364d02
-  data.tar.gz: eb0121440bc005232b3d35eb1867b46004eb14c894b630f2ef49d311c4d99a26
+  metadata.gz: 1c44cc3036273c8c34800d8f78a79316c53efb80fe45ad81092a6172da3b03c6
+  data.tar.gz: 50ab44cc66b50bf732e99f900c10584025c6ed498603ccf3afd75de90cac4b79
 SHA512:
-  metadata.gz: e29037021763c9d4f581b278b72d319d364431e8abad48b45f21d68df102928eec993bbba582383a8a01b6ba7855f3542d2cc620ffa055a90aea08c2f5371117
-  data.tar.gz: 63c0e5b712b89012082197d3c4400cd705283e6d908f40003a399cb2efdc2b3aa4f6e445eb90e1ab4d7d39a9d14421881f945e0483c14ec52c1af7176ca07e81
+  metadata.gz: 98645c1ba4566c822d1e6ba6488e4ecdfe100c30923cc7effe7d2a4390ebb6901707e8c9f6a12145e2f98515bc6792afef4f9bfa5fcd683c77d3a5cf599094c7
+  data.tar.gz: 11657abdb8acb64c8edfd5271bbf78d2a75024753180988030c5ce6722b4da2781760e583ca6e33ed469cca85e4a2f8e28af6ef4dc62029ada5bd8a184200dfb

data/bin/align_annotations CHANGED

@@ -26,33 +26,43 @@ def read_text(filename)
 	end
 end
-def align_mdoc(source_annotations, target_annotations)
+def align_denotations(denotations, source_text, target_text, debug = false)
+	alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
+	new_denotations = alignment.transform_hdenotations(denotations)
+	if debug
+		warn "[block alignment]"
+		warn alignment.alignment_show
+		warn "-----"
+	end
+	lost_annotations = alignment.lost_annotations
+	unless lost_annotations.empty?
+		warn "\n[lost annotations] #{lost_annotations.length}"
+		lost_annotations.each do |a|
+			warn "#{a}"
+		end
+		warn "====="
+	end
+	warn
+	# return target annotations
+	new_denotations
+end
+def align_mannotations(source_annotations, target_text, debug = false)
+	target_annotations = {text:target_text}
 	idnum_denotations = 0
 	idnum_relations = 0
 	idnum_attributes = 0
 	idnum_modifications = 0
-	source_annotations.each do |annotations|
-		alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
-		puts alignment.alignment_show
-		puts "-----"
-		puts
-		# alignment.block_alignments.each do |a|
-			# p {source:a[:source], target:a[:target]}
-			# puts "--"
-			# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
-			# puts "--"
-			# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
-			# puts "--"
-			# puts target_text[a[:target][:begin] ... a[:target][:end]]
-			# puts "======"
-		# end
+	source_annotations.each_with_index do |annotations, i|
 		if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
 			ididx = {}
-			denotations = alignment.transform_hdenotations(annotations[:denotations])
+			warn "[#{i}]-=-=-=-=-"
+			denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
 			denotations.each do |d|
 				reid = 'T' + (idnum_denotations += 1).to_s
 				ididx[d[:id]] = reid
@@ -101,126 +111,11 @@ end
 source_annotations = read_annotations(ARGV[0])
 target_text = read_text(ARGV[1])
-lost_annotations = []
 target_annotations = if source_annotations.class == Array
-	align_mdoc(source_annotations, {text: target_text})
+	align_mannotations(source_annotations, target_text, false)
 else
-	alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text, source_annotations[:denotations])
-	# verification
-	# source_text = source_annotations[:text]
-	# puts "=====BEGIN"
-	# (0 ... source_text.rstrip.length).each do |p|
-	# 	t = alignment.transform_begin_position(p)
-	# 	if t.nil?
-	# 		print source_text[p]
-	# 	else
-	# 		print '.'
-	# 	end
-	# end
-	# puts
-	# puts "=====END"
-	# puts "=====BEGIN"
-	# (0 .. source_text.rstrip.length).each do |p|
-	# 	t = alignment.transform_end_position(p)
-	# 	if t.nil?
-	# 		print source_text[p]
-	# 	else
-	# 		print '.'
-	# 	end
-	# end
-	# puts
-	# puts "=====END"
-	source_text = source_annotations[:text]
-	puts "[block alignment]"
-	puts alignment.alignment_show
-	puts "====="
-	# exit
-	denotations = alignment.transform_hdenotations(source_annotations[:denotations])
-	lost_annotations += alignment.lost_annotations if alignment.lost_annotations
+	denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
 	source_annotations.merge({text:target_text, denotations:denotations})
 end
-num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
-	num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
-	source_annotations.each do |annotations|
-		num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
-		num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
-		num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
-		num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
-	end
-	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
-else
-	num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
-	num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
-	num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
-	num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
-	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
-end
-warn "[source]"
-warn "denotations:\t#{num_denotations_source}"
-# warn "relations:\t#{num_relations_source}"
-# warn "attributes:\t#{num_attributes_source}"
-# warn "modifications:\t#{num_modifications_source}"
-warn "\n[target]"
-warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
-# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
-# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
-# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
-if lost_annotations
-	warn "\n[lost annotations]"
-	lost_annotations.each do |a|
-		p a
-	end
-end
-#puts target_annotations.to_json
-# denotations = anns1[:denotations]
-# puts "[Alignment1]====="
-# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
-# align.alignment.each do |a|
-# 	p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
-# end
-# puts TextAlignment::sdiff2cdiff(align.sdiff)
-# puts
-# puts "[Similarity]\n#{align.similarity}"
-# puts
-# puts '[Denotations original]'
-# pp denotations
-# puts
-# puts '[Denotations transformed]'
-# new_denotations = align.transform_hdenotations(denotations)
-# pp new_denotations
-# puts
-# puts "[Alignment2 (downcased)]====="
-# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
-# puts TextAlignment::sdiff2cdiff(align.sdiff)
-# puts
-# puts "[Similarity]\n#{align.similarity}"
-# puts
-# puts '[Denotations original]'
-# pp denotations
-# puts
-# puts '[Denotations transformed]'
-# new_denotations = align.transform_hdenotations(denotations)
-# pp new_denotations
-# puts
-# puts '[Annotations transformed]'
-# anns2[:denotations] = new_denotations
-# puts anns2.to_json
-# p align.common_elements
-# puts "---------------"
-# p align.mapped_elements
+# puts target_annotations.to_json

data/lib/text_alignment/constants.rb CHANGED

@@ -1,7 +1,7 @@
 module TextAlignment; end unless defined? TextAlignment
 TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
-TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
+TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
 TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
 TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
 TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD

data/lib/text_alignment/mappings.rb CHANGED

@@ -1,74 +1,172 @@
 module TextAlignment; end unless defined? TextAlignment
 TextAlignment::MAPPINGS = [
-	["©", "(c)"],   #U+00A9 (Copyright Sign)
-	["α", "alpha"],   #U+03B1 (greek small letter alpha)
-	["β", "beta"],    #U+03B2 (greek small letter beta)
-	["γ", "gamma"],   #U+03B3 (greek small letter gamma)
-	["δ", "delta"],   #U+03B4 (greek small letter delta)
-	["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
-	["ζ", "zeta"],    #U+03B6 (greek small letter zeta)
-	["η", "eta"],     #U+03B7 (greek small letter eta)
-	["θ", "theta"],   #U+03B7 (greek small letter eta)
-	["ι", "iota"],    #U+03B7 (greek small letter eta)
-	["κ", "kappa"],   #U+03BA (greek small letter kappa)
-	["λ", "lambda"],  #U+03BB (greek small letter lambda)
-	["λ", "lamda"],  #U+03BB (greek small letter lambda)
-	["μ", "mu"],      #U+03BC (greek small letter mu)
-	["ν", "nu"],      #U+03BD (greek small letter nu)
-	["ξ", "xi"],      #U+03BE (greek small letter xi)
-	["ο", "omicron"], #U+03BF (greek small letter omicron)
-	["π", "pi"],      #U+03C0 (greek small letter pi)
-	["ρ", "rho"],     #U+03C1 (greek small letter rho)
-	["σ", "sigma"],   #U+03C3 (greek small letter sigma)
-	["τ", "tau"],     #U+03C4 (greek small letter tau)
-	["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
-	["φ", "phi"],     #U+03C6 (greek small letter phi)
-	["χ", "chi"],     #U+03C7 (greek small letter chi)
-	["ψ", "psi"],     #U+03C8 (greek small letter psi)
-	["ω", "omega"],   #U+03C9 (greek small letter omega)
-	["Α", "Alpha"],   #U+0391 (greek capital letter alpha)
-	["Β", "Beta"],    #U+0392 (greek capital letter beta)
-	["Γ", "Gamma"],   #U+0393 (greek capital letter gamma)
-	["Δ", "Delta"],   #U+0394 (greek capital letter delta)
-	["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
-	["Ζ", "Zeta"],    #U+0396 (greek capital letter zeta)
-	["Η", "Eta"],     #U+0397 (greek capital letter eta)
-	["Θ", "Theta"],   #U+0398 (greek capital letter theta)
-	["Ι", "Iota"],    #U+0399 (greek capital letter iota)
-	["Κ", "Kappa"],   #U+039A (greek capital letter kappa)
-	["Λ", "Lambda"],  #U+039B (greek capital letter lambda)
-	["Λ", "Lamda"],  #U+039B (greek capital letter lambda)
-	["Μ", "Mu"],      #U+039C (greek capital letter mu)
-	["Ν", "Nu"],      #U+039D (greek capital letter nu)
-	["Ξ", "Xi"],      #U+039E (greek capital letter xi)
-	["Ο", "Omicron"], #U+039F (greek capital letter omicron)
-	["Π", "Pi"],      #U+03A0 (greek capital letter pi)
-	["Ρ", "Rho"],     #U+03A1 (greek capital letter rho)
-	["Σ", "Sigma"],   #U+03A3 (greek capital letter sigma)
-	["Τ", "Tau"],     #U+03A4 (greek capital letter tau)
-	["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
-	["Φ", "Phi"],     #U+03A6 (greek capital letter phi)
-	["Χ", "Chi"],     #U+03A7 (greek capital letter chi)
-	["Ψ", "Psi"],     #U+03A8 (greek capital letter Psi)
-	["Ω", "Omega"],   #U+03A9 (greek capital letter omega)
-	["ϕ", "phi"],     #U+03D5 (greek phi symbol)
-	["×", "x"],       #U+00D7 (multiplication sign)
-	["•", "*"],       #U+2022 (bullet)
-	[" ", " "],       #U+2009 (thin space)
-	[" ", " "],       #U+200A (hair space)
-	[" ", " "],       #U+00A0 (no-break space)
-	["　", " "],       #U+3000 (ideographic space)
-	["−", "-"],       #U+2212 (minus sign)
-	["–", "-"],       #U+2013 (en dash)
-	["′", "'"],       #U+2032 (prime)
-	["‘", "'"],       #U+2018 (left single quotation mark)
-	["’", "'"],       #U+2019 (right single quotation mark)
-	["“", '"'],       #U+201C (left double quotation mark)
-	["”", '"'],        #U+201D (right double quotation mark)
+	["©", "(c)"],			#U+00A9 (Copyright Sign)
+	["α", "alpha"],		#U+03B1 (greek small letter alpha)
+	["β", "beta"],		#U+03B2 (greek small letter beta)
+	["γ", "gamma"],		#U+03B3 (greek small letter gamma)
+	["δ", "delta"],		#U+03B4 (greek small letter delta)
+	["ε", "epsilon"],	#U+03B5 (greek small letter epsilon)
+	["ζ", "zeta"],		#U+03B6 (greek small letter zeta)
+	["η", "eta"],			#U+03B7 (greek small letter eta)
+	["θ", "theta"],		#U+03B7 (greek small letter eta)
+	["ι", "iota"],		#U+03B7 (greek small letter eta)
+	["κ", "kappa"],		#U+03BA (greek small letter kappa)
+	["λ", "lambda"],	#U+03BB (greek small letter lambda)
+	["λ", "lamda"],		#U+03BB (greek small letter lambda)
+	["μ", "mu"],			#U+03BC (greek small letter mu)
+	["ν", "nu"],			#U+03BD (greek small letter nu)
+	["ξ", "xi"],			#U+03BE (greek small letter xi)
+	["ο", "omicron"],	#U+03BF (greek small letter omicron)
+	["π", "pi"],			#U+03C0 (greek small letter pi)
+	["ρ", "rho"],			#U+03C1 (greek small letter rho)
+	["σ", "sigma"],		#U+03C3 (greek small letter sigma)
+	["τ", "tau"],			#U+03C4 (greek small letter tau)
+	["υ", "upsilon"],	#U+03C5 (greek small letter upsilon)
+	["φ", "phi"],			#U+03C6 (greek small letter phi)
+	["χ", "chi"],			#U+03C7 (greek small letter chi)
+	["ψ", "psi"],			#U+03C8 (greek small letter psi)
+	["ω", "omega"],		#U+03C9 (greek small letter omega)
+	["Α", "Alpha"],		#U+0391 (greek capital letter alpha)
+	["Β", "Beta"],		#U+0392 (greek capital letter beta)
+	["Γ", "Gamma"],		#U+0393 (greek capital letter gamma)
+	["Δ", "Delta"],		#U+0394 (greek capital letter delta)
+	["Ε", "Epsilon"],	#U+0395 (greek capital letter epsilon)
+	["Ζ", "Zeta"],		#U+0396 (greek capital letter zeta)
+	["Η", "Eta"],			#U+0397 (greek capital letter eta)
+	["Θ", "Theta"],		#U+0398 (greek capital letter theta)
+	["Ι", "Iota"],		#U+0399 (greek capital letter iota)
+	["Κ", "Kappa"],		#U+039A (greek capital letter kappa)
+	["Λ", "Lambda"],	#U+039B (greek capital letter lambda)
+	["Λ", "Lamda"],		#U+039B (greek capital letter lambda)
+	["Μ", "Mu"],			#U+039C (greek capital letter mu)
+	["Ν", "Nu"],			#U+039D (greek capital letter nu)
+	["Ξ", "Xi"],			#U+039E (greek capital letter xi)
+	["Ο", "Omicron"],	#U+039F (greek capital letter omicron)
+	["Π", "Pi"],			#U+03A0 (greek capital letter pi)
+	["Ρ", "Rho"],			#U+03A1 (greek capital letter rho)
+	["Σ", "Sigma"],		#U+03A3 (greek capital letter sigma)
+	["Τ", "Tau"],			#U+03A4 (greek capital letter tau)
+	["Υ", "Upsilon"],	#U+03A5 (greek capital letter upsilon)
+	["Φ", "Phi"],			#U+03A6 (greek capital letter phi)
+	["Χ", "Chi"],			#U+03A7 (greek capital letter chi)
+	["Ψ", "Psi"],			#U+03A8 (greek capital letter Psi)
+	["Ω", "Omega"],		#U+03A9 (greek capital letter omega)
+	["ϕ", "phi"],			#U+03D5 (greek phi symbol)
+	["×", "x"],				#U+00D7 (multiplication sign)
+	["•", "*"],				#U+2022 (bullet)
+	[" ", " "],				#U+2009 (thin space)
+	[" ", " "],				#U+200A (hair space)
+	[" ", " "],				#U+00A0 (no-break space)
+	["　", " "],				#U+3000 (ideographic space)
+	["‑", "-"],				#U+2211 (Non-Breaking Hyphen)
+	["−", "-"],				#U+2212 (minus sign)
+	["–", "-"],				#U+2013 (en dash)
+	["′", "'"],				#U+2032 (prime)
+	["‘", "'"],				#U+2018 (left single quotation mark)
+	["’", "'"],				#U+2019 (right single quotation mark)
+	["“", '"'],				#U+201C (left double quotation mark)
+	["”", '"'],				#U+201D (right double quotation mark)
 	['"', "''"]
-  ]
+]
+TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
+class << TextAlignment
+	def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
+		_mappings ||= TextAlignment::MAPPINGS
+		character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
+		if character_mappings.empty?
+			[_str1, _str2, _mappings]
+		else
+			characters_from = character_mappings.collect{|m| m[0]}.join
+			characters_to   = character_mappings.collect{|m| m[1]}.join
+			characters_to.gsub!(/-/, '\-')
+			str1 = _str1.tr(characters_from, characters_to)
+			str2 = _str2.tr(characters_from, characters_to)
+			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
+			[str1, str2, mappings]
+		end
+	end
+	def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
+		_mappings ||= TextAlignment::MAPPINGS
+		long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
+		if long_to_one_mappings.empty?
+			[_str1, _str2, _mappings]
+		else
+			## long to one character mappings
+			pletters = TextAlignment::PADDING_LETTERS
+			# find the padding letter for str1
+			@padding_letter1 = begin
+				i = pletters.index{|l| _str2.index(l).nil?}
+				raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
+				TextAlignment::PADDING_LETTERS[i]
+			end
+			# find the padding letter for str2
+			@padding_letter2 = begin
+				i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
+				raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
+				TextAlignment::PADDING_LETTERS[i]
+			end
+			str1 = str2 = nil
+			long_to_one_mappings.each do |f|
+				from = f[1]
+				str1 = if _str2.index(f[0])
+					to = f[0] + (@padding_letter1 * (f[1].length - 1))
+					_str1.gsub(from, to)
+				else
+					_str1
+				end
+				str2 = if _str1.index(f[0])
+					to = f[0] + (@padding_letter2 * (f[1].length - 1))
+					_str2.gsub(from, to)
+				else
+					_str2
+				end
+			end
+			mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
+			[str1, str2, mappings]
+		end
+	end
+	def compute_similarity(_s1, _s2, sdiff)
+		return 0 if sdiff.nil?
+		# compute the lcs only with non-whitespace letters
+		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
+		return 0 if lcs == 0
+		s1 = if @padding_letter1
+			_s1.tr(@padding_letter1, ' ')
+		else
+			_s1
+		end
+		s2 = if @padding_letter2
+			_s2.tr(@padding_letter2, ' ')
+		else
+			_s2
+		end
+		similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
+	end
+end

data/lib/text_alignment/mixed_alignment.rb CHANGED

@@ -17,10 +17,10 @@ class TextAlignment::MixedAlignment
 	attr_reader :similarity
 	attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
-	def initialize(_str1, _str2)
+	def initialize(_str1, _str2, _mappings = nil)
 		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
-		str1, str2, mappings = string_preprocessing(_str1, _str2)
+		str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
 		_compute_mixed_alignment(str1, str2, mappings)
 	end
@@ -63,7 +63,7 @@ class TextAlignment::MixedAlignment
 		end
 		cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
-		@similarity         = compute_similarity(str1, str2, @sdiff)
+		@similarity         = TextAlignment::compute_similarity(str1, str2, @sdiff)
 		@str1_match_initial = cmp.str1_match_initial
 		@str1_match_final   = cmp.str1_match_final
 		@str2_match_initial = cmp.str2_match_initial
@@ -139,72 +139,4 @@ class TextAlignment::MixedAlignment
 		@position_map_end = posmap_end.sort.to_h
 	end
-	private
-	def string_preprocessing(_str1, _str2)
-		str1 = _str1.dup
-		str2 = _str2.dup
-		mappings = TextAlignment::MAPPINGS.dup
-		## single character mappings
-		character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
-		characters_from = character_mappings.collect{|m| m[0]}.join
-		characters_to   = character_mappings.collect{|m| m[1]}.join
-		characters_to.gsub!(/-/, '\-')
-		str1.tr!(characters_from, characters_to)
-		str2.tr!(characters_from, characters_to)
-		mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
-		## long to one character mappings
-		pletters = TextAlignment::PADDING_LETTERS
-		# find the padding letter for str1
-		@padding_letter1 = begin
-			i = pletters.index{|l| str2.index(l).nil?}
-			raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
-			TextAlignment::PADDING_LETTERS[i]
-		end
-		# find the padding letter for str2
-		@padding_letter2 = begin
-			i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
-			raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
-			TextAlignment::PADDING_LETTERS[i]
-		end
-		# ASCII foldings
-		ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
-		ascii_foldings.each do |f|
-			from = f[1]
-			if str2.index(f[0])
-				to   = f[0] + (@padding_letter1 * (f[1].length - 1))
-				str1.gsub!(from, to)
-			end
-			if str1.index(f[0])
-				to   = f[0] + (@padding_letter2 * (f[1].length - 1))
-				str2.gsub!(from, to)
-			end
-		end
-		mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
-		[str1, str2, mappings]
-	end
-	def compute_similarity(_s1, _s2, sdiff)
-		return 0 if sdiff.nil?
-		# compute the lcs only with non-whitespace letters
-		lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
-		return 0 if lcs == 0
-		s1 = _s1.tr(@padding_letter1, ' ')
-		s2 = _s2.tr(@padding_letter2, ' ')
-		similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
-	end
 end

data/lib/text_alignment/text_alignment.rb CHANGED

@@ -5,34 +5,25 @@ require 'text_alignment/mixed_alignment'
 module TextAlignment; end unless defined? TextAlignment
-TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
 class TextAlignment::TextAlignment
 	attr_reader :block_alignment
 	attr_reader :similarity
 	attr_reader :lost_annotations
-	def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
-		raise ArgumentError, "nil string" if str1.nil? || str2.nil?
+	def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
+		raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
-		@block_alignment = {source_text:str1, target_text:str2}
-		@str1 = str1
-		@str2 = str2
+		@block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
+		@original_str1 = _str1
+		@original_str2 = _str2
-		## Block exact match
-		block_begin = str2.index(str1)
-		unless block_begin.nil?
-			@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
-			return
-		end
+		str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
-		block_begin = str2.downcase.index(str1.downcase)
-		unless block_begin.nil?
-			@block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
+		if r = whole_block_alignment(str1, str2)
+			@block_alignment[:blocks] = r
 			return
 		end
 		## to find block alignments
 		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
@@ -120,12 +111,29 @@ class TextAlignment::TextAlignment
 				else
 					[{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
 				end
+			else
+				[]
 			end
 		end
 		@block_alignment[:blocks] = blocks2
 	end
+	def whole_block_alignment(str1, str2)
+		## Block exact match
+		block_begin = str2.index(str1)
+		unless block_begin.nil?
+			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
+		end
+		block_begin = str2.downcase.index(str1.downcase)
+		unless block_begin.nil?
+			return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
+		end
+		nil
+	end
 	def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
 		block2 = str2[b2 ... e2]
@@ -156,7 +164,6 @@ class TextAlignment::TextAlignment
 					lex = term[:lex]
 					look_forward = block2.index(lex, position)
 					unless look_forward.nil?
-						puts lex
 						tblocks = []
 						break
 					end
@@ -164,31 +171,37 @@ class TextAlignment::TextAlignment
 			end
 			tblocks
+		else
+			[]
 		end
 		if tblocks.empty?
 			if b1 == 0 && e1 == str1.length
-				if str2.length > 2000
+				if (e1 > 2000) || (e2 > 2000)
 					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
 				else
 					block1 = str1[b1 ... e1]
 					block2 = str2[b2 ... e2]
 					## character-based alignment
-					alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
-					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
-					# alignment = :alignment
-					# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
+					alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
+					if alignment.sdiff.nil?
+						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
+					else
+						[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
+					end
 				end
 			else
 				block1 = str1[b1 ... e1]
 				block2 = str2[b2 ... e2]
 				## character-based alignment
-				alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
-				[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment:alignment, similarity: alignment.similarity}]
-				# alignmnet = :alignment
-				# [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :alignment}]
+				alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
+				if alignment.sdiff.nil?
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
+				else
+					[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
+				end
 			end
 		else
 			last_tblock = nil
@@ -199,7 +212,7 @@ class TextAlignment::TextAlignment
 				sum += if te1 == tb1
 					[tblock]
 				else
-					tb2 = last_tblock ? tlast_block[:target][:end] : b2
+					tb2 = last_tblock ? last_tblock[:target][:end] : b2
 					te2 = tblock[:target][:begin]
 					if b2 == e2
@@ -295,7 +308,7 @@ class TextAlignment::TextAlignment
 			source = {begin:d.begin, end:d.end}
 			d.begin = transform_begin_position(d.begin);
 			d.end = transform_end_position(d.end);
-			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
+			raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
 		rescue
 			@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
 			d.begin = nil
@@ -311,7 +324,7 @@ class TextAlignment::TextAlignment
 		r = hdenotations.collect do |d|
 			t = transform_a_span(d[:span])
-			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
+			raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
 			new_d = d.dup.merge({span:t})
 		rescue
 			@lost_annotations << {source: d[:span], target:t}
@@ -338,8 +351,13 @@ class TextAlignment::TextAlignment
 				"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
 				"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
 				stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
-				">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
-				ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
+				">>>>> string 2 " +
+				if a[:target]
+					"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
+					ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
+				else
+					"[-]\n\n"
+				end
 			else
 				astr1 = ''
 				astr2 = ''
@@ -379,5 +397,4 @@ class TextAlignment::TextAlignment
 		end
 		show
 	end
 end

data/lib/text_alignment/version.rb CHANGED

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.7'
+	VERSION = '0.9'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: '0.7'
+  version: '0.9'
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-10-11 00:00:00.000000000 Z
+date: 2020-11-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary