RubyGems - text_alignment - Versions diffs - 0.2.9 → 0.3.9 - Mend

text_alignment 0.2.9 → 0.3.9

Files changed (23) hide show

checksums.yaml +4 -4
data/bin/align_annotations +190 -39
data/lib/text_alignment/anchor_finder.rb +149 -0
data/lib/text_alignment/approximate_fit.rb +50 -52
data/lib/text_alignment/find_divisions.rb +198 -200
data/lib/text_alignment/glcs_alignment.rb +297 -297
data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
data/lib/text_alignment/glcs_required.rb +50 -50
data/lib/text_alignment/lcs_alignment.rb +115 -115
data/lib/text_alignment/lcs_cdiff.rb +46 -48
data/lib/text_alignment/lcs_comparison.rb +53 -53
data/lib/text_alignment/lcs_min.rb +144 -138
data/lib/text_alignment/mappings.rb +68 -69
data/lib/text_alignment/mixed_alignment.rb +193 -0
data/lib/text_alignment/text_alignment.rb +232 -174
data/lib/text_alignment/version.rb +1 -1
data/text_alignment.gemspec +1 -1
metadata +5 -13
data/spec/spec_helper.rb +0 -1
data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
data/spec/text_alignment/text_alignment_spec.rb +0 -302

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8c1c45ed630cdd60291606b59e1944f0b854a689cfa0d281ae8b8879bf01e806
-  data.tar.gz: 9b33688f3a08f9110f556b4357fbe598d42d1147bc06933323cae4df7187341f
+  metadata.gz: b4a8ed8b4cff6f5b04b10c91704939936530ad0dc820a126a514a58cce7a4df6
+  data.tar.gz: '087412fa8b9779073c67fa1d9a0afc05e32d7f1baad4518c375fcf804a45ecd4'
 SHA512:
-  metadata.gz: 856a8fca63f80be4cea7f6beff85dcf475d9237d68bc96728bcfcc030397f414637d2a8e32b139a6fdbccfd8603d327738ff7f3f59d80dc9e61e55a11a04bf20
-  data.tar.gz: 6055f50827354461f194a50da74259e8d97473be9b085634df270964bb815d96d9f916bbad2ef3961ddd845291a80e13b1680f1c42bafeab156b0d59fa3ba952
+  metadata.gz: e5f56b58d35a614c6b9a72ccb8282b775d19c2fb576d68420153b96703d954e47471cfbcd9b384bd244f19110cc436d28e409d5014f1ade66c23390a928111fc
+  data.tar.gz: 4dbdd214b0e2aab9b32751305517160016cfd16b60f5380282b2b1ba2e6946e3097789f4fd234b577cf8ab00e06ea8d3497f59843d600eac2b12b8d60c32441c

data/bin/align_annotations CHANGED

@@ -1,51 +1,202 @@
 #!/usr/bin/env ruby
 require 'text_alignment'
 require 'json'
 require 'pp'
+def read_annotations(filename)
+	case File.extname(filename)
+	when '.json'
+		JSON.parse File.read(filename), :symbolize_names => true
+	when '.txt'
+		{text: File.read(filename)}
+	else
+		raise "unknown file type: #{filename}"
+	end
+end
+def read_text(filename)
+	case File.extname(filename)
+	when '.json'
+		json = JSON.parse File.read(filename), :symbolize_names => true
+		json[:text]
+	when '.txt'
+		File.read(filename)
+	else
+		raise "unknown file type: #{filename}"
+	end
+end
+def align_mdoc(source_annotations, target_annotations)
+	idnum_denotations = 0
+	idnum_relations = 0
+	idnum_attributes = 0
+	idnum_modifications = 0
+	source_annotations.each do |annotations|
+		alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
+		# alignment.block_alignments.each do |a|
+			# p {source:a[:source], target:a[:target]}
+			# puts "--"
+			# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
+			# puts "--"
+			# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
+			# puts "--"
+			# puts target_text[a[:target][:begin] ... a[:target][:end]]
+			# puts "======"
+		# end
+		if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
+			ididx = {}
+			denotations = alignment.transform_hdenotations(annotations[:denotations])
+			denotations.each do |d|
+				reid = 'T' + (idnum_denotations += 1).to_s
+				ididx[d[:id]] = reid
+				d[:id] = reid
+			end
+			target_annotations[:denotations] = [] unless target_annotations.has_key? :denotations
+			target_annotations[:denotations] += denotations
+			if annotations.has_key?(:relations) && !annotations[:relations].empty?
+				target_annotations[:relations] = [] unless target_annotations.has_key? :relations
+				annotations[:relations].each do |r|
+					reid = 'R' + (idnum_relations += 1).to_s
+					ididx[r[:id]] = reid
+					target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
+				end
+			end
+			if annotations.has_key?(:attributes) && !annotations[:attributes].empty?
+				target_annotations[:attributes] = [] unless target_annotations.has_key? :attributes
+				annotations[:attributes].each do |a|
+					reid = 'A' + (idnum_attributes += 1).to_s
+					ididx[a[:id]] = reid
+					target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
+				end
+			end
+			if annotations.has_key?(:modifications) && !annotations[:modifications].empty?
+				target_annotations[:modifications] = [] unless target_annotations.has_key? :modifications
+				annotations[:modifications].each do |m|
+					reid = 'M' + (idnum_modifications += 1).to_s
+					ididx[m[:id]] = reid
+					target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
+				end
+			end
+		end
+	end
+	target_annotations
+end
 unless ARGV.length == 2
-	warn "align_annotations target_annotations(.json) reference_annotations(.json)"
+	warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
 	exit
 end
-anns1 = JSON.parse File.read(ARGV[0].strip), :symbolize_names => true
-anns2 = JSON.parse File.read(ARGV[1].strip), :symbolize_names => true
-str1 = anns1[:text]
-str2 = anns2[:text]
-denotations = anns1[:denotations]
-puts "[Alignment1]====="
-align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
-puts TextAlignment::sdiff2cdiff(align.sdiff)
-puts
-puts "[Similarity]\n#{align.similarity}"
-puts
-puts '[Denotations original]'
-pp denotations
-puts
-puts '[Denotations transformed]'
-new_denotations = align.transform_hdenotations(denotations)
-pp new_denotations
-puts
-puts "[Alignment2 (downcased)]====="
-align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
-puts TextAlignment::sdiff2cdiff(align.sdiff)
-puts
-puts "[Similarity]\n#{align.similarity}"
-puts
-puts '[Denotations original]'
-pp denotations
-puts
-puts '[Denotations transformed]'
-new_denotations = align.transform_hdenotations(denotations)
-pp new_denotations
-puts
-puts '[Annotations transformed]'
-anns2[:denotations] = new_denotations
-puts anns2.to_json
+source_annotations = read_annotations(ARGV[0])
+target_text = read_text(ARGV[1])
+lost_annotations = []
+target_annotations = if source_annotations.class == Array
+	align_mdoc(source_annotations, {text: target_text})
+else
+	alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
+	# alignment.block_alignments.each do |a|
+	# 	if a[:alignment].nil? || a[:alignment] == :empty
+	# 		# p [a[:source], a[:target]]
+	# 		# p a[:alignment]
+	# 	else
+	# 		p [a[:source], a[:target]]
+	# 		p a[:alignment].similarity
+	# 		puts "--"
+	# 		puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
+	# 		puts "--"
+	# 		puts target_text[a[:target][:begin] ... a[:target][:end]]
+	# 		puts "======"
+	# 	end
+	# end
+	# exit
+	denotations = alignment.transform_hdenotations(source_annotations[:denotations])
+	lost_annotations += alignment.lost_annotations if alignment.lost_annotations
+	source_annotations.merge({text:target_text, denotations:denotations})
+end
+num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
+	num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
+	source_annotations.each do |annotations|
+		num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
+		num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
+		num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
+		num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
+	end
+	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
+else
+	num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
+	num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
+	num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
+	num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
+	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
+end
+warn "[source]"
+warn "denotations:\t#{num_denotations_source}"
+# warn "relations:\t#{num_relations_source}"
+# warn "attributes:\t#{num_attributes_source}"
+# warn "modifications:\t#{num_modifications_source}"
+warn "\n[target]"
+warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
+# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
+# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
+# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
+if lost_annotations
+	warn "\n[lost annotations]"
+	warn "#{lost_annotations.length}"
+end
+puts target_annotations.to_json
+# denotations = anns1[:denotations]
+# puts "[Alignment1]====="
+# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
+# align.alignment.each do |a|
+# 	p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
+# end
+# puts TextAlignment::sdiff2cdiff(align.sdiff)
+# puts
+# puts "[Similarity]\n#{align.similarity}"
+# puts
+# puts '[Denotations original]'
+# pp denotations
+# puts
+# puts '[Denotations transformed]'
+# new_denotations = align.transform_hdenotations(denotations)
+# pp new_denotations
+# puts
+# puts "[Alignment2 (downcased)]====="
+# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
+# puts TextAlignment::sdiff2cdiff(align.sdiff)
+# puts
+# puts "[Similarity]\n#{align.similarity}"
+# puts
+# puts '[Denotations original]'
+# pp denotations
+# puts
+# puts '[Denotations transformed]'
+# new_denotations = align.transform_hdenotations(denotations)
+# pp new_denotations
+# puts
+# puts '[Annotations transformed]'
+# anns2[:denotations] = new_denotations
+# puts anns2.to_json
 # p align.common_elements
 # puts "---------------"

data/lib/text_alignment/anchor_finder.rb ADDED

@@ -0,0 +1,149 @@
+#!/usr/bin/env ruby
+require 'string-similarity'
+module TextAlignment; end unless defined? TextAlignment
+TextAlignment::SIZE_NGRAM = 5 unless defined? TextAlignment::SIZE_NGRAM
+TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
+TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
+class TextAlignment::AnchorFinder
+	def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
+		@size_ngram  = _size_ngram  || TextAlignment::SIZE_NGRAM
+		@size_window = _size_window || TextAlignment::SIZE_WINDOW
+		@reverse = (target_str.length < source_str.length)
+		@s1, @s2 = if @reverse
+			[target_str.downcase, source_str.downcase]
+		else
+			[source_str.downcase, target_str.downcase]
+		end
+		# current position in s1
+		@beg_s1 = 0
+	end
+	def get_next_anchor
+		# find the position of an anchor ngram in s1 and s2
+		@beg_s2 = nil
+		while @beg_s1 < (@s1.length - @size_ngram)
+			while @beg_s1 < (@s1.length - @size_ngram)
+				anchor = @s1[@beg_s1, @size_ngram]
+				@beg_s2 = if defined? @end_s2_prev
+					@s2.index(anchor, @end_s2_prev)
+				else
+					@s2.index(anchor)
+				end
+				break unless @beg_s2.nil?
+				@beg_s1 += 1
+			end
+			# The loop above is terminated with beg_s2 == nil, which means no more anchor
+			break if @beg_s2.nil?
+			# if both the begining points are sufficiantly close to the end points of the last match
+			break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
+			left_window_s1, left_window_s2 = get_left_windows
+			break if left_window_s1  && text_similarity(left_window_s1, left_window_s2)  > TextAlignment::TEXT_SIMILARITY_TRESHOLD
+			right_window_s1, right_window_s2 = get_right_windows
+			break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
+			@beg_s1 += 1
+		end
+		return nil if @beg_s2.nil?
+		# extend the block
+		b1 = @beg_s1
+		b2 = @beg_s2
+		while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
+			b1 -= 1; b2 -= 1
+		end
+		b1 += 1; b2 += 1
+		e1 = @beg_s1 + @size_ngram
+		e2 = @beg_s2 + @size_ngram
+		while @s1[e1] == @s2[e2]
+			e1 += 1; e2 += 1
+		end
+		@end_s1_prev = e1
+		@end_s2_prev = e2
+		@beg_s1 = e1
+		if @reverse
+			{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
+		else
+			{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
+		end
+	end
+	private
+	def get_left_windows
+		return if @beg_s1 < @size_window || @beg_s2 < @size_window
+		window_s1 = ''
+		loc = @beg_s1 - 1
+		count = 0
+		while count < @size_window && loc >= 0
+			if @s1[loc] =~ /[0-9a-zA-Z]/
+				window_s1 += @s1[loc]
+				count += 1
+			end
+			loc -= 1
+		end
+		window_s2 = ''
+		loc = @beg_s2 - 1
+		count = 0
+		while count < @size_window && loc >= 0
+			if @s2[loc] =~ /[0-9a-zA-Z]/
+				window_s2 += @s2[loc]
+				count += 1
+			end
+			loc -= 1
+		end
+		[window_s1, window_s2]
+	end
+	def get_right_windows
+		return if (@beg_s1 + @size_ngram < (@s1.length - @size_window)) || (@beg_s2 + @size_ngram < (@s2.length - @size_window))
+		window_s1 = ''
+		loc = @beg_s1 + @size_ngram
+		len_s1 = @s1.length
+		count = 0
+		while count < @size_window && loc < len_s1
+			if @s1[loc] =~ /[0-9a-zA-Z]/
+				window_s1 += @s1[loc]
+				count += 1
+			end
+			loc += 1
+		end
+		window_s2 = ''
+		loc = @beg_s2 + @size_ngram
+		len_s2 = @s2.length
+		count = 0
+		while count < @size_window && loc < len_s2
+			if @s2[loc] =~ /[0-9a-zA-Z]/
+				window_s2 += @s2[loc]
+				count += 1
+			end
+			loc += 1
+		end
+		[window_s1, window_s2]
+	end
+	def text_similarity(str1, str2, ngram_order = 2)
+		String::Similarity.cosine(str1, str2, ngram:ngram_order)
+	end
+end

data/lib/text_alignment/approximate_fit.rb CHANGED

@@ -4,75 +4,73 @@ require 'string-similarity'
 module TextAlignment; end unless defined? TextAlignment
 # approximate the location of str1 in str2
-module TextAlignment
-  SIGNATURE_NGRAM = 5
-  MIN_LENGTH_FOR_APPROXIMATION = 50
-  BUFFER_RATE = 0.1
-  TEXT_SIMILARITY_TRESHOLD = 0.7
-end
+TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
+TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
+TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
+TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
 class << TextAlignment
-  # If finds an approximate region of str2 that contains str1
-  def approximate_fit(str1, str2)
-    raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
-    return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
+	# If finds an approximate region of str2 that contains str1
+	def approximate_fit(str1, str2)
+		raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
+		return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
-    ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
-    ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
-    ngram_shared = ngram1 & ngram2
+		ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
+		ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
+		ngram_shared = ngram1 & ngram2
-    # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
-    return nil, nil if ngram_shared.empty?
+		# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
+		return nil, nil if ngram_shared.empty?
-    signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
-    return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
+		signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
+		return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
-    cache = {}
-    fit_begin, fit_end = nil, nil
-    signature_ngrams.each do |signature_ngram|
-      loc_signature_ngram_in_str1 = str1.index(signature_ngram)
-      loc_signature_ngram_in_str2 = str2.index(signature_ngram)
+		cache = {}
+		fit_begin, fit_end = nil, nil
+		signature_ngrams.each do |signature_ngram|
+			loc_signature_ngram_in_str1 = str1.index(signature_ngram)
+			loc_signature_ngram_in_str2 = str2.index(signature_ngram)
-      # approximate the beginning of the fit
-      fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
-      fit_begin = 0 if fit_begin < 0
+			# approximate the beginning of the fit
+			fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
+			fit_begin = 0 if fit_begin < 0
-      # approximate the end of the fit
-      offset_end = str1.length - loc_signature_ngram_in_str1
-      fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
-      fit_end = str2.length if fit_end > str2.length
+			# approximate the end of the fit
+			offset_end = str1.length - loc_signature_ngram_in_str1
+			fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
+			fit_end = str2.length if fit_end > str2.length
-      next if cache.has_key?("#{fit_begin}-#{fit_end}")
-      text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
-      cache["#{fit_begin}-#{fit_end}"] = text_similarity
+			next if cache.has_key?("#{fit_begin}-#{fit_end}")
+			text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
+			cache["#{fit_begin}-#{fit_end}"] = text_similarity
-      break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
-      fit_begin, fit_end = nil, nil
-    end
-    return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
-    return nil, nil
-  end
+			break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
+			fit_begin, fit_end = nil, nil
+		end
+		return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
+		return nil, nil
+	end
-  private
+	private
-  def text_similarity(str1, str2, ngram_order = 3)
-    _str1 = str1.delete(" \t\r\n")
-    _str2 = str2.delete(" \t\r\n")
-    String::Similarity.cosine(_str1, _str2, ngram:2)
-  end
+	def text_similarity(str1, str2, ngram_order = 3)
+		_str1 = str1.delete(" \t\r\n")
+		_str2 = str2.delete(" \t\r\n")
+		String::Similarity.cosine(_str1, _str2, ngram:2)
+	end
 end
 if __FILE__ == $0
-  require 'json'
+	require 'json'
-  if ARGV.length == 2
-    str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
-    str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
+	if ARGV.length == 2
+		str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
+		str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
-    loc = TextAlignment::approximate_fit(str1, str2)
-    p loc
-    puts str2[loc[0]...loc[1]]
-  end
+		loc = TextAlignment::approximate_fit(str1, str2)
+		p loc
+		puts str2[loc[0]...loc[1]]
+	end
 end