RubyGems - text_alignment - Versions diffs - 0.2.7 → 0.3.11 - Mend

text_alignment 0.2.7 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/bin/align_annotations +190 -39
data/lib/text_alignment/anchor_finder.rb +143 -0
data/lib/text_alignment/approximate_fit.rb +50 -49
data/lib/text_alignment/find_divisions.rb +199 -101
data/lib/text_alignment/glcs_alignment.rb +297 -297
data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
data/lib/text_alignment/glcs_required.rb +50 -50
data/lib/text_alignment/lcs_alignment.rb +115 -115
data/lib/text_alignment/lcs_cdiff.rb +46 -48
data/lib/text_alignment/lcs_comparison.rb +53 -53
data/lib/text_alignment/lcs_min.rb +144 -138
data/lib/text_alignment/mappings.rb +68 -69
data/lib/text_alignment/mixed_alignment.rb +193 -0
data/lib/text_alignment/text_alignment.rb +228 -174
data/lib/text_alignment/version.rb +1 -1
data/text_alignment.gemspec +3 -3
metadata +7 -15
data/spec/spec_helper.rb +0 -1
data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
data/spec/text_alignment/text_alignment_spec.rb +0 -302

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 45bf8e55a66daff4d79765aec0e4d7482151aa7b86749e054e5d93030ffffabe
-  data.tar.gz: 6251835e35077e2ae828666026a663ae63be7f8d7b7d785b4898763021f546e2
+  metadata.gz: 01525b6ca5f7e0ae1ebb9dfce2083006e095a66b3ab468ccef0a584bc3005556
+  data.tar.gz: bc1137052a12b8db97635b183f299518b0158f3ccecf36d42fa45d746b4f3792
 SHA512:
-  metadata.gz: 4750b2507b4eb172123ca9a82320c2456e66d3f06ca3e3bdd52cf31055e9cb46af53013a484dbb87b1bb3236701cc8a69e1c4c64b576ee41a3a4957c3431be65
-  data.tar.gz: 1819b94e47bdf064d53712bb667f15d74f8e37bd49df7976075c3cbdc0a992d25bb318f76bfc0522cea0b594d002bcab0ebcef6112bf9e26128c245a7fb67bcf
+  metadata.gz: 9596bea2616c3b4d939c8314d026941a6e627f4380183409051df62722a0ee5e3b35302da3066b0d32e8322582c999877b05a09c54749d878a284a062247342e
+  data.tar.gz: 361e3e7a23697167b41e037e7d272bbd286ac397416defb125821ebbcb17bfa386341c75fecbb94fe7f9976cfbc2a8b5f7f9be7c150d23daf6d8c16410509b5d

data/bin/align_annotations CHANGED

@@ -1,51 +1,202 @@
 #!/usr/bin/env ruby
 require 'text_alignment'
 require 'json'
 require 'pp'
+def read_annotations(filename)
+	case File.extname(filename)
+	when '.json'
+		JSON.parse File.read(filename), :symbolize_names => true
+	when '.txt'
+		{text: File.read(filename)}
+	else
+		raise "unknown file type: #{filename}"
+	end
+end
+def read_text(filename)
+	case File.extname(filename)
+	when '.json'
+		json = JSON.parse File.read(filename), :symbolize_names => true
+		json[:text]
+	when '.txt'
+		File.read(filename)
+	else
+		raise "unknown file type: #{filename}"
+	end
+end
+def align_mdoc(source_annotations, target_annotations)
+	idnum_denotations = 0
+	idnum_relations = 0
+	idnum_attributes = 0
+	idnum_modifications = 0
+	source_annotations.each do |annotations|
+		alignment = TextAlignment::TextAlignment.new(annotations[:text], target_annotations[:text])
+		# alignment.block_alignments.each do |a|
+			# p {source:a[:source], target:a[:target]}
+			# puts "--"
+			# p a[:alignment] if a[:alignment].nil? || a[:alignment] == :empty
+			# puts "--"
+			# puts annotations[:text][a[:source][:begin] ... a[:source][:end]]
+			# puts "--"
+			# puts target_text[a[:target][:begin] ... a[:target][:end]]
+			# puts "======"
+		# end
+		if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
+			ididx = {}
+			denotations = alignment.transform_hdenotations(annotations[:denotations])
+			denotations.each do |d|
+				reid = 'T' + (idnum_denotations += 1).to_s
+				ididx[d[:id]] = reid
+				d[:id] = reid
+			end
+			target_annotations[:denotations] = [] unless target_annotations.has_key? :denotations
+			target_annotations[:denotations] += denotations
+			if annotations.has_key?(:relations) && !annotations[:relations].empty?
+				target_annotations[:relations] = [] unless target_annotations.has_key? :relations
+				annotations[:relations].each do |r|
+					reid = 'R' + (idnum_relations += 1).to_s
+					ididx[r[:id]] = reid
+					target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
+				end
+			end
+			if annotations.has_key?(:attributes) && !annotations[:attributes].empty?
+				target_annotations[:attributes] = [] unless target_annotations.has_key? :attributes
+				annotations[:attributes].each do |a|
+					reid = 'A' + (idnum_attributes += 1).to_s
+					ididx[a[:id]] = reid
+					target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
+				end
+			end
+			if annotations.has_key?(:modifications) && !annotations[:modifications].empty?
+				target_annotations[:modifications] = [] unless target_annotations.has_key? :modifications
+				annotations[:modifications].each do |m|
+					reid = 'M' + (idnum_modifications += 1).to_s
+					ididx[m[:id]] = reid
+					target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
+				end
+			end
+		end
+	end
+	target_annotations
+end
 unless ARGV.length == 2
-	warn "align_annotations target_annotations(.json) reference_annotations(.json)"
+	warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
 	exit
 end
-anns1 = JSON.parse File.read(ARGV[0].strip), :symbolize_names => true
-anns2 = JSON.parse File.read(ARGV[1].strip), :symbolize_names => true
-str1 = anns1[:text]
-str2 = anns2[:text]
-denotations = anns1[:denotations]
-puts "[Alignment1]====="
-align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
-puts TextAlignment::sdiff2cdiff(align.sdiff)
-puts
-puts "[Similarity]\n#{align.similarity}"
-puts
-puts '[Denotations original]'
-pp denotations
-puts
-puts '[Denotations transformed]'
-new_denotations = align.transform_hdenotations(denotations)
-pp new_denotations
-puts
-puts "[Alignment2 (downcased)]====="
-align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
-puts TextAlignment::sdiff2cdiff(align.sdiff)
-puts
-puts "[Similarity]\n#{align.similarity}"
-puts
-puts '[Denotations original]'
-pp denotations
-puts
-puts '[Denotations transformed]'
-new_denotations = align.transform_hdenotations(denotations)
-pp new_denotations
-puts
-puts '[Annotations transformed]'
-anns2[:denotations] = new_denotations
-puts anns2.to_json
+source_annotations = read_annotations(ARGV[0])
+target_text = read_text(ARGV[1])
+lost_annotations = []
+target_annotations = if source_annotations.class == Array
+	align_mdoc(source_annotations, {text: target_text})
+else
+	alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
+	# alignment.block_alignments.each do |a|
+	# 	if a[:alignment].nil? || a[:alignment] == :empty
+	# 		# p [a[:source], a[:target]]
+	# 		# p a[:alignment]
+	# 	else
+	# 		p [a[:source], a[:target]]
+	# 		p a[:alignment].similarity
+	# 		puts "--"
+	# 		puts source_annotations[:text][a[:source][:begin] ... a[:source][:end]]
+	# 		puts "--"
+	# 		puts target_text[a[:target][:begin] ... a[:target][:end]]
+	# 		puts "======"
+	# 	end
+	# end
+	# exit
+	denotations = alignment.transform_hdenotations(source_annotations[:denotations])
+	lost_annotations += alignment.lost_annotations if alignment.lost_annotations
+	source_annotations.merge({text:target_text, denotations:denotations})
+end
+num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = if source_annotations.class == Array
+	num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source = 0, 0, 0, 0
+	source_annotations.each do |annotations|
+		num_denotations_source += annotations[:denotations].nil? ? 0 : annotations[:denotations].length
+		num_relations_source += annotations[:relations].nil? ? 0 : annotations[:relations].length
+		num_attributes_source += annotations[:attributes].nil? ? 0 : annotations[:attributes].length
+		num_modifications_source += annotations[:modifications].nil? ? 0 : annotations[:modifications].length
+	end
+	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
+else
+	num_denotations_source = source_annotations[:denotations].nil? ? 0 : source_annotations[:denotations].length
+	num_relations_source = source_annotations[:relations].nil? ? 0 : source_annotations[:relations].length
+	num_attributes_source = source_annotations[:attributes].nil? ? 0 : source_annotations[:attributes].length
+	num_modifications_source = source_annotations[:modifications].nil? ? 0 : source_annotations[:modifications].length
+	[num_denotations_source, num_relations_source, num_attributes_source, num_modifications_source]
+end
+warn "[source]"
+warn "denotations:\t#{num_denotations_source}"
+# warn "relations:\t#{num_relations_source}"
+# warn "attributes:\t#{num_attributes_source}"
+# warn "modifications:\t#{num_modifications_source}"
+warn "\n[target]"
+warn "denotations:\t#{target_annotations[:denotations].nil? ? 0 : target_annotations[:denotations].length}"
+# warn "relations:\t#{target_annotations[:relations].nil? ? 0 : target_annotations[:relations].length}"
+# warn "attributes:\t#{target_annotations[:attributes].nil? ? 0 : target_annotations[:attributes].length}"
+# warn "modifications:\t#{target_annotations[:modifications].nil? ? 0 : target_annotations[:modifications].length}"
+if lost_annotations
+	warn "\n[lost annotations]"
+	warn "#{lost_annotations.length}"
+end
+puts target_annotations.to_json
+# denotations = anns1[:denotations]
+# puts "[Alignment1]====="
+# align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
+# align.alignment.each do |a|
+# 	p [a[:target][:begin], a[:target][:end], a[:source][:begin], a[:source][:end]]
+# end
+# puts TextAlignment::sdiff2cdiff(align.sdiff)
+# puts
+# puts "[Similarity]\n#{align.similarity}"
+# puts
+# puts '[Denotations original]'
+# pp denotations
+# puts
+# puts '[Denotations transformed]'
+# new_denotations = align.transform_hdenotations(denotations)
+# pp new_denotations
+# puts
+# puts "[Alignment2 (downcased)]====="
+# align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
+# puts TextAlignment::sdiff2cdiff(align.sdiff)
+# puts
+# puts "[Similarity]\n#{align.similarity}"
+# puts
+# puts '[Denotations original]'
+# pp denotations
+# puts
+# puts '[Denotations transformed]'
+# new_denotations = align.transform_hdenotations(denotations)
+# pp new_denotations
+# puts
+# puts '[Annotations transformed]'
+# anns2[:denotations] = new_denotations
+# puts anns2.to_json
 # p align.common_elements
 # puts "---------------"

data/lib/text_alignment/anchor_finder.rb ADDED

@@ -0,0 +1,143 @@
+#!/usr/bin/env ruby
+require 'string-similarity'
+module TextAlignment; end unless defined? TextAlignment
+TextAlignment::SIZE_NGRAM = 5 unless defined? TextAlignment::SIZE_NGRAM
+TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
+TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
+class TextAlignment::AnchorFinder
+	def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
+		@size_ngram  = _size_ngram  || TextAlignment::SIZE_NGRAM
+		@size_window = _size_window || TextAlignment::SIZE_WINDOW
+		@reverse = (target_str.length < source_str.length)
+		@s1, @s2 = if @reverse
+			[target_str.downcase, source_str.downcase]
+		else
+			[source_str.downcase, target_str.downcase]
+		end
+		# current position in s1
+		@beg_s1 = 0
+	end
+	def get_next_anchor
+		# find the position of an anchor ngram in s1 and s2
+		while @beg_s1 < (@s1.length - @size_ngram)
+			anchor = @s1[@beg_s1, @size_ngram]
+			search_position = 0
+			while @beg_s2 = @s2.index(anchor, search_position)
+				# if both the begining points are sufficiantly close to the end points of the last match
+				break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
+				left_window_s1, left_window_s2 = get_left_windows
+				break if left_window_s1  && text_similarity(left_window_s1, left_window_s2)  > TextAlignment::TEXT_SIMILARITY_TRESHOLD
+				right_window_s1, right_window_s2 = get_right_windows
+				break if right_window_s2 && text_similarity(right_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD
+				search_position = @beg_s2 + 1
+			end
+			break unless @beg_s2.nil?
+			@beg_s1 += 1
+		end
+		return nil if @beg_s1 >= (@s1.length - @size_ngram)
+		# extend the block
+		b1 = @beg_s1
+		b2 = @beg_s2
+		while b1 > -1 && b2 > -1 && @s1[b1] == @s2[b2]
+			b1 -= 1; b2 -= 1
+		end
+		b1 += 1; b2 += 1
+		e1 = @beg_s1 + @size_ngram
+		e2 = @beg_s2 + @size_ngram
+		while @s1[e1] && @s1[e1] == @s2[e2]
+			e1 += 1; e2 += 1
+		end
+		@end_s1_prev = e1
+		@end_s2_prev = e2
+		@beg_s1 = e1
+		if @reverse
+			{source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
+		else
+			{source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
+		end
+	end
+	private
+	def get_left_windows
+		return if @beg_s1 < @size_window || @beg_s2 < @size_window
+		window_s1 = ''
+		loc = @beg_s1 - 1
+		count = 0
+		while count < @size_window && loc >= 0
+			if @s1[loc] =~ /[0-9a-zA-Z]/
+				window_s1 += @s1[loc]
+				count += 1
+			end
+			loc -= 1
+		end
+		window_s2 = ''
+		loc = @beg_s2 - 1
+		count = 0
+		while count < @size_window && loc >= 0
+			if @s2[loc] =~ /[0-9a-zA-Z]/
+				window_s2 += @s2[loc]
+				count += 1
+			end
+			loc -= 1
+		end
+		[window_s1, window_s2]
+	end
+	def get_right_windows
+		return if (@beg_s1 + @size_ngram < (@s1.length - @size_window)) || (@beg_s2 + @size_ngram < (@s2.length - @size_window))
+		window_s1 = ''
+		loc = @beg_s1 + @size_ngram
+		len_s1 = @s1.length
+		count = 0
+		while count < @size_window && loc < len_s1
+			if @s1[loc] =~ /[0-9a-zA-Z]/
+				window_s1 += @s1[loc]
+				count += 1
+			end
+			loc += 1
+		end
+		window_s2 = ''
+		loc = @beg_s2 + @size_ngram
+		len_s2 = @s2.length
+		count = 0
+		while count < @size_window && loc < len_s2
+			if @s2[loc] =~ /[0-9a-zA-Z]/
+				window_s2 += @s2[loc]
+				count += 1
+			end
+			loc += 1
+		end
+		[window_s1, window_s2]
+	end
+	def text_similarity(str1, str2, ngram_order = 2)
+		String::Similarity.cosine(str1, str2, ngram:ngram_order)
+	end
+end

data/lib/text_alignment/approximate_fit.rb CHANGED

@@ -4,72 +4,73 @@ require 'string-similarity'
 module TextAlignment; end unless defined? TextAlignment
 # approximate the location of str1 in str2
-module TextAlignment
-  SIGNATURE_NGRAM = 5
-  MIN_LENGTH_FOR_APPROXIMATION = 50
-  BUFFER_RATE = 0.1
-  TEXT_SIMILARITY_TRESHOLD = 0.8
-end
+TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
+TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
+TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
+TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
 class << TextAlignment
-  # If finds an approximate region of str2 that contains str1
-  def approximate_fit(str1, str2)
-    raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
-    return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
+	# If finds an approximate region of str2 that contains str1
+	def approximate_fit(str1, str2)
+		raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
+		return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
-    ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
-    ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
-    ngram_shared = ngram1 & ngram2
+		ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
+		ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
+		ngram_shared = ngram1 & ngram2
-    # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
-    return nil, nil if ngram_shared.empty?
+		# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
+		return nil, nil if ngram_shared.empty?
-    signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
-    return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
+		signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
+		return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
-    fit_begin, fit_end = nil, nil
-    signature_ngrams.each do |signature_ngram|
-      loc_signature_ngram_in_str1 = str1.index(signature_ngram)
-      loc_signature_ngram_in_str2 = str2.index(signature_ngram)
+		cache = {}
+		fit_begin, fit_end = nil, nil
+		signature_ngrams.each do |signature_ngram|
+			loc_signature_ngram_in_str1 = str1.index(signature_ngram)
+			loc_signature_ngram_in_str2 = str2.index(signature_ngram)
-      # approximate the beginning of the fit
-      fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
-      fit_begin = 0 if fit_begin < 0
+			# approximate the beginning of the fit
+			fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
+			fit_begin = 0 if fit_begin < 0
-      # approximate the end of the fit
-      offset_end = str1.length - loc_signature_ngram_in_str1
-      fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
-      fit_end = str2.length if fit_end > str2.length
+			# approximate the end of the fit
+			offset_end = str1.length - loc_signature_ngram_in_str1
+			fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
+			fit_end = str2.length if fit_end > str2.length
-      text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
-      break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
-      fit_begin, fit_end = nil, nil
-    end
+			next if cache.has_key?("#{fit_begin}-#{fit_end}")
+			text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
+			cache["#{fit_begin}-#{fit_end}"] = text_similarity
-    return nil, nil if fit_begin >= fit_end
-    return fit_begin, fit_end
-  end
+			break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
+			fit_begin, fit_end = nil, nil
+		end
+		return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
+		return nil, nil
+	end
-  private
+	private
-  def text_similarity(str1, str2, ngram_order = 3)
-    _str1 = str1.delete(" \t\r\n")
-    _str2 = str2.delete(" \t\r\n")
-    String::Similarity.cosine(_str1, _str2, ngram:2)
-  end
+	def text_similarity(str1, str2, ngram_order = 3)
+		_str1 = str1.delete(" \t\r\n")
+		_str2 = str2.delete(" \t\r\n")
+		String::Similarity.cosine(_str1, _str2, ngram:2)
+	end
 end
 if __FILE__ == $0
-  require 'json'
+	require 'json'
-  if ARGV.length == 2
-    str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
-    str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
+	if ARGV.length == 2
+		str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
+		str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
-    loc = TextAlignment::approximate_fit(str1, str2)
-    p loc
-    puts str2[loc[0]...loc[1]]
-  end
+		loc = TextAlignment::approximate_fit(str1, str2)
+		p loc
+		puts str2[loc[0]...loc[1]]
+	end
 end