RubyGems - text_alignment - Versions diffs - 0.11.2 → 0.11.8 - Mend

text_alignment 0.11.2 → 0.11.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/bin/align_annotations +31 -8
data/lib/text_alignment/anchor_finder.rb +54 -6
data/lib/text_alignment/char_mapping.rb +61 -21
data/lib/text_alignment/cultivation_map.rb +1 -1
data/lib/text_alignment/mixed_alignment.rb +15 -4
data/lib/text_alignment/text_alignment.rb +10 -7
data/lib/text_alignment/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 81994ec9a8c7c08d2aad32b351b5942fef1748e4035ba762af546d4f3fe7cee8
-  data.tar.gz: bfa75451d33b9d21c2baa1a52280f03486bf10a41b5ac2a97469f0ca3a4f7379
+  metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
+  data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
 SHA512:
-  metadata.gz: 73d0ff212a89d6ad33751f87f14a9b292b45ce177c61efd0ede5f852eb3834d1bd3940d202c4d87cfb0422c4dc566dea30c560f8500a220dcd3e1dd492e29eac
-  data.tar.gz: 6c763a564e339267624bec5c809b334b0adf9951d25409eac21ae4b4582beae4a08739119dfa369022c44c19d4faacfd0e06766c6223bd219a43c3961cfab08c
+  metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
+  data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a

data/bin/align_annotations CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'text_alignment'
 require 'json'
 require 'pp'
+require 'optparse'
 def read_annotations(filename)
 	case File.extname(filename)
@@ -108,24 +109,46 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
 end
+## Options
+overlap_p = false
+debug_p = false
+## command line option processing
+require 'optparse'
+optparse = OptionParser.new do |opts|
+	opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
+	opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
+		overlap_p = true
+	end
+	opts.on('-d', '--debug', 'tells it to show debugging information.') do
+		debug_p = true
+	end
+	opts.on('-h', '--help', 'displays this screen.') do
+		puts opts
+		exit
+	end
+end
+optparse.parse!
 unless ARGV.length == 2
-	warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
-	exit
+	puts optparse.help
+	exit 1
 end
 source_annotations = read_annotations(ARGV[0])
 reference_text = read_text(ARGV[1])
-alignment = TextAlignment::TextAlignment.new(reference_text, true)
+alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
 target_annotations = if source_annotations.class == Array
-	# align_mannotations(source_annotations, reference_text, alignment, true)
-	align_mannotations(source_annotations, reference_text, alignment, false)
+	align_mannotations(source_annotations, reference_text, alignment, debug_p)
 else
-	# denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
-	denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
+	denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
 	source_annotations.merge({text:reference_text, denotations:denotations})
 end
-# pp alignment.block_alignment
 # puts target_annotations.to_json

data/lib/text_alignment/anchor_finder.rb CHANGED Viewed

@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
 class TextAlignment::AnchorFinder
-	def initialize(source_str, target_str, cultivation_map)
+	def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
+		@method_get_left_windows, @method_get_right_windows = if squeeze_ws
+			[method(:get_left_windows), method(:get_right_windows)]
+		else
+			[method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
+		end
 		@s1 = source_str.downcase
 		@s2 = target_str.downcase
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
 					next
 				end
-				left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
+				left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
 				if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
 					break unless valid_beg_s2.nil?
 					valid_beg_s2 = beg_s2
 					next
 				end
-				right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
+				right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
 				if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
 					break unless valid_beg_s2.nil?
 					valid_beg_s2 = beg_s2
@@ -125,7 +131,11 @@ class TextAlignment::AnchorFinder
 			# r == nil means that the inner loop was broken (multiple candidates had passed the tests)
 			# r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
-			break unless r.nil?
+			if r.nil?
+				valid_beg_s2 = nil
+			else
+				break
+			end
 		end
 		valid_beg_s2
@@ -135,7 +145,7 @@ class TextAlignment::AnchorFinder
 		size_window ||= @size_window
 		# comment out below with the assumption that the beginning of a document gives a significant locational information
-		# return if @beg_s1 < size_window || @beg_s2 < size_window
+		# return if beg_s1 < size_window || beg_s2 < size_window
 		window_s1 = ''
 		loc = beg_s1 - 1
@@ -166,7 +176,7 @@ class TextAlignment::AnchorFinder
 		size_window ||= @size_window
 		# commend below with the assumption that the end of a document gives a significant locational
-		# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
+		# return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
 		window_s1 = ''
 		loc = beg_s1 + @size_ngram
@@ -195,6 +205,44 @@ class TextAlignment::AnchorFinder
 		[window_s1, window_s2]
 	end
+	def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
+		size_window ||= @size_window
+		# comment out below with the assumption that the beginning of a document gives a significant locational information
+		# return if beg_s1 < size_window || beg_s2 < size_window
+		wbeg = beg_s1 - size_window
+		wbeg = 0 if wbeg < 0
+		window_s1 = @s1[wbeg ... beg_s1]
+		wbeg = beg_s2 - size_window
+		wbeg = 0 if wbeg < 0
+		window_s2 = @s2[wbeg ... beg_s2]
+		[window_s1, window_s2]
+	end
+	def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
+		size_window ||= @size_window
+		# commend below with the assumption that the end of a document gives a significant locational
+		# return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
+		slen = @s1.length
+		wbeg = beg_s1 + @size_ngram
+		wend = wbeg + size_window
+		wend = slen if wend > slen
+		window_s1 = @s1[wbeg ... wend]
+		slen = @s2.length
+		wbeg = beg_s2 + @size_ngram
+		wend = wbeg + size_window
+		wend = slen if wend > slen
+		window_s2 = @s2[wbeg ... wend]
+		[window_s1, window_s2]
+	end
 	def text_similarity(str1, str2, ngram_order = 2)
 		return 0 if str1.nil? || str2.nil?
 		String::Similarity.cosine(str1, str2, ngram:ngram_order)

data/lib/text_alignment/char_mapping.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'strscan'
 module TextAlignment; end unless defined? TextAlignment
 TextAlignment::CHAR_MAPPING = [
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
 class TextAlignment::CharMapping
 	attr_reader :mapped_text
-	def initialize(_text, char_mapping = nil)
-		char_mapping ||= TextAlignment::CHAR_MAPPING
+	def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
+		if squeeze_ws_to == 0
+			@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
+			@method_squeeze_ws = method(:squeeze_ws_0!)
+		else
+			@method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
+			@method_squeeze_ws = method(:squeeze_ws_1!)
+		end
+		char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
 		@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
 		@index_enmap = offset_mapping.to_h
 		@index_demap = offset_mapping.map{|m| m.reverse}.to_h
@@ -105,22 +115,22 @@ class TextAlignment::CharMapping
 	private
-	def enmap_text(_text, char_mapping)
+	def enmap_text(_text, char_mapping, no_ws = false)
 		text = _text.dup
-		# To execute the single letter mapping
+		# To execute the single letter mapping replacement
 		char_mapping.each do |one, long|
 			text.gsub!(one, long) if long.length == 1
 		end
-		# To get the (location, length) index for replacements
-		loc_len = []
+		# To get the replacement positions, (position, old_length, new_length), for char mappings
+		rpositions = []
 		char_mapping.each do |one, long|
 			next if long.length == 1
 			init_next = 0
 			while loc = text.index(long, init_next)
-				loc_len << [loc, long.length]
+				rpositions << [loc, long.length, 1]
 				init_next = loc + long.length
 			end
@@ -128,32 +138,31 @@ class TextAlignment::CharMapping
 			text.gsub!(long, one * long.length)
 		end
-		# To get the (location, length) index for consecutive whitespace sequences
-		init_next = 0
-		while loc = text.index(/\s{2,}/, init_next)
-			len = $~[0].length
-			loc_len << [loc, len]
-			init_next = loc + len
-		end
+		# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
+		rpositions += @method_get_positions_squeeze_ws.call(text)
-		loc_len.sort!{|a, b| a[0] <=> b[0]}
+		rpositions.sort!{|a, b| a[0] <=> b[0]}
 		# To get the offset_mapping before and after replacement
 		offset_mapping = []
 		init_next = 0
 		j = 0
-		loc_len.each do |loc, len|
+		rpositions.each do |loc, old_len, new_len|
 			offset_mapping += (init_next .. loc).map do |i|
+				m = [i, j]
 				j += 1
-				[i, j - 1]
+				m
 			end
-			init_next = loc + len
+			init_next = loc + old_len
+			j += (new_len - 1)
 		end
 		offset_mapping += (init_next .. text.length).map do |i|
+			m = [i, j]
 			j += 1
-			[i, j - 1]
+			m
 		end
 		# To execute the long letter mapping
@@ -162,10 +171,41 @@ class TextAlignment::CharMapping
 		end
 		# To replace multi whitespace sequences to a space
-		text.gsub!(/\s{2,}/, ' ')
+		@method_squeeze_ws.call(text)
 		[text, offset_mapping]
 	end
+	# To get squeeze positions of whitespaces to one
+	def get_positions_squeeze_ws_1(text)
+		rpositions = []
+		text.scan(/s{2,}/) do |s|
+			loc = $~.begin(0)
+			len = $~.end(0) - loc
+			rpositions << [loc, len, 1]
+		end
+		rpositions
+	end
+	# To get squeeze positions of whitespaces to zero
+	def get_positions_squeeze_ws_0(text)
+		rpositions = []
+		text.scan(/\s+/) do |s|
+			loc = $~.begin(0)
+			len = $~.end(0) - loc
+			rpositions << [loc, len, 0]
+		end
+		rpositions
+	end
+	def squeeze_ws_1!(text)
+		text.gsub!(/\s{2,}/, ' ')
+	end
+	def squeeze_ws_0!(text)
+		text.gsub!(/\s+/, '')
+	end
 end
 if __FILE__ == $0
@@ -186,5 +226,5 @@ if __FILE__ == $0
 	denotations_mapped = text_mapping.enmap_denotations(denotations)
 	new_annotations = {text:text_mapped, denotations:denotations_mapped}
-	puts new_annotations.to_json
+	# puts new_annotations.to_json
 end

data/lib/text_alignment/cultivation_map.rb CHANGED Viewed

@@ -41,7 +41,7 @@ class TextAlignment::CultivationMap
 	end
 	def next_cultivated_position(position)
-		region = @map.bsearch{|r| position < r[0]}
+		region = @map.bsearch{|r| position <= r[0]}
 		region.nil? ? nil : region[0]
 	end

data/lib/text_alignment/mixed_alignment.rb CHANGED Viewed

@@ -147,13 +147,24 @@ class TextAlignment::MixedAlignment
 		# recoverbility
 		count_nws =	sdiff.count{|d| d.old_element =~ /\S/}
 		count_nws_match =	sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
 		coverage = count_nws_match.to_f / count_nws
 		# fragmentation rate
-		count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
-		count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
-		rate_frag = count_ofrag.to_f / count_frag
+		frag_str = sdiff.collect do |d|
+			case d.action
+			when '='
+				'='
+			when '-'
+				''
+			when '+'
+				(d.new_element =~ /\S/) ? '+' : ''
+			else
+				''
+			end
+		end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
+		count_frag = frag_str.scan(/=+/).count
+		rate_frag = 1.0 / count_frag
 		similarity = coverage * rate_frag
 	end

data/lib/text_alignment/text_alignment.rb CHANGED Viewed

@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
 	attr_reader :similarity
 	attr_reader :lost_annotations
-	# Initialize with a reference text, again which texts will be aligned
-	def initialize(reference_text, to_prevent_overlap = false)
+	# Initialize with a reference text, against which texts will be aligned
+	def initialize(reference_text, options = {})
 		raise ArgumentError, "nil text" if reference_text.nil?
+		options ||= {}
+		@to_prevent_overlap = options[:to_prevent_overlap] || false
+		@squeeze_ws_to = options[:squeeze_ws_to] || 0
 		@original_reference_text = reference_text
-		@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
+		@rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
 		@mapped_reference_text = @rtext_mapping.mapped_text
-		@to_prevent_overlap = to_prevent_overlap
 		@original_text = nil
 		@blocks = nil
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
 		# In case the input text is the same as the previous one, reuse the previous text mapping
 		unless @original_text && @original_text == text
 			@original_text = text
-			@text_mapping = TextAlignment::CharMapping.new(text)
+			@text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
 		end
 		@mapped_text = @text_mapping.mapped_text
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
 	def find_block_alignment(str1, str2, denotations, cultivation_map)
 		## to find block alignments
-		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
+		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
 		blocks = []
 		while block = anchor_finder.get_next_anchor
@@ -320,7 +323,7 @@ class TextAlignment::TextAlignment
 	def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
 		tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
-		if tblocks.empty?
+		if tblocks.empty? || tblocks.first[:alignment] == :empty
 			lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
 		else
 			tblocks

data/lib/text_alignment/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class TextAlignment
-	VERSION = '0.11.2'
+	VERSION = '0.11.8'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_alignment
 version: !ruby/object:Gem::Version
-  version: 0.11.2
+  version: 0.11.8
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-03-14 00:00:00.000000000 Z
+date: 2021-04-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-dictionary