text_alignment 0.3.21 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +2 -2
- data/lib/text_alignment/anchor_finder.rb +15 -10
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/text_alignment.rb +7 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
         | 
| 4 | 
            +
              data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
         | 
| 7 | 
            +
              data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
         | 
    
        data/bin/align_annotations
    CHANGED
    
    | @@ -108,7 +108,7 @@ else | |
| 108 108 | 
             
            	# verification
         | 
| 109 109 | 
             
            	source_text = source_annotations[:text]
         | 
| 110 110 | 
             
            	puts "=====BEGIN"
         | 
| 111 | 
            -
            	(0 ... source_text.length).each do |p|
         | 
| 111 | 
            +
            	(0 ... source_text.rstrip.length).each do |p|
         | 
| 112 112 | 
             
            		t = alignment.transform_begin_position(p)
         | 
| 113 113 | 
             
            		if t.nil?
         | 
| 114 114 | 
             
            			print source_text[p]
         | 
| @@ -120,7 +120,7 @@ else | |
| 120 120 | 
             
            	puts "=====END"
         | 
| 121 121 |  | 
| 122 122 | 
             
            	puts "=====BEGIN"
         | 
| 123 | 
            -
            	(0 .. source_text.length).each do |p|
         | 
| 123 | 
            +
            	(0 .. source_text.rstrip.length).each do |p|
         | 
| 124 124 | 
             
            		t = alignment.transform_end_position(p)
         | 
| 125 125 | 
             
            		if t.nil?
         | 
| 126 126 | 
             
            			print source_text[p]
         | 
| @@ -1,17 +1,15 @@ | |
| 1 1 | 
             
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            require 'text_alignment/constants'
         | 
| 2 3 | 
             
            require 'string-similarity'
         | 
| 3 4 |  | 
| 4 5 | 
             
            module TextAlignment; end unless defined? TextAlignment
         | 
| 5 6 |  | 
| 6 | 
            -
            TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
         | 
| 7 | 
            -
            TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
         | 
| 8 | 
            -
            TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
         | 
| 9 | 
            -
             | 
| 10 7 | 
             
            class TextAlignment::AnchorFinder
         | 
| 11 8 |  | 
| 12 | 
            -
            	def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
         | 
| 9 | 
            +
            	def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
         | 
| 13 10 | 
             
            		@size_ngram  = _size_ngram  || TextAlignment::SIZE_NGRAM
         | 
| 14 11 | 
             
            		@size_window = _size_window || TextAlignment::SIZE_WINDOW
         | 
| 12 | 
            +
            		@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
         | 
| 15 13 |  | 
| 16 14 | 
             
            		@reverse = (target_str.length < source_str.length)
         | 
| 17 15 |  | 
| @@ -30,6 +28,10 @@ class TextAlignment::AnchorFinder | |
| 30 28 | 
             
            	def get_next_anchor
         | 
| 31 29 | 
             
            		# find the position of an anchor ngram in s1 and s2
         | 
| 32 30 | 
             
            		while @beg_s1 < (@s1.length - @size_ngram)
         | 
| 31 | 
            +
            			if [' ', "\n", "\t"].include? @s1[@beg_s1]
         | 
| 32 | 
            +
            				@beg_s1 += 1
         | 
| 33 | 
            +
            				next
         | 
| 34 | 
            +
            			end
         | 
| 33 35 | 
             
            			anchor = @s1[@beg_s1, @size_ngram]
         | 
| 34 36 |  | 
| 35 37 | 
             
            			# search_position = 0
         | 
| @@ -39,10 +41,10 @@ class TextAlignment::AnchorFinder | |
| 39 41 | 
             
            				break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
         | 
| 40 42 |  | 
| 41 43 | 
             
            				left_window_s1, left_window_s2 = get_left_windows
         | 
| 42 | 
            -
            				break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) >  | 
| 44 | 
            +
            				break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
         | 
| 43 45 |  | 
| 44 46 | 
             
            				right_window_s1, right_window_s2 = get_right_windows
         | 
| 45 | 
            -
            				break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) >  | 
| 47 | 
            +
            				break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
         | 
| 46 48 |  | 
| 47 49 | 
             
            				search_position = @beg_s2 + 1
         | 
| 48 50 | 
             
            			end
         | 
| @@ -57,9 +59,10 @@ class TextAlignment::AnchorFinder | |
| 57 59 | 
             
            		# extend the block
         | 
| 58 60 | 
             
            		b1 = @beg_s1
         | 
| 59 61 | 
             
            		b2 = @beg_s2
         | 
| 60 | 
            -
            		while b1 >= @end_s1_prev && b2  | 
| 62 | 
            +
            		while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
         | 
| 61 63 | 
             
            			b1 -= 1; b2 -= 1
         | 
| 62 64 | 
             
            		end
         | 
| 65 | 
            +
             | 
| 63 66 | 
             
            		b1 += 1; b2 += 1
         | 
| 64 67 |  | 
| 65 68 | 
             
            		e1 = @beg_s1 + @size_ngram
         | 
| @@ -82,7 +85,8 @@ class TextAlignment::AnchorFinder | |
| 82 85 | 
             
            	private
         | 
| 83 86 |  | 
| 84 87 | 
             
            	def get_left_windows
         | 
| 85 | 
            -
            		 | 
| 88 | 
            +
            		# commend below with the assumption that the beginning of a document gives a significant locational information
         | 
| 89 | 
            +
            		# return if @beg_s1 < @size_window || @beg_s2 < @size_window
         | 
| 86 90 |  | 
| 87 91 | 
             
            		window_s1 = ''
         | 
| 88 92 | 
             
            		loc = @beg_s1 - 1
         | 
| @@ -110,7 +114,8 @@ class TextAlignment::AnchorFinder | |
| 110 114 | 
             
            	end
         | 
| 111 115 |  | 
| 112 116 | 
             
            	def get_right_windows
         | 
| 113 | 
            -
            		 | 
| 117 | 
            +
            		# commend below with the assumption that the end of a document gives a significant locational
         | 
| 118 | 
            +
            		# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
         | 
| 114 119 |  | 
| 115 120 | 
             
            		window_s1 = ''
         | 
| 116 121 | 
             
            		loc = @beg_s1 + @size_ngram
         | 
| @@ -1,13 +1,11 @@ | |
| 1 1 | 
             
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            require 'text_alignment/constants'
         | 
| 2 3 | 
             
            require 'string-similarity'
         | 
| 3 4 |  | 
| 4 5 | 
             
            module TextAlignment; end unless defined? TextAlignment
         | 
| 5 6 |  | 
| 6 7 | 
             
            # approximate the location of str1 in str2
         | 
| 7 | 
            -
            TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
         | 
| 8 8 | 
             
            TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
         | 
| 9 | 
            -
            TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
         | 
| 10 | 
            -
            TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
         | 
| 11 9 |  | 
| 12 10 | 
             
            class << TextAlignment
         | 
| 13 11 |  | 
| @@ -16,8 +14,8 @@ class << TextAlignment | |
| 16 14 | 
             
            		raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
         | 
| 17 15 | 
             
            		return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
         | 
| 18 16 |  | 
| 19 | 
            -
            		ngram1 = (0 .. str1.length - TextAlignment:: | 
| 20 | 
            -
            		ngram2 = (0 .. str2.length - TextAlignment:: | 
| 17 | 
            +
            		ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
         | 
| 18 | 
            +
            		ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
         | 
| 21 19 | 
             
            		ngram_shared = ngram1 & ngram2
         | 
| 22 20 |  | 
| 23 21 | 
             
            		# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
         | 
| @@ -45,7 +43,7 @@ class << TextAlignment | |
| 45 43 | 
             
            			text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
         | 
| 46 44 | 
             
            			cache["#{fit_begin}-#{fit_end}"] = text_similarity
         | 
| 47 45 |  | 
| 48 | 
            -
            			break if text_similarity > TextAlignment:: | 
| 46 | 
            +
            			break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
         | 
| 49 47 | 
             
            			fit_begin, fit_end = nil, nil
         | 
| 50 48 | 
             
            		end
         | 
| 51 49 | 
             
            		return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
         | 
| @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            module TextAlignment; end unless defined? TextAlignment
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
         | 
| 4 | 
            +
            TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
         | 
| 5 | 
            +
            TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
         | 
| 6 | 
            +
            TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
         | 
| 7 | 
            +
            TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
         | 
| @@ -9,8 +9,6 @@ require 'text_alignment/mappings' | |
| 9 9 |  | 
| 10 10 | 
             
            module TextAlignment; end unless defined? TextAlignment
         | 
| 11 11 |  | 
| 12 | 
            -
            TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
         | 
| 13 | 
            -
             | 
| 14 12 | 
             
            class TextAlignment::GLCSTextAlignment
         | 
| 15 13 | 
             
            	attr_reader :position_map_begin, :position_map_end
         | 
| 16 14 | 
             
            	attr_reader :common_elements, :mapped_elements
         | 
| @@ -1,22 +1,22 @@ | |
| 1 1 | 
             
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            require 'text_alignment/constants'
         | 
| 2 3 | 
             
            require 'text_alignment/anchor_finder'
         | 
| 3 4 | 
             
            require 'text_alignment/mixed_alignment'
         | 
| 4 5 |  | 
| 5 6 | 
             
            module TextAlignment; end unless defined? TextAlignment
         | 
| 6 7 |  | 
| 7 | 
            -
            TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
         | 
| 8 | 
            -
            TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
         | 
| 9 | 
            -
            TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
         | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 8 | 
             
            class TextAlignment::TextAlignment
         | 
| 13 9 | 
             
            	attr_reader :block_alignments
         | 
| 14 10 | 
             
            	attr_reader :similarity
         | 
| 15 11 | 
             
            	attr_reader :lost_annotations
         | 
| 16 12 |  | 
| 17 | 
            -
            	def initialize(str1, str2,  | 
| 13 | 
            +
            	def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
         | 
| 18 14 | 
             
            		raise ArgumentError, "nil string" if str1.nil? || str2.nil?
         | 
| 19 15 |  | 
| 16 | 
            +
            		size_ngram  = _size_ngram  || TextAlignment::SIZE_NGRAM
         | 
| 17 | 
            +
            		size_window = _size_window || TextAlignment::SIZE_WINDOW
         | 
| 18 | 
            +
            		sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
         | 
| 19 | 
            +
             | 
| 20 20 | 
             
            		mappings ||= TextAlignment::MAPPINGS
         | 
| 21 21 |  | 
| 22 22 | 
             
            		# try exact match
         | 
| @@ -26,7 +26,7 @@ class TextAlignment::TextAlignment | |
| 26 26 | 
             
            			return @block_alignments
         | 
| 27 27 | 
             
            		end
         | 
| 28 28 |  | 
| 29 | 
            -
            		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
         | 
| 29 | 
            +
            		anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
         | 
| 30 30 |  | 
| 31 31 | 
             
            		# To collect matched blocks
         | 
| 32 32 | 
             
            		mblocks = []
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: text_alignment
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.4.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Jin-Dong Kim
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020- | 
| 11 | 
            +
            date: 2020-10-02 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: ruby-dictionary
         | 
| @@ -77,6 +77,7 @@ files: | |
| 77 77 | 
             
            - lib/text_alignment.rb
         | 
| 78 78 | 
             
            - lib/text_alignment/anchor_finder.rb
         | 
| 79 79 | 
             
            - lib/text_alignment/approximate_fit.rb
         | 
| 80 | 
            +
            - lib/text_alignment/constants.rb
         | 
| 80 81 | 
             
            - lib/text_alignment/find_divisions.rb
         | 
| 81 82 | 
             
            - lib/text_alignment/glcs_alignment.rb
         | 
| 82 83 | 
             
            - lib/text_alignment/glcs_alignment_fast.rb
         |