text_alignment 0.3.11 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +3 -1
- data/lib/text_alignment/anchor_finder.rb +10 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 19a2dfcf8dfffa752dfc0c3363d2d3e1cb3ef7498f79023cdd16e38aa8c46afd
         | 
| 4 | 
            +
              data.tar.gz: 94d925dfc71d24b05fd6861a4f7f7344428b68785db84eeae8f430563b4e3318
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 72e61cf30c98df2c3d5ac19717c813c936b55daad22cb8c6e8b44bdb45321dab98c69d5f90820e9993d86263b04bdad2e96e8010afc8a57eee916126b673c8cc
         | 
| 7 | 
            +
              data.tar.gz: d92c04294d58845f4a88cb8d9e3db42e9a18e0dd02d0398e3a95bf94662f64a33752754dd637163ef9bc4af77dc602c12fe683d49e9ac0ebb61a2469e5e08216
         | 
    
        data/bin/align_annotations
    CHANGED
    
    | @@ -103,6 +103,8 @@ target_annotations = if source_annotations.class == Array | |
| 103 103 | 
             
            else
         | 
| 104 104 | 
             
            	alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
         | 
| 105 105 |  | 
| 106 | 
            +
            	pp alignment
         | 
| 107 | 
            +
             | 
| 106 108 | 
             
            	# alignment.block_alignments.each do |a|
         | 
| 107 109 | 
             
            	# 	if a[:alignment].nil? || a[:alignment] == :empty
         | 
| 108 110 | 
             
            	# 		# p [a[:source], a[:target]]
         | 
| @@ -159,7 +161,7 @@ if lost_annotations | |
| 159 161 | 
             
            	warn "#{lost_annotations.length}"
         | 
| 160 162 | 
             
            end
         | 
| 161 163 |  | 
| 162 | 
            -
            puts target_annotations.to_json
         | 
| 164 | 
            +
            #puts target_annotations.to_json
         | 
| 163 165 |  | 
| 164 166 | 
             
            # denotations = anns1[:denotations]
         | 
| 165 167 |  | 
| @@ -3,9 +3,9 @@ require 'string-similarity' | |
| 3 3 |  | 
| 4 4 | 
             
            module TextAlignment; end unless defined? TextAlignment
         | 
| 5 5 |  | 
| 6 | 
            -
            TextAlignment::SIZE_NGRAM =  | 
| 7 | 
            -
            TextAlignment::SIZE_WINDOW =  | 
| 8 | 
            -
            TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0. | 
| 6 | 
            +
            TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
         | 
| 7 | 
            +
            TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
         | 
| 8 | 
            +
            TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
         | 
| 9 9 |  | 
| 10 10 | 
             
            class TextAlignment::AnchorFinder
         | 
| 11 11 |  | 
| @@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder | |
| 23 23 |  | 
| 24 24 | 
             
            		# current position in s1
         | 
| 25 25 | 
             
            		@beg_s1 = 0
         | 
| 26 | 
            +
            		@end_s2_prev = 0
         | 
| 26 27 | 
             
            	end
         | 
| 27 28 |  | 
| 28 29 | 
             
            	def get_next_anchor
         | 
| @@ -31,15 +32,16 @@ class TextAlignment::AnchorFinder | |
| 31 32 | 
             
            			anchor = @s1[@beg_s1, @size_ngram]
         | 
| 32 33 |  | 
| 33 34 | 
             
            			search_position = 0
         | 
| 35 | 
            +
            			# search_position = @end_s2_prev
         | 
| 34 36 | 
             
            			while @beg_s2 = @s2.index(anchor, search_position)
         | 
| 35 37 | 
             
            				# if both the begining points are sufficiantly close to the end points of the last match
         | 
| 36 | 
            -
            				break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
         | 
| 38 | 
            +
            				break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
         | 
| 37 39 |  | 
| 38 40 | 
             
            				left_window_s1, left_window_s2 = get_left_windows
         | 
| 39 | 
            -
            				break if left_window_s1 | 
| 41 | 
            +
            				break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
         | 
| 40 42 |  | 
| 41 43 | 
             
            				right_window_s1, right_window_s2 = get_right_windows
         | 
| 42 | 
            -
            				break if right_window_s2 && text_similarity(right_window_s1,  | 
| 44 | 
            +
            				break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
         | 
| 43 45 |  | 
| 44 46 | 
             
            				search_position = @beg_s2 + 1
         | 
| 45 47 | 
             
            			end
         | 
| @@ -107,7 +109,7 @@ class TextAlignment::AnchorFinder | |
| 107 109 | 
             
            	end
         | 
| 108 110 |  | 
| 109 111 | 
             
            	def get_right_windows
         | 
| 110 | 
            -
            		return if (@beg_s1 + @size_ngram  | 
| 112 | 
            +
            		return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
         | 
| 111 113 |  | 
| 112 114 | 
             
            		window_s1 = ''
         | 
| 113 115 | 
             
            		loc = @beg_s1 + @size_ngram
         | 
| @@ -137,6 +139,7 @@ class TextAlignment::AnchorFinder | |
| 137 139 | 
             
            	end
         | 
| 138 140 |  | 
| 139 141 | 
             
            	def text_similarity(str1, str2, ngram_order = 2)
         | 
| 142 | 
            +
            		return 0 if str1.nil? || str2.nil?
         | 
| 140 143 | 
             
            		String::Similarity.cosine(str1, str2, ngram:ngram_order)
         | 
| 141 144 | 
             
            	end
         | 
| 142 145 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: text_alignment
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.3. | 
| 4 | 
            +
              version: 0.3.13
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Jin-Dong Kim
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020-09- | 
| 11 | 
            +
            date: 2020-09-12 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: ruby-dictionary
         |