text_alignment 0.3.19 → 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/bin/align_annotations +2 -2
 - data/lib/text_alignment/anchor_finder.rb +11 -4
 - data/lib/text_alignment/text_alignment.rb +8 -20
 - data/lib/text_alignment/version.rb +1 -1
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: fd59fb1ad977d3286f358d8c08315824e86d99cc6f9d72814adb760f2e680107
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: baafd1b76f6c6447a5763ff731b77ad07e449fc87ef94abed74a978376f6334e
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: d417878396803e1169a24fae67a9a4b0d4e84948d0c6bc678626641d6f1b6ac1fe16c94e9f07ee747b98f83e4305fa553220a607475363378f32fac4d43a65c7
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: efaf3640a67be46dddcf7dda9d819cbcb47828a3966d305887b6024eee6ba517a597e9303790f584264b8995f69671d90bdc0c7883fc9244f3b1746695960bf8
         
     | 
    
        data/bin/align_annotations
    CHANGED
    
    | 
         @@ -108,7 +108,7 @@ else 
     | 
|
| 
       108 
108 
     | 
    
         
             
            	# verification
         
     | 
| 
       109 
109 
     | 
    
         
             
            	source_text = source_annotations[:text]
         
     | 
| 
       110 
110 
     | 
    
         
             
            	puts "=====BEGIN"
         
     | 
| 
       111 
     | 
    
         
            -
            	(0 ... source_text.length).each do |p|
         
     | 
| 
      
 111 
     | 
    
         
            +
            	(0 ... source_text.rstrip.length).each do |p|
         
     | 
| 
       112 
112 
     | 
    
         
             
            		t = alignment.transform_begin_position(p)
         
     | 
| 
       113 
113 
     | 
    
         
             
            		if t.nil?
         
     | 
| 
       114 
114 
     | 
    
         
             
            			print source_text[p]
         
     | 
| 
         @@ -120,7 +120,7 @@ else 
     | 
|
| 
       120 
120 
     | 
    
         
             
            	puts "=====END"
         
     | 
| 
       121 
121 
     | 
    
         | 
| 
       122 
122 
     | 
    
         
             
            	puts "=====BEGIN"
         
     | 
| 
       123 
     | 
    
         
            -
            	(0 .. source_text.length).each do |p|
         
     | 
| 
      
 123 
     | 
    
         
            +
            	(0 .. source_text.rstrip.length).each do |p|
         
     | 
| 
       124 
124 
     | 
    
         
             
            		t = alignment.transform_end_position(p)
         
     | 
| 
       125 
125 
     | 
    
         
             
            		if t.nil?
         
     | 
| 
       126 
126 
     | 
    
         
             
            			print source_text[p]
         
     | 
| 
         @@ -30,13 +30,17 @@ class TextAlignment::AnchorFinder 
     | 
|
| 
       30 
30 
     | 
    
         
             
            	def get_next_anchor
         
     | 
| 
       31 
31 
     | 
    
         
             
            		# find the position of an anchor ngram in s1 and s2
         
     | 
| 
       32 
32 
     | 
    
         
             
            		while @beg_s1 < (@s1.length - @size_ngram)
         
     | 
| 
      
 33 
     | 
    
         
            +
            			if [' ', "\n", "\t"].include? @s1[@beg_s1]
         
     | 
| 
      
 34 
     | 
    
         
            +
            				@beg_s1 += 1
         
     | 
| 
      
 35 
     | 
    
         
            +
            				next
         
     | 
| 
      
 36 
     | 
    
         
            +
            			end
         
     | 
| 
       33 
37 
     | 
    
         
             
            			anchor = @s1[@beg_s1, @size_ngram]
         
     | 
| 
       34 
38 
     | 
    
         | 
| 
       35 
39 
     | 
    
         
             
            			# search_position = 0
         
     | 
| 
       36 
40 
     | 
    
         
             
            			search_position = @end_s2_prev
         
     | 
| 
       37 
41 
     | 
    
         
             
            			while @beg_s2 = @s2.index(anchor, search_position)
         
     | 
| 
       38 
42 
     | 
    
         
             
            				# if both the begining points are sufficiantly close to the end points of the last match
         
     | 
| 
       39 
     | 
    
         
            -
            				break if @ 
     | 
| 
      
 43 
     | 
    
         
            +
            				break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
         
     | 
| 
       40 
44 
     | 
    
         | 
| 
       41 
45 
     | 
    
         
             
            				left_window_s1, left_window_s2 = get_left_windows
         
     | 
| 
       42 
46 
     | 
    
         
             
            				break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
         
     | 
| 
         @@ -57,9 +61,10 @@ class TextAlignment::AnchorFinder 
     | 
|
| 
       57 
61 
     | 
    
         
             
            		# extend the block
         
     | 
| 
       58 
62 
     | 
    
         
             
            		b1 = @beg_s1
         
     | 
| 
       59 
63 
     | 
    
         
             
            		b2 = @beg_s2
         
     | 
| 
       60 
     | 
    
         
            -
            		while b1 >= @end_s1_prev && b2  
     | 
| 
      
 64 
     | 
    
         
            +
            		while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
         
     | 
| 
       61 
65 
     | 
    
         
             
            			b1 -= 1; b2 -= 1
         
     | 
| 
       62 
66 
     | 
    
         
             
            		end
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
       63 
68 
     | 
    
         
             
            		b1 += 1; b2 += 1
         
     | 
| 
       64 
69 
     | 
    
         | 
| 
       65 
70 
     | 
    
         
             
            		e1 = @beg_s1 + @size_ngram
         
     | 
| 
         @@ -82,7 +87,8 @@ class TextAlignment::AnchorFinder 
     | 
|
| 
       82 
87 
     | 
    
         
             
            	private
         
     | 
| 
       83 
88 
     | 
    
         | 
| 
       84 
89 
     | 
    
         
             
            	def get_left_windows
         
     | 
| 
       85 
     | 
    
         
            -
            		 
     | 
| 
      
 90 
     | 
    
         
            +
            		# commend below with the assumption that the beginning of a document gives a significant locational information
         
     | 
| 
      
 91 
     | 
    
         
            +
            		# return if @beg_s1 < @size_window || @beg_s2 < @size_window
         
     | 
| 
       86 
92 
     | 
    
         | 
| 
       87 
93 
     | 
    
         
             
            		window_s1 = ''
         
     | 
| 
       88 
94 
     | 
    
         
             
            		loc = @beg_s1 - 1
         
     | 
| 
         @@ -110,7 +116,8 @@ class TextAlignment::AnchorFinder 
     | 
|
| 
       110 
116 
     | 
    
         
             
            	end
         
     | 
| 
       111 
117 
     | 
    
         | 
| 
       112 
118 
     | 
    
         
             
            	def get_right_windows
         
     | 
| 
       113 
     | 
    
         
            -
            		 
     | 
| 
      
 119 
     | 
    
         
            +
            		# commend below with the assumption that the end of a document gives a significant locational
         
     | 
| 
      
 120 
     | 
    
         
            +
            		# return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
         
     | 
| 
       114 
121 
     | 
    
         | 
| 
       115 
122 
     | 
    
         
             
            		window_s1 = ''
         
     | 
| 
       116 
123 
     | 
    
         
             
            		loc = @beg_s1 + @size_ngram
         
     | 
| 
         @@ -6,7 +6,7 @@ module TextAlignment; end unless defined? TextAlignment 
     | 
|
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
            TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
         
     | 
| 
       8 
8 
     | 
    
         
             
            TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
         
     | 
| 
       9 
     | 
    
         
            -
            TextAlignment::BUFFER_MIN =  
     | 
| 
      
 9 
     | 
    
         
            +
            TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
       11 
11 
     | 
    
         | 
| 
       12 
12 
     | 
    
         
             
            class TextAlignment::TextAlignment
         
     | 
| 
         @@ -40,10 +40,9 @@ class TextAlignment::TextAlignment 
     | 
|
| 
       40 
40 
     | 
    
         
             
            			end
         
     | 
| 
       41 
41 
     | 
    
         
             
            		end
         
     | 
| 
       42 
42 
     | 
    
         | 
| 
       43 
     | 
    
         
            -
            		pp mblocks
         
     | 
| 
       44 
     | 
    
         
            -
            		puts "-----"
         
     | 
| 
       45 
     | 
    
         
            -
            		puts
         
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
      
 43 
     | 
    
         
            +
            		# pp mblocks
         
     | 
| 
      
 44 
     | 
    
         
            +
            		# puts "-----"
         
     | 
| 
      
 45 
     | 
    
         
            +
            		# puts
         
     | 
| 
       47 
46 
     | 
    
         
             
            		# mblocks.each do |b|
         
     | 
| 
       48 
47 
     | 
    
         
             
            		# 	p [b[:source], b[:target]]
         
     | 
| 
       49 
48 
     | 
    
         
             
            		# 	puts "---"
         
     | 
| 
         @@ -82,6 +81,8 @@ class TextAlignment::TextAlignment 
     | 
|
| 
       82 
81 
     | 
    
         | 
| 
       83 
82 
     | 
    
         
             
            						@block_alignments << {source:{begin:0, end:b1}, target:{begin:0, end:b2}, alignment: :empty} if b1 > 0
         
     | 
| 
       84 
83 
     | 
    
         | 
| 
      
 84 
     | 
    
         
            +
            						_str1 = str1[b1 ... e1]
         
     | 
| 
      
 85 
     | 
    
         
            +
            						_str2 = str2[b2 ... e2]
         
     | 
| 
       85 
86 
     | 
    
         
             
            						alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
         
     | 
| 
       86 
87 
     | 
    
         
             
            						if alignment.similarity < 0.6
         
     | 
| 
       87 
88 
     | 
    
         
             
            							@block_alignments << {source:{begin:b1, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
         
     | 
| 
         @@ -117,24 +118,11 @@ class TextAlignment::TextAlignment 
     | 
|
| 
       117 
118 
     | 
    
         
             
            		end
         
     | 
| 
       118 
119 
     | 
    
         | 
| 
       119 
120 
     | 
    
         
             
            		# Final step
         
     | 
| 
       120 
     | 
    
         
            -
            		if mblocks[-1][:source][:end] < str1.length
         
     | 
| 
       121 
     | 
    
         
            -
            			b1 = mblocks[-1][:source][:end]
         
     | 
| 
       122 
     | 
    
         
            -
            			b2 = mblocks[-1][:target][:end]
         
     | 
| 
       123 
     | 
    
         
            -
             
     | 
| 
       124 
     | 
    
         
            -
            			if mblocks[-1][:target][:end] < str2.length
         
     | 
| 
       125 
     | 
    
         
            -
             
     | 
| 
       126 
     | 
    
         
            -
            			else
         
     | 
| 
       127 
     | 
    
         
            -
            				e1 = str1.length
         
     | 
| 
       128 
     | 
    
         
            -
            				e2 = str2.length
         
     | 
| 
       129 
     | 
    
         
            -
            				@block_alignments << {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}
         
     | 
| 
       130 
     | 
    
         
            -
            			end
         
     | 
| 
       131 
     | 
    
         
            -
            		end
         
     | 
| 
       132 
     | 
    
         
            -
             
     | 
| 
       133 
121 
     | 
    
         
             
            		if  mblocks[-1][:source][:end] < str1.length && mblocks[-1][:target][:end] < str2.length
         
     | 
| 
       134 
122 
     | 
    
         
             
            			b1 = mblocks[-1][:source][:end]
         
     | 
| 
       135 
123 
     | 
    
         
             
            			b2 = mblocks[-1][:target][:end]
         
     | 
| 
       136 
     | 
    
         
            -
            			_str1 = str1[b1 ...  
     | 
| 
       137 
     | 
    
         
            -
            			_str2 = str2[b2 ...  
     | 
| 
      
 124 
     | 
    
         
            +
            			_str1 = str1[b1 ... str1.length]
         
     | 
| 
      
 125 
     | 
    
         
            +
            			_str2 = str2[b2 ... str2.length]
         
     | 
| 
       138 
126 
     | 
    
         | 
| 
       139 
127 
     | 
    
         
             
            			unless _str1.strip.empty?
         
     | 
| 
       140 
128 
     | 
    
         
             
            				if _str2.strip.empty?
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: text_alignment
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.3. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.3.24
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Jin-Dong Kim
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2020-09- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2020-09-29 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: ruby-dictionary
         
     |