text_alignment 0.3.13 → 0.3.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +41 -0
- data/lib/text_alignment/anchor_finder.rb +5 -4
- data/lib/text_alignment/text_alignment.rb +15 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 5b6821076dd780721d97403f3ca49568a66100f34f7fffe32acbf28572c36429
         | 
| 4 | 
            +
              data.tar.gz: 0d29d5b64897b4030de7479eaa8fc3990e13f2135298315a18b42b06ee3aa300
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: efb442cee5dfa76e7428d516b5e8ba768cbf80229580f396d02fcd5aa99f5573f88b2af662d82159669fc2c58f84ad77f6bde72696162f3c0a7fb74a75d4f7e0
         | 
| 7 | 
            +
              data.tar.gz: d6fe000e32862fea2c511fb7937c85ca65e75dd53fe33214f0a637cd7b081a947f268919dd050e425d4fa2fca4268125dcffcb1a54fb933c6eb0bd3c4971b058
         | 
    
        data/bin/align_annotations
    CHANGED
    
    | @@ -105,6 +105,32 @@ else | |
| 105 105 |  | 
| 106 106 | 
             
            	pp alignment
         | 
| 107 107 |  | 
| 108 | 
            +
            	# verification
         | 
| 109 | 
            +
            	source_text = source_annotations[:text]
         | 
| 110 | 
            +
            	puts "=====BEGIN"
         | 
| 111 | 
            +
            	(0 ... source_text.length).each do |p|
         | 
| 112 | 
            +
            		t = alignment.transform_begin_position(p)
         | 
| 113 | 
            +
            		if t.nil?
         | 
| 114 | 
            +
            			print source_text[p]
         | 
| 115 | 
            +
            		else
         | 
| 116 | 
            +
            			print '.'
         | 
| 117 | 
            +
            		end
         | 
| 118 | 
            +
            	end
         | 
| 119 | 
            +
            	puts
         | 
| 120 | 
            +
            	puts "=====END"
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            	puts "=====BEGIN"
         | 
| 123 | 
            +
            	(0 .. source_text.length).each do |p|
         | 
| 124 | 
            +
            		t = alignment.transform_end_position(p)
         | 
| 125 | 
            +
            		if t.nil?
         | 
| 126 | 
            +
            			print source_text[p]
         | 
| 127 | 
            +
            		else
         | 
| 128 | 
            +
            			print '.'
         | 
| 129 | 
            +
            		end
         | 
| 130 | 
            +
            	end
         | 
| 131 | 
            +
            	puts
         | 
| 132 | 
            +
            	puts "=====END"
         | 
| 133 | 
            +
             | 
| 108 134 | 
             
            	# alignment.block_alignments.each do |a|
         | 
| 109 135 | 
             
            	# 	if a[:alignment].nil? || a[:alignment] == :empty
         | 
| 110 136 | 
             
            	# 		# p [a[:source], a[:target]]
         | 
| @@ -121,7 +147,22 @@ else | |
| 121 147 | 
             
            	# end
         | 
| 122 148 | 
             
            	# exit
         | 
| 123 149 |  | 
| 150 | 
            +
            	# verification of source denotations
         | 
| 151 | 
            +
            	puts "[Invalid source denotations]"
         | 
| 152 | 
            +
            	source_annotations[:denotations] do |d|
         | 
| 153 | 
            +
            		p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < source_text.length
         | 
| 154 | 
            +
            	end
         | 
| 155 | 
            +
            	puts "====="
         | 
| 156 | 
            +
            	puts
         | 
| 157 | 
            +
             | 
| 124 158 | 
             
            	denotations = alignment.transform_hdenotations(source_annotations[:denotations])
         | 
| 159 | 
            +
            	puts "[Invalid transformation]"
         | 
| 160 | 
            +
            	denotations.each do |d|
         | 
| 161 | 
            +
            		p d unless d[:span][:begin] && d[:span][:end] && d[:span][:begin] < d[:span][:end] && d[:span][:begin] >= 0 && d[:span][:end] < target_text.length
         | 
| 162 | 
            +
            	end
         | 
| 163 | 
            +
            	puts "====="
         | 
| 164 | 
            +
            	puts
         | 
| 165 | 
            +
             | 
| 125 166 | 
             
            	lost_annotations += alignment.lost_annotations if alignment.lost_annotations
         | 
| 126 167 |  | 
| 127 168 | 
             
            	source_annotations.merge({text:target_text, denotations:denotations})
         | 
| @@ -4,7 +4,7 @@ require 'string-similarity' | |
| 4 4 | 
             
            module TextAlignment; end unless defined? TextAlignment
         | 
| 5 5 |  | 
| 6 6 | 
             
            TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
         | 
| 7 | 
            -
            TextAlignment::SIZE_WINDOW =  | 
| 7 | 
            +
            TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
         | 
| 8 8 | 
             
            TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
         | 
| 9 9 |  | 
| 10 10 | 
             
            class TextAlignment::AnchorFinder
         | 
| @@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder | |
| 23 23 |  | 
| 24 24 | 
             
            		# current position in s1
         | 
| 25 25 | 
             
            		@beg_s1 = 0
         | 
| 26 | 
            +
            		@end_s1_prev = 0
         | 
| 26 27 | 
             
            		@end_s2_prev = 0
         | 
| 27 28 | 
             
            	end
         | 
| 28 29 |  | 
| @@ -31,8 +32,8 @@ class TextAlignment::AnchorFinder | |
| 31 32 | 
             
            		while @beg_s1 < (@s1.length - @size_ngram)
         | 
| 32 33 | 
             
            			anchor = @s1[@beg_s1, @size_ngram]
         | 
| 33 34 |  | 
| 34 | 
            -
            			search_position = 0
         | 
| 35 | 
            -
            			 | 
| 35 | 
            +
            			# search_position = 0
         | 
| 36 | 
            +
            			search_position = @end_s2_prev
         | 
| 36 37 | 
             
            			while @beg_s2 = @s2.index(anchor, search_position)
         | 
| 37 38 | 
             
            				# if both the begining points are sufficiantly close to the end points of the last match
         | 
| 38 39 | 
             
            				break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
         | 
| @@ -56,7 +57,7 @@ class TextAlignment::AnchorFinder | |
| 56 57 | 
             
            		# extend the block
         | 
| 57 58 | 
             
            		b1 = @beg_s1
         | 
| 58 59 | 
             
            		b2 = @beg_s2
         | 
| 59 | 
            -
            		while b1  | 
| 60 | 
            +
            		while b1 >= @end_s1_prev && b2 > -1 && @s1[b1] == @s2[b2]
         | 
| 60 61 | 
             
            			b1 -= 1; b2 -= 1
         | 
| 61 62 | 
             
            		end
         | 
| 62 63 | 
             
            		b1 += 1; b2 += 1
         | 
| @@ -6,6 +6,8 @@ module TextAlignment; end unless defined? TextAlignment | |
| 6 6 |  | 
| 7 7 | 
             
            TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
         | 
| 8 8 | 
             
            TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
         | 
| 9 | 
            +
            TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
         | 
| 10 | 
            +
             | 
| 9 11 |  | 
| 10 12 | 
             
            class TextAlignment::TextAlignment
         | 
| 11 13 | 
             
            	attr_reader :block_alignments
         | 
| @@ -70,7 +72,7 @@ class TextAlignment::TextAlignment | |
| 70 72 | 
             
            						@block_alignments << {source:{begin:0, end:e1}, target:{begin:0, end:e2}, alignment: :empty}
         | 
| 71 73 | 
             
            					else
         | 
| 72 74 | 
             
            						len_min = [_str1.length, _str2.length].min
         | 
| 73 | 
            -
            						len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
         | 
| 75 | 
            +
            						len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
         | 
| 74 76 | 
             
            						b1 = _str1.length < len_buffer ? 0 : e1 - len_buffer
         | 
| 75 77 | 
             
            						b2 = _str2.length < len_buffer ? 0 : e2 - len_buffer
         | 
| 76 78 |  | 
| @@ -135,9 +137,11 @@ class TextAlignment::TextAlignment | |
| 135 137 | 
             
            					@block_alignments << {source:{begin:b1, end:str1.length}, target:{begin:b2, end:str2.length}, alignment: :empty}
         | 
| 136 138 | 
             
            				else
         | 
| 137 139 | 
             
            					len_min = [_str1.length, _str2.length].min
         | 
| 138 | 
            -
            					len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i
         | 
| 140 | 
            +
            					len_buffer = (len_min * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
         | 
| 139 141 | 
             
            					e1 = _str1.length < len_buffer ? str1.length : b1 + len_buffer
         | 
| 140 | 
            -
            					e2 = _str2.length < len_buffer ?  | 
| 142 | 
            +
            					e2 = _str2.length < len_buffer ? str2.length : b2 + len_buffer
         | 
| 143 | 
            +
            					_str1 = str1[b1 ... e1]
         | 
| 144 | 
            +
            					_str2 = str2[b2 ... e2]
         | 
| 141 145 |  | 
| 142 146 | 
             
            					alignment = TextAlignment::MixedAlignment.new(_str1.downcase, _str2.downcase, mappings)
         | 
| 143 147 | 
             
            					if alignment.similarity < 0.6
         | 
| @@ -166,10 +170,12 @@ class TextAlignment::TextAlignment | |
| 166 170 | 
             
            			if begin_position == block_alignment[:source][:begin]
         | 
| 167 171 | 
             
            				block_alignment[:target][:begin]
         | 
| 168 172 | 
             
            			else
         | 
| 169 | 
            -
            				raise "lost annotation"
         | 
| 173 | 
            +
            				# raise "lost annotation"
         | 
| 174 | 
            +
            				nil
         | 
| 170 175 | 
             
            			end
         | 
| 171 176 | 
             
            		else
         | 
| 172 | 
            -
            			block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin]) | 
| 177 | 
            +
            			r = block_alignment[:alignment].transform_begin_position(begin_position - block_alignment[:source][:begin])
         | 
| 178 | 
            +
            			r.nil? ? nil : r + block_alignment[:target][:begin]
         | 
| 173 179 | 
             
            		end
         | 
| 174 180 | 
             
            	end
         | 
| 175 181 |  | 
| @@ -183,10 +189,12 @@ class TextAlignment::TextAlignment | |
| 183 189 | 
             
            			if end_position == block_alignment[:source][:end]
         | 
| 184 190 | 
             
            				block_alignment[:target][:end]
         | 
| 185 191 | 
             
            			else
         | 
| 186 | 
            -
            				raise "lost annotation"
         | 
| 192 | 
            +
            				# raise "lost annotation"
         | 
| 193 | 
            +
            				nil
         | 
| 187 194 | 
             
            			end
         | 
| 188 195 | 
             
            		else
         | 
| 189 | 
            -
            			block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin]) | 
| 196 | 
            +
            			r = block_alignment[:alignment].transform_end_position(end_position - block_alignment[:source][:begin])
         | 
| 197 | 
            +
            			r.nil? ? nil : r + block_alignment[:target][:begin]
         | 
| 190 198 | 
             
            		end
         | 
| 191 199 | 
             
            	end
         | 
| 192 200 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: text_alignment
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.3. | 
| 4 | 
            +
              version: 0.3.18
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Jin-Dong Kim
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020-09- | 
| 11 | 
            +
            date: 2020-09-18 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: ruby-dictionary
         |