rbbt-text 1.1.9 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/g_norm_plus.rb +19 -12
- data/lib/rbbt/text/corpus/document.rb +63 -41
- data/lib/rbbt/text/segment.rb +10 -2
- data/lib/rbbt/text/segment/docid.rb +44 -44
- data/lib/rbbt/text/segment/named_entity.rb +1 -0
- data/lib/rbbt/text/segment/transformed.rb +2 -2
- data/test/rbbt/ner/test_g_norm_plus.rb +0 -1
- data/test/rbbt/text/corpus/test_document.rb +39 -9
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
         | 
| 4 | 
            +
              data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
         | 
| 7 | 
            +
              data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
         | 
    
        data/lib/rbbt/ner/g_norm_plus.rb
    CHANGED
    
    | @@ -10,35 +10,39 @@ module GNormPlus | |
| 10 10 | 
             
              end
         | 
| 11 11 |  | 
| 12 12 | 
             
              CONFIG =<<-EOF
         | 
| 13 | 
            -
             | 
| 14 13 | 
             
            #===Annotation
         | 
| 15 14 | 
             
            #Attribution setting:
         | 
| 16 15 | 
             
            #FocusSpecies = Taxonomy ID
         | 
| 17 | 
            -
            # | 
| 18 | 
            -
            # | 
| 19 | 
            -
            # | 
| 20 | 
            -
            # | 
| 21 | 
            -
            # | 
| 22 | 
            -
            # | 
| 23 | 
            -
            # | 
| 24 | 
            -
            # | 
| 16 | 
            +
            #       All: All species
         | 
| 17 | 
            +
            #       9606: Human
         | 
| 18 | 
            +
            #       4932: yeast
         | 
| 19 | 
            +
            #       7227: Fly
         | 
| 20 | 
            +
            #       10090: Mouse
         | 
| 21 | 
            +
            #       10116: Rat
         | 
| 22 | 
            +
            #       7955: Zebrafish
         | 
| 23 | 
            +
            #       3702: Arabidopsis thaliana
         | 
| 25 24 | 
             
            #open: True
         | 
| 26 25 | 
             
            #close: False
         | 
| 27 26 |  | 
| 28 27 | 
             
            [Focus Species]
         | 
| 29 | 
            -
            	FocusSpecies =  | 
| 28 | 
            +
            	FocusSpecies = 9606
         | 
| 29 | 
            +
            	FilterAntibody = False
         | 
| 30 30 | 
             
            [Dictionary & Model]
         | 
| 31 31 | 
             
            	DictionaryFolder = ./Dictionary
         | 
| 32 32 | 
             
            	GNRModel = ./Dictionary/GNR.Model
         | 
| 33 33 | 
             
            	SCModel = ./Dictionary/SimConcept.Model
         | 
| 34 34 | 
             
            	GeneIDMatch = True
         | 
| 35 | 
            +
            	HomologeneID = False
         | 
| 35 36 | 
             
            	Normalization2Protein = False
         | 
| 37 | 
            +
            	ShowUnNormalizedMention = False
         | 
| 36 38 | 
             
            	DeleteTmp = True
         | 
| 39 | 
            +
            	IgnoreNER = True
         | 
| 37 40 | 
             
            EOF
         | 
| 38 41 |  | 
| 39 42 | 
             
              def self.process(texts)
         | 
| 40 43 | 
             
                TmpFile.with_file do |tmpdir|
         | 
| 41 44 | 
             
                  Open.mkdir tmpdir
         | 
| 45 | 
            +
             | 
| 42 46 | 
             
                  Misc.in_dir tmpdir do
         | 
| 43 47 | 
             
                    Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
         | 
| 44 48 | 
             
                    Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
         | 
| @@ -50,12 +54,12 @@ EOF | |
| 50 54 |  | 
| 51 55 | 
             
                    texts.each do |name,text|
         | 
| 52 56 | 
             
                      Open.write("input/#{name}.txt") do |f|
         | 
| 53 | 
            -
                        f.puts "#{name}|a|" << text
         | 
| 57 | 
            +
                        f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
         | 
| 54 58 | 
             
                        f.puts
         | 
| 55 59 | 
             
                      end
         | 
| 56 60 | 
             
                    end
         | 
| 57 61 | 
             
                    Open.write('config', CONFIG)
         | 
| 58 | 
            -
                    CMD.cmd_log("java -Xmx20G -Xms20G  -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
         | 
| 62 | 
            +
                    CMD.cmd_log("java -Xmx20G -Xms20G  -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
         | 
| 59 63 |  | 
| 60 64 | 
             
                    if texts.respond_to? :key_field
         | 
| 61 65 | 
             
                      key_field = texts.key_field
         | 
| @@ -68,6 +72,9 @@ EOF | |
| 68 72 | 
             
                      entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
         | 
| 69 73 | 
             
                      tsv[name] = entities
         | 
| 70 74 | 
             
                    end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    raise "GNormPlus failed: no results found" if tsv.size == 0 && texts.size > 0
         | 
| 77 | 
            +
             | 
| 71 78 | 
             
                    tsv
         | 
| 72 79 | 
             
                  end
         | 
| 73 80 | 
             
                end
         | 
| @@ -1,5 +1,6 @@ | |
| 1 1 | 
             
            require 'rbbt/text/segment'
         | 
| 2 2 | 
             
            require 'rbbt/text/segment/segmented'
         | 
| 3 | 
            +
            require 'rbbt/text/segment/docid'
         | 
| 3 4 | 
             
            require 'rbbt/tsv'
         | 
| 4 5 | 
             
            require 'rbbt/resource/path'
         | 
| 5 6 | 
             
            require 'rbbt/persist/tsv'
         | 
| @@ -15,6 +16,7 @@ class Corpus | |
| 15 16 | 
             
                attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
         | 
| 16 17 |  | 
| 17 18 | 
             
                attr_accessor :multiple_result
         | 
| 19 | 
            +
             | 
| 18 20 | 
             
                def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
         | 
| 19 21 | 
             
                  @segments = {}
         | 
| 20 22 | 
             
                  @segment_indices = {}
         | 
| @@ -44,16 +46,22 @@ class Corpus | |
| 44 46 | 
             
                end
         | 
| 45 47 |  | 
| 46 48 | 
             
                def self.define(entity, &block)
         | 
| 47 | 
            -
                  send :define_method, "produce_#{entity}" | 
| 49 | 
            +
                  send :define_method, "produce_#{entity}" do 
         | 
| 50 | 
            +
                    segments = self.instance_exec &block
         | 
| 48 51 |  | 
| 49 | 
            -
             | 
| 52 | 
            +
                    segments.each{|s| s.docid = docid }
         | 
| 53 | 
            +
                  end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                  self.class_eval <<-EOC, __FILE__, __LINE__ + 1
         | 
| 50 56 | 
             
                    def load_#{entity}(raw = false)
         | 
| 51 57 | 
             
                      return if segments.include? "#{ entity }"
         | 
| 52 58 | 
             
                      if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
         | 
| 53 | 
            -
                         | 
| 59 | 
            +
                        entities = load_with_persistence_#{entity}(raw)
         | 
| 54 60 | 
             
                      else
         | 
| 55 | 
            -
                         | 
| 61 | 
            +
                        entities = produce_#{entity}
         | 
| 56 62 | 
             
                      end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                      segments["#{ entity }"] = entities
         | 
| 57 65 | 
             
                    end
         | 
| 58 66 |  | 
| 59 67 | 
             
                    def #{entity}(raw = false)
         | 
| @@ -77,7 +85,10 @@ class Corpus | |
| 77 85 |  | 
| 78 86 | 
             
                def self.define_multiple(entity, &block)
         | 
| 79 87 | 
             
                  send :define_method, "produce_#{entity}" do
         | 
| 80 | 
            -
                     | 
| 88 | 
            +
                    if self.multiple_result && self.multiple_result[entity]
         | 
| 89 | 
            +
                      segments = self.multiple_result[entity]
         | 
| 90 | 
            +
                      return segments.each{|s| s.docid = docid }
         | 
| 91 | 
            +
                    end
         | 
| 81 92 | 
             
                    raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
         | 
| 82 93 | 
             
                  end
         | 
| 83 94 |  | 
| @@ -86,14 +97,16 @@ class Corpus | |
| 86 97 | 
             
                    self
         | 
| 87 98 | 
             
                  end.send :define_method, name, &block
         | 
| 88 99 |  | 
| 89 | 
            -
                  self.class_eval <<-EOC, __FILE__, __LINE__
         | 
| 100 | 
            +
                  self.class_eval <<-EOC, __FILE__, __LINE__ + 1
         | 
| 90 101 | 
             
                    def load_#{entity}(raw = false)
         | 
| 91 102 | 
             
                      return if segments.include? "#{ entity }"
         | 
| 92 103 | 
             
                      if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
         | 
| 93 | 
            -
                         | 
| 104 | 
            +
                        entities = load_with_persistence_#{entity}(raw)
         | 
| 94 105 | 
             
                      else
         | 
| 95 | 
            -
                         | 
| 106 | 
            +
                        entities = produce_#{entity}
         | 
| 96 107 | 
             
                      end
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                      segments["#{ entity }"] = entities
         | 
| 97 110 | 
             
                    end
         | 
| 98 111 |  | 
| 99 112 | 
             
                    def #{entity}(raw = false)
         | 
| @@ -124,7 +137,7 @@ class Corpus | |
| 124 137 | 
             
                      missing << doc
         | 
| 125 138 | 
             
                    end
         | 
| 126 139 | 
             
                  end
         | 
| 127 | 
            -
                  res = self.send("multiple_produce_#{entity.to_s}", missing)
         | 
| 140 | 
            +
                  res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
         | 
| 128 141 | 
             
                  case res
         | 
| 129 142 | 
             
                  when Array
         | 
| 130 143 | 
             
                    res.each_with_index do |res,i|
         | 
| @@ -142,7 +155,9 @@ class Corpus | |
| 142 155 | 
             
                      end
         | 
| 143 156 | 
             
                    end
         | 
| 144 157 | 
             
                  end
         | 
| 145 | 
            -
                  missing.each{|doc| | 
| 158 | 
            +
                  missing.each{|doc|
         | 
| 159 | 
            +
                    doc.send entity 
         | 
| 160 | 
            +
                  }
         | 
| 146 161 | 
             
                end
         | 
| 147 162 |  | 
| 148 163 |  | 
| @@ -197,7 +212,7 @@ class Corpus | |
| 197 212 | 
             
                    FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
         | 
| 198 213 | 
             
                  end
         | 
| 199 214 |  | 
| 200 | 
            -
                  self.class_eval <<-EOC, __FILE__, __LINE__
         | 
| 215 | 
            +
                  self.class_eval <<-EOC, __FILE__, __LINE__ + 1
         | 
| 201 216 | 
             
                    def load_with_persistence_#{entity}(raw = false)
         | 
| 202 217 | 
             
                      repo = TSV_REPOS["#{ entity }"]
         | 
| 203 218 | 
             
                      if repo.nil?
         | 
| @@ -253,7 +268,7 @@ class Corpus | |
| 253 268 |  | 
| 254 269 | 
             
                  FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields 
         | 
| 255 270 |  | 
| 256 | 
            -
                  self.class_eval <<-EOC, __FILE__, __LINE__
         | 
| 271 | 
            +
                  self.class_eval <<-EOC, __FILE__, __LINE__ + 1
         | 
| 257 272 | 
             
                    def load_with_persistence_#{entity}(raw = false)
         | 
| 258 273 | 
             
                      fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
         | 
| 259 274 |  | 
| @@ -261,20 +276,23 @@ class Corpus | |
| 261 276 |  | 
| 262 277 | 
             
                      begin
         | 
| 263 278 |  | 
| 264 | 
            -
                        data. | 
| 279 | 
            +
                        if data.respond_to? :persistence_path and String === data.persistence_path
         | 
| 280 | 
            +
                          data.filter(data.persistence_path + '.filters')
         | 
| 281 | 
            +
                        end
         | 
| 282 | 
            +
             | 
| 283 | 
            +
                        keys = data.read_and_close do
         | 
| 265 284 |  | 
| 266 | 
            -
             | 
| 285 | 
            +
                          fields = data.fields if fields.nil? and data.respond_to? :fields
         | 
| 267 286 |  | 
| 287 | 
            +
                          data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
         | 
| 288 | 
            +
                          data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
         | 
| 289 | 
            +
                          keys = data.keys
         | 
| 290 | 
            +
                          data.pop_filter if fields.include?("#{entity_field}")
         | 
| 291 | 
            +
                          data.pop_filter if fields.include?("#{doc_field}")
         | 
| 268 292 |  | 
| 269 | 
            -
             | 
| 270 | 
            -
                          data.filter(data.persistence_path + '.filters')
         | 
| 293 | 
            +
                          keys
         | 
| 271 294 | 
             
                        end
         | 
| 272 295 |  | 
| 273 | 
            -
                        data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
         | 
| 274 | 
            -
                        data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
         | 
| 275 | 
            -
                        keys = data.keys
         | 
| 276 | 
            -
                        data.pop_filter if data.fields.include?("#{entity_field}")
         | 
| 277 | 
            -
                        data.pop_filter if data.fields.include?("#{doc_field}")
         | 
| 278 296 |  | 
| 279 297 | 
             
                        if keys.empty?
         | 
| 280 298 | 
             
                          segments = produce_#{entity}
         | 
| @@ -289,34 +307,38 @@ class Corpus | |
| 289 307 | 
             
                            "#{ entity }"
         | 
| 290 308 | 
             
                          end
         | 
| 291 309 |  | 
| 292 | 
            -
                           | 
| 293 | 
            -
             | 
| 294 | 
            -
             | 
| 295 | 
            -
             | 
| 296 | 
            -
             | 
| 297 | 
            -
             | 
| 310 | 
            +
                          keys = data.write_and_close do
         | 
| 311 | 
            +
                            data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
         | 
| 312 | 
            +
                            data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
         | 
| 313 | 
            +
                            keys = tsv.collect do |key, value|
         | 
| 314 | 
            +
                              data[key] = value
         | 
| 315 | 
            +
                              key
         | 
| 316 | 
            +
                            end
         | 
| 317 | 
            +
                            data.pop_filter if fields.include?("#{entity_field}")
         | 
| 318 | 
            +
                            data.pop_filter if fields.include?("#{doc_field}")
         | 
| 319 | 
            +
                            keys
         | 
| 298 320 | 
             
                          end
         | 
| 299 | 
            -
                          data.pop_filter if data.fields.include?("#{entity_field}")
         | 
| 300 | 
            -
                          data.pop_filter if data.fields.include?("#{doc_field}")
         | 
| 301 | 
            -
                          data.read
         | 
| 302 321 |  | 
| 303 322 | 
             
                        else
         | 
| 304 | 
            -
                          if raw == :check
         | 
| 305 | 
            -
                            data.close
         | 
| 306 | 
            -
                            return nil
         | 
| 307 | 
            -
                          end
         | 
| 323 | 
            +
                          return nil if raw == :check
         | 
| 308 324 | 
             
                        end
         | 
| 309 325 |  | 
| 310 326 | 
             
                        return data.values if raw
         | 
| 311 327 |  | 
| 312 328 | 
             
                        start_pos = data.identify_field "Start"
         | 
| 313 | 
            -
                         | 
| 329 | 
            +
                        data.read_and_close do 
         | 
| 330 | 
            +
                          data.chunked_values_at(keys).collect{|annotation| 
         | 
| 331 | 
            +
                              begin
         | 
| 314 332 | 
             
                            pos = annotation[start_pos]
         | 
| 315 | 
            -
                            Segment.load_tsv_values(text, annotation,  | 
| 316 | 
            -
             | 
| 317 | 
            -
             | 
| 318 | 
            -
             | 
| 319 | 
            -
             | 
| 333 | 
            +
                            Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
         | 
| 334 | 
            +
                              rescue
         | 
| 335 | 
            +
                                Log.exception $!
         | 
| 336 | 
            +
                                iif keys
         | 
| 337 | 
            +
                                iif [text, annotation]
         | 
| 338 | 
            +
                              end
         | 
| 339 | 
            +
                              
         | 
| 340 | 
            +
                          }.compact
         | 
| 341 | 
            +
                        end
         | 
| 320 342 | 
             
                      ensure
         | 
| 321 343 | 
             
                        data.close
         | 
| 322 344 | 
             
                      end
         | 
| @@ -348,7 +370,7 @@ class Corpus | |
| 348 370 | 
             
                    segment.segments[name] = annotations
         | 
| 349 371 | 
             
                    class << segment
         | 
| 350 372 | 
             
                      self
         | 
| 351 | 
            -
                    end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
         | 
| 373 | 
            +
                    end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
         | 
| 352 374 | 
             
                  end
         | 
| 353 375 |  | 
| 354 376 | 
             
                  segment
         | 
    
        data/lib/rbbt/text/segment.rb
    CHANGED
    
    | @@ -3,7 +3,7 @@ require 'rbbt/fix_width_table' | |
| 3 3 |  | 
| 4 4 | 
             
            module Segment 
         | 
| 5 5 | 
             
              extend Annotation
         | 
| 6 | 
            -
              self.annotation :offset
         | 
| 6 | 
            +
              self.annotation :offset, :docid
         | 
| 7 7 |  | 
| 8 8 | 
             
              def segment_length
         | 
| 9 9 | 
             
                begin
         | 
| @@ -325,7 +325,7 @@ module Segment | |
| 325 325 | 
             
                tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
         | 
| 326 326 |  | 
| 327 327 | 
             
                segments.each do |segment|
         | 
| 328 | 
            -
                  tsv[segment. | 
| 328 | 
            +
                  tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
         | 
| 329 329 | 
             
                end
         | 
| 330 330 |  | 
| 331 331 | 
             
                tsv
         | 
| @@ -348,6 +348,14 @@ module Segment | |
| 348 348 | 
             
                [offset, self.end] * ".."
         | 
| 349 349 | 
             
              end
         | 
| 350 350 |  | 
| 351 | 
            +
              def segment_id
         | 
| 352 | 
            +
                if self.respond_to?(:docid)
         | 
| 353 | 
            +
                  [docid, locus, Misc.obj2digest(info)] * ":"
         | 
| 354 | 
            +
                else
         | 
| 355 | 
            +
                  Misc.obj2digest(info)
         | 
| 356 | 
            +
                end
         | 
| 357 | 
            +
              end
         | 
| 358 | 
            +
             | 
| 351 359 | 
             
              #def ==(other)
         | 
| 352 360 | 
             
              #  self.text == other.text
         | 
| 353 361 | 
             
              #end
         | 
| @@ -1,46 +1,46 @@ | |
| 1 1 | 
             
            require 'rbbt/text/segment'
         | 
| 2 2 |  | 
| 3 | 
            -
            module SegmentWithDocid 
         | 
| 4 | 
            -
              extend Annotation
         | 
| 5 | 
            -
             | 
| 6 | 
            -
              self.annotation :docid
         | 
| 7 | 
            -
             | 
| 8 | 
            -
              def masked?
         | 
| 9 | 
            -
                self[0..5] == "MASKED"
         | 
| 10 | 
            -
              end
         | 
| 11 | 
            -
             | 
| 12 | 
            -
              def mask
         | 
| 13 | 
            -
                return self if masked?
         | 
| 14 | 
            -
                raise "Cannot mask an array of elements, they must be masked individually" if Array === self
         | 
| 15 | 
            -
                raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
         | 
| 16 | 
            -
                raise "Cannot mask a segment with no offset" if offset.nil?
         | 
| 17 | 
            -
                textual_position = ["MASKED", length] * ":"
         | 
| 18 | 
            -
                self.replace(textual_position)
         | 
| 19 | 
            -
                self
         | 
| 20 | 
            -
              end
         | 
| 21 | 
            -
             | 
| 22 | 
            -
              def unmasked_text
         | 
| 23 | 
            -
                return self unless masked?
         | 
| 24 | 
            -
                tag, length = self.split(":")
         | 
| 25 | 
            -
                Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
         | 
| 26 | 
            -
              end
         | 
| 27 | 
            -
             | 
| 28 | 
            -
              def unmask
         | 
| 29 | 
            -
                return self unless masked?
         | 
| 30 | 
            -
                self.replace(unmasked_text)
         | 
| 31 | 
            -
                self
         | 
| 32 | 
            -
              end
         | 
| 33 | 
            -
             | 
| 34 | 
            -
              def str_length
         | 
| 35 | 
            -
                self.length
         | 
| 36 | 
            -
              end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
              def masked_length
         | 
| 39 | 
            -
                self.split(":").last.to_i
         | 
| 40 | 
            -
              end
         | 
| 41 | 
            -
             | 
| 42 | 
            -
              def segment_length
         | 
| 43 | 
            -
                masked? ? masked_length : str_length
         | 
| 44 | 
            -
              end
         | 
| 45 | 
            -
            end
         | 
| 46 | 
            -
             | 
| 3 | 
            +
            #module SegmentWithDocid 
         | 
| 4 | 
            +
            #  extend Annotation
         | 
| 5 | 
            +
            #
         | 
| 6 | 
            +
            #  self.annotation :docid
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #  def masked?
         | 
| 9 | 
            +
            #    self[0..5] == "MASKED"
         | 
| 10 | 
            +
            #  end
         | 
| 11 | 
            +
            #
         | 
| 12 | 
            +
            #  def mask
         | 
| 13 | 
            +
            #    return self if masked?
         | 
| 14 | 
            +
            #    raise "Cannot mask an array of elements, they must be masked individually" if Array === self
         | 
| 15 | 
            +
            #    raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
         | 
| 16 | 
            +
            #    raise "Cannot mask a segment with no offset" if offset.nil?
         | 
| 17 | 
            +
            #    textual_position = ["MASKED", length] * ":"
         | 
| 18 | 
            +
            #    self.replace(textual_position)
         | 
| 19 | 
            +
            #    self
         | 
| 20 | 
            +
            #  end
         | 
| 21 | 
            +
            #
         | 
| 22 | 
            +
            #  def unmasked_text
         | 
| 23 | 
            +
            #    return self unless masked?
         | 
| 24 | 
            +
            #    tag, length = self.split(":")
         | 
| 25 | 
            +
            #    Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
         | 
| 26 | 
            +
            #  end
         | 
| 27 | 
            +
            #
         | 
| 28 | 
            +
            #  def unmask
         | 
| 29 | 
            +
            #    return self unless masked?
         | 
| 30 | 
            +
            #    self.replace(unmasked_text)
         | 
| 31 | 
            +
            #    self
         | 
| 32 | 
            +
            #  end
         | 
| 33 | 
            +
            #
         | 
| 34 | 
            +
            #  def str_length
         | 
| 35 | 
            +
            #    self.length
         | 
| 36 | 
            +
            #  end
         | 
| 37 | 
            +
            #
         | 
| 38 | 
            +
            #  def masked_length
         | 
| 39 | 
            +
            #    self.split(":").last.to_i
         | 
| 40 | 
            +
            #  end
         | 
| 41 | 
            +
            #
         | 
| 42 | 
            +
            #  def segment_length
         | 
| 43 | 
            +
            #    masked? ? masked_length : str_length
         | 
| 44 | 
            +
            #  end
         | 
| 45 | 
            +
            #end
         | 
| 46 | 
            +
            #
         | 
| @@ -111,10 +111,10 @@ module Transformed | |
| 111 111 |  | 
| 112 112 | 
             
                  self[updated_begin..updated_end] = new
         | 
| 113 113 |  | 
| 114 | 
            -
                  @transformed_segments[segment. | 
| 114 | 
            +
                  @transformed_segments[segment.segment_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
         | 
| 115 115 |  | 
| 116 116 | 
             
                  segment.replace original_text
         | 
| 117 | 
            -
                  stack << segment. | 
| 117 | 
            +
                  stack << segment.segment_id
         | 
| 118 118 | 
             
                end
         | 
| 119 119 | 
             
                @transformation_stack << stack
         | 
| 120 120 | 
             
              end
         | 
| @@ -10,10 +10,6 @@ class TestCorpusDocument < Test::Unit::TestCase | |
| 10 10 | 
             
                  Segment.align(self.text, words)
         | 
| 11 11 | 
             
                end
         | 
| 12 12 |  | 
| 13 | 
            -
                Open.mkdir Rbbt.tmp.test.annotations.find
         | 
| 14 | 
            -
                Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
         | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 13 | 
             
                Corpus::Document.define_multiple :words2 do |documents|
         | 
| 18 14 | 
             
                  documents.collect do |doc|
         | 
| 19 15 | 
             
                    words = doc.text.split(" ")
         | 
| @@ -21,32 +17,66 @@ class TestCorpusDocument < Test::Unit::TestCase | |
| 21 17 | 
             
                  end
         | 
| 22 18 | 
             
                end
         | 
| 23 19 |  | 
| 20 | 
            +
                Open.mkdir Rbbt.tmp.test.annotations.find
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
         | 
| 24 23 | 
             
                Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
         | 
| 25 24 | 
             
              end
         | 
| 26 25 |  | 
| 27 26 | 
             
              def test_words
         | 
| 28 27 | 
             
                text = "This is a test document"
         | 
| 29 | 
            -
                document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
         | 
| 28 | 
            +
                document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
         | 
| 30 29 | 
             
                assert_equal Segment.sort(document.words), text.split(" ")
         | 
| 30 | 
            +
                assert document.words.first.docid
         | 
| 31 | 
            +
                assert document.words.first.segment_id.include?("TEST")
         | 
| 31 32 | 
             
              end
         | 
| 32 33 |  | 
| 33 34 | 
             
              def test_words_multiple
         | 
| 34 35 | 
             
                document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
         | 
| 35 | 
            -
                document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is  | 
| 36 | 
            +
                document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
         | 
| 37 | 
            +
                document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
         | 
| 36 38 |  | 
| 37 | 
            -
                docs = [document1, document2]
         | 
| 39 | 
            +
                docs = [document1, document2, document3]
         | 
| 38 40 |  | 
| 39 41 | 
             
                Corpus::Document.prepare_multiple(docs, :words2)
         | 
| 40 | 
            -
             | 
| 42 | 
            +
             | 
| 43 | 
            +
                assert document1.words.first.docid
         | 
| 44 | 
            +
                assert document1.words.first.segment_id.include?("TEST")
         | 
| 45 | 
            +
             | 
| 41 46 | 
             
                assert_equal document1.words2, document1.text.split(" ")
         | 
| 42 47 | 
             
                assert_equal document2.words2, document2.text.split(" ")
         | 
| 48 | 
            +
                assert_equal document3.words2, document3.text.split(" ")
         | 
| 43 49 |  | 
| 44 50 | 
             
                document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
         | 
| 45 | 
            -
                document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is  | 
| 51 | 
            +
                document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
         | 
| 46 52 |  | 
| 47 53 | 
             
                docs = [document1, document2]
         | 
| 48 54 |  | 
| 49 55 | 
             
                Corpus::Document.prepare_multiple(docs, :words2)
         | 
| 50 56 | 
             
              end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              def test_parallel
         | 
| 59 | 
            +
                text =<<-EOF
         | 
| 60 | 
            +
            This is a test document number
         | 
| 61 | 
            +
                EOF
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                docs = []
         | 
| 64 | 
            +
                100.times do |i|
         | 
| 65 | 
            +
                  docs << text.chomp + " " + i.to_s
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                Log.with_severity 0 do
         | 
| 69 | 
            +
                  TSV.traverse docs, :cpus => 10, :bar => true do |doc|
         | 
| 70 | 
            +
                    hash = Misc.digest(doc)
         | 
| 71 | 
            +
                    document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
         | 
| 72 | 
            +
                    assert_equal Segment.sort(document.words), document.text.split(" ")
         | 
| 73 | 
            +
                  end
         | 
| 74 | 
            +
                  TSV.traverse docs, :cpus => 10, :bar => true do |doc|
         | 
| 75 | 
            +
                    hash = Misc.digest(doc)
         | 
| 76 | 
            +
                    document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
         | 
| 77 | 
            +
                    assert_equal Segment.sort(document.words), document.text.split(" ")
         | 
| 78 | 
            +
                  end
         | 
| 79 | 
            +
                end
         | 
| 80 | 
            +
              end
         | 
| 51 81 | 
             
            end
         | 
| 52 82 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: rbbt-text
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1. | 
| 4 | 
            +
              version: 1.2.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Miguel Vazquez
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020-04- | 
| 11 | 
            +
            date: 2020-04-16 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: rbbt-util
         |