RubyGems - rbbt-text - Versions diffs - 1.3.4 → 1.3.7 - Mend

rbbt-text 1.3.4 → 1.3.7

Files changed (35) hide show

checksums.yaml +4 -4
data/LICENSE +20 -0
data/lib/rbbt/document/annotation.rb +2 -2
data/lib/rbbt/document/corpus/pubmed.rb +14 -5
data/lib/rbbt/document/corpus.rb +10 -7
data/lib/rbbt/document.rb +7 -3
data/lib/rbbt/ner/abner.rb +3 -2
data/lib/rbbt/ner/banner.rb +3 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/linnaeus.rb +2 -1
data/lib/rbbt/ner/oscar3.rb +0 -1
data/lib/rbbt/ner/oscar4.rb +0 -1
data/lib/rbbt/ner/rner.rb +229 -0
data/lib/rbbt/ner/rnorm/tokens.rb +3 -1
data/lib/rbbt/ner/rnorm.rb +5 -1
data/lib/rbbt/ner/token_trieNER.rb +2 -1
data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
data/lib/rbbt/nlp/spaCy.rb +158 -15
data/lib/rbbt/relationship.rb +24 -0
data/lib/rbbt/segment/named_entity.rb +4 -0
data/lib/rbbt/segment/range_index.rb +1 -1
data/lib/rbbt/segment/transformed.rb +9 -1
data/lib/rbbt/segment.rb +3 -0
data/share/install/software/OpenNLP +3 -8
data/share/rner/config.rb +51 -0
data/test/rbbt/document/corpus/test_pubmed.rb +1 -1
data/test/rbbt/document/test_annotation.rb +10 -1
data/test/rbbt/document/test_corpus.rb +14 -0
data/test/rbbt/ner/rnorm/test_tokens.rb +11 -0
data/test/rbbt/ner/test_rner.rb +132 -0
data/test/rbbt/ner/test_rnorm.rb +5 -0
data/test/rbbt/segment/test_named_entity.rb +2 -1
data/test/rbbt/segment/test_transformed.rb +13 -30
data/test/test_spaCy.rb +113 -1
metadata +13 -18

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 496288d7d3ff1215ded1fd210192d5887a6a071eea5f322295a669a5d648d77b
-  data.tar.gz: 47996496009cbcdaab38a9dc9bf6efbbe7fc0145f315b0a48bfab0f543742f94
+  metadata.gz: 8dfc374254fcbe88c8be6bfffd9a3cfabf6e23c953c11ecd2f61cf41027ff3d6
+  data.tar.gz: 3d3211f41cfecea05862505d1508a4b7b76eecb3c90b3b0000194eb08033715e
 SHA512:
-  metadata.gz: 36e7415ad06207066844a30001c8541865f066d1e83a4a2ddc5182c54b704cd3d442cbccce219bd2114717a83656d07558c42725eca75597fea239b6e13244ab
-  data.tar.gz: 988eff4d242d0425910b96fac4188df079c8c53c3abea2825cc97d5af5118841680705fa33461a5b4cfa7b8d6b32a486465e44b75f20fad324e4623c6c8083d8
+  metadata.gz: 7ed870e46bae2c113d0885697bfbade6064732a89477833c640eaf4ee8bdb2c0fbf52f69f456af5eb30a82e56a7f0aeb37e71127f884430c3d315202a07fa3cb
+  data.tar.gz: e31853e816321a5ead788036b5f67eecaca179c75168c0bb2804be1f18ae844031ab808a4e3c9d67e1f9a52f94ca478949798b8101e164eba32481c0182a1f58

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2010-2022 Miguel Vázquez García
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/lib/rbbt/document/annotation.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module Document
       end
       docid = self.docid
-      segments.each{|s| s.docid = docid if s.docid.nil? }
+      segments.each{|s| s.docid = docid }
       segments
     end
@@ -36,7 +36,7 @@ module Document
         docid = document.docid
-        segments.each{|s| s.docid = docid if s.docid.nil? }
+        segments.each{|s| s.docid = docid }
         segments
       end

data/lib/rbbt/document/corpus/pubmed.rb CHANGED Viewed

@@ -1,21 +1,30 @@
 require 'rbbt/sources/pubmed'
 module Document::Corpus
-  def add_pmid(pmid, type = nil)
+  PUBMED_NAMESPACE="PMID"
+  def add_pmid(pmid, type = nil, update = false)
+    type = :abstract if type.nil?
+    if update == false
+      id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
+      documents = self.documents(id)
+      return documents if documents.any?
+    end
     pmids = Array === pmid ? pmid : [pmid]
     type = nil if String === type and type.empty?
     res = PubMed.get_article(pmids).collect do |pmid, article|
-      document = if type.nil? || type.to_sym == :abstract
-                   Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
+      document = if type.to_sym == :abstract
+                   Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
                  elsif type.to_sym == :title
-                   Document.setup(article.title, :PMID, pmid, :title, self)
+                   Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
                  else
                    raise "No FullText available for #{ pmid }" if article.full_text.nil?
-                   Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
+                   Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
                  end
       Log.debug "Loading pmid #{pmid}"
       add_document(document)
+      document
     end
     Document.setup(res)

data/lib/rbbt/document/corpus.rb CHANGED Viewed

@@ -3,8 +3,10 @@ require 'rbbt-util'
 module Document::Corpus
   def self.setup(corpus)
+    corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
     corpus.extend Document::Corpus unless Document::Corpus === corpus
     corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
+    corpus.close
     corpus
   end
@@ -16,22 +18,23 @@ module Document::Corpus
     end
   end
-  def docids(prefix)
-    prefix += ":" unless prefix == :all || prefix[-1] == ":"
+  def docids(*prefix)
+    prefix = prefix * ":"
+    prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
     docids = self.read_and_close do
-      prefix == :all ? self.keys : self.prefix(prefix)
+      prefix == "all" ? self.keys : self.prefix(prefix)
     end
     DocID.setup(docids, :corpus => self)
   end
-  def documents(prefix)
-    self.docids(prefix).document
+  def documents(*prefix)
+    self.docids(*prefix).document
   end
   def [](*args)
     docid, *rest = args
-    res = self.read_and_close do
+    res = self.with_read do
       super(*args)
     end
@@ -41,7 +44,7 @@ module Document::Corpus
     namespace, id, type  = docid.split(":")
     if res.nil?
-      if Document::Corpus.claims.include?(namespace.to_s)
+      if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
         res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
       end
     end

data/lib/rbbt/document.rb CHANGED Viewed

@@ -22,9 +22,7 @@ module DocID
     if Array === self
       namespace, id, type = nil, nil, nil
       docs = self.collect do |docid|
-        text = self.corpus[docid]
-        namespace, id, type = docid.split(":")
-        text
+        self.corpus[docid]
       end
       Document.setup(docs, :corpus => corpus)
     else
@@ -53,3 +51,9 @@ module Document
   alias id docid
 end
+#class String
+#  def docid
+#    digest = Misc.digest(self)
+#    ["STRING", digest, nil, nil] * ":"
+#  end
+#end

data/lib/rbbt/ner/abner.rb CHANGED Viewed

@@ -39,14 +39,15 @@ class Abner < NER
     types = res[1]
     strings = res[0]
+    docid = Misc.digest(text)
     global_offset = 0
     strings.zip(types).collect do |mention, type|
       mention = mention.to_s;
       offset = text.index(mention)
       if offset.nil?
-        NamedEntity.setup(mention, nil, type.to_s)
+        NamedEntity.setup(mention, :docid => docid, :entity_type => type)
       else
-        NamedEntity.setup(mention, offset + global_offset, type.to_s)
+        NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
         text = text[offset + mention.length..-1]
         global_offset += offset + mention.length
       end

data/lib/rbbt/ner/banner.rb CHANGED Viewed

@@ -55,6 +55,7 @@ class Banner < NER
   # text.
   def match(text)
     return [] if text.nil?
+    text = text.dup if text.frozen?
     text.gsub!(/\n/,' ')
     text.gsub!(/\|/,'/') # Character | gives an error
     return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
     @parenPP.postProcess(sentence)
     tagged = sentence.getSGML
+    docid = Misc.digest text
     res = tagged.scan(/<GENE>.*?<\/GENE>/).
       collect{|r|
       r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
       mention.sub!(/^\s*/,'')
       mention.sub!(/\s*$/,'')
       offset = text.index(mention)
-      NamedEntity.setup(mention, offset, 'GENE')
+      NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
       mention
     }
     res

data/lib/rbbt/ner/brat.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 require 'rbbt/segment/named_entity'
-require 'rbbt/text/segment/relationship'
+require 'rbbt/relationship'
 module Brat
   Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"

data/lib/rbbt/ner/linnaeus.rb CHANGED Viewed

@@ -31,7 +31,8 @@ module Linnaeus
     init unless defined? @@Matcher
     @@Matcher.match(text).toArray().collect do |mention|
-      NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
+      best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
+      NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
     end
   end
 end

data/lib/rbbt/ner/oscar3.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 require 'rbbt'
 require 'rjb'
-require 'libxml'
 require 'rbbt/ner/NER'
 require 'rbbt/util/log'

data/lib/rbbt/ner/oscar4.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 require 'rbbt'
 require 'rjb'
-require 'libxml'
 require 'rbbt/segment'
 require 'rbbt/ner/NER'
 require 'rbbt/util/log'

data/lib/rbbt/ner/rner.rb ADDED Viewed

@@ -0,0 +1,229 @@
+require 'rbbt'
+require 'rbbt/util/open'
+require 'rbbt/util/misc'
+require 'rbbt/util/simpleDSL'
+class NERFeatures
+  include SimpleDSL
+  def self.tokens(text)
+    text.scan(/
+              \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
+              \w-\w*|
+              \w+-[A-Z](?!\w)|
+              \w+|
+              [.,()\/\[\]{}'"+-]
+              /x)
+  end
+  def self.reverse(text)
+    tokens(text).reverse.join(" ")
+  end
+  def define(name, *args, &block)
+    action = args[0] || block ||  /#{name.to_s}s?/i
+    raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
+    @types[name.to_s] = action
+    @order.push name.to_s
+    name.to_s
+  end
+  attr_accessor :reverse
+  def initialize(file = nil, reverse = false, &block)
+    @types   = {}
+    @order   = []
+    @context = []
+    @reverse = reverse
+    file ||= Rbbt.share.ner['config.rb'].find if !file && !block
+    parse(:define, file, &block)
+  end
+  def config
+    @config[:define]
+  end
+  def window(positions)
+    @window = positions
+  end
+  def context(name, &block)
+    if name.is_a? Array
+      @context += name
+    else
+      @context.push name
+      # The block might be wrongly assigned to this function
+      # instead of the actual definition, fix that.
+      if block
+        @types[name] = block
+      end
+    end
+  end
+  def direction(dir)
+    if dir.to_sym == :reverse
+      @reverse = true
+    end
+  end
+  def features(word)
+    values = [word]
+    @order.each{|features|
+      action = @types[features]
+      if action.is_a?(Proc)
+        values.push(action.call(word))
+      else
+        m = action.match(word)
+        if m
+          if m[1]
+            values.push(m[1])
+          else
+            values.push(m != nil)
+          end
+        else
+          values.push(false)
+        end
+      end
+    }
+    values
+  end
+  def template(window=nil)
+    window ||= @window || [1,-1]
+    template = ""
+    i = 1
+    @order.each{|feat|
+      template += "U#{ feat }: %x[0,#{ i }]\n"
+      if @context.include?(feat)
+        window.each{|p|
+          template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
+        }
+      end
+      i += 1
+    }
+    template += "B\n"
+    template
+  end
+  def text_features(text, positive = nil)
+    text = self.class.reverse(text) if @reverse
+    initial = true
+    self.class.tokens(text).collect{|token|
+      features = features(token)
+      if !positive.nil?
+        features << (positive ? (initial ? 1 : 2) : 0)
+        initial = false
+      end
+      features
+    }
+  end
+  def tagged_features(text, mentions)
+    mentions ||= []
+    mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
+    re = mentions.collect{|mention|
+      Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
+    }.join("|")
+    positive = false
+    features = []
+    chunks = text.split(/(#{re})/)
+    chunks.each{|t|
+      chunk_features = text_features(t, positive)
+      positive = !positive
+      if @reverse
+        features = chunk_features + features
+      else
+        features = features + chunk_features
+      end
+    }
+    features
+  end
+  def train(features, model)
+    tmp_template = TmpFile.tmp_file("template-")
+    Open.write(tmp_template,template)
+    cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}'  '#{features}' '#{model}'"
+    system cmd
+    Open.write(model + '.config',config)
+    FileUtils.rm tmp_template
+  end
+end
+class NER
+  def initialize(model = nil)
+    begin
+      require 'CRFPP'
+    rescue Exception
+      require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
+    end
+    model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
+    @parser = NERFeatures.new(model + '.config')
+    @reverse = @parser.reverse
+    @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
+  end
+  def extract(text)
+    features = @parser.text_features(text)
+    @tagger.clear
+    features.each{|feats|
+      @tagger.add(feats.join(" "))
+    }
+    @tagger.parse
+    found = []
+    mention = []
+    @tagger.size.times{|i|
+      label = @tagger.y(i)
+      word  = @tagger.x(i,0)
+      if word == ')'
+        mention.push(')') if mention.join =~ /\(/
+        next
+      end
+      case label
+      when 1
+        if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
+          found.push(mention)
+          mention = []
+        end
+        mention.push(word)
+      when 2
+        mention.push(word)
+      when 0
+        found.push(mention) if mention.any?
+        mention = []
+      end
+    }
+    found << mention if mention.any?
+    found.collect{|list|
+      list = list.reverse if @reverse
+      list.join(" ")
+    }
+  end
+end

data/lib/rbbt/ner/rnorm/tokens.rb CHANGED Viewed

@@ -172,6 +172,7 @@ class Tokenizer
   #{{{ Token Types
   GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
+  GREEK_LETTER_RE = "(?:" + $inverse_greek.keys.select{|w| w.length == 1}.collect{|w| w.upcase}.join("|") + ")"
   def tokenize(word)
     return word.
       gsub(/([^IVX])I$/,'\1|I|').     # Separate last roman number
@@ -180,6 +181,7 @@ class Tokenizer
       gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
       gsub(/^(#{GREEK_RE})/,'\1-').
       gsub(/(#{GREEK_RE})$/,'-\1').
+      gsub(/(#{GREEK_LETTER_RE})$/,'-\1').
       split( /[^\w.]+/).  # Split by separator char
       select{|t|  !t.empty? }
   end
@@ -204,7 +206,7 @@ class Tokenizer
   end
   #{{{ Comparisons
   def evaluate_tokens(list1, list2)
     @operations.inject(0){|acc, o|
       acc + o.eval(list1, list2)

data/lib/rbbt/ner/rnorm.rb CHANGED Viewed

@@ -18,6 +18,10 @@ class Normalizer
     values.select{|p| p[1] == best}
   end
+  def token_evaluate(mention, name)
+    @tokens.evaluate(mention, name)
+  end
   # Compares the tokens and gives each candidate a score based on the
   # commonalities and differences amongst the tokens.
   def token_score(code, mention)
@@ -31,7 +35,7 @@ class Normalizer
               when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
                 80
               else
-                @tokens.evaluate(mention, name)
+                token_evaluate(mention, name)
               end
       [value, name]
     }.sort_by{|value, name| value }.last

data/lib/rbbt/ner/token_trieNER.rb CHANGED Viewed

@@ -249,7 +249,8 @@ class TokenTrieNER < NER
       match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
     }
-    NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
+    type = type.first
+    NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
   end
   attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem

data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module OpenNLP
   Rbbt.claim Rbbt.software.opt.OpenNLP, :install, Rbbt.share.install.software.OpenNLP.find
-  Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "http://opennlp.sourceforge.net/models-1.5/de-sent.bin"
+  Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "https://www.apache.org/dyn/closer.cgi/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"
   MAX = 5