RubyGems - rbbt-text - Versions diffs - 1.1.9 → 1.3.3 - Mend

rbbt-text 1.1.9 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

checksums.yaml +4 -4
data/lib/rbbt/bow/bow.rb +5 -2
data/lib/rbbt/bow/dictionary.rb +27 -23
data/lib/rbbt/document.rb +56 -0
data/lib/rbbt/document/annotation.rb +45 -0
data/lib/rbbt/document/corpus.rb +61 -0
data/lib/rbbt/document/corpus/pubmed.rb +33 -0
data/lib/rbbt/ner/NER.rb +3 -3
data/lib/rbbt/ner/abner.rb +1 -1
data/lib/rbbt/ner/banner.rb +1 -1
data/lib/rbbt/ner/brat.rb +1 -1
data/lib/rbbt/ner/chemical_tagger.rb +1 -2
data/lib/rbbt/ner/g_norm_plus.rb +42 -12
data/lib/rbbt/ner/linnaeus.rb +3 -3
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
data/lib/rbbt/ner/oscar3.rb +1 -2
data/lib/rbbt/ner/oscar4.rb +3 -3
data/lib/rbbt/ner/patterns.rb +5 -5
data/lib/rbbt/ner/regexpNER.rb +1 -2
data/lib/rbbt/ner/token_trieNER.rb +35 -22
data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
data/lib/rbbt/nlp/nlp.rb +5 -5
data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
data/lib/rbbt/nlp/spaCy.rb +52 -0
data/lib/rbbt/segment.rb +179 -0
data/lib/rbbt/segment/annotation.rb +58 -0
data/lib/rbbt/segment/encoding.rb +18 -0
data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
data/lib/rbbt/segment/overlaps.rb +63 -0
data/lib/rbbt/segment/range_index.rb +35 -0
data/lib/rbbt/segment/relationship.rb +7 -0
data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
data/lib/rbbt/segment/token.rb +23 -0
data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
data/lib/rbbt/segment/tsv.rb +41 -0
data/share/install/software/Linnaeus +1 -1
data/share/install/software/OpenNLP +1 -1
data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
data/test/rbbt/document/test_annotation.rb +140 -0
data/test/rbbt/document/test_corpus.rb +33 -0
data/test/rbbt/ner/test_finder.rb +3 -3
data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
data/test/rbbt/ner/test_patterns.rb +9 -9
data/test/rbbt/ner/test_regexpNER.rb +14 -14
data/test/rbbt/ner/test_rnorm.rb +3 -4
data/test/rbbt/ner/test_token_trieNER.rb +1 -0
data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
data/test/rbbt/segment/test_annotation.rb +39 -0
data/test/rbbt/segment/test_corpus.rb +36 -0
data/test/rbbt/segment/test_encoding.rb +24 -0
data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
data/test/rbbt/segment/test_overlaps.rb +69 -0
data/test/rbbt/segment/test_range_index.rb +42 -0
data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
data/test/rbbt/test_document.rb +14 -0
data/test/rbbt/test_segment.rb +182 -0
data/test/test_helper.rb +5 -3
data/test/test_spaCy.rb +32 -0
metadata +44 -32
data/lib/rbbt/text/corpus.rb +0 -106
data/lib/rbbt/text/corpus/document.rb +0 -361
data/lib/rbbt/text/corpus/document_repo.rb +0 -68
data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
data/lib/rbbt/text/document.rb +0 -39
data/lib/rbbt/text/segment.rb +0 -355
data/lib/rbbt/text/segment/docid.rb +0 -46
data/lib/rbbt/text/segment/relationship.rb +0 -24
data/lib/rbbt/text/segment/token.rb +0 -49
data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
data/test/rbbt/text/corpus/test_document.rb +0 -52
data/test/rbbt/text/segment/test_relationship.rb +0 -0
data/test/rbbt/text/segment/test_segmented.rb +0 -23
data/test/rbbt/text/test_corpus.rb +0 -34
data/test/rbbt/text/test_document.rb +0 -58
data/test/rbbt/text/test_segment.rb +0 -100

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 77391b4691e4ea2a6e5da918bc40820bae8175ff1d82f9c96a1685986605dfd7
-  data.tar.gz: a83dd9236502d1787f1040fb4c60a6160086515713282283e434b589c1425743
+  metadata.gz: d25c6d473e1ee0a8ba79af357571181539b6e18e6b8d11e85fcca037069be3bf
+  data.tar.gz: dbc3621f7fbc0ab5569b9f98a527c20cbc4192c6db211504a904364452518caf
 SHA512:
-  metadata.gz: f69d7eb10741d2b3c7735e8e29f29625567775647d16d0261b42cce108d2f8309a2e938dad3360842a964a9c5d4fd5a2197c72618ab40971f7a65306e9c6936a
-  data.tar.gz: dec802a15cfc7c8c9a90ee8ec0c83af88c881ee16e071776a995554aa0661603bdd6cb7bf30162c43beccf1a423a2e8d26afc15f92544ccc08284a87a038a1b2
+  metadata.gz: 7a6568d91518fa0c4aedd748fe2b7c2db745a2997efb03c00993ebc24f6682422d209aa266912bcaca32c2033b6babbf9b14db2bf39973b4a33a69fa9ed07eca
+  data.tar.gz: 2c87eeccbb22e90c87611024429918e4a3fbdcf8212d6b6538e063d6e0116a457a6c855bafad4ac4621bfb7ae91ff18d2cea0cbcf56dfdff79c2ec88666cbf18

data/lib/rbbt/bow/bow.rb CHANGED

@@ -69,6 +69,11 @@ module BagOfWords
     count = bigrams ? count(bigrams(text)) : count(words(text))
     count.values_at(*terms)
   end
+  def self.weighted_features(text, weights)
+    features = features(text, weights.keys)
+    features.zip(weights.values).collect{|f,w| f * w }
+  end
 end
 class String
@@ -82,5 +87,3 @@ class String
     BagOfWords.bigrams(self)
   end
 end

data/lib/rbbt/bow/dictionary.rb CHANGED

@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
   end
   def best(options = {})
-    high, low, limit = {
-      :low   => 0,
-      :high    => 1,
-    }.merge(options).
-    values_at(:high, :low, :limit)
-    num_docs = @num_docs.to_f
-    best = df.select{|term, value|
-      value >= low && value <= high
-    }.collect{|p|
-      term     = p.first
-      df_value = p.last
-      [term,
-       @terms[term].to_f / num_docs * Math::log(1.0/df_value)
-      ]
-    }
-    if limit
-      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
-    else
-      Hash[*best.flatten]
-    end
+    key = Misc.obj2digest(options)
+    @best ||= {}
+    @best[key] ||= begin
+                     high, low, limit = {
+                       :low   => 0,
+                       :high    => 1,
+                     }.merge(options).
+                     values_at(:high, :low, :limit)
+                     num_docs = @num_docs.to_f
+                     best = df.select{|term, value|
+                       value >= low && value <= high
+                     }.collect{|p|
+                       term     = p.first
+                       df_value = p.last
+                       [term,
+                        @terms[term].to_f / num_docs * Math::log(1.0/df_value)
+                       ]
+                     }
+                     if limit
+                       Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit-1).flatten]
+                     else
+                       Hash[*best.flatten]
+                     end
+                   end
   end
   def weights(options = {})
@@ -173,7 +177,7 @@ class Dictionary::KL
       best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
     }
     if limit
-      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
+      Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit-1).flatten]
     else
       best
     end

data/lib/rbbt/document.rb ADDED

@@ -0,0 +1,56 @@
+require 'rbbt-util'
+require 'rbbt/entity'
+module DocID
+  extend Entity
+  self.annotation :corpus
+  class << self
+    attr_accessor :default_corpus
+  end
+  def corpus
+    annotation_values[:corpus] || DocID.default_corpus
+  end
+  property :to do |type|
+    namespace, code = self.split(":")
+    DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
+  end
+  property :document => :both do
+    if Array === self
+      namespace, id, type = nil, nil, nil
+      docs = self.collect do |docid|
+        text = self.corpus[docid]
+        namespace, id, type = docid.split(":")
+        #Document.setup(text, namespace, id, type, :corpus => corpus)
+        text
+      end
+      Document.setup(docs, :corpus => corpus)
+    else
+      text = self.corpus[self]
+      namespace, id, type = self.split(":")
+      Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
+    end
+  end
+end
+module Document
+  extend Entity
+  self.annotation :namespace, :code, :type, :corpus
+  property :docid do |corpus=nil|
+    digest = Misc.digest(self)
+    corpus = self.corpus if corpus.nil?
+    DocID.setup([namespace, code, type, digest] * ":", :corpus => corpus)
+  end
+  property :to do |type|
+    docid.to(type).document
+  end
+  alias id docid
+end

data/lib/rbbt/document/annotation.rb ADDED

@@ -0,0 +1,45 @@
+require 'rbbt/segment'
+require 'rbbt/segment/annotation'
+module Document
+  def self.define(type, &block)
+    send :property, type do
+      segments = self.instance_exec &block
+      Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
+      segments.each do |segment|
+        SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
+      end
+      docid = self.docid
+      segments.each{|s| s.docid = docid if s.docid.nil? }
+      segments
+    end
+  end
+  def self.define_multiple(type, &block)
+    send :property, type => :multiple do |list|
+      doc_segments = self.instance_exec list, &block
+      doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
+      doc_segments.each_with_index do |segments,i|
+        next if segments.nil?
+        document = list[i]
+        Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
+        segments.each do |segment|
+          SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
+        end
+        docid = document.docid
+        segments.each{|s| s.docid = docid if s.docid.nil? }
+        segments
+      end
+    end
+  end
+end

data/lib/rbbt/document/corpus.rb ADDED

@@ -0,0 +1,61 @@
+require 'rbbt-util'
+module Document::Corpus
+  def self.setup(corpus)
+    corpus.extend Document::Corpus unless Document::Corpus === corpus
+    corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
+    corpus
+  end
+  def add_document(document)
+    docid = document.docid
+    return document if self.include?(docid)
+    self.write_and_close do
+      self[docid] = document
+    end
+  end
+  def docids(prefix)
+    prefix += ":" unless prefix[-1] == ":"
+    docids = self.read_and_close do
+      self.prefix(prefix)
+    end
+    DocID.setup(docids, :corpus => self)
+  end
+  def documents(prefix)
+    self.docids(prefix).document
+  end
+  def [](*args)
+    docid, *rest = args
+    res = self.read_and_close do
+      super(*args)
+    end
+    return res if args.length > 1
+    namespace, id, type  = docid.split(":")
+    if res.nil?
+      if Document::Corpus.claims.include?(namespace.to_s)
+        res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
+      end
+    end
+    Document.setup(res, namespace, id, type, self) unless res.nil?
+    res
+  end
+  class << self
+    attr_accessor :claims
+    def claim(namespace, &block)
+      @claims = {}
+      @claims[namespace.to_s] = block
+    end
+  end
+end

data/lib/rbbt/document/corpus/pubmed.rb ADDED

@@ -0,0 +1,33 @@
+require 'rbbt/sources/pubmed'
+module Document::Corpus
+  def add_pmid(pmid, type = nil)
+    pmids = Array === pmid ? pmid : [pmid]
+    type = nil if String === type and type.empty?
+    res = PubMed.get_article(pmids).collect do |pmid, article|
+      document = if type.nil? || type.to_sym == :abstract
+                   Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
+                 elsif type.to_sym == :title
+                   Document.setup(article.title, :PMID, pmid, :title, self)
+                 else
+                   raise "No FullText available for #{ pmid }" if article.full_text.nil?
+                   Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
+                 end
+      Log.debug "Loading pmid #{pmid}"
+      add_document(document)
+    end
+    Document.setup(res)
+  end
+  def add_pubmed_query(query, max = 3000, type = nil)
+    pmids = PubMed.query(query, max)
+    add_pmid(pmids, type)
+  end
+  self.claim "PMID" do |id, type|
+    Log.debug "Claiming #{id}"
+    self.add_pmid(id, type).first
+  end
+end

data/lib/rbbt/ner/NER.rb CHANGED

@@ -1,6 +1,6 @@
-require 'rbbt/text/segment'
-require 'rbbt/text/segment/named_entity'
-require 'rbbt/text/segment/segmented'
+require 'rbbt/segment'
+require 'rbbt/segment/named_entity'
+require 'rbbt/segment/segmented'
 class NER
   def entities(text, protect = false, *args)

data/lib/rbbt/ner/abner.rb CHANGED

@@ -1,7 +1,7 @@
 require 'rbbt'
 require 'rjb'
 require 'rbbt/resource'
-require 'rbbt/text/segment'
+require 'rbbt/segment'
 require 'rbbt/ner/NER'
 # Offers a Ruby interface to the Abner Named Entity Recognition Package

data/lib/rbbt/ner/banner.rb CHANGED

@@ -1,7 +1,7 @@
 require 'rbbt'
 require 'rjb'
 require 'rbbt/ner/NER'
-require 'rbbt/text/segment'
+require 'rbbt/segment'
 # Offers a Ruby interface to the Banner Named Entity Recognition Package
 # in Java. Banner[http://banner.sourceforge.net/].

data/lib/rbbt/ner/brat.rb CHANGED

@@ -1,4 +1,4 @@
-require 'rbbt/text/segment/named_entity'
+require 'rbbt/segment/named_entity'
 require 'rbbt/text/segment/relationship'
 module Brat
   Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"

data/lib/rbbt/ner/chemical_tagger.rb CHANGED

@@ -1,6 +1,5 @@
 require 'rbbt'
 require 'rjb'
-require 'rbbt/text/segment'
 require 'rbbt/ner/NER'
 require 'rbbt/util/log'
@@ -8,7 +7,7 @@ class ChemicalTagger < NER
   Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
   def self.init
-    ENV["CLASSPATH"] = ENV["CLASSPATH"].split(":").reverse * ":"
+    ENV["CLASSPATH"] = [ENV["CLASSPATH"].split(":"), Rbbt.software.opt.ChemicalTagger.produce.glob("*.jar").first].reverse * ":"
     Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
     @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
   end

data/lib/rbbt/ner/g_norm_plus.rb CHANGED

@@ -1,4 +1,6 @@
 require 'rbbt-util'
+require 'rbbt/segment'
+require 'rbbt/segment/named_entity'
 module GNormPlus
   Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
@@ -10,35 +12,39 @@ module GNormPlus
   end
   CONFIG =<<-EOF
 #===Annotation
 #Attribution setting:
 #FocusSpecies = Taxonomy ID
-#	All: All species
-#	9606: Human
-#	4932: yeast
-#	7227: Fly
-#	10090: Mouse
-#	10116: Rat
-#	7955: Zebrafish
-#	3702: Arabidopsis thaliana
+#       All: All species
+#       9606: Human
+#       4932: yeast
+#       7227: Fly
+#       10090: Mouse
+#       10116: Rat
+#       7955: Zebrafish
+#       3702: Arabidopsis thaliana
 #open: True
 #close: False
 [Focus Species]
-	FocusSpecies = All
+	FocusSpecies = 9606
+	FilterAntibody = False
 [Dictionary & Model]
 	DictionaryFolder = ./Dictionary
 	GNRModel = ./Dictionary/GNR.Model
 	SCModel = ./Dictionary/SimConcept.Model
 	GeneIDMatch = True
+	HomologeneID = False
 	Normalization2Protein = False
+	ShowUnNormalizedMention = False
+	IgnoreNER = False
 	DeleteTmp = True
 EOF
   def self.process(texts)
     TmpFile.with_file do |tmpdir|
       Open.mkdir tmpdir
       Misc.in_dir tmpdir do
         Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
         Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
@@ -49,13 +55,18 @@ EOF
         Open.mkdir 'tmp'
         texts.each do |name,text|
+          text = Misc.fixutf8(text)
+          text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
           Open.write("input/#{name}.txt") do |f|
             f.puts "#{name}|a|" << text
             f.puts
           end
         end
         Open.write('config', CONFIG)
-        CMD.cmd_log("java -Xmx20G -Xms20G  -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
+        CMD.cmd_log("java -Xmx20G -Xms20G  -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
         if texts.respond_to? :key_field
           key_field = texts.key_field
@@ -65,13 +76,32 @@ EOF
         tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
         Dir.glob("output/*.txt").each do |file|
           name = File.basename(file).sub(".txt",'')
-          entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
+          entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '·').split("\t")[1..-1] * ":"}
           tsv[name] = entities
         end
+        raise "GNormPlus failed: no results found" if tsv.size == 0 && texts.size > 0
         tsv
       end
     end
   end
+  def self.entities(texts)
+    res = {}
+    process(texts).each do |name, entities|
+      segments = entities.collect do |entity|
+        start, eend, literal, type, code = entity.split(":")
+        literal.gsub!('·',':')
+        NamedEntity.setup(literal, :offset => start.to_i, :entity_type => type, :code => code)
+      end
+      res[name] = segments
+    end
+    res
+  end
 end
 if __FILE__ == $0