RubyGems - rbbt-text - Versions diffs - 0.6.3 → 1.0.0 - Mend

rbbt-text 0.6.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

data/lib/rbbt/corpus/document.rb +1 -0
data/lib/rbbt/entity/document.rb +62 -18
data/lib/rbbt/ner/abner.rb +6 -3
data/lib/rbbt/ner/banner.rb +10 -7
data/lib/rbbt/ner/chemical_tagger.rb +5 -3
data/lib/rbbt/ner/finder.rb +60 -0
data/lib/rbbt/ner/linnaeus.rb +38 -0
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
data/lib/rbbt/ner/oscar3.rb +9 -6
data/lib/rbbt/ner/oscar4.rb +21 -7
data/lib/rbbt/ner/rnorm.rb +57 -33
data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
data/lib/rbbt/ner/segment.rb +19 -8
data/lib/rbbt/ner/segment/docid.rb +46 -0
data/lib/rbbt/ner/segment/named_entity.rb +1 -1
data/lib/rbbt/ner/segment/transformed.rb +5 -3
data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
data/share/install/software/Linnaeus +21 -0
data/share/install/software/OpenNLP +12 -0
data/share/rnorm/tokens_default +1 -2
data/test/rbbt/entity/test_document.rb +66 -0
data/test/rbbt/ner/segment/test_transformed.rb +10 -0
data/test/rbbt/ner/test_finder.rb +34 -0
data/test/rbbt/ner/test_linnaeus.rb +16 -0
data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
data/test/rbbt/ner/test_oscar4.rb +3 -3
data/test/rbbt/ner/test_rnorm.rb +3 -3
data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
data/test/test_helper.rb +1 -1
metadata +101 -99
data/test/rbbt/corpus/test_corpus.rb +0 -99
data/test/rbbt/corpus/test_document.rb +0 -236

data/lib/rbbt/corpus/document.rb CHANGED Viewed

@@ -254,6 +254,7 @@ class Document
       name = name.to_s
       index = segment_index(name, persist_dir)
       annotations = index[segment.range]
+      segment.segments[name] ||= {}
       segment.segments[name] = annotations
       class << segment
         self

data/lib/rbbt/entity/document.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'rbbt/entity'
+require 'rbbt/ner/segment/docid'
 module Document
   extend Entity
@@ -7,34 +8,77 @@ module Document
     attr_accessor :corpus
   end
-  property :text => :array2single do
+  attr_accessor :docid
+  property :docid => :single2array do |*args|
+    @docid ||= if self =~ /^text:/
+                 self
+               else
+                 ["text", Misc.digest(self.inspect)] * ":"
+               end
+    @docid
+  end
+  #property :annotation_id => :single2array do |*args|
+  #  docid(*args)
+  #end
+  property :annotation_id => :both do |*args|
+    if Array === self
+      Misc.hash2md5(info.merge(:self => self))
+    else
+      docid(*args)
+    end
+  end
+  property :_get_text => :single do
+    self
+  end
+  property :text => :array2single do |*args|
     article_text = {}
     missing = []
-    self.each do |doc|
+    if Document.corpus.nil?
+      self._get_text(*args)
+    else
       Document.corpus.read if Document.corpus.respond_to? :read
-      if Document.corpus.include?(doc)
-        article_text[doc] =  Document.corpus[doc]
-      else
-        missing << doc
+      self.each do |doc|
+        case
+        when Document.corpus.include?(doc)
+          article_text[doc] =  Document.corpus[doc]
+        when Document.corpus.include?(doc.docid(*args))
+          article_text[doc] =  Document.corpus[doc.docid(*args)]
+        else
+          missing << doc
+        end
       end
-    end
+      Document.corpus.close if Document.corpus.respond_to? :close
+      if missing.any?
+        missing.first.annotate missing
+        missing_text = Misc.process_to_hash(missing){|list| list._get_text(*args)}
-    if missing.any?
-      missing.first.annotate missing
-      missing_text = Misc.process_to_hash(missing){|list| list._get_text}
+        Misc.lock(Document.corpus.respond_to?(:persistence_path) ? Document.corpus.persistence_path : nil) do
+          Document.corpus.write if Document.corpus.respond_to? :write and not Document.corpus.write?
-      Misc.lock Document.corpus.persistence_path do
-        Document.corpus.write if Document.corpus.respond_to? :write
-        missing_text.each do |doc, text|
-          article_text[doc] = text
-          Document.corpus[doc] = text
+          missing_text.each do |doc, doc_text|
+            doc = missing.first.annotate doc.dup
+            Document.corpus[doc.docid(*args)] = doc_text
+            article_text[doc] = doc_text
+          end
+          Document.corpus.close if Document.corpus.respond_to? :close
         end
-        Document.corpus.read if Document.corpus.respond_to? :read
       end
-    end
-    article_text.values_at *self
+      article_text.values_at *self
+    end
   end
 end

data/lib/rbbt/ner/abner.rb CHANGED Viewed

@@ -10,13 +10,16 @@ class Abner < NER
   Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
-  @@JFile   = Rjb::import('java.io.File')
-  @@Tagger  = Rjb::import('abner.Tagger')
-  @@Trainer = Rjb::import('abner.Trainer')
+  def self.init
+    @@JFile   ||= Rjb::import('java.io.File')
+    @@Tagger  ||= Rjb::import('abner.Tagger')
+    @@Trainer ||= Rjb::import('abner.Trainer')
+  end
   # If modelfile is present a custom trained model can be used,
   # otherwise, the default BioCreative model is used.
   def initialize(modelfile=nil)
+    Abner.init
     if modelfile == nil
       @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
     else

data/lib/rbbt/ner/banner.rb CHANGED Viewed

@@ -9,13 +9,15 @@ class Banner < NER
   Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
-  @@JFile = Rjb::import('java.io.File')
-  @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
-  @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
-  @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
-  @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
-  @@Sentence = Rjb::import('banner.Sentence')
-  @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
+  def self.init
+    @@JFile                    ||= Rjb::import('java.io.File')
+    @@SimpleTokenizer          ||= Rjb::import('banner.tokenization.SimpleTokenizer')
+    @@CRFTagger                ||= Rjb::import('banner.tagging.CRFTagger')
+    @@ParenthesisPostProcessor ||= Rjb::import('banner.processing.ParenthesisPostProcessor')
+    @@HeppleTagger             ||= Rjb::import('dragon.nlp.tool.HeppleTagger')
+    @@Sentence                 ||= Rjb::import('banner.Sentence')
+    @@EngLemmatiser            ||= Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
+  end
@@ -26,6 +28,7 @@ class Banner < NER
                  lemmadir  = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
                  taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
                 )
+    Banner.init
     @tokenizer = @@SimpleTokenizer.new

data/lib/rbbt/ner/chemical_tagger.rb CHANGED Viewed

@@ -7,11 +7,13 @@ require 'rbbt/util/log'
 class ChemicalTagger < NER
   Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
-  Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
-  @@RbbtChemicalTagger = Rjb::import('RbbtChemicalTagger')
+  def self.init
+    Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
+    @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
+  end
   def self.match(text,  type = nil, memm =  false)
+    self.init
     return [] if text.nil? or text.strip.empty?

data/lib/rbbt/ner/finder.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'rbbt/ner/rnorm'
+class Finder
+  if defined? Entity
+    module Match
+      extend Entity
+      self.annotation :format
+      self.annotation :namespace
+      self.annotation :score
+    end
+  end
+  class Instance
+    attr_accessor :namespace, :format, :normalizer
+    def initialize(path, open_options = {})
+      if TSV === path
+        @namespace = path.namespace
+        @format = path.key_field
+        @normalizer = Normalizer.new(path)
+      else
+        open_options = Misc.add_defaults open_options, :type => :flat
+        parser = TSV::Parser.new(Open.open(Path === path ? path.find : path), open_options)
+        @namespace = parser.namespace
+        @format = parser.key_field
+        @normalizer = Normalizer.new(Path === path ? path.tsv(open_options) : TSV.open(path, open_options))
+      end
+    end
+    def find(name)
+      candidates = @normalizer.match(name)
+      if defined? Finder::Match
+        candidates.collect{|c|
+          Finder::Match.setup(c.dup, @format, @namespace, @normalizer.token_score(c, name))
+        }
+      else
+        candidates
+      end
+    end
+  end
+  attr_accessor :instances
+  def initialize(path = nil, open_options = {})
+    @instances ||= []
+    @instances << Finder::Instance.new(path, open_options) unless path.nil?
+  end
+  def add_instance(path, open_options = {})
+    @instances << Finder::Instance.new(path, open_options)
+  end
+  def find(name)
+    @instances.inject([]) do |acc,instance|
+      acc += instance.find(name)
+    end
+  end
+end

data/lib/rbbt/ner/linnaeus.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'rjb'
+require 'rbbt'
+require 'rbbt/ner/segment/named_entity'
+module Linnaeus
+  Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
+  ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
+  def self.init
+    begin
+      Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx4G']) unless Rjb.loaded?
+      @@ArgParser    = Rjb::import('martin.common.ArgParser')
+      @@Args         = @@ArgParser.new(ARGS)
+      @@Loggers      = Rjb::import('martin.common.Loggers')
+      @@Logger       = @@Loggers.getDefaultLogger(@@Args)
+      @@EntityTagger = Rjb::import('uk.ac.man.entitytagger.EntityTagger')
+      @@Matcher      = @@EntityTagger.getMatcher(@@Args, @@Logger)
+    rescue
+      if $!.message =~ /heap space/i
+        Log.warn "Heap Space seems too low. Make sure Linnaeus is loaded before other Java wrappers so that it has the chance to init the Java Bridge with sufficient heap space"
+      end
+      raise $!
+    end
+  end
+  def self.match(text)
+    init unless defined? @@Matcher
+    @@Matcher.match(text).toArray().collect do |mention|
+      NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
+    end
+  end
+end

data/lib/rbbt/ner/ngram_prefix_dictionary.rb CHANGED Viewed

@@ -6,24 +6,26 @@ require 'rbbt/ner/segment/token'
 require 'rbbt/ner/NER'
 require 'inline'
 # This code was adapted from Ashish Tendulkar (ASK MARTIN)
 class NGramPrefixDictionary < NER
   STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
-  STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
-  class << self
-    inline do |builder|
+  STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}
+  LETTER_REGEXP = Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)
+  inline do |builder|
-      builder.c_raw <<-EOC
+    builder.c_raw_singleton <<-EOC
 int is_stop_letter(char letter)
 {
-  if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
+  if( letter == ' ' || letter == '\\n' || letter == '\\r' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
   return 0;
 }
-      EOC
+    EOC
-      builder.c <<-EOC
+    builder.c_singleton <<-EOC
 VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
 {
   int length_cmp = RSTRING_LEN(cmp);
@@ -38,48 +40,59 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
   return Qfalse;
 }
-      EOC
-    end
+    EOC
   end
-  def self.process_stream(stream)
+  def self.process_stream(stream, case_insensitive = false)
     index = {}
     while line = stream.gets
       names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
       code = names.shift
       names.each do |name|
+        name = name.downcase if case_insensitive
         ngram = name[0..2].strip
         index[ngram] ||= []
         index[ngram] << [name, code]
       end
     end
     index
   end
-  def self.process_hash(hash)
+  def self.process_hash(hash, case_insensitive = false)
     index = {}
     hash.monitor = true if hash.respond_to? :monitor
     hash.unnamed = true if hash.respond_to? :unnamed
     method = hash.respond_to?(:through)? :through : :each
     hash.send(method) do |code, names|
       names.each do |name|
+        name = name.downcase if case_insensitive
         ngram = name[0..2].strip
         index[ngram] ||= []
         index[ngram] << [name, code]
       end
     end
     index
   end
   def self.match(index, text)
+    return [] if text.nil? or text.empty?
     matches = []
     text_offset = 0
     text_length = text.length
     while (not text_offset.nil?) and text_offset < text_length
-      text_offset += 1 if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
+      if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
+        text_offset += 1
+        next
+      end
       ngram =  text[text_offset..text_offset + 2].strip
       found = nil
@@ -88,12 +101,12 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
         diff = text_length - text_offset
         # Match with entries
         index[ngram].each do |name, code|
-          if name.length < diff
+          if name.length <= diff
             #if piece.start_with? name and
             #  (text_offset + name.length == text_length or piece[name.length] == " "[0])
             if fast_start_with(text, name, text_offset)
-              found = [name, code, text_offset]
+              found = [name.dup, code, text_offset]
               break
             end
           end
@@ -101,7 +114,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
       end
       if found.nil?
-        text_offset = text.index(" ", text_offset)
+        text_offset = text.index(LETTER_REGEXP, text_offset)
         text_offset += 1 unless text_offset.nil?
       else
         matches << found
@@ -112,22 +125,24 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
     matches
   end
-  attr_accessor :index, :type
-  def initialize(file, type = nil)
+  attr_accessor :index, :type, :case_insensitive
+  def initialize(file, type = nil, case_insensitive = false)
     @type = type
+    @case_insensitive = case_insensitive
     case
     when (TSV === file or Hash === file)
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
-      @index = NGramPrefixDictionary.process_hash(file)
+      @index = NGramPrefixDictionary.process_hash(file, case_insensitive)
     when Path === file
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
-      @index = NGramPrefixDictionary.process_stream(file.open)
+      @index = NGramPrefixDictionary.process_stream(file.open, case_insensitive)
     when Misc.is_filename?(file)
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
       @index = NGramPrefixDictionary.process_stream(Open.open(file))
     when StreamIO === file
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
-      @index = NGramPrefixDictionary.process_stream(file)
+      @index = NGramPrefixDictionary.process_stream(file, case_insensitive)
     else
       raise "Format of lexicon not understood: #{file.inspect}"
     end
@@ -136,36 +151,15 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
   end
   def match(text)
-    NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
+    matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
       NamedEntity.setup(name, offset, type, code)
     }
-  end
-end
-if __FILE__ == $0
-  require 'rbbt/sources/jochem'
-  require 'rbbt/sources/pubmed'
-  texts = []
-  index = {}
-  texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
-    article.text
-  end
-  texts *= 150/texts.length
-  tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => false, :grep => "GB"
-  #tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => true
-  tsv.unnamed = true
-  ner = NGramPrefixDictionary.new(tsv)
-  Misc.benchmark do
-    texts.each do |text|
-      ner.match(text)
+    if case_insensitive
+      matches.each{|m| m.replace(text[m.range])}
+      matches
+    else
+      matches
     end
   end
 end