RubyGems - rbbt-text - Versions diffs - 0.6.3 → 1.0.0 - Mend

rbbt-text 0.6.3 → 1.0.0

Files changed (34) hide show

data/lib/rbbt/corpus/document.rb +1 -0
data/lib/rbbt/entity/document.rb +62 -18
data/lib/rbbt/ner/abner.rb +6 -3
data/lib/rbbt/ner/banner.rb +10 -7
data/lib/rbbt/ner/chemical_tagger.rb +5 -3
data/lib/rbbt/ner/finder.rb +60 -0
data/lib/rbbt/ner/linnaeus.rb +38 -0
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
data/lib/rbbt/ner/oscar3.rb +9 -6
data/lib/rbbt/ner/oscar4.rb +21 -7
data/lib/rbbt/ner/rnorm.rb +57 -33
data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
data/lib/rbbt/ner/segment.rb +19 -8
data/lib/rbbt/ner/segment/docid.rb +46 -0
data/lib/rbbt/ner/segment/named_entity.rb +1 -1
data/lib/rbbt/ner/segment/transformed.rb +5 -3
data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
data/share/install/software/Linnaeus +21 -0
data/share/install/software/OpenNLP +12 -0
data/share/rnorm/tokens_default +1 -2
data/test/rbbt/entity/test_document.rb +66 -0
data/test/rbbt/ner/segment/test_transformed.rb +10 -0
data/test/rbbt/ner/test_finder.rb +34 -0
data/test/rbbt/ner/test_linnaeus.rb +16 -0
data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
data/test/rbbt/ner/test_oscar4.rb +3 -3
data/test/rbbt/ner/test_rnorm.rb +3 -3
data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
data/test/test_helper.rb +1 -1
metadata +101 -99
data/test/rbbt/corpus/test_corpus.rb +0 -99
data/test/rbbt/corpus/test_document.rb +0 -236

data/lib/rbbt/corpus/document.rb CHANGED Viewed

@@ -254,6 +254,7 @@ class Document
       name = name.to_s
       index = segment_index(name, persist_dir)
       annotations = index[segment.range]
+      segment.segments[name] ||= {}
       segment.segments[name] = annotations
       class << segment
         self

data/lib/rbbt/entity/document.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'rbbt/entity'
+require 'rbbt/ner/segment/docid'
 module Document
   extend Entity
@@ -7,34 +8,77 @@ module Document
     attr_accessor :corpus
   end
-  property :text => :array2single do
+  attr_accessor :docid
+  property :docid => :single2array do |*args|
+    @docid ||= if self =~ /^text:/
+                 self
+               else
+                 ["text", Misc.digest(self.inspect)] * ":"
+               end
+    @docid
+  end
+  #property :annotation_id => :single2array do |*args|
+  #  docid(*args)
+  #end
+  property :annotation_id => :both do |*args|
+    if Array === self
+      Misc.hash2md5(info.merge(:self => self))
+    else
+      docid(*args)
+    end
+  end
+  property :_get_text => :single do
+    self
+  end
+  property :text => :array2single do |*args|
     article_text = {}
     missing = []
-    self.each do |doc|
+    if Document.corpus.nil?
+      self._get_text(*args)
+    else
       Document.corpus.read if Document.corpus.respond_to? :read
-      if Document.corpus.include?(doc)
-        article_text[doc] =  Document.corpus[doc]
-      else
-        missing << doc
+      self.each do |doc|
+        case
+        when Document.corpus.include?(doc)
+          article_text[doc] =  Document.corpus[doc]
+        when Document.corpus.include?(doc.docid(*args))
+          article_text[doc] =  Document.corpus[doc.docid(*args)]
+        else
+          missing << doc
+        end
       end
-    end
+      Document.corpus.close if Document.corpus.respond_to? :close
+      if missing.any?
+        missing.first.annotate missing
+        missing_text = Misc.process_to_hash(missing){|list| list._get_text(*args)}
-    if missing.any?
-      missing.first.annotate missing
-      missing_text = Misc.process_to_hash(missing){|list| list._get_text}
+        Misc.lock(Document.corpus.respond_to?(:persistence_path) ? Document.corpus.persistence_path : nil) do
+          Document.corpus.write if Document.corpus.respond_to? :write and not Document.corpus.write?
-      Misc.lock Document.corpus.persistence_path do
-        Document.corpus.write if Document.corpus.respond_to? :write
-        missing_text.each do |doc, text|
-          article_text[doc] = text
-          Document.corpus[doc] = text
+          missing_text.each do |doc, doc_text|
+            doc = missing.first.annotate doc.dup
+            Document.corpus[doc.docid(*args)] = doc_text
+            article_text[doc] = doc_text
+          end
+          Document.corpus.close if Document.corpus.respond_to? :close
         end
-        Document.corpus.read if Document.corpus.respond_to? :read
       end
-    end
-    article_text.values_at *self
+      article_text.values_at *self
+    end
   end
 end

data/lib/rbbt/ner/abner.rb CHANGED Viewed

@@ -10,13 +10,16 @@ class Abner < NER
   Rbbt.claim Rbbt.software.opt.ABNER, :install, Rbbt.share.install.software.ABNER.find
-  @@JFile   = Rjb::import('java.io.File')
-  @@Tagger  = Rjb::import('abner.Tagger')
-  @@Trainer = Rjb::import('abner.Trainer')
+  def self.init
+    @@JFile   ||= Rjb::import('java.io.File')
+    @@Tagger  ||= Rjb::import('abner.Tagger')
+    @@Trainer ||= Rjb::import('abner.Trainer')
+  end
   # If modelfile is present a custom trained model can be used,
   # otherwise, the default BioCreative model is used.
   def initialize(modelfile=nil)
+    Abner.init
     if modelfile == nil
       @tagger = @@Tagger.new(@@Tagger.BIOCREATIVE)
     else

data/lib/rbbt/ner/banner.rb CHANGED Viewed

@@ -9,13 +9,15 @@ class Banner < NER
   Rbbt.claim Rbbt.software.opt.BANNER, :install, Rbbt.share.install.software.BANNER.find
-  @@JFile = Rjb::import('java.io.File')
-  @@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
-  @@CRFTagger = Rjb::import('banner.tagging.CRFTagger')
-  @@ParenthesisPostProcessor = Rjb::import('banner.processing.ParenthesisPostProcessor')
-  @@HeppleTagger = Rjb::import('dragon.nlp.tool.HeppleTagger')
-  @@Sentence = Rjb::import('banner.Sentence')
-  @@EngLemmatiser = Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
+  def self.init
+    @@JFile                    ||= Rjb::import('java.io.File')
+    @@SimpleTokenizer          ||= Rjb::import('banner.tokenization.SimpleTokenizer')
+    @@CRFTagger                ||= Rjb::import('banner.tagging.CRFTagger')
+    @@ParenthesisPostProcessor ||= Rjb::import('banner.processing.ParenthesisPostProcessor')
+    @@HeppleTagger             ||= Rjb::import('dragon.nlp.tool.HeppleTagger')
+    @@Sentence                 ||= Rjb::import('banner.Sentence')
+    @@EngLemmatiser            ||= Rjb::import('dragon.nlp.tool.lemmatiser.EngLemmatiser')
+  end
@@ -26,6 +28,7 @@ class Banner < NER
                  lemmadir  = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
                  taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
                 )
+    Banner.init
     @tokenizer = @@SimpleTokenizer.new

data/lib/rbbt/ner/chemical_tagger.rb CHANGED Viewed

@@ -7,11 +7,13 @@ require 'rbbt/util/log'
 class ChemicalTagger < NER
   Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
-  Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
-  @@RbbtChemicalTagger = Rjb::import('RbbtChemicalTagger')
+  def self.init
+    Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
+    @@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
+  end
   def self.match(text,  type = nil, memm =  false)
+    self.init
     return [] if text.nil? or text.strip.empty?

data/lib/rbbt/ner/finder.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'rbbt/ner/rnorm'
+class Finder
+  if defined? Entity
+    module Match
+      extend Entity
+      self.annotation :format
+      self.annotation :namespace
+      self.annotation :score
+    end
+  end
+  class Instance
+    attr_accessor :namespace, :format, :normalizer
+    def initialize(path, open_options = {})
+      if TSV === path
+        @namespace = path.namespace
+        @format = path.key_field
+        @normalizer = Normalizer.new(path)
+      else
+        open_options = Misc.add_defaults open_options, :type => :flat
+        parser = TSV::Parser.new(Open.open(Path === path ? path.find : path), open_options)
+        @namespace = parser.namespace
+        @format = parser.key_field
+        @normalizer = Normalizer.new(Path === path ? path.tsv(open_options) : TSV.open(path, open_options))
+      end
+    end
+    def find(name)
+      candidates = @normalizer.match(name)
+      if defined? Finder::Match
+        candidates.collect{|c|
+          Finder::Match.setup(c.dup, @format, @namespace, @normalizer.token_score(c, name))
+        }
+      else
+        candidates
+      end
+    end
+  end
+  attr_accessor :instances
+  def initialize(path = nil, open_options = {})
+    @instances ||= []
+    @instances << Finder::Instance.new(path, open_options) unless path.nil?
+  end
+  def add_instance(path, open_options = {})
+    @instances << Finder::Instance.new(path, open_options)
+  end
+  def find(name)
+    @instances.inject([]) do |acc,instance|
+      acc += instance.find(name)
+    end
+  end
+end

data/lib/rbbt/ner/linnaeus.rb ADDED Viewed

@@ -0,0 +1,38 @@
+require 'rjb'
+require 'rbbt'
+require 'rbbt/ner/segment/named_entity'
+module Linnaeus
+  Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
+  ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
+  def self.init
+    begin
+      Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx4G']) unless Rjb.loaded?
+      @@ArgParser    = Rjb::import('martin.common.ArgParser')
+      @@Args         = @@ArgParser.new(ARGS)
+      @@Loggers      = Rjb::import('martin.common.Loggers')
+      @@Logger       = @@Loggers.getDefaultLogger(@@Args)
+      @@EntityTagger = Rjb::import('uk.ac.man.entitytagger.EntityTagger')
+      @@Matcher      = @@EntityTagger.getMatcher(@@Args, @@Logger)
+    rescue
+      if $!.message =~ /heap space/i
+        Log.warn "Heap Space seems too low. Make sure Linnaeus is loaded before other Java wrappers so that it has the chance to init the Java Bridge with sufficient heap space"
+      end
+      raise $!
+    end
+  end
+  def self.match(text)
+    init unless defined? @@Matcher
+    @@Matcher.match(text).toArray().collect do |mention|
+      NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
+    end
+  end
+end

data/lib/rbbt/ner/ngram_prefix_dictionary.rb CHANGED Viewed

@@ -6,24 +6,26 @@ require 'rbbt/ner/segment/token'
 require 'rbbt/ner/NER'
 require 'inline'
 # This code was adapted from Ashish Tendulkar (ASK MARTIN)
 class NGramPrefixDictionary < NER
   STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
-  STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
-  class << self
-    inline do |builder|
+  STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}
+  LETTER_REGEXP = Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)
+  inline do |builder|
-      builder.c_raw <<-EOC
+    builder.c_raw_singleton <<-EOC
 int is_stop_letter(char letter)
 {
-  if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
+  if( letter == ' ' || letter == '\\n' || letter == '\\r' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
   return 0;
 }
-      EOC
+    EOC
-      builder.c <<-EOC
+    builder.c_singleton <<-EOC
 VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
 {
   int length_cmp = RSTRING_LEN(cmp);
@@ -38,48 +40,59 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
   return Qfalse;
 }
-      EOC
-    end
+    EOC
   end
-  def self.process_stream(stream)
+  def self.process_stream(stream, case_insensitive = false)
     index = {}
     while line = stream.gets
       names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
       code = names.shift
       names.each do |name|
+        name = name.downcase if case_insensitive
         ngram = name[0..2].strip
         index[ngram] ||= []
         index[ngram] << [name, code]
       end
     end
     index
   end
-  def self.process_hash(hash)
+  def self.process_hash(hash, case_insensitive = false)
     index = {}
     hash.monitor = true if hash.respond_to? :monitor
     hash.unnamed = true if hash.respond_to? :unnamed
     method = hash.respond_to?(:through)? :through : :each
     hash.send(method) do |code, names|
       names.each do |name|
+        name = name.downcase if case_insensitive
         ngram = name[0..2].strip
         index[ngram] ||= []
         index[ngram] << [name, code]
       end
     end
     index
   end
   def self.match(index, text)
+    return [] if text.nil? or text.empty?
     matches = []
     text_offset = 0
     text_length = text.length
     while (not text_offset.nil?) and text_offset < text_length
-      text_offset += 1 if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
+      if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
+        text_offset += 1
+        next
+      end
       ngram =  text[text_offset..text_offset + 2].strip
       found = nil
@@ -88,12 +101,12 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
         diff = text_length - text_offset
         # Match with entries
         index[ngram].each do |name, code|
-          if name.length < diff
+          if name.length <= diff
             #if piece.start_with? name and
             #  (text_offset + name.length == text_length or piece[name.length] == " "[0])
             if fast_start_with(text, name, text_offset)
-              found = [name, code, text_offset]
+              found = [name.dup, code, text_offset]
               break
             end
           end
@@ -101,7 +114,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
       end
       if found.nil?
-        text_offset = text.index(" ", text_offset)
+        text_offset = text.index(LETTER_REGEXP, text_offset)
         text_offset += 1 unless text_offset.nil?
       else
         matches << found
@@ -112,22 +125,24 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
     matches
   end
-  attr_accessor :index, :type
-  def initialize(file, type = nil)
+  attr_accessor :index, :type, :case_insensitive
+  def initialize(file, type = nil, case_insensitive = false)
     @type = type
+    @case_insensitive = case_insensitive
     case
     when (TSV === file or Hash === file)
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
-      @index = NGramPrefixDictionary.process_hash(file)
+      @index = NGramPrefixDictionary.process_hash(file, case_insensitive)
     when Path === file
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
-      @index = NGramPrefixDictionary.process_stream(file.open)
+      @index = NGramPrefixDictionary.process_stream(file.open, case_insensitive)
     when Misc.is_filename?(file)
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
       @index = NGramPrefixDictionary.process_stream(Open.open(file))
     when StreamIO === file
       Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
-      @index = NGramPrefixDictionary.process_stream(file)
+      @index = NGramPrefixDictionary.process_stream(file, case_insensitive)
     else
       raise "Format of lexicon not understood: #{file.inspect}"
     end
@@ -136,36 +151,15 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
   end
   def match(text)
-    NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
+    matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
       NamedEntity.setup(name, offset, type, code)
     }
-  end
-end
-if __FILE__ == $0
-  require 'rbbt/sources/jochem'
-  require 'rbbt/sources/pubmed'
-  texts = []
-  index = {}
-  texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
-    article.text
-  end
-  texts *= 150/texts.length
-  tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => false, :grep => "GB"
-  #tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => true
-  tsv.unnamed = true
-  ner = NGramPrefixDictionary.new(tsv)
-  Misc.benchmark do
-    texts.each do |text|
-      ner.match(text)
+    if case_insensitive
+      matches.each{|m| m.replace(text[m.range])}
+      matches
+    else
+      matches
     end
   end
 end