RubyGems - rbbt-text - Versions diffs - 0.2.1 → 0.5.0 - Mend

rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/bin/get_ppis.rb +52 -0
data/lib/rbbt/bow/dictionary.rb +9 -9
data/lib/rbbt/bow/misc.rb +86 -2
data/lib/rbbt/corpus/corpus.rb +55 -0
data/lib/rbbt/corpus/document.rb +289 -0
data/lib/rbbt/corpus/document_repo.rb +115 -0
data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
data/lib/rbbt/ner/NER.rb +7 -5
data/lib/rbbt/ner/abner.rb +13 -2
data/lib/rbbt/ner/annotations.rb +182 -51
data/lib/rbbt/ner/annotations/annotated.rb +15 -0
data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
data/lib/rbbt/ner/annotations/relations.rb +25 -0
data/lib/rbbt/ner/annotations/token.rb +28 -0
data/lib/rbbt/ner/annotations/transformed.rb +170 -0
data/lib/rbbt/ner/banner.rb +8 -5
data/lib/rbbt/ner/chemical_tagger.rb +34 -0
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
data/lib/rbbt/ner/oscar3.rb +1 -1
data/lib/rbbt/ner/oscar4.rb +41 -0
data/lib/rbbt/ner/patterns.rb +132 -0
data/lib/rbbt/ner/rnorm.rb +141 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
data/lib/rbbt/ner/token_trieNER.rb +185 -51
data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
data/lib/rbbt/nlp/nlp.rb +235 -0
data/share/install/software/ABNER +0 -4
data/share/install/software/ChemicalTagger +81 -0
data/share/install/software/Gdep +115 -0
data/share/install/software/Geniass +118 -0
data/share/install/software/OSCAR4 +16 -0
data/share/install/software/StanfordParser +15 -0
data/share/patterns/drug_induce_disease +22 -0
data/share/rnorm/cue_default +10 -0
data/share/rnorm/tokens_default +86 -0
data/share/{stopwords → wordlists/stopwords} +0 -0
data/test/rbbt/bow/test_bow.rb +1 -1
data/test/rbbt/bow/test_dictionary.rb +1 -1
data/test/rbbt/bow/test_misc.rb +1 -1
data/test/rbbt/corpus/test_corpus.rb +99 -0
data/test/rbbt/corpus/test_document.rb +222 -0
data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
data/test/rbbt/ner/test_abner.rb +1 -1
data/test/rbbt/ner/test_annotations.rb +64 -2
data/test/rbbt/ner/test_banner.rb +1 -1
data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
data/test/rbbt/ner/test_patterns.rb +66 -0
data/test/rbbt/ner/test_regexpNER.rb +1 -1
data/test/rbbt/ner/test_rnorm.rb +47 -0
data/test/rbbt/ner/test_token_trieNER.rb +60 -35
data/test/rbbt/nlp/test_nlp.rb +88 -0
data/test/test_helper.rb +20 -0
metadata +93 -20

data/bin/get_ppis.rb ADDED

@@ -0,0 +1,52 @@
+#!/usr/bin/env ruby
+require 'rbbt-util'
+require 'rbbt/annotations/corpus'
+require 'rbbt/annotations/corpus/pubmed'
+require 'rbbt/annotations/relationships/ppi'
+require 'rbbt/sources/pubmed'
+require 'rbbt/ner/annotations'
+require 'rbbt/ner/token_trieNER'
+require 'rbbt/ner/annotations/transformed'
+require 'rbbt/ner/chemical_tagger'
+Corpus.define_entity_ner "Compounds", false do |doc|
+  @@chemical_tagger ||= ChemicalTagger.new
+  @@chemical_tagger.entities(doc.text)
+end
+Corpus.define_entity_ner "Diseases", false do |doc|
+  if ! defined? @@tokenizer
+    @@tokenizer = TokenTrieNER.new [], :longest_match => true
+    @@tokenizer.merge TSV.new(Rbbt.share.databases.COSTART.COSTART, :native => 0, :extra => 0, :flatten => true), :COSTART
+    @@tokenizer.merge TSV.new(Rbbt.share.databases.CTCAE.CTCAE, :native => 0, :extra => 1, :flatten => true), :CTCAE
+    @@tokenizer.merge Rbbt.share.databases.Polysearch.disease, :disease
+  end
+  @@tokenizer.entities(doc.text)
+end
+corpus = Corpus.new Rbbt.tmp.corpus["PPIS2"].find
+docids = corpus.add_pubmed_query("Cancer", 5000, :abstract)
+Misc.profile do
+  docids[0..100].each do |docid|
+    puts "ARTICLE: #{ docid }"
+    doc = corpus.docid(docid)
+    diseases = doc.produce_diseases
+    #puts "Diseases: #{diseases.collect{|g| [g,g.id,g.offset] * ":"} * ", "}"
+  #sentences = doc.sentences
+  #diseases_index = Segment.index(diseases)
+  #sentences.each do |sentence|
+  #  diseases_in_sentence = diseases_index[sentence.range]
+  #  next if diseases_in_sentence.empty?
+  #  Transformed.transform(sentence, sentence.make_relative(diseases_in_sentence.dup)) do |entity|
+  #    entity.html
+  #  end
+  #  puts "---#{[sentence.id, sentence.offset] * ":"}"
+  #  puts sentence
+  #  puts "Diseases: #{diseases_in_sentence.collect{|g| [g,g.id,g.offset] * ":"} * ", "}"
+  #  sentence.restore
+  #end
+  end
+end

data/lib/rbbt/bow/dictionary.rb CHANGED

@@ -74,15 +74,15 @@ class Dictionary::TF_IDF
   end
   def best(options = {})
-    hi, low, limit = {
+    high, low, limit = {
       :low   => 0,
-      :hi    => 1,
+      :high    => 1,
     }.merge(options).
-    values_at(:hi, :low, :limit)
+    values_at(:high, :low, :limit)
     num_docs = @num_docs.to_f
     best = df.select{|term, value|
-      value >= low && value <= hi
+      value >= low && value <= high
     }.collect{|p|
       term     = p.first
       df_value = p.last
@@ -147,19 +147,19 @@ class Dictionary::KL
   end
   def best(options = {})
-    hi, low, limit = {
+    high, low, limit = {
       :low   => 0,
-      :hi    => 1,
+      :high    => 1,
     }.merge(options).
-    values_at(:hi, :low, :limit)
+    values_at(:high, :low, :limit)
     pos_df = @pos_dict.df
     neg_df = @neg_dict.df
     best = {}
     terms.select{|term|
-      pos_df[term] >= low && pos_df[term] <= hi ||
-      neg_df[term] >= low && neg_df[term] <= hi
+      pos_df[term] >= low && pos_df[term] <= high ||
+      neg_df[term] >= low && neg_df[term] <= high
     }.each{|term|
       pos = pos_df[term]
       neg = neg_df[term]

data/lib/rbbt/bow/misc.rb CHANGED

@@ -1,7 +1,91 @@
 require 'rbbt'
 require 'rbbt/util/open'
-Rbbt.claim 'stopwords', 'stopwords', 'wordlist'
+Rbbt.share.wordlists.trigger_terms.define_as_url "http://zope.bioinfo.cnio.es/hpylori/pubmedxml2dir_files/ppi_trigger_term_table.txt"
-$stopwords = Rbbt.files.wordlists.stopwords.read.scan(/\w+/)
+$stopwords     = Rbbt.share.wordlists.stopwords.read.scan(/\w+/)
+$greek = {
+  "alpha"   => "a",
+  "beta"    => "b",
+  "gamma"   => "g",
+  "delta"   => "d",
+  "epsilon" => "e",
+  "zeta"    => "z",
+  "eta"     => "e",
+  "theta"   => "th",
+  "iota"    => "i",
+  "kappa"   => "k",
+  "lambda"  => "l",
+  "mu"      => "m",
+  "nu"      => "n",
+  "xi"      => "x",
+  "omicron" => "o",
+  "pi"      => "p",
+  "rho"     => "r",
+  "sigma"   => "s",
+  "tau"     => "t",
+  "upsilon" => "u",
+  "phi"     => "ph",
+  "chi"     => "ch",
+  "psi"     => "ps",
+  "omega"   => "o"
+}
+$inverse_greek = Hash.new
+$greek.each{|l,s| $inverse_greek[s] = l }
+class String
+  CONSONANTS = []
+  if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
+    Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
+  end
+  # Uses heuristics to checks if a string seems like a special word, like a gene name.
+  def is_special?
+    # Only consonants
+    return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
+    # Not a word
+    return false if self =~ /[^\s]\s[^\s]/;
+    return false if self.length < 3;
+    # Alphanumeric
+    return true if self =~ /[0-9]/ &&  self =~ /[a-z]/i
+    # All Caps
+    return true if self =~ /[A-Z]{2,}/;
+    # Caps Mix
+    return true if self =~ /[a-z][A-Z]/;
+    # All consonants
+    return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
+    # Dashed word
+    return true if self =~ /(^\w-|-\w$)/
+    # To many consonants (very heuristic)
+    if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
+      return true
+    end
+    return false
+  end
+  # Turns the first letter to lowercase
+  def downcase_first
+    return "" if self == ""
+    letters = self.scan(/./)
+    letters[0].downcase!
+    letters.join("")
+  end
+  # Turns a roman number into arabic form is possible. Just simple
+  # romans only...
+  def arabic
+    return 1 if self =~ /^I$/;
+    return 2 if self =~ /^II$/;
+    return 3 if self =~ /^III$/;
+    return 4 if self =~ /^IV$/;
+    return 5 if self =~ /^V$/;
+    return 10 if self =~ /^X$/;
+    return nil
+  end
+end

data/lib/rbbt/corpus/corpus.rb ADDED

@@ -0,0 +1,55 @@
+require 'rbbt/corpus/document'
+require 'rbbt/corpus/document_repo'
+class Corpus
+  attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
+  def initialize(corpora_path = nil)
+    @corpora_path = case
+                   when corpora_path.nil?
+                     Rbbt.corpora
+                   when (not Resource::Path === corpora_path)
+                     Resource::Path.path(corpora_path)
+                   else
+                     corpora_path
+                   end
+    @document_repo   = DocumentRepo.get @corpora_path.document_repo, false
+    @persistence_dir = File.join(@corpora_path, "annotations")
+    @global_annotations = TSV.new(TCHash.get(File.join(@persistence_dir, "global_annotations"), :list), :list, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
+    @global_annotations.unnamed = true
+  end
+  def persistence_for(docid)
+    File.join(persistence_dir, docid)
+  end
+  def document(namespace, id, type, hash)
+    docid = [namespace, id, type, hash] * ":"
+    Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
+  end
+  def docid(docid)
+    Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
+  end
+  def add_document(text, namespace, id, type = nil)
+    hash = Digest::MD5.hexdigest(text)
+    @document_repo.add(text, namespace, id, type, hash)
+  end
+  def find(namespace=nil, id = nil, type = nil, hash = nil)
+    @document_repo.find(namespace, id, type, hash).collect{|docid|
+      Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
+    }
+  end
+  def find_docid(docid)
+    @document_repo.find_docid(docid).collect{|docid|
+      Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
+    }
+  end
+  def exists?(namespace=nil, id = nil, type = nil, hash = nil)
+    find(namespace, id, type, hash).any?
+  end
+end

data/lib/rbbt/corpus/document.rb ADDED

@@ -0,0 +1,289 @@
+require 'rbbt/ner/annotations'
+require 'rbbt/util/tsv'
+require 'rbbt/util/resource'
+require 'rbbt/util/misc'
+require 'json'
+class Document
+  attr_accessor :text, :docid, :namespace, :id, :type, :hash, :annotations, :segment_indeces, :persistence_dir, :global_persistence
+  def initialize(persistence_dir = nil, docid = nil, text = nil, global_persistence = nil)
+    @annotations = {}
+    @segment_indeces = {}
+    if not persistence_dir.nil?
+      @persistence_dir = persistence_dir
+      @persistence_dir = Resource::Path.path(@persistence_dir) if not Resource::Path == @persistence_dir
+    end
+    @global_persistence = global_persistence
+    if not docid.nil?
+      @docid = docid
+      update_docid
+    end
+    @text = text unless text.nil?
+  end
+  def update_docid
+    @namespace, @id, @type, @hash = docid.split(":", -1)
+  end
+  def docid=(docid)
+    @docid = docid
+    update_docid
+  end
+  def self.save_segment(segment, fields = nil)
+    if fields.nil?
+      eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
+      [segment.offset, eend, segment.info.to_json]
+    else
+      eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
+      info = segment.info
+      info["literal"] = segment.to_s.gsub(/\s/,' ')
+      info.extend IndiferentHash
+      [segment.offset, eend].concat info.values_at(*fields.collect{|f| f.downcase}).collect{|v| Array === v ? v * "|" : v}
+    end
+  end
+  def self.load_segment(text, annotation, fields = nil)
+    if fields.nil?
+      start, eend, info = annotation.values_at 0,1,2
+      info = JSON.parse(info)
+    else
+      start, eend = annotation.values_at 0,1
+      info = Misc.process_to_hash(fields) do |fields| annotation.values_at(*fields.collect{|f| f.downcase}).collect{|v| v.index("|").nil? ? v : v.split("|")} end
+    end
+    Segment.load(text, start, eend, info, @docid)
+  end
+  def self.tsv(segments, fields = nil)
+    tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
+    if fields.nil?
+      tsv.fields += ["Info"]
+    else
+      tsv.fields += fields
+    end
+    segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
+    tsv
+  end
+  #{{{ PERSISTENCE
+  TSV_REPOS = {}
+  FIELDS_FOR_ENTITY_PERSISTENCE = {}
+  def self.persist(entity, fields = nil)
+    if not fields.nil?
+      fields = [fields] if not Array === fields
+      fields = fields.collect{|f| f.to_s}
+      FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
+    end
+    self.class_eval <<-EOC
+      def load_with_persistence_#{entity}
+        fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
+        annotations = Persistence.persist("#{ entity }", :Entity, :tsv_string,
+                        :persistence_file => File.join(@persistence_dir, "#{ entity }")) do
+          tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
+          if fields.nil?
+            tsv.fields += ["Info"]
+          else
+            tsv.fields += fields
+          end
+          segments = produce_#{entity}
+          segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
+          tsv
+        end
+        annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
+      end
+          EOC
+  end
+  def self.persist_in_tsv(entity, tsv = nil, fields = nil)
+    if not tsv.nil? and not tsv.respond_to?(:keys)
+      fields = tsv
+      tsv = nil
+    end
+    TSV_REPOS[entity.to_s] = tsv
+    if not fields.nil?
+      fields = [fields] if not Array === fields
+      fields = fields.collect{|f| f.to_s}
+      FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
+    end
+    self.class_eval <<-EOC
+      def load_with_persistence_#{entity}
+        repo = TSV_REPOS["#{ entity }"]
+        if repo.nil?
+          raise "No persistence file or persistencr dir for persist_in_tsv" if persistence_dir.nil?
+          repo = TCHash.get(persistence_dir.annotations_by_type.find, TCHash::TSVSerializer)
+        end
+        fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
+        if not repo.include? "#{ entity }"
+          tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
+          if fields.nil?
+            tsv.fields += ["Info"]
+          else
+            tsv.fields += fields
+          end
+          produce_#{entity}.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
+          repo.write
+          repo["#{entity}"] = tsv
+          repo.read
+        end
+        annotations = repo["#{entity}"]
+        repo.close
+        annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
+      end
+      EOC
+  end
+  def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
+    if not tsv.nil? and not tsv.respond_to?(:keys)
+      entity_field = doc_field if doc_field
+      doc_field = fields if fields
+      fields = tsv if tsv
+      tsv = nil
+    end
+    doc_field ||= "Document ID"
+    entity_field ||= "Entity Type"
+    TSV_REPOS[entity.to_s] = tsv
+    if not fields.nil?
+      fields = [fields] if not Array === fields
+      fields = fields.collect{|f| f.to_s}
+      FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
+    end
+    self.class_eval <<-EOC
+      def load_with_persistence_#{entity}
+        fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
+        data = TSV_REPOS["#{ entity }"]
+        if data.nil?
+          data = global_persistence
+        end
+        data.filter
+        data.add_filter("field:#{ doc_field }", @docid)
+        data.add_filter("field:#{ entity_field }", "#{ entity }")
+        if data.keys.empty?
+          tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
+          if fields.nil?
+            tsv.fields += ["Info"]
+          else
+            tsv.fields += fields
+          end
+          segments = produce_#{entity}
+          segments << Segment.annotate("No #{entity} found in document #{ @docid }", -1) if segments.empty?
+          segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
+          tsv.add_field "#{ doc_field }" do
+            @docid
+          end
+          tsv.add_field "#{ entity_field }" do
+            "#{ entity }"
+          end
+          data.write
+          data.merge!(tsv)
+          data.read
+        end
+        segments = []
+        data.each{|id, annotation| segments << Document.load_segment(text, annotation, fields) unless annotation[1].to_i == -1}
+        data.pop_filter
+        data.pop_filter
+        segments
+      end
+      EOC
+  end
+  def self.define(entity, &block)
+    send :define_method, "produce_#{entity}", &block
+    self.class_eval <<-EOC
+      def load_#{entity}
+        return if annotations.include? "#{ entity }"
+        if self.respond_to?("load_with_persistence_#{entity}") and not @persistence_dir.nil?
+          annotations["#{entity}"] = load_with_persistence_#{entity}
+        else
+          annotations["#{ entity }"] = produce_#{entity}
+        end
+      end
+      def #{entity}
+        begin
+          entities = annotations["#{ entity }"]
+          if entities.nil?
+            load_#{entity}
+            entities = annotations["#{ entity }"]
+          end
+        end
+        entities
+      end
+      def #{entity}_at(pos, persist = false)
+        segment_index("#{ entity }", persist ? File.join(@persistence_dir, 'ranges') : nil)[pos]
+      end
+    EOC
+  end
+  def segment_index(name, persistence_dir = nil)
+    @segment_indeces[name] ||= Segment.index(self.send(name), persistence_dir.nil? ? :memory : File.join(persistence_dir, name + '.range'))
+  end
+  def load_into(segment, *annotations)
+    options = annotations.pop if Hash === annotations.last
+    options ||= {}
+    if options[:persist] and not @persistence_dir.nil?
+      persistence_dir = File.join(@persistence_dir, 'ranges')
+    else
+      persistence_dir = nil
+    end
+    segment.extend Annotated
+    segment.annotations ||= {}
+    annotations.collect do |name|
+      name = name.to_s
+      annotations = segment_index(name, persistence_dir)[segment.range]
+      segment.annotations[name] = annotations
+      class << segment
+        self
+      end.class_eval "def #{ name }; @annotations['#{ name }']; end"
+    end
+    segment
+  end
+end