RubyGems - rbbt-text - Versions diffs - 0.2.1 → 0.5.0 - Mend

rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

data/bin/get_ppis.rb +52 -0
data/lib/rbbt/bow/dictionary.rb +9 -9
data/lib/rbbt/bow/misc.rb +86 -2
data/lib/rbbt/corpus/corpus.rb +55 -0
data/lib/rbbt/corpus/document.rb +289 -0
data/lib/rbbt/corpus/document_repo.rb +115 -0
data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
data/lib/rbbt/ner/NER.rb +7 -5
data/lib/rbbt/ner/abner.rb +13 -2
data/lib/rbbt/ner/annotations.rb +182 -51
data/lib/rbbt/ner/annotations/annotated.rb +15 -0
data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
data/lib/rbbt/ner/annotations/relations.rb +25 -0
data/lib/rbbt/ner/annotations/token.rb +28 -0
data/lib/rbbt/ner/annotations/transformed.rb +170 -0
data/lib/rbbt/ner/banner.rb +8 -5
data/lib/rbbt/ner/chemical_tagger.rb +34 -0
data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
data/lib/rbbt/ner/oscar3.rb +1 -1
data/lib/rbbt/ner/oscar4.rb +41 -0
data/lib/rbbt/ner/patterns.rb +132 -0
data/lib/rbbt/ner/rnorm.rb +141 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
data/lib/rbbt/ner/token_trieNER.rb +185 -51
data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
data/lib/rbbt/nlp/nlp.rb +235 -0
data/share/install/software/ABNER +0 -4
data/share/install/software/ChemicalTagger +81 -0
data/share/install/software/Gdep +115 -0
data/share/install/software/Geniass +118 -0
data/share/install/software/OSCAR4 +16 -0
data/share/install/software/StanfordParser +15 -0
data/share/patterns/drug_induce_disease +22 -0
data/share/rnorm/cue_default +10 -0
data/share/rnorm/tokens_default +86 -0
data/share/{stopwords → wordlists/stopwords} +0 -0
data/test/rbbt/bow/test_bow.rb +1 -1
data/test/rbbt/bow/test_dictionary.rb +1 -1
data/test/rbbt/bow/test_misc.rb +1 -1
data/test/rbbt/corpus/test_corpus.rb +99 -0
data/test/rbbt/corpus/test_document.rb +222 -0
data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
data/test/rbbt/ner/test_abner.rb +1 -1
data/test/rbbt/ner/test_annotations.rb +64 -2
data/test/rbbt/ner/test_banner.rb +1 -1
data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
data/test/rbbt/ner/test_patterns.rb +66 -0
data/test/rbbt/ner/test_regexpNER.rb +1 -1
data/test/rbbt/ner/test_rnorm.rb +47 -0
data/test/rbbt/ner/test_token_trieNER.rb +60 -35
data/test/rbbt/nlp/test_nlp.rb +88 -0
data/test/test_helper.rb +20 -0
metadata +93 -20

data/share/install/software/OSCAR4 ADDED

@@ -0,0 +1,16 @@
+#!/bin/bash
+INSTALL_HELPER_FILE="$1"
+RBBT_SOFTWARE_DIR="$2"
+source "$INSTALL_HELPER_FILE"
+name='OSCAR4'
+url="http://maven.ch.cam.ac.uk/m2repo/uk/ac/cam/ch/wwmm/oscar/oscar4-all/4.0.1/oscar4-all-4.0.1-with-dependencies.jar"
+PKG_DIR=`opt_dir $name`
+[ -d $PKG_DIR ] || mkdir -p $PKG_DIR
+wget "$url" -O "$PKG_DIR/OSCAR4.jar"
+ln -sf  "$PKG_DIR/OSCAR4.jar"  "$OPT_JAR_DIR/OSCAR4.jar"

data/share/install/software/StanfordParser ADDED

@@ -0,0 +1,15 @@
+#!/bin/bash
+INSTALL_HELPER_FILE="$1"
+RBBT_SOFTWARE_DIR="$2"
+source "$INSTALL_HELPER_FILE"
+name='StanfordParser'
+url="http://nlp.stanford.edu/downloads/stanford-parser-2011-04-20.tgz"
+get_src "$name" "$url"
+mkdir "$OPT_DIR/$name"
+cp "$OPT_BUILD_DIR/stanford-parser.jar" "$OPT_DIR/$name"
+ln -s "$OPT_DIR/$name/stanford-parser.jar" "$OPT_JAR_DIR/stanford-parser.jar"

data/share/patterns/drug_induce_disease ADDED

@@ -0,0 +1,22 @@
+NP[disease_NN] VP[induce_HVB by_IN] NP[drug_NN]
+NP[drug_NN] VP[induce_HVB] NP[disease_NN]
+NP[drug_NN] VP[cause_HVB] NP[disease_NN]
+NP[disease_NN] VP[cause_HVB by_IN] NP[drug_NN]
+NP[disease_NN] VP[produce_HVB by_IN] NP[drug_NN]
+NP[disease_NN] VP[induce_HVB by_IN] NP[injection_HNN of_IN] NP[drug_NN]
+NP[drug_NN] VP[associate_HVB with_IN] NP[risk_HNN of_IN] NP[disease_NN]
+NP[disease_NN] VP[induce_HVB by_IN] NP[administration_HNN of_IN] NP[drug_NN]
+NP[disease_NN] VP[be_HVB] NP[effect_HNN of_IN] NP[drug_NN]
+NP[drug_NN] VP[increase_HVB] NP[risk_HNN of_IN] NP[disease_NN]
+NP[disease_NN] NTG[follow_VBG] NP[treatment_HNN with_IN] NP[drug_NN]
+NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN therapy_HNN]
+NP[disease_NN] VP[associate_HVB with_IN] NP[use_HNN of_IN] NP[drug_NN]
+NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN use_HNN]
+NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN treatment_HNN]
+NP[disease_NN while_IN] VP[receive_HVB] NP[drug_NN]
+NP[disease_NN] NTG[follow_VBG] NP[drug_NN therapy_HNN]
+NP[disease_NN after_IN] VP[receive_HVB] NP[drug_NN]
+NP[disease_NN] NTG[follow_VBG] NP[drug_NN administration_HNN]
+NP[disease_NN due_(?:IN|JJ) to_TO] NP[drug_NN therapy_HNN]
+NP[disease_NN] VP[follow_HVB] NP[treatment_HNN with_IN] NP[drug_NN]
+NP[disease_NN] VP[follow_HVB] NP[drug_NN administration_HNN]

data/share/rnorm/cue_default ADDED

@@ -0,0 +1,10 @@
+equal    do |w| [w] end
+standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
+cleaned  do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
+special  do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
+words    do |w|
+  w.sub(/(.*)I$/,'\1I \1').
+    scan(/[a-z][a-z]+/i).
+    sort{|a,b| b.length <=> a.length}.
+    collect{|n| n.downcase}
+end

data/share/rnorm/tokens_default ADDED

@@ -0,0 +1,86 @@
+require 'rbbt/util/misc'
+plural = Proc.new do |t| t.sub(/s$/,'') end
+tokens do
+  # Some (possible) single letters first
+  receptor     /^(?:receptor|r)s?$/i
+  protein      /^(?:protein|p)s?$/i
+  roman        /^[IV]+$/
+  greek_letter do |w| $inverse_greek[w.downcase] != nil end
+  # Some words for removal
+  stopword     do |w|  $stopwords.include?( w.downcase_first)  end
+  gene         /genes?/i
+  dna
+  cdna
+  rna
+  mrna
+  trna
+  cdna
+  component
+  exon
+  intron
+  domain
+  family
+  # Important words
+  number       /^(?:\d+[.,]?\d+|\d)$/
+  greek        do |w| $greek[w.downcase] != nil end
+  special      do |w| w.is_special? end
+  promoter
+  similar      /^(homolog.*|like|related|associated)$/
+  ase          /ase$/
+  in_end       /in$/
+end
+comparisons do
+  compare.number do |l1,l2|
+      v = 0
+      case
+      when l1.empty? && l2.empty?
+          v = 0
+      when l1.sort.uniq == l2.sort.uniq
+          v = 3
+      when l1.any? && l1[0] == l2[0]
+          v = -3
+      when l1.empty? && l2 == ['1']
+          v = -5
+      else
+          v = -10
+      end
+      v
+  end
+  diff.promoter   -10
+  diff.receptor   -10
+  diff.similar    -10
+  diff.capital    -10
+  same.unknown      1
+  miss.unknown      -2
+  extr.unknown      -2
+  same.greek      1
+  miss.greek      -2
+  extr.greek      -2
+  same.special    4
+  miss.special    -3
+  extr.special    -3
+  transform.receptor plural
+  transform.protein plural
+  transform.roman do |t| [t.arabic, :number] end
+  transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
+  transform.ase do |t| [t, :special] end
+  transform.in_end do |t| [t, :special] end
+  transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
+end

data/share/{stopwords → wordlists/stopwords} RENAMED

File without changes

data/test/rbbt/bow/test_bow.rb CHANGED

@@ -1,4 +1,4 @@
-require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
 require 'rbbt/bow/bow'
 require 'test/unit'

data/test/rbbt/bow/test_dictionary.rb CHANGED

@@ -1,4 +1,4 @@
-require File.dirname(__FILE__) + '/../../test_helper'
+require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
 require 'rbbt/bow/dictionary'
 require 'rbbt/bow/bow'
 require 'test/unit'

data/test/rbbt/bow/test_misc.rb CHANGED

@@ -1,4 +1,4 @@
-require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
+require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
 require 'rbbt/bow/misc'
 require 'test/unit'

data/test/rbbt/corpus/test_corpus.rb ADDED

@@ -0,0 +1,99 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
+require 'rbbt/corpus/corpus'
+require 'rbbt/corpus/sources/pubmed'
+class Document
+  define :sentences do
+    require 'rbbt/nlp/nlp'
+    NLP.geniass_sentence_splitter(text)
+  end
+  define :genes do
+    require 'rbbt/ner/abner'
+    Abner.new.entities(text)
+  end
+end
+class TestCorpus < Test::Unit::TestCase
+  def test_add_document
+    pmid = "19458159"
+    text = PubMed.get_article(pmid).text
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    assert corpus.find(:pubmed, pmid).empty?
+    corpus.add_document(text, :pubmed, pmid, :abstract)
+    assert corpus.find(:pubmed, pmid).any?
+    assert corpus.find(:pubmed, pmid, :fulltext).empty?
+    assert corpus.find(:pubmed, pmid, :abstract).any?
+    assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
+  end
+  def test_add_pmid
+    pmid = "19465387"
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    corpus.add_pmid(pmid, :abstract)
+    assert corpus.exists? :pubmed, pmid
+    assert corpus.exists? :pubmed, pmid, :abstract
+    assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
+  end
+  def test_find_all
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    corpus.add_pmid("19458159", :abstract)
+    corpus.add_pmid("19465387", :abstract)
+    all = corpus.find
+    assert_equal 2, all.length
+    assert all.select{|document| document.id == "19458159"}.any?
+    assert all.select{|document| document.id == "19465387"}.any?
+  end
+  def test_doc_sentences
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    corpus.add_pmid("19458159", :abstract)
+    sentences = corpus.find.collect{|doc| doc.sentences}.flatten
+    assert corpus.find.first.sentences.length > 0
+    assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
+  end
+  def test_doc_genes
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    corpus.add_pmid("21611789", :abstract)
+    assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
+  end
+  def test_genes
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    corpus.add_pmid("21611789", :abstract)
+    assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
+  end
+  def test_index
+    corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
+    corpus.add_pmid("21611789", :abstract)
+    document = corpus.find(:pubmed, "21611789").first
+    genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
+    assert genes.collect{|gene|
+      document.sentences_at(gene.offset)
+    }.flatten.length >  1
+  end
+end

data/test/rbbt/corpus/test_document.rb ADDED

@@ -0,0 +1,222 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
+require 'rbbt/corpus/document'
+require 'test/unit'
+$persistence = TSV.new({})
+$tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
+$global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
+$tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
+class Document
+  define :sentences do
+    require 'rbbt/nlp/nlp'
+    NLP.geniass_sentence_splitter(text)
+  end
+  define :tokens do
+    require 'rbbt/ner/annotations/token'
+    Token.tokenize(text)
+  end
+  define :long_words do
+    require 'rbbt/ner/annotations/token'
+    Token.tokenize(text).select{|tok| tok.length > 5}
+  end
+  define :short_words do
+    require 'rbbt/ner/annotations/token'
+    Token.tokenize(text).select{|tok| tok.length < 5}
+  end
+  define :even_words do
+    require 'rbbt/ner/annotations/token'
+    Token.tokenize(text).select{|tok| tok.length % 2 == 0}
+  end
+  define :missing do
+    []
+  end
+  define :tokens_again do
+    raise "This should be here already"
+  end
+  persist :sentences
+  persist_in_tsv :tokens
+  persist_in_tsv :long_words, $tchash_persistence, :Literal
+  persist_in_global_tsv :short_words, $global_persistence
+  persist_in_global_tsv :even_words, $tchash_global_persistence
+  persist_in_global_tsv :missing, $tchash_global_persistence
+end
+class TestDocument < Test::Unit::TestCase
+  def test_annotations
+    text =<<-EOF
+This is a
+sentence. This is
+another sentence.
+    EOF
+    doc = Document.new
+    doc.text = text
+    assert_equal 2, doc.sentences.length
+    assert_equal 10, doc.tokens.length
+  end
+  def test_annotation_load
+    text =<<-EOF
+This is a
+sentence. This is
+another sentence.
+    EOF
+    doc = Document.new
+    doc.text = text * 10
+    sentence = doc.sentences.last
+    doc.load_into sentence, :tokens
+    assert_equal 5, sentence.tokens.length
+    assert_equal "another", sentence.tokens[2]
+    assert_equal sentence.offset + 0, sentence.tokens[0].offset
+  end
+  def test_annotation_persistence
+    text =<<-EOF
+This is a
+sentence. This is
+another sentence.
+    EOF
+    text *= 10
+    TmpFile.with_file do |dir|
+      FileUtils.mkdir_p dir
+      doc = Document.new(dir)
+      doc.text = text
+      doc.sentences
+      doc = Document.new(dir)
+      doc.text = text
+      sentence = doc.sentences.last
+      doc.load_into sentence, :tokens
+      assert_equal 5, sentence.tokens.length
+      assert_equal "another", sentence.tokens[2]
+      assert_equal sentence.offset + 0, sentence.tokens[0].offset
+    end
+  end
+  def test_range_persistence
+    text =<<-EOF
+This is a
+sentence. This is
+another sentence.
+    EOF
+    text *= 10
+    TmpFile.with_file do |dir|
+      FileUtils.mkdir_p dir
+      doc = Document.new(dir)
+      doc.text = text
+      sentence = doc.sentences.last
+      Misc.benchmark(10) do
+        doc = Document.new(dir)
+        doc.text = text
+        doc.load_into sentence, :tokens, :persist => true
+        assert_equal 5, sentence.tokens.length
+        assert_equal "another", sentence.tokens[2]
+        assert_equal sentence.offset + 0, sentence.tokens[0].offset
+        assert_equal sentence.offset + 5, sentence.tokens[1].offset
+      end
+    end
+  end
+  def test_annotation_persistence_in_tsv
+    text =<<-EOF
+This is a
+sentence. This is
+another sentence.
+    EOF
+    TmpFile.with_file do |dir|
+      FileUtils.mkdir_p dir
+      doc = Document.new(dir)
+      doc.text = text * 10
+      doc.sentences
+      doc = Document.new(dir)
+      doc.text = text * 10
+      sentence = doc.sentences.last
+      doc.load_into sentence, :tokens, :long_words
+      assert_equal 5, sentence.tokens.length
+      assert_equal "another", sentence.tokens[2]
+      assert_equal sentence.offset + 0, sentence.tokens[0].offset
+      assert_equal 2, sentence.long_words.length
+      assert_equal %w(another sentence), sentence.long_words
+      assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
+    end
+  end
+  def test_annotation_persistence_in_global
+    text =<<-EOF
+This is a
+sentence. This is
+another sentence.
+    EOF
+    TmpFile.with_file do |dir|
+      FileUtils.mkdir_p dir
+      doc = Document.new(dir)
+      doc.text = text * 10
+      doc.docid = "FOOF"
+      doc.short_words
+      doc.sentences
+      doc = Document.new(dir)
+      doc.text = text * 10
+      doc.docid = "FOOF"
+      sentence = doc.sentences.last
+      doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
+      assert_equal 3, sentence.short_words.length
+      assert_equal 3, sentence.even_words.length
+    end
+  end
+  def test_dump
+    text =<<-EOF
+This is a
+sentence. This is
+another sentence.
+    EOF
+    TmpFile.with_file do |dir|
+      FileUtils.mkdir_p dir
+      doc = Document.new(dir)
+      doc.text = text * 10
+      tsv = Document.tsv(doc.sentences, ["Literal"])
+   end
+  end
+end