RubyGems - rbbt - Versions diffs - 1.1.8 → 1.2.1 - Mend

rbbt 1.1.8 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

data/README.rdoc +12 -12
data/bin/rbbt_config +2 -3
data/install_scripts/norm/Rakefile +4 -4
data/install_scripts/organisms/{tair.Rakefile → Ath.Rakefile} +4 -3
data/install_scripts/organisms/{cgd.Rakefile → Cal.Rakefile} +0 -0
data/install_scripts/organisms/{worm.Rakefile → Cel.Rakefile} +0 -0
data/install_scripts/organisms/{human.Rakefile → Hsa.Rakefile} +4 -8
data/install_scripts/organisms/{mgi.Rakefile → Mmu.Rakefile} +0 -0
data/install_scripts/organisms/{rgd.Rakefile → Rno.Rakefile} +0 -0
data/install_scripts/organisms/{sgd.Rakefile → Sce.Rakefile} +0 -0
data/install_scripts/organisms/{pombe.Rakefile → Spo.Rakefile} +0 -0
data/install_scripts/organisms/rake-include.rb +15 -19
data/lib/rbbt.rb +0 -3
data/lib/rbbt/ner/rnorm.rb +2 -2
data/lib/rbbt/sources/go.rb +48 -3
data/lib/rbbt/sources/organism.rb +12 -17
data/lib/rbbt/util/open.rb +27 -27
data/lib/rbbt/util/tmpfile.rb +16 -0
data/tasks/install.rake +1 -1
data/test/rbbt/bow/test_bow.rb +33 -0
data/test/rbbt/bow/test_classifier.rb +72 -0
data/test/rbbt/bow/test_dictionary.rb +91 -0
data/test/rbbt/ner/rnorm/test_cue_index.rb +57 -0
data/test/rbbt/ner/rnorm/test_tokens.rb +70 -0
data/test/rbbt/ner/test_abner.rb +17 -0
data/test/rbbt/ner/test_banner.rb +17 -0
data/test/rbbt/ner/test_dictionaryNER.rb +122 -0
data/test/rbbt/ner/test_regexpNER.rb +33 -0
data/test/rbbt/ner/test_rner.rb +126 -0
data/test/rbbt/ner/test_rnorm.rb +47 -0
data/test/rbbt/sources/test_biocreative.rb +38 -0
data/test/rbbt/sources/test_biomart.rb +31 -0
data/test/rbbt/sources/test_entrez.rb +49 -0
data/test/rbbt/sources/test_go.rb +24 -0
data/test/rbbt/sources/test_organism.rb +59 -0
data/test/rbbt/sources/test_polysearch.rb +27 -0
data/test/rbbt/sources/test_pubmed.rb +29 -0
data/test/rbbt/util/test_arrayHash.rb +257 -0
data/test/rbbt/util/test_filecache.rb +37 -0
data/test/rbbt/util/test_index.rb +31 -0
data/test/rbbt/util/test_misc.rb +20 -0
data/test/rbbt/util/test_open.rb +97 -0
data/test/rbbt/util/test_simpleDSL.rb +57 -0
data/test/rbbt/util/test_tmpfile.rb +21 -0
data/test/test_helper.rb +4 -0
data/test/test_rbbt.rb +11 -0
metadata +39 -12

data/lib/rbbt/util/tmpfile.rb CHANGED Viewed

@@ -16,4 +16,20 @@ module TmpFile
   def self.tmp_file(s = "",max=10000000)
     File.join(Rbbt.tmpdir,random_name(s,max))
   end
+  def self.with_file(content = nil)
+    tmpfile = tmp_file
+    File.open(tmpfile, 'w') do |f| f.write content end if content != nil
+    result = yield(tmpfile)
+    FileUtils.rm tmpfile if File.exists? tmpfile
+    result
+  end
+  class << self
+    alias :new :tmp_file
+  end
 end

data/tasks/install.rake CHANGED Viewed

@@ -1,7 +1,7 @@
 require 'rbbt'
 $datadir = Rbbt.datadir
-$scriptdir = File.join(Rbbt.rootdir, '/install_scripts')
+$scriptdir = File.join(File.expand_path(Rbbt.rootdir), '/install_scripts')
 task 'abner' do

data/test/rbbt/bow/test_bow.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt/bow/bow'
+require 'test/unit'
+class TestBow < Test::Unit::TestCase
+  def test_words
+    assert_equal(["hello", "world"], "Hello World".words)
+  end
+  def test_terms
+    text = "Hello World"
+    assert_equal(["hello", "world"], BagOfWords.terms(text,false).keys.sort)
+    assert_equal(["hello", "hello world", "world"], BagOfWords.terms(text,true).keys.sort)
+  end
+  def test_features
+    text = "Hello world!"
+    text += "Hello World Again!"
+    assert_equal([2, 2], BagOfWords.features(text, "Hello World".words.uniq.sort))
+  end
+  def test_stem
+    assert_equal(["protein"], "Proteins".words)
+  end
+end

data/test/rbbt/bow/test_classifier.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt/bow/classifier'
+require 'rbbt/util/tmpfile'
+require 'rbbt/util/open'
+require 'test/unit'
+class TestClassifier < Test::Unit::TestCase
+  def test_build_model
+    features =<<-EOT
+Name	Class	hello	world
+row1	-	2	0
+row2	+	0	2
+    EOT
+    featuresfile = TmpFile.tmp_file("test_classifier")
+    modelfile = TmpFile.tmp_file("test_classifier")
+    Open.write(featuresfile, features)
+    Classifier.create_model(featuresfile, modelfile)
+    assert(File.exist? modelfile)
+    FileUtils.rm featuresfile
+    FileUtils.rm modelfile
+  end
+  def test_classifier
+    features =<<-EOT
+Name	Class	hello	world
+row1	-	2	0
+row2	+	0	2
+    EOT
+    featuresfile = TmpFile.tmp_file("test_classifier")
+    modelfile = TmpFile.tmp_file("test_classifier")
+    Open.write(featuresfile, features)
+    Classifier.create_model(featuresfile, modelfile)
+    FileUtils.rm featuresfile
+    classifier = Classifier.new(modelfile)
+    assert_equal(["hello", "world"], classifier.terms)
+    assert_equal(["-", "+"], classifier.classify_feature_array([[1,0],[0,1]]))
+    assert_equal({"negative"=>"-", "positive"=>"+"}, classifier.classify_feature_hash({:positive => [0,1], :negative => [1,0]}))
+    assert_equal({"negative"=>"-", "positive"=>"+"}, classifier.classify_feature_hash({:positive => [0,1], :negative => [1,0]}))
+    assert_equal(["-", "+"], classifier.classify_text_array(["Hello","World"]))
+    assert_equal({"negative"=>"-", "positive"=>"+"}, classifier.classify_text_hash({:negative => "Hello", :positive =>"World"}))
+    assert_equal('-', classifier.classify("Hello"))
+    assert_equal(["-", "+"],classifier.classify([[1,0],[0,1]]))
+    assert_equal({"negative"=>"-", "positive"=>"+"},classifier.classify({:positive => [0,1], :negative => [1,0]}))
+    assert_equal(["-", "+"],classifier.classify(["Hello","World"]))
+    #assert_equal({"negative"=>"-", "positive"=>"+"},classifier.classify({:negative => "Hello", :positive => "World"}))
+    #assert_nothing_raised do classifier.classify("Unknown terms") end
+    #assert_nothing_raised do classifier.classify([]) end
+    FileUtils.rm modelfile
+  end
+end

data/test/rbbt/bow/test_dictionary.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt/bow/dictionary'
+require 'rbbt/bow/bow'
+require 'test/unit'
+class TestDictionary < Test::Unit::TestCase
+  def test_standard
+    docs = []
+    docs << BagOfWords.terms("Hello World", false)
+    docs << BagOfWords.terms("Hello Yin Yin", false)
+    dict = Dictionary.new
+    docs.each{|doc| dict.add doc}
+    assert_equal(2, dict.terms["hello"])
+    assert_equal(2, dict.terms["yin"])
+    assert_equal(0, dict.terms["bye"])
+    assert_equal(1, dict.terms["world"])
+  end
+  def test_tf_idf
+    docs = []
+    docs << BagOfWords.terms("Hello World", false)
+    docs << BagOfWords.terms("Hello Yin Yin", false)
+    dict = Dictionary::TF_IDF.new
+    docs.each{|doc| dict.add doc}
+    assert_equal(2, dict.terms["hello"])
+    assert_equal(2, dict.terms["yin"])
+    assert_equal(0, dict.terms["bye"])
+    assert_equal(1, dict.terms["world"])
+    assert_equal(1,   dict.df["hello"])
+    assert_equal(0.5, dict.df["yin"])
+    assert_equal(0,   dict.df["bye"])
+    assert_equal(0.5, dict.df["world"])
+    assert_equal(2.0/5, dict.tf["hello"])
+    assert_equal(2.0/5, dict.tf["yin"])
+    assert_equal(0,     dict.tf["bye"])
+    assert_equal(1.0/5,   dict.tf["world"])
+    assert_equal(Math::log(1), dict.idf["hello"])
+    assert_equal(Math::log(2), dict.idf["yin"])
+    assert_equal(0,            dict.idf["bye"])
+    assert_equal(Math::log(2), dict.idf["world"])
+    assert_equal(2.0/5 * Math::log(1),   dict.tf_idf["hello"])
+    assert_equal(2.0/5 * Math::log(2), dict.tf_idf["yin"])
+    assert_equal(0,                      dict.tf_idf["bye"])
+    assert_equal(1.0/5 * Math::log(2), dict.tf_idf["world"])
+  end
+  def test_best
+    docs = []
+    docs << BagOfWords.terms("Hello World", false)
+    docs << BagOfWords.terms("Hello Yin Yin", false)
+    dict = Dictionary::TF_IDF.new
+    docs.each{|doc| dict.add doc}
+    assert_equal(1, dict.best(:limit => 1).length)
+    assert(dict.best(:limit => 1).include? "yin")
+  end
+  def test_kl
+    docs = []
+    docs << [BagOfWords.terms("Hello World", false), :+]
+    docs << [BagOfWords.terms("Hello Cruel World", false), :+]
+    docs << [BagOfWords.terms("Hello Yan Yan", false), :-]
+    docs << [BagOfWords.terms("Hello Yin Yin", false), :-]
+    dict = Dictionary::KL.new
+    docs.each{|doc| dict.add *doc}
+    assert_equal(0, dict.kl["hello"])
+    assert_equal(dict.kl['yan'], dict.kl['yin'])
+    assert_in_delta(1 * Math::log(1 / 0.000001), dict.kl["world"],0.01)
+    assert_in_delta(0.5 * Math::log(0.5 / 0.000001), dict.kl["cruel"],0.01)
+  end
+end

data/test/rbbt/ner/rnorm/test_cue_index.rb ADDED Viewed

@@ -0,0 +1,57 @@
+require File.dirname(__FILE__) + '/../../../test_helper'
+require 'rbbt/ner/rnorm/cue_index'
+require 'rbbt/util/misc'
+require 'rbbt/util/tmpfile'
+require 'rbbt/util/open'
+require 'test/unit'
+class TestCUE < Test::Unit::TestCase
+  def setup
+    @index = CueIndex.new do
+      equal    do |w| [w] end
+      standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
+      special  do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
+      words    do |w|
+        w.scan(/[a-z]+/i).
+          select{|w| w.length > 2}.
+          sort{|a,b| b.length <=> a.length}.
+          collect{|n| n.downcase}
+      end
+    end
+  end
+  def test_cue
+    assert_equal([["Hsp70 gene"], ["genehsp70"], ["hsp70"], ["gene", "hsp"]], @index.cues("Hsp70 gene"))
+  end
+  def test_load
+    tmp = TmpFile.tmp_file("test_cue")
+    lexicon =<<-EOT
+code1\tNAME1\tname 1
+code2\tNAME2\tname 2
+    EOT
+    Open.write(tmp,lexicon)
+    assert_raise(CueIndex::LexiconMissingError){@index.match("NAME2")}
+    @index.load(tmp)
+    assert_equal(["code2"], @index.match("NAME2"))
+    FileUtils.rm tmp
+  end
+  #def test_yeast
+  #  index  = CueIndex.new
+  #  index.load(File.join(Rbbt.datadir,'biocreative','BC1GN','yeast','synonyms.list'))
+  #  assert(index.match("Met - 31").include? 'S0005959')
+  #end
+  #def test_mouse
+  #  index  = CueIndex.new
+  #  index.load(File.join(Rbbt.datadir,'biocreative','BC1GN','mouse','synonyms.list'))
+  #  puts index.match("kreisler gene").length
+  #end
+end

data/test/rbbt/ner/rnorm/test_tokens.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require File.dirname(__FILE__) + '/../../../test_helper'
+require 'rbbt/ner/rnorm/tokens'
+require 'rbbt/util/misc'
+require 'rbbt/util/tmpfile'
+require 'rbbt/util/open'
+require 'test/unit'
+class TestCompare < Test::Unit::TestCase
+  def setup
+    @index = Tokenizer.new
+  end
+  def test_type
+    assert_equal(:gene, @index.type("gene"))
+    assert_equal(:dna, @index.type("dna"))
+    assert_equal(:number, @index.type("121"))
+  end
+  def test_token_types
+    assert_equal([["dna", :dna], ["12", :number]], @index.token_types("dna12"))
+    assert_equal([["REX", :special], ["12", :number]], @index.token_types("REX12"))
+    assert_equal([["SSH", :special], ["3", :number], ["BP", :special]], @index.token_types("SSH3BP"))
+    assert_equal([["HP", :special], ["1", :number], ["gamma", :greek]], @index.token_types("HP1gamma"))
+    assert_equal([["HP", :special], ["1", :number], ["GAMMA", :greek]], @index.token_types("HP1-GAMMA"))
+  end
+  def test_eval
+    assert_equal(3, @index.evaluate_tokens(@index.token_types("1"), @index.token_types("1")))
+  end
+  def test_transforms
+    t = Tokenizer::Transform.new.unknown do |t| [t, if t.length < 4 then :special else :unknown end] end
+    assert_equal(["BP", :special], t.transform(["BP",:unknown]))
+  end
+  def test_comparisons
+    assert_equal(0, Tokenizer::Operation.new(:same).number(3).eval(@index.token_types("SSH1"),@index.token_types("SSH2")))
+    assert_equal(3, Tokenizer::Operation.new(:same).number(3).eval(@index.token_types("SSH1"),@index.token_types("SSH1")))
+    assert_equal(0, Tokenizer::Operation.new(:same).special(1).eval([["SSH", :special],["1", :number]],[["SSH", :special],["3", :number],["BP",:special]]))
+    assert_equal(-1, Tokenizer::Operation.new(:diff).special(-1).eval([["SSH", :special],["1", :number]],[["SSH", :special],["3", :number],["BP",:special]]))
+    assert_equal(-1, Tokenizer::Operation.new(:extr).special(-1).eval([["SSH", :special],["1", :number]],[["SSH", :special],["3", :number],["BP",:special]]))
+    assert_equal(-1, Tokenizer::Operation.new(:miss).special(-1).eval([["SSH", :special],["3", :number],["BP",:special]],[["SSH", :special],["1", :number]]))
+  end
+  def test_ignore_case
+    assert_equal(-1, Tokenizer::Operation.new(:diff).ignore_case(false).special(-1).eval([["ssh", :special]],[["SSH", :special]]))
+    assert_equal(0, Tokenizer::Operation.new(:diff).ignore_case(true).special(-1).eval([["ssh", :special]],[["SSH", :special]]))
+  end
+  def test_compare
+     assert_equal(-10, @index.evaluate("DNA1", "GENE2"))
+     assert_equal(3, @index.evaluate("DNA1", "GENE1"))
+     assert_equal(3, @index.evaluate("DNA1", "RNA1"))
+     assert_equal(-1, @index.evaluate("SSH", "SSH1"))
+     assert_equal(7, @index.evaluate("pol III", "POL3"))
+  end
+  def test_default
+    index = Tokenizer.new
+    assert(index.evaluate("SSH", "SSH1") > index.evaluate("SSH", "SSH3BP"))
+    assert(index.evaluate("HP1gamma", "HP1-GAMMA") > 1)
+    assert(index.evaluate("HP1alpha", "HP1 alpha") > 1)
+    assert(index.evaluate("IL-1beta", "IL-1 beta") > 1)
+    assert(index.evaluate("IL-1RI", "IL-1R-1") > 1)
+    assert(index.evaluate("MODI", "MOD 1") > 1)
+    assert(index.evaluate("MOD 1", "MODI") > 1)
+    assert(index.evaluate("Ubc3", "Ubc3b") > 1)
+  end
+end

data/test/rbbt/ner/test_abner.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt/ner/abner'
+require 'test/unit'
+class TestAbner < Test::Unit::TestCase
+  def test_extract
+      ner = Abner.new
+      mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
+      ["SHP-2", "SHIP", "Shc"].each{|mention|
+        assert(mentions.include? mention)
+      }
+  end
+end

data/test/rbbt/ner/test_banner.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require File.dirname(__FILE__) + '/../../test_helper'
+require 'rbbt/ner/banner'
+require 'test/unit'
+class TestBanner < Test::Unit::TestCase
+  def test_extract
+      ner = Banner.new
+      mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
+      ["SHP - 2", "SHIP", "Shc"].each{|mention|
+        assert(mentions.include? mention)
+      }
+  end
+end

data/test/rbbt/ner/test_dictionaryNER.rb ADDED Viewed

@@ -0,0 +1,122 @@
+require 'rbbt'
+require 'rbbt/util/tmpfile'
+require 'rbbt/util/open'
+require 'rbbt/ner/dictionaryNER'
+require 'test/unit'
+class TestDictionaryNER < Test::Unit::TestCase
+  def setup
+    @dictionary  =<<-EOT
+DICT1\tWord1 Word2\tWord1
+DICT2\tWord3-Word4\tWord4
+    EOT
+    @dict = {
+      "word1" => [{'word2' => ['DICT1'] }, 'DICT1'],
+      "word3" => [{'word4' => ['DICT2'] }],
+      "word4" => ['DICT2'],
+    }
+  end
+  def test_simplify
+    assert_equal('word1', DictionaryNER.simplify( "Word1"))
+    assert_equal('ACL', DictionaryNER.simplify("ACL"))
+  end
+  def test_chunk
+    assert_equal(["Word1","Word2"], DictionaryNER.chunk('Word1-Word2'))
+    assert_equal(["Word1-1"], DictionaryNER.chunk('Word1-1'))
+  end
+  def test_match
+    [
+      ["Word1", {"word1" => ["D1"]}, {"Word1" => ["D1"]}],
+      ["Word1 Word1", {"word1" => ["D1"]}, {"Word1" => ["D1"]}],
+      ["Word2 Word1 Word3", {"word1" => ["D1"]}, {"Word1" => ["D1"]} ],
+      ["Word2 Word1 Word4", {"word1" => ["D1","D2"]}, {"Word1" => ["D1","D2"]} ],
+      ["Word2 Word1 Word4",
+        {"word1" => [{'word2' => ['D1']}]},
+        {} ],
+      [
+        "Word2 Word1 Word4",
+        {"word1" => [ {'word4' => ['D1']} ] },
+        {"Word1 Word4" => ["D1"]},
+      ],
+      [
+        "Word2 Word1 Word4",
+        {"word1" => [ {'word4' => ['D1']} ], "word4" => ['D2'] },
+        {"Word1 Word4" => ["D1"], "Word4" => ['D2']},
+      ],
+    ].each{|match_info|
+      text   = match_info[0]
+      dict   = match_info[1]
+      result = match_info[2]
+      assert_equal(result, DictionaryNER.match(dict, text))
+    }
+  end
+  def test_add_name
+    [
+      ["Word1", {"word1" => ['code']}],
+      ["Word1 Word2", {"word1" => [{"word2" => ['code']}]}],
+      ["Cerebellar stroke syndrome", {"cerebellar" => [{'stroke' => [{'syndrome' => ['code']}]}]}]
+    ].each{|info|
+      name = info[0]
+      result = info[1]
+      dict = {}
+      DictionaryNER.add_name(dict, name, 'code')
+      assert_equal(result, dict)
+    }
+  end
+  def test_load
+    assert_equal(@dict, DictionaryNER.load(@dictionary))
+  end
+  def test_class
+    ner = DictionaryNER.new(@dictionary)
+    [
+      [ "Word1 Word2", ["Word1 Word2", "Word1"] ],
+      [ "foo Word1 Word2 foo", ["Word1 Word2", "Word1"] ],
+      [ "Word1-Word2", ["Word1 Word2", "Word1"] ],
+      [ "Word1\nWord2", ["Word1 Word2", "Word1"] ],
+    ].each{|info|
+      text = info[0]
+      keys = info[1]
+      assert_equal(keys.sort, ner.match(text).keys.sort)
+    }
+  end
+  def test_load_from_file
+    tmpfile = TmpFile.tmp_file
+    Open.write(tmpfile, @dictionary)
+    ner = DictionaryNER.new(tmpfile)
+    assert(ner.match("Word1").any?)
+  end
+end