RubyGems - rbbt - Versions diffs - 1.2.5 → 2.0.0 - Mend

rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +69 -214
data/LICENSE +0 -20
data/bin/rbbt_config +0 -245
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -140
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -86
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Ath.Rakefile +0 -55
data/install_scripts/organisms/Cal.Rakefile +0 -84
data/install_scripts/organisms/Cel.Rakefile +0 -109
data/install_scripts/organisms/Hsa.Rakefile +0 -140
data/install_scripts/organisms/Mmu.Rakefile +0 -77
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/Rno.Rakefile +0 -88
data/install_scripts/organisms/Sce.Rakefile +0 -66
data/install_scripts/organisms/Spo.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -252
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -83
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -85
data/lib/rbbt/sources/gscholar.rb +0 -74
data/lib/rbbt/sources/organism.rb +0 -241
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -248
data/lib/rbbt/util/arrayHash.rb +0 -266
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -251
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -35
data/tasks/install.rake +0 -124
data/test/rbbt/bow/test_bow.rb +0 -33
data/test/rbbt/bow/test_classifier.rb +0 -72
data/test/rbbt/bow/test_dictionary.rb +0 -91
data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
data/test/rbbt/ner/test_abner.rb +0 -17
data/test/rbbt/ner/test_banner.rb +0 -17
data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
data/test/rbbt/ner/test_regexpNER.rb +0 -33
data/test/rbbt/ner/test_rner.rb +0 -126
data/test/rbbt/ner/test_rnorm.rb +0 -47
data/test/rbbt/sources/test_biocreative.rb +0 -38
data/test/rbbt/sources/test_biomart.rb +0 -31
data/test/rbbt/sources/test_entrez.rb +0 -49
data/test/rbbt/sources/test_go.rb +0 -24
data/test/rbbt/sources/test_organism.rb +0 -59
data/test/rbbt/sources/test_polysearch.rb +0 -27
data/test/rbbt/sources/test_pubmed.rb +0 -39
data/test/rbbt/util/test_arrayHash.rb +0 -257
data/test/rbbt/util/test_filecache.rb +0 -37
data/test/rbbt/util/test_index.rb +0 -31
data/test/rbbt/util/test_misc.rb +0 -20
data/test/rbbt/util/test_open.rb +0 -110
data/test/rbbt/util/test_simpleDSL.rb +0 -57
data/test/rbbt/util/test_tmpfile.rb +0 -21
data/test/test_helper.rb +0 -4
data/test/test_rbbt.rb +0 -11

data/lib/rbbt/util/tmpfile.rb DELETED

@@ -1,35 +0,0 @@
-require 'fileutils'
-require 'rbbt'
-module TmpFile
-  # Creates a random file name, with the given suffix and a random number
-  # up to +max+
-  def self.random_name( s="",max=10000000)
-    n = rand(max)
-    s << n.to_s
-    s
-  end
-  # Creates a random filename in the temporary directory
-  def self.tmp_file(s = "",max=10000000)
-    File.join(Rbbt.tmpdir,random_name(s,max))
-  end
-  def self.with_file(content = nil)
-    tmpfile = tmp_file
-    File.open(tmpfile, 'w') do |f| f.write content end if content != nil
-    result = yield(tmpfile)
-    FileUtils.rm tmpfile if File.exists? tmpfile
-    result
-  end
-  class << self
-    alias :new :tmp_file
-  end
-end

data/tasks/install.rake DELETED

@@ -1,124 +0,0 @@
-require 'rbbt'
-$datadir = Rbbt.datadir
-$scriptdir = File.join(File.expand_path(Rbbt.rootdir), '/install_scripts')
-task 'abner' do
-  directory = "#{$datadir}/third_party/abner/"
-  if !File.exists?(File.join(directory, 'abner.jar')) || $force
-    FileUtils.mkdir_p directory
-    `cd #{directory};rm -Rf *; #{$scriptdir}/get_abner.sh;cd -`
-  end
-end
-task 'banner' do
-  directory = "#{$datadir}/third_party/banner/"
-  if !File.exists?(File.join(directory, 'banner.jar')) || $force
-    FileUtils.mkdir_p directory
-    `cd #{directory};rm -Rf *; #{$scriptdir}/get_banner.sh;cd -`
-  end
-end
-task 'crf++' do
-  directory = "#{$datadir}/third_party/crf++/"
-  if !File.exists?(File.join(directory, 'ruby/CRFPP.so')) || $force
-    FileUtils.mkdir_p directory
-    `cd #{directory};rm -Rf *; #{$scriptdir}/get_crf++.sh;cd -`
-  end
-end
-task 'wordlists' do
-  FileUtils.cp_r File.join($scriptdir, 'wordlists/'), $datadir
-end
-task 'polysearch' do
-  directory = "#{$datadir}/dbs/polysearch/"
-  if !File.exists?(File.join(directory,'disease.txt')) || $force
-    FileUtils.mkdir_p directory
-    `cd #{directory}/; rm * -Rf; #{$scriptdir}/get_polysearch.sh;cd -`
-  end
-end
-task '3party' => %w(abner banner crf++)
-task 'entrez' do
-  directory = "#{$datadir}/dbs/entrez/"
-  if !File.exists?(File.join(directory,'gene_info')) || $force
-    FileUtils.mkdir_p directory
-    `cd #{directory}/; rm * -Rf; #{$scriptdir}/get_entrez.sh;cd -`
-  end
-end
-task 'go' do
-  directory = "#{$datadir}/dbs/go/"
-  if !File.exists?(File.join(directory,'gene_ontology.obo')) || $force
-    FileUtils.mkdir_p directory
-    `cd #{directory}/; rm * -Rf; #{$scriptdir}/get_go.sh;cd -`
-  end
-end
-task 'biocreative' do
-  directory = "#{$datadir}/biocreative/"
-  if !File.exists?(File.join(directory, 'BC2GN')) || $force
-    FileUtils.mkdir_p directory
-    `cd #{directory};rm -Rf *; #{$scriptdir}/get_biocreative.sh;cd -`
-  end
-end
-task 'datasets' => %w(entrez biocreative)
-task 'organisms' do
-  directory = "#{$datadir}/organisms"
-  FileUtils.mkdir_p directory
-  %w(Rakefile rake-include.rb).each{|f|
-    FileUtils.cp_r File.join($scriptdir, "organisms/#{ f }"), directory
-  }
-  Dir.glob(File.join($scriptdir, "organisms/*.Rakefile")).each{|f|
-    org = File.basename(f).sub(/.Rakefile/,'')
-    if !File.exists?(File.join(directory, org))
-      FileUtils.mkdir_p File.join(directory, org)
-    end
-    FileUtils.cp f , File.join(directory, "#{ org }/Rakefile")
-  }
-  `cd #{directory}; rake names`
-end
-task 'ner' do
-  directory = "#{$datadir}/ner"
-  FileUtils.mkdir_p directory
-  %w(Rakefile config).each{|f|
-    FileUtils.cp_r File.join($scriptdir, "ner/#{ f }"), directory
-  }
-  %w(data model results).each{|d|
-    FileUtils.mkdir_p File.join(directory, d)
-  }
-end
-task 'norm' do
-  directory = "#{$datadir}/norm"
-  FileUtils.mkdir_p directory
-  %w(Rakefile config functions.sh).each{|f|
-    FileUtils.cp_r File.join($scriptdir, "norm/#{ f }"), directory
-  }
- %w(results models).each{|d|
-  FileUtils.mkdir_p File.join(directory, d)
-  }
-end
-task 'classifier' do
-  directory = "#{$datadir}/classifier"
-  FileUtils.mkdir_p directory
-  %w(Rakefile R).each{|f|
-    FileUtils.cp_r File.join($scriptdir, "classifier/#{ f }"), directory
-  }
-  %w(data model results).each{|d|
-    FileUtils.mkdir_p File.join(directory, d)
-  }
-end

data/test/rbbt/bow/test_bow.rb DELETED

@@ -1,33 +0,0 @@
-require File.dirname(__FILE__) + '/../../test_helper'
-require 'rbbt/bow/bow'
-require 'test/unit'
-class TestBow < Test::Unit::TestCase
-  def test_words
-    assert_equal(["hello", "world"], "Hello World".words)
-  end
-  def test_terms
-    text = "Hello World"
-    assert_equal(["hello", "world"], BagOfWords.terms(text,false).keys.sort)
-    assert_equal(["hello", "hello world", "world"], BagOfWords.terms(text,true).keys.sort)
-  end
-  def test_features
-    text = "Hello world!"
-    text += "Hello World Again!"
-    assert_equal([2, 2], BagOfWords.features(text, "Hello World".words.uniq.sort))
-  end
-  def test_stem
-    assert_equal(["protein"], "Proteins".words)
-  end
-end

data/test/rbbt/bow/test_classifier.rb DELETED

@@ -1,72 +0,0 @@
-require File.dirname(__FILE__) + '/../../test_helper'
-require 'rbbt/bow/classifier'
-require 'rbbt/util/tmpfile'
-require 'rbbt/util/open'
-require 'test/unit'
-class TestClassifier < Test::Unit::TestCase
-  def test_build_model
-    features =<<-EOT
-Name	Class	hello	world
-row1	-	2	0
-row2	+	0	2
-    EOT
-    featuresfile = TmpFile.tmp_file("test_classifier")
-    modelfile = TmpFile.tmp_file("test_classifier")
-    Open.write(featuresfile, features)
-    Classifier.create_model(featuresfile, modelfile)
-    assert(File.exist? modelfile)
-    FileUtils.rm featuresfile
-    FileUtils.rm modelfile
-  end
-  def test_classifier
-    features =<<-EOT
-Name	Class	hello	world
-row1	-	2	0
-row2	+	0	2
-    EOT
-    featuresfile = TmpFile.tmp_file("test_classifier")
-    modelfile = TmpFile.tmp_file("test_classifier")
-    Open.write(featuresfile, features)
-    Classifier.create_model(featuresfile, modelfile)
-    FileUtils.rm featuresfile
-    classifier = Classifier.new(modelfile)
-    assert_equal(["hello", "world"], classifier.terms)
-    assert_equal(["-", "+"], classifier.classify_feature_array([[1,0],[0,1]]))
-    assert_equal({"negative"=>"-", "positive"=>"+"}, classifier.classify_feature_hash({:positive => [0,1], :negative => [1,0]}))
-    assert_equal({"negative"=>"-", "positive"=>"+"}, classifier.classify_feature_hash({:positive => [0,1], :negative => [1,0]}))
-    assert_equal(["-", "+"], classifier.classify_text_array(["Hello","World"]))
-    assert_equal({"negative"=>"-", "positive"=>"+"}, classifier.classify_text_hash({:negative => "Hello", :positive =>"World"}))
-    assert_equal('-', classifier.classify("Hello"))
-    assert_equal(["-", "+"],classifier.classify([[1,0],[0,1]]))
-    assert_equal({"negative"=>"-", "positive"=>"+"},classifier.classify({:positive => [0,1], :negative => [1,0]}))
-    assert_equal(["-", "+"],classifier.classify(["Hello","World"]))
-    #assert_equal({"negative"=>"-", "positive"=>"+"},classifier.classify({:negative => "Hello", :positive => "World"}))
-    #assert_nothing_raised do classifier.classify("Unknown terms") end
-    #assert_nothing_raised do classifier.classify([]) end
-    FileUtils.rm modelfile
-  end
-end

data/test/rbbt/bow/test_dictionary.rb DELETED

@@ -1,91 +0,0 @@
-require File.dirname(__FILE__) + '/../../test_helper'
-require 'rbbt/bow/dictionary'
-require 'rbbt/bow/bow'
-require 'test/unit'
-class TestDictionary < Test::Unit::TestCase
-  def test_standard
-    docs = []
-    docs << BagOfWords.terms("Hello World", false)
-    docs << BagOfWords.terms("Hello Yin Yin", false)
-    dict = Dictionary.new
-    docs.each{|doc| dict.add doc}
-    assert_equal(2, dict.terms["hello"])
-    assert_equal(2, dict.terms["yin"])
-    assert_equal(0, dict.terms["bye"])
-    assert_equal(1, dict.terms["world"])
-  end
-  def test_tf_idf
-    docs = []
-    docs << BagOfWords.terms("Hello World", false)
-    docs << BagOfWords.terms("Hello Yin Yin", false)
-    dict = Dictionary::TF_IDF.new
-    docs.each{|doc| dict.add doc}
-    assert_equal(2, dict.terms["hello"])
-    assert_equal(2, dict.terms["yin"])
-    assert_equal(0, dict.terms["bye"])
-    assert_equal(1, dict.terms["world"])
-    assert_equal(1,   dict.df["hello"])
-    assert_equal(0.5, dict.df["yin"])
-    assert_equal(0,   dict.df["bye"])
-    assert_equal(0.5, dict.df["world"])
-    assert_equal(2.0/5, dict.tf["hello"])
-    assert_equal(2.0/5, dict.tf["yin"])
-    assert_equal(0,     dict.tf["bye"])
-    assert_equal(1.0/5,   dict.tf["world"])
-    assert_equal(Math::log(1), dict.idf["hello"])
-    assert_equal(Math::log(2), dict.idf["yin"])
-    assert_equal(0,            dict.idf["bye"])
-    assert_equal(Math::log(2), dict.idf["world"])
-    assert_equal(2.0/5 * Math::log(1),   dict.tf_idf["hello"])
-    assert_equal(2.0/5 * Math::log(2), dict.tf_idf["yin"])
-    assert_equal(0,                      dict.tf_idf["bye"])
-    assert_equal(1.0/5 * Math::log(2), dict.tf_idf["world"])
-  end
-  def test_best
-    docs = []
-    docs << BagOfWords.terms("Hello World", false)
-    docs << BagOfWords.terms("Hello Yin Yin", false)
-    dict = Dictionary::TF_IDF.new
-    docs.each{|doc| dict.add doc}
-    assert_equal(1, dict.best(:limit => 1).length)
-    assert(dict.best(:limit => 1).include? "yin")
-  end
-  def test_kl
-    docs = []
-    docs << [BagOfWords.terms("Hello World", false), :+]
-    docs << [BagOfWords.terms("Hello Cruel World", false), :+]
-    docs << [BagOfWords.terms("Hello Yan Yan", false), :-]
-    docs << [BagOfWords.terms("Hello Yin Yin", false), :-]
-    dict = Dictionary::KL.new
-    docs.each{|doc| dict.add *doc}
-    assert_equal(0, dict.kl["hello"])
-    assert_equal(dict.kl['yan'], dict.kl['yin'])
-    assert_in_delta(1 * Math::log(1 / 0.000001), dict.kl["world"],0.01)
-    assert_in_delta(0.5 * Math::log(0.5 / 0.000001), dict.kl["cruel"],0.01)
-  end
-end

data/test/rbbt/ner/rnorm/test_cue_index.rb DELETED

@@ -1,57 +0,0 @@
-require File.dirname(__FILE__) + '/../../../test_helper'
-require 'rbbt/ner/rnorm/cue_index'
-require 'rbbt/util/misc'
-require 'rbbt/util/tmpfile'
-require 'rbbt/util/open'
-require 'test/unit'
-class TestCUE < Test::Unit::TestCase
-  def setup
-    @index = CueIndex.new do
-      equal    do |w| [w] end
-      standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
-      special  do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
-      words    do |w|
-        w.scan(/[a-z]+/i).
-          select{|w| w.length > 2}.
-          sort{|a,b| b.length <=> a.length}.
-          collect{|n| n.downcase}
-      end
-    end
-  end
-  def test_cue
-    assert_equal([["Hsp70 gene"], ["genehsp70"], ["hsp70"], ["gene", "hsp"]], @index.cues("Hsp70 gene"))
-  end
-  def test_load
-    tmp = TmpFile.tmp_file("test_cue")
-    lexicon =<<-EOT
-code1\tNAME1\tname 1
-code2\tNAME2\tname 2
-    EOT
-    Open.write(tmp,lexicon)
-    assert_raise(CueIndex::LexiconMissingError){@index.match("NAME2")}
-    @index.load(tmp)
-    assert_equal(["code2"], @index.match("NAME2"))
-    FileUtils.rm tmp
-  end
-  #def test_yeast
-  #  index  = CueIndex.new
-  #  index.load(File.join(Rbbt.datadir,'biocreative','BC1GN','yeast','synonyms.list'))
-  #  assert(index.match("Met - 31").include? 'S0005959')
-  #end
-  #def test_mouse
-  #  index  = CueIndex.new
-  #  index.load(File.join(Rbbt.datadir,'biocreative','BC1GN','mouse','synonyms.list'))
-  #  puts index.match("kreisler gene").length
-  #end
-end

data/test/rbbt/ner/rnorm/test_tokens.rb DELETED

@@ -1,70 +0,0 @@
-require File.dirname(__FILE__) + '/../../../test_helper'
-require 'rbbt/ner/rnorm/tokens'
-require 'rbbt/util/misc'
-require 'rbbt/util/tmpfile'
-require 'rbbt/util/open'
-require 'test/unit'
-class TestCompare < Test::Unit::TestCase
-  def setup
-    @index = Tokenizer.new
-  end
-  def test_type
-    assert_equal(:gene, @index.type("gene"))
-    assert_equal(:dna, @index.type("dna"))
-    assert_equal(:number, @index.type("121"))
-  end
-  def test_token_types
-    assert_equal([["dna", :dna], ["12", :number]], @index.token_types("dna12"))
-    assert_equal([["REX", :special], ["12", :number]], @index.token_types("REX12"))
-    assert_equal([["SSH", :special], ["3", :number], ["BP", :special]], @index.token_types("SSH3BP"))
-    assert_equal([["HP", :special], ["1", :number], ["gamma", :greek]], @index.token_types("HP1gamma"))
-    assert_equal([["HP", :special], ["1", :number], ["GAMMA", :greek]], @index.token_types("HP1-GAMMA"))
-  end
-  def test_eval
-    assert_equal(3, @index.evaluate_tokens(@index.token_types("1"), @index.token_types("1")))
-  end
-  def test_transforms
-    t = Tokenizer::Transform.new.unknown do |t| [t, if t.length < 4 then :special else :unknown end] end
-    assert_equal(["BP", :special], t.transform(["BP",:unknown]))
-  end
-  def test_comparisons
-    assert_equal(0, Tokenizer::Operation.new(:same).number(3).eval(@index.token_types("SSH1"),@index.token_types("SSH2")))
-    assert_equal(3, Tokenizer::Operation.new(:same).number(3).eval(@index.token_types("SSH1"),@index.token_types("SSH1")))
-    assert_equal(0, Tokenizer::Operation.new(:same).special(1).eval([["SSH", :special],["1", :number]],[["SSH", :special],["3", :number],["BP",:special]]))
-    assert_equal(-1, Tokenizer::Operation.new(:diff).special(-1).eval([["SSH", :special],["1", :number]],[["SSH", :special],["3", :number],["BP",:special]]))
-    assert_equal(-1, Tokenizer::Operation.new(:extr).special(-1).eval([["SSH", :special],["1", :number]],[["SSH", :special],["3", :number],["BP",:special]]))
-    assert_equal(-1, Tokenizer::Operation.new(:miss).special(-1).eval([["SSH", :special],["3", :number],["BP",:special]],[["SSH", :special],["1", :number]]))
-  end
-  def test_ignore_case
-    assert_equal(-1, Tokenizer::Operation.new(:diff).ignore_case(false).special(-1).eval([["ssh", :special]],[["SSH", :special]]))
-    assert_equal(0, Tokenizer::Operation.new(:diff).ignore_case(true).special(-1).eval([["ssh", :special]],[["SSH", :special]]))
-  end
-  def test_compare
-     assert_equal(-10, @index.evaluate("DNA1", "GENE2"))
-     assert_equal(3, @index.evaluate("DNA1", "GENE1"))
-     assert_equal(3, @index.evaluate("DNA1", "RNA1"))
-     assert_equal(-1, @index.evaluate("SSH", "SSH1"))
-     assert_equal(7, @index.evaluate("pol III", "POL3"))
-  end
-  def test_default
-    index = Tokenizer.new
-    assert(index.evaluate("SSH", "SSH1") > index.evaluate("SSH", "SSH3BP"))
-    assert(index.evaluate("HP1gamma", "HP1-GAMMA") > 1)
-    assert(index.evaluate("HP1alpha", "HP1 alpha") > 1)
-    assert(index.evaluate("IL-1beta", "IL-1 beta") > 1)
-    assert(index.evaluate("IL-1RI", "IL-1R-1") > 1)
-    assert(index.evaluate("MODI", "MOD 1") > 1)
-    assert(index.evaluate("MOD 1", "MODI") > 1)
-    assert(index.evaluate("Ubc3", "Ubc3b") > 1)
-  end
-end