RubyGems - rbbt - Versions diffs - 1.0.0 - Mend

rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/LICENSE +20 -0
data/README.rdoc +17 -0
data/bin/rbbt_config +180 -0
data/install_scripts/classifier/R/classify.R +36 -0
data/install_scripts/classifier/Rakefile +140 -0
data/install_scripts/get_abner.sh +2 -0
data/install_scripts/get_banner.sh +25 -0
data/install_scripts/get_biocreative.sh +72 -0
data/install_scripts/get_crf++.sh +26 -0
data/install_scripts/get_entrez.sh +4 -0
data/install_scripts/get_go.sh +4 -0
data/install_scripts/get_polysearch.sh +8 -0
data/install_scripts/ner/Rakefile +206 -0
data/install_scripts/ner/config/default.rb +52 -0
data/install_scripts/norm/Rakefile +218 -0
data/install_scripts/norm/config/cue_default.rb +10 -0
data/install_scripts/norm/config/tokens_default.rb +79 -0
data/install_scripts/norm/functions.sh +21 -0
data/install_scripts/organisms/Rakefile +25 -0
data/install_scripts/organisms/cgd.Rakefile +84 -0
data/install_scripts/organisms/human.Rakefile +145 -0
data/install_scripts/organisms/mgi.Rakefile +77 -0
data/install_scripts/organisms/pombe.Rakefile +40 -0
data/install_scripts/organisms/rake-include.rb +258 -0
data/install_scripts/organisms/rgd.Rakefile +88 -0
data/install_scripts/organisms/sgd.Rakefile +66 -0
data/install_scripts/organisms/tair.Rakefile +54 -0
data/install_scripts/organisms/worm.Rakefile +109 -0
data/install_scripts/stopwords +1 -0
data/install_scripts/wordlists/consonants +897 -0
data/install_scripts/wordlists/stopwords +1 -0
data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/classifier.rb +118 -0
data/lib/rbbt/bow/dictionary.rb +218 -0
data/lib/rbbt/ner/abner.rb +34 -0
data/lib/rbbt/ner/banner.rb +73 -0
data/lib/rbbt/ner/regexpNER.rb +62 -0
data/lib/rbbt/ner/rner.rb +227 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
data/lib/rbbt/ner/rnorm.rb +142 -0
data/lib/rbbt/sources/biocreative.rb +75 -0
data/lib/rbbt/sources/biomart.rb +106 -0
data/lib/rbbt/sources/entrez.rb +211 -0
data/lib/rbbt/sources/go.rb +40 -0
data/lib/rbbt/sources/organism.rb +197 -0
data/lib/rbbt/sources/polysearch.rb +88 -0
data/lib/rbbt/sources/pubmed.rb +111 -0
data/lib/rbbt/util/arrayHash.rb +255 -0
data/lib/rbbt/util/filecache.rb +72 -0
data/lib/rbbt/util/index.rb +69 -0
data/lib/rbbt/util/misc.rb +101 -0
data/lib/rbbt/util/open.rb +207 -0
data/lib/rbbt/util/simpleDSL.rb +87 -0
data/lib/rbbt/util/tmpfile.rb +19 -0
data/lib/rbbt/version.rb +10 -0
data/lib/rbbt.rb +86 -0
data/tasks/install.rake +123 -0
metadata +114 -0

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Miguel Vazquez
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,17 @@
+= rbbt
+Description goes here.
+== Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a
+  future version unintentionally.
+* Commit, do not mess with rakefile, version, or history.
+  (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
+* Send me a pull request. Bonus points for topic branches.
+== Copyright
+Copyright (c) 2009 Miguel Vazquez. See LICENSE for details.

data/bin/rbbt_config ADDED Viewed

@@ -0,0 +1,180 @@
+#!/usr/bin/ruby
+require 'rubygems'
+require 'rake'
+require 'simpleconsole'
+begin
+  require 'rbbt'
+rescue Rbbt::NoConfig
+  $noconfig = true
+end
+$USAGE =<<EOT
+#{__FILE__} <action> [<subaction>] [--force] [--organism <org>]
+  actions:
+    * configure:   Set paths for data, cache, and tmp directories
+    * install:
+      * basic:     Third party software
+      * databases: Entrez and Biocreative
+      * models:    Gene Mention and Classification
+      * organisms: Rules to gather data for organisms
+      * all:       3party wordlists entrez biocreative go ner norm classifier organisms polysearch
+    * update:
+      * organisms: Gather data for organisms
+      * ner:       Build Named Entity Recognition Models for Gene Mention
+      * classification:
+                   Build Function/Process Classifiers
+    * purge_cache: Clean the non-persistent cache, which holds general things
+        downloaded using Open.read, like organism identifiers downloaded from
+        BioMart. The persistent cache, which hold pubmed articles or entrez gene
+        descriptions, is not cleaned, as these are not likely to change
+EOT
+class Controller < SimpleConsole::Controller
+  params :bool => {:f => :force},
+         :string => {:o => :organism}
+  def default
+    render :action => :usage
+  end
+  def help
+    render :action => :usage
+  end
+  def update
+    raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
+    case params[:id]
+    when "organisms"
+      @location = File.join(Rbbt.datadir,'organisms')
+    when "ner"
+      @location = File.join(Rbbt.datadir,'ner')
+    when "classifier"
+      @location = File.join(Rbbt.datadir,'classifier')
+    else
+      redirect_to :action => :help, :id => :update
+    end
+    $force = true if params[:force]
+    $org = params[:organism] if params[:organism]
+  end
+  def install
+    raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
+    case params[:id]
+    when "basic"
+      @tasks = %w(3party wordlists polysearch)
+    when "databases"
+      @tasks = %w(entrez biocreative go)
+    when "models"
+      @tasks = %w(ner norm classifier)
+    when "organisms"
+      @tasks = %w(organisms)
+    when "all"
+      @tasks = %w(3party wordlists entrez biocreative go ner norm classifier organisms polysearch)
+    when nil
+      redirect_to :action => :help, :id => :install
+    else
+      @tasks = [params[:id]]
+    end
+    $force = true if params[:force]
+    $org = params[:organism] if params[:organism]
+  end
+  def configure
+  end
+  def purge_cache
+  end
+end
+class View < SimpleConsole::View
+  def usage
+    puts $USAGE
+  end
+  def install
+    load File.join(Rbbt.rootdir, 'tasks/install.rake')
+    @tasks.each{|t|
+      puts "Invoking #{ t }"
+      Rake::Task[t].invoke
+    }
+  end
+  def update
+    puts "Changing directory to #{@location}"
+    chdir @location
+    load "./Rakefile"
+    Rake::Task['default'].invoke
+  end
+  def configure
+    defaultdir = File.join(ENV['HOME'],'rbbt')
+    cachedir   = File.join(defaultdir, 'cache')
+    tmpdir   = File.join(defaultdir, 'tmp')
+    datadir   = File.join(defaultdir, 'data')
+    puts "Please indicate where you wish to place the data directories"
+    puts
+    puts
+    puts "* Cache Directory: This directory will hold downloads, from PubMed,
+  Entrez and other, for local store. It might grow considerably."
+    print "[#{ cachedir }]? "
+    input = STDIN.gets
+    cachedir = input if input =~ /\w/
+    puts
+    puts "* Tmp Directory: Temporary files."
+    print "[#{ tmpdir }]? "
+    input = STDIN.gets
+    tmpdir = input if input =~ /\w/
+    puts
+    puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
+    print "[#{ datadir }]? "
+    input = STDIN.gets
+    datadir = input if input =~ /\w/
+    fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
+    fout.puts "cachedir: #{cachedir}"
+    fout.puts "tmpdir: #{tmpdir}"
+    fout.puts "datadir: #{datadir}"
+    fout.close
+  end
+  def purge_cache
+    FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
+  end
+end
+SimpleConsole::Application.run(ARGV, Controller, View)

data/install_scripts/classifier/R/classify.R ADDED Viewed

@@ -0,0 +1,36 @@
+library('e1071')
+BOW.norm <- function(x, weights = NULL){
+    x = 1 + log(x);
+    x[x==-Inf] = 0;
+    x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
+    x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
+    x.norm = x * x.sum;
+    rm(x.sum);
+    x.norm[is.na(x.norm)] = 0
+    if (!is.null(weights)){
+      x.norm =  x.norm  * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
+    }
+    x.norm;
+}
+BOW.classification.model <- function(features, modelfile, dictfile = NULL){
+    feats = read.table(features, sep="\t", header=T, row.names=1);
+    if (!is.null(dictfile)){
+        svm.weights = read.table(file=dictfile, sep="\t")[2];
+    }else {
+        svm.weights = NULL;
+    }
+    feats[-1] = BOW.norm(feats[-1], svm.weights);
+    svm.model = svm(Class ~ ., data=feats, svm.weights);
+    save(svm.model,svm.weights, file=modelfile);
+}
+BOW.classification.classify <- function(modelfile, x, weights = NULL){
+    x = BOW.norm(x, weights);
+    predict(modelfile, x);
+}

data/install_scripts/classifier/Rakefile ADDED Viewed

@@ -0,0 +1,140 @@
+require 'rbbt'
+require 'rbbt/sources/organism'
+require 'rbbt/sources/pubmed'
+require 'rbbt/bow/bow'
+require 'rbbt/bow/dictionary'
+require 'rbbt/bow/classifier'
+require 'rbbt/util/misc'
+require 'progress-monitor'
+require 'rand'
+$hi      = ENV['hi']  || 0.8
+$low     = ENV['low'] || 0.01
+$max     = ENV['max'] || 3000
+$bigrams = ENV['bigrams'] == 'true' || false
+$ndocs   = ENV['ndocs'] || 5000
+desc "Bilds Dictionary and Features for an organism"
+rule(/data\/(.*)/) do |t|
+  org = File.basename(t.name)
+  go  = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
+  all = Organism.literature(org).flatten.uniq - go
+  ndocs = [go.length, all.length, $ndocs.to_i].min
+  puts "Using #{ ndocs } from each class\n\n"
+  go    = go.shuffle[0..ndocs - 1]
+  all   = all.shuffle[0..ndocs - 1]
+  dict = Dictionary::KL.new
+  chunks = all.chunk(50)
+  Progress.monitor("Building Dictionary for #{ org }: -",1000)
+  chunks.each{|chunk|
+    PubMed.get_article(chunk).each{|pmid, article|
+      words = BagOfWords.terms(article.text,$bigrams)
+      dict.add(words, :-)
+    }
+  }
+  chunks = go.chunk(50)
+  Progress.monitor("Building Dictionary for #{ org }: +",1000)
+  chunks.each{|chunk|
+    PubMed.get_article(chunk).each{|pmid, article|
+      words = BagOfWords.terms(article.text,$bigrams)
+      dict.add(words, :+)
+    }
+  }
+  term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
+  Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
+  terms = term_weigths.keys.sort
+  fout = File.open(t.name, 'w')
+  fout.puts((['Name','Class'] + terms).join("\t"))
+  Progress.monitor("Building Features for #{ org }", 1000)
+  all.each{|pmid|
+    text = PubMed.get_article(pmid).text
+    fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
+  }
+  go.each{|pmid|
+    text = PubMed.get_article(pmid).text
+    fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
+  }
+  fout.close
+end
+rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
+  features = t.name.sub(/model/,'data')
+  Classifier.create_model(features, t.name, features + '.dict')
+end
+rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
+  model       = t.name.sub(/results/,'model')
+  features    = t.name.sub(/results/,'data')
+  org = File.basename(t.name)
+  ndocs    = 100
+  used = Open.read(features).collect{|l| l.chomp.split(/\t/).first}[1..-1]
+  classifier = Classifier.new(model)
+  go  = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
+  all = Organism.literature(org).flatten.uniq - go - used
+  go    = go.shuffle[0..ndocs - 1]
+  all   = all.shuffle[0..ndocs - 1]
+  ndocs = go.length + all.length
+  raise "Not enogh unused articles to evaluate" if  go.empty? || all.empty?
+  features_go = PubMed.get_article(go).collect{|pmid, article|
+    article = article.text
+  }
+  pos = classifier.classify(features_go).select{|v| v == '+'}.length
+  features_all = PubMed.get_article(all).collect{|pmid, article|
+    article = article.text
+  }
+  neg = classifier.classify(features_all).select{|v| v == '-'}.length
+  puts "#{ pos } #{ neg }"
+  precision = (pos + neg) / (ndocs).to_f
+  recall    = pos / go.length.to_f
+  f1        = ( 2 * precision * recall) / (precision + recall ).to_f
+  puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
+end
+task 'clean' do
+  FileUtils.rm Dir.glob("data/*")
+  FileUtils.rm Dir.glob("model/*")
+  FileUtils.rm Dir.glob("results/*")
+end
+task 'all' do
+  Organism.all.each{|org|
+    Rake::Task["model/#{ org }"].invoke
+  }
+end
+task 'update' do
+  if $org
+    FileUtils.rm Dir.glob("**/#{$org}.*") if $force
+    Rake::Task["model/#{$org}"].invoke
+  else
+    Rake::Task['clean'].invoke if $force
+    Rake::Task['all'].invoke
+  end
+end

data/install_scripts/get_abner.sh ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/bin/bash
2	+ wget http://pages.cs.wisc.edu/~bsettles/abner/abner.jar

data/install_scripts/get_banner.sh ADDED Viewed

@@ -0,0 +1,25 @@
+#!/bin/bash
+wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
+wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
+mv BANNER_v02.zip BANNER.zip
+mv gene_model_v02.bin gene_model.bin
+unzip BANNER.zip
+cd BANNER
+libs=`find libs/ -name "*.jar"`
+mkdir classes
+javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
+cd classes
+for f in ../libs/*.jar; do jar xf "$f";done
+jar cf banner.jar *
+mv banner.jar ../..
+cd ..
+cp -R nlpdata/ ../
+cd ..
+rm BANNER.zip
+rm -Rf BANNER

data/install_scripts/get_biocreative.sh ADDED Viewed

@@ -0,0 +1,72 @@
+#!/bin/bash
+mkdir src
+cd src
+wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2GNandGMgold_Subs.tar.gz"
+wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1a.tar.gz"
+wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1b.tar.gz"
+wget "http://mesh.dl.sourceforge.net/sourceforge/biocreative/biocreative1task2.tar.gz"
+wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2geneMention.tar.gz"
+wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/bc2normal.1.4.tar.gz"
+wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/bc2GNtest.zip"
+for f in *.gz; do tar xfz $f; done
+unzip bc2GNtest.zip
+cd ..
+mkdir BC2GM
+cp -R src/bc2geneMention/train/ BC2GM/
+cp -R src/sourceforgeDistrib-22-Sept-07/genemention/BC2GM/test/ BC2GM/
+mv BC2GM/train/alt_eval.perl BC2GM/
+mkdir BC2GN
+cp -R src/biocreative2normalization/* BC2GN/
+mv BC2GN/noisyTrainingData/ BC2GN/NoisyTrain
+mv BC2GN/trainingData/ BC2GN/Train
+cp -R src/bc2GNtest/bc2GNtestdocs/ BC2GN/Test
+mv BC2GN/NoisyTrain/noisytrain.genelist BC2GN/NoisyTrain/genelist
+mv BC2GN/Train/training.genelist BC2GN/Train/genelist
+cp src/sourceforgeDistrib-22-Sept-07/genenormalization/bc2test.genelist BC2GN/Test/genelist
+mkdir BC1GN
+cp -R src/biocreative1/bc1task1b/* BC1GN/
+mv BC1GN/fly/FlyDevTest/ BC1GN/fly/devtest
+mv BC1GN/fly/FlyEvaluation/ BC1GN/fly/test
+mv BC1GN/fly/FlyNoisyTraining/ BC1GN/fly/train
+mv BC1GN/fly/*.list  BC1GN/fly/synonyms.list
+mv BC1GN/fly/test/*gene_list  BC1GN/fly/test/genelist
+for f in BC1GN/fly/train/gene_list/*; do cat "$f" >> BC1GN/fly/train/genelist;done
+for f in BC1GN/fly/devtest/gene_lists/*; do cat "$f" >> BC1GN/fly/devtest/genelist;done
+mv BC1GN/mouse/MouseDevTest/ BC1GN/mouse/devtest
+mv BC1GN/mouse/MouseEvaluation/ BC1GN/mouse/test
+mv BC1GN/mouse/MouseNoisyTraining/ BC1GN/mouse/train
+mv BC1GN/mouse/*.list  BC1GN/mouse/synonyms.list
+mv BC1GN/mouse/test/*gene_list  BC1GN/mouse/test/genelist
+for f in BC1GN/mouse/train/gene_list/*; do cat "$f" >> BC1GN/mouse/train/genelist;done
+for f in BC1GN/mouse/devtest/gene_lists/*; do cat "$f" >> BC1GN/mouse/devtest/genelist;done
+mv BC1GN/yeast/YeastDevTest/ BC1GN/yeast/devtest
+mv BC1GN/yeast/YeastEvaluation/ BC1GN/yeast/test
+mv BC1GN/yeast/YeastNoisyTraining/ BC1GN/yeast/train
+mv BC1GN/yeast/*.list  BC1GN/yeast/synonyms.list
+mv BC1GN/yeast/test/*gene_list  BC1GN/yeast/test/genelist
+for f in BC1GN/yeast/train/gene_list/*; do cat "$f" >> BC1GN/yeast/train/genelist;done
+for f in BC1GN/yeast/devtest/gene_lists/*; do cat "$f" >> BC1GN/yeast/devtest/genelist;done
+# Fix a bug in the perl script! :-|
+cat BC1GN/task1Bscorer.pl |grep -v 'else {EVALFILE = STDIN;}' >foo; mv foo BC1GN/task1Bscorer.pl
+rm -Rf src

data/install_scripts/get_crf++.sh ADDED Viewed

@@ -0,0 +1,26 @@
+wget "http://downloads.sourceforge.net/crfpp/CRF%2B%2B-0.51.tar.gz?modtime=1215793886&big_mirror=0" -O crf++.tar.gz
+tar xvfz crf++.tar.gz
+rm crf++.tar.gz
+cd CRF*
+PREFIX=$(dirname $PWD)
+if [ `uname -m` == 'x86_64' ]; then
+  WITH_PIC='--with-pic';
+else
+  WITH_PIC=''
+fi
+./configure  --prefix=$PREFIX --exec-prefix=$PREFIX $WITH_PIC;
+make install
+cd ruby
+ruby extconf.rb  --with-opt-lib=$PREFIX/lib/ --with-opt-include=$PREFIX/include/
+make
+cc -shared -o CRFPP.so CRFPP_wrap.o ../../lib/libcrfpp.a  -L. -L/usr/lib  -L.  -rdynamic -Wl,-export-dynamic    -lruby -lpthread  -lpthread -ldl -lcrypt -lm   -lc -lstdc++
+mkdir ../../ruby/
+cp CRFPP.so ../../ruby/
+cd ../../
+rm -Rf CRF* include

data/install_scripts/get_entrez.sh ADDED Viewed

@@ -0,0 +1,4 @@
+#!/bin/bash
+wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz; gunzip gene_info.gz
+wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz; gunzip gene2pubmed.gz

data/install_scripts/get_go.sh ADDED Viewed

@@ -0,0 +1,4 @@
+#!/bin/bash
+wget ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo
+wget http://www.geneontology.org/GO_slims/goslim_generic.obo

data/install_scripts/get_polysearch.sh ADDED Viewed

@@ -0,0 +1,8 @@
+#!/bin/bash
+wget http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt -O disease.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt -O organ.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt -O tissue.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt -O subcellular.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt -O drug.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/HMDBnames.txt -O metabolite.txt