RubyGems - rbbt - Versions diffs - 1.0.0 - Mend

rbbt 1.0.0

Files changed (59) hide show

data/LICENSE +20 -0
data/README.rdoc +17 -0
data/bin/rbbt_config +180 -0
data/install_scripts/classifier/R/classify.R +36 -0
data/install_scripts/classifier/Rakefile +140 -0
data/install_scripts/get_abner.sh +2 -0
data/install_scripts/get_banner.sh +25 -0
data/install_scripts/get_biocreative.sh +72 -0
data/install_scripts/get_crf++.sh +26 -0
data/install_scripts/get_entrez.sh +4 -0
data/install_scripts/get_go.sh +4 -0
data/install_scripts/get_polysearch.sh +8 -0
data/install_scripts/ner/Rakefile +206 -0
data/install_scripts/ner/config/default.rb +52 -0
data/install_scripts/norm/Rakefile +218 -0
data/install_scripts/norm/config/cue_default.rb +10 -0
data/install_scripts/norm/config/tokens_default.rb +79 -0
data/install_scripts/norm/functions.sh +21 -0
data/install_scripts/organisms/Rakefile +25 -0
data/install_scripts/organisms/cgd.Rakefile +84 -0
data/install_scripts/organisms/human.Rakefile +145 -0
data/install_scripts/organisms/mgi.Rakefile +77 -0
data/install_scripts/organisms/pombe.Rakefile +40 -0
data/install_scripts/organisms/rake-include.rb +258 -0
data/install_scripts/organisms/rgd.Rakefile +88 -0
data/install_scripts/organisms/sgd.Rakefile +66 -0
data/install_scripts/organisms/tair.Rakefile +54 -0
data/install_scripts/organisms/worm.Rakefile +109 -0
data/install_scripts/stopwords +1 -0
data/install_scripts/wordlists/consonants +897 -0
data/install_scripts/wordlists/stopwords +1 -0
data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/classifier.rb +118 -0
data/lib/rbbt/bow/dictionary.rb +218 -0
data/lib/rbbt/ner/abner.rb +34 -0
data/lib/rbbt/ner/banner.rb +73 -0
data/lib/rbbt/ner/regexpNER.rb +62 -0
data/lib/rbbt/ner/rner.rb +227 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
data/lib/rbbt/ner/rnorm.rb +142 -0
data/lib/rbbt/sources/biocreative.rb +75 -0
data/lib/rbbt/sources/biomart.rb +106 -0
data/lib/rbbt/sources/entrez.rb +211 -0
data/lib/rbbt/sources/go.rb +40 -0
data/lib/rbbt/sources/organism.rb +197 -0
data/lib/rbbt/sources/polysearch.rb +88 -0
data/lib/rbbt/sources/pubmed.rb +111 -0
data/lib/rbbt/util/arrayHash.rb +255 -0
data/lib/rbbt/util/filecache.rb +72 -0
data/lib/rbbt/util/index.rb +69 -0
data/lib/rbbt/util/misc.rb +101 -0
data/lib/rbbt/util/open.rb +207 -0
data/lib/rbbt/util/simpleDSL.rb +87 -0
data/lib/rbbt/util/tmpfile.rb +19 -0
data/lib/rbbt/version.rb +10 -0
data/lib/rbbt.rb +86 -0
data/tasks/install.rake +123 -0
metadata +114 -0

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Miguel Vazquez
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,17 @@
+= rbbt
+Description goes here.
+== Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a
+  future version unintentionally.
+* Commit, do not mess with rakefile, version, or history.
+  (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
+* Send me a pull request. Bonus points for topic branches.
+== Copyright
+Copyright (c) 2009 Miguel Vazquez. See LICENSE for details.

data/bin/rbbt_config ADDED Viewed

@@ -0,0 +1,180 @@
+#!/usr/bin/ruby
+require 'rubygems'
+require 'rake'
+require 'simpleconsole'
+begin
+  require 'rbbt'
+rescue Rbbt::NoConfig
+  $noconfig = true
+end
+$USAGE =<<EOT
+#{__FILE__} <action> [<subaction>] [--force] [--organism <org>]
+  actions:
+    * configure:   Set paths for data, cache, and tmp directories
+    * install:
+      * basic:     Third party software
+      * databases: Entrez and Biocreative
+      * models:    Gene Mention and Classification
+      * organisms: Rules to gather data for organisms
+      * all:       3party wordlists entrez biocreative go ner norm classifier organisms polysearch
+    * update:
+      * organisms: Gather data for organisms
+      * ner:       Build Named Entity Recognition Models for Gene Mention
+      * classification:
+                   Build Function/Process Classifiers
+    * purge_cache: Clean the non-persistent cache, which holds general things
+        downloaded using Open.read, like organism identifiers downloaded from
+        BioMart. The persistent cache, which hold pubmed articles or entrez gene
+        descriptions, is not cleaned, as these are not likely to change
+EOT
+class Controller < SimpleConsole::Controller
+  params :bool => {:f => :force},
+         :string => {:o => :organism}
+  def default
+    render :action => :usage
+  end
+  def help
+    render :action => :usage
+  end
+  def update
+    raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
+    case params[:id]
+    when "organisms"
+      @location = File.join(Rbbt.datadir,'organisms')
+    when "ner"
+      @location = File.join(Rbbt.datadir,'ner')
+    when "classifier"
+      @location = File.join(Rbbt.datadir,'classifier')
+    else
+      redirect_to :action => :help, :id => :update
+    end
+    $force = true if params[:force]
+    $org = params[:organism] if params[:organism]
+  end
+  def install
+    raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
+    case params[:id]
+    when "basic"
+      @tasks = %w(3party wordlists polysearch)
+    when "databases"
+      @tasks = %w(entrez biocreative go)
+    when "models"
+      @tasks = %w(ner norm classifier)
+    when "organisms"
+      @tasks = %w(organisms)
+    when "all"
+      @tasks = %w(3party wordlists entrez biocreative go ner norm classifier organisms polysearch)
+    when nil
+      redirect_to :action => :help, :id => :install
+    else
+      @tasks = [params[:id]]
+    end
+    $force = true if params[:force]
+    $org = params[:organism] if params[:organism]
+  end
+  def configure
+  end
+  def purge_cache
+  end
+end
+class View < SimpleConsole::View
+  def usage
+    puts $USAGE
+  end
+  def install
+    load File.join(Rbbt.rootdir, 'tasks/install.rake')
+    @tasks.each{|t|
+      puts "Invoking #{ t }"
+      Rake::Task[t].invoke
+    }
+  end
+  def update
+    puts "Changing directory to #{@location}"
+    chdir @location
+    load "./Rakefile"
+    Rake::Task['default'].invoke
+  end
+  def configure
+    defaultdir = File.join(ENV['HOME'],'rbbt')
+    cachedir   = File.join(defaultdir, 'cache')
+    tmpdir   = File.join(defaultdir, 'tmp')
+    datadir   = File.join(defaultdir, 'data')
+    puts "Please indicate where you wish to place the data directories"
+    puts
+    puts
+    puts "* Cache Directory: This directory will hold downloads, from PubMed,
+  Entrez and other, for local store. It might grow considerably."
+    print "[#{ cachedir }]? "
+    input = STDIN.gets
+    cachedir = input if input =~ /\w/
+    puts
+    puts "* Tmp Directory: Temporary files."
+    print "[#{ tmpdir }]? "
+    input = STDIN.gets
+    tmpdir = input if input =~ /\w/
+    puts
+    puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
+    print "[#{ datadir }]? "
+    input = STDIN.gets
+    datadir = input if input =~ /\w/
+    fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
+    fout.puts "cachedir: #{cachedir}"
+    fout.puts "tmpdir: #{tmpdir}"
+    fout.puts "datadir: #{datadir}"
+    fout.close
+  end
+  def purge_cache
+    FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
+  end
+end
+SimpleConsole::Application.run(ARGV, Controller, View)

data/install_scripts/classifier/R/classify.R ADDED Viewed

@@ -0,0 +1,36 @@
+library('e1071')
+BOW.norm <- function(x, weights = NULL){
+    x = 1 + log(x);
+    x[x==-Inf] = 0;
+    x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
+    x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
+    x.norm = x * x.sum;
+    rm(x.sum);
+    x.norm[is.na(x.norm)] = 0
+    if (!is.null(weights)){
+      x.norm =  x.norm  * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
+    }
+    x.norm;
+}
+BOW.classification.model <- function(features, modelfile, dictfile = NULL){
+    feats = read.table(features, sep="\t", header=T, row.names=1);
+    if (!is.null(dictfile)){
+        svm.weights = read.table(file=dictfile, sep="\t")[2];
+    }else {
+        svm.weights = NULL;
+    }
+    feats[-1] = BOW.norm(feats[-1], svm.weights);
+    svm.model = svm(Class ~ ., data=feats, svm.weights);
+    save(svm.model,svm.weights, file=modelfile);
+}
+BOW.classification.classify <- function(modelfile, x, weights = NULL){
+    x = BOW.norm(x, weights);
+    predict(modelfile, x);
+}

data/install_scripts/classifier/Rakefile ADDED Viewed

@@ -0,0 +1,140 @@
+require 'rbbt'
+require 'rbbt/sources/organism'
+require 'rbbt/sources/pubmed'
+require 'rbbt/bow/bow'
+require 'rbbt/bow/dictionary'
+require 'rbbt/bow/classifier'
+require 'rbbt/util/misc'
+require 'progress-monitor'
+require 'rand'
+$hi      = ENV['hi']  || 0.8
+$low     = ENV['low'] || 0.01
+$max     = ENV['max'] || 3000
+$bigrams = ENV['bigrams'] == 'true' || false
+$ndocs   = ENV['ndocs'] || 5000
+desc "Bilds Dictionary and Features for an organism"
+rule(/data\/(.*)/) do |t|
+  org = File.basename(t.name)
+  go  = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
+  all = Organism.literature(org).flatten.uniq - go
+  ndocs = [go.length, all.length, $ndocs.to_i].min
+  puts "Using #{ ndocs } from each class\n\n"
+  go    = go.shuffle[0..ndocs - 1]
+  all   = all.shuffle[0..ndocs - 1]
+  dict = Dictionary::KL.new
+  chunks = all.chunk(50)
+  Progress.monitor("Building Dictionary for #{ org }: -",1000)
+  chunks.each{|chunk|
+    PubMed.get_article(chunk).each{|pmid, article|
+      words = BagOfWords.terms(article.text,$bigrams)
+      dict.add(words, :-)
+    }
+  }
+  chunks = go.chunk(50)
+  Progress.monitor("Building Dictionary for #{ org }: +",1000)
+  chunks.each{|chunk|
+    PubMed.get_article(chunk).each{|pmid, article|
+      words = BagOfWords.terms(article.text,$bigrams)
+      dict.add(words, :+)
+    }
+  }
+  term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
+  Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
+  terms = term_weigths.keys.sort
+  fout = File.open(t.name, 'w')
+  fout.puts((['Name','Class'] + terms).join("\t"))
+  Progress.monitor("Building Features for #{ org }", 1000)
+  all.each{|pmid|
+    text = PubMed.get_article(pmid).text
+    fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
+  }
+  go.each{|pmid|
+    text = PubMed.get_article(pmid).text
+    fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
+  }
+  fout.close
+end
+rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
+  features = t.name.sub(/model/,'data')
+  Classifier.create_model(features, t.name, features + '.dict')
+end
+rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
+  model       = t.name.sub(/results/,'model')
+  features    = t.name.sub(/results/,'data')
+  org = File.basename(t.name)
+  ndocs    = 100
+  used = Open.read(features).collect{|l| l.chomp.split(/\t/).first}[1..-1]
+  classifier = Classifier.new(model)
+  go  = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
+  all = Organism.literature(org).flatten.uniq - go - used
+  go    = go.shuffle[0..ndocs - 1]
+  all   = all.shuffle[0..ndocs - 1]
+  ndocs = go.length + all.length
+  raise "Not enogh unused articles to evaluate" if  go.empty? || all.empty?
+  features_go = PubMed.get_article(go).collect{|pmid, article|
+    article = article.text
+  }
+  pos = classifier.classify(features_go).select{|v| v == '+'}.length
+  features_all = PubMed.get_article(all).collect{|pmid, article|
+    article = article.text
+  }
+  neg = classifier.classify(features_all).select{|v| v == '-'}.length
+  puts "#{ pos } #{ neg }"
+  precision = (pos + neg) / (ndocs).to_f
+  recall    = pos / go.length.to_f
+  f1        = ( 2 * precision * recall) / (precision + recall ).to_f
+  puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
+end
+task 'clean' do
+  FileUtils.rm Dir.glob("data/*")
+  FileUtils.rm Dir.glob("model/*")
+  FileUtils.rm Dir.glob("results/*")
+end
+task 'all' do
+  Organism.all.each{|org|
+    Rake::Task["model/#{ org }"].invoke
+  }
+end
+task 'update' do
+  if $org
+    FileUtils.rm Dir.glob("**/#{$org}.*") if $force
+    Rake::Task["model/#{$org}"].invoke
+  else
+    Rake::Task['clean'].invoke if $force
+    Rake::Task['all'].invoke
+  end
+end

data/install_scripts/get_abner.sh ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/bin/bash
2	+ wget http://pages.cs.wisc.edu/~bsettles/abner/abner.jar

data/install_scripts/get_banner.sh ADDED Viewed

@@ -0,0 +1,25 @@
+#!/bin/bash
+wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
+wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
+mv BANNER_v02.zip BANNER.zip
+mv gene_model_v02.bin gene_model.bin
+unzip BANNER.zip
+cd BANNER
+libs=`find libs/ -name "*.jar"`
+mkdir classes
+javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
+cd classes
+for f in ../libs/*.jar; do jar xf "$f";done
+jar cf banner.jar *
+mv banner.jar ../..
+cd ..
+cp -R nlpdata/ ../
+cd ..
+rm BANNER.zip
+rm -Rf BANNER

data/install_scripts/get_biocreative.sh ADDED Viewed

@@ -0,0 +1,72 @@
+#!/bin/bash
+mkdir src
+cd src
+wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2GNandGMgold_Subs.tar.gz"
+wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1a.tar.gz"
+wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/biocreative1task1b.tar.gz"
+wget "http://mesh.dl.sourceforge.net/sourceforge/biocreative/biocreative1task2.tar.gz"
+wget "http://garr.dl.sourceforge.net/sourceforge/biocreative/bc2geneMention.tar.gz"
+wget "http://switch.dl.sourceforge.net/sourceforge/biocreative/bc2normal.1.4.tar.gz"
+wget "http://kent.dl.sourceforge.net/sourceforge/biocreative/bc2GNtest.zip"
+for f in *.gz; do tar xfz $f; done
+unzip bc2GNtest.zip
+cd ..
+mkdir BC2GM
+cp -R src/bc2geneMention/train/ BC2GM/
+cp -R src/sourceforgeDistrib-22-Sept-07/genemention/BC2GM/test/ BC2GM/
+mv BC2GM/train/alt_eval.perl BC2GM/
+mkdir BC2GN
+cp -R src/biocreative2normalization/* BC2GN/
+mv BC2GN/noisyTrainingData/ BC2GN/NoisyTrain
+mv BC2GN/trainingData/ BC2GN/Train
+cp -R src/bc2GNtest/bc2GNtestdocs/ BC2GN/Test
+mv BC2GN/NoisyTrain/noisytrain.genelist BC2GN/NoisyTrain/genelist
+mv BC2GN/Train/training.genelist BC2GN/Train/genelist
+cp src/sourceforgeDistrib-22-Sept-07/genenormalization/bc2test.genelist BC2GN/Test/genelist
+mkdir BC1GN
+cp -R src/biocreative1/bc1task1b/* BC1GN/
+mv BC1GN/fly/FlyDevTest/ BC1GN/fly/devtest
+mv BC1GN/fly/FlyEvaluation/ BC1GN/fly/test
+mv BC1GN/fly/FlyNoisyTraining/ BC1GN/fly/train
+mv BC1GN/fly/*.list  BC1GN/fly/synonyms.list
+mv BC1GN/fly/test/*gene_list  BC1GN/fly/test/genelist
+for f in BC1GN/fly/train/gene_list/*; do cat "$f" >> BC1GN/fly/train/genelist;done
+for f in BC1GN/fly/devtest/gene_lists/*; do cat "$f" >> BC1GN/fly/devtest/genelist;done
+mv BC1GN/mouse/MouseDevTest/ BC1GN/mouse/devtest
+mv BC1GN/mouse/MouseEvaluation/ BC1GN/mouse/test
+mv BC1GN/mouse/MouseNoisyTraining/ BC1GN/mouse/train
+mv BC1GN/mouse/*.list  BC1GN/mouse/synonyms.list
+mv BC1GN/mouse/test/*gene_list  BC1GN/mouse/test/genelist
+for f in BC1GN/mouse/train/gene_list/*; do cat "$f" >> BC1GN/mouse/train/genelist;done
+for f in BC1GN/mouse/devtest/gene_lists/*; do cat "$f" >> BC1GN/mouse/devtest/genelist;done
+mv BC1GN/yeast/YeastDevTest/ BC1GN/yeast/devtest
+mv BC1GN/yeast/YeastEvaluation/ BC1GN/yeast/test
+mv BC1GN/yeast/YeastNoisyTraining/ BC1GN/yeast/train
+mv BC1GN/yeast/*.list  BC1GN/yeast/synonyms.list
+mv BC1GN/yeast/test/*gene_list  BC1GN/yeast/test/genelist
+for f in BC1GN/yeast/train/gene_list/*; do cat "$f" >> BC1GN/yeast/train/genelist;done
+for f in BC1GN/yeast/devtest/gene_lists/*; do cat "$f" >> BC1GN/yeast/devtest/genelist;done
+# Fix a bug in the perl script! :-|
+cat BC1GN/task1Bscorer.pl |grep -v 'else {EVALFILE = STDIN;}' >foo; mv foo BC1GN/task1Bscorer.pl
+rm -Rf src

data/install_scripts/get_crf++.sh ADDED Viewed

@@ -0,0 +1,26 @@
+wget "http://downloads.sourceforge.net/crfpp/CRF%2B%2B-0.51.tar.gz?modtime=1215793886&big_mirror=0" -O crf++.tar.gz
+tar xvfz crf++.tar.gz
+rm crf++.tar.gz
+cd CRF*
+PREFIX=$(dirname $PWD)
+if [ `uname -m` == 'x86_64' ]; then
+  WITH_PIC='--with-pic';
+else
+  WITH_PIC=''
+fi
+./configure  --prefix=$PREFIX --exec-prefix=$PREFIX $WITH_PIC;
+make install
+cd ruby
+ruby extconf.rb  --with-opt-lib=$PREFIX/lib/ --with-opt-include=$PREFIX/include/
+make
+cc -shared -o CRFPP.so CRFPP_wrap.o ../../lib/libcrfpp.a  -L. -L/usr/lib  -L.  -rdynamic -Wl,-export-dynamic    -lruby -lpthread  -lpthread -ldl -lcrypt -lm   -lc -lstdc++
+mkdir ../../ruby/
+cp CRFPP.so ../../ruby/
+cd ../../
+rm -Rf CRF* include

data/install_scripts/get_entrez.sh ADDED Viewed

@@ -0,0 +1,4 @@
+#!/bin/bash
+wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz; gunzip gene_info.gz
+wget ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz; gunzip gene2pubmed.gz

data/install_scripts/get_go.sh ADDED Viewed

@@ -0,0 +1,4 @@
+#!/bin/bash
+wget ftp://ftp.geneontology.org/pub/go/ontology/gene_ontology.obo
+wget http://www.geneontology.org/GO_slims/goslim_generic.obo

data/install_scripts/get_polysearch.sh ADDED Viewed

@@ -0,0 +1,8 @@
+#!/bin/bash
+wget http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt -O disease.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt -O organ.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt -O tissue.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt -O subcellular.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt -O drug.txt
+wget http://wishart.biology.ualberta.ca/polysearch/include/HMDBnames.txt -O metabolite.txt