RubyGems - rbbt - Versions diffs - 1.2.5 → 2.0.0 - Mend

rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +69 -214
data/LICENSE +0 -20
data/bin/rbbt_config +0 -245
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -140
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -86
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Ath.Rakefile +0 -55
data/install_scripts/organisms/Cal.Rakefile +0 -84
data/install_scripts/organisms/Cel.Rakefile +0 -109
data/install_scripts/organisms/Hsa.Rakefile +0 -140
data/install_scripts/organisms/Mmu.Rakefile +0 -77
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/Rno.Rakefile +0 -88
data/install_scripts/organisms/Sce.Rakefile +0 -66
data/install_scripts/organisms/Spo.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -252
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -83
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -85
data/lib/rbbt/sources/gscholar.rb +0 -74
data/lib/rbbt/sources/organism.rb +0 -241
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -248
data/lib/rbbt/util/arrayHash.rb +0 -266
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -251
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -35
data/tasks/install.rake +0 -124
data/test/rbbt/bow/test_bow.rb +0 -33
data/test/rbbt/bow/test_classifier.rb +0 -72
data/test/rbbt/bow/test_dictionary.rb +0 -91
data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
data/test/rbbt/ner/test_abner.rb +0 -17
data/test/rbbt/ner/test_banner.rb +0 -17
data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
data/test/rbbt/ner/test_regexpNER.rb +0 -33
data/test/rbbt/ner/test_rner.rb +0 -126
data/test/rbbt/ner/test_rnorm.rb +0 -47
data/test/rbbt/sources/test_biocreative.rb +0 -38
data/test/rbbt/sources/test_biomart.rb +0 -31
data/test/rbbt/sources/test_entrez.rb +0 -49
data/test/rbbt/sources/test_go.rb +0 -24
data/test/rbbt/sources/test_organism.rb +0 -59
data/test/rbbt/sources/test_polysearch.rb +0 -27
data/test/rbbt/sources/test_pubmed.rb +0 -39
data/test/rbbt/util/test_arrayHash.rb +0 -257
data/test/rbbt/util/test_filecache.rb +0 -37
data/test/rbbt/util/test_index.rb +0 -31
data/test/rbbt/util/test_misc.rb +0 -20
data/test/rbbt/util/test_open.rb +0 -110
data/test/rbbt/util/test_simpleDSL.rb +0 -57
data/test/rbbt/util/test_tmpfile.rb +0 -21
data/test/test_helper.rb +0 -4
data/test/test_rbbt.rb +0 -11

data/LICENSE DELETED

@@ -1,20 +0,0 @@
-Copyright (c) 2009 Miguel Vazquez
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/bin/rbbt_config DELETED

@@ -1,245 +0,0 @@
-#!/usr/bin/ruby
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
-require 'rubygems'
-require 'rake'
-require 'simpleconsole'
-begin
-  require 'rbbt'
-rescue Rbbt::NoConfig
-  $noconfig = true
-end
-TASKS= %w(organisms ner norm classifier biocreative entrez go wordlists polysearch abner banner crf++)
-$USAGE =<<EOT
-#{__FILE__} <action> [<subaction>] [--update] [--organism <org>]
-  actions:
-    * configure:   Set paths for data, cache, and tmp directories
-    * prepare:
-      Basic subactions:
-      * organisms:     Install processing scripts to process organisms
-      * ner:           Install processing scripts for Named Entity Recognition
-      * norm:          Install processing scripts for Gene Mention Normalization
-      * classifier:    Install processing scripts for Classification
-      * biocreative:   Download and train and test data from BioCreative
-      * entrez:        Download and install data from Entrez
-      * go:            Download and install data from The Gene Ontology
-      * wordlists:     Install word lists
-      * polysearch:    Download and install Polysearch dictionaries
-      * abner:         Download and install Abner NER system:      http://pages.cs.wisc.edu/~bsettles/abner/
-      * banner:        Download and install Banner NER system:     http://sourceforge.net/projects/banner/
-      * crf++:         Download and install CRF++ a CRF framework: http://crfpp.sourceforge.net/
-      Subactions grouped by task:
-      * identifiers:  entrez, organisms
-      * rner:         entrez, organisms, biocreative, ner, crf++
-      * java_ner:     entrez, organisms, abner, banner
-      * norm: entrez  organisms, biocreative, crf++, norm, polysearch
-      * bow:          organisms, wordlists
-      * classifier:   organisms, wordlists, classifier, go
-      * all:          #{TASKS.join(", ")}
-    * install:
-      * organisms:      Gather organisms data
-      * ner:            Build Named Entity Recognition Models. Mention Normalization needs no training.
-      * classification: Build Function/Process Classifiers
-      --update:         Rebuild models or reprocess organism data even if present. You may want to purge the cache
-                        to be up to date with the data in the internet.
-      --organism:       Gather data only for that particular organism. The organism must be specified by the
-                        keyword. Use '#{__FILE__} organisms' to see find the keywords.
-    * purge_cache: Clean the non-persistent cache, which holds general things
-        downloaded using Open.read, like organism identifiers downloaded from
-        BioMart. The persistent cache, which hold pubmed articles or entrez gene
-        descriptions, is not cleaned, as these are not likely to change
-    * organisms: Show a list of all organisms along with their identifier in the system
-EOT
-class Controller < SimpleConsole::Controller
-  params :bool => {:u => :update},
-         :string => {:o => :organism}
-  def organisms
-  end
-  def default
-    render :action => :usage
-  end
-  def help
-    render :action => :usage
-  end
-  def install
-    raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
-    case params[:id]
-    when "organisms"
-      @location = File.join(Rbbt.datadir,'organisms')
-    when "ner"
-      @location = File.join(Rbbt.datadir,'ner')
-    when "classifier"
-      @location = File.join(Rbbt.datadir,'classifier')
-    else
-      redirect_to :action => :help, :id => :update
-    end
-    $force = true if params[:update]
-    $org = params[:organism] if params[:organism]
-  end
-  def prepare
-    raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
-    case params[:id]
-    when "identifiers"
-      require 'rbbt/sources/organism'
-      require 'rbbt/sources/entrez'
-      @tasks = %w(entrez organisms)
-    when "rner"
-      require 'rbbt/ner/rner'
-      require 'rbbt/sources/entrez'
-      @tasks = %w(entrez organisms biocreative ner crf++)
-    when "java_ner"
-      require 'rjb'
-      @tasks = %w(entrez organisms abner banner)
-    when "norm"
-      require 'rbbt/ner/rner'
-      require 'rbbt/ner/rnorm'
-      require 'rbbt/ner/regexpNER'
-      require 'rbbt/sources/entrez'
-      @tasks = %w(entrez organisms biocreative crf++ norm polysearch)
-    when "bow"
-      require 'rbbt/bow/bow'
-      require 'rbbt/bow/dictionary'
-      @tasks = %w(organisms wordlists)
-    when "classifier"
-      require 'rbbt/bow/bow'
-      require 'rbbt/bow/dictionary'
-      require 'rbbt/bow/classifier'
-      @tasks = %w(organisms wordlists classifier go)
-    when "all"
-      @tasks = TASKS
-    when nil
-      redirect_to :action => :help, :id => :install
-    else
-      redirect_to :action => :help, :id => :install if ! TASKS.include? params[:id]
-      @tasks = [params[:id]]
-    end
-    $force = true if params[:update]
-    $org = params[:organism] if params[:organism]
-  end
-  def configure
-  end
-  def purge_cache
-  end
-end
-class View < SimpleConsole::View
-  def usage
-    puts $USAGE
-  end
-  def organisms
-      require 'rbbt/sources/organism'
-      all = Organism.all(false)
-      installed = Organism.all
-      all.each{|org|
-          puts "#{Organism.name(org)}: #{org} #{installed.include?(org) ? "(installed)" : ""}"
-      }
-  end
-  def prepare
-    load File.join(Rbbt.rootdir, 'tasks/install.rake')
-    @tasks.each{|t|
-      puts "Invoking #{ t }"
-      Rake::Task[t].invoke
-    }
-  end
-  def install
-    puts "Changing directory to #{@location}"
-    chdir @location
-    load "./Rakefile"
-    Rake::Task['default'].invoke
-  end
-  def configure
-    defaultdir = File.join(ENV['HOME'],'rbbt')
-    cachedir   = File.join(defaultdir, 'cache')
-    tmpdir   = File.join(defaultdir, 'tmp')
-    datadir   = File.join(defaultdir, 'data')
-    puts "Please indicate where you wish to place the data directories"
-    puts
-    puts
-    puts "* Cache Directory: This directory will hold downloads, from PubMed,
-  Entrez and other, for local store. It might grow considerably."
-    print "[#{ cachedir }]? "
-    input = STDIN.gets
-    cachedir = input if input =~ /\w/
-    puts
-    puts "* Tmp Directory: Temporary files."
-    print "[#{ tmpdir }]? "
-    input = STDIN.gets
-    tmpdir = input if input =~ /\w/
-    puts
-    puts "* Data Directory: Holds data from organisms, databases, third party software, etc."
-    print "[#{ datadir }]? "
-    input = STDIN.gets
-    datadir = input if input =~ /\w/
-    fout = File.open(File.join(ENV['HOME'], '.rbbt'),'w')
-    fout.puts "cachedir: #{cachedir}"
-    fout.puts "tmpdir: #{tmpdir}"
-    fout.puts "datadir: #{datadir}"
-    fout.close
-  end
-  def purge_cache
-    FileUtils.rm Dir.glob(File.join(Rbbt.cachedir,'open-remote','*'))
-  end
-end
-SimpleConsole::Application.run(ARGV, Controller, View)

data/install_scripts/classifier/R/classify.R DELETED

@@ -1,36 +0,0 @@
-library('e1071')
-BOW.norm <- function(x, weights = NULL){
-    x = 1 + log(x);
-    x[x==-Inf] = 0;
-    x.sum = as.matrix(x) %*% matrix(1,nrow=dim(x)[2],ncol=1);
-    x.sum = matrix(100/x.sum,nrow=length(x.sum),ncol=dim(x)[2]);
-    x.norm = x * x.sum;
-    rm(x.sum);
-    x.norm[is.na(x.norm)] = 0
-    if (!is.null(weights)){
-      x.norm =  x.norm  * matrix(abs(weights),ncol=length(weights),nrow=dim(x.norm)[1],byrow=T)
-    }
-    x.norm;
-}
-BOW.classification.model <- function(features, modelfile, dictfile = NULL){
-    feats = read.table(features, sep="\t", header=T, row.names=1);
-    if (!is.null(dictfile)){
-        svm.weights = read.table(file=dictfile, sep="\t")[2];
-    }else {
-        svm.weights = NULL;
-    }
-    feats[-1] = BOW.norm(feats[-1], svm.weights);
-    svm.model = svm(Class ~ ., data=feats, svm.weights);
-    save(svm.model,svm.weights, file=modelfile);
-}
-BOW.classification.classify <- function(modelfile, x, weights = NULL){
-    x = BOW.norm(x, weights);
-    predict(modelfile, x);
-}

data/install_scripts/classifier/Rakefile DELETED

@@ -1,140 +0,0 @@
-require 'rbbt'
-require 'rbbt/sources/organism'
-require 'rbbt/sources/pubmed'
-require 'rbbt/bow/bow'
-require 'rbbt/bow/dictionary'
-require 'rbbt/bow/classifier'
-require 'rbbt/util/misc'
-require 'progress-monitor'
-require 'rand'
-$hi      ||= ENV['hi']  || 0.8
-$low     ||= ENV['low'] || 0.01
-$max     ||= ENV['max'] || 3000
-$bigrams ||= ENV['bigrams'] == 'true'
-$ndocs   ||= ENV['ndocs'] || 5000
-desc "Bilds Dictionary and Features for an organism"
-rule(/data\/(.*)/) do |t|
-  org = File.basename(t.name)
-  go  = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq
-  all = Organism.literature(org).flatten.uniq - go
-  ndocs = [go.length, all.length, $ndocs.to_i].min
-  puts "Using #{ ndocs } from each class\n\n"
-  go    = go.shuffle[0..ndocs - 1]
-  all   = all.shuffle[0..ndocs - 1]
-  dict = Dictionary::KL.new
-  chunks = all.chunk(50)
-  Progress.monitor("Building Dictionary for #{ org }: -")
-  chunks.each{|chunk|
-    PubMed.get_article(chunk).each{|pmid, article|
-      words = BagOfWords.terms(article.text,$bigrams)
-      dict.add(words, :-)
-    }
-  }
-  chunks = go.chunk(50)
-  Progress.monitor("Building Dictionary for #{ org }: +")
-  chunks.each{|chunk|
-    PubMed.get_article(chunk).each{|pmid, article|
-      words = BagOfWords.terms(article.text,$bigrams)
-      dict.add(words, :+)
-    }
-  }
-  term_weigths = dict.weights(:low => $low.to_f, :hi => $hi.to_f, :limit => $max.to_i)
-  Open.write(t.name + '.dict', term_weigths.sort.collect{|p| p.join("\t")}.join("\n"))
-  terms = term_weigths.keys.sort
-  fout = File.open(t.name, 'w')
-  fout.puts((['Name','Class'] + terms).join("\t"))
-  Progress.monitor("Building Features for #{ org }")
-  all.each{|pmid|
-    text = PubMed.get_article(pmid).text
-    fout.puts(([pmid, :-] + BagOfWords.features(text, terms)).join("\t"))
-  }
-  go.each{|pmid|
-    text = PubMed.get_article(pmid).text
-    fout.puts(([pmid, :+] + BagOfWords.features(text, terms)).join("\t"))
-  }
-  fout.close
-end
-rule (/model\/(.*)/) => lambda{|n| n.sub(/model/,'data')} do |t|
-  features = t.name.sub(/model/,'data')
-  Classifier.create_model(features, t.name, features + '.dict')
-end
-rule (/results\/(.*)/) => lambda{|n| n.sub(/results/,'model')} do |t|
-  model       = t.name.sub(/results/,'model')
-  features    = t.name.sub(/results/,'data')
-  org = File.basename(t.name)
-  ndocs    = 1000
-  used = Open.read(features).read.split(/\n/).collect{|l| l.chomp.split(/\t/).first}[1..-1]
-  classifier = Classifier.new(model)
-  go  = Organism.gene_literature_go(org).collect{|gene, pmids| pmids}.flatten.uniq - used
-  all = Organism.literature(org).flatten.uniq - go - used
-  go    = go.shuffle[0..ndocs - 1]
-  all   = all.shuffle[0..ndocs - 1]
-  ndocs = go.length + all.length
-  raise "Not enogh unused articles to evaluate" if  go.empty? || all.empty?
-  features_go = PubMed.get_article(go).collect{|pmid, article|
-    article.text
-  }
-  pos = classifier.classify(features_go).select{|v| v == '+'}.length
-  features_all = PubMed.get_article(all).collect{|pmid, article|
-    article.text
-  }
-  neg = classifier.classify(features_all).select{|v| v == '-'}.length
-  puts "#{ pos } #{ neg }"
-  precision = (pos + neg) / (ndocs).to_f
-  recall    = pos / go.length.to_f
-  f1        = ( 2 * precision * recall) / (precision + recall ).to_f
-  puts "Precision: #{ precision}, Recall: #{ recall }, F1: #{f1}"
-end
-task 'clean' do
-  FileUtils.rm Dir.glob("data/*")
-  FileUtils.rm Dir.glob("model/*")
-  FileUtils.rm Dir.glob("results/*")
-end
-task 'all' do
-  Organism.all.each{|org|
-    Rake::Task["model/#{ org }"].invoke
-  }
-end
-task 'update' do
-  if $org
-    FileUtils.rm Dir.glob("**/#{$org}.*") if $force
-    Rake::Task["model/#{$org}"].invoke
-  else
-    Rake::Task['clean'].invoke if $force
-    Rake::Task['all'].invoke
-  end
-end

data/install_scripts/get_abner.sh DELETED

	@@ -1,2 +0,0 @@
1	- #!/bin/bash
2	- wget http://pages.cs.wisc.edu/~bsettles/abner/abner.jar

data/install_scripts/get_banner.sh DELETED

@@ -1,25 +0,0 @@
-#!/bin/bash
-wget "http://downloads.sourceforge.net/banner/BANNER_v02.zip?modtime=1196955449&big_mirror=0"
-wget "http://downloads.sourceforge.net/banner/gene_model_v02.bin?modtime=1196955509&big_mirror=0"
-mv BANNER_v02.zip BANNER.zip
-mv gene_model_v02.bin gene_model.bin
-unzip BANNER.zip
-cd BANNER
-libs=`find libs/ -name "*.jar"`
-mkdir classes
-javac -classpath `echo $libs|sed s/\ /:/g` -d classes `find src/ -name "*.java"`
-cd classes
-for f in ../libs/*.jar; do jar xf "$f";done
-jar cf banner.jar *
-mv banner.jar ../..
-cd ..
-cp -R nlpdata/ ../
-cd ..
-rm BANNER.zip
-rm -Rf BANNER