RubyGems - rbbt - Versions diffs - 1.1.7 → 2.0.0 - Mend

rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +7 -0
data/README.rdoc +2 -138
metadata +72 -136
data/LICENSE +0 -20
data/bin/rbbt_config +0 -246
data/install_scripts/classifier/R/classify.R +0 -36
data/install_scripts/classifier/Rakefile +0 -145
data/install_scripts/get_abner.sh +0 -2
data/install_scripts/get_banner.sh +0 -25
data/install_scripts/get_biocreative.sh +0 -72
data/install_scripts/get_crf++.sh +0 -26
data/install_scripts/get_entrez.sh +0 -4
data/install_scripts/get_go.sh +0 -4
data/install_scripts/get_polysearch.sh +0 -8
data/install_scripts/ner/Rakefile +0 -206
data/install_scripts/ner/config/default.rb +0 -52
data/install_scripts/norm/Rakefile +0 -219
data/install_scripts/norm/config/cue_default.rb +0 -10
data/install_scripts/norm/config/tokens_default.rb +0 -79
data/install_scripts/norm/functions.sh +0 -23
data/install_scripts/organisms/Rakefile +0 -43
data/install_scripts/organisms/cgd.Rakefile +0 -84
data/install_scripts/organisms/human.Rakefile +0 -145
data/install_scripts/organisms/mgi.Rakefile +0 -77
data/install_scripts/organisms/pombe.Rakefile +0 -40
data/install_scripts/organisms/rake-include.rb +0 -258
data/install_scripts/organisms/rgd.Rakefile +0 -88
data/install_scripts/organisms/sgd.Rakefile +0 -66
data/install_scripts/organisms/tair.Rakefile +0 -54
data/install_scripts/organisms/worm.Rakefile +0 -109
data/install_scripts/wordlists/consonants +0 -897
data/install_scripts/wordlists/stopwords +0 -1
data/lib/rbbt.rb +0 -86
data/lib/rbbt/bow/bow.rb +0 -88
data/lib/rbbt/bow/classifier.rb +0 -116
data/lib/rbbt/bow/dictionary.rb +0 -187
data/lib/rbbt/ner/abner.rb +0 -34
data/lib/rbbt/ner/banner.rb +0 -73
data/lib/rbbt/ner/dictionaryNER.rb +0 -98
data/lib/rbbt/ner/regexpNER.rb +0 -70
data/lib/rbbt/ner/rner.rb +0 -227
data/lib/rbbt/ner/rnorm.rb +0 -143
data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
data/lib/rbbt/sources/biocreative.rb +0 -75
data/lib/rbbt/sources/biomart.rb +0 -105
data/lib/rbbt/sources/entrez.rb +0 -211
data/lib/rbbt/sources/go.rb +0 -40
data/lib/rbbt/sources/organism.rb +0 -245
data/lib/rbbt/sources/polysearch.rb +0 -117
data/lib/rbbt/sources/pubmed.rb +0 -111
data/lib/rbbt/util/arrayHash.rb +0 -255
data/lib/rbbt/util/filecache.rb +0 -72
data/lib/rbbt/util/index.rb +0 -47
data/lib/rbbt/util/misc.rb +0 -106
data/lib/rbbt/util/open.rb +0 -235
data/lib/rbbt/util/rake.rb +0 -183
data/lib/rbbt/util/simpleDSL.rb +0 -87
data/lib/rbbt/util/tmpfile.rb +0 -19
data/tasks/install.rake +0 -124

data/install_scripts/ner/config/default.rb DELETED

@@ -1,52 +0,0 @@
-isLetters     /^[A-Z]+$/i
-isUpper       /^[A-Z]+$/
-isLower       /^[a-z]+$/
-isDigits      /^[0-9]+$/i
-isRoman       /^[IVX]+$/
-isGreek       /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
-isPunctuation /^[,.;]$/
-isDelim       /^[\/()\[\]{}\-]$/
-isNonWord     /^[^\w]+$/
-isConjunction /^and|or|&|,$/
-hasLetters    /[A-Z]/i
-hasUpper      /.[A-Z]/
-hasLower      /[a-z]/
-hasDigits     /[0-9]/i
-hasGreek      /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
-hasPunctuation /[,.;]/
-hasDelim      /[\/()\[\]{}\-]/
-hasNonWord    /[^\w]/
-caspMix       /[a-z].[A-Z]/
-keywords      /(?:protein|gene|domain|ase)s?$/
-hasSuffix     /[a-z][A-Z0-9]$/
-numLetters    do |w| w.scan(/[A-Z]/i).length end
-numDigits     do |w| w.scan(/[0-9]/).length end
-#
-prefix_3      /^(...)/
-prefix_4      /^(....)/
-suffix_3      /(...)$/
-suffix_4      /(....)$/
-token1        do |w|
-                 w.sub(/[A-Z]/,'A').
-                   sub(/[a-z]/,'a').
-                   sub(/[0-9]/,'0').
-                   sub(/[^0-9a-z]/i,'x')
-              end
-token2        do  |w|
-                 w.sub(/[A-Z]+/,'A').
-                   sub(/[a-z]+/,'a').
-                   sub(/[0-9]+/,'0').
-                   sub(/[^0-9a-z]+/i,'x')
-               end
-token3         do |w| w.downcase end
-special        do |w| w.is_special? end
-context   %w(special token2 isPunctuation isDelim)
-window     %w(1 2 3 -1 -2 -3)
-#direction :reverse

data/install_scripts/norm/Rakefile DELETED

@@ -1,219 +0,0 @@
-require 'rbbt'
-require 'rbbt/sources/organism'
-require 'rbbt/util/open'
-require 'rbbt/ner/rner'
-require 'rbbt/ner/rnorm'
-require 'progress-monitor'
-$type = ENV['ner'] || :rner
-$debug = !ENV['debug'].nil?
-$perfect = !ENV['perfect'].nil?
-$docs  = ENV['docs']
-$org2rbbt = {
-  'yeast' => 'sgd',
-  'mouse' => 'mgi',
-  'fly' => 'sgd',
-  'bc2gn' => 'human',
-}
-def match(org, filedir, goldstandard,outfile)
-  t = Time.now
-  if org == 'bc2gn'
-    custom_file = File.join('config', org + '.config')
-    norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
-                        :to_entrez => false,
-                        :file => (File.exist?(custom_file) ? custom_file : nil),
-                        :max_candidates => 200)
-  else
-    custom_file = File.join('config', org + '.config')
-    norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
-                        :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
-                                     :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
-                                     :fix => proc{|l| l.sub(/S000/,'S0')}),
-                        :file => (File.exist?(custom_file) ? custom_file : nil),
-                        :max_candidates => 200)
-  end
-  STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
-  if $type.to_s == 'rner'
-    ner = NER.new('models/' + org)
-  else
-    ner = Organism.ner($org2rbbt[org], $type)
-  end
-  fout=File.open(outfile,'w')
-  gs  = Open.to_hash(goldstandard,:native => 0,:extra => 1)
-  gs_mentions  = Open.to_hash(goldstandard,:native => 0,:extra => 2)
-  if org == 'bc2gn'
-    lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
-  else
-    lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
-  end
-  if $docs
-    files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
-  else
-    files = Dir.glob(filedir + '*.txt').sort
-  end
-  Progress.monitor("Processing Files")
-  files.each{|f|
-    fid = File.basename(f).sub(/.txt/,'')
-    text = Open.read(f)
-    if $perfect
-      mentions = (gs_mentions[fid] || []).flatten
-    else
-      mentions = ner.extract(text).uniq
-    end
-    if $debug
-      puts "------------------------------------"
-      puts "FILE #{fid}"
-      puts
-      puts text
-      puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
-      puts "MENTIONS: #{mentions.join(", ")}"
-    end
-    found = []
-    mentions.each{|mention|
-      codes = norm.select(norm.match(mention),mention,text)
-      found += codes
-      codes.each{|code|
-        #code = code.sub(/S000/,'S0')
-        fout.puts "#{ fid }\t#{ code}\t#{mention}"
-      }
-      puts "Mention: #{ mention } => #{ codes.join(", ") }"  if $debug
-    }
-    if $debug
-      found.uniq!
-      fn = (gs[fid] || []).flatten.uniq - found
-      fp = found - (gs[fid] || []).flatten.uniq
-      fn.each{|code|
-        if lex[code]
-          puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
-        else
-           puts "FN: #{ code }"
-        end
-     }
-      fp.each{|code|
-        if lex[code]
-          puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
-        else
-           puts "FN: #{ code }"
-        end
-      }
-    end
-  }
-  fout.close
-end
-rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
-  org = File.basename(t.name).sub(/\.features/,'')
-  if org == 'bc2gn'
-    lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
-  else
-    lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
-  end
-  names = File.open(lexicon).collect{|l|
-    names = l.split(/\t/)
-    names.shift
-    names.compact.select{|n| !n.empty?}
-  }.flatten
-  fout = File.open(t.name,'w')
-  parser   = NERFeatures.new
-  Progress.monitor("CRFPP Features #{ org }")
-  names.each{|name|
-    features = parser.text_features(name, true)
-    features.each{|feat|
-      fout.puts feat.join(" ")
-    }
-    fout.puts
-  }
-  fout.close
-  if org != 'bc2gn'
-    Open.append(t.name, Open.read('../ner/data/BC2.features'))
-  else
-    Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
-    Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
-  end
-end
-rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
-  org = File.basename(t.name)
-  parser = NERFeatures.new
-  parser.train( t.name + '.features', t.name)
-end
-rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
-  org, dataset = File.basename(t.name).split(/_/)
-  if $type.to_sym == :rner
-    Rake::Task['models/' + org].invoke
-  end
-  filedir      = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
-  goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
-  match(org,filedir, goldstandard,t.name)
-end
-rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
-  org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
-  cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
-  puts cmd
-  system cmd
-end
-rule (/results\/bc2gn$/) do |t|
-  org = 'bc2gn'
-  if $type.to_sym == :rner
-    Rake::Task['models/' + org].invoke
-  end
-  filedir      = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
-  goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
-  match(org,filedir, goldstandard,t.name)
-end
-rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
-  cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
-  system cmd
-end

data/install_scripts/norm/config/cue_default.rb DELETED

@@ -1,10 +0,0 @@
-equal    do |w| [w] end
-standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
-cleaned  do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
-special  do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
-words    do |w|
-  w.sub(/(.*)I$/,'\1I \1').
-    scan(/[a-z][a-z]+/i).
-    sort{|a,b| b.length <=> a.length}.
-    collect{|n| n.downcase}
-end

data/install_scripts/norm/config/tokens_default.rb DELETED

@@ -1,79 +0,0 @@
-require 'rbbt/util/misc'
-tokens do
-  # Some (possible) single letters first
-  receptor     /^(?:receptor|r)s?$/i
-  protein      /^(?:protein|p)s?$/i
-  roman        /^[IV]+$/
-  greek_letter do |w| $inverse_greek[w.downcase] != nil end
-  # Some words for removal
-  stopword     do |w|  $stopwords.include?( w.downcase_first)  end
-  gene         /genes?/i
-  dna
-  cdna
-  rna
-  mrna
-  trna
-  cdna
-  component
-  exon
-  intron
-  domain
-  family
-  # Important words
-  number       /^(?:\d+[.,]?\d+|\d)$/
-  greek        do |w| $greek[w.downcase] != nil end
-  special      do |w| w.is_special? end
-  promoter
-  similar      /^(homolog.*|like|related|associated)$/
-  ase          /ase$/
-  in_end       /in$/
-end
-comparisons do
-  compare.number do |l1,l2|
-      v = 0
-      case
-      when l1.empty? && l2.empty?
-          v = 0
-      when l1.sort.uniq == l2.sort.uniq
-          v = 3
-      when l1.any? && l1[0] == l2[0]
-          v = -3
-      when l1.empty? && l2 == ['1']
-          v = -5
-      else
-          v = -10
-      end
-      v
-  end
-  diff.promoter   -10
-  diff.receptor   -10
-  diff.similar    -10
-  diff.capital    -10
-  same.unknown      1
-  miss.unknown      -2
-  extr.unknown      -2
-  same.greek      1
-  miss.greek      -2
-  extr.greek      -2
-  same.special    4
-  miss.special    -3
-  extr.special    -3
-  transform.roman do |t| [t.arabic, :number] end
-  transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
-  transform.ase do |t| [t, :special] end
-  transform.in_end do |t| [t, :special] end
-  transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
-end

data/install_scripts/norm/functions.sh DELETED

@@ -1,23 +0,0 @@
-#!/bin/bash
-function norm(){
-    organism=$1
-    shift
-    dataset=$1
-    shift
-    ner=$1
-    shift
-    CMD="rm results/${organism}_$dataset; rake results/${organism}_$dataset.eval ner=$ner $@ > ${organism}_$dataset.log_$ner; tail results/${organism}_$dataset.eval"
-    echo $CMD
-    $CMD
-}
-function norm_2(){
-    ner=$1
-    shift
-    CMD="rm results/bc2gn; rake results/bc2gn.eval ner=$ner $@ > bc2gn.log_$ner; tail results/bc2gn.eval"
-    echo $CMD
-    $CMD
-}

data/install_scripts/organisms/Rakefile DELETED

@@ -1,43 +0,0 @@
-$org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
-task 'names' do
-  orgs = Dir.glob('*').
-    select{|t|
-    File.directory?(t ) &&
-      File.exist?(t + '/Rakefile')
-  }
-  orgs.each{|org|
-    pid = Process.fork{
-      Dir.chdir(org)
-      load 'Rakefile'
-      Rake::Task['name'].invoke
-    }
-    Process.waitpid pid
-  }
-end
-task 'default' do
-  if $org
-    orgs = [$org]
-  else
-    orgs = Dir.glob('*').
-      select{|t|
-      File.directory?(t ) &&
-        File.exist?(t + '/Rakefile')
-    }
-  end
-  orgs.each{|org|
-    puts "Updating #{ org }"
-    pid = Process.fork{
-      Dir.chdir(org)
-      load 'Rakefile'
-      Rake::Task['update'].invoke
-    }
-    Process.waitpid pid
-  }
-end

data/install_scripts/organisms/cgd.Rakefile DELETED

@@ -1,84 +0,0 @@
-require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
-$name = "Candida albicans"
-$native_id = "Systematic Name"
-$entrez2native = {
-  :tax => 237561,
-  :fix => proc{|code| code.sub(/^CaO/,'orf') },
-  :check => proc{|code| code.match(/^orf/)},
-  :native => 3
-}
-$lexicon = {
-  :file => {
-    :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
-    :native => 0,
-    :extra => [8,1,2],
-    :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
-  },
-}
-$identifiers = {
-  :file => {
-    :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
-    :native => 0,
-    :extra => [8,1,2],
-    :exclude => proc{|l| l.match(/^!/)},
-    :fields => ["GCD ID", "Gene Name", "Gene Alias"]
-  },
-}
-$go = {
-  :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
-  :code => 10,
-  :go   => 4,
-  :pmid => 5,
-  :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
-}
-$query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
-####
-#Rake::Task['identifiers'].clear
-#file 'identifiers' => ['lexicon'] do |t|
-#  identifiers = {}
-#  if $identifiers[:file]
-#    identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
-#  end
-#
-#  orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
-#
-#  translations = {}
-#
-#  Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
-#    orfs.each{|orf|
-#      translations[orf] ||= []
-#      translations[orf] << entrez
-#    }
-#  }
-#
-#  orf2native.each{|orf, native|
-#    next unless identifiers[native]
-#    identifiers[native] << [orf]
-#    if translations[orf]
-#      identifiers[native] << translations[orf]
-#    else
-#      identifiers[native] << []
-#    end
-#
-#  }
-#
-#  header = "#" + [$native_id, 'Gene Name', 'Orf',  "Entrez Gene ID"].uniq.join("\t") + "\n"
-#  Open.write('identifiers',
-#             header +
-#             identifiers.collect{|code, name_lists|
-#               "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
-#             }.join("\n")
-#            )
-#end
-#
-#