RubyGems - rbbt - Versions diffs - 1.0.0 - Mend

rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/LICENSE +20 -0
data/README.rdoc +17 -0
data/bin/rbbt_config +180 -0
data/install_scripts/classifier/R/classify.R +36 -0
data/install_scripts/classifier/Rakefile +140 -0
data/install_scripts/get_abner.sh +2 -0
data/install_scripts/get_banner.sh +25 -0
data/install_scripts/get_biocreative.sh +72 -0
data/install_scripts/get_crf++.sh +26 -0
data/install_scripts/get_entrez.sh +4 -0
data/install_scripts/get_go.sh +4 -0
data/install_scripts/get_polysearch.sh +8 -0
data/install_scripts/ner/Rakefile +206 -0
data/install_scripts/ner/config/default.rb +52 -0
data/install_scripts/norm/Rakefile +218 -0
data/install_scripts/norm/config/cue_default.rb +10 -0
data/install_scripts/norm/config/tokens_default.rb +79 -0
data/install_scripts/norm/functions.sh +21 -0
data/install_scripts/organisms/Rakefile +25 -0
data/install_scripts/organisms/cgd.Rakefile +84 -0
data/install_scripts/organisms/human.Rakefile +145 -0
data/install_scripts/organisms/mgi.Rakefile +77 -0
data/install_scripts/organisms/pombe.Rakefile +40 -0
data/install_scripts/organisms/rake-include.rb +258 -0
data/install_scripts/organisms/rgd.Rakefile +88 -0
data/install_scripts/organisms/sgd.Rakefile +66 -0
data/install_scripts/organisms/tair.Rakefile +54 -0
data/install_scripts/organisms/worm.Rakefile +109 -0
data/install_scripts/stopwords +1 -0
data/install_scripts/wordlists/consonants +897 -0
data/install_scripts/wordlists/stopwords +1 -0
data/lib/rbbt/bow/bow.rb +87 -0
data/lib/rbbt/bow/classifier.rb +118 -0
data/lib/rbbt/bow/dictionary.rb +218 -0
data/lib/rbbt/ner/abner.rb +34 -0
data/lib/rbbt/ner/banner.rb +73 -0
data/lib/rbbt/ner/regexpNER.rb +62 -0
data/lib/rbbt/ner/rner.rb +227 -0
data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
data/lib/rbbt/ner/rnorm.rb +142 -0
data/lib/rbbt/sources/biocreative.rb +75 -0
data/lib/rbbt/sources/biomart.rb +106 -0
data/lib/rbbt/sources/entrez.rb +211 -0
data/lib/rbbt/sources/go.rb +40 -0
data/lib/rbbt/sources/organism.rb +197 -0
data/lib/rbbt/sources/polysearch.rb +88 -0
data/lib/rbbt/sources/pubmed.rb +111 -0
data/lib/rbbt/util/arrayHash.rb +255 -0
data/lib/rbbt/util/filecache.rb +72 -0
data/lib/rbbt/util/index.rb +69 -0
data/lib/rbbt/util/misc.rb +101 -0
data/lib/rbbt/util/open.rb +207 -0
data/lib/rbbt/util/simpleDSL.rb +87 -0
data/lib/rbbt/util/tmpfile.rb +19 -0
data/lib/rbbt/version.rb +10 -0
data/lib/rbbt.rb +86 -0
data/tasks/install.rake +123 -0
metadata +114 -0

data/install_scripts/ner/Rakefile ADDED Viewed

@@ -0,0 +1,206 @@
+require 'rbbt/sources/organism'
+require 'rbbt/sources/biocreative'
+require 'rbbt/ner/rner'
+require 'progress-monitor'
+$type = ENV['type'] || 'rner'
+#{{{ FEATURES
+def BC2GM_features(dataset, outfile)
+  data = Biocreative.BC2GM(dataset)
+  fout = File.open(outfile,'w')
+  parser   = NERFeatures.new
+  Progress.monitor("CRFPP Features BC2GM #{ dataset }")
+  data.each{|code, info|
+    text = info[:text]
+    mentions = info[:mentions]
+    features = parser.tagged_features(text,mentions)
+    features.each{|feat|
+      fout.puts feat.join(" ")
+    }
+    fout.puts
+  }
+  fout.close
+end
+def BC2GN_features(dataset, outfile)
+  data = {}
+  Dir.glob(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'*.txt')).each{|f|
+    code = File.basename(f).sub(/.txt/,'')
+    data[code] = {}
+    data[code][:text] = Open.read(f)
+  }
+  Open.read(File.join(Rbbt.datadir,'biocreative','BC2GN',dataset,'genelist')).each{|l|
+   code, gene, mention = l.chomp.split(/\t/)
+   data[code][:mentions] ||= []
+   data[code][:mentions] << mention
+  }
+  fout = File.open(outfile,'w')
+  parser   = NERFeatures.new
+  Progress.monitor("CRFPP Features BC2GN #{ dataset }")
+  data.each{|code, info|
+    text = info[:text]
+    mentions = info[:mentions]
+    next if mentions.nil?
+    features = parser.tagged_features(text,mentions)
+    features.each{|feat|
+      fout.puts feat.join(" ")
+    }
+    fout.puts
+  }
+  fout.close
+end
+def org_features(org, outfile)
+  names = Organism.lexicon(org).collect{|code, names|
+    names
+  }.flatten
+  fout = File.open(outfile,'w')
+  parser   = NERFeatures.new
+  Progress.monitor("CRFPP Features #{ org }")
+  names.each{|name|
+    features = parser.text_features(name, true)
+    features.each{|feat|
+      fout.puts feat.join(" ")
+    }
+    fout.puts
+  }
+  fout.close
+end
+file "data/BC2GM_train.features" do |t|
+  BC2GM_features(:train, 'data/BC2GM_train.features')
+end
+file "data/BC2GM_test.features" do |t|
+  BC2GM_features(:test, 'data/BC2GM_test.features')
+end
+file "data/BC2GN_Train.features" do |t|
+  BC2GN_features('Train', 'data/BC2GN_Train.features')
+end
+file "data/BC2GN_Test.features" do |t|
+  BC2GN_features('Test', 'data/BC2GN_Test.features')
+end
+file "data/BC2GM.features" => ['data/BC2GM_train.features','data/BC2GM_test.features'] do |t|
+  Open.write('data/BC2GM.features',Open.read('data/BC2GM_train.features'))
+  Open.append('data/BC2GM.features',Open.read('data/BC2GM_test.features'))
+end
+file "data/BC2GN.features" => ['data/BC2GN_Train.features','data/BC2GN_Test.features'] do |t|
+  Open.write('data/BC2GN.features',Open.read('data/BC2GN_Train.features'))
+  Open.append('data/BC2GN.features',Open.read('data/BC2GN_Test.features'))
+end
+file "data/BC2.features" => ['data/BC2GN.features','data/BC2GM.features'] do |t|
+  Open.write('data/BC2.features',Open.read('data/BC2GM.features'))
+  Open.append('data/BC2.features',Open.read('data/BC2GN.features'))
+end
+file "data/train.features" => [
+  #'data/BC2GN.features',
+  'data/BC2GM_train.features'
+  ] do |t|
+  t.prerequisites.each_with_index{|f,i|
+    if i == 0
+      Open.write('data/train.features',Open.read(f))
+    else
+      Open.append('data/train.features',Open.read(f))
+    end
+  }
+end
+rule (/data\/(.*).features/) =>  ['data/BC2.features'] do |t|
+  org = File.basename(t.name).sub(/.features$/,'')
+  org_features(org, t.name)
+  Open.append(t.name, Open.read('data/BC2.features'))
+end
+#{{{ MODEL
+rule (/model\/(.*)/) => lambda {|t| t.sub(/model/,'data') + '.features'} do |t|
+  parser = NERFeatures.new
+  parser.train( t.name.sub(/model/,'data') + '.features', t.name)
+end
+task 'clean' do
+  FileUtils.rm Dir.glob("data/*")
+  FileUtils.rm Dir.glob("model/*")
+  FileUtils.rm Dir.glob("results/*")
+end
+task 'all' do
+  Organism.all.each{|org|
+    Rake::Task["model/#{ org }"].invoke
+  }
+end
+task 'default' do
+  if $org
+    FileUtils.rm Dir.glob("**/#{$org}.*") if $force
+    Rake::Task["model/#{$org}"].invoke
+  else
+    Rake::Task['clean'].invoke if $force
+    Rake::Task['all'].invoke
+  end
+end
+#{{{ EVALUATE
+def find(model, type, outfile)
+  ner = Organism.ner(:human,type,:model => model)
+  data = Biocreative.BC2GM(:test)
+  fout = File.open(outfile,'w')
+  Progress.monitor("Test")
+  data.each{|code,info|
+    text = info[:text]
+    mentions = ner.extract(text)
+    mentions.each{|mention|
+      positions = Biocreative.position(text,mention)
+      positions.each{|pos|
+        fout.puts "#{code}|#{pos[0]} #{pos[1]}|#{mention}"
+      }
+    }
+  }
+end
+rule (/results\/test$/)  do |t|
+  org = File.basename(t.name)
+  if $type == 'rner'
+    Rake::Task['model/train'].invoke
+  end
+  find('model/train',$type,t.name)
+end
+rule (/results\/test.eval$/) => ['results/test'] do |t|
+  Biocreative.BC2GM_eval('results/test',:test, 'results/test.eval')
+end

data/install_scripts/ner/config/default.rb ADDED Viewed

@@ -0,0 +1,52 @@
+isLetters     /^[A-Z]+$/i
+isUpper       /^[A-Z]+$/
+isLower       /^[a-z]+$/
+isDigits      /^[0-9]+$/i
+isRoman       /^[IVX]+$/
+isGreek       /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
+isPunctuation /^[,.;]$/
+isDelim       /^[\/()\[\]{}\-]$/
+isNonWord     /^[^\w]+$/
+isConjunction /^and|or|&|,$/
+hasLetters    /[A-Z]/i
+hasUpper      /.[A-Z]/
+hasLower      /[a-z]/
+hasDigits     /[0-9]/i
+hasGreek      /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
+hasPunctuation /[,.;]/
+hasDelim      /[\/()\[\]{}\-]/
+hasNonWord    /[^\w]/
+caspMix       /[a-z].[A-Z]/
+keywords      /(?:protein|gene|domain|ase)s?$/
+hasSuffix     /[a-z][A-Z0-9]$/
+numLetters    do |w| w.scan(/[A-Z]/i).length end
+numDigits     do |w| w.scan(/[0-9]/).length end
+#
+prefix_3      /^(...)/
+prefix_4      /^(....)/
+suffix_3      /(...)$/
+suffix_4      /(....)$/
+token1        do |w|
+                 w.sub(/[A-Z]/,'A').
+                   sub(/[a-z]/,'a').
+                   sub(/[0-9]/,'0').
+                   sub(/[^0-9a-z]/i,'x')
+              end
+token2        do  |w|
+                 w.sub(/[A-Z]+/,'A').
+                   sub(/[a-z]+/,'a').
+                   sub(/[0-9]+/,'0').
+                   sub(/[^0-9a-z]+/i,'x')
+               end
+token3         do |w| w.downcase end
+special        do |w| w.is_special? end
+context   %w(special token2 isPunctuation isDelim)
+window     %w(1 2 3 -1 -2 -3)
+#direction :reverse

data/install_scripts/norm/Rakefile ADDED Viewed

@@ -0,0 +1,218 @@
+require 'rbbt'
+require 'rbbt/sources/organism'
+require 'rbbt/util/open'
+require 'rbbt/ner/rner'
+require 'progress-meter'
+$type = ENV['ner'] || :rner
+$debug = !ENV['debug'].nil?
+$perfect = !ENV['perfect'].nil?
+$docs  = ENV['docs']
+$org2rbbt = {
+  'yeast' => 'sgd',
+  'mouse' => 'mgi',
+  'fly' => 'sgd',
+  'bc2gn' => 'human',
+}
+def match(org, filedir, goldstandard,outfile)
+  t = Time.now
+  if org == 'bc2gn'
+    custom_file = File.join('config', org + '.config')
+    norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"),
+                        :to_entrez => false,
+                        :file => (File.exist?(custom_file) ? custom_file : nil),
+                        :max_candidates => 200)
+  else
+    custom_file = File.join('config', org + '.config')
+    norm = Normalizer.new(File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"),
+                        :to_entrez => Open.to_hash(File.join(Rbbt.datadir,"organisms/#{$org2rbbt[org]}/identifiers"),
+                                     :native => 0, :extra => 1,:single => true, :sep => "\t|\\|",
+                                     :fix => proc{|l| l.sub(/S000/,'S0')}),
+                        :file => (File.exist?(custom_file) ? custom_file : nil),
+                        :max_candidates => 200)
+  end
+  STDERR.puts "Loaded Normalizer #{Time.now - t}\n\n"
+  if $type.to_s == 'rner'
+    ner = NER.new('models/' + org)
+  else
+    ner = Organism.ner($org2rbbt[org], $type)
+  end
+  fout=File.open(outfile,'w')
+  gs  = Open.to_hash(goldstandard,:native => 0,:extra => 1)
+  gs_mentions  = Open.to_hash(goldstandard,:native => 0,:extra => 2)
+  if org == 'bc2gn'
+    lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC2GN/entrezGeneLexicon.list"), :sep => "\t|\\|")
+  else
+    lex = Open.to_hash( File.join(Rbbt.datadir,"biocreative/BC1GN/#{org}/synonyms.list"), :sep => "\t|\\|")
+  end
+  if $docs
+    files = $docs.split(',').collect{|doc| File.join(filedir, doc + '.txt')}
+  else
+    files = Dir.glob(filedir + '*.txt').sort
+  end
+  Progress.monitor("Processing Files")
+  files.each{|f|
+    fid = File.basename(f).sub(/.txt/,'')
+    text = Open.read(f)
+    if $perfect
+      mentions = (gs_mentions[fid] || []).flatten
+    else
+      mentions = ner.extract(text).uniq
+    end
+    if $debug
+      puts "------------------------------------"
+      puts "FILE #{fid}"
+      puts
+      puts text
+      puts "CODES: #{(gs[fid] || []).flatten.join(", ")}"
+      puts "MENTIONS: #{mentions.join(", ")}"
+    end
+    found = []
+    mentions.each{|mention|
+      codes = norm.select(norm.match(mention),mention,text)
+      found += codes
+      codes.each{|code|
+        #code = code.sub(/S000/,'S0')
+        fout.puts "#{ fid }\t#{ code}\t#{mention}"
+      }
+      puts "Mention: #{ mention } => #{ codes.join(", ") }"  if $debug
+    }
+    if $debug
+      found.uniq!
+      fn = (gs[fid] || []).flatten.uniq - found
+      fp = found - (gs[fid] || []).flatten.uniq
+      fn.each{|code|
+        if lex[code]
+          puts "FN: #{ code } => #{lex[code].flatten.join(", ")}"
+        else
+           puts "FN: #{ code }"
+        end
+     }
+      fp.each{|code|
+        if lex[code]
+          puts "FP: #{ code } => #{lex[code].flatten.join(", ")}"
+        else
+           puts "FN: #{ code }"
+        end
+      }
+    end
+  }
+  fout.close
+end
+rule (/models\/(yeast|mouse|fly|bc2gn).features/) do |t|
+  org = File.basename(t.name).sub(/\.features/,'')
+  if org == 'bc2gn'
+    lexicon = File.join(Rbbt.datadir, "biocreative/BC2GN/entrezGeneLexicon.list")
+  else
+    lexicon = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/synonyms.list")
+  end
+  names = File.open(lexicon).collect{|l|
+    names = l.split(/\t/)
+    names.shift
+    names.compact.select{|n| !n.empty?}
+  }.flatten
+  fout = File.open(t.name,'w')
+  parser   = NERFeatures.new
+  Progress.monitor("CRFPP Features #{ org }")
+  names.each{|name|
+    features = parser.text_features(name, true)
+    features.each{|feat|
+      fout.puts feat.join(" ")
+    }
+    fout.puts
+  }
+  fout.close
+  if org != 'bc2gn'
+    Open.append(t.name, Open.read('../ner/data/BC2.features'))
+  else
+    Open.append(t.name, Open.read('../ner/data/BC2GM.features'))
+    Open.append(t.name, Open.read('../ner/data/BC2GN_Train.features'))
+  end
+end
+rule (/models\/(yeast|mouse|fly|bc2gn)$/) => lambda{|t| t + '.features' } do |t|
+  org = File.basename(t.name)
+  parser = NERFeatures.new
+  parser.train( t.name + '.features', t.name)
+end
+rule (/results\/(yeast|mouse|fly)_(devtest|train|test)$/) do |t|
+  org, dataset = File.basename(t.name).split(/_/)
+  if $type.to_sym == :rner
+    Rake::Task['models/' + org].invoke
+  end
+  filedir      = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/text/")
+  goldstandard = File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")
+  match(org,filedir, goldstandard,t.name)
+end
+rule (/results\/(.+)_(.+).eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
+  org, dataset = File.basename(t.name.sub(/.eval/,'')).split(/_/)
+  cmd = "perl #{File.join(Rbbt.datadir, "biocreative/BC1GN/task1Bscorer.pl")} #{File.join(Rbbt.datadir, "biocreative/BC1GN/#{ org }/#{ dataset }/genelist")} #{t.name.sub(/.eval/,'')} > #{t.name}"
+  puts cmd
+  system cmd
+end
+rule (/results\/bc2gn$/) do |t|
+  org = 'bc2gn'
+  if $type.to_sym == :rner
+    Rake::Task['models/' + org].invoke
+  end
+  filedir      = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/")
+  goldstandard = File.join(Rbbt.datadir, "biocreative/BC2GN/Test/genelist")
+  match(org,filedir, goldstandard,t.name)
+end
+rule (/results\/bc2gn.eval/) => lambda{|t| t.sub(/.eval/,'')} do |t|
+  cmd = "python #{Rbbt.datadir + '/biocreative/BC2GN/bc2scoring.py'} #{Rbbt.datadir + '/biocreative/BC2GN/Test/genelist'} results/bc2gn > #{t.name}"
+  system cmd
+end

data/install_scripts/norm/config/cue_default.rb ADDED Viewed

@@ -0,0 +1,10 @@
+equal    do |w| [w] end
+standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
+cleaned  do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'')] end
+special  do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
+words    do |w|
+  w.sub(/(.*)I$/,'\1I \1').
+    scan(/[a-z][a-z]+/i).
+    sort{|a,b| b.length <=> a.length}.
+    collect{|n| n.downcase}
+end

data/install_scripts/norm/config/tokens_default.rb ADDED Viewed

@@ -0,0 +1,79 @@
+require 'rbbt/util/misc'
+tokens do
+  # Some (possible) single letters first
+  receptor     /^(?:receptor|r)s?$/i
+  protein      /^(?:protein|p)s?$/i
+  roman        /^[IV]+$/
+  greek_letter do |w| $inverse_greek[w.downcase] != nil end
+  # Some words for removal
+  stopword     do |w|  $stopwords.include?( w.downcase_first)  end
+  gene         /genes?/i
+  dna
+  cdna
+  rna
+  mrna
+  trna
+  cdna
+  component
+  exon
+  intron
+  domain
+  family
+  # Important words
+  number       /^(?:\d+[.,]?\d+|\d)$/
+  greek        do |w| $greek[w.downcase] != nil end
+  special      do |w| w.is_special? end
+  promoter
+  similar      /^(homolog.*|like|related|associated)$/
+  ase          /ase$/
+  in_end       /in$/
+end
+comparisons do
+  compare.number do |l1,l2|
+      v = 0
+      case
+      when l1.empty? && l2.empty?
+          v = 0
+      when l1.sort.uniq == l2.sort.uniq
+          v = 3
+      when l1.any? && l1[0] == l2[0]
+          v = -3
+      when l1.empty? && l2 == ['1']
+          v = -5
+      else
+          v = -10
+      end
+      v
+  end
+  diff.promoter   -10
+  diff.receptor   -10
+  diff.similar    -10
+  diff.capital    -10
+  same.unknown      1
+  miss.unknown      -2
+  extr.unknown      -2
+  same.greek      1
+  miss.greek      -2
+  extr.greek      -2
+  same.special    4
+  miss.special    -3
+  extr.special    -3
+  transform.roman do |t| [t.arabic, :number] end
+  transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
+  transform.ase do |t| [t, :special] end
+  transform.in_end do |t| [t, :special] end
+  transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
+end

data/install_scripts/norm/functions.sh ADDED Viewed

@@ -0,0 +1,21 @@
+#!/bin/bash
+function norm(){
+    o=$1
+    shift
+    s=$1
+    shift
+    n=$1
+    shift
+    echo "rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval"
+    rm results/${o}_$s; rake results/${o}_$s.eval ner=$n $@ > ${o}_$s.log_$n; tail results/${o}_$s.eval
+}
+function norm_2(){
+    n=$1
+    shift
+    echo "rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval"
+    rm results/bc2gn; rake results/bc2gn.eval ner=$n $@ > bc2gn.log_$n; tail results/bc2gn.eval
+}

data/install_scripts/organisms/Rakefile ADDED Viewed

@@ -0,0 +1,25 @@
+$org = [$org, ENV['organism'],nil].reject{|e| e.nil? }.first
+task 'default' do
+  if $org
+    orgs = [$org]
+  else
+    orgs = Dir.glob('*').
+      select{|t|
+      File.directory?(t ) &&
+        File.exist?(t + '/Rakefile')
+    }
+  end
+  orgs.each{|org|
+    puts "Updating #{ org }"
+    pid = Process.fork{
+      Dir.chdir(org)
+      load 'Rakefile'
+      Rake::Task['update'].invoke
+    }
+    Process.waitpid pid
+  }
+end

data/install_scripts/organisms/cgd.Rakefile ADDED Viewed

@@ -0,0 +1,84 @@
+require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
+$name = "Candida albicans"
+$native_id = "Systematic Name"
+$entrez2native = {
+  :tax => 237561,
+  :fix => proc{|code| code.sub(/^CaO/,'orf') },
+  :check => proc{|code| code.match(/^orf/)},
+  :native => 3
+}
+$lexicon = {
+  :file => {
+    :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
+    :native => 0,
+    :extra => [8,1,2],
+    :exclude => proc{|l| l.match(/^!/) && !l.match(/^orf/)}
+  },
+}
+$identifiers = {
+  :file => {
+    :url => 'http://hypha.stanford.edu/download/chromosomal_feature_files/chromosomal_feature.tab',
+    :native => 0,
+    :extra => [8,1,2],
+    :exclude => proc{|l| l.match(/^!/)},
+    :fields => ["GCD ID", "Gene Name", "Gene Alias"]
+  },
+}
+$go = {
+  :url => "http://www.candidagenome.org/go/gene_association.cgd.gz",
+  :code => 10,
+  :go   => 4,
+  :pmid => 5,
+  :fix => proc{|l| v = l.split(/\t/); v[10] = (v[10] || "").split('|').first; v.join("\t")}
+}
+$query = '"candida albicans"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
+####
+#Rake::Task['identifiers'].clear
+#file 'identifiers' => ['lexicon'] do |t|
+#  identifiers = {}
+#  if $identifiers[:file]
+#    identifiers = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
+#  end
+#
+#  orf2native = Open.to_hash('lexicon', :native => 1, :extra => 0, :single => true)
+#
+#  translations = {}
+#
+#  Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).each{|entrez, orfs|
+#    orfs.each{|orf|
+#      translations[orf] ||= []
+#      translations[orf] << entrez
+#    }
+#  }
+#
+#  orf2native.each{|orf, native|
+#    next unless identifiers[native]
+#    identifiers[native] << [orf]
+#    if translations[orf]
+#      identifiers[native] << translations[orf]
+#    else
+#      identifiers[native] << []
+#    end
+#
+#  }
+#
+#  header = "#" + [$native_id, 'Gene Name', 'Orf',  "Entrez Gene ID"].uniq.join("\t") + "\n"
+#  Open.write('identifiers',
+#             header +
+#             identifiers.collect{|code, name_lists|
+#               "#{ code }\t" + name_lists.collect{ |names| names.join("|") }.join("\t")
+#             }.join("\n")
+#            )
+#end
+#
+#