RubyGems - rbbt-sources - Versions diffs - 0.4.0 → 1.0.0 - Mend

rbbt-sources 0.4.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/etc/biomart/missing_in_archive +15 -0
data/lib/rbbt/sources/COSMIC.rb +14 -0
data/lib/rbbt/sources/COSTART.rb +1 -1
data/lib/rbbt/sources/CTCAE.rb +1 -1
data/lib/rbbt/sources/InterPro.rb +17 -0
data/lib/rbbt/sources/NCI.rb +7 -0
data/lib/rbbt/sources/biomart.rb +9 -9
data/lib/rbbt/sources/entrez.rb +44 -17
data/lib/rbbt/sources/go.rb +10 -7
data/lib/rbbt/sources/jochem.rb +4 -0
data/lib/rbbt/sources/organism.rb +24 -25
data/lib/rbbt/sources/organism/sequence.rb +253 -19
data/lib/rbbt/sources/polysearch.rb +5 -5
data/lib/rbbt/sources/pubmed.rb +10 -5
data/lib/rbbt/sources/wgEncodeBroadHmm.rb +37 -0
data/share/install/InterPro/Rakefile +29 -0
data/share/install/JoChem/Rakefile +67 -0
data/share/install/NCI/Rakefile +79 -0
data/share/install/Organism/Hsa/Rakefile +20 -1
data/share/install/Organism/Rno/Rakefile +2 -0
data/share/install/Organism/organism_helpers.rb +134 -77
data/share/install/lib/helpers.rb +6 -5
data/test/rbbt/sources/test_biomart.rb +8 -5
data/test/rbbt/sources/test_organism.rb +23 -19
metadata +39 -14

data/lib/rbbt/sources/polysearch.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 require 'rbbt'
 module Polysearch
-  Rbbt.share.Polysearch.organ.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
-  Rbbt.share.Polysearch.tissue.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
-  Rbbt.share.Polysearch.location.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
-  Rbbt.share.Polysearch.disease.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
-  Rbbt.share.Polysearch.drug.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
+  Rbbt.claim Rbbt.share.databases.Polysearch.organ, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
+  Rbbt.claim Rbbt.share.databases.Polysearch.tissue, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
+  Rbbt.claim Rbbt.share.databases.Polysearch.location, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
+  Rbbt.claim Rbbt.share.databases.Polysearch.disease, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
+  Rbbt.claim Rbbt.share.databases.Polysearch.drug, :url, 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
 end

data/lib/rbbt/sources/pubmed.rb CHANGED Viewed

@@ -1,5 +1,6 @@
-require 'rbbt-util'
+require 'rbbt'
 require 'libxml'
+require 'rbbt/sources/gscholar'
 # This module offers an interface with PubMed, to perform queries, and
 # retrieve simple information from articles. It uses the caching
@@ -10,12 +11,16 @@ module PubMed
   @@pubmed_lag = 1
   def self.get_online(pmids)
-    pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
-    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
+    pmids_complete =  pmids.is_a?(Array) ? pmids : [pmids]
-    xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
+    articles = []
+    Misc.divide(pmids_complete, (pmids_complete.length / 500) + 1).each do |pmid_list|
+      url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list * ","}"
-    articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
+      xml = Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed")
+      articles += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
+    end
     if pmids.is_a? Array
       list = {}

data/lib/rbbt/sources/wgEncodeBroadHmm.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require 'rbbt'
+module EBChromatin
+  BASE_URL='http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/'
+  TISSUES= %w(Gm12878 H1hesc Hmec Hsmm Huvec Hepg2 K562 Nhek Nhlf)
+  TISSUES.each do |tissue|
+    file = "wgEncodeBroadHmm#{tissue}HMM.bed.gz"
+    Rbbt.claim Rbbt.share.databases.EBChromatin[file.match(/wgEncodeBroadHmm(.*)HMM.bed.gz/)[1]], :proc do
+      url = File.join(BASE_URL, file)
+      CMD.cmd('sed \'s/^chr\([[:alnum:]]\+\)\t\([[:digit:]]\+\)\t\([[:digit:]]\+\)/\1:\2:\3\t\1\t\2\t\3/\' | cut -f 1,2,3,4,5 | awk \'BEGIN {print "#Region ID\tChromosome Name\tStart\tEnd\tType"} /./ {print $0}\' ', :in => Open.read(url), :pipe => true).read
+    end
+  end
+  def self.chromosome(tissue, chr, positions)
+    list = Array === positions ? positions : [positions]
+    file = Rbbt.share.databases.EBChromatin[tissue]
+    chromosome_bed = Persistence.persist(file, "EBChromatin[#{tissue}][#{chr}]", :fwt, :chromosome => chr, :range => true) do |file, options|
+      chromosome = options[:chromosome]
+      tsv = file.tsv(:persist => false, :type => :list, :grep => "^#{chromosome}:\\|^#")
+      if tsv.size > 0
+        tsv.collect do |gene, values|
+          [gene, values.values_at("Start", "End").collect{|p| p.to_i}]
+        end
+      else
+        raise "No chromatin information for chromosome #{ chr } in tissue #{ tissue }"
+      end
+    end
+    list.collect do |pos| chromosome_bed[pos] end
+  end
+end

data/share/install/InterPro/Rakefile ADDED Viewed

@@ -0,0 +1,29 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
+require 'rbbt/sources/biomart'
+require 'rbbt/sources/entrez'
+$interpro_db = 'entry'
+$interpro_id = ['InterPro Entry Accession','entry_id']
+$interpro_pos = [
+  ["UniProt/SwissProt Accession", "protein_ac"],
+  ["Match Start Position", "pos_from"],
+  ["Match Stop Position ", "pos_to"]
+]
+file 'interpro_positions' do |t|
+  Open.write(t.name, InterPro.tsv($interpro_db, $interpro_id, $interpro_pos, [], nil, :type => :double, :nocache => true).to_s)
+end
+file 'interpro_names' do |t|
+  Open.write(t.name, "#: :type=:list\n#InterPro Entry Accession\tName\n" +  Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/names.dat"))
+end
+file 'interpro_short_names' do |t|
+  Open.write(t.name,  "#: :type=:list\n#InterPro Entry Accession\tShort Name\n" +  Open.read("ftp://ftp.ebi.ac.uk/pub/databases/interpro/short_names.dat"))
+end

data/share/install/JoChem/Rakefile ADDED Viewed

@@ -0,0 +1,67 @@
+require 'rbbt/util/open'
+require 'rbbt/util/misc'
+def read_chunk(jochem)
+  chunk = ""
+  while (not jochem.eof? and not (line = jochem.gets).match(/^--/))
+    chunk << line
+  end
+  return nil if chunk.empty?
+  chunk
+end
+def first(list)
+  return nil if list.nil? or list.empty?
+  list.first
+end
+def process_jochem
+  jochem = Open.open("http://www.biosemantics.org/uploads/file/Jochem/JochemV1_2.zip")
+  identifiers = File.open('identifiers', 'w')
+  identifiers.puts("#: :namespace=JoChem")
+  identifiers.puts("#ID\tCompound Name\tPubChem:ID\tDrugBank:ID")
+  lexicon = File.open('lexicon', 'w')
+  lexicon.puts("#: :namespace=JoChem")
+  lexicon.puts("#ID\tSynonyms")
+  inchi = File.open('inchi', 'w')
+  inchi.puts("#: :namespace=JoChem")
+  inchi.puts("#ID\tInChi")
+  definitions = File.open('definitions', 'w')
+  definitions.puts("#: :namespace=JoChem#:type=:list")
+  definitions.puts("#ID\tDefinition")
+  while chunk = read_chunk(jochem) do
+    next if chunk.empty? or chunk =~ /^#/ or chunk =~ /^NS /
+      info = {}
+      chunk.split(/\n/).each do |line|
+      line.sub!(/\t@match.*/,'')
+      code, value = line.match(/([A-Z]*) (.*)/).values_at 1, 2
+      info[code] ||= []
+      info[code] << value
+    end
+    id = first(info["ID"])
+    na = first(info["NA"])
+    df = first(info["DF"])
+    tm = info["TM"] || []
+    db = info["DB"] || []
+    pubc = db.collect{|code| code.match(/PUBC_(.*)/) ? $1 : nil}.compact
+    drug = db.collect{|code| code.match(/DRUG_(.*)/) ? $1 : nil}.compact
+    inch = db.collect{|code| code.match(/INCH_InChI=(.*)/) ? $1 : nil}.compact
+    lexicon.puts [id, tm.unshift(na) * "|"] * "\t"
+    identifiers.puts [id, na, pubc * "|", drug * "|"] * "\t"
+    inchi.puts [id, inch * "|"] * "\t" if inch.any?
+    definitions.puts [id, df] * "\t" unless df.nil?
+  end
+end
+rule /identifiers|lexicon|inchi|definitions/ do |t|
+  Misc.in_dir(File.dirname(t.name)) do
+    process_jochem
+  end
+end

data/share/install/NCI/Rakefile ADDED Viewed

@@ -0,0 +1,79 @@
+require 'nokogiri'
+module NCI
+  def self.get_pathways(xml, format = "UP")
+    doc = Nokogiri::XML(xml)
+    pathways = {}
+    molecules = {}
+    doc.xpath("//Molecule").each do |molecule|
+      id = molecule.attribute('id').value
+      type = molecule.attribute('molecule_type').value
+      next unless type == "protein"
+      names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value}
+      next if names.empty?
+      molecules[id] = {:xml => molecule, :uniprot => names.first}
+    end
+    interactions = {}
+    doc.xpath("//Interaction").each do |interaction|
+      id = interaction.attribute('id').value
+      molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}
+      interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids}
+    end
+    doc.xpath("//Pathway").each do |pathway|
+      id = pathway.attribute('id').value
+      subnet = pathway.attribute('subnet').value
+      name = pathway.xpath('LongName').first.content
+      interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value}
+      pathway_interactions = interaction_ids.collect{|id| interactions[id]}
+      pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten
+      pathway_uniprot_ids = pathway_molecule_ids.collect do |id|
+        next unless molecules.include? id
+        molecules[id][:uniprot]
+      end
+      pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]]
+    end
+    pathways
+  end
+end
+file 'nature_pathways' do |t|
+  url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/NCI-Nature_Curated.xml.gz"
+  xml = Open.read(url)
+  pathways = NCI.get_pathways(xml)
+  Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
+end
+file 'biocarta_pathways' do |t|
+  url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/BioCarta.xml.gz"
+  xml = Open.read(url)
+  pathways = NCI.get_pathways(xml, "LL")
+  Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
+end
+file 'reactome_pathways' do |t|
+  url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz"
+  xml = Open.read(url)
+  pathways = NCI.get_pathways(xml, "UP")
+  Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
+end

data/share/install/Organism/Hsa/Rakefile CHANGED Viewed

@@ -7,6 +7,8 @@ $taxs = [9606]
 $scientific_name = "Homo sapiens"
 $biomart_db = 'hsapiens_gene_ensembl'
+$biomart_db_germline_variation = 'hsapiens_snp'
+$biomart_db_somatic_variation = 'hsapiens_snp_som'
 $biomart_lexicon = [
   [ 'Associated Gene Name' , "external_gene_id"],
@@ -15,6 +17,14 @@ $biomart_lexicon = [
   [ 'HGNC curated gene name ', "hgnc_curated_gene_name"  ],
 ]
+$biomart_protein_identifiers = [
+  [ 'Protein ID', "protein_id"  ],
+  [ 'RefSeq Protein ID', "refseq_peptide"  ],
+  [ 'Unigene ID', "unigene"  ],
+  [ 'UniProt/SwissProt ID', "uniprot_swissprot"  ],
+  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"  ],
+]
 $biomart_identifiers = [
   [ 'Entrez Gene ID', "entrezgene"],
   [ 'Ensembl Protein ID', "ensembl_peptide_id"  ],
@@ -42,7 +52,7 @@ $biomart_identifiers = [
   [ 'AFFY HG U95E', 'affy_hg_u95e' ],
   [ 'AFFY HG U95A', 'affy_hg_u95a' ],
   [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
-  [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
+  [ 'AFFY HuEx', 'affy_huex_1_0_st_v2', "HuEx" ],
   [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
   [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
   [ 'Agilent WholeGenome',"agilent_wholegenome" ],
@@ -52,5 +62,14 @@ $biomart_identifiers = [
   [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
 ]
+$biomart_go= [
+  ["GO ID", 'go_id'],
+  ["GO Namespace", 'namespace_1003'],
+]
+$biomart_pfam= [
+  ["Pfam Domain", 'pfam'],
+]
 $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
 load File.join(File.dirname(__FILE__), '../organism_helpers.rb')

data/share/install/Organism/Rno/Rakefile CHANGED Viewed

@@ -7,6 +7,8 @@ $taxs = [10116]
 $scientific_name = "Rattus norvegicus"
 $biomart_db = 'rnorvegicus_gene_ensembl'
+$biomart_db_germline_variation = 'rnorvegicus_snp'
+$biomart_db_somatic_variation = 'rnorvegicus_snp_som'
 $biomart_lexicon = [
   [ 'Associated Gene Name' , "external_gene_id"],

data/share/install/Organism/organism_helpers.rb CHANGED Viewed

@@ -2,8 +2,6 @@ $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
 $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
 $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
 $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
-$biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
-$biomart_germline_variation_id = ['Variation ID', "external_id" ]
 $biomart_gene_positions = [
   ['Chromosome Name','chromosome_name'],
@@ -60,42 +58,6 @@ $biomart_exons = [
   ['Exon Chr End','exon_chrom_end'],
 ]
-#{{{ Variations
-$biomart_germline_variation_positions = [
-  ['Chromosome Location (bp)', "chromosome_location" ],
-  ['SNP Chromosome Strand', "snp_chromosome_strand" ],
-  ['Transcript location (bp)', "transcript_location" ],
-  ['Allele', "allele" ],
-  ['Protein Allele', "peptide_shift" ],
-  ['CDS Start', "cds_start_2076" ],
-  ['CDS End', "cds_end_2076" ],
-]
-$biomart_germline_variations = [
-    $biomart_ensembl_gene,
-    ['Source', "source_name" ],
-    ['Validated', "validated" ],
-    ['Consequence Type', "synonymous_status" ],
-]
-$biomart_somatic_variation_positions = [
-    ['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
-    ['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
-    ['Transcript location (bp)' , "somatic_transcript_location" ] ,
-    ['Allele' , "somatic_allele" ] ,
-    ['Protein Allele' , "somatic_peptide_shift" ] ,
-    ['CDS Start' , "somatic_cds_start_2076" ] ,
-    ['CDS End' , "somatic_cds_end_2076" ] ,
-]
-$biomart_somatic_variations = [
-    $biomart_ensembl_gene,
-    ['Source' , "somatic_source_name" ] ,
-    ['Validated' , "somatic_validated" ] ,
-    ['Consequence Type' , "somatic_synonymous_status" ] ,
-]
 #{{{ Rules
 file 'scientific_name' do |t|
@@ -104,15 +66,69 @@ end
 file 'identifiers' do |t|
   identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
   $biomart_identifiers.each do |name, key, prefix|
     if prefix
       identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
     end
   end
+  name_pos = identifiers.identify_field "Associated Gene Name"
+  entrez2name = Entrez.entrez2name($taxs)
+  identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
+    names = values[name_pos]
+    matches = entrez.select do |e|
+      entrez2name.include? e and (names & entrez2name[e]).any?
+    end
+    if matches.any?
+      matches
+    else
+      entrez
+    end
+  end
+  entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
+  entrez_synonyms.key_field = "Entrez Gene ID"
+  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
+  identifiers.attach entrez_synonyms
+  identifiers.each do |key, values|
+    values.each do |list|
+      list.reject!{|v| v.nil? or v.empty?}
+      list.uniq!
+    end
+  end
+  File.open(t.name, 'w') do |f| f.puts identifiers end
+end
+file 'lexicon' => 'identifiers' do |t|
+  tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
+  entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 8
+  entrez_description.key_field = "Entrez Gene ID"
+  entrez_description.fields = ["Entrez Gene Description"]
+  tsv.attach entrez_description
+  Open.write(t.name, tsv.to_s)
+end
+file 'protein_identifiers' do |t|
+  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_identifiers, [], nil, :namespace => $namespace)
+  $biomart_protein_identifiers.each do |name, key, prefix|
+    if prefix
+      identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
+    end
+  end
   File.open(t.name, 'w') do |f| f.puts identifiers end
 end
 file 'gene_transcripts' do |t|
   transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat, :namespace => $namespace)
@@ -121,7 +137,7 @@ end
 file 'transcripts' => 'gene_positions' do |t|
   transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
-  transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
+  transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
   File.open(t.name, 'w') do |f| f.puts transcripts end
 end
@@ -198,7 +214,7 @@ end
 file 'exons' => 'gene_positions' do |t|
   exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
-  exons.attach TSV.new('gene_positions'), "Chromosome Name"
+  exons.attach TSV.open('gene_positions'), "Chromosome Name"
   File.open(t.name, 'w') do |f| f.puts exons end
 end
@@ -227,28 +243,21 @@ file 'transcript_sequence' do |t|
 end
-$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD_TRANSCRIPT,COMPLEX_INDEL&SPLICE_SITE,ESSENTIAL_SPLICE_SITE&INTRONIC,ESSENTIAL_SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,FRAMESHIFT_CODING,FRAMESHIFT_CODING&NMD_TRANSCRIPT,FRAMESHIFT_CODING&SPLICE_SITE,FRAMESHIFT_CODING&SPLICE_SITE&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING&SPLICE_SITE,NON_SYNONYMOUS_CODING&SPLICE_SITE&NMD_TRANSCRIPT,REGULATORY_REGION,SPLICE_SITE&3PRIME_UTR,SPLICE_SITE&3PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&5PRIME_UTR,SPLICE_SITE&5PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&INTRONIC,SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,SPLICE_SITE&SYNONYMOUS_CODING,SPLICE_SITE&SYNONYMOUS_CODING&NMD_TRANSCRIPT,STOP_GAINED,STOP_GAINED&FRAMESHIFT_CODING,STOP_GAINED&FRAMESHIFT_CODING&NMD_TRANSCRIPT,STOP_GAINED&NMD_TRANSCRIPT,STOP_GAINED&SPLICE_SITE,STOP_GAINED&SPLICE_SITE&NMD_TRANSCRIPT,STOP_LOST,STOP_LOST&NMD_TRANSCRIPT,STOP_LOST&SPLICE_SITE,STOP_LOST&SPLICE_SITE&NMD_TRANSCRIPT,SYNONYMOUS_CODING,SYNONYMOUS_CODING&NMD_TRANSCRIPT"]
-#$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
-$biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
+#{{{ Variations
-file 'germline_variations' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
-end
+$biomart_variation_id = ["SNP ID", "refsnp_id"]
+$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
-file 'germline_variation_positions' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
-  File.open(t.name, 'w') do |f| f.puts variations.to_s end
+file 'germline_variations' do |t|
+  BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
 end
 file 'somatic_variations' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
-  File.open(t.name, 'w') do |f| f.puts variations.to_s end
+  BioMart.tsv($biomart_db_somatic_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
 end
-file 'somatic_variation_positions' do |t|
-  variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
-  File.open(t.name, 'w') do |f| f.puts variations.to_s end
-end
+# {{{ Other info
 file 'gene_pmids' do |t|
   tsv =  Entrez.entrez2pubmed($taxs)
@@ -260,47 +269,95 @@ file 'gene_pmids' do |t|
   Open.write(t.name, text)
 end
-file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
-  require 'rbbt/sources/organism/sequence'
+def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
+  transcripts = begin
+                  exon_transcripts[exon].first
+                rescue
+                  []
+                end
-  exons = TSV.new('exons', :persistence => true)
-  exon_transcripts = TSV.new('transcript_exons', :double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
-  gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
-  transcript_info = TSV.new('transcripts', :list, :persistence => true )
-  transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
+  transcripts.select{|transcript| transcript_info[transcript].first.any?}
+end
+def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
+  sizes = [0]
+  rank = nil
+  start_pos = exons.identify_field "Exon Chr Start"
+  end_pos = exons.identify_field "Exon Chr End"
+  Misc.zip_fields(transcript_exons[transcript]).each do |_exon, _rank|
+    _rank = _rank.to_i
+    s, e = exons[_exon].values_at(start_pos, end_pos)
+    size = e.to_i - s.to_i + 1
+    sizes[_rank] =  size
+    rank = _rank if _exon == exon
+  end
+  if not rank.nil?
+    sizes[0..rank - 1].inject(0){|e,acc| acc += e}
+  else
+    nil
+  end
+end
+file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
+  exons = TSV.open('exons')
+  exon_transcripts = nil
+  exon_transcripts = TSV.open('transcript_exons', :double, :key_field => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true)
+  gene_transcripts = TSV.open('gene_transcripts', :flat)
+  transcript_info  = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
+  transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
   string = "#: :namespace=#{$namespace}"
   string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
-  exons.each do |exon, info|
-    gene, start, finish, strand, chr = info
-    transcripts = Organism::Hsa.coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
+  exons.unnamed = true
+  exon_transcripts.unnamed = true
+  gene_transcripts.unnamed = true
+  transcript_info.unnamed = true
+  transcript_exons.unnamed = true
+  exons.monitor = true
+  Misc.profile do
+    exons.through do |exon, info|
+      gene, start, finish, strand, chr = info
-    transcript_offsets = {}
-    transcripts.each do |transcript|
-      offset = Organism::Hsa.exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
-      transcript_offsets[transcript] = offset unless offset.nil?
+      transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
+      transcript_offsets = {}
+      transcripts.each do |transcript|
+        offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
+        transcript_offsets[transcript] = offset unless offset.nil?
+      end
+      string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
     end
-    string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
   end
   Open.write(t.name, string)
 end
+file 'gene_go' do |t|
+  goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
+  File.open(t.name, 'w') do |f| f.puts goterms end
+end
+file 'gene_pfam' do |t|
+  goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
+  File.open(t.name, 'w') do |f| f.puts goterms end
+end
 rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
   t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
   archive = $1
   task    = $2
-  old_pwd = FileUtils.pwd
-  begin
-    FileUtils.mkdir archive unless File.exists? archive
-    FileUtils.cd File.join(archive)
+  Misc.in_dir(archive) do
     BioMart.set_archive archive
     Rake::Task[task].invoke
     BioMart.unset_archive
-  ensure
-    FileUtils.cd old_pwd
   end
 end