RubyGems - rbbt-sources - Versions diffs - 1.2.0 → 2.0.0 - Mend

rbbt-sources 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/etc/biomart/missing_in_archive +11 -0
data/lib/rbbt/sources/COSMIC.rb +47 -4
data/lib/rbbt/sources/HPRD.rb +23 -0
data/lib/rbbt/sources/InterPro.rb +98 -8
data/lib/rbbt/sources/NCI.rb +7 -5
data/lib/rbbt/sources/PSI_MI.rb +41 -0
data/lib/rbbt/sources/STITCH.rb +92 -0
data/lib/rbbt/sources/barcode.rb +0 -3
data/lib/rbbt/sources/biomart.rb +3 -3
data/lib/rbbt/sources/dbSNP.rb +100 -0
data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
data/lib/rbbt/sources/entrez.rb +2 -2
data/lib/rbbt/sources/genomes1000.rb +45 -0
data/lib/rbbt/sources/go.rb +16 -4
data/lib/rbbt/sources/organism.rb +80 -12
data/lib/rbbt/sources/pfam.rb +63 -3
data/lib/rbbt/sources/pubmed.rb +10 -3
data/lib/rbbt/sources/reactome.rb +82 -0
data/lib/rbbt/sources/tfacts.rb +37 -36
data/lib/rbbt/sources/uniprot.rb +25 -23
data/share/Ensembl/release_dates +18 -0
data/share/install/Genomes1000/Rakefile +15 -0
data/share/install/JoChem/Rakefile +11 -3
data/share/install/NCI/Rakefile +54 -16
data/share/install/Organism/Hsa/Rakefile +3 -2
data/share/install/Organism/Rno/Rakefile +1 -2
data/share/install/Organism/Sce/Rakefile +43 -45
data/share/install/Organism/organism_helpers.rb +360 -96
data/share/install/STITCH/Rakefile +0 -0
data/test/rbbt/sources/test_organism.rb +26 -7
data/test/rbbt/sources/test_pubmed.rb +5 -0
metadata +94 -97
data/share/install/InterPro/Rakefile +0 -29

data/share/install/Organism/organism_helpers.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 require 'net/ftp'
+require 'rbbt/sources/ensembl_ftp'
 $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
 $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
@@ -62,6 +64,10 @@ $biomart_pfam= [
   ["Pfam Domain", 'pfam'],
 ]
+$biomart_gene_biotype= [
+  ["Biotype", 'gene_biotype'],
+]
 $biomart_exons = [
   $biomart_ensembl_gene,
   ['Exon Strand','strand'],
@@ -71,6 +77,10 @@ $biomart_exons = [
 #{{{ Rules
+file 'entrez_taxids' do |t|
+  File.open(t.name, 'w') do |f| f.write $taxs * "\n" end
+end
 file 'scientific_name' do |t|
   File.open(t.name, 'w') do |f| f.write $scientific_name end
 end
@@ -108,7 +118,7 @@ file 'identifiers' do |t|
     end
   end
-  entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
+  entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => [4]
   entrez_synonyms.key_field = "Entrez Gene ID"
   entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
@@ -174,34 +184,6 @@ file 'transcripts' => 'gene_positions' do |t|
   File.open(t.name, 'w') do |f| f.puts transcripts end
 end
-file 'transcript_3utr' do |t|
-  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
-  File.open(t.name, 'w') do |f|
-    f.puts "#: :type=:single#cast=to_i"
-    f.puts "#Ensembl Transcript ID\t3' UTR Length"
-    utrs.each do |seq,trans|
-      trans.each do |tran|
-        f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
-      end
-    end
-  end
-end
-file 'transcript_5utr' do |t|
-  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
-  File.open(t.name, 'w') do |f|
-    f.puts "#: :type=:single#cast=to_i"
-    f.puts "#Ensembl Transcript ID\t5' UTR Length"
-    utrs.each do |seq,trans|
-      trans.each do |tran|
-        f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
-      end
-    end
-  end
-end
 file 'gene_positions' do |t|
   sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
@@ -225,25 +207,6 @@ file 'gene_sequence' do |t|
   end
 end
-file 'protein_sequence' => 'chromosomes' do |t|
-  #chromosomes = TSV.open(t.prerequisites.first).keys
-  #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
-  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
-  File.open(t.name, 'w') do |f|
-    f.puts "#: :type=:single"
-    f.puts "#Ensembl Protein ID\tProtein Sequence"
-    sequences.each do |seq, genes|
-      genes.each do |gene|
-        f.write gene
-        f.write "\t"
-        f.write seq
-        f.write "\n"
-      end
-    end
-  end
-end
 file 'exons' => 'gene_positions' do |t|
   exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
   exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
@@ -264,18 +227,6 @@ file 'exon_phase' do |t|
 end
-#file 'transcript_phase' do |t|
-#  tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
-#
-#  transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
-#  transcript_cds_start.through do |transcript, values|
-#    phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
-#    tsv[transcript] = phase.to_i unless phase.nil?
-#  end
-#
-#  File.open(t.name, 'w') do |f| f.puts tsv end
-#end
 file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
   tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
@@ -306,28 +257,10 @@ file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
 end
-file 'transcript_sequence' do |t|
-  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
-  File.open(t.name, 'w') do |f|
-    f.puts "#: :type=:single"
-    f.puts "#Ensembl Transcript ID\tTranscript Sequence"
-    sequences.each do |seq, genes|
-      genes.each do |gene|
-        f.write gene
-        f.write "\t"
-        f.write seq
-        f.write "\n"
-      end
-    end
-  end
-end
 #{{{ Variations
 $biomart_variation_id = ["SNP ID", "refsnp_id"]
-$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
+$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"], ["Variant Alleles", "allele"]]
 file 'germline_variations' do |t|
   BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
@@ -357,7 +290,7 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
                   []
                 end
-  transcripts.select{|transcript| transcript_info[transcript].first.any?}
+  transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
 end
 def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
@@ -420,6 +353,10 @@ file 'gene_go' do |t|
   if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
     goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
+    goterms.each do |key, values|
+      values.each do |list| list.uniq! end
+    end
     goterms.add_field "GO ID" do |key, values|
       values.flatten.compact.reject{|go| go.empty?}
     end
@@ -453,11 +390,48 @@ file 'gene_go_bp' => 'gene_go' do |t|
   File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
 end
+file 'gene_go_cc' => 'gene_go' do |t|
+  gene_go = TSV.open(t.prerequisites.first)
+  gene_go.monitor = true
+  gene_go.process "GO ID" do |key, go_id, values|
+    clean = values.zip_fields.select do |id, type|
+      type == "cellular_component"
+    end
+    clean.collect{|id, type| id}
+  end
+  File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
+end
+file 'gene_go_mf' => 'gene_go' do |t|
+  gene_go = TSV.open(t.prerequisites.first)
+  gene_go.monitor = true
+  gene_go.process "GO ID" do |key, go_id, values|
+    clean = values.zip_fields.select do |id, type|
+      type == "molecular_function"
+    end
+    clean.collect{|id, type| id}
+  end
+  File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
+end
+file 'gene_biotype' do |t|
+  biotype = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_biotype, [], nil, :type => :single, :namespace => $namespace)
+  File.open(t.name, 'w') do |f| f.puts biotype end
+end
 file 'gene_pfam' do |t|
-  goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
+  pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
-  File.open(t.name, 'w') do |f| f.puts goterms end
+  File.open(t.name, 'w') do |f| f.puts pfam end
 end
 file 'chromosomes' do |t|
@@ -471,15 +445,7 @@ rule /^chromosome_.*/ do |t|
   archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
-  release = case archive
-            when "may2009"
-              "release-54"
-            when "jun2011"
-              "release-64"
-            when nil
-              Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
-            end
+  release = Ensembl.releases[archive]
   ftp = Net::FTP.new("ftp.ensembl.org")
   ftp.login
@@ -488,13 +454,16 @@ rule /^chromosome_.*/ do |t|
   ftp.chdir('dna')
   file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
-  raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
+  raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
   Log.debug("Downloading chromosome sequence: #{ file }")
-  TmpFile.with_file do |tmpfile|
-    ftp.getbinaryfile(file, tmpfile)
-    Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
-    ftp.close
+  Misc.lock t.name + '.rake' do
+    TmpFile.with_file do |tmpfile|
+      ftp.getbinaryfile(file, tmpfile)
+      Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
+      ftp.close
+    end
   end
 end
@@ -520,3 +489,298 @@ rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
     BioMart.unset_archive
   end
 end
+#{{{ Special riles
+require 'bio'
+file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
+  exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
+  chr_transcript_ranges ||= {}
+  transcript_strand = {}
+  TSV.open('transcript_exons', :unnamed => true).through do |transcript, values|
+    transcript_ranges = []
+    exons = Misc.zip_fields(values).sort_by{|exon,rank| rank.to_i}.collect{|exon,rank| exon}
+    chr = nil
+    strand = nil
+    exons.each do |exon|
+      strand, start, eend, chr = exon_info[exon]
+      start = start.to_i
+      eend = eend.to_i
+      transcript_ranges << [start, eend]
+    end
+    transcript_strand[transcript] = strand
+    chr_transcript_ranges[chr] ||= {}
+    chr_transcript_ranges[chr][transcript] ||= transcript_ranges
+  end
+  transcript_sequence = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Sequence"], :type => :single)
+  chr_transcript_ranges.each do |chr, transcript_ranges|
+    begin
+      p = Organism.root
+      p.replace File.expand_path("./chromosome_#{chr}")
+      p.sub!(/.*\/.rbbt\//,'')
+      p = Path.setup(p, 'rbbt', Organism)
+      chr_str = p.produce.read
+    rescue Exception
+      Log.debug("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
+      next
+    end
+    transcript_ranges.each do |transcript, ranges|
+      strand = transcript_strand[transcript]
+      ranges = ranges.reverse if strand == "-1"
+      sequence = ranges.inject(""){|acc, range|
+        start, eend = range
+        raise "Chromosome #{ chr } is too short (#{eend - chr_str.length } bases) for transcript #{ transcript } ([#{ start }, #{ eend }])." if chr_str.length < eend
+        acc << chr_str[start-1..eend-1]
+      }
+      sequence = Bio::Sequence::NA.new(sequence).complement.upcase if strand == "-1"
+      transcript_sequence[transcript] = sequence
+    end
+  end
+  Misc.lock t.name + '.rake' do
+    Open.write(t.name, transcript_sequence.to_s)
+  end
+end
+file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
+  path = File.expand_path(t.name)
+  dirname = File.dirname(path)
+  organism = File.basename(dirname)
+  if organism =~ /[a-z]{3}20[0-9]{2}/
+    build = organism
+    organism = File.basename(File.dirname(dirname))
+    organism = File.join(organism, build)
+  end
+  translation        = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
+  if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
+    exon2ensembl       = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
+  else
+    exon2ensembl       = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
+  end
+  if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
+    transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
+  else
+    transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
+  end
+  transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single,  :unmamed => true)
+  transcript_exons   = TSV.open("./transcript_exons", :unmamed => true)
+  exon_ranges        = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
+  transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
+  transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
+  translation.through do |transcript_id, values|
+    start, start_exon, eend, eend_exon = values
+    transcript = transcript2ensembl[transcript_id]
+    protein    = transcript_protein[transcript]
+    start_exon = exon2ensembl[start_exon]
+    eend_exon = exon2ensembl[eend_exon]
+    exon_and_rank = Hash[*Misc.zip_fields(transcript_exons[transcript]).flatten]
+    start_exon_rank = exon_and_rank[start_exon].to_i
+    skipped_exons = exon_and_rank.select{|exon,rank| rank.to_i < start_exon_rank}.collect{|exon,rank| exon }
+    skipped_exon_bases = skipped_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
+    utr5 = skipped_exon_bases + start.to_i - 1
+    transcript_utr5[transcript] = utr5
+    eend_exon_rank = exon_and_rank[eend_exon].to_i
+    extra_exons = exon_and_rank.select{|exon,rank| rank.to_i >= eend_exon_rank}.collect{|exon,rank| exon }
+    extra_exon_bases = extra_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
+    utr3 = extra_exon_bases - eend.to_i
+    transcript_utr3[transcript] = utr3
+  end
+  Misc.lock t.name + '.rake' do
+    Open.write(t.name, transcript_utr5.to_s)
+    Open.write(t.name.sub('transcript_5utr', 'transcript_3utr'), transcript_utr3.to_s)
+  end
+end
+file 'transcript_3utr' => ["transcript_5utr"] do |t|
+end
+file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
+  transcript_5utr     = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
+  transcript_3utr     = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
+  transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
+  transcript_protein  = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
+  protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
+  transcript_sequence.through do |transcript, sequence|
+    protein = transcript_protein[transcript]
+    next if protein.nil? or protein.empty?
+    utr5 = transcript_5utr[transcript]
+    utr3 = transcript_3utr[transcript]
+    psequence = Bio::Sequence::NA.new(sequence[utr5..sequence.length-utr3-1]).translate
+    protein_sequence[protein]=psequence
+  end
+  Misc.lock t.name + '.rake' do
+    Open.write(t.name, protein_sequence.to_s)
+  end
+end
+#{{{ OLD
+#file 'transcript_phase' do |t|
+#  tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
+#
+#  transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
+#  transcript_cds_start.through do |transcript, values|
+#    phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
+#    tsv[transcript] = phase.to_i unless phase.nil?
+#  end
+#
+#  File.open(t.name, 'w') do |f| f.puts tsv end
+#end
+#
+#file 'transcript_3utr' do |t|
+#  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
+#
+#  File.open(t.name, 'w') do |f|
+#    f.puts "#: :type=:single#cast=to_i"
+#    f.puts "#Ensembl Transcript ID\t3' UTR Length"
+#    utrs.each do |seq,trans|
+#      trans.each do |tran|
+#        f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
+#      end
+#    end
+#  end
+#end
+#
+#file 'transcript_5utr' do |t|
+#  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
+#
+#  File.open(t.name, 'w') do |f|
+#    f.puts "#: :type=:single#cast=to_i"
+#    f.puts "#Ensembl Transcript ID\t5' UTR Length"
+#    utrs.each do |seq,trans|
+#      trans.each do |tran|
+#        f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
+#      end
+#    end
+#  end
+#end
+#file 'transcript_sequence' do |t|
+#  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
+#
+#  File.open(t.name, 'w') do |f|
+#    f.puts "#: :type=:single"
+#    f.puts "#Ensembl Transcript ID\tTranscript Sequence"
+#    sequences.each do |seq, genes|
+#      genes.each do |gene|
+#        f.write gene
+#        f.write "\t"
+#        f.write seq
+#        f.write "\n"
+#      end
+#    end
+#  end
+#end
+#file 'transcript_phase' do |t|
+#  tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
+#
+#  transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
+#  transcript_cds_start.through do |transcript, values|
+#    phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
+#    tsv[transcript] = phase.to_i unless phase.nil?
+#  end
+#
+#  File.open(t.name, 'w') do |f| f.puts tsv end
+#end
+#
+#file 'transcript_3utr' do |t|
+#  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
+#
+#  File.open(t.name, 'w') do |f|
+#    f.puts "#: :type=:single#cast=to_i"
+#    f.puts "#Ensembl Transcript ID\t3' UTR Length"
+#    utrs.each do |seq,trans|
+#      trans.each do |tran|
+#        f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
+#      end
+#    end
+#  end
+#end
+#
+#file 'transcript_5utr' do |t|
+#  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
+#
+#  File.open(t.name, 'w') do |f|
+#    f.puts "#: :type=:single#cast=to_i"
+#    f.puts "#Ensembl Transcript ID\t5' UTR Length"
+#    utrs.each do |seq,trans|
+#      trans.each do |tran|
+#        f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
+#      end
+#    end
+#  end
+#end
+#file 'transcript_sequence' do |t|
+#  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
+#
+#  File.open(t.name, 'w') do |f|
+#    f.puts "#: :type=:single"
+#    f.puts "#Ensembl Transcript ID\tTranscript Sequence"
+#    sequences.each do |seq, genes|
+#      genes.each do |gene|
+#        f.write gene
+#        f.write "\t"
+#        f.write seq
+#        f.write "\n"
+#      end
+#    end
+#  end
+#end
+#file 'protein_sequence' => 'chromosomes' do |t|
+#  #chromosomes = TSV.open(t.prerequisites.first).keys
+#  #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
+#  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
+#
+#  File.open(t.name, 'w') do |f|
+#    f.puts "#: :type=:single"
+#    f.puts "#Ensembl Protein ID\tProtein Sequence"
+#    sequences.each do |seq, genes|
+#      genes.each do |gene|
+#        f.write gene
+#        f.write "\t"
+#        f.write seq
+#        f.write "\n"
+#      end
+#    end
+#  end
+#end