rbbt-sources 1.2.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,6 @@
1
1
  require 'net/ftp'
2
+ require 'rbbt/sources/ensembl_ftp'
3
+
2
4
 
3
5
  $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
4
6
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
@@ -62,6 +64,10 @@ $biomart_pfam= [
62
64
  ["Pfam Domain", 'pfam'],
63
65
  ]
64
66
 
67
+ $biomart_gene_biotype= [
68
+ ["Biotype", 'gene_biotype'],
69
+ ]
70
+
65
71
  $biomart_exons = [
66
72
  $biomart_ensembl_gene,
67
73
  ['Exon Strand','strand'],
@@ -71,6 +77,10 @@ $biomart_exons = [
71
77
 
72
78
  #{{{ Rules
73
79
 
80
+ file 'entrez_taxids' do |t|
81
+ File.open(t.name, 'w') do |f| f.write $taxs * "\n" end
82
+ end
83
+
74
84
  file 'scientific_name' do |t|
75
85
  File.open(t.name, 'w') do |f| f.write $scientific_name end
76
86
  end
@@ -108,7 +118,7 @@ file 'identifiers' do |t|
108
118
  end
109
119
  end
110
120
 
111
- entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
121
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => [4]
112
122
  entrez_synonyms.key_field = "Entrez Gene ID"
113
123
  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
114
124
 
@@ -174,34 +184,6 @@ file 'transcripts' => 'gene_positions' do |t|
174
184
  File.open(t.name, 'w') do |f| f.puts transcripts end
175
185
  end
176
186
 
177
- file 'transcript_3utr' do |t|
178
- utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
179
-
180
- File.open(t.name, 'w') do |f|
181
- f.puts "#: :type=:single#cast=to_i"
182
- f.puts "#Ensembl Transcript ID\t3' UTR Length"
183
- utrs.each do |seq,trans|
184
- trans.each do |tran|
185
- f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
186
- end
187
- end
188
- end
189
- end
190
-
191
- file 'transcript_5utr' do |t|
192
- utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
193
-
194
- File.open(t.name, 'w') do |f|
195
- f.puts "#: :type=:single#cast=to_i"
196
- f.puts "#Ensembl Transcript ID\t5' UTR Length"
197
- utrs.each do |seq,trans|
198
- trans.each do |tran|
199
- f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
200
- end
201
- end
202
- end
203
- end
204
-
205
187
  file 'gene_positions' do |t|
206
188
  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
207
189
 
@@ -225,25 +207,6 @@ file 'gene_sequence' do |t|
225
207
  end
226
208
  end
227
209
 
228
- file 'protein_sequence' => 'chromosomes' do |t|
229
- #chromosomes = TSV.open(t.prerequisites.first).keys
230
- #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
231
- sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
232
-
233
- File.open(t.name, 'w') do |f|
234
- f.puts "#: :type=:single"
235
- f.puts "#Ensembl Protein ID\tProtein Sequence"
236
- sequences.each do |seq, genes|
237
- genes.each do |gene|
238
- f.write gene
239
- f.write "\t"
240
- f.write seq
241
- f.write "\n"
242
- end
243
- end
244
- end
245
- end
246
-
247
210
  file 'exons' => 'gene_positions' do |t|
248
211
  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
249
212
  exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
@@ -264,18 +227,6 @@ file 'exon_phase' do |t|
264
227
  end
265
228
 
266
229
 
267
- #file 'transcript_phase' do |t|
268
- # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
269
- #
270
- # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
271
- # transcript_cds_start.through do |transcript, values|
272
- # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
273
- # tsv[transcript] = phase.to_i unless phase.nil?
274
- # end
275
- #
276
- # File.open(t.name, 'w') do |f| f.puts tsv end
277
- #end
278
-
279
230
  file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
280
231
  tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
281
232
 
@@ -306,28 +257,10 @@ file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
306
257
  end
307
258
 
308
259
 
309
- file 'transcript_sequence' do |t|
310
- sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
311
-
312
- File.open(t.name, 'w') do |f|
313
- f.puts "#: :type=:single"
314
- f.puts "#Ensembl Transcript ID\tTranscript Sequence"
315
- sequences.each do |seq, genes|
316
- genes.each do |gene|
317
- f.write gene
318
- f.write "\t"
319
- f.write seq
320
- f.write "\n"
321
- end
322
- end
323
- end
324
- end
325
-
326
-
327
260
  #{{{ Variations
328
261
 
329
262
  $biomart_variation_id = ["SNP ID", "refsnp_id"]
330
- $biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
263
+ $biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"], ["Variant Alleles", "allele"]]
331
264
 
332
265
  file 'germline_variations' do |t|
333
266
  BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
@@ -357,7 +290,7 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
357
290
  []
358
291
  end
359
292
 
360
- transcripts.select{|transcript| transcript_info[transcript].first.any?}
293
+ transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
361
294
  end
362
295
 
363
296
  def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
@@ -420,6 +353,10 @@ file 'gene_go' do |t|
420
353
  if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
421
354
  goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
422
355
 
356
+ goterms.each do |key, values|
357
+ values.each do |list| list.uniq! end
358
+ end
359
+
423
360
  goterms.add_field "GO ID" do |key, values|
424
361
  values.flatten.compact.reject{|go| go.empty?}
425
362
  end
@@ -453,11 +390,48 @@ file 'gene_go_bp' => 'gene_go' do |t|
453
390
  File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
454
391
  end
455
392
 
393
+ file 'gene_go_cc' => 'gene_go' do |t|
394
+ gene_go = TSV.open(t.prerequisites.first)
395
+
396
+ gene_go.monitor = true
397
+ gene_go.process "GO ID" do |key, go_id, values|
398
+ clean = values.zip_fields.select do |id, type|
399
+ type == "cellular_component"
400
+ end
401
+ clean.collect{|id, type| id}
402
+ end
403
+
404
+
405
+ File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
406
+ end
407
+
408
+ file 'gene_go_mf' => 'gene_go' do |t|
409
+ gene_go = TSV.open(t.prerequisites.first)
410
+
411
+ gene_go.monitor = true
412
+ gene_go.process "GO ID" do |key, go_id, values|
413
+ clean = values.zip_fields.select do |id, type|
414
+ type == "molecular_function"
415
+ end
416
+ clean.collect{|id, type| id}
417
+ end
418
+
419
+
420
+ File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
421
+ end
422
+
423
+
424
+
425
+ file 'gene_biotype' do |t|
426
+ biotype = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_biotype, [], nil, :type => :single, :namespace => $namespace)
427
+
428
+ File.open(t.name, 'w') do |f| f.puts biotype end
429
+ end
456
430
 
457
431
  file 'gene_pfam' do |t|
458
- goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
432
+ pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
459
433
 
460
- File.open(t.name, 'w') do |f| f.puts goterms end
434
+ File.open(t.name, 'w') do |f| f.puts pfam end
461
435
  end
462
436
 
463
437
  file 'chromosomes' do |t|
@@ -471,15 +445,7 @@ rule /^chromosome_.*/ do |t|
471
445
 
472
446
  archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
473
447
 
474
- release = case archive
475
- when "may2009"
476
- "release-54"
477
- when "jun2011"
478
- "release-64"
479
- when nil
480
- Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
481
- end
482
-
448
+ release = Ensembl.releases[archive]
483
449
 
484
450
  ftp = Net::FTP.new("ftp.ensembl.org")
485
451
  ftp.login
@@ -488,13 +454,16 @@ rule /^chromosome_.*/ do |t|
488
454
  ftp.chdir('dna')
489
455
  file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
490
456
 
491
- raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
457
+ raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
492
458
 
493
459
  Log.debug("Downloading chromosome sequence: #{ file }")
494
- TmpFile.with_file do |tmpfile|
495
- ftp.getbinaryfile(file, tmpfile)
496
- Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
497
- ftp.close
460
+
461
+ Misc.lock t.name + '.rake' do
462
+ TmpFile.with_file do |tmpfile|
463
+ ftp.getbinaryfile(file, tmpfile)
464
+ Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
465
+ ftp.close
466
+ end
498
467
  end
499
468
  end
500
469
 
@@ -520,3 +489,298 @@ rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
520
489
  BioMart.unset_archive
521
490
  end
522
491
  end
492
+
493
+
494
+
495
+
496
+
497
+ #{{{ Special riles
498
+ require 'bio'
499
+
500
+ file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
501
+ exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
502
+
503
+ chr_transcript_ranges ||= {}
504
+ transcript_strand = {}
505
+
506
+ TSV.open('transcript_exons', :unnamed => true).through do |transcript, values|
507
+ transcript_ranges = []
508
+
509
+ exons = Misc.zip_fields(values).sort_by{|exon,rank| rank.to_i}.collect{|exon,rank| exon}
510
+
511
+ chr = nil
512
+ strand = nil
513
+ exons.each do |exon|
514
+ strand, start, eend, chr = exon_info[exon]
515
+ start = start.to_i
516
+ eend = eend.to_i
517
+ transcript_ranges << [start, eend]
518
+ end
519
+
520
+ transcript_strand[transcript] = strand
521
+
522
+ chr_transcript_ranges[chr] ||= {}
523
+ chr_transcript_ranges[chr][transcript] ||= transcript_ranges
524
+ end
525
+
526
+ transcript_sequence = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Sequence"], :type => :single)
527
+ chr_transcript_ranges.each do |chr, transcript_ranges|
528
+
529
+ begin
530
+ p = Organism.root
531
+ p.replace File.expand_path("./chromosome_#{chr}")
532
+ p.sub!(/.*\/.rbbt\//,'')
533
+ p = Path.setup(p, 'rbbt', Organism)
534
+ chr_str = p.produce.read
535
+ rescue Exception
536
+ Log.debug("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
537
+ next
538
+ end
539
+
540
+ transcript_ranges.each do |transcript, ranges|
541
+ strand = transcript_strand[transcript]
542
+ ranges = ranges.reverse if strand == "-1"
543
+
544
+ sequence = ranges.inject(""){|acc, range|
545
+ start, eend = range
546
+ raise "Chromosome #{ chr } is too short (#{eend - chr_str.length } bases) for transcript #{ transcript } ([#{ start }, #{ eend }])." if chr_str.length < eend
547
+ acc << chr_str[start-1..eend-1]
548
+ }
549
+
550
+ sequence = Bio::Sequence::NA.new(sequence).complement.upcase if strand == "-1"
551
+ transcript_sequence[transcript] = sequence
552
+ end
553
+ end
554
+
555
+ Misc.lock t.name + '.rake' do
556
+ Open.write(t.name, transcript_sequence.to_s)
557
+ end
558
+ end
559
+
560
+ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
561
+ path = File.expand_path(t.name)
562
+ dirname = File.dirname(path)
563
+ organism = File.basename(dirname)
564
+
565
+ if organism =~ /[a-z]{3}20[0-9]{2}/
566
+ build = organism
567
+ organism = File.basename(File.dirname(dirname))
568
+ organism = File.join(organism, build)
569
+ end
570
+
571
+ translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
572
+
573
+ if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
574
+ exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
575
+ else
576
+ exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
577
+ end
578
+
579
+ if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
580
+ transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
581
+ else
582
+ transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
583
+ end
584
+
585
+ transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
586
+ transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
587
+ exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
588
+
589
+ transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
590
+ transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
591
+
592
+ translation.through do |transcript_id, values|
593
+ start, start_exon, eend, eend_exon = values
594
+
595
+ transcript = transcript2ensembl[transcript_id]
596
+ protein = transcript_protein[transcript]
597
+
598
+ start_exon = exon2ensembl[start_exon]
599
+ eend_exon = exon2ensembl[eend_exon]
600
+
601
+ exon_and_rank = Hash[*Misc.zip_fields(transcript_exons[transcript]).flatten]
602
+
603
+ start_exon_rank = exon_and_rank[start_exon].to_i
604
+ skipped_exons = exon_and_rank.select{|exon,rank| rank.to_i < start_exon_rank}.collect{|exon,rank| exon }
605
+ skipped_exon_bases = skipped_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
606
+
607
+ utr5 = skipped_exon_bases + start.to_i - 1
608
+ transcript_utr5[transcript] = utr5
609
+
610
+ eend_exon_rank = exon_and_rank[eend_exon].to_i
611
+ extra_exons = exon_and_rank.select{|exon,rank| rank.to_i >= eend_exon_rank}.collect{|exon,rank| exon }
612
+ extra_exon_bases = extra_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
613
+
614
+ utr3 = extra_exon_bases - eend.to_i
615
+ transcript_utr3[transcript] = utr3
616
+ end
617
+
618
+ Misc.lock t.name + '.rake' do
619
+ Open.write(t.name, transcript_utr5.to_s)
620
+ Open.write(t.name.sub('transcript_5utr', 'transcript_3utr'), transcript_utr3.to_s)
621
+ end
622
+ end
623
+
624
+ file 'transcript_3utr' => ["transcript_5utr"] do |t|
625
+ end
626
+
627
+ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
628
+ transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
629
+ transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
630
+ transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
631
+ transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
632
+
633
+
634
+ protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
635
+ transcript_sequence.through do |transcript, sequence|
636
+ protein = transcript_protein[transcript]
637
+ next if protein.nil? or protein.empty?
638
+ utr5 = transcript_5utr[transcript]
639
+ utr3 = transcript_3utr[transcript]
640
+ psequence = Bio::Sequence::NA.new(sequence[utr5..sequence.length-utr3-1]).translate
641
+ protein_sequence[protein]=psequence
642
+ end
643
+
644
+ Misc.lock t.name + '.rake' do
645
+ Open.write(t.name, protein_sequence.to_s)
646
+ end
647
+ end
648
+
649
+ #{{{ OLD
650
+
651
+ #file 'transcript_phase' do |t|
652
+ # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
653
+ #
654
+ # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
655
+ # transcript_cds_start.through do |transcript, values|
656
+ # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
657
+ # tsv[transcript] = phase.to_i unless phase.nil?
658
+ # end
659
+ #
660
+ # File.open(t.name, 'w') do |f| f.puts tsv end
661
+ #end
662
+ #
663
+ #file 'transcript_3utr' do |t|
664
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
665
+ #
666
+ # File.open(t.name, 'w') do |f|
667
+ # f.puts "#: :type=:single#cast=to_i"
668
+ # f.puts "#Ensembl Transcript ID\t3' UTR Length"
669
+ # utrs.each do |seq,trans|
670
+ # trans.each do |tran|
671
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
672
+ # end
673
+ # end
674
+ # end
675
+ #end
676
+ #
677
+ #file 'transcript_5utr' do |t|
678
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
679
+ #
680
+ # File.open(t.name, 'w') do |f|
681
+ # f.puts "#: :type=:single#cast=to_i"
682
+ # f.puts "#Ensembl Transcript ID\t5' UTR Length"
683
+ # utrs.each do |seq,trans|
684
+ # trans.each do |tran|
685
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
686
+ # end
687
+ # end
688
+ # end
689
+ #end
690
+
691
+
692
+
693
+ #file 'transcript_sequence' do |t|
694
+ # sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
695
+ #
696
+ # File.open(t.name, 'w') do |f|
697
+ # f.puts "#: :type=:single"
698
+ # f.puts "#Ensembl Transcript ID\tTranscript Sequence"
699
+ # sequences.each do |seq, genes|
700
+ # genes.each do |gene|
701
+ # f.write gene
702
+ # f.write "\t"
703
+ # f.write seq
704
+ # f.write "\n"
705
+ # end
706
+ # end
707
+ # end
708
+ #end
709
+
710
+ #file 'transcript_phase' do |t|
711
+ # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
712
+ #
713
+ # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
714
+ # transcript_cds_start.through do |transcript, values|
715
+ # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
716
+ # tsv[transcript] = phase.to_i unless phase.nil?
717
+ # end
718
+ #
719
+ # File.open(t.name, 'w') do |f| f.puts tsv end
720
+ #end
721
+ #
722
+ #file 'transcript_3utr' do |t|
723
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
724
+ #
725
+ # File.open(t.name, 'w') do |f|
726
+ # f.puts "#: :type=:single#cast=to_i"
727
+ # f.puts "#Ensembl Transcript ID\t3' UTR Length"
728
+ # utrs.each do |seq,trans|
729
+ # trans.each do |tran|
730
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
731
+ # end
732
+ # end
733
+ # end
734
+ #end
735
+ #
736
+ #file 'transcript_5utr' do |t|
737
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
738
+ #
739
+ # File.open(t.name, 'w') do |f|
740
+ # f.puts "#: :type=:single#cast=to_i"
741
+ # f.puts "#Ensembl Transcript ID\t5' UTR Length"
742
+ # utrs.each do |seq,trans|
743
+ # trans.each do |tran|
744
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
745
+ # end
746
+ # end
747
+ # end
748
+ #end
749
+
750
+ #file 'transcript_sequence' do |t|
751
+ # sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
752
+ #
753
+ # File.open(t.name, 'w') do |f|
754
+ # f.puts "#: :type=:single"
755
+ # f.puts "#Ensembl Transcript ID\tTranscript Sequence"
756
+ # sequences.each do |seq, genes|
757
+ # genes.each do |gene|
758
+ # f.write gene
759
+ # f.write "\t"
760
+ # f.write seq
761
+ # f.write "\n"
762
+ # end
763
+ # end
764
+ # end
765
+ #end
766
+ #file 'protein_sequence' => 'chromosomes' do |t|
767
+ # #chromosomes = TSV.open(t.prerequisites.first).keys
768
+ # #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
769
+ # sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
770
+ #
771
+ # File.open(t.name, 'w') do |f|
772
+ # f.puts "#: :type=:single"
773
+ # f.puts "#Ensembl Protein ID\tProtein Sequence"
774
+ # sequences.each do |seq, genes|
775
+ # genes.each do |gene|
776
+ # f.write gene
777
+ # f.write "\t"
778
+ # f.write seq
779
+ # f.write "\n"
780
+ # end
781
+ # end
782
+ # end
783
+ #end
784
+
785
+
786
+