rbbt-sources 1.2.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
1
  require 'net/ftp'
2
+ require 'rbbt/sources/ensembl_ftp'
3
+
2
4
 
3
5
  $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
4
6
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
@@ -62,6 +64,10 @@ $biomart_pfam= [
62
64
  ["Pfam Domain", 'pfam'],
63
65
  ]
64
66
 
67
+ $biomart_gene_biotype= [
68
+ ["Biotype", 'gene_biotype'],
69
+ ]
70
+
65
71
  $biomart_exons = [
66
72
  $biomart_ensembl_gene,
67
73
  ['Exon Strand','strand'],
@@ -71,6 +77,10 @@ $biomart_exons = [
71
77
 
72
78
  #{{{ Rules
73
79
 
80
+ file 'entrez_taxids' do |t|
81
+ File.open(t.name, 'w') do |f| f.write $taxs * "\n" end
82
+ end
83
+
74
84
  file 'scientific_name' do |t|
75
85
  File.open(t.name, 'w') do |f| f.write $scientific_name end
76
86
  end
@@ -108,7 +118,7 @@ file 'identifiers' do |t|
108
118
  end
109
119
  end
110
120
 
111
- entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
121
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => [4]
112
122
  entrez_synonyms.key_field = "Entrez Gene ID"
113
123
  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
114
124
 
@@ -174,34 +184,6 @@ file 'transcripts' => 'gene_positions' do |t|
174
184
  File.open(t.name, 'w') do |f| f.puts transcripts end
175
185
  end
176
186
 
177
- file 'transcript_3utr' do |t|
178
- utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
179
-
180
- File.open(t.name, 'w') do |f|
181
- f.puts "#: :type=:single#cast=to_i"
182
- f.puts "#Ensembl Transcript ID\t3' UTR Length"
183
- utrs.each do |seq,trans|
184
- trans.each do |tran|
185
- f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
186
- end
187
- end
188
- end
189
- end
190
-
191
- file 'transcript_5utr' do |t|
192
- utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
193
-
194
- File.open(t.name, 'w') do |f|
195
- f.puts "#: :type=:single#cast=to_i"
196
- f.puts "#Ensembl Transcript ID\t5' UTR Length"
197
- utrs.each do |seq,trans|
198
- trans.each do |tran|
199
- f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
200
- end
201
- end
202
- end
203
- end
204
-
205
187
  file 'gene_positions' do |t|
206
188
  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
207
189
 
@@ -225,25 +207,6 @@ file 'gene_sequence' do |t|
225
207
  end
226
208
  end
227
209
 
228
- file 'protein_sequence' => 'chromosomes' do |t|
229
- #chromosomes = TSV.open(t.prerequisites.first).keys
230
- #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
231
- sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
232
-
233
- File.open(t.name, 'w') do |f|
234
- f.puts "#: :type=:single"
235
- f.puts "#Ensembl Protein ID\tProtein Sequence"
236
- sequences.each do |seq, genes|
237
- genes.each do |gene|
238
- f.write gene
239
- f.write "\t"
240
- f.write seq
241
- f.write "\n"
242
- end
243
- end
244
- end
245
- end
246
-
247
210
  file 'exons' => 'gene_positions' do |t|
248
211
  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
249
212
  exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
@@ -264,18 +227,6 @@ file 'exon_phase' do |t|
264
227
  end
265
228
 
266
229
 
267
- #file 'transcript_phase' do |t|
268
- # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
269
- #
270
- # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
271
- # transcript_cds_start.through do |transcript, values|
272
- # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
273
- # tsv[transcript] = phase.to_i unless phase.nil?
274
- # end
275
- #
276
- # File.open(t.name, 'w') do |f| f.puts tsv end
277
- #end
278
-
279
230
  file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
280
231
  tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
281
232
 
@@ -306,28 +257,10 @@ file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
306
257
  end
307
258
 
308
259
 
309
- file 'transcript_sequence' do |t|
310
- sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
311
-
312
- File.open(t.name, 'w') do |f|
313
- f.puts "#: :type=:single"
314
- f.puts "#Ensembl Transcript ID\tTranscript Sequence"
315
- sequences.each do |seq, genes|
316
- genes.each do |gene|
317
- f.write gene
318
- f.write "\t"
319
- f.write seq
320
- f.write "\n"
321
- end
322
- end
323
- end
324
- end
325
-
326
-
327
260
  #{{{ Variations
328
261
 
329
262
  $biomart_variation_id = ["SNP ID", "refsnp_id"]
330
- $biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
263
+ $biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"], ["Variant Alleles", "allele"]]
331
264
 
332
265
  file 'germline_variations' do |t|
333
266
  BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
@@ -357,7 +290,7 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
357
290
  []
358
291
  end
359
292
 
360
- transcripts.select{|transcript| transcript_info[transcript].first.any?}
293
+ transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
361
294
  end
362
295
 
363
296
  def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
@@ -420,6 +353,10 @@ file 'gene_go' do |t|
420
353
  if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
421
354
  goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
422
355
 
356
+ goterms.each do |key, values|
357
+ values.each do |list| list.uniq! end
358
+ end
359
+
423
360
  goterms.add_field "GO ID" do |key, values|
424
361
  values.flatten.compact.reject{|go| go.empty?}
425
362
  end
@@ -453,11 +390,48 @@ file 'gene_go_bp' => 'gene_go' do |t|
453
390
  File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
454
391
  end
455
392
 
393
+ file 'gene_go_cc' => 'gene_go' do |t|
394
+ gene_go = TSV.open(t.prerequisites.first)
395
+
396
+ gene_go.monitor = true
397
+ gene_go.process "GO ID" do |key, go_id, values|
398
+ clean = values.zip_fields.select do |id, type|
399
+ type == "cellular_component"
400
+ end
401
+ clean.collect{|id, type| id}
402
+ end
403
+
404
+
405
+ File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
406
+ end
407
+
408
+ file 'gene_go_mf' => 'gene_go' do |t|
409
+ gene_go = TSV.open(t.prerequisites.first)
410
+
411
+ gene_go.monitor = true
412
+ gene_go.process "GO ID" do |key, go_id, values|
413
+ clean = values.zip_fields.select do |id, type|
414
+ type == "molecular_function"
415
+ end
416
+ clean.collect{|id, type| id}
417
+ end
418
+
419
+
420
+ File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
421
+ end
422
+
423
+
424
+
425
+ file 'gene_biotype' do |t|
426
+ biotype = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_biotype, [], nil, :type => :single, :namespace => $namespace)
427
+
428
+ File.open(t.name, 'w') do |f| f.puts biotype end
429
+ end
456
430
 
457
431
  file 'gene_pfam' do |t|
458
- goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
432
+ pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
459
433
 
460
- File.open(t.name, 'w') do |f| f.puts goterms end
434
+ File.open(t.name, 'w') do |f| f.puts pfam end
461
435
  end
462
436
 
463
437
  file 'chromosomes' do |t|
@@ -471,15 +445,7 @@ rule /^chromosome_.*/ do |t|
471
445
 
472
446
  archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
473
447
 
474
- release = case archive
475
- when "may2009"
476
- "release-54"
477
- when "jun2011"
478
- "release-64"
479
- when nil
480
- Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
481
- end
482
-
448
+ release = Ensembl.releases[archive]
483
449
 
484
450
  ftp = Net::FTP.new("ftp.ensembl.org")
485
451
  ftp.login
@@ -488,13 +454,16 @@ rule /^chromosome_.*/ do |t|
488
454
  ftp.chdir('dna')
489
455
  file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
490
456
 
491
- raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
457
+ raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
492
458
 
493
459
  Log.debug("Downloading chromosome sequence: #{ file }")
494
- TmpFile.with_file do |tmpfile|
495
- ftp.getbinaryfile(file, tmpfile)
496
- Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
497
- ftp.close
460
+
461
+ Misc.lock t.name + '.rake' do
462
+ TmpFile.with_file do |tmpfile|
463
+ ftp.getbinaryfile(file, tmpfile)
464
+ Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
465
+ ftp.close
466
+ end
498
467
  end
499
468
  end
500
469
 
@@ -520,3 +489,298 @@ rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
520
489
  BioMart.unset_archive
521
490
  end
522
491
  end
492
+
493
+
494
+
495
+
496
+
497
+ #{{{ Special riles
498
+ require 'bio'
499
+
500
+ file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
501
+ exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
502
+
503
+ chr_transcript_ranges ||= {}
504
+ transcript_strand = {}
505
+
506
+ TSV.open('transcript_exons', :unnamed => true).through do |transcript, values|
507
+ transcript_ranges = []
508
+
509
+ exons = Misc.zip_fields(values).sort_by{|exon,rank| rank.to_i}.collect{|exon,rank| exon}
510
+
511
+ chr = nil
512
+ strand = nil
513
+ exons.each do |exon|
514
+ strand, start, eend, chr = exon_info[exon]
515
+ start = start.to_i
516
+ eend = eend.to_i
517
+ transcript_ranges << [start, eend]
518
+ end
519
+
520
+ transcript_strand[transcript] = strand
521
+
522
+ chr_transcript_ranges[chr] ||= {}
523
+ chr_transcript_ranges[chr][transcript] ||= transcript_ranges
524
+ end
525
+
526
+ transcript_sequence = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Sequence"], :type => :single)
527
+ chr_transcript_ranges.each do |chr, transcript_ranges|
528
+
529
+ begin
530
+ p = Organism.root
531
+ p.replace File.expand_path("./chromosome_#{chr}")
532
+ p.sub!(/.*\/.rbbt\//,'')
533
+ p = Path.setup(p, 'rbbt', Organism)
534
+ chr_str = p.produce.read
535
+ rescue Exception
536
+ Log.debug("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
537
+ next
538
+ end
539
+
540
+ transcript_ranges.each do |transcript, ranges|
541
+ strand = transcript_strand[transcript]
542
+ ranges = ranges.reverse if strand == "-1"
543
+
544
+ sequence = ranges.inject(""){|acc, range|
545
+ start, eend = range
546
+ raise "Chromosome #{ chr } is too short (#{eend - chr_str.length } bases) for transcript #{ transcript } ([#{ start }, #{ eend }])." if chr_str.length < eend
547
+ acc << chr_str[start-1..eend-1]
548
+ }
549
+
550
+ sequence = Bio::Sequence::NA.new(sequence).complement.upcase if strand == "-1"
551
+ transcript_sequence[transcript] = sequence
552
+ end
553
+ end
554
+
555
+ Misc.lock t.name + '.rake' do
556
+ Open.write(t.name, transcript_sequence.to_s)
557
+ end
558
+ end
559
+
560
+ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
561
+ path = File.expand_path(t.name)
562
+ dirname = File.dirname(path)
563
+ organism = File.basename(dirname)
564
+
565
+ if organism =~ /[a-z]{3}20[0-9]{2}/
566
+ build = organism
567
+ organism = File.basename(File.dirname(dirname))
568
+ organism = File.join(organism, build)
569
+ end
570
+
571
+ translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
572
+
573
+ if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
574
+ exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
575
+ else
576
+ exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
577
+ end
578
+
579
+ if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
580
+ transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
581
+ else
582
+ transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
583
+ end
584
+
585
+ transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
586
+ transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
587
+ exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
588
+
589
+ transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
590
+ transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
591
+
592
+ translation.through do |transcript_id, values|
593
+ start, start_exon, eend, eend_exon = values
594
+
595
+ transcript = transcript2ensembl[transcript_id]
596
+ protein = transcript_protein[transcript]
597
+
598
+ start_exon = exon2ensembl[start_exon]
599
+ eend_exon = exon2ensembl[eend_exon]
600
+
601
+ exon_and_rank = Hash[*Misc.zip_fields(transcript_exons[transcript]).flatten]
602
+
603
+ start_exon_rank = exon_and_rank[start_exon].to_i
604
+ skipped_exons = exon_and_rank.select{|exon,rank| rank.to_i < start_exon_rank}.collect{|exon,rank| exon }
605
+ skipped_exon_bases = skipped_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
606
+
607
+ utr5 = skipped_exon_bases + start.to_i - 1
608
+ transcript_utr5[transcript] = utr5
609
+
610
+ eend_exon_rank = exon_and_rank[eend_exon].to_i
611
+ extra_exons = exon_and_rank.select{|exon,rank| rank.to_i >= eend_exon_rank}.collect{|exon,rank| exon }
612
+ extra_exon_bases = extra_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
613
+
614
+ utr3 = extra_exon_bases - eend.to_i
615
+ transcript_utr3[transcript] = utr3
616
+ end
617
+
618
+ Misc.lock t.name + '.rake' do
619
+ Open.write(t.name, transcript_utr5.to_s)
620
+ Open.write(t.name.sub('transcript_5utr', 'transcript_3utr'), transcript_utr3.to_s)
621
+ end
622
+ end
623
+
624
+ file 'transcript_3utr' => ["transcript_5utr"] do |t|
625
+ end
626
+
627
+ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
628
+ transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
629
+ transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
630
+ transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
631
+ transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
632
+
633
+
634
+ protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
635
+ transcript_sequence.through do |transcript, sequence|
636
+ protein = transcript_protein[transcript]
637
+ next if protein.nil? or protein.empty?
638
+ utr5 = transcript_5utr[transcript]
639
+ utr3 = transcript_3utr[transcript]
640
+ psequence = Bio::Sequence::NA.new(sequence[utr5..sequence.length-utr3-1]).translate
641
+ protein_sequence[protein]=psequence
642
+ end
643
+
644
+ Misc.lock t.name + '.rake' do
645
+ Open.write(t.name, protein_sequence.to_s)
646
+ end
647
+ end
648
+
649
+ #{{{ OLD
650
+
651
+ #file 'transcript_phase' do |t|
652
+ # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
653
+ #
654
+ # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
655
+ # transcript_cds_start.through do |transcript, values|
656
+ # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
657
+ # tsv[transcript] = phase.to_i unless phase.nil?
658
+ # end
659
+ #
660
+ # File.open(t.name, 'w') do |f| f.puts tsv end
661
+ #end
662
+ #
663
+ #file 'transcript_3utr' do |t|
664
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
665
+ #
666
+ # File.open(t.name, 'w') do |f|
667
+ # f.puts "#: :type=:single#cast=to_i"
668
+ # f.puts "#Ensembl Transcript ID\t3' UTR Length"
669
+ # utrs.each do |seq,trans|
670
+ # trans.each do |tran|
671
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
672
+ # end
673
+ # end
674
+ # end
675
+ #end
676
+ #
677
+ #file 'transcript_5utr' do |t|
678
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
679
+ #
680
+ # File.open(t.name, 'w') do |f|
681
+ # f.puts "#: :type=:single#cast=to_i"
682
+ # f.puts "#Ensembl Transcript ID\t5' UTR Length"
683
+ # utrs.each do |seq,trans|
684
+ # trans.each do |tran|
685
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
686
+ # end
687
+ # end
688
+ # end
689
+ #end
690
+
691
+
692
+
693
+ #file 'transcript_sequence' do |t|
694
+ # sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
695
+ #
696
+ # File.open(t.name, 'w') do |f|
697
+ # f.puts "#: :type=:single"
698
+ # f.puts "#Ensembl Transcript ID\tTranscript Sequence"
699
+ # sequences.each do |seq, genes|
700
+ # genes.each do |gene|
701
+ # f.write gene
702
+ # f.write "\t"
703
+ # f.write seq
704
+ # f.write "\n"
705
+ # end
706
+ # end
707
+ # end
708
+ #end
709
+
710
+ #file 'transcript_phase' do |t|
711
+ # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
712
+ #
713
+ # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
714
+ # transcript_cds_start.through do |transcript, values|
715
+ # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
716
+ # tsv[transcript] = phase.to_i unless phase.nil?
717
+ # end
718
+ #
719
+ # File.open(t.name, 'w') do |f| f.puts tsv end
720
+ #end
721
+ #
722
+ #file 'transcript_3utr' do |t|
723
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
724
+ #
725
+ # File.open(t.name, 'w') do |f|
726
+ # f.puts "#: :type=:single#cast=to_i"
727
+ # f.puts "#Ensembl Transcript ID\t3' UTR Length"
728
+ # utrs.each do |seq,trans|
729
+ # trans.each do |tran|
730
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
731
+ # end
732
+ # end
733
+ # end
734
+ #end
735
+ #
736
+ #file 'transcript_5utr' do |t|
737
+ # utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
738
+ #
739
+ # File.open(t.name, 'w') do |f|
740
+ # f.puts "#: :type=:single#cast=to_i"
741
+ # f.puts "#Ensembl Transcript ID\t5' UTR Length"
742
+ # utrs.each do |seq,trans|
743
+ # trans.each do |tran|
744
+ # f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
745
+ # end
746
+ # end
747
+ # end
748
+ #end
749
+
750
+ #file 'transcript_sequence' do |t|
751
+ # sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
752
+ #
753
+ # File.open(t.name, 'w') do |f|
754
+ # f.puts "#: :type=:single"
755
+ # f.puts "#Ensembl Transcript ID\tTranscript Sequence"
756
+ # sequences.each do |seq, genes|
757
+ # genes.each do |gene|
758
+ # f.write gene
759
+ # f.write "\t"
760
+ # f.write seq
761
+ # f.write "\n"
762
+ # end
763
+ # end
764
+ # end
765
+ #end
766
+ #file 'protein_sequence' => 'chromosomes' do |t|
767
+ # #chromosomes = TSV.open(t.prerequisites.first).keys
768
+ # #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
769
+ # sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
770
+ #
771
+ # File.open(t.name, 'w') do |f|
772
+ # f.puts "#: :type=:single"
773
+ # f.puts "#Ensembl Protein ID\tProtein Sequence"
774
+ # sequences.each do |seq, genes|
775
+ # genes.each do |gene|
776
+ # f.write gene
777
+ # f.write "\t"
778
+ # f.write seq
779
+ # f.write "\n"
780
+ # end
781
+ # end
782
+ # end
783
+ #end
784
+
785
+
786
+