rbbt-sources 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'net/ftp'
|
2
|
+
require 'rbbt/sources/ensembl_ftp'
|
3
|
+
|
2
4
|
|
3
5
|
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
4
6
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
@@ -62,6 +64,10 @@ $biomart_pfam= [
|
|
62
64
|
["Pfam Domain", 'pfam'],
|
63
65
|
]
|
64
66
|
|
67
|
+
$biomart_gene_biotype= [
|
68
|
+
["Biotype", 'gene_biotype'],
|
69
|
+
]
|
70
|
+
|
65
71
|
$biomart_exons = [
|
66
72
|
$biomart_ensembl_gene,
|
67
73
|
['Exon Strand','strand'],
|
@@ -71,6 +77,10 @@ $biomart_exons = [
|
|
71
77
|
|
72
78
|
#{{{ Rules
|
73
79
|
|
80
|
+
file 'entrez_taxids' do |t|
|
81
|
+
File.open(t.name, 'w') do |f| f.write $taxs * "\n" end
|
82
|
+
end
|
83
|
+
|
74
84
|
file 'scientific_name' do |t|
|
75
85
|
File.open(t.name, 'w') do |f| f.write $scientific_name end
|
76
86
|
end
|
@@ -108,7 +118,7 @@ file 'identifiers' do |t|
|
|
108
118
|
end
|
109
119
|
end
|
110
120
|
|
111
|
-
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
|
121
|
+
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => [4]
|
112
122
|
entrez_synonyms.key_field = "Entrez Gene ID"
|
113
123
|
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
114
124
|
|
@@ -174,34 +184,6 @@ file 'transcripts' => 'gene_positions' do |t|
|
|
174
184
|
File.open(t.name, 'w') do |f| f.puts transcripts end
|
175
185
|
end
|
176
186
|
|
177
|
-
file 'transcript_3utr' do |t|
|
178
|
-
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
|
179
|
-
|
180
|
-
File.open(t.name, 'w') do |f|
|
181
|
-
f.puts "#: :type=:single#cast=to_i"
|
182
|
-
f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
183
|
-
utrs.each do |seq,trans|
|
184
|
-
trans.each do |tran|
|
185
|
-
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
file 'transcript_5utr' do |t|
|
192
|
-
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
193
|
-
|
194
|
-
File.open(t.name, 'w') do |f|
|
195
|
-
f.puts "#: :type=:single#cast=to_i"
|
196
|
-
f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
197
|
-
utrs.each do |seq,trans|
|
198
|
-
trans.each do |tran|
|
199
|
-
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
187
|
file 'gene_positions' do |t|
|
206
188
|
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
|
207
189
|
|
@@ -225,25 +207,6 @@ file 'gene_sequence' do |t|
|
|
225
207
|
end
|
226
208
|
end
|
227
209
|
|
228
|
-
file 'protein_sequence' => 'chromosomes' do |t|
|
229
|
-
#chromosomes = TSV.open(t.prerequisites.first).keys
|
230
|
-
#sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
|
231
|
-
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
232
|
-
|
233
|
-
File.open(t.name, 'w') do |f|
|
234
|
-
f.puts "#: :type=:single"
|
235
|
-
f.puts "#Ensembl Protein ID\tProtein Sequence"
|
236
|
-
sequences.each do |seq, genes|
|
237
|
-
genes.each do |gene|
|
238
|
-
f.write gene
|
239
|
-
f.write "\t"
|
240
|
-
f.write seq
|
241
|
-
f.write "\n"
|
242
|
-
end
|
243
|
-
end
|
244
|
-
end
|
245
|
-
end
|
246
|
-
|
247
210
|
file 'exons' => 'gene_positions' do |t|
|
248
211
|
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
|
249
212
|
exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
|
@@ -264,18 +227,6 @@ file 'exon_phase' do |t|
|
|
264
227
|
end
|
265
228
|
|
266
229
|
|
267
|
-
#file 'transcript_phase' do |t|
|
268
|
-
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
269
|
-
#
|
270
|
-
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
271
|
-
# transcript_cds_start.through do |transcript, values|
|
272
|
-
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
273
|
-
# tsv[transcript] = phase.to_i unless phase.nil?
|
274
|
-
# end
|
275
|
-
#
|
276
|
-
# File.open(t.name, 'w') do |f| f.puts tsv end
|
277
|
-
#end
|
278
|
-
|
279
230
|
file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
|
280
231
|
tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
|
281
232
|
|
@@ -306,28 +257,10 @@ file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
|
|
306
257
|
end
|
307
258
|
|
308
259
|
|
309
|
-
file 'transcript_sequence' do |t|
|
310
|
-
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
311
|
-
|
312
|
-
File.open(t.name, 'w') do |f|
|
313
|
-
f.puts "#: :type=:single"
|
314
|
-
f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
315
|
-
sequences.each do |seq, genes|
|
316
|
-
genes.each do |gene|
|
317
|
-
f.write gene
|
318
|
-
f.write "\t"
|
319
|
-
f.write seq
|
320
|
-
f.write "\n"
|
321
|
-
end
|
322
|
-
end
|
323
|
-
end
|
324
|
-
end
|
325
|
-
|
326
|
-
|
327
260
|
#{{{ Variations
|
328
261
|
|
329
262
|
$biomart_variation_id = ["SNP ID", "refsnp_id"]
|
330
|
-
$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
|
263
|
+
$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"], ["Variant Alleles", "allele"]]
|
331
264
|
|
332
265
|
file 'germline_variations' do |t|
|
333
266
|
BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
|
@@ -357,7 +290,7 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
|
357
290
|
[]
|
358
291
|
end
|
359
292
|
|
360
|
-
transcripts.
|
293
|
+
transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
|
361
294
|
end
|
362
295
|
|
363
296
|
def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
@@ -420,6 +353,10 @@ file 'gene_go' do |t|
|
|
420
353
|
if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
|
421
354
|
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
|
422
355
|
|
356
|
+
goterms.each do |key, values|
|
357
|
+
values.each do |list| list.uniq! end
|
358
|
+
end
|
359
|
+
|
423
360
|
goterms.add_field "GO ID" do |key, values|
|
424
361
|
values.flatten.compact.reject{|go| go.empty?}
|
425
362
|
end
|
@@ -453,11 +390,48 @@ file 'gene_go_bp' => 'gene_go' do |t|
|
|
453
390
|
File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
|
454
391
|
end
|
455
392
|
|
393
|
+
file 'gene_go_cc' => 'gene_go' do |t|
|
394
|
+
gene_go = TSV.open(t.prerequisites.first)
|
395
|
+
|
396
|
+
gene_go.monitor = true
|
397
|
+
gene_go.process "GO ID" do |key, go_id, values|
|
398
|
+
clean = values.zip_fields.select do |id, type|
|
399
|
+
type == "cellular_component"
|
400
|
+
end
|
401
|
+
clean.collect{|id, type| id}
|
402
|
+
end
|
403
|
+
|
404
|
+
|
405
|
+
File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
|
406
|
+
end
|
407
|
+
|
408
|
+
file 'gene_go_mf' => 'gene_go' do |t|
|
409
|
+
gene_go = TSV.open(t.prerequisites.first)
|
410
|
+
|
411
|
+
gene_go.monitor = true
|
412
|
+
gene_go.process "GO ID" do |key, go_id, values|
|
413
|
+
clean = values.zip_fields.select do |id, type|
|
414
|
+
type == "molecular_function"
|
415
|
+
end
|
416
|
+
clean.collect{|id, type| id}
|
417
|
+
end
|
418
|
+
|
419
|
+
|
420
|
+
File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
|
421
|
+
end
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
file 'gene_biotype' do |t|
|
426
|
+
biotype = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_biotype, [], nil, :type => :single, :namespace => $namespace)
|
427
|
+
|
428
|
+
File.open(t.name, 'w') do |f| f.puts biotype end
|
429
|
+
end
|
456
430
|
|
457
431
|
file 'gene_pfam' do |t|
|
458
|
-
|
432
|
+
pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
|
459
433
|
|
460
|
-
File.open(t.name, 'w') do |f| f.puts
|
434
|
+
File.open(t.name, 'w') do |f| f.puts pfam end
|
461
435
|
end
|
462
436
|
|
463
437
|
file 'chromosomes' do |t|
|
@@ -471,15 +445,7 @@ rule /^chromosome_.*/ do |t|
|
|
471
445
|
|
472
446
|
archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
|
473
447
|
|
474
|
-
release =
|
475
|
-
when "may2009"
|
476
|
-
"release-54"
|
477
|
-
when "jun2011"
|
478
|
-
"release-64"
|
479
|
-
when nil
|
480
|
-
Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
|
481
|
-
end
|
482
|
-
|
448
|
+
release = Ensembl.releases[archive]
|
483
449
|
|
484
450
|
ftp = Net::FTP.new("ftp.ensembl.org")
|
485
451
|
ftp.login
|
@@ -488,13 +454,16 @@ rule /^chromosome_.*/ do |t|
|
|
488
454
|
ftp.chdir('dna')
|
489
455
|
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
490
456
|
|
491
|
-
raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
|
457
|
+
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
492
458
|
|
493
459
|
Log.debug("Downloading chromosome sequence: #{ file }")
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
460
|
+
|
461
|
+
Misc.lock t.name + '.rake' do
|
462
|
+
TmpFile.with_file do |tmpfile|
|
463
|
+
ftp.getbinaryfile(file, tmpfile)
|
464
|
+
Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
|
465
|
+
ftp.close
|
466
|
+
end
|
498
467
|
end
|
499
468
|
end
|
500
469
|
|
@@ -520,3 +489,298 @@ rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
|
520
489
|
BioMart.unset_archive
|
521
490
|
end
|
522
491
|
end
|
492
|
+
|
493
|
+
|
494
|
+
|
495
|
+
|
496
|
+
|
497
|
+
#{{{ Special riles
|
498
|
+
require 'bio'
|
499
|
+
|
500
|
+
file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
|
501
|
+
exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
|
502
|
+
|
503
|
+
chr_transcript_ranges ||= {}
|
504
|
+
transcript_strand = {}
|
505
|
+
|
506
|
+
TSV.open('transcript_exons', :unnamed => true).through do |transcript, values|
|
507
|
+
transcript_ranges = []
|
508
|
+
|
509
|
+
exons = Misc.zip_fields(values).sort_by{|exon,rank| rank.to_i}.collect{|exon,rank| exon}
|
510
|
+
|
511
|
+
chr = nil
|
512
|
+
strand = nil
|
513
|
+
exons.each do |exon|
|
514
|
+
strand, start, eend, chr = exon_info[exon]
|
515
|
+
start = start.to_i
|
516
|
+
eend = eend.to_i
|
517
|
+
transcript_ranges << [start, eend]
|
518
|
+
end
|
519
|
+
|
520
|
+
transcript_strand[transcript] = strand
|
521
|
+
|
522
|
+
chr_transcript_ranges[chr] ||= {}
|
523
|
+
chr_transcript_ranges[chr][transcript] ||= transcript_ranges
|
524
|
+
end
|
525
|
+
|
526
|
+
transcript_sequence = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Sequence"], :type => :single)
|
527
|
+
chr_transcript_ranges.each do |chr, transcript_ranges|
|
528
|
+
|
529
|
+
begin
|
530
|
+
p = Organism.root
|
531
|
+
p.replace File.expand_path("./chromosome_#{chr}")
|
532
|
+
p.sub!(/.*\/.rbbt\//,'')
|
533
|
+
p = Path.setup(p, 'rbbt', Organism)
|
534
|
+
chr_str = p.produce.read
|
535
|
+
rescue Exception
|
536
|
+
Log.debug("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
|
537
|
+
next
|
538
|
+
end
|
539
|
+
|
540
|
+
transcript_ranges.each do |transcript, ranges|
|
541
|
+
strand = transcript_strand[transcript]
|
542
|
+
ranges = ranges.reverse if strand == "-1"
|
543
|
+
|
544
|
+
sequence = ranges.inject(""){|acc, range|
|
545
|
+
start, eend = range
|
546
|
+
raise "Chromosome #{ chr } is too short (#{eend - chr_str.length } bases) for transcript #{ transcript } ([#{ start }, #{ eend }])." if chr_str.length < eend
|
547
|
+
acc << chr_str[start-1..eend-1]
|
548
|
+
}
|
549
|
+
|
550
|
+
sequence = Bio::Sequence::NA.new(sequence).complement.upcase if strand == "-1"
|
551
|
+
transcript_sequence[transcript] = sequence
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
Misc.lock t.name + '.rake' do
|
556
|
+
Open.write(t.name, transcript_sequence.to_s)
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
561
|
+
path = File.expand_path(t.name)
|
562
|
+
dirname = File.dirname(path)
|
563
|
+
organism = File.basename(dirname)
|
564
|
+
|
565
|
+
if organism =~ /[a-z]{3}20[0-9]{2}/
|
566
|
+
build = organism
|
567
|
+
organism = File.basename(File.dirname(dirname))
|
568
|
+
organism = File.join(organism, build)
|
569
|
+
end
|
570
|
+
|
571
|
+
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
|
572
|
+
|
573
|
+
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
|
574
|
+
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
|
575
|
+
else
|
576
|
+
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
|
577
|
+
end
|
578
|
+
|
579
|
+
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
|
580
|
+
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
|
581
|
+
else
|
582
|
+
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
|
583
|
+
end
|
584
|
+
|
585
|
+
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
|
586
|
+
transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
|
587
|
+
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
|
588
|
+
|
589
|
+
transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
|
590
|
+
transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
|
591
|
+
|
592
|
+
translation.through do |transcript_id, values|
|
593
|
+
start, start_exon, eend, eend_exon = values
|
594
|
+
|
595
|
+
transcript = transcript2ensembl[transcript_id]
|
596
|
+
protein = transcript_protein[transcript]
|
597
|
+
|
598
|
+
start_exon = exon2ensembl[start_exon]
|
599
|
+
eend_exon = exon2ensembl[eend_exon]
|
600
|
+
|
601
|
+
exon_and_rank = Hash[*Misc.zip_fields(transcript_exons[transcript]).flatten]
|
602
|
+
|
603
|
+
start_exon_rank = exon_and_rank[start_exon].to_i
|
604
|
+
skipped_exons = exon_and_rank.select{|exon,rank| rank.to_i < start_exon_rank}.collect{|exon,rank| exon }
|
605
|
+
skipped_exon_bases = skipped_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
|
606
|
+
|
607
|
+
utr5 = skipped_exon_bases + start.to_i - 1
|
608
|
+
transcript_utr5[transcript] = utr5
|
609
|
+
|
610
|
+
eend_exon_rank = exon_and_rank[eend_exon].to_i
|
611
|
+
extra_exons = exon_and_rank.select{|exon,rank| rank.to_i >= eend_exon_rank}.collect{|exon,rank| exon }
|
612
|
+
extra_exon_bases = extra_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
|
613
|
+
|
614
|
+
utr3 = extra_exon_bases - eend.to_i
|
615
|
+
transcript_utr3[transcript] = utr3
|
616
|
+
end
|
617
|
+
|
618
|
+
Misc.lock t.name + '.rake' do
|
619
|
+
Open.write(t.name, transcript_utr5.to_s)
|
620
|
+
Open.write(t.name.sub('transcript_5utr', 'transcript_3utr'), transcript_utr3.to_s)
|
621
|
+
end
|
622
|
+
end
|
623
|
+
|
624
|
+
file 'transcript_3utr' => ["transcript_5utr"] do |t|
|
625
|
+
end
|
626
|
+
|
627
|
+
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
|
628
|
+
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
|
629
|
+
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
|
630
|
+
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
|
631
|
+
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
632
|
+
|
633
|
+
|
634
|
+
protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
|
635
|
+
transcript_sequence.through do |transcript, sequence|
|
636
|
+
protein = transcript_protein[transcript]
|
637
|
+
next if protein.nil? or protein.empty?
|
638
|
+
utr5 = transcript_5utr[transcript]
|
639
|
+
utr3 = transcript_3utr[transcript]
|
640
|
+
psequence = Bio::Sequence::NA.new(sequence[utr5..sequence.length-utr3-1]).translate
|
641
|
+
protein_sequence[protein]=psequence
|
642
|
+
end
|
643
|
+
|
644
|
+
Misc.lock t.name + '.rake' do
|
645
|
+
Open.write(t.name, protein_sequence.to_s)
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
649
|
+
#{{{ OLD
|
650
|
+
|
651
|
+
#file 'transcript_phase' do |t|
|
652
|
+
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
653
|
+
#
|
654
|
+
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
655
|
+
# transcript_cds_start.through do |transcript, values|
|
656
|
+
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
657
|
+
# tsv[transcript] = phase.to_i unless phase.nil?
|
658
|
+
# end
|
659
|
+
#
|
660
|
+
# File.open(t.name, 'w') do |f| f.puts tsv end
|
661
|
+
#end
|
662
|
+
#
|
663
|
+
#file 'transcript_3utr' do |t|
|
664
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
|
665
|
+
#
|
666
|
+
# File.open(t.name, 'w') do |f|
|
667
|
+
# f.puts "#: :type=:single#cast=to_i"
|
668
|
+
# f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
669
|
+
# utrs.each do |seq,trans|
|
670
|
+
# trans.each do |tran|
|
671
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
672
|
+
# end
|
673
|
+
# end
|
674
|
+
# end
|
675
|
+
#end
|
676
|
+
#
|
677
|
+
#file 'transcript_5utr' do |t|
|
678
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
679
|
+
#
|
680
|
+
# File.open(t.name, 'w') do |f|
|
681
|
+
# f.puts "#: :type=:single#cast=to_i"
|
682
|
+
# f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
683
|
+
# utrs.each do |seq,trans|
|
684
|
+
# trans.each do |tran|
|
685
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
686
|
+
# end
|
687
|
+
# end
|
688
|
+
# end
|
689
|
+
#end
|
690
|
+
|
691
|
+
|
692
|
+
|
693
|
+
#file 'transcript_sequence' do |t|
|
694
|
+
# sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
695
|
+
#
|
696
|
+
# File.open(t.name, 'w') do |f|
|
697
|
+
# f.puts "#: :type=:single"
|
698
|
+
# f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
699
|
+
# sequences.each do |seq, genes|
|
700
|
+
# genes.each do |gene|
|
701
|
+
# f.write gene
|
702
|
+
# f.write "\t"
|
703
|
+
# f.write seq
|
704
|
+
# f.write "\n"
|
705
|
+
# end
|
706
|
+
# end
|
707
|
+
# end
|
708
|
+
#end
|
709
|
+
|
710
|
+
#file 'transcript_phase' do |t|
|
711
|
+
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
712
|
+
#
|
713
|
+
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
714
|
+
# transcript_cds_start.through do |transcript, values|
|
715
|
+
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
716
|
+
# tsv[transcript] = phase.to_i unless phase.nil?
|
717
|
+
# end
|
718
|
+
#
|
719
|
+
# File.open(t.name, 'w') do |f| f.puts tsv end
|
720
|
+
#end
|
721
|
+
#
|
722
|
+
#file 'transcript_3utr' do |t|
|
723
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
|
724
|
+
#
|
725
|
+
# File.open(t.name, 'w') do |f|
|
726
|
+
# f.puts "#: :type=:single#cast=to_i"
|
727
|
+
# f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
728
|
+
# utrs.each do |seq,trans|
|
729
|
+
# trans.each do |tran|
|
730
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
731
|
+
# end
|
732
|
+
# end
|
733
|
+
# end
|
734
|
+
#end
|
735
|
+
#
|
736
|
+
#file 'transcript_5utr' do |t|
|
737
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
738
|
+
#
|
739
|
+
# File.open(t.name, 'w') do |f|
|
740
|
+
# f.puts "#: :type=:single#cast=to_i"
|
741
|
+
# f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
742
|
+
# utrs.each do |seq,trans|
|
743
|
+
# trans.each do |tran|
|
744
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
745
|
+
# end
|
746
|
+
# end
|
747
|
+
# end
|
748
|
+
#end
|
749
|
+
|
750
|
+
#file 'transcript_sequence' do |t|
|
751
|
+
# sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
752
|
+
#
|
753
|
+
# File.open(t.name, 'w') do |f|
|
754
|
+
# f.puts "#: :type=:single"
|
755
|
+
# f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
756
|
+
# sequences.each do |seq, genes|
|
757
|
+
# genes.each do |gene|
|
758
|
+
# f.write gene
|
759
|
+
# f.write "\t"
|
760
|
+
# f.write seq
|
761
|
+
# f.write "\n"
|
762
|
+
# end
|
763
|
+
# end
|
764
|
+
# end
|
765
|
+
#end
|
766
|
+
#file 'protein_sequence' => 'chromosomes' do |t|
|
767
|
+
# #chromosomes = TSV.open(t.prerequisites.first).keys
|
768
|
+
# #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
|
769
|
+
# sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
770
|
+
#
|
771
|
+
# File.open(t.name, 'w') do |f|
|
772
|
+
# f.puts "#: :type=:single"
|
773
|
+
# f.puts "#Ensembl Protein ID\tProtein Sequence"
|
774
|
+
# sequences.each do |seq, genes|
|
775
|
+
# genes.each do |gene|
|
776
|
+
# f.write gene
|
777
|
+
# f.write "\t"
|
778
|
+
# f.write seq
|
779
|
+
# f.write "\n"
|
780
|
+
# end
|
781
|
+
# end
|
782
|
+
# end
|
783
|
+
#end
|
784
|
+
|
785
|
+
|
786
|
+
|