rbbt-sources 1.2.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/etc/biomart/missing_in_archive +11 -0
- data/lib/rbbt/sources/COSMIC.rb +47 -4
- data/lib/rbbt/sources/HPRD.rb +23 -0
- data/lib/rbbt/sources/InterPro.rb +98 -8
- data/lib/rbbt/sources/NCI.rb +7 -5
- data/lib/rbbt/sources/PSI_MI.rb +41 -0
- data/lib/rbbt/sources/STITCH.rb +92 -0
- data/lib/rbbt/sources/barcode.rb +0 -3
- data/lib/rbbt/sources/biomart.rb +3 -3
- data/lib/rbbt/sources/dbSNP.rb +100 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +79 -0
- data/lib/rbbt/sources/entrez.rb +2 -2
- data/lib/rbbt/sources/genomes1000.rb +45 -0
- data/lib/rbbt/sources/go.rb +16 -4
- data/lib/rbbt/sources/organism.rb +80 -12
- data/lib/rbbt/sources/pfam.rb +63 -3
- data/lib/rbbt/sources/pubmed.rb +10 -3
- data/lib/rbbt/sources/reactome.rb +82 -0
- data/lib/rbbt/sources/tfacts.rb +37 -36
- data/lib/rbbt/sources/uniprot.rb +25 -23
- data/share/Ensembl/release_dates +18 -0
- data/share/install/Genomes1000/Rakefile +15 -0
- data/share/install/JoChem/Rakefile +11 -3
- data/share/install/NCI/Rakefile +54 -16
- data/share/install/Organism/Hsa/Rakefile +3 -2
- data/share/install/Organism/Rno/Rakefile +1 -2
- data/share/install/Organism/Sce/Rakefile +43 -45
- data/share/install/Organism/organism_helpers.rb +360 -96
- data/share/install/STITCH/Rakefile +0 -0
- data/test/rbbt/sources/test_organism.rb +26 -7
- data/test/rbbt/sources/test_pubmed.rb +5 -0
- metadata +94 -97
- data/share/install/InterPro/Rakefile +0 -29
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'net/ftp'
|
2
|
+
require 'rbbt/sources/ensembl_ftp'
|
3
|
+
|
2
4
|
|
3
5
|
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
4
6
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
@@ -62,6 +64,10 @@ $biomart_pfam= [
|
|
62
64
|
["Pfam Domain", 'pfam'],
|
63
65
|
]
|
64
66
|
|
67
|
+
$biomart_gene_biotype= [
|
68
|
+
["Biotype", 'gene_biotype'],
|
69
|
+
]
|
70
|
+
|
65
71
|
$biomart_exons = [
|
66
72
|
$biomart_ensembl_gene,
|
67
73
|
['Exon Strand','strand'],
|
@@ -71,6 +77,10 @@ $biomart_exons = [
|
|
71
77
|
|
72
78
|
#{{{ Rules
|
73
79
|
|
80
|
+
file 'entrez_taxids' do |t|
|
81
|
+
File.open(t.name, 'w') do |f| f.write $taxs * "\n" end
|
82
|
+
end
|
83
|
+
|
74
84
|
file 'scientific_name' do |t|
|
75
85
|
File.open(t.name, 'w') do |f| f.write $scientific_name end
|
76
86
|
end
|
@@ -108,7 +118,7 @@ file 'identifiers' do |t|
|
|
108
118
|
end
|
109
119
|
end
|
110
120
|
|
111
|
-
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 4
|
121
|
+
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => [4]
|
112
122
|
entrez_synonyms.key_field = "Entrez Gene ID"
|
113
123
|
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
114
124
|
|
@@ -174,34 +184,6 @@ file 'transcripts' => 'gene_positions' do |t|
|
|
174
184
|
File.open(t.name, 'w') do |f| f.puts transcripts end
|
175
185
|
end
|
176
186
|
|
177
|
-
file 'transcript_3utr' do |t|
|
178
|
-
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
|
179
|
-
|
180
|
-
File.open(t.name, 'w') do |f|
|
181
|
-
f.puts "#: :type=:single#cast=to_i"
|
182
|
-
f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
183
|
-
utrs.each do |seq,trans|
|
184
|
-
trans.each do |tran|
|
185
|
-
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
189
|
-
end
|
190
|
-
|
191
|
-
file 'transcript_5utr' do |t|
|
192
|
-
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
193
|
-
|
194
|
-
File.open(t.name, 'w') do |f|
|
195
|
-
f.puts "#: :type=:single#cast=to_i"
|
196
|
-
f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
197
|
-
utrs.each do |seq,trans|
|
198
|
-
trans.each do |tran|
|
199
|
-
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
187
|
file 'gene_positions' do |t|
|
206
188
|
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
|
207
189
|
|
@@ -225,25 +207,6 @@ file 'gene_sequence' do |t|
|
|
225
207
|
end
|
226
208
|
end
|
227
209
|
|
228
|
-
file 'protein_sequence' => 'chromosomes' do |t|
|
229
|
-
#chromosomes = TSV.open(t.prerequisites.first).keys
|
230
|
-
#sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
|
231
|
-
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
232
|
-
|
233
|
-
File.open(t.name, 'w') do |f|
|
234
|
-
f.puts "#: :type=:single"
|
235
|
-
f.puts "#Ensembl Protein ID\tProtein Sequence"
|
236
|
-
sequences.each do |seq, genes|
|
237
|
-
genes.each do |gene|
|
238
|
-
f.write gene
|
239
|
-
f.write "\t"
|
240
|
-
f.write seq
|
241
|
-
f.write "\n"
|
242
|
-
end
|
243
|
-
end
|
244
|
-
end
|
245
|
-
end
|
246
|
-
|
247
210
|
file 'exons' => 'gene_positions' do |t|
|
248
211
|
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
|
249
212
|
exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
|
@@ -264,18 +227,6 @@ file 'exon_phase' do |t|
|
|
264
227
|
end
|
265
228
|
|
266
229
|
|
267
|
-
#file 'transcript_phase' do |t|
|
268
|
-
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
269
|
-
#
|
270
|
-
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
271
|
-
# transcript_cds_start.through do |transcript, values|
|
272
|
-
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
273
|
-
# tsv[transcript] = phase.to_i unless phase.nil?
|
274
|
-
# end
|
275
|
-
#
|
276
|
-
# File.open(t.name, 'w') do |f| f.puts tsv end
|
277
|
-
#end
|
278
|
-
|
279
230
|
file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
|
280
231
|
tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
|
281
232
|
|
@@ -306,28 +257,10 @@ file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
|
|
306
257
|
end
|
307
258
|
|
308
259
|
|
309
|
-
file 'transcript_sequence' do |t|
|
310
|
-
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
311
|
-
|
312
|
-
File.open(t.name, 'w') do |f|
|
313
|
-
f.puts "#: :type=:single"
|
314
|
-
f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
315
|
-
sequences.each do |seq, genes|
|
316
|
-
genes.each do |gene|
|
317
|
-
f.write gene
|
318
|
-
f.write "\t"
|
319
|
-
f.write seq
|
320
|
-
f.write "\n"
|
321
|
-
end
|
322
|
-
end
|
323
|
-
end
|
324
|
-
end
|
325
|
-
|
326
|
-
|
327
260
|
#{{{ Variations
|
328
261
|
|
329
262
|
$biomart_variation_id = ["SNP ID", "refsnp_id"]
|
330
|
-
$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"]]
|
263
|
+
$biomart_variation_position = [["Chromosome Name", "chr_name"], ["Chromosome Start", "chrom_start"], ["Variant Alleles", "allele"]]
|
331
264
|
|
332
265
|
file 'germline_variations' do |t|
|
333
266
|
BioMart.tsv($biomart_db_germline_variation, $biomart_variation_id, $biomart_variation_position, [], nil, :keep_empty => true, :type => :list, :filename => t.name, :namespace => $namespace)
|
@@ -357,7 +290,7 @@ def coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
|
357
290
|
[]
|
358
291
|
end
|
359
292
|
|
360
|
-
transcripts.
|
293
|
+
transcripts.reject{|transcript| transcript_info[transcript].first.empty?}
|
361
294
|
end
|
362
295
|
|
363
296
|
def exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
@@ -420,6 +353,10 @@ file 'gene_go' do |t|
|
|
420
353
|
if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
|
421
354
|
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
|
422
355
|
|
356
|
+
goterms.each do |key, values|
|
357
|
+
values.each do |list| list.uniq! end
|
358
|
+
end
|
359
|
+
|
423
360
|
goterms.add_field "GO ID" do |key, values|
|
424
361
|
values.flatten.compact.reject{|go| go.empty?}
|
425
362
|
end
|
@@ -453,11 +390,48 @@ file 'gene_go_bp' => 'gene_go' do |t|
|
|
453
390
|
File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
|
454
391
|
end
|
455
392
|
|
393
|
+
file 'gene_go_cc' => 'gene_go' do |t|
|
394
|
+
gene_go = TSV.open(t.prerequisites.first)
|
395
|
+
|
396
|
+
gene_go.monitor = true
|
397
|
+
gene_go.process "GO ID" do |key, go_id, values|
|
398
|
+
clean = values.zip_fields.select do |id, type|
|
399
|
+
type == "cellular_component"
|
400
|
+
end
|
401
|
+
clean.collect{|id, type| id}
|
402
|
+
end
|
403
|
+
|
404
|
+
|
405
|
+
File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
|
406
|
+
end
|
407
|
+
|
408
|
+
file 'gene_go_mf' => 'gene_go' do |t|
|
409
|
+
gene_go = TSV.open(t.prerequisites.first)
|
410
|
+
|
411
|
+
gene_go.monitor = true
|
412
|
+
gene_go.process "GO ID" do |key, go_id, values|
|
413
|
+
clean = values.zip_fields.select do |id, type|
|
414
|
+
type == "molecular_function"
|
415
|
+
end
|
416
|
+
clean.collect{|id, type| id}
|
417
|
+
end
|
418
|
+
|
419
|
+
|
420
|
+
File.open(t.name, 'w') do |f| f.puts gene_go.slice "GO ID" end
|
421
|
+
end
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
file 'gene_biotype' do |t|
|
426
|
+
biotype = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_biotype, [], nil, :type => :single, :namespace => $namespace)
|
427
|
+
|
428
|
+
File.open(t.name, 'w') do |f| f.puts biotype end
|
429
|
+
end
|
456
430
|
|
457
431
|
file 'gene_pfam' do |t|
|
458
|
-
|
432
|
+
pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
|
459
433
|
|
460
|
-
File.open(t.name, 'w') do |f| f.puts
|
434
|
+
File.open(t.name, 'w') do |f| f.puts pfam end
|
461
435
|
end
|
462
436
|
|
463
437
|
file 'chromosomes' do |t|
|
@@ -471,15 +445,7 @@ rule /^chromosome_.*/ do |t|
|
|
471
445
|
|
472
446
|
archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
|
473
447
|
|
474
|
-
release =
|
475
|
-
when "may2009"
|
476
|
-
"release-54"
|
477
|
-
when "jun2011"
|
478
|
-
"release-64"
|
479
|
-
when nil
|
480
|
-
Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
|
481
|
-
end
|
482
|
-
|
448
|
+
release = Ensembl.releases[archive]
|
483
449
|
|
484
450
|
ftp = Net::FTP.new("ftp.ensembl.org")
|
485
451
|
ftp.login
|
@@ -488,13 +454,16 @@ rule /^chromosome_.*/ do |t|
|
|
488
454
|
ftp.chdir('dna')
|
489
455
|
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
490
456
|
|
491
|
-
raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
|
457
|
+
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
492
458
|
|
493
459
|
Log.debug("Downloading chromosome sequence: #{ file }")
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
460
|
+
|
461
|
+
Misc.lock t.name + '.rake' do
|
462
|
+
TmpFile.with_file do |tmpfile|
|
463
|
+
ftp.getbinaryfile(file, tmpfile)
|
464
|
+
Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
|
465
|
+
ftp.close
|
466
|
+
end
|
498
467
|
end
|
499
468
|
end
|
500
469
|
|
@@ -520,3 +489,298 @@ rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
|
520
489
|
BioMart.unset_archive
|
521
490
|
end
|
522
491
|
end
|
492
|
+
|
493
|
+
|
494
|
+
|
495
|
+
|
496
|
+
|
497
|
+
#{{{ Special riles
|
498
|
+
require 'bio'
|
499
|
+
|
500
|
+
file 'transcript_sequence' => ["exons", "transcript_exons"] do |t|
|
501
|
+
exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
|
502
|
+
|
503
|
+
chr_transcript_ranges ||= {}
|
504
|
+
transcript_strand = {}
|
505
|
+
|
506
|
+
TSV.open('transcript_exons', :unnamed => true).through do |transcript, values|
|
507
|
+
transcript_ranges = []
|
508
|
+
|
509
|
+
exons = Misc.zip_fields(values).sort_by{|exon,rank| rank.to_i}.collect{|exon,rank| exon}
|
510
|
+
|
511
|
+
chr = nil
|
512
|
+
strand = nil
|
513
|
+
exons.each do |exon|
|
514
|
+
strand, start, eend, chr = exon_info[exon]
|
515
|
+
start = start.to_i
|
516
|
+
eend = eend.to_i
|
517
|
+
transcript_ranges << [start, eend]
|
518
|
+
end
|
519
|
+
|
520
|
+
transcript_strand[transcript] = strand
|
521
|
+
|
522
|
+
chr_transcript_ranges[chr] ||= {}
|
523
|
+
chr_transcript_ranges[chr][transcript] ||= transcript_ranges
|
524
|
+
end
|
525
|
+
|
526
|
+
transcript_sequence = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Sequence"], :type => :single)
|
527
|
+
chr_transcript_ranges.each do |chr, transcript_ranges|
|
528
|
+
|
529
|
+
begin
|
530
|
+
p = Organism.root
|
531
|
+
p.replace File.expand_path("./chromosome_#{chr}")
|
532
|
+
p.sub!(/.*\/.rbbt\//,'')
|
533
|
+
p = Path.setup(p, 'rbbt', Organism)
|
534
|
+
chr_str = p.produce.read
|
535
|
+
rescue Exception
|
536
|
+
Log.debug("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
|
537
|
+
next
|
538
|
+
end
|
539
|
+
|
540
|
+
transcript_ranges.each do |transcript, ranges|
|
541
|
+
strand = transcript_strand[transcript]
|
542
|
+
ranges = ranges.reverse if strand == "-1"
|
543
|
+
|
544
|
+
sequence = ranges.inject(""){|acc, range|
|
545
|
+
start, eend = range
|
546
|
+
raise "Chromosome #{ chr } is too short (#{eend - chr_str.length } bases) for transcript #{ transcript } ([#{ start }, #{ eend }])." if chr_str.length < eend
|
547
|
+
acc << chr_str[start-1..eend-1]
|
548
|
+
}
|
549
|
+
|
550
|
+
sequence = Bio::Sequence::NA.new(sequence).complement.upcase if strand == "-1"
|
551
|
+
transcript_sequence[transcript] = sequence
|
552
|
+
end
|
553
|
+
end
|
554
|
+
|
555
|
+
Misc.lock t.name + '.rake' do
|
556
|
+
Open.write(t.name, transcript_sequence.to_s)
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
561
|
+
path = File.expand_path(t.name)
|
562
|
+
dirname = File.dirname(path)
|
563
|
+
organism = File.basename(dirname)
|
564
|
+
|
565
|
+
if organism =~ /[a-z]{3}20[0-9]{2}/
|
566
|
+
build = organism
|
567
|
+
organism = File.basename(File.dirname(dirname))
|
568
|
+
organism = File.join(organism, build)
|
569
|
+
end
|
570
|
+
|
571
|
+
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
|
572
|
+
|
573
|
+
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
|
574
|
+
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
|
575
|
+
else
|
576
|
+
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
|
577
|
+
end
|
578
|
+
|
579
|
+
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
|
580
|
+
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
|
581
|
+
else
|
582
|
+
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
|
583
|
+
end
|
584
|
+
|
585
|
+
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
|
586
|
+
transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
|
587
|
+
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
|
588
|
+
|
589
|
+
transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
|
590
|
+
transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
|
591
|
+
|
592
|
+
translation.through do |transcript_id, values|
|
593
|
+
start, start_exon, eend, eend_exon = values
|
594
|
+
|
595
|
+
transcript = transcript2ensembl[transcript_id]
|
596
|
+
protein = transcript_protein[transcript]
|
597
|
+
|
598
|
+
start_exon = exon2ensembl[start_exon]
|
599
|
+
eend_exon = exon2ensembl[eend_exon]
|
600
|
+
|
601
|
+
exon_and_rank = Hash[*Misc.zip_fields(transcript_exons[transcript]).flatten]
|
602
|
+
|
603
|
+
start_exon_rank = exon_and_rank[start_exon].to_i
|
604
|
+
skipped_exons = exon_and_rank.select{|exon,rank| rank.to_i < start_exon_rank}.collect{|exon,rank| exon }
|
605
|
+
skipped_exon_bases = skipped_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
|
606
|
+
|
607
|
+
utr5 = skipped_exon_bases + start.to_i - 1
|
608
|
+
transcript_utr5[transcript] = utr5
|
609
|
+
|
610
|
+
eend_exon_rank = exon_and_rank[eend_exon].to_i
|
611
|
+
extra_exons = exon_and_rank.select{|exon,rank| rank.to_i >= eend_exon_rank}.collect{|exon,rank| exon }
|
612
|
+
extra_exon_bases = extra_exons.inject(0){|acc,exon| exon_start, exon_eend = exon_ranges[exon]; acc += exon_eend - exon_start + 1}
|
613
|
+
|
614
|
+
utr3 = extra_exon_bases - eend.to_i
|
615
|
+
transcript_utr3[transcript] = utr3
|
616
|
+
end
|
617
|
+
|
618
|
+
Misc.lock t.name + '.rake' do
|
619
|
+
Open.write(t.name, transcript_utr5.to_s)
|
620
|
+
Open.write(t.name.sub('transcript_5utr', 'transcript_3utr'), transcript_utr3.to_s)
|
621
|
+
end
|
622
|
+
end
|
623
|
+
|
624
|
+
file 'transcript_3utr' => ["transcript_5utr"] do |t|
|
625
|
+
end
|
626
|
+
|
627
|
+
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_sequence"] do |t|
|
628
|
+
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
|
629
|
+
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
|
630
|
+
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
|
631
|
+
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
632
|
+
|
633
|
+
|
634
|
+
protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
|
635
|
+
transcript_sequence.through do |transcript, sequence|
|
636
|
+
protein = transcript_protein[transcript]
|
637
|
+
next if protein.nil? or protein.empty?
|
638
|
+
utr5 = transcript_5utr[transcript]
|
639
|
+
utr3 = transcript_3utr[transcript]
|
640
|
+
psequence = Bio::Sequence::NA.new(sequence[utr5..sequence.length-utr3-1]).translate
|
641
|
+
protein_sequence[protein]=psequence
|
642
|
+
end
|
643
|
+
|
644
|
+
Misc.lock t.name + '.rake' do
|
645
|
+
Open.write(t.name, protein_sequence.to_s)
|
646
|
+
end
|
647
|
+
end
|
648
|
+
|
649
|
+
#{{{ OLD
|
650
|
+
|
651
|
+
#file 'transcript_phase' do |t|
|
652
|
+
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
653
|
+
#
|
654
|
+
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
655
|
+
# transcript_cds_start.through do |transcript, values|
|
656
|
+
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
657
|
+
# tsv[transcript] = phase.to_i unless phase.nil?
|
658
|
+
# end
|
659
|
+
#
|
660
|
+
# File.open(t.name, 'w') do |f| f.puts tsv end
|
661
|
+
#end
|
662
|
+
#
|
663
|
+
#file 'transcript_3utr' do |t|
|
664
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
|
665
|
+
#
|
666
|
+
# File.open(t.name, 'w') do |f|
|
667
|
+
# f.puts "#: :type=:single#cast=to_i"
|
668
|
+
# f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
669
|
+
# utrs.each do |seq,trans|
|
670
|
+
# trans.each do |tran|
|
671
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
672
|
+
# end
|
673
|
+
# end
|
674
|
+
# end
|
675
|
+
#end
|
676
|
+
#
|
677
|
+
#file 'transcript_5utr' do |t|
|
678
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
679
|
+
#
|
680
|
+
# File.open(t.name, 'w') do |f|
|
681
|
+
# f.puts "#: :type=:single#cast=to_i"
|
682
|
+
# f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
683
|
+
# utrs.each do |seq,trans|
|
684
|
+
# trans.each do |tran|
|
685
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
686
|
+
# end
|
687
|
+
# end
|
688
|
+
# end
|
689
|
+
#end
|
690
|
+
|
691
|
+
|
692
|
+
|
693
|
+
#file 'transcript_sequence' do |t|
|
694
|
+
# sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
695
|
+
#
|
696
|
+
# File.open(t.name, 'w') do |f|
|
697
|
+
# f.puts "#: :type=:single"
|
698
|
+
# f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
699
|
+
# sequences.each do |seq, genes|
|
700
|
+
# genes.each do |gene|
|
701
|
+
# f.write gene
|
702
|
+
# f.write "\t"
|
703
|
+
# f.write seq
|
704
|
+
# f.write "\n"
|
705
|
+
# end
|
706
|
+
# end
|
707
|
+
# end
|
708
|
+
#end
|
709
|
+
|
710
|
+
#file 'transcript_phase' do |t|
|
711
|
+
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
712
|
+
#
|
713
|
+
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
714
|
+
# transcript_cds_start.through do |transcript, values|
|
715
|
+
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
716
|
+
# tsv[transcript] = phase.to_i unless phase.nil?
|
717
|
+
# end
|
718
|
+
#
|
719
|
+
# File.open(t.name, 'w') do |f| f.puts tsv end
|
720
|
+
#end
|
721
|
+
#
|
722
|
+
#file 'transcript_3utr' do |t|
|
723
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :namespace => $namespace)
|
724
|
+
#
|
725
|
+
# File.open(t.name, 'w') do |f|
|
726
|
+
# f.puts "#: :type=:single#cast=to_i"
|
727
|
+
# f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
728
|
+
# utrs.each do |seq,trans|
|
729
|
+
# trans.each do |tran|
|
730
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
731
|
+
# end
|
732
|
+
# end
|
733
|
+
# end
|
734
|
+
#end
|
735
|
+
#
|
736
|
+
#file 'transcript_5utr' do |t|
|
737
|
+
# utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
738
|
+
#
|
739
|
+
# File.open(t.name, 'w') do |f|
|
740
|
+
# f.puts "#: :type=:single#cast=to_i"
|
741
|
+
# f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
742
|
+
# utrs.each do |seq,trans|
|
743
|
+
# trans.each do |tran|
|
744
|
+
# f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
745
|
+
# end
|
746
|
+
# end
|
747
|
+
# end
|
748
|
+
#end
|
749
|
+
|
750
|
+
#file 'transcript_sequence' do |t|
|
751
|
+
# sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
752
|
+
#
|
753
|
+
# File.open(t.name, 'w') do |f|
|
754
|
+
# f.puts "#: :type=:single"
|
755
|
+
# f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
756
|
+
# sequences.each do |seq, genes|
|
757
|
+
# genes.each do |gene|
|
758
|
+
# f.write gene
|
759
|
+
# f.write "\t"
|
760
|
+
# f.write seq
|
761
|
+
# f.write "\n"
|
762
|
+
# end
|
763
|
+
# end
|
764
|
+
# end
|
765
|
+
#end
|
766
|
+
#file 'protein_sequence' => 'chromosomes' do |t|
|
767
|
+
# #chromosomes = TSV.open(t.prerequisites.first).keys
|
768
|
+
# #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
|
769
|
+
# sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
770
|
+
#
|
771
|
+
# File.open(t.name, 'w') do |f|
|
772
|
+
# f.puts "#: :type=:single"
|
773
|
+
# f.puts "#Ensembl Protein ID\tProtein Sequence"
|
774
|
+
# sequences.each do |seq, genes|
|
775
|
+
# genes.each do |gene|
|
776
|
+
# f.write gene
|
777
|
+
# f.write "\t"
|
778
|
+
# f.write seq
|
779
|
+
# f.write "\n"
|
780
|
+
# end
|
781
|
+
# end
|
782
|
+
# end
|
783
|
+
#end
|
784
|
+
|
785
|
+
|
786
|
+
|