rbbt-sources 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/COSTART.rb +2 -3
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/biomart.rb +32 -32
- data/lib/rbbt/sources/entrez.rb +14 -10
- data/lib/rbbt/sources/go.rb +9 -8
- data/lib/rbbt/sources/organism.rb +36 -10
- data/lib/rbbt/sources/organism/sequence.rb +337 -0
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/share/install/Organism/Hsa/Rakefile +7 -68
- data/share/install/Organism/Sce/Rakefile +4 -70
- data/share/install/Organism/organism_helpers.rb +305 -0
- data/share/install/lib/helpers.rb +5 -5
- data/test/rbbt/sources/test_biomart.rb +7 -6
- data/test/rbbt/sources/test_entrez.rb +3 -3
- data/test/rbbt/sources/test_organism.rb +32 -3
- data/test/rbbt/sources/test_pubmed.rb +1 -1
- metadata +7 -6
- data/lib/rbbt/sources/Reactome.rb +0 -16
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
|
3
3
|
module Polysearch
|
4
|
-
Rbbt.
|
5
|
-
Rbbt.
|
6
|
-
Rbbt.
|
7
|
-
Rbbt.
|
8
|
-
Rbbt.
|
4
|
+
Rbbt.share.Polysearch.organ.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
|
5
|
+
Rbbt.share.Polysearch.tissue.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
|
6
|
+
Rbbt.share.Polysearch.location.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
|
7
|
+
Rbbt.share.Polysearch.disease.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
|
8
|
+
Rbbt.share.Polysearch.drug.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
|
9
9
|
end
|
10
10
|
|
@@ -3,11 +3,11 @@ require 'rbbt/sources/biomart'
|
|
3
3
|
require 'rbbt/sources/entrez'
|
4
4
|
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
5
|
|
6
|
-
$taxs = [
|
7
|
-
$
|
8
|
-
|
6
|
+
$taxs = [9606]
|
7
|
+
$scientific_name = "Homo sapiens"
|
8
|
+
|
9
9
|
$biomart_db = 'hsapiens_gene_ensembl'
|
10
|
-
|
10
|
+
|
11
11
|
$biomart_lexicon = [
|
12
12
|
[ 'Associated Gene Name' , "external_gene_id"],
|
13
13
|
[ 'HGNC symbol', "hgnc_symbol" ],
|
@@ -16,7 +16,7 @@ $biomart_lexicon = [
|
|
16
16
|
]
|
17
17
|
|
18
18
|
$biomart_identifiers = [
|
19
|
-
[ '
|
19
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
20
20
|
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
21
21
|
[ 'Associated Gene Name', "external_gene_id" ],
|
22
22
|
[ 'CCDS ID', "ccds" ],
|
@@ -52,66 +52,5 @@ $biomart_identifiers = [
|
|
52
52
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
53
|
]
|
54
54
|
|
55
|
-
$
|
56
|
-
|
57
|
-
['Strand','strand'],
|
58
|
-
['Gene Start','start_position'],
|
59
|
-
['Gene End','end_position'],
|
60
|
-
['Transcript Start','transcript_start'],
|
61
|
-
['Transcript End','transcript_end'],
|
62
|
-
]
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
file 'scientific_name' do |t|
|
67
|
-
File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
|
68
|
-
end
|
69
|
-
|
70
|
-
file 'lexicon' do |t|
|
71
|
-
lexicon = tsv_file('http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag',
|
72
|
-
"HGNC ID", nil, :flatten => true, :header_hash => '')
|
73
|
-
merge_biomart lexicon, $biomart_db, $biomart_main, $biomart_lexicon, "HGNC ID"
|
74
|
-
|
75
|
-
File.open(t.name, 'w') do |f| f.puts lexicon end
|
76
|
-
end
|
77
|
-
|
78
|
-
file 'identifiers' do |t|
|
79
|
-
identifiers = BioMart.tsv($biomart_db, $biomart_main, $biomart_identifiers)
|
80
|
-
$biomart_identifiers.each do |name, key, prefix|
|
81
|
-
if prefix
|
82
|
-
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
File.open(t.name, 'w') do |f| f.puts identifiers end
|
87
|
-
end
|
88
|
-
|
89
|
-
file 'gene_go' do |t|
|
90
|
-
url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
|
91
|
-
tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
|
92
|
-
|
93
|
-
index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
|
94
|
-
new = TSV.new({})
|
95
|
-
tsv.through do |key, values|
|
96
|
-
next if index[key].nil?
|
97
|
-
new_key = index[key].first
|
98
|
-
new[new_key] = values
|
99
|
-
end
|
100
|
-
|
101
|
-
|
102
|
-
new.key_field = "Associated Gene Name"
|
103
|
-
new.fields = ["GO Term"]
|
104
|
-
Open.write(t.name, new.to_s)
|
105
|
-
end
|
106
|
-
|
107
|
-
file 'gene_positions' do |t|
|
108
|
-
BioMart.set_archive('may2009')
|
109
|
-
positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
|
110
|
-
BioMart.unset_archive
|
111
|
-
|
112
|
-
Open.write(t.name, positions.to_s)
|
113
|
-
end
|
114
|
-
|
115
|
-
task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
|
116
|
-
|
117
|
-
|
55
|
+
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
56
|
+
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -17,9 +17,9 @@ end
|
|
17
17
|
file 'lexicon' do |t|
|
18
18
|
lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
19
19
|
|
20
|
-
merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\
|
20
|
+
lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
21
21
|
|
22
|
-
merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
22
|
+
lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
23
23
|
|
24
24
|
lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
|
25
25
|
|
@@ -29,9 +29,9 @@ end
|
|
29
29
|
file 'identifiers' do |t|
|
30
30
|
identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
|
31
31
|
|
32
|
-
merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\
|
32
|
+
identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
33
33
|
|
34
|
-
merge_biomart(identifiers, $biomart_db, $biomart_main,
|
34
|
+
identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
|
35
35
|
[['Associated Gene Name' , "external_gene_id"],
|
36
36
|
['Ensembl Gene ID', "ensembl_gene_id" ],
|
37
37
|
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
@@ -50,69 +50,3 @@ end
|
|
50
50
|
|
51
51
|
task :default => ['name', 'lexicon', 'identifiers']
|
52
52
|
|
53
|
-
#require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
54
|
-
#
|
55
|
-
#$name = "Saccharomyces cerevisiae"
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#$native_id = "SGD DB Id"
|
59
|
-
#
|
60
|
-
#$entrez2native = {
|
61
|
-
# :tax => 559292,
|
62
|
-
# :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
|
63
|
-
# :check => proc{|code| code.match(/^S0/)},
|
64
|
-
#}
|
65
|
-
#
|
66
|
-
#$lexicon = {
|
67
|
-
# :file => {
|
68
|
-
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
69
|
-
# :native => 0,
|
70
|
-
# :extra => [4,3,5]
|
71
|
-
# },
|
72
|
-
# :biomart => {
|
73
|
-
# :database => 'scerevisiae_gene_ensembl',
|
74
|
-
# :main => ['Entrez Gene ID', 'entrezgene'],
|
75
|
-
# :extra => [
|
76
|
-
# ['Interpro Description' , "interpro_description"],
|
77
|
-
# ],
|
78
|
-
# :filter => [],
|
79
|
-
# }
|
80
|
-
#
|
81
|
-
#}
|
82
|
-
#
|
83
|
-
#$identifiers = {
|
84
|
-
# :file => {
|
85
|
-
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
86
|
-
# :native => 0,
|
87
|
-
# :extra => [],
|
88
|
-
# },
|
89
|
-
# :biomart => {
|
90
|
-
# :database => 'scerevisiae_gene_ensembl',
|
91
|
-
# :main => ['Entrez Gene ID', 'entrezgene'],
|
92
|
-
# :extra => [
|
93
|
-
# ['Associated Gene Name' , "external_gene_id"],
|
94
|
-
# ['Ensembl Gene ID', "ensembl_gene_id" ],
|
95
|
-
# ['Ensembl Protein ID', "ensembl_peptide_id" ],
|
96
|
-
# ['RefSeq Protein ID' , "refseq_peptide"] ,
|
97
|
-
# ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
98
|
-
# ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
99
|
-
# ['Protein ID' , "protein_id"] ,
|
100
|
-
# ['EMBL (Genbank) ID' , "embl"] ,
|
101
|
-
# # Affymetrix
|
102
|
-
# ['Affy yeast 2',"affy_yeast_2"],
|
103
|
-
# ['Affy yg s98', "affy_yg_s98"],
|
104
|
-
# ],
|
105
|
-
# :filter => [],
|
106
|
-
# }
|
107
|
-
#}
|
108
|
-
#
|
109
|
-
#$go = {
|
110
|
-
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
|
111
|
-
# :code => 1,
|
112
|
-
# :go => 4,
|
113
|
-
# :pmid => 5,
|
114
|
-
#}
|
115
|
-
#
|
116
|
-
#$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
117
|
-
#
|
118
|
-
#
|
@@ -0,0 +1,305 @@
|
|
1
|
+
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
2
|
+
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
3
|
+
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
4
|
+
$biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
|
5
|
+
$biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
|
6
|
+
$biomart_germline_variation_id = ['Variation ID', "external_id" ]
|
7
|
+
|
8
|
+
$biomart_gene_positions = [
|
9
|
+
['Chromosome Name','chromosome_name'],
|
10
|
+
['Strand','strand'],
|
11
|
+
['Gene Start','start_position'],
|
12
|
+
['Gene End','end_position'],
|
13
|
+
]
|
14
|
+
|
15
|
+
$biomart_gene_sequence = [
|
16
|
+
['Gene Sequence','gene_exon_intron'],
|
17
|
+
]
|
18
|
+
|
19
|
+
#{{{ Transcript
|
20
|
+
|
21
|
+
$biomart_gene_transcript = [
|
22
|
+
$biomart_ensembl_transcript
|
23
|
+
]
|
24
|
+
|
25
|
+
$biomart_transcript = [
|
26
|
+
['Transcript Start (bp)','transcript_start'],
|
27
|
+
['Transcript End (bp)','transcript_end'],
|
28
|
+
$biomart_ensembl_protein,
|
29
|
+
$biomart_ensembl_gene
|
30
|
+
]
|
31
|
+
|
32
|
+
$biomart_transcript_sequence = [
|
33
|
+
['cDNA','cdna'],
|
34
|
+
]
|
35
|
+
|
36
|
+
$biomart_transcript_3utr = [
|
37
|
+
["3' UTR", '3utr'],
|
38
|
+
]
|
39
|
+
|
40
|
+
$biomart_transcript_5utr = [
|
41
|
+
["5' UTR", '5utr'],
|
42
|
+
]
|
43
|
+
|
44
|
+
|
45
|
+
$biomart_protein_sequence = [
|
46
|
+
['Protein Sequence','peptide'],
|
47
|
+
]
|
48
|
+
|
49
|
+
#{{{ Exons
|
50
|
+
|
51
|
+
$biomart_transcript_exons = [
|
52
|
+
$biomart_ensembl_exon,
|
53
|
+
['Exon Rank in Transcript','rank'],
|
54
|
+
]
|
55
|
+
|
56
|
+
$biomart_exons = [
|
57
|
+
$biomart_ensembl_gene,
|
58
|
+
['Exon Strand','strand'],
|
59
|
+
['Exon Chr Start','exon_chrom_start'],
|
60
|
+
['Exon Chr End','exon_chrom_end'],
|
61
|
+
]
|
62
|
+
|
63
|
+
#{{{ Variations
|
64
|
+
|
65
|
+
$biomart_germline_variation_positions = [
|
66
|
+
['Chromosome Location (bp)', "chromosome_location" ],
|
67
|
+
['SNP Chromosome Strand', "snp_chromosome_strand" ],
|
68
|
+
['Transcript location (bp)', "transcript_location" ],
|
69
|
+
['Allele', "allele" ],
|
70
|
+
['Protein Allele', "peptide_shift" ],
|
71
|
+
['CDS Start', "cds_start_2076" ],
|
72
|
+
['CDS End', "cds_end_2076" ],
|
73
|
+
]
|
74
|
+
|
75
|
+
$biomart_germline_variations = [
|
76
|
+
$biomart_ensembl_gene,
|
77
|
+
['Source', "source_name" ],
|
78
|
+
['Validated', "validated" ],
|
79
|
+
['Consequence Type', "synonymous_status" ],
|
80
|
+
]
|
81
|
+
|
82
|
+
$biomart_somatic_variation_positions = [
|
83
|
+
['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
|
84
|
+
['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
|
85
|
+
['Transcript location (bp)' , "somatic_transcript_location" ] ,
|
86
|
+
['Allele' , "somatic_allele" ] ,
|
87
|
+
['Protein Allele' , "somatic_peptide_shift" ] ,
|
88
|
+
['CDS Start' , "somatic_cds_start_2076" ] ,
|
89
|
+
['CDS End' , "somatic_cds_end_2076" ] ,
|
90
|
+
]
|
91
|
+
|
92
|
+
$biomart_somatic_variations = [
|
93
|
+
$biomart_ensembl_gene,
|
94
|
+
['Source' , "somatic_source_name" ] ,
|
95
|
+
['Validated' , "somatic_validated" ] ,
|
96
|
+
['Consequence Type' , "somatic_synonymous_status" ] ,
|
97
|
+
]
|
98
|
+
|
99
|
+
#{{{ Rules
|
100
|
+
|
101
|
+
file 'scientific_name' do |t|
|
102
|
+
File.open(t.name, 'w') do |f| f.write $scientific_name end
|
103
|
+
end
|
104
|
+
|
105
|
+
file 'identifiers' do |t|
|
106
|
+
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [])
|
107
|
+
$biomart_identifiers.each do |name, key, prefix|
|
108
|
+
if prefix
|
109
|
+
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
114
|
+
end
|
115
|
+
|
116
|
+
file 'gene_transcripts' do |t|
|
117
|
+
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat)
|
118
|
+
|
119
|
+
File.open(t.name, 'w') do |f| f.puts transcripts end
|
120
|
+
end
|
121
|
+
|
122
|
+
file 'transcripts' => 'gene_positions' do |t|
|
123
|
+
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list)
|
124
|
+
transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
|
125
|
+
|
126
|
+
File.open(t.name, 'w') do |f| f.puts transcripts end
|
127
|
+
end
|
128
|
+
|
129
|
+
file 'transcript_3utr' do |t|
|
130
|
+
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :merge => true)
|
131
|
+
|
132
|
+
File.open(t.name, 'w') do |f|
|
133
|
+
f.puts "#: :type=:single#cast=to_i"
|
134
|
+
f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
135
|
+
utrs.each do |seq,trans|
|
136
|
+
trans.each do |tran|
|
137
|
+
f.puts [tran, seq.length] * "\t"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
file 'transcript_5utr' do |t|
|
145
|
+
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :merge => true)
|
146
|
+
|
147
|
+
File.open(t.name, 'w') do |f|
|
148
|
+
f.puts "#: :type=:single#cast=to_i"
|
149
|
+
f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
150
|
+
utrs.each do |seq,trans|
|
151
|
+
trans.each do |tran|
|
152
|
+
f.puts [tran, seq.length] * "\t"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
file 'gene_positions' do |t|
|
159
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
|
160
|
+
|
161
|
+
File.open(t.name, 'w') do |f| f.puts sequences end
|
162
|
+
end
|
163
|
+
|
164
|
+
file 'gene_sequence' do |t|
|
165
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_sequence, [], nil, :type => :flat, :merge => true)
|
166
|
+
|
167
|
+
File.open(t.name, 'w') do |f|
|
168
|
+
f.puts "#: :type=:single"
|
169
|
+
f.puts "#Ensembl Gene ID\tProtein Sequence"
|
170
|
+
sequences.each do |seq, genes|
|
171
|
+
genes.each do |gene|
|
172
|
+
f.write gene
|
173
|
+
f.write "\t"
|
174
|
+
f.write seq
|
175
|
+
f.write "\n"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
file 'protein_sequence' do |t|
|
182
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :merge => true)
|
183
|
+
|
184
|
+
File.open(t.name, 'w') do |f|
|
185
|
+
f.puts "#: :type=:single"
|
186
|
+
f.puts "#Ensembl Protein ID\tProtein Sequence"
|
187
|
+
sequences.each do |seq, genes|
|
188
|
+
genes.each do |gene|
|
189
|
+
f.write gene
|
190
|
+
f.write "\t"
|
191
|
+
f.write seq
|
192
|
+
f.write "\n"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
file 'exons' => 'gene_positions' do |t|
|
200
|
+
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list)
|
201
|
+
exons.attach TSV.new('gene_positions'), "Chromosome Name"
|
202
|
+
|
203
|
+
File.open(t.name, 'w') do |f| f.puts exons end
|
204
|
+
end
|
205
|
+
|
206
|
+
file 'transcript_exons' do |t|
|
207
|
+
exons = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_exons, [], nil, :keep_empty => true)
|
208
|
+
|
209
|
+
File.open(t.name, 'w') do |f| f.puts exons end
|
210
|
+
end
|
211
|
+
|
212
|
+
file 'transcript_sequence' do |t|
|
213
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :merge => true)
|
214
|
+
|
215
|
+
File.open(t.name, 'w') do |f|
|
216
|
+
f.puts "#: :type=:single"
|
217
|
+
f.puts "#Ensembl Transcript ID\tProtein Sequence"
|
218
|
+
sequences.each do |seq, genes|
|
219
|
+
genes.each do |gene|
|
220
|
+
f.write gene
|
221
|
+
f.write "\t"
|
222
|
+
f.write seq
|
223
|
+
f.write "\n"
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD_TRANSCRIPT,COMPLEX_INDEL&SPLICE_SITE,ESSENTIAL_SPLICE_SITE&INTRONIC,ESSENTIAL_SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,FRAMESHIFT_CODING,FRAMESHIFT_CODING&NMD_TRANSCRIPT,FRAMESHIFT_CODING&SPLICE_SITE,FRAMESHIFT_CODING&SPLICE_SITE&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING&SPLICE_SITE,NON_SYNONYMOUS_CODING&SPLICE_SITE&NMD_TRANSCRIPT,REGULATORY_REGION,SPLICE_SITE&3PRIME_UTR,SPLICE_SITE&3PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&5PRIME_UTR,SPLICE_SITE&5PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&INTRONIC,SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,SPLICE_SITE&SYNONYMOUS_CODING,SPLICE_SITE&SYNONYMOUS_CODING&NMD_TRANSCRIPT,STOP_GAINED,STOP_GAINED&FRAMESHIFT_CODING,STOP_GAINED&FRAMESHIFT_CODING&NMD_TRANSCRIPT,STOP_GAINED&NMD_TRANSCRIPT,STOP_GAINED&SPLICE_SITE,STOP_GAINED&SPLICE_SITE&NMD_TRANSCRIPT,STOP_LOST,STOP_LOST&NMD_TRANSCRIPT,STOP_LOST&SPLICE_SITE,STOP_LOST&SPLICE_SITE&NMD_TRANSCRIPT,SYNONYMOUS_CODING,SYNONYMOUS_CODING&NMD_TRANSCRIPT"]
|
231
|
+
#$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
|
232
|
+
$biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
|
233
|
+
|
234
|
+
file 'germline_variations' do |t|
|
235
|
+
variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
236
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
237
|
+
end
|
238
|
+
|
239
|
+
file 'germline_variation_positions' do |t|
|
240
|
+
variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
241
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
242
|
+
end
|
243
|
+
|
244
|
+
file 'somatic_variations' do |t|
|
245
|
+
variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
246
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
247
|
+
end
|
248
|
+
|
249
|
+
file 'somatic_variation_positions' do |t|
|
250
|
+
variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
251
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
252
|
+
end
|
253
|
+
|
254
|
+
file 'gene_pmids' do |t|
|
255
|
+
tsv = Entrez.entrez2pubmed($taxs)
|
256
|
+
text = "#Entrez Gene ID\tPMID"
|
257
|
+
tsv.each do |gene, pmids|
|
258
|
+
text << "\n" << gene << "\t" << pmids * "|"
|
259
|
+
end
|
260
|
+
Open.write(t.name, text)
|
261
|
+
end
|
262
|
+
|
263
|
+
file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
|
264
|
+
require 'rbbt/sources/organism/sequence'
|
265
|
+
|
266
|
+
exons = TSV.new('exons', :persistence => true)
|
267
|
+
exon_transcripts = TSV.new('transcript_exons', :double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
268
|
+
gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
|
269
|
+
transcript_info = TSV.new('transcripts', :list, :persistence => true )
|
270
|
+
transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
|
271
|
+
|
272
|
+
|
273
|
+
string = "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
|
274
|
+
exons.each do |exon, info|
|
275
|
+
gene, start, finish, strand, chr = info
|
276
|
+
|
277
|
+
transcripts = Organism::Hsa.coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
278
|
+
|
279
|
+
transcript_offsets = {}
|
280
|
+
transcripts.each do |transcript|
|
281
|
+
offset = Organism::Hsa.exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
282
|
+
transcript_offsets[transcript] = offset unless offset.nil?
|
283
|
+
end
|
284
|
+
|
285
|
+
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
286
|
+
end
|
287
|
+
|
288
|
+
Open.write(t.name, string)
|
289
|
+
end
|
290
|
+
|
291
|
+
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
292
|
+
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
293
|
+
archive = $1
|
294
|
+
task = $2
|
295
|
+
old_pwd = FileUtils.pwd
|
296
|
+
begin
|
297
|
+
FileUtils.mkdir archive unless File.exists? archive
|
298
|
+
FileUtils.cd File.join(archive)
|
299
|
+
BioMart.set_archive archive
|
300
|
+
Rake::Task[task].invoke
|
301
|
+
BioMart.unset_archive
|
302
|
+
ensure
|
303
|
+
FileUtils.cd old_pwd
|
304
|
+
end
|
305
|
+
end
|