rbbt-sources 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/sources/COSTART.rb +2 -3
- data/lib/rbbt/sources/CTCAE.rb +1 -1
- data/lib/rbbt/sources/biomart.rb +32 -32
- data/lib/rbbt/sources/entrez.rb +14 -10
- data/lib/rbbt/sources/go.rb +9 -8
- data/lib/rbbt/sources/organism.rb +36 -10
- data/lib/rbbt/sources/organism/sequence.rb +337 -0
- data/lib/rbbt/sources/polysearch.rb +5 -5
- data/share/install/Organism/Hsa/Rakefile +7 -68
- data/share/install/Organism/Sce/Rakefile +4 -70
- data/share/install/Organism/organism_helpers.rb +305 -0
- data/share/install/lib/helpers.rb +5 -5
- data/test/rbbt/sources/test_biomart.rb +7 -6
- data/test/rbbt/sources/test_entrez.rb +3 -3
- data/test/rbbt/sources/test_organism.rb +32 -3
- data/test/rbbt/sources/test_pubmed.rb +1 -1
- metadata +7 -6
- data/lib/rbbt/sources/Reactome.rb +0 -16
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
|
3
3
|
module Polysearch
|
4
|
-
Rbbt.
|
5
|
-
Rbbt.
|
6
|
-
Rbbt.
|
7
|
-
Rbbt.
|
8
|
-
Rbbt.
|
4
|
+
Rbbt.share.Polysearch.organ.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
|
5
|
+
Rbbt.share.Polysearch.tissue.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
|
6
|
+
Rbbt.share.Polysearch.location.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
|
7
|
+
Rbbt.share.Polysearch.disease.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
|
8
|
+
Rbbt.share.Polysearch.drug.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
|
9
9
|
end
|
10
10
|
|
@@ -3,11 +3,11 @@ require 'rbbt/sources/biomart'
|
|
3
3
|
require 'rbbt/sources/entrez'
|
4
4
|
require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
5
5
|
|
6
|
-
$taxs = [
|
7
|
-
$
|
8
|
-
|
6
|
+
$taxs = [9606]
|
7
|
+
$scientific_name = "Homo sapiens"
|
8
|
+
|
9
9
|
$biomart_db = 'hsapiens_gene_ensembl'
|
10
|
-
|
10
|
+
|
11
11
|
$biomart_lexicon = [
|
12
12
|
[ 'Associated Gene Name' , "external_gene_id"],
|
13
13
|
[ 'HGNC symbol', "hgnc_symbol" ],
|
@@ -16,7 +16,7 @@ $biomart_lexicon = [
|
|
16
16
|
]
|
17
17
|
|
18
18
|
$biomart_identifiers = [
|
19
|
-
[ '
|
19
|
+
[ 'Entrez Gene ID', "entrezgene"],
|
20
20
|
[ 'Ensembl Protein ID', "ensembl_peptide_id" ],
|
21
21
|
[ 'Associated Gene Name', "external_gene_id" ],
|
22
22
|
[ 'CCDS ID', "ccds" ],
|
@@ -52,66 +52,5 @@ $biomart_identifiers = [
|
|
52
52
|
[ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
|
53
53
|
]
|
54
54
|
|
55
|
-
$
|
56
|
-
|
57
|
-
['Strand','strand'],
|
58
|
-
['Gene Start','start_position'],
|
59
|
-
['Gene End','end_position'],
|
60
|
-
['Transcript Start','transcript_start'],
|
61
|
-
['Transcript End','transcript_end'],
|
62
|
-
]
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
file 'scientific_name' do |t|
|
67
|
-
File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
|
68
|
-
end
|
69
|
-
|
70
|
-
file 'lexicon' do |t|
|
71
|
-
lexicon = tsv_file('http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag',
|
72
|
-
"HGNC ID", nil, :flatten => true, :header_hash => '')
|
73
|
-
merge_biomart lexicon, $biomart_db, $biomart_main, $biomart_lexicon, "HGNC ID"
|
74
|
-
|
75
|
-
File.open(t.name, 'w') do |f| f.puts lexicon end
|
76
|
-
end
|
77
|
-
|
78
|
-
file 'identifiers' do |t|
|
79
|
-
identifiers = BioMart.tsv($biomart_db, $biomart_main, $biomart_identifiers)
|
80
|
-
$biomart_identifiers.each do |name, key, prefix|
|
81
|
-
if prefix
|
82
|
-
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
File.open(t.name, 'w') do |f| f.puts identifiers end
|
87
|
-
end
|
88
|
-
|
89
|
-
file 'gene_go' do |t|
|
90
|
-
url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
|
91
|
-
tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
|
92
|
-
|
93
|
-
index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
|
94
|
-
new = TSV.new({})
|
95
|
-
tsv.through do |key, values|
|
96
|
-
next if index[key].nil?
|
97
|
-
new_key = index[key].first
|
98
|
-
new[new_key] = values
|
99
|
-
end
|
100
|
-
|
101
|
-
|
102
|
-
new.key_field = "Associated Gene Name"
|
103
|
-
new.fields = ["GO Term"]
|
104
|
-
Open.write(t.name, new.to_s)
|
105
|
-
end
|
106
|
-
|
107
|
-
file 'gene_positions' do |t|
|
108
|
-
BioMart.set_archive('may2009')
|
109
|
-
positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
|
110
|
-
BioMart.unset_archive
|
111
|
-
|
112
|
-
Open.write(t.name, positions.to_s)
|
113
|
-
end
|
114
|
-
|
115
|
-
task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
|
116
|
-
|
117
|
-
|
55
|
+
$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
|
56
|
+
load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
|
@@ -17,9 +17,9 @@ end
|
|
17
17
|
file 'lexicon' do |t|
|
18
18
|
lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
|
19
19
|
|
20
|
-
merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\
|
20
|
+
lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
21
21
|
|
22
|
-
merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
22
|
+
lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
|
23
23
|
|
24
24
|
lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
|
25
25
|
|
@@ -29,9 +29,9 @@ end
|
|
29
29
|
file 'identifiers' do |t|
|
30
30
|
identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
|
31
31
|
|
32
|
-
merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\
|
32
|
+
identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
|
33
33
|
|
34
|
-
merge_biomart(identifiers, $biomart_db, $biomart_main,
|
34
|
+
identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
|
35
35
|
[['Associated Gene Name' , "external_gene_id"],
|
36
36
|
['Ensembl Gene ID', "ensembl_gene_id" ],
|
37
37
|
['Ensembl Protein ID', "ensembl_peptide_id" ],
|
@@ -50,69 +50,3 @@ end
|
|
50
50
|
|
51
51
|
task :default => ['name', 'lexicon', 'identifiers']
|
52
52
|
|
53
|
-
#require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
|
54
|
-
#
|
55
|
-
#$name = "Saccharomyces cerevisiae"
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#$native_id = "SGD DB Id"
|
59
|
-
#
|
60
|
-
#$entrez2native = {
|
61
|
-
# :tax => 559292,
|
62
|
-
# :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
|
63
|
-
# :check => proc{|code| code.match(/^S0/)},
|
64
|
-
#}
|
65
|
-
#
|
66
|
-
#$lexicon = {
|
67
|
-
# :file => {
|
68
|
-
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
69
|
-
# :native => 0,
|
70
|
-
# :extra => [4,3,5]
|
71
|
-
# },
|
72
|
-
# :biomart => {
|
73
|
-
# :database => 'scerevisiae_gene_ensembl',
|
74
|
-
# :main => ['Entrez Gene ID', 'entrezgene'],
|
75
|
-
# :extra => [
|
76
|
-
# ['Interpro Description' , "interpro_description"],
|
77
|
-
# ],
|
78
|
-
# :filter => [],
|
79
|
-
# }
|
80
|
-
#
|
81
|
-
#}
|
82
|
-
#
|
83
|
-
#$identifiers = {
|
84
|
-
# :file => {
|
85
|
-
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
|
86
|
-
# :native => 0,
|
87
|
-
# :extra => [],
|
88
|
-
# },
|
89
|
-
# :biomart => {
|
90
|
-
# :database => 'scerevisiae_gene_ensembl',
|
91
|
-
# :main => ['Entrez Gene ID', 'entrezgene'],
|
92
|
-
# :extra => [
|
93
|
-
# ['Associated Gene Name' , "external_gene_id"],
|
94
|
-
# ['Ensembl Gene ID', "ensembl_gene_id" ],
|
95
|
-
# ['Ensembl Protein ID', "ensembl_peptide_id" ],
|
96
|
-
# ['RefSeq Protein ID' , "refseq_peptide"] ,
|
97
|
-
# ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
|
98
|
-
# ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
|
99
|
-
# ['Protein ID' , "protein_id"] ,
|
100
|
-
# ['EMBL (Genbank) ID' , "embl"] ,
|
101
|
-
# # Affymetrix
|
102
|
-
# ['Affy yeast 2',"affy_yeast_2"],
|
103
|
-
# ['Affy yg s98', "affy_yg_s98"],
|
104
|
-
# ],
|
105
|
-
# :filter => [],
|
106
|
-
# }
|
107
|
-
#}
|
108
|
-
#
|
109
|
-
#$go = {
|
110
|
-
# :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
|
111
|
-
# :code => 1,
|
112
|
-
# :go => 4,
|
113
|
-
# :pmid => 5,
|
114
|
-
#}
|
115
|
-
#
|
116
|
-
#$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
|
117
|
-
#
|
118
|
-
#
|
@@ -0,0 +1,305 @@
|
|
1
|
+
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
2
|
+
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
3
|
+
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
4
|
+
$biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
|
5
|
+
$biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
|
6
|
+
$biomart_germline_variation_id = ['Variation ID', "external_id" ]
|
7
|
+
|
8
|
+
$biomart_gene_positions = [
|
9
|
+
['Chromosome Name','chromosome_name'],
|
10
|
+
['Strand','strand'],
|
11
|
+
['Gene Start','start_position'],
|
12
|
+
['Gene End','end_position'],
|
13
|
+
]
|
14
|
+
|
15
|
+
$biomart_gene_sequence = [
|
16
|
+
['Gene Sequence','gene_exon_intron'],
|
17
|
+
]
|
18
|
+
|
19
|
+
#{{{ Transcript
|
20
|
+
|
21
|
+
$biomart_gene_transcript = [
|
22
|
+
$biomart_ensembl_transcript
|
23
|
+
]
|
24
|
+
|
25
|
+
$biomart_transcript = [
|
26
|
+
['Transcript Start (bp)','transcript_start'],
|
27
|
+
['Transcript End (bp)','transcript_end'],
|
28
|
+
$biomart_ensembl_protein,
|
29
|
+
$biomart_ensembl_gene
|
30
|
+
]
|
31
|
+
|
32
|
+
$biomart_transcript_sequence = [
|
33
|
+
['cDNA','cdna'],
|
34
|
+
]
|
35
|
+
|
36
|
+
$biomart_transcript_3utr = [
|
37
|
+
["3' UTR", '3utr'],
|
38
|
+
]
|
39
|
+
|
40
|
+
$biomart_transcript_5utr = [
|
41
|
+
["5' UTR", '5utr'],
|
42
|
+
]
|
43
|
+
|
44
|
+
|
45
|
+
$biomart_protein_sequence = [
|
46
|
+
['Protein Sequence','peptide'],
|
47
|
+
]
|
48
|
+
|
49
|
+
#{{{ Exons
|
50
|
+
|
51
|
+
$biomart_transcript_exons = [
|
52
|
+
$biomart_ensembl_exon,
|
53
|
+
['Exon Rank in Transcript','rank'],
|
54
|
+
]
|
55
|
+
|
56
|
+
$biomart_exons = [
|
57
|
+
$biomart_ensembl_gene,
|
58
|
+
['Exon Strand','strand'],
|
59
|
+
['Exon Chr Start','exon_chrom_start'],
|
60
|
+
['Exon Chr End','exon_chrom_end'],
|
61
|
+
]
|
62
|
+
|
63
|
+
#{{{ Variations
|
64
|
+
|
65
|
+
$biomart_germline_variation_positions = [
|
66
|
+
['Chromosome Location (bp)', "chromosome_location" ],
|
67
|
+
['SNP Chromosome Strand', "snp_chromosome_strand" ],
|
68
|
+
['Transcript location (bp)', "transcript_location" ],
|
69
|
+
['Allele', "allele" ],
|
70
|
+
['Protein Allele', "peptide_shift" ],
|
71
|
+
['CDS Start', "cds_start_2076" ],
|
72
|
+
['CDS End', "cds_end_2076" ],
|
73
|
+
]
|
74
|
+
|
75
|
+
$biomart_germline_variations = [
|
76
|
+
$biomart_ensembl_gene,
|
77
|
+
['Source', "source_name" ],
|
78
|
+
['Validated', "validated" ],
|
79
|
+
['Consequence Type', "synonymous_status" ],
|
80
|
+
]
|
81
|
+
|
82
|
+
$biomart_somatic_variation_positions = [
|
83
|
+
['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
|
84
|
+
['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
|
85
|
+
['Transcript location (bp)' , "somatic_transcript_location" ] ,
|
86
|
+
['Allele' , "somatic_allele" ] ,
|
87
|
+
['Protein Allele' , "somatic_peptide_shift" ] ,
|
88
|
+
['CDS Start' , "somatic_cds_start_2076" ] ,
|
89
|
+
['CDS End' , "somatic_cds_end_2076" ] ,
|
90
|
+
]
|
91
|
+
|
92
|
+
$biomart_somatic_variations = [
|
93
|
+
$biomart_ensembl_gene,
|
94
|
+
['Source' , "somatic_source_name" ] ,
|
95
|
+
['Validated' , "somatic_validated" ] ,
|
96
|
+
['Consequence Type' , "somatic_synonymous_status" ] ,
|
97
|
+
]
|
98
|
+
|
99
|
+
#{{{ Rules
|
100
|
+
|
101
|
+
file 'scientific_name' do |t|
|
102
|
+
File.open(t.name, 'w') do |f| f.write $scientific_name end
|
103
|
+
end
|
104
|
+
|
105
|
+
file 'identifiers' do |t|
|
106
|
+
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [])
|
107
|
+
$biomart_identifiers.each do |name, key, prefix|
|
108
|
+
if prefix
|
109
|
+
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
File.open(t.name, 'w') do |f| f.puts identifiers end
|
114
|
+
end
|
115
|
+
|
116
|
+
file 'gene_transcripts' do |t|
|
117
|
+
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat)
|
118
|
+
|
119
|
+
File.open(t.name, 'w') do |f| f.puts transcripts end
|
120
|
+
end
|
121
|
+
|
122
|
+
file 'transcripts' => 'gene_positions' do |t|
|
123
|
+
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list)
|
124
|
+
transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
|
125
|
+
|
126
|
+
File.open(t.name, 'w') do |f| f.puts transcripts end
|
127
|
+
end
|
128
|
+
|
129
|
+
file 'transcript_3utr' do |t|
|
130
|
+
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :merge => true)
|
131
|
+
|
132
|
+
File.open(t.name, 'w') do |f|
|
133
|
+
f.puts "#: :type=:single#cast=to_i"
|
134
|
+
f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
135
|
+
utrs.each do |seq,trans|
|
136
|
+
trans.each do |tran|
|
137
|
+
f.puts [tran, seq.length] * "\t"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
file 'transcript_5utr' do |t|
|
145
|
+
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :merge => true)
|
146
|
+
|
147
|
+
File.open(t.name, 'w') do |f|
|
148
|
+
f.puts "#: :type=:single#cast=to_i"
|
149
|
+
f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
150
|
+
utrs.each do |seq,trans|
|
151
|
+
trans.each do |tran|
|
152
|
+
f.puts [tran, seq.length] * "\t"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
file 'gene_positions' do |t|
|
159
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
|
160
|
+
|
161
|
+
File.open(t.name, 'w') do |f| f.puts sequences end
|
162
|
+
end
|
163
|
+
|
164
|
+
file 'gene_sequence' do |t|
|
165
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_sequence, [], nil, :type => :flat, :merge => true)
|
166
|
+
|
167
|
+
File.open(t.name, 'w') do |f|
|
168
|
+
f.puts "#: :type=:single"
|
169
|
+
f.puts "#Ensembl Gene ID\tProtein Sequence"
|
170
|
+
sequences.each do |seq, genes|
|
171
|
+
genes.each do |gene|
|
172
|
+
f.write gene
|
173
|
+
f.write "\t"
|
174
|
+
f.write seq
|
175
|
+
f.write "\n"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
file 'protein_sequence' do |t|
|
182
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :merge => true)
|
183
|
+
|
184
|
+
File.open(t.name, 'w') do |f|
|
185
|
+
f.puts "#: :type=:single"
|
186
|
+
f.puts "#Ensembl Protein ID\tProtein Sequence"
|
187
|
+
sequences.each do |seq, genes|
|
188
|
+
genes.each do |gene|
|
189
|
+
f.write gene
|
190
|
+
f.write "\t"
|
191
|
+
f.write seq
|
192
|
+
f.write "\n"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
file 'exons' => 'gene_positions' do |t|
|
200
|
+
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list)
|
201
|
+
exons.attach TSV.new('gene_positions'), "Chromosome Name"
|
202
|
+
|
203
|
+
File.open(t.name, 'w') do |f| f.puts exons end
|
204
|
+
end
|
205
|
+
|
206
|
+
file 'transcript_exons' do |t|
|
207
|
+
exons = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_exons, [], nil, :keep_empty => true)
|
208
|
+
|
209
|
+
File.open(t.name, 'w') do |f| f.puts exons end
|
210
|
+
end
|
211
|
+
|
212
|
+
file 'transcript_sequence' do |t|
|
213
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :merge => true)
|
214
|
+
|
215
|
+
File.open(t.name, 'w') do |f|
|
216
|
+
f.puts "#: :type=:single"
|
217
|
+
f.puts "#Ensembl Transcript ID\tProtein Sequence"
|
218
|
+
sequences.each do |seq, genes|
|
219
|
+
genes.each do |gene|
|
220
|
+
f.write gene
|
221
|
+
f.write "\t"
|
222
|
+
f.write seq
|
223
|
+
f.write "\n"
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD_TRANSCRIPT,COMPLEX_INDEL&SPLICE_SITE,ESSENTIAL_SPLICE_SITE&INTRONIC,ESSENTIAL_SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,FRAMESHIFT_CODING,FRAMESHIFT_CODING&NMD_TRANSCRIPT,FRAMESHIFT_CODING&SPLICE_SITE,FRAMESHIFT_CODING&SPLICE_SITE&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING&SPLICE_SITE,NON_SYNONYMOUS_CODING&SPLICE_SITE&NMD_TRANSCRIPT,REGULATORY_REGION,SPLICE_SITE&3PRIME_UTR,SPLICE_SITE&3PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&5PRIME_UTR,SPLICE_SITE&5PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&INTRONIC,SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,SPLICE_SITE&SYNONYMOUS_CODING,SPLICE_SITE&SYNONYMOUS_CODING&NMD_TRANSCRIPT,STOP_GAINED,STOP_GAINED&FRAMESHIFT_CODING,STOP_GAINED&FRAMESHIFT_CODING&NMD_TRANSCRIPT,STOP_GAINED&NMD_TRANSCRIPT,STOP_GAINED&SPLICE_SITE,STOP_GAINED&SPLICE_SITE&NMD_TRANSCRIPT,STOP_LOST,STOP_LOST&NMD_TRANSCRIPT,STOP_LOST&SPLICE_SITE,STOP_LOST&SPLICE_SITE&NMD_TRANSCRIPT,SYNONYMOUS_CODING,SYNONYMOUS_CODING&NMD_TRANSCRIPT"]
|
231
|
+
#$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
|
232
|
+
$biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
|
233
|
+
|
234
|
+
file 'germline_variations' do |t|
|
235
|
+
variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
236
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
237
|
+
end
|
238
|
+
|
239
|
+
file 'germline_variation_positions' do |t|
|
240
|
+
variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
241
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
242
|
+
end
|
243
|
+
|
244
|
+
file 'somatic_variations' do |t|
|
245
|
+
variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
246
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
247
|
+
end
|
248
|
+
|
249
|
+
file 'somatic_variation_positions' do |t|
|
250
|
+
variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
|
251
|
+
File.open(t.name, 'w') do |f| f.puts variations.to_s end
|
252
|
+
end
|
253
|
+
|
254
|
+
file 'gene_pmids' do |t|
|
255
|
+
tsv = Entrez.entrez2pubmed($taxs)
|
256
|
+
text = "#Entrez Gene ID\tPMID"
|
257
|
+
tsv.each do |gene, pmids|
|
258
|
+
text << "\n" << gene << "\t" << pmids * "|"
|
259
|
+
end
|
260
|
+
Open.write(t.name, text)
|
261
|
+
end
|
262
|
+
|
263
|
+
file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
|
264
|
+
require 'rbbt/sources/organism/sequence'
|
265
|
+
|
266
|
+
exons = TSV.new('exons', :persistence => true)
|
267
|
+
exon_transcripts = TSV.new('transcript_exons', :double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
|
268
|
+
gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
|
269
|
+
transcript_info = TSV.new('transcripts', :list, :persistence => true )
|
270
|
+
transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
|
271
|
+
|
272
|
+
|
273
|
+
string = "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
|
274
|
+
exons.each do |exon, info|
|
275
|
+
gene, start, finish, strand, chr = info
|
276
|
+
|
277
|
+
transcripts = Organism::Hsa.coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
278
|
+
|
279
|
+
transcript_offsets = {}
|
280
|
+
transcripts.each do |transcript|
|
281
|
+
offset = Organism::Hsa.exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
|
282
|
+
transcript_offsets[transcript] = offset unless offset.nil?
|
283
|
+
end
|
284
|
+
|
285
|
+
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
286
|
+
end
|
287
|
+
|
288
|
+
Open.write(t.name, string)
|
289
|
+
end
|
290
|
+
|
291
|
+
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
292
|
+
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
293
|
+
archive = $1
|
294
|
+
task = $2
|
295
|
+
old_pwd = FileUtils.pwd
|
296
|
+
begin
|
297
|
+
FileUtils.mkdir archive unless File.exists? archive
|
298
|
+
FileUtils.cd File.join(archive)
|
299
|
+
BioMart.set_archive archive
|
300
|
+
Rake::Task[task].invoke
|
301
|
+
BioMart.unset_archive
|
302
|
+
ensure
|
303
|
+
FileUtils.cd old_pwd
|
304
|
+
end
|
305
|
+
end
|