rbbt-sources 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,10 @@
1
1
  require 'rbbt'
2
2
 
3
3
  module Polysearch
4
- Rbbt.claim "organ" ,'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt', 'Polysearch'
5
- Rbbt.claim "tissue" ,'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt', 'Polysearch'
6
- Rbbt.claim "location" ,'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt', 'Polysearch'
7
- Rbbt.claim "disease" ,'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt', 'Polysearch'
8
- Rbbt.claim "drug" ,'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt', 'Polysearch'
4
+ Rbbt.share.Polysearch.organ.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
5
+ Rbbt.share.Polysearch.tissue.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
6
+ Rbbt.share.Polysearch.location.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
7
+ Rbbt.share.Polysearch.disease.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
8
+ Rbbt.share.Polysearch.drug.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
9
9
  end
10
10
 
@@ -3,11 +3,11 @@ require 'rbbt/sources/biomart'
3
3
  require 'rbbt/sources/entrez'
4
4
  require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
- $taxs = [559292,4932]
7
- $native = "SGD ID"
8
- $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
6
+ $taxs = [9606]
7
+ $scientific_name = "Homo sapiens"
8
+
9
9
  $biomart_db = 'hsapiens_gene_ensembl'
10
- $biomart_main = ['Entrez Gene ID', 'entrezgene']
10
+
11
11
  $biomart_lexicon = [
12
12
  [ 'Associated Gene Name' , "external_gene_id"],
13
13
  [ 'HGNC symbol', "hgnc_symbol" ],
@@ -16,7 +16,7 @@ $biomart_lexicon = [
16
16
  ]
17
17
 
18
18
  $biomart_identifiers = [
19
- [ 'Ensembl Gene ID', "ensembl_gene_id" ],
19
+ [ 'Entrez Gene ID', "entrezgene"],
20
20
  [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
21
21
  [ 'Associated Gene Name', "external_gene_id" ],
22
22
  [ 'CCDS ID', "ccds" ],
@@ -52,66 +52,5 @@ $biomart_identifiers = [
52
52
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
53
53
  ]
54
54
 
55
- $biomart_positions = [
56
- ['Chromosome Name','chromosome_name'],
57
- ['Strand','strand'],
58
- ['Gene Start','start_position'],
59
- ['Gene End','end_position'],
60
- ['Transcript Start','transcript_start'],
61
- ['Transcript End','transcript_end'],
62
- ]
63
-
64
-
65
-
66
- file 'scientific_name' do |t|
67
- File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
68
- end
69
-
70
- file 'lexicon' do |t|
71
- lexicon = tsv_file('http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag',
72
- "HGNC ID", nil, :flatten => true, :header_hash => '')
73
- merge_biomart lexicon, $biomart_db, $biomart_main, $biomart_lexicon, "HGNC ID"
74
-
75
- File.open(t.name, 'w') do |f| f.puts lexicon end
76
- end
77
-
78
- file 'identifiers' do |t|
79
- identifiers = BioMart.tsv($biomart_db, $biomart_main, $biomart_identifiers)
80
- $biomart_identifiers.each do |name, key, prefix|
81
- if prefix
82
- identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
83
- end
84
- end
85
-
86
- File.open(t.name, 'w') do |f| f.puts identifiers end
87
- end
88
-
89
- file 'gene_go' do |t|
90
- url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
91
- tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
92
-
93
- index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
94
- new = TSV.new({})
95
- tsv.through do |key, values|
96
- next if index[key].nil?
97
- new_key = index[key].first
98
- new[new_key] = values
99
- end
100
-
101
-
102
- new.key_field = "Associated Gene Name"
103
- new.fields = ["GO Term"]
104
- Open.write(t.name, new.to_s)
105
- end
106
-
107
- file 'gene_positions' do |t|
108
- BioMart.set_archive('may2009')
109
- positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
110
- BioMart.unset_archive
111
-
112
- Open.write(t.name, positions.to_s)
113
- end
114
-
115
- task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
116
-
117
-
55
+ $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
56
+ load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -17,9 +17,9 @@ end
17
17
  file 'lexicon' do |t|
18
18
  lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
19
19
 
20
- merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
20
+ lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
21
21
 
22
- merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
22
+ lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
23
23
 
24
24
  lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
25
25
 
@@ -29,9 +29,9 @@ end
29
29
  file 'identifiers' do |t|
30
30
  identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
31
31
 
32
- merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
32
+ identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
33
33
 
34
- merge_biomart(identifiers, $biomart_db, $biomart_main,
34
+ identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
35
35
  [['Associated Gene Name' , "external_gene_id"],
36
36
  ['Ensembl Gene ID', "ensembl_gene_id" ],
37
37
  ['Ensembl Protein ID', "ensembl_peptide_id" ],
@@ -50,69 +50,3 @@ end
50
50
 
51
51
  task :default => ['name', 'lexicon', 'identifiers']
52
52
 
53
- #require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
54
- #
55
- #$name = "Saccharomyces cerevisiae"
56
- #
57
- #
58
- #$native_id = "SGD DB Id"
59
- #
60
- #$entrez2native = {
61
- # :tax => 559292,
62
- # :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
63
- # :check => proc{|code| code.match(/^S0/)},
64
- #}
65
- #
66
- #$lexicon = {
67
- # :file => {
68
- # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
69
- # :native => 0,
70
- # :extra => [4,3,5]
71
- # },
72
- # :biomart => {
73
- # :database => 'scerevisiae_gene_ensembl',
74
- # :main => ['Entrez Gene ID', 'entrezgene'],
75
- # :extra => [
76
- # ['Interpro Description' , "interpro_description"],
77
- # ],
78
- # :filter => [],
79
- # }
80
- #
81
- #}
82
- #
83
- #$identifiers = {
84
- # :file => {
85
- # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
86
- # :native => 0,
87
- # :extra => [],
88
- # },
89
- # :biomart => {
90
- # :database => 'scerevisiae_gene_ensembl',
91
- # :main => ['Entrez Gene ID', 'entrezgene'],
92
- # :extra => [
93
- # ['Associated Gene Name' , "external_gene_id"],
94
- # ['Ensembl Gene ID', "ensembl_gene_id" ],
95
- # ['Ensembl Protein ID', "ensembl_peptide_id" ],
96
- # ['RefSeq Protein ID' , "refseq_peptide"] ,
97
- # ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
98
- # ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
99
- # ['Protein ID' , "protein_id"] ,
100
- # ['EMBL (Genbank) ID' , "embl"] ,
101
- # # Affymetrix
102
- # ['Affy yeast 2',"affy_yeast_2"],
103
- # ['Affy yg s98', "affy_yg_s98"],
104
- # ],
105
- # :filter => [],
106
- # }
107
- #}
108
- #
109
- #$go = {
110
- # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
111
- # :code => 1,
112
- # :go => 4,
113
- # :pmid => 5,
114
- #}
115
- #
116
- #$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
117
- #
118
- #
@@ -0,0 +1,305 @@
1
+ $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
2
+ $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
3
+ $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
4
+ $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
5
+ $biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
6
+ $biomart_germline_variation_id = ['Variation ID', "external_id" ]
7
+
8
+ $biomart_gene_positions = [
9
+ ['Chromosome Name','chromosome_name'],
10
+ ['Strand','strand'],
11
+ ['Gene Start','start_position'],
12
+ ['Gene End','end_position'],
13
+ ]
14
+
15
+ $biomart_gene_sequence = [
16
+ ['Gene Sequence','gene_exon_intron'],
17
+ ]
18
+
19
+ #{{{ Transcript
20
+
21
+ $biomart_gene_transcript = [
22
+ $biomart_ensembl_transcript
23
+ ]
24
+
25
+ $biomart_transcript = [
26
+ ['Transcript Start (bp)','transcript_start'],
27
+ ['Transcript End (bp)','transcript_end'],
28
+ $biomart_ensembl_protein,
29
+ $biomart_ensembl_gene
30
+ ]
31
+
32
+ $biomart_transcript_sequence = [
33
+ ['cDNA','cdna'],
34
+ ]
35
+
36
+ $biomart_transcript_3utr = [
37
+ ["3' UTR", '3utr'],
38
+ ]
39
+
40
+ $biomart_transcript_5utr = [
41
+ ["5' UTR", '5utr'],
42
+ ]
43
+
44
+
45
+ $biomart_protein_sequence = [
46
+ ['Protein Sequence','peptide'],
47
+ ]
48
+
49
+ #{{{ Exons
50
+
51
+ $biomart_transcript_exons = [
52
+ $biomart_ensembl_exon,
53
+ ['Exon Rank in Transcript','rank'],
54
+ ]
55
+
56
+ $biomart_exons = [
57
+ $biomart_ensembl_gene,
58
+ ['Exon Strand','strand'],
59
+ ['Exon Chr Start','exon_chrom_start'],
60
+ ['Exon Chr End','exon_chrom_end'],
61
+ ]
62
+
63
+ #{{{ Variations
64
+
65
+ $biomart_germline_variation_positions = [
66
+ ['Chromosome Location (bp)', "chromosome_location" ],
67
+ ['SNP Chromosome Strand', "snp_chromosome_strand" ],
68
+ ['Transcript location (bp)', "transcript_location" ],
69
+ ['Allele', "allele" ],
70
+ ['Protein Allele', "peptide_shift" ],
71
+ ['CDS Start', "cds_start_2076" ],
72
+ ['CDS End', "cds_end_2076" ],
73
+ ]
74
+
75
+ $biomart_germline_variations = [
76
+ $biomart_ensembl_gene,
77
+ ['Source', "source_name" ],
78
+ ['Validated', "validated" ],
79
+ ['Consequence Type', "synonymous_status" ],
80
+ ]
81
+
82
+ $biomart_somatic_variation_positions = [
83
+ ['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
84
+ ['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
85
+ ['Transcript location (bp)' , "somatic_transcript_location" ] ,
86
+ ['Allele' , "somatic_allele" ] ,
87
+ ['Protein Allele' , "somatic_peptide_shift" ] ,
88
+ ['CDS Start' , "somatic_cds_start_2076" ] ,
89
+ ['CDS End' , "somatic_cds_end_2076" ] ,
90
+ ]
91
+
92
+ $biomart_somatic_variations = [
93
+ $biomart_ensembl_gene,
94
+ ['Source' , "somatic_source_name" ] ,
95
+ ['Validated' , "somatic_validated" ] ,
96
+ ['Consequence Type' , "somatic_synonymous_status" ] ,
97
+ ]
98
+
99
+ #{{{ Rules
100
+
101
+ file 'scientific_name' do |t|
102
+ File.open(t.name, 'w') do |f| f.write $scientific_name end
103
+ end
104
+
105
+ file 'identifiers' do |t|
106
+ identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [])
107
+ $biomart_identifiers.each do |name, key, prefix|
108
+ if prefix
109
+ identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
110
+ end
111
+ end
112
+
113
+ File.open(t.name, 'w') do |f| f.puts identifiers end
114
+ end
115
+
116
+ file 'gene_transcripts' do |t|
117
+ transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat)
118
+
119
+ File.open(t.name, 'w') do |f| f.puts transcripts end
120
+ end
121
+
122
+ file 'transcripts' => 'gene_positions' do |t|
123
+ transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list)
124
+ transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
125
+
126
+ File.open(t.name, 'w') do |f| f.puts transcripts end
127
+ end
128
+
129
+ file 'transcript_3utr' do |t|
130
+ utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :merge => true)
131
+
132
+ File.open(t.name, 'w') do |f|
133
+ f.puts "#: :type=:single#cast=to_i"
134
+ f.puts "#Ensembl Transcript ID\t3' UTR Length"
135
+ utrs.each do |seq,trans|
136
+ trans.each do |tran|
137
+ f.puts [tran, seq.length] * "\t"
138
+ end
139
+ end
140
+ end
141
+ end
142
+
143
+
144
+ file 'transcript_5utr' do |t|
145
+ utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :merge => true)
146
+
147
+ File.open(t.name, 'w') do |f|
148
+ f.puts "#: :type=:single#cast=to_i"
149
+ f.puts "#Ensembl Transcript ID\t5' UTR Length"
150
+ utrs.each do |seq,trans|
151
+ trans.each do |tran|
152
+ f.puts [tran, seq.length] * "\t"
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ file 'gene_positions' do |t|
159
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
160
+
161
+ File.open(t.name, 'w') do |f| f.puts sequences end
162
+ end
163
+
164
+ file 'gene_sequence' do |t|
165
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_sequence, [], nil, :type => :flat, :merge => true)
166
+
167
+ File.open(t.name, 'w') do |f|
168
+ f.puts "#: :type=:single"
169
+ f.puts "#Ensembl Gene ID\tProtein Sequence"
170
+ sequences.each do |seq, genes|
171
+ genes.each do |gene|
172
+ f.write gene
173
+ f.write "\t"
174
+ f.write seq
175
+ f.write "\n"
176
+ end
177
+ end
178
+ end
179
+ end
180
+
181
+ file 'protein_sequence' do |t|
182
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :merge => true)
183
+
184
+ File.open(t.name, 'w') do |f|
185
+ f.puts "#: :type=:single"
186
+ f.puts "#Ensembl Protein ID\tProtein Sequence"
187
+ sequences.each do |seq, genes|
188
+ genes.each do |gene|
189
+ f.write gene
190
+ f.write "\t"
191
+ f.write seq
192
+ f.write "\n"
193
+ end
194
+ end
195
+ end
196
+
197
+ end
198
+
199
+ file 'exons' => 'gene_positions' do |t|
200
+ exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list)
201
+ exons.attach TSV.new('gene_positions'), "Chromosome Name"
202
+
203
+ File.open(t.name, 'w') do |f| f.puts exons end
204
+ end
205
+
206
+ file 'transcript_exons' do |t|
207
+ exons = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_exons, [], nil, :keep_empty => true)
208
+
209
+ File.open(t.name, 'w') do |f| f.puts exons end
210
+ end
211
+
212
+ file 'transcript_sequence' do |t|
213
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :merge => true)
214
+
215
+ File.open(t.name, 'w') do |f|
216
+ f.puts "#: :type=:single"
217
+ f.puts "#Ensembl Transcript ID\tProtein Sequence"
218
+ sequences.each do |seq, genes|
219
+ genes.each do |gene|
220
+ f.write gene
221
+ f.write "\t"
222
+ f.write seq
223
+ f.write "\n"
224
+ end
225
+ end
226
+ end
227
+ end
228
+
229
+
230
+ $biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD_TRANSCRIPT,COMPLEX_INDEL&SPLICE_SITE,ESSENTIAL_SPLICE_SITE&INTRONIC,ESSENTIAL_SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,FRAMESHIFT_CODING,FRAMESHIFT_CODING&NMD_TRANSCRIPT,FRAMESHIFT_CODING&SPLICE_SITE,FRAMESHIFT_CODING&SPLICE_SITE&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING&SPLICE_SITE,NON_SYNONYMOUS_CODING&SPLICE_SITE&NMD_TRANSCRIPT,REGULATORY_REGION,SPLICE_SITE&3PRIME_UTR,SPLICE_SITE&3PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&5PRIME_UTR,SPLICE_SITE&5PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&INTRONIC,SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,SPLICE_SITE&SYNONYMOUS_CODING,SPLICE_SITE&SYNONYMOUS_CODING&NMD_TRANSCRIPT,STOP_GAINED,STOP_GAINED&FRAMESHIFT_CODING,STOP_GAINED&FRAMESHIFT_CODING&NMD_TRANSCRIPT,STOP_GAINED&NMD_TRANSCRIPT,STOP_GAINED&SPLICE_SITE,STOP_GAINED&SPLICE_SITE&NMD_TRANSCRIPT,STOP_LOST,STOP_LOST&NMD_TRANSCRIPT,STOP_LOST&SPLICE_SITE,STOP_LOST&SPLICE_SITE&NMD_TRANSCRIPT,SYNONYMOUS_CODING,SYNONYMOUS_CODING&NMD_TRANSCRIPT"]
231
+ #$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
232
+ $biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
233
+
234
+ file 'germline_variations' do |t|
235
+ variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
236
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
237
+ end
238
+
239
+ file 'germline_variation_positions' do |t|
240
+ variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
241
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
242
+ end
243
+
244
+ file 'somatic_variations' do |t|
245
+ variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
246
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
247
+ end
248
+
249
+ file 'somatic_variation_positions' do |t|
250
+ variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
251
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
252
+ end
253
+
254
+ file 'gene_pmids' do |t|
255
+ tsv = Entrez.entrez2pubmed($taxs)
256
+ text = "#Entrez Gene ID\tPMID"
257
+ tsv.each do |gene, pmids|
258
+ text << "\n" << gene << "\t" << pmids * "|"
259
+ end
260
+ Open.write(t.name, text)
261
+ end
262
+
263
+ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
264
+ require 'rbbt/sources/organism/sequence'
265
+
266
+ exons = TSV.new('exons', :persistence => true)
267
+ exon_transcripts = TSV.new('transcript_exons', :double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
268
+ gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
269
+ transcript_info = TSV.new('transcripts', :list, :persistence => true )
270
+ transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
271
+
272
+
273
+ string = "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
274
+ exons.each do |exon, info|
275
+ gene, start, finish, strand, chr = info
276
+
277
+ transcripts = Organism::Hsa.coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
278
+
279
+ transcript_offsets = {}
280
+ transcripts.each do |transcript|
281
+ offset = Organism::Hsa.exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
282
+ transcript_offsets[transcript] = offset unless offset.nil?
283
+ end
284
+
285
+ string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
286
+ end
287
+
288
+ Open.write(t.name, string)
289
+ end
290
+
291
+ rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
292
+ t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
293
+ archive = $1
294
+ task = $2
295
+ old_pwd = FileUtils.pwd
296
+ begin
297
+ FileUtils.mkdir archive unless File.exists? archive
298
+ FileUtils.cd File.join(archive)
299
+ BioMart.set_archive archive
300
+ Rake::Task[task].invoke
301
+ BioMart.unset_archive
302
+ ensure
303
+ FileUtils.cd old_pwd
304
+ end
305
+ end