rbbt-sources 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  require 'rbbt'
2
2
 
3
3
  module Polysearch
4
- Rbbt.claim "organ" ,'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt', 'Polysearch'
5
- Rbbt.claim "tissue" ,'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt', 'Polysearch'
6
- Rbbt.claim "location" ,'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt', 'Polysearch'
7
- Rbbt.claim "disease" ,'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt', 'Polysearch'
8
- Rbbt.claim "drug" ,'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt', 'Polysearch'
4
+ Rbbt.share.Polysearch.organ.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/organ_ID.txt'
5
+ Rbbt.share.Polysearch.tissue.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/tissue_ID.txt'
6
+ Rbbt.share.Polysearch.location.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/subcellular_localization_ID.txt'
7
+ Rbbt.share.Polysearch.disease.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/disease_IDlist.txt'
8
+ Rbbt.share.Polysearch.drug.define_as_url 'http://wishart.biology.ualberta.ca/polysearch/include/drugnames.txt'
9
9
  end
10
10
 
@@ -3,11 +3,11 @@ require 'rbbt/sources/biomart'
3
3
  require 'rbbt/sources/entrez'
4
4
  require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
- $taxs = [559292,4932]
7
- $native = "SGD ID"
8
- $url = "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab"
6
+ $taxs = [9606]
7
+ $scientific_name = "Homo sapiens"
8
+
9
9
  $biomart_db = 'hsapiens_gene_ensembl'
10
- $biomart_main = ['Entrez Gene ID', 'entrezgene']
10
+
11
11
  $biomart_lexicon = [
12
12
  [ 'Associated Gene Name' , "external_gene_id"],
13
13
  [ 'HGNC symbol', "hgnc_symbol" ],
@@ -16,7 +16,7 @@ $biomart_lexicon = [
16
16
  ]
17
17
 
18
18
  $biomart_identifiers = [
19
- [ 'Ensembl Gene ID', "ensembl_gene_id" ],
19
+ [ 'Entrez Gene ID', "entrezgene"],
20
20
  [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
21
21
  [ 'Associated Gene Name', "external_gene_id" ],
22
22
  [ 'CCDS ID', "ccds" ],
@@ -52,66 +52,5 @@ $biomart_identifiers = [
52
52
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
53
53
  ]
54
54
 
55
- $biomart_positions = [
56
- ['Chromosome Name','chromosome_name'],
57
- ['Strand','strand'],
58
- ['Gene Start','start_position'],
59
- ['Gene End','end_position'],
60
- ['Transcript Start','transcript_start'],
61
- ['Transcript End','transcript_end'],
62
- ]
63
-
64
-
65
-
66
- file 'scientific_name' do |t|
67
- File.open(t.name, 'w') do |f| f.puts "Homo sapiens" end
68
- end
69
-
70
- file 'lexicon' do |t|
71
- lexicon = tsv_file('http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag',
72
- "HGNC ID", nil, :flatten => true, :header_hash => '')
73
- merge_biomart lexicon, $biomart_db, $biomart_main, $biomart_lexicon, "HGNC ID"
74
-
75
- File.open(t.name, 'w') do |f| f.puts lexicon end
76
- end
77
-
78
- file 'identifiers' do |t|
79
- identifiers = BioMart.tsv($biomart_db, $biomart_main, $biomart_identifiers)
80
- $biomart_identifiers.each do |name, key, prefix|
81
- if prefix
82
- identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
83
- end
84
- end
85
-
86
- File.open(t.name, 'w') do |f| f.puts identifiers end
87
- end
88
-
89
- file 'gene_go' do |t|
90
- url = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"
91
- tsv = TSV.new(Open.open(url, :gzip => true), :native => 2, :extra => 4)
92
-
93
- index = TSV.index(Organism::Hsa.identifiers, :persistence => true)
94
- new = TSV.new({})
95
- tsv.through do |key, values|
96
- next if index[key].nil?
97
- new_key = index[key].first
98
- new[new_key] = values
99
- end
100
-
101
-
102
- new.key_field = "Associated Gene Name"
103
- new.fields = ["GO Term"]
104
- Open.write(t.name, new.to_s)
105
- end
106
-
107
- file 'gene_positions' do |t|
108
- BioMart.set_archive('may2009')
109
- positions = BioMart.tsv($biomart_db, $biomart_main, $biomart_positions)
110
- BioMart.unset_archive
111
-
112
- Open.write(t.name, positions.to_s)
113
- end
114
-
115
- task :default => ['name', 'lexicon', 'identifiers', 'gene_positions']
116
-
117
-
55
+ $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
56
+ load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
@@ -17,9 +17,9 @@ end
17
17
  file 'lexicon' do |t|
18
18
  lexicon = tsv_file($url, [$native, 0], [3, 4, 5], :keep_empty => true)
19
19
 
20
- merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
20
+ lexicon = merge_entrez(lexicon, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
21
21
 
22
- merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
22
+ lexicon = merge_biomart(lexicon, $biomart_db, $biomart_main, [['Interpro Description' , "interpro_description"]])
23
23
 
24
24
  lexicon = lexicon.slice(lexicon.fields - ["Entrez Gene ID"])
25
25
 
@@ -29,9 +29,9 @@ end
29
29
  file 'identifiers' do |t|
30
30
  identifiers = tsv_file($url, [$native, 0], [["Ensembl Gene ID", 3], ["Associated Gene Name",4], ["Associated Gene Name Alias", 5]], :keep_empty => true)
31
31
 
32
- merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tS0/)})
32
+ identifiers = merge_entrez(identifiers, $taxs, $native, proc{|code| code.sub(/SGD:S0/,'S0') }, proc{|code| code.match(/\tSGD:S0/)})
33
33
 
34
- merge_biomart(identifiers, $biomart_db, $biomart_main,
34
+ identifiers = merge_biomart(identifiers, $biomart_db, $biomart_main,
35
35
  [['Associated Gene Name' , "external_gene_id"],
36
36
  ['Ensembl Gene ID', "ensembl_gene_id" ],
37
37
  ['Ensembl Protein ID', "ensembl_peptide_id" ],
@@ -50,69 +50,3 @@ end
50
50
 
51
51
  task :default => ['name', 'lexicon', 'identifiers']
52
52
 
53
- #require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
54
- #
55
- #$name = "Saccharomyces cerevisiae"
56
- #
57
- #
58
- #$native_id = "SGD DB Id"
59
- #
60
- #$entrez2native = {
61
- # :tax => 559292,
62
- # :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
63
- # :check => proc{|code| code.match(/^S0/)},
64
- #}
65
- #
66
- #$lexicon = {
67
- # :file => {
68
- # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
69
- # :native => 0,
70
- # :extra => [4,3,5]
71
- # },
72
- # :biomart => {
73
- # :database => 'scerevisiae_gene_ensembl',
74
- # :main => ['Entrez Gene ID', 'entrezgene'],
75
- # :extra => [
76
- # ['Interpro Description' , "interpro_description"],
77
- # ],
78
- # :filter => [],
79
- # }
80
- #
81
- #}
82
- #
83
- #$identifiers = {
84
- # :file => {
85
- # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
86
- # :native => 0,
87
- # :extra => [],
88
- # },
89
- # :biomart => {
90
- # :database => 'scerevisiae_gene_ensembl',
91
- # :main => ['Entrez Gene ID', 'entrezgene'],
92
- # :extra => [
93
- # ['Associated Gene Name' , "external_gene_id"],
94
- # ['Ensembl Gene ID', "ensembl_gene_id" ],
95
- # ['Ensembl Protein ID', "ensembl_peptide_id" ],
96
- # ['RefSeq Protein ID' , "refseq_peptide"] ,
97
- # ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
98
- # ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
99
- # ['Protein ID' , "protein_id"] ,
100
- # ['EMBL (Genbank) ID' , "embl"] ,
101
- # # Affymetrix
102
- # ['Affy yeast 2',"affy_yeast_2"],
103
- # ['Affy yg s98', "affy_yg_s98"],
104
- # ],
105
- # :filter => [],
106
- # }
107
- #}
108
- #
109
- #$go = {
110
- # :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
111
- # :code => 1,
112
- # :go => 4,
113
- # :pmid => 5,
114
- #}
115
- #
116
- #$query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
117
- #
118
- #
@@ -0,0 +1,305 @@
1
+ $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
2
+ $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
3
+ $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
4
+ $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
5
+ $biomart_somatic_variation_id = ['Variation ID', "somatic_reference_id" ]
6
+ $biomart_germline_variation_id = ['Variation ID', "external_id" ]
7
+
8
+ $biomart_gene_positions = [
9
+ ['Chromosome Name','chromosome_name'],
10
+ ['Strand','strand'],
11
+ ['Gene Start','start_position'],
12
+ ['Gene End','end_position'],
13
+ ]
14
+
15
+ $biomart_gene_sequence = [
16
+ ['Gene Sequence','gene_exon_intron'],
17
+ ]
18
+
19
+ #{{{ Transcript
20
+
21
+ $biomart_gene_transcript = [
22
+ $biomart_ensembl_transcript
23
+ ]
24
+
25
+ $biomart_transcript = [
26
+ ['Transcript Start (bp)','transcript_start'],
27
+ ['Transcript End (bp)','transcript_end'],
28
+ $biomart_ensembl_protein,
29
+ $biomart_ensembl_gene
30
+ ]
31
+
32
+ $biomart_transcript_sequence = [
33
+ ['cDNA','cdna'],
34
+ ]
35
+
36
+ $biomart_transcript_3utr = [
37
+ ["3' UTR", '3utr'],
38
+ ]
39
+
40
+ $biomart_transcript_5utr = [
41
+ ["5' UTR", '5utr'],
42
+ ]
43
+
44
+
45
+ $biomart_protein_sequence = [
46
+ ['Protein Sequence','peptide'],
47
+ ]
48
+
49
+ #{{{ Exons
50
+
51
+ $biomart_transcript_exons = [
52
+ $biomart_ensembl_exon,
53
+ ['Exon Rank in Transcript','rank'],
54
+ ]
55
+
56
+ $biomart_exons = [
57
+ $biomart_ensembl_gene,
58
+ ['Exon Strand','strand'],
59
+ ['Exon Chr Start','exon_chrom_start'],
60
+ ['Exon Chr End','exon_chrom_end'],
61
+ ]
62
+
63
+ #{{{ Variations
64
+
65
+ $biomart_germline_variation_positions = [
66
+ ['Chromosome Location (bp)', "chromosome_location" ],
67
+ ['SNP Chromosome Strand', "snp_chromosome_strand" ],
68
+ ['Transcript location (bp)', "transcript_location" ],
69
+ ['Allele', "allele" ],
70
+ ['Protein Allele', "peptide_shift" ],
71
+ ['CDS Start', "cds_start_2076" ],
72
+ ['CDS End', "cds_end_2076" ],
73
+ ]
74
+
75
+ $biomart_germline_variations = [
76
+ $biomart_ensembl_gene,
77
+ ['Source', "source_name" ],
78
+ ['Validated', "validated" ],
79
+ ['Consequence Type', "synonymous_status" ],
80
+ ]
81
+
82
+ $biomart_somatic_variation_positions = [
83
+ ['Chromosome Location (bp)' , "somatic_chromosome_location" ] ,
84
+ ['SNP Chromosome Strand' , "somatic_snp_chromosome_strand" ] ,
85
+ ['Transcript location (bp)' , "somatic_transcript_location" ] ,
86
+ ['Allele' , "somatic_allele" ] ,
87
+ ['Protein Allele' , "somatic_peptide_shift" ] ,
88
+ ['CDS Start' , "somatic_cds_start_2076" ] ,
89
+ ['CDS End' , "somatic_cds_end_2076" ] ,
90
+ ]
91
+
92
+ $biomart_somatic_variations = [
93
+ $biomart_ensembl_gene,
94
+ ['Source' , "somatic_source_name" ] ,
95
+ ['Validated' , "somatic_validated" ] ,
96
+ ['Consequence Type' , "somatic_synonymous_status" ] ,
97
+ ]
98
+
99
+ #{{{ Rules
100
+
101
+ file 'scientific_name' do |t|
102
+ File.open(t.name, 'w') do |f| f.write $scientific_name end
103
+ end
104
+
105
+ file 'identifiers' do |t|
106
+ identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [])
107
+ $biomart_identifiers.each do |name, key, prefix|
108
+ if prefix
109
+ identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
110
+ end
111
+ end
112
+
113
+ File.open(t.name, 'w') do |f| f.puts identifiers end
114
+ end
115
+
116
+ file 'gene_transcripts' do |t|
117
+ transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_transcript, [], nil, :type => :flat)
118
+
119
+ File.open(t.name, 'w') do |f| f.puts transcripts end
120
+ end
121
+
122
+ file 'transcripts' => 'gene_positions' do |t|
123
+ transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list)
124
+ transcripts.attach TSV.new('gene_positions'), "Chromosome Name"
125
+
126
+ File.open(t.name, 'w') do |f| f.puts transcripts end
127
+ end
128
+
129
+ file 'transcript_3utr' do |t|
130
+ utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_3utr, [], nil, :type => :flat, :merge => true)
131
+
132
+ File.open(t.name, 'w') do |f|
133
+ f.puts "#: :type=:single#cast=to_i"
134
+ f.puts "#Ensembl Transcript ID\t3' UTR Length"
135
+ utrs.each do |seq,trans|
136
+ trans.each do |tran|
137
+ f.puts [tran, seq.length] * "\t"
138
+ end
139
+ end
140
+ end
141
+ end
142
+
143
+
144
+ file 'transcript_5utr' do |t|
145
+ utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :merge => true)
146
+
147
+ File.open(t.name, 'w') do |f|
148
+ f.puts "#: :type=:single#cast=to_i"
149
+ f.puts "#Ensembl Transcript ID\t5' UTR Length"
150
+ utrs.each do |seq,trans|
151
+ trans.each do |tran|
152
+ f.puts [tran, seq.length] * "\t"
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ file 'gene_positions' do |t|
159
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
160
+
161
+ File.open(t.name, 'w') do |f| f.puts sequences end
162
+ end
163
+
164
+ file 'gene_sequence' do |t|
165
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_sequence, [], nil, :type => :flat, :merge => true)
166
+
167
+ File.open(t.name, 'w') do |f|
168
+ f.puts "#: :type=:single"
169
+ f.puts "#Ensembl Gene ID\tProtein Sequence"
170
+ sequences.each do |seq, genes|
171
+ genes.each do |gene|
172
+ f.write gene
173
+ f.write "\t"
174
+ f.write seq
175
+ f.write "\n"
176
+ end
177
+ end
178
+ end
179
+ end
180
+
181
+ file 'protein_sequence' do |t|
182
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :merge => true)
183
+
184
+ File.open(t.name, 'w') do |f|
185
+ f.puts "#: :type=:single"
186
+ f.puts "#Ensembl Protein ID\tProtein Sequence"
187
+ sequences.each do |seq, genes|
188
+ genes.each do |gene|
189
+ f.write gene
190
+ f.write "\t"
191
+ f.write seq
192
+ f.write "\n"
193
+ end
194
+ end
195
+ end
196
+
197
+ end
198
+
199
+ file 'exons' => 'gene_positions' do |t|
200
+ exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list)
201
+ exons.attach TSV.new('gene_positions'), "Chromosome Name"
202
+
203
+ File.open(t.name, 'w') do |f| f.puts exons end
204
+ end
205
+
206
+ file 'transcript_exons' do |t|
207
+ exons = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_exons, [], nil, :keep_empty => true)
208
+
209
+ File.open(t.name, 'w') do |f| f.puts exons end
210
+ end
211
+
212
+ file 'transcript_sequence' do |t|
213
+ sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :merge => true)
214
+
215
+ File.open(t.name, 'w') do |f|
216
+ f.puts "#: :type=:single"
217
+ f.puts "#Ensembl Transcript ID\tProtein Sequence"
218
+ sequences.each do |seq, genes|
219
+ genes.each do |gene|
220
+ f.write gene
221
+ f.write "\t"
222
+ f.write seq
223
+ f.write "\n"
224
+ end
225
+ end
226
+ end
227
+ end
228
+
229
+
230
+ $biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,COMPLEX_INDEL&NMD_TRANSCRIPT,COMPLEX_INDEL&SPLICE_SITE,ESSENTIAL_SPLICE_SITE&INTRONIC,ESSENTIAL_SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,FRAMESHIFT_CODING,FRAMESHIFT_CODING&NMD_TRANSCRIPT,FRAMESHIFT_CODING&SPLICE_SITE,FRAMESHIFT_CODING&SPLICE_SITE&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING&NMD_TRANSCRIPT,NON_SYNONYMOUS_CODING&SPLICE_SITE,NON_SYNONYMOUS_CODING&SPLICE_SITE&NMD_TRANSCRIPT,REGULATORY_REGION,SPLICE_SITE&3PRIME_UTR,SPLICE_SITE&3PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&5PRIME_UTR,SPLICE_SITE&5PRIME_UTR&NMD_TRANSCRIPT,SPLICE_SITE&INTRONIC,SPLICE_SITE&INTRONIC&NMD_TRANSCRIPT,SPLICE_SITE&SYNONYMOUS_CODING,SPLICE_SITE&SYNONYMOUS_CODING&NMD_TRANSCRIPT,STOP_GAINED,STOP_GAINED&FRAMESHIFT_CODING,STOP_GAINED&FRAMESHIFT_CODING&NMD_TRANSCRIPT,STOP_GAINED&NMD_TRANSCRIPT,STOP_GAINED&SPLICE_SITE,STOP_GAINED&SPLICE_SITE&NMD_TRANSCRIPT,STOP_LOST,STOP_LOST&NMD_TRANSCRIPT,STOP_LOST&SPLICE_SITE,STOP_LOST&SPLICE_SITE&NMD_TRANSCRIPT,SYNONYMOUS_CODING,SYNONYMOUS_CODING&NMD_TRANSCRIPT"]
231
+ #$biomart_variation_filter = ["snptype_filters", "COMPLEX_INDEL,SYNONYMOUS_CODING"]
232
+ $biomart_variation_filter = ["snptype_filters", 'COMPLEX_INDEL&NMD_TRANSCRIPT']
233
+
234
+ file 'germline_variations' do |t|
235
+ variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
236
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
237
+ end
238
+
239
+ file 'germline_variation_positions' do |t|
240
+ variations = BioMart.tsv($biomart_db, $biomart_germline_variation_id, $biomart_germline_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
241
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
242
+ end
243
+
244
+ file 'somatic_variations' do |t|
245
+ variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variations, [], nil, :keep_empty => true, :type => :list, :merge => false)
246
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
247
+ end
248
+
249
+ file 'somatic_variation_positions' do |t|
250
+ variations = BioMart.tsv($biomart_db, $biomart_somatic_variation_id, $biomart_somatic_variation_positions, [], nil, :keep_empty => true, :type => :list, :merge => false)
251
+ File.open(t.name, 'w') do |f| f.puts variations.to_s end
252
+ end
253
+
254
+ file 'gene_pmids' do |t|
255
+ tsv = Entrez.entrez2pubmed($taxs)
256
+ text = "#Entrez Gene ID\tPMID"
257
+ tsv.each do |gene, pmids|
258
+ text << "\n" << gene << "\t" << pmids * "|"
259
+ end
260
+ Open.write(t.name, text)
261
+ end
262
+
263
+ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts transcript_exons) do |t|
264
+ require 'rbbt/sources/organism/sequence'
265
+
266
+ exons = TSV.new('exons', :persistence => true)
267
+ exon_transcripts = TSV.new('transcript_exons', :double, :key => "Ensembl Exon ID", :fields => ["Ensembl Transcript ID"], :merge => true, :persistence => true )
268
+ gene_transcripts = TSV.new('gene_transcripts', :flat, :persistence => true )
269
+ transcript_info = TSV.new('transcripts', :list, :persistence => true )
270
+ transcript_exons = TSV.new('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"], :persistence => true )
271
+
272
+
273
+ string = "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
274
+ exons.each do |exon, info|
275
+ gene, start, finish, strand, chr = info
276
+
277
+ transcripts = Organism::Hsa.coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
278
+
279
+ transcript_offsets = {}
280
+ transcripts.each do |transcript|
281
+ offset = Organism::Hsa.exon_offset_in_transcript(exon, transcript, exons, transcript_exons)
282
+ transcript_offsets[transcript] = offset unless offset.nil?
283
+ end
284
+
285
+ string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
286
+ end
287
+
288
+ Open.write(t.name, string)
289
+ end
290
+
291
+ rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
292
+ t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
293
+ archive = $1
294
+ task = $2
295
+ old_pwd = FileUtils.pwd
296
+ begin
297
+ FileUtils.mkdir archive unless File.exists? archive
298
+ FileUtils.cd File.join(archive)
299
+ BioMart.set_archive archive
300
+ Rake::Task[task].invoke
301
+ BioMart.unset_archive
302
+ ensure
303
+ FileUtils.cd old_pwd
304
+ end
305
+ end