rbbt-sources 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,3 +5,61 @@ module NCI
5
5
 
6
6
  NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
7
7
  end
8
+
9
+ if defined? Entity
10
+
11
+ module NCINaturePathways
12
+ extend Entity
13
+ self.format = "NCI Nature Pathway ID"
14
+
15
+ property :name => :array2single do
16
+ @name ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
17
+ end
18
+
19
+ property :genes => :array2single do
20
+ @genes ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
21
+ end
22
+ end
23
+
24
+ module NCIReactomePathways
25
+ extend Entity
26
+ self.format = "NCI Reactome Pathway ID"
27
+
28
+ property :name => :array2single do
29
+ @name ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
30
+ end
31
+
32
+ property :genes => :array2single do
33
+ @genes ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
34
+ end
35
+ end
36
+
37
+ module NCIBioCartaPathways
38
+ extend Entity
39
+ self.format = "NCI BioCarta Pathway ID"
40
+
41
+ property :name => :array2single do
42
+ @name ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
43
+ end
44
+
45
+ property :genes => :array2single do
46
+ @genes ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true).values_at *self
47
+ end
48
+ end
49
+
50
+ if defined? Gene and Entity === Gene
51
+ module Gene
52
+ property :nature_pathways => :array2single do
53
+ @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
54
+ end
55
+
56
+ property :reactome_pathways => :array2single do
57
+ @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
58
+ end
59
+
60
+ property :biocarta_pathways => :array2single do
61
+ @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at *self.entrez
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,23 @@
1
+ require 'rbbt-util'
2
+
3
+ module Barcode
4
+ extend Resource
5
+ self.subdir = "share/databases/Barcode"
6
+
7
+ Barcode.claim Barcode.transcriptome, :proc do
8
+ tsv = TSV.open(Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv"),
9
+ :fix => Proc.new{|l| l.gsub('"', '').gsub(',', "\t")}, :header_hash => "",
10
+ :type => :list, :cast => :to_f)
11
+ io = Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv")
12
+ fields = io.gets.chomp.gsub('"','').split(',')
13
+ io.close
14
+ fields.shift
15
+ tsv.fields = fields
16
+ tsv.key_field = "AFFY HG U133-PLUS-2"
17
+ tsv.to_s
18
+ end
19
+
20
+ end
21
+
22
+
23
+ Barcode.transcriptome.produce
@@ -22,7 +22,7 @@ module BioMart
22
22
  @@biomart_query_xml = <<-EOT
23
23
  <?xml version="1.0" encoding="UTF-8"?>
24
24
  <!DOCTYPE Query>
25
- <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
25
+ <Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
26
26
  <Dataset name = "<!--DATABASE-->" interface = "default" >
27
27
  <!--FILTERS-->
28
28
  <!--MAIN-->
@@ -47,26 +47,50 @@ module BioMart
47
47
  end
48
48
 
49
49
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
50
+ open_options = Misc.add_defaults :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
50
51
  repeats = true
51
52
  attrs ||= []
52
53
  filters ||= ["with_#{main}"]
53
54
 
55
+ if chunk_filter = open_options.delete(:chunk_filter)
56
+ filter, values = chunk_filter
57
+ merged_file = TmpFile.tmp_file
58
+ f = File.open(merged_file, 'w')
59
+ values.each do |value|
60
+ data = get(database, main, attrs, filters + [[filter, value]], data, open_options)
61
+ f.write Open.read(data)
62
+ end
63
+ f.close
64
+ return merged_file
65
+ end
66
+
54
67
  query = @@biomart_query_xml.dup
55
68
  query.sub!(/<!--DATABASE-->/,database)
56
69
  query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
57
70
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
58
71
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
59
72
 
60
- if @archive_url
61
- response = Open.read(@archive_url + query.gsub(/\n/,' '), open_options)
62
- else
63
- response = Open.read(BIOMART_URL + query.gsub(/\n/,' '), open_options)
73
+ url = @archive_url ? @archive_url + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
74
+
75
+ begin
76
+ response = Open.read(url, open_options.dup)
77
+ rescue
78
+ Open.remove_from_cache url, open_options
79
+ raise $!
64
80
  end
65
81
 
66
- if response.empty? or response =~ /Query ERROR:/
82
+ if response.empty? or response =~ /Query ERROR:/
83
+ Open.remove_from_cache url, open_options
67
84
  raise BioMart::QueryError, response
68
85
  end
69
86
 
87
+ if not response =~ /\[success\]$/sm
88
+ Open.remove_from_cache url, open_options
89
+ raise BioMart::QueryError, "Uncomplete result"
90
+ end
91
+
92
+ response.sub!(/\n\[success\]$/sm,'')
93
+
70
94
  result_file = TmpFile.tmp_file
71
95
  Open.write(result_file, response)
72
96
 
@@ -101,10 +125,10 @@ module BioMart
101
125
  # cause an error if the BioMart WS does not allow filtering with that
102
126
  # attribute.
103
127
  def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
104
- open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil
105
- filename, field_names = Misc.process_options open_options, :filename, :field_names
128
+ open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false
129
+ filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr
106
130
  attrs ||= []
107
-
131
+
108
132
  open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
109
133
 
110
134
  Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
@@ -122,11 +146,17 @@ module BioMart
122
146
 
123
147
  chunks << chunk if chunk.any?
124
148
 
149
+ chunks << [] if chunks.empty?
150
+
125
151
  Log.low "Chunks: #{chunks.length}"
126
- chunks.each_with_index{|chunk,i|
127
- Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
128
- data = get(database, main, chunk, filters, data, open_options)
129
- }
152
+ if chunks.any?
153
+ chunks.each_with_index{|chunk,i|
154
+ Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
155
+ data = get(database, main, chunk, filters, data, open_options)
156
+ }
157
+ else
158
+ data = get(database, main, [], filters, data, open_options)
159
+ end
130
160
 
131
161
  open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
132
162
  if filename.nil?
@@ -150,10 +180,13 @@ module BioMart
150
180
  end
151
181
 
152
182
  def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
183
+ attrs ||= []
184
+
153
185
  if @archive_url
154
186
  attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
155
187
  end
156
188
 
189
+
157
190
  codes = attrs.collect{|attr| attr[1]}
158
191
  if open_options[:filename].nil?
159
192
  tsv = query(database, main.last, codes, filters, data, open_options)
@@ -35,7 +35,7 @@ module Entrez
35
35
 
36
36
 
37
37
  def self.entrez2pubmed(taxs)
38
- options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
38
+ options = {:key_field => 1, :fields => [2], :persist => true, :merge => true}
39
39
 
40
40
  taxs = [taxs] unless taxs.is_a?(Array)
41
41
  options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
@@ -84,3 +84,33 @@ module GO
84
84
  end
85
85
  end
86
86
  end
87
+
88
+ if defined? Entity
89
+
90
+ module GOTerm
91
+ extend Entity
92
+ self.format = "GO ID"
93
+
94
+ property :name => :array2single do
95
+ @name ||= GO.id2name(self)
96
+ end
97
+
98
+ property :genes => :array2single do |organism|
99
+ @genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
100
+ end
101
+ end
102
+
103
+ if defined? Gene and Entity === Gene
104
+ module Gene
105
+ property :go_terms => :array2single do |organism|
106
+ @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
107
+ end
108
+
109
+ property :go_bp_terms => :array2single do |organism|
110
+ @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+
@@ -0,0 +1,65 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'nokogiri'
4
+
5
+ module TFacts
6
+ extend Resource
7
+ self.subdir = "share/databases/TF"
8
+
9
+ def self.targets_for_gene_unsigned(gene_name)
10
+ doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
11
+
12
+ doc.css("td a").collect{|link| link.content.strip}
13
+ end
14
+
15
+ def self.targets_for_gene_signed(gene_name)
16
+ doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResults.php", :post => "TFS_ID=#{ gene_name }"))
17
+
18
+ rows = doc.css("tr")
19
+ rows.shift
20
+ targets = {}
21
+ rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
22
+ targets
23
+ end
24
+
25
+ def self.known_transcription_factors_signed
26
+ Open.read("http://www.tfacts.org/source/tfs.php").scan(/OPTION VALUE=([^\s]+)/).flatten
27
+ end
28
+
29
+ def self.known_transcription_factors_unsigned
30
+ Open.read("http://www.tfacts.org/source/tfsns.php").scan(/OPTION VALUE=([^\s]+)/).flatten
31
+ end
32
+
33
+ TFacts.claim TFacts.targets, :proc do
34
+ tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
35
+ TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
36
+ tsv.to_s
37
+ end
38
+
39
+ TFacts.claim TFacts.targets_signed, :proc do
40
+ tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name", "Target Sign"], :type => :double)
41
+ Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
42
+ tsv[tf] = [targets.keys, targets.values]
43
+ end
44
+ tsv.to_s
45
+ end
46
+ end
47
+
48
+
49
+ if defined? Entity and defined? Gene and Entity === Gene
50
+
51
+ module Gene
52
+ property :is_transcription_factor? => :array2single do
53
+ @is_trasncription_factor ||= begin
54
+ tfs = TFacts.targets.keys
55
+ self.name.collect{|gene| tfs.include? gene}
56
+ end
57
+ end
58
+
59
+ property :transcription_targets => :array2single do
60
+ @transcription_targets ||= begin
61
+ Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -51,7 +51,7 @@ file 'nature_pathways' do |t|
51
51
 
52
52
  pathways = NCI.get_pathways(xml)
53
53
 
54
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
54
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
55
55
  end
56
56
 
57
57
  file 'biocarta_pathways' do |t|
@@ -62,7 +62,7 @@ file 'biocarta_pathways' do |t|
62
62
 
63
63
  pathways = NCI.get_pathways(xml, "LL")
64
64
 
65
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
65
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
66
66
  end
67
67
 
68
68
  file 'reactome_pathways' do |t|
@@ -73,7 +73,7 @@ file 'reactome_pathways' do |t|
73
73
 
74
74
  pathways = NCI.get_pathways(xml, "UP")
75
75
 
76
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
76
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
77
77
  end
78
78
 
79
79
 
@@ -91,6 +91,12 @@ $biomart_go= [
91
91
  ["GO Namespace", 'namespace_1003'],
92
92
  ]
93
93
 
94
+ $biomart_go_2009= [
95
+ ["GO BP ID", 'go_biological_process_id'],
96
+ ["GO MF ID", 'go_molecular_function_id'],
97
+ ["GO CC ID", 'go_cellular_component_id'],
98
+ ]
99
+
94
100
  $biomart_pfam= [
95
101
  ["Pfam Domain", 'pfam'],
96
102
  ]
@@ -51,6 +51,13 @@ $biomart_transcript_exons = [
51
51
  ['Exon Rank in Transcript','rank'],
52
52
  ]
53
53
 
54
+ $biomart_exon_phase = [
55
+ $biomart_ensembl_transcript,
56
+ ['Phase','phase'],
57
+ ]
58
+
59
+
60
+
54
61
  $biomart_exons = [
55
62
  $biomart_ensembl_gene,
56
63
  ['Exon Strand','strand'],
@@ -66,6 +73,7 @@ end
66
73
 
67
74
  file 'identifiers' do |t|
68
75
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
76
+ identifiers.unnamed = true
69
77
 
70
78
  $biomart_identifiers.each do |name, key, prefix|
71
79
  next unless identifiers.all_fields.include? name
@@ -95,11 +103,14 @@ file 'identifiers' do |t|
95
103
  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
96
104
 
97
105
  identifiers.attach entrez_synonyms
106
+
98
107
 
99
- identifiers.each do |key, values|
100
- values.each do |list|
101
- list.reject!{|v| v.nil? or v.empty?}
102
- list.uniq!
108
+ identifiers.with_unnamed do
109
+ identifiers.each do |key, values|
110
+ values.each do |list|
111
+ list.reject!{|v| v.nil? or v.empty?}
112
+ list.uniq!
113
+ end
103
114
  end
104
115
  end
105
116
 
@@ -129,7 +140,7 @@ file 'protein_identifiers' do |t|
129
140
  File.open(t.name, 'w') do |f| f.puts identifiers end
130
141
  end
131
142
 
132
- file 'probe_transcripts' do |t|
143
+ file 'transcript_probes' do |t|
133
144
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_probe_identifiers, [], nil, :namespace => $namespace)
134
145
  $biomart_probe_identifiers.each do |name, key, prefix|
135
146
  if prefix
@@ -148,7 +159,7 @@ end
148
159
 
149
160
  file 'transcripts' => 'gene_positions' do |t|
150
161
  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
151
- transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
162
+ transcripts.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
152
163
 
153
164
  File.open(t.name, 'w') do |f| f.puts transcripts end
154
165
  end
@@ -161,13 +172,12 @@ file 'transcript_3utr' do |t|
161
172
  f.puts "#Ensembl Transcript ID\t3' UTR Length"
162
173
  utrs.each do |seq,trans|
163
174
  trans.each do |tran|
164
- f.puts [tran, seq.length] * "\t"
175
+ f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
165
176
  end
166
177
  end
167
178
  end
168
179
  end
169
180
 
170
-
171
181
  file 'transcript_5utr' do |t|
172
182
  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
173
183
 
@@ -176,7 +186,7 @@ file 'transcript_5utr' do |t|
176
186
  f.puts "#Ensembl Transcript ID\t5' UTR Length"
177
187
  utrs.each do |seq,trans|
178
188
  trans.each do |tran|
179
- f.puts [tran, seq.length] * "\t"
189
+ f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
180
190
  end
181
191
  end
182
192
  end
@@ -193,7 +203,7 @@ file 'gene_sequence' do |t|
193
203
 
194
204
  File.open(t.name, 'w') do |f|
195
205
  f.puts "#: :type=:single"
196
- f.puts "#Ensembl Gene ID\tProtein Sequence"
206
+ f.puts "#Ensembl Gene ID\tGene Sequence"
197
207
  sequences.each do |seq, genes|
198
208
  genes.each do |gene|
199
209
  f.write gene
@@ -205,7 +215,9 @@ file 'gene_sequence' do |t|
205
215
  end
206
216
  end
207
217
 
208
- file 'protein_sequence' do |t|
218
+ file 'protein_sequence' => 'chromosomes' do |t|
219
+ #chromosomes = TSV.open(t.prerequisites.first).keys
220
+ #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
209
221
  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
210
222
 
211
223
  File.open(t.name, 'w') do |f|
@@ -220,12 +232,11 @@ file 'protein_sequence' do |t|
220
232
  end
221
233
  end
222
234
  end
223
-
224
235
  end
225
236
 
226
237
  file 'exons' => 'gene_positions' do |t|
227
238
  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
228
- exons.attach TSV.open('gene_positions'), "Chromosome Name"
239
+ exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
229
240
 
230
241
  File.open(t.name, 'w') do |f| f.puts exons end
231
242
  end
@@ -236,12 +247,61 @@ file 'transcript_exons' do |t|
236
247
  File.open(t.name, 'w') do |f| f.puts exons end
237
248
  end
238
249
 
250
+ file 'exon_phase' do |t|
251
+ exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exon_phase, [], nil, :keep_empty => true, :namespace => $namespace)
252
+
253
+ File.open(t.name, 'w') do |f| f.puts exons end
254
+ end
255
+
256
+
257
+ #file 'transcript_phase' do |t|
258
+ # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
259
+ #
260
+ # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
261
+ # transcript_cds_start.through do |transcript, values|
262
+ # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
263
+ # tsv[transcript] = phase.to_i unless phase.nil?
264
+ # end
265
+ #
266
+ # File.open(t.name, 'w') do |f| f.puts tsv end
267
+ #end
268
+
269
+ file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
270
+ tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
271
+
272
+ transcript_exons = TSV.open(t.prerequisites.last)
273
+ transcript_exons.unnamed = true
274
+
275
+ exon_is_first_for_transcripts = {}
276
+
277
+ transcript_exons.through do |transcript, value|
278
+ exon = Misc.zip_fields(value).select{|exon, rank| rank == "1" }.first[0]
279
+ exon_is_first_for_transcripts[exon] ||= []
280
+ exon_is_first_for_transcripts[exon] << transcript
281
+ end
282
+
283
+ exon_phase = TSV.open(t.prerequisites.first)
284
+ exon_phase.unnamed = true
285
+ exon_phase.monitor = true
286
+
287
+ exon_phase.through do |exon, value|
288
+ Misc.zip_fields(value).each{|transcript, phase|
289
+ next unless exon_is_first_for_transcripts.include? exon
290
+ next unless exon_is_first_for_transcripts[exon].include? transcript
291
+ tsv[transcript] = phase
292
+ }
293
+ end
294
+
295
+ File.open(t.name, 'w') do |f| f.puts tsv end
296
+ end
297
+
298
+
239
299
  file 'transcript_sequence' do |t|
240
300
  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
241
301
 
242
302
  File.open(t.name, 'w') do |f|
243
303
  f.puts "#: :type=:single"
244
- f.puts "#Ensembl Transcript ID\tProtein Sequence"
304
+ f.puts "#Ensembl Transcript ID\tTranscript Sequence"
245
305
  sequences.each do |seq, genes|
246
306
  genes.each do |gene|
247
307
  f.write gene
@@ -272,7 +332,7 @@ end
272
332
 
273
333
  file 'gene_pmids' do |t|
274
334
  tsv = Entrez.entrez2pubmed($taxs)
275
- text = "#: :namespace=#{$namespace}"
335
+ text = "#: :namespace=#{$namespace}\n"
276
336
  text += "#Entrez Gene ID\tPMID"
277
337
  tsv.each do |gene, pmids|
278
338
  text << "\n" << gene << "\t" << pmids * "|"
@@ -319,7 +379,7 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
319
379
  transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
320
380
  transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
321
381
 
322
- string = "#: :namespace=#{$namespace}"
382
+ string = "#: :namespace=#{$namespace}\n"
323
383
  string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
324
384
 
325
385
  exons.unnamed = true
@@ -329,29 +389,43 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
329
389
  transcript_exons.unnamed = true
330
390
 
331
391
  exons.monitor = true
332
- Misc.profile do
333
- exons.through do |exon, info|
334
- gene, start, finish, strand, chr = info
335
-
336
- transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
392
+ exons.through do |exon, info|
393
+ gene, start, finish, strand, chr = info
337
394
 
338
- transcript_offsets = {}
339
- transcripts.each do |transcript|
340
- offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
341
- transcript_offsets[transcript] = offset unless offset.nil?
342
- end
395
+ transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
343
396
 
344
- string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
397
+ transcript_offsets = {}
398
+ transcripts.each do |transcript|
399
+ offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
400
+ transcript_offsets[transcript] = offset unless offset.nil?
345
401
  end
402
+
403
+ string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
346
404
  end
347
405
 
348
406
  Open.write(t.name, string)
349
407
  end
350
408
 
351
409
  file 'gene_go' do |t|
352
- goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
410
+ if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
411
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
353
412
 
354
- File.open(t.name, 'w') do |f| f.puts goterms end
413
+ goterms.add_field "GO ID" do |key, values|
414
+ values.flatten.compact.reject{|go| go.empty?}
415
+ end
416
+
417
+ goterms.add_field "GO Namespace" do |key, values|
418
+ ["biological_process"] * values["GO BP ID"].reject{|go| go.empty?}.length +
419
+ ["cellular_component"] * values["GO CC ID"].reject{|go| go.empty?}.length +
420
+ ["molecular_function"] * values["GO MF ID"].reject{|go| go.empty?}.length
421
+ end
422
+
423
+ File.open(t.name, 'w') do |f| f.puts goterms.slice(["GO ID", "GO Namespace"]) end
424
+ else
425
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
426
+
427
+ File.open(t.name, 'w') do |f| f.puts goterms end
428
+ end
355
429
  end
356
430
 
357
431
  file 'gene_go_bp' => 'gene_go' do |t|
@@ -370,13 +444,18 @@ file 'gene_go_bp' => 'gene_go' do |t|
370
444
  end
371
445
 
372
446
 
373
-
374
447
  file 'gene_pfam' do |t|
375
448
  goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
376
449
 
377
450
  File.open(t.name, 'w') do |f| f.puts goterms end
378
451
  end
379
452
 
453
+ file 'chromosomes' do |t|
454
+ goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => $namespace)
455
+
456
+ File.open(t.name, 'w') do |f| f.puts goterms end
457
+ end
458
+
380
459
 
381
460
  rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
382
461
  t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
@@ -47,6 +47,18 @@ class TestBioMart < Test::Unit::TestCase
47
47
  assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
48
48
  end
49
49
  end
50
+
51
+ def __test_chunk
52
+ chrs = %w(I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI MT 2-micron)
53
+ data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :chunk_filter => ['chromosome_name', chrs], :nocache => false, :wget_options => { :quiet => false})
54
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
55
+
56
+ TmpFile.with_file do |f|
57
+ filename = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false}, :filename => f)
58
+ data = TSV.open Open.open(filename)
59
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
60
+ end
61
+ end
50
62
  end
51
63
 
52
64
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 0
9
8
  - 1
10
- version: 1.0.1
9
+ - 0
10
+ version: 1.1.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-10-03 00:00:00 +02:00
18
+ date: 2011-11-17 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -105,6 +105,7 @@ files:
105
105
  - lib/rbbt/sources/CTCAE.rb
106
106
  - lib/rbbt/sources/InterPro.rb
107
107
  - lib/rbbt/sources/NCI.rb
108
+ - lib/rbbt/sources/barcode.rb
108
109
  - lib/rbbt/sources/bibtex.rb
109
110
  - lib/rbbt/sources/biomart.rb
110
111
  - lib/rbbt/sources/entrez.rb
@@ -115,6 +116,7 @@ files:
115
116
  - lib/rbbt/sources/organism/sequence.rb
116
117
  - lib/rbbt/sources/polysearch.rb
117
118
  - lib/rbbt/sources/pubmed.rb
119
+ - lib/rbbt/sources/tfacts.rb
118
120
  - lib/rbbt/sources/wgEncodeBroadHmm.rb
119
121
  - share/install/InterPro/Rakefile
120
122
  - share/install/JoChem/Rakefile