rbbt-sources 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,3 +5,61 @@ module NCI
5
5
 
6
6
  NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
7
7
  end
8
+
9
+ if defined? Entity
10
+
11
+ module NCINaturePathways
12
+ extend Entity
13
+ self.format = "NCI Nature Pathway ID"
14
+
15
+ property :name => :array2single do
16
+ @name ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
17
+ end
18
+
19
+ property :genes => :array2single do
20
+ @genes ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
21
+ end
22
+ end
23
+
24
+ module NCIReactomePathways
25
+ extend Entity
26
+ self.format = "NCI Reactome Pathway ID"
27
+
28
+ property :name => :array2single do
29
+ @name ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
30
+ end
31
+
32
+ property :genes => :array2single do
33
+ @genes ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
34
+ end
35
+ end
36
+
37
+ module NCIBioCartaPathways
38
+ extend Entity
39
+ self.format = "NCI BioCarta Pathway ID"
40
+
41
+ property :name => :array2single do
42
+ @name ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
43
+ end
44
+
45
+ property :genes => :array2single do
46
+ @genes ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true).values_at *self
47
+ end
48
+ end
49
+
50
+ if defined? Gene and Entity === Gene
51
+ module Gene
52
+ property :nature_pathways => :array2single do
53
+ @nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
54
+ end
55
+
56
+ property :reactome_pathways => :array2single do
57
+ @reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
58
+ end
59
+
60
+ property :biocarta_pathways => :array2single do
61
+ @biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at *self.entrez
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,23 @@
1
+ require 'rbbt-util'
2
+
3
+ module Barcode
4
+ extend Resource
5
+ self.subdir = "share/databases/Barcode"
6
+
7
+ Barcode.claim Barcode.transcriptome, :proc do
8
+ tsv = TSV.open(Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv"),
9
+ :fix => Proc.new{|l| l.gsub('"', '').gsub(',', "\t")}, :header_hash => "",
10
+ :type => :list, :cast => :to_f)
11
+ io = Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv")
12
+ fields = io.gets.chomp.gsub('"','').split(',')
13
+ io.close
14
+ fields.shift
15
+ tsv.fields = fields
16
+ tsv.key_field = "AFFY HG U133-PLUS-2"
17
+ tsv.to_s
18
+ end
19
+
20
+ end
21
+
22
+
23
+ Barcode.transcriptome.produce
@@ -22,7 +22,7 @@ module BioMart
22
22
  @@biomart_query_xml = <<-EOT
23
23
  <?xml version="1.0" encoding="UTF-8"?>
24
24
  <!DOCTYPE Query>
25
- <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
25
+ <Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
26
26
  <Dataset name = "<!--DATABASE-->" interface = "default" >
27
27
  <!--FILTERS-->
28
28
  <!--MAIN-->
@@ -47,26 +47,50 @@ module BioMart
47
47
  end
48
48
 
49
49
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
50
+ open_options = Misc.add_defaults :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
50
51
  repeats = true
51
52
  attrs ||= []
52
53
  filters ||= ["with_#{main}"]
53
54
 
55
+ if chunk_filter = open_options.delete(:chunk_filter)
56
+ filter, values = chunk_filter
57
+ merged_file = TmpFile.tmp_file
58
+ f = File.open(merged_file, 'w')
59
+ values.each do |value|
60
+ data = get(database, main, attrs, filters + [[filter, value]], data, open_options)
61
+ f.write Open.read(data)
62
+ end
63
+ f.close
64
+ return merged_file
65
+ end
66
+
54
67
  query = @@biomart_query_xml.dup
55
68
  query.sub!(/<!--DATABASE-->/,database)
56
69
  query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
57
70
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
58
71
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
59
72
 
60
- if @archive_url
61
- response = Open.read(@archive_url + query.gsub(/\n/,' '), open_options)
62
- else
63
- response = Open.read(BIOMART_URL + query.gsub(/\n/,' '), open_options)
73
+ url = @archive_url ? @archive_url + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
74
+
75
+ begin
76
+ response = Open.read(url, open_options.dup)
77
+ rescue
78
+ Open.remove_from_cache url, open_options
79
+ raise $!
64
80
  end
65
81
 
66
- if response.empty? or response =~ /Query ERROR:/
82
+ if response.empty? or response =~ /Query ERROR:/
83
+ Open.remove_from_cache url, open_options
67
84
  raise BioMart::QueryError, response
68
85
  end
69
86
 
87
+ if not response =~ /\[success\]$/sm
88
+ Open.remove_from_cache url, open_options
89
+ raise BioMart::QueryError, "Uncomplete result"
90
+ end
91
+
92
+ response.sub!(/\n\[success\]$/sm,'')
93
+
70
94
  result_file = TmpFile.tmp_file
71
95
  Open.write(result_file, response)
72
96
 
@@ -101,10 +125,10 @@ module BioMart
101
125
  # cause an error if the BioMart WS does not allow filtering with that
102
126
  # attribute.
103
127
  def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
104
- open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil
105
- filename, field_names = Misc.process_options open_options, :filename, :field_names
128
+ open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false
129
+ filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr
106
130
  attrs ||= []
107
-
131
+
108
132
  open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
109
133
 
110
134
  Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
@@ -122,11 +146,17 @@ module BioMart
122
146
 
123
147
  chunks << chunk if chunk.any?
124
148
 
149
+ chunks << [] if chunks.empty?
150
+
125
151
  Log.low "Chunks: #{chunks.length}"
126
- chunks.each_with_index{|chunk,i|
127
- Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
128
- data = get(database, main, chunk, filters, data, open_options)
129
- }
152
+ if chunks.any?
153
+ chunks.each_with_index{|chunk,i|
154
+ Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
155
+ data = get(database, main, chunk, filters, data, open_options)
156
+ }
157
+ else
158
+ data = get(database, main, [], filters, data, open_options)
159
+ end
130
160
 
131
161
  open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
132
162
  if filename.nil?
@@ -150,10 +180,13 @@ module BioMart
150
180
  end
151
181
 
152
182
  def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
183
+ attrs ||= []
184
+
153
185
  if @archive_url
154
186
  attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
155
187
  end
156
188
 
189
+
157
190
  codes = attrs.collect{|attr| attr[1]}
158
191
  if open_options[:filename].nil?
159
192
  tsv = query(database, main.last, codes, filters, data, open_options)
@@ -35,7 +35,7 @@ module Entrez
35
35
 
36
36
 
37
37
  def self.entrez2pubmed(taxs)
38
- options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
38
+ options = {:key_field => 1, :fields => [2], :persist => true, :merge => true}
39
39
 
40
40
  taxs = [taxs] unless taxs.is_a?(Array)
41
41
  options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
@@ -84,3 +84,33 @@ module GO
84
84
  end
85
85
  end
86
86
  end
87
+
88
+ if defined? Entity
89
+
90
+ module GOTerm
91
+ extend Entity
92
+ self.format = "GO ID"
93
+
94
+ property :name => :array2single do
95
+ @name ||= GO.id2name(self)
96
+ end
97
+
98
+ property :genes => :array2single do |organism|
99
+ @genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
100
+ end
101
+ end
102
+
103
+ if defined? Gene and Entity === Gene
104
+ module Gene
105
+ property :go_terms => :array2single do |organism|
106
+ @go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
107
+ end
108
+
109
+ property :go_bp_terms => :array2single do |organism|
110
+ @go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+
@@ -0,0 +1,65 @@
1
+ require 'rbbt'
2
+ require 'rbbt/resource'
3
+ require 'nokogiri'
4
+
5
+ module TFacts
6
+ extend Resource
7
+ self.subdir = "share/databases/TF"
8
+
9
+ def self.targets_for_gene_unsigned(gene_name)
10
+ doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
11
+
12
+ doc.css("td a").collect{|link| link.content.strip}
13
+ end
14
+
15
+ def self.targets_for_gene_signed(gene_name)
16
+ doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResults.php", :post => "TFS_ID=#{ gene_name }"))
17
+
18
+ rows = doc.css("tr")
19
+ rows.shift
20
+ targets = {}
21
+ rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
22
+ targets
23
+ end
24
+
25
+ def self.known_transcription_factors_signed
26
+ Open.read("http://www.tfacts.org/source/tfs.php").scan(/OPTION VALUE=([^\s]+)/).flatten
27
+ end
28
+
29
+ def self.known_transcription_factors_unsigned
30
+ Open.read("http://www.tfacts.org/source/tfsns.php").scan(/OPTION VALUE=([^\s]+)/).flatten
31
+ end
32
+
33
+ TFacts.claim TFacts.targets, :proc do
34
+ tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
35
+ TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
36
+ tsv.to_s
37
+ end
38
+
39
+ TFacts.claim TFacts.targets_signed, :proc do
40
+ tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name", "Target Sign"], :type => :double)
41
+ Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
42
+ tsv[tf] = [targets.keys, targets.values]
43
+ end
44
+ tsv.to_s
45
+ end
46
+ end
47
+
48
+
49
+ if defined? Entity and defined? Gene and Entity === Gene
50
+
51
+ module Gene
52
+ property :is_transcription_factor? => :array2single do
53
+ @is_trasncription_factor ||= begin
54
+ tfs = TFacts.targets.keys
55
+ self.name.collect{|gene| tfs.include? gene}
56
+ end
57
+ end
58
+
59
+ property :transcription_targets => :array2single do
60
+ @transcription_targets ||= begin
61
+ Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -51,7 +51,7 @@ file 'nature_pathways' do |t|
51
51
 
52
52
  pathways = NCI.get_pathways(xml)
53
53
 
54
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
54
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
55
55
  end
56
56
 
57
57
  file 'biocarta_pathways' do |t|
@@ -62,7 +62,7 @@ file 'biocarta_pathways' do |t|
62
62
 
63
63
  pathways = NCI.get_pathways(xml, "LL")
64
64
 
65
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
65
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
66
66
  end
67
67
 
68
68
  file 'reactome_pathways' do |t|
@@ -73,7 +73,7 @@ file 'reactome_pathways' do |t|
73
73
 
74
74
  pathways = NCI.get_pathways(xml, "UP")
75
75
 
76
- Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
76
+ Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
77
77
  end
78
78
 
79
79
 
@@ -91,6 +91,12 @@ $biomart_go= [
91
91
  ["GO Namespace", 'namespace_1003'],
92
92
  ]
93
93
 
94
+ $biomart_go_2009= [
95
+ ["GO BP ID", 'go_biological_process_id'],
96
+ ["GO MF ID", 'go_molecular_function_id'],
97
+ ["GO CC ID", 'go_cellular_component_id'],
98
+ ]
99
+
94
100
  $biomart_pfam= [
95
101
  ["Pfam Domain", 'pfam'],
96
102
  ]
@@ -51,6 +51,13 @@ $biomart_transcript_exons = [
51
51
  ['Exon Rank in Transcript','rank'],
52
52
  ]
53
53
 
54
+ $biomart_exon_phase = [
55
+ $biomart_ensembl_transcript,
56
+ ['Phase','phase'],
57
+ ]
58
+
59
+
60
+
54
61
  $biomart_exons = [
55
62
  $biomart_ensembl_gene,
56
63
  ['Exon Strand','strand'],
@@ -66,6 +73,7 @@ end
66
73
 
67
74
  file 'identifiers' do |t|
68
75
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
76
+ identifiers.unnamed = true
69
77
 
70
78
  $biomart_identifiers.each do |name, key, prefix|
71
79
  next unless identifiers.all_fields.include? name
@@ -95,11 +103,14 @@ file 'identifiers' do |t|
95
103
  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
96
104
 
97
105
  identifiers.attach entrez_synonyms
106
+
98
107
 
99
- identifiers.each do |key, values|
100
- values.each do |list|
101
- list.reject!{|v| v.nil? or v.empty?}
102
- list.uniq!
108
+ identifiers.with_unnamed do
109
+ identifiers.each do |key, values|
110
+ values.each do |list|
111
+ list.reject!{|v| v.nil? or v.empty?}
112
+ list.uniq!
113
+ end
103
114
  end
104
115
  end
105
116
 
@@ -129,7 +140,7 @@ file 'protein_identifiers' do |t|
129
140
  File.open(t.name, 'w') do |f| f.puts identifiers end
130
141
  end
131
142
 
132
- file 'probe_transcripts' do |t|
143
+ file 'transcript_probes' do |t|
133
144
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_probe_identifiers, [], nil, :namespace => $namespace)
134
145
  $biomart_probe_identifiers.each do |name, key, prefix|
135
146
  if prefix
@@ -148,7 +159,7 @@ end
148
159
 
149
160
  file 'transcripts' => 'gene_positions' do |t|
150
161
  transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
151
- transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
162
+ transcripts.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
152
163
 
153
164
  File.open(t.name, 'w') do |f| f.puts transcripts end
154
165
  end
@@ -161,13 +172,12 @@ file 'transcript_3utr' do |t|
161
172
  f.puts "#Ensembl Transcript ID\t3' UTR Length"
162
173
  utrs.each do |seq,trans|
163
174
  trans.each do |tran|
164
- f.puts [tran, seq.length] * "\t"
175
+ f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
165
176
  end
166
177
  end
167
178
  end
168
179
  end
169
180
 
170
-
171
181
  file 'transcript_5utr' do |t|
172
182
  utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
173
183
 
@@ -176,7 +186,7 @@ file 'transcript_5utr' do |t|
176
186
  f.puts "#Ensembl Transcript ID\t5' UTR Length"
177
187
  utrs.each do |seq,trans|
178
188
  trans.each do |tran|
179
- f.puts [tran, seq.length] * "\t"
189
+ f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
180
190
  end
181
191
  end
182
192
  end
@@ -193,7 +203,7 @@ file 'gene_sequence' do |t|
193
203
 
194
204
  File.open(t.name, 'w') do |f|
195
205
  f.puts "#: :type=:single"
196
- f.puts "#Ensembl Gene ID\tProtein Sequence"
206
+ f.puts "#Ensembl Gene ID\tGene Sequence"
197
207
  sequences.each do |seq, genes|
198
208
  genes.each do |gene|
199
209
  f.write gene
@@ -205,7 +215,9 @@ file 'gene_sequence' do |t|
205
215
  end
206
216
  end
207
217
 
208
- file 'protein_sequence' do |t|
218
+ file 'protein_sequence' => 'chromosomes' do |t|
219
+ #chromosomes = TSV.open(t.prerequisites.first).keys
220
+ #sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
209
221
  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
210
222
 
211
223
  File.open(t.name, 'w') do |f|
@@ -220,12 +232,11 @@ file 'protein_sequence' do |t|
220
232
  end
221
233
  end
222
234
  end
223
-
224
235
  end
225
236
 
226
237
  file 'exons' => 'gene_positions' do |t|
227
238
  exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
228
- exons.attach TSV.open('gene_positions'), "Chromosome Name"
239
+ exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
229
240
 
230
241
  File.open(t.name, 'w') do |f| f.puts exons end
231
242
  end
@@ -236,12 +247,61 @@ file 'transcript_exons' do |t|
236
247
  File.open(t.name, 'w') do |f| f.puts exons end
237
248
  end
238
249
 
250
+ file 'exon_phase' do |t|
251
+ exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exon_phase, [], nil, :keep_empty => true, :namespace => $namespace)
252
+
253
+ File.open(t.name, 'w') do |f| f.puts exons end
254
+ end
255
+
256
+
257
+ #file 'transcript_phase' do |t|
258
+ # tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
259
+ #
260
+ # transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
261
+ # transcript_cds_start.through do |transcript, values|
262
+ # phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
263
+ # tsv[transcript] = phase.to_i unless phase.nil?
264
+ # end
265
+ #
266
+ # File.open(t.name, 'w') do |f| f.puts tsv end
267
+ #end
268
+
269
+ file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
270
+ tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
271
+
272
+ transcript_exons = TSV.open(t.prerequisites.last)
273
+ transcript_exons.unnamed = true
274
+
275
+ exon_is_first_for_transcripts = {}
276
+
277
+ transcript_exons.through do |transcript, value|
278
+ exon = Misc.zip_fields(value).select{|exon, rank| rank == "1" }.first[0]
279
+ exon_is_first_for_transcripts[exon] ||= []
280
+ exon_is_first_for_transcripts[exon] << transcript
281
+ end
282
+
283
+ exon_phase = TSV.open(t.prerequisites.first)
284
+ exon_phase.unnamed = true
285
+ exon_phase.monitor = true
286
+
287
+ exon_phase.through do |exon, value|
288
+ Misc.zip_fields(value).each{|transcript, phase|
289
+ next unless exon_is_first_for_transcripts.include? exon
290
+ next unless exon_is_first_for_transcripts[exon].include? transcript
291
+ tsv[transcript] = phase
292
+ }
293
+ end
294
+
295
+ File.open(t.name, 'w') do |f| f.puts tsv end
296
+ end
297
+
298
+
239
299
  file 'transcript_sequence' do |t|
240
300
  sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
241
301
 
242
302
  File.open(t.name, 'w') do |f|
243
303
  f.puts "#: :type=:single"
244
- f.puts "#Ensembl Transcript ID\tProtein Sequence"
304
+ f.puts "#Ensembl Transcript ID\tTranscript Sequence"
245
305
  sequences.each do |seq, genes|
246
306
  genes.each do |gene|
247
307
  f.write gene
@@ -272,7 +332,7 @@ end
272
332
 
273
333
  file 'gene_pmids' do |t|
274
334
  tsv = Entrez.entrez2pubmed($taxs)
275
- text = "#: :namespace=#{$namespace}"
335
+ text = "#: :namespace=#{$namespace}\n"
276
336
  text += "#Entrez Gene ID\tPMID"
277
337
  tsv.each do |gene, pmids|
278
338
  text << "\n" << gene << "\t" << pmids * "|"
@@ -319,7 +379,7 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
319
379
  transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
320
380
  transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
321
381
 
322
- string = "#: :namespace=#{$namespace}"
382
+ string = "#: :namespace=#{$namespace}\n"
323
383
  string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
324
384
 
325
385
  exons.unnamed = true
@@ -329,29 +389,43 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
329
389
  transcript_exons.unnamed = true
330
390
 
331
391
  exons.monitor = true
332
- Misc.profile do
333
- exons.through do |exon, info|
334
- gene, start, finish, strand, chr = info
335
-
336
- transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
392
+ exons.through do |exon, info|
393
+ gene, start, finish, strand, chr = info
337
394
 
338
- transcript_offsets = {}
339
- transcripts.each do |transcript|
340
- offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
341
- transcript_offsets[transcript] = offset unless offset.nil?
342
- end
395
+ transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
343
396
 
344
- string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
397
+ transcript_offsets = {}
398
+ transcripts.each do |transcript|
399
+ offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
400
+ transcript_offsets[transcript] = offset unless offset.nil?
345
401
  end
402
+
403
+ string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
346
404
  end
347
405
 
348
406
  Open.write(t.name, string)
349
407
  end
350
408
 
351
409
  file 'gene_go' do |t|
352
- goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
410
+ if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
411
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
353
412
 
354
- File.open(t.name, 'w') do |f| f.puts goterms end
413
+ goterms.add_field "GO ID" do |key, values|
414
+ values.flatten.compact.reject{|go| go.empty?}
415
+ end
416
+
417
+ goterms.add_field "GO Namespace" do |key, values|
418
+ ["biological_process"] * values["GO BP ID"].reject{|go| go.empty?}.length +
419
+ ["cellular_component"] * values["GO CC ID"].reject{|go| go.empty?}.length +
420
+ ["molecular_function"] * values["GO MF ID"].reject{|go| go.empty?}.length
421
+ end
422
+
423
+ File.open(t.name, 'w') do |f| f.puts goterms.slice(["GO ID", "GO Namespace"]) end
424
+ else
425
+ goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
426
+
427
+ File.open(t.name, 'w') do |f| f.puts goterms end
428
+ end
355
429
  end
356
430
 
357
431
  file 'gene_go_bp' => 'gene_go' do |t|
@@ -370,13 +444,18 @@ file 'gene_go_bp' => 'gene_go' do |t|
370
444
  end
371
445
 
372
446
 
373
-
374
447
  file 'gene_pfam' do |t|
375
448
  goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
376
449
 
377
450
  File.open(t.name, 'w') do |f| f.puts goterms end
378
451
  end
379
452
 
453
+ file 'chromosomes' do |t|
454
+ goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => $namespace)
455
+
456
+ File.open(t.name, 'w') do |f| f.puts goterms end
457
+ end
458
+
380
459
 
381
460
  rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
382
461
  t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
@@ -47,6 +47,18 @@ class TestBioMart < Test::Unit::TestCase
47
47
  assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
48
48
  end
49
49
  end
50
+
51
+ def __test_chunk
52
+ chrs = %w(I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI MT 2-micron)
53
+ data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :chunk_filter => ['chromosome_name', chrs], :nocache => false, :wget_options => { :quiet => false})
54
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
55
+
56
+ TmpFile.with_file do |f|
57
+ filename = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false}, :filename => f)
58
+ data = TSV.open Open.open(filename)
59
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
60
+ end
61
+ end
50
62
  end
51
63
 
52
64
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
- - 0
9
8
  - 1
10
- version: 1.0.1
9
+ - 0
10
+ version: 1.1.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Miguel Vazquez
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-10-03 00:00:00 +02:00
18
+ date: 2011-11-17 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -105,6 +105,7 @@ files:
105
105
  - lib/rbbt/sources/CTCAE.rb
106
106
  - lib/rbbt/sources/InterPro.rb
107
107
  - lib/rbbt/sources/NCI.rb
108
+ - lib/rbbt/sources/barcode.rb
108
109
  - lib/rbbt/sources/bibtex.rb
109
110
  - lib/rbbt/sources/biomart.rb
110
111
  - lib/rbbt/sources/entrez.rb
@@ -115,6 +116,7 @@ files:
115
116
  - lib/rbbt/sources/organism/sequence.rb
116
117
  - lib/rbbt/sources/polysearch.rb
117
118
  - lib/rbbt/sources/pubmed.rb
119
+ - lib/rbbt/sources/tfacts.rb
118
120
  - lib/rbbt/sources/wgEncodeBroadHmm.rb
119
121
  - share/install/InterPro/Rakefile
120
122
  - share/install/JoChem/Rakefile