rbbt-sources 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/sources/NCI.rb +58 -0
- data/lib/rbbt/sources/barcode.rb +23 -0
- data/lib/rbbt/sources/biomart.rb +46 -13
- data/lib/rbbt/sources/entrez.rb +1 -1
- data/lib/rbbt/sources/go.rb +30 -0
- data/lib/rbbt/sources/tfacts.rb +65 -0
- data/share/install/NCI/Rakefile +3 -3
- data/share/install/Organism/Hsa/Rakefile +6 -0
- data/share/install/Organism/organism_helpers.rb +109 -30
- data/test/rbbt/sources/test_biomart.rb +12 -0
- metadata +6 -4
data/lib/rbbt/sources/NCI.rb
CHANGED
@@ -5,3 +5,61 @@ module NCI
|
|
5
5
|
|
6
6
|
NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
|
7
7
|
end
|
8
|
+
|
9
|
+
if defined? Entity
|
10
|
+
|
11
|
+
module NCINaturePathways
|
12
|
+
extend Entity
|
13
|
+
self.format = "NCI Nature Pathway ID"
|
14
|
+
|
15
|
+
property :name => :array2single do
|
16
|
+
@name ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
|
17
|
+
end
|
18
|
+
|
19
|
+
property :genes => :array2single do
|
20
|
+
@genes ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
module NCIReactomePathways
|
25
|
+
extend Entity
|
26
|
+
self.format = "NCI Reactome Pathway ID"
|
27
|
+
|
28
|
+
property :name => :array2single do
|
29
|
+
@name ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
|
30
|
+
end
|
31
|
+
|
32
|
+
property :genes => :array2single do
|
33
|
+
@genes ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
module NCIBioCartaPathways
|
38
|
+
extend Entity
|
39
|
+
self.format = "NCI BioCarta Pathway ID"
|
40
|
+
|
41
|
+
property :name => :array2single do
|
42
|
+
@name ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
|
43
|
+
end
|
44
|
+
|
45
|
+
property :genes => :array2single do
|
46
|
+
@genes ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true).values_at *self
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
if defined? Gene and Entity === Gene
|
51
|
+
module Gene
|
52
|
+
property :nature_pathways => :array2single do
|
53
|
+
@nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
|
54
|
+
end
|
55
|
+
|
56
|
+
property :reactome_pathways => :array2single do
|
57
|
+
@reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
|
58
|
+
end
|
59
|
+
|
60
|
+
property :biocarta_pathways => :array2single do
|
61
|
+
@biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at *self.entrez
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
module Barcode
|
4
|
+
extend Resource
|
5
|
+
self.subdir = "share/databases/Barcode"
|
6
|
+
|
7
|
+
Barcode.claim Barcode.transcriptome, :proc do
|
8
|
+
tsv = TSV.open(Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv"),
|
9
|
+
:fix => Proc.new{|l| l.gsub('"', '').gsub(',', "\t")}, :header_hash => "",
|
10
|
+
:type => :list, :cast => :to_f)
|
11
|
+
io = Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv")
|
12
|
+
fields = io.gets.chomp.gsub('"','').split(',')
|
13
|
+
io.close
|
14
|
+
fields.shift
|
15
|
+
tsv.fields = fields
|
16
|
+
tsv.key_field = "AFFY HG U133-PLUS-2"
|
17
|
+
tsv.to_s
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
Barcode.transcriptome.produce
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -22,7 +22,7 @@ module BioMart
|
|
22
22
|
@@biomart_query_xml = <<-EOT
|
23
23
|
<?xml version="1.0" encoding="UTF-8"?>
|
24
24
|
<!DOCTYPE Query>
|
25
|
-
<Query
|
25
|
+
<Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
26
26
|
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
27
27
|
<!--FILTERS-->
|
28
28
|
<!--MAIN-->
|
@@ -47,26 +47,50 @@ module BioMart
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
50
|
+
open_options = Misc.add_defaults :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
|
50
51
|
repeats = true
|
51
52
|
attrs ||= []
|
52
53
|
filters ||= ["with_#{main}"]
|
53
54
|
|
55
|
+
if chunk_filter = open_options.delete(:chunk_filter)
|
56
|
+
filter, values = chunk_filter
|
57
|
+
merged_file = TmpFile.tmp_file
|
58
|
+
f = File.open(merged_file, 'w')
|
59
|
+
values.each do |value|
|
60
|
+
data = get(database, main, attrs, filters + [[filter, value]], data, open_options)
|
61
|
+
f.write Open.read(data)
|
62
|
+
end
|
63
|
+
f.close
|
64
|
+
return merged_file
|
65
|
+
end
|
66
|
+
|
54
67
|
query = @@biomart_query_xml.dup
|
55
68
|
query.sub!(/<!--DATABASE-->/,database)
|
56
69
|
query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
|
57
70
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
58
71
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
59
72
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
response = Open.read(
|
73
|
+
url = @archive_url ? @archive_url + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
|
74
|
+
|
75
|
+
begin
|
76
|
+
response = Open.read(url, open_options.dup)
|
77
|
+
rescue
|
78
|
+
Open.remove_from_cache url, open_options
|
79
|
+
raise $!
|
64
80
|
end
|
65
81
|
|
66
|
-
if response.empty? or response =~ /Query ERROR:/
|
82
|
+
if response.empty? or response =~ /Query ERROR:/
|
83
|
+
Open.remove_from_cache url, open_options
|
67
84
|
raise BioMart::QueryError, response
|
68
85
|
end
|
69
86
|
|
87
|
+
if not response =~ /\[success\]$/sm
|
88
|
+
Open.remove_from_cache url, open_options
|
89
|
+
raise BioMart::QueryError, "Uncomplete result"
|
90
|
+
end
|
91
|
+
|
92
|
+
response.sub!(/\n\[success\]$/sm,'')
|
93
|
+
|
70
94
|
result_file = TmpFile.tmp_file
|
71
95
|
Open.write(result_file, response)
|
72
96
|
|
@@ -101,10 +125,10 @@ module BioMart
|
|
101
125
|
# cause an error if the BioMart WS does not allow filtering with that
|
102
126
|
# attribute.
|
103
127
|
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
104
|
-
open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil
|
105
|
-
filename, field_names = Misc.process_options open_options, :filename, :field_names
|
128
|
+
open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false
|
129
|
+
filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr
|
106
130
|
attrs ||= []
|
107
|
-
|
131
|
+
|
108
132
|
open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
|
109
133
|
|
110
134
|
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
@@ -122,11 +146,17 @@ module BioMart
|
|
122
146
|
|
123
147
|
chunks << chunk if chunk.any?
|
124
148
|
|
149
|
+
chunks << [] if chunks.empty?
|
150
|
+
|
125
151
|
Log.low "Chunks: #{chunks.length}"
|
126
|
-
chunks.
|
127
|
-
|
128
|
-
|
129
|
-
|
152
|
+
if chunks.any?
|
153
|
+
chunks.each_with_index{|chunk,i|
|
154
|
+
Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
|
155
|
+
data = get(database, main, chunk, filters, data, open_options)
|
156
|
+
}
|
157
|
+
else
|
158
|
+
data = get(database, main, [], filters, data, open_options)
|
159
|
+
end
|
130
160
|
|
131
161
|
open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
|
132
162
|
if filename.nil?
|
@@ -150,10 +180,13 @@ module BioMart
|
|
150
180
|
end
|
151
181
|
|
152
182
|
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
183
|
+
attrs ||= []
|
184
|
+
|
153
185
|
if @archive_url
|
154
186
|
attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
|
155
187
|
end
|
156
188
|
|
189
|
+
|
157
190
|
codes = attrs.collect{|attr| attr[1]}
|
158
191
|
if open_options[:filename].nil?
|
159
192
|
tsv = query(database, main.last, codes, filters, data, open_options)
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -35,7 +35,7 @@ module Entrez
|
|
35
35
|
|
36
36
|
|
37
37
|
def self.entrez2pubmed(taxs)
|
38
|
-
options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
|
38
|
+
options = {:key_field => 1, :fields => [2], :persist => true, :merge => true}
|
39
39
|
|
40
40
|
taxs = [taxs] unless taxs.is_a?(Array)
|
41
41
|
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -84,3 +84,33 @@ module GO
|
|
84
84
|
end
|
85
85
|
end
|
86
86
|
end
|
87
|
+
|
88
|
+
if defined? Entity
|
89
|
+
|
90
|
+
module GOTerm
|
91
|
+
extend Entity
|
92
|
+
self.format = "GO ID"
|
93
|
+
|
94
|
+
property :name => :array2single do
|
95
|
+
@name ||= GO.id2name(self)
|
96
|
+
end
|
97
|
+
|
98
|
+
property :genes => :array2single do |organism|
|
99
|
+
@genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
if defined? Gene and Entity === Gene
|
104
|
+
module Gene
|
105
|
+
property :go_terms => :array2single do |organism|
|
106
|
+
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
107
|
+
end
|
108
|
+
|
109
|
+
property :go_bp_terms => :array2single do |organism|
|
110
|
+
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module TFacts
|
6
|
+
extend Resource
|
7
|
+
self.subdir = "share/databases/TF"
|
8
|
+
|
9
|
+
def self.targets_for_gene_unsigned(gene_name)
|
10
|
+
doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
|
11
|
+
|
12
|
+
doc.css("td a").collect{|link| link.content.strip}
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.targets_for_gene_signed(gene_name)
|
16
|
+
doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResults.php", :post => "TFS_ID=#{ gene_name }"))
|
17
|
+
|
18
|
+
rows = doc.css("tr")
|
19
|
+
rows.shift
|
20
|
+
targets = {}
|
21
|
+
rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
|
22
|
+
targets
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.known_transcription_factors_signed
|
26
|
+
Open.read("http://www.tfacts.org/source/tfs.php").scan(/OPTION VALUE=([^\s]+)/).flatten
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.known_transcription_factors_unsigned
|
30
|
+
Open.read("http://www.tfacts.org/source/tfsns.php").scan(/OPTION VALUE=([^\s]+)/).flatten
|
31
|
+
end
|
32
|
+
|
33
|
+
TFacts.claim TFacts.targets, :proc do
|
34
|
+
tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
|
35
|
+
TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
|
36
|
+
tsv.to_s
|
37
|
+
end
|
38
|
+
|
39
|
+
TFacts.claim TFacts.targets_signed, :proc do
|
40
|
+
tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name", "Target Sign"], :type => :double)
|
41
|
+
Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
|
42
|
+
tsv[tf] = [targets.keys, targets.values]
|
43
|
+
end
|
44
|
+
tsv.to_s
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
if defined? Entity and defined? Gene and Entity === Gene
|
50
|
+
|
51
|
+
module Gene
|
52
|
+
property :is_transcription_factor? => :array2single do
|
53
|
+
@is_trasncription_factor ||= begin
|
54
|
+
tfs = TFacts.targets.keys
|
55
|
+
self.name.collect{|gene| tfs.include? gene}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
property :transcription_targets => :array2single do
|
60
|
+
@transcription_targets ||= begin
|
61
|
+
Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/share/install/NCI/Rakefile
CHANGED
@@ -51,7 +51,7 @@ file 'nature_pathways' do |t|
|
|
51
51
|
|
52
52
|
pathways = NCI.get_pathways(xml)
|
53
53
|
|
54
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
54
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
55
55
|
end
|
56
56
|
|
57
57
|
file 'biocarta_pathways' do |t|
|
@@ -62,7 +62,7 @@ file 'biocarta_pathways' do |t|
|
|
62
62
|
|
63
63
|
pathways = NCI.get_pathways(xml, "LL")
|
64
64
|
|
65
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
65
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
66
66
|
end
|
67
67
|
|
68
68
|
file 'reactome_pathways' do |t|
|
@@ -73,7 +73,7 @@ file 'reactome_pathways' do |t|
|
|
73
73
|
|
74
74
|
pathways = NCI.get_pathways(xml, "UP")
|
75
75
|
|
76
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
76
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
77
77
|
end
|
78
78
|
|
79
79
|
|
@@ -91,6 +91,12 @@ $biomart_go= [
|
|
91
91
|
["GO Namespace", 'namespace_1003'],
|
92
92
|
]
|
93
93
|
|
94
|
+
$biomart_go_2009= [
|
95
|
+
["GO BP ID", 'go_biological_process_id'],
|
96
|
+
["GO MF ID", 'go_molecular_function_id'],
|
97
|
+
["GO CC ID", 'go_cellular_component_id'],
|
98
|
+
]
|
99
|
+
|
94
100
|
$biomart_pfam= [
|
95
101
|
["Pfam Domain", 'pfam'],
|
96
102
|
]
|
@@ -51,6 +51,13 @@ $biomart_transcript_exons = [
|
|
51
51
|
['Exon Rank in Transcript','rank'],
|
52
52
|
]
|
53
53
|
|
54
|
+
$biomart_exon_phase = [
|
55
|
+
$biomart_ensembl_transcript,
|
56
|
+
['Phase','phase'],
|
57
|
+
]
|
58
|
+
|
59
|
+
|
60
|
+
|
54
61
|
$biomart_exons = [
|
55
62
|
$biomart_ensembl_gene,
|
56
63
|
['Exon Strand','strand'],
|
@@ -66,6 +73,7 @@ end
|
|
66
73
|
|
67
74
|
file 'identifiers' do |t|
|
68
75
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
|
76
|
+
identifiers.unnamed = true
|
69
77
|
|
70
78
|
$biomart_identifiers.each do |name, key, prefix|
|
71
79
|
next unless identifiers.all_fields.include? name
|
@@ -95,11 +103,14 @@ file 'identifiers' do |t|
|
|
95
103
|
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
96
104
|
|
97
105
|
identifiers.attach entrez_synonyms
|
106
|
+
|
98
107
|
|
99
|
-
identifiers.
|
100
|
-
|
101
|
-
|
102
|
-
|
108
|
+
identifiers.with_unnamed do
|
109
|
+
identifiers.each do |key, values|
|
110
|
+
values.each do |list|
|
111
|
+
list.reject!{|v| v.nil? or v.empty?}
|
112
|
+
list.uniq!
|
113
|
+
end
|
103
114
|
end
|
104
115
|
end
|
105
116
|
|
@@ -129,7 +140,7 @@ file 'protein_identifiers' do |t|
|
|
129
140
|
File.open(t.name, 'w') do |f| f.puts identifiers end
|
130
141
|
end
|
131
142
|
|
132
|
-
file '
|
143
|
+
file 'transcript_probes' do |t|
|
133
144
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_probe_identifiers, [], nil, :namespace => $namespace)
|
134
145
|
$biomart_probe_identifiers.each do |name, key, prefix|
|
135
146
|
if prefix
|
@@ -148,7 +159,7 @@ end
|
|
148
159
|
|
149
160
|
file 'transcripts' => 'gene_positions' do |t|
|
150
161
|
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
|
151
|
-
transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
|
162
|
+
transcripts.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
|
152
163
|
|
153
164
|
File.open(t.name, 'w') do |f| f.puts transcripts end
|
154
165
|
end
|
@@ -161,13 +172,12 @@ file 'transcript_3utr' do |t|
|
|
161
172
|
f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
162
173
|
utrs.each do |seq,trans|
|
163
174
|
trans.each do |tran|
|
164
|
-
f.puts [tran, seq.length] * "\t"
|
175
|
+
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
165
176
|
end
|
166
177
|
end
|
167
178
|
end
|
168
179
|
end
|
169
180
|
|
170
|
-
|
171
181
|
file 'transcript_5utr' do |t|
|
172
182
|
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
173
183
|
|
@@ -176,7 +186,7 @@ file 'transcript_5utr' do |t|
|
|
176
186
|
f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
177
187
|
utrs.each do |seq,trans|
|
178
188
|
trans.each do |tran|
|
179
|
-
f.puts [tran, seq.length] * "\t"
|
189
|
+
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
180
190
|
end
|
181
191
|
end
|
182
192
|
end
|
@@ -193,7 +203,7 @@ file 'gene_sequence' do |t|
|
|
193
203
|
|
194
204
|
File.open(t.name, 'w') do |f|
|
195
205
|
f.puts "#: :type=:single"
|
196
|
-
f.puts "#Ensembl Gene ID\
|
206
|
+
f.puts "#Ensembl Gene ID\tGene Sequence"
|
197
207
|
sequences.each do |seq, genes|
|
198
208
|
genes.each do |gene|
|
199
209
|
f.write gene
|
@@ -205,7 +215,9 @@ file 'gene_sequence' do |t|
|
|
205
215
|
end
|
206
216
|
end
|
207
217
|
|
208
|
-
file 'protein_sequence' do |t|
|
218
|
+
file 'protein_sequence' => 'chromosomes' do |t|
|
219
|
+
#chromosomes = TSV.open(t.prerequisites.first).keys
|
220
|
+
#sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
|
209
221
|
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
210
222
|
|
211
223
|
File.open(t.name, 'w') do |f|
|
@@ -220,12 +232,11 @@ file 'protein_sequence' do |t|
|
|
220
232
|
end
|
221
233
|
end
|
222
234
|
end
|
223
|
-
|
224
235
|
end
|
225
236
|
|
226
237
|
file 'exons' => 'gene_positions' do |t|
|
227
238
|
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
|
228
|
-
exons.attach TSV.open('gene_positions'), "Chromosome Name"
|
239
|
+
exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
|
229
240
|
|
230
241
|
File.open(t.name, 'w') do |f| f.puts exons end
|
231
242
|
end
|
@@ -236,12 +247,61 @@ file 'transcript_exons' do |t|
|
|
236
247
|
File.open(t.name, 'w') do |f| f.puts exons end
|
237
248
|
end
|
238
249
|
|
250
|
+
file 'exon_phase' do |t|
|
251
|
+
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exon_phase, [], nil, :keep_empty => true, :namespace => $namespace)
|
252
|
+
|
253
|
+
File.open(t.name, 'w') do |f| f.puts exons end
|
254
|
+
end
|
255
|
+
|
256
|
+
|
257
|
+
#file 'transcript_phase' do |t|
|
258
|
+
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
259
|
+
#
|
260
|
+
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
261
|
+
# transcript_cds_start.through do |transcript, values|
|
262
|
+
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
263
|
+
# tsv[transcript] = phase.to_i unless phase.nil?
|
264
|
+
# end
|
265
|
+
#
|
266
|
+
# File.open(t.name, 'w') do |f| f.puts tsv end
|
267
|
+
#end
|
268
|
+
|
269
|
+
file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
|
270
|
+
tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
|
271
|
+
|
272
|
+
transcript_exons = TSV.open(t.prerequisites.last)
|
273
|
+
transcript_exons.unnamed = true
|
274
|
+
|
275
|
+
exon_is_first_for_transcripts = {}
|
276
|
+
|
277
|
+
transcript_exons.through do |transcript, value|
|
278
|
+
exon = Misc.zip_fields(value).select{|exon, rank| rank == "1" }.first[0]
|
279
|
+
exon_is_first_for_transcripts[exon] ||= []
|
280
|
+
exon_is_first_for_transcripts[exon] << transcript
|
281
|
+
end
|
282
|
+
|
283
|
+
exon_phase = TSV.open(t.prerequisites.first)
|
284
|
+
exon_phase.unnamed = true
|
285
|
+
exon_phase.monitor = true
|
286
|
+
|
287
|
+
exon_phase.through do |exon, value|
|
288
|
+
Misc.zip_fields(value).each{|transcript, phase|
|
289
|
+
next unless exon_is_first_for_transcripts.include? exon
|
290
|
+
next unless exon_is_first_for_transcripts[exon].include? transcript
|
291
|
+
tsv[transcript] = phase
|
292
|
+
}
|
293
|
+
end
|
294
|
+
|
295
|
+
File.open(t.name, 'w') do |f| f.puts tsv end
|
296
|
+
end
|
297
|
+
|
298
|
+
|
239
299
|
file 'transcript_sequence' do |t|
|
240
300
|
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
241
301
|
|
242
302
|
File.open(t.name, 'w') do |f|
|
243
303
|
f.puts "#: :type=:single"
|
244
|
-
f.puts "#Ensembl Transcript ID\
|
304
|
+
f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
245
305
|
sequences.each do |seq, genes|
|
246
306
|
genes.each do |gene|
|
247
307
|
f.write gene
|
@@ -272,7 +332,7 @@ end
|
|
272
332
|
|
273
333
|
file 'gene_pmids' do |t|
|
274
334
|
tsv = Entrez.entrez2pubmed($taxs)
|
275
|
-
text = "#: :namespace=#{$namespace}"
|
335
|
+
text = "#: :namespace=#{$namespace}\n"
|
276
336
|
text += "#Entrez Gene ID\tPMID"
|
277
337
|
tsv.each do |gene, pmids|
|
278
338
|
text << "\n" << gene << "\t" << pmids * "|"
|
@@ -319,7 +379,7 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
|
|
319
379
|
transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
|
320
380
|
transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
|
321
381
|
|
322
|
-
string = "#: :namespace=#{$namespace}"
|
382
|
+
string = "#: :namespace=#{$namespace}\n"
|
323
383
|
string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
|
324
384
|
|
325
385
|
exons.unnamed = true
|
@@ -329,29 +389,43 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
|
|
329
389
|
transcript_exons.unnamed = true
|
330
390
|
|
331
391
|
exons.monitor = true
|
332
|
-
|
333
|
-
|
334
|
-
gene, start, finish, strand, chr = info
|
335
|
-
|
336
|
-
transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
392
|
+
exons.through do |exon, info|
|
393
|
+
gene, start, finish, strand, chr = info
|
337
394
|
|
338
|
-
|
339
|
-
transcripts.each do |transcript|
|
340
|
-
offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
|
341
|
-
transcript_offsets[transcript] = offset unless offset.nil?
|
342
|
-
end
|
395
|
+
transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
343
396
|
|
344
|
-
|
397
|
+
transcript_offsets = {}
|
398
|
+
transcripts.each do |transcript|
|
399
|
+
offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
|
400
|
+
transcript_offsets[transcript] = offset unless offset.nil?
|
345
401
|
end
|
402
|
+
|
403
|
+
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
346
404
|
end
|
347
405
|
|
348
406
|
Open.write(t.name, string)
|
349
407
|
end
|
350
408
|
|
351
409
|
file 'gene_go' do |t|
|
352
|
-
|
410
|
+
if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
|
411
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
|
353
412
|
|
354
|
-
|
413
|
+
goterms.add_field "GO ID" do |key, values|
|
414
|
+
values.flatten.compact.reject{|go| go.empty?}
|
415
|
+
end
|
416
|
+
|
417
|
+
goterms.add_field "GO Namespace" do |key, values|
|
418
|
+
["biological_process"] * values["GO BP ID"].reject{|go| go.empty?}.length +
|
419
|
+
["cellular_component"] * values["GO CC ID"].reject{|go| go.empty?}.length +
|
420
|
+
["molecular_function"] * values["GO MF ID"].reject{|go| go.empty?}.length
|
421
|
+
end
|
422
|
+
|
423
|
+
File.open(t.name, 'w') do |f| f.puts goterms.slice(["GO ID", "GO Namespace"]) end
|
424
|
+
else
|
425
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
|
426
|
+
|
427
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
428
|
+
end
|
355
429
|
end
|
356
430
|
|
357
431
|
file 'gene_go_bp' => 'gene_go' do |t|
|
@@ -370,13 +444,18 @@ file 'gene_go_bp' => 'gene_go' do |t|
|
|
370
444
|
end
|
371
445
|
|
372
446
|
|
373
|
-
|
374
447
|
file 'gene_pfam' do |t|
|
375
448
|
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
|
376
449
|
|
377
450
|
File.open(t.name, 'w') do |f| f.puts goterms end
|
378
451
|
end
|
379
452
|
|
453
|
+
file 'chromosomes' do |t|
|
454
|
+
goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => $namespace)
|
455
|
+
|
456
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
457
|
+
end
|
458
|
+
|
380
459
|
|
381
460
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
382
461
|
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
@@ -47,6 +47,18 @@ class TestBioMart < Test::Unit::TestCase
|
|
47
47
|
assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
|
48
48
|
end
|
49
49
|
end
|
50
|
+
|
51
|
+
def __test_chunk
|
52
|
+
chrs = %w(I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI MT 2-micron)
|
53
|
+
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :chunk_filter => ['chromosome_name', chrs], :nocache => false, :wget_options => { :quiet => false})
|
54
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
55
|
+
|
56
|
+
TmpFile.with_file do |f|
|
57
|
+
filename = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false}, :filename => f)
|
58
|
+
data = TSV.open Open.open(filename)
|
59
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
60
|
+
end
|
61
|
+
end
|
50
62
|
end
|
51
63
|
|
52
64
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 0
|
9
8
|
- 1
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 1.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-11-17 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -105,6 +105,7 @@ files:
|
|
105
105
|
- lib/rbbt/sources/CTCAE.rb
|
106
106
|
- lib/rbbt/sources/InterPro.rb
|
107
107
|
- lib/rbbt/sources/NCI.rb
|
108
|
+
- lib/rbbt/sources/barcode.rb
|
108
109
|
- lib/rbbt/sources/bibtex.rb
|
109
110
|
- lib/rbbt/sources/biomart.rb
|
110
111
|
- lib/rbbt/sources/entrez.rb
|
@@ -115,6 +116,7 @@ files:
|
|
115
116
|
- lib/rbbt/sources/organism/sequence.rb
|
116
117
|
- lib/rbbt/sources/polysearch.rb
|
117
118
|
- lib/rbbt/sources/pubmed.rb
|
119
|
+
- lib/rbbt/sources/tfacts.rb
|
118
120
|
- lib/rbbt/sources/wgEncodeBroadHmm.rb
|
119
121
|
- share/install/InterPro/Rakefile
|
120
122
|
- share/install/JoChem/Rakefile
|