rbbt-sources 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/sources/NCI.rb +58 -0
- data/lib/rbbt/sources/barcode.rb +23 -0
- data/lib/rbbt/sources/biomart.rb +46 -13
- data/lib/rbbt/sources/entrez.rb +1 -1
- data/lib/rbbt/sources/go.rb +30 -0
- data/lib/rbbt/sources/tfacts.rb +65 -0
- data/share/install/NCI/Rakefile +3 -3
- data/share/install/Organism/Hsa/Rakefile +6 -0
- data/share/install/Organism/organism_helpers.rb +109 -30
- data/test/rbbt/sources/test_biomart.rb +12 -0
- metadata +6 -4
data/lib/rbbt/sources/NCI.rb
CHANGED
@@ -5,3 +5,61 @@ module NCI
|
|
5
5
|
|
6
6
|
NCI.claim NCI.root.find, :rake, Rbbt.share.install.NCI.Rakefile.find(:lib)
|
7
7
|
end
|
8
|
+
|
9
|
+
if defined? Entity
|
10
|
+
|
11
|
+
module NCINaturePathways
|
12
|
+
extend Entity
|
13
|
+
self.format = "NCI Nature Pathway ID"
|
14
|
+
|
15
|
+
property :name => :array2single do
|
16
|
+
@name ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
|
17
|
+
end
|
18
|
+
|
19
|
+
property :genes => :array2single do
|
20
|
+
@genes ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "NCI Nature Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
module NCIReactomePathways
|
25
|
+
extend Entity
|
26
|
+
self.format = "NCI Reactome Pathway ID"
|
27
|
+
|
28
|
+
property :name => :array2single do
|
29
|
+
@name ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
|
30
|
+
end
|
31
|
+
|
32
|
+
property :genes => :array2single do
|
33
|
+
@genes ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "NCI Reactome Pathway ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true).values_at *self
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
module NCIBioCartaPathways
|
38
|
+
extend Entity
|
39
|
+
self.format = "NCI BioCarta Pathway ID"
|
40
|
+
|
41
|
+
property :name => :array2single do
|
42
|
+
@name ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name"], :type => :flat, :merge => true).values_at *self
|
43
|
+
end
|
44
|
+
|
45
|
+
property :genes => :array2single do
|
46
|
+
@genes ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "NCI BioCarta Pathway ID", :fields => ["Entrez Gene ID"], :type => :flat, :merge => true).values_at *self
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
if defined? Gene and Entity === Gene
|
51
|
+
module Gene
|
52
|
+
property :nature_pathways => :array2single do
|
53
|
+
@nature_pathways ||= NCI.nature_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Nature Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
|
54
|
+
end
|
55
|
+
|
56
|
+
property :reactome_pathways => :array2single do
|
57
|
+
@reactome_pathways ||= NCI.reactome_pathways.tsv(:persist => true, :key_field => "UniProt/SwissProt Accession", :fields => ["NCI Reactome Pathway ID"], :type => :flat, :merge => true).values_at *self.to("UniProt/SwissProt Accession")
|
58
|
+
end
|
59
|
+
|
60
|
+
property :biocarta_pathways => :array2single do
|
61
|
+
@biocarta_pathways ||= NCI.biocarta_pathways.tsv(:persist => true, :key_field => "Entrez Gene ID", :fields => ["NCI BioCarta Pathway ID"], :type => :flat, :merge => true).values_at *self.entrez
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
module Barcode
|
4
|
+
extend Resource
|
5
|
+
self.subdir = "share/databases/Barcode"
|
6
|
+
|
7
|
+
Barcode.claim Barcode.transcriptome, :proc do
|
8
|
+
tsv = TSV.open(Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv"),
|
9
|
+
:fix => Proc.new{|l| l.gsub('"', '').gsub(',', "\t")}, :header_hash => "",
|
10
|
+
:type => :list, :cast => :to_f)
|
11
|
+
io = Open.open("http://rafalab.jhsph.edu/barcode/abc.ntc.GPL570.csv")
|
12
|
+
fields = io.gets.chomp.gsub('"','').split(',')
|
13
|
+
io.close
|
14
|
+
fields.shift
|
15
|
+
tsv.fields = fields
|
16
|
+
tsv.key_field = "AFFY HG U133-PLUS-2"
|
17
|
+
tsv.to_s
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
Barcode.transcriptome.produce
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -22,7 +22,7 @@ module BioMart
|
|
22
22
|
@@biomart_query_xml = <<-EOT
|
23
23
|
<?xml version="1.0" encoding="UTF-8"?>
|
24
24
|
<!DOCTYPE Query>
|
25
|
-
<Query
|
25
|
+
<Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
|
26
26
|
<Dataset name = "<!--DATABASE-->" interface = "default" >
|
27
27
|
<!--FILTERS-->
|
28
28
|
<!--MAIN-->
|
@@ -47,26 +47,50 @@ module BioMart
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
50
|
+
open_options = Misc.add_defaults :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
|
50
51
|
repeats = true
|
51
52
|
attrs ||= []
|
52
53
|
filters ||= ["with_#{main}"]
|
53
54
|
|
55
|
+
if chunk_filter = open_options.delete(:chunk_filter)
|
56
|
+
filter, values = chunk_filter
|
57
|
+
merged_file = TmpFile.tmp_file
|
58
|
+
f = File.open(merged_file, 'w')
|
59
|
+
values.each do |value|
|
60
|
+
data = get(database, main, attrs, filters + [[filter, value]], data, open_options)
|
61
|
+
f.write Open.read(data)
|
62
|
+
end
|
63
|
+
f.close
|
64
|
+
return merged_file
|
65
|
+
end
|
66
|
+
|
54
67
|
query = @@biomart_query_xml.dup
|
55
68
|
query.sub!(/<!--DATABASE-->/,database)
|
56
69
|
query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
|
57
70
|
query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
|
58
71
|
query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
|
59
72
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
response = Open.read(
|
73
|
+
url = @archive_url ? @archive_url + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
|
74
|
+
|
75
|
+
begin
|
76
|
+
response = Open.read(url, open_options.dup)
|
77
|
+
rescue
|
78
|
+
Open.remove_from_cache url, open_options
|
79
|
+
raise $!
|
64
80
|
end
|
65
81
|
|
66
|
-
if response.empty? or response =~ /Query ERROR:/
|
82
|
+
if response.empty? or response =~ /Query ERROR:/
|
83
|
+
Open.remove_from_cache url, open_options
|
67
84
|
raise BioMart::QueryError, response
|
68
85
|
end
|
69
86
|
|
87
|
+
if not response =~ /\[success\]$/sm
|
88
|
+
Open.remove_from_cache url, open_options
|
89
|
+
raise BioMart::QueryError, "Uncomplete result"
|
90
|
+
end
|
91
|
+
|
92
|
+
response.sub!(/\n\[success\]$/sm,'')
|
93
|
+
|
70
94
|
result_file = TmpFile.tmp_file
|
71
95
|
Open.write(result_file, response)
|
72
96
|
|
@@ -101,10 +125,10 @@ module BioMart
|
|
101
125
|
# cause an error if the BioMart WS does not allow filtering with that
|
102
126
|
# attribute.
|
103
127
|
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
104
|
-
open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil
|
105
|
-
filename, field_names = Misc.process_options open_options, :filename, :field_names
|
128
|
+
open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false
|
129
|
+
filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr
|
106
130
|
attrs ||= []
|
107
|
-
|
131
|
+
|
108
132
|
open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
|
109
133
|
|
110
134
|
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
@@ -122,11 +146,17 @@ module BioMart
|
|
122
146
|
|
123
147
|
chunks << chunk if chunk.any?
|
124
148
|
|
149
|
+
chunks << [] if chunks.empty?
|
150
|
+
|
125
151
|
Log.low "Chunks: #{chunks.length}"
|
126
|
-
chunks.
|
127
|
-
|
128
|
-
|
129
|
-
|
152
|
+
if chunks.any?
|
153
|
+
chunks.each_with_index{|chunk,i|
|
154
|
+
Log.low "Chunk #{ i + 1 } / #{chunks.length}: [#{chunk * ", "}]"
|
155
|
+
data = get(database, main, chunk, filters, data, open_options)
|
156
|
+
}
|
157
|
+
else
|
158
|
+
data = get(database, main, [], filters, data, open_options)
|
159
|
+
end
|
130
160
|
|
131
161
|
open_options[:filename] ||= "BioMart[#{main}+#{attrs.length}]"
|
132
162
|
if filename.nil?
|
@@ -150,10 +180,13 @@ module BioMart
|
|
150
180
|
end
|
151
181
|
|
152
182
|
def self.tsv(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
183
|
+
attrs ||= []
|
184
|
+
|
153
185
|
if @archive_url
|
154
186
|
attrs = attrs.reject{|attr| (MISSING_IN_ARCHIVE[@archive] || []).include? attr[1]}
|
155
187
|
end
|
156
188
|
|
189
|
+
|
157
190
|
codes = attrs.collect{|attr| attr[1]}
|
158
191
|
if open_options[:filename].nil?
|
159
192
|
tsv = query(database, main.last, codes, filters, data, open_options)
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -35,7 +35,7 @@ module Entrez
|
|
35
35
|
|
36
36
|
|
37
37
|
def self.entrez2pubmed(taxs)
|
38
|
-
options = {:key_field => 1, :fields => 2, :persist => true, :merge => true}
|
38
|
+
options = {:key_field => 1, :fields => [2], :persist => true, :merge => true}
|
39
39
|
|
40
40
|
taxs = [taxs] unless taxs.is_a?(Array)
|
41
41
|
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -84,3 +84,33 @@ module GO
|
|
84
84
|
end
|
85
85
|
end
|
86
86
|
end
|
87
|
+
|
88
|
+
if defined? Entity
|
89
|
+
|
90
|
+
module GOTerm
|
91
|
+
extend Entity
|
92
|
+
self.format = "GO ID"
|
93
|
+
|
94
|
+
property :name => :array2single do
|
95
|
+
@name ||= GO.id2name(self)
|
96
|
+
end
|
97
|
+
|
98
|
+
property :genes => :array2single do |organism|
|
99
|
+
@genes ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "GO ID", :fields => ["Ensembl Gene ID"], :type => :flat, :merge => true).values_at *self
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
if defined? Gene and Entity === Gene
|
104
|
+
module Gene
|
105
|
+
property :go_terms => :array2single do |organism|
|
106
|
+
@go_terms ||= Organism.gene_go(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
107
|
+
end
|
108
|
+
|
109
|
+
property :go_bp_terms => :array2single do |organism|
|
110
|
+
@go_bp_terms ||= Organism.gene_go_bp(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["GO ID"], :type => :flat, :merge => true).values_at *self.ensembl
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/resource'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module TFacts
|
6
|
+
extend Resource
|
7
|
+
self.subdir = "share/databases/TF"
|
8
|
+
|
9
|
+
def self.targets_for_gene_unsigned(gene_name)
|
10
|
+
doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResultsns.php", :post => "TFS_ID=#{ gene_name }"))
|
11
|
+
|
12
|
+
doc.css("td a").collect{|link| link.content.strip}
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.targets_for_gene_signed(gene_name)
|
16
|
+
doc = Nokogiri::HTML(Open.read("http://www.tfacts.org/source/tfsResults.php", :post => "TFS_ID=#{ gene_name }"))
|
17
|
+
|
18
|
+
rows = doc.css("tr")
|
19
|
+
rows.shift
|
20
|
+
targets = {}
|
21
|
+
rows.each{|row| gene, sign = row.css("td"); targets[gene.css("a").first.content.strip] = sign.content.strip}
|
22
|
+
targets
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.known_transcription_factors_signed
|
26
|
+
Open.read("http://www.tfacts.org/source/tfs.php").scan(/OPTION VALUE=([^\s]+)/).flatten
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.known_transcription_factors_unsigned
|
30
|
+
Open.read("http://www.tfacts.org/source/tfsns.php").scan(/OPTION VALUE=([^\s]+)/).flatten
|
31
|
+
end
|
32
|
+
|
33
|
+
TFacts.claim TFacts.targets, :proc do
|
34
|
+
tsv = Misc.process_to_hash(TFacts.known_transcription_factors_unsigned){|list| list.collect{|tf| TFacts.targets_for_gene_unsigned(tf)}}
|
35
|
+
TSV.setup tsv, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name"], :type => :flat
|
36
|
+
tsv.to_s
|
37
|
+
end
|
38
|
+
|
39
|
+
TFacts.claim TFacts.targets_signed, :proc do
|
40
|
+
tsv = TSV.setup({}, :key_field => "Associated Gene Name", :fields => ["Target Associated Gene Name", "Target Sign"], :type => :double)
|
41
|
+
Misc.process_to_hash(TFacts.known_transcription_factors_signed){|list| list.collect{|tf| TFacts.targets_for_gene_signed(tf)}}.each do |tf, targets|
|
42
|
+
tsv[tf] = [targets.keys, targets.values]
|
43
|
+
end
|
44
|
+
tsv.to_s
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
if defined? Entity and defined? Gene and Entity === Gene
|
50
|
+
|
51
|
+
module Gene
|
52
|
+
property :is_transcription_factor? => :array2single do
|
53
|
+
@is_trasncription_factor ||= begin
|
54
|
+
tfs = TFacts.targets.keys
|
55
|
+
self.name.collect{|gene| tfs.include? gene}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
property :transcription_targets => :array2single do
|
60
|
+
@transcription_targets ||= begin
|
61
|
+
Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/share/install/NCI/Rakefile
CHANGED
@@ -51,7 +51,7 @@ file 'nature_pathways' do |t|
|
|
51
51
|
|
52
52
|
pathways = NCI.get_pathways(xml)
|
53
53
|
|
54
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
54
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
55
55
|
end
|
56
56
|
|
57
57
|
file 'biocarta_pathways' do |t|
|
@@ -62,7 +62,7 @@ file 'biocarta_pathways' do |t|
|
|
62
62
|
|
63
63
|
pathways = NCI.get_pathways(xml, "LL")
|
64
64
|
|
65
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
65
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s)
|
66
66
|
end
|
67
67
|
|
68
68
|
file 'reactome_pathways' do |t|
|
@@ -73,7 +73,7 @@ file 'reactome_pathways' do |t|
|
|
73
73
|
|
74
74
|
pathways = NCI.get_pathways(xml, "UP")
|
75
75
|
|
76
|
-
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
76
|
+
Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s)
|
77
77
|
end
|
78
78
|
|
79
79
|
|
@@ -91,6 +91,12 @@ $biomart_go= [
|
|
91
91
|
["GO Namespace", 'namespace_1003'],
|
92
92
|
]
|
93
93
|
|
94
|
+
$biomart_go_2009= [
|
95
|
+
["GO BP ID", 'go_biological_process_id'],
|
96
|
+
["GO MF ID", 'go_molecular_function_id'],
|
97
|
+
["GO CC ID", 'go_cellular_component_id'],
|
98
|
+
]
|
99
|
+
|
94
100
|
$biomart_pfam= [
|
95
101
|
["Pfam Domain", 'pfam'],
|
96
102
|
]
|
@@ -51,6 +51,13 @@ $biomart_transcript_exons = [
|
|
51
51
|
['Exon Rank in Transcript','rank'],
|
52
52
|
]
|
53
53
|
|
54
|
+
$biomart_exon_phase = [
|
55
|
+
$biomart_ensembl_transcript,
|
56
|
+
['Phase','phase'],
|
57
|
+
]
|
58
|
+
|
59
|
+
|
60
|
+
|
54
61
|
$biomart_exons = [
|
55
62
|
$biomart_ensembl_gene,
|
56
63
|
['Exon Strand','strand'],
|
@@ -66,6 +73,7 @@ end
|
|
66
73
|
|
67
74
|
file 'identifiers' do |t|
|
68
75
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
|
76
|
+
identifiers.unnamed = true
|
69
77
|
|
70
78
|
$biomart_identifiers.each do |name, key, prefix|
|
71
79
|
next unless identifiers.all_fields.include? name
|
@@ -95,11 +103,14 @@ file 'identifiers' do |t|
|
|
95
103
|
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
96
104
|
|
97
105
|
identifiers.attach entrez_synonyms
|
106
|
+
|
98
107
|
|
99
|
-
identifiers.
|
100
|
-
|
101
|
-
|
102
|
-
|
108
|
+
identifiers.with_unnamed do
|
109
|
+
identifiers.each do |key, values|
|
110
|
+
values.each do |list|
|
111
|
+
list.reject!{|v| v.nil? or v.empty?}
|
112
|
+
list.uniq!
|
113
|
+
end
|
103
114
|
end
|
104
115
|
end
|
105
116
|
|
@@ -129,7 +140,7 @@ file 'protein_identifiers' do |t|
|
|
129
140
|
File.open(t.name, 'w') do |f| f.puts identifiers end
|
130
141
|
end
|
131
142
|
|
132
|
-
file '
|
143
|
+
file 'transcript_probes' do |t|
|
133
144
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_probe_identifiers, [], nil, :namespace => $namespace)
|
134
145
|
$biomart_probe_identifiers.each do |name, key, prefix|
|
135
146
|
if prefix
|
@@ -148,7 +159,7 @@ end
|
|
148
159
|
|
149
160
|
file 'transcripts' => 'gene_positions' do |t|
|
150
161
|
transcripts = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript, [], nil, :type => :list, :namespace => $namespace)
|
151
|
-
transcripts.attach TSV.open('gene_positions'), "Chromosome Name"
|
162
|
+
transcripts.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
|
152
163
|
|
153
164
|
File.open(t.name, 'w') do |f| f.puts transcripts end
|
154
165
|
end
|
@@ -161,13 +172,12 @@ file 'transcript_3utr' do |t|
|
|
161
172
|
f.puts "#Ensembl Transcript ID\t3' UTR Length"
|
162
173
|
utrs.each do |seq,trans|
|
163
174
|
trans.each do |tran|
|
164
|
-
f.puts [tran, seq.length] * "\t"
|
175
|
+
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
165
176
|
end
|
166
177
|
end
|
167
178
|
end
|
168
179
|
end
|
169
180
|
|
170
|
-
|
171
181
|
file 'transcript_5utr' do |t|
|
172
182
|
utrs = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_5utr, [], nil, :type => :flat, :namespace => $namespace)
|
173
183
|
|
@@ -176,7 +186,7 @@ file 'transcript_5utr' do |t|
|
|
176
186
|
f.puts "#Ensembl Transcript ID\t5' UTR Length"
|
177
187
|
utrs.each do |seq,trans|
|
178
188
|
trans.each do |tran|
|
179
|
-
f.puts [tran, seq.length] * "\t"
|
189
|
+
f.puts [tran, seq.length] * "\t" if seq =~ /^[ACTG]+$/
|
180
190
|
end
|
181
191
|
end
|
182
192
|
end
|
@@ -193,7 +203,7 @@ file 'gene_sequence' do |t|
|
|
193
203
|
|
194
204
|
File.open(t.name, 'w') do |f|
|
195
205
|
f.puts "#: :type=:single"
|
196
|
-
f.puts "#Ensembl Gene ID\
|
206
|
+
f.puts "#Ensembl Gene ID\tGene Sequence"
|
197
207
|
sequences.each do |seq, genes|
|
198
208
|
genes.each do |gene|
|
199
209
|
f.write gene
|
@@ -205,7 +215,9 @@ file 'gene_sequence' do |t|
|
|
205
215
|
end
|
206
216
|
end
|
207
217
|
|
208
|
-
file 'protein_sequence' do |t|
|
218
|
+
file 'protein_sequence' => 'chromosomes' do |t|
|
219
|
+
#chromosomes = TSV.open(t.prerequisites.first).keys
|
220
|
+
#sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace, :chunk_filter => ['chromosome_name', chromosomes])
|
209
221
|
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_protein, $biomart_protein_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
210
222
|
|
211
223
|
File.open(t.name, 'w') do |f|
|
@@ -220,12 +232,11 @@ file 'protein_sequence' do |t|
|
|
220
232
|
end
|
221
233
|
end
|
222
234
|
end
|
223
|
-
|
224
235
|
end
|
225
236
|
|
226
237
|
file 'exons' => 'gene_positions' do |t|
|
227
238
|
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exons, [], nil, :merge => false, :type => :list, :namespace => $namespace)
|
228
|
-
exons.attach TSV.open('gene_positions'), "Chromosome Name"
|
239
|
+
exons.attach TSV.open('gene_positions'), :fields => ["Chromosome Name"]
|
229
240
|
|
230
241
|
File.open(t.name, 'w') do |f| f.puts exons end
|
231
242
|
end
|
@@ -236,12 +247,61 @@ file 'transcript_exons' do |t|
|
|
236
247
|
File.open(t.name, 'w') do |f| f.puts exons end
|
237
248
|
end
|
238
249
|
|
250
|
+
file 'exon_phase' do |t|
|
251
|
+
exons = BioMart.tsv($biomart_db, $biomart_ensembl_exon, $biomart_exon_phase, [], nil, :keep_empty => true, :namespace => $namespace)
|
252
|
+
|
253
|
+
File.open(t.name, 'w') do |f| f.puts exons end
|
254
|
+
end
|
255
|
+
|
256
|
+
|
257
|
+
#file 'transcript_phase' do |t|
|
258
|
+
# tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["Phase"], :type => :single, :cast => :to_i)
|
259
|
+
#
|
260
|
+
# transcript_cds_start = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, [['CDNA Start','cds_start']], [], nil, :type => :flat, :namespace => $namespace)
|
261
|
+
# transcript_cds_start.through do |transcript, values|
|
262
|
+
# phase = values.compact.reject{|p| p.empty?}.select{|p| p == "1" or p == "2"}.first
|
263
|
+
# tsv[transcript] = phase.to_i unless phase.nil?
|
264
|
+
# end
|
265
|
+
#
|
266
|
+
# File.open(t.name, 'w') do |f| f.puts tsv end
|
267
|
+
#end
|
268
|
+
|
269
|
+
file 'transcript_phase' => ['exon_phase', 'transcript_exons'] do |t|
|
270
|
+
tsv = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["phase"], :type => :single, :cast => :to_i)
|
271
|
+
|
272
|
+
transcript_exons = TSV.open(t.prerequisites.last)
|
273
|
+
transcript_exons.unnamed = true
|
274
|
+
|
275
|
+
exon_is_first_for_transcripts = {}
|
276
|
+
|
277
|
+
transcript_exons.through do |transcript, value|
|
278
|
+
exon = Misc.zip_fields(value).select{|exon, rank| rank == "1" }.first[0]
|
279
|
+
exon_is_first_for_transcripts[exon] ||= []
|
280
|
+
exon_is_first_for_transcripts[exon] << transcript
|
281
|
+
end
|
282
|
+
|
283
|
+
exon_phase = TSV.open(t.prerequisites.first)
|
284
|
+
exon_phase.unnamed = true
|
285
|
+
exon_phase.monitor = true
|
286
|
+
|
287
|
+
exon_phase.through do |exon, value|
|
288
|
+
Misc.zip_fields(value).each{|transcript, phase|
|
289
|
+
next unless exon_is_first_for_transcripts.include? exon
|
290
|
+
next unless exon_is_first_for_transcripts[exon].include? transcript
|
291
|
+
tsv[transcript] = phase
|
292
|
+
}
|
293
|
+
end
|
294
|
+
|
295
|
+
File.open(t.name, 'w') do |f| f.puts tsv end
|
296
|
+
end
|
297
|
+
|
298
|
+
|
239
299
|
file 'transcript_sequence' do |t|
|
240
300
|
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_sequence, [], nil, :type => :flat, :namespace => $namespace)
|
241
301
|
|
242
302
|
File.open(t.name, 'w') do |f|
|
243
303
|
f.puts "#: :type=:single"
|
244
|
-
f.puts "#Ensembl Transcript ID\
|
304
|
+
f.puts "#Ensembl Transcript ID\tTranscript Sequence"
|
245
305
|
sequences.each do |seq, genes|
|
246
306
|
genes.each do |gene|
|
247
307
|
f.write gene
|
@@ -272,7 +332,7 @@ end
|
|
272
332
|
|
273
333
|
file 'gene_pmids' do |t|
|
274
334
|
tsv = Entrez.entrez2pubmed($taxs)
|
275
|
-
text = "#: :namespace=#{$namespace}"
|
335
|
+
text = "#: :namespace=#{$namespace}\n"
|
276
336
|
text += "#Entrez Gene ID\tPMID"
|
277
337
|
tsv.each do |gene, pmids|
|
278
338
|
text << "\n" << gene << "\t" << pmids * "|"
|
@@ -319,7 +379,7 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
|
|
319
379
|
transcript_info = TSV.open('transcripts', :list, :fields => ["Ensembl Protein ID"])
|
320
380
|
transcript_exons = TSV.open('transcript_exons', :double, :fields => ["Ensembl Exon ID","Exon Rank in Transcript"])
|
321
381
|
|
322
|
-
string = "#: :namespace=#{$namespace}"
|
382
|
+
string = "#: :namespace=#{$namespace}\n"
|
323
383
|
string += "#Ensembl Exon ID\tEnsembl Transcript ID\tOffset\n"
|
324
384
|
|
325
385
|
exons.unnamed = true
|
@@ -329,29 +389,43 @@ file 'exon_offsets' => %w(exons transcript_exons gene_transcripts transcripts tr
|
|
329
389
|
transcript_exons.unnamed = true
|
330
390
|
|
331
391
|
exons.monitor = true
|
332
|
-
|
333
|
-
|
334
|
-
gene, start, finish, strand, chr = info
|
335
|
-
|
336
|
-
transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
392
|
+
exons.through do |exon, info|
|
393
|
+
gene, start, finish, strand, chr = info
|
337
394
|
|
338
|
-
|
339
|
-
transcripts.each do |transcript|
|
340
|
-
offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
|
341
|
-
transcript_offsets[transcript] = offset unless offset.nil?
|
342
|
-
end
|
395
|
+
transcripts = coding_transcripts_for_exon(exon, exon_transcripts, transcript_info)
|
343
396
|
|
344
|
-
|
397
|
+
transcript_offsets = {}
|
398
|
+
transcripts.each do |transcript|
|
399
|
+
offset = exon_offset_in_transcript( exon, transcript, exons, transcript_exons)
|
400
|
+
transcript_offsets[transcript] = offset unless offset.nil?
|
345
401
|
end
|
402
|
+
|
403
|
+
string << exon << "\t" << transcript_offsets.keys * "|" << "\t" << transcript_offsets.values * "|" << "\n"
|
346
404
|
end
|
347
405
|
|
348
406
|
Open.write(t.name, string)
|
349
407
|
end
|
350
408
|
|
351
409
|
file 'gene_go' do |t|
|
352
|
-
|
410
|
+
if File.basename(FileUtils.pwd) =~ /^[a-z]{3}([0-9]{4})$/i and $1.to_i <= 2009
|
411
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go_2009, [], nil, :type => :double, :namespace => $namespace)
|
353
412
|
|
354
|
-
|
413
|
+
goterms.add_field "GO ID" do |key, values|
|
414
|
+
values.flatten.compact.reject{|go| go.empty?}
|
415
|
+
end
|
416
|
+
|
417
|
+
goterms.add_field "GO Namespace" do |key, values|
|
418
|
+
["biological_process"] * values["GO BP ID"].reject{|go| go.empty?}.length +
|
419
|
+
["cellular_component"] * values["GO CC ID"].reject{|go| go.empty?}.length +
|
420
|
+
["molecular_function"] * values["GO MF ID"].reject{|go| go.empty?}.length
|
421
|
+
end
|
422
|
+
|
423
|
+
File.open(t.name, 'w') do |f| f.puts goterms.slice(["GO ID", "GO Namespace"]) end
|
424
|
+
else
|
425
|
+
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_go, [], nil, :type => :double, :namespace => $namespace)
|
426
|
+
|
427
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
428
|
+
end
|
355
429
|
end
|
356
430
|
|
357
431
|
file 'gene_go_bp' => 'gene_go' do |t|
|
@@ -370,13 +444,18 @@ file 'gene_go_bp' => 'gene_go' do |t|
|
|
370
444
|
end
|
371
445
|
|
372
446
|
|
373
|
-
|
374
447
|
file 'gene_pfam' do |t|
|
375
448
|
goterms = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => $namespace)
|
376
449
|
|
377
450
|
File.open(t.name, 'w') do |f| f.puts goterms end
|
378
451
|
end
|
379
452
|
|
453
|
+
file 'chromosomes' do |t|
|
454
|
+
goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => $namespace)
|
455
|
+
|
456
|
+
File.open(t.name, 'w') do |f| f.puts goterms end
|
457
|
+
end
|
458
|
+
|
380
459
|
|
381
460
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
382
461
|
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
|
@@ -47,6 +47,18 @@ class TestBioMart < Test::Unit::TestCase
|
|
47
47
|
assert_equal ['Protein ID', 'RefSeq Peptide'], data.fields
|
48
48
|
end
|
49
49
|
end
|
50
|
+
|
51
|
+
def __test_chunk
|
52
|
+
chrs = %w(I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI MT 2-micron)
|
53
|
+
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :chunk_filter => ['chromosome_name', chrs], :nocache => false, :wget_options => { :quiet => false})
|
54
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
55
|
+
|
56
|
+
TmpFile.with_file do |f|
|
57
|
+
filename = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false}, :filename => f)
|
58
|
+
data = TSV.open Open.open(filename)
|
59
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
60
|
+
end
|
61
|
+
end
|
50
62
|
end
|
51
63
|
|
52
64
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
|
-
- 0
|
9
8
|
- 1
|
10
|
-
|
9
|
+
- 0
|
10
|
+
version: 1.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
18
|
+
date: 2011-11-17 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -105,6 +105,7 @@ files:
|
|
105
105
|
- lib/rbbt/sources/CTCAE.rb
|
106
106
|
- lib/rbbt/sources/InterPro.rb
|
107
107
|
- lib/rbbt/sources/NCI.rb
|
108
|
+
- lib/rbbt/sources/barcode.rb
|
108
109
|
- lib/rbbt/sources/bibtex.rb
|
109
110
|
- lib/rbbt/sources/biomart.rb
|
110
111
|
- lib/rbbt/sources/entrez.rb
|
@@ -115,6 +116,7 @@ files:
|
|
115
116
|
- lib/rbbt/sources/organism/sequence.rb
|
116
117
|
- lib/rbbt/sources/polysearch.rb
|
117
118
|
- lib/rbbt/sources/pubmed.rb
|
119
|
+
- lib/rbbt/sources/tfacts.rb
|
118
120
|
- lib/rbbt/sources/wgEncodeBroadHmm.rb
|
119
121
|
- share/install/InterPro/Rakefile
|
120
122
|
- share/install/JoChem/Rakefile
|