rbbt-phgx 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/phgx.rb +2 -11
- data/lib/rbbt/mutation/fireDB.rb +11 -0
- data/lib/rbbt/mutation/polyphen.rb +172 -0
- data/lib/rbbt/mutation/sift.rb +112 -0
- data/lib/rbbt/mutation/snps_and_go.rb +61 -0
- data/lib/rbbt/sources/biogrid.rb +11 -0
- data/lib/rbbt/sources/cancer.rb +3 -8
- data/lib/rbbt/sources/dbsnp.rb +6 -0
- data/lib/rbbt/sources/hprd.rb +6 -0
- data/lib/rbbt/sources/kegg.rb +2 -5
- data/lib/rbbt/sources/matador.rb +2 -5
- data/lib/rbbt/sources/nci.rb +2 -5
- data/lib/rbbt/sources/pharmagkb.rb +2 -5
- data/lib/rbbt/sources/pina.rb +6 -0
- data/lib/rbbt/sources/reactome.rb +6 -0
- data/lib/rbbt/sources/sift.rb +5 -0
- data/lib/rbbt/sources/stitch.rb +2 -5
- data/lib/rbbt/sources/string.rb +2 -5
- data/share/Cancer/anais_annotations +7949 -0
- data/share/Cancer/anais_interactions +3402 -0
- data/share/Cancer/cancer_genes.tsv +428 -0
- data/share/install/Biogrid/Rakefile +20 -0
- data/share/install/DBSNP/Rakefile +50 -0
- data/share/install/HPRD/Rakefile +15 -0
- data/share/install/KEGG/Rakefile +3 -3
- data/share/install/Matador/Rakefile +2 -2
- data/share/install/NCI/Rakefile +2 -2
- data/share/install/PharmaGKB/Rakefile +17 -14
- data/share/install/Pina/Rakefile +16 -0
- data/share/install/Reactome/Rakefile +36 -0
- data/share/install/STITCH/Rakefile +5 -5
- data/share/install/STRING/Rakefile +2 -2
- data/test/rbbt/mutation/test_fireDB.rb +14 -0
- data/test/rbbt/mutation/test_polyphen.rb +20 -0
- data/test/rbbt/mutation/test_sift.rb +23 -0
- data/test/rbbt/mutation/test_snps_and_go.rb +32 -0
- data/test/rbbt/sources/test_cancer.rb +1 -1
- data/test/rbbt/sources/test_matador.rb +2 -3
- data/test/rbbt/sources/test_pharmagkb.rb +1 -1
- data/test/rbbt/sources/test_stitch.rb +3 -1
- metadata +47 -8
data/lib/phgx.rb
CHANGED
@@ -1,16 +1,7 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
-
require 'rbbt/util/
|
3
|
-
require 'rbbt/util/pkg_data'
|
4
|
-
require 'rbbt/util/pkg_software'
|
5
|
-
require 'rbbt/util/open'
|
6
|
-
require 'rbbt/util/tmpfile'
|
7
|
-
require 'rbbt/util/filecache'
|
2
|
+
require 'rbbt/util/resource'
|
8
3
|
|
9
4
|
module PhGx
|
10
|
-
extend
|
11
|
-
extend PKGData
|
12
|
-
extend PKGSoftware
|
13
|
-
|
14
|
-
self.load_cfg(%w(datadir), "datadir: #{File.join(ENV['HOME'], 'phgx', 'data')}\n")
|
5
|
+
extend Resource
|
15
6
|
end
|
16
7
|
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'phgx'
|
2
|
+
require 'rbbt/util/cmd'
|
3
|
+
|
4
|
+
module FireDB
|
5
|
+
PhGx.add_software "FireDB" => ['', :directory]
|
6
|
+
|
7
|
+
def self.predict(accession, sequence, mutation)
|
8
|
+
CMD.cmd("perl " + File.join(PhGx.find_software("FireDB"), "firePredText.pl") + " " + [accession, accession, sequence, 10] * " ").read
|
9
|
+
end
|
10
|
+
|
11
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'digest/md5'
|
5
|
+
|
6
|
+
module Polyphen2
|
7
|
+
|
8
|
+
URL="http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi"
|
9
|
+
URL_BASE="http://genetics.bwh.harvard.edu/"
|
10
|
+
|
11
|
+
module Batch
|
12
|
+
OPTIONS = {
|
13
|
+
"_ggi_project" => "PPHWeb2",
|
14
|
+
"_ggi_origin" => "query",
|
15
|
+
"_ggi_batch_file" => "",
|
16
|
+
"description" => "",
|
17
|
+
"NOTIFYME" => "",
|
18
|
+
"uploaded_sequences_1" => "",
|
19
|
+
"description_of_uploaded_sequences" => "",
|
20
|
+
"MODELNAME" => "HumDiv",
|
21
|
+
"UCSCDB" => "hg19",
|
22
|
+
"SNPFILTER" => "1",
|
23
|
+
"SNPFUNC" => "m",
|
24
|
+
"_ggi_target_pipeline" => "Submit Batch",
|
25
|
+
}
|
26
|
+
|
27
|
+
REFRESH_OPTIONS = {
|
28
|
+
"sid" => "",
|
29
|
+
"_ggi_project" => "PPHWeb2",
|
30
|
+
"_ggi_origin" => "manage",
|
31
|
+
"_ggi_target_manage" => "Refresh",
|
32
|
+
}
|
33
|
+
|
34
|
+
def self.predict(query)
|
35
|
+
options = OPTIONS.merge "_ggi_batch" => query
|
36
|
+
|
37
|
+
desc = Digest::MD5.hexdigest(options.inspect)
|
38
|
+
options["description"] = desc
|
39
|
+
|
40
|
+
ddd desc
|
41
|
+
|
42
|
+
doc = Nokogiri::HTML(Open.read(Polyphen2::URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
|
43
|
+
|
44
|
+
sid = doc.css('input[name=sid]').attr('value')
|
45
|
+
|
46
|
+
options = REFRESH_OPTIONS.merge "sid" => sid
|
47
|
+
finished = false
|
48
|
+
|
49
|
+
view_link = nil
|
50
|
+
while not finished do
|
51
|
+
doc = Nokogiri::HTML(Open.read(Polyphen2::URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
|
52
|
+
|
53
|
+
result_table = doc.css('body > table')[1].css('table')[2]
|
54
|
+
|
55
|
+
rows = result_table.css('tr')
|
56
|
+
|
57
|
+
row = rows.select{|row| row.css('td').length == 6}.select{|row| row.css('td').last.content.strip == desc}.first
|
58
|
+
|
59
|
+
cells = row.css('td')
|
60
|
+
if cells[2].content =~ /Error/
|
61
|
+
view_link = nil
|
62
|
+
break
|
63
|
+
end
|
64
|
+
|
65
|
+
if cells[1].content =~ /Short/
|
66
|
+
view_link = cells[1].css('a').attr('href')
|
67
|
+
break
|
68
|
+
end
|
69
|
+
|
70
|
+
sleep 1
|
71
|
+
end
|
72
|
+
|
73
|
+
return nil if view_link.nil?
|
74
|
+
|
75
|
+
tsv = TSV.new Open.open(Polyphen2::URL_BASE + view_link, :nocache => true), :double, :key => 'acc', :merge => true
|
76
|
+
|
77
|
+
return tsv
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
OPTIONS = {
|
85
|
+
"ContAllHits" => 0,
|
86
|
+
"ContThresh" => 6,
|
87
|
+
"Map2Mismatch" => 0,
|
88
|
+
"MaxHitGaps" => 20,
|
89
|
+
"MinHitIde" => 0.5,
|
90
|
+
"MinHitLen" => 100,
|
91
|
+
"SortByIde" => 1,
|
92
|
+
"StructAllHits" => 0,
|
93
|
+
"_ggi_jpover" => 1,
|
94
|
+
"_ggi_origin" => "query",
|
95
|
+
"_ggi_project" => "PPHWeb2",
|
96
|
+
"_ggi_target_submit" => "submit",
|
97
|
+
"accid" => "A6NFZ4",
|
98
|
+
"description" => "",
|
99
|
+
"seqpos" => "34",
|
100
|
+
"seqres" => "",
|
101
|
+
"seqvar1" => "Y",
|
102
|
+
"seqvar2" => "D",
|
103
|
+
"Submit" => "Submit+Query",
|
104
|
+
}
|
105
|
+
|
106
|
+
REFRESH_OPTIONS = {
|
107
|
+
"sid" => "",
|
108
|
+
"_ggi_project" => "PPHWeb2",
|
109
|
+
"_ggi_origin" => "manage",
|
110
|
+
"_ggi_target_manage" => "Refresh",
|
111
|
+
}
|
112
|
+
|
113
|
+
def self.parse_mutation(mutation)
|
114
|
+
mutation.match(/([A-Z])(\d+)([A-Z])/i).values_at 1,2,3
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.predict(accession, mutation)
|
118
|
+
reference, pos, substitution = parse_mutation(mutation)
|
119
|
+
|
120
|
+
options = OPTIONS.merge "accid" => accession, "seqpos" => pos, "seqvar1" => reference, "seqvar2" => substitution
|
121
|
+
|
122
|
+
desc = Digest::MD5.hexdigest(options.inspect)
|
123
|
+
options["description"] = desc
|
124
|
+
|
125
|
+
doc = Nokogiri::HTML(Open.read(URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
|
126
|
+
|
127
|
+
sid = doc.css('input[name=sid]').attr('value')
|
128
|
+
|
129
|
+
options = REFRESH_OPTIONS.merge "sid" => sid
|
130
|
+
finished = false
|
131
|
+
|
132
|
+
view_link = nil
|
133
|
+
while not finished do
|
134
|
+
doc = Nokogiri::HTML(Open.read(URL, :wget_options => {"--post-data" => "'#{options.collect{|k,v| [k,v] * "="} * "&"}'"}, :nocache => true))
|
135
|
+
|
136
|
+
result_table = doc.css('body > table')[1].css('table')[2]
|
137
|
+
|
138
|
+
rows = result_table.css('tr')
|
139
|
+
|
140
|
+
row = rows.select{|row| row.css('td').length == 6}.select{|row| row.css('td').last.content.strip == desc}.first
|
141
|
+
|
142
|
+
cells = row.css('td')
|
143
|
+
if cells[2].content =~ /Error/
|
144
|
+
view_link = nil
|
145
|
+
break
|
146
|
+
end
|
147
|
+
|
148
|
+
if cells[1].content =~ /View/
|
149
|
+
view_link = cells[1].css('a').attr('href')
|
150
|
+
break
|
151
|
+
end
|
152
|
+
|
153
|
+
sleep 1
|
154
|
+
end
|
155
|
+
|
156
|
+
return nil if view_link.nil?
|
157
|
+
|
158
|
+
|
159
|
+
doc = Nokogiri::HTML(Open.read(URL_BASE + view_link, :nocache => true))
|
160
|
+
|
161
|
+
para = doc.css('div#HumDivConf > p').first
|
162
|
+
div_prediction = para.css('span').first.content
|
163
|
+
div_score = para.css('b').first.content
|
164
|
+
|
165
|
+
para = doc.css('div#HumVarConf > p').first
|
166
|
+
var_prediction = para.css('span').first.content
|
167
|
+
var_score = para.css('b').first.content
|
168
|
+
|
169
|
+
return [div_prediction, div_score, var_prediction, var_score]
|
170
|
+
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'nokogiri'
|
3
|
+
module SIFT
|
4
|
+
URL_AMINOACID="http://sift.jcvi.org/sift-bin/SIFT_pid_subst_all_submit.pl"
|
5
|
+
URL_GENOMIC="http://sift.jcvi.org/sift-bin/SIFT_feed_to_chr_coords.pl"
|
6
|
+
|
7
|
+
def self.predict_aminoacid_mutation(accession, mutations)
|
8
|
+
doc = Nokogiri::HTML(Open.read(URL_AMINOACID, :wget_options => {"--post-data" => "'GI=#{[accession, mutations].flatten * ","}&sequences_to_select=BEST&seq_identity_filter=90'"}, :nocache => false))
|
9
|
+
|
10
|
+
rows = []
|
11
|
+
doc.css('tr').each do |row|
|
12
|
+
rows << row.css('td').collect{|cell| cell.content}
|
13
|
+
end
|
14
|
+
|
15
|
+
rows.shift
|
16
|
+
|
17
|
+
if Array === mutations
|
18
|
+
rows
|
19
|
+
else
|
20
|
+
rows.first
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.predict_aminoacid_mutation_batch(mutations)
|
25
|
+
data = case
|
26
|
+
when String === mutations
|
27
|
+
mutations
|
28
|
+
when Array === mutations
|
29
|
+
mutations.collect{|p| p * ", "} * "\n" if Array === mutations
|
30
|
+
end
|
31
|
+
|
32
|
+
doc = Nokogiri::HTML(Open.read(URL_AMINOACID, :wget_options => {"--post-data" => "'GI=#{data}&sequences_to_select=BEST&seq_identity_filter=90'"}, :nocache => false))
|
33
|
+
|
34
|
+
rows = []
|
35
|
+
doc.css('tr').each do |row|
|
36
|
+
rows << row.css('td').collect{|cell| cell.content}
|
37
|
+
end
|
38
|
+
|
39
|
+
rows.shift
|
40
|
+
|
41
|
+
if Array === mutations
|
42
|
+
rows
|
43
|
+
else
|
44
|
+
rows.first
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.parse_genomic_mutation(mutation)
|
49
|
+
mutation.match(/(\d+):(\d+):(1|-1):([A-Z])\/([A-Z])/).values_at 1,2,3,4,5
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.add_predictions(tsv)
|
53
|
+
raise "Input not TSV" unless TSV === tsv
|
54
|
+
|
55
|
+
raise "Field 'Refseq Protein ID' Not in TSV" unless tsv.fields.include? "Refseq Protein ID"
|
56
|
+
|
57
|
+
raise "Field 'Protein Mutation' Not in TSV" unless tsv.fields.include? "Protein Mutation"
|
58
|
+
|
59
|
+
data = []
|
60
|
+
tsv.through :key, ["Refseq Protein ID", "Protein Mutation"] do |key,values|
|
61
|
+
refseqs, mutations = values
|
62
|
+
mutations = mutations.reject{|mutation| mutation[0] == mutation[-1]}
|
63
|
+
next if refseqs.nil? or refseqs.compact.reject{|v| v.nil? or v.empty?}.empty? or mutations.empty?
|
64
|
+
|
65
|
+
refseqs.compact.uniq.each do |refseq|
|
66
|
+
data << [refseq, mutations]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
data.sort!
|
71
|
+
|
72
|
+
|
73
|
+
predictions = {}
|
74
|
+
predict_aminoacid_mutation_batch(data).each{|values| predictions[values[0] + ":" << values[1]] = values.values_at 3,4,5,6}
|
75
|
+
|
76
|
+
refseq_field = tsv.identify_field "Refseq Protein ID"
|
77
|
+
protein_field = tsv.identify_field "Protein Mutation"
|
78
|
+
|
79
|
+
tsv.add_field "SIFT:Prediction" do |key,values|
|
80
|
+
refseqs = if refseq_field === :key
|
81
|
+
[key]
|
82
|
+
else
|
83
|
+
values[refseq_field] || []
|
84
|
+
end
|
85
|
+
|
86
|
+
next if refseqs.compact.reject{|v| v.nil? or v.empty?}.empty?
|
87
|
+
|
88
|
+
mutations = values[protein_field]
|
89
|
+
|
90
|
+
refseqs.zip(mutations).collect do |refseq,mutation|
|
91
|
+
case
|
92
|
+
when (mutation.nil? or mutation.empty?)
|
93
|
+
"No Prediction"
|
94
|
+
when mutation[0] == mutation[-1]
|
95
|
+
"TOLERATED"
|
96
|
+
when (refseq.nil? or refseq.empty?)
|
97
|
+
"No Prediction"
|
98
|
+
else
|
99
|
+
list = predictions[refseq + ":" << mutation]
|
100
|
+
if list.nil?
|
101
|
+
"No Prediction"
|
102
|
+
else
|
103
|
+
list.first
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
tsv
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'rbbt/util/log'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
module SNPSandGO
|
4
|
+
URL="http://snps-and-go.biocomp.unibo.it/cgi-bin/snps-and-go/runpred.cgi?uniprot=#ACCESSION#&position=#POSITION#&wild-type=#REFERENCE#&substituting=#SUBSTITUTION#"
|
5
|
+
|
6
|
+
def self.parse_mutation(mutation)
|
7
|
+
mutation.match(/([A-Z])(\d+)([A-Z])/i).values_at 1,2,3
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.predict(accession, mutation)
|
11
|
+
reference, pos, substitution = parse_mutation(mutation)
|
12
|
+
|
13
|
+
url = URL.sub(/#ACCESSION#/,accession).sub(/#POSITION#/, pos).sub(/#REFERENCE#/,reference).sub(/#SUBSTITUTION#/,substitution)
|
14
|
+
|
15
|
+
res = Open.read(url)
|
16
|
+
|
17
|
+
raise "Error in prediction" unless res =~ /RESULTS/
|
18
|
+
|
19
|
+
res.match(/Position\s+WT\s+NEW\s+Effect\s+RI\n\s+\d+\s+[A-Z]\s+[A-Z]\s+(\w+)\s+(\d+)/).values_at 1,2
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.add_predictions(tsv)
|
23
|
+
raise "Input not TSV" unless TSV === tsv
|
24
|
+
|
25
|
+
uniprot_field = tsv.identify_field "UniProt/SwissProt Accession"
|
26
|
+
raise "Field 'UniProt/SwissProt Accession' Not in TSV" if uniprot_field.nil?
|
27
|
+
|
28
|
+
protein_field = tsv.identify_field "Protein Mutation"
|
29
|
+
raise "Field 'Protein Mutation' Not in TSV" if protein_field.nil?
|
30
|
+
|
31
|
+
|
32
|
+
tsv.add_field "SNPs&GO:Prediction" do |key,values|
|
33
|
+
uniprots = if uniprot_field === :key
|
34
|
+
[key]
|
35
|
+
else
|
36
|
+
values[uniprot_field] || []
|
37
|
+
end
|
38
|
+
|
39
|
+
mutations = values[protein_field]
|
40
|
+
|
41
|
+
uniprots.zip(mutations).collect{|uniprot,mutation|
|
42
|
+
case
|
43
|
+
when mutation.nil?
|
44
|
+
"No Prediction"
|
45
|
+
when mutation[0] == mutation[-1]
|
46
|
+
"Neutral"
|
47
|
+
when (uniprot.nil? or uniprot.empty?)
|
48
|
+
"No Prediction"
|
49
|
+
else
|
50
|
+
begin
|
51
|
+
SNPSandGO.predict(uniprot, mutation).first
|
52
|
+
rescue
|
53
|
+
"No Prediction"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
tsv
|
60
|
+
end
|
61
|
+
end
|
data/lib/rbbt/sources/cancer.rb
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
require 'phgx'
|
2
|
-
require 'rbbt/util/data_module'
|
3
2
|
|
4
3
|
module Cancer
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
PKG = PhGx
|
9
|
-
extend DataModule
|
4
|
+
extend Resource
|
5
|
+
relative_to Rbbt, "share/Cancer"
|
6
|
+
@namespace = "Cancer"
|
10
7
|
end
|
11
|
-
|
12
|
-
if __FILE__ == $0 then NCI.all end
|
data/lib/rbbt/sources/kegg.rb
CHANGED