protk 1.1.0.pre → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +19 -17
- data/bin/annotate_ids.rb +1 -1
- data/bin/asapratio.rb +27 -0
- data/bin/file_convert.rb +3 -3
- data/bin/libra.rb +70 -0
- data/bin/msgfplus_search.rb +41 -35
- data/bin/omssa_search.rb +33 -1
- data/bin/peptide_prophet.rb +17 -4
- data/bin/pepxml_to_table.rb +17 -6
- data/bin/protein_prophet.rb +1 -1
- data/bin/tandem_search.rb +49 -5
- data/bin/uniprot_mapper.rb +77 -0
- data/bin/xpress.rb +27 -0
- data/lib/protk/constants.rb +47 -1
- data/lib/protk/convert_util.rb +27 -0
- data/lib/protk/data/apt-get_packages.yaml +4 -1
- data/lib/protk/data/default_config.yml +1 -0
- data/lib/protk/data/make_uniprot_table.rb +29 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +1 -1
- data/lib/protk/data/tandem_params.xml +17 -3
- data/lib/protk/data/uniprot_accessions.loc +96 -0
- data/lib/protk/data/uniprot_accessions_table.txt +97 -0
- data/lib/protk/data/uniprot_input_accessions.loc +95 -0
- data/lib/protk/data/yum_packages.yaml +65 -0
- data/lib/protk/galaxy_stager.rb +18 -5
- data/lib/protk/galaxy_util.rb +39 -2
- data/lib/protk/manage_db_rakefile.rake +43 -30
- data/lib/protk/pepxml.rb +22 -0
- data/lib/protk/protxml.rb +5 -1
- data/lib/protk/setup_rakefile.rake +55 -8
- data/lib/protk/swissprot_database.rb +1 -1
- data/lib/protk/uniprot_mapper.rb +47 -0
- data/lib/protk.rb +1 -0
- metadata +20 -4
data/README.md
CHANGED
@@ -4,11 +4,7 @@
|
|
4
4
|
***
|
5
5
|
## What is it?
|
6
6
|
|
7
|
-
Protk is a wrapper for various proteomics tools.
|
8
|
-
|
9
|
-
## Why do we need a wrapper around these tools
|
10
|
-
|
11
|
-
The aim of protk is present a consistent interface to numerous proteomics tools that is as uniform as possible. Protk also provides built-in support for managing protein databases.
|
7
|
+
Protk is a wrapper for various proteomics tools. It aims to present a consistent interface to a wide variety of tools and provides support for managing protein databases.
|
12
8
|
|
13
9
|
***
|
14
10
|
|
@@ -16,27 +12,33 @@ The aim of protk is present a consistent interface to numerous proteomics tools
|
|
16
12
|
|
17
13
|
## Basic Installation
|
18
14
|
|
19
|
-
1. Install rvm
|
20
|
-
|
15
|
+
Protk depends on ruby 1.9. The recommended way to install ruby and manage ruby gems is with rvm. Install rvm using this command.
|
16
|
+
|
17
|
+
curl -L https://get.rvm.io | bash -s stable
|
18
|
+
|
19
|
+
Next install ruby and protk's dependencies
|
21
20
|
|
22
21
|
On OSX
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
|
23
|
+
rvm install 1.9.3 --with-gcc=clang
|
24
|
+
rvm use 1.9.3
|
25
|
+
gem install protk
|
26
|
+
protk_setup.rb all
|
27
27
|
|
28
28
|
On Linux
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
|
30
|
+
rvm install 1.9.3
|
31
|
+
rvm use 1.9.3
|
32
|
+
gem install protk
|
33
|
+
sudo protk_setup.rb system_dependencies
|
34
|
+
protk_setup all
|
34
35
|
|
35
36
|
|
36
37
|
## Sequence databases
|
37
38
|
|
38
|
-
After running the setup.sh script you should run manage_db.rb to install specific sequence databases for use by the search engines. Protk comes with several predefined database configurations. For example, to install a database consisting of human entries from Swissprot plus known contaminants use the following
|
39
|
+
After running the setup.sh script you should run manage_db.rb to install specific sequence databases for use by the search engines. Protk comes with several predefined database configurations. For example, to install a database consisting of human entries from Swissprot plus known contaminants use the following commands;
|
39
40
|
|
41
|
+
manage_db.rb add crap
|
40
42
|
manage_db.rb add sphuman
|
41
43
|
|
42
44
|
You should now be able to run database searches, specifying this database by using the -d sphuman flag. Every month or so swissprot will release a new database version. You can keep your database up to date using;
|
data/bin/annotate_ids.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
|
-
# This file is part of
|
3
|
+
# This file is part of Protk
|
4
4
|
# Created by Ira Cooke 21/7/2011
|
5
5
|
#
|
6
6
|
# Takes an input file with a list of identified proteins and creates a table with swissprot/uniprot database details in various columns for each protein in the input file.
|
data/bin/asapratio.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Created by John Chilton
|
4
|
+
#
|
5
|
+
# Run ASAPRatio against protein prophet results.
|
6
|
+
#
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/protxml'
|
11
|
+
require 'protk/galaxy_util'
|
12
|
+
|
13
|
+
for_galaxy = GalaxyUtil.for_galaxy?
|
14
|
+
|
15
|
+
protxml_path = ARGV.shift
|
16
|
+
|
17
|
+
if for_galaxy
|
18
|
+
protxml_path = GalaxyUtil.stage_protxml(protxml_path)
|
19
|
+
end
|
20
|
+
|
21
|
+
protxml = ProtXML.new(protxml_path)
|
22
|
+
pepxml_path = protxml.find_pep_xml()
|
23
|
+
|
24
|
+
genv=Constants.new
|
25
|
+
|
26
|
+
command="#{genv.asapratiopeptideparser} '#{pepxml_path}' #{ARGV.join(" ")} ; #{genv.asapratioproteinparser} '#{protxml_path}'; #{genv.asaprationpvalueparser} '#{protxml_path}' "
|
27
|
+
%x[#{command}]
|
data/bin/file_convert.rb
CHANGED
@@ -126,14 +126,14 @@ basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
|
|
126
126
|
|
127
127
|
if ( convert_tool.maldi )
|
128
128
|
#For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
|
129
|
-
runner.run_local("cd #{basedir}; #{genv.
|
129
|
+
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
130
130
|
else
|
131
131
|
if ( has_charge_information(filename) )
|
132
|
-
runner.run_local("cd #{basedir}; #{genv.
|
132
|
+
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
|
133
133
|
else
|
134
134
|
# If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
|
135
135
|
#
|
136
|
-
runner.run_local("cd #{basedir}; #{genv.
|
136
|
+
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
137
137
|
end
|
138
138
|
end
|
139
139
|
|
data/bin/libra.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Created by John Chilton
|
4
|
+
#
|
5
|
+
# Run libra quantification against protein prophet results.
|
6
|
+
#
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/protxml'
|
11
|
+
require 'protk/galaxy_util'
|
12
|
+
require 'optparse'
|
13
|
+
|
14
|
+
for_galaxy = GalaxyUtil.for_galaxy?
|
15
|
+
|
16
|
+
protxml_path = ARGV.shift
|
17
|
+
|
18
|
+
if for_galaxy
|
19
|
+
protxml_path = GalaxyUtil.stage_protxml(protxml_path)
|
20
|
+
end
|
21
|
+
|
22
|
+
protxml = ProtXML.new(protxml_path)
|
23
|
+
pepxml_path = protxml.find_pep_xml()
|
24
|
+
|
25
|
+
genv=Constants.new
|
26
|
+
|
27
|
+
option_parser=OptionParser.new()
|
28
|
+
|
29
|
+
reagents = []
|
30
|
+
mass_tolerance = "0.2"
|
31
|
+
option_parser.on( '--mass-tolerance TOL',"Specifies the mass tolerance (window libra will search for the most intense m/z value in)." ) do |tol|
|
32
|
+
mass_tolerance = tol
|
33
|
+
end
|
34
|
+
|
35
|
+
option_parser.on( '--reagent MZ', "Specify a reagent (via m/z values).") do |reagent|
|
36
|
+
reagents << reagent
|
37
|
+
end
|
38
|
+
|
39
|
+
minimum_threshold_string = ""
|
40
|
+
option_parser.on( '--minimum-threshold THRESH', "Minimum threshhold intensity (not required).") do |thresh|
|
41
|
+
minimum_threshold_string = "<minimumThreshhold value=\"#{thresh}\"/>"
|
42
|
+
end
|
43
|
+
|
44
|
+
option_parser.parse!
|
45
|
+
|
46
|
+
|
47
|
+
reagent_strings = reagents.map do |reagent|
|
48
|
+
"<reagent mz=\"#{reagent}\" />"
|
49
|
+
end
|
50
|
+
reagents_string = reagent_strings.join(" ")
|
51
|
+
|
52
|
+
isotopic_contributions = ""
|
53
|
+
|
54
|
+
condition_contents = "<SUMmOnCondition description=\"libra_galaxy_run\">
|
55
|
+
<fragmentMasses>
|
56
|
+
#{reagents_string}
|
57
|
+
</fragmentMasses>
|
58
|
+
#{isotopic_contributions}
|
59
|
+
<massTolerance value=\"#{mass_tolerance}\"/>
|
60
|
+
<centroiding type=\"2\" iterations=\"1\"/>
|
61
|
+
<normalization type=\"4\"/>
|
62
|
+
<targetMs level=\"2\"/>
|
63
|
+
<output type=\"1\"/>
|
64
|
+
<quantitationFile name=\"quantitation.tsv\"/>
|
65
|
+
#{minimum_threshold_string}
|
66
|
+
</SUMmOnCondition>"
|
67
|
+
File.open("condition.xml", "w") { |f| f.write(condition_contents) }
|
68
|
+
print condition_contents
|
69
|
+
command="#{genv.librapeptideparser} '#{pepxml_path}' -ccondition.xml; #{genv.libraproteinratioparser} '#{protxml_path}' -c#{condition_file}"
|
70
|
+
%x[#{command}]
|
data/bin/msgfplus_search.rb
CHANGED
@@ -9,11 +9,15 @@ $VERBOSE=nil
|
|
9
9
|
require 'protk/constants'
|
10
10
|
require 'protk/command_runner'
|
11
11
|
require 'protk/search_tool'
|
12
|
+
require 'protk/galaxy_stager'
|
13
|
+
require 'protk/galaxy_util'
|
12
14
|
|
15
|
+
for_galaxy = GalaxyUtil.for_galaxy
|
16
|
+
input_stager = nil
|
13
17
|
|
14
18
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
15
19
|
#
|
16
|
-
search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>
|
20
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>false,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
|
17
21
|
search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
|
18
22
|
search_tool.options.output_suffix="_msgfplus"
|
19
23
|
|
@@ -92,16 +96,26 @@ ARGV.each do |filename|
|
|
92
96
|
if ( search_tool.explicit_output!=nil)
|
93
97
|
output_path=search_tool.explicit_output
|
94
98
|
else
|
95
|
-
output_path="#{search_tool.output_base_path(filename.chomp)}.
|
99
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.pepXML"
|
96
100
|
end
|
97
|
-
|
101
|
+
|
102
|
+
|
98
103
|
# (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt)
|
99
104
|
# Get the input file extension
|
100
105
|
ext = Pathname.new(filename).extname
|
106
|
+
input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
|
101
107
|
|
108
|
+
mzid_output_path="#{search_tool.input_base_path(filename.chomp)}.mzid"
|
109
|
+
|
110
|
+
|
111
|
+
if for_galaxy
|
112
|
+
original_input_file = input_path
|
113
|
+
original_input_path = Pathname.new("#{original_input_file}")
|
114
|
+
input_stager = GalaxyStager.new("#{original_input_file}", :extension => '.mzML')
|
115
|
+
input_path = input_stager.staged_path
|
116
|
+
end
|
102
117
|
|
103
118
|
|
104
|
-
input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
|
105
119
|
|
106
120
|
# Only proceed if the output file is not present or we have opted to over-write it
|
107
121
|
#
|
@@ -109,11 +123,10 @@ ARGV.each do |filename|
|
|
109
123
|
|
110
124
|
# The basic command
|
111
125
|
#
|
112
|
-
cmd= "java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{
|
113
|
-
|
126
|
+
cmd= "java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{mzid_output_path} "
|
114
127
|
#Missed cleavages
|
115
128
|
#
|
116
|
-
throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages > 2)
|
129
|
+
throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages.to_i > 2)
|
117
130
|
cmd << " -ntt #{search_tool.missed_cleavages}"
|
118
131
|
|
119
132
|
# Precursor tolerance
|
@@ -121,8 +134,7 @@ ARGV.each do |filename|
|
|
121
134
|
cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
|
122
135
|
|
123
136
|
# Instrument type
|
124
|
-
#
|
125
|
-
cmd << " -inst 2"
|
137
|
+
cmd << " -inst #{search_tool.instrument}"
|
126
138
|
|
127
139
|
# cmd << " -m 4"
|
128
140
|
|
@@ -134,55 +146,49 @@ ARGV.each do |filename|
|
|
134
146
|
# cmd << " -e #{search_tool.enzyme}"
|
135
147
|
# end
|
136
148
|
|
137
|
-
|
138
|
-
mods_file=File.open(mods_path,'w+')
|
149
|
+
mods_file_content = ""
|
139
150
|
|
140
151
|
# Variable Modifications
|
141
152
|
#
|
142
153
|
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
143
|
-
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join("
|
154
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join("\n")
|
144
155
|
if ( var_mods !="" )
|
145
|
-
|
146
|
-
end
|
147
|
-
else
|
148
|
-
# Add options related to peptide modifications
|
149
|
-
#
|
150
|
-
if ( search_tool.glyco )
|
151
|
-
cmd << " -mv 119 "
|
156
|
+
mods_file_content << "#{var_mods}\n"
|
152
157
|
end
|
153
158
|
end
|
154
159
|
|
155
160
|
# Fixed modifications
|
156
161
|
#
|
157
162
|
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
158
|
-
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join("
|
163
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join("\n")
|
159
164
|
if ( fix_mods !="")
|
160
|
-
|
165
|
+
mods_file_content << "#{fix_mods}"
|
161
166
|
end
|
162
|
-
|
163
|
-
if ( search_tool.has_modifications )
|
164
|
-
cmd << " -mf "
|
165
|
-
if ( search_tool.carbamidomethyl )
|
166
|
-
cmd<<"3 "
|
167
|
-
end
|
168
|
-
|
169
|
-
if ( search_tool.methionine_oxidation )
|
170
|
-
cmd<<"1 "
|
171
|
-
end
|
167
|
+
end
|
172
168
|
|
173
|
-
|
169
|
+
if ( mods_file_content != "")
|
170
|
+
mods_path="#{search_tool.input_base_path(filename.chomp)}.msgfplus_mods.txt"
|
171
|
+
mods_file=File.open(mods_path,'w+')
|
172
|
+
mods_file.write "NumMods=2\n#{mods_file_content}"
|
173
|
+
mods_file.close
|
174
|
+
cmd << " -mod #{mods_path}"
|
174
175
|
end
|
175
176
|
|
176
|
-
#
|
177
|
+
# As a final part of the command we convert to pepxml
|
178
|
+
cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
|
179
|
+
|
180
|
+
#Then copy the pepxml to the final output path
|
181
|
+
cmd << "; cp #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
|
182
|
+
|
183
|
+
# Up to here we've formulated the command. The rest is cleanup
|
177
184
|
p "Running:#{cmd}"
|
178
185
|
|
179
186
|
# Run the search
|
180
187
|
#
|
181
188
|
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
182
|
-
job_params[:queue]="lowmem"
|
183
|
-
job_params[:vmem]="900mb"
|
184
189
|
search_tool.run(cmd,genv,job_params)
|
185
190
|
|
191
|
+
input_stager.restore_references(output_path)
|
186
192
|
|
187
193
|
else
|
188
194
|
genv.log("Skipping search on existing file #{output_path}",:warn)
|
data/bin/omssa_search.rb
CHANGED
@@ -10,7 +10,9 @@ $VERBOSE=nil
|
|
10
10
|
require 'protk/constants'
|
11
11
|
require 'protk/command_runner'
|
12
12
|
require 'protk/search_tool'
|
13
|
+
require 'protk/galaxy_util'
|
13
14
|
|
15
|
+
for_galaxy = GalaxyUtil.for_galaxy?
|
14
16
|
|
15
17
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
16
18
|
#
|
@@ -33,6 +35,10 @@ search_tool.option_parser.on( '--intensity-cut-off co', 'Peak intensity cut-off
|
|
33
35
|
search_tool.options.intensity_cut_off=co
|
34
36
|
end
|
35
37
|
|
38
|
+
search_tool.options.galaxy_index_dir=nil
|
39
|
+
search_tool.option_parser.on( '--galaxy-index-dir dir', 'Specify galaxy index directory, will search for mods file there.' ) do |dir|
|
40
|
+
search_tool.options.galaxy_index_dir=dir
|
41
|
+
end
|
36
42
|
|
37
43
|
search_tool.option_parser.parse!
|
38
44
|
|
@@ -45,9 +51,14 @@ genv=Constants.new
|
|
45
51
|
rt_correct_bin="#{File.dirname(__FILE__)}/correct_omssa_retention_times.rb"
|
46
52
|
repair_script_bin="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
47
53
|
|
54
|
+
make_blastdb_cmd=""
|
55
|
+
|
48
56
|
case
|
49
57
|
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
50
58
|
current_db=Pathname.new(search_tool.database).realpath.to_s
|
59
|
+
if(not FileTest.exists?("#{current_db}.phr"))
|
60
|
+
make_blastdb_cmd << "#{@genv.makeblastdb} -dbtype prot -parse_seqids -in #{current_db}; "
|
61
|
+
end
|
51
62
|
else
|
52
63
|
current_db=search_tool.current_database :fasta
|
53
64
|
end
|
@@ -85,12 +96,29 @@ ARGV.each do |filename|
|
|
85
96
|
|
86
97
|
# The basic command
|
87
98
|
#
|
88
|
-
cmd= "#{genv.omssacl} -d #{current_db} -fm #{input_path} -op #{output_path} -w"
|
99
|
+
cmd = "#{make_blastdb_cmd} #{genv.omssacl} -d #{current_db} -fm #{input_path} -op #{output_path} -w"
|
89
100
|
|
90
101
|
#Missed cleavages
|
91
102
|
#
|
92
103
|
cmd << " -v #{search_tool.missed_cleavages}"
|
93
104
|
|
105
|
+
# If this is for Galaxy and a data directory has been specified
|
106
|
+
# look for a common unimod.xml file.
|
107
|
+
if for_galaxy
|
108
|
+
galaxy_index_dir = search_tool.galaxy_index_dir
|
109
|
+
if galaxy_index_dir
|
110
|
+
galaxy_mods = File.join(galaxy_index_dir, "mods.xml")
|
111
|
+
if( FileTest.exists?(galaxy_mods) )
|
112
|
+
cmd << " -mx #{galaxy_mods}"
|
113
|
+
end
|
114
|
+
galaxy_usermods = File.join(galaxy_index_dir, "usermods.xml")
|
115
|
+
if( FileTest.exists?(galaxy_usermods) )
|
116
|
+
cmd << " -mux #{galaxy_usermods}"
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
|
94
122
|
# Precursor tolerance
|
95
123
|
#
|
96
124
|
if ( search_tool.precursor_tolu=="ppm")
|
@@ -202,4 +230,8 @@ ARGV.each do |filename|
|
|
202
230
|
genv.log("Skipping search on existing file #{output_path}",:warn)
|
203
231
|
end
|
204
232
|
|
233
|
+
# Reset this. We only want to index the database at most once
|
234
|
+
#
|
235
|
+
make_blastdb_cmd=""
|
236
|
+
|
205
237
|
end
|
data/bin/peptide_prophet.rb
CHANGED
@@ -82,6 +82,15 @@ prophet_tool.option_parser.on( '-F', '--one-ata-time', 'Create a separate pproph
|
|
82
82
|
prophet_tool.options.one_ata_time = true
|
83
83
|
end
|
84
84
|
|
85
|
+
prophet_tool.options.decoy_prefix="decoy"
|
86
|
+
prophet_tool.option_parser.on( '--decoy-prefix prefix', 'Prefix for decoy sequences') do |prefix|
|
87
|
+
prophet_tool.options.decoy_prefix = prefix
|
88
|
+
end
|
89
|
+
|
90
|
+
prophet_tool.options.override_database=nil
|
91
|
+
prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
|
92
|
+
prophet_tool.options.override_database = database
|
93
|
+
end
|
85
94
|
|
86
95
|
prophet_tool.option_parser.parse!
|
87
96
|
|
@@ -99,7 +108,11 @@ ARGV.each {|file_name|
|
|
99
108
|
name=file_name.chomp
|
100
109
|
|
101
110
|
engine=prophet_tool.extract_engine(name)
|
102
|
-
|
111
|
+
if prophet_tool.override_database
|
112
|
+
db_path = prophet_tool.override_database
|
113
|
+
else
|
114
|
+
db_path=prophet_tool.extract_db(name)
|
115
|
+
end
|
103
116
|
|
104
117
|
|
105
118
|
file_info[name]={:engine=>engine , :database=>db_path }
|
@@ -130,7 +143,7 @@ end
|
|
130
143
|
|
131
144
|
def generate_command(genv,prophet_tool,inputs,output,database,engine)
|
132
145
|
|
133
|
-
cmd="#{genv.xinteract} -N#{output} -l7 -eT -D#{database} "
|
146
|
+
cmd="#{genv.xinteract} -N#{output} -l7 -eT -D'#{database}' "
|
134
147
|
|
135
148
|
if prophet_tool.glyco
|
136
149
|
cmd << " -Og "
|
@@ -189,9 +202,9 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
|
|
189
202
|
end
|
190
203
|
|
191
204
|
if engine=="omssa" || engine=="phenyx"
|
192
|
-
cmd << "-Op -P -
|
205
|
+
cmd << " -Op -P -d#{prophet_tool.decoy_prefix} "
|
193
206
|
else
|
194
|
-
cmd << "-
|
207
|
+
cmd << " -d#{prophet_tool.decoy_prefix} "
|
195
208
|
end
|
196
209
|
|
197
210
|
|
data/bin/pepxml_to_table.rb
CHANGED
@@ -34,10 +34,20 @@ output_fh=File.new("#{output_file}",'w')
|
|
34
34
|
|
35
35
|
output_fh.write "protein\tpeptide\tassumed_charge\tcalc_neutral_pep_mass\tneutral_mass\tretention_time\tstart_scan\tend_scan\tsearch_engine\tpeptideprophet_prob\tinterprophet_prob\n"
|
36
36
|
|
37
|
+
XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
|
38
|
+
|
37
39
|
pepxml_parser=XML::Parser.file("#{input_file}")
|
38
|
-
pepxml_doc=pepxml_parser.parse
|
39
40
|
|
40
|
-
|
41
|
+
pepxml_ns_prefix="xmlns:"
|
42
|
+
pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
|
43
|
+
pepxml_doc=pepxml_parser.parse
|
44
|
+
if not pepxml_doc.root.namespaces.default
|
45
|
+
pepxml_ns_prefix=""
|
46
|
+
pepxml_ns=nil
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
spectrum_queries=pepxml_doc.find("//#{pepxml_ns_prefix}spectrum_query", pepxml_ns)
|
41
51
|
|
42
52
|
spectrum_queries.each do |query|
|
43
53
|
|
@@ -45,7 +55,7 @@ spectrum_queries.each do |query|
|
|
45
55
|
neutral_mass=query.attributes['precursor_neutral_mass']
|
46
56
|
assumed_charge=query.attributes['assumed_charge']
|
47
57
|
|
48
|
-
top_search_hit=query.find(
|
58
|
+
top_search_hit=query.find("./#{pepxml_ns_prefix}search_result/#{pepxml_ns_prefix}search_hit",pepxml_ns)[0]
|
49
59
|
peptide=top_search_hit.attributes['peptide']
|
50
60
|
protein=top_search_hit.attributes['protein']
|
51
61
|
calc_neutral_pep_mass=top_search_hit.attributes['calc_neutral_pep_mass']
|
@@ -53,7 +63,7 @@ spectrum_queries.each do |query|
|
|
53
63
|
end_scan=query.attributes['end_scan']
|
54
64
|
|
55
65
|
search_engine=""
|
56
|
-
search_score_names=top_search_hit.find(
|
66
|
+
search_score_names=top_search_hit.find("./#{pepxml_ns_prefix}search_score/@name",pepxml_ns).collect {|s| s.to_s}
|
57
67
|
|
58
68
|
if ( search_score_names.length==2 && search_score_names.grep(/^name.*=.*pvalue/))
|
59
69
|
search_engine="omssa"
|
@@ -63,9 +73,10 @@ spectrum_queries.each do |query|
|
|
63
73
|
search_engine="x!tandem"
|
64
74
|
end
|
65
75
|
|
66
|
-
pp_result=top_search_hit.find('./xmlns:analysis_result/xmlns:peptideprophet_result/@probability','xmlns:http://regis-web.systemsbiology.net/pepXML')
|
67
|
-
ip_result=top_search_hit.find('./xmlns:analysis_result/xmlns:interprophet_result/@probability','xmlns:http://regis-web.systemsbiology.net/pepXML')
|
68
76
|
|
77
|
+
pp_result=top_search_hit.find("./#{pepxml_ns_prefix}analysis_result/#{pepxml_ns_prefix}peptideprophet_result/@probability",pepxml_ns)
|
78
|
+
ip_result=top_search_hit.find("./#{pepxml_ns_prefix}analysis_result/#{pepxml_ns_prefix}interprophet_result/@probability",pepxml_ns)
|
79
|
+
|
69
80
|
peptide_prophet_prob=""
|
70
81
|
interprophet_prob=""
|
71
82
|
peptide_prophet_prob=pp_result[0].value if ( pp_result.length>0 )
|
data/bin/protein_prophet.rb
CHANGED
data/bin/tandem_search.rb
CHANGED
@@ -41,6 +41,47 @@ search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parame
|
|
41
41
|
search_tool.options.keep_params_files = true
|
42
42
|
end
|
43
43
|
|
44
|
+
# In case want pepXML, but still want tandem output also.
|
45
|
+
search_tool.options.tandem_output=nil
|
46
|
+
search_tool.option_parser.on( '--tandem-output tandem_output', 'Keep X! Tandem Output') do |tandem_output|
|
47
|
+
search_tool.options.tandem_output=tandem_output
|
48
|
+
end
|
49
|
+
|
50
|
+
search_tool.options.thresholds_type = 'isb_kscore'
|
51
|
+
search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold)' ) do |thresholds_type|
|
52
|
+
search_tool.options.thresholds_type = thresholds_type
|
53
|
+
end
|
54
|
+
|
55
|
+
search_tool.options.algorithm = "kscore"
|
56
|
+
search_tool.option_parser.on( '--algorithm algorithm', "Scoring algorithm (kscore or native)" ) do |algorithm|
|
57
|
+
search_tool.options.algorithm = algorithm
|
58
|
+
end
|
59
|
+
|
60
|
+
search_tool.options.cleavage_semi = false
|
61
|
+
search_tool.option_parser.on( '--cleavage-semi' ) do
|
62
|
+
search_tool.options.cleavage_semi = true
|
63
|
+
end
|
64
|
+
|
65
|
+
search_tool.options.n_terminal_mod_mass=nil
|
66
|
+
search_tool.option_parser.on('--n-terminal-mod-mass mass') do |mass|
|
67
|
+
search_tool.options.n_terminal_mod_mass = mass
|
68
|
+
end
|
69
|
+
|
70
|
+
search_tool.options.c_terminal_mod_mass=nil
|
71
|
+
search_tool.option_parser.on('--c-terminal-mod-mass mass') do |mass|
|
72
|
+
search_tool.options.c_terminal_mod_mass = mass
|
73
|
+
end
|
74
|
+
|
75
|
+
search_tool.options.cleavage_n_terminal_mod_mass=nil
|
76
|
+
search_tool.option_parser.on('--cleavage-n-terminal-mod-mass mass') do |mass|
|
77
|
+
search_tool.options.cleavage_n_terminal_mod_mass = mass
|
78
|
+
end
|
79
|
+
|
80
|
+
search_tool.options.cleavage_c_terminal_mod_mass=nil
|
81
|
+
search_tool.option_parser.on('--cleavage-c-terminal-mod-mass mass') do |mass|
|
82
|
+
search_tool.options.cleavage_c_terminal_mod_mass = mass
|
83
|
+
end
|
84
|
+
|
44
85
|
search_tool.option_parser.parse!
|
45
86
|
|
46
87
|
|
@@ -60,8 +101,6 @@ else
|
|
60
101
|
end
|
61
102
|
|
62
103
|
|
63
|
-
|
64
|
-
|
65
104
|
# Parse options from a parameter file (if provided), or from the default parameter file
|
66
105
|
#
|
67
106
|
params_parser=XML::Parser.file(tandem_params)
|
@@ -100,7 +139,7 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
|
|
100
139
|
#
|
101
140
|
scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
|
102
141
|
throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
|
103
|
-
scoring_notes[0].content="#{genv.tpp_root}/bin/
|
142
|
+
scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_#{search_tool.algorithm}.xml"
|
104
143
|
|
105
144
|
# Taxonomy and Database
|
106
145
|
#
|
@@ -264,8 +303,13 @@ ARGV.each do |filename|
|
|
264
303
|
# pepXML conversion and repair
|
265
304
|
#
|
266
305
|
unless search_tool.no_pepxml
|
267
|
-
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
268
|
-
cmd << "; #{genv.tandem2xml} #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}
|
306
|
+
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
307
|
+
cmd << "; #{genv.tandem2xml} #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}"
|
308
|
+
if search_tool.tandem_output
|
309
|
+
cmd << "; cp #{output_path} #{search_tool.tandem_output}"
|
310
|
+
else
|
311
|
+
cmd << "; rm #{output_path}"
|
312
|
+
end
|
269
313
|
end
|
270
314
|
|
271
315
|
# Add a cleanup command unless the user wants to keep params files
|