protk 1.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
data/bin/omssa_search.rb
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Runs an MS/MS search using the OMSSA search engine
|
7
|
+
#
|
8
|
+
$VERBOSE=nil
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/search_tool'
|
13
|
+
|
14
|
+
|
15
|
+
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
16
|
+
#
|
17
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
|
18
|
+
search_tool.option_parser.banner = "Run an OMSSA msms search on a set of mgf input files.\n\nUsage: omssa_search.rb [options] file1.mgf file2.mgf ..."
|
19
|
+
search_tool.options.output_suffix="_omssa"
|
20
|
+
|
21
|
+
search_tool.options.add_retention_times=true
|
22
|
+
search_tool.option_parser.on( '-R', '--no-add-retention-times', 'Don\'t post process the output to add retention times' ) do
|
23
|
+
search_tool.options.add_retention_times=false
|
24
|
+
end
|
25
|
+
|
26
|
+
search_tool.options.max_hit_expect=1
|
27
|
+
search_tool.option_parser.on( '--max-hit-expect exp', 'Expect values less than this are considered to be hits' ) do |exp|
|
28
|
+
search_tool.options.max_hit_expect=exp
|
29
|
+
end
|
30
|
+
|
31
|
+
search_tool.options.intensity_cut_off=0.0005
|
32
|
+
search_tool.option_parser.on( '--intensity-cut-off co', 'Peak intensity cut-off as a fraction of maximum peak intensity' ) do |co|
|
33
|
+
search_tool.options.intensity_cut_off=co
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
search_tool.option_parser.parse!
|
38
|
+
|
39
|
+
# Environment with global constants
|
40
|
+
#
|
41
|
+
genv=Constants.new
|
42
|
+
|
43
|
+
# Set search engine specific parameters on the SearchTool object
|
44
|
+
#
|
45
|
+
rt_correct_bin="#{File.dirname(__FILE__)}/correct_omssa_retention_times.rb"
|
46
|
+
repair_script_bin="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
47
|
+
|
48
|
+
case
|
49
|
+
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
50
|
+
current_db=Pathname.new(search_tool.database).realpath.to_s
|
51
|
+
else
|
52
|
+
current_db=search_tool.current_database :fasta
|
53
|
+
end
|
54
|
+
|
55
|
+
fragment_tol = search_tool.fragment_tol
|
56
|
+
precursor_tol = search_tool.precursor_tol
|
57
|
+
|
58
|
+
|
59
|
+
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
60
|
+
|
61
|
+
# Run the search engine on each input file
|
62
|
+
#
|
63
|
+
ARGV.each do |filename|
|
64
|
+
|
65
|
+
if ( search_tool.explicit_output!=nil)
|
66
|
+
output_path=search_tool.explicit_output
|
67
|
+
else
|
68
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.pep.xml"
|
69
|
+
end
|
70
|
+
|
71
|
+
# We always perform searches on mgf files so
|
72
|
+
#
|
73
|
+
input_path="#{search_tool.input_base_path(filename.chomp)}.mgf"
|
74
|
+
input_ext=Pathname.new(filename).extname
|
75
|
+
|
76
|
+
if ( input_ext==".dat" )
|
77
|
+
# This is a file provided by galaxy so we need to leave the .dat extension
|
78
|
+
input_path="#{search_tool.input_base_path(filename.chomp)}.dat"
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
# Only proceed if the output file is not present or we have opted to over-write it
|
83
|
+
#
|
84
|
+
if ( search_tool.over_write || !Pathname.new(output_path).exist? )
|
85
|
+
|
86
|
+
# The basic command
|
87
|
+
#
|
88
|
+
cmd= "#{genv.omssacl} -d #{current_db} -fm #{input_path} -op #{output_path} -w"
|
89
|
+
|
90
|
+
#Missed cleavages
|
91
|
+
#
|
92
|
+
cmd << " -v #{search_tool.missed_cleavages}"
|
93
|
+
|
94
|
+
# Precursor tolerance
|
95
|
+
#
|
96
|
+
if ( search_tool.precursor_tolu=="ppm")
|
97
|
+
cmd << " -teppm"
|
98
|
+
end
|
99
|
+
cmd << " -te #{search_tool.precursor_tol}"
|
100
|
+
|
101
|
+
# Fragment ion tolerance
|
102
|
+
#
|
103
|
+
cmd << " -to #{fragment_tol}" #Always in Da
|
104
|
+
|
105
|
+
# Set the search type (monoisotopic vs average masses) and whether to use strict monoisotopic masses
|
106
|
+
#
|
107
|
+
if ( search_tool.precursor_search_type=="monoisotopic")
|
108
|
+
if ( search_tool.strict_monoisotopic_mass )
|
109
|
+
cmd << " -tem 0"
|
110
|
+
else
|
111
|
+
cmd << " -tem 4 -ti #{search_tool.num_peaks_for_multi_isotope_search}"
|
112
|
+
end
|
113
|
+
else
|
114
|
+
cmd << " -tem 1"
|
115
|
+
end
|
116
|
+
|
117
|
+
# Enzyme
|
118
|
+
#
|
119
|
+
if ( search_tool.enzyme!="Trypsin")
|
120
|
+
cmd << " -e #{search_tool.enzyme}"
|
121
|
+
end
|
122
|
+
|
123
|
+
# Variable Modifications
|
124
|
+
#
|
125
|
+
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
126
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
|
127
|
+
if ( var_mods !="" )
|
128
|
+
cmd << " -mv #{var_mods}"
|
129
|
+
end
|
130
|
+
else
|
131
|
+
# Add options related to peptide modifications
|
132
|
+
#
|
133
|
+
if ( search_tool.glyco )
|
134
|
+
cmd << " -mv 119 "
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
139
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
140
|
+
if ( fix_mods !="")
|
141
|
+
cmd << " -mf #{fix_mods}"
|
142
|
+
end
|
143
|
+
else
|
144
|
+
if ( search_tool.has_modifications )
|
145
|
+
cmd << " -mf "
|
146
|
+
if ( search_tool.carbamidomethyl )
|
147
|
+
cmd<<"3 "
|
148
|
+
end
|
149
|
+
|
150
|
+
if ( search_tool.methionine_oxidation )
|
151
|
+
cmd<<"1 "
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
if ( search_tool.searched_ions !="" && !search_tool.searched_ions=~/None/)
|
158
|
+
searched_ions=search_tool.searched_ions.split(",").collect{ |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
159
|
+
if ( searched_ions!="")
|
160
|
+
cmd << " -i #{searched_ions}"
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
# Infer precursor charges or respect charges in input file
|
166
|
+
#
|
167
|
+
if ( search_tool.respect_precursor_charges )
|
168
|
+
cmd << " -zcc 1"
|
169
|
+
end
|
170
|
+
|
171
|
+
|
172
|
+
# Max expect
|
173
|
+
#
|
174
|
+
cmd << " -he #{search_tool.max_hit_expect}"
|
175
|
+
|
176
|
+
# Intensity cut-off
|
177
|
+
cmd << " -ci #{search_tool.intensity_cut_off}"
|
178
|
+
|
179
|
+
# Up to here we've formulated the omssa command. The rest is cleanup
|
180
|
+
p "Running:#{cmd}"
|
181
|
+
|
182
|
+
# Add retention time corrections
|
183
|
+
#
|
184
|
+
if (search_tool.options.add_retention_times)
|
185
|
+
cmd << "; #{rt_correct_bin} #{output_path} #{input_path} "
|
186
|
+
end
|
187
|
+
|
188
|
+
# Correct the pepXML file
|
189
|
+
#
|
190
|
+
cmd << "; #{repair_script_bin} -N #{input_path} -R mgf #{output_path} --omssa-itol #{search_tool.fragment_tol}"
|
191
|
+
genv.log("Running repair script command #{cmd}",:info)
|
192
|
+
|
193
|
+
# Run the search
|
194
|
+
#
|
195
|
+
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
196
|
+
job_params[:queue]="lowmem"
|
197
|
+
job_params[:vmem]="900mb"
|
198
|
+
search_tool.run(cmd,genv,job_params)
|
199
|
+
|
200
|
+
|
201
|
+
else
|
202
|
+
genv.log("Skipping search on existing file #{output_path}",:warn)
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# A wrapper for PeptideProphet
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/prophet_tool'
|
13
|
+
|
14
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
15
|
+
#
|
16
|
+
prophet_tool=ProphetTool.new({:glyco=>true,:explicit_output=>true,:maldi=>true})
|
17
|
+
prophet_tool.option_parser.banner = "Run PeptideProphet on a set of pep.xml input files.\n\nUsage: peptide_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
|
18
|
+
prophet_tool.options.output_suffix="_pproph"
|
19
|
+
|
20
|
+
prophet_tool.options.useicat = false
|
21
|
+
prophet_tool.option_parser.on( '--useicat',"Use icat information" ) do
|
22
|
+
prophet_tool.options.useicat = true
|
23
|
+
end
|
24
|
+
|
25
|
+
prophet_tool.options.nouseicat = false
|
26
|
+
prophet_tool.option_parser.on( '--no-useicat',"Do not use icat information" ) do
|
27
|
+
prophet_tool.options.nouseicat = true
|
28
|
+
end
|
29
|
+
|
30
|
+
prophet_tool.options.phospho = false
|
31
|
+
prophet_tool.option_parser.on( '--phospho',"Use phospho information" ) do
|
32
|
+
prophet_tool.options.phospho = true
|
33
|
+
end
|
34
|
+
|
35
|
+
prophet_tool.options.usepi = false
|
36
|
+
prophet_tool.option_parser.on( '--usepi',"Use pI information" ) do
|
37
|
+
prophet_tool.options.usepi = true
|
38
|
+
end
|
39
|
+
|
40
|
+
prophet_tool.options.usert = false
|
41
|
+
prophet_tool.option_parser.on( '--usert',"Use hydrophobicity / RT information" ) do
|
42
|
+
prophet_tool.options.usert = true
|
43
|
+
end
|
44
|
+
|
45
|
+
prophet_tool.options.accurate_mass = false
|
46
|
+
prophet_tool.option_parser.on( '--accurate-mass',"Use accurate mass binning" ) do
|
47
|
+
prophet_tool.options.accurate_mass = true
|
48
|
+
end
|
49
|
+
|
50
|
+
prophet_tool.options.no_ntt = false
|
51
|
+
prophet_tool.option_parser.on( '--no-ntt',"Don't use NTT model" ) do
|
52
|
+
prophet_tool.options.no_ntt = true
|
53
|
+
end
|
54
|
+
|
55
|
+
prophet_tool.options.no_nmc = false
|
56
|
+
prophet_tool.option_parser.on( '--no-nmc',"Don't use NMC model" ) do
|
57
|
+
prophet_tool.options.no_nmc = true
|
58
|
+
end
|
59
|
+
|
60
|
+
prophet_tool.options.usegamma = false
|
61
|
+
prophet_tool.option_parser.on( '--usegamma',"Use Gamma distribution to model the negatives" ) do
|
62
|
+
prophet_tool.options.usegamma = true
|
63
|
+
end
|
64
|
+
|
65
|
+
prophet_tool.options.use_only_expect = false
|
66
|
+
prophet_tool.option_parser.on( '--use-only-expect',"Only use Expect Score as the discriminant" ) do
|
67
|
+
prophet_tool.options.use_only_expect = true
|
68
|
+
end
|
69
|
+
|
70
|
+
prophet_tool.options.force_fit = false
|
71
|
+
prophet_tool.option_parser.on( '--force-fit',"Force fitting of mixture model and bypass checks" ) do
|
72
|
+
prophet_tool.options.force_fit = true
|
73
|
+
end
|
74
|
+
|
75
|
+
prophet_tool.options.allow_alt_instruments=false
|
76
|
+
prophet_tool.option_parser.on( '--allow-alt-instruments',"Warning instead of exit with error if instrument types between runs is different" ) do
|
77
|
+
prophet_tool.options.allow_alt_instruments = true
|
78
|
+
end
|
79
|
+
|
80
|
+
prophet_tool.options.one_ata_time = false
|
81
|
+
prophet_tool.option_parser.on( '-F', '--one-ata-time', 'Create a separate pproph output file for each analysis' ) do
|
82
|
+
prophet_tool.options.one_ata_time = true
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
prophet_tool.option_parser.parse!
|
87
|
+
|
88
|
+
throw "When --output and -F options are set only one file at a time can be run" if ( ARGV.length> 1 ) && ( prophet_tool.explicit_output!=nil ) && (prophet_tool.one_ata_time!=nil)
|
89
|
+
|
90
|
+
# Obtain a global environment object
|
91
|
+
genv=Constants.new
|
92
|
+
|
93
|
+
|
94
|
+
# Interrogate all the input files to obtain the database and search engine from them
|
95
|
+
#
|
96
|
+
genv.log("Determining search engine and database used to create input files ...",:info)
|
97
|
+
file_info={}
|
98
|
+
ARGV.each {|file_name|
|
99
|
+
name=file_name.chomp
|
100
|
+
|
101
|
+
engine=prophet_tool.extract_engine(name)
|
102
|
+
db_path=prophet_tool.extract_db(name)
|
103
|
+
|
104
|
+
|
105
|
+
file_info[name]={:engine=>engine , :database=>db_path }
|
106
|
+
}
|
107
|
+
|
108
|
+
# Check that all searches were performed with the same engine and database
|
109
|
+
#
|
110
|
+
#
|
111
|
+
engine=nil
|
112
|
+
database=nil
|
113
|
+
inputs=file_info.collect do |info|
|
114
|
+
if ( engine==nil)
|
115
|
+
engine=info[1][:engine]
|
116
|
+
end
|
117
|
+
if ( database==nil)
|
118
|
+
database=info[1][:database]
|
119
|
+
end
|
120
|
+
throw "All files to be analyzed must have been searched with the same database and search engine" unless (info[1][:engine]==engine) && (info[1][:database])
|
121
|
+
|
122
|
+
retname= "#{prophet_tool.input_base_path(info[0],".pep.xml")}.pep.xml"
|
123
|
+
if ( info[0]=~/\.dat$/)
|
124
|
+
retname=info[0]
|
125
|
+
end
|
126
|
+
|
127
|
+
retname
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
def generate_command(genv,prophet_tool,inputs,output,database,engine)
|
132
|
+
|
133
|
+
cmd="#{genv.xinteract} -N#{output} -l7 -eT -D#{database} "
|
134
|
+
|
135
|
+
if prophet_tool.glyco
|
136
|
+
cmd << " -Og "
|
137
|
+
end
|
138
|
+
|
139
|
+
if prophet_tool.phospho
|
140
|
+
cmd << " -OH "
|
141
|
+
end
|
142
|
+
|
143
|
+
if prophet_tool.usepi
|
144
|
+
cmd << " -OI "
|
145
|
+
end
|
146
|
+
|
147
|
+
if prophet_tool.usert
|
148
|
+
cmd << " -OR "
|
149
|
+
end
|
150
|
+
|
151
|
+
if prophet_tool.accurate_mass
|
152
|
+
cmd << " -OA "
|
153
|
+
end
|
154
|
+
|
155
|
+
if prophet_tool.no_ntt
|
156
|
+
cmd << " -ON "
|
157
|
+
end
|
158
|
+
|
159
|
+
if prophet_tool.no_nmc
|
160
|
+
cmd << " -OM "
|
161
|
+
end
|
162
|
+
|
163
|
+
if prophet_tool.usegamma
|
164
|
+
cmd << " -OG "
|
165
|
+
end
|
166
|
+
|
167
|
+
if prophet_tool.use_only_expect
|
168
|
+
cmd << " -OE "
|
169
|
+
end
|
170
|
+
|
171
|
+
if prophet_tool.force_fit
|
172
|
+
cmd << " -OF "
|
173
|
+
end
|
174
|
+
|
175
|
+
if prophet_tool.allow_alt_instruments
|
176
|
+
cmd << " -Ow "
|
177
|
+
end
|
178
|
+
|
179
|
+
if prophet_tool.useicat
|
180
|
+
cmd << " -Oi "
|
181
|
+
end
|
182
|
+
|
183
|
+
if prophet_tool.nouseicat
|
184
|
+
cmd << " -Of"
|
185
|
+
end
|
186
|
+
|
187
|
+
if prophet_tool.maldi
|
188
|
+
cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
|
189
|
+
end
|
190
|
+
|
191
|
+
if engine=="omssa" || engine=="phenyx"
|
192
|
+
cmd << "-Op -P -ddecoy "
|
193
|
+
else
|
194
|
+
cmd << "-ddecoy "
|
195
|
+
end
|
196
|
+
|
197
|
+
|
198
|
+
if ( inputs.class==Array)
|
199
|
+
cmd << " #{inputs.join(" ")}"
|
200
|
+
else
|
201
|
+
cmd << " #{inputs}"
|
202
|
+
end
|
203
|
+
|
204
|
+
cmd
|
205
|
+
end
|
206
|
+
|
207
|
+
def run_peptide_prophet(genv,prophet_tool,cmd,output_path,engine)
|
208
|
+
if ( !prophet_tool.over_write && Pathname.new(output_path).exist? )
|
209
|
+
genv.log("Skipping analysis on existing file #{output_path}",:warn)
|
210
|
+
else
|
211
|
+
jobscript_path="#{output_path}.pbs.sh"
|
212
|
+
job_params={:jobid=>engine, :vmem=>"900mb", :queue => "lowmem"}
|
213
|
+
code=prophet_tool.run(cmd,genv,job_params,jobscript_path)
|
214
|
+
throw "Command failed with exit code #{code}" unless code==0
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
|
219
|
+
cmd=""
|
220
|
+
if ( prophet_tool.one_ata_time )
|
221
|
+
inputs.each { |input|
|
222
|
+
|
223
|
+
output_file_name="#{prophet_tool.output_prefix}#{input}_#{engine}_interact#{prophet_tool.output_suffix}.pep.xml"
|
224
|
+
|
225
|
+
cmd=generate_command(genv,prophet_tool,input,output_file_name,database,engine)
|
226
|
+
|
227
|
+
run_peptide_prophet(genv,prophet_tool,cmd,output_file_base_name,engine)
|
228
|
+
|
229
|
+
|
230
|
+
}
|
231
|
+
else
|
232
|
+
if (prophet_tool.explicit_output==nil)
|
233
|
+
output_file_name="#{prophet_tool.output_prefix}#{engine}_interact#{prophet_tool.output_suffix}.pep.xml"
|
234
|
+
else
|
235
|
+
|
236
|
+
output_file_name=prophet_tool.explicit_output
|
237
|
+
|
238
|
+
end
|
239
|
+
cmd=generate_command(genv,prophet_tool,inputs,output_file_name,database,engine)
|
240
|
+
|
241
|
+
run_peptide_prophet(genv,prophet_tool,cmd,output_file_name,engine)
|
242
|
+
|
243
|
+
end
|
244
|
+
|
245
|
+
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Convert a pepXML file to a tab delimited table
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'libxml'
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/tool'
|
14
|
+
|
15
|
+
include LibXML
|
16
|
+
|
17
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
18
|
+
#
|
19
|
+
tool=Tool.new({:explicit_output=>true})
|
20
|
+
tool.option_parser.banner = "Convert a pepXML file to a tab delimited table.\n\nUsage: pepxml_to_table.rb [options] file1.pep.xml"
|
21
|
+
|
22
|
+
tool.option_parser.parse!
|
23
|
+
|
24
|
+
# Obtain a global environment object
|
25
|
+
#genv=Constants.new
|
26
|
+
|
27
|
+
input_file=ARGV[0]
|
28
|
+
|
29
|
+
output_file="#{input_file}.txt"
|
30
|
+
|
31
|
+
output_file = tool.explicit_output if tool.explicit_output!=nil
|
32
|
+
|
33
|
+
output_fh=File.new("#{output_file}",'w')
|
34
|
+
|
35
|
+
output_fh.write "protein\tpeptide\tassumed_charge\tcalc_neutral_pep_mass\tneutral_mass\tretention_time\tstart_scan\tend_scan\tsearch_engine\tpeptideprophet_prob\tinterprophet_prob\n"
|
36
|
+
|
37
|
+
pepxml_parser=XML::Parser.file("#{input_file}")
|
38
|
+
pepxml_doc=pepxml_parser.parse
|
39
|
+
|
40
|
+
spectrum_queries=pepxml_doc.find('//xmlns:spectrum_query','xmlns:http://regis-web.systemsbiology.net/pepXML')
|
41
|
+
|
42
|
+
spectrum_queries.each do |query|
|
43
|
+
|
44
|
+
retention_time=query.attributes['retention_time_sec']
|
45
|
+
neutral_mass=query.attributes['precursor_neutral_mass']
|
46
|
+
assumed_charge=query.attributes['assumed_charge']
|
47
|
+
|
48
|
+
top_search_hit=query.find('./xmlns:search_result/xmlns:search_hit','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
|
49
|
+
peptide=top_search_hit.attributes['peptide']
|
50
|
+
protein=top_search_hit.attributes['protein']
|
51
|
+
calc_neutral_pep_mass=top_search_hit.attributes['calc_neutral_pep_mass']
|
52
|
+
start_scan=query.attributes['start_scan']
|
53
|
+
end_scan=query.attributes['end_scan']
|
54
|
+
|
55
|
+
search_engine=""
|
56
|
+
search_score_names=top_search_hit.find('./xmlns:search_score/@name','xmlns:http://regis-web.systemsbiology.net/pepXML').collect {|s| s.to_s}
|
57
|
+
|
58
|
+
if ( search_score_names.length==2 && search_score_names.grep(/^name.*=.*pvalue/))
|
59
|
+
search_engine="omssa"
|
60
|
+
elsif ( search_score_names.grep(/^name.*=.*ionscore/))
|
61
|
+
search_engine="mascot"
|
62
|
+
elsif ( search_score_names.grep(/^name.*=.*hyperscore/) )
|
63
|
+
search_engine="x!tandem"
|
64
|
+
end
|
65
|
+
|
66
|
+
pp_result=top_search_hit.find('./xmlns:analysis_result/xmlns:peptideprophet_result/@probability','xmlns:http://regis-web.systemsbiology.net/pepXML')
|
67
|
+
ip_result=top_search_hit.find('./xmlns:analysis_result/xmlns:interprophet_result/@probability','xmlns:http://regis-web.systemsbiology.net/pepXML')
|
68
|
+
|
69
|
+
peptide_prophet_prob=""
|
70
|
+
interprophet_prob=""
|
71
|
+
peptide_prophet_prob=pp_result[0].value if ( pp_result.length>0 )
|
72
|
+
interprophet_prob=ip_result[0].value if ( ip_result.length>0)
|
73
|
+
|
74
|
+
output_fh.write "#{protein}\t#{peptide}\t#{assumed_charge}\t#{calc_neutral_pep_mass}\t#{neutral_mass}\t#{retention_time}\t#{start_scan}\t#{end_scan}\t#{search_engine}\t#{peptide_prophet_prob}\t#{interprophet_prob}\n"
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
output_fh.close
|
@@ -0,0 +1,140 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 17/1/2011
|
5
|
+
#
|
6
|
+
# Runs the Protein Prophet tool on a set of pep.xml files. Accepts input from peptide_prophet or interprophet.
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/prophet_tool'
|
13
|
+
require 'protk/galaxy_stager'
|
14
|
+
require 'protk/galaxy_util'
|
15
|
+
|
16
|
+
for_galaxy = GalaxyUtil.for_galaxy
|
17
|
+
|
18
|
+
if for_galaxy
|
19
|
+
# Stage files for galaxy
|
20
|
+
original_input_file = ARGV[0]
|
21
|
+
original_input_path = Pathname.new("#{original_input_file}")
|
22
|
+
input_stager = GalaxyStager.new("#{original_input_file}", :extension => '.pep.xml')
|
23
|
+
ARGV.push("-o")
|
24
|
+
ARGV.push("protein_prophet_results.prot.xml")
|
25
|
+
end
|
26
|
+
|
27
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
28
|
+
#
|
29
|
+
prophet_tool=ProphetTool.new({:glyco=>true,:explicit_output=>true})
|
30
|
+
prophet_tool.option_parser.banner = "Run ProteinProphet on a set of pep.xml input files.\n\nUsage: protein_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
|
31
|
+
prophet_tool.options.output_suffix="_protproph"
|
32
|
+
|
33
|
+
prophet_tool.options.iproph = false
|
34
|
+
prophet_tool.option_parser.on( '--iprophet-input',"Inputs are from iProphet" ) do
|
35
|
+
prophet_tool.options.iproph = true
|
36
|
+
end
|
37
|
+
|
38
|
+
prophet_tool.options.nooccam = false
|
39
|
+
prophet_tool.option_parser.on( '--no-occam',"Do not attempt to derive the simplest protein list explaining observed peptides" ) do
|
40
|
+
prophet_tool.options.nooccam = true
|
41
|
+
end
|
42
|
+
|
43
|
+
prophet_tool.options.groupwts = false
|
44
|
+
prophet_tool.option_parser.on( '--group-wts',"Check peptide's total weight (rather than actual weight) in the Protein Group against the threshold" ) do
|
45
|
+
prophet_tool.options.groupwts = true
|
46
|
+
end
|
47
|
+
|
48
|
+
prophet_tool.options.normprotlen = false
|
49
|
+
prophet_tool.option_parser.on( '--norm-protlen',"Normalize NSP using Protein Length" ) do
|
50
|
+
prophet_tool.options.normprotlen = true
|
51
|
+
end
|
52
|
+
|
53
|
+
prophet_tool.options.logprobs = false
|
54
|
+
prophet_tool.option_parser.on( '--log-prob',"Use the log of probability in the confidence calculations" ) do
|
55
|
+
prophet_tool.options.logprobs = true
|
56
|
+
end
|
57
|
+
|
58
|
+
prophet_tool.options.confem = false
|
59
|
+
prophet_tool.option_parser.on( '--confem',"Use the EM to compute probability given the confidence" ) do
|
60
|
+
prophet_tool.options.confem = true
|
61
|
+
end
|
62
|
+
|
63
|
+
prophet_tool.options.allpeps = false
|
64
|
+
prophet_tool.option_parser.on( '--allpeps',"Consider all possible peptides in the database in the confidence model" ) do
|
65
|
+
prophet_tool.options.allpeps = true
|
66
|
+
end
|
67
|
+
|
68
|
+
prophet_tool.options.unmapped = false
|
69
|
+
prophet_tool.option_parser.on( '--unmapped',"Report results for unmapped proteins" ) do
|
70
|
+
prophet_tool.options.unmapped = true
|
71
|
+
end
|
72
|
+
|
73
|
+
prophet_tool.options.instances = false
|
74
|
+
prophet_tool.option_parser.on( '--instances',"Use Expected Number of Ion Instances to adjust the peptide probabilities prior to NSP adjustment" ) do
|
75
|
+
prophet_tool.options.instances = true
|
76
|
+
end
|
77
|
+
|
78
|
+
prophet_tool.options.delude = false
|
79
|
+
prophet_tool.option_parser.on( '--delude',"Do NOT use peptide degeneracy information when assessing proteins" ) do
|
80
|
+
prophet_tool.options.delude = true
|
81
|
+
end
|
82
|
+
|
83
|
+
prophet_tool.options.minprob = 0.05
|
84
|
+
prophet_tool.option_parser.on( '--minprob mp',"Minimum peptide prophet probability for peptides to be considered" ) do |mp|
|
85
|
+
prophet_tool.options.minprob = mp
|
86
|
+
end
|
87
|
+
|
88
|
+
prophet_tool.options.minindep = 0
|
89
|
+
prophet_tool.option_parser.on( '--minindep mp',"Minimum percentage of independent peptides required for a protein" ) do |mp|
|
90
|
+
prophet_tool.options.minindep = mp
|
91
|
+
end
|
92
|
+
|
93
|
+
prophet_tool.option_parser.parse!
|
94
|
+
|
95
|
+
|
96
|
+
# Obtain a global environment object
|
97
|
+
genv=Constants.new
|
98
|
+
|
99
|
+
if ( prophet_tool.explicit_output==nil )
|
100
|
+
output_file="#{prophet_tool.output_prefix}interact#{prophet_tool.output_suffix}.prot.xml"
|
101
|
+
else
|
102
|
+
output_file=prophet_tool.explicit_output
|
103
|
+
end
|
104
|
+
|
105
|
+
p output_file
|
106
|
+
|
107
|
+
if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
|
108
|
+
|
109
|
+
cmd="#{genv.proteinprophet} "
|
110
|
+
|
111
|
+
inputs = ARGV.collect {|file_name|
|
112
|
+
file_name.chomp
|
113
|
+
}
|
114
|
+
|
115
|
+
cmd << " #{inputs.join(" ")} #{output_file}"
|
116
|
+
|
117
|
+
if ( prophet_tool.glyco )
|
118
|
+
cmd << " GLYC "
|
119
|
+
end
|
120
|
+
|
121
|
+
# Run the analysis
|
122
|
+
#
|
123
|
+
jobscript_path="#{output_file}.pbs.sh"
|
124
|
+
job_params={:jobid=>"protproph", :vmem=>"900mb", :queue => "lowmem"}
|
125
|
+
genv.log("Running #{cmd}",:info)
|
126
|
+
code = prophet_tool.run(cmd,genv,job_params,jobscript_path)
|
127
|
+
throw "Command failed with exit code #{code}" unless code==0
|
128
|
+
else
|
129
|
+
genv.log("Protein Prophet output file #{output_file} already exists. Run with -r option to replace",:warn)
|
130
|
+
end
|
131
|
+
|
132
|
+
if for_galaxy
|
133
|
+
# Restore references to peptide prophet xml so downstream tools like
|
134
|
+
# libra can find it.
|
135
|
+
input_stager.restore_references("protein_prophet_results.prot.xml")
|
136
|
+
end
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
|
data/bin/protk_setup.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 20/10/2012
|
5
|
+
#
|
6
|
+
# Post-install setup for protk.
|
7
|
+
# Installs third party tools
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/setup_tool'
|
12
|
+
require 'yaml'
|
13
|
+
require 'pp'
|
14
|
+
|
15
|
+
|
16
|
+
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
17
|
+
#
|
18
|
+
tool=SetupTool.new
|
19
|
+
if ( tool.option_parser.banner=="")
|
20
|
+
tool.option_parser.banner = "Post install tasks for protk.\nUsage: protk_setup.rb [options] toolname"
|
21
|
+
end
|
22
|
+
|
23
|
+
tool.option_parser.parse!
|
24
|
+
|
25
|
+
# Create install directory if it doesn't already exist
|
26
|
+
#
|
27
|
+
env=Constants.new
|
28
|
+
|
29
|
+
ARGV.each do |toolname|
|
30
|
+
tool.install toolname
|
31
|
+
end
|