protk 1.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 2/12/2011
|
5
|
+
#
|
6
|
+
# Repairs the msms_run_summary tag in a pepXML document to contain a specified file and datatype
|
7
|
+
# This tool should only be used on pepXML files that contain a single msms_run_summary (eg not interprophet results)
|
8
|
+
#
|
9
|
+
|
10
|
+
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/tool'
|
14
|
+
require 'libxml'
|
15
|
+
|
16
|
+
include LibXML
|
17
|
+
|
18
|
+
# Environment with global constants
|
19
|
+
#
|
20
|
+
genv=Constants.new
|
21
|
+
|
22
|
+
|
23
|
+
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
24
|
+
#
|
25
|
+
tool=Tool.new()
|
26
|
+
tool.option_parser.banner = "Repair msms_run_summary tag in a pepXML file.\n\nUsage: repair_run_summary.rb [options] file1.pepXML"
|
27
|
+
|
28
|
+
tool.options.new_base_name=nil
|
29
|
+
tool.option_parser.on( '-N', '--base-name mzmlfile', 'Original MSMS spectrum file used for search' ) do |file|
|
30
|
+
tool.options.new_base_name = file
|
31
|
+
end
|
32
|
+
|
33
|
+
tool.options.raw_data_type=nil
|
34
|
+
tool.option_parser.on( '-R', '--raw-type type', 'Raw data type used for search' ) do |type|
|
35
|
+
tool.options.raw_data_type = type
|
36
|
+
end
|
37
|
+
|
38
|
+
tool.options.omssa_ion_tolerance=nil
|
39
|
+
tool.option_parser.on('--omssa-itol fitol','Add a fragment ion tolerance parameter to the omssa search summary') do |fitol|
|
40
|
+
tool.options.omssa_ion_tolerance=fitol
|
41
|
+
end
|
42
|
+
|
43
|
+
tool.option_parser.parse!
|
44
|
+
|
45
|
+
pepxml_file=ARGV[0]
|
46
|
+
|
47
|
+
# Read the input file
|
48
|
+
#
|
49
|
+
parser=XML::Parser.file(pepxml_file)
|
50
|
+
doc=parser.parse
|
51
|
+
|
52
|
+
new_base_name=tool.new_base_name
|
53
|
+
raw_data_type=tool.raw_data_type
|
54
|
+
|
55
|
+
genv.log("Repairing #{pepxml_file} to #{new_base_name} format #{raw_data_type}",:info)
|
56
|
+
|
57
|
+
if ( new_base_name==nil )
|
58
|
+
# Try X!Tandem first
|
59
|
+
# It would be parameter spectrum,path
|
60
|
+
#
|
61
|
+
spectrum_path = doc.find('//xmlns:msms_run_summary/xmlns:search_summary/xmlns:parameter[@name="spectrum, path"]','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
|
62
|
+
if ( spectrum_path!=nil)
|
63
|
+
new_base_name=spectrum_path.attributes['value']
|
64
|
+
raw_data_type="mzML" # Always is for X!Tandem
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
if ( new_base_name==nil )
|
69
|
+
# Try Mascot
|
70
|
+
# It would be parameter File path
|
71
|
+
#
|
72
|
+
#<parameter name="FILE" value="dataset_2.dat"/>
|
73
|
+
file_path = doc.find('//xmlns:msms_run_summary/xmlns:search_summary/xmlns:parameter[@name="FILE"]','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
|
74
|
+
if ( file_path!=nil)
|
75
|
+
|
76
|
+
run_summary=doc.find('//xmlns:msms_run_summary','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
|
77
|
+
old_base_name=run_summary.attributes['base_name']
|
78
|
+
base_dir_path=Pathname.new(old_base_name).dirname.to_s
|
79
|
+
|
80
|
+
new_base_name="#{base_dir_path}/#{file_path.attributes['value']}"
|
81
|
+
raw_data_type="mgf" # Always is for Mascot
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
throw "Could not find original spectrum filename in pepXML and none provided" unless new_base_name!=nil
|
87
|
+
|
88
|
+
|
89
|
+
run_summary=doc.find('//xmlns:msms_run_summary','xmlns:http://regis-web.systemsbiology.net/pepXML')
|
90
|
+
if ( run_summary[0]==nil)
|
91
|
+
# Try without namespace (OMSSA)
|
92
|
+
run_summary=doc.find('//msms_run_summary')
|
93
|
+
|
94
|
+
if ( tool.options.omssa_ion_tolerance !=nil)
|
95
|
+
search_summary=doc.find('//search_summary')[0]
|
96
|
+
p search_summary
|
97
|
+
pmnode=XML::Node.new('parameter')
|
98
|
+
pmnode["name"]="to"
|
99
|
+
pmnode["value"]=tool.options.omssa_ion_tolerance.to_s
|
100
|
+
search_summary << pmnode
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
raw_data_type="mgf"
|
105
|
+
end
|
106
|
+
|
107
|
+
throw "No run summary found" unless run_summary[0]!=nil
|
108
|
+
|
109
|
+
run_summary[0].attributes['base_name']=new_base_name
|
110
|
+
run_summary[0].attributes['raw_data']=raw_data_type
|
111
|
+
|
112
|
+
|
113
|
+
doc.save(pepxml_file)
|
@@ -0,0 +1,292 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 17/12/2010
|
5
|
+
#
|
6
|
+
# Runs an MS/MS search using the X!Tandem search engine
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/command_runner'
|
11
|
+
require 'protk/search_tool'
|
12
|
+
require 'protk/xtandem_defaults'
|
13
|
+
require 'libxml'
|
14
|
+
|
15
|
+
include LibXML
|
16
|
+
|
17
|
+
# Environment with global constants
|
18
|
+
#
|
19
|
+
genv=Constants.new
|
20
|
+
|
21
|
+
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
22
|
+
#
|
23
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>true,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
|
24
|
+
search_tool.jobid_prefix="x"
|
25
|
+
search_tool.option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
|
26
|
+
search_tool.options.output_suffix="_tandem"
|
27
|
+
|
28
|
+
tandem_defaults=XTandemDefaults.new.path
|
29
|
+
search_tool.options.tandem_params=tandem_defaults
|
30
|
+
search_tool.option_parser.on( '-T', '--tandem-params tandem', 'XTandem parameters to use' ) do |parms|
|
31
|
+
search_tool.options.tandem_params = parms
|
32
|
+
end
|
33
|
+
|
34
|
+
search_tool.options.no_pepxml=false
|
35
|
+
search_tool.option_parser.on( '-P', '--no-pepxml', 'Dont convert to pepXML after running the search') do
|
36
|
+
search_tool.options.no_pepxml=true
|
37
|
+
end
|
38
|
+
|
39
|
+
search_tool.options.keep_params_files=false
|
40
|
+
search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parameter files' ) do
|
41
|
+
search_tool.options.keep_params_files = true
|
42
|
+
end
|
43
|
+
|
44
|
+
search_tool.option_parser.parse!
|
45
|
+
|
46
|
+
|
47
|
+
# Set search engine specific parameters on the SearchTool object
|
48
|
+
#
|
49
|
+
tandem_bin="#{genv.xtandem}"
|
50
|
+
|
51
|
+
throw "Could not find X!Tandem executable" unless FileTest.exists?(tandem_bin)
|
52
|
+
|
53
|
+
tandem_params=search_tool.tandem_params
|
54
|
+
|
55
|
+
case
|
56
|
+
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
57
|
+
current_db=Pathname.new(search_tool.database).realpath.to_s
|
58
|
+
else
|
59
|
+
current_db=search_tool.current_database :fasta
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
# Parse options from a parameter file (if provided), or from the default parameter file
|
66
|
+
#
|
67
|
+
params_parser=XML::Parser.file(tandem_params)
|
68
|
+
std_params=params_parser.parse
|
69
|
+
|
70
|
+
# Parse taxonomy template file
|
71
|
+
#
|
72
|
+
taxo_parser=XML::Parser.file(XTandemDefaults.new.taxonomy_path)
|
73
|
+
taxo_doc=taxo_parser.parse
|
74
|
+
|
75
|
+
# Galaxy changes things like @ to __at__ we need to change it back
|
76
|
+
#
|
77
|
+
def decode_modification_string(mstring)
|
78
|
+
mstring.gsub!("__at__","@")
|
79
|
+
mstring.gsub!("__oc__","{")
|
80
|
+
mstring.gsub!("__cc__","}")
|
81
|
+
mstring.gsub!("__ob__","[")
|
82
|
+
mstring.gsub!("__cb__","]")
|
83
|
+
mstring
|
84
|
+
end
|
85
|
+
|
86
|
+
def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
87
|
+
|
88
|
+
|
89
|
+
# Set the input and output paths
|
90
|
+
#
|
91
|
+
input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
|
92
|
+
throw "Exactly one spectrum, path note is required in the parameter file" unless input_notes.length==1
|
93
|
+
input_notes[0].content=input_path
|
94
|
+
|
95
|
+
output_notes=std_params.find('/bioml/note[@type="input" and @label="output, path"]')
|
96
|
+
throw "Exactly one output, path note is required in the parameter file" unless output_notes.length==1
|
97
|
+
output_notes[0].content=output_path
|
98
|
+
|
99
|
+
# Set the path to the scoring algorithm default params. We use one from ISB
|
100
|
+
#
|
101
|
+
scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
|
102
|
+
throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
|
103
|
+
scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_kscore.xml"
|
104
|
+
|
105
|
+
# Taxonomy and Database
|
106
|
+
#
|
107
|
+
db_notes=std_params.find('/bioml/note[@type="input" and @label="protein, taxon"]')
|
108
|
+
throw "Exactly one protein, taxon note is required in the parameter file" unless db_notes.length==1
|
109
|
+
db_notes[0].content=search_tool.database.downcase
|
110
|
+
|
111
|
+
taxo_notes=std_params.find('/bioml/note[@type="input" and @label="list path, taxonomy information"]')
|
112
|
+
throw "Exactly one list path, taxonomy information note is required in the parameter file" unless taxo_notes.length==1
|
113
|
+
taxo_notes[0].content=taxo_path
|
114
|
+
|
115
|
+
fragment_tol = search_tool.fragment_tol
|
116
|
+
|
117
|
+
fmass=std_params.find('/bioml/note[@type="input" and @label="spectrum, fragment monoisotopic mass error"]')
|
118
|
+
p fmass
|
119
|
+
throw "Exactly one spectrum, fragment monoisotopic mass error note is required in the parameter file" unless fmass.length==1
|
120
|
+
fmass[0].content=fragment_tol.to_s
|
121
|
+
|
122
|
+
precursor_tol = search_tool.precursor_tol
|
123
|
+
ptol_plus=precursor_tol*0.5
|
124
|
+
ptol_minus=precursor_tol*0.5
|
125
|
+
|
126
|
+
# Precursor mass matching
|
127
|
+
#
|
128
|
+
pmass_minus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error minus"]')
|
129
|
+
throw "Exactly one spectrum, parent monoisotopic mass error minus note is required in the parameter file" unless pmass_minus.length==1
|
130
|
+
pmass_minus[0].content=ptol_minus.to_s
|
131
|
+
|
132
|
+
pmass_plus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error plus"]')
|
133
|
+
throw "Exactly one spectrum, parent monoisotopic mass error plus note is required in the parameter file" unless pmass_plus.length==1
|
134
|
+
pmass_plus[0].content=ptol_plus.to_s
|
135
|
+
|
136
|
+
pmass_err_units=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error units"]')
|
137
|
+
throw "Exactly one spectrum, parent monoisotopic mass error units note is required in the parameter file. Got #{pmass_err_units.length}" unless pmass_err_units.length==1
|
138
|
+
|
139
|
+
|
140
|
+
pmass_err_units[0].content=search_tool.precursor_tolu
|
141
|
+
|
142
|
+
if search_tool.strict_monoisotopic_mass
|
143
|
+
isotopic_error=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass isotope error"]')
|
144
|
+
throw "Exactly one spectrum, parent monoisotopic mass isotope error is required in the parameter file" unless isotopic_error.length==1
|
145
|
+
isotopic_error[0].content="no"
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
# Fixed and Variable Modifications
|
150
|
+
#
|
151
|
+
unless search_tool.carbamidomethyl
|
152
|
+
mods=std_params.find('/bioml/note[@type="input" and @id="carbamidomethyl-fixed"]')
|
153
|
+
mods.each{ |node| node.remove!}
|
154
|
+
end
|
155
|
+
|
156
|
+
unless search_tool.glyco
|
157
|
+
mods=std_params.find('/bioml/note[@type="input" and @id="glyco-variable"]')
|
158
|
+
mods.each{ |node| node.remove!}
|
159
|
+
end
|
160
|
+
|
161
|
+
unless search_tool.methionine_oxidation
|
162
|
+
mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
|
163
|
+
mods.each{ |node| node.remove!}
|
164
|
+
end
|
165
|
+
|
166
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
167
|
+
var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
|
168
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
169
|
+
fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
|
170
|
+
|
171
|
+
root_bioml_node=std_params.find('/bioml')[0]
|
172
|
+
|
173
|
+
mod_id=1
|
174
|
+
var_mods.each do |vm|
|
175
|
+
|
176
|
+
mod_type="potential modification mass"
|
177
|
+
mod_type = "potential modification motif" if ( vm=~/[\[\]\(\)\{\}\!]/ )
|
178
|
+
mod_id_label = "custom-variable-mod-#{mod_id.to_s}"
|
179
|
+
mod_id=mod_id+1
|
180
|
+
mnode=XML::Node.new('node')
|
181
|
+
mnode["id"]=mod_id_label
|
182
|
+
mnode["type"]="input"
|
183
|
+
mnode["label"]="residue, #{mod_type}"
|
184
|
+
mnode.content=vm
|
185
|
+
|
186
|
+
root_bioml_node << mnode
|
187
|
+
end
|
188
|
+
|
189
|
+
mod_id=1
|
190
|
+
fix_mods.each do |fm|
|
191
|
+
mod_type="modification mass"
|
192
|
+
mod_type = "modification motif" if ( fm=~/[\[\]\(\)\{\}\!]/ )
|
193
|
+
mod_id_label = "custom-fixed-mod-#{mod_id.to_s}"
|
194
|
+
mod_id=mod_id+1
|
195
|
+
mnode=XML::Node.new('node')
|
196
|
+
mnode["id"]=mod_id_label
|
197
|
+
mnode["type"]="input"
|
198
|
+
mnode["label"]="residue, #{mod_type}"
|
199
|
+
mnode.content=fm
|
200
|
+
|
201
|
+
root_bioml_node << mnode
|
202
|
+
end
|
203
|
+
|
204
|
+
#p root_bioml_node
|
205
|
+
std_params
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
|
210
|
+
|
211
|
+
taxon_label=taxo_doc.find('/bioml/taxon')
|
212
|
+
throw "Exactly one taxon label is required in the taxonomy_template file" unless taxon_label.length==1
|
213
|
+
taxon_label[0].attributes['label']=search_tool.database.downcase
|
214
|
+
|
215
|
+
db_file=taxo_doc.find('/bioml/taxon/file')
|
216
|
+
throw "Exactly one database file is required in the taxonomy_template file" unless db_file.length==1
|
217
|
+
db_file[0].attributes['URL']=current_db
|
218
|
+
|
219
|
+
taxo_doc
|
220
|
+
end
|
221
|
+
|
222
|
+
# Run the search engine on each input file
|
223
|
+
#
|
224
|
+
ARGV.each do |filename|
|
225
|
+
|
226
|
+
input_path=Pathname.new(filename.chomp).realpath.to_s
|
227
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.tandem"
|
228
|
+
|
229
|
+
if ( search_tool.explicit_output==nil )
|
230
|
+
pepxml_path="#{output_path.match(/(.*)\.tandem$/)[1]}.pep.xml"
|
231
|
+
else
|
232
|
+
pepxml_path=search_tool.explicit_output
|
233
|
+
end
|
234
|
+
|
235
|
+
output_exists=false
|
236
|
+
if ( !search_tool.no_pepxml && Pathname.new(pepxml_path).exist?)
|
237
|
+
output_exists=true
|
238
|
+
end
|
239
|
+
|
240
|
+
if ( search_tool.no_pepxml && Pathname.new(output_path).exist? )
|
241
|
+
output_exists=true
|
242
|
+
end
|
243
|
+
|
244
|
+
# Only proceed if the output file is not present or we have opted to over-write it
|
245
|
+
#
|
246
|
+
if ( search_tool.over_write || !output_exists )
|
247
|
+
|
248
|
+
# Create the taxonomy file in the same directory as the params file
|
249
|
+
#
|
250
|
+
taxo_path="#{search_tool.input_base_path(filename.chomp)}.taxonomy.xml"
|
251
|
+
mod_taxo_doc=generate_taxonomy_doc(taxo_doc,current_db,search_tool)
|
252
|
+
mod_taxo_doc.save(taxo_path)
|
253
|
+
|
254
|
+
# Modify the default XML document to contain search specific details and save it so it can be used in the search
|
255
|
+
#
|
256
|
+
mod_params=generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
257
|
+
params_path="#{search_tool.input_base_path(filename.chomp)}.tandem.params"
|
258
|
+
mod_params.save(params_path)
|
259
|
+
|
260
|
+
# The basic command
|
261
|
+
#
|
262
|
+
cmd= "#{tandem_bin} #{params_path}"
|
263
|
+
|
264
|
+
# pepXML conversion and repair
|
265
|
+
#
|
266
|
+
unless search_tool.no_pepxml
|
267
|
+
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
268
|
+
cmd << "; #{genv.tandem2xml} #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}; rm #{output_path}"
|
269
|
+
end
|
270
|
+
|
271
|
+
# Add a cleanup command unless the user wants to keep params files
|
272
|
+
#
|
273
|
+
unless search_tool.keep_params_files
|
274
|
+
cmd << "; rm #{params_path}; rm #{taxo_path}"
|
275
|
+
end
|
276
|
+
|
277
|
+
# In case the user specified background running we need to create a jobscript path
|
278
|
+
#
|
279
|
+
jobscript_path="#{output_path}.pbs.sh"
|
280
|
+
|
281
|
+
# Run the search
|
282
|
+
#
|
283
|
+
job_params= {:jobid => search_tool.jobid_from_filename(filename)}
|
284
|
+
job_params[:queue]="lowmem"
|
285
|
+
job_params[:vmem]="900mb"
|
286
|
+
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
287
|
+
throw "Command failed with exit code #{code}" unless code==0
|
288
|
+
else
|
289
|
+
genv.log("Skipping search on existing file #{output_path}",:warn)
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Runs an MS/MS search using the MSGFPlus search engine
|
7
|
+
#
|
8
|
+
require 'protk/search_tool'
|
9
|
+
|
10
|
+
|
11
|
+
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
12
|
+
#
|
13
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
|
14
|
+
search_tool.option_parser.banner = "Run an msms search on a set of msms spectrum input files.\n\nUsage: template_search.rb [options] file1.mzML file2.mzML ..."
|
15
|
+
search_tool.options.output_suffix="_template"
|
16
|
+
|
17
|
+
search_tool.options.custom_option="default"
|
18
|
+
search_tool.option_parser.on('--custom-opt value','Custom option relevant to this tool only (Default default)') do |val|
|
19
|
+
search_tool.options.custom_option=val
|
20
|
+
end
|
21
|
+
|
22
|
+
search_tool.option_parser.parse!
|
23
|
+
|
24
|
+
# Set search engine specific parameters on the SearchTool object
|
25
|
+
#
|
26
|
+
msgf_bin="#{genv.msgf_bin}/MSGFPlus.jar"
|
27
|
+
|
28
|
+
case
|
29
|
+
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
30
|
+
current_db=Pathname.new(search_tool.database).realpath.to_s
|
31
|
+
else
|
32
|
+
current_db=search_tool.current_database :fasta
|
33
|
+
end
|
34
|
+
|
35
|
+
fragment_tol = search_tool.fragment_tol
|
36
|
+
precursor_tol = search_tool.precursor_tol
|
37
|
+
|
38
|
+
|
39
|
+
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
40
|
+
|
41
|
+
# Run the search engine on each input file
|
42
|
+
#
|
43
|
+
ARGV.each do |filename|
|
44
|
+
|
45
|
+
if ( search_tool.explicit_output!=nil)
|
46
|
+
output_path=search_tool.explicit_output
|
47
|
+
else
|
48
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.mzid"
|
49
|
+
end
|
50
|
+
|
51
|
+
# (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt)
|
52
|
+
# Get the input file extension
|
53
|
+
ext = Pathname.new(filename).extname
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
|
58
|
+
|
59
|
+
# Only proceed if the output file is not present or we have opted to over-write it
|
60
|
+
#
|
61
|
+
if ( search_tool.over_write || !Pathname.new(output_path).exist? )
|
62
|
+
|
63
|
+
# The basic command
|
64
|
+
#
|
65
|
+
cmd= "java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{output_path} "
|
66
|
+
|
67
|
+
#Missed cleavages
|
68
|
+
#
|
69
|
+
throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages > 2)
|
70
|
+
cmd << " -ntt #{search_tool.missed_cleavages}"
|
71
|
+
|
72
|
+
# Precursor tolerance
|
73
|
+
#
|
74
|
+
cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
|
75
|
+
|
76
|
+
# Instrument type
|
77
|
+
#
|
78
|
+
cmd << " -inst 2"
|
79
|
+
|
80
|
+
# cmd << " -m 4"
|
81
|
+
|
82
|
+
cmd << " -addFeatures 1"
|
83
|
+
|
84
|
+
# Enzyme
|
85
|
+
#
|
86
|
+
# if ( search_tool.enzyme!="Trypsin")
|
87
|
+
# cmd << " -e #{search_tool.enzyme}"
|
88
|
+
# end
|
89
|
+
|
90
|
+
mods_path="#{search_tool.input_base_path(filename.chomp)}.msgfplus_mods.txt"
|
91
|
+
mods_file=File.open(mods_path,'w+')
|
92
|
+
|
93
|
+
# Variable Modifications
|
94
|
+
#
|
95
|
+
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
96
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
|
97
|
+
if ( var_mods !="" )
|
98
|
+
cmd << " -mv #{var_mods}"
|
99
|
+
end
|
100
|
+
else
|
101
|
+
# Add options related to peptide modifications
|
102
|
+
#
|
103
|
+
if ( search_tool.glyco )
|
104
|
+
cmd << " -mv 119 "
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Fixed modifications
|
109
|
+
#
|
110
|
+
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
111
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
112
|
+
if ( fix_mods !="")
|
113
|
+
cmd << " -mf #{fix_mods}"
|
114
|
+
end
|
115
|
+
else
|
116
|
+
if ( search_tool.has_modifications )
|
117
|
+
cmd << " -mf "
|
118
|
+
if ( search_tool.carbamidomethyl )
|
119
|
+
cmd<<"3 "
|
120
|
+
end
|
121
|
+
|
122
|
+
if ( search_tool.methionine_oxidation )
|
123
|
+
cmd<<"1 "
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Up to here we've formulated the omssa command. The rest is cleanup
|
130
|
+
p "Running:#{cmd}"
|
131
|
+
|
132
|
+
# Run the search
|
133
|
+
#
|
134
|
+
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
135
|
+
job_params[:queue]="lowmem"
|
136
|
+
job_params[:vmem]="900mb"
|
137
|
+
search_tool.run(cmd,genv,job_params)
|
138
|
+
|
139
|
+
|
140
|
+
else
|
141
|
+
genv.log("Skipping search on existing file #{output_path}",:warn)
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of MSLIMS
|
4
|
+
# Created by Ira Cooke 12/4/2010
|
5
|
+
#
|
6
|
+
# Reads a unimod xml file (eg from a Mascot installation) and produces a loc file with names of allowable chemical modifications
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'libxml'
|
10
|
+
|
11
|
+
include LibXML
|
12
|
+
|
13
|
+
unimod_file=ARGV[0]
|
14
|
+
|
15
|
+
unimod_file=XML::Parser.file(unimod_file)
|
16
|
+
unimod_doc=unimod_file.parse
|
17
|
+
|
18
|
+
|
19
|
+
all_mods=[]
|
20
|
+
|
21
|
+
umd = unimod_doc.find('//umod:unimod/umod:modifications/umod:mod')
|
22
|
+
|
23
|
+
umd.each { |mod|
|
24
|
+
|
25
|
+
# Special Cases
|
26
|
+
#
|
27
|
+
title=mod.attributes['title']
|
28
|
+
if ( title=="Oxidation" || title=="Phospho" || title=="Sulfo")
|
29
|
+
if ( title=="Oxidation")
|
30
|
+
all_mods.push("Oxidation (HW)")
|
31
|
+
all_mods.push("Oxidation (M)")
|
32
|
+
end
|
33
|
+
|
34
|
+
if ( title=="Phospho")
|
35
|
+
all_mods.push("Phospho (ST)")
|
36
|
+
all_mods.push("Phospho (Y)")
|
37
|
+
end
|
38
|
+
|
39
|
+
if ( title=="Sulfo")
|
40
|
+
all_mods.push("Sulfo (S)")
|
41
|
+
all_mods.push("Sulfo (T)")
|
42
|
+
all_mods.push("Sulfo (Y)")
|
43
|
+
end
|
44
|
+
|
45
|
+
else
|
46
|
+
|
47
|
+
# Deal with the anywhere sites which can be concatenated
|
48
|
+
#
|
49
|
+
if ( mod.attributes['title'] !~ /^iTRAQ/ && mod.attributes['title'] !~ /^mTRAQ/ )
|
50
|
+
anywhere_sites = mod.find('./umod:specificity[@hidden="0" and @position="Anywhere"]')
|
51
|
+
if ( anywhere_sites.length>0 )
|
52
|
+
|
53
|
+
sites=[]
|
54
|
+
|
55
|
+
anywhere_sites.each { |s|
|
56
|
+
sites.push("#{s.attributes['site']}")
|
57
|
+
}
|
58
|
+
sites.sort!
|
59
|
+
specificity="("
|
60
|
+
sites.each { |s| specificity<<s }
|
61
|
+
specificity<<")"
|
62
|
+
|
63
|
+
all_mods.push("#{mod.attributes['title']} #{specificity}")
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
else
|
68
|
+
anywhere_sites = mod.find('./umod:specificity[@hidden="0" and @position="Anywhere"]')
|
69
|
+
anywhere_sites.each { |s|
|
70
|
+
all_mods.push("#{mod.attributes['title']} (#{s.attributes['site']})")
|
71
|
+
}
|
72
|
+
end
|
73
|
+
|
74
|
+
specifics=mod.find('./umod:specificity[@hidden="0" and @position!="Anywhere"]')
|
75
|
+
if ( specifics.length > 0 )
|
76
|
+
specifics.each { |specific_mod|
|
77
|
+
|
78
|
+
specificity=specific_mod.attributes['site']
|
79
|
+
if ( specific_mod.attributes['position'] =~ /^Protein/)
|
80
|
+
specificity=specific_mod.attributes['position']
|
81
|
+
end
|
82
|
+
|
83
|
+
if ( (specific_mod.attributes['position'] =~ /Any N-term/) && (specific_mod.attributes['site'] =~ /^[CQEM]$/) )
|
84
|
+
specificity="N-term #{specific_mod.attributes['site']}"
|
85
|
+
end
|
86
|
+
|
87
|
+
if ( (specific_mod.attributes['position'] =~ /Any C-term/) && (specific_mod.attributes['site'] =~ /^[M]$/) )
|
88
|
+
specificity="C-term #{specific_mod.attributes['site']}"
|
89
|
+
end
|
90
|
+
|
91
|
+
all_mods.push("#{mod.attributes['title']} (#{specificity})")
|
92
|
+
|
93
|
+
}
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
}
|
100
|
+
|
101
|
+
|
102
|
+
all_mods=all_mods.sort {|a,b| a.downcase <=> b.downcase}
|
103
|
+
|
104
|
+
loc_output=File.new("mascot_mods.loc",'w')
|
105
|
+
|
106
|
+
loc_output << "#This file lists the names of chemical modifications acceptable for proteomics search engines\n"
|
107
|
+
loc_output << "#\n"
|
108
|
+
loc_output << "#So, unimod_names.loc could look something like this:\n"
|
109
|
+
loc_output << "#\n"
|
110
|
+
|
111
|
+
all_mods.each { |am|
|
112
|
+
key = am.downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
|
113
|
+
loc_output << "#{am}\t#{key}\t#{am}\t#{key}\n"
|
114
|
+
}
|
115
|
+
|
116
|
+
loc_output.close
|
117
|
+
|
118
|
+
|