protk 1.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 2/12/2011
|
5
|
+
#
|
6
|
+
# Repairs the msms_run_summary tag in a pepXML document to contain a specified file and datatype
|
7
|
+
# This tool should only be used on pepXML files that contain a single msms_run_summary (eg not interprophet results)
|
8
|
+
#
|
9
|
+
|
10
|
+
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/tool'
|
14
|
+
require 'libxml'
|
15
|
+
|
16
|
+
include LibXML
|
17
|
+
|
18
|
+
# Environment with global constants
|
19
|
+
#
|
20
|
+
genv=Constants.new
|
21
|
+
|
22
|
+
|
23
|
+
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
24
|
+
#
|
25
|
+
tool=Tool.new()
|
26
|
+
tool.option_parser.banner = "Repair msms_run_summary tag in a pepXML file.\n\nUsage: repair_run_summary.rb [options] file1.pepXML"
|
27
|
+
|
28
|
+
tool.options.new_base_name=nil
|
29
|
+
tool.option_parser.on( '-N', '--base-name mzmlfile', 'Original MSMS spectrum file used for search' ) do |file|
|
30
|
+
tool.options.new_base_name = file
|
31
|
+
end
|
32
|
+
|
33
|
+
tool.options.raw_data_type=nil
|
34
|
+
tool.option_parser.on( '-R', '--raw-type type', 'Raw data type used for search' ) do |type|
|
35
|
+
tool.options.raw_data_type = type
|
36
|
+
end
|
37
|
+
|
38
|
+
tool.options.omssa_ion_tolerance=nil
|
39
|
+
tool.option_parser.on('--omssa-itol fitol','Add a fragment ion tolerance parameter to the omssa search summary') do |fitol|
|
40
|
+
tool.options.omssa_ion_tolerance=fitol
|
41
|
+
end
|
42
|
+
|
43
|
+
tool.option_parser.parse!
|
44
|
+
|
45
|
+
pepxml_file=ARGV[0]
|
46
|
+
|
47
|
+
# Read the input file
|
48
|
+
#
|
49
|
+
parser=XML::Parser.file(pepxml_file)
|
50
|
+
doc=parser.parse
|
51
|
+
|
52
|
+
new_base_name=tool.new_base_name
|
53
|
+
raw_data_type=tool.raw_data_type
|
54
|
+
|
55
|
+
genv.log("Repairing #{pepxml_file} to #{new_base_name} format #{raw_data_type}",:info)
|
56
|
+
|
57
|
+
if ( new_base_name==nil )
|
58
|
+
# Try X!Tandem first
|
59
|
+
# It would be parameter spectrum,path
|
60
|
+
#
|
61
|
+
spectrum_path = doc.find('//xmlns:msms_run_summary/xmlns:search_summary/xmlns:parameter[@name="spectrum, path"]','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
|
62
|
+
if ( spectrum_path!=nil)
|
63
|
+
new_base_name=spectrum_path.attributes['value']
|
64
|
+
raw_data_type="mzML" # Always is for X!Tandem
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
if ( new_base_name==nil )
|
69
|
+
# Try Mascot
|
70
|
+
# It would be parameter File path
|
71
|
+
#
|
72
|
+
#<parameter name="FILE" value="dataset_2.dat"/>
|
73
|
+
file_path = doc.find('//xmlns:msms_run_summary/xmlns:search_summary/xmlns:parameter[@name="FILE"]','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
|
74
|
+
if ( file_path!=nil)
|
75
|
+
|
76
|
+
run_summary=doc.find('//xmlns:msms_run_summary','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
|
77
|
+
old_base_name=run_summary.attributes['base_name']
|
78
|
+
base_dir_path=Pathname.new(old_base_name).dirname.to_s
|
79
|
+
|
80
|
+
new_base_name="#{base_dir_path}/#{file_path.attributes['value']}"
|
81
|
+
raw_data_type="mgf" # Always is for Mascot
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
throw "Could not find original spectrum filename in pepXML and none provided" unless new_base_name!=nil
|
87
|
+
|
88
|
+
|
89
|
+
run_summary=doc.find('//xmlns:msms_run_summary','xmlns:http://regis-web.systemsbiology.net/pepXML')
|
90
|
+
if ( run_summary[0]==nil)
|
91
|
+
# Try without namespace (OMSSA)
|
92
|
+
run_summary=doc.find('//msms_run_summary')
|
93
|
+
|
94
|
+
if ( tool.options.omssa_ion_tolerance !=nil)
|
95
|
+
search_summary=doc.find('//search_summary')[0]
|
96
|
+
p search_summary
|
97
|
+
pmnode=XML::Node.new('parameter')
|
98
|
+
pmnode["name"]="to"
|
99
|
+
pmnode["value"]=tool.options.omssa_ion_tolerance.to_s
|
100
|
+
search_summary << pmnode
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
raw_data_type="mgf"
|
105
|
+
end
|
106
|
+
|
107
|
+
throw "No run summary found" unless run_summary[0]!=nil
|
108
|
+
|
109
|
+
run_summary[0].attributes['base_name']=new_base_name
|
110
|
+
run_summary[0].attributes['raw_data']=raw_data_type
|
111
|
+
|
112
|
+
|
113
|
+
doc.save(pepxml_file)
|
@@ -0,0 +1,292 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 17/12/2010
|
5
|
+
#
|
6
|
+
# Runs an MS/MS search using the X!Tandem search engine
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/command_runner'
|
11
|
+
require 'protk/search_tool'
|
12
|
+
require 'protk/xtandem_defaults'
|
13
|
+
require 'libxml'
|
14
|
+
|
15
|
+
include LibXML
|
16
|
+
|
17
|
+
# Environment with global constants
|
18
|
+
#
|
19
|
+
genv=Constants.new
|
20
|
+
|
21
|
+
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
22
|
+
#
|
23
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>true,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
|
24
|
+
search_tool.jobid_prefix="x"
|
25
|
+
search_tool.option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
|
26
|
+
search_tool.options.output_suffix="_tandem"
|
27
|
+
|
28
|
+
tandem_defaults=XTandemDefaults.new.path
|
29
|
+
search_tool.options.tandem_params=tandem_defaults
|
30
|
+
search_tool.option_parser.on( '-T', '--tandem-params tandem', 'XTandem parameters to use' ) do |parms|
|
31
|
+
search_tool.options.tandem_params = parms
|
32
|
+
end
|
33
|
+
|
34
|
+
search_tool.options.no_pepxml=false
|
35
|
+
search_tool.option_parser.on( '-P', '--no-pepxml', 'Dont convert to pepXML after running the search') do
|
36
|
+
search_tool.options.no_pepxml=true
|
37
|
+
end
|
38
|
+
|
39
|
+
search_tool.options.keep_params_files=false
|
40
|
+
search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parameter files' ) do
|
41
|
+
search_tool.options.keep_params_files = true
|
42
|
+
end
|
43
|
+
|
44
|
+
search_tool.option_parser.parse!
|
45
|
+
|
46
|
+
|
47
|
+
# Set search engine specific parameters on the SearchTool object
|
48
|
+
#
|
49
|
+
tandem_bin="#{genv.xtandem}"
|
50
|
+
|
51
|
+
throw "Could not find X!Tandem executable" unless FileTest.exists?(tandem_bin)
|
52
|
+
|
53
|
+
tandem_params=search_tool.tandem_params
|
54
|
+
|
55
|
+
case
|
56
|
+
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
57
|
+
current_db=Pathname.new(search_tool.database).realpath.to_s
|
58
|
+
else
|
59
|
+
current_db=search_tool.current_database :fasta
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
# Parse options from a parameter file (if provided), or from the default parameter file
|
66
|
+
#
|
67
|
+
params_parser=XML::Parser.file(tandem_params)
|
68
|
+
std_params=params_parser.parse
|
69
|
+
|
70
|
+
# Parse taxonomy template file
|
71
|
+
#
|
72
|
+
taxo_parser=XML::Parser.file(XTandemDefaults.new.taxonomy_path)
|
73
|
+
taxo_doc=taxo_parser.parse
|
74
|
+
|
75
|
+
# Galaxy changes things like @ to __at__ we need to change it back
|
76
|
+
#
|
77
|
+
def decode_modification_string(mstring)
|
78
|
+
mstring.gsub!("__at__","@")
|
79
|
+
mstring.gsub!("__oc__","{")
|
80
|
+
mstring.gsub!("__cc__","}")
|
81
|
+
mstring.gsub!("__ob__","[")
|
82
|
+
mstring.gsub!("__cb__","]")
|
83
|
+
mstring
|
84
|
+
end
|
85
|
+
|
86
|
+
def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
87
|
+
|
88
|
+
|
89
|
+
# Set the input and output paths
|
90
|
+
#
|
91
|
+
input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
|
92
|
+
throw "Exactly one spectrum, path note is required in the parameter file" unless input_notes.length==1
|
93
|
+
input_notes[0].content=input_path
|
94
|
+
|
95
|
+
output_notes=std_params.find('/bioml/note[@type="input" and @label="output, path"]')
|
96
|
+
throw "Exactly one output, path note is required in the parameter file" unless output_notes.length==1
|
97
|
+
output_notes[0].content=output_path
|
98
|
+
|
99
|
+
# Set the path to the scoring algorithm default params. We use one from ISB
|
100
|
+
#
|
101
|
+
scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
|
102
|
+
throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
|
103
|
+
scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_kscore.xml"
|
104
|
+
|
105
|
+
# Taxonomy and Database
|
106
|
+
#
|
107
|
+
db_notes=std_params.find('/bioml/note[@type="input" and @label="protein, taxon"]')
|
108
|
+
throw "Exactly one protein, taxon note is required in the parameter file" unless db_notes.length==1
|
109
|
+
db_notes[0].content=search_tool.database.downcase
|
110
|
+
|
111
|
+
taxo_notes=std_params.find('/bioml/note[@type="input" and @label="list path, taxonomy information"]')
|
112
|
+
throw "Exactly one list path, taxonomy information note is required in the parameter file" unless taxo_notes.length==1
|
113
|
+
taxo_notes[0].content=taxo_path
|
114
|
+
|
115
|
+
fragment_tol = search_tool.fragment_tol
|
116
|
+
|
117
|
+
fmass=std_params.find('/bioml/note[@type="input" and @label="spectrum, fragment monoisotopic mass error"]')
|
118
|
+
p fmass
|
119
|
+
throw "Exactly one spectrum, fragment monoisotopic mass error note is required in the parameter file" unless fmass.length==1
|
120
|
+
fmass[0].content=fragment_tol.to_s
|
121
|
+
|
122
|
+
precursor_tol = search_tool.precursor_tol
|
123
|
+
ptol_plus=precursor_tol*0.5
|
124
|
+
ptol_minus=precursor_tol*0.5
|
125
|
+
|
126
|
+
# Precursor mass matching
|
127
|
+
#
|
128
|
+
pmass_minus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error minus"]')
|
129
|
+
throw "Exactly one spectrum, parent monoisotopic mass error minus note is required in the parameter file" unless pmass_minus.length==1
|
130
|
+
pmass_minus[0].content=ptol_minus.to_s
|
131
|
+
|
132
|
+
pmass_plus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error plus"]')
|
133
|
+
throw "Exactly one spectrum, parent monoisotopic mass error plus note is required in the parameter file" unless pmass_plus.length==1
|
134
|
+
pmass_plus[0].content=ptol_plus.to_s
|
135
|
+
|
136
|
+
pmass_err_units=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error units"]')
|
137
|
+
throw "Exactly one spectrum, parent monoisotopic mass error units note is required in the parameter file. Got #{pmass_err_units.length}" unless pmass_err_units.length==1
|
138
|
+
|
139
|
+
|
140
|
+
pmass_err_units[0].content=search_tool.precursor_tolu
|
141
|
+
|
142
|
+
if search_tool.strict_monoisotopic_mass
|
143
|
+
isotopic_error=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass isotope error"]')
|
144
|
+
throw "Exactly one spectrum, parent monoisotopic mass isotope error is required in the parameter file" unless isotopic_error.length==1
|
145
|
+
isotopic_error[0].content="no"
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
# Fixed and Variable Modifications
|
150
|
+
#
|
151
|
+
unless search_tool.carbamidomethyl
|
152
|
+
mods=std_params.find('/bioml/note[@type="input" and @id="carbamidomethyl-fixed"]')
|
153
|
+
mods.each{ |node| node.remove!}
|
154
|
+
end
|
155
|
+
|
156
|
+
unless search_tool.glyco
|
157
|
+
mods=std_params.find('/bioml/note[@type="input" and @id="glyco-variable"]')
|
158
|
+
mods.each{ |node| node.remove!}
|
159
|
+
end
|
160
|
+
|
161
|
+
unless search_tool.methionine_oxidation
|
162
|
+
mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
|
163
|
+
mods.each{ |node| node.remove!}
|
164
|
+
end
|
165
|
+
|
166
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
167
|
+
var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
|
168
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
169
|
+
fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
|
170
|
+
|
171
|
+
root_bioml_node=std_params.find('/bioml')[0]
|
172
|
+
|
173
|
+
mod_id=1
|
174
|
+
var_mods.each do |vm|
|
175
|
+
|
176
|
+
mod_type="potential modification mass"
|
177
|
+
mod_type = "potential modification motif" if ( vm=~/[\[\]\(\)\{\}\!]/ )
|
178
|
+
mod_id_label = "custom-variable-mod-#{mod_id.to_s}"
|
179
|
+
mod_id=mod_id+1
|
180
|
+
mnode=XML::Node.new('node')
|
181
|
+
mnode["id"]=mod_id_label
|
182
|
+
mnode["type"]="input"
|
183
|
+
mnode["label"]="residue, #{mod_type}"
|
184
|
+
mnode.content=vm
|
185
|
+
|
186
|
+
root_bioml_node << mnode
|
187
|
+
end
|
188
|
+
|
189
|
+
mod_id=1
|
190
|
+
fix_mods.each do |fm|
|
191
|
+
mod_type="modification mass"
|
192
|
+
mod_type = "modification motif" if ( fm=~/[\[\]\(\)\{\}\!]/ )
|
193
|
+
mod_id_label = "custom-fixed-mod-#{mod_id.to_s}"
|
194
|
+
mod_id=mod_id+1
|
195
|
+
mnode=XML::Node.new('node')
|
196
|
+
mnode["id"]=mod_id_label
|
197
|
+
mnode["type"]="input"
|
198
|
+
mnode["label"]="residue, #{mod_type}"
|
199
|
+
mnode.content=fm
|
200
|
+
|
201
|
+
root_bioml_node << mnode
|
202
|
+
end
|
203
|
+
|
204
|
+
#p root_bioml_node
|
205
|
+
std_params
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
|
210
|
+
|
211
|
+
taxon_label=taxo_doc.find('/bioml/taxon')
|
212
|
+
throw "Exactly one taxon label is required in the taxonomy_template file" unless taxon_label.length==1
|
213
|
+
taxon_label[0].attributes['label']=search_tool.database.downcase
|
214
|
+
|
215
|
+
db_file=taxo_doc.find('/bioml/taxon/file')
|
216
|
+
throw "Exactly one database file is required in the taxonomy_template file" unless db_file.length==1
|
217
|
+
db_file[0].attributes['URL']=current_db
|
218
|
+
|
219
|
+
taxo_doc
|
220
|
+
end
|
221
|
+
|
222
|
+
# Run the search engine on each input file
|
223
|
+
#
|
224
|
+
ARGV.each do |filename|
|
225
|
+
|
226
|
+
input_path=Pathname.new(filename.chomp).realpath.to_s
|
227
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.tandem"
|
228
|
+
|
229
|
+
if ( search_tool.explicit_output==nil )
|
230
|
+
pepxml_path="#{output_path.match(/(.*)\.tandem$/)[1]}.pep.xml"
|
231
|
+
else
|
232
|
+
pepxml_path=search_tool.explicit_output
|
233
|
+
end
|
234
|
+
|
235
|
+
output_exists=false
|
236
|
+
if ( !search_tool.no_pepxml && Pathname.new(pepxml_path).exist?)
|
237
|
+
output_exists=true
|
238
|
+
end
|
239
|
+
|
240
|
+
if ( search_tool.no_pepxml && Pathname.new(output_path).exist? )
|
241
|
+
output_exists=true
|
242
|
+
end
|
243
|
+
|
244
|
+
# Only proceed if the output file is not present or we have opted to over-write it
|
245
|
+
#
|
246
|
+
if ( search_tool.over_write || !output_exists )
|
247
|
+
|
248
|
+
# Create the taxonomy file in the same directory as the params file
|
249
|
+
#
|
250
|
+
taxo_path="#{search_tool.input_base_path(filename.chomp)}.taxonomy.xml"
|
251
|
+
mod_taxo_doc=generate_taxonomy_doc(taxo_doc,current_db,search_tool)
|
252
|
+
mod_taxo_doc.save(taxo_path)
|
253
|
+
|
254
|
+
# Modify the default XML document to contain search specific details and save it so it can be used in the search
|
255
|
+
#
|
256
|
+
mod_params=generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
257
|
+
params_path="#{search_tool.input_base_path(filename.chomp)}.tandem.params"
|
258
|
+
mod_params.save(params_path)
|
259
|
+
|
260
|
+
# The basic command
|
261
|
+
#
|
262
|
+
cmd= "#{tandem_bin} #{params_path}"
|
263
|
+
|
264
|
+
# pepXML conversion and repair
|
265
|
+
#
|
266
|
+
unless search_tool.no_pepxml
|
267
|
+
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
268
|
+
cmd << "; #{genv.tandem2xml} #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}; rm #{output_path}"
|
269
|
+
end
|
270
|
+
|
271
|
+
# Add a cleanup command unless the user wants to keep params files
|
272
|
+
#
|
273
|
+
unless search_tool.keep_params_files
|
274
|
+
cmd << "; rm #{params_path}; rm #{taxo_path}"
|
275
|
+
end
|
276
|
+
|
277
|
+
# In case the user specified background running we need to create a jobscript path
|
278
|
+
#
|
279
|
+
jobscript_path="#{output_path}.pbs.sh"
|
280
|
+
|
281
|
+
# Run the search
|
282
|
+
#
|
283
|
+
job_params= {:jobid => search_tool.jobid_from_filename(filename)}
|
284
|
+
job_params[:queue]="lowmem"
|
285
|
+
job_params[:vmem]="900mb"
|
286
|
+
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
287
|
+
throw "Command failed with exit code #{code}" unless code==0
|
288
|
+
else
|
289
|
+
genv.log("Skipping search on existing file #{output_path}",:warn)
|
290
|
+
end
|
291
|
+
|
292
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Runs an MS/MS search using the MSGFPlus search engine
|
7
|
+
#
|
8
|
+
require 'protk/search_tool'
|
9
|
+
|
10
|
+
|
11
|
+
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
12
|
+
#
|
13
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
|
14
|
+
search_tool.option_parser.banner = "Run an msms search on a set of msms spectrum input files.\n\nUsage: template_search.rb [options] file1.mzML file2.mzML ..."
|
15
|
+
search_tool.options.output_suffix="_template"
|
16
|
+
|
17
|
+
search_tool.options.custom_option="default"
|
18
|
+
search_tool.option_parser.on('--custom-opt value','Custom option relevant to this tool only (Default default)') do |val|
|
19
|
+
search_tool.options.custom_option=val
|
20
|
+
end
|
21
|
+
|
22
|
+
search_tool.option_parser.parse!
|
23
|
+
|
24
|
+
# Set search engine specific parameters on the SearchTool object
|
25
|
+
#
|
26
|
+
msgf_bin="#{genv.msgf_bin}/MSGFPlus.jar"
|
27
|
+
|
28
|
+
case
|
29
|
+
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
30
|
+
current_db=Pathname.new(search_tool.database).realpath.to_s
|
31
|
+
else
|
32
|
+
current_db=search_tool.current_database :fasta
|
33
|
+
end
|
34
|
+
|
35
|
+
fragment_tol = search_tool.fragment_tol
|
36
|
+
precursor_tol = search_tool.precursor_tol
|
37
|
+
|
38
|
+
|
39
|
+
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
40
|
+
|
41
|
+
# Run the search engine on each input file
|
42
|
+
#
|
43
|
+
ARGV.each do |filename|
|
44
|
+
|
45
|
+
if ( search_tool.explicit_output!=nil)
|
46
|
+
output_path=search_tool.explicit_output
|
47
|
+
else
|
48
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.mzid"
|
49
|
+
end
|
50
|
+
|
51
|
+
# (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt)
|
52
|
+
# Get the input file extension
|
53
|
+
ext = Pathname.new(filename).extname
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
|
58
|
+
|
59
|
+
# Only proceed if the output file is not present or we have opted to over-write it
|
60
|
+
#
|
61
|
+
if ( search_tool.over_write || !Pathname.new(output_path).exist? )
|
62
|
+
|
63
|
+
# The basic command
|
64
|
+
#
|
65
|
+
cmd= "java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{output_path} "
|
66
|
+
|
67
|
+
#Missed cleavages
|
68
|
+
#
|
69
|
+
throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages > 2)
|
70
|
+
cmd << " -ntt #{search_tool.missed_cleavages}"
|
71
|
+
|
72
|
+
# Precursor tolerance
|
73
|
+
#
|
74
|
+
cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
|
75
|
+
|
76
|
+
# Instrument type
|
77
|
+
#
|
78
|
+
cmd << " -inst 2"
|
79
|
+
|
80
|
+
# cmd << " -m 4"
|
81
|
+
|
82
|
+
cmd << " -addFeatures 1"
|
83
|
+
|
84
|
+
# Enzyme
|
85
|
+
#
|
86
|
+
# if ( search_tool.enzyme!="Trypsin")
|
87
|
+
# cmd << " -e #{search_tool.enzyme}"
|
88
|
+
# end
|
89
|
+
|
90
|
+
mods_path="#{search_tool.input_base_path(filename.chomp)}.msgfplus_mods.txt"
|
91
|
+
mods_file=File.open(mods_path,'w+')
|
92
|
+
|
93
|
+
# Variable Modifications
|
94
|
+
#
|
95
|
+
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
96
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
|
97
|
+
if ( var_mods !="" )
|
98
|
+
cmd << " -mv #{var_mods}"
|
99
|
+
end
|
100
|
+
else
|
101
|
+
# Add options related to peptide modifications
|
102
|
+
#
|
103
|
+
if ( search_tool.glyco )
|
104
|
+
cmd << " -mv 119 "
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Fixed modifications
|
109
|
+
#
|
110
|
+
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
111
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
112
|
+
if ( fix_mods !="")
|
113
|
+
cmd << " -mf #{fix_mods}"
|
114
|
+
end
|
115
|
+
else
|
116
|
+
if ( search_tool.has_modifications )
|
117
|
+
cmd << " -mf "
|
118
|
+
if ( search_tool.carbamidomethyl )
|
119
|
+
cmd<<"3 "
|
120
|
+
end
|
121
|
+
|
122
|
+
if ( search_tool.methionine_oxidation )
|
123
|
+
cmd<<"1 "
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Up to here we've formulated the omssa command. The rest is cleanup
|
130
|
+
p "Running:#{cmd}"
|
131
|
+
|
132
|
+
# Run the search
|
133
|
+
#
|
134
|
+
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
135
|
+
job_params[:queue]="lowmem"
|
136
|
+
job_params[:vmem]="900mb"
|
137
|
+
search_tool.run(cmd,genv,job_params)
|
138
|
+
|
139
|
+
|
140
|
+
else
|
141
|
+
genv.log("Skipping search on existing file #{output_path}",:warn)
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of MSLIMS
|
4
|
+
# Created by Ira Cooke 12/4/2010
|
5
|
+
#
|
6
|
+
# Reads a unimod xml file (eg from a Mascot installation) and produces a loc file with names of allowable chemical modifications
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'libxml'
|
10
|
+
|
11
|
+
include LibXML
|
12
|
+
|
13
|
+
unimod_file=ARGV[0]
|
14
|
+
|
15
|
+
unimod_file=XML::Parser.file(unimod_file)
|
16
|
+
unimod_doc=unimod_file.parse
|
17
|
+
|
18
|
+
|
19
|
+
all_mods=[]
|
20
|
+
|
21
|
+
umd = unimod_doc.find('//umod:unimod/umod:modifications/umod:mod')
|
22
|
+
|
23
|
+
umd.each { |mod|
|
24
|
+
|
25
|
+
# Special Cases
|
26
|
+
#
|
27
|
+
title=mod.attributes['title']
|
28
|
+
if ( title=="Oxidation" || title=="Phospho" || title=="Sulfo")
|
29
|
+
if ( title=="Oxidation")
|
30
|
+
all_mods.push("Oxidation (HW)")
|
31
|
+
all_mods.push("Oxidation (M)")
|
32
|
+
end
|
33
|
+
|
34
|
+
if ( title=="Phospho")
|
35
|
+
all_mods.push("Phospho (ST)")
|
36
|
+
all_mods.push("Phospho (Y)")
|
37
|
+
end
|
38
|
+
|
39
|
+
if ( title=="Sulfo")
|
40
|
+
all_mods.push("Sulfo (S)")
|
41
|
+
all_mods.push("Sulfo (T)")
|
42
|
+
all_mods.push("Sulfo (Y)")
|
43
|
+
end
|
44
|
+
|
45
|
+
else
|
46
|
+
|
47
|
+
# Deal with the anywhere sites which can be concatenated
|
48
|
+
#
|
49
|
+
if ( mod.attributes['title'] !~ /^iTRAQ/ && mod.attributes['title'] !~ /^mTRAQ/ )
|
50
|
+
anywhere_sites = mod.find('./umod:specificity[@hidden="0" and @position="Anywhere"]')
|
51
|
+
if ( anywhere_sites.length>0 )
|
52
|
+
|
53
|
+
sites=[]
|
54
|
+
|
55
|
+
anywhere_sites.each { |s|
|
56
|
+
sites.push("#{s.attributes['site']}")
|
57
|
+
}
|
58
|
+
sites.sort!
|
59
|
+
specificity="("
|
60
|
+
sites.each { |s| specificity<<s }
|
61
|
+
specificity<<")"
|
62
|
+
|
63
|
+
all_mods.push("#{mod.attributes['title']} #{specificity}")
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
else
|
68
|
+
anywhere_sites = mod.find('./umod:specificity[@hidden="0" and @position="Anywhere"]')
|
69
|
+
anywhere_sites.each { |s|
|
70
|
+
all_mods.push("#{mod.attributes['title']} (#{s.attributes['site']})")
|
71
|
+
}
|
72
|
+
end
|
73
|
+
|
74
|
+
specifics=mod.find('./umod:specificity[@hidden="0" and @position!="Anywhere"]')
|
75
|
+
if ( specifics.length > 0 )
|
76
|
+
specifics.each { |specific_mod|
|
77
|
+
|
78
|
+
specificity=specific_mod.attributes['site']
|
79
|
+
if ( specific_mod.attributes['position'] =~ /^Protein/)
|
80
|
+
specificity=specific_mod.attributes['position']
|
81
|
+
end
|
82
|
+
|
83
|
+
if ( (specific_mod.attributes['position'] =~ /Any N-term/) && (specific_mod.attributes['site'] =~ /^[CQEM]$/) )
|
84
|
+
specificity="N-term #{specific_mod.attributes['site']}"
|
85
|
+
end
|
86
|
+
|
87
|
+
if ( (specific_mod.attributes['position'] =~ /Any C-term/) && (specific_mod.attributes['site'] =~ /^[M]$/) )
|
88
|
+
specificity="C-term #{specific_mod.attributes['site']}"
|
89
|
+
end
|
90
|
+
|
91
|
+
all_mods.push("#{mod.attributes['title']} (#{specificity})")
|
92
|
+
|
93
|
+
}
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
}
|
100
|
+
|
101
|
+
|
102
|
+
all_mods=all_mods.sort {|a,b| a.downcase <=> b.downcase}
|
103
|
+
|
104
|
+
loc_output=File.new("mascot_mods.loc",'w')
|
105
|
+
|
106
|
+
loc_output << "#This file lists the names of chemical modifications acceptable for proteomics search engines\n"
|
107
|
+
loc_output << "#\n"
|
108
|
+
loc_output << "#So, unimod_names.loc could look something like this:\n"
|
109
|
+
loc_output << "#\n"
|
110
|
+
|
111
|
+
all_mods.each { |am|
|
112
|
+
key = am.downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
|
113
|
+
loc_output << "#{am}\t#{key}\t#{am}\t#{key}\n"
|
114
|
+
}
|
115
|
+
|
116
|
+
loc_output.close
|
117
|
+
|
118
|
+
|