protk 1.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 2/12/2011
5
+ #
6
+ # Repairs the msms_run_summary tag in a pepXML document to contain a specified file and datatype
7
+ # This tool should only be used on pepXML files that contain a single msms_run_summary (eg not interprophet results)
8
+ #
9
+
10
+
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+ require 'libxml'
15
+
16
+ include LibXML
17
+
18
+ # Environment with global constants
19
+ #
20
+ genv=Constants.new
21
+
22
+
23
+ # Setup specific command-line options for this tool. Other options are inherited from Tool
24
+ #
25
+ tool=Tool.new()
26
+ tool.option_parser.banner = "Repair msms_run_summary tag in a pepXML file.\n\nUsage: repair_run_summary.rb [options] file1.pepXML"
27
+
28
+ tool.options.new_base_name=nil
29
+ tool.option_parser.on( '-N', '--base-name mzmlfile', 'Original MSMS spectrum file used for search' ) do |file|
30
+ tool.options.new_base_name = file
31
+ end
32
+
33
+ tool.options.raw_data_type=nil
34
+ tool.option_parser.on( '-R', '--raw-type type', 'Raw data type used for search' ) do |type|
35
+ tool.options.raw_data_type = type
36
+ end
37
+
38
+ tool.options.omssa_ion_tolerance=nil
39
+ tool.option_parser.on('--omssa-itol fitol','Add a fragment ion tolerance parameter to the omssa search summary') do |fitol|
40
+ tool.options.omssa_ion_tolerance=fitol
41
+ end
42
+
43
+ tool.option_parser.parse!
44
+
45
+ pepxml_file=ARGV[0]
46
+
47
+ # Read the input file
48
+ #
49
+ parser=XML::Parser.file(pepxml_file)
50
+ doc=parser.parse
51
+
52
+ new_base_name=tool.new_base_name
53
+ raw_data_type=tool.raw_data_type
54
+
55
+ genv.log("Repairing #{pepxml_file} to #{new_base_name} format #{raw_data_type}",:info)
56
+
57
+ if ( new_base_name==nil )
58
+ # Try X!Tandem first
59
+ # It would be parameter spectrum,path
60
+ #
61
+ spectrum_path = doc.find('//xmlns:msms_run_summary/xmlns:search_summary/xmlns:parameter[@name="spectrum, path"]','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
62
+ if ( spectrum_path!=nil)
63
+ new_base_name=spectrum_path.attributes['value']
64
+ raw_data_type="mzML" # Always is for X!Tandem
65
+ end
66
+ end
67
+
68
+ if ( new_base_name==nil )
69
+ # Try Mascot
70
+ # It would be parameter File path
71
+ #
72
+ #<parameter name="FILE" value="dataset_2.dat"/>
73
+ file_path = doc.find('//xmlns:msms_run_summary/xmlns:search_summary/xmlns:parameter[@name="FILE"]','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
74
+ if ( file_path!=nil)
75
+
76
+ run_summary=doc.find('//xmlns:msms_run_summary','xmlns:http://regis-web.systemsbiology.net/pepXML')[0]
77
+ old_base_name=run_summary.attributes['base_name']
78
+ base_dir_path=Pathname.new(old_base_name).dirname.to_s
79
+
80
+ new_base_name="#{base_dir_path}/#{file_path.attributes['value']}"
81
+ raw_data_type="mgf" # Always is for Mascot
82
+ end
83
+
84
+ end
85
+
86
+ throw "Could not find original spectrum filename in pepXML and none provided" unless new_base_name!=nil
87
+
88
+
89
+ run_summary=doc.find('//xmlns:msms_run_summary','xmlns:http://regis-web.systemsbiology.net/pepXML')
90
+ if ( run_summary[0]==nil)
91
+ # Try without namespace (OMSSA)
92
+ run_summary=doc.find('//msms_run_summary')
93
+
94
+ if ( tool.options.omssa_ion_tolerance !=nil)
95
+ search_summary=doc.find('//search_summary')[0]
96
+ p search_summary
97
+ pmnode=XML::Node.new('parameter')
98
+ pmnode["name"]="to"
99
+ pmnode["value"]=tool.options.omssa_ion_tolerance.to_s
100
+ search_summary << pmnode
101
+
102
+ end
103
+
104
+ raw_data_type="mgf"
105
+ end
106
+
107
+ throw "No run summary found" unless run_summary[0]!=nil
108
+
109
+ run_summary[0].attributes['base_name']=new_base_name
110
+ run_summary[0].attributes['raw_data']=raw_data_type
111
+
112
+
113
+ doc.save(pepxml_file)
@@ -0,0 +1,292 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 17/12/2010
5
+ #
6
+ # Runs an MS/MS search using the X!Tandem search engine
7
+ #
8
+
9
+ require 'protk/constants'
10
+ require 'protk/command_runner'
11
+ require 'protk/search_tool'
12
+ require 'protk/xtandem_defaults'
13
+ require 'libxml'
14
+
15
+ include LibXML
16
+
17
+ # Environment with global constants
18
+ #
19
+ genv=Constants.new
20
+
21
+ # Setup specific command-line options for this tool. Other options are inherited from SearchTool
22
+ #
23
+ search_tool=SearchTool.new({:msms_search=>true,:background=>true,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
24
+ search_tool.jobid_prefix="x"
25
+ search_tool.option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
26
+ search_tool.options.output_suffix="_tandem"
27
+
28
+ tandem_defaults=XTandemDefaults.new.path
29
+ search_tool.options.tandem_params=tandem_defaults
30
+ search_tool.option_parser.on( '-T', '--tandem-params tandem', 'XTandem parameters to use' ) do |parms|
31
+ search_tool.options.tandem_params = parms
32
+ end
33
+
34
+ search_tool.options.no_pepxml=false
35
+ search_tool.option_parser.on( '-P', '--no-pepxml', 'Dont convert to pepXML after running the search') do
36
+ search_tool.options.no_pepxml=true
37
+ end
38
+
39
+ search_tool.options.keep_params_files=false
40
+ search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parameter files' ) do
41
+ search_tool.options.keep_params_files = true
42
+ end
43
+
44
+ search_tool.option_parser.parse!
45
+
46
+
47
+ # Set search engine specific parameters on the SearchTool object
48
+ #
49
+ tandem_bin="#{genv.xtandem}"
50
+
51
+ throw "Could not find X!Tandem executable" unless FileTest.exists?(tandem_bin)
52
+
53
+ tandem_params=search_tool.tandem_params
54
+
55
+ case
56
+ when Pathname.new(search_tool.database).exist? # It's an explicitly named db
57
+ current_db=Pathname.new(search_tool.database).realpath.to_s
58
+ else
59
+ current_db=search_tool.current_database :fasta
60
+ end
61
+
62
+
63
+
64
+
65
+ # Parse options from a parameter file (if provided), or from the default parameter file
66
+ #
67
+ params_parser=XML::Parser.file(tandem_params)
68
+ std_params=params_parser.parse
69
+
70
+ # Parse taxonomy template file
71
+ #
72
+ taxo_parser=XML::Parser.file(XTandemDefaults.new.taxonomy_path)
73
+ taxo_doc=taxo_parser.parse
74
+
75
+ # Galaxy changes things like @ to __at__ we need to change it back
76
+ #
77
+ def decode_modification_string(mstring)
78
+ mstring.gsub!("__at__","@")
79
+ mstring.gsub!("__oc__","{")
80
+ mstring.gsub!("__cc__","}")
81
+ mstring.gsub!("__ob__","[")
82
+ mstring.gsub!("__cb__","]")
83
+ mstring
84
+ end
85
+
86
+ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
87
+
88
+
89
+ # Set the input and output paths
90
+ #
91
+ input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
92
+ throw "Exactly one spectrum, path note is required in the parameter file" unless input_notes.length==1
93
+ input_notes[0].content=input_path
94
+
95
+ output_notes=std_params.find('/bioml/note[@type="input" and @label="output, path"]')
96
+ throw "Exactly one output, path note is required in the parameter file" unless output_notes.length==1
97
+ output_notes[0].content=output_path
98
+
99
+ # Set the path to the scoring algorithm default params. We use one from ISB
100
+ #
101
+ scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
102
+ throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
103
+ scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_kscore.xml"
104
+
105
+ # Taxonomy and Database
106
+ #
107
+ db_notes=std_params.find('/bioml/note[@type="input" and @label="protein, taxon"]')
108
+ throw "Exactly one protein, taxon note is required in the parameter file" unless db_notes.length==1
109
+ db_notes[0].content=search_tool.database.downcase
110
+
111
+ taxo_notes=std_params.find('/bioml/note[@type="input" and @label="list path, taxonomy information"]')
112
+ throw "Exactly one list path, taxonomy information note is required in the parameter file" unless taxo_notes.length==1
113
+ taxo_notes[0].content=taxo_path
114
+
115
+ fragment_tol = search_tool.fragment_tol
116
+
117
+ fmass=std_params.find('/bioml/note[@type="input" and @label="spectrum, fragment monoisotopic mass error"]')
118
+ p fmass
119
+ throw "Exactly one spectrum, fragment monoisotopic mass error note is required in the parameter file" unless fmass.length==1
120
+ fmass[0].content=fragment_tol.to_s
121
+
122
+ precursor_tol = search_tool.precursor_tol
123
+ ptol_plus=precursor_tol*0.5
124
+ ptol_minus=precursor_tol*0.5
125
+
126
+ # Precursor mass matching
127
+ #
128
+ pmass_minus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error minus"]')
129
+ throw "Exactly one spectrum, parent monoisotopic mass error minus note is required in the parameter file" unless pmass_minus.length==1
130
+ pmass_minus[0].content=ptol_minus.to_s
131
+
132
+ pmass_plus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error plus"]')
133
+ throw "Exactly one spectrum, parent monoisotopic mass error plus note is required in the parameter file" unless pmass_plus.length==1
134
+ pmass_plus[0].content=ptol_plus.to_s
135
+
136
+ pmass_err_units=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error units"]')
137
+ throw "Exactly one spectrum, parent monoisotopic mass error units note is required in the parameter file. Got #{pmass_err_units.length}" unless pmass_err_units.length==1
138
+
139
+
140
+ pmass_err_units[0].content=search_tool.precursor_tolu
141
+
142
+ if search_tool.strict_monoisotopic_mass
143
+ isotopic_error=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass isotope error"]')
144
+ throw "Exactly one spectrum, parent monoisotopic mass isotope error is required in the parameter file" unless isotopic_error.length==1
145
+ isotopic_error[0].content="no"
146
+ end
147
+
148
+
149
+ # Fixed and Variable Modifications
150
+ #
151
+ unless search_tool.carbamidomethyl
152
+ mods=std_params.find('/bioml/note[@type="input" and @id="carbamidomethyl-fixed"]')
153
+ mods.each{ |node| node.remove!}
154
+ end
155
+
156
+ unless search_tool.glyco
157
+ mods=std_params.find('/bioml/note[@type="input" and @id="glyco-variable"]')
158
+ mods.each{ |node| node.remove!}
159
+ end
160
+
161
+ unless search_tool.methionine_oxidation
162
+ mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
163
+ mods.each{ |node| node.remove!}
164
+ end
165
+
166
+ var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
167
+ var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
168
+ fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
169
+ fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
170
+
171
+ root_bioml_node=std_params.find('/bioml')[0]
172
+
173
+ mod_id=1
174
+ var_mods.each do |vm|
175
+
176
+ mod_type="potential modification mass"
177
+ mod_type = "potential modification motif" if ( vm=~/[\[\]\(\)\{\}\!]/ )
178
+ mod_id_label = "custom-variable-mod-#{mod_id.to_s}"
179
+ mod_id=mod_id+1
180
+ mnode=XML::Node.new('node')
181
+ mnode["id"]=mod_id_label
182
+ mnode["type"]="input"
183
+ mnode["label"]="residue, #{mod_type}"
184
+ mnode.content=vm
185
+
186
+ root_bioml_node << mnode
187
+ end
188
+
189
+ mod_id=1
190
+ fix_mods.each do |fm|
191
+ mod_type="modification mass"
192
+ mod_type = "modification motif" if ( fm=~/[\[\]\(\)\{\}\!]/ )
193
+ mod_id_label = "custom-fixed-mod-#{mod_id.to_s}"
194
+ mod_id=mod_id+1
195
+ mnode=XML::Node.new('node')
196
+ mnode["id"]=mod_id_label
197
+ mnode["type"]="input"
198
+ mnode["label"]="residue, #{mod_type}"
199
+ mnode.content=fm
200
+
201
+ root_bioml_node << mnode
202
+ end
203
+
204
+ #p root_bioml_node
205
+ std_params
206
+
207
+ end
208
+
209
+ def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
210
+
211
+ taxon_label=taxo_doc.find('/bioml/taxon')
212
+ throw "Exactly one taxon label is required in the taxonomy_template file" unless taxon_label.length==1
213
+ taxon_label[0].attributes['label']=search_tool.database.downcase
214
+
215
+ db_file=taxo_doc.find('/bioml/taxon/file')
216
+ throw "Exactly one database file is required in the taxonomy_template file" unless db_file.length==1
217
+ db_file[0].attributes['URL']=current_db
218
+
219
+ taxo_doc
220
+ end
221
+
222
+ # Run the search engine on each input file
223
+ #
224
+ ARGV.each do |filename|
225
+
226
+ input_path=Pathname.new(filename.chomp).realpath.to_s
227
+ output_path="#{search_tool.output_base_path(filename.chomp)}.tandem"
228
+
229
+ if ( search_tool.explicit_output==nil )
230
+ pepxml_path="#{output_path.match(/(.*)\.tandem$/)[1]}.pep.xml"
231
+ else
232
+ pepxml_path=search_tool.explicit_output
233
+ end
234
+
235
+ output_exists=false
236
+ if ( !search_tool.no_pepxml && Pathname.new(pepxml_path).exist?)
237
+ output_exists=true
238
+ end
239
+
240
+ if ( search_tool.no_pepxml && Pathname.new(output_path).exist? )
241
+ output_exists=true
242
+ end
243
+
244
+ # Only proceed if the output file is not present or we have opted to over-write it
245
+ #
246
+ if ( search_tool.over_write || !output_exists )
247
+
248
+ # Create the taxonomy file in the same directory as the params file
249
+ #
250
+ taxo_path="#{search_tool.input_base_path(filename.chomp)}.taxonomy.xml"
251
+ mod_taxo_doc=generate_taxonomy_doc(taxo_doc,current_db,search_tool)
252
+ mod_taxo_doc.save(taxo_path)
253
+
254
+ # Modify the default XML document to contain search specific details and save it so it can be used in the search
255
+ #
256
+ mod_params=generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
257
+ params_path="#{search_tool.input_base_path(filename.chomp)}.tandem.params"
258
+ mod_params.save(params_path)
259
+
260
+ # The basic command
261
+ #
262
+ cmd= "#{tandem_bin} #{params_path}"
263
+
264
+ # pepXML conversion and repair
265
+ #
266
+ unless search_tool.no_pepxml
267
+ repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
268
+ cmd << "; #{genv.tandem2xml} #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}; rm #{output_path}"
269
+ end
270
+
271
+ # Add a cleanup command unless the user wants to keep params files
272
+ #
273
+ unless search_tool.keep_params_files
274
+ cmd << "; rm #{params_path}; rm #{taxo_path}"
275
+ end
276
+
277
+ # In case the user specified background running we need to create a jobscript path
278
+ #
279
+ jobscript_path="#{output_path}.pbs.sh"
280
+
281
+ # Run the search
282
+ #
283
+ job_params= {:jobid => search_tool.jobid_from_filename(filename)}
284
+ job_params[:queue]="lowmem"
285
+ job_params[:vmem]="900mb"
286
+ code = search_tool.run(cmd,genv,job_params,jobscript_path)
287
+ throw "Command failed with exit code #{code}" unless code==0
288
+ else
289
+ genv.log("Skipping search on existing file #{output_path}",:warn)
290
+ end
291
+
292
+ end
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Runs an MS/MS search using the MSGFPlus search engine
7
+ #
8
+ require 'protk/search_tool'
9
+
10
+
11
+ # Setup specific command-line options for this tool. Other options are inherited from SearchTool
12
+ #
13
+ search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
14
+ search_tool.option_parser.banner = "Run an msms search on a set of msms spectrum input files.\n\nUsage: template_search.rb [options] file1.mzML file2.mzML ..."
15
+ search_tool.options.output_suffix="_template"
16
+
17
+ search_tool.options.custom_option="default"
18
+ search_tool.option_parser.on('--custom-opt value','Custom option relevant to this tool only (Default default)') do |val|
19
+ search_tool.options.custom_option=val
20
+ end
21
+
22
+ search_tool.option_parser.parse!
23
+
24
+ # Set search engine specific parameters on the SearchTool object
25
+ #
26
+ msgf_bin="#{genv.msgf_bin}/MSGFPlus.jar"
27
+
28
+ case
29
+ when Pathname.new(search_tool.database).exist? # It's an explicitly named db
30
+ current_db=Pathname.new(search_tool.database).realpath.to_s
31
+ else
32
+ current_db=search_tool.current_database :fasta
33
+ end
34
+
35
+ fragment_tol = search_tool.fragment_tol
36
+ precursor_tol = search_tool.precursor_tol
37
+
38
+
39
+ throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
40
+
41
+ # Run the search engine on each input file
42
+ #
43
+ ARGV.each do |filename|
44
+
45
+ if ( search_tool.explicit_output!=nil)
46
+ output_path=search_tool.explicit_output
47
+ else
48
+ output_path="#{search_tool.output_base_path(filename.chomp)}.mzid"
49
+ end
50
+
51
+ # (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt)
52
+ # Get the input file extension
53
+ ext = Pathname.new(filename).extname
54
+
55
+
56
+
57
+ input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
58
+
59
+ # Only proceed if the output file is not present or we have opted to over-write it
60
+ #
61
+ if ( search_tool.over_write || !Pathname.new(output_path).exist? )
62
+
63
+ # The basic command
64
+ #
65
+ cmd= "java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{output_path} "
66
+
67
+ #Missed cleavages
68
+ #
69
+ throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages > 2)
70
+ cmd << " -ntt #{search_tool.missed_cleavages}"
71
+
72
+ # Precursor tolerance
73
+ #
74
+ cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
75
+
76
+ # Instrument type
77
+ #
78
+ cmd << " -inst 2"
79
+
80
+ # cmd << " -m 4"
81
+
82
+ cmd << " -addFeatures 1"
83
+
84
+ # Enzyme
85
+ #
86
+ # if ( search_tool.enzyme!="Trypsin")
87
+ # cmd << " -e #{search_tool.enzyme}"
88
+ # end
89
+
90
+ mods_path="#{search_tool.input_base_path(filename.chomp)}.msgfplus_mods.txt"
91
+ mods_file=File.open(mods_path,'w+')
92
+
93
+ # Variable Modifications
94
+ #
95
+ if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
96
+ var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
97
+ if ( var_mods !="" )
98
+ cmd << " -mv #{var_mods}"
99
+ end
100
+ else
101
+ # Add options related to peptide modifications
102
+ #
103
+ if ( search_tool.glyco )
104
+ cmd << " -mv 119 "
105
+ end
106
+ end
107
+
108
+ # Fixed modifications
109
+ #
110
+ if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
111
+ fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
112
+ if ( fix_mods !="")
113
+ cmd << " -mf #{fix_mods}"
114
+ end
115
+ else
116
+ if ( search_tool.has_modifications )
117
+ cmd << " -mf "
118
+ if ( search_tool.carbamidomethyl )
119
+ cmd<<"3 "
120
+ end
121
+
122
+ if ( search_tool.methionine_oxidation )
123
+ cmd<<"1 "
124
+ end
125
+
126
+ end
127
+ end
128
+
129
+ # Up to here we've formulated the omssa command. The rest is cleanup
130
+ p "Running:#{cmd}"
131
+
132
+ # Run the search
133
+ #
134
+ job_params= {:jobid => search_tool.jobid_from_filename(filename) }
135
+ job_params[:queue]="lowmem"
136
+ job_params[:vmem]="900mb"
137
+ search_tool.run(cmd,genv,job_params)
138
+
139
+
140
+ else
141
+ genv.log("Skipping search on existing file #{output_path}",:warn)
142
+ end
143
+
144
+ end
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of MSLIMS
4
+ # Created by Ira Cooke 12/4/2010
5
+ #
6
+ # Reads a unimod xml file (eg from a Mascot installation) and produces a loc file with names of allowable chemical modifications
7
+ #
8
+
9
+ require 'libxml'
10
+
11
+ include LibXML
12
+
13
+ unimod_file=ARGV[0]
14
+
15
+ unimod_file=XML::Parser.file(unimod_file)
16
+ unimod_doc=unimod_file.parse
17
+
18
+
19
+ all_mods=[]
20
+
21
+ umd = unimod_doc.find('//umod:unimod/umod:modifications/umod:mod')
22
+
23
+ umd.each { |mod|
24
+
25
+ # Special Cases
26
+ #
27
+ title=mod.attributes['title']
28
+ if ( title=="Oxidation" || title=="Phospho" || title=="Sulfo")
29
+ if ( title=="Oxidation")
30
+ all_mods.push("Oxidation (HW)")
31
+ all_mods.push("Oxidation (M)")
32
+ end
33
+
34
+ if ( title=="Phospho")
35
+ all_mods.push("Phospho (ST)")
36
+ all_mods.push("Phospho (Y)")
37
+ end
38
+
39
+ if ( title=="Sulfo")
40
+ all_mods.push("Sulfo (S)")
41
+ all_mods.push("Sulfo (T)")
42
+ all_mods.push("Sulfo (Y)")
43
+ end
44
+
45
+ else
46
+
47
+ # Deal with the anywhere sites which can be concatenated
48
+ #
49
+ if ( mod.attributes['title'] !~ /^iTRAQ/ && mod.attributes['title'] !~ /^mTRAQ/ )
50
+ anywhere_sites = mod.find('./umod:specificity[@hidden="0" and @position="Anywhere"]')
51
+ if ( anywhere_sites.length>0 )
52
+
53
+ sites=[]
54
+
55
+ anywhere_sites.each { |s|
56
+ sites.push("#{s.attributes['site']}")
57
+ }
58
+ sites.sort!
59
+ specificity="("
60
+ sites.each { |s| specificity<<s }
61
+ specificity<<")"
62
+
63
+ all_mods.push("#{mod.attributes['title']} #{specificity}")
64
+
65
+ end
66
+
67
+ else
68
+ anywhere_sites = mod.find('./umod:specificity[@hidden="0" and @position="Anywhere"]')
69
+ anywhere_sites.each { |s|
70
+ all_mods.push("#{mod.attributes['title']} (#{s.attributes['site']})")
71
+ }
72
+ end
73
+
74
+ specifics=mod.find('./umod:specificity[@hidden="0" and @position!="Anywhere"]')
75
+ if ( specifics.length > 0 )
76
+ specifics.each { |specific_mod|
77
+
78
+ specificity=specific_mod.attributes['site']
79
+ if ( specific_mod.attributes['position'] =~ /^Protein/)
80
+ specificity=specific_mod.attributes['position']
81
+ end
82
+
83
+ if ( (specific_mod.attributes['position'] =~ /Any N-term/) && (specific_mod.attributes['site'] =~ /^[CQEM]$/) )
84
+ specificity="N-term #{specific_mod.attributes['site']}"
85
+ end
86
+
87
+ if ( (specific_mod.attributes['position'] =~ /Any C-term/) && (specific_mod.attributes['site'] =~ /^[M]$/) )
88
+ specificity="C-term #{specific_mod.attributes['site']}"
89
+ end
90
+
91
+ all_mods.push("#{mod.attributes['title']} (#{specificity})")
92
+
93
+ }
94
+
95
+ end
96
+
97
+ end
98
+
99
+ }
100
+
101
+
102
+ all_mods=all_mods.sort {|a,b| a.downcase <=> b.downcase}
103
+
104
+ loc_output=File.new("mascot_mods.loc",'w')
105
+
106
+ loc_output << "#This file lists the names of chemical modifications acceptable for proteomics search engines\n"
107
+ loc_output << "#\n"
108
+ loc_output << "#So, unimod_names.loc could look something like this:\n"
109
+ loc_output << "#\n"
110
+
111
+ all_mods.each { |am|
112
+ key = am.downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
113
+ loc_output << "#{am}\t#{key}\t#{am}\t#{key}\n"
114
+ }
115
+
116
+ loc_output.close
117
+
118
+