protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/sixframe.rb CHANGED
@@ -16,55 +16,33 @@ def check_coords(naseq,aaseq,frame,pstart,pend)
16
16
  orf_from_coords=naseq[pstart-1..pend-1].translate(1)
17
17
  else
18
18
  orf_from_coords=naseq[pstart-1..pend-1].reverse_complement.translate(1)
19
- # current coords give
20
- # naseq.reverse_complement[pstart-1..pend-1].translate(1)
21
- # naseq[350368-pend..(350367-pstart+1)].reverse_complement.translate(1)
22
- # orf_from_coords=naseq[naseq.length-pend..naseq.length-pstart].reverse_complement.translate(1)
23
19
  end
24
20
  if ( orf_from_coords!=aaseq)
25
21
  require 'debugger'; debugger
26
22
  end
27
- # p "#{aaseq} #{frame}"
28
23
  end
29
24
 
30
25
 
31
26
  tool=Tool.new([:explicit_output])
32
27
  tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage: sixframe.rb [options] genome.fasta"
33
28
 
34
- tool.options.print_coords=false
35
- tool.option_parser.on( '--coords', 'Write genomic coordinates in the fasta header' ) do
36
- tool.options.print_coords=true
37
- end
38
-
39
- tool.options.keep_header=true
40
- tool.option_parser.on( '--strip-header', 'Dont write sequence definition' ) do
41
- tool.options.keep_header=false
42
- end
29
+ tool.add_boolean_option(:print_coords,false,['--coords', 'Write genomic coordinates in the fasta header'])
30
+ tool.add_boolean_option(:keep_header,true,['--strip-header', 'Dont write sequence definition'])
31
+ tool.add_value_option(:min_len,20,['--min-len','Minimum ORF length to keep'])
43
32
 
44
- exit unless tool.check_options
33
+ exit unless tool.check_options(true)
45
34
 
46
- if ( ARGV[0].nil? )
47
- puts "You must supply an input file"
48
- puts tool.option_parser
49
- exit
50
- end
35
+ input_file=ARGV[0]
51
36
 
52
- inname=ARGV.shift
37
+ output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
53
38
 
54
- outfile=nil
55
- if ( tool.explicit_output != nil)
56
- outfile=File.open(tool.explicit_output,'w')
57
- else
58
- outfile=File.open("#{inname}.translated.fasta",'w')
59
- end
39
+ output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
60
40
 
61
41
 
62
- file = Bio::FastaFormat.open(inname)
42
+ file = Bio::FastaFormat.open(input_file)
63
43
 
64
44
  file.each do |entry|
65
45
 
66
- puts entry.entry_id
67
-
68
46
  length = entry.naseq.length
69
47
 
70
48
  (1...7).each do |frame|
@@ -76,7 +54,7 @@ file.each do |entry|
76
54
  oi=0
77
55
  orfs.each do |orf|
78
56
  oi+=1
79
- if ( orf.length > 20 )
57
+ if ( orf.length > tool.min_len )
80
58
 
81
59
  position_start = position
82
60
  position_end = position_start + orf.length*3 -1
@@ -109,7 +87,7 @@ file.each do |entry|
109
87
  # Output in fasta format
110
88
  # start and end positions are always relative to the forward strand
111
89
 
112
- outfile.write("#{defline}\n#{orf}\n")
90
+ output_fh.write("#{defline}\n#{orf}\n")
113
91
 
114
92
  end
115
93
  position += orf.length*3+3
data/bin/tandem_search.rb CHANGED
@@ -8,8 +8,7 @@
8
8
 
9
9
  require 'protk/constants'
10
10
  require 'protk/command_runner'
11
- require 'protk/search_tool'
12
- require 'protk/xtandem_defaults'
11
+ require 'protk/tandem_search_tool'
13
12
  require 'libxml'
14
13
 
15
14
  include LibXML
@@ -17,443 +16,71 @@ include LibXML
17
16
  # Environment with global constants
18
17
  #
19
18
  genv=Constants.new
19
+ search_tool=TandemSearchTool.new()
20
20
 
21
- # Setup specific command-line options for this tool. Other options are inherited from SearchTool
22
- #
23
- search_tool=SearchTool.new([:background,:glyco,:database,:explicit_output,:over_write,
24
- :enzyme,:modifications,:mass_tolerance_units,:mass_tolerance,:strict_monoisotopic_mass,
25
- :missed_cleavages,:cleavage_semi,:carbamidomethyl,:methionine_oxidation
26
- ])
27
- search_tool.jobid_prefix="x"
28
- search_tool.option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
29
- search_tool.options.output_suffix="_tandem"
30
-
31
- tandem_defaults=XTandemDefaults.new.path
32
- search_tool.options.tandem_params=tandem_defaults
33
- search_tool.option_parser.on( '-T', '--tandem-params tandem', 'XTandem parameters to use' ) do |parms|
34
- search_tool.options.tandem_params = parms
35
- end
36
-
37
- search_tool.options.no_pepxml=false
38
- search_tool.option_parser.on( '-P', '--no-pepxml', 'Dont convert to pepXML after running the search') do
39
- search_tool.options.no_pepxml=true
40
- end
41
-
42
- search_tool.options.keep_params_files=false
43
- search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parameter files' ) do
44
- search_tool.options.keep_params_files = true
45
- end
46
-
47
- # In case want pepXML, but still want tandem output also.
48
- search_tool.options.tandem_output=nil
49
- search_tool.option_parser.on( '--tandem-output tandem_output', 'Keep X! Tandem Output') do |tandem_output|
50
- search_tool.options.tandem_output=tandem_output
51
- end
52
-
53
- search_tool.options.thresholds_type = 'isb_kscore'
54
- search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold, system_default)' ) do |thresholds_type|
55
- # This options sets up various X! Tandem thresholds.
56
- # - system_default: Don't change any defaults just use
57
- # the defaults for this TPP install as is.
58
- # - tandem_default: These thresholds are found on the
59
- # tandem api page. http://www.thegpm.org/tandem/api/index.html
60
- # - isb_native: These are the defaults found in
61
- # isb_default_input_native.xml distributed with TPP 4.6.
62
- # - isb_kscore: These are the defaults found in
63
- # isb_default_input_kscore.xml distributed with TPP 4.6.
64
- # - scaffold: These are the defaults recommend by Proteome Software
65
- # for use with Scaffold.
66
- search_tool.options.thresholds_type = thresholds_type
67
- end
68
-
69
- search_tool.options.algorithm = "kscore"
70
- search_tool.option_parser.on( '--algorithm algorithm', "Scoring algorithm (kscore or native)" ) do |algorithm|
71
- search_tool.options.algorithm = algorithm
72
- end
73
-
74
- search_tool.options.cleavage_semi = true
75
- search_tool.option_parser.on( '--no-cleavage-semi' ) do
76
- search_tool.options.cleavage_semi = false
77
- end
21
+ exit unless search_tool.check_options(true)
78
22
 
79
-
80
- search_tool.options.n_terminal_mod_mass=nil
81
- search_tool.option_parser.on('--n-terminal-mod-mass mass') do |mass|
82
- search_tool.options.n_terminal_mod_mass = mass
83
- end
84
-
85
- search_tool.options.c_terminal_mod_mass=nil
86
- search_tool.option_parser.on('--c-terminal-mod-mass mass') do |mass|
87
- search_tool.options.c_terminal_mod_mass = mass
88
- end
89
-
90
- search_tool.options.cleavage_n_terminal_mod_mass=nil
91
- search_tool.option_parser.on('--cleavage-n-terminal-mod-mass mass') do |mass|
92
- search_tool.options.cleavage_n_terminal_mod_mass = mass
93
- end
94
-
95
- search_tool.options.cleavage_c_terminal_mod_mass=nil
96
- search_tool.option_parser.on('--cleavage-c-terminal-mod-mass mass') do |mass|
97
- search_tool.options.cleavage_c_terminal_mod_mass = mass
98
- end
99
-
100
- exit unless search_tool.check_options
101
-
102
- if ( ARGV[0].nil? )
103
- puts "You must supply an input file"
104
- puts search_tool.option_parser
105
- exit
106
- end
107
-
108
-
109
- # Set search engine specific parameters on the SearchTool object
23
+ # Our environment should be setup so that tandem or tandem.exe is on the path
110
24
  #
111
- tandem_bin="tandem"
25
+ tandem_bin=%x[which tandem].chomp
26
+ tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
112
27
 
113
- tandem_params=search_tool.tandem_params
28
+ @output_suffix="_tandem"
114
29
 
115
- case
116
- when Pathname.new(search_tool.database).exist? # It's an explicitly named db
117
- current_db=Pathname.new(search_tool.database).realpath.to_s
118
- else
119
- current_db=search_tool.current_database :fasta
120
- end
121
-
122
-
123
- # Parse options from a parameter file (if provided), or from the default parameter file
124
- #
125
- params_parser=XML::Parser.file(tandem_params)
126
- std_params=params_parser.parse
127
-
128
- # Parse taxonomy template file
129
- #
130
- taxo_parser=XML::Parser.file(XTandemDefaults.new.taxonomy_path)
131
- taxo_doc=taxo_parser.parse
132
-
133
- # Galaxy changes things like @ to __at__ we need to change it back
30
+ # Run the search engine on each input file
134
31
  #
135
- def decode_modification_string(mstring)
136
- mstring.gsub!("__at__","@")
137
- mstring.gsub!("__oc__","{")
138
- mstring.gsub!("__cc__","}")
139
- mstring.gsub!("__ob__","[")
140
- mstring.gsub!("__cb__","]")
141
- mstring
142
- end
32
+ ARGV.each do |filename|
143
33
 
144
- def set_option(std_params, tandem_key, value)
145
- notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
146
- throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
147
- notes[0].content=value
148
- end
34
+ throw "Input file #{filename} does not exist" unless File.exist?(filename)
149
35
 
150
- def append_option(std_params, tandem_key, value)
151
- notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
152
- if notes.length == 0
153
- node = XML::Node.new('note')
154
- node["type"] = "input"
155
- node["label"] = tandem_key
156
- node.content = value
157
- std_params.find('/bioml')[0] << node
158
- else
159
- throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
160
- notes[0].content = append_string(notes[0].content, value)
161
- end
162
- end
36
+ input_path=Pathname.new(filename.chomp).expand_path.to_s
37
+ output_path=Tool.default_output_path(input_path,".tandem",search_tool.output_prefix,@output_suffix)
163
38
 
164
- def collapse_keys(std_params, tandem_key)
165
- mods=std_params.find('/bioml/note[@type="input" and @label="#{tandem_key}"]')
166
- if not mods
167
- first_mod = mods[0]
168
- rest_mods = mods[1..-1]
169
- rest_mods.each{ |node| first_mod.content = append_string(first_mod.content, node.content); node.remove!}
170
- end
171
- end
172
39
 
173
- def append_string(first, second)
174
- if first.empty?
175
- second
40
+ if ( search_tool.explicit_output )
41
+ final_output_path=search_tool.explicit_output
176
42
  else
177
- "#{first},#{second}"
178
- end
179
- end
180
-
181
- def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
182
- set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
183
- set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
184
-
185
- # Set the input and output paths
186
- #
187
- input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
188
- throw "Exactly one spectrum, path note is required in the parameter file" unless input_notes.length==1
189
- input_notes[0].content=input_path
190
-
191
- output_notes=std_params.find('/bioml/note[@type="input" and @label="output, path"]')
192
- throw "Exactly one output, path note is required in the parameter file" unless output_notes.length==1
193
- output_notes[0].content=output_path
194
-
195
- # Set the path to the scoring algorithm default params. We use one from ISB
196
- #
197
- scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
198
- throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
199
- scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_#{search_tool.algorithm}.xml"
200
-
201
- # Taxonomy and Database
202
- #
203
- db_notes=std_params.find('/bioml/note[@type="input" and @label="protein, taxon"]')
204
- throw "Exactly one protein, taxon note is required in the parameter file" unless db_notes.length==1
205
- db_notes[0].content=search_tool.database.downcase
206
-
207
- taxo_notes=std_params.find('/bioml/note[@type="input" and @label="list path, taxonomy information"]')
208
- throw "Exactly one list path, taxonomy information note is required in the parameter file" unless taxo_notes.length==1
209
- taxo_notes[0].content=taxo_path
210
-
211
- fragment_tol = search_tool.fragment_tol
212
-
213
- fmass=std_params.find('/bioml/note[@type="input" and @label="spectrum, fragment monoisotopic mass error"]')
214
- p fmass
215
- throw "Exactly one spectrum, fragment monoisotopic mass error note is required in the parameter file" unless fmass.length==1
216
- fmass[0].content=fragment_tol.to_s
217
-
218
- precursor_tol = search_tool.precursor_tol
219
- ptol_plus=precursor_tol*0.5
220
- ptol_minus=precursor_tol*0.5
221
-
222
- # Precursor mass matching
223
- #
224
- pmass_minus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error minus"]')
225
- throw "Exactly one spectrum, parent monoisotopic mass error minus note is required in the parameter file" unless pmass_minus.length==1
226
- pmass_minus[0].content=ptol_minus.to_s
227
-
228
- pmass_plus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error plus"]')
229
- throw "Exactly one spectrum, parent monoisotopic mass error plus note is required in the parameter file" unless pmass_plus.length==1
230
- pmass_plus[0].content=ptol_plus.to_s
231
-
232
- pmass_err_units=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error units"]')
233
- throw "Exactly one spectrum, parent monoisotopic mass error units note is required in the parameter file. Got #{pmass_err_units.length}" unless pmass_err_units.length==1
234
-
235
-
236
- pmass_err_units[0].content=search_tool.precursor_tolu
237
-
238
- if search_tool.strict_monoisotopic_mass
239
- isotopic_error=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass isotope error"]')
240
- throw "Exactly one spectrum, parent monoisotopic mass isotope error is required in the parameter file" unless isotopic_error.length==1
241
- isotopic_error[0].content="no"
43
+ final_output_path=output_path
242
44
  end
243
45
 
244
- if search_tool.tandem_output
245
- # If one is interested in the tandem output (e.g. for consumption by Scaffold)
246
- # want to store additional information.
247
- set_option(std_params, "output, spectra", "yes")
248
- end
249
46
 
250
- thresholds_type = search_tool.thresholds_type
47
+ output_exists=Pathname.new(final_output_path).exist?
251
48
 
252
- if thresholds_type != "system_default"
253
-
254
- maximum_valid_expectation_value = "0.1"
255
- if thresholds_type == "scaffold"
256
- maximum_valid_expectation_value = "1000"
257
- end
258
-
259
- minimum_ion_count = "4"
260
- case thresholds_type
261
- when "isb_kscore", "isb_native"
262
- minimum_ion_count = "1"
263
- when "scaffold"
264
- minimum_ion_count = "0"
265
- end
266
-
267
- minimum_peaks = "15"
268
- case thresholds_type
269
- when "isb_native"
270
- minimum_peaks = "6"
271
- when "isb_kscore"
272
- minimum_peaks = "10"
273
- when "scaffold"
274
- minimum_peaks = "0"
275
- end
276
-
277
- minimum_fragement_mz = "150"
278
- case thresholds_type
279
- when "isb_native"
280
- minimum_fragement_mz = "50"
281
- when "isb_kscore"
282
- minimum_fragement_mz = "125"
283
- when "scaffold"
284
- minimum_fragement_mz = "0"
285
- end
286
-
287
- minimum_parent_mh = "500" # tandem and isb_native defaults
288
- case thresholds_type
289
- when "isb_kscore"
290
- minimum_parent_mh = "600"
291
- when "scaffold"
292
- minimum_parent_mh = "0"
293
- end
294
-
295
- use_noise_suppression = "yes"
296
- if thresholds_type == "isb_kscore" or thresholds_type == "scaffold"
297
- use_noise_suppression = "no"
298
- end
299
-
300
- dynamic_range = "100.0"
301
- case thresholds_type
302
- when "isb_kscore"
303
- dynamic_range = "10000.0"
304
- when "scaffold"
305
- dynamic_range = "1000.0"
306
- end
307
-
308
- set_option(std_params, "spectrum, dynamic range", dynamic_range)
309
- set_option(std_params, "spectrum, use noise suppression", use_noise_suppression)
310
- set_option(std_params, "spectrum, minimum parent m+h", minimum_parent_mh)
311
- set_option(std_params, "spectrum, minimum fragment mz", minimum_fragement_mz)
312
- set_option(std_params, "spectrum, minimum peaks", minimum_peaks)
313
- set_option(std_params, "scoring, minimum ion count", minimum_ion_count)
314
- set_option(std_params, "output, maximum valid expectation value", maximum_valid_expectation_value)
315
- end
316
-
317
- # Fixed and Variable Modifications
318
- #
319
- unless search_tool.carbamidomethyl
320
- mods=std_params.find('/bioml/note[@type="input" and @id="carbamidomethyl-fixed"]')
321
- mods.each{ |node| node.remove!}
322
- end
323
-
324
- unless search_tool.glyco
325
- mods=std_params.find('/bioml/note[@type="input" and @id="glyco-variable"]')
326
- mods.each{ |node| node.remove!}
327
- end
328
-
329
- unless search_tool.methionine_oxidation
330
- mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
331
- mods.each{ |node| node.remove!}
332
- end
333
-
334
- # Merge all remaining id based modification into single modification.
335
- collapse_keys(std_params, "residue, potential modification mass")
336
- collapse_keys(std_params, "residue, modification mass")
337
-
338
- var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
339
- var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
340
- fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
341
- fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
342
-
343
- root_bioml_node=std_params.find('/bioml')[0]
344
-
345
- mod_id=1
346
- var_mods.each do |vm|
347
-
348
- mod_type="potential modification mass"
349
- mod_type = "potential modification motif" if motif?(vm)
350
- label="residue, #{mod_type}"
351
- append_option(std_params, label, vm)
352
- end
353
-
354
- mod_id=1
355
- fix_mods.each do |fm|
356
- mod_type="modification mass"
357
- mod_type = "modification motif" if motif?(fm)
358
- label="residue, #{mod_type}"
359
- append_option(std_params, label, fm)
49
+ puts final_output_path
50
+ if Pathname.new(final_output_path).absolute?
51
+ output_base_path=Pathname.new(final_output_path).dirname.to_s
52
+ else
53
+ output_base_path="#{Dir.pwd}/#{Pathname.new(final_output_path).dirname.to_s}"
360
54
  end
55
+ puts output_base_path
361
56
 
362
- #p root_bioml_node
363
- std_params
364
-
365
- end
366
-
367
- def motif?(mod_string)
368
- # 124@[ is not a modification motif, it is a residue (N-term) modification,
369
- # so when checking if modification is a motif look for paired square brackets.
370
- mod_string =~ /[\(\)\{\}\!]/ or mod_string =~ /\[.*\]/
371
- end
372
-
57
+ protein_db_info=search_tool.database_info
373
58
 
374
- def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
375
-
376
- taxon_label=taxo_doc.find('/bioml/taxon')
377
- throw "Exactly one taxon label is required in the taxonomy_template file" unless taxon_label.length==1
378
- taxon_label[0].attributes['label']=search_tool.database.downcase
379
-
380
- db_file=taxo_doc.find('/bioml/taxon/file')
381
- throw "Exactly one database file is required in the taxonomy_template file" unless db_file.length==1
382
- db_file[0].attributes['URL']=current_db
383
-
384
- taxo_doc
385
- end
386
-
387
- # Run the search engine on each input file
388
- #
389
- ARGV.each do |filename|
59
+ taxo_path="#{final_output_path}.taxonomy.xml"
60
+ taxo_doc = search_tool.taxonomy_doc(protein_db_info)
61
+ taxo_doc.save(taxo_path)
390
62
 
391
- input_path=Pathname.new(filename.chomp).realpath.to_s
392
- output_path="#{search_tool.output_base_path(filename.chomp)}.tandem"
63
+ params_path="#{final_output_path}.params"
64
+ params_doc = search_tool.params_doc(protein_db_info,taxo_path,input_path,final_output_path)
65
+ params_doc.save(params_path)
393
66
 
394
- if ( search_tool.explicit_output==nil )
395
- pepxml_path="#{output_path.match(/(.*)\.tandem$/)[1]}.pep.xml"
396
- else
397
- pepxml_path=search_tool.explicit_output
398
- end
399
-
400
- output_exists=false
401
- if ( !search_tool.no_pepxml && Pathname.new(pepxml_path).exist?)
402
- output_exists=true
403
- end
404
-
405
- if ( search_tool.no_pepxml && Pathname.new(output_path).exist? )
406
- output_exists=true
407
- end
408
-
409
67
  # Only proceed if the output file is not present or we have opted to over-write it
410
68
  #
411
69
  if ( search_tool.over_write || !output_exists )
412
70
 
413
- # Create the taxonomy file in the same directory as the params file
414
- #
415
- taxo_path="#{search_tool.input_base_path(filename.chomp)}.taxonomy.xml"
416
- mod_taxo_doc=generate_taxonomy_doc(taxo_doc,current_db,search_tool)
417
- mod_taxo_doc.save(taxo_path)
418
-
419
- # Modify the default XML document to contain search specific details and save it so it can be used in the search
420
- #
421
- mod_params=generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
422
- params_path="#{search_tool.input_base_path(filename.chomp)}.tandem.params"
423
- mod_params.save(params_path)
424
-
425
71
  # The basic command
426
72
  #
427
73
  cmd= "#{tandem_bin} #{params_path}"
428
74
 
429
- # pepXML conversion and repair
430
- #
431
- unless search_tool.no_pepxml
432
- repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
433
- cmd << "; Tandem2XML #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}"
434
- if search_tool.tandem_output
435
- cmd << "; cp #{output_path} #{search_tool.tandem_output}"
436
- else
437
- cmd << "; rm #{output_path}"
438
- end
439
- end
440
-
441
75
  # Add a cleanup command unless the user wants to keep params files
442
76
  #
443
77
  unless search_tool.keep_params_files
444
78
  cmd << "; rm #{params_path}; rm #{taxo_path}"
445
79
  end
446
-
447
- # In case the user specified background running we need to create a jobscript path
448
- #
449
- jobscript_path="#{output_path}.pbs.sh"
450
80
 
451
81
  # Run the search
452
82
  #
453
- job_params= {:jobid => search_tool.jobid_from_filename(filename)}
454
- job_params[:queue]="sixteen"
455
- job_params[:vmem]="12gb"
456
- code = search_tool.run(cmd,genv,job_params,jobscript_path)
83
+ code = search_tool.run(cmd,genv)
457
84
  throw "Command failed with exit code #{code}" unless code==0
458
85
  else
459
86
  genv.log("Skipping search on existing file #{output_path}",:warn)
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of Protk
4
+ # Created by Ira Cooke 12/4/2010
5
+ #
6
+ # Convert tandem output files to pepxml. A wrapper for Tandem2XML
7
+ #
8
+
9
+
10
+ require 'protk/constants'
11
+ require 'protk/search_tool'
12
+
13
+ # Environment with global constants
14
+ #
15
+ genv=Constants.new
16
+
17
+ tool=SearchTool.new([:explicit_output,:over_write,:prefix])
18
+ tool.option_parser.banner = "Convert tandem files to pep.xml files.\n\nUsage: tandem_to_pepxml.rb [options] file1.dat file2.dat ... "
19
+
20
+ @output_suffix=""
21
+
22
+ exit unless tool.check_options(true)
23
+
24
+ binpath=%x[which Tandem2XML]
25
+ binpath.chomp!
26
+
27
+
28
+ ARGV.each do |filename|
29
+
30
+ throw "Input file #{filename} does not exist" unless File.exist?(filename)
31
+
32
+ if ( tool.explicit_output )
33
+ output_path=tool.explicit_output
34
+ else
35
+ output_path=Tool.default_output_path(filename.chomp,".pep.xml",tool.output_prefix,@output_suffix)
36
+ end
37
+
38
+ throw "Unable to find Tandem2XML" unless binpath=~/Tandem2XML/
39
+ cmd = "#{binpath} #{filename.chomp} #{output_path}"
40
+
41
+ code = tool.run(cmd,genv)
42
+ throw "Command #{cmd} failed with exit code #{code}" unless code==0
43
+ end
data/bin/unimod_to_loc.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  #
3
- # This file is part of MSLIMS
3
+ # This file is part of Protk
4
4
  # Created by Ira Cooke 12/4/2010
5
5
  #
6
6
  # Reads a unimod xml file (eg from a Mascot installation) and produces a loc file with names of allowable chemical modifications