protk 1.2.6.pre5 → 1.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/sixframe.rb CHANGED
@@ -16,55 +16,33 @@ def check_coords(naseq,aaseq,frame,pstart,pend)
16
16
  orf_from_coords=naseq[pstart-1..pend-1].translate(1)
17
17
  else
18
18
  orf_from_coords=naseq[pstart-1..pend-1].reverse_complement.translate(1)
19
- # current coords give
20
- # naseq.reverse_complement[pstart-1..pend-1].translate(1)
21
- # naseq[350368-pend..(350367-pstart+1)].reverse_complement.translate(1)
22
- # orf_from_coords=naseq[naseq.length-pend..naseq.length-pstart].reverse_complement.translate(1)
23
19
  end
24
20
  if ( orf_from_coords!=aaseq)
25
21
  require 'debugger'; debugger
26
22
  end
27
- # p "#{aaseq} #{frame}"
28
23
  end
29
24
 
30
25
 
31
26
  tool=Tool.new([:explicit_output])
32
27
  tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage: sixframe.rb [options] genome.fasta"
33
28
 
34
- tool.options.print_coords=false
35
- tool.option_parser.on( '--coords', 'Write genomic coordinates in the fasta header' ) do
36
- tool.options.print_coords=true
37
- end
38
-
39
- tool.options.keep_header=true
40
- tool.option_parser.on( '--strip-header', 'Dont write sequence definition' ) do
41
- tool.options.keep_header=false
42
- end
29
+ tool.add_boolean_option(:print_coords,false,['--coords', 'Write genomic coordinates in the fasta header'])
30
+ tool.add_boolean_option(:keep_header,true,['--strip-header', 'Dont write sequence definition'])
31
+ tool.add_value_option(:min_len,20,['--min-len','Minimum ORF length to keep'])
43
32
 
44
- exit unless tool.check_options
33
+ exit unless tool.check_options(true)
45
34
 
46
- if ( ARGV[0].nil? )
47
- puts "You must supply an input file"
48
- puts tool.option_parser
49
- exit
50
- end
35
+ input_file=ARGV[0]
51
36
 
52
- inname=ARGV.shift
37
+ output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
53
38
 
54
- outfile=nil
55
- if ( tool.explicit_output != nil)
56
- outfile=File.open(tool.explicit_output,'w')
57
- else
58
- outfile=File.open("#{inname}.translated.fasta",'w')
59
- end
39
+ output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
60
40
 
61
41
 
62
- file = Bio::FastaFormat.open(inname)
42
+ file = Bio::FastaFormat.open(input_file)
63
43
 
64
44
  file.each do |entry|
65
45
 
66
- puts entry.entry_id
67
-
68
46
  length = entry.naseq.length
69
47
 
70
48
  (1...7).each do |frame|
@@ -76,7 +54,7 @@ file.each do |entry|
76
54
  oi=0
77
55
  orfs.each do |orf|
78
56
  oi+=1
79
- if ( orf.length > 20 )
57
+ if ( orf.length > tool.min_len )
80
58
 
81
59
  position_start = position
82
60
  position_end = position_start + orf.length*3 -1
@@ -109,7 +87,7 @@ file.each do |entry|
109
87
  # Output in fasta format
110
88
  # start and end positions are always relative to the forward strand
111
89
 
112
- outfile.write("#{defline}\n#{orf}\n")
90
+ output_fh.write("#{defline}\n#{orf}\n")
113
91
 
114
92
  end
115
93
  position += orf.length*3+3
data/bin/tandem_search.rb CHANGED
@@ -8,8 +8,7 @@
8
8
 
9
9
  require 'protk/constants'
10
10
  require 'protk/command_runner'
11
- require 'protk/search_tool'
12
- require 'protk/xtandem_defaults'
11
+ require 'protk/tandem_search_tool'
13
12
  require 'libxml'
14
13
 
15
14
  include LibXML
@@ -17,443 +16,71 @@ include LibXML
17
16
  # Environment with global constants
18
17
  #
19
18
  genv=Constants.new
19
+ search_tool=TandemSearchTool.new()
20
20
 
21
- # Setup specific command-line options for this tool. Other options are inherited from SearchTool
22
- #
23
- search_tool=SearchTool.new([:background,:glyco,:database,:explicit_output,:over_write,
24
- :enzyme,:modifications,:mass_tolerance_units,:mass_tolerance,:strict_monoisotopic_mass,
25
- :missed_cleavages,:cleavage_semi,:carbamidomethyl,:methionine_oxidation
26
- ])
27
- search_tool.jobid_prefix="x"
28
- search_tool.option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
29
- search_tool.options.output_suffix="_tandem"
30
-
31
- tandem_defaults=XTandemDefaults.new.path
32
- search_tool.options.tandem_params=tandem_defaults
33
- search_tool.option_parser.on( '-T', '--tandem-params tandem', 'XTandem parameters to use' ) do |parms|
34
- search_tool.options.tandem_params = parms
35
- end
36
-
37
- search_tool.options.no_pepxml=false
38
- search_tool.option_parser.on( '-P', '--no-pepxml', 'Dont convert to pepXML after running the search') do
39
- search_tool.options.no_pepxml=true
40
- end
41
-
42
- search_tool.options.keep_params_files=false
43
- search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parameter files' ) do
44
- search_tool.options.keep_params_files = true
45
- end
46
-
47
- # In case want pepXML, but still want tandem output also.
48
- search_tool.options.tandem_output=nil
49
- search_tool.option_parser.on( '--tandem-output tandem_output', 'Keep X! Tandem Output') do |tandem_output|
50
- search_tool.options.tandem_output=tandem_output
51
- end
52
-
53
- search_tool.options.thresholds_type = 'isb_kscore'
54
- search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold, system_default)' ) do |thresholds_type|
55
- # This options sets up various X! Tandem thresholds.
56
- # - system_default: Don't change any defaults just use
57
- # the defaults for this TPP install as is.
58
- # - tandem_default: These thresholds are found on the
59
- # tandem api page. http://www.thegpm.org/tandem/api/index.html
60
- # - isb_native: These are the defaults found in
61
- # isb_default_input_native.xml distributed with TPP 4.6.
62
- # - isb_kscore: These are the defaults found in
63
- # isb_default_input_kscore.xml distributed with TPP 4.6.
64
- # - scaffold: These are the defaults recommend by Proteome Software
65
- # for use with Scaffold.
66
- search_tool.options.thresholds_type = thresholds_type
67
- end
68
-
69
- search_tool.options.algorithm = "kscore"
70
- search_tool.option_parser.on( '--algorithm algorithm', "Scoring algorithm (kscore or native)" ) do |algorithm|
71
- search_tool.options.algorithm = algorithm
72
- end
73
-
74
- search_tool.options.cleavage_semi = true
75
- search_tool.option_parser.on( '--no-cleavage-semi' ) do
76
- search_tool.options.cleavage_semi = false
77
- end
21
+ exit unless search_tool.check_options(true)
78
22
 
79
-
80
- search_tool.options.n_terminal_mod_mass=nil
81
- search_tool.option_parser.on('--n-terminal-mod-mass mass') do |mass|
82
- search_tool.options.n_terminal_mod_mass = mass
83
- end
84
-
85
- search_tool.options.c_terminal_mod_mass=nil
86
- search_tool.option_parser.on('--c-terminal-mod-mass mass') do |mass|
87
- search_tool.options.c_terminal_mod_mass = mass
88
- end
89
-
90
- search_tool.options.cleavage_n_terminal_mod_mass=nil
91
- search_tool.option_parser.on('--cleavage-n-terminal-mod-mass mass') do |mass|
92
- search_tool.options.cleavage_n_terminal_mod_mass = mass
93
- end
94
-
95
- search_tool.options.cleavage_c_terminal_mod_mass=nil
96
- search_tool.option_parser.on('--cleavage-c-terminal-mod-mass mass') do |mass|
97
- search_tool.options.cleavage_c_terminal_mod_mass = mass
98
- end
99
-
100
- exit unless search_tool.check_options
101
-
102
- if ( ARGV[0].nil? )
103
- puts "You must supply an input file"
104
- puts search_tool.option_parser
105
- exit
106
- end
107
-
108
-
109
- # Set search engine specific parameters on the SearchTool object
23
+ # Our environment should be setup so that tandem or tandem.exe is on the path
110
24
  #
111
- tandem_bin="tandem"
25
+ tandem_bin=%x[which tandem].chomp
26
+ tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
112
27
 
113
- tandem_params=search_tool.tandem_params
28
+ @output_suffix="_tandem"
114
29
 
115
- case
116
- when Pathname.new(search_tool.database).exist? # It's an explicitly named db
117
- current_db=Pathname.new(search_tool.database).realpath.to_s
118
- else
119
- current_db=search_tool.current_database :fasta
120
- end
121
-
122
-
123
- # Parse options from a parameter file (if provided), or from the default parameter file
124
- #
125
- params_parser=XML::Parser.file(tandem_params)
126
- std_params=params_parser.parse
127
-
128
- # Parse taxonomy template file
129
- #
130
- taxo_parser=XML::Parser.file(XTandemDefaults.new.taxonomy_path)
131
- taxo_doc=taxo_parser.parse
132
-
133
- # Galaxy changes things like @ to __at__ we need to change it back
30
+ # Run the search engine on each input file
134
31
  #
135
- def decode_modification_string(mstring)
136
- mstring.gsub!("__at__","@")
137
- mstring.gsub!("__oc__","{")
138
- mstring.gsub!("__cc__","}")
139
- mstring.gsub!("__ob__","[")
140
- mstring.gsub!("__cb__","]")
141
- mstring
142
- end
32
+ ARGV.each do |filename|
143
33
 
144
- def set_option(std_params, tandem_key, value)
145
- notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
146
- throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
147
- notes[0].content=value
148
- end
34
+ throw "Input file #{filename} does not exist" unless File.exist?(filename)
149
35
 
150
- def append_option(std_params, tandem_key, value)
151
- notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
152
- if notes.length == 0
153
- node = XML::Node.new('note')
154
- node["type"] = "input"
155
- node["label"] = tandem_key
156
- node.content = value
157
- std_params.find('/bioml')[0] << node
158
- else
159
- throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
160
- notes[0].content = append_string(notes[0].content, value)
161
- end
162
- end
36
+ input_path=Pathname.new(filename.chomp).expand_path.to_s
37
+ output_path=Tool.default_output_path(input_path,".tandem",search_tool.output_prefix,@output_suffix)
163
38
 
164
- def collapse_keys(std_params, tandem_key)
165
- mods=std_params.find('/bioml/note[@type="input" and @label="#{tandem_key}"]')
166
- if not mods
167
- first_mod = mods[0]
168
- rest_mods = mods[1..-1]
169
- rest_mods.each{ |node| first_mod.content = append_string(first_mod.content, node.content); node.remove!}
170
- end
171
- end
172
39
 
173
- def append_string(first, second)
174
- if first.empty?
175
- second
40
+ if ( search_tool.explicit_output )
41
+ final_output_path=search_tool.explicit_output
176
42
  else
177
- "#{first},#{second}"
178
- end
179
- end
180
-
181
- def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
182
- set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
183
- set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
184
-
185
- # Set the input and output paths
186
- #
187
- input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
188
- throw "Exactly one spectrum, path note is required in the parameter file" unless input_notes.length==1
189
- input_notes[0].content=input_path
190
-
191
- output_notes=std_params.find('/bioml/note[@type="input" and @label="output, path"]')
192
- throw "Exactly one output, path note is required in the parameter file" unless output_notes.length==1
193
- output_notes[0].content=output_path
194
-
195
- # Set the path to the scoring algorithm default params. We use one from ISB
196
- #
197
- scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
198
- throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
199
- scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_#{search_tool.algorithm}.xml"
200
-
201
- # Taxonomy and Database
202
- #
203
- db_notes=std_params.find('/bioml/note[@type="input" and @label="protein, taxon"]')
204
- throw "Exactly one protein, taxon note is required in the parameter file" unless db_notes.length==1
205
- db_notes[0].content=search_tool.database.downcase
206
-
207
- taxo_notes=std_params.find('/bioml/note[@type="input" and @label="list path, taxonomy information"]')
208
- throw "Exactly one list path, taxonomy information note is required in the parameter file" unless taxo_notes.length==1
209
- taxo_notes[0].content=taxo_path
210
-
211
- fragment_tol = search_tool.fragment_tol
212
-
213
- fmass=std_params.find('/bioml/note[@type="input" and @label="spectrum, fragment monoisotopic mass error"]')
214
- p fmass
215
- throw "Exactly one spectrum, fragment monoisotopic mass error note is required in the parameter file" unless fmass.length==1
216
- fmass[0].content=fragment_tol.to_s
217
-
218
- precursor_tol = search_tool.precursor_tol
219
- ptol_plus=precursor_tol*0.5
220
- ptol_minus=precursor_tol*0.5
221
-
222
- # Precursor mass matching
223
- #
224
- pmass_minus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error minus"]')
225
- throw "Exactly one spectrum, parent monoisotopic mass error minus note is required in the parameter file" unless pmass_minus.length==1
226
- pmass_minus[0].content=ptol_minus.to_s
227
-
228
- pmass_plus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error plus"]')
229
- throw "Exactly one spectrum, parent monoisotopic mass error plus note is required in the parameter file" unless pmass_plus.length==1
230
- pmass_plus[0].content=ptol_plus.to_s
231
-
232
- pmass_err_units=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error units"]')
233
- throw "Exactly one spectrum, parent monoisotopic mass error units note is required in the parameter file. Got #{pmass_err_units.length}" unless pmass_err_units.length==1
234
-
235
-
236
- pmass_err_units[0].content=search_tool.precursor_tolu
237
-
238
- if search_tool.strict_monoisotopic_mass
239
- isotopic_error=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass isotope error"]')
240
- throw "Exactly one spectrum, parent monoisotopic mass isotope error is required in the parameter file" unless isotopic_error.length==1
241
- isotopic_error[0].content="no"
43
+ final_output_path=output_path
242
44
  end
243
45
 
244
- if search_tool.tandem_output
245
- # If one is interested in the tandem output (e.g. for consumption by Scaffold)
246
- # want to store additional information.
247
- set_option(std_params, "output, spectra", "yes")
248
- end
249
46
 
250
- thresholds_type = search_tool.thresholds_type
47
+ output_exists=Pathname.new(final_output_path).exist?
251
48
 
252
- if thresholds_type != "system_default"
253
-
254
- maximum_valid_expectation_value = "0.1"
255
- if thresholds_type == "scaffold"
256
- maximum_valid_expectation_value = "1000"
257
- end
258
-
259
- minimum_ion_count = "4"
260
- case thresholds_type
261
- when "isb_kscore", "isb_native"
262
- minimum_ion_count = "1"
263
- when "scaffold"
264
- minimum_ion_count = "0"
265
- end
266
-
267
- minimum_peaks = "15"
268
- case thresholds_type
269
- when "isb_native"
270
- minimum_peaks = "6"
271
- when "isb_kscore"
272
- minimum_peaks = "10"
273
- when "scaffold"
274
- minimum_peaks = "0"
275
- end
276
-
277
- minimum_fragement_mz = "150"
278
- case thresholds_type
279
- when "isb_native"
280
- minimum_fragement_mz = "50"
281
- when "isb_kscore"
282
- minimum_fragement_mz = "125"
283
- when "scaffold"
284
- minimum_fragement_mz = "0"
285
- end
286
-
287
- minimum_parent_mh = "500" # tandem and isb_native defaults
288
- case thresholds_type
289
- when "isb_kscore"
290
- minimum_parent_mh = "600"
291
- when "scaffold"
292
- minimum_parent_mh = "0"
293
- end
294
-
295
- use_noise_suppression = "yes"
296
- if thresholds_type == "isb_kscore" or thresholds_type == "scaffold"
297
- use_noise_suppression = "no"
298
- end
299
-
300
- dynamic_range = "100.0"
301
- case thresholds_type
302
- when "isb_kscore"
303
- dynamic_range = "10000.0"
304
- when "scaffold"
305
- dynamic_range = "1000.0"
306
- end
307
-
308
- set_option(std_params, "spectrum, dynamic range", dynamic_range)
309
- set_option(std_params, "spectrum, use noise suppression", use_noise_suppression)
310
- set_option(std_params, "spectrum, minimum parent m+h", minimum_parent_mh)
311
- set_option(std_params, "spectrum, minimum fragment mz", minimum_fragement_mz)
312
- set_option(std_params, "spectrum, minimum peaks", minimum_peaks)
313
- set_option(std_params, "scoring, minimum ion count", minimum_ion_count)
314
- set_option(std_params, "output, maximum valid expectation value", maximum_valid_expectation_value)
315
- end
316
-
317
- # Fixed and Variable Modifications
318
- #
319
- unless search_tool.carbamidomethyl
320
- mods=std_params.find('/bioml/note[@type="input" and @id="carbamidomethyl-fixed"]')
321
- mods.each{ |node| node.remove!}
322
- end
323
-
324
- unless search_tool.glyco
325
- mods=std_params.find('/bioml/note[@type="input" and @id="glyco-variable"]')
326
- mods.each{ |node| node.remove!}
327
- end
328
-
329
- unless search_tool.methionine_oxidation
330
- mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
331
- mods.each{ |node| node.remove!}
332
- end
333
-
334
- # Merge all remaining id based modification into single modification.
335
- collapse_keys(std_params, "residue, potential modification mass")
336
- collapse_keys(std_params, "residue, modification mass")
337
-
338
- var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
339
- var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
340
- fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
341
- fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
342
-
343
- root_bioml_node=std_params.find('/bioml')[0]
344
-
345
- mod_id=1
346
- var_mods.each do |vm|
347
-
348
- mod_type="potential modification mass"
349
- mod_type = "potential modification motif" if motif?(vm)
350
- label="residue, #{mod_type}"
351
- append_option(std_params, label, vm)
352
- end
353
-
354
- mod_id=1
355
- fix_mods.each do |fm|
356
- mod_type="modification mass"
357
- mod_type = "modification motif" if motif?(fm)
358
- label="residue, #{mod_type}"
359
- append_option(std_params, label, fm)
49
+ puts final_output_path
50
+ if Pathname.new(final_output_path).absolute?
51
+ output_base_path=Pathname.new(final_output_path).dirname.to_s
52
+ else
53
+ output_base_path="#{Dir.pwd}/#{Pathname.new(final_output_path).dirname.to_s}"
360
54
  end
55
+ puts output_base_path
361
56
 
362
- #p root_bioml_node
363
- std_params
364
-
365
- end
366
-
367
- def motif?(mod_string)
368
- # 124@[ is not a modification motif, it is a residue (N-term) modification,
369
- # so when checking if modification is a motif look for paired square brackets.
370
- mod_string =~ /[\(\)\{\}\!]/ or mod_string =~ /\[.*\]/
371
- end
372
-
57
+ protein_db_info=search_tool.database_info
373
58
 
374
- def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
375
-
376
- taxon_label=taxo_doc.find('/bioml/taxon')
377
- throw "Exactly one taxon label is required in the taxonomy_template file" unless taxon_label.length==1
378
- taxon_label[0].attributes['label']=search_tool.database.downcase
379
-
380
- db_file=taxo_doc.find('/bioml/taxon/file')
381
- throw "Exactly one database file is required in the taxonomy_template file" unless db_file.length==1
382
- db_file[0].attributes['URL']=current_db
383
-
384
- taxo_doc
385
- end
386
-
387
- # Run the search engine on each input file
388
- #
389
- ARGV.each do |filename|
59
+ taxo_path="#{final_output_path}.taxonomy.xml"
60
+ taxo_doc = search_tool.taxonomy_doc(protein_db_info)
61
+ taxo_doc.save(taxo_path)
390
62
 
391
- input_path=Pathname.new(filename.chomp).realpath.to_s
392
- output_path="#{search_tool.output_base_path(filename.chomp)}.tandem"
63
+ params_path="#{final_output_path}.params"
64
+ params_doc = search_tool.params_doc(protein_db_info,taxo_path,input_path,final_output_path)
65
+ params_doc.save(params_path)
393
66
 
394
- if ( search_tool.explicit_output==nil )
395
- pepxml_path="#{output_path.match(/(.*)\.tandem$/)[1]}.pep.xml"
396
- else
397
- pepxml_path=search_tool.explicit_output
398
- end
399
-
400
- output_exists=false
401
- if ( !search_tool.no_pepxml && Pathname.new(pepxml_path).exist?)
402
- output_exists=true
403
- end
404
-
405
- if ( search_tool.no_pepxml && Pathname.new(output_path).exist? )
406
- output_exists=true
407
- end
408
-
409
67
  # Only proceed if the output file is not present or we have opted to over-write it
410
68
  #
411
69
  if ( search_tool.over_write || !output_exists )
412
70
 
413
- # Create the taxonomy file in the same directory as the params file
414
- #
415
- taxo_path="#{search_tool.input_base_path(filename.chomp)}.taxonomy.xml"
416
- mod_taxo_doc=generate_taxonomy_doc(taxo_doc,current_db,search_tool)
417
- mod_taxo_doc.save(taxo_path)
418
-
419
- # Modify the default XML document to contain search specific details and save it so it can be used in the search
420
- #
421
- mod_params=generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
422
- params_path="#{search_tool.input_base_path(filename.chomp)}.tandem.params"
423
- mod_params.save(params_path)
424
-
425
71
  # The basic command
426
72
  #
427
73
  cmd= "#{tandem_bin} #{params_path}"
428
74
 
429
- # pepXML conversion and repair
430
- #
431
- unless search_tool.no_pepxml
432
- repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
433
- cmd << "; Tandem2XML #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}"
434
- if search_tool.tandem_output
435
- cmd << "; cp #{output_path} #{search_tool.tandem_output}"
436
- else
437
- cmd << "; rm #{output_path}"
438
- end
439
- end
440
-
441
75
  # Add a cleanup command unless the user wants to keep params files
442
76
  #
443
77
  unless search_tool.keep_params_files
444
78
  cmd << "; rm #{params_path}; rm #{taxo_path}"
445
79
  end
446
-
447
- # In case the user specified background running we need to create a jobscript path
448
- #
449
- jobscript_path="#{output_path}.pbs.sh"
450
80
 
451
81
  # Run the search
452
82
  #
453
- job_params= {:jobid => search_tool.jobid_from_filename(filename)}
454
- job_params[:queue]="sixteen"
455
- job_params[:vmem]="12gb"
456
- code = search_tool.run(cmd,genv,job_params,jobscript_path)
83
+ code = search_tool.run(cmd,genv)
457
84
  throw "Command failed with exit code #{code}" unless code==0
458
85
  else
459
86
  genv.log("Skipping search on existing file #{output_path}",:warn)
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of Protk
4
+ # Created by Ira Cooke 12/4/2010
5
+ #
6
+ # Convert tandem output files to pepxml. A wrapper for Tandem2XML
7
+ #
8
+
9
+
10
+ require 'protk/constants'
11
+ require 'protk/search_tool'
12
+
13
+ # Environment with global constants
14
+ #
15
+ genv=Constants.new
16
+
17
+ tool=SearchTool.new([:explicit_output,:over_write,:prefix])
18
+ tool.option_parser.banner = "Convert tandem files to pep.xml files.\n\nUsage: tandem_to_pepxml.rb [options] file1.dat file2.dat ... "
19
+
20
+ @output_suffix=""
21
+
22
+ exit unless tool.check_options(true)
23
+
24
+ binpath=%x[which Tandem2XML]
25
+ binpath.chomp!
26
+
27
+
28
+ ARGV.each do |filename|
29
+
30
+ throw "Input file #{filename} does not exist" unless File.exist?(filename)
31
+
32
+ if ( tool.explicit_output )
33
+ output_path=tool.explicit_output
34
+ else
35
+ output_path=Tool.default_output_path(filename.chomp,".pep.xml",tool.output_prefix,@output_suffix)
36
+ end
37
+
38
+ throw "Unable to find Tandem2XML" unless binpath=~/Tandem2XML/
39
+ cmd = "#{binpath} #{filename.chomp} #{output_path}"
40
+
41
+ code = tool.run(cmd,genv)
42
+ throw "Command #{cmd} failed with exit code #{code}" unless code==0
43
+ end
data/bin/unimod_to_loc.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  #
3
- # This file is part of MSLIMS
3
+ # This file is part of Protk
4
4
  # Created by Ira Cooke 12/4/2010
5
5
  #
6
6
  # Reads a unimod xml file (eg from a Mascot installation) and produces a loc file with names of allowable chemical modifications