protk 1.2.6.pre5 → 1.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/mascot_to_pepxml.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
|
-
# This file is part of
|
3
|
+
# This file is part of Protk
|
4
4
|
# Created by Ira Cooke 12/4/2010
|
5
5
|
#
|
6
6
|
# Convert mascot dat files to pepxml. A wrapper for Mascot2XML
|
@@ -15,34 +15,21 @@ require 'protk/mascot_util'
|
|
15
15
|
#
|
16
16
|
genv=Constants.new
|
17
17
|
|
18
|
-
tool=SearchTool.new([
|
18
|
+
tool=SearchTool.new([
|
19
|
+
:database,
|
20
|
+
:explicit_output,
|
21
|
+
:over_write,
|
22
|
+
:enzyme])
|
23
|
+
|
19
24
|
tool.option_parser.banner = "Convert mascot dat files to pep.xml files.\n\nUsage: mascot_to_pepxml.rb [options] file1.dat file2.dat ... "
|
20
25
|
|
21
26
|
tool.options.enzyme="trypsin"
|
22
27
|
|
23
|
-
tool.
|
24
|
-
tool.option_parser.on( '--shortid', 'Use short protein id as per Mascot result (default uses full protein ids in fasta file)' ) do
|
25
|
-
tool.options.shortid=true
|
26
|
-
end
|
27
|
-
|
28
|
-
tool.option_parser.parse!
|
29
|
-
|
30
|
-
exit unless tool.check_options
|
31
|
-
|
32
|
-
if ( ARGV[0].nil? )
|
33
|
-
puts "You must supply an input file"
|
34
|
-
puts tool.option_parser
|
35
|
-
exit
|
36
|
-
end
|
28
|
+
tool.add_boolean_option(:shortid,false,['--shortid', 'Use short protein id as per Mascot result (default uses full protein ids in fasta file)' ])
|
37
29
|
|
38
|
-
|
30
|
+
exit unless tool.check_options(true,[:database])
|
39
31
|
|
40
|
-
|
41
|
-
when Pathname.new(tool.database).exist? # It's an explicitly named db
|
42
|
-
current_db=Pathname.new(tool.database).realpath.to_s
|
43
|
-
else
|
44
|
-
current_db=tool.current_database :fasta
|
45
|
-
end
|
32
|
+
database_path=tool.database_info.path
|
46
33
|
|
47
34
|
|
48
35
|
|
@@ -54,20 +41,20 @@ ARGV.each do |file_name|
|
|
54
41
|
if ( tool.explicit_output==nil )
|
55
42
|
new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
|
56
43
|
cmd="cp #{name} #{new_basename}.dat"
|
57
|
-
cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{
|
44
|
+
cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{database_path} -E#{tool.enzyme}"
|
58
45
|
|
59
46
|
cmd << " -shortid" if tool.shortid
|
60
47
|
|
61
48
|
else #Mascot2XML doesn't support explicitly named output files so we move the file to an appropriate output filename after finishing
|
62
49
|
new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
|
63
50
|
cmd="cp #{name} #{new_basename}.dat"
|
64
|
-
cmd << "; Mascot2XML #{new_basename}.dat -D#{
|
51
|
+
cmd << "; Mascot2XML #{new_basename}.dat -D#{database_path} -E#{tool.enzyme}"
|
65
52
|
cmd << " -shortid" if tool.shortid
|
66
53
|
cmd << "; mv #{new_basename}.pep.xml #{tool.explicit_output}; rm #{new_basename}.dat"
|
67
54
|
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
68
55
|
cmd << "; #{repair_script} #{tool.explicit_output}"
|
69
56
|
end
|
70
57
|
|
71
|
-
code = tool.run(cmd,genv
|
58
|
+
code = tool.run(cmd,genv)
|
72
59
|
throw "Command #{cmd} failed with exit code #{code}" unless code==0
|
73
60
|
end
|
data/bin/msgfplus_search.rb
CHANGED
@@ -18,8 +18,19 @@ input_stager = nil
|
|
18
18
|
|
19
19
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
20
20
|
#
|
21
|
-
search_tool=SearchTool.new([
|
22
|
-
:
|
21
|
+
search_tool=SearchTool.new([
|
22
|
+
:database,
|
23
|
+
:explicit_output,
|
24
|
+
:over_write,
|
25
|
+
:enzyme,
|
26
|
+
:modifications,
|
27
|
+
:methionine_oxidation,
|
28
|
+
:carbamidomethyl,
|
29
|
+
:glyco,
|
30
|
+
:acetyl_nterm,
|
31
|
+
:instrument,
|
32
|
+
:cleavage_semi,
|
33
|
+
:threads])
|
23
34
|
|
24
35
|
search_tool.jobid_prefix="p"
|
25
36
|
search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
|
@@ -28,73 +39,24 @@ search_tool.options.output_suffix="_msgfplus"
|
|
28
39
|
search_tool.options.enzyme=1
|
29
40
|
search_tool.options.instrument=0
|
30
41
|
|
31
|
-
|
32
|
-
search_tool.
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
search_tool.
|
37
|
-
search_tool.
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
search_tool.
|
42
|
-
search_tool.
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
search_tool.
|
47
|
-
search_tool.option_parser.on( '--protocol p', '0: NoProtocol (Default), 1: Phosphorylation, 2: iTRAQ, 3: iTRAQPhospho' ) do |p|
|
48
|
-
search_tool.options.protocol=p
|
49
|
-
end
|
50
|
-
|
51
|
-
search_tool.options.min_pep_length=6
|
52
|
-
search_tool.option_parser.on( '--min-pep-length p', 'Minimum peptide length to consider, Default: 6' ) do |p|
|
53
|
-
search_tool.options.min_pep_length=p
|
54
|
-
end
|
55
|
-
|
56
|
-
search_tool.options.max_pep_length=40
|
57
|
-
search_tool.option_parser.on( '--max-pep-length p', 'Maximum peptide length to consider, Default: 40' ) do |p|
|
58
|
-
search_tool.options.max_pep_length=p
|
59
|
-
end
|
60
|
-
|
61
|
-
search_tool.options.min_pep_charge=2
|
62
|
-
search_tool.option_parser.on( '--min-pep-charge c', 'Minimum precursor charge to consider if charges are not specified in the spectrum file, Default: 2' ) do |c|
|
63
|
-
search_tool.options.min_pep_charge=c
|
64
|
-
end
|
65
|
-
|
66
|
-
search_tool.options.max_pep_charge=3
|
67
|
-
search_tool.option_parser.on( '--max-pep-charge c', 'Maximum precursor charge to consider if charges are not specified in the spectrum file, Default: 3' ) do |c|
|
68
|
-
search_tool.options.max_pep_charge=c
|
69
|
-
end
|
70
|
-
|
71
|
-
search_tool.options.num_reported_matches=1
|
72
|
-
search_tool.option_parser.on( '--num-reported-matches n', 'Number of matches per spectrum to be reported, Default: 1' ) do |n|
|
73
|
-
search_tool.options.num_reported_matches=n
|
74
|
-
end
|
75
|
-
|
76
|
-
search_tool.options.add_features=false
|
77
|
-
search_tool.option_parser.on( '--add-features', 'output additional features' ) do
|
78
|
-
search_tool.options.add_features=true
|
79
|
-
end
|
80
|
-
|
81
|
-
search_tool.options.num_threads=nil
|
82
|
-
search_tool.option_parser.on('--threads NumThreads','Number of processing threads to use') do |nt|
|
83
|
-
search_tool.options.num_threads=nt
|
84
|
-
end
|
85
|
-
|
86
|
-
search_tool.options.java_mem="3500M"
|
87
|
-
search_tool.option_parser.on('--java-mem mem','Java memory limit when running the search (Default 3.5Gb)') do |mem|
|
88
|
-
search_tool.options.java_mem=mem
|
89
|
-
end
|
42
|
+
# MS-GF+ doesnt support fragment tol so add this manually rather than via the SearchTool defaults
|
43
|
+
search_tool.add_value_option(:precursor_tol,"20",['-p','--precursor-ion-tol tol', 'Precursor ion mass tolerance.'])
|
44
|
+
search_tool.add_value_option(:precursor_tolu,"ppm",['--precursor-ion-tol-units tolu', 'Precursor ion mass tolerance units (ppm or Da). Default=ppm'])
|
45
|
+
|
46
|
+
search_tool.add_boolean_option(:pepxml,false,['--pepxml', 'Convert results to pepxml.'])
|
47
|
+
search_tool.add_value_option(:isotope_error_range,"0,1",['--isotope-error-range range', 'Takes into account of the error introduced by chooosing a non-monoisotopic peak for fragmentation.(Default 0,1)'])
|
48
|
+
search_tool.add_value_option(:fragment_method,0,['--fragment-method method', 'Fragment method 0: As written in the spectrum or CID if no info (Default), 1: CID, 2: ETD, 3: HCD, 4: Merge spectra from the same precursor'])
|
49
|
+
search_tool.add_boolean_option(:decoy_search,false,['--decoy-search', 'Build and search a decoy database on the fly. Input db should not contain decoys if this option is used'])
|
50
|
+
search_tool.add_value_option(:protocol,0,['--protocol p', '0: NoProtocol (Default), 1: Phosphorylation, 2: iTRAQ, 3: iTRAQPhospho'])
|
51
|
+
search_tool.add_value_option(:min_pep_length,6,['--min-pep-length p', 'Minimum peptide length to consider, Default: 6'])
|
52
|
+
search_tool.add_value_option(:max_pep_length,40,['--max-pep-length p', 'Maximum peptide length to consider, Default: 40'])
|
53
|
+
search_tool.add_value_option(:min_pep_charge,2,['--min-pep-charge c', 'Minimum precursor charge to consider if charges are not specified in the spectrum file, Default: 2'])
|
54
|
+
search_tool.add_value_option(:max_pep_charge,3,['--max-pep-charge c', 'Maximum precursor charge to consider if charges are not specified in the spectrum file, Default: 3'])
|
55
|
+
search_tool.add_value_option(:num_reported_matches,1,['--num-reported-matches n', 'Number of matches per spectrum to be reported, Default: 1'])
|
56
|
+
search_tool.add_boolean_option(:add_features,false,['--add-features', 'output additional features'])
|
57
|
+
search_tool.add_value_option(:java_mem,"3500M",['--java-mem mem','Java memory limit when running the search (Default 3.5Gb)'])
|
90
58
|
|
91
|
-
exit unless search_tool.check_options
|
92
|
-
|
93
|
-
if ( ARGV[0].nil? )
|
94
|
-
puts "You must supply an input file"
|
95
|
-
puts search_tool.option_parser
|
96
|
-
exit
|
97
|
-
end
|
59
|
+
exit unless search_tool.check_options(true)
|
98
60
|
|
99
61
|
# Environment with global constants
|
100
62
|
#
|
@@ -104,30 +66,34 @@ genv=Constants.new
|
|
104
66
|
#
|
105
67
|
msgf_bin="#{genv.msgfplusjar}"
|
106
68
|
|
107
|
-
|
69
|
+
# We need to cope with the fact that MSGFPlus.jar might not be executable so fall back to the protk predefined path
|
70
|
+
|
71
|
+
msgf_bin = "#{genv.msgfplus_root}/MSGFPlus.jar " if !msgf_bin
|
72
|
+
|
73
|
+
throw "Could not find MSGFPlus.jar" if !msgf_bin || (msgf_bin.length==0) || !File.exist?(msgf_bin)
|
108
74
|
|
109
75
|
make_msgfdb_cmd=""
|
110
76
|
|
111
|
-
|
112
|
-
|
113
|
-
current_db=Pathname.new(search_tool.database).realpath.to_s
|
77
|
+
@output_suffix="_msgfplus"
|
78
|
+
@output_extension= search_tool.pepxml ? ".pep.xml" : ".mzid"
|
114
79
|
|
115
|
-
|
116
|
-
if ( Pathname.new(current_db).extname.to_s.downcase != ".fasta" )
|
117
|
-
make_msgfdb_cmd << "ln -s #{current_db} #{current_db}.fasta;"
|
118
|
-
current_db="#{current_db}.fasta"
|
119
|
-
end
|
80
|
+
db_info=search_tool.database_info
|
120
81
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
82
|
+
database_path=db_info.path
|
83
|
+
|
84
|
+
# Database must have fasta extension
|
85
|
+
if Pathname.new(database_path).extname.to_s.downcase != ".fasta"
|
86
|
+
make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
|
87
|
+
database_path="#{database_path}.fasta"
|
88
|
+
db_info.path=database_path
|
127
89
|
end
|
128
90
|
|
129
|
-
|
130
|
-
|
91
|
+
# Database must be indexed
|
92
|
+
unless FileTest.exists?("#{database_path}.canno")
|
93
|
+
dbdir = Pathname.new(database_path).dirname.realpath.to_s
|
94
|
+
tdavalue=search_tool.decoy_search ? 1 : 0;
|
95
|
+
make_msgfdb_cmd << "cd #{dbdir}; java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
|
96
|
+
end
|
131
97
|
|
132
98
|
|
133
99
|
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
@@ -139,17 +105,12 @@ ARGV.each do |filename|
|
|
139
105
|
if ( search_tool.explicit_output!=nil)
|
140
106
|
output_path=search_tool.explicit_output
|
141
107
|
else
|
142
|
-
output_path=
|
108
|
+
output_path=Tool.default_output_path(filename,@output_extension,search_tool.output_prefix,@output_suffix)
|
143
109
|
end
|
144
110
|
|
145
111
|
|
146
|
-
|
147
|
-
#
|
148
|
-
ext = Pathname.new(filename).extname
|
149
|
-
input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
|
150
|
-
|
151
|
-
mzid_output_path="#{search_tool.input_base_path(filename.chomp)}.mzid"
|
152
|
-
|
112
|
+
input_path=filename.chomp
|
113
|
+
mzid_output_path="#{output_path}.mzid"
|
153
114
|
|
154
115
|
if for_galaxy
|
155
116
|
original_input_file = input_path
|
@@ -166,12 +127,17 @@ ARGV.each do |filename|
|
|
166
127
|
|
167
128
|
# The basic command
|
168
129
|
#
|
169
|
-
cmd= "#{make_msgfdb_cmd} java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{
|
130
|
+
cmd= "#{make_msgfdb_cmd} java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{database_path} -s #{input_path} -o #{mzid_output_path} "
|
170
131
|
|
171
132
|
#Semi tryptic peptides
|
172
133
|
#
|
173
134
|
cmd << " -ntt 1" if ( search_tool.cleavage_semi )
|
174
135
|
|
136
|
+
#Decoy searches
|
137
|
+
#
|
138
|
+
tdavalue=search_tool.decoy_search ? 1 : 0;
|
139
|
+
cmd << " -tda #{tdavalue}"
|
140
|
+
|
175
141
|
# Precursor tolerance
|
176
142
|
#
|
177
143
|
cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
|
@@ -203,7 +169,7 @@ ARGV.each do |filename|
|
|
203
169
|
|
204
170
|
# Num Threads
|
205
171
|
#
|
206
|
-
cmd << " -thread #{search_tool.
|
172
|
+
cmd << " -thread #{search_tool.threads}" if search_tool.threads > 0
|
207
173
|
|
208
174
|
mods_file_content = ""
|
209
175
|
|
@@ -211,6 +177,9 @@ ARGV.each do |filename|
|
|
211
177
|
#
|
212
178
|
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
213
179
|
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join("\n")
|
180
|
+
var_mods << "O1,M,opt,any,Oxidation\n" if search_tool.methionine_oxidation
|
181
|
+
var_mods << "C2H2O,*,opt,Prot-N-term,Acetyl\n" if search_tool.acetyl_nterm
|
182
|
+
var_mods << "H-1N-1O1,N,opt,any,Deamidated\n" if search_tool.glyco
|
214
183
|
if ( var_mods !="" )
|
215
184
|
mods_file_content << "#{var_mods}\n"
|
216
185
|
end
|
@@ -220,13 +189,14 @@ ARGV.each do |filename|
|
|
220
189
|
#
|
221
190
|
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
222
191
|
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join("\n")
|
192
|
+
fix_mods << "C2H3N1O1,C,opt,any,Carbamidomethyl\n" if search_tool.carbamidomethyl
|
223
193
|
if ( fix_mods !="")
|
224
194
|
mods_file_content << "#{fix_mods}"
|
225
195
|
end
|
226
196
|
end
|
227
197
|
|
228
198
|
if ( mods_file_content != "")
|
229
|
-
mods_path="#{
|
199
|
+
mods_path="#{output_path}.msgfplus_mods.txt"
|
230
200
|
mods_file=File.open(mods_path,'w+')
|
231
201
|
mods_file.write "NumMods=2\n#{mods_file_content}"
|
232
202
|
mods_file.close
|
@@ -234,15 +204,15 @@ ARGV.each do |filename|
|
|
234
204
|
end
|
235
205
|
|
236
206
|
# As a final part of the command we convert to pepxml
|
237
|
-
if search_tool.
|
238
|
-
cmd << "; cp #{mzid_output_path} #{output_path}"
|
239
|
-
else
|
207
|
+
if search_tool.pepxml
|
240
208
|
#if search_tool.explicit_output
|
241
209
|
cmd << ";ruby -pi.bak -e \"gsub('post=\\\"?','post=\\\"X')\" #{mzid_output_path}"
|
242
210
|
cmd << ";ruby -pi.bak -e \"gsub('pre=\\\"?','pre=\\\"X')\" #{mzid_output_path}"
|
243
211
|
cmd << ";idconvert #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
|
244
212
|
#Then copy the pepxml to the final output path
|
245
213
|
cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
|
214
|
+
else
|
215
|
+
cmd << "; mv #{mzid_output_path} #{output_path}"
|
246
216
|
end
|
247
217
|
|
248
218
|
|
@@ -251,14 +221,7 @@ ARGV.each do |filename|
|
|
251
221
|
|
252
222
|
# In case the user specified background running we need to create a jobscript path
|
253
223
|
#
|
254
|
-
|
255
|
-
|
256
|
-
# Run the search
|
257
|
-
#
|
258
|
-
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
259
|
-
job_params[:queue]="seventytwo"
|
260
|
-
job_params[:vmem]="70gb"
|
261
|
-
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
224
|
+
code = search_tool.run(cmd,genv)
|
262
225
|
throw "Command failed with exit code #{code}" unless code==0
|
263
226
|
|
264
227
|
if for_galaxy
|
data/bin/omssa_search.rb
CHANGED
@@ -16,56 +16,37 @@ for_galaxy = GalaxyUtil.for_galaxy?
|
|
16
16
|
|
17
17
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
18
18
|
#
|
19
|
-
search_tool=SearchTool.new([
|
20
|
-
:
|
21
|
-
:
|
19
|
+
search_tool=SearchTool.new([
|
20
|
+
:database,
|
21
|
+
:explicit_output,
|
22
|
+
:over_write,
|
23
|
+
:enzyme,
|
24
|
+
:modifications,
|
25
|
+
:methionine_oxidation,
|
26
|
+
:carbamidomethyl,
|
27
|
+
:glyco,
|
28
|
+
:instrument,
|
29
|
+
:mass_tolerance_units,
|
30
|
+
:mass_tolerance,
|
31
|
+
:missed_cleavages,
|
32
|
+
:precursor_search_type,
|
33
|
+
:respect_precursor_charges,
|
34
|
+
:num_peaks_for_multi_isotope_search,
|
35
|
+
:searched_ions,
|
36
|
+
:threads
|
22
37
|
])
|
23
38
|
|
24
39
|
|
25
40
|
search_tool.option_parser.banner = "Run an OMSSA msms search on a set of mgf input files.\n\nUsage: omssa_search.rb [options] file1.mgf file2.mgf ..."
|
26
|
-
search_tool.options.output_suffix="_omssa"
|
27
41
|
|
28
|
-
search_tool.
|
29
|
-
search_tool.
|
30
|
-
|
31
|
-
|
42
|
+
search_tool.add_boolean_option(:add_retention_times,true,['-R', '--no-add-retention-times', 'Don\'t post process the output to add retention times'])
|
43
|
+
search_tool.add_value_option(:max_hit_expect,1,['--max-hit-expect exp', 'Expect values less than this are considered to be hits'])
|
44
|
+
search_tool.add_value_option(:intensity_cut_off,0.0005,['--intensity-cut-off co', 'Peak intensity cut-off as a fraction of maximum peak intensity'])
|
45
|
+
search_tool.add_value_option(:galaxy_index_dir,nil,['--galaxy-index-dir dir', 'Specify galaxy index directory, will search for mods file there.'])
|
46
|
+
search_tool.add_value_option(:omx_output,nil,['--omx-output path', 'Specify path for additional OMX output (optional).'])
|
47
|
+
search_tool.add_value_option(:logfile,nil,['--logfile path','Send OMSSA stdout to a logfile'])
|
32
48
|
|
33
|
-
search_tool.
|
34
|
-
search_tool.option_parser.on( '--max-hit-expect exp', 'Expect values less than this are considered to be hits' ) do |exp|
|
35
|
-
search_tool.options.max_hit_expect=exp
|
36
|
-
end
|
37
|
-
|
38
|
-
search_tool.options.intensity_cut_off=0.0005
|
39
|
-
search_tool.option_parser.on( '--intensity-cut-off co', 'Peak intensity cut-off as a fraction of maximum peak intensity' ) do |co|
|
40
|
-
search_tool.options.intensity_cut_off=co
|
41
|
-
end
|
42
|
-
|
43
|
-
search_tool.options.galaxy_index_dir=nil
|
44
|
-
search_tool.option_parser.on( '--galaxy-index-dir dir', 'Specify galaxy index directory, will search for mods file there.' ) do |dir|
|
45
|
-
search_tool.options.galaxy_index_dir=dir
|
46
|
-
end
|
47
|
-
|
48
|
-
search_tool.options.omx_output=nil
|
49
|
-
search_tool.option_parser.on( '--omx-output path', 'Specify path for additional OMX output (optional).' ) do |path|
|
50
|
-
search_tool.options.omx_output=path
|
51
|
-
end
|
52
|
-
|
53
|
-
if ( ENV['PROTK_OMSSA_NTHREADS'] )
|
54
|
-
search_tool.options.nthreads=ENV['PROTK_OMSSA_NTHREADS']
|
55
|
-
else
|
56
|
-
search_tool.options.nthreads=0
|
57
|
-
end
|
58
|
-
search_tool.option_parser.on( '--nthreads num', 'Number of search threads to use. Default is to use the value in environment variable PROTK_OMSSA_NTHREADS or else to autodetect' ) do |num|
|
59
|
-
search_tool.options.nthreads=num
|
60
|
-
end
|
61
|
-
|
62
|
-
exit unless search_tool.check_options
|
63
|
-
|
64
|
-
if ( ARGV[0].nil? )
|
65
|
-
puts "You must supply an input file"
|
66
|
-
puts search_tool.option_parser
|
67
|
-
exit
|
68
|
-
end
|
49
|
+
exit unless search_tool.check_options(true)
|
69
50
|
|
70
51
|
# Environment with global constants
|
71
52
|
#
|
@@ -73,24 +54,18 @@ genv=Constants.new
|
|
73
54
|
|
74
55
|
# Set search engine specific parameters on the SearchTool object
|
75
56
|
#
|
76
|
-
rt_correct_bin="#{File.dirname(__FILE__)}/correct_omssa_retention_times.rb"
|
77
57
|
repair_script_bin="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
78
58
|
|
79
59
|
make_blastdb_cmd=""
|
60
|
+
@output_suffix="_omssa"
|
80
61
|
|
81
|
-
|
82
|
-
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
83
|
-
current_db=Pathname.new(search_tool.database).realpath.to_s
|
84
|
-
if(not FileTest.exists?("#{current_db}.phr"))
|
85
|
-
make_blastdb_cmd << "makeblastdb -dbtype prot -parse_seqids -in #{current_db}; "
|
86
|
-
end
|
87
|
-
else
|
88
|
-
current_db=search_tool.current_database :fasta
|
89
|
-
end
|
90
|
-
|
91
|
-
fragment_tol = search_tool.fragment_tol
|
92
|
-
precursor_tol = search_tool.precursor_tol
|
62
|
+
db_info = search_tool.database_info
|
93
63
|
|
64
|
+
# Index the DB if needed
|
65
|
+
#
|
66
|
+
unless File.exists?("#{db_info.path}.phr")
|
67
|
+
make_blastdb_cmd << "makeblastdb -dbtype prot -parse_seqids -in #{db_info.path}; "
|
68
|
+
end
|
94
69
|
|
95
70
|
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
96
71
|
|
@@ -101,18 +76,10 @@ ARGV.each do |filename|
|
|
101
76
|
if ( search_tool.explicit_output!=nil)
|
102
77
|
output_path=search_tool.explicit_output
|
103
78
|
else
|
104
|
-
output_path=
|
79
|
+
output_path=Tool.default_output_path(filename,".pep.xml",search_tool.output_prefix,@output_suffix)
|
105
80
|
end
|
106
81
|
|
107
|
-
|
108
|
-
#
|
109
|
-
input_path="#{search_tool.input_base_path(filename.chomp)}.mgf"
|
110
|
-
input_ext=Pathname.new(filename).extname
|
111
|
-
|
112
|
-
if ( input_ext==".dat" )
|
113
|
-
# This is a file provided by galaxy so we need to leave the .dat extension
|
114
|
-
input_path="#{search_tool.input_base_path(filename.chomp)}.dat"
|
115
|
-
end
|
82
|
+
input_path=filename.chomp
|
116
83
|
|
117
84
|
|
118
85
|
# Only proceed if the output file is not present or we have opted to over-write it
|
@@ -121,7 +88,7 @@ ARGV.each do |filename|
|
|
121
88
|
|
122
89
|
# The basic command
|
123
90
|
#
|
124
|
-
cmd = "#{make_blastdb_cmd} omssacl -nt #{search_tool.
|
91
|
+
cmd = "#{make_blastdb_cmd} omssacl -nt #{search_tool.threads} -d #{db_info.path} -fm #{input_path} -op #{output_path} -w"
|
125
92
|
|
126
93
|
#Missed cleavages
|
127
94
|
#
|
@@ -157,7 +124,7 @@ ARGV.each do |filename|
|
|
157
124
|
|
158
125
|
# Fragment ion tolerance
|
159
126
|
#
|
160
|
-
cmd << " -to #{fragment_tol}" #Always in Da
|
127
|
+
cmd << " -to #{search_tool.fragment_tol}" #Always in Da
|
161
128
|
|
162
129
|
# Set the search type (monoisotopic vs average masses) and whether to use strict monoisotopic masses
|
163
130
|
#
|
@@ -179,38 +146,24 @@ ARGV.each do |filename|
|
|
179
146
|
|
180
147
|
# Variable Modifications
|
181
148
|
#
|
182
|
-
if ( search_tool.var_mods
|
183
|
-
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
184
|
-
|
185
|
-
if ( var_mods !="" )
|
186
|
-
cmd << " -mv #{var_mods}"
|
187
|
-
end
|
188
|
-
else
|
189
|
-
# Add options related to peptide modifications
|
190
|
-
#
|
191
|
-
if ( search_tool.glyco )
|
192
|
-
cmd << " -mv 119 "
|
193
|
-
end
|
149
|
+
if ( search_tool.var_mods && !(search_tool.var_mods =~/None/)) # Checking for none is to cope with galaxy input
|
150
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
194
151
|
end
|
195
152
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
cmd << " -mf #{fix_mods}"
|
200
|
-
end
|
201
|
-
else
|
202
|
-
if ( search_tool.has_modifications )
|
203
|
-
cmd << " -mf "
|
204
|
-
if ( search_tool.carbamidomethyl )
|
205
|
-
cmd<<"3 "
|
206
|
-
end
|
153
|
+
var_mods=[] unless var_mods
|
154
|
+
var_mods << "119" if search_tool.glyco
|
155
|
+
var_mods << "1" if search_tool.methionine_oxidation
|
207
156
|
|
208
|
-
|
209
|
-
cmd<<"1 "
|
210
|
-
end
|
157
|
+
cmd << " -mv #{var_mods.join(",")}" if var_mods.length > 0
|
211
158
|
|
212
|
-
|
159
|
+
|
160
|
+
if ( search_tool.fix_mods && !(search_tool.fix_mods=~/None/))
|
161
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
213
162
|
end
|
163
|
+
fix_mods=[] unless fix_mods
|
164
|
+
fix_mods << ["3"] if search_tool.carbamidomethyl
|
165
|
+
|
166
|
+
cmd << " -mf #{fix_mods.join(",")}" if fix_mods.length > 0
|
214
167
|
|
215
168
|
if ( search_tool.searched_ions !="" && !(search_tool.searched_ions=~/None/))
|
216
169
|
searched_ions=search_tool.searched_ions.split(",").collect{ |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
@@ -235,29 +188,19 @@ ARGV.each do |filename|
|
|
235
188
|
cmd << " -ci #{search_tool.intensity_cut_off}"
|
236
189
|
|
237
190
|
# Send output to logfile. OMSSA Logging does not play well with Ruby Open4
|
238
|
-
cmd << " -logfile
|
191
|
+
cmd << " -logfile #{search_tool.logfile}" if search_tool.logfile
|
239
192
|
|
240
193
|
# Up to here we've formulated the omssa command. The rest is cleanup
|
241
194
|
p "Running:#{cmd}"
|
242
195
|
|
243
|
-
# Add retention time corrections
|
244
|
-
#
|
245
|
-
if (search_tool.options.add_retention_times)
|
246
|
-
# TODO: Really correct rts
|
247
|
-
# cmd << "; #{rt_correct_bin} #{output_path} #{input_path} "
|
248
|
-
end
|
249
196
|
|
250
197
|
# Correct the pepXML file
|
251
198
|
#
|
252
|
-
|
253
|
-
# genv.log("Running repair script command #{cmd}",:info)
|
199
|
+
cmd << "; #{repair_script_bin} -N #{input_path} -R mgf #{output_path} --omssa-itol #{search_tool.fragment_tol}"
|
254
200
|
|
255
201
|
# Run the search
|
256
202
|
#
|
257
|
-
|
258
|
-
job_params[:queue]="lowmem"
|
259
|
-
job_params[:vmem]="900mb"
|
260
|
-
search_tool.run(cmd,genv,job_params)
|
203
|
+
search_tool.run(cmd,genv)
|
261
204
|
|
262
205
|
|
263
206
|
else
|