protk 1.2.6.pre5 → 1.3.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/mascot_to_pepxml.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
#
|
3
|
-
# This file is part of
|
3
|
+
# This file is part of Protk
|
4
4
|
# Created by Ira Cooke 12/4/2010
|
5
5
|
#
|
6
6
|
# Convert mascot dat files to pepxml. A wrapper for Mascot2XML
|
@@ -15,34 +15,21 @@ require 'protk/mascot_util'
|
|
15
15
|
#
|
16
16
|
genv=Constants.new
|
17
17
|
|
18
|
-
tool=SearchTool.new([
|
18
|
+
tool=SearchTool.new([
|
19
|
+
:database,
|
20
|
+
:explicit_output,
|
21
|
+
:over_write,
|
22
|
+
:enzyme])
|
23
|
+
|
19
24
|
tool.option_parser.banner = "Convert mascot dat files to pep.xml files.\n\nUsage: mascot_to_pepxml.rb [options] file1.dat file2.dat ... "
|
20
25
|
|
21
26
|
tool.options.enzyme="trypsin"
|
22
27
|
|
23
|
-
tool.
|
24
|
-
tool.option_parser.on( '--shortid', 'Use short protein id as per Mascot result (default uses full protein ids in fasta file)' ) do
|
25
|
-
tool.options.shortid=true
|
26
|
-
end
|
27
|
-
|
28
|
-
tool.option_parser.parse!
|
29
|
-
|
30
|
-
exit unless tool.check_options
|
31
|
-
|
32
|
-
if ( ARGV[0].nil? )
|
33
|
-
puts "You must supply an input file"
|
34
|
-
puts tool.option_parser
|
35
|
-
exit
|
36
|
-
end
|
28
|
+
tool.add_boolean_option(:shortid,false,['--shortid', 'Use short protein id as per Mascot result (default uses full protein ids in fasta file)' ])
|
37
29
|
|
38
|
-
|
30
|
+
exit unless tool.check_options(true,[:database])
|
39
31
|
|
40
|
-
|
41
|
-
when Pathname.new(tool.database).exist? # It's an explicitly named db
|
42
|
-
current_db=Pathname.new(tool.database).realpath.to_s
|
43
|
-
else
|
44
|
-
current_db=tool.current_database :fasta
|
45
|
-
end
|
32
|
+
database_path=tool.database_info.path
|
46
33
|
|
47
34
|
|
48
35
|
|
@@ -54,20 +41,20 @@ ARGV.each do |file_name|
|
|
54
41
|
if ( tool.explicit_output==nil )
|
55
42
|
new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
|
56
43
|
cmd="cp #{name} #{new_basename}.dat"
|
57
|
-
cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{
|
44
|
+
cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{database_path} -E#{tool.enzyme}"
|
58
45
|
|
59
46
|
cmd << " -shortid" if tool.shortid
|
60
47
|
|
61
48
|
else #Mascot2XML doesn't support explicitly named output files so we move the file to an appropriate output filename after finishing
|
62
49
|
new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
|
63
50
|
cmd="cp #{name} #{new_basename}.dat"
|
64
|
-
cmd << "; Mascot2XML #{new_basename}.dat -D#{
|
51
|
+
cmd << "; Mascot2XML #{new_basename}.dat -D#{database_path} -E#{tool.enzyme}"
|
65
52
|
cmd << " -shortid" if tool.shortid
|
66
53
|
cmd << "; mv #{new_basename}.pep.xml #{tool.explicit_output}; rm #{new_basename}.dat"
|
67
54
|
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
68
55
|
cmd << "; #{repair_script} #{tool.explicit_output}"
|
69
56
|
end
|
70
57
|
|
71
|
-
code = tool.run(cmd,genv
|
58
|
+
code = tool.run(cmd,genv)
|
72
59
|
throw "Command #{cmd} failed with exit code #{code}" unless code==0
|
73
60
|
end
|
data/bin/msgfplus_search.rb
CHANGED
@@ -18,8 +18,19 @@ input_stager = nil
|
|
18
18
|
|
19
19
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
20
20
|
#
|
21
|
-
search_tool=SearchTool.new([
|
22
|
-
:
|
21
|
+
search_tool=SearchTool.new([
|
22
|
+
:database,
|
23
|
+
:explicit_output,
|
24
|
+
:over_write,
|
25
|
+
:enzyme,
|
26
|
+
:modifications,
|
27
|
+
:methionine_oxidation,
|
28
|
+
:carbamidomethyl,
|
29
|
+
:glyco,
|
30
|
+
:acetyl_nterm,
|
31
|
+
:instrument,
|
32
|
+
:cleavage_semi,
|
33
|
+
:threads])
|
23
34
|
|
24
35
|
search_tool.jobid_prefix="p"
|
25
36
|
search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
|
@@ -28,73 +39,24 @@ search_tool.options.output_suffix="_msgfplus"
|
|
28
39
|
search_tool.options.enzyme=1
|
29
40
|
search_tool.options.instrument=0
|
30
41
|
|
31
|
-
|
32
|
-
search_tool.
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
search_tool.
|
37
|
-
search_tool.
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
search_tool.
|
42
|
-
search_tool.
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
search_tool.
|
47
|
-
search_tool.option_parser.on( '--protocol p', '0: NoProtocol (Default), 1: Phosphorylation, 2: iTRAQ, 3: iTRAQPhospho' ) do |p|
|
48
|
-
search_tool.options.protocol=p
|
49
|
-
end
|
50
|
-
|
51
|
-
search_tool.options.min_pep_length=6
|
52
|
-
search_tool.option_parser.on( '--min-pep-length p', 'Minimum peptide length to consider, Default: 6' ) do |p|
|
53
|
-
search_tool.options.min_pep_length=p
|
54
|
-
end
|
55
|
-
|
56
|
-
search_tool.options.max_pep_length=40
|
57
|
-
search_tool.option_parser.on( '--max-pep-length p', 'Maximum peptide length to consider, Default: 40' ) do |p|
|
58
|
-
search_tool.options.max_pep_length=p
|
59
|
-
end
|
60
|
-
|
61
|
-
search_tool.options.min_pep_charge=2
|
62
|
-
search_tool.option_parser.on( '--min-pep-charge c', 'Minimum precursor charge to consider if charges are not specified in the spectrum file, Default: 2' ) do |c|
|
63
|
-
search_tool.options.min_pep_charge=c
|
64
|
-
end
|
65
|
-
|
66
|
-
search_tool.options.max_pep_charge=3
|
67
|
-
search_tool.option_parser.on( '--max-pep-charge c', 'Maximum precursor charge to consider if charges are not specified in the spectrum file, Default: 3' ) do |c|
|
68
|
-
search_tool.options.max_pep_charge=c
|
69
|
-
end
|
70
|
-
|
71
|
-
search_tool.options.num_reported_matches=1
|
72
|
-
search_tool.option_parser.on( '--num-reported-matches n', 'Number of matches per spectrum to be reported, Default: 1' ) do |n|
|
73
|
-
search_tool.options.num_reported_matches=n
|
74
|
-
end
|
75
|
-
|
76
|
-
search_tool.options.add_features=false
|
77
|
-
search_tool.option_parser.on( '--add-features', 'output additional features' ) do
|
78
|
-
search_tool.options.add_features=true
|
79
|
-
end
|
80
|
-
|
81
|
-
search_tool.options.num_threads=nil
|
82
|
-
search_tool.option_parser.on('--threads NumThreads','Number of processing threads to use') do |nt|
|
83
|
-
search_tool.options.num_threads=nt
|
84
|
-
end
|
85
|
-
|
86
|
-
search_tool.options.java_mem="3500M"
|
87
|
-
search_tool.option_parser.on('--java-mem mem','Java memory limit when running the search (Default 3.5Gb)') do |mem|
|
88
|
-
search_tool.options.java_mem=mem
|
89
|
-
end
|
42
|
+
# MS-GF+ doesnt support fragment tol so add this manually rather than via the SearchTool defaults
|
43
|
+
search_tool.add_value_option(:precursor_tol,"20",['-p','--precursor-ion-tol tol', 'Precursor ion mass tolerance.'])
|
44
|
+
search_tool.add_value_option(:precursor_tolu,"ppm",['--precursor-ion-tol-units tolu', 'Precursor ion mass tolerance units (ppm or Da). Default=ppm'])
|
45
|
+
|
46
|
+
search_tool.add_boolean_option(:pepxml,false,['--pepxml', 'Convert results to pepxml.'])
|
47
|
+
search_tool.add_value_option(:isotope_error_range,"0,1",['--isotope-error-range range', 'Takes into account of the error introduced by chooosing a non-monoisotopic peak for fragmentation.(Default 0,1)'])
|
48
|
+
search_tool.add_value_option(:fragment_method,0,['--fragment-method method', 'Fragment method 0: As written in the spectrum or CID if no info (Default), 1: CID, 2: ETD, 3: HCD, 4: Merge spectra from the same precursor'])
|
49
|
+
search_tool.add_boolean_option(:decoy_search,false,['--decoy-search', 'Build and search a decoy database on the fly. Input db should not contain decoys if this option is used'])
|
50
|
+
search_tool.add_value_option(:protocol,0,['--protocol p', '0: NoProtocol (Default), 1: Phosphorylation, 2: iTRAQ, 3: iTRAQPhospho'])
|
51
|
+
search_tool.add_value_option(:min_pep_length,6,['--min-pep-length p', 'Minimum peptide length to consider, Default: 6'])
|
52
|
+
search_tool.add_value_option(:max_pep_length,40,['--max-pep-length p', 'Maximum peptide length to consider, Default: 40'])
|
53
|
+
search_tool.add_value_option(:min_pep_charge,2,['--min-pep-charge c', 'Minimum precursor charge to consider if charges are not specified in the spectrum file, Default: 2'])
|
54
|
+
search_tool.add_value_option(:max_pep_charge,3,['--max-pep-charge c', 'Maximum precursor charge to consider if charges are not specified in the spectrum file, Default: 3'])
|
55
|
+
search_tool.add_value_option(:num_reported_matches,1,['--num-reported-matches n', 'Number of matches per spectrum to be reported, Default: 1'])
|
56
|
+
search_tool.add_boolean_option(:add_features,false,['--add-features', 'output additional features'])
|
57
|
+
search_tool.add_value_option(:java_mem,"3500M",['--java-mem mem','Java memory limit when running the search (Default 3.5Gb)'])
|
90
58
|
|
91
|
-
exit unless search_tool.check_options
|
92
|
-
|
93
|
-
if ( ARGV[0].nil? )
|
94
|
-
puts "You must supply an input file"
|
95
|
-
puts search_tool.option_parser
|
96
|
-
exit
|
97
|
-
end
|
59
|
+
exit unless search_tool.check_options(true)
|
98
60
|
|
99
61
|
# Environment with global constants
|
100
62
|
#
|
@@ -104,30 +66,34 @@ genv=Constants.new
|
|
104
66
|
#
|
105
67
|
msgf_bin="#{genv.msgfplusjar}"
|
106
68
|
|
107
|
-
|
69
|
+
# We need to cope with the fact that MSGFPlus.jar might not be executable so fall back to the protk predefined path
|
70
|
+
|
71
|
+
msgf_bin = "#{genv.msgfplus_root}/MSGFPlus.jar " if !msgf_bin
|
72
|
+
|
73
|
+
throw "Could not find MSGFPlus.jar" if !msgf_bin || (msgf_bin.length==0) || !File.exist?(msgf_bin)
|
108
74
|
|
109
75
|
make_msgfdb_cmd=""
|
110
76
|
|
111
|
-
|
112
|
-
|
113
|
-
current_db=Pathname.new(search_tool.database).realpath.to_s
|
77
|
+
@output_suffix="_msgfplus"
|
78
|
+
@output_extension= search_tool.pepxml ? ".pep.xml" : ".mzid"
|
114
79
|
|
115
|
-
|
116
|
-
if ( Pathname.new(current_db).extname.to_s.downcase != ".fasta" )
|
117
|
-
make_msgfdb_cmd << "ln -s #{current_db} #{current_db}.fasta;"
|
118
|
-
current_db="#{current_db}.fasta"
|
119
|
-
end
|
80
|
+
db_info=search_tool.database_info
|
120
81
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
82
|
+
database_path=db_info.path
|
83
|
+
|
84
|
+
# Database must have fasta extension
|
85
|
+
if Pathname.new(database_path).extname.to_s.downcase != ".fasta"
|
86
|
+
make_msgfdb_cmd << "ln -s #{database_path} #{database_path}.fasta;"
|
87
|
+
database_path="#{database_path}.fasta"
|
88
|
+
db_info.path=database_path
|
127
89
|
end
|
128
90
|
|
129
|
-
|
130
|
-
|
91
|
+
# Database must be indexed
|
92
|
+
unless FileTest.exists?("#{database_path}.canno")
|
93
|
+
dbdir = Pathname.new(database_path).dirname.realpath.to_s
|
94
|
+
tdavalue=search_tool.decoy_search ? 1 : 0;
|
95
|
+
make_msgfdb_cmd << "cd #{dbdir}; java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
|
96
|
+
end
|
131
97
|
|
132
98
|
|
133
99
|
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
@@ -139,17 +105,12 @@ ARGV.each do |filename|
|
|
139
105
|
if ( search_tool.explicit_output!=nil)
|
140
106
|
output_path=search_tool.explicit_output
|
141
107
|
else
|
142
|
-
output_path=
|
108
|
+
output_path=Tool.default_output_path(filename,@output_extension,search_tool.output_prefix,@output_suffix)
|
143
109
|
end
|
144
110
|
|
145
111
|
|
146
|
-
|
147
|
-
#
|
148
|
-
ext = Pathname.new(filename).extname
|
149
|
-
input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
|
150
|
-
|
151
|
-
mzid_output_path="#{search_tool.input_base_path(filename.chomp)}.mzid"
|
152
|
-
|
112
|
+
input_path=filename.chomp
|
113
|
+
mzid_output_path="#{output_path}.mzid"
|
153
114
|
|
154
115
|
if for_galaxy
|
155
116
|
original_input_file = input_path
|
@@ -166,12 +127,17 @@ ARGV.each do |filename|
|
|
166
127
|
|
167
128
|
# The basic command
|
168
129
|
#
|
169
|
-
cmd= "#{make_msgfdb_cmd} java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{
|
130
|
+
cmd= "#{make_msgfdb_cmd} java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{database_path} -s #{input_path} -o #{mzid_output_path} "
|
170
131
|
|
171
132
|
#Semi tryptic peptides
|
172
133
|
#
|
173
134
|
cmd << " -ntt 1" if ( search_tool.cleavage_semi )
|
174
135
|
|
136
|
+
#Decoy searches
|
137
|
+
#
|
138
|
+
tdavalue=search_tool.decoy_search ? 1 : 0;
|
139
|
+
cmd << " -tda #{tdavalue}"
|
140
|
+
|
175
141
|
# Precursor tolerance
|
176
142
|
#
|
177
143
|
cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
|
@@ -203,7 +169,7 @@ ARGV.each do |filename|
|
|
203
169
|
|
204
170
|
# Num Threads
|
205
171
|
#
|
206
|
-
cmd << " -thread #{search_tool.
|
172
|
+
cmd << " -thread #{search_tool.threads}" if search_tool.threads > 0
|
207
173
|
|
208
174
|
mods_file_content = ""
|
209
175
|
|
@@ -211,6 +177,9 @@ ARGV.each do |filename|
|
|
211
177
|
#
|
212
178
|
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
213
179
|
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join("\n")
|
180
|
+
var_mods << "O1,M,opt,any,Oxidation\n" if search_tool.methionine_oxidation
|
181
|
+
var_mods << "C2H2O,*,opt,Prot-N-term,Acetyl\n" if search_tool.acetyl_nterm
|
182
|
+
var_mods << "H-1N-1O1,N,opt,any,Deamidated\n" if search_tool.glyco
|
214
183
|
if ( var_mods !="" )
|
215
184
|
mods_file_content << "#{var_mods}\n"
|
216
185
|
end
|
@@ -220,13 +189,14 @@ ARGV.each do |filename|
|
|
220
189
|
#
|
221
190
|
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
222
191
|
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join("\n")
|
192
|
+
fix_mods << "C2H3N1O1,C,opt,any,Carbamidomethyl\n" if search_tool.carbamidomethyl
|
223
193
|
if ( fix_mods !="")
|
224
194
|
mods_file_content << "#{fix_mods}"
|
225
195
|
end
|
226
196
|
end
|
227
197
|
|
228
198
|
if ( mods_file_content != "")
|
229
|
-
mods_path="#{
|
199
|
+
mods_path="#{output_path}.msgfplus_mods.txt"
|
230
200
|
mods_file=File.open(mods_path,'w+')
|
231
201
|
mods_file.write "NumMods=2\n#{mods_file_content}"
|
232
202
|
mods_file.close
|
@@ -234,15 +204,15 @@ ARGV.each do |filename|
|
|
234
204
|
end
|
235
205
|
|
236
206
|
# As a final part of the command we convert to pepxml
|
237
|
-
if search_tool.
|
238
|
-
cmd << "; cp #{mzid_output_path} #{output_path}"
|
239
|
-
else
|
207
|
+
if search_tool.pepxml
|
240
208
|
#if search_tool.explicit_output
|
241
209
|
cmd << ";ruby -pi.bak -e \"gsub('post=\\\"?','post=\\\"X')\" #{mzid_output_path}"
|
242
210
|
cmd << ";ruby -pi.bak -e \"gsub('pre=\\\"?','pre=\\\"X')\" #{mzid_output_path}"
|
243
211
|
cmd << ";idconvert #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
|
244
212
|
#Then copy the pepxml to the final output path
|
245
213
|
cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
|
214
|
+
else
|
215
|
+
cmd << "; mv #{mzid_output_path} #{output_path}"
|
246
216
|
end
|
247
217
|
|
248
218
|
|
@@ -251,14 +221,7 @@ ARGV.each do |filename|
|
|
251
221
|
|
252
222
|
# In case the user specified background running we need to create a jobscript path
|
253
223
|
#
|
254
|
-
|
255
|
-
|
256
|
-
# Run the search
|
257
|
-
#
|
258
|
-
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
259
|
-
job_params[:queue]="seventytwo"
|
260
|
-
job_params[:vmem]="70gb"
|
261
|
-
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
224
|
+
code = search_tool.run(cmd,genv)
|
262
225
|
throw "Command failed with exit code #{code}" unless code==0
|
263
226
|
|
264
227
|
if for_galaxy
|
data/bin/omssa_search.rb
CHANGED
@@ -16,56 +16,37 @@ for_galaxy = GalaxyUtil.for_galaxy?
|
|
16
16
|
|
17
17
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
18
18
|
#
|
19
|
-
search_tool=SearchTool.new([
|
20
|
-
:
|
21
|
-
:
|
19
|
+
search_tool=SearchTool.new([
|
20
|
+
:database,
|
21
|
+
:explicit_output,
|
22
|
+
:over_write,
|
23
|
+
:enzyme,
|
24
|
+
:modifications,
|
25
|
+
:methionine_oxidation,
|
26
|
+
:carbamidomethyl,
|
27
|
+
:glyco,
|
28
|
+
:instrument,
|
29
|
+
:mass_tolerance_units,
|
30
|
+
:mass_tolerance,
|
31
|
+
:missed_cleavages,
|
32
|
+
:precursor_search_type,
|
33
|
+
:respect_precursor_charges,
|
34
|
+
:num_peaks_for_multi_isotope_search,
|
35
|
+
:searched_ions,
|
36
|
+
:threads
|
22
37
|
])
|
23
38
|
|
24
39
|
|
25
40
|
search_tool.option_parser.banner = "Run an OMSSA msms search on a set of mgf input files.\n\nUsage: omssa_search.rb [options] file1.mgf file2.mgf ..."
|
26
|
-
search_tool.options.output_suffix="_omssa"
|
27
41
|
|
28
|
-
search_tool.
|
29
|
-
search_tool.
|
30
|
-
|
31
|
-
|
42
|
+
search_tool.add_boolean_option(:add_retention_times,true,['-R', '--no-add-retention-times', 'Don\'t post process the output to add retention times'])
|
43
|
+
search_tool.add_value_option(:max_hit_expect,1,['--max-hit-expect exp', 'Expect values less than this are considered to be hits'])
|
44
|
+
search_tool.add_value_option(:intensity_cut_off,0.0005,['--intensity-cut-off co', 'Peak intensity cut-off as a fraction of maximum peak intensity'])
|
45
|
+
search_tool.add_value_option(:galaxy_index_dir,nil,['--galaxy-index-dir dir', 'Specify galaxy index directory, will search for mods file there.'])
|
46
|
+
search_tool.add_value_option(:omx_output,nil,['--omx-output path', 'Specify path for additional OMX output (optional).'])
|
47
|
+
search_tool.add_value_option(:logfile,nil,['--logfile path','Send OMSSA stdout to a logfile'])
|
32
48
|
|
33
|
-
search_tool.
|
34
|
-
search_tool.option_parser.on( '--max-hit-expect exp', 'Expect values less than this are considered to be hits' ) do |exp|
|
35
|
-
search_tool.options.max_hit_expect=exp
|
36
|
-
end
|
37
|
-
|
38
|
-
search_tool.options.intensity_cut_off=0.0005
|
39
|
-
search_tool.option_parser.on( '--intensity-cut-off co', 'Peak intensity cut-off as a fraction of maximum peak intensity' ) do |co|
|
40
|
-
search_tool.options.intensity_cut_off=co
|
41
|
-
end
|
42
|
-
|
43
|
-
search_tool.options.galaxy_index_dir=nil
|
44
|
-
search_tool.option_parser.on( '--galaxy-index-dir dir', 'Specify galaxy index directory, will search for mods file there.' ) do |dir|
|
45
|
-
search_tool.options.galaxy_index_dir=dir
|
46
|
-
end
|
47
|
-
|
48
|
-
search_tool.options.omx_output=nil
|
49
|
-
search_tool.option_parser.on( '--omx-output path', 'Specify path for additional OMX output (optional).' ) do |path|
|
50
|
-
search_tool.options.omx_output=path
|
51
|
-
end
|
52
|
-
|
53
|
-
if ( ENV['PROTK_OMSSA_NTHREADS'] )
|
54
|
-
search_tool.options.nthreads=ENV['PROTK_OMSSA_NTHREADS']
|
55
|
-
else
|
56
|
-
search_tool.options.nthreads=0
|
57
|
-
end
|
58
|
-
search_tool.option_parser.on( '--nthreads num', 'Number of search threads to use. Default is to use the value in environment variable PROTK_OMSSA_NTHREADS or else to autodetect' ) do |num|
|
59
|
-
search_tool.options.nthreads=num
|
60
|
-
end
|
61
|
-
|
62
|
-
exit unless search_tool.check_options
|
63
|
-
|
64
|
-
if ( ARGV[0].nil? )
|
65
|
-
puts "You must supply an input file"
|
66
|
-
puts search_tool.option_parser
|
67
|
-
exit
|
68
|
-
end
|
49
|
+
exit unless search_tool.check_options(true)
|
69
50
|
|
70
51
|
# Environment with global constants
|
71
52
|
#
|
@@ -73,24 +54,18 @@ genv=Constants.new
|
|
73
54
|
|
74
55
|
# Set search engine specific parameters on the SearchTool object
|
75
56
|
#
|
76
|
-
rt_correct_bin="#{File.dirname(__FILE__)}/correct_omssa_retention_times.rb"
|
77
57
|
repair_script_bin="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
78
58
|
|
79
59
|
make_blastdb_cmd=""
|
60
|
+
@output_suffix="_omssa"
|
80
61
|
|
81
|
-
|
82
|
-
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
83
|
-
current_db=Pathname.new(search_tool.database).realpath.to_s
|
84
|
-
if(not FileTest.exists?("#{current_db}.phr"))
|
85
|
-
make_blastdb_cmd << "makeblastdb -dbtype prot -parse_seqids -in #{current_db}; "
|
86
|
-
end
|
87
|
-
else
|
88
|
-
current_db=search_tool.current_database :fasta
|
89
|
-
end
|
90
|
-
|
91
|
-
fragment_tol = search_tool.fragment_tol
|
92
|
-
precursor_tol = search_tool.precursor_tol
|
62
|
+
db_info = search_tool.database_info
|
93
63
|
|
64
|
+
# Index the DB if needed
|
65
|
+
#
|
66
|
+
unless File.exists?("#{db_info.path}.phr")
|
67
|
+
make_blastdb_cmd << "makeblastdb -dbtype prot -parse_seqids -in #{db_info.path}; "
|
68
|
+
end
|
94
69
|
|
95
70
|
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
96
71
|
|
@@ -101,18 +76,10 @@ ARGV.each do |filename|
|
|
101
76
|
if ( search_tool.explicit_output!=nil)
|
102
77
|
output_path=search_tool.explicit_output
|
103
78
|
else
|
104
|
-
output_path=
|
79
|
+
output_path=Tool.default_output_path(filename,".pep.xml",search_tool.output_prefix,@output_suffix)
|
105
80
|
end
|
106
81
|
|
107
|
-
|
108
|
-
#
|
109
|
-
input_path="#{search_tool.input_base_path(filename.chomp)}.mgf"
|
110
|
-
input_ext=Pathname.new(filename).extname
|
111
|
-
|
112
|
-
if ( input_ext==".dat" )
|
113
|
-
# This is a file provided by galaxy so we need to leave the .dat extension
|
114
|
-
input_path="#{search_tool.input_base_path(filename.chomp)}.dat"
|
115
|
-
end
|
82
|
+
input_path=filename.chomp
|
116
83
|
|
117
84
|
|
118
85
|
# Only proceed if the output file is not present or we have opted to over-write it
|
@@ -121,7 +88,7 @@ ARGV.each do |filename|
|
|
121
88
|
|
122
89
|
# The basic command
|
123
90
|
#
|
124
|
-
cmd = "#{make_blastdb_cmd} omssacl -nt #{search_tool.
|
91
|
+
cmd = "#{make_blastdb_cmd} omssacl -nt #{search_tool.threads} -d #{db_info.path} -fm #{input_path} -op #{output_path} -w"
|
125
92
|
|
126
93
|
#Missed cleavages
|
127
94
|
#
|
@@ -157,7 +124,7 @@ ARGV.each do |filename|
|
|
157
124
|
|
158
125
|
# Fragment ion tolerance
|
159
126
|
#
|
160
|
-
cmd << " -to #{fragment_tol}" #Always in Da
|
127
|
+
cmd << " -to #{search_tool.fragment_tol}" #Always in Da
|
161
128
|
|
162
129
|
# Set the search type (monoisotopic vs average masses) and whether to use strict monoisotopic masses
|
163
130
|
#
|
@@ -179,38 +146,24 @@ ARGV.each do |filename|
|
|
179
146
|
|
180
147
|
# Variable Modifications
|
181
148
|
#
|
182
|
-
if ( search_tool.var_mods
|
183
|
-
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
184
|
-
|
185
|
-
if ( var_mods !="" )
|
186
|
-
cmd << " -mv #{var_mods}"
|
187
|
-
end
|
188
|
-
else
|
189
|
-
# Add options related to peptide modifications
|
190
|
-
#
|
191
|
-
if ( search_tool.glyco )
|
192
|
-
cmd << " -mv 119 "
|
193
|
-
end
|
149
|
+
if ( search_tool.var_mods && !(search_tool.var_mods =~/None/)) # Checking for none is to cope with galaxy input
|
150
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
194
151
|
end
|
195
152
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
cmd << " -mf #{fix_mods}"
|
200
|
-
end
|
201
|
-
else
|
202
|
-
if ( search_tool.has_modifications )
|
203
|
-
cmd << " -mf "
|
204
|
-
if ( search_tool.carbamidomethyl )
|
205
|
-
cmd<<"3 "
|
206
|
-
end
|
153
|
+
var_mods=[] unless var_mods
|
154
|
+
var_mods << "119" if search_tool.glyco
|
155
|
+
var_mods << "1" if search_tool.methionine_oxidation
|
207
156
|
|
208
|
-
|
209
|
-
cmd<<"1 "
|
210
|
-
end
|
157
|
+
cmd << " -mv #{var_mods.join(",")}" if var_mods.length > 0
|
211
158
|
|
212
|
-
|
159
|
+
|
160
|
+
if ( search_tool.fix_mods && !(search_tool.fix_mods=~/None/))
|
161
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
213
162
|
end
|
163
|
+
fix_mods=[] unless fix_mods
|
164
|
+
fix_mods << ["3"] if search_tool.carbamidomethyl
|
165
|
+
|
166
|
+
cmd << " -mf #{fix_mods.join(",")}" if fix_mods.length > 0
|
214
167
|
|
215
168
|
if ( search_tool.searched_ions !="" && !(search_tool.searched_ions=~/None/))
|
216
169
|
searched_ions=search_tool.searched_ions.split(",").collect{ |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
@@ -235,29 +188,19 @@ ARGV.each do |filename|
|
|
235
188
|
cmd << " -ci #{search_tool.intensity_cut_off}"
|
236
189
|
|
237
190
|
# Send output to logfile. OMSSA Logging does not play well with Ruby Open4
|
238
|
-
cmd << " -logfile
|
191
|
+
cmd << " -logfile #{search_tool.logfile}" if search_tool.logfile
|
239
192
|
|
240
193
|
# Up to here we've formulated the omssa command. The rest is cleanup
|
241
194
|
p "Running:#{cmd}"
|
242
195
|
|
243
|
-
# Add retention time corrections
|
244
|
-
#
|
245
|
-
if (search_tool.options.add_retention_times)
|
246
|
-
# TODO: Really correct rts
|
247
|
-
# cmd << "; #{rt_correct_bin} #{output_path} #{input_path} "
|
248
|
-
end
|
249
196
|
|
250
197
|
# Correct the pepXML file
|
251
198
|
#
|
252
|
-
|
253
|
-
# genv.log("Running repair script command #{cmd}",:info)
|
199
|
+
cmd << "; #{repair_script_bin} -N #{input_path} -R mgf #{output_path} --omssa-itol #{search_tool.fragment_tol}"
|
254
200
|
|
255
201
|
# Run the search
|
256
202
|
#
|
257
|
-
|
258
|
-
job_params[:queue]="lowmem"
|
259
|
-
job_params[:vmem]="900mb"
|
260
|
-
search_tool.run(cmd,genv,job_params)
|
203
|
+
search_tool.run(cmd,genv)
|
261
204
|
|
262
205
|
|
263
206
|
else
|