protk 1.2.6.pre5 → 1.3.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/sixframe.rb
CHANGED
@@ -16,55 +16,33 @@ def check_coords(naseq,aaseq,frame,pstart,pend)
|
|
16
16
|
orf_from_coords=naseq[pstart-1..pend-1].translate(1)
|
17
17
|
else
|
18
18
|
orf_from_coords=naseq[pstart-1..pend-1].reverse_complement.translate(1)
|
19
|
-
# current coords give
|
20
|
-
# naseq.reverse_complement[pstart-1..pend-1].translate(1)
|
21
|
-
# naseq[350368-pend..(350367-pstart+1)].reverse_complement.translate(1)
|
22
|
-
# orf_from_coords=naseq[naseq.length-pend..naseq.length-pstart].reverse_complement.translate(1)
|
23
19
|
end
|
24
20
|
if ( orf_from_coords!=aaseq)
|
25
21
|
require 'debugger'; debugger
|
26
22
|
end
|
27
|
-
# p "#{aaseq} #{frame}"
|
28
23
|
end
|
29
24
|
|
30
25
|
|
31
26
|
tool=Tool.new([:explicit_output])
|
32
27
|
tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage: sixframe.rb [options] genome.fasta"
|
33
28
|
|
34
|
-
tool.
|
35
|
-
tool.
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
tool.options.keep_header=true
|
40
|
-
tool.option_parser.on( '--strip-header', 'Dont write sequence definition' ) do
|
41
|
-
tool.options.keep_header=false
|
42
|
-
end
|
29
|
+
tool.add_boolean_option(:print_coords,false,['--coords', 'Write genomic coordinates in the fasta header'])
|
30
|
+
tool.add_boolean_option(:keep_header,true,['--strip-header', 'Dont write sequence definition'])
|
31
|
+
tool.add_value_option(:min_len,20,['--min-len','Minimum ORF length to keep'])
|
43
32
|
|
44
|
-
exit unless tool.check_options
|
33
|
+
exit unless tool.check_options(true)
|
45
34
|
|
46
|
-
|
47
|
-
puts "You must supply an input file"
|
48
|
-
puts tool.option_parser
|
49
|
-
exit
|
50
|
-
end
|
35
|
+
input_file=ARGV[0]
|
51
36
|
|
52
|
-
|
37
|
+
output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
|
53
38
|
|
54
|
-
|
55
|
-
if ( tool.explicit_output != nil)
|
56
|
-
outfile=File.open(tool.explicit_output,'w')
|
57
|
-
else
|
58
|
-
outfile=File.open("#{inname}.translated.fasta",'w')
|
59
|
-
end
|
39
|
+
output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
|
60
40
|
|
61
41
|
|
62
|
-
file = Bio::FastaFormat.open(
|
42
|
+
file = Bio::FastaFormat.open(input_file)
|
63
43
|
|
64
44
|
file.each do |entry|
|
65
45
|
|
66
|
-
puts entry.entry_id
|
67
|
-
|
68
46
|
length = entry.naseq.length
|
69
47
|
|
70
48
|
(1...7).each do |frame|
|
@@ -76,7 +54,7 @@ file.each do |entry|
|
|
76
54
|
oi=0
|
77
55
|
orfs.each do |orf|
|
78
56
|
oi+=1
|
79
|
-
if ( orf.length >
|
57
|
+
if ( orf.length > tool.min_len )
|
80
58
|
|
81
59
|
position_start = position
|
82
60
|
position_end = position_start + orf.length*3 -1
|
@@ -109,7 +87,7 @@ file.each do |entry|
|
|
109
87
|
# Output in fasta format
|
110
88
|
# start and end positions are always relative to the forward strand
|
111
89
|
|
112
|
-
|
90
|
+
output_fh.write("#{defline}\n#{orf}\n")
|
113
91
|
|
114
92
|
end
|
115
93
|
position += orf.length*3+3
|
data/bin/tandem_search.rb
CHANGED
@@ -8,8 +8,7 @@
|
|
8
8
|
|
9
9
|
require 'protk/constants'
|
10
10
|
require 'protk/command_runner'
|
11
|
-
require 'protk/
|
12
|
-
require 'protk/xtandem_defaults'
|
11
|
+
require 'protk/tandem_search_tool'
|
13
12
|
require 'libxml'
|
14
13
|
|
15
14
|
include LibXML
|
@@ -17,443 +16,71 @@ include LibXML
|
|
17
16
|
# Environment with global constants
|
18
17
|
#
|
19
18
|
genv=Constants.new
|
19
|
+
search_tool=TandemSearchTool.new()
|
20
20
|
|
21
|
-
|
22
|
-
#
|
23
|
-
search_tool=SearchTool.new([:background,:glyco,:database,:explicit_output,:over_write,
|
24
|
-
:enzyme,:modifications,:mass_tolerance_units,:mass_tolerance,:strict_monoisotopic_mass,
|
25
|
-
:missed_cleavages,:cleavage_semi,:carbamidomethyl,:methionine_oxidation
|
26
|
-
])
|
27
|
-
search_tool.jobid_prefix="x"
|
28
|
-
search_tool.option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
|
29
|
-
search_tool.options.output_suffix="_tandem"
|
30
|
-
|
31
|
-
tandem_defaults=XTandemDefaults.new.path
|
32
|
-
search_tool.options.tandem_params=tandem_defaults
|
33
|
-
search_tool.option_parser.on( '-T', '--tandem-params tandem', 'XTandem parameters to use' ) do |parms|
|
34
|
-
search_tool.options.tandem_params = parms
|
35
|
-
end
|
36
|
-
|
37
|
-
search_tool.options.no_pepxml=false
|
38
|
-
search_tool.option_parser.on( '-P', '--no-pepxml', 'Dont convert to pepXML after running the search') do
|
39
|
-
search_tool.options.no_pepxml=true
|
40
|
-
end
|
41
|
-
|
42
|
-
search_tool.options.keep_params_files=false
|
43
|
-
search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parameter files' ) do
|
44
|
-
search_tool.options.keep_params_files = true
|
45
|
-
end
|
46
|
-
|
47
|
-
# In case want pepXML, but still want tandem output also.
|
48
|
-
search_tool.options.tandem_output=nil
|
49
|
-
search_tool.option_parser.on( '--tandem-output tandem_output', 'Keep X! Tandem Output') do |tandem_output|
|
50
|
-
search_tool.options.tandem_output=tandem_output
|
51
|
-
end
|
52
|
-
|
53
|
-
search_tool.options.thresholds_type = 'isb_kscore'
|
54
|
-
search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold, system_default)' ) do |thresholds_type|
|
55
|
-
# This options sets up various X! Tandem thresholds.
|
56
|
-
# - system_default: Don't change any defaults just use
|
57
|
-
# the defaults for this TPP install as is.
|
58
|
-
# - tandem_default: These thresholds are found on the
|
59
|
-
# tandem api page. http://www.thegpm.org/tandem/api/index.html
|
60
|
-
# - isb_native: These are the defaults found in
|
61
|
-
# isb_default_input_native.xml distributed with TPP 4.6.
|
62
|
-
# - isb_kscore: These are the defaults found in
|
63
|
-
# isb_default_input_kscore.xml distributed with TPP 4.6.
|
64
|
-
# - scaffold: These are the defaults recommend by Proteome Software
|
65
|
-
# for use with Scaffold.
|
66
|
-
search_tool.options.thresholds_type = thresholds_type
|
67
|
-
end
|
68
|
-
|
69
|
-
search_tool.options.algorithm = "kscore"
|
70
|
-
search_tool.option_parser.on( '--algorithm algorithm', "Scoring algorithm (kscore or native)" ) do |algorithm|
|
71
|
-
search_tool.options.algorithm = algorithm
|
72
|
-
end
|
73
|
-
|
74
|
-
search_tool.options.cleavage_semi = true
|
75
|
-
search_tool.option_parser.on( '--no-cleavage-semi' ) do
|
76
|
-
search_tool.options.cleavage_semi = false
|
77
|
-
end
|
21
|
+
exit unless search_tool.check_options(true)
|
78
22
|
|
79
|
-
|
80
|
-
search_tool.options.n_terminal_mod_mass=nil
|
81
|
-
search_tool.option_parser.on('--n-terminal-mod-mass mass') do |mass|
|
82
|
-
search_tool.options.n_terminal_mod_mass = mass
|
83
|
-
end
|
84
|
-
|
85
|
-
search_tool.options.c_terminal_mod_mass=nil
|
86
|
-
search_tool.option_parser.on('--c-terminal-mod-mass mass') do |mass|
|
87
|
-
search_tool.options.c_terminal_mod_mass = mass
|
88
|
-
end
|
89
|
-
|
90
|
-
search_tool.options.cleavage_n_terminal_mod_mass=nil
|
91
|
-
search_tool.option_parser.on('--cleavage-n-terminal-mod-mass mass') do |mass|
|
92
|
-
search_tool.options.cleavage_n_terminal_mod_mass = mass
|
93
|
-
end
|
94
|
-
|
95
|
-
search_tool.options.cleavage_c_terminal_mod_mass=nil
|
96
|
-
search_tool.option_parser.on('--cleavage-c-terminal-mod-mass mass') do |mass|
|
97
|
-
search_tool.options.cleavage_c_terminal_mod_mass = mass
|
98
|
-
end
|
99
|
-
|
100
|
-
exit unless search_tool.check_options
|
101
|
-
|
102
|
-
if ( ARGV[0].nil? )
|
103
|
-
puts "You must supply an input file"
|
104
|
-
puts search_tool.option_parser
|
105
|
-
exit
|
106
|
-
end
|
107
|
-
|
108
|
-
|
109
|
-
# Set search engine specific parameters on the SearchTool object
|
23
|
+
# Our environment should be setup so that tandem or tandem.exe is on the path
|
110
24
|
#
|
111
|
-
tandem_bin
|
25
|
+
tandem_bin=%x[which tandem].chomp
|
26
|
+
tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
|
112
27
|
|
113
|
-
|
28
|
+
@output_suffix="_tandem"
|
114
29
|
|
115
|
-
|
116
|
-
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
117
|
-
current_db=Pathname.new(search_tool.database).realpath.to_s
|
118
|
-
else
|
119
|
-
current_db=search_tool.current_database :fasta
|
120
|
-
end
|
121
|
-
|
122
|
-
|
123
|
-
# Parse options from a parameter file (if provided), or from the default parameter file
|
124
|
-
#
|
125
|
-
params_parser=XML::Parser.file(tandem_params)
|
126
|
-
std_params=params_parser.parse
|
127
|
-
|
128
|
-
# Parse taxonomy template file
|
129
|
-
#
|
130
|
-
taxo_parser=XML::Parser.file(XTandemDefaults.new.taxonomy_path)
|
131
|
-
taxo_doc=taxo_parser.parse
|
132
|
-
|
133
|
-
# Galaxy changes things like @ to __at__ we need to change it back
|
30
|
+
# Run the search engine on each input file
|
134
31
|
#
|
135
|
-
|
136
|
-
mstring.gsub!("__at__","@")
|
137
|
-
mstring.gsub!("__oc__","{")
|
138
|
-
mstring.gsub!("__cc__","}")
|
139
|
-
mstring.gsub!("__ob__","[")
|
140
|
-
mstring.gsub!("__cb__","]")
|
141
|
-
mstring
|
142
|
-
end
|
32
|
+
ARGV.each do |filename|
|
143
33
|
|
144
|
-
|
145
|
-
notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
|
146
|
-
throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
|
147
|
-
notes[0].content=value
|
148
|
-
end
|
34
|
+
throw "Input file #{filename} does not exist" unless File.exist?(filename)
|
149
35
|
|
150
|
-
|
151
|
-
|
152
|
-
if notes.length == 0
|
153
|
-
node = XML::Node.new('note')
|
154
|
-
node["type"] = "input"
|
155
|
-
node["label"] = tandem_key
|
156
|
-
node.content = value
|
157
|
-
std_params.find('/bioml')[0] << node
|
158
|
-
else
|
159
|
-
throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
|
160
|
-
notes[0].content = append_string(notes[0].content, value)
|
161
|
-
end
|
162
|
-
end
|
36
|
+
input_path=Pathname.new(filename.chomp).expand_path.to_s
|
37
|
+
output_path=Tool.default_output_path(input_path,".tandem",search_tool.output_prefix,@output_suffix)
|
163
38
|
|
164
|
-
def collapse_keys(std_params, tandem_key)
|
165
|
-
mods=std_params.find('/bioml/note[@type="input" and @label="#{tandem_key}"]')
|
166
|
-
if not mods
|
167
|
-
first_mod = mods[0]
|
168
|
-
rest_mods = mods[1..-1]
|
169
|
-
rest_mods.each{ |node| first_mod.content = append_string(first_mod.content, node.content); node.remove!}
|
170
|
-
end
|
171
|
-
end
|
172
39
|
|
173
|
-
|
174
|
-
|
175
|
-
second
|
40
|
+
if ( search_tool.explicit_output )
|
41
|
+
final_output_path=search_tool.explicit_output
|
176
42
|
else
|
177
|
-
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
182
|
-
set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
|
183
|
-
set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
|
184
|
-
|
185
|
-
# Set the input and output paths
|
186
|
-
#
|
187
|
-
input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
|
188
|
-
throw "Exactly one spectrum, path note is required in the parameter file" unless input_notes.length==1
|
189
|
-
input_notes[0].content=input_path
|
190
|
-
|
191
|
-
output_notes=std_params.find('/bioml/note[@type="input" and @label="output, path"]')
|
192
|
-
throw "Exactly one output, path note is required in the parameter file" unless output_notes.length==1
|
193
|
-
output_notes[0].content=output_path
|
194
|
-
|
195
|
-
# Set the path to the scoring algorithm default params. We use one from ISB
|
196
|
-
#
|
197
|
-
scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
|
198
|
-
throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
|
199
|
-
scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_#{search_tool.algorithm}.xml"
|
200
|
-
|
201
|
-
# Taxonomy and Database
|
202
|
-
#
|
203
|
-
db_notes=std_params.find('/bioml/note[@type="input" and @label="protein, taxon"]')
|
204
|
-
throw "Exactly one protein, taxon note is required in the parameter file" unless db_notes.length==1
|
205
|
-
db_notes[0].content=search_tool.database.downcase
|
206
|
-
|
207
|
-
taxo_notes=std_params.find('/bioml/note[@type="input" and @label="list path, taxonomy information"]')
|
208
|
-
throw "Exactly one list path, taxonomy information note is required in the parameter file" unless taxo_notes.length==1
|
209
|
-
taxo_notes[0].content=taxo_path
|
210
|
-
|
211
|
-
fragment_tol = search_tool.fragment_tol
|
212
|
-
|
213
|
-
fmass=std_params.find('/bioml/note[@type="input" and @label="spectrum, fragment monoisotopic mass error"]')
|
214
|
-
p fmass
|
215
|
-
throw "Exactly one spectrum, fragment monoisotopic mass error note is required in the parameter file" unless fmass.length==1
|
216
|
-
fmass[0].content=fragment_tol.to_s
|
217
|
-
|
218
|
-
precursor_tol = search_tool.precursor_tol
|
219
|
-
ptol_plus=precursor_tol*0.5
|
220
|
-
ptol_minus=precursor_tol*0.5
|
221
|
-
|
222
|
-
# Precursor mass matching
|
223
|
-
#
|
224
|
-
pmass_minus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error minus"]')
|
225
|
-
throw "Exactly one spectrum, parent monoisotopic mass error minus note is required in the parameter file" unless pmass_minus.length==1
|
226
|
-
pmass_minus[0].content=ptol_minus.to_s
|
227
|
-
|
228
|
-
pmass_plus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error plus"]')
|
229
|
-
throw "Exactly one spectrum, parent monoisotopic mass error plus note is required in the parameter file" unless pmass_plus.length==1
|
230
|
-
pmass_plus[0].content=ptol_plus.to_s
|
231
|
-
|
232
|
-
pmass_err_units=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error units"]')
|
233
|
-
throw "Exactly one spectrum, parent monoisotopic mass error units note is required in the parameter file. Got #{pmass_err_units.length}" unless pmass_err_units.length==1
|
234
|
-
|
235
|
-
|
236
|
-
pmass_err_units[0].content=search_tool.precursor_tolu
|
237
|
-
|
238
|
-
if search_tool.strict_monoisotopic_mass
|
239
|
-
isotopic_error=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass isotope error"]')
|
240
|
-
throw "Exactly one spectrum, parent monoisotopic mass isotope error is required in the parameter file" unless isotopic_error.length==1
|
241
|
-
isotopic_error[0].content="no"
|
43
|
+
final_output_path=output_path
|
242
44
|
end
|
243
45
|
|
244
|
-
if search_tool.tandem_output
|
245
|
-
# If one is interested in the tandem output (e.g. for consumption by Scaffold)
|
246
|
-
# want to store additional information.
|
247
|
-
set_option(std_params, "output, spectra", "yes")
|
248
|
-
end
|
249
46
|
|
250
|
-
|
47
|
+
output_exists=Pathname.new(final_output_path).exist?
|
251
48
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
end
|
258
|
-
|
259
|
-
minimum_ion_count = "4"
|
260
|
-
case thresholds_type
|
261
|
-
when "isb_kscore", "isb_native"
|
262
|
-
minimum_ion_count = "1"
|
263
|
-
when "scaffold"
|
264
|
-
minimum_ion_count = "0"
|
265
|
-
end
|
266
|
-
|
267
|
-
minimum_peaks = "15"
|
268
|
-
case thresholds_type
|
269
|
-
when "isb_native"
|
270
|
-
minimum_peaks = "6"
|
271
|
-
when "isb_kscore"
|
272
|
-
minimum_peaks = "10"
|
273
|
-
when "scaffold"
|
274
|
-
minimum_peaks = "0"
|
275
|
-
end
|
276
|
-
|
277
|
-
minimum_fragement_mz = "150"
|
278
|
-
case thresholds_type
|
279
|
-
when "isb_native"
|
280
|
-
minimum_fragement_mz = "50"
|
281
|
-
when "isb_kscore"
|
282
|
-
minimum_fragement_mz = "125"
|
283
|
-
when "scaffold"
|
284
|
-
minimum_fragement_mz = "0"
|
285
|
-
end
|
286
|
-
|
287
|
-
minimum_parent_mh = "500" # tandem and isb_native defaults
|
288
|
-
case thresholds_type
|
289
|
-
when "isb_kscore"
|
290
|
-
minimum_parent_mh = "600"
|
291
|
-
when "scaffold"
|
292
|
-
minimum_parent_mh = "0"
|
293
|
-
end
|
294
|
-
|
295
|
-
use_noise_suppression = "yes"
|
296
|
-
if thresholds_type == "isb_kscore" or thresholds_type == "scaffold"
|
297
|
-
use_noise_suppression = "no"
|
298
|
-
end
|
299
|
-
|
300
|
-
dynamic_range = "100.0"
|
301
|
-
case thresholds_type
|
302
|
-
when "isb_kscore"
|
303
|
-
dynamic_range = "10000.0"
|
304
|
-
when "scaffold"
|
305
|
-
dynamic_range = "1000.0"
|
306
|
-
end
|
307
|
-
|
308
|
-
set_option(std_params, "spectrum, dynamic range", dynamic_range)
|
309
|
-
set_option(std_params, "spectrum, use noise suppression", use_noise_suppression)
|
310
|
-
set_option(std_params, "spectrum, minimum parent m+h", minimum_parent_mh)
|
311
|
-
set_option(std_params, "spectrum, minimum fragment mz", minimum_fragement_mz)
|
312
|
-
set_option(std_params, "spectrum, minimum peaks", minimum_peaks)
|
313
|
-
set_option(std_params, "scoring, minimum ion count", minimum_ion_count)
|
314
|
-
set_option(std_params, "output, maximum valid expectation value", maximum_valid_expectation_value)
|
315
|
-
end
|
316
|
-
|
317
|
-
# Fixed and Variable Modifications
|
318
|
-
#
|
319
|
-
unless search_tool.carbamidomethyl
|
320
|
-
mods=std_params.find('/bioml/note[@type="input" and @id="carbamidomethyl-fixed"]')
|
321
|
-
mods.each{ |node| node.remove!}
|
322
|
-
end
|
323
|
-
|
324
|
-
unless search_tool.glyco
|
325
|
-
mods=std_params.find('/bioml/note[@type="input" and @id="glyco-variable"]')
|
326
|
-
mods.each{ |node| node.remove!}
|
327
|
-
end
|
328
|
-
|
329
|
-
unless search_tool.methionine_oxidation
|
330
|
-
mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
|
331
|
-
mods.each{ |node| node.remove!}
|
332
|
-
end
|
333
|
-
|
334
|
-
# Merge all remaining id based modification into single modification.
|
335
|
-
collapse_keys(std_params, "residue, potential modification mass")
|
336
|
-
collapse_keys(std_params, "residue, modification mass")
|
337
|
-
|
338
|
-
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
339
|
-
var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
|
340
|
-
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
341
|
-
fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
|
342
|
-
|
343
|
-
root_bioml_node=std_params.find('/bioml')[0]
|
344
|
-
|
345
|
-
mod_id=1
|
346
|
-
var_mods.each do |vm|
|
347
|
-
|
348
|
-
mod_type="potential modification mass"
|
349
|
-
mod_type = "potential modification motif" if motif?(vm)
|
350
|
-
label="residue, #{mod_type}"
|
351
|
-
append_option(std_params, label, vm)
|
352
|
-
end
|
353
|
-
|
354
|
-
mod_id=1
|
355
|
-
fix_mods.each do |fm|
|
356
|
-
mod_type="modification mass"
|
357
|
-
mod_type = "modification motif" if motif?(fm)
|
358
|
-
label="residue, #{mod_type}"
|
359
|
-
append_option(std_params, label, fm)
|
49
|
+
puts final_output_path
|
50
|
+
if Pathname.new(final_output_path).absolute?
|
51
|
+
output_base_path=Pathname.new(final_output_path).dirname.to_s
|
52
|
+
else
|
53
|
+
output_base_path="#{Dir.pwd}/#{Pathname.new(final_output_path).dirname.to_s}"
|
360
54
|
end
|
55
|
+
puts output_base_path
|
361
56
|
|
362
|
-
|
363
|
-
std_params
|
364
|
-
|
365
|
-
end
|
366
|
-
|
367
|
-
def motif?(mod_string)
|
368
|
-
# 124@[ is not a modification motif, it is a residue (N-term) modification,
|
369
|
-
# so when checking if modification is a motif look for paired square brackets.
|
370
|
-
mod_string =~ /[\(\)\{\}\!]/ or mod_string =~ /\[.*\]/
|
371
|
-
end
|
372
|
-
|
57
|
+
protein_db_info=search_tool.database_info
|
373
58
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
throw "Exactly one taxon label is required in the taxonomy_template file" unless taxon_label.length==1
|
378
|
-
taxon_label[0].attributes['label']=search_tool.database.downcase
|
379
|
-
|
380
|
-
db_file=taxo_doc.find('/bioml/taxon/file')
|
381
|
-
throw "Exactly one database file is required in the taxonomy_template file" unless db_file.length==1
|
382
|
-
db_file[0].attributes['URL']=current_db
|
383
|
-
|
384
|
-
taxo_doc
|
385
|
-
end
|
386
|
-
|
387
|
-
# Run the search engine on each input file
|
388
|
-
#
|
389
|
-
ARGV.each do |filename|
|
59
|
+
taxo_path="#{final_output_path}.taxonomy.xml"
|
60
|
+
taxo_doc = search_tool.taxonomy_doc(protein_db_info)
|
61
|
+
taxo_doc.save(taxo_path)
|
390
62
|
|
391
|
-
|
392
|
-
|
63
|
+
params_path="#{final_output_path}.params"
|
64
|
+
params_doc = search_tool.params_doc(protein_db_info,taxo_path,input_path,final_output_path)
|
65
|
+
params_doc.save(params_path)
|
393
66
|
|
394
|
-
if ( search_tool.explicit_output==nil )
|
395
|
-
pepxml_path="#{output_path.match(/(.*)\.tandem$/)[1]}.pep.xml"
|
396
|
-
else
|
397
|
-
pepxml_path=search_tool.explicit_output
|
398
|
-
end
|
399
|
-
|
400
|
-
output_exists=false
|
401
|
-
if ( !search_tool.no_pepxml && Pathname.new(pepxml_path).exist?)
|
402
|
-
output_exists=true
|
403
|
-
end
|
404
|
-
|
405
|
-
if ( search_tool.no_pepxml && Pathname.new(output_path).exist? )
|
406
|
-
output_exists=true
|
407
|
-
end
|
408
|
-
|
409
67
|
# Only proceed if the output file is not present or we have opted to over-write it
|
410
68
|
#
|
411
69
|
if ( search_tool.over_write || !output_exists )
|
412
70
|
|
413
|
-
# Create the taxonomy file in the same directory as the params file
|
414
|
-
#
|
415
|
-
taxo_path="#{search_tool.input_base_path(filename.chomp)}.taxonomy.xml"
|
416
|
-
mod_taxo_doc=generate_taxonomy_doc(taxo_doc,current_db,search_tool)
|
417
|
-
mod_taxo_doc.save(taxo_path)
|
418
|
-
|
419
|
-
# Modify the default XML document to contain search specific details and save it so it can be used in the search
|
420
|
-
#
|
421
|
-
mod_params=generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
422
|
-
params_path="#{search_tool.input_base_path(filename.chomp)}.tandem.params"
|
423
|
-
mod_params.save(params_path)
|
424
|
-
|
425
71
|
# The basic command
|
426
72
|
#
|
427
73
|
cmd= "#{tandem_bin} #{params_path}"
|
428
74
|
|
429
|
-
# pepXML conversion and repair
|
430
|
-
#
|
431
|
-
unless search_tool.no_pepxml
|
432
|
-
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
433
|
-
cmd << "; Tandem2XML #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}"
|
434
|
-
if search_tool.tandem_output
|
435
|
-
cmd << "; cp #{output_path} #{search_tool.tandem_output}"
|
436
|
-
else
|
437
|
-
cmd << "; rm #{output_path}"
|
438
|
-
end
|
439
|
-
end
|
440
|
-
|
441
75
|
# Add a cleanup command unless the user wants to keep params files
|
442
76
|
#
|
443
77
|
unless search_tool.keep_params_files
|
444
78
|
cmd << "; rm #{params_path}; rm #{taxo_path}"
|
445
79
|
end
|
446
|
-
|
447
|
-
# In case the user specified background running we need to create a jobscript path
|
448
|
-
#
|
449
|
-
jobscript_path="#{output_path}.pbs.sh"
|
450
80
|
|
451
81
|
# Run the search
|
452
82
|
#
|
453
|
-
|
454
|
-
job_params[:queue]="sixteen"
|
455
|
-
job_params[:vmem]="12gb"
|
456
|
-
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
83
|
+
code = search_tool.run(cmd,genv)
|
457
84
|
throw "Command failed with exit code #{code}" unless code==0
|
458
85
|
else
|
459
86
|
genv.log("Skipping search on existing file #{output_path}",:warn)
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of Protk
|
4
|
+
# Created by Ira Cooke 12/4/2010
|
5
|
+
#
|
6
|
+
# Convert tandem output files to pepxml. A wrapper for Tandem2XML
|
7
|
+
#
|
8
|
+
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/search_tool'
|
12
|
+
|
13
|
+
# Environment with global constants
|
14
|
+
#
|
15
|
+
genv=Constants.new
|
16
|
+
|
17
|
+
tool=SearchTool.new([:explicit_output,:over_write,:prefix])
|
18
|
+
tool.option_parser.banner = "Convert tandem files to pep.xml files.\n\nUsage: tandem_to_pepxml.rb [options] file1.dat file2.dat ... "
|
19
|
+
|
20
|
+
@output_suffix=""
|
21
|
+
|
22
|
+
exit unless tool.check_options(true)
|
23
|
+
|
24
|
+
binpath=%x[which Tandem2XML]
|
25
|
+
binpath.chomp!
|
26
|
+
|
27
|
+
|
28
|
+
ARGV.each do |filename|
|
29
|
+
|
30
|
+
throw "Input file #{filename} does not exist" unless File.exist?(filename)
|
31
|
+
|
32
|
+
if ( tool.explicit_output )
|
33
|
+
output_path=tool.explicit_output
|
34
|
+
else
|
35
|
+
output_path=Tool.default_output_path(filename.chomp,".pep.xml",tool.output_prefix,@output_suffix)
|
36
|
+
end
|
37
|
+
|
38
|
+
throw "Unable to find Tandem2XML" unless binpath=~/Tandem2XML/
|
39
|
+
cmd = "#{binpath} #{filename.chomp} #{output_path}"
|
40
|
+
|
41
|
+
code = tool.run(cmd,genv)
|
42
|
+
throw "Command #{cmd} failed with exit code #{code}" unless code==0
|
43
|
+
end
|
data/bin/unimod_to_loc.rb
CHANGED