protk 1.2.6.pre5 → 1.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/sixframe.rb
CHANGED
@@ -16,55 +16,33 @@ def check_coords(naseq,aaseq,frame,pstart,pend)
|
|
16
16
|
orf_from_coords=naseq[pstart-1..pend-1].translate(1)
|
17
17
|
else
|
18
18
|
orf_from_coords=naseq[pstart-1..pend-1].reverse_complement.translate(1)
|
19
|
-
# current coords give
|
20
|
-
# naseq.reverse_complement[pstart-1..pend-1].translate(1)
|
21
|
-
# naseq[350368-pend..(350367-pstart+1)].reverse_complement.translate(1)
|
22
|
-
# orf_from_coords=naseq[naseq.length-pend..naseq.length-pstart].reverse_complement.translate(1)
|
23
19
|
end
|
24
20
|
if ( orf_from_coords!=aaseq)
|
25
21
|
require 'debugger'; debugger
|
26
22
|
end
|
27
|
-
# p "#{aaseq} #{frame}"
|
28
23
|
end
|
29
24
|
|
30
25
|
|
31
26
|
tool=Tool.new([:explicit_output])
|
32
27
|
tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage: sixframe.rb [options] genome.fasta"
|
33
28
|
|
34
|
-
tool.
|
35
|
-
tool.
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
tool.options.keep_header=true
|
40
|
-
tool.option_parser.on( '--strip-header', 'Dont write sequence definition' ) do
|
41
|
-
tool.options.keep_header=false
|
42
|
-
end
|
29
|
+
tool.add_boolean_option(:print_coords,false,['--coords', 'Write genomic coordinates in the fasta header'])
|
30
|
+
tool.add_boolean_option(:keep_header,true,['--strip-header', 'Dont write sequence definition'])
|
31
|
+
tool.add_value_option(:min_len,20,['--min-len','Minimum ORF length to keep'])
|
43
32
|
|
44
|
-
exit unless tool.check_options
|
33
|
+
exit unless tool.check_options(true)
|
45
34
|
|
46
|
-
|
47
|
-
puts "You must supply an input file"
|
48
|
-
puts tool.option_parser
|
49
|
-
exit
|
50
|
-
end
|
35
|
+
input_file=ARGV[0]
|
51
36
|
|
52
|
-
|
37
|
+
output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
|
53
38
|
|
54
|
-
|
55
|
-
if ( tool.explicit_output != nil)
|
56
|
-
outfile=File.open(tool.explicit_output,'w')
|
57
|
-
else
|
58
|
-
outfile=File.open("#{inname}.translated.fasta",'w')
|
59
|
-
end
|
39
|
+
output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
|
60
40
|
|
61
41
|
|
62
|
-
file = Bio::FastaFormat.open(
|
42
|
+
file = Bio::FastaFormat.open(input_file)
|
63
43
|
|
64
44
|
file.each do |entry|
|
65
45
|
|
66
|
-
puts entry.entry_id
|
67
|
-
|
68
46
|
length = entry.naseq.length
|
69
47
|
|
70
48
|
(1...7).each do |frame|
|
@@ -76,7 +54,7 @@ file.each do |entry|
|
|
76
54
|
oi=0
|
77
55
|
orfs.each do |orf|
|
78
56
|
oi+=1
|
79
|
-
if ( orf.length >
|
57
|
+
if ( orf.length > tool.min_len )
|
80
58
|
|
81
59
|
position_start = position
|
82
60
|
position_end = position_start + orf.length*3 -1
|
@@ -109,7 +87,7 @@ file.each do |entry|
|
|
109
87
|
# Output in fasta format
|
110
88
|
# start and end positions are always relative to the forward strand
|
111
89
|
|
112
|
-
|
90
|
+
output_fh.write("#{defline}\n#{orf}\n")
|
113
91
|
|
114
92
|
end
|
115
93
|
position += orf.length*3+3
|
data/bin/tandem_search.rb
CHANGED
@@ -8,8 +8,7 @@
|
|
8
8
|
|
9
9
|
require 'protk/constants'
|
10
10
|
require 'protk/command_runner'
|
11
|
-
require 'protk/
|
12
|
-
require 'protk/xtandem_defaults'
|
11
|
+
require 'protk/tandem_search_tool'
|
13
12
|
require 'libxml'
|
14
13
|
|
15
14
|
include LibXML
|
@@ -17,443 +16,71 @@ include LibXML
|
|
17
16
|
# Environment with global constants
|
18
17
|
#
|
19
18
|
genv=Constants.new
|
19
|
+
search_tool=TandemSearchTool.new()
|
20
20
|
|
21
|
-
|
22
|
-
#
|
23
|
-
search_tool=SearchTool.new([:background,:glyco,:database,:explicit_output,:over_write,
|
24
|
-
:enzyme,:modifications,:mass_tolerance_units,:mass_tolerance,:strict_monoisotopic_mass,
|
25
|
-
:missed_cleavages,:cleavage_semi,:carbamidomethyl,:methionine_oxidation
|
26
|
-
])
|
27
|
-
search_tool.jobid_prefix="x"
|
28
|
-
search_tool.option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
|
29
|
-
search_tool.options.output_suffix="_tandem"
|
30
|
-
|
31
|
-
tandem_defaults=XTandemDefaults.new.path
|
32
|
-
search_tool.options.tandem_params=tandem_defaults
|
33
|
-
search_tool.option_parser.on( '-T', '--tandem-params tandem', 'XTandem parameters to use' ) do |parms|
|
34
|
-
search_tool.options.tandem_params = parms
|
35
|
-
end
|
36
|
-
|
37
|
-
search_tool.options.no_pepxml=false
|
38
|
-
search_tool.option_parser.on( '-P', '--no-pepxml', 'Dont convert to pepXML after running the search') do
|
39
|
-
search_tool.options.no_pepxml=true
|
40
|
-
end
|
41
|
-
|
42
|
-
search_tool.options.keep_params_files=false
|
43
|
-
search_tool.option_parser.on( '-K', '--keep-params-files', 'Keep X!Tandem parameter files' ) do
|
44
|
-
search_tool.options.keep_params_files = true
|
45
|
-
end
|
46
|
-
|
47
|
-
# In case want pepXML, but still want tandem output also.
|
48
|
-
search_tool.options.tandem_output=nil
|
49
|
-
search_tool.option_parser.on( '--tandem-output tandem_output', 'Keep X! Tandem Output') do |tandem_output|
|
50
|
-
search_tool.options.tandem_output=tandem_output
|
51
|
-
end
|
52
|
-
|
53
|
-
search_tool.options.thresholds_type = 'isb_kscore'
|
54
|
-
search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold, system_default)' ) do |thresholds_type|
|
55
|
-
# This options sets up various X! Tandem thresholds.
|
56
|
-
# - system_default: Don't change any defaults just use
|
57
|
-
# the defaults for this TPP install as is.
|
58
|
-
# - tandem_default: These thresholds are found on the
|
59
|
-
# tandem api page. http://www.thegpm.org/tandem/api/index.html
|
60
|
-
# - isb_native: These are the defaults found in
|
61
|
-
# isb_default_input_native.xml distributed with TPP 4.6.
|
62
|
-
# - isb_kscore: These are the defaults found in
|
63
|
-
# isb_default_input_kscore.xml distributed with TPP 4.6.
|
64
|
-
# - scaffold: These are the defaults recommend by Proteome Software
|
65
|
-
# for use with Scaffold.
|
66
|
-
search_tool.options.thresholds_type = thresholds_type
|
67
|
-
end
|
68
|
-
|
69
|
-
search_tool.options.algorithm = "kscore"
|
70
|
-
search_tool.option_parser.on( '--algorithm algorithm', "Scoring algorithm (kscore or native)" ) do |algorithm|
|
71
|
-
search_tool.options.algorithm = algorithm
|
72
|
-
end
|
73
|
-
|
74
|
-
search_tool.options.cleavage_semi = true
|
75
|
-
search_tool.option_parser.on( '--no-cleavage-semi' ) do
|
76
|
-
search_tool.options.cleavage_semi = false
|
77
|
-
end
|
21
|
+
exit unless search_tool.check_options(true)
|
78
22
|
|
79
|
-
|
80
|
-
search_tool.options.n_terminal_mod_mass=nil
|
81
|
-
search_tool.option_parser.on('--n-terminal-mod-mass mass') do |mass|
|
82
|
-
search_tool.options.n_terminal_mod_mass = mass
|
83
|
-
end
|
84
|
-
|
85
|
-
search_tool.options.c_terminal_mod_mass=nil
|
86
|
-
search_tool.option_parser.on('--c-terminal-mod-mass mass') do |mass|
|
87
|
-
search_tool.options.c_terminal_mod_mass = mass
|
88
|
-
end
|
89
|
-
|
90
|
-
search_tool.options.cleavage_n_terminal_mod_mass=nil
|
91
|
-
search_tool.option_parser.on('--cleavage-n-terminal-mod-mass mass') do |mass|
|
92
|
-
search_tool.options.cleavage_n_terminal_mod_mass = mass
|
93
|
-
end
|
94
|
-
|
95
|
-
search_tool.options.cleavage_c_terminal_mod_mass=nil
|
96
|
-
search_tool.option_parser.on('--cleavage-c-terminal-mod-mass mass') do |mass|
|
97
|
-
search_tool.options.cleavage_c_terminal_mod_mass = mass
|
98
|
-
end
|
99
|
-
|
100
|
-
exit unless search_tool.check_options
|
101
|
-
|
102
|
-
if ( ARGV[0].nil? )
|
103
|
-
puts "You must supply an input file"
|
104
|
-
puts search_tool.option_parser
|
105
|
-
exit
|
106
|
-
end
|
107
|
-
|
108
|
-
|
109
|
-
# Set search engine specific parameters on the SearchTool object
|
23
|
+
# Our environment should be setup so that tandem or tandem.exe is on the path
|
110
24
|
#
|
111
|
-
tandem_bin
|
25
|
+
tandem_bin=%x[which tandem].chomp
|
26
|
+
tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
|
112
27
|
|
113
|
-
|
28
|
+
@output_suffix="_tandem"
|
114
29
|
|
115
|
-
|
116
|
-
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
117
|
-
current_db=Pathname.new(search_tool.database).realpath.to_s
|
118
|
-
else
|
119
|
-
current_db=search_tool.current_database :fasta
|
120
|
-
end
|
121
|
-
|
122
|
-
|
123
|
-
# Parse options from a parameter file (if provided), or from the default parameter file
|
124
|
-
#
|
125
|
-
params_parser=XML::Parser.file(tandem_params)
|
126
|
-
std_params=params_parser.parse
|
127
|
-
|
128
|
-
# Parse taxonomy template file
|
129
|
-
#
|
130
|
-
taxo_parser=XML::Parser.file(XTandemDefaults.new.taxonomy_path)
|
131
|
-
taxo_doc=taxo_parser.parse
|
132
|
-
|
133
|
-
# Galaxy changes things like @ to __at__ we need to change it back
|
30
|
+
# Run the search engine on each input file
|
134
31
|
#
|
135
|
-
|
136
|
-
mstring.gsub!("__at__","@")
|
137
|
-
mstring.gsub!("__oc__","{")
|
138
|
-
mstring.gsub!("__cc__","}")
|
139
|
-
mstring.gsub!("__ob__","[")
|
140
|
-
mstring.gsub!("__cb__","]")
|
141
|
-
mstring
|
142
|
-
end
|
32
|
+
ARGV.each do |filename|
|
143
33
|
|
144
|
-
|
145
|
-
notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
|
146
|
-
throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
|
147
|
-
notes[0].content=value
|
148
|
-
end
|
34
|
+
throw "Input file #{filename} does not exist" unless File.exist?(filename)
|
149
35
|
|
150
|
-
|
151
|
-
|
152
|
-
if notes.length == 0
|
153
|
-
node = XML::Node.new('note')
|
154
|
-
node["type"] = "input"
|
155
|
-
node["label"] = tandem_key
|
156
|
-
node.content = value
|
157
|
-
std_params.find('/bioml')[0] << node
|
158
|
-
else
|
159
|
-
throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
|
160
|
-
notes[0].content = append_string(notes[0].content, value)
|
161
|
-
end
|
162
|
-
end
|
36
|
+
input_path=Pathname.new(filename.chomp).expand_path.to_s
|
37
|
+
output_path=Tool.default_output_path(input_path,".tandem",search_tool.output_prefix,@output_suffix)
|
163
38
|
|
164
|
-
def collapse_keys(std_params, tandem_key)
|
165
|
-
mods=std_params.find('/bioml/note[@type="input" and @label="#{tandem_key}"]')
|
166
|
-
if not mods
|
167
|
-
first_mod = mods[0]
|
168
|
-
rest_mods = mods[1..-1]
|
169
|
-
rest_mods.each{ |node| first_mod.content = append_string(first_mod.content, node.content); node.remove!}
|
170
|
-
end
|
171
|
-
end
|
172
39
|
|
173
|
-
|
174
|
-
|
175
|
-
second
|
40
|
+
if ( search_tool.explicit_output )
|
41
|
+
final_output_path=search_tool.explicit_output
|
176
42
|
else
|
177
|
-
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
182
|
-
set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
|
183
|
-
set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
|
184
|
-
|
185
|
-
# Set the input and output paths
|
186
|
-
#
|
187
|
-
input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
|
188
|
-
throw "Exactly one spectrum, path note is required in the parameter file" unless input_notes.length==1
|
189
|
-
input_notes[0].content=input_path
|
190
|
-
|
191
|
-
output_notes=std_params.find('/bioml/note[@type="input" and @label="output, path"]')
|
192
|
-
throw "Exactly one output, path note is required in the parameter file" unless output_notes.length==1
|
193
|
-
output_notes[0].content=output_path
|
194
|
-
|
195
|
-
# Set the path to the scoring algorithm default params. We use one from ISB
|
196
|
-
#
|
197
|
-
scoring_notes=std_params.find('/bioml/note[@type="input" and @label="list path, default parameters"]')
|
198
|
-
throw "Exactly one list path, default parameters note is required in the parameter file" unless scoring_notes.length==1
|
199
|
-
scoring_notes[0].content="#{genv.tpp_root}/bin/isb_default_input_#{search_tool.algorithm}.xml"
|
200
|
-
|
201
|
-
# Taxonomy and Database
|
202
|
-
#
|
203
|
-
db_notes=std_params.find('/bioml/note[@type="input" and @label="protein, taxon"]')
|
204
|
-
throw "Exactly one protein, taxon note is required in the parameter file" unless db_notes.length==1
|
205
|
-
db_notes[0].content=search_tool.database.downcase
|
206
|
-
|
207
|
-
taxo_notes=std_params.find('/bioml/note[@type="input" and @label="list path, taxonomy information"]')
|
208
|
-
throw "Exactly one list path, taxonomy information note is required in the parameter file" unless taxo_notes.length==1
|
209
|
-
taxo_notes[0].content=taxo_path
|
210
|
-
|
211
|
-
fragment_tol = search_tool.fragment_tol
|
212
|
-
|
213
|
-
fmass=std_params.find('/bioml/note[@type="input" and @label="spectrum, fragment monoisotopic mass error"]')
|
214
|
-
p fmass
|
215
|
-
throw "Exactly one spectrum, fragment monoisotopic mass error note is required in the parameter file" unless fmass.length==1
|
216
|
-
fmass[0].content=fragment_tol.to_s
|
217
|
-
|
218
|
-
precursor_tol = search_tool.precursor_tol
|
219
|
-
ptol_plus=precursor_tol*0.5
|
220
|
-
ptol_minus=precursor_tol*0.5
|
221
|
-
|
222
|
-
# Precursor mass matching
|
223
|
-
#
|
224
|
-
pmass_minus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error minus"]')
|
225
|
-
throw "Exactly one spectrum, parent monoisotopic mass error minus note is required in the parameter file" unless pmass_minus.length==1
|
226
|
-
pmass_minus[0].content=ptol_minus.to_s
|
227
|
-
|
228
|
-
pmass_plus=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error plus"]')
|
229
|
-
throw "Exactly one spectrum, parent monoisotopic mass error plus note is required in the parameter file" unless pmass_plus.length==1
|
230
|
-
pmass_plus[0].content=ptol_plus.to_s
|
231
|
-
|
232
|
-
pmass_err_units=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass error units"]')
|
233
|
-
throw "Exactly one spectrum, parent monoisotopic mass error units note is required in the parameter file. Got #{pmass_err_units.length}" unless pmass_err_units.length==1
|
234
|
-
|
235
|
-
|
236
|
-
pmass_err_units[0].content=search_tool.precursor_tolu
|
237
|
-
|
238
|
-
if search_tool.strict_monoisotopic_mass
|
239
|
-
isotopic_error=std_params.find('/bioml/note[@type="input" and @label="spectrum, parent monoisotopic mass isotope error"]')
|
240
|
-
throw "Exactly one spectrum, parent monoisotopic mass isotope error is required in the parameter file" unless isotopic_error.length==1
|
241
|
-
isotopic_error[0].content="no"
|
43
|
+
final_output_path=output_path
|
242
44
|
end
|
243
45
|
|
244
|
-
if search_tool.tandem_output
|
245
|
-
# If one is interested in the tandem output (e.g. for consumption by Scaffold)
|
246
|
-
# want to store additional information.
|
247
|
-
set_option(std_params, "output, spectra", "yes")
|
248
|
-
end
|
249
46
|
|
250
|
-
|
47
|
+
output_exists=Pathname.new(final_output_path).exist?
|
251
48
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
end
|
258
|
-
|
259
|
-
minimum_ion_count = "4"
|
260
|
-
case thresholds_type
|
261
|
-
when "isb_kscore", "isb_native"
|
262
|
-
minimum_ion_count = "1"
|
263
|
-
when "scaffold"
|
264
|
-
minimum_ion_count = "0"
|
265
|
-
end
|
266
|
-
|
267
|
-
minimum_peaks = "15"
|
268
|
-
case thresholds_type
|
269
|
-
when "isb_native"
|
270
|
-
minimum_peaks = "6"
|
271
|
-
when "isb_kscore"
|
272
|
-
minimum_peaks = "10"
|
273
|
-
when "scaffold"
|
274
|
-
minimum_peaks = "0"
|
275
|
-
end
|
276
|
-
|
277
|
-
minimum_fragement_mz = "150"
|
278
|
-
case thresholds_type
|
279
|
-
when "isb_native"
|
280
|
-
minimum_fragement_mz = "50"
|
281
|
-
when "isb_kscore"
|
282
|
-
minimum_fragement_mz = "125"
|
283
|
-
when "scaffold"
|
284
|
-
minimum_fragement_mz = "0"
|
285
|
-
end
|
286
|
-
|
287
|
-
minimum_parent_mh = "500" # tandem and isb_native defaults
|
288
|
-
case thresholds_type
|
289
|
-
when "isb_kscore"
|
290
|
-
minimum_parent_mh = "600"
|
291
|
-
when "scaffold"
|
292
|
-
minimum_parent_mh = "0"
|
293
|
-
end
|
294
|
-
|
295
|
-
use_noise_suppression = "yes"
|
296
|
-
if thresholds_type == "isb_kscore" or thresholds_type == "scaffold"
|
297
|
-
use_noise_suppression = "no"
|
298
|
-
end
|
299
|
-
|
300
|
-
dynamic_range = "100.0"
|
301
|
-
case thresholds_type
|
302
|
-
when "isb_kscore"
|
303
|
-
dynamic_range = "10000.0"
|
304
|
-
when "scaffold"
|
305
|
-
dynamic_range = "1000.0"
|
306
|
-
end
|
307
|
-
|
308
|
-
set_option(std_params, "spectrum, dynamic range", dynamic_range)
|
309
|
-
set_option(std_params, "spectrum, use noise suppression", use_noise_suppression)
|
310
|
-
set_option(std_params, "spectrum, minimum parent m+h", minimum_parent_mh)
|
311
|
-
set_option(std_params, "spectrum, minimum fragment mz", minimum_fragement_mz)
|
312
|
-
set_option(std_params, "spectrum, minimum peaks", minimum_peaks)
|
313
|
-
set_option(std_params, "scoring, minimum ion count", minimum_ion_count)
|
314
|
-
set_option(std_params, "output, maximum valid expectation value", maximum_valid_expectation_value)
|
315
|
-
end
|
316
|
-
|
317
|
-
# Fixed and Variable Modifications
|
318
|
-
#
|
319
|
-
unless search_tool.carbamidomethyl
|
320
|
-
mods=std_params.find('/bioml/note[@type="input" and @id="carbamidomethyl-fixed"]')
|
321
|
-
mods.each{ |node| node.remove!}
|
322
|
-
end
|
323
|
-
|
324
|
-
unless search_tool.glyco
|
325
|
-
mods=std_params.find('/bioml/note[@type="input" and @id="glyco-variable"]')
|
326
|
-
mods.each{ |node| node.remove!}
|
327
|
-
end
|
328
|
-
|
329
|
-
unless search_tool.methionine_oxidation
|
330
|
-
mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
|
331
|
-
mods.each{ |node| node.remove!}
|
332
|
-
end
|
333
|
-
|
334
|
-
# Merge all remaining id based modification into single modification.
|
335
|
-
collapse_keys(std_params, "residue, potential modification mass")
|
336
|
-
collapse_keys(std_params, "residue, modification mass")
|
337
|
-
|
338
|
-
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
339
|
-
var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
|
340
|
-
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
341
|
-
fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
|
342
|
-
|
343
|
-
root_bioml_node=std_params.find('/bioml')[0]
|
344
|
-
|
345
|
-
mod_id=1
|
346
|
-
var_mods.each do |vm|
|
347
|
-
|
348
|
-
mod_type="potential modification mass"
|
349
|
-
mod_type = "potential modification motif" if motif?(vm)
|
350
|
-
label="residue, #{mod_type}"
|
351
|
-
append_option(std_params, label, vm)
|
352
|
-
end
|
353
|
-
|
354
|
-
mod_id=1
|
355
|
-
fix_mods.each do |fm|
|
356
|
-
mod_type="modification mass"
|
357
|
-
mod_type = "modification motif" if motif?(fm)
|
358
|
-
label="residue, #{mod_type}"
|
359
|
-
append_option(std_params, label, fm)
|
49
|
+
puts final_output_path
|
50
|
+
if Pathname.new(final_output_path).absolute?
|
51
|
+
output_base_path=Pathname.new(final_output_path).dirname.to_s
|
52
|
+
else
|
53
|
+
output_base_path="#{Dir.pwd}/#{Pathname.new(final_output_path).dirname.to_s}"
|
360
54
|
end
|
55
|
+
puts output_base_path
|
361
56
|
|
362
|
-
|
363
|
-
std_params
|
364
|
-
|
365
|
-
end
|
366
|
-
|
367
|
-
def motif?(mod_string)
|
368
|
-
# 124@[ is not a modification motif, it is a residue (N-term) modification,
|
369
|
-
# so when checking if modification is a motif look for paired square brackets.
|
370
|
-
mod_string =~ /[\(\)\{\}\!]/ or mod_string =~ /\[.*\]/
|
371
|
-
end
|
372
|
-
|
57
|
+
protein_db_info=search_tool.database_info
|
373
58
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
throw "Exactly one taxon label is required in the taxonomy_template file" unless taxon_label.length==1
|
378
|
-
taxon_label[0].attributes['label']=search_tool.database.downcase
|
379
|
-
|
380
|
-
db_file=taxo_doc.find('/bioml/taxon/file')
|
381
|
-
throw "Exactly one database file is required in the taxonomy_template file" unless db_file.length==1
|
382
|
-
db_file[0].attributes['URL']=current_db
|
383
|
-
|
384
|
-
taxo_doc
|
385
|
-
end
|
386
|
-
|
387
|
-
# Run the search engine on each input file
|
388
|
-
#
|
389
|
-
ARGV.each do |filename|
|
59
|
+
taxo_path="#{final_output_path}.taxonomy.xml"
|
60
|
+
taxo_doc = search_tool.taxonomy_doc(protein_db_info)
|
61
|
+
taxo_doc.save(taxo_path)
|
390
62
|
|
391
|
-
|
392
|
-
|
63
|
+
params_path="#{final_output_path}.params"
|
64
|
+
params_doc = search_tool.params_doc(protein_db_info,taxo_path,input_path,final_output_path)
|
65
|
+
params_doc.save(params_path)
|
393
66
|
|
394
|
-
if ( search_tool.explicit_output==nil )
|
395
|
-
pepxml_path="#{output_path.match(/(.*)\.tandem$/)[1]}.pep.xml"
|
396
|
-
else
|
397
|
-
pepxml_path=search_tool.explicit_output
|
398
|
-
end
|
399
|
-
|
400
|
-
output_exists=false
|
401
|
-
if ( !search_tool.no_pepxml && Pathname.new(pepxml_path).exist?)
|
402
|
-
output_exists=true
|
403
|
-
end
|
404
|
-
|
405
|
-
if ( search_tool.no_pepxml && Pathname.new(output_path).exist? )
|
406
|
-
output_exists=true
|
407
|
-
end
|
408
|
-
|
409
67
|
# Only proceed if the output file is not present or we have opted to over-write it
|
410
68
|
#
|
411
69
|
if ( search_tool.over_write || !output_exists )
|
412
70
|
|
413
|
-
# Create the taxonomy file in the same directory as the params file
|
414
|
-
#
|
415
|
-
taxo_path="#{search_tool.input_base_path(filename.chomp)}.taxonomy.xml"
|
416
|
-
mod_taxo_doc=generate_taxonomy_doc(taxo_doc,current_db,search_tool)
|
417
|
-
mod_taxo_doc.save(taxo_path)
|
418
|
-
|
419
|
-
# Modify the default XML document to contain search specific details and save it so it can be used in the search
|
420
|
-
#
|
421
|
-
mod_params=generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
422
|
-
params_path="#{search_tool.input_base_path(filename.chomp)}.tandem.params"
|
423
|
-
mod_params.save(params_path)
|
424
|
-
|
425
71
|
# The basic command
|
426
72
|
#
|
427
73
|
cmd= "#{tandem_bin} #{params_path}"
|
428
74
|
|
429
|
-
# pepXML conversion and repair
|
430
|
-
#
|
431
|
-
unless search_tool.no_pepxml
|
432
|
-
repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
|
433
|
-
cmd << "; Tandem2XML #{output_path} #{pepxml_path}; #{repair_script} #{pepxml_path}"
|
434
|
-
if search_tool.tandem_output
|
435
|
-
cmd << "; cp #{output_path} #{search_tool.tandem_output}"
|
436
|
-
else
|
437
|
-
cmd << "; rm #{output_path}"
|
438
|
-
end
|
439
|
-
end
|
440
|
-
|
441
75
|
# Add a cleanup command unless the user wants to keep params files
|
442
76
|
#
|
443
77
|
unless search_tool.keep_params_files
|
444
78
|
cmd << "; rm #{params_path}; rm #{taxo_path}"
|
445
79
|
end
|
446
|
-
|
447
|
-
# In case the user specified background running we need to create a jobscript path
|
448
|
-
#
|
449
|
-
jobscript_path="#{output_path}.pbs.sh"
|
450
80
|
|
451
81
|
# Run the search
|
452
82
|
#
|
453
|
-
|
454
|
-
job_params[:queue]="sixteen"
|
455
|
-
job_params[:vmem]="12gb"
|
456
|
-
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
83
|
+
code = search_tool.run(cmd,genv)
|
457
84
|
throw "Command failed with exit code #{code}" unless code==0
|
458
85
|
else
|
459
86
|
genv.log("Skipping search on existing file #{output_path}",:warn)
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of Protk
|
4
|
+
# Created by Ira Cooke 12/4/2010
|
5
|
+
#
|
6
|
+
# Convert tandem output files to pepxml. A wrapper for Tandem2XML
|
7
|
+
#
|
8
|
+
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/search_tool'
|
12
|
+
|
13
|
+
# Environment with global constants
|
14
|
+
#
|
15
|
+
genv=Constants.new
|
16
|
+
|
17
|
+
tool=SearchTool.new([:explicit_output,:over_write,:prefix])
|
18
|
+
tool.option_parser.banner = "Convert tandem files to pep.xml files.\n\nUsage: tandem_to_pepxml.rb [options] file1.dat file2.dat ... "
|
19
|
+
|
20
|
+
@output_suffix=""
|
21
|
+
|
22
|
+
exit unless tool.check_options(true)
|
23
|
+
|
24
|
+
binpath=%x[which Tandem2XML]
|
25
|
+
binpath.chomp!
|
26
|
+
|
27
|
+
|
28
|
+
ARGV.each do |filename|
|
29
|
+
|
30
|
+
throw "Input file #{filename} does not exist" unless File.exist?(filename)
|
31
|
+
|
32
|
+
if ( tool.explicit_output )
|
33
|
+
output_path=tool.explicit_output
|
34
|
+
else
|
35
|
+
output_path=Tool.default_output_path(filename.chomp,".pep.xml",tool.output_prefix,@output_suffix)
|
36
|
+
end
|
37
|
+
|
38
|
+
throw "Unable to find Tandem2XML" unless binpath=~/Tandem2XML/
|
39
|
+
cmd = "#{binpath} #{filename.chomp} #{output_path}"
|
40
|
+
|
41
|
+
code = tool.run(cmd,genv)
|
42
|
+
throw "Command #{cmd} failed with exit code #{code}" unless code==0
|
43
|
+
end
|
data/bin/unimod_to_loc.rb
CHANGED