mspire 0.1.7 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/Rakefile +41 -14
  2. data/bin/bioworks2excel.rb +1 -1
  3. data/bin/bioworks_to_pepxml.rb +46 -59
  4. data/bin/fasta_shaker.rb +1 -1
  5. data/bin/filter.rb +6 -0
  6. data/bin/find_aa_freq.rb +23 -0
  7. data/bin/id_precision.rb +3 -2
  8. data/bin/mzxml_to_lmat.rb +2 -1
  9. data/bin/pepproph_filter.rb +1 -1
  10. data/bin/precision.rb +1 -1
  11. data/bin/protein_summary.rb +2 -451
  12. data/bin/raw_to_mzXML.rb +55 -0
  13. data/bin/srf_group.rb +26 -0
  14. data/changelog.txt +7 -0
  15. data/lib/align.rb +3 -3
  16. data/lib/fasta.rb +6 -1
  17. data/lib/gi.rb +9 -4
  18. data/lib/roc.rb +2 -0
  19. data/lib/sample_enzyme.rb +2 -1
  20. data/lib/spec/mzxml/parser.rb +2 -43
  21. data/lib/spec/mzxml.rb +65 -2
  22. data/lib/spec_id/aa_freqs.rb +10 -7
  23. data/lib/spec_id/bioworks.rb +67 -87
  24. data/lib/spec_id/filter.rb +794 -0
  25. data/lib/spec_id/precision.rb +29 -36
  26. data/lib/spec_id/proph.rb +5 -3
  27. data/lib/spec_id/protein_summary.rb +459 -0
  28. data/lib/spec_id/sequest.rb +323 -271
  29. data/lib/spec_id/srf.rb +189 -135
  30. data/lib/spec_id.rb +276 -227
  31. data/lib/spec_id_xml.rb +101 -0
  32. data/lib/toppred.rb +18 -0
  33. data/script/degenerate_peptides.rb +47 -0
  34. data/script/filter-peps.rb +5 -1
  35. data/test/tc_align.rb +1 -1
  36. data/test/tc_bioworks.rb +25 -22
  37. data/test/tc_bioworks_to_pepxml.rb +37 -4
  38. data/test/tc_fasta.rb +3 -1
  39. data/test/tc_fasta_shaker.rb +8 -6
  40. data/test/tc_filter.rb +203 -0
  41. data/test/tc_gi.rb +6 -9
  42. data/test/tc_id_precision.rb +31 -0
  43. data/test/tc_mzxml.rb +8 -6
  44. data/test/tc_peptide_parent_times.rb +2 -1
  45. data/test/tc_precision.rb +1 -1
  46. data/test/tc_proph.rb +5 -5
  47. data/test/tc_protein_summary.rb +36 -13
  48. data/test/tc_sequest.rb +78 -33
  49. data/test/tc_spec_id.rb +128 -6
  50. data/test/tc_srf.rb +84 -38
  51. metadata +67 -62
  52. data/bin/fasta_cat.rb +0 -39
  53. data/bin/fasta_cat_mod.rb +0 -59
  54. data/bin/fasta_mod.rb +0 -57
  55. data/bin/filter_spec_id.rb +0 -365
  56. data/bin/raw2mzXML.rb +0 -21
  57. data/script/gen_database_searching.rb +0 -258
data/bin/fasta_mod.rb DELETED
@@ -1,57 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'fasta'
4
- require 'optparse'
5
-
6
- hash = {
7
- 'shuffle' => {
8
- 'method' => :aaseq_shuffle!,
9
- 'file_postfix' => Fasta::SHUFF_FILE_POSTFIX,
10
- },
11
- 'invert' => {
12
- 'method' => :aaseq_invert!,
13
- 'file_postfix' => Fasta::INV_FILE_POSTFIX,
14
- },
15
- }
16
-
17
- opt = {}
18
- OptionParser.new do |opts|
19
- opts.on("-p", "--prefix PREFIX", "add prefix to protein header") {|v| opt['p'] = v }
20
- end.parse!
21
- #opts = GetoptLong.new(["-p", "--prefix", GetoptLong::REQUIRED_ARGUMENT])
22
-
23
-
24
- if ARGV.size < 2
25
- puts "
26
- usage: #{File.basename(__FILE__)} [-p <prefix>] <method> <file>.fasta ...
27
-
28
- AA seq's will be modified according to <method>. Each file takes on a
29
- postfix (before the extension).
30
-
31
- -p prefix each protein's header will take on a header prefix after the '>'.
32
-
33
- Method FILE_POSTFIX (added before the file extension)
34
- shuffle #{hash['shuffle']['file_postfix']}
35
- invert #{hash['invert']['file_postfix']}
36
- "
37
- exit
38
- end
39
-
40
- prefix = nil; if opt.key?('p') then prefix = opt['p'] end
41
- method = ARGV.shift
42
-
43
- cmd_hash = nil
44
- if hash.key?(method)
45
- cmd_hash = hash[method]
46
- else
47
- puts "Not a defined method: #{method}"
48
- puts "type #{File.basename(__FILE__)} for usage"
49
- exit
50
- end
51
-
52
- ARGV.each do |fn|
53
- outfile = Fasta.modify_file(fn, cmd_hash['method'], cmd_hash['file_postfix'], prefix)
54
- puts "OUTPUT: #{outfile}"
55
- end
56
-
57
-
@@ -1,365 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'spec_id'
4
- require 'optparse'
5
- require 'ostruct'
6
- require 'spec_id/aa_freqs'
7
-
8
- ########################################################
9
- WRITE_MARSHAL = true
10
- TABULATE_DATA = true
11
- WRITE_CYS_FIND = false
12
- ########################################################
13
-
14
- opt = OpenStruct.new
15
- opt.x1 = 1.0
16
- opt.x2 = 1.5
17
- opt.x3 = 2.0
18
- opt.c = 0.5
19
- opt.rppm = 1000.0
20
- opt.false = false
21
-
22
- # prints shortened number for display
23
- def short(num)
24
- sprintf( "%.3f",num)
25
- end
26
-
27
- opts = OptionParser.new do |op|
28
- op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml"
29
- op.separator("prints number of proteins (and FPR if -f option)")
30
- op.separator ""
31
-
32
- op.separator("** only takes the top hit per scan+charge")
33
- op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
34
- op.separator(" (these are peptides who are the only hit with xcorr > 0)")
35
- op.separator ""
36
- op.on("-1", "--xcorr1 <f>", "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v.to_f}
37
- op.on("-2", "--xcorr2 <f>", "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v.to_f}
38
- op.on("-3", "--xcorr3 <f>", "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v.to_f}
39
- op.on("-c", "--deltacn <f>", ">= deltacn d: #{opt.c}") {|v| opt.c = v.to_f}
40
- op.on("--rppm <f>", "<= rough ppm (10^6*deltamass/mass) d: #{opt.rppm}") {|v| opt.rppm = v.to_f}
41
- op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
42
- op.on("-f", "--false <s>", "protein prefix or file name of decoys") {|v| opt.false = v}
43
- op.on("-y", "--cysteines <fasta_file|freq>", "report fpr by expected cysteine freq") {|v| opt.cysteines = v}
44
- op.on("--cback <mean,stdev>", "the cysteine background") {|v| opt.cback = v}
45
- op.on("--from_file <file>", "(no -i) file with list of interactive input") {|v| opt.from_file = v}
46
- op.on("-t", "--true_pos <fasta>", "fasta file containing true hits") {|v| opt.true_pos = v }
47
- end
48
-
49
- $cys_mean = nil
50
- $cys_stdev = nil
51
-
52
-
53
- # fpr is a SpecID obj that is the false positives
54
- # cysteines holds an aafreqs object or nil
55
- def filter_round(files, spec_ids, kind, args, fpr=nil, cysteines=nil, interactive=false)
56
- (x1, x2, x3, deltacn, rppm) = args
57
- combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)
58
- puts "=========================================================================="
59
- puts "[[ xcorr(1,2,3) >= #{x1},#{x2},#{x3} ; deltacn >= #{deltacn} ; rough_ppm <= #{rppm} ]]"
60
- # push fpr on the end for the calculations
61
- if fpr ; spec_ids.push(fpr) ; end
62
- arr_of_prots_and_peps_and_deltacnstars_and_cfpr = spec_ids.map do |spec_id|
63
- (prots, peps, deltacnstar_cnt) = spec_id.filter(kind, *args)
64
- if cysteines
65
-
66
- if cysteines.is_a? Float
67
- freq = cysteines
68
- else
69
- freq = cysteines.aafreqs[:C]
70
- end
71
- (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(peps, freq)
72
- [prots, peps, deltacnstar_cnt, [ac,exp]]
73
- else
74
- [prots, peps, deltacnstar_cnt]
75
- end
76
- end
77
- arr_of_num_of_prots = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[0].size }
78
- arr_of_num_of_peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[1].size }
79
- deltacnstars = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[2] }
80
- cys_reports = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[3] } if cysteines
81
- prot_nums = arr_of_num_of_prots
82
- pep_nums = arr_of_num_of_peps
83
- ## files = [file1, file2, file3]
84
- ## prot_nums = [nums1, nums2, nums3, nums_for_false_positives]
85
- ## pep_nums = [nums1, nums2, nums3, nums_for_false_positives]
86
- files.each_with_index do |file,i|
87
-
88
- if !interactive
89
- puts "#{file} [prots]:\t#{prot_nums[i]}"
90
- puts "#{file} [peps]:\t#{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
91
- else
92
- puts "file#{i+1} [prots]: #{prot_nums[i]}"
93
- puts "file#{i+1} [peps]: #{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
94
- end
95
- if fpr
96
- #puts "FPR [prots] : " + short( 100.0*(prot_nums[-1].to_f/prot_nums[i].to_f) ) + " % (#{prot_nums[-1]})"
97
- #puts "FPR [peps] : " + short( 100.0*(pep_nums[-1].to_f/pep_nums[i].to_f) ) + " % (#{pep_nums[-1]}) (dcn*=#{deltacnstars[-1]})"
98
-
99
- ## For separate searches: every false positive = one less TP
100
- ## For concatenated searches: every false positive is one less TP
101
- ## THAT's what I've been doing already !
102
-
103
- prot_tps = prot_nums[i] - prot_nums[-1]
104
- pep_tps = pep_nums[i] - pep_nums[-1]
105
- prot_fps = prot_nums[i] - prot_tps
106
- pep_fps = pep_nums[i] - pep_tps
107
- prot_fpr = prot_fps.to_f/prot_nums[i].to_f
108
- pep_fpr = pep_fps.to_f/pep_nums[i].to_f
109
- # those are the same!
110
- puts "FPR [prots] : " + short( 100.0*prot_fpr ) + " % (#{prot_fps})"
111
- puts "FPR [peps] : " + short( 100.0*pep_fpr ) + " % (#{pep_fps}) (dcn*=#{deltacnstars[-1]})"
112
- end
113
- if cysteines
114
- (ac, exp) = cys_reports[i]
115
-
116
- (cys_fprate, total_num_false) = fpr_by_cysteines(ac, exp, pep_nums[i], $cys_mean, $cys_stdev)
117
- fraction_of_expected = ac.to_f/exp
118
- cys_tps = pep_nums[i] - total_num_false
119
-
120
- puts "CYSTEINE FPR: "
121
- puts " (# peps containing >= 1 cysteines)"
122
- puts " actual: #{ac}"
123
- puts "fraction of expected: #{short(fraction_of_expected)}"
124
- puts " expected # FP's: " + short(total_num_false)
125
- puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
126
-
127
- puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)"
128
- puts "Combined Score & FPR"
129
- puts "#{combined_score}\t#{cys_fprate}"
130
- puts "Combined Score & fraction of expected"
131
- #puts "#{combined_score} #{fraction_of_expected}"
132
- to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
133
- puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
134
- to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, rppm]
135
- puts to_tab.join("\t") if TABULATE_DATA
136
- end
137
- if $true_pos_aaseqs
138
- peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr[i][1]
139
- real_tps = 0
140
- real_fps = 0
141
- # could also do with partition
142
- peps.each do |pep|
143
- if pep.sequence =~ /\.([\w\*]+)\.?/
144
- if $true_pos_aaseqs.any? {|aaseq| aaseq.include? $1}
145
- real_tps += 1
146
- else
147
- real_fps += 1
148
- end
149
- else
150
- abort "Couldn't Match: #{pep.sequence}"
151
- end
152
- end
153
- if peps.size > 0
154
- real_fpr = real_fps.to_f/peps.size
155
- else
156
- real_fpr = 0.0
157
- end
158
- puts "REAL FPR: #{real_fpr}"
159
- puts "REAL #TP: #{real_tps}"
160
- to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, real_tps, real_fpr, '', x1, x2, x3, deltacn, rppm]
161
- puts to_tab.join("\t") if TABULATE_DATA
162
-
163
- end
164
- end
165
- #puts files.join(' | ')
166
- #puts nums.join(' | ')
167
- end
168
-
169
-
170
- # (actual # with cys, expected # with cys, total#peptides,
171
- # mean_fraction_of_cysteines_true, std)
172
- # PepHit(C) = Peptide containing cysteine
173
- # # Total PepHit(C) # Observed Bad Pep (C)
174
- # ------------------ proportional_to ----------------------
175
- # # Total PepHit # Total Bad PepHit (X)
176
- def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
177
-
178
- # the number of bona fide BAD cysteine hits
179
- # (some of the cysteine hits (~5%) are true positives)
180
-
181
- ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
182
- if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
183
- total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
184
- fpr = total_number_false / total_peptides
185
- [fpr, total_number_false]
186
- end
187
-
188
- # assumes its already chomped
189
- # updates the 5 globals
190
- def prep_reply(reply, base)
191
- if reply == 'q' ; exit ; end
192
- if reply =~ /^\s*$/
193
- base
194
- elsif reply
195
- arr = reply.split(/\s+/)
196
- to_change = []
197
- to_change_hash = {}
198
- arr.each do |it|
199
- if it.include? ':'
200
- (k,v) = it.split(':')
201
- to_change_hash[k] = v
202
- else
203
- to_change << it
204
- end
205
- end
206
- to_change.each_with_index do |tc,i|
207
- begin
208
- base[i] = tc.to_f
209
- rescue NoMethodError
210
- puts "BAD ARG: #{tc}"
211
- return false
212
- end
213
- end
214
- to_change_hash.each do |k,v|
215
- case k
216
- when 'x1' ; base[0] = v
217
- when 'x2' ; base[1] = v
218
- when 'x3' ; base[2] = v
219
- when 'dcn' ; base[3] = v
220
- when 'rppm' ; base[4] = v
221
- else
222
- puts "BAD ARG: #{k}:#{v}"
223
- end
224
- end
225
- base.map {|v| v.to_f }
226
- else
227
- false
228
- end
229
- end
230
-
231
- def file_to_prefiltered_spec_id(file)
232
- spec_id = nil
233
- marshal_file = file + ".prefiltered.msh"
234
- if File.exist?(marshal_file)
235
- File.open(marshal_file) do |fh|
236
- spec_id = Marshal.load(fh)
237
- end
238
- else
239
- spec_id = SpecID.new(file)
240
- spec_id.top_peps_prefilter!
241
- ## marshal it!
242
- if WRITE_MARSHAL
243
- File.open(marshal_file, "w") do |fh|
244
- Marshal.dump(spec_id,fh)
245
- end
246
- end
247
- end
248
- spec_id
249
- end
250
-
251
- def interactive_help
252
- puts "enter: <x1> <x2> <x3> <dcn> <rppm>"
253
- puts "or : x1:<x1> x2:<x2> x3:<x3> dcn:<dcn> rppm:<rppm>"
254
- puts "or : dcn:<dcn>"
255
- puts "or : <x1> <x2> rppm:<rppm>"
256
- puts "etc..."
257
- puts "<enter> to (re)run current values"
258
- puts "'q' to quit"
259
- end
260
-
261
- opts.parse!
262
-
263
- if ARGV.size < 1
264
- puts opts
265
- exit
266
- end
267
-
268
- $stderr.puts "reading files (can take a minute or two for large files)..."
269
- files = ARGV.map {|file| file }
270
- ARGV.clear
271
-
272
- arr_of_spec_ids = files.map do |file|
273
- file_to_prefiltered_spec_id(file)
274
- end
275
-
276
- fpr = nil
277
- cysteines = nil
278
- if opt.cysteines
279
- puts %w(TABULATE combined_score pep_tps pep_fprate cys_pep_tps cys_pep_fprate [nil] x1 x2 x3 dcn rppm).join("\t")
280
- if File.exist? opt.cysteines
281
- cysteines = SpecID::AAFreqs.new(opt.cysteines)
282
- else
283
- cysteines = opt.cysteines.to_f
284
- end
285
- if opt.cback
286
- ($cys_mean, $cys_stdev) = opt.cback.split(',').map{|v| v.to_f }
287
- end
288
- end
289
-
290
- $true_pos_aaseqs = nil
291
- if opt.true_pos
292
- puts %w(TABULATE combined_score pep_tps pep_fprate real_tps real_fpr [nil] x1 x2 x3 dcn rppm).join("\t")
293
- fasta = Fasta.new.read_file(opt.true_pos)
294
- $true_pos_aaseqs = fasta.prots.map do |prot|
295
- prot.aaseq.chomp
296
- end
297
- end
298
-
299
- if opt.false
300
- # its a file if it exists
301
- if File.exist? opt.false
302
- fpr = file_to_prefiltered_spec_id(opt.false)
303
- else # its a prefix (Assume one file for now!)
304
- spec_obj = arr_of_spec_ids[0]
305
- (tps, fps) = spec_obj.classify_by_prefix(:peps, opt.false)
306
- fps_specid_minor = spec_obj.obj.class.new
307
- tps_specid_minor = spec_obj.obj.class.new
308
- fps_specid = SpecID.new
309
- tps_specid = SpecID.new
310
- fps_specid.obj = fps_specid_minor
311
- tps_specid.obj = tps_specid_minor
312
- fps_specid.peps = fps
313
- tps_specid.peps = tps
314
- arr_of_spec_ids[0] = tps_specid
315
- fpr = fps_specid
316
- end
317
- end
318
-
319
-
320
-
321
-
322
- base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.rppm]
323
-
324
- if opt.from_file
325
- lines = IO.readlines(opt.from_file)
326
- lines.each do |line|
327
- line.chomp!
328
- answer = prep_reply(line, base_args)
329
- next if answer == false
330
- base_args = answer
331
- filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
332
- end
333
- elsif opt.i
334
- interactive_help
335
- puts "*******************************************************"
336
- puts "Number of proteins in files (this order):"
337
- files.each do |file|
338
- puts file
339
- end
340
- puts "*******************************************************"
341
- reply = "nil"
342
- loop do
343
- b = base_args
344
- puts "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} rppm:#{b[4]}"
345
- loop do
346
- reply = gets.chomp
347
- answer = prep_reply(reply, base_args)
348
- if answer == false
349
- interactive_help
350
- else
351
- base_args = answer
352
- filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
353
- break
354
- end
355
- end
356
- end
357
- else
358
- filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, false)
359
- end
360
-
361
-
362
-
363
-
364
-
365
-
data/bin/raw2mzXML.rb DELETED
@@ -1,21 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- # John Prince
4
- # 2006-08-24
5
- # takes multiple files and converts them into mzXML
6
-
7
- EXECUTE_MZXML_CONVERTER = "t2x";
8
-
9
- if ARGV.size == 0
10
- puts "usage: raw2mzXML.pl file1.RAW file2.RAW ..."
11
- puts " [or raw2mzXML.pl *.RAW]"
12
- exit
13
- end
14
-
15
- files = ARGV.to_a
16
- files.each do |file|
17
- puts "******************************************"
18
- puts "About to convert #{file}"
19
- system "#{EXECUTE_MZXML_CONVERTER} #{file}"
20
- puts "******************************************"
21
- end
@@ -1,258 +0,0 @@
1
- require 'fileutils'
2
-
3
- ####################################################
4
- # GLOBAL:
5
- make_path = File.dirname(__FILE__) + "/../doc/src/tutorial/database_searching"
6
- destination = File.dirname(__FILE__) + "/../doc/src/tutorial/database_searching/"
7
- intropage = "index"
8
- option_a_file = "two_db_search"
9
- option_b_file = "cat_db_search"
10
- extension = ".page"
11
- index_header = "---
12
- inMenu: false
13
- directoryName: DB searching
14
- ---\n"
15
- header = "---
16
- inMenu: true
17
- directoryName: DB searching
18
- ---\n"
19
- ####################################################
20
-
21
- ## Make the destination path
22
- FileUtils.mkpath( make_path )
23
-
24
- def print_file(destination, base_name, extension, header, string)
25
- File.open(destination + base_name + extension,'w') do |out| out.print(header + string) end
26
- end
27
-
28
- class StepByStep
29
-
30
- def links
31
- "How to: [Two-DB Search](two_db_search.html) | [Cat-DB Search](cat_db_search.html)"
32
- end
33
-
34
- def intro
35
- "Step by step
36
- ------------
37
- Guide for basic processing of mass spectrometry data derived from Bioworks 3.2.
38
- "
39
- end
40
-
41
- def explanation
42
- "### Database Search Options
43
-
44
- There are two flavors for estimating a false positive identification rate for
45
- protein identification search engines. The first uses two databases: one
46
- normal, the other a decoy (['Two-DB'](two_db_search.html)). The second method (['Cat-DB'](cat_db_search.html)) uses a
47
- single databases where decoy proteins have been inserted at the bottom of the
48
- file. Each method has its advantages and limitations:
49
-
50
- Two-DB: Pros: Database size unchanged.
51
- Prophet prefers a normal database.
52
- Cons: No independent estimation of FPR for Bioworks and Prophet probs.
53
- Multiple parameters make it difficult to select a specfic FPR -
54
- you take the FPR you get or keep re-filtering.
55
-
56
- Cat-DB: Pros: Independent estimation of Bioworks and Prophet probs.
57
- Can select desired FPR on single parameter (probability).
58
- Cons: Probabilities (and some other score functions)
59
- are influenced by presence of decoys and database size.
60
- "
61
-
62
- end
63
-
64
- def to_sequest ; "Create file: `to_sequest.sld`
65
-
66
- To run sequest, first create a `to_sequest.sld` file that points
67
- sequest to your raw data files (you can use it to run sequest and in the
68
- multi-consensus view).
69
- "
70
- end
71
-
72
- def convert_to_pepxml ; "Convert to pepXML
73
-
74
- bioworks_to_pepxml.rb bioworks.xml -p /cygdrive/c/Xcalibur/params/myparams.params -m /cygdrive/c/Xcalibur/data/mydatafolder
75
-
76
- By default, the pepxml files will be written to a subdirectory called
77
- 'pepxml'. Type `bioworks_to_pepxml.rb` for more details.
78
- "
79
- end
80
-
81
-
82
- def run_prophet ; "Run Protein Prophet
83
-
84
- ProteinProphet must be run in a particular directory. If one does not exist, create an alias (in ~/.bashrc file) to simplify getting there: `alias isb=\"cd /cygdrive/c/Inetpub/wwwroot/ISB/data\"`. Then, to get to the isb folder, just type:
85
-
86
- isb # -> takes you to /cygdrive/c/Inetpub/wwwroot/ISB/data
87
-
88
- Then, run protein prophet:
89
-
90
- xinteract -N<my_run_name>.xml -Op sequest/myfolder/pepxml/*.xml
91
-
92
- Type `xinteract` for more details. *NOTE:* it is very important
93
- that the path to the pepxml files be given starting with the sequest soft link
94
- so the server thinks the data is mounted under the webserver.
95
-
96
- The full protein results are written to '<my_run_name>-prot.xml'."
97
- end
98
-
99
- end
100
-
101
- # Option for using 2 databases and normal prophet run...
102
- class TwoDB < StepByStep
103
-
104
- def option_title
105
- "Two-DB: Real and Decoy Database Search
106
- ======================================" end
107
-
108
- def run_sequest ; "Run Sequest with a Normal and an Inverse Database
109
-
110
- If you don't already have one, here's how to make an inverse database:
111
-
112
- fasta_shaker.rb reverse <yourfile.fasta>
113
-
114
- This will create a file with the trailing tag '_reverse.fasta'. Just type
115
- `fasta_shaker.rb` for more details.
116
-
117
- Run sequest with 'report duplicate references' set to >= 40
118
- "
119
- end
120
-
121
- def bioworks_xml ; "Export a Bioworks XML File for each Database
122
-
123
- 1. Load your sequest results in MultiConsensus results (even if you only have one run)
124
- File -> 'Load MultiConsensus Results'
125
- 2. Click 'yes' to calculate peptide probabilities [optional]
126
- 3. Click 'yes' to view results without filtering.
127
- 4. Right click on the data and 'Export' to XML (name the file `bioworks.xml`). This file is fed into ProteinProphet.
128
- 5. Filter your data on the parameters you prefer and export.
129
- 6. Do the same thing (only need to do steps 1,2,5) for the inverted database. Make sure to filter on these same parameters and export these results, too. (To expirement with different parameters, open two Bioworks windows and filter the normal and inverse databases until satisfied).
130
- "
131
- end
132
-
133
- def classification_analysis ; "Classification Analysis
134
-
135
- ProteinProphet run with a normal database gives an estimate of false positive rates. We can view a protein summary with a desired cutoff:
136
-
137
- protein_summary.rb -c 5.0 <my_run_name>-prot.xml
138
-
139
- Proteins above the red cutoff line have a false positive rate of less than or equal to 5%.
140
-
141
- We can verify Bioworks probability scores by counting the number of true hits
142
- (from normal database) compared with false hits (from inverted db) using the same score filters for both. This command will give a protein summary and include precision and false positive rates (two different kinds):
143
-
144
- protein_summary.rb bioworks_filtered.xml -f bioworks_filtered_INV.xml -p -g --fpr
145
-
146
- Type `protein_summary.rb` for more details.
147
-
148
- The false positive rate information can also be calculated without the protein summary:
149
-
150
- false_positive_rate.rb bioworks_filtered.xml -f bioworks_filtered_INV.xml -p -g
151
-
152
- Type `false_positive_rate.rb` for more details.
153
- "
154
- end
155
-
156
- end # class
157
-
158
-
159
- # Option for using 2 databases and normal prophet run...
160
- class CatDB < StepByStep
161
-
162
- def option_title
163
- "Cat-DB: Concatenated Database Search
164
- ===================================="
165
- end
166
-
167
- def run_sequest ; "Run Sequest with a Concatenated Inverse Database
168
-
169
- If you don't already have one, here's how to make one:
170
-
171
- fasta_shaker.rb reverse -c -p INV_ <yourfile.fasta>
172
-
173
- This will create a file '<yourfile>_cat_reverse_prefix_INV_.fasta'. Each
174
- inverted protein name will be prefixed with 'INV_'. Just type
175
- `fasta_shaker.rb` for more details.
176
-
177
- Run sequest with 'report duplicate references' set to >= 40
178
- "
179
- end
180
-
181
- def bioworks_xml ; "Export a Bioworks XML File
182
-
183
- 1. Load your sequest results in MultiConsensus results (even if you only have one run)
184
- File -> 'Load MultiConsensus Results'
185
- 2. Click 'yes' to calculate peptide probabilities.
186
- 3. Click 'yes' to view results without filtering.
187
- 4. Right click on the data and 'Export' to XML (name the file `bioworks.xml`)
188
- 5. If you care to, you can filter your data how you like for viewing in Bioworks Browser.
189
- "
190
- end
191
-
192
- def classification_analysis ; "Classification Analysis
193
-
194
- We can verify the Bioworks or ProteinProphet probability scores by counting
195
- the number of true hits (proteins from normal database hits) compared with
196
- false hits (proteins from inverted db hits). Since the probabilities are more
197
- or less continuous, we can select any desired false positive rate.
198
-
199
- protein_summary.rb bioworks_cat_inv.xml -f INV_ --fpr ## -> for Bioworks
200
- protein_summary.rb run_cat_inv-prot.xml -f INV_ --fpr ## -> for Prophet
201
-
202
- Type `protein_summary.rb` for more details.
203
-
204
- We can also view the false positive rate without the protein summary (and compare false positive rates between files):
205
-
206
- false_positive_rate.rb -f INV_ bioworks.xml proph-prot.xml
207
-
208
- Type `false_positive_rate.rb` for more details.
209
- "
210
- end
211
-
212
- end # class
213
-
214
-
215
- def number(num, string)
216
- "### #{num}. #{string}"
217
- end
218
-
219
- def number_array(start, array)
220
- num = start
221
- new_array = array.map do |v|
222
- if v[0,2] == "=="
223
- new_string = v
224
- else
225
- new_string = number(num, v)
226
- num += 1
227
- end
228
- new_string
229
- end
230
- new_array
231
- end
232
-
233
- st = StepByStep.new
234
- a = TwoDB.new
235
- b = CatDB.new
236
-
237
- intro_page = [st.intro, st.explanation, st.links]
238
- intro_page_string = intro_page.join("\n")
239
-
240
- option_a = [a.to_sequest, a.run_sequest, a.bioworks_xml, a.convert_to_pepxml, a.run_prophet, a.classification_analysis]
241
- option_b = [a.to_sequest, b.run_sequest, b.bioworks_xml, b.convert_to_pepxml, b.run_prophet, b.classification_analysis]
242
-
243
- option_a = number_array(1, option_a)
244
- option_b = number_array(1, option_b)
245
-
246
- option_a.unshift a.option_title
247
- option_b.unshift b.option_title
248
-
249
- option_a_string = option_a.join("\n")
250
- option_b_string = option_b.join("\n")
251
-
252
- print_file( destination, intropage, extension, index_header, intro_page_string )
253
- print_file( destination, option_a_file, extension, header, option_a_string )
254
- print_file( destination, option_b_file, extension, header, option_b_string )
255
-
256
-
257
-
258
-