mspire 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
data/bin/fasta_mod.rb
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
require 'fasta'
|
4
|
-
require 'optparse'
|
5
|
-
|
6
|
-
hash = {
|
7
|
-
'shuffle' => {
|
8
|
-
'method' => :aaseq_shuffle!,
|
9
|
-
'file_postfix' => Fasta::SHUFF_FILE_POSTFIX,
|
10
|
-
},
|
11
|
-
'invert' => {
|
12
|
-
'method' => :aaseq_invert!,
|
13
|
-
'file_postfix' => Fasta::INV_FILE_POSTFIX,
|
14
|
-
},
|
15
|
-
}
|
16
|
-
|
17
|
-
opt = {}
|
18
|
-
OptionParser.new do |opts|
|
19
|
-
opts.on("-p", "--prefix PREFIX", "add prefix to protein header") {|v| opt['p'] = v }
|
20
|
-
end.parse!
|
21
|
-
#opts = GetoptLong.new(["-p", "--prefix", GetoptLong::REQUIRED_ARGUMENT])
|
22
|
-
|
23
|
-
|
24
|
-
if ARGV.size < 2
|
25
|
-
puts "
|
26
|
-
usage: #{File.basename(__FILE__)} [-p <prefix>] <method> <file>.fasta ...
|
27
|
-
|
28
|
-
AA seq's will be modified according to <method>. Each file takes on a
|
29
|
-
postfix (before the extension).
|
30
|
-
|
31
|
-
-p prefix each protein's header will take on a header prefix after the '>'.
|
32
|
-
|
33
|
-
Method FILE_POSTFIX (added before the file extension)
|
34
|
-
shuffle #{hash['shuffle']['file_postfix']}
|
35
|
-
invert #{hash['invert']['file_postfix']}
|
36
|
-
"
|
37
|
-
exit
|
38
|
-
end
|
39
|
-
|
40
|
-
prefix = nil; if opt.key?('p') then prefix = opt['p'] end
|
41
|
-
method = ARGV.shift
|
42
|
-
|
43
|
-
cmd_hash = nil
|
44
|
-
if hash.key?(method)
|
45
|
-
cmd_hash = hash[method]
|
46
|
-
else
|
47
|
-
puts "Not a defined method: #{method}"
|
48
|
-
puts "type #{File.basename(__FILE__)} for usage"
|
49
|
-
exit
|
50
|
-
end
|
51
|
-
|
52
|
-
ARGV.each do |fn|
|
53
|
-
outfile = Fasta.modify_file(fn, cmd_hash['method'], cmd_hash['file_postfix'], prefix)
|
54
|
-
puts "OUTPUT: #{outfile}"
|
55
|
-
end
|
56
|
-
|
57
|
-
|
data/bin/filter_spec_id.rb
DELETED
@@ -1,365 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
require 'spec_id'
|
4
|
-
require 'optparse'
|
5
|
-
require 'ostruct'
|
6
|
-
require 'spec_id/aa_freqs'
|
7
|
-
|
8
|
-
########################################################
|
9
|
-
WRITE_MARSHAL = true
|
10
|
-
TABULATE_DATA = true
|
11
|
-
WRITE_CYS_FIND = false
|
12
|
-
########################################################
|
13
|
-
|
14
|
-
opt = OpenStruct.new
|
15
|
-
opt.x1 = 1.0
|
16
|
-
opt.x2 = 1.5
|
17
|
-
opt.x3 = 2.0
|
18
|
-
opt.c = 0.5
|
19
|
-
opt.rppm = 1000.0
|
20
|
-
opt.false = false
|
21
|
-
|
22
|
-
# prints shortened number for display
|
23
|
-
def short(num)
|
24
|
-
sprintf( "%.3f",num)
|
25
|
-
end
|
26
|
-
|
27
|
-
opts = OptionParser.new do |op|
|
28
|
-
op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml"
|
29
|
-
op.separator("prints number of proteins (and FPR if -f option)")
|
30
|
-
op.separator ""
|
31
|
-
|
32
|
-
op.separator("** only takes the top hit per scan+charge")
|
33
|
-
op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
|
34
|
-
op.separator(" (these are peptides who are the only hit with xcorr > 0)")
|
35
|
-
op.separator ""
|
36
|
-
op.on("-1", "--xcorr1 <f>", "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v.to_f}
|
37
|
-
op.on("-2", "--xcorr2 <f>", "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v.to_f}
|
38
|
-
op.on("-3", "--xcorr3 <f>", "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v.to_f}
|
39
|
-
op.on("-c", "--deltacn <f>", ">= deltacn d: #{opt.c}") {|v| opt.c = v.to_f}
|
40
|
-
op.on("--rppm <f>", "<= rough ppm (10^6*deltamass/mass) d: #{opt.rppm}") {|v| opt.rppm = v.to_f}
|
41
|
-
op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
|
42
|
-
op.on("-f", "--false <s>", "protein prefix or file name of decoys") {|v| opt.false = v}
|
43
|
-
op.on("-y", "--cysteines <fasta_file|freq>", "report fpr by expected cysteine freq") {|v| opt.cysteines = v}
|
44
|
-
op.on("--cback <mean,stdev>", "the cysteine background") {|v| opt.cback = v}
|
45
|
-
op.on("--from_file <file>", "(no -i) file with list of interactive input") {|v| opt.from_file = v}
|
46
|
-
op.on("-t", "--true_pos <fasta>", "fasta file containing true hits") {|v| opt.true_pos = v }
|
47
|
-
end
|
48
|
-
|
49
|
-
$cys_mean = nil
|
50
|
-
$cys_stdev = nil
|
51
|
-
|
52
|
-
|
53
|
-
# fpr is a SpecID obj that is the false positives
|
54
|
-
# cysteines holds an aafreqs object or nil
|
55
|
-
def filter_round(files, spec_ids, kind, args, fpr=nil, cysteines=nil, interactive=false)
|
56
|
-
(x1, x2, x3, deltacn, rppm) = args
|
57
|
-
combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)
|
58
|
-
puts "=========================================================================="
|
59
|
-
puts "[[ xcorr(1,2,3) >= #{x1},#{x2},#{x3} ; deltacn >= #{deltacn} ; rough_ppm <= #{rppm} ]]"
|
60
|
-
# push fpr on the end for the calculations
|
61
|
-
if fpr ; spec_ids.push(fpr) ; end
|
62
|
-
arr_of_prots_and_peps_and_deltacnstars_and_cfpr = spec_ids.map do |spec_id|
|
63
|
-
(prots, peps, deltacnstar_cnt) = spec_id.filter(kind, *args)
|
64
|
-
if cysteines
|
65
|
-
|
66
|
-
if cysteines.is_a? Float
|
67
|
-
freq = cysteines
|
68
|
-
else
|
69
|
-
freq = cysteines.aafreqs[:C]
|
70
|
-
end
|
71
|
-
(ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(peps, freq)
|
72
|
-
[prots, peps, deltacnstar_cnt, [ac,exp]]
|
73
|
-
else
|
74
|
-
[prots, peps, deltacnstar_cnt]
|
75
|
-
end
|
76
|
-
end
|
77
|
-
arr_of_num_of_prots = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[0].size }
|
78
|
-
arr_of_num_of_peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[1].size }
|
79
|
-
deltacnstars = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[2] }
|
80
|
-
cys_reports = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[3] } if cysteines
|
81
|
-
prot_nums = arr_of_num_of_prots
|
82
|
-
pep_nums = arr_of_num_of_peps
|
83
|
-
## files = [file1, file2, file3]
|
84
|
-
## prot_nums = [nums1, nums2, nums3, nums_for_false_positives]
|
85
|
-
## pep_nums = [nums1, nums2, nums3, nums_for_false_positives]
|
86
|
-
files.each_with_index do |file,i|
|
87
|
-
|
88
|
-
if !interactive
|
89
|
-
puts "#{file} [prots]:\t#{prot_nums[i]}"
|
90
|
-
puts "#{file} [peps]:\t#{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
|
91
|
-
else
|
92
|
-
puts "file#{i+1} [prots]: #{prot_nums[i]}"
|
93
|
-
puts "file#{i+1} [peps]: #{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
|
94
|
-
end
|
95
|
-
if fpr
|
96
|
-
#puts "FPR [prots] : " + short( 100.0*(prot_nums[-1].to_f/prot_nums[i].to_f) ) + " % (#{prot_nums[-1]})"
|
97
|
-
#puts "FPR [peps] : " + short( 100.0*(pep_nums[-1].to_f/pep_nums[i].to_f) ) + " % (#{pep_nums[-1]}) (dcn*=#{deltacnstars[-1]})"
|
98
|
-
|
99
|
-
## For separate searches: every false positive = one less TP
|
100
|
-
## For concatenated searches: every false positive is one less TP
|
101
|
-
## THAT's what I've been doing already !
|
102
|
-
|
103
|
-
prot_tps = prot_nums[i] - prot_nums[-1]
|
104
|
-
pep_tps = pep_nums[i] - pep_nums[-1]
|
105
|
-
prot_fps = prot_nums[i] - prot_tps
|
106
|
-
pep_fps = pep_nums[i] - pep_tps
|
107
|
-
prot_fpr = prot_fps.to_f/prot_nums[i].to_f
|
108
|
-
pep_fpr = pep_fps.to_f/pep_nums[i].to_f
|
109
|
-
# those are the same!
|
110
|
-
puts "FPR [prots] : " + short( 100.0*prot_fpr ) + " % (#{prot_fps})"
|
111
|
-
puts "FPR [peps] : " + short( 100.0*pep_fpr ) + " % (#{pep_fps}) (dcn*=#{deltacnstars[-1]})"
|
112
|
-
end
|
113
|
-
if cysteines
|
114
|
-
(ac, exp) = cys_reports[i]
|
115
|
-
|
116
|
-
(cys_fprate, total_num_false) = fpr_by_cysteines(ac, exp, pep_nums[i], $cys_mean, $cys_stdev)
|
117
|
-
fraction_of_expected = ac.to_f/exp
|
118
|
-
cys_tps = pep_nums[i] - total_num_false
|
119
|
-
|
120
|
-
puts "CYSTEINE FPR: "
|
121
|
-
puts " (# peps containing >= 1 cysteines)"
|
122
|
-
puts " actual: #{ac}"
|
123
|
-
puts "fraction of expected: #{short(fraction_of_expected)}"
|
124
|
-
puts " expected # FP's: " + short(total_num_false)
|
125
|
-
puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
|
126
|
-
|
127
|
-
puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)"
|
128
|
-
puts "Combined Score & FPR"
|
129
|
-
puts "#{combined_score}\t#{cys_fprate}"
|
130
|
-
puts "Combined Score & fraction of expected"
|
131
|
-
#puts "#{combined_score} #{fraction_of_expected}"
|
132
|
-
to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
|
133
|
-
puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
|
134
|
-
to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, rppm]
|
135
|
-
puts to_tab.join("\t") if TABULATE_DATA
|
136
|
-
end
|
137
|
-
if $true_pos_aaseqs
|
138
|
-
peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr[i][1]
|
139
|
-
real_tps = 0
|
140
|
-
real_fps = 0
|
141
|
-
# could also do with partition
|
142
|
-
peps.each do |pep|
|
143
|
-
if pep.sequence =~ /\.([\w\*]+)\.?/
|
144
|
-
if $true_pos_aaseqs.any? {|aaseq| aaseq.include? $1}
|
145
|
-
real_tps += 1
|
146
|
-
else
|
147
|
-
real_fps += 1
|
148
|
-
end
|
149
|
-
else
|
150
|
-
abort "Couldn't Match: #{pep.sequence}"
|
151
|
-
end
|
152
|
-
end
|
153
|
-
if peps.size > 0
|
154
|
-
real_fpr = real_fps.to_f/peps.size
|
155
|
-
else
|
156
|
-
real_fpr = 0.0
|
157
|
-
end
|
158
|
-
puts "REAL FPR: #{real_fpr}"
|
159
|
-
puts "REAL #TP: #{real_tps}"
|
160
|
-
to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, real_tps, real_fpr, '', x1, x2, x3, deltacn, rppm]
|
161
|
-
puts to_tab.join("\t") if TABULATE_DATA
|
162
|
-
|
163
|
-
end
|
164
|
-
end
|
165
|
-
#puts files.join(' | ')
|
166
|
-
#puts nums.join(' | ')
|
167
|
-
end
|
168
|
-
|
169
|
-
|
170
|
-
# (actual # with cys, expected # with cys, total#peptides,
|
171
|
-
# mean_fraction_of_cysteines_true, std)
|
172
|
-
# PepHit(C) = Peptide containing cysteine
|
173
|
-
# # Total PepHit(C) # Observed Bad Pep (C)
|
174
|
-
# ------------------ proportional_to ----------------------
|
175
|
-
# # Total PepHit # Total Bad PepHit (X)
|
176
|
-
def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
|
177
|
-
|
178
|
-
# the number of bona fide BAD cysteine hits
|
179
|
-
# (some of the cysteine hits (~5%) are true positives)
|
180
|
-
|
181
|
-
ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
|
182
|
-
if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
|
183
|
-
total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
|
184
|
-
fpr = total_number_false / total_peptides
|
185
|
-
[fpr, total_number_false]
|
186
|
-
end
|
187
|
-
|
188
|
-
# assumes its already chomped
|
189
|
-
# updates the 5 globals
|
190
|
-
def prep_reply(reply, base)
|
191
|
-
if reply == 'q' ; exit ; end
|
192
|
-
if reply =~ /^\s*$/
|
193
|
-
base
|
194
|
-
elsif reply
|
195
|
-
arr = reply.split(/\s+/)
|
196
|
-
to_change = []
|
197
|
-
to_change_hash = {}
|
198
|
-
arr.each do |it|
|
199
|
-
if it.include? ':'
|
200
|
-
(k,v) = it.split(':')
|
201
|
-
to_change_hash[k] = v
|
202
|
-
else
|
203
|
-
to_change << it
|
204
|
-
end
|
205
|
-
end
|
206
|
-
to_change.each_with_index do |tc,i|
|
207
|
-
begin
|
208
|
-
base[i] = tc.to_f
|
209
|
-
rescue NoMethodError
|
210
|
-
puts "BAD ARG: #{tc}"
|
211
|
-
return false
|
212
|
-
end
|
213
|
-
end
|
214
|
-
to_change_hash.each do |k,v|
|
215
|
-
case k
|
216
|
-
when 'x1' ; base[0] = v
|
217
|
-
when 'x2' ; base[1] = v
|
218
|
-
when 'x3' ; base[2] = v
|
219
|
-
when 'dcn' ; base[3] = v
|
220
|
-
when 'rppm' ; base[4] = v
|
221
|
-
else
|
222
|
-
puts "BAD ARG: #{k}:#{v}"
|
223
|
-
end
|
224
|
-
end
|
225
|
-
base.map {|v| v.to_f }
|
226
|
-
else
|
227
|
-
false
|
228
|
-
end
|
229
|
-
end
|
230
|
-
|
231
|
-
def file_to_prefiltered_spec_id(file)
|
232
|
-
spec_id = nil
|
233
|
-
marshal_file = file + ".prefiltered.msh"
|
234
|
-
if File.exist?(marshal_file)
|
235
|
-
File.open(marshal_file) do |fh|
|
236
|
-
spec_id = Marshal.load(fh)
|
237
|
-
end
|
238
|
-
else
|
239
|
-
spec_id = SpecID.new(file)
|
240
|
-
spec_id.top_peps_prefilter!
|
241
|
-
## marshal it!
|
242
|
-
if WRITE_MARSHAL
|
243
|
-
File.open(marshal_file, "w") do |fh|
|
244
|
-
Marshal.dump(spec_id,fh)
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
248
|
-
spec_id
|
249
|
-
end
|
250
|
-
|
251
|
-
def interactive_help
|
252
|
-
puts "enter: <x1> <x2> <x3> <dcn> <rppm>"
|
253
|
-
puts "or : x1:<x1> x2:<x2> x3:<x3> dcn:<dcn> rppm:<rppm>"
|
254
|
-
puts "or : dcn:<dcn>"
|
255
|
-
puts "or : <x1> <x2> rppm:<rppm>"
|
256
|
-
puts "etc..."
|
257
|
-
puts "<enter> to (re)run current values"
|
258
|
-
puts "'q' to quit"
|
259
|
-
end
|
260
|
-
|
261
|
-
opts.parse!
|
262
|
-
|
263
|
-
if ARGV.size < 1
|
264
|
-
puts opts
|
265
|
-
exit
|
266
|
-
end
|
267
|
-
|
268
|
-
$stderr.puts "reading files (can take a minute or two for large files)..."
|
269
|
-
files = ARGV.map {|file| file }
|
270
|
-
ARGV.clear
|
271
|
-
|
272
|
-
arr_of_spec_ids = files.map do |file|
|
273
|
-
file_to_prefiltered_spec_id(file)
|
274
|
-
end
|
275
|
-
|
276
|
-
fpr = nil
|
277
|
-
cysteines = nil
|
278
|
-
if opt.cysteines
|
279
|
-
puts %w(TABULATE combined_score pep_tps pep_fprate cys_pep_tps cys_pep_fprate [nil] x1 x2 x3 dcn rppm).join("\t")
|
280
|
-
if File.exist? opt.cysteines
|
281
|
-
cysteines = SpecID::AAFreqs.new(opt.cysteines)
|
282
|
-
else
|
283
|
-
cysteines = opt.cysteines.to_f
|
284
|
-
end
|
285
|
-
if opt.cback
|
286
|
-
($cys_mean, $cys_stdev) = opt.cback.split(',').map{|v| v.to_f }
|
287
|
-
end
|
288
|
-
end
|
289
|
-
|
290
|
-
$true_pos_aaseqs = nil
|
291
|
-
if opt.true_pos
|
292
|
-
puts %w(TABULATE combined_score pep_tps pep_fprate real_tps real_fpr [nil] x1 x2 x3 dcn rppm).join("\t")
|
293
|
-
fasta = Fasta.new.read_file(opt.true_pos)
|
294
|
-
$true_pos_aaseqs = fasta.prots.map do |prot|
|
295
|
-
prot.aaseq.chomp
|
296
|
-
end
|
297
|
-
end
|
298
|
-
|
299
|
-
if opt.false
|
300
|
-
# its a file if it exists
|
301
|
-
if File.exist? opt.false
|
302
|
-
fpr = file_to_prefiltered_spec_id(opt.false)
|
303
|
-
else # its a prefix (Assume one file for now!)
|
304
|
-
spec_obj = arr_of_spec_ids[0]
|
305
|
-
(tps, fps) = spec_obj.classify_by_prefix(:peps, opt.false)
|
306
|
-
fps_specid_minor = spec_obj.obj.class.new
|
307
|
-
tps_specid_minor = spec_obj.obj.class.new
|
308
|
-
fps_specid = SpecID.new
|
309
|
-
tps_specid = SpecID.new
|
310
|
-
fps_specid.obj = fps_specid_minor
|
311
|
-
tps_specid.obj = tps_specid_minor
|
312
|
-
fps_specid.peps = fps
|
313
|
-
tps_specid.peps = tps
|
314
|
-
arr_of_spec_ids[0] = tps_specid
|
315
|
-
fpr = fps_specid
|
316
|
-
end
|
317
|
-
end
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.rppm]
|
323
|
-
|
324
|
-
if opt.from_file
|
325
|
-
lines = IO.readlines(opt.from_file)
|
326
|
-
lines.each do |line|
|
327
|
-
line.chomp!
|
328
|
-
answer = prep_reply(line, base_args)
|
329
|
-
next if answer == false
|
330
|
-
base_args = answer
|
331
|
-
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
|
332
|
-
end
|
333
|
-
elsif opt.i
|
334
|
-
interactive_help
|
335
|
-
puts "*******************************************************"
|
336
|
-
puts "Number of proteins in files (this order):"
|
337
|
-
files.each do |file|
|
338
|
-
puts file
|
339
|
-
end
|
340
|
-
puts "*******************************************************"
|
341
|
-
reply = "nil"
|
342
|
-
loop do
|
343
|
-
b = base_args
|
344
|
-
puts "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} rppm:#{b[4]}"
|
345
|
-
loop do
|
346
|
-
reply = gets.chomp
|
347
|
-
answer = prep_reply(reply, base_args)
|
348
|
-
if answer == false
|
349
|
-
interactive_help
|
350
|
-
else
|
351
|
-
base_args = answer
|
352
|
-
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
|
353
|
-
break
|
354
|
-
end
|
355
|
-
end
|
356
|
-
end
|
357
|
-
else
|
358
|
-
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, false)
|
359
|
-
end
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
data/bin/raw2mzXML.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
# John Prince
|
4
|
-
# 2006-08-24
|
5
|
-
# takes multiple files and converts them into mzXML
|
6
|
-
|
7
|
-
EXECUTE_MZXML_CONVERTER = "t2x";
|
8
|
-
|
9
|
-
if ARGV.size == 0
|
10
|
-
puts "usage: raw2mzXML.pl file1.RAW file2.RAW ..."
|
11
|
-
puts " [or raw2mzXML.pl *.RAW]"
|
12
|
-
exit
|
13
|
-
end
|
14
|
-
|
15
|
-
files = ARGV.to_a
|
16
|
-
files.each do |file|
|
17
|
-
puts "******************************************"
|
18
|
-
puts "About to convert #{file}"
|
19
|
-
system "#{EXECUTE_MZXML_CONVERTER} #{file}"
|
20
|
-
puts "******************************************"
|
21
|
-
end
|
@@ -1,258 +0,0 @@
|
|
1
|
-
require 'fileutils'
|
2
|
-
|
3
|
-
####################################################
|
4
|
-
# GLOBAL:
|
5
|
-
make_path = File.dirname(__FILE__) + "/../doc/src/tutorial/database_searching"
|
6
|
-
destination = File.dirname(__FILE__) + "/../doc/src/tutorial/database_searching/"
|
7
|
-
intropage = "index"
|
8
|
-
option_a_file = "two_db_search"
|
9
|
-
option_b_file = "cat_db_search"
|
10
|
-
extension = ".page"
|
11
|
-
index_header = "---
|
12
|
-
inMenu: false
|
13
|
-
directoryName: DB searching
|
14
|
-
---\n"
|
15
|
-
header = "---
|
16
|
-
inMenu: true
|
17
|
-
directoryName: DB searching
|
18
|
-
---\n"
|
19
|
-
####################################################
|
20
|
-
|
21
|
-
## Make the destination path
|
22
|
-
FileUtils.mkpath( make_path )
|
23
|
-
|
24
|
-
def print_file(destination, base_name, extension, header, string)
|
25
|
-
File.open(destination + base_name + extension,'w') do |out| out.print(header + string) end
|
26
|
-
end
|
27
|
-
|
28
|
-
class StepByStep
|
29
|
-
|
30
|
-
def links
|
31
|
-
"How to: [Two-DB Search](two_db_search.html) | [Cat-DB Search](cat_db_search.html)"
|
32
|
-
end
|
33
|
-
|
34
|
-
def intro
|
35
|
-
"Step by step
|
36
|
-
------------
|
37
|
-
Guide for basic processing of mass spectrometry data derived from Bioworks 3.2.
|
38
|
-
"
|
39
|
-
end
|
40
|
-
|
41
|
-
def explanation
|
42
|
-
"### Database Search Options
|
43
|
-
|
44
|
-
There are two flavors for estimating a false positive identification rate for
|
45
|
-
protein identification search engines. The first uses two databases: one
|
46
|
-
normal, the other a decoy (['Two-DB'](two_db_search.html)). The second method (['Cat-DB'](cat_db_search.html)) uses a
|
47
|
-
single databases where decoy proteins have been inserted at the bottom of the
|
48
|
-
file. Each method has its advantages and limitations:
|
49
|
-
|
50
|
-
Two-DB: Pros: Database size unchanged.
|
51
|
-
Prophet prefers a normal database.
|
52
|
-
Cons: No independent estimation of FPR for Bioworks and Prophet probs.
|
53
|
-
Multiple parameters make it difficult to select a specfic FPR -
|
54
|
-
you take the FPR you get or keep re-filtering.
|
55
|
-
|
56
|
-
Cat-DB: Pros: Independent estimation of Bioworks and Prophet probs.
|
57
|
-
Can select desired FPR on single parameter (probability).
|
58
|
-
Cons: Probabilities (and some other score functions)
|
59
|
-
are influenced by presence of decoys and database size.
|
60
|
-
"
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
def to_sequest ; "Create file: `to_sequest.sld`
|
65
|
-
|
66
|
-
To run sequest, first create a `to_sequest.sld` file that points
|
67
|
-
sequest to your raw data files (you can use it to run sequest and in the
|
68
|
-
multi-consensus view).
|
69
|
-
"
|
70
|
-
end
|
71
|
-
|
72
|
-
def convert_to_pepxml ; "Convert to pepXML
|
73
|
-
|
74
|
-
bioworks_to_pepxml.rb bioworks.xml -p /cygdrive/c/Xcalibur/params/myparams.params -m /cygdrive/c/Xcalibur/data/mydatafolder
|
75
|
-
|
76
|
-
By default, the pepxml files will be written to a subdirectory called
|
77
|
-
'pepxml'. Type `bioworks_to_pepxml.rb` for more details.
|
78
|
-
"
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
def run_prophet ; "Run Protein Prophet
|
83
|
-
|
84
|
-
ProteinProphet must be run in a particular directory. If one does not exist, create an alias (in ~/.bashrc file) to simplify getting there: `alias isb=\"cd /cygdrive/c/Inetpub/wwwroot/ISB/data\"`. Then, to get to the isb folder, just type:
|
85
|
-
|
86
|
-
isb # -> takes you to /cygdrive/c/Inetpub/wwwroot/ISB/data
|
87
|
-
|
88
|
-
Then, run protein prophet:
|
89
|
-
|
90
|
-
xinteract -N<my_run_name>.xml -Op sequest/myfolder/pepxml/*.xml
|
91
|
-
|
92
|
-
Type `xinteract` for more details. *NOTE:* it is very important
|
93
|
-
that the path to the pepxml files be given starting with the sequest soft link
|
94
|
-
so the server thinks the data is mounted under the webserver.
|
95
|
-
|
96
|
-
The full protein results are written to '<my_run_name>-prot.xml'."
|
97
|
-
end
|
98
|
-
|
99
|
-
end
|
100
|
-
|
101
|
-
# Option for using 2 databases and normal prophet run...
|
102
|
-
class TwoDB < StepByStep
|
103
|
-
|
104
|
-
def option_title
|
105
|
-
"Two-DB: Real and Decoy Database Search
|
106
|
-
======================================" end
|
107
|
-
|
108
|
-
def run_sequest ; "Run Sequest with a Normal and an Inverse Database
|
109
|
-
|
110
|
-
If you don't already have one, here's how to make an inverse database:
|
111
|
-
|
112
|
-
fasta_shaker.rb reverse <yourfile.fasta>
|
113
|
-
|
114
|
-
This will create a file with the trailing tag '_reverse.fasta'. Just type
|
115
|
-
`fasta_shaker.rb` for more details.
|
116
|
-
|
117
|
-
Run sequest with 'report duplicate references' set to >= 40
|
118
|
-
"
|
119
|
-
end
|
120
|
-
|
121
|
-
def bioworks_xml ; "Export a Bioworks XML File for each Database
|
122
|
-
|
123
|
-
1. Load your sequest results in MultiConsensus results (even if you only have one run)
|
124
|
-
File -> 'Load MultiConsensus Results'
|
125
|
-
2. Click 'yes' to calculate peptide probabilities [optional]
|
126
|
-
3. Click 'yes' to view results without filtering.
|
127
|
-
4. Right click on the data and 'Export' to XML (name the file `bioworks.xml`). This file is fed into ProteinProphet.
|
128
|
-
5. Filter your data on the parameters you prefer and export.
|
129
|
-
6. Do the same thing (only need to do steps 1,2,5) for the inverted database. Make sure to filter on these same parameters and export these results, too. (To expirement with different parameters, open two Bioworks windows and filter the normal and inverse databases until satisfied).
|
130
|
-
"
|
131
|
-
end
|
132
|
-
|
133
|
-
def classification_analysis ; "Classification Analysis
|
134
|
-
|
135
|
-
ProteinProphet run with a normal database gives an estimate of false positive rates. We can view a protein summary with a desired cutoff:
|
136
|
-
|
137
|
-
protein_summary.rb -c 5.0 <my_run_name>-prot.xml
|
138
|
-
|
139
|
-
Proteins above the red cutoff line have a false positive rate of less than or equal to 5%.
|
140
|
-
|
141
|
-
We can verify Bioworks probability scores by counting the number of true hits
|
142
|
-
(from normal database) compared with false hits (from inverted db) using the same score filters for both. This command will give a protein summary and include precision and false positive rates (two different kinds):
|
143
|
-
|
144
|
-
protein_summary.rb bioworks_filtered.xml -f bioworks_filtered_INV.xml -p -g --fpr
|
145
|
-
|
146
|
-
Type `protein_summary.rb` for more details.
|
147
|
-
|
148
|
-
The false positive rate information can also be calculated without the protein summary:
|
149
|
-
|
150
|
-
false_positive_rate.rb bioworks_filtered.xml -f bioworks_filtered_INV.xml -p -g
|
151
|
-
|
152
|
-
Type `false_positive_rate.rb` for more details.
|
153
|
-
"
|
154
|
-
end
|
155
|
-
|
156
|
-
end # class
|
157
|
-
|
158
|
-
|
159
|
-
# Option for using 2 databases and normal prophet run...
|
160
|
-
class CatDB < StepByStep
|
161
|
-
|
162
|
-
def option_title
|
163
|
-
"Cat-DB: Concatenated Database Search
|
164
|
-
===================================="
|
165
|
-
end
|
166
|
-
|
167
|
-
def run_sequest ; "Run Sequest with a Concatenated Inverse Database
|
168
|
-
|
169
|
-
If you don't already have one, here's how to make one:
|
170
|
-
|
171
|
-
fasta_shaker.rb reverse -c -p INV_ <yourfile.fasta>
|
172
|
-
|
173
|
-
This will create a file '<yourfile>_cat_reverse_prefix_INV_.fasta'. Each
|
174
|
-
inverted protein name will be prefixed with 'INV_'. Just type
|
175
|
-
`fasta_shaker.rb` for more details.
|
176
|
-
|
177
|
-
Run sequest with 'report duplicate references' set to >= 40
|
178
|
-
"
|
179
|
-
end
|
180
|
-
|
181
|
-
def bioworks_xml ; "Export a Bioworks XML File
|
182
|
-
|
183
|
-
1. Load your sequest results in MultiConsensus results (even if you only have one run)
|
184
|
-
File -> 'Load MultiConsensus Results'
|
185
|
-
2. Click 'yes' to calculate peptide probabilities.
|
186
|
-
3. Click 'yes' to view results without filtering.
|
187
|
-
4. Right click on the data and 'Export' to XML (name the file `bioworks.xml`)
|
188
|
-
5. If you care to, you can filter your data how you like for viewing in Bioworks Browser.
|
189
|
-
"
|
190
|
-
end
|
191
|
-
|
192
|
-
def classification_analysis ; "Classification Analysis
|
193
|
-
|
194
|
-
We can verify the Bioworks or ProteinProphet probability scores by counting
|
195
|
-
the number of true hits (proteins from normal database hits) compared with
|
196
|
-
false hits (proteins from inverted db hits). Since the probabilities are more
|
197
|
-
or less continuous, we can select any desired false positive rate.
|
198
|
-
|
199
|
-
protein_summary.rb bioworks_cat_inv.xml -f INV_ --fpr ## -> for Bioworks
|
200
|
-
protein_summary.rb run_cat_inv-prot.xml -f INV_ --fpr ## -> for Prophet
|
201
|
-
|
202
|
-
Type `protein_summary.rb` for more details.
|
203
|
-
|
204
|
-
We can also view the false positive rate without the protein summary (and compare false positive rates between files):
|
205
|
-
|
206
|
-
false_positive_rate.rb -f INV_ bioworks.xml proph-prot.xml
|
207
|
-
|
208
|
-
Type `false_positive_rate.rb` for more details.
|
209
|
-
"
|
210
|
-
end
|
211
|
-
|
212
|
-
end # class
|
213
|
-
|
214
|
-
|
215
|
-
def number(num, string)
|
216
|
-
"### #{num}. #{string}"
|
217
|
-
end
|
218
|
-
|
219
|
-
def number_array(start, array)
|
220
|
-
num = start
|
221
|
-
new_array = array.map do |v|
|
222
|
-
if v[0,2] == "=="
|
223
|
-
new_string = v
|
224
|
-
else
|
225
|
-
new_string = number(num, v)
|
226
|
-
num += 1
|
227
|
-
end
|
228
|
-
new_string
|
229
|
-
end
|
230
|
-
new_array
|
231
|
-
end
|
232
|
-
|
233
|
-
st = StepByStep.new
|
234
|
-
a = TwoDB.new
|
235
|
-
b = CatDB.new
|
236
|
-
|
237
|
-
intro_page = [st.intro, st.explanation, st.links]
|
238
|
-
intro_page_string = intro_page.join("\n")
|
239
|
-
|
240
|
-
option_a = [a.to_sequest, a.run_sequest, a.bioworks_xml, a.convert_to_pepxml, a.run_prophet, a.classification_analysis]
|
241
|
-
option_b = [a.to_sequest, b.run_sequest, b.bioworks_xml, b.convert_to_pepxml, b.run_prophet, b.classification_analysis]
|
242
|
-
|
243
|
-
option_a = number_array(1, option_a)
|
244
|
-
option_b = number_array(1, option_b)
|
245
|
-
|
246
|
-
option_a.unshift a.option_title
|
247
|
-
option_b.unshift b.option_title
|
248
|
-
|
249
|
-
option_a_string = option_a.join("\n")
|
250
|
-
option_b_string = option_b.join("\n")
|
251
|
-
|
252
|
-
print_file( destination, intropage, extension, index_header, intro_page_string )
|
253
|
-
print_file( destination, option_a_file, extension, header, option_a_string )
|
254
|
-
print_file( destination, option_b_file, extension, header, option_b_string )
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|