mspire 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +41 -14
- data/bin/bioworks2excel.rb +1 -1
- data/bin/bioworks_to_pepxml.rb +46 -59
- data/bin/fasta_shaker.rb +1 -1
- data/bin/filter.rb +6 -0
- data/bin/find_aa_freq.rb +23 -0
- data/bin/id_precision.rb +3 -2
- data/bin/mzxml_to_lmat.rb +2 -1
- data/bin/pepproph_filter.rb +1 -1
- data/bin/precision.rb +1 -1
- data/bin/protein_summary.rb +2 -451
- data/bin/raw_to_mzXML.rb +55 -0
- data/bin/srf_group.rb +26 -0
- data/changelog.txt +7 -0
- data/lib/align.rb +3 -3
- data/lib/fasta.rb +6 -1
- data/lib/gi.rb +9 -4
- data/lib/roc.rb +2 -0
- data/lib/sample_enzyme.rb +2 -1
- data/lib/spec/mzxml/parser.rb +2 -43
- data/lib/spec/mzxml.rb +65 -2
- data/lib/spec_id/aa_freqs.rb +10 -7
- data/lib/spec_id/bioworks.rb +67 -87
- data/lib/spec_id/filter.rb +794 -0
- data/lib/spec_id/precision.rb +29 -36
- data/lib/spec_id/proph.rb +5 -3
- data/lib/spec_id/protein_summary.rb +459 -0
- data/lib/spec_id/sequest.rb +323 -271
- data/lib/spec_id/srf.rb +189 -135
- data/lib/spec_id.rb +276 -227
- data/lib/spec_id_xml.rb +101 -0
- data/lib/toppred.rb +18 -0
- data/script/degenerate_peptides.rb +47 -0
- data/script/filter-peps.rb +5 -1
- data/test/tc_align.rb +1 -1
- data/test/tc_bioworks.rb +25 -22
- data/test/tc_bioworks_to_pepxml.rb +37 -4
- data/test/tc_fasta.rb +3 -1
- data/test/tc_fasta_shaker.rb +8 -6
- data/test/tc_filter.rb +203 -0
- data/test/tc_gi.rb +6 -9
- data/test/tc_id_precision.rb +31 -0
- data/test/tc_mzxml.rb +8 -6
- data/test/tc_peptide_parent_times.rb +2 -1
- data/test/tc_precision.rb +1 -1
- data/test/tc_proph.rb +5 -5
- data/test/tc_protein_summary.rb +36 -13
- data/test/tc_sequest.rb +78 -33
- data/test/tc_spec_id.rb +128 -6
- data/test/tc_srf.rb +84 -38
- metadata +67 -62
- data/bin/fasta_cat.rb +0 -39
- data/bin/fasta_cat_mod.rb +0 -59
- data/bin/fasta_mod.rb +0 -57
- data/bin/filter_spec_id.rb +0 -365
- data/bin/raw2mzXML.rb +0 -21
- data/script/gen_database_searching.rb +0 -258
@@ -0,0 +1,794 @@
|
|
1
|
+
|
2
|
+
require 'spec_id'
|
3
|
+
require 'optparse'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'spec_id/aa_freqs'
|
6
|
+
require 'shuffle'
|
7
|
+
require 'vec'
|
8
|
+
require 'table'
|
9
|
+
|
10
|
+
|
11
|
+
########################################################
|
12
|
+
WRITE_CYS_FIND = false
|
13
|
+
########################################################
|
14
|
+
|
15
|
+
|
16
|
+
module SpecID
|
17
|
+
attr_accessor :orig_peps, :passed_peps, :passed_prots
|
18
|
+
# The filename passed in for filtering
|
19
|
+
attr_accessor :passed_in_filename
|
20
|
+
|
21
|
+
# returns the top peptide hits per file dta (first_scan + charge)
|
22
|
+
# all hits with same score as top score are returned
|
23
|
+
# assumes that all fields are strings...
|
24
|
+
# converts xcorr, deltacn, deltamass, mass, and charge into numerical types
|
25
|
+
# deletes the protein array (but not relevant proteins)
|
26
|
+
# hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i]
|
27
|
+
# sets the @orig_peps attribute to those passing
|
28
|
+
def top_peps_prefilter!
|
29
|
+
## Bioworks peps are text based and need to be transformed first
|
30
|
+
if peps.first.is_a? Bioworks::Pep
|
31
|
+
peps.each do |pep|
|
32
|
+
pep.xcorr = pep.xcorr.to_f
|
33
|
+
pep.deltacn = pep.deltacn.to_f
|
34
|
+
pep.deltamass = pep.deltamass.to_f
|
35
|
+
pep.mass = pep.mass.to_f
|
36
|
+
pep.charge = pep.charge.to_i
|
37
|
+
pep.first_scan = pep.first_scan.to_i
|
38
|
+
end
|
39
|
+
end
|
40
|
+
## Srf Peps need no transformation!
|
41
|
+
|
42
|
+
# get the top peptide by firstscan/charge (equivalent to .out files)
|
43
|
+
top_peps = []
|
44
|
+
self.peps.hash_by {|pep| [pep.base_name, pep.first_scan, pep.charge]}.values.map do |v|
|
45
|
+
#self.peps.hash_by {|pep| [pep.aaseq, pep.charge]}.values.map do |v|
|
46
|
+
best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse
|
47
|
+
top_score = best_to_worst.first.xcorr
|
48
|
+
best_to_worst.each do |pep|
|
49
|
+
if pep.xcorr == top_score
|
50
|
+
top_peps << pep
|
51
|
+
else ; break
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
@orig_peps = top_peps
|
56
|
+
end
|
57
|
+
|
58
|
+
# (xcorr1, xcorr2, xcorr3, deltacn, ppm)
|
59
|
+
# interface very unstable. For now, keeping it very loose...
|
60
|
+
# assumed that peptide xcorr, deltacn, deltamass, mass, ppm are Floats
|
61
|
+
# assumed that peptide charge is Integer
|
62
|
+
# returns peps_passed
|
63
|
+
# must respond to 'peps'
|
64
|
+
# DOES NOT UPDATE the prot.peps attribute!!
|
65
|
+
def filter_sequest(args, include_deltacnstar=false)
|
66
|
+
(x1, x2, x3, deltacn, ppm) = args
|
67
|
+
self.peps.select do |pep|
|
68
|
+
# have to add the upper limit to deltacn because the lowest score is often
|
69
|
+
# assigned a 1.10 in bioworks!
|
70
|
+
pep_deltacn = pep.deltacn
|
71
|
+
pep_charge = pep.charge
|
72
|
+
|
73
|
+
## The outer parentheses are critical to getting the correct answer!
|
74
|
+
passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm ))
|
75
|
+
|
76
|
+
if passing
|
77
|
+
if !include_deltacnstar && pep_deltacn > 1.0
|
78
|
+
false
|
79
|
+
else
|
80
|
+
true
|
81
|
+
end
|
82
|
+
else
|
83
|
+
false
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
# given some list of SpecID::Pep based objects, finds the list of proteins
|
90
|
+
# associated with those peptides
|
91
|
+
# update_prot_peps => when true, updates prot.peps attribute given the list
|
92
|
+
# of pephits
|
93
|
+
# kind =
|
94
|
+
# :no_update (current proteins are returned, but their peps attribute
|
95
|
+
# is not updated)
|
96
|
+
# :update (current proteins returned with peps attribute updated)
|
97
|
+
# :new (new proteins are created complete with peps attribute)
|
98
|
+
def self.passing_proteins(pephits, kind=:no_update)
|
99
|
+
|
100
|
+
orig_pephits_prts = []
|
101
|
+
if kind == :new
|
102
|
+
new_prots = {}
|
103
|
+
pephits.each_with_index do |pep,i|
|
104
|
+
orig_pephits_prts[i] = pep.prots
|
105
|
+
peps_new_prts = pep.prots.map do |prt|
|
106
|
+
if new_prots.key? prt.reference
|
107
|
+
already_exists = new_prots[prt.reference]
|
108
|
+
else
|
109
|
+
np = prt.dup
|
110
|
+
np.peps = []
|
111
|
+
new_prots[np.reference] = np
|
112
|
+
np
|
113
|
+
end
|
114
|
+
end
|
115
|
+
pep.prots = peps_new_prts
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
if kind == :update
|
120
|
+
pephits.each do |pep|
|
121
|
+
pep.prots.each do |prt|
|
122
|
+
prt.peps = []
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
prot_set = {}
|
128
|
+
pephits.each do |pep|
|
129
|
+
prts = pep.prots
|
130
|
+
prts.each do |prt|
|
131
|
+
prot_set[ prt.reference ] = prt
|
132
|
+
end
|
133
|
+
if (kind == :update || kind == :new)
|
134
|
+
prts.each do |prt|
|
135
|
+
prt.peps << pep
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
## Reset the original protein hits
|
141
|
+
if kind == :new
|
142
|
+
pephits.each_with_index do |pep,i|
|
143
|
+
pep.prots = orig_pephits_prts[i]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
prot_set.values
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
|
152
|
+
class SpecID::Filter
|
153
|
+
|
154
|
+
NUM_PROT_FPPR_ITERATIONS = 10
|
155
|
+
|
156
|
+
def self.run_from_argv(argv)
|
157
|
+
obj = self.new
|
158
|
+
obj.run_from_argv(argv)
|
159
|
+
end
|
160
|
+
|
161
|
+
def run_from_argv(argv)
|
162
|
+
reply = get_options(argv)
|
163
|
+
return unless reply
|
164
|
+
files, opt = reply
|
165
|
+
|
166
|
+
#files = ARGV.map {|file| file }
|
167
|
+
#ARGV.clear
|
168
|
+
|
169
|
+
$stderr.puts "reading files (can take a minute or two for large files)..." if $VERBOSE
|
170
|
+
spec_ids = files.map do |file|
|
171
|
+
spec_id = file_to_prefiltered_spec_id(file, opt)
|
172
|
+
spec_id
|
173
|
+
end
|
174
|
+
|
175
|
+
## the options hash
|
176
|
+
hash = {}
|
177
|
+
if opt.cys
|
178
|
+
if opt.cys[1]
|
179
|
+
opt.cys[1] = opt.cys[1].to_f
|
180
|
+
else
|
181
|
+
opt.cys[1] = 0.0
|
182
|
+
end
|
183
|
+
hash[:cys] = opt.cys
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
hash[:tps] =
|
188
|
+
if opt.tps
|
189
|
+
Fasta.new.read_file(opt.tps).prots.map do |prot|
|
190
|
+
prot.aaseq.chomp
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
hash[:dcy] =
|
195
|
+
if opt.false
|
196
|
+
new_spec_ids = []
|
197
|
+
prefixes_or_files = SpecID.extend_args(opt.false, files.size)
|
198
|
+
false_spec_ids = spec_ids.zip(prefixes_or_files).map do |spec_id, prefix_or_file|
|
199
|
+
if File.exist? prefix_or_file
|
200
|
+
new_spec_ids << spec_id
|
201
|
+
file_to_prefiltered_spec_id(prefix_or_file, opt)
|
202
|
+
else
|
203
|
+
(tps, fps) = spec_id.classify_by_prefix(:peps, prefix_or_file)
|
204
|
+
fps_specid = spec_id.class.new
|
205
|
+
tps_specid = spec_id.class.new
|
206
|
+
|
207
|
+
fps_specid.peps = fps
|
208
|
+
tps_specid.peps = tps
|
209
|
+
new_spec_ids << tps_specid
|
210
|
+
fps_specid
|
211
|
+
end
|
212
|
+
end
|
213
|
+
spec_ids = new_spec_ids
|
214
|
+
false_spec_ids
|
215
|
+
end
|
216
|
+
|
217
|
+
defaults = {
|
218
|
+
:dcy => nil, # { spec_id => false_spec_id }
|
219
|
+
:cys => nil, # [cys_background_freq, cys_containing_freq]
|
220
|
+
:tps => nil,
|
221
|
+
:tmm => nil,
|
222
|
+
:occams_razor => opt.occams_razor,
|
223
|
+
}
|
224
|
+
args = defaults.merge hash
|
225
|
+
|
226
|
+
|
227
|
+
base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.ppm]
|
228
|
+
|
229
|
+
#################################################### <--
|
230
|
+
@fppr_methods = [:tmm, :tps, :cys, :dcy].select do |x|
|
231
|
+
args[x]
|
232
|
+
end
|
233
|
+
@groups_reporting = [:pephits, :aaseq, :prothits]
|
234
|
+
@groups_reporting.push( :occams_razor ) if args[:occams_razor]
|
235
|
+
|
236
|
+
@cat_labels = {
|
237
|
+
:pephits => 'pep_hits',
|
238
|
+
:prothits => 'prot_hits',
|
239
|
+
:aaseq => 'uniq_aa_hits',
|
240
|
+
:occams_razor => 'occams_prot_hits',
|
241
|
+
}
|
242
|
+
#################################################### <--
|
243
|
+
|
244
|
+
if opt.log
|
245
|
+
@logfh = File.open(opt.log, 'w')
|
246
|
+
else
|
247
|
+
@logfh = nil
|
248
|
+
end
|
249
|
+
#########################################
|
250
|
+
# PRINT FILTER LEGEND
|
251
|
+
out filter_legend(@fppr_methods)
|
252
|
+
#########################################
|
253
|
+
|
254
|
+
if opt.filters_file
|
255
|
+
lines = IO.readlines(opt.filters_file)
|
256
|
+
lines.each do |line|
|
257
|
+
line.chomp!
|
258
|
+
answer = prep_reply(line, base_args)
|
259
|
+
next if answer == false
|
260
|
+
base_args = answer
|
261
|
+
filter_round(spec_ids, base_args, args)
|
262
|
+
end
|
263
|
+
elsif opt.i
|
264
|
+
## CLEAR ARGV (since otherwise, gets reads it!)
|
265
|
+
ARGV.clear
|
266
|
+
out interactive_help
|
267
|
+
reply = "nil"
|
268
|
+
loop do
|
269
|
+
b = base_args
|
270
|
+
out "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} ppm:#{b[4]}"
|
271
|
+
loop do
|
272
|
+
reply = gets.chomp
|
273
|
+
answer = prep_reply(reply, base_args)
|
274
|
+
if answer == false
|
275
|
+
out interactive_help
|
276
|
+
else
|
277
|
+
base_args = answer
|
278
|
+
filter_round(spec_ids, base_args, args)
|
279
|
+
break
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
else
|
284
|
+
filter_round(spec_ids, base_args, args)
|
285
|
+
end
|
286
|
+
|
287
|
+
if opt.log
|
288
|
+
@logfh.close
|
289
|
+
end
|
290
|
+
|
291
|
+
end
|
292
|
+
|
293
|
+
def out(string)
|
294
|
+
puts string
|
295
|
+
if @logfh
|
296
|
+
@logfh.puts string
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# takes a fasta file or a string ( to be cast as a float )
|
301
|
+
def get_cys_freq(arg)
|
302
|
+
if File.exist? arg
|
303
|
+
SpecID::AAFreqs.new(arg).aafreqs[:C]
|
304
|
+
else
|
305
|
+
arg.to_f
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
# prints shortened number for display
|
310
|
+
def short(num)
|
311
|
+
sprintf( "%.3f",num)
|
312
|
+
end
|
313
|
+
|
314
|
+
# if good arguments, returns [files_array, options]
|
315
|
+
# else prints an error argument and returns nil
|
316
|
+
def get_options(argv)
|
317
|
+
dup_argv = argv.dup
|
318
|
+
|
319
|
+
opt = OpenStruct.new
|
320
|
+
opt.x1 = 1.0
|
321
|
+
opt.x2 = 1.5
|
322
|
+
opt.x3 = 2.0
|
323
|
+
opt.c = 0.1
|
324
|
+
opt.ppm = 1000.0
|
325
|
+
opt.false = false
|
326
|
+
|
327
|
+
opts = OptionParser.new do |op|
|
328
|
+
op.banner = "usage: #{File.basename(__FILE__)} [OPTS] <bioworks.xml | bioworks.srg>"
|
329
|
+
op.separator("prints number of peptides/proteins ID'd at given thresholds")
|
330
|
+
op.separator "only top hit (by xcorr) per scan+charge is considered"
|
331
|
+
|
332
|
+
#op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
|
333
|
+
#op.separator(" (these are peptides who are the only hit with xcorr > 0)")
|
334
|
+
op.separator ""
|
335
|
+
op.on("-1", "--xcorr1 N", Float, "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v}
|
336
|
+
op.on("-2", "--xcorr2 N", Float, "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v}
|
337
|
+
op.on("-3", "--xcorr3 N", Float, "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v}
|
338
|
+
op.on("-c", "--deltacn N", Float, ">= deltacn d: #{opt.c}") {|v| opt.c = v}
|
339
|
+
op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v}
|
340
|
+
op.separator " if bioworks.xml, = 10^6deltamass/mass"
|
341
|
+
op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
|
342
|
+
op.on("-f", "--false a,b,c", Array, "prot prefixes or filenames of decoys") {|v| opt.false = v}
|
343
|
+
op.separator(" last given will apply to remaining files")
|
344
|
+
op.on("-y", "--cys <fasta_file|freq,[bkg]>", Array, "report fpr by expected cysteine freq") do |v|
|
345
|
+
v[0] = get_cys_freq(v[0])
|
346
|
+
opt.cys = v
|
347
|
+
end
|
348
|
+
op.separator(" freq = freq of cysteine as amino acid")
|
349
|
+
op.separator(" [bkg] = freq of cys containing peps d: 0.0")
|
350
|
+
op.on("--filters_file <file>", "(no -i) file with list of interactive input") {|v| opt.filters_file = v}
|
351
|
+
op.on("-t", "--tps <fasta>", "fasta file containing true hits") {|v| opt.tps = v }
|
352
|
+
#op.on("--tmm <toppred.out>", "toppred.out file with transmembr. topology") {|v| opt.tps = v }
|
353
|
+
op.on("--yaml", "spits out yaml-ized data") {|v| opt.tabulate = v }
|
354
|
+
op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v }
|
355
|
+
op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v }
|
356
|
+
op.on("--log <file>", "also writes all output to file") {|v| opt.log = v }
|
357
|
+
op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v }
|
358
|
+
op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v }
|
359
|
+
end
|
360
|
+
|
361
|
+
opts.parse!(dup_argv)
|
362
|
+
|
363
|
+
if dup_argv.size < 1
|
364
|
+
puts opts
|
365
|
+
return nil
|
366
|
+
end
|
367
|
+
|
368
|
+
[dup_argv, opt]
|
369
|
+
end
|
370
|
+
|
371
|
+
# (actual # with cys, expected # with cys, total#peptides,
|
372
|
+
# mean_fraction_of_cysteines_true, std)
|
373
|
+
# PepHit(C) = Peptide containing cysteine
|
374
|
+
# # Total PepHit(C) # Observed Bad Pep (C)
|
375
|
+
# ------------------ proportional_to ----------------------
|
376
|
+
# # Total PepHit # Total Bad PepHit (X)
|
377
|
+
# returns the fppr and the total number false
|
378
|
+
def fppr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
|
379
|
+
|
380
|
+
# the number of bona fide BAD cysteine hits
|
381
|
+
# (some of the cysteine hits (~5%) are true positives)
|
382
|
+
|
383
|
+
ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
|
384
|
+
if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
|
385
|
+
total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
|
386
|
+
fppr = total_number_false / total_peptides
|
387
|
+
[fppr, total_number_false]
|
388
|
+
end
|
389
|
+
|
390
|
+
# num_peps_per_protein is an array of the number of peptides per protein hit
|
391
|
+
# (these are the true hits)
|
392
|
+
# assumes that the number follows a gaussian distribution (binomial
|
393
|
+
# distributions tend toward gaussians, I believe, at large N)
|
394
|
+
# returns [mean_num_wrong, mean_fppr, stdev_num_wrong, stdev_fppr] fppr
|
395
|
+
def protein_fppr( num_peps_per_protein, number_false_peptides, num_iterations=10)
|
396
|
+
|
397
|
+
## Check for more false peptides than peptides in our proteins:
|
398
|
+
total_protein_peps = 0
|
399
|
+
contained = num_peps_per_protein.each do |num|
|
400
|
+
total_protein_peps += num
|
401
|
+
end
|
402
|
+
## All peptides will be wrong every time!
|
403
|
+
## which means all proteins will be wrong every time!
|
404
|
+
if number_false_peptides >= total_protein_peps
|
405
|
+
# [all proteins wrong, fppr=1.0
|
406
|
+
return [num_peps_per_protein.size, 1.0, 0.0, 0.0]
|
407
|
+
end
|
408
|
+
|
409
|
+
|
410
|
+
num_prots = num_peps_per_protein.size
|
411
|
+
sample = VecD.new(num_iterations)
|
412
|
+
# indexed by peptide_number, pointing to a protein's peptide_count
|
413
|
+
# we shuffle the indices and then walk along until we are finished
|
414
|
+
# then we count how many proteins still have peptides
|
415
|
+
|
416
|
+
# we create an array to hold the peptide number for each protein, then we
|
417
|
+
# can reference the same entity when subtracting the peptides in the
|
418
|
+
# algorithm
|
419
|
+
cont_pep_num_per_prot_ars = (0...num_iterations).map do |i|
|
420
|
+
total_protein_peps = 0
|
421
|
+
contained = num_peps_per_protein.map do |num|
|
422
|
+
[num]
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
cont_num_by_pep_index_ars = cont_pep_num_per_prot_ars.map do |ar|
|
427
|
+
index_count = 0
|
428
|
+
pc_ar = []
|
429
|
+
ar.each do |contained_num|
|
430
|
+
contained_num.first.times do
|
431
|
+
pc_ar[index_count] = contained_num
|
432
|
+
index_count += 1
|
433
|
+
end
|
434
|
+
end
|
435
|
+
pc_ar
|
436
|
+
end
|
437
|
+
|
438
|
+
indices = (0...(cont_num_by_pep_index_ars.first.size)).map {|x| x }
|
439
|
+
|
440
|
+
|
441
|
+
(0...num_iterations).each do |i|
|
442
|
+
num_false = 0
|
443
|
+
indices.shuffle!
|
444
|
+
pc = cont_num_by_pep_index_ars[i]
|
445
|
+
number_false_peptides.times do |shuffle_index|
|
446
|
+
#big_i = indices[shuffle_index]
|
447
|
+
pc[indices[shuffle_index]][0] -= 1
|
448
|
+
end
|
449
|
+
cont_pep_num_per_prot_ars[i].each do |contained_pep_count|
|
450
|
+
if contained_pep_count.first == 0
|
451
|
+
num_false += 1
|
452
|
+
end
|
453
|
+
end
|
454
|
+
sample[i] = num_false
|
455
|
+
end
|
456
|
+
(mean_num_wrong, stdev) = sample.sample_stats
|
457
|
+
mean_fppr = mean_num_wrong / num_prots
|
458
|
+
stdev_fppr = stdev / num_prots
|
459
|
+
[mean_num_wrong, mean_fppr, stdev, stdev_fppr]
|
460
|
+
end
|
461
|
+
|
462
|
+
# returns [total_number_false, fppr, fraction_expected]
|
463
|
+
# also takes a hash of pephits keyed on :aaseq
|
464
|
+
def fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
|
465
|
+
(ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(pephits, cys_bg_freq)
|
466
|
+
fraction_of_expected = ac.to_f/exp
|
467
|
+
|
468
|
+
(cys_fprate, total_num_false) = fppr_by_cysteines(ac, exp, pephits.size, cys_containing_freq)
|
469
|
+
[total_num_false, cys_fprate, fraction_of_expected]
|
470
|
+
end
|
471
|
+
|
472
|
+
def report_cysteines
|
473
|
+
#### UNDERWAY:::
|
474
|
+
cys_tps = pep_nums[i] - total_num_false
|
475
|
+
|
476
|
+
puts "CYSTEINE FPR: "
|
477
|
+
puts " (# peps containing >= 1 cysteines)"
|
478
|
+
puts " actual: #{ac}"
|
479
|
+
puts "fraction of expected: #{short(fraction_of_expected)}"
|
480
|
+
puts " expected # FP's: " + short(total_num_false)
|
481
|
+
puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
|
482
|
+
|
483
|
+
puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)"
|
484
|
+
puts "Combined Score & FPR"
|
485
|
+
puts "#{combined_score}\t#{cys_fprate}"
|
486
|
+
puts "Combined Score & fraction of expected"
|
487
|
+
#puts "#{combined_score} #{fraction_of_expected}"
|
488
|
+
to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
|
489
|
+
puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
|
490
|
+
puts(['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
|
491
|
+
|
492
|
+
end
|
493
|
+
|
494
|
+
def filter_legend(fppr_methods)
|
495
|
+
lines = []
|
496
|
+
lines << "Note: protein FPPR values are probably optimistic"
|
497
|
+
lines << "[this implementation assumes an equal likelihood that a false peptide"
|
498
|
+
lines << " comes from a protein with more hits as one with less (which is probably"
|
499
|
+
lines << " not the case)]"
|
500
|
+
lines << "* = deltacn_star = peptides with deltacn > 1.0 (no sibling hits)"
|
501
|
+
if fppr_methods.size > 0
|
502
|
+
lines << "Following are methods for determining false identification rate:"
|
503
|
+
lines << ['dcy=decoy', 'cys=cysteine', 'tps=known_true_positives'].join(" ")
|
504
|
+
## when tmm is implemented:
|
505
|
+
#lines << ['dcy=decoy', 'cys=cysteine', 'tmm=transmembrane', 'tps=known_true_positives'].join(" ")
|
506
|
+
end
|
507
|
+
lines.join("\n")
|
508
|
+
end
|
509
|
+
|
510
|
+
# does this give aafreq from a fasta file?
|
511
|
+
# freq = cysteines.aafreqs[:C]
|
512
|
+
|
513
|
+
# returns [total_number_false, fppr]
|
514
|
+
# pephits can be an array or a hash of peptides keyed on :aaseq
|
515
|
+
def fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
|
516
|
+
if pephits.is_a? Hash
|
517
|
+
seqs = pephits.keys
|
518
|
+
else
|
519
|
+
seqs = pephits.map do |v|
|
520
|
+
v.aaseq
|
521
|
+
end
|
522
|
+
end
|
523
|
+
real_tps = 0
|
524
|
+
real_fps = 0
|
525
|
+
# could also do with partition
|
526
|
+
seqs.each do |pep_aaseq|
|
527
|
+
if true_pos_aaseqs_ar.any? {|prot_aaseq| prot_aaseq.include? pep_aaseq}
|
528
|
+
real_tps += 1
|
529
|
+
else
|
530
|
+
real_fps += 1
|
531
|
+
end
|
532
|
+
end
|
533
|
+
real_fppr = real_fps.to_f/pephits.size
|
534
|
+
[real_fps, real_fppr]
|
535
|
+
end
|
536
|
+
|
537
|
+
def filter_spec_id(spec_id, filter_args, args)
|
538
|
+
results_hash = {}
|
539
|
+
# that second argument is to update protein peptides
|
540
|
+
pephits = spec_id.filter_sequest(filter_args)
|
541
|
+
|
542
|
+
results_hash[:prothits] = SpecID.passing_proteins(pephits, :no_update)
|
543
|
+
results_hash[:pephits] = pephits
|
544
|
+
results_hash[:dcn_cnt] = pephits.select{|v| v.deltacn > 1.0}.size
|
545
|
+
# be aware that this is a hash keyed by aaseq and values of arrays of
|
546
|
+
# peptides sharing the same aaseq!
|
547
|
+
results_hash[:aaseq] = pephits.hash_by(:aaseq)
|
548
|
+
results_hash
|
549
|
+
end
|
550
|
+
|
551
|
+
# returns [#FP, FPPR]
|
552
|
+
def dcy_fppr(pephits, false_pephits)
|
553
|
+
fps = false_pephits.size
|
554
|
+
[fps, fps.to_f/pephits.size]
|
555
|
+
end
|
556
|
+
|
557
|
+
def tmm_fppr(pephits)
|
558
|
+
abort "NEED TO IMPLEMENT"
|
559
|
+
end
|
560
|
+
|
561
|
+
# returns [#FP, FPPR]
|
562
|
+
def cys_fppr(pephits, cys_bg_freq, cys_containing_freq)
|
563
|
+
(total_num_false, cys_fprate, fraction_of_expected) = fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq)
|
564
|
+
[total_num_false, cys_fprate]
|
565
|
+
end
|
566
|
+
|
567
|
+
def tps_fppr(pephits, true_pos_aaseqs_ar)
|
568
|
+
fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar)
|
569
|
+
end
|
570
|
+
|
571
|
+
## methods should be passed in like this 'cysteine' for cysteine_fppr
|
572
|
+
## all methods should return [number_false, fppr]
|
573
|
+
## returns a hash (by method) for each set of pephits
|
574
|
+
## if :dcy is given as a method, then expects the false pephits array
|
575
|
+
def calculate_pep_fppr(pephits_ar, methods, args, false_pephits_ar=nil)
|
576
|
+
cnt = 0
|
577
|
+
pephits_ar.map do |ph|
|
578
|
+
hash = {}
|
579
|
+
methods.each do |mth|
|
580
|
+
case mth
|
581
|
+
when :dcy
|
582
|
+
hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, false_pephits_ar[cnt])
|
583
|
+
when :cys
|
584
|
+
hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, *(args[:cys]) )
|
585
|
+
when :tps
|
586
|
+
hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, (args[:tps]) )
|
587
|
+
else
|
588
|
+
hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph)
|
589
|
+
end
|
590
|
+
end
|
591
|
+
cnt += 1
|
592
|
+
hash
|
593
|
+
end
|
594
|
+
end
|
595
|
+
|
596
|
+
# fpr is a SpecID obj that is the false positives
|
597
|
+
# cysteines holds an aafreqs object or nil
|
598
|
+
def filter_round(spec_ids, filter_args, args)
|
599
|
+
|
600
|
+
# push fpr on the end for the calculations
|
601
|
+
## FILTER the NORMAL spec_id objects
|
602
|
+
little_tables = []
|
603
|
+
spec_ids.each_with_index do |spec_id, i|
|
604
|
+
normal_results = filter_spec_id(spec_id, filter_args, args)
|
605
|
+
|
606
|
+
## FILTER the FALSE objects (if given)
|
607
|
+
false_results =
|
608
|
+
if args[:dcy]
|
609
|
+
little_args_hash = args.dup
|
610
|
+
false_results = filter_spec_id(args[:dcy][i], filter_args, little_args_hash)
|
611
|
+
end
|
612
|
+
|
613
|
+
## HOW TO CALCULATE FPPR FOR EVERYTHING:
|
614
|
+
# pephits Fpephits C/Tpephits TPpephits
|
615
|
+
# uniqaa Funiqaa C/Tuniqaa TPuniqaa
|
616
|
+
# prothits ProtFPR(Fpephits, prothits) ProtFPR(C/Tpephits, prothits) ProtFPR(total-TPpephits, prothits)
|
617
|
+
# OccProthits ProtFPR(Funiqaa, OccProthits) ProtFPR(C/Tuniqaa, OccProthits) ProtFPR(total-TPuniqaa, OccProthits)
|
618
|
+
# C/T = cystein or Transmembrane method
|
619
|
+
|
620
|
+
## set up false results array
|
621
|
+
if args[:dcy]
|
622
|
+
fr_ar = [false_results[:pephits], false_results[:aaseq]]
|
623
|
+
else
|
624
|
+
fr_ar = nil
|
625
|
+
end
|
626
|
+
(pephits_fppr_results, aaseq_fppr_results) = calculate_pep_fppr([normal_results[:pephits], normal_results[:aaseq]], @fppr_methods, args, fr_ar)
|
627
|
+
|
628
|
+
## NORMAL prothits
|
629
|
+
## update prothits peptides
|
630
|
+
updated_proteins = SpecID.passing_proteins(normal_results[:pephits], :update)
|
631
|
+
pep_cnt_arr = updated_proteins.map {|v| v.peps.size }
|
632
|
+
|
633
|
+
## update occams prothits
|
634
|
+
if args[:occams_razor]
|
635
|
+
updated_occams_protein_triplets = SpecID::occams_razor(updated_proteins, true)
|
636
|
+
occams_pep_cnt_arr = updated_occams_protein_triplets.map {|v| v[1].size }
|
637
|
+
occams_prots = updated_occams_protein_triplets.map {|v| v[0] }
|
638
|
+
normal_results[:occams_razor] = occams_prots
|
639
|
+
end
|
640
|
+
|
641
|
+
## note that the original prot.peps arrays are obliterated by this.
|
642
|
+
## we would need to re-update if someone wanted these
|
643
|
+
|
644
|
+
prothits_fppr_results = {}
|
645
|
+
occams_results = {}
|
646
|
+
@fppr_methods.each do |mth|
|
647
|
+
prothits_fppr_results[mth] = protein_fppr(pep_cnt_arr, pephits_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS)
|
648
|
+
occams_results[mth] = protein_fppr(occams_pep_cnt_arr, aaseq_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS) if args[:occams_razor]
|
649
|
+
end
|
650
|
+
|
651
|
+
fppr_results = {
|
652
|
+
:pephits => pephits_fppr_results,
|
653
|
+
:aaseq => aaseq_fppr_results,
|
654
|
+
:prothits => prothits_fppr_results,
|
655
|
+
}
|
656
|
+
fppr_results[:occams_razor] = occams_results if args[:occams_razor]
|
657
|
+
|
658
|
+
## CHANGE ALL RESULTS INTO PERCENTAGES:
|
659
|
+
fppr_results.each do |bk,hash|
|
660
|
+
hash.each do |k,val|
|
661
|
+
hash[k][1] = 100.0 * val[1]
|
662
|
+
end
|
663
|
+
end
|
664
|
+
little_tables[i] = to_table( spec_id, args, normal_results, fppr_results, @groups_reporting, @fppr_methods, @cat_labels)
|
665
|
+
end
|
666
|
+
|
667
|
+
out filter_params_string(filter_args, @fppr_methods)
|
668
|
+
little_tables.each do |tbl|
|
669
|
+
out tbl.to_formatted_string(nil, ' ')
|
670
|
+
out "-----------------------------------------------\n"
|
671
|
+
end
|
672
|
+
#big_table(spec_ids, filter_args, args, normal_results, groups_reporting, fppr_results, cat_labels)
|
673
|
+
|
674
|
+
end
|
675
|
+
|
676
|
+
|
677
|
+
|
678
|
+
def filter_params_string(filter_args, fppr_methods)
|
679
|
+
(x1, x2, x3, deltacn, ppm) = filter_args
|
680
|
+
st = []
|
681
|
+
st << "=========================================================================="
|
682
|
+
st << " xcorr(1,2,3) >= #{x1},#{x2},#{x3} || deltacn >= #{deltacn} || ppm <= #{ppm} "
|
683
|
+
st << ''
|
684
|
+
st.join("\n")
|
685
|
+
#st = []
|
686
|
+
#st << ["xcorr(1,2,3) >= #{x1},#{x2},#{x3}", "deltacn >= #{deltacn}", "ppm <= #{ppm}"].join("\t")
|
687
|
+
#st
|
688
|
+
end
|
689
|
+
|
690
|
+
def to_table(spec_id, args, normal_results, fppr_results, groups_reporting, fppr_methods, cat_labels)
|
691
|
+
#table is in the form: { column heading => [ values ] }
|
692
|
+
|
693
|
+
title = spec_id.passed_in_filename
|
694
|
+
col_labels = ['num', *(fppr_methods.map{|v| "#{v}%" })]
|
695
|
+
|
696
|
+
row_labels = groups_reporting.map {|grp| cat_labels[grp]}
|
697
|
+
dt = groups_reporting.map do |grp|
|
698
|
+
line = [normal_results[grp].size]
|
699
|
+
fppr_methods.each do |mth|
|
700
|
+
line << fppr_results[grp][mth][1]
|
701
|
+
end
|
702
|
+
line
|
703
|
+
end
|
704
|
+
|
705
|
+
Table.new(dt, row_labels, col_labels, title)
|
706
|
+
#puts(['TABULATE:', combined_score, pep_tps, pep_fppr, real_tps, real_fppr, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate
|
707
|
+
end
|
708
|
+
|
709
|
+
def combined_score(filter_args)
|
710
|
+
(x1, x2, x3, deltacn, ppm) = filter_args
|
711
|
+
combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)
|
712
|
+
end
|
713
|
+
|
714
|
+
# assumes its already chomped
|
715
|
+
# updates the 5 globals
|
716
|
+
def prep_reply(reply, base)
|
717
|
+
if reply == 'q' ; exit ; end
|
718
|
+
if reply =~ /^\s*$/
|
719
|
+
base
|
720
|
+
elsif reply
|
721
|
+
arr = reply.split(/\s+/)
|
722
|
+
to_change = []
|
723
|
+
to_change_hash = {}
|
724
|
+
arr.each do |it|
|
725
|
+
if it.include? ':'
|
726
|
+
(k,v) = it.split(':')
|
727
|
+
to_change_hash[k] = v
|
728
|
+
else
|
729
|
+
to_change << it
|
730
|
+
end
|
731
|
+
end
|
732
|
+
to_change.each_with_index do |tc,i|
|
733
|
+
begin
|
734
|
+
base[i] = tc.to_f
|
735
|
+
rescue NoMethodError
|
736
|
+
out "BAD ARG: #{tc}"
|
737
|
+
return false
|
738
|
+
end
|
739
|
+
end
|
740
|
+
to_change_hash.each do |k,v|
|
741
|
+
case k
|
742
|
+
when 'x1' ; base[0] = v
|
743
|
+
when 'x2' ; base[1] = v
|
744
|
+
when 'x3' ; base[2] = v
|
745
|
+
when 'dcn' ; base[3] = v
|
746
|
+
when 'ppm' ; base[4] = v
|
747
|
+
else
|
748
|
+
out "BAD ARG: #{k}:#{v}"
|
749
|
+
end
|
750
|
+
end
|
751
|
+
base.map {|v| v.to_f }
|
752
|
+
else
|
753
|
+
false
|
754
|
+
end
|
755
|
+
end
|
756
|
+
|
757
|
+
def file_to_prefiltered_spec_id(file, opt)
|
758
|
+
spec_id = nil
|
759
|
+
marshal_file = file + ".prefiltered.msh"
|
760
|
+
if File.exist?(marshal_file)
|
761
|
+
File.open(marshal_file) do |fh|
|
762
|
+
spec_id = Marshal.load(fh)
|
763
|
+
end
|
764
|
+
else
|
765
|
+
spec_id = SpecID.new(file)
|
766
|
+
spec_id.passed_in_filename = file
|
767
|
+
spec_id.top_peps_prefilter!
|
768
|
+
## marshal it!
|
769
|
+
if opt.marshal
|
770
|
+
File.open(marshal_file, "w") do |fh|
|
771
|
+
Marshal.dump(spec_id,fh)
|
772
|
+
end
|
773
|
+
end
|
774
|
+
end
|
775
|
+
spec_id
|
776
|
+
end
|
777
|
+
|
778
|
+
def interactive_help
|
779
|
+
string = []
|
780
|
+
string << "********************************************************"
|
781
|
+
string << "INTERACTIVE FILTERING HELP:"
|
782
|
+
string << "enter: <x1> <x2> <x3> <dcn> <ppm>"
|
783
|
+
string << "or : x1:<x1> x2:<x2> x3:<x3> dcn:<dcn> ppm:<ppm>"
|
784
|
+
string << "or : dcn:<dcn>"
|
785
|
+
string << "or : <x1> <x2> ppm:<ppm>"
|
786
|
+
string << "etc..."
|
787
|
+
string << "<enter> to (re)run current values"
|
788
|
+
string << "'q' to quit"
|
789
|
+
string << "********************************************************"
|
790
|
+
string.join("\n")
|
791
|
+
end
|
792
|
+
|
793
|
+
|
794
|
+
end
|