mspire 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
data/Rakefile
CHANGED
@@ -16,7 +16,7 @@ NAME = "mspire"
|
|
16
16
|
lib_files = FL["lib/**/*"]
|
17
17
|
test_dir_too = FL["test/**/*"]
|
18
18
|
|
19
|
-
little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*"]
|
19
|
+
little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "changelog.txt", "release_notes.txt", "{bin,script,tutorial}/**/*"]
|
20
20
|
dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*", test_dir_too]
|
21
21
|
|
22
22
|
dist_files = little_dist_files # comment out to include test files
|
@@ -107,12 +107,15 @@ end
|
|
107
107
|
# PACKAGE / INSTALL / UNINSTALL
|
108
108
|
###############################################
|
109
109
|
|
110
|
+
## To release a package on rubyforge:
|
111
|
+
## Login to rubyforge and go the 'Files' tab
|
112
|
+
## then "To create a new release click here"
|
110
113
|
|
111
114
|
tm = Time.now
|
112
115
|
spec = Gem::Specification.new do |s|
|
113
116
|
s.platform = Gem::Platform::RUBY
|
114
117
|
s.name = NAME
|
115
|
-
s.version = "0.1.
|
118
|
+
s.version = "0.1.7"
|
116
119
|
s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
|
117
120
|
s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
|
118
121
|
s.email = "jprince@icmb.utexas.edu"
|
data/bin/bioworks_to_pepxml.rb
CHANGED
@@ -7,8 +7,10 @@ DEFAULT_DATABASE_PATH = "/project/marcotte/marcotte/ms/database"
|
|
7
7
|
DEFAULT_MZXML_PATH = "."
|
8
8
|
DEFAULT_OUTDIR = "pepxml"
|
9
9
|
DEFAULT_PARAMS_GLOB = "*.params"
|
10
|
+
DEFAULT_PARAMS_FILE = Dir[DEFAULT_PARAMS_GLOB].first
|
10
11
|
DEFAULT_PEPXML_VERSION = 18
|
11
12
|
DEFAULT_MS_MODEL = 'LCQ'
|
13
|
+
DEFAULT_MASS_ANALYZER = 'Ion Trap'
|
12
14
|
##############################################################
|
13
15
|
|
14
16
|
require 'spec_id'
|
@@ -26,78 +28,120 @@ else
|
|
26
28
|
end
|
27
29
|
|
28
30
|
opt = OpenStruct.new
|
29
|
-
opt.mspath = DEFAULT_MZXML_PATH
|
30
|
-
opt.outdir = DEFAULT_OUTDIR
|
31
|
-
opt.params = Dir[DEFAULT_PARAMS_GLOB].first
|
32
|
-
opt.pepxml_version = DEFAULT_PEPXML_VERSION
|
33
|
-
opt.model = DEFAULT_MS_MODEL
|
34
31
|
|
35
32
|
opt_obj = OptionParser.new do |op|
|
36
|
-
op.banner = "\nusage: #{File.basename(__FILE__)} [options]
|
33
|
+
op.banner = "\nusage: #{File.basename(__FILE__)} [options] <file>.srf ...
|
34
|
+
usage: #{File.basename(__FILE__)} [options] bioworks.xml"
|
37
35
|
op.on_head "
|
38
|
-
Takes the xml exported output of Bioworks multi-consensus view
|
39
|
-
and outputs pepXML files (
|
36
|
+
Takes .srf files or the xml exported output of Bioworks multi-consensus view
|
37
|
+
(no filtering) and outputs pepXML files (to feed the trans-proteomic pipeline).
|
40
38
|
|
41
39
|
Options:"
|
42
|
-
op.on('-
|
43
|
-
op.on('-
|
44
|
-
|
45
|
-
op.
|
46
|
-
op.
|
47
|
-
op.
|
48
|
-
|
49
|
-
|
50
|
-
op.
|
40
|
+
op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
|
41
|
+
op.on('-o', '--outdir path', "output directory d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
|
42
|
+
|
43
|
+
op.separator ""
|
44
|
+
op.separator "bioworks.xml files may require additional options:"
|
45
|
+
op.separator ""
|
46
|
+
op.on('-p', '--params file', "sequest params file d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
|
47
|
+
op.on('-d', '--dbpath path', "path to databases d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
|
48
|
+
op.on('-m', '--mspath path', "path to MS files d: '#{DEFAULT_MZXML_PATH}'") {|v| opt.mspath = v }
|
49
|
+
op.on('--model <LCQ|Orbi|string>', "MS model d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
|
50
|
+
op.on('--mass_analyzer <string>', "Mass Analyzer d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
|
51
|
+
op.on('-v', '--version pepxml_version', "pepxml version d: '#{DEFAULT_PEPXML_VERSION}'") {|v| opt.pepxml_version = v.to_i }
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
more_notes = "
|
51
56
|
Notes:
|
57
|
+
|
52
58
|
mspath: Directory to RAW or mzXML (version 1) files.
|
53
59
|
This option is not used with Bioworks 3.3 files.
|
54
60
|
outdir: Path will be created if it does not already exist.
|
55
61
|
model : LCQ -> 'LCQ Deca XP Plus'
|
56
62
|
: Orbi -> 'LTQ Orbitrap'
|
63
|
+
: other string -> That's the string that will be used.
|
57
64
|
|
65
|
+
options with spaces should be quoted: e.g., \"Time of Flight\"
|
58
66
|
|
59
67
|
Database Path:
|
60
|
-
|
68
|
+
|
61
69
|
If the database path in the sequest.params file is valid, that will be used.
|
62
|
-
|
70
|
+
Otherwise, will try (in order):
|
71
|
+
1. --dbpath or -d option
|
63
72
|
1. environmental variable BIOWORKS_DBPATH (currently: '#{db_env_var}')
|
64
73
|
2. constant at top of this script (currently: '#{DEFAULT_DATABASE_PATH}')
|
65
74
|
"
|
66
|
-
|
75
|
+
|
76
|
+
|
77
|
+
|
67
78
|
opt_obj.parse!
|
68
79
|
|
80
|
+
# intercept before argv count
|
81
|
+
if opt.help
|
82
|
+
puts opt_obj
|
83
|
+
puts more_notes
|
84
|
+
exit
|
85
|
+
end
|
86
|
+
|
69
87
|
if ARGV.size < 1
|
70
88
|
puts opt_obj
|
71
89
|
exit
|
72
90
|
end
|
73
91
|
|
74
92
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
abort "Bad MS model argument: #{opt.model}"
|
93
|
+
|
94
|
+
opt.outdir ||= DEFAULT_OUTDIR
|
95
|
+
|
96
|
+
## Create dbpath if does not exist
|
97
|
+
if opt.outdir
|
98
|
+
FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
|
82
99
|
end
|
83
100
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
101
|
+
files = ARGV.to_a
|
102
|
+
|
103
|
+
if files[0] =~ /\.srf/i
|
104
|
+
opt.dbpath ||= def_dbpath
|
105
|
+
files.each do |file|
|
106
|
+
hash = {
|
107
|
+
:backup_db_path => opt.dbpath || def_dbpath,
|
108
|
+
:out_path => opt.outdir,
|
109
|
+
}
|
110
|
+
xml_obj = SpecID::Sequest::PepXML.new_from_srf(file, hash)
|
111
|
+
xml_obj.to_pepxml(xml_obj.base_name + ".xml")
|
112
|
+
end
|
89
113
|
else
|
114
|
+
## Ensure params file exists (unless opt given)
|
115
|
+
opt.params ||= DEFAULT_PARAMS_FILE
|
116
|
+
params_obj = SpecID::Sequest::Params.new(opt.params)
|
117
|
+
# Ensure the database exists!
|
90
118
|
unless File.exist?( params_obj.database )
|
91
|
-
|
119
|
+
if opt.dbpath
|
120
|
+
params_obj.database_path = opt.dbpath
|
121
|
+
else
|
122
|
+
params_obj.database_path = def_dbpath
|
123
|
+
end
|
92
124
|
end
|
93
|
-
end
|
94
125
|
|
95
|
-
|
96
|
-
|
126
|
+
opt.mspath ||= DEFAULT_MZXML_PATH
|
127
|
+
opt.pepxml_version ||= DEFAULT_PEPXML_VERSION
|
128
|
+
opt.model ||= DEFAULT_MS_MODEL
|
129
|
+
opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
|
130
|
+
|
131
|
+
case opt.model
|
132
|
+
when "LCQ"
|
133
|
+
model = 'LCQ Deca XP Plus'
|
134
|
+
when "Orbi"
|
135
|
+
model = 'LTQ Orbitrap'
|
136
|
+
else
|
137
|
+
model = opt.model
|
138
|
+
end
|
97
139
|
|
98
|
-
bioworks = ARGV[0]
|
99
|
-
xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
|
100
140
|
|
101
|
-
|
102
|
-
|
141
|
+
bioworks = files[0]
|
142
|
+
xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
|
143
|
+
|
144
|
+
xml_objs.each do |obj|
|
145
|
+
obj.to_pepxml(obj.base_name + ".xml")
|
146
|
+
end
|
103
147
|
end
|
data/bin/fasta_shaker.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# This is my second attempt at writing a simple interface for messing with
|
4
|
+
# fasta files. Acheiving simplicity (and power) is challenging. It usually
|
5
|
+
# only happens on the second (or sometimes more) try. Of course, in
|
6
|
+
# retrospect the simple solution seems sooo obvious. But its deceptive.
|
7
|
+
# It takes work to acheive simplicity for complex tasks. That's my thought
|
8
|
+
# for the day.
|
9
|
+
|
10
|
+
# fasta_shaker as in a salt shaker. Shake up your fasta proteins and let them
|
11
|
+
# season your dinner (hopefully a protein dinner). Mmmm. Don't they taste
|
12
|
+
# good all mixed up? If you want, you can think of it as a pepper shaker.
|
13
|
+
# I don't usually comment on my scripts (in my script, anyway), but this one
|
14
|
+
# came out so nice and clean that I feel like I have room to spare.
|
15
|
+
|
16
|
+
require 'fasta'
|
17
|
+
require 'cmdparse'
|
18
|
+
|
19
|
+
opt = {}
|
20
|
+
|
21
|
+
opts = OptionParser.new do |op|
|
22
|
+
prog = File.basename(__FILE__)
|
23
|
+
op.banner = "usage: #{prog} <method> [OPTIONS] <file>.fasta"
|
24
|
+
op.separator " <method> = reverse | shuffle"
|
25
|
+
op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
|
26
|
+
op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
|
27
|
+
op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
|
28
|
+
op.on("-f", "--fraction <float>", "creates some fraction of proteins") {|v| opt[:fraction] = v }
|
29
|
+
op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
|
30
|
+
op.separator " (after any given prefix) so that proteins are unique]"
|
31
|
+
op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
|
32
|
+
|
33
|
+
op.separator "EXAMPLES: "
|
34
|
+
op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
|
35
|
+
op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
|
36
|
+
op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
|
37
|
+
op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.parse!
|
41
|
+
|
42
|
+
if ARGV.size < 2
|
43
|
+
puts opts
|
44
|
+
exit
|
45
|
+
end
|
46
|
+
|
47
|
+
(method, file) = ARGV
|
48
|
+
|
49
|
+
if opt[:cat] && !opt[:prefix]
|
50
|
+
puts "WARNING: concatenated proteins don't have unique headers"
|
51
|
+
puts "[you probably wanted to use the '--prefix' option!]"
|
52
|
+
end
|
53
|
+
|
54
|
+
# OUT filename:
|
55
|
+
unless opt[:out]
|
56
|
+
filebase = file.sub(/\..*$/,'')
|
57
|
+
parts = [filebase]
|
58
|
+
parts << 'cat' if opt[:cat]
|
59
|
+
parts << method
|
60
|
+
parts << 'prefix' << opt[:prefix] if opt[:prefix]
|
61
|
+
parts << 'fraction' << opt[:fraction] if opt[:fraction]
|
62
|
+
parts << 'tryptic_peptides' if opt[:tryptic_peptides]
|
63
|
+
opt[:out] = parts.join("_") << ".fasta"
|
64
|
+
end
|
65
|
+
|
66
|
+
## READ the file
|
67
|
+
fasta = Fasta.new.read_file(file)
|
68
|
+
|
69
|
+
## CAT (save an original copy)
|
70
|
+
fasta_orig = fasta.dup if opt[:cat]
|
71
|
+
|
72
|
+
## FRACTION the proteins
|
73
|
+
if f = opt[:fraction]
|
74
|
+
prefix = nil
|
75
|
+
f = f.to_f
|
76
|
+
if f > 1.0
|
77
|
+
prefix = proc {|cnt| "f#{cnt}_" }
|
78
|
+
end
|
79
|
+
fasta = fasta.fraction_of_prots(f, prefix)
|
80
|
+
end
|
81
|
+
|
82
|
+
## PREFIX the proteins
|
83
|
+
if pre = opt[:prefix]
|
84
|
+
fasta.header_prefix!(pre)
|
85
|
+
end
|
86
|
+
|
87
|
+
## MODIFY the proteins
|
88
|
+
fasta.aaseq!((method + '!').to_sym, opt[:tryptic_peptides])
|
89
|
+
|
90
|
+
## CAT (finish it up)
|
91
|
+
if opt[:cat]
|
92
|
+
fasta_orig << fasta
|
93
|
+
fasta = fasta_orig
|
94
|
+
end
|
95
|
+
|
96
|
+
## WRITE out the file
|
97
|
+
fasta.write_file(opt[:out])
|
98
|
+
|
99
|
+
|
100
|
+
|
data/bin/filter_spec_id.rb
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
#!/usr/bin/ruby -w
|
2
2
|
|
3
3
|
require 'spec_id'
|
4
|
-
require 'hash_by'
|
5
4
|
require 'optparse'
|
6
5
|
require 'ostruct'
|
6
|
+
require 'spec_id/aa_freqs'
|
7
7
|
|
8
|
+
########################################################
|
9
|
+
WRITE_MARSHAL = true
|
10
|
+
TABULATE_DATA = true
|
11
|
+
WRITE_CYS_FIND = false
|
12
|
+
########################################################
|
8
13
|
|
9
14
|
opt = OpenStruct.new
|
10
15
|
opt.x1 = 1.0
|
@@ -14,14 +19,19 @@ opt.c = 0.5
|
|
14
19
|
opt.rppm = 1000.0
|
15
20
|
opt.false = false
|
16
21
|
|
22
|
+
# prints shortened number for display
|
23
|
+
def short(num)
|
24
|
+
sprintf( "%.3f",num)
|
25
|
+
end
|
26
|
+
|
17
27
|
opts = OptionParser.new do |op|
|
18
|
-
op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml
|
19
|
-
op.separator("prints number of proteins (and FPR if
|
28
|
+
op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml"
|
29
|
+
op.separator("prints number of proteins (and FPR if -f option)")
|
20
30
|
op.separator ""
|
21
31
|
|
22
32
|
op.separator("** only takes the top hit per scan+charge")
|
23
|
-
op.separator("**
|
24
|
-
op.separator(" (
|
33
|
+
op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
|
34
|
+
op.separator(" (these are peptides who are the only hit with xcorr > 0)")
|
25
35
|
op.separator ""
|
26
36
|
op.on("-1", "--xcorr1 <f>", "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v.to_f}
|
27
37
|
op.on("-2", "--xcorr2 <f>", "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v.to_f}
|
@@ -30,40 +40,151 @@ opts = OptionParser.new do |op|
|
|
30
40
|
op.on("--rppm <f>", "<= rough ppm (10^6*deltamass/mass) d: #{opt.rppm}") {|v| opt.rppm = v.to_f}
|
31
41
|
op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
|
32
42
|
op.on("-f", "--false <s>", "protein prefix or file name of decoys") {|v| opt.false = v}
|
43
|
+
op.on("-y", "--cysteines <fasta_file|freq>", "report fpr by expected cysteine freq") {|v| opt.cysteines = v}
|
44
|
+
op.on("--cback <mean,stdev>", "the cysteine background") {|v| opt.cback = v}
|
45
|
+
op.on("--from_file <file>", "(no -i) file with list of interactive input") {|v| opt.from_file = v}
|
46
|
+
op.on("-t", "--true_pos <fasta>", "fasta file containing true hits") {|v| opt.true_pos = v }
|
33
47
|
end
|
34
48
|
|
49
|
+
$cys_mean = nil
|
50
|
+
$cys_stdev = nil
|
51
|
+
|
35
52
|
|
36
53
|
# fpr is a SpecID obj that is the false positives
|
37
|
-
|
54
|
+
# cysteines holds an aafreqs object or nil
|
55
|
+
def filter_round(files, spec_ids, kind, args, fpr=nil, cysteines=nil, interactive=false)
|
38
56
|
(x1, x2, x3, deltacn, rppm) = args
|
57
|
+
combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)
|
39
58
|
puts "=========================================================================="
|
40
59
|
puts "[[ xcorr(1,2,3) >= #{x1},#{x2},#{x3} ; deltacn >= #{deltacn} ; rough_ppm <= #{rppm} ]]"
|
41
60
|
# push fpr on the end for the calculations
|
42
61
|
if fpr ; spec_ids.push(fpr) ; end
|
43
|
-
|
44
|
-
|
62
|
+
arr_of_prots_and_peps_and_deltacnstars_and_cfpr = spec_ids.map do |spec_id|
|
63
|
+
(prots, peps, deltacnstar_cnt) = spec_id.filter(kind, *args)
|
64
|
+
if cysteines
|
65
|
+
|
66
|
+
if cysteines.is_a? Float
|
67
|
+
freq = cysteines
|
68
|
+
else
|
69
|
+
freq = cysteines.aafreqs[:C]
|
70
|
+
end
|
71
|
+
(ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(peps, freq)
|
72
|
+
[prots, peps, deltacnstar_cnt, [ac,exp]]
|
73
|
+
else
|
74
|
+
[prots, peps, deltacnstar_cnt]
|
75
|
+
end
|
45
76
|
end
|
46
|
-
arr_of_num_of_prots =
|
47
|
-
arr_of_num_of_peps =
|
77
|
+
arr_of_num_of_prots = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[0].size }
|
78
|
+
arr_of_num_of_peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[1].size }
|
79
|
+
deltacnstars = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[2] }
|
80
|
+
cys_reports = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[3] } if cysteines
|
48
81
|
prot_nums = arr_of_num_of_prots
|
49
82
|
pep_nums = arr_of_num_of_peps
|
83
|
+
## files = [file1, file2, file3]
|
84
|
+
## prot_nums = [nums1, nums2, nums3, nums_for_false_positives]
|
85
|
+
## pep_nums = [nums1, nums2, nums3, nums_for_false_positives]
|
50
86
|
files.each_with_index do |file,i|
|
87
|
+
|
51
88
|
if !interactive
|
52
89
|
puts "#{file} [prots]:\t#{prot_nums[i]}"
|
53
|
-
puts "#{file} [peps]:\t#{pep_nums[i]}"
|
90
|
+
puts "#{file} [peps]:\t#{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
|
54
91
|
else
|
55
92
|
puts "file#{i+1} [prots]: #{prot_nums[i]}"
|
56
|
-
puts "file#{i+1} [peps]: #{pep_nums[i]}"
|
93
|
+
puts "file#{i+1} [peps]: #{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
|
57
94
|
end
|
58
95
|
if fpr
|
59
|
-
puts "FPR [prots] : " +
|
60
|
-
puts "FPR [peps] : " +
|
96
|
+
#puts "FPR [prots] : " + short( 100.0*(prot_nums[-1].to_f/prot_nums[i].to_f) ) + " % (#{prot_nums[-1]})"
|
97
|
+
#puts "FPR [peps] : " + short( 100.0*(pep_nums[-1].to_f/pep_nums[i].to_f) ) + " % (#{pep_nums[-1]}) (dcn*=#{deltacnstars[-1]})"
|
98
|
+
|
99
|
+
## For separate searches: every false positive = one less TP
|
100
|
+
## For concatenated searches: every false positive is one less TP
|
101
|
+
## THAT's what I've been doing already !
|
102
|
+
|
103
|
+
prot_tps = prot_nums[i] - prot_nums[-1]
|
104
|
+
pep_tps = pep_nums[i] - pep_nums[-1]
|
105
|
+
prot_fps = prot_nums[i] - prot_tps
|
106
|
+
pep_fps = pep_nums[i] - pep_tps
|
107
|
+
prot_fpr = prot_fps.to_f/prot_nums[i].to_f
|
108
|
+
pep_fpr = pep_fps.to_f/pep_nums[i].to_f
|
109
|
+
# those are the same!
|
110
|
+
puts "FPR [prots] : " + short( 100.0*prot_fpr ) + " % (#{prot_fps})"
|
111
|
+
puts "FPR [peps] : " + short( 100.0*pep_fpr ) + " % (#{pep_fps}) (dcn*=#{deltacnstars[-1]})"
|
112
|
+
end
|
113
|
+
if cysteines
|
114
|
+
(ac, exp) = cys_reports[i]
|
115
|
+
|
116
|
+
(cys_fprate, total_num_false) = fpr_by_cysteines(ac, exp, pep_nums[i], $cys_mean, $cys_stdev)
|
117
|
+
fraction_of_expected = ac.to_f/exp
|
118
|
+
cys_tps = pep_nums[i] - total_num_false
|
119
|
+
|
120
|
+
puts "CYSTEINE FPR: "
|
121
|
+
puts " (# peps containing >= 1 cysteines)"
|
122
|
+
puts " actual: #{ac}"
|
123
|
+
puts "fraction of expected: #{short(fraction_of_expected)}"
|
124
|
+
puts " expected # FP's: " + short(total_num_false)
|
125
|
+
puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
|
126
|
+
|
127
|
+
puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)"
|
128
|
+
puts "Combined Score & FPR"
|
129
|
+
puts "#{combined_score}\t#{cys_fprate}"
|
130
|
+
puts "Combined Score & fraction of expected"
|
131
|
+
#puts "#{combined_score} #{fraction_of_expected}"
|
132
|
+
to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
|
133
|
+
puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
|
134
|
+
to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, rppm]
|
135
|
+
puts to_tab.join("\t") if TABULATE_DATA
|
136
|
+
end
|
137
|
+
if $true_pos_aaseqs
|
138
|
+
peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr[i][1]
|
139
|
+
real_tps = 0
|
140
|
+
real_fps = 0
|
141
|
+
# could also do with partition
|
142
|
+
peps.each do |pep|
|
143
|
+
if pep.sequence =~ /\.([\w\*]+)\.?/
|
144
|
+
if $true_pos_aaseqs.any? {|aaseq| aaseq.include? $1}
|
145
|
+
real_tps += 1
|
146
|
+
else
|
147
|
+
real_fps += 1
|
148
|
+
end
|
149
|
+
else
|
150
|
+
abort "Couldn't Match: #{pep.sequence}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
if peps.size > 0
|
154
|
+
real_fpr = real_fps.to_f/peps.size
|
155
|
+
else
|
156
|
+
real_fpr = 0.0
|
157
|
+
end
|
158
|
+
puts "REAL FPR: #{real_fpr}"
|
159
|
+
puts "REAL #TP: #{real_tps}"
|
160
|
+
to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, real_tps, real_fpr, '', x1, x2, x3, deltacn, rppm]
|
161
|
+
puts to_tab.join("\t") if TABULATE_DATA
|
162
|
+
|
61
163
|
end
|
62
164
|
end
|
63
165
|
#puts files.join(' | ')
|
64
166
|
#puts nums.join(' | ')
|
65
167
|
end
|
66
168
|
|
169
|
+
|
170
|
+
# (actual # with cys, expected # with cys, total#peptides,
|
171
|
+
# mean_fraction_of_cysteines_true, std)
|
172
|
+
# PepHit(C) = Peptide containing cysteine
|
173
|
+
# # Total PepHit(C) # Observed Bad Pep (C)
|
174
|
+
# ------------------ proportional_to ----------------------
|
175
|
+
# # Total PepHit # Total Bad PepHit (X)
|
176
|
+
def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
|
177
|
+
|
178
|
+
# the number of bona fide BAD cysteine hits
|
179
|
+
# (some of the cysteine hits (~5%) are true positives)
|
180
|
+
|
181
|
+
ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
|
182
|
+
if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
|
183
|
+
total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
|
184
|
+
fpr = total_number_false / total_peptides
|
185
|
+
[fpr, total_number_false]
|
186
|
+
end
|
187
|
+
|
67
188
|
# assumes its already chomped
|
68
189
|
# updates the 5 globals
|
69
190
|
def prep_reply(reply, base)
|
@@ -108,8 +229,22 @@ def prep_reply(reply, base)
|
|
108
229
|
end
|
109
230
|
|
110
231
|
def file_to_prefiltered_spec_id(file)
|
111
|
-
spec_id =
|
112
|
-
|
232
|
+
spec_id = nil
|
233
|
+
marshal_file = file + ".prefiltered.msh"
|
234
|
+
if File.exist?(marshal_file)
|
235
|
+
File.open(marshal_file) do |fh|
|
236
|
+
spec_id = Marshal.load(fh)
|
237
|
+
end
|
238
|
+
else
|
239
|
+
spec_id = SpecID.new(file)
|
240
|
+
spec_id.top_peps_prefilter!
|
241
|
+
## marshal it!
|
242
|
+
if WRITE_MARSHAL
|
243
|
+
File.open(marshal_file, "w") do |fh|
|
244
|
+
Marshal.dump(spec_id,fh)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
113
248
|
spec_id
|
114
249
|
end
|
115
250
|
|
@@ -123,7 +258,6 @@ def interactive_help
|
|
123
258
|
puts "'q' to quit"
|
124
259
|
end
|
125
260
|
|
126
|
-
|
127
261
|
opts.parse!
|
128
262
|
|
129
263
|
if ARGV.size < 1
|
@@ -140,6 +274,28 @@ arr_of_spec_ids = files.map do |file|
|
|
140
274
|
end
|
141
275
|
|
142
276
|
fpr = nil
|
277
|
+
cysteines = nil
|
278
|
+
if opt.cysteines
|
279
|
+
puts %w(TABULATE combined_score pep_tps pep_fprate cys_pep_tps cys_pep_fprate [nil] x1 x2 x3 dcn rppm).join("\t")
|
280
|
+
if File.exist? opt.cysteines
|
281
|
+
cysteines = SpecID::AAFreqs.new(opt.cysteines)
|
282
|
+
else
|
283
|
+
cysteines = opt.cysteines.to_f
|
284
|
+
end
|
285
|
+
if opt.cback
|
286
|
+
($cys_mean, $cys_stdev) = opt.cback.split(',').map{|v| v.to_f }
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
$true_pos_aaseqs = nil
|
291
|
+
if opt.true_pos
|
292
|
+
puts %w(TABULATE combined_score pep_tps pep_fprate real_tps real_fpr [nil] x1 x2 x3 dcn rppm).join("\t")
|
293
|
+
fasta = Fasta.new.read_file(opt.true_pos)
|
294
|
+
$true_pos_aaseqs = fasta.prots.map do |prot|
|
295
|
+
prot.aaseq.chomp
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
143
299
|
if opt.false
|
144
300
|
# its a file if it exists
|
145
301
|
if File.exist? opt.false
|
@@ -163,12 +319,18 @@ end
|
|
163
319
|
|
164
320
|
|
165
321
|
|
166
|
-
|
167
|
-
|
168
322
|
base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.rppm]
|
169
323
|
|
170
|
-
|
171
|
-
|
324
|
+
if opt.from_file
|
325
|
+
lines = IO.readlines(opt.from_file)
|
326
|
+
lines.each do |line|
|
327
|
+
line.chomp!
|
328
|
+
answer = prep_reply(line, base_args)
|
329
|
+
next if answer == false
|
330
|
+
base_args = answer
|
331
|
+
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
|
332
|
+
end
|
333
|
+
elsif opt.i
|
172
334
|
interactive_help
|
173
335
|
puts "*******************************************************"
|
174
336
|
puts "Number of proteins in files (this order):"
|
@@ -187,13 +349,13 @@ if opt.i
|
|
187
349
|
interactive_help
|
188
350
|
else
|
189
351
|
base_args = answer
|
190
|
-
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, true)
|
352
|
+
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
|
191
353
|
break
|
192
354
|
end
|
193
355
|
end
|
194
356
|
end
|
195
357
|
else
|
196
|
-
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, false)
|
358
|
+
filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, false)
|
197
359
|
end
|
198
360
|
|
199
361
|
|