mspire 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/Rakefile CHANGED
@@ -16,7 +16,7 @@ NAME = "mspire"
16
16
  lib_files = FL["lib/**/*"]
17
17
  test_dir_too = FL["test/**/*"]
18
18
 
19
- little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*"]
19
+ little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "changelog.txt", "release_notes.txt", "{bin,script,tutorial}/**/*"]
20
20
  dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*", test_dir_too]
21
21
 
22
22
  dist_files = little_dist_files # comment out to include test files
@@ -107,12 +107,15 @@ end
107
107
  # PACKAGE / INSTALL / UNINSTALL
108
108
  ###############################################
109
109
 
110
+ ## To release a package on rubyforge:
111
+ ## Login to rubyforge and go the 'Files' tab
112
+ ## then "To create a new release click here"
110
113
 
111
114
  tm = Time.now
112
115
  spec = Gem::Specification.new do |s|
113
116
  s.platform = Gem::Platform::RUBY
114
117
  s.name = NAME
115
- s.version = "0.1.5"
118
+ s.version = "0.1.7"
116
119
  s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
117
120
  s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
118
121
  s.email = "jprince@icmb.utexas.edu"
@@ -7,8 +7,10 @@ DEFAULT_DATABASE_PATH = "/project/marcotte/marcotte/ms/database"
7
7
  DEFAULT_MZXML_PATH = "."
8
8
  DEFAULT_OUTDIR = "pepxml"
9
9
  DEFAULT_PARAMS_GLOB = "*.params"
10
+ DEFAULT_PARAMS_FILE = Dir[DEFAULT_PARAMS_GLOB].first
10
11
  DEFAULT_PEPXML_VERSION = 18
11
12
  DEFAULT_MS_MODEL = 'LCQ'
13
+ DEFAULT_MASS_ANALYZER = 'Ion Trap'
12
14
  ##############################################################
13
15
 
14
16
  require 'spec_id'
@@ -26,78 +28,120 @@ else
26
28
  end
27
29
 
28
30
  opt = OpenStruct.new
29
- opt.mspath = DEFAULT_MZXML_PATH
30
- opt.outdir = DEFAULT_OUTDIR
31
- opt.params = Dir[DEFAULT_PARAMS_GLOB].first
32
- opt.pepxml_version = DEFAULT_PEPXML_VERSION
33
- opt.model = DEFAULT_MS_MODEL
34
31
 
35
32
  opt_obj = OptionParser.new do |op|
36
- op.banner = "\nusage: #{File.basename(__FILE__)} [options] bioworks.xml"
33
+ op.banner = "\nusage: #{File.basename(__FILE__)} [options] <file>.srf ...
34
+ usage: #{File.basename(__FILE__)} [options] bioworks.xml"
37
35
  op.on_head "
38
- Takes the xml exported output of Bioworks multi-consensus view (no filtering)
39
- and outputs pepXML files (which can be fed into the trans-proteomic pipeline).
36
+ Takes .srf files or the xml exported output of Bioworks multi-consensus view
37
+ (no filtering) and outputs pepXML files (to feed the trans-proteomic pipeline).
40
38
 
41
39
  Options:"
42
- op.on('-p', '--params file', "sequest params file d: '#{opt.params}'") {|v| opt.params = v }
43
- op.on('-d', '--dbpath path', "path to databases d: '#{def_dbpath}'") {|v| opt.dbpath = v }
44
- op.on('-m', '--mspath path', "path to MS files d: '#{opt.mspath}'") {|v| opt.mspath = v }
45
- op.on('-o', '--outdir path', "output directory d: '#{opt.outdir}'") {|v| opt.outdir = v }
46
- op.on('--model <LCQ|Orbi>', "MS model d: '#{opt.model}'") {|v| opt.model = v }
47
- op.on('-v', '--version pepxml_version', "pepxml version d: '#{opt.pepxml_version}'") {|v| opt.pepxml_version = v.to_i }
48
-
49
-
50
- op.on_tail "
40
+ op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
41
+ op.on('-o', '--outdir path', "output directory d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
42
+
43
+ op.separator ""
44
+ op.separator "bioworks.xml files may require additional options:"
45
+ op.separator ""
46
+ op.on('-p', '--params file', "sequest params file d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
47
+ op.on('-d', '--dbpath path', "path to databases d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
48
+ op.on('-m', '--mspath path', "path to MS files d: '#{DEFAULT_MZXML_PATH}'") {|v| opt.mspath = v }
49
+ op.on('--model <LCQ|Orbi|string>', "MS model d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
50
+ op.on('--mass_analyzer <string>', "Mass Analyzer d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
51
+ op.on('-v', '--version pepxml_version', "pepxml version d: '#{DEFAULT_PEPXML_VERSION}'") {|v| opt.pepxml_version = v.to_i }
52
+
53
+ end
54
+
55
+ more_notes = "
51
56
  Notes:
57
+
52
58
  mspath: Directory to RAW or mzXML (version 1) files.
53
59
  This option is not used with Bioworks 3.3 files.
54
60
  outdir: Path will be created if it does not already exist.
55
61
  model : LCQ -> 'LCQ Deca XP Plus'
56
62
  : Orbi -> 'LTQ Orbitrap'
63
+ : other string -> That's the string that will be used.
57
64
 
65
+ options with spaces should be quoted: e.g., \"Time of Flight\"
58
66
 
59
67
  Database Path:
60
- If dbpath opt is given it will be used as the database path (overriding all).
68
+
61
69
  If the database path in the sequest.params file is valid, that will be used.
62
- If no database_path is given, will try (in order):
70
+ Otherwise, will try (in order):
71
+ 1. --dbpath or -d option
63
72
  1. environmental variable BIOWORKS_DBPATH (currently: '#{db_env_var}')
64
73
  2. constant at top of this script (currently: '#{DEFAULT_DATABASE_PATH}')
65
74
  "
66
- end
75
+
76
+
77
+
67
78
  opt_obj.parse!
68
79
 
80
+ # intercept before argv count
81
+ if opt.help
82
+ puts opt_obj
83
+ puts more_notes
84
+ exit
85
+ end
86
+
69
87
  if ARGV.size < 1
70
88
  puts opt_obj
71
89
  exit
72
90
  end
73
91
 
74
92
 
75
- case opt.model
76
- when "LCQ"
77
- model = 'LCQ Deca XP Plus'
78
- when "Orbi"
79
- model = 'LTQ Orbitrap'
80
- else
81
- abort "Bad MS model argument: #{opt.model}"
93
+
94
+ opt.outdir ||= DEFAULT_OUTDIR
95
+
96
+ ## Create dbpath if does not exist
97
+ if opt.outdir
98
+ FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
82
99
  end
83
100
 
84
- ## Ensure params file exists (unless opt given)
85
- params_obj = SpecID::Sequest::Params.new(opt.params)
86
- # Ensure the database exists!
87
- if opt.dbpath
88
- params_obj.database_path = opt.dbpath
101
+ files = ARGV.to_a
102
+
103
+ if files[0] =~ /\.srf/i
104
+ opt.dbpath ||= def_dbpath
105
+ files.each do |file|
106
+ hash = {
107
+ :backup_db_path => opt.dbpath || def_dbpath,
108
+ :out_path => opt.outdir,
109
+ }
110
+ xml_obj = SpecID::Sequest::PepXML.new_from_srf(file, hash)
111
+ xml_obj.to_pepxml(xml_obj.base_name + ".xml")
112
+ end
89
113
  else
114
+ ## Ensure params file exists (unless opt given)
115
+ opt.params ||= DEFAULT_PARAMS_FILE
116
+ params_obj = SpecID::Sequest::Params.new(opt.params)
117
+ # Ensure the database exists!
90
118
  unless File.exist?( params_obj.database )
91
- params_obj.database_path = def_dbpath
119
+ if opt.dbpath
120
+ params_obj.database_path = opt.dbpath
121
+ else
122
+ params_obj.database_path = def_dbpath
123
+ end
92
124
  end
93
- end
94
125
 
95
- ## Create dbpath if does not exist
96
- FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
126
+ opt.mspath ||= DEFAULT_MZXML_PATH
127
+ opt.pepxml_version ||= DEFAULT_PEPXML_VERSION
128
+ opt.model ||= DEFAULT_MS_MODEL
129
+ opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
130
+
131
+ case opt.model
132
+ when "LCQ"
133
+ model = 'LCQ Deca XP Plus'
134
+ when "Orbi"
135
+ model = 'LTQ Orbitrap'
136
+ else
137
+ model = opt.model
138
+ end
97
139
 
98
- bioworks = ARGV[0]
99
- xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
100
140
 
101
- xml_objs.each do |obj|
102
- obj.to_pepxml(obj.base_name + ".xml")
141
+ bioworks = files[0]
142
+ xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
143
+
144
+ xml_objs.each do |obj|
145
+ obj.to_pepxml(obj.base_name + ".xml")
146
+ end
103
147
  end
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # This is my second attempt at writing a simple interface for messing with
4
+ # fasta files. Acheiving simplicity (and power) is challenging. It usually
5
+ # only happens on the second (or sometimes more) try. Of course, in
6
+ # retrospect the simple solution seems sooo obvious. But its deceptive.
7
+ # It takes work to acheive simplicity for complex tasks. That's my thought
8
+ # for the day.
9
+
10
+ # fasta_shaker as in a salt shaker. Shake up your fasta proteins and let them
11
+ # season your dinner (hopefully a protein dinner). Mmmm. Don't they taste
12
+ # good all mixed up? If you want, you can think of it as a pepper shaker.
13
+ # I don't usually comment on my scripts (in my script, anyway), but this one
14
+ # came out so nice and clean that I feel like I have room to spare.
15
+
16
+ require 'fasta'
17
+ require 'cmdparse'
18
+
19
+ opt = {}
20
+
21
+ opts = OptionParser.new do |op|
22
+ prog = File.basename(__FILE__)
23
+ op.banner = "usage: #{prog} <method> [OPTIONS] <file>.fasta"
24
+ op.separator " <method> = reverse | shuffle"
25
+ op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
26
+ op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
27
+ op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
28
+ op.on("-f", "--fraction <float>", "creates some fraction of proteins") {|v| opt[:fraction] = v }
29
+ op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
30
+ op.separator " (after any given prefix) so that proteins are unique]"
31
+ op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
32
+
33
+ op.separator "EXAMPLES: "
34
+ op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
35
+ op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
36
+ op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
37
+ op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
38
+ end
39
+
40
+ opts.parse!
41
+
42
+ if ARGV.size < 2
43
+ puts opts
44
+ exit
45
+ end
46
+
47
+ (method, file) = ARGV
48
+
49
+ if opt[:cat] && !opt[:prefix]
50
+ puts "WARNING: concatenated proteins don't have unique headers"
51
+ puts "[you probably wanted to use the '--prefix' option!]"
52
+ end
53
+
54
+ # OUT filename:
55
+ unless opt[:out]
56
+ filebase = file.sub(/\..*$/,'')
57
+ parts = [filebase]
58
+ parts << 'cat' if opt[:cat]
59
+ parts << method
60
+ parts << 'prefix' << opt[:prefix] if opt[:prefix]
61
+ parts << 'fraction' << opt[:fraction] if opt[:fraction]
62
+ parts << 'tryptic_peptides' if opt[:tryptic_peptides]
63
+ opt[:out] = parts.join("_") << ".fasta"
64
+ end
65
+
66
+ ## READ the file
67
+ fasta = Fasta.new.read_file(file)
68
+
69
+ ## CAT (save an original copy)
70
+ fasta_orig = fasta.dup if opt[:cat]
71
+
72
+ ## FRACTION the proteins
73
+ if f = opt[:fraction]
74
+ prefix = nil
75
+ f = f.to_f
76
+ if f > 1.0
77
+ prefix = proc {|cnt| "f#{cnt}_" }
78
+ end
79
+ fasta = fasta.fraction_of_prots(f, prefix)
80
+ end
81
+
82
+ ## PREFIX the proteins
83
+ if pre = opt[:prefix]
84
+ fasta.header_prefix!(pre)
85
+ end
86
+
87
+ ## MODIFY the proteins
88
+ fasta.aaseq!((method + '!').to_sym, opt[:tryptic_peptides])
89
+
90
+ ## CAT (finish it up)
91
+ if opt[:cat]
92
+ fasta_orig << fasta
93
+ fasta = fasta_orig
94
+ end
95
+
96
+ ## WRITE out the file
97
+ fasta.write_file(opt[:out])
98
+
99
+
100
+
@@ -1,10 +1,15 @@
1
1
  #!/usr/bin/ruby -w
2
2
 
3
3
  require 'spec_id'
4
- require 'hash_by'
5
4
  require 'optparse'
6
5
  require 'ostruct'
6
+ require 'spec_id/aa_freqs'
7
7
 
8
+ ########################################################
9
+ WRITE_MARSHAL = true
10
+ TABULATE_DATA = true
11
+ WRITE_CYS_FIND = false
12
+ ########################################################
8
13
 
9
14
  opt = OpenStruct.new
10
15
  opt.x1 = 1.0
@@ -14,14 +19,19 @@ opt.c = 0.5
14
19
  opt.rppm = 1000.0
15
20
  opt.false = false
16
21
 
22
+ # prints shortened number for display
23
+ def short(num)
24
+ sprintf( "%.3f",num)
25
+ end
26
+
17
27
  opts = OptionParser.new do |op|
18
- op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml [decoy.xml]"
19
- op.separator("prints number of proteins (and FPR if decoy.xml)")
28
+ op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml"
29
+ op.separator("prints number of proteins (and FPR if -f option)")
20
30
  op.separator ""
21
31
 
22
32
  op.separator("** only takes the top hit per scan+charge")
23
- op.separator("** Excludes all deltacn's over 1.0")
24
- op.separator(" (in BioworksBrowser worst hits often given deltacn of 1.1)")
33
+ op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
34
+ op.separator(" (these are peptides who are the only hit with xcorr > 0)")
25
35
  op.separator ""
26
36
  op.on("-1", "--xcorr1 <f>", "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v.to_f}
27
37
  op.on("-2", "--xcorr2 <f>", "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v.to_f}
@@ -30,40 +40,151 @@ opts = OptionParser.new do |op|
30
40
  op.on("--rppm <f>", "<= rough ppm (10^6*deltamass/mass) d: #{opt.rppm}") {|v| opt.rppm = v.to_f}
31
41
  op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
32
42
  op.on("-f", "--false <s>", "protein prefix or file name of decoys") {|v| opt.false = v}
43
+ op.on("-y", "--cysteines <fasta_file|freq>", "report fpr by expected cysteine freq") {|v| opt.cysteines = v}
44
+ op.on("--cback <mean,stdev>", "the cysteine background") {|v| opt.cback = v}
45
+ op.on("--from_file <file>", "(no -i) file with list of interactive input") {|v| opt.from_file = v}
46
+ op.on("-t", "--true_pos <fasta>", "fasta file containing true hits") {|v| opt.true_pos = v }
33
47
  end
34
48
 
49
+ $cys_mean = nil
50
+ $cys_stdev = nil
51
+
35
52
 
36
53
  # fpr is a SpecID obj that is the false positives
37
- def filter_round(files, spec_ids, kind, args, fpr=nil, interactive=false)
54
+ # cysteines holds an aafreqs object or nil
55
+ def filter_round(files, spec_ids, kind, args, fpr=nil, cysteines=nil, interactive=false)
38
56
  (x1, x2, x3, deltacn, rppm) = args
57
+ combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)
39
58
  puts "=========================================================================="
40
59
  puts "[[ xcorr(1,2,3) >= #{x1},#{x2},#{x3} ; deltacn >= #{deltacn} ; rough_ppm <= #{rppm} ]]"
41
60
  # push fpr on the end for the calculations
42
61
  if fpr ; spec_ids.push(fpr) ; end
43
- arr_of_prots_and_peps = spec_ids.map do |spec_id|
44
- prots_and_peps = spec_id.filter(kind, *args)
62
+ arr_of_prots_and_peps_and_deltacnstars_and_cfpr = spec_ids.map do |spec_id|
63
+ (prots, peps, deltacnstar_cnt) = spec_id.filter(kind, *args)
64
+ if cysteines
65
+
66
+ if cysteines.is_a? Float
67
+ freq = cysteines
68
+ else
69
+ freq = cysteines.aafreqs[:C]
70
+ end
71
+ (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(peps, freq)
72
+ [prots, peps, deltacnstar_cnt, [ac,exp]]
73
+ else
74
+ [prots, peps, deltacnstar_cnt]
75
+ end
45
76
  end
46
- arr_of_num_of_prots = arr_of_prots_and_peps.map {|ar| ar[0].size }
47
- arr_of_num_of_peps = arr_of_prots_and_peps.map {|ar| ar[1].size }
77
+ arr_of_num_of_prots = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[0].size }
78
+ arr_of_num_of_peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[1].size }
79
+ deltacnstars = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[2] }
80
+ cys_reports = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[3] } if cysteines
48
81
  prot_nums = arr_of_num_of_prots
49
82
  pep_nums = arr_of_num_of_peps
83
+ ## files = [file1, file2, file3]
84
+ ## prot_nums = [nums1, nums2, nums3, nums_for_false_positives]
85
+ ## pep_nums = [nums1, nums2, nums3, nums_for_false_positives]
50
86
  files.each_with_index do |file,i|
87
+
51
88
  if !interactive
52
89
  puts "#{file} [prots]:\t#{prot_nums[i]}"
53
- puts "#{file} [peps]:\t#{pep_nums[i]}"
90
+ puts "#{file} [peps]:\t#{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
54
91
  else
55
92
  puts "file#{i+1} [prots]: #{prot_nums[i]}"
56
- puts "file#{i+1} [peps]: #{pep_nums[i]}"
93
+ puts "file#{i+1} [peps]: #{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
57
94
  end
58
95
  if fpr
59
- puts "FPR [prots] : " + sprintf( "%.3f", 100.0*(prot_nums[-1].to_f/prot_nums[0].to_f) ) + " % (#{prot_nums[-1]})"
60
- puts "FPR [peps] : " + sprintf( "%.3f", 100.0*(pep_nums[-1].to_f/pep_nums[0].to_f) ) + " % (#{pep_nums[-1]})"
96
+ #puts "FPR [prots] : " + short( 100.0*(prot_nums[-1].to_f/prot_nums[i].to_f) ) + " % (#{prot_nums[-1]})"
97
+ #puts "FPR [peps] : " + short( 100.0*(pep_nums[-1].to_f/pep_nums[i].to_f) ) + " % (#{pep_nums[-1]}) (dcn*=#{deltacnstars[-1]})"
98
+
99
+ ## For separate searches: every false positive = one less TP
100
+ ## For concatenated searches: every false positive is one less TP
101
+ ## THAT's what I've been doing already !
102
+
103
+ prot_tps = prot_nums[i] - prot_nums[-1]
104
+ pep_tps = pep_nums[i] - pep_nums[-1]
105
+ prot_fps = prot_nums[i] - prot_tps
106
+ pep_fps = pep_nums[i] - pep_tps
107
+ prot_fpr = prot_fps.to_f/prot_nums[i].to_f
108
+ pep_fpr = pep_fps.to_f/pep_nums[i].to_f
109
+ # those are the same!
110
+ puts "FPR [prots] : " + short( 100.0*prot_fpr ) + " % (#{prot_fps})"
111
+ puts "FPR [peps] : " + short( 100.0*pep_fpr ) + " % (#{pep_fps}) (dcn*=#{deltacnstars[-1]})"
112
+ end
113
+ if cysteines
114
+ (ac, exp) = cys_reports[i]
115
+
116
+ (cys_fprate, total_num_false) = fpr_by_cysteines(ac, exp, pep_nums[i], $cys_mean, $cys_stdev)
117
+ fraction_of_expected = ac.to_f/exp
118
+ cys_tps = pep_nums[i] - total_num_false
119
+
120
+ puts "CYSTEINE FPR: "
121
+ puts " (# peps containing >= 1 cysteines)"
122
+ puts " actual: #{ac}"
123
+ puts "fraction of expected: #{short(fraction_of_expected)}"
124
+ puts " expected # FP's: " + short(total_num_false)
125
+ puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
126
+
127
+ puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)"
128
+ puts "Combined Score & FPR"
129
+ puts "#{combined_score}\t#{cys_fprate}"
130
+ puts "Combined Score & fraction of expected"
131
+ #puts "#{combined_score} #{fraction_of_expected}"
132
+ to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
133
+ puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
134
+ to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, rppm]
135
+ puts to_tab.join("\t") if TABULATE_DATA
136
+ end
137
+ if $true_pos_aaseqs
138
+ peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr[i][1]
139
+ real_tps = 0
140
+ real_fps = 0
141
+ # could also do with partition
142
+ peps.each do |pep|
143
+ if pep.sequence =~ /\.([\w\*]+)\.?/
144
+ if $true_pos_aaseqs.any? {|aaseq| aaseq.include? $1}
145
+ real_tps += 1
146
+ else
147
+ real_fps += 1
148
+ end
149
+ else
150
+ abort "Couldn't Match: #{pep.sequence}"
151
+ end
152
+ end
153
+ if peps.size > 0
154
+ real_fpr = real_fps.to_f/peps.size
155
+ else
156
+ real_fpr = 0.0
157
+ end
158
+ puts "REAL FPR: #{real_fpr}"
159
+ puts "REAL #TP: #{real_tps}"
160
+ to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, real_tps, real_fpr, '', x1, x2, x3, deltacn, rppm]
161
+ puts to_tab.join("\t") if TABULATE_DATA
162
+
61
163
  end
62
164
  end
63
165
  #puts files.join(' | ')
64
166
  #puts nums.join(' | ')
65
167
  end
66
168
 
169
+
170
+ # (actual # with cys, expected # with cys, total#peptides,
171
+ # mean_fraction_of_cysteines_true, std)
172
+ # PepHit(C) = Peptide containing cysteine
173
+ # # Total PepHit(C) # Observed Bad Pep (C)
174
+ # ------------------ proportional_to ----------------------
175
+ # # Total PepHit # Total Bad PepHit (X)
176
+ def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
177
+
178
+ # the number of bona fide BAD cysteine hits
179
+ # (some of the cysteine hits (~5%) are true positives)
180
+
181
+ ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
182
+ if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
183
+ total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
184
+ fpr = total_number_false / total_peptides
185
+ [fpr, total_number_false]
186
+ end
187
+
67
188
  # assumes its already chomped
68
189
  # updates the 5 globals
69
190
  def prep_reply(reply, base)
@@ -108,8 +229,22 @@ def prep_reply(reply, base)
108
229
  end
109
230
 
110
231
  def file_to_prefiltered_spec_id(file)
111
- spec_id = SpecID.new(file)
112
- spec_id.top_peps_prefilter!
232
+ spec_id = nil
233
+ marshal_file = file + ".prefiltered.msh"
234
+ if File.exist?(marshal_file)
235
+ File.open(marshal_file) do |fh|
236
+ spec_id = Marshal.load(fh)
237
+ end
238
+ else
239
+ spec_id = SpecID.new(file)
240
+ spec_id.top_peps_prefilter!
241
+ ## marshal it!
242
+ if WRITE_MARSHAL
243
+ File.open(marshal_file, "w") do |fh|
244
+ Marshal.dump(spec_id,fh)
245
+ end
246
+ end
247
+ end
113
248
  spec_id
114
249
  end
115
250
 
@@ -123,7 +258,6 @@ def interactive_help
123
258
  puts "'q' to quit"
124
259
  end
125
260
 
126
-
127
261
  opts.parse!
128
262
 
129
263
  if ARGV.size < 1
@@ -140,6 +274,28 @@ arr_of_spec_ids = files.map do |file|
140
274
  end
141
275
 
142
276
  fpr = nil
277
+ cysteines = nil
278
+ if opt.cysteines
279
+ puts %w(TABULATE combined_score pep_tps pep_fprate cys_pep_tps cys_pep_fprate [nil] x1 x2 x3 dcn rppm).join("\t")
280
+ if File.exist? opt.cysteines
281
+ cysteines = SpecID::AAFreqs.new(opt.cysteines)
282
+ else
283
+ cysteines = opt.cysteines.to_f
284
+ end
285
+ if opt.cback
286
+ ($cys_mean, $cys_stdev) = opt.cback.split(',').map{|v| v.to_f }
287
+ end
288
+ end
289
+
290
+ $true_pos_aaseqs = nil
291
+ if opt.true_pos
292
+ puts %w(TABULATE combined_score pep_tps pep_fprate real_tps real_fpr [nil] x1 x2 x3 dcn rppm).join("\t")
293
+ fasta = Fasta.new.read_file(opt.true_pos)
294
+ $true_pos_aaseqs = fasta.prots.map do |prot|
295
+ prot.aaseq.chomp
296
+ end
297
+ end
298
+
143
299
  if opt.false
144
300
  # its a file if it exists
145
301
  if File.exist? opt.false
@@ -163,12 +319,18 @@ end
163
319
 
164
320
 
165
321
 
166
-
167
-
168
322
  base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.rppm]
169
323
 
170
-
171
- if opt.i
324
+ if opt.from_file
325
+ lines = IO.readlines(opt.from_file)
326
+ lines.each do |line|
327
+ line.chomp!
328
+ answer = prep_reply(line, base_args)
329
+ next if answer == false
330
+ base_args = answer
331
+ filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
332
+ end
333
+ elsif opt.i
172
334
  interactive_help
173
335
  puts "*******************************************************"
174
336
  puts "Number of proteins in files (this order):"
@@ -187,13 +349,13 @@ if opt.i
187
349
  interactive_help
188
350
  else
189
351
  base_args = answer
190
- filter_round(files, arr_of_spec_ids, :common, base_args, fpr, true)
352
+ filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
191
353
  break
192
354
  end
193
355
  end
194
356
  end
195
357
  else
196
- filter_round(files, arr_of_spec_ids, :common, base_args, fpr, false)
358
+ filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, false)
197
359
  end
198
360
 
199
361