mspire 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/Rakefile +5 -2
  2. data/bin/bioworks_to_pepxml.rb +84 -40
  3. data/bin/fasta_shaker.rb +100 -0
  4. data/bin/filter_spec_id.rb +185 -23
  5. data/bin/gi2annot.rb +2 -110
  6. data/bin/id_class_anal.rb +31 -21
  7. data/bin/id_precision.rb +12 -8
  8. data/bin/{false_positive_rate.rb → precision.rb} +1 -1
  9. data/bin/protein_summary.rb +55 -62
  10. data/changelog.txt +34 -0
  11. data/lib/align.rb +0 -1
  12. data/lib/fasta.rb +88 -24
  13. data/lib/gi.rb +114 -0
  14. data/lib/roc.rb +64 -58
  15. data/lib/spec_id/aa_freqs.rb +166 -0
  16. data/lib/spec_id/bioworks.rb +5 -1
  17. data/lib/spec_id/precision.rb +427 -0
  18. data/lib/spec_id/proph.rb +2 -2
  19. data/lib/spec_id/sequest.rb +810 -113
  20. data/lib/spec_id/srf.rb +486 -0
  21. data/lib/spec_id.rb +107 -23
  22. data/release_notes.txt +11 -0
  23. data/script/estimate_fpr_by_cysteine.rb +226 -0
  24. data/script/filter-peps.rb +3 -3
  25. data/script/find_cysteine_background.rb +137 -0
  26. data/script/gen_database_searching.rb +11 -7
  27. data/script/genuine_tps_and_probs.rb +136 -0
  28. data/script/top_hit_per_scan.rb +5 -2
  29. data/test/tc_aa_freqs.rb +59 -0
  30. data/test/tc_bioworks.rb +6 -1
  31. data/test/tc_bioworks_to_pepxml.rb +25 -18
  32. data/test/tc_fasta.rb +81 -3
  33. data/test/tc_fasta_shaker.rb +147 -0
  34. data/test/tc_gi.rb +20 -0
  35. data/test/tc_id_class_anal.rb +9 -12
  36. data/test/tc_id_precision.rb +12 -11
  37. data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
  38. data/test/tc_protein_summary.rb +31 -22
  39. data/test/tc_roc.rb +95 -50
  40. data/test/tc_sequest.rb +212 -145
  41. data/test/tc_spec.rb +10 -5
  42. data/test/tc_spec_id.rb +0 -2
  43. data/test/tc_spec_id_xml.rb +36 -0
  44. data/test/tc_srf.rb +216 -0
  45. metadata +35 -21
  46. data/lib/spec_id/false_positive_rate.rb +0 -476
  47. data/test/tc_gi2annot.rb +0 -12
data/Rakefile CHANGED
@@ -16,7 +16,7 @@ NAME = "mspire"
16
16
  lib_files = FL["lib/**/*"]
17
17
  test_dir_too = FL["test/**/*"]
18
18
 
19
- little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*"]
19
+ little_dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "changelog.txt", "release_notes.txt", "{bin,script,tutorial}/**/*"]
20
20
  dist_files = lib_files + FL["INSTALL", "README", "Rakefile", "LICENSE", "{bin,script,tutorial}/**/*", test_dir_too]
21
21
 
22
22
  dist_files = little_dist_files # comment out to include test files
@@ -107,12 +107,15 @@ end
107
107
  # PACKAGE / INSTALL / UNINSTALL
108
108
  ###############################################
109
109
 
110
+ ## To release a package on rubyforge:
111
+ ## Login to rubyforge and go the 'Files' tab
112
+ ## then "To create a new release click here"
110
113
 
111
114
  tm = Time.now
112
115
  spec = Gem::Specification.new do |s|
113
116
  s.platform = Gem::Platform::RUBY
114
117
  s.name = NAME
115
- s.version = "0.1.5"
118
+ s.version = "0.1.7"
116
119
  s.summary = "Mass Spectrometry Proteomics Objects, Scripts, and Executables"
117
120
  s.date = "#{tm.year}-#{tm.month}-#{tm.day}"
118
121
  s.email = "jprince@icmb.utexas.edu"
@@ -7,8 +7,10 @@ DEFAULT_DATABASE_PATH = "/project/marcotte/marcotte/ms/database"
7
7
  DEFAULT_MZXML_PATH = "."
8
8
  DEFAULT_OUTDIR = "pepxml"
9
9
  DEFAULT_PARAMS_GLOB = "*.params"
10
+ DEFAULT_PARAMS_FILE = Dir[DEFAULT_PARAMS_GLOB].first
10
11
  DEFAULT_PEPXML_VERSION = 18
11
12
  DEFAULT_MS_MODEL = 'LCQ'
13
+ DEFAULT_MASS_ANALYZER = 'Ion Trap'
12
14
  ##############################################################
13
15
 
14
16
  require 'spec_id'
@@ -26,78 +28,120 @@ else
26
28
  end
27
29
 
28
30
  opt = OpenStruct.new
29
- opt.mspath = DEFAULT_MZXML_PATH
30
- opt.outdir = DEFAULT_OUTDIR
31
- opt.params = Dir[DEFAULT_PARAMS_GLOB].first
32
- opt.pepxml_version = DEFAULT_PEPXML_VERSION
33
- opt.model = DEFAULT_MS_MODEL
34
31
 
35
32
  opt_obj = OptionParser.new do |op|
36
- op.banner = "\nusage: #{File.basename(__FILE__)} [options] bioworks.xml"
33
+ op.banner = "\nusage: #{File.basename(__FILE__)} [options] <file>.srf ...
34
+ usage: #{File.basename(__FILE__)} [options] bioworks.xml"
37
35
  op.on_head "
38
- Takes the xml exported output of Bioworks multi-consensus view (no filtering)
39
- and outputs pepXML files (which can be fed into the trans-proteomic pipeline).
36
+ Takes .srf files or the xml exported output of Bioworks multi-consensus view
37
+ (no filtering) and outputs pepXML files (to feed the trans-proteomic pipeline).
40
38
 
41
39
  Options:"
42
- op.on('-p', '--params file', "sequest params file d: '#{opt.params}'") {|v| opt.params = v }
43
- op.on('-d', '--dbpath path', "path to databases d: '#{def_dbpath}'") {|v| opt.dbpath = v }
44
- op.on('-m', '--mspath path', "path to MS files d: '#{opt.mspath}'") {|v| opt.mspath = v }
45
- op.on('-o', '--outdir path', "output directory d: '#{opt.outdir}'") {|v| opt.outdir = v }
46
- op.on('--model <LCQ|Orbi>', "MS model d: '#{opt.model}'") {|v| opt.model = v }
47
- op.on('-v', '--version pepxml_version', "pepxml version d: '#{opt.pepxml_version}'") {|v| opt.pepxml_version = v.to_i }
48
-
49
-
50
- op.on_tail "
40
+ op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
41
+ op.on('-o', '--outdir path', "output directory d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
42
+
43
+ op.separator ""
44
+ op.separator "bioworks.xml files may require additional options:"
45
+ op.separator ""
46
+ op.on('-p', '--params file', "sequest params file d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
47
+ op.on('-d', '--dbpath path', "path to databases d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
48
+ op.on('-m', '--mspath path', "path to MS files d: '#{DEFAULT_MZXML_PATH}'") {|v| opt.mspath = v }
49
+ op.on('--model <LCQ|Orbi|string>', "MS model d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
50
+ op.on('--mass_analyzer <string>', "Mass Analyzer d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
51
+ op.on('-v', '--version pepxml_version', "pepxml version d: '#{DEFAULT_PEPXML_VERSION}'") {|v| opt.pepxml_version = v.to_i }
52
+
53
+ end
54
+
55
+ more_notes = "
51
56
  Notes:
57
+
52
58
  mspath: Directory to RAW or mzXML (version 1) files.
53
59
  This option is not used with Bioworks 3.3 files.
54
60
  outdir: Path will be created if it does not already exist.
55
61
  model : LCQ -> 'LCQ Deca XP Plus'
56
62
  : Orbi -> 'LTQ Orbitrap'
63
+ : other string -> That's the string that will be used.
57
64
 
65
+ options with spaces should be quoted: e.g., \"Time of Flight\"
58
66
 
59
67
  Database Path:
60
- If dbpath opt is given it will be used as the database path (overriding all).
68
+
61
69
  If the database path in the sequest.params file is valid, that will be used.
62
- If no database_path is given, will try (in order):
70
+ Otherwise, will try (in order):
71
+ 1. --dbpath or -d option
63
72
  1. environmental variable BIOWORKS_DBPATH (currently: '#{db_env_var}')
64
73
  2. constant at top of this script (currently: '#{DEFAULT_DATABASE_PATH}')
65
74
  "
66
- end
75
+
76
+
77
+
67
78
  opt_obj.parse!
68
79
 
80
+ # intercept before argv count
81
+ if opt.help
82
+ puts opt_obj
83
+ puts more_notes
84
+ exit
85
+ end
86
+
69
87
  if ARGV.size < 1
70
88
  puts opt_obj
71
89
  exit
72
90
  end
73
91
 
74
92
 
75
- case opt.model
76
- when "LCQ"
77
- model = 'LCQ Deca XP Plus'
78
- when "Orbi"
79
- model = 'LTQ Orbitrap'
80
- else
81
- abort "Bad MS model argument: #{opt.model}"
93
+
94
+ opt.outdir ||= DEFAULT_OUTDIR
95
+
96
+ ## Create dbpath if does not exist
97
+ if opt.outdir
98
+ FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
82
99
  end
83
100
 
84
- ## Ensure params file exists (unless opt given)
85
- params_obj = SpecID::Sequest::Params.new(opt.params)
86
- # Ensure the database exists!
87
- if opt.dbpath
88
- params_obj.database_path = opt.dbpath
101
+ files = ARGV.to_a
102
+
103
+ if files[0] =~ /\.srf/i
104
+ opt.dbpath ||= def_dbpath
105
+ files.each do |file|
106
+ hash = {
107
+ :backup_db_path => opt.dbpath || def_dbpath,
108
+ :out_path => opt.outdir,
109
+ }
110
+ xml_obj = SpecID::Sequest::PepXML.new_from_srf(file, hash)
111
+ xml_obj.to_pepxml(xml_obj.base_name + ".xml")
112
+ end
89
113
  else
114
+ ## Ensure params file exists (unless opt given)
115
+ opt.params ||= DEFAULT_PARAMS_FILE
116
+ params_obj = SpecID::Sequest::Params.new(opt.params)
117
+ # Ensure the database exists!
90
118
  unless File.exist?( params_obj.database )
91
- params_obj.database_path = def_dbpath
119
+ if opt.dbpath
120
+ params_obj.database_path = opt.dbpath
121
+ else
122
+ params_obj.database_path = def_dbpath
123
+ end
92
124
  end
93
- end
94
125
 
95
- ## Create dbpath if does not exist
96
- FileUtils.mkpath(opt.outdir) unless File.exist? opt.outdir
126
+ opt.mspath ||= DEFAULT_MZXML_PATH
127
+ opt.pepxml_version ||= DEFAULT_PEPXML_VERSION
128
+ opt.model ||= DEFAULT_MS_MODEL
129
+ opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
130
+
131
+ case opt.model
132
+ when "LCQ"
133
+ model = 'LCQ Deca XP Plus'
134
+ when "Orbi"
135
+ model = 'LTQ Orbitrap'
136
+ else
137
+ model = opt.model
138
+ end
97
139
 
98
- bioworks = ARGV[0]
99
- xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
100
140
 
101
- xml_objs.each do |obj|
102
- obj.to_pepxml(obj.base_name + ".xml")
141
+ bioworks = files[0]
142
+ xml_objs = SpecID::Sequest::PepXML.set_from_bioworks(params_obj, bioworks, opt.mspath, opt.outdir, opt.pepxml_version, 'trypsin', 'ThermoFinnigan', model)
143
+
144
+ xml_objs.each do |obj|
145
+ obj.to_pepxml(obj.base_name + ".xml")
146
+ end
103
147
  end
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # This is my second attempt at writing a simple interface for messing with
4
+ # fasta files. Acheiving simplicity (and power) is challenging. It usually
5
+ # only happens on the second (or sometimes more) try. Of course, in
6
+ # retrospect the simple solution seems sooo obvious. But its deceptive.
7
+ # It takes work to acheive simplicity for complex tasks. That's my thought
8
+ # for the day.
9
+
10
+ # fasta_shaker as in a salt shaker. Shake up your fasta proteins and let them
11
+ # season your dinner (hopefully a protein dinner). Mmmm. Don't they taste
12
+ # good all mixed up? If you want, you can think of it as a pepper shaker.
13
+ # I don't usually comment on my scripts (in my script, anyway), but this one
14
+ # came out so nice and clean that I feel like I have room to spare.
15
+
16
+ require 'fasta'
17
+ require 'cmdparse'
18
+
19
+ opt = {}
20
+
21
+ opts = OptionParser.new do |op|
22
+ prog = File.basename(__FILE__)
23
+ op.banner = "usage: #{prog} <method> [OPTIONS] <file>.fasta"
24
+ op.separator " <method> = reverse | shuffle"
25
+ op.on("-c", "--cat", "catenates the output to copy of original") {|v| opt[:cat] = v }
26
+ op.on("-o", "--out <string>", "name of output file (default is descriptive)") {|v| opt[:out] = v }
27
+ op.on("-p", "--prefix <string>", "give a header prefix to modified prots") {|v| opt[:prefix] = v }
28
+ op.on("-f", "--fraction <float>", "creates some fraction of proteins") {|v| opt[:fraction] = v }
29
+ op.separator " [if fraction > 1 then the tag 'f<frac#>_' prefixed to proteins"
30
+ op.separator " (after any given prefix) so that proteins are unique]"
31
+ op.on("--tryptic_peptides", "applies method to [KR][^P] peptides") {|v| opt[:tryptic_peptides] = v }
32
+
33
+ op.separator "EXAMPLES: "
34
+ op.separator " #{prog} reverse file.fasta -o protein_aa_sequence_reversed.fasta"
35
+ op.separator " #{prog} shuffle file.fasta -o protein_aa_sequence_shuffled.fasta"
36
+ op.separator " #{prog} shuffle file.fasta -c -p SH_ -o normal_cat_shuffled_with_prefix.fasta"
37
+ op.separator " #{prog} reverse file.fasta --tryptic_peptides tryptic_peptides_reversed.fasta"
38
+ end
39
+
40
+ opts.parse!
41
+
42
+ if ARGV.size < 2
43
+ puts opts
44
+ exit
45
+ end
46
+
47
+ (method, file) = ARGV
48
+
49
+ if opt[:cat] && !opt[:prefix]
50
+ puts "WARNING: concatenated proteins don't have unique headers"
51
+ puts "[you probably wanted to use the '--prefix' option!]"
52
+ end
53
+
54
+ # OUT filename:
55
+ unless opt[:out]
56
+ filebase = file.sub(/\..*$/,'')
57
+ parts = [filebase]
58
+ parts << 'cat' if opt[:cat]
59
+ parts << method
60
+ parts << 'prefix' << opt[:prefix] if opt[:prefix]
61
+ parts << 'fraction' << opt[:fraction] if opt[:fraction]
62
+ parts << 'tryptic_peptides' if opt[:tryptic_peptides]
63
+ opt[:out] = parts.join("_") << ".fasta"
64
+ end
65
+
66
+ ## READ the file
67
+ fasta = Fasta.new.read_file(file)
68
+
69
+ ## CAT (save an original copy)
70
+ fasta_orig = fasta.dup if opt[:cat]
71
+
72
+ ## FRACTION the proteins
73
+ if f = opt[:fraction]
74
+ prefix = nil
75
+ f = f.to_f
76
+ if f > 1.0
77
+ prefix = proc {|cnt| "f#{cnt}_" }
78
+ end
79
+ fasta = fasta.fraction_of_prots(f, prefix)
80
+ end
81
+
82
+ ## PREFIX the proteins
83
+ if pre = opt[:prefix]
84
+ fasta.header_prefix!(pre)
85
+ end
86
+
87
+ ## MODIFY the proteins
88
+ fasta.aaseq!((method + '!').to_sym, opt[:tryptic_peptides])
89
+
90
+ ## CAT (finish it up)
91
+ if opt[:cat]
92
+ fasta_orig << fasta
93
+ fasta = fasta_orig
94
+ end
95
+
96
+ ## WRITE out the file
97
+ fasta.write_file(opt[:out])
98
+
99
+
100
+
@@ -1,10 +1,15 @@
1
1
  #!/usr/bin/ruby -w
2
2
 
3
3
  require 'spec_id'
4
- require 'hash_by'
5
4
  require 'optparse'
6
5
  require 'ostruct'
6
+ require 'spec_id/aa_freqs'
7
7
 
8
+ ########################################################
9
+ WRITE_MARSHAL = true
10
+ TABULATE_DATA = true
11
+ WRITE_CYS_FIND = false
12
+ ########################################################
8
13
 
9
14
  opt = OpenStruct.new
10
15
  opt.x1 = 1.0
@@ -14,14 +19,19 @@ opt.c = 0.5
14
19
  opt.rppm = 1000.0
15
20
  opt.false = false
16
21
 
22
+ # prints shortened number for display
23
+ def short(num)
24
+ sprintf( "%.3f",num)
25
+ end
26
+
17
27
  opts = OptionParser.new do |op|
18
- op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml [decoy.xml]"
19
- op.separator("prints number of proteins (and FPR if decoy.xml)")
28
+ op.banner = "usage: #{File.basename(__FILE__)} [OPTS] bioworks.xml"
29
+ op.separator("prints number of proteins (and FPR if -f option)")
20
30
  op.separator ""
21
31
 
22
32
  op.separator("** only takes the top hit per scan+charge")
23
- op.separator("** Excludes all deltacn's over 1.0")
24
- op.separator(" (in BioworksBrowser worst hits often given deltacn of 1.1)")
33
+ op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1")
34
+ op.separator(" (these are peptides who are the only hit with xcorr > 0)")
25
35
  op.separator ""
26
36
  op.on("-1", "--xcorr1 <f>", "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v.to_f}
27
37
  op.on("-2", "--xcorr2 <f>", "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v.to_f}
@@ -30,40 +40,151 @@ opts = OptionParser.new do |op|
30
40
  op.on("--rppm <f>", "<= rough ppm (10^6*deltamass/mass) d: #{opt.rppm}") {|v| opt.rppm = v.to_f}
31
41
  op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v}
32
42
  op.on("-f", "--false <s>", "protein prefix or file name of decoys") {|v| opt.false = v}
43
+ op.on("-y", "--cysteines <fasta_file|freq>", "report fpr by expected cysteine freq") {|v| opt.cysteines = v}
44
+ op.on("--cback <mean,stdev>", "the cysteine background") {|v| opt.cback = v}
45
+ op.on("--from_file <file>", "(no -i) file with list of interactive input") {|v| opt.from_file = v}
46
+ op.on("-t", "--true_pos <fasta>", "fasta file containing true hits") {|v| opt.true_pos = v }
33
47
  end
34
48
 
49
+ $cys_mean = nil
50
+ $cys_stdev = nil
51
+
35
52
 
36
53
  # fpr is a SpecID obj that is the false positives
37
- def filter_round(files, spec_ids, kind, args, fpr=nil, interactive=false)
54
+ # cysteines holds an aafreqs object or nil
55
+ def filter_round(files, spec_ids, kind, args, fpr=nil, cysteines=nil, interactive=false)
38
56
  (x1, x2, x3, deltacn, rppm) = args
57
+ combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)
39
58
  puts "=========================================================================="
40
59
  puts "[[ xcorr(1,2,3) >= #{x1},#{x2},#{x3} ; deltacn >= #{deltacn} ; rough_ppm <= #{rppm} ]]"
41
60
  # push fpr on the end for the calculations
42
61
  if fpr ; spec_ids.push(fpr) ; end
43
- arr_of_prots_and_peps = spec_ids.map do |spec_id|
44
- prots_and_peps = spec_id.filter(kind, *args)
62
+ arr_of_prots_and_peps_and_deltacnstars_and_cfpr = spec_ids.map do |spec_id|
63
+ (prots, peps, deltacnstar_cnt) = spec_id.filter(kind, *args)
64
+ if cysteines
65
+
66
+ if cysteines.is_a? Float
67
+ freq = cysteines
68
+ else
69
+ freq = cysteines.aafreqs[:C]
70
+ end
71
+ (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(peps, freq)
72
+ [prots, peps, deltacnstar_cnt, [ac,exp]]
73
+ else
74
+ [prots, peps, deltacnstar_cnt]
75
+ end
45
76
  end
46
- arr_of_num_of_prots = arr_of_prots_and_peps.map {|ar| ar[0].size }
47
- arr_of_num_of_peps = arr_of_prots_and_peps.map {|ar| ar[1].size }
77
+ arr_of_num_of_prots = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[0].size }
78
+ arr_of_num_of_peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[1].size }
79
+ deltacnstars = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[2] }
80
+ cys_reports = arr_of_prots_and_peps_and_deltacnstars_and_cfpr.map {|ar| ar[3] } if cysteines
48
81
  prot_nums = arr_of_num_of_prots
49
82
  pep_nums = arr_of_num_of_peps
83
+ ## files = [file1, file2, file3]
84
+ ## prot_nums = [nums1, nums2, nums3, nums_for_false_positives]
85
+ ## pep_nums = [nums1, nums2, nums3, nums_for_false_positives]
50
86
  files.each_with_index do |file,i|
87
+
51
88
  if !interactive
52
89
  puts "#{file} [prots]:\t#{prot_nums[i]}"
53
- puts "#{file} [peps]:\t#{pep_nums[i]}"
90
+ puts "#{file} [peps]:\t#{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
54
91
  else
55
92
  puts "file#{i+1} [prots]: #{prot_nums[i]}"
56
- puts "file#{i+1} [peps]: #{pep_nums[i]}"
93
+ puts "file#{i+1} [peps]: #{pep_nums[i]} (dcn*=#{deltacnstars[i]})"
57
94
  end
58
95
  if fpr
59
- puts "FPR [prots] : " + sprintf( "%.3f", 100.0*(prot_nums[-1].to_f/prot_nums[0].to_f) ) + " % (#{prot_nums[-1]})"
60
- puts "FPR [peps] : " + sprintf( "%.3f", 100.0*(pep_nums[-1].to_f/pep_nums[0].to_f) ) + " % (#{pep_nums[-1]})"
96
+ #puts "FPR [prots] : " + short( 100.0*(prot_nums[-1].to_f/prot_nums[i].to_f) ) + " % (#{prot_nums[-1]})"
97
+ #puts "FPR [peps] : " + short( 100.0*(pep_nums[-1].to_f/pep_nums[i].to_f) ) + " % (#{pep_nums[-1]}) (dcn*=#{deltacnstars[-1]})"
98
+
99
+ ## For separate searches: every false positive = one less TP
100
+ ## For concatenated searches: every false positive is one less TP
101
+ ## THAT's what I've been doing already !
102
+
103
+ prot_tps = prot_nums[i] - prot_nums[-1]
104
+ pep_tps = pep_nums[i] - pep_nums[-1]
105
+ prot_fps = prot_nums[i] - prot_tps
106
+ pep_fps = pep_nums[i] - pep_tps
107
+ prot_fpr = prot_fps.to_f/prot_nums[i].to_f
108
+ pep_fpr = pep_fps.to_f/pep_nums[i].to_f
109
+ # those are the same!
110
+ puts "FPR [prots] : " + short( 100.0*prot_fpr ) + " % (#{prot_fps})"
111
+ puts "FPR [peps] : " + short( 100.0*pep_fpr ) + " % (#{pep_fps}) (dcn*=#{deltacnstars[-1]})"
112
+ end
113
+ if cysteines
114
+ (ac, exp) = cys_reports[i]
115
+
116
+ (cys_fprate, total_num_false) = fpr_by_cysteines(ac, exp, pep_nums[i], $cys_mean, $cys_stdev)
117
+ fraction_of_expected = ac.to_f/exp
118
+ cys_tps = pep_nums[i] - total_num_false
119
+
120
+ puts "CYSTEINE FPR: "
121
+ puts " (# peps containing >= 1 cysteines)"
122
+ puts " actual: #{ac}"
123
+ puts "fraction of expected: #{short(fraction_of_expected)}"
124
+ puts " expected # FP's: " + short(total_num_false)
125
+ puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % "
126
+
127
+ puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/rppm)"
128
+ puts "Combined Score & FPR"
129
+ puts "#{combined_score}\t#{cys_fprate}"
130
+ puts "Combined Score & fraction of expected"
131
+ #puts "#{combined_score} #{fraction_of_expected}"
132
+ to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected]
133
+ puts to_write_cys_find.join("\t") if WRITE_CYS_FIND
134
+ to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, rppm]
135
+ puts to_tab.join("\t") if TABULATE_DATA
136
+ end
137
+ if $true_pos_aaseqs
138
+ peps = arr_of_prots_and_peps_and_deltacnstars_and_cfpr[i][1]
139
+ real_tps = 0
140
+ real_fps = 0
141
+ # could also do with partition
142
+ peps.each do |pep|
143
+ if pep.sequence =~ /\.([\w\*]+)\.?/
144
+ if $true_pos_aaseqs.any? {|aaseq| aaseq.include? $1}
145
+ real_tps += 1
146
+ else
147
+ real_fps += 1
148
+ end
149
+ else
150
+ abort "Couldn't Match: #{pep.sequence}"
151
+ end
152
+ end
153
+ if peps.size > 0
154
+ real_fpr = real_fps.to_f/peps.size
155
+ else
156
+ real_fpr = 0.0
157
+ end
158
+ puts "REAL FPR: #{real_fpr}"
159
+ puts "REAL #TP: #{real_tps}"
160
+ to_tab = ['TABULATE:', combined_score, pep_tps, pep_fpr, real_tps, real_fpr, '', x1, x2, x3, deltacn, rppm]
161
+ puts to_tab.join("\t") if TABULATE_DATA
162
+
61
163
  end
62
164
  end
63
165
  #puts files.join(' | ')
64
166
  #puts nums.join(' | ')
65
167
  end
66
168
 
169
+
170
+ # (actual # with cys, expected # with cys, total#peptides,
171
+ # mean_fraction_of_cysteines_true, std)
172
+ # PepHit(C) = Peptide containing cysteine
173
+ # # Total PepHit(C) # Observed Bad Pep (C)
174
+ # ------------------ proportional_to ----------------------
175
+ # # Total PepHit # Total Bad PepHit (X)
176
+ def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
177
+
178
+ # the number of bona fide BAD cysteine hits
179
+ # (some of the cysteine hits (~5%) are true positives)
180
+
181
+ ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
182
+ if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
183
+ total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
184
+ fpr = total_number_false / total_peptides
185
+ [fpr, total_number_false]
186
+ end
187
+
67
188
  # assumes its already chomped
68
189
  # updates the 5 globals
69
190
  def prep_reply(reply, base)
@@ -108,8 +229,22 @@ def prep_reply(reply, base)
108
229
  end
109
230
 
110
231
  def file_to_prefiltered_spec_id(file)
111
- spec_id = SpecID.new(file)
112
- spec_id.top_peps_prefilter!
232
+ spec_id = nil
233
+ marshal_file = file + ".prefiltered.msh"
234
+ if File.exist?(marshal_file)
235
+ File.open(marshal_file) do |fh|
236
+ spec_id = Marshal.load(fh)
237
+ end
238
+ else
239
+ spec_id = SpecID.new(file)
240
+ spec_id.top_peps_prefilter!
241
+ ## marshal it!
242
+ if WRITE_MARSHAL
243
+ File.open(marshal_file, "w") do |fh|
244
+ Marshal.dump(spec_id,fh)
245
+ end
246
+ end
247
+ end
113
248
  spec_id
114
249
  end
115
250
 
@@ -123,7 +258,6 @@ def interactive_help
123
258
  puts "'q' to quit"
124
259
  end
125
260
 
126
-
127
261
  opts.parse!
128
262
 
129
263
  if ARGV.size < 1
@@ -140,6 +274,28 @@ arr_of_spec_ids = files.map do |file|
140
274
  end
141
275
 
142
276
  fpr = nil
277
+ cysteines = nil
278
+ if opt.cysteines
279
+ puts %w(TABULATE combined_score pep_tps pep_fprate cys_pep_tps cys_pep_fprate [nil] x1 x2 x3 dcn rppm).join("\t")
280
+ if File.exist? opt.cysteines
281
+ cysteines = SpecID::AAFreqs.new(opt.cysteines)
282
+ else
283
+ cysteines = opt.cysteines.to_f
284
+ end
285
+ if opt.cback
286
+ ($cys_mean, $cys_stdev) = opt.cback.split(',').map{|v| v.to_f }
287
+ end
288
+ end
289
+
290
+ $true_pos_aaseqs = nil
291
+ if opt.true_pos
292
+ puts %w(TABULATE combined_score pep_tps pep_fprate real_tps real_fpr [nil] x1 x2 x3 dcn rppm).join("\t")
293
+ fasta = Fasta.new.read_file(opt.true_pos)
294
+ $true_pos_aaseqs = fasta.prots.map do |prot|
295
+ prot.aaseq.chomp
296
+ end
297
+ end
298
+
143
299
  if opt.false
144
300
  # its a file if it exists
145
301
  if File.exist? opt.false
@@ -163,12 +319,18 @@ end
163
319
 
164
320
 
165
321
 
166
-
167
-
168
322
  base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.rppm]
169
323
 
170
-
171
- if opt.i
324
+ if opt.from_file
325
+ lines = IO.readlines(opt.from_file)
326
+ lines.each do |line|
327
+ line.chomp!
328
+ answer = prep_reply(line, base_args)
329
+ next if answer == false
330
+ base_args = answer
331
+ filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
332
+ end
333
+ elsif opt.i
172
334
  interactive_help
173
335
  puts "*******************************************************"
174
336
  puts "Number of proteins in files (this order):"
@@ -187,13 +349,13 @@ if opt.i
187
349
  interactive_help
188
350
  else
189
351
  base_args = answer
190
- filter_round(files, arr_of_spec_ids, :common, base_args, fpr, true)
352
+ filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, true)
191
353
  break
192
354
  end
193
355
  end
194
356
  end
195
357
  else
196
- filter_round(files, arr_of_spec_ids, :common, base_args, fpr, false)
358
+ filter_round(files, arr_of_spec_ids, :common, base_args, fpr, cysteines, false)
197
359
  end
198
360
 
199
361