mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,139 @@
1
+
2
+ require 'validator/cmdline'
3
+ require 'spec_id'
4
+
5
+ module SpecID
6
+ module Precision
7
+ class Prob
8
+ class CmdlineParser
9
+
10
+ DEFAULTS = SpecID::Precision::Prob::PN_DEFAULTS.merge( { :output => [[:csv, nil]], } )
11
+
12
+
13
+ COMMAND_LINE = {
14
+ :sort_by_init => ['--sort_by_init', "sort the proteins based on init probability"],
15
+ :prob => ['--prob [TYPE]', "use prophet probabilites to calculate precision",
16
+ "TYPE = *nsp|init"],
17
+ # OUTPUT
18
+ :proteins => ["--proteins", "includes proteins (and validation)"],
19
+ :output => ["-o", "--output format[:FILENAME]", "format to output filtering results.",
20
+ "can be used multiple times",
21
+ ":FILENAME is the filename to use (defaults to STDOUT)",
22
+ "valid formats are:",
23
+ " csv (default)",
24
+ " to_plot",
25
+ " calc_bkg_to_plot",
26
+ " yaml",
27
+ #" protein_summary (need to implement)",
28
+ #" html_table (need to implement)"
29
+ ],
30
+
31
+ # VALIDATION MODIFIERS:
32
+ :hits_separate => ["--hits_separate", "target/decoy hits are normally together when choosing",
33
+ "the top hit per peptide (in prefilter and postfilter)",
34
+ "in BOTH catenated and separate searches. This flag",
35
+ "separates them when finding the top hit per scan.",
36
+ "[This option modifies behavior of --decoy options]"],
37
+
38
+ }.merge( Validator::Cmdline::COMMAND_LINE )
39
+
40
+
41
+ # returns (spec_id_obj, options, option_parser_obj)
42
+ def parse(args)
43
+ opts = {}
44
+ opts[:output] = []
45
+ @out_used = false
46
+ opts[:sequest] = {}
47
+ opts[:validators] = []
48
+ # defaults
49
+
50
+ option_parser = OptionParser.new do |op|
51
+ def op.opt(arg, &block)
52
+ on(*COMMAND_LINE[arg], &block)
53
+ end
54
+
55
+ def op.val_opt(arg, opts)
56
+ on(*COMMAND_LINE[arg]) {|ar| Validator::Cmdline::PrepArgs[arg].call(ar, opts) }
57
+ end
58
+
59
+ def op.exact_opt(opts, arg)
60
+ on(*COMMAND_LINE[arg]) {|v| opts[arg] = v}
61
+ end
62
+
63
+ op.banner = "USAGE: #{File.basename($0)} [OPTS] <file>-prot.xml"
64
+ op.separator ""
65
+ op.separator " RETURNS: precision across the number of hits (based on probability)"
66
+ op.separator " (optional) other validation of the results."
67
+ op.separator ""
68
+
69
+ op.separator "OUTPUT OPTIONS: "
70
+ op.opt(:proteins) {|v| opts[:proteins] = true }
71
+ op.opt(:output) do |output|
72
+ # copied from rspec:
73
+ # This funky regexp checks whether we have a FILE_NAME or not
74
+ where = nil
75
+ if (output =~ /([a-zA-Z_]+(?:::[a-zA-Z_]+)*):?(.*)/) && ($2 != '')
76
+ output = $1
77
+ where = $2
78
+ else
79
+ raise "When using several --output options only one of them can be without a file" if @out_used
80
+ @out_used = true
81
+ end
82
+ opts[:output] << [output, where]
83
+ end
84
+
85
+ op.separator "GENERAL OPTIONS:"
86
+ op.separator ""
87
+ op.opt(:sort_by_init) {|v| opts[:sort_by_init] = true }
88
+ op.separator "VALIDATION OPTIONS: "
89
+ op.separator " each option will calculate the precision"
90
+ op.separator ""
91
+
92
+ op.val_opt(:prob, opts)
93
+ op.val_opt(:decoy, opts)
94
+ op.val_opt(:digestion, opts)
95
+ op.val_opt(:bias, opts)
96
+ op.val_opt(:bad_aa, opts)
97
+
98
+ op.val_opt(:tmm, opts)
99
+ op.val_opt(:tps, opts)
100
+
101
+ op.separator ""
102
+ op.separator "VALIDATION MODIFIERS: "
103
+ op.val_opt(:false_on_tie, opts) # sets opts[:ties] = false
104
+
105
+ end
106
+ option_parser.parse!(args)
107
+
108
+ # prepare validators
109
+
110
+ if args.size > 0
111
+ spec_id_obj =
112
+ if args[0] =~ /\.srf$/i
113
+ ::SpecID.new(args)
114
+ else
115
+ ::SpecID.new(args[0])
116
+ end
117
+ if opts[:ties] == nil # will be nil or false
118
+ opts[:ties] = Validator::Cmdline::DEFAULTS[:ties]
119
+ end
120
+ opts[:validators] = Validator::Cmdline.prepare_validators(opts, !opts[:ties], opts[:interactive], spec_id_obj)
121
+
122
+ if opts[:output].size == 0
123
+ opts[:output] = DEFAULTS[:output]
124
+ end
125
+ else
126
+ spec_id_obj = nil
127
+ end
128
+
129
+ [spec_id_obj, opts, option_parser]
130
+ end # parse
131
+ end # CmdlineParser
132
+ end # Prob
133
+ end # Precision
134
+ end # SpecID
135
+
136
+
137
+
138
+
139
+
@@ -0,0 +1,88 @@
1
+
2
+ require 'spec_id/precision/output'
3
+ require 'table'
4
+ require 'matrix'
5
+
6
+ module SpecID ; end
7
+ module SpecID::Precision ; end
8
+ class SpecID::Precision::Prob ; end
9
+ class SpecID::Precision::Prob::Output
10
+ include SpecID::Precision::Output
11
+
12
+ # returns array of data arrays and parallel labels
13
+ def to_cols_and_labels(answer_hash)
14
+ col_labels = %w(count probability peptide)
15
+
16
+ cols = []
17
+ cols << answer_hash[:count]
18
+ cols << answer_hash[:probabilities]
19
+ cols << answer_hash[:aaseqs]
20
+
21
+ # if there is a single modified peptide, we'll include the column
22
+ if answer_hash.key?(:modified_peptides)
23
+ cols << answer_hash[:modified_peptides]
24
+ col_labels.push( 'modified_peptide' )
25
+ end
26
+
27
+ col_labels.push( 'charge' )
28
+ cols << answer_hash[:charges]
29
+
30
+ answer_hash[:pephits_precision].each do |ans|
31
+ col_labels.push( "#{ans[:validator]} (prob)" )
32
+ cols << ans[:values]
33
+ end
34
+
35
+ [cols, col_labels]
36
+ end
37
+
38
+ def csv(handle, answer_hash)
39
+ (cols, col_labels) = to_cols_and_labels(answer_hash)
40
+ table = Table.new(Matrix[*cols].transpose, nil, col_labels)
41
+ handle.puts(table.to_s("\t"))
42
+ end
43
+
44
+ def to_plot(handle, answer_hash)
45
+ tp = 'XYData'
46
+ basename_noext =
47
+ if handle.respond_to?(:path)
48
+ out = File.basename(handle.path).sub(/\.(\w)+$/,'')
49
+ else
50
+ 'plot'
51
+ end
52
+ title = 'precision vs. num (aaseq+charge)'
53
+ xlabel = 'num hits'
54
+ ylabel = 'precision'
55
+ [tp, basename_noext, title, xlabel, ylabel].each {|v| handle.puts v }
56
+ answer_hash[:pephits_precision].each do |hash|
57
+ handle.puts hash[:validator] # label
58
+ handle.puts answer_hash[:count] # x vals
59
+ handle.puts hash[:values] # y vals
60
+ end
61
+ end
62
+
63
+ def calc_bkg_to_plot(handle, answer_hash)
64
+ tp = 'XYData'
65
+ basename_noext =
66
+ if handle.respond_to?(:path)
67
+ out = File.basename(handle.path).sub(/\.(\w)+$/,'')
68
+ else
69
+ 'calc_bkg_plot'
70
+ end
71
+ title = 'background vs. num (aaseq+charge)'
72
+ xlabel = 'num hits'
73
+ ylabel = 'background (false/total)'
74
+ [tp, basename_noext, title, xlabel, ylabel].each {|v| handle.puts v }
75
+ answer_hash[:params][:validators].each do |hash|
76
+ handle.puts hash[:name] # label
77
+ handle.puts answer_hash[:count] # x vals
78
+ handle.puts hash[:calculated_backgrounds] # y vals
79
+ end
80
+ end
81
+
82
+ def yaml(handle, answer_hash)
83
+ handle.puts answer_hash.to_yaml
84
+ end
85
+
86
+ end
87
+
88
+
@@ -0,0 +1,171 @@
1
+ # note that we require 'spec_id/precision/prob/cmdline' below!
2
+
3
+ require 'spec_id/precision/prob/output'
4
+
5
+ module SpecID ; end
6
+ module SpecID::Precision ; end
7
+
8
+
9
+ # for probability based spec identifications (true probabilities, not the
10
+ # bioworks p-value (which they call probability)).
11
+ class SpecID::Precision::Prob
12
+
13
+ PN_DEFAULTS = {
14
+ :proteins => false,
15
+ :validators => [],
16
+ :sort_by_init => false,
17
+ }
18
+
19
+ require 'spec_id/precision/prob/cmdline'
20
+
21
+ def precision_vs_num_hits_cmdline(args)
22
+ (spec_id_obj, options, option_parser) = CmdlineParser.new.parse(args)
23
+ if spec_id_obj == nil
24
+ puts option_parser
25
+ return
26
+ end
27
+ final_answer = SpecID::Precision::Prob.new.precision_vs_num_hits(spec_id_obj, options)
28
+ options[:output].each do |output|
29
+ output[1] = $stdout unless output[1]
30
+ SpecID::Precision::Prob::Output.new(*output).print(final_answer).close
31
+ end
32
+ end
33
+
34
+ # opts may include:
35
+ # :proteins => true|*false
36
+ # :validators => array of Validator objects
37
+ # NOTE: if you have decoy data, you MUST pass in a decoy validator for the
38
+ # decoy pephits to be removed from other validator analyses!
39
+ # (precision based on peptide probabilities are adjusted to account for
40
+ # the decoy peptides being present: Precision(no_decoy) = (2*Prec)/(Prec+1)
41
+ # which is derived from the 50/50 rule for decoy vs. embedded false hits
42
+ #
43
+ # returns a hash of data
44
+ # :pephits_precision => [{validator => <name>, values => [<precision>,...]},... ]
45
+ # :params => :validators => [array of validators] (includes
46
+ # :calculated_backgrounds)
47
+ # :aaseqs => array of aaseqs
48
+ # :charges => array of charge
49
+ # :modified_peptides => array of modified sequence (only included if
50
+ # applicable)
51
+ #
52
+ #
53
+ # TODO: implement tihs guy:
54
+ # prothits_precision => {validator => <name>, values => {worst => ,
55
+ # normal, normal_stdev } }
56
+ def precision_vs_num_hits(spec_id, opts={})
57
+
58
+ opt = PN_DEFAULTS.merge(opts)
59
+
60
+ out = {}
61
+ num_pephits = [] # NOTE!: these are aaseq/aaseq_mod + charge (not really a pephit, but BEST)
62
+ val_hash = Hash.new {|hash,key| hash[key] = [] }
63
+ val_calc_bkg_hash = Hash.new {|hash,key| hash[key] = [] }
64
+ pepstrings = []
65
+ modified_peptides = []
66
+ pepcharges = []
67
+ probabilities = []
68
+ found_modified_peptide = false
69
+
70
+
71
+ # do we need to deal with decoy peptides? (true/false)
72
+ validators = opt[:validators].map
73
+ decoy_vals = validators.select {|val| val.class == Validator::Decoy }
74
+
75
+ if decoy_vals.size > 1
76
+ raise(ArgumentError, "only one decoy validator allowed!")
77
+ else
78
+ decoy_val = decoy_vals.first
79
+ end
80
+ validators.delete(decoy_val)
81
+ other_validators = validators
82
+
83
+ (probability_validators, other_validators) = other_validators.partition {|val| val.class == Validator::Probability }
84
+ if opt[:initial_probability]
85
+ probability_validators.each do |pv|
86
+ pv.prob_method = :initial_probability
87
+ end
88
+ end
89
+
90
+ n_count = 0
91
+ d_count = 0
92
+ ordered_peps =
93
+ if opt[:sort_by_init]
94
+ spec_id.peps.sort_by{|v| [v.initial_probability, v.n_instances, ( v.is_nondegenerate_evidence ? 1 : 0 ), v.n_enzymatic_termini, ( v.is_contributing_evidence ? 1 : 0 ), v.n_sibling_peptides] }.reverse
95
+ else
96
+ spec_id.peps.sort_by{|v| [v.nsp_adjusted_probability, v.initial_probability, v.n_instances, ( v.is_nondegenerate_evidence ? 1 : 0 ), v.n_enzymatic_termini, ( v.is_contributing_evidence ? 1 : 0 ), v.n_sibling_peptides] }.reverse
97
+ end
98
+ ordered_peps.each_with_index do |pep,i|
99
+ # probability validators must work on the entire set of normal and decoy
100
+
101
+ last_prob_values = probability_validators.map do |val|
102
+ val.increment_pephits_precision(pep)
103
+ end
104
+
105
+ it_is_a_normal_pep =
106
+ if decoy_val
107
+ # get the decoy precision
108
+ decoy_precision = decoy_val.increment_pephits_precision(pep)
109
+
110
+ # continue with ONLY normal peptides
111
+ is_normal = (decoy_val.normal_peps_just_submitted.size > 0)
112
+ else
113
+ true
114
+ end
115
+
116
+ if it_is_a_normal_pep
117
+ n_count += 1
118
+
119
+ # UPDATE validators:
120
+ val_hash[decoy_val] << decoy_precision
121
+ probability_validators.zip(last_prob_values) do |val,prec|
122
+ val_hash[val] << ( (prec * 2.0) / (prec + 1.0) )
123
+ end
124
+ other_validators.each do |val|
125
+ val_hash[val] << val.increment_pephits_precision(pep)
126
+ if val.is_a? Validator::DigestionBased
127
+ val_calc_bkg_hash[val] << val.calculated_background
128
+ end
129
+ end
130
+
131
+ # UPDATE other basic useful information:
132
+ modified_pep_string =
133
+ if pep.mod_info
134
+ found_modified_peptide = true
135
+ pep.mod_info.modified_peptide
136
+ else
137
+ nil
138
+ end
139
+ modified_peptides << modified_pep_string
140
+ pepcharges << pep.charge
141
+ pepstrings << pep.aaseq
142
+ probabilities << pep.probability
143
+ num_pephits << (i+1)
144
+ else
145
+ d_count += 1
146
+ end
147
+ end
148
+ if found_modified_peptide
149
+ out[:modified_peptides] = modified_peptides
150
+ end
151
+ out[:probabilities] = probabilities
152
+ out[:count] = num_pephits
153
+ out[:aaseqs] = pepstrings
154
+ out[:charges] = pepcharges
155
+ out[:pephits_precision] = opt[:validators].map do |val|
156
+ hsh = {}
157
+ hsh[:validator] = Validator::Validator_to_string[val.class.to_s]
158
+ hsh[:values] = val_hash[val]
159
+ hsh
160
+ end
161
+ out[:params] = {}
162
+ out[:params][:validators] = Validator.sensible_validator_hashes(opt[:validators]).zip(opt[:validators]).map do |hash,val|
163
+ hash.delete(:calculated_background)
164
+ hash[:calculated_backgrounds] = val_calc_bkg_hash[val]
165
+ hash
166
+ end
167
+ out
168
+ end
169
+ end
170
+
171
+
@@ -0,0 +1,92 @@
1
+
2
+ require 'array_class'
3
+ puts "REQUIRING"
4
+ puts( require 'spec_id/sequest/pepxml' )
5
+ require 'spec_id/parser/proph'
6
+
7
+ module Sequest ; end
8
+ class Sequest::PepXML ; end
9
+ class Sequest::PepXML::MSMSRunSummary ; end
10
+ class Sequest::PepXML::SearchHit ; end
11
+
12
+ module SpecID ; end
13
+ module SpecID::Prot ; end
14
+ module SpecID::Pep ; end
15
+
16
+ module Proph
17
+
18
+ class PepSummary < Sequest::PepXML::MSMSRunSummary
19
+ # MSMSRunSummary is a SpecID object!
20
+
21
+ Filetype_and_version_re_new = /version="PeptideProphet v([\d\.]+) /
22
+
23
+ # inherits prots and peps
24
+
25
+ # the protein groups
26
+ # currently these are just xml nodes returned!
27
+ attr_accessor :peptideprophet_summary
28
+ attr_accessor :spectrum_queries
29
+ attr_accessor :version
30
+
31
+ def hi_prob_best ; true end
32
+
33
+ def get_version(file)
34
+ answer = nil
35
+ File.open(file) do |fh|
36
+ 8.times do
37
+ line = fh.gets
38
+ answer =
39
+ if line =~ Filetype_and_version_re_new
40
+ $1.dup
41
+ end
42
+ break if answer
43
+ end
44
+ end
45
+ raise(ArgumentError, "couldn't detect version in #{file}") unless answer
46
+ answer
47
+ end
48
+
49
+ def search_hit_class
50
+ PepSummary::Pep
51
+ end
52
+
53
+ def initialize(file=nil)
54
+ @prots = nil
55
+ if file
56
+ @version = get_version(file)
57
+ #@prot_groups = ProtSummary::Parser.new.parse_file(file)
58
+ SpecID::Parser::PepProph.new(:spec_id).parse(file, :spec_id => self)
59
+ end
60
+ end
61
+ end
62
+
63
+ class PepSummary::Pep < Sequest::PepXML::SearchHit
64
+ %w(probability fval ntt nmc massd).each do |guy|
65
+ self.add_member(guy)
66
+ end
67
+
68
+ # returns self
69
+ def from_pepxml_node(node, spec_query)
70
+ super(node, spec_query)
71
+ #pp_n = node.find_first('descendant::peptideprophet_result')
72
+ an_res = node.find_first('child::analysis_result')
73
+ pp_n = an_res.find_first('child::peptideprophet_result')
74
+ self.probability = pp_n['probability'].to_f
75
+ pp_n.find('descendant::parameter').each do |par_n|
76
+ case par_n['name']
77
+ when 'fval'
78
+ self.fval = par_n['value'].to_f
79
+ when 'ntt'
80
+ self.ntt = par_n['value'].to_i
81
+ when 'nmc'
82
+ self.nmc = par_n['value'].to_i
83
+ when 'massd'
84
+ self.massd = par_n['value'].to_f
85
+ end
86
+ end
87
+ self
88
+ end
89
+ end
90
+ end
91
+
92
+