mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,73 @@
1
+ require 'validator'
2
+ require 'vec'
3
+ require 'enumerator'
4
+
5
+ class Validator ; end
6
+ class Validator::Background
7
+
8
+ attr_accessor :data
9
+
10
+ def initialize(data=nil)
11
+ @data = data
12
+ end
13
+
14
+ def delete_nan!(vec)
15
+ vec.each_with_index do |v,i|
16
+ if v.nan?
17
+ vec[i] = 0
18
+ end
19
+ end
20
+ end
21
+
22
+ def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
23
+ data_vec = VecD[*@data]
24
+ delete_nan!(data_vec)
25
+ stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread }
26
+ smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
27
+ smoothed_stdev_derivs = smoothed_stdev.chim
28
+ last_0_index = index_of_last_0(smoothed_stdev_derivs)
29
+ min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
30
+ end
31
+
32
+ # not really working right currently
33
+ def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
34
+ data_vec = VecD[*@data]
35
+ delete_nan!(data_vec)
36
+ drvs = data_vec.chim
37
+ # absolute value
38
+ drvs.each_with_index {|x,i| drvs[i] = x.abs }
39
+ mv_avg = drvs.transform(avg_points) {|v| v.avg }
40
+ last_0_index = index_of_last_0(mv_avg.chim)
41
+ min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
42
+ end
43
+
44
+ def index_of_last_0(vec)
45
+ last_0_index = nil
46
+ vec.each_with_index do |v,i|
47
+ if v == 0
48
+ last_0_index = i
49
+ end
50
+ end
51
+ last_0_index
52
+ end
53
+
54
+ # returns the minimum value in the window centered on index
55
+ def min_in_window(vec, index, pre, post)
56
+ last_index = vec.size - 1
57
+ start = index - pre
58
+ stop = index + post
59
+ start = 0 if start < 0
60
+ stop = last_index if stop > last_index
61
+ vec[start..stop].min
62
+ end
63
+
64
+ # very simple, should work
65
+ def min_mesa(start, stop, points=3)
66
+ data_vec = VecD[*@data]
67
+ delete_nan!(data_vec)
68
+ smoothed = data_vec.transform(3) {|v| v.avg }
69
+ smoothed[start..stop].min
70
+ end
71
+
72
+ end
73
+
@@ -0,0 +1,95 @@
1
+ require 'validator'
2
+ require 'validator/digestion_based'
3
+
4
+ # class for any generic kind of bias. For instance, a list of high abundance
5
+ # proteins we would expect to see, or a list of low abundance proteins we
6
+ # would not expect to see, or proteins that have been filtered out in some
7
+ # way, etc.
8
+ class Validator::Bias < Validator::DigestionBased
9
+ include Precision::Calculator
10
+
11
+ # a fasta object (by default containing proteins expected to be in the
12
+ # sample [see proteins_expected to modify that behavior])
13
+ attr_reader :fasta
14
+
15
+ # correct_wins means that only a single protein from a pep.aaseq must match
16
+ # the fasta object for the pep hit to be considered valid. Otherwise, all
17
+ # must be a match (logic negated by proteins_expected)
18
+ attr_accessor :correct_wins
19
+
20
+ # proteins_expected==true means we expect to see the proteins in the sample
21
+ # proteins_expected==false means we do not expect to see these proteins in
22
+ # the sample
23
+ attr_accessor :proteins_expected
24
+
25
+ # a hash made by taking each fasta reference in fasta_object, (everything
26
+ # until a space) and setting the value to true. It can be queried with the
27
+ # start of an fasta sequence
28
+ attr_accessor :short_reference_hash
29
+
30
+ DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
31
+ :proteins_expected => true,
32
+ :correct_wins => true,
33
+ } )
34
+
35
+ # options:
36
+ # (t = true, f = false, '*'= default)
37
+ # :proteins_expected => *t/f we expect to see the fasta proteins in our hit list
38
+ # :correct_wins => *t/f a single peptide hit from one of these proteins
39
+ # constitutes a true positive
40
+ # :background => Float (*0.0-1.0)
41
+ # :false_to_total_ratio => Float (*nil by default)
42
+ def initialize(fasta_object, options={})
43
+ opts = DEFAULTS.merge(options)
44
+ (@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
45
+ @fasta = fasta_object
46
+ @header_split_hash = @fasta.prots.map {|prot| prot.reference }
47
+ @short_reference_hash = self.class.make_short_reference_hash(fasta_object)
48
+ end
49
+
50
+ def self.make_short_reference_hash(fasta_object)
51
+ hash = {}
52
+ fasta_object.each do |prot|
53
+ hash[prot.first_entry] = true
54
+ end
55
+ hash
56
+ end
57
+
58
+ def partition(peps)
59
+ klass = self.class
60
+ cw =
61
+ if !@proteins_expected
62
+ !@correct_wins
63
+ else
64
+ @correct_wins
65
+ end
66
+
67
+ (tp, fp) =
68
+ if cw
69
+ peps.partition do |pep|
70
+ pep.prots.any? do |pepprot|
71
+ @short_reference_hash.key?( pepprot.first_entry )
72
+ end
73
+ end
74
+ else
75
+ peps.partition do |pep|
76
+ pep.prots.any? do |pepprot|
77
+ !@short_reference_hash.key?( pepprot.first_entry )
78
+ end
79
+ end
80
+ end
81
+
82
+ if !@correct_wins
83
+ tp, fp = fp, tp
84
+ end
85
+
86
+ [tp, fp]
87
+ end
88
+
89
+ # pephit_precision is done through inheritance
90
+
91
+ def to_param_string
92
+ "abundance=" + ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
93
+ end
94
+
95
+ end
@@ -0,0 +1,260 @@
1
+ require 'validator'
2
+
3
+ class Validator::Cmdline
4
+
5
+ Validator_symbols_to_classes = {
6
+ :tmm => Validator::Transmem::Protein,
7
+ :decoy => Validator::Decoy,
8
+ :bad_aa => Validator::AA,
9
+ :tps => Validator::TruePos,
10
+ :bias => Validator::Bias,
11
+ :prob => Validator::Probability,
12
+ }
13
+ # was VAL_DEFAULTS
14
+ DEFAULTS = {
15
+ :tmm =>
16
+ {
17
+ # file
18
+ :min_num_tmm_seqs => 1,
19
+ :expect_soluble => true,
20
+ :no_include_tm_peps => 0.8,
21
+ :bkg => 0.0,
22
+ },
23
+ :decoy =>
24
+ {
25
+ :hits_together => true,
26
+ :decoy_on_match => true,
27
+ },
28
+ :bad_aa =>
29
+ {
30
+ :false_if_found => true,
31
+ :estimate => true,
32
+ :bkg => 0.0,
33
+ },
34
+ :bias =>
35
+ {
36
+ :bkg => 0.0,
37
+ :proteins_expected => true,
38
+ },
39
+ :ties => true,
40
+ }
41
+ COMMAND_LINE = {
42
+ :decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
43
+ "FILENAME of separate search on decoys.",
44
+ "All regular expressions must be surrounded by '/'",
45
+ "(no extended options [trailing modifiers]).",
46
+ "e.g., a run using concatenated reversed proteins that",
47
+ "includes 'REVERSE' in the fasta heading:",
48
+ " --decoy /REVERSE/",
49
+ "Anything fancier should be quoted:",
50
+ " --decoy '/^\\s*REVERSE/'",
51
+ "If decoys proteins were searched in a separate file,",
52
+ "then give the FILENAME (e.g., --decoy decoy.srg)",
53
+ "DOM = *true/false, decoy on match",],
54
+ :tps => ["--tps <fasta>", "for a completely defined sample, this is the",
55
+ "fasta file containing the true protein hits"],
56
+ # may require digestion:
57
+ :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
58
+ "information (that is shared between them).",
59
+ "ORIG_FASTA = the fasta file used to do the run",
60
+ "PARAMS = the params file used to do the run",],
61
+ :bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
62
+ "PE = *true|false proteins in fasta file expected in sample",
63
+ "BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
64
+ :bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
65
+ "AA = The amino acid (e.g., 'C')",
66
+ "EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
67
+ "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
68
+
69
+ :tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
70
+ "phobius.small:",
71
+ "http://phobius.cgb.ki.se/",
72
+ "(select 'Short' output, and save output as file)",
73
+ "toppred.out:",
74
+ "http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html",
75
+ "(output 'toppred.out' in 'New' or 'Xml' format)",
76
+ "MIN = Int, minimum number transmembrane seqs (def: #{DEFAULTS[:tmm][:min_num_tmm_seqs]})",
77
+ "SOL = true|false, this is a soluble fraction( def: #{DEFAULTS[:tmm][:expect_soluble]})",
78
+ "PEPS = Float | false, don't consider tm peps (>= fraction",
79
+ " tm content) (false skips) (def: #{DEFAULTS[:tmm][:no_include_tm_peps]})",
80
+ "BKG = Float , background contaminating insoluble (def: #{DEFAULTS[:tmm][:bkg]})"],
81
+
82
+
83
+ # VALIDATION MODIFIERS
84
+ :false_on_tie => ["--false_on_tie", "if peptide belongs to correct AND incorrect proteins",
85
+ "it will be counted as correct"],
86
+
87
+ }
88
+
89
+ def self.boolean(arg, default)
90
+ case arg
91
+ when 'true' ; true
92
+ when 'false' ; false
93
+ else ; default
94
+ end
95
+ end
96
+
97
+ PrepArgs = {
98
+ :prob => lambda {|ar, opts|
99
+ mthd =
100
+ if ar
101
+ if ar == 'nsp'
102
+ :probability
103
+ elsif ar == 'init'
104
+ :initial_probability
105
+ else
106
+ raise ArgumentError, "--prob [arg], optional arg can only be 'nsp' or 'init'!"
107
+ end
108
+ else
109
+ :probability
110
+ end
111
+ opts[:validators].push([:prob, mthd])
112
+ },
113
+ :decoy => lambda {|ar, opts|
114
+ myargs = [:decoy]
115
+ first_arg = ar[0]
116
+ myargs[1] =
117
+ if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
118
+ Regexp.new(first_arg[1...-1])
119
+ else
120
+ first_arg
121
+ end
122
+ myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
123
+ opts[:validators].push(myargs)
124
+ },
125
+ :digestion => lambda {|ar, opts|
126
+ raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
127
+ opts[:digestion] = ar.dup
128
+ opts[:digestion_objects] = [Fasta.new(ar[0]), Sequest::Params.new(ar[1])]
129
+ },
130
+ :bias => lambda {|ar, opts|
131
+ myargs = [:bias]
132
+ myargs.push( Fasta.new(ar[0]) )
133
+ val_opts = {}
134
+ val_opts[:proteins_expected] = self.boolean(ar[1], DEFAULTS[:bias][:proteins_expected])
135
+ val_opts[:background] =
136
+ if ar[2]
137
+ ar[2].to_f
138
+ else
139
+ DEFAULTS[:bias][:bkg]
140
+ end
141
+ myargs.push(val_opts)
142
+ opts[:validators].push(myargs)
143
+ },
144
+ :bad_aa => lambda {|ar, opts|
145
+ ## GET the FREQUENCY
146
+ myargs = [:bad_aa]
147
+ myargs.push( ar[0] )
148
+ val_opts = {}
149
+ val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
150
+ val_opts[:background] =
151
+ if ar[2]
152
+ ar[2].to_f
153
+ else
154
+ DEFAULTS[:bad_aa][:bkg]
155
+ end
156
+ myargs.push(val_opts)
157
+ opts[:validators].push(myargs)
158
+ },
159
+ :tmm => lambda {|ar, opts|
160
+ myargs = [:tmm]
161
+ myargs.push( ar[0] )
162
+ val_opts = {}
163
+ val_opts[:min_num_tms] =
164
+ if ar[1] ; ar[1].to_i
165
+ else ; DEFAULTS[:tmm][:min_num_tmm_seqs]
166
+ end
167
+ val_opts[:soluble_fraction] = self.boolean(ar[2], DEFAULTS[:tmm][:expect_soluble])
168
+ val_opts[:no_include_tm_peps] =
169
+ if ar[3]
170
+ case ar[3]
171
+ when 'false' ; false
172
+ else ; ar[3].to_f
173
+ end
174
+ else ; DEFAULTS[:tmm][:no_include_tm_peps]
175
+ end
176
+ val_opts[:background] =
177
+ if ar[4] ; ar[4].to_f
178
+ else ; DEFAULTS[:tmm][:bkg]
179
+ end
180
+ myargs.push(val_opts)
181
+ opts[:validators].push( myargs )
182
+ },
183
+ :tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
184
+ :false_on_tie => lambda {|v,opts| opts[:ties] = false },
185
+ }
186
+
187
+ # remove the keys from opts involved in validators and return an array
188
+ # of validators
189
+ def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
190
+ validator_args = opts[:validators]
191
+ correct_wins = !false_on_tie
192
+ need_false_to_total_ratio = []
193
+ need_frequency = []
194
+ transmem_vals = []
195
+ validators = validator_args.map do |args|
196
+ tp = args.shift
197
+ val_args = args.dup # protect the original keys
198
+ val_args =
199
+ case tp
200
+ when :tmm
201
+ val_args[1][:correct_wins] = correct_wins
202
+ val_args[1][:fasta] = opts[:digestion_objects][0]
203
+ val_args
204
+ when :bias
205
+ val_args[1][:correct_wins] = correct_wins
206
+ val_args
207
+ when :tps
208
+ val_args = [val_args[0], correct_wins]
209
+ val_args
210
+ when :decoy
211
+ val_args = [val_args[0], val_args[1], correct_wins]
212
+ # don't delete the key here since we need the decoy = regexp key
213
+ val_args
214
+ else ## bad_aa and prob are represented here:
215
+ val_args
216
+ end
217
+ val = Validator_symbols_to_classes[tp].new( *val_args )
218
+ # make some lists of validators based on pre-processing needs:
219
+ if tp == :tmm
220
+ transmem_vals << val
221
+ end
222
+ potential_digestion_classes = /Transmem|AA|Bias/
223
+ if val.class.to_s =~ potential_digestion_classes
224
+ if val_args[1][:estimate] == true
225
+ need_frequency << val
226
+ else
227
+ need_false_to_total_ratio << val
228
+ end
229
+ end
230
+ val
231
+ end
232
+
233
+ if need_false_to_total_ratio.size > 0
234
+ raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
235
+ peps = Digestor.digest( *(opts[:digestion_objects]) )
236
+ need_false_to_total_ratio.each do |val|
237
+ val.set_false_to_total_ratio( peps )
238
+ end
239
+ end
240
+ if need_frequency.size > 0
241
+ raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
242
+ need_frequency.each do |val|
243
+ val.set_frequency( opts[:digestion_objects][0] )
244
+ end
245
+ end
246
+ opts.delete(:digestion_objects)
247
+
248
+ if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
249
+ # This is overkill if we are doing a single filtering job, but it
250
+ # ensures that it works in all the ways I'm doing it. Should
251
+ # refactor eventually !!
252
+ transmem_vals.each do |val| ## but, prob uses it too!
253
+ val.transmem_status_hash = val.create_transmem_status_hash(spec_id.peps)
254
+ end
255
+ end
256
+ validators
257
+
258
+ end
259
+
260
+ end
@@ -0,0 +1,94 @@
1
+ require 'validator'
2
+
3
+ class Validator::Decoy < Validator
4
+ include Precision::Calculator::Decoy
5
+
6
+ attr_accessor :constraint
7
+
8
+ attr_accessor :decoy_on_match
9
+ attr_accessor :correct_wins
10
+
11
+ attr_accessor :last_pep_was_decoy
12
+
13
+ attr_accessor :increment_normal
14
+ attr_accessor :increment_decoy
15
+ attr_accessor :increment_total_submitted
16
+
17
+ attr_reader :normal_peps_just_submitted
18
+
19
+ def initialize(constraint=nil, decoy_on_match = true, correct_wins = true)
20
+ @decoy_on_match = decoy_on_match
21
+ @correct_wins = correct_wins
22
+ @constraint = constraint
23
+ end
24
+
25
+ # returns [normal, decoy] (?? I think ??)
26
+ def partition(peps)
27
+ if @decoy_on_match
28
+ if @correct_wins
29
+ peps.partition do |pep|
30
+ !(pep.prots.all? {|prot| prot.reference.match(@constraint) })
31
+ end
32
+ else # fp wins
33
+ peps.partition do |pep|
34
+ !(pep.prots.any? {|prot| prot.reference.match(@constraint) })
35
+ end
36
+ end
37
+ else
38
+ if @correct_wins
39
+ peps.partition do |pep|
40
+ pep.prots.any? {|prot| prot.reference.match(@constraint) }
41
+ end
42
+ else
43
+ peps.partition do |pep|
44
+ pep.prots.all? {|prot| prot.reference.match(@constraint) }
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ def initialize_increment
51
+ @increment_normal = 0
52
+ @increment_decoy = 0
53
+ @increment_total_submitted = 0
54
+ @increment_initialized = true
55
+ end
56
+
57
+
58
+ # does not deal in separate_peps right now!!
59
+ # will take an array or single peptide
60
+ def increment_pephits_precision(peps)
61
+ tmp = $VERBOSE; $VERBOSE = nil
62
+ initialize_increment unless @increment_initialized
63
+ $VERBOSE = tmp
64
+
65
+ to_submit =
66
+ if peps.is_a? SpecID::Pep
67
+ [peps]
68
+ else
69
+ peps
70
+ end
71
+
72
+ @increment_total_submitted += to_submit.size
73
+ (normal, decoy) = partition(to_submit)
74
+ @normal_peps_just_submitted = normal
75
+ @increment_normal += normal.size
76
+ @increment_decoy += decoy.size
77
+ calc_precision(@increment_normal, @increment_decoy)
78
+ end
79
+
80
+ def pephit_precision(peps, separate_peps=nil)
81
+ if separate_peps
82
+ calc_precision(peps.size, separate_peps.size)
83
+ else
84
+ (norm, decoy) = partition(peps)
85
+ calc_precision(norm.size, decoy.size)
86
+ end
87
+ end
88
+
89
+ def to_param_string
90
+ "decoy="+ ["{constraint=#{(constraint ? constraint.inspect : '')}", "decoy_on_match=#{@decoy_on_match}", "correct_wins=#{@correct_wins}}"].join(", ")
91
+ end
92
+ end
93
+
94
+
@@ -0,0 +1,69 @@
1
+ require 'validator'
2
+ require 'fasta'
3
+ require 'spec_id/sequest/params'
4
+
5
+ # objects of this class can calculate pephit_precision given an array of
6
+ # SpecID::Pep objects using the pephit_precision method.
7
+ class Validator::DigestionBased < Validator
8
+ DEFAULTS = {
9
+ :false_to_total_ratio => 1.0,
10
+ :background => 0.0,
11
+ }
12
+
13
+ # the number of tps
14
+ attr_accessor :increment_tps
15
+ # the number of fps
16
+ attr_accessor :increment_fps
17
+
18
+ # the total peptides submitted to the validator (regardless of tp, fp, or
19
+ # nil)
20
+ attr_accessor :increment_total_submitted
21
+
22
+ # the ratio of false hits to total peptides in the fasta file
23
+ attr_accessor :false_to_total_ratio
24
+
25
+ # the false_to_total_ratio calculated (but not applied)
26
+ attr_reader :calculated_background
27
+
28
+ # For a sample with no false hits in it, (under defaults) this is the
29
+ # fraction of peptides with the constraint over the total number of peptides
30
+ # from which these hits are derived.
31
+ attr_accessor :background
32
+
33
+
34
+ # expects that classes define a partition method, and a @background
35
+ def pephit_precision(peps)
36
+ ## this gives us the fraction that are transmembrane (under defaults):
37
+ (tps, fps) = partition(peps)
38
+ (num_tps, num_fps) = calc_precision_prep(tps.size, fps.size)
39
+ calc_precision(num_tps, num_fps)
40
+ end
41
+
42
+ # returns [num_tps, num_fps]
43
+ def calc_precision_prep(num_tps, num_fps)
44
+ total_peps_passing_partition = num_tps + num_fps
45
+ num_fps = adjust_fps_for_background(num_tps, num_fps, @background)
46
+ ## we must use the false_to_total_ratio to estimate how many are really
47
+ ## incorrect!
48
+ # FALSE/TOTAL = FALSE(found)/TOTAL(found)
49
+ # TOTAL(found) = FALSE(found) * TOTAL/FALSE
50
+ # = FALSE(found) / (FALSE/TOTAL)
51
+ total_false = num_fps / @false_to_total_ratio
52
+ # NOTE: the partition algorithm drops peptides that are transmembrane
53
+ # under certain options. Thus, the total false estimate must be tempered
54
+ # by this lower number of total peptides.
55
+ adjusted_tps = total_peps_passing_partition.to_f - total_false
56
+ [adjusted_tps, total_false]
57
+ end
58
+
59
+ # returns self
60
+ # assumes partition returns (tps, fps)
61
+ def set_false_to_total_ratio(peps)
62
+ (tps, fps) = partition(peps)
63
+ @false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
64
+ self
65
+ end
66
+
67
+ end
68
+
69
+
@@ -0,0 +1,48 @@
1
+
2
+ class Validator::Probability
3
+
4
+ attr_accessor :prob_method
5
+
6
+ def initialize(prob_method=:probability)
7
+ @prob_method = prob_method
8
+ end
9
+
10
+ # objs should respond_to probability
11
+ def precision(objs)
12
+ return 1.0 if objs.size == 0
13
+
14
+ current_sum_one_minus_prob = 0.0
15
+
16
+ # this should work!
17
+ #objs.inject(0.0) {|sum,obj| sum + (1.0 - obj.probability) }
18
+
19
+ objs.each do |obj|
20
+ # SUM(1-probX)/#objs
21
+ current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
22
+ end
23
+ prec = 1.0 - (current_sum_one_minus_prob / objs.size)
24
+ end
25
+
26
+
27
+ # objs should respond_to probability
28
+ # These should be added from high probability(1.0) to low (0.0)
29
+ def increment_precision(objs)
30
+ if objs.is_a?(SpecID::Pep) or objs.is_a?(SpecID::Prot)
31
+ objs = [objs]
32
+ end
33
+
34
+ @total_objs ||= 0
35
+ @current_sum_one_minus_prob ||= 0.0
36
+
37
+ @total_objs += objs.size
38
+ objs.each do |obj|
39
+ @current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
40
+ end
41
+ prec = 1.0 - (@current_sum_one_minus_prob / @total_objs)
42
+ end
43
+
44
+
45
+ alias_method :pephit_precision, :precision
46
+ alias_method :prothit_precision, :precision
47
+ alias_method :increment_pephits_precision, :increment_precision
48
+ end