mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,73 @@
1
+ require 'validator'
2
+ require 'vec'
3
+ require 'enumerator'
4
+
5
+ class Validator ; end
6
+ class Validator::Background
7
+
8
+ attr_accessor :data
9
+
10
+ def initialize(data=nil)
11
+ @data = data
12
+ end
13
+
14
+ def delete_nan!(vec)
15
+ vec.each_with_index do |v,i|
16
+ if v.nan?
17
+ vec[i] = 0
18
+ end
19
+ end
20
+ end
21
+
22
+ def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
23
+ data_vec = VecD[*@data]
24
+ delete_nan!(data_vec)
25
+ stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread }
26
+ smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
27
+ smoothed_stdev_derivs = smoothed_stdev.chim
28
+ last_0_index = index_of_last_0(smoothed_stdev_derivs)
29
+ min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
30
+ end
31
+
32
+ # not really working right currently
33
+ def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
34
+ data_vec = VecD[*@data]
35
+ delete_nan!(data_vec)
36
+ drvs = data_vec.chim
37
+ # absolute value
38
+ drvs.each_with_index {|x,i| drvs[i] = x.abs }
39
+ mv_avg = drvs.transform(avg_points) {|v| v.avg }
40
+ last_0_index = index_of_last_0(mv_avg.chim)
41
+ min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
42
+ end
43
+
44
+ def index_of_last_0(vec)
45
+ last_0_index = nil
46
+ vec.each_with_index do |v,i|
47
+ if v == 0
48
+ last_0_index = i
49
+ end
50
+ end
51
+ last_0_index
52
+ end
53
+
54
+ # returns the minimum value in the window centered on index
55
+ def min_in_window(vec, index, pre, post)
56
+ last_index = vec.size - 1
57
+ start = index - pre
58
+ stop = index + post
59
+ start = 0 if start < 0
60
+ stop = last_index if stop > last_index
61
+ vec[start..stop].min
62
+ end
63
+
64
+ # very simple, should work
65
+ def min_mesa(start, stop, points=3)
66
+ data_vec = VecD[*@data]
67
+ delete_nan!(data_vec)
68
+ smoothed = data_vec.transform(3) {|v| v.avg }
69
+ smoothed[start..stop].min
70
+ end
71
+
72
+ end
73
+
@@ -0,0 +1,95 @@
1
+ require 'validator'
2
+ require 'validator/digestion_based'
3
+
4
+ # class for any generic kind of bias. For instance, a list of high abundance
5
+ # proteins we would expect to see, or a list of low abundance proteins we
6
+ # would not expect to see, or proteins that have been filtered out in some
7
+ # way, etc.
8
+ class Validator::Bias < Validator::DigestionBased
9
+ include Precision::Calculator
10
+
11
+ # a fasta object (by default containing proteins expected to be in the
12
+ # sample [see proteins_expected to modify that behavior])
13
+ attr_reader :fasta
14
+
15
+ # correct_wins means that only a single protein from a pep.aaseq must match
16
+ # the fasta object for the pep hit to be considered valid. Otherwise, all
17
+ # must be a match (logic negated by proteins_expected)
18
+ attr_accessor :correct_wins
19
+
20
+ # proteins_expected==true means we expect to see the proteins in the sample
21
+ # proteins_expected==false means we do not expect to see these proteins in
22
+ # the sample
23
+ attr_accessor :proteins_expected
24
+
25
+ # a hash made by taking each fasta reference in fasta_object, (everything
26
+ # until a space) and setting the value to true. It can be queried with the
27
+ # start of an fasta sequence
28
+ attr_accessor :short_reference_hash
29
+
30
+ DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
31
+ :proteins_expected => true,
32
+ :correct_wins => true,
33
+ } )
34
+
35
+ # options:
36
+ # (t = true, f = false, '*'= default)
37
+ # :proteins_expected => *t/f we expect to see the fasta proteins in our hit list
38
+ # :correct_wins => *t/f a single peptide hit from one of these proteins
39
+ # constitutes a true positive
40
+ # :background => Float (*0.0-1.0)
41
+ # :false_to_total_ratio => Float (*nil by default)
42
+ def initialize(fasta_object, options={})
43
+ opts = DEFAULTS.merge(options)
44
+ (@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
45
+ @fasta = fasta_object
46
+ @header_split_hash = @fasta.prots.map {|prot| prot.reference }
47
+ @short_reference_hash = self.class.make_short_reference_hash(fasta_object)
48
+ end
49
+
50
+ def self.make_short_reference_hash(fasta_object)
51
+ hash = {}
52
+ fasta_object.each do |prot|
53
+ hash[prot.first_entry] = true
54
+ end
55
+ hash
56
+ end
57
+
58
+ def partition(peps)
59
+ klass = self.class
60
+ cw =
61
+ if !@proteins_expected
62
+ !@correct_wins
63
+ else
64
+ @correct_wins
65
+ end
66
+
67
+ (tp, fp) =
68
+ if cw
69
+ peps.partition do |pep|
70
+ pep.prots.any? do |pepprot|
71
+ @short_reference_hash.key?( pepprot.first_entry )
72
+ end
73
+ end
74
+ else
75
+ peps.partition do |pep|
76
+ pep.prots.any? do |pepprot|
77
+ !@short_reference_hash.key?( pepprot.first_entry )
78
+ end
79
+ end
80
+ end
81
+
82
+ if !@correct_wins
83
+ tp, fp = fp, tp
84
+ end
85
+
86
+ [tp, fp]
87
+ end
88
+
89
+ # pephit_precision is done through inheritance
90
+
91
+ def to_param_string
92
+ "abundance=" + ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
93
+ end
94
+
95
+ end
@@ -0,0 +1,260 @@
1
+ require 'validator'
2
+
3
+ class Validator::Cmdline
4
+
5
+ Validator_symbols_to_classes = {
6
+ :tmm => Validator::Transmem::Protein,
7
+ :decoy => Validator::Decoy,
8
+ :bad_aa => Validator::AA,
9
+ :tps => Validator::TruePos,
10
+ :bias => Validator::Bias,
11
+ :prob => Validator::Probability,
12
+ }
13
+ # was VAL_DEFAULTS
14
+ DEFAULTS = {
15
+ :tmm =>
16
+ {
17
+ # file
18
+ :min_num_tmm_seqs => 1,
19
+ :expect_soluble => true,
20
+ :no_include_tm_peps => 0.8,
21
+ :bkg => 0.0,
22
+ },
23
+ :decoy =>
24
+ {
25
+ :hits_together => true,
26
+ :decoy_on_match => true,
27
+ },
28
+ :bad_aa =>
29
+ {
30
+ :false_if_found => true,
31
+ :estimate => true,
32
+ :bkg => 0.0,
33
+ },
34
+ :bias =>
35
+ {
36
+ :bkg => 0.0,
37
+ :proteins_expected => true,
38
+ },
39
+ :ties => true,
40
+ }
41
+ COMMAND_LINE = {
42
+ :decoy => ["--decoy /REGEXP/|FILENAME[DOM]", Array, "REGEXP for decoy proteins (catenated searches) or a",
43
+ "FILENAME of separate search on decoys.",
44
+ "All regular expressions must be surrounded by '/'",
45
+ "(no extended options [trailing modifiers]).",
46
+ "e.g., a run using concatenated reversed proteins that",
47
+ "includes 'REVERSE' in the fasta heading:",
48
+ " --decoy /REVERSE/",
49
+ "Anything fancier should be quoted:",
50
+ " --decoy '/^\\s*REVERSE/'",
51
+ "If decoys proteins were searched in a separate file,",
52
+ "then give the FILENAME (e.g., --decoy decoy.srg)",
53
+ "DOM = *true/false, decoy on match",],
54
+ :tps => ["--tps <fasta>", "for a completely defined sample, this is the",
55
+ "fasta file containing the true protein hits"],
56
+ # may require digestion:
57
+ :digestion => ["--digestion ORIG_FASTA,PARAMS", Array, "The following validators require additional",
58
+ "information (that is shared between them).",
59
+ "ORIG_FASTA = the fasta file used to do the run",
60
+ "PARAMS = the params file used to do the run",],
61
+ :bias => ["--bias FASTA[,PE,BKG]", Array, "FASTA contains proteins expected to be in the sample",
62
+ "PE = *true|false proteins in fasta file expected in sample",
63
+ "BKG = Background frequency of fps (d: #{DEFAULTS[:bias][:bkg]})",],
64
+ :bad_aa => ["--bad_aa AA,[EST,BKG]", Array, "An amino acid expected (or not expected) in legitimate hits",
65
+ "AA = The amino acid (e.g., 'C')",
66
+ "EST = true|false (def: #{DEFAULTS[:bad_aa][:estimate]})",
67
+ "BKG = Background frequency of genuine pephits (d: #{DEFAULTS[:bad_aa][:bkg]}):",],
68
+
69
+ :tmm => ["--tmm <TM[,MIN,SOL,PEPS,BKG]>", Array, "TM = phobius.small or toppred.out file",
70
+ "phobius.small:",
71
+ "http://phobius.cgb.ki.se/",
72
+ "(select 'Short' output, and save output as file)",
73
+ "toppred.out:",
74
+ "http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html",
75
+ "(output 'toppred.out' in 'New' or 'Xml' format)",
76
+ "MIN = Int, minimum number transmembrane seqs (def: #{DEFAULTS[:tmm][:min_num_tmm_seqs]})",
77
+ "SOL = true|false, this is a soluble fraction( def: #{DEFAULTS[:tmm][:expect_soluble]})",
78
+ "PEPS = Float | false, don't consider tm peps (>= fraction",
79
+ " tm content) (false skips) (def: #{DEFAULTS[:tmm][:no_include_tm_peps]})",
80
+ "BKG = Float , background contaminating insoluble (def: #{DEFAULTS[:tmm][:bkg]})"],
81
+
82
+
83
+ # VALIDATION MODIFIERS
84
+ :false_on_tie => ["--false_on_tie", "if peptide belongs to correct AND incorrect proteins",
85
+ "it will be counted as correct"],
86
+
87
+ }
88
+
89
+ def self.boolean(arg, default)
90
+ case arg
91
+ when 'true' ; true
92
+ when 'false' ; false
93
+ else ; default
94
+ end
95
+ end
96
+
97
+ PrepArgs = {
98
+ :prob => lambda {|ar, opts|
99
+ mthd =
100
+ if ar
101
+ if ar == 'nsp'
102
+ :probability
103
+ elsif ar == 'init'
104
+ :initial_probability
105
+ else
106
+ raise ArgumentError, "--prob [arg], optional arg can only be 'nsp' or 'init'!"
107
+ end
108
+ else
109
+ :probability
110
+ end
111
+ opts[:validators].push([:prob, mthd])
112
+ },
113
+ :decoy => lambda {|ar, opts|
114
+ myargs = [:decoy]
115
+ first_arg = ar[0]
116
+ myargs[1] =
117
+ if first_arg[0,1] == '/' and first_arg[-1,1] == '/'
118
+ Regexp.new(first_arg[1...-1])
119
+ else
120
+ first_arg
121
+ end
122
+ myargs[2] = self.boolean(ar[1], DEFAULTS[:decoy][:decoy_on_match])
123
+ opts[:validators].push(myargs)
124
+ },
125
+ :digestion => lambda {|ar, opts|
126
+ raise(ArgumentError, "need fasta and sequest params!") if ar.size != 2
127
+ opts[:digestion] = ar.dup
128
+ opts[:digestion_objects] = [Fasta.new(ar[0]), Sequest::Params.new(ar[1])]
129
+ },
130
+ :bias => lambda {|ar, opts|
131
+ myargs = [:bias]
132
+ myargs.push( Fasta.new(ar[0]) )
133
+ val_opts = {}
134
+ val_opts[:proteins_expected] = self.boolean(ar[1], DEFAULTS[:bias][:proteins_expected])
135
+ val_opts[:background] =
136
+ if ar[2]
137
+ ar[2].to_f
138
+ else
139
+ DEFAULTS[:bias][:bkg]
140
+ end
141
+ myargs.push(val_opts)
142
+ opts[:validators].push(myargs)
143
+ },
144
+ :bad_aa => lambda {|ar, opts|
145
+ ## GET the FREQUENCY
146
+ myargs = [:bad_aa]
147
+ myargs.push( ar[0] )
148
+ val_opts = {}
149
+ val_opts[:estimate] = self.boolean(ar[1], DEFAULTS[:bad_aa][:est])
150
+ val_opts[:background] =
151
+ if ar[2]
152
+ ar[2].to_f
153
+ else
154
+ DEFAULTS[:bad_aa][:bkg]
155
+ end
156
+ myargs.push(val_opts)
157
+ opts[:validators].push(myargs)
158
+ },
159
+ :tmm => lambda {|ar, opts|
160
+ myargs = [:tmm]
161
+ myargs.push( ar[0] )
162
+ val_opts = {}
163
+ val_opts[:min_num_tms] =
164
+ if ar[1] ; ar[1].to_i
165
+ else ; DEFAULTS[:tmm][:min_num_tmm_seqs]
166
+ end
167
+ val_opts[:soluble_fraction] = self.boolean(ar[2], DEFAULTS[:tmm][:expect_soluble])
168
+ val_opts[:no_include_tm_peps] =
169
+ if ar[3]
170
+ case ar[3]
171
+ when 'false' ; false
172
+ else ; ar[3].to_f
173
+ end
174
+ else ; DEFAULTS[:tmm][:no_include_tm_peps]
175
+ end
176
+ val_opts[:background] =
177
+ if ar[4] ; ar[4].to_f
178
+ else ; DEFAULTS[:tmm][:bkg]
179
+ end
180
+ myargs.push(val_opts)
181
+ opts[:validators].push( myargs )
182
+ },
183
+ :tps => lambda {|v,opts| opts[:validators].push([:tps, Fasta.new(v)]) },
184
+ :false_on_tie => lambda {|v,opts| opts[:ties] = false },
185
+ }
186
+
187
+ # remove the keys from opts involved in validators and return an array
188
+ # of validators
189
+ def self.prepare_validators(opts, false_on_tie, interactive, spec_id)
190
+ validator_args = opts[:validators]
191
+ correct_wins = !false_on_tie
192
+ need_false_to_total_ratio = []
193
+ need_frequency = []
194
+ transmem_vals = []
195
+ validators = validator_args.map do |args|
196
+ tp = args.shift
197
+ val_args = args.dup # protect the original keys
198
+ val_args =
199
+ case tp
200
+ when :tmm
201
+ val_args[1][:correct_wins] = correct_wins
202
+ val_args[1][:fasta] = opts[:digestion_objects][0]
203
+ val_args
204
+ when :bias
205
+ val_args[1][:correct_wins] = correct_wins
206
+ val_args
207
+ when :tps
208
+ val_args = [val_args[0], correct_wins]
209
+ val_args
210
+ when :decoy
211
+ val_args = [val_args[0], val_args[1], correct_wins]
212
+ # don't delete the key here since we need the decoy = regexp key
213
+ val_args
214
+ else ## bad_aa and prob are represented here:
215
+ val_args
216
+ end
217
+ val = Validator_symbols_to_classes[tp].new( *val_args )
218
+ # make some lists of validators based on pre-processing needs:
219
+ if tp == :tmm
220
+ transmem_vals << val
221
+ end
222
+ potential_digestion_classes = /Transmem|AA|Bias/
223
+ if val.class.to_s =~ potential_digestion_classes
224
+ if val_args[1][:estimate] == true
225
+ need_frequency << val
226
+ else
227
+ need_false_to_total_ratio << val
228
+ end
229
+ end
230
+ val
231
+ end
232
+
233
+ if need_false_to_total_ratio.size > 0
234
+ raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
235
+ peps = Digestor.digest( *(opts[:digestion_objects]) )
236
+ need_false_to_total_ratio.each do |val|
237
+ val.set_false_to_total_ratio( peps )
238
+ end
239
+ end
240
+ if need_frequency.size > 0
241
+ raise ArgumentError, "requires --digestion fasta,params argument!" if !opts.key?(:digestion_objects)
242
+ need_frequency.each do |val|
243
+ val.set_frequency( opts[:digestion_objects][0] )
244
+ end
245
+ end
246
+ opts.delete(:digestion_objects)
247
+
248
+ if (transmem_vals.size > 0) # and interactive ## we'd like to just run this for interactive
249
+ # This is overkill if we are doing a single filtering job, but it
250
+ # ensures that it works in all the ways I'm doing it. Should
251
+ # refactor eventually !!
252
+ transmem_vals.each do |val| ## but, prob uses it too!
253
+ val.transmem_status_hash = val.create_transmem_status_hash(spec_id.peps)
254
+ end
255
+ end
256
+ validators
257
+
258
+ end
259
+
260
+ end
@@ -0,0 +1,94 @@
1
+ require 'validator'
2
+
3
+ class Validator::Decoy < Validator
4
+ include Precision::Calculator::Decoy
5
+
6
+ attr_accessor :constraint
7
+
8
+ attr_accessor :decoy_on_match
9
+ attr_accessor :correct_wins
10
+
11
+ attr_accessor :last_pep_was_decoy
12
+
13
+ attr_accessor :increment_normal
14
+ attr_accessor :increment_decoy
15
+ attr_accessor :increment_total_submitted
16
+
17
+ attr_reader :normal_peps_just_submitted
18
+
19
+ def initialize(constraint=nil, decoy_on_match = true, correct_wins = true)
20
+ @decoy_on_match = decoy_on_match
21
+ @correct_wins = correct_wins
22
+ @constraint = constraint
23
+ end
24
+
25
+ # returns [normal, decoy] (?? I think ??)
26
+ def partition(peps)
27
+ if @decoy_on_match
28
+ if @correct_wins
29
+ peps.partition do |pep|
30
+ !(pep.prots.all? {|prot| prot.reference.match(@constraint) })
31
+ end
32
+ else # fp wins
33
+ peps.partition do |pep|
34
+ !(pep.prots.any? {|prot| prot.reference.match(@constraint) })
35
+ end
36
+ end
37
+ else
38
+ if @correct_wins
39
+ peps.partition do |pep|
40
+ pep.prots.any? {|prot| prot.reference.match(@constraint) }
41
+ end
42
+ else
43
+ peps.partition do |pep|
44
+ pep.prots.all? {|prot| prot.reference.match(@constraint) }
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ def initialize_increment
51
+ @increment_normal = 0
52
+ @increment_decoy = 0
53
+ @increment_total_submitted = 0
54
+ @increment_initialized = true
55
+ end
56
+
57
+
58
+ # does not deal in separate_peps right now!!
59
+ # will take an array or single peptide
60
+ def increment_pephits_precision(peps)
61
+ tmp = $VERBOSE; $VERBOSE = nil
62
+ initialize_increment unless @increment_initialized
63
+ $VERBOSE = tmp
64
+
65
+ to_submit =
66
+ if peps.is_a? SpecID::Pep
67
+ [peps]
68
+ else
69
+ peps
70
+ end
71
+
72
+ @increment_total_submitted += to_submit.size
73
+ (normal, decoy) = partition(to_submit)
74
+ @normal_peps_just_submitted = normal
75
+ @increment_normal += normal.size
76
+ @increment_decoy += decoy.size
77
+ calc_precision(@increment_normal, @increment_decoy)
78
+ end
79
+
80
+ def pephit_precision(peps, separate_peps=nil)
81
+ if separate_peps
82
+ calc_precision(peps.size, separate_peps.size)
83
+ else
84
+ (norm, decoy) = partition(peps)
85
+ calc_precision(norm.size, decoy.size)
86
+ end
87
+ end
88
+
89
+ def to_param_string
90
+ "decoy="+ ["{constraint=#{(constraint ? constraint.inspect : '')}", "decoy_on_match=#{@decoy_on_match}", "correct_wins=#{@correct_wins}}"].join(", ")
91
+ end
92
+ end
93
+
94
+
@@ -0,0 +1,69 @@
1
+ require 'validator'
2
+ require 'fasta'
3
+ require 'spec_id/sequest/params'
4
+
5
+ # objects of this class can calculate pephit_precision given an array of
6
+ # SpecID::Pep objects using the pephit_precision method.
7
+ class Validator::DigestionBased < Validator
8
+ DEFAULTS = {
9
+ :false_to_total_ratio => 1.0,
10
+ :background => 0.0,
11
+ }
12
+
13
+ # the number of tps
14
+ attr_accessor :increment_tps
15
+ # the number of fps
16
+ attr_accessor :increment_fps
17
+
18
+ # the total peptides submitted to the validator (regardless of tp, fp, or
19
+ # nil)
20
+ attr_accessor :increment_total_submitted
21
+
22
+ # the ratio of false hits to total peptides in the fasta file
23
+ attr_accessor :false_to_total_ratio
24
+
25
+ # the false_to_total_ratio calculated (but not applied)
26
+ attr_reader :calculated_background
27
+
28
+ # For a sample with no false hits in it, (under defaults) this is the
29
+ # fraction of peptides with the constraint over the total number of peptides
30
+ # from which these hits are derived.
31
+ attr_accessor :background
32
+
33
+
34
+ # expects that classes define a partition method, and a @background
35
+ def pephit_precision(peps)
36
+ ## this gives us the fraction that are transmembrane (under defaults):
37
+ (tps, fps) = partition(peps)
38
+ (num_tps, num_fps) = calc_precision_prep(tps.size, fps.size)
39
+ calc_precision(num_tps, num_fps)
40
+ end
41
+
42
+ # returns [num_tps, num_fps]
43
+ def calc_precision_prep(num_tps, num_fps)
44
+ total_peps_passing_partition = num_tps + num_fps
45
+ num_fps = adjust_fps_for_background(num_tps, num_fps, @background)
46
+ ## we must use the false_to_total_ratio to estimate how many are really
47
+ ## incorrect!
48
+ # FALSE/TOTAL = FALSE(found)/TOTAL(found)
49
+ # TOTAL(found) = FALSE(found) * TOTAL/FALSE
50
+ # = FALSE(found) / (FALSE/TOTAL)
51
+ total_false = num_fps / @false_to_total_ratio
52
+ # NOTE: the partition algorithm drops peptides that are transmembrane
53
+ # under certain options. Thus, the total false estimate must be tempered
54
+ # by this lower number of total peptides.
55
+ adjusted_tps = total_peps_passing_partition.to_f - total_false
56
+ [adjusted_tps, total_false]
57
+ end
58
+
59
+ # returns self
60
+ # assumes partition returns (tps, fps)
61
+ def set_false_to_total_ratio(peps)
62
+ (tps, fps) = partition(peps)
63
+ @false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
64
+ self
65
+ end
66
+
67
+ end
68
+
69
+
@@ -0,0 +1,48 @@
1
+
2
+ class Validator::Probability
3
+
4
+ attr_accessor :prob_method
5
+
6
+ def initialize(prob_method=:probability)
7
+ @prob_method = prob_method
8
+ end
9
+
10
+ # objs should respond_to probability
11
+ def precision(objs)
12
+ return 1.0 if objs.size == 0
13
+
14
+ current_sum_one_minus_prob = 0.0
15
+
16
+ # this should work!
17
+ #objs.inject(0.0) {|sum,obj| sum + (1.0 - obj.probability) }
18
+
19
+ objs.each do |obj|
20
+ # SUM(1-probX)/#objs
21
+ current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
22
+ end
23
+ prec = 1.0 - (current_sum_one_minus_prob / objs.size)
24
+ end
25
+
26
+
27
+ # objs should respond_to probability
28
+ # These should be added from high probability(1.0) to low (0.0)
29
+ def increment_precision(objs)
30
+ if objs.is_a?(SpecID::Pep) or objs.is_a?(SpecID::Prot)
31
+ objs = [objs]
32
+ end
33
+
34
+ @total_objs ||= 0
35
+ @current_sum_one_minus_prob ||= 0.0
36
+
37
+ @total_objs += objs.size
38
+ objs.each do |obj|
39
+ @current_sum_one_minus_prob += 1.0 - obj.send(@prob_method)
40
+ end
41
+ prec = 1.0 - (@current_sum_one_minus_prob / @total_objs)
42
+ end
43
+
44
+
45
+ alias_method :pephit_precision, :precision
46
+ alias_method :prothit_precision, :precision
47
+ alias_method :increment_pephits_precision, :increment_precision
48
+ end