mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,234 @@
1
+ require 'validator'
2
+
3
+ require 'set'
4
+ require 'group_by'
5
+ require 'shuffle'
6
+
7
+ # calculates protein hit precision based on peptide precision
8
+ class Validator::ProtFromPep < Validator
9
+
10
+ # calculate protein precision based on the number of false peptides
11
+ # returns the precision based on the number of proteins *completely false*
12
+ # calculates the worst precision by assuming that proteins with the fewest
13
+ # peptides are all false (before prots with more pephits)
14
+ # note that this approaches the worst, but is not guaranteed to be worst
15
+ # unless each pephit maps to a single protein hit.
16
+ # [worst, normal_mean, normal_stddev]
17
+ # options
18
+ # :num_its_normal => Integer, # num iterations for normal (d: 10)
19
+ # :num_its_worstcase => Integer, # num iterations for worstcase (d: 10)
20
+ #
21
+ def prothit_precision(peps, num_false_pephits, opts={})
22
+ opts[:num_its_normal] ||= 10
23
+ opts[:num_its_worstcase] ||= 10
24
+ # get the num_peps_per_protein array
25
+ worst = worstcase_prothit_precision(peps, num_false_pephits, :num_its => opts[:num_its_worstcase])
26
+ (normal_mean, normal_stdev) = normal_prothit_precision( peps, num_false_pephits, :num_its => opts[:num_its_normal])
27
+ [worst, normal_mean, normal_stdev]
28
+ end
29
+
30
+ # returns an array of the number of peptide hits in each protein
31
+ def num_peps_per_protein(peps)
32
+ num_pephits_by_prot = Hash.new { 0 }
33
+ peps.each do |pep|
34
+ pep.prots.each do |prot|
35
+ num_pephits_by_prot[prot.reference] += 1
36
+ end
37
+ end
38
+ num_pephits_by_prot.values
39
+ end
40
+
41
+ # returns the worstcase precision. This assumes that every small protein
42
+ # with the fewest peptide hits is completely 'filled' with incorrect hits in
43
+ # preference to any higher hit protein.
44
+ # Where each peptide hit maps to a single protein, this is guaranteed to be
45
+ # worst-case. If this doesn't hold, there are some extreme cases where a
46
+ # poorer precision could be generated, but this is still probably fairly
47
+ # close. Thus, a slightly different answer may be generated each time.
48
+ # ...variation is produced by shuffling the order of the proteins from which
49
+ # peptides are removed within groups of proteins having the same number of
50
+ # peptides.
51
+ # This method does NOT require that the prothits be updated to reflect only
52
+ # those pephits being passed in.
53
+ #
54
+ # validator.worstcase_prothit_precision(peps, 14, 1) # => 0.232111
55
+ #
56
+ # options:
57
+ # :num_its => Integer (default: 10) number of times to run (finds minimum)
58
+ # :one_prot_per_pep => true | *false assumes each peptide maps to a
59
+ # single protein
60
+ def worstcase_prothit_precision(peps, num_false_pephits, opts = {})
61
+ num_its = opts[:num_its] || 10
62
+ one_prot_per_pep = opts[:one_prot_per_pep] # nil or false still == false
63
+ one_prot_per_pep = false if one_prot_per_pep == nil
64
+
65
+
66
+ ##############################################
67
+ # The END Cases (can be dealt with quickly)
68
+ ##############################################
69
+ if num_false_pephits == 0
70
+ return 1.0
71
+ elsif num_false_pephits >= peps.size
72
+ return 0.0
73
+ end
74
+
75
+ if one_prot_per_pep
76
+ num_peps_per_prot = num_peps_per_protein(peps)
77
+ return worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
78
+ else
79
+ #####################################
80
+ # HERE's the basic plan!!
81
+ #####################################
82
+ # order the proteins by num peptides
83
+ # create a set of peptides
84
+ # delete peptides from the proteins off the set o' peptides (ensuring that
85
+ # a deleted one cannot be deleted twice)
86
+
87
+ #####################################
88
+ # order the proteins by num peptides
89
+ # and create a hash that holds the peptides (given here) in those proteins
90
+ prots_to_peps_here = Hash.new {|h,k| h[k] = [] }
91
+ prots_to_peps_size = Hash.new { 0 }
92
+ pep_ids = []
93
+ pep_ids_to_prot_ids = Hash.new {|h,k| h[k] = [] }
94
+ peps.each do |pep|
95
+ #puts pep.prots.size
96
+ pep.prots.each do |prot|
97
+ #p prot.reference
98
+ prots_to_peps_here[prot] << pep
99
+ prots_to_peps_size[prot] += 1
100
+ pep_ids << pep
101
+ pep_ids_to_prot_ids[pep] << prot
102
+ end
103
+ end
104
+ prot_ids_listed_by_peps_size = prots_to_peps_size.keys
105
+ tot_num_prots = prot_ids_listed_by_peps_size.size
106
+
107
+ sample = Array.new(num_its)
108
+
109
+ srand( 777 )
110
+ precision_sample = (0...num_its).to_a.map do
111
+ num_false_pephits_counter = num_false_pephits
112
+ # create a set of peptides
113
+ pep_ids_set = pep_ids.to_set
114
+ # shuffle the proteins within size groups
115
+ finished = false
116
+ prot_ids_listed_by_peps_size.group_by {|prot_id| prots_to_peps_size[prot_id] }.sort.each do |k,group_of_proteins_with_same_pep_size|
117
+ group_of_proteins_with_same_pep_size.shuffle!
118
+ group_of_proteins_with_same_pep_size.each do |prot_id|
119
+ prots_to_peps_here[prot_id].each do |pep_id|
120
+ if pep_ids_set.include?(pep_id) # if 1
121
+ # remove a peptide
122
+ pep_ids_set.delete(pep_id)
123
+ num_false_pephits_counter -= 1
124
+ if num_false_pephits_counter == 0 # if 2
125
+ finished = true
126
+ end # close if 2
127
+ end # close if 1
128
+ break if finished # each pep
129
+ end
130
+ break if finished # each prot
131
+ end
132
+ break if finished # each group_of_proteins_with_same_pep_size
133
+ end # each group_of_proteins_with_same_pep_size
134
+ ## Figure out the number of proteins left!
135
+ proteins_still_around = pep_ids_set.inject(Set.new) {|protset,pep_id| protset.merge( pep_ids_to_prot_ids[pep_id]) }
136
+
137
+ proteins_still_around.size.to_f / tot_num_prots
138
+ end # a sample
139
+ return precision_sample.min
140
+ end # FINAL else
141
+ end
142
+
143
+ # returns the precision of the worst possible outcome
144
+ def worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
145
+ completely_false_proteins = 0
146
+ num_peps_per_prot.sort.each do |num_peps|
147
+ num_false_pephits -= num_peps
148
+ if num_false_pephits >= 0
149
+ completely_false_proteins += 1
150
+ end
151
+ if num_false_pephits <= 0
152
+ break
153
+ end
154
+ end
155
+ num_prots = num_peps_per_prot.size
156
+ (num_prots - completely_false_proteins).to_f/num_prots
157
+ end
158
+
159
+ # normal as in a standard normal distribution of peptide hits per protein
160
+ # they are distributed randomly and the precision is assumed to take on a
161
+ # standard normal distribution.
162
+ # num_peps_per_protein is an array of the number of peptides per protein hit
163
+ # (these are the true hits)
164
+ # assumes that the number follows a gaussian distribution (binomial
165
+ # distributions tend toward gaussians, I believe, at large N)
166
+ # returns [mean_precision, stdev_precision]
167
+ # options:
168
+ # :num_its => Integer (default: 10)
169
+ #
170
+ # if num_iterations is set at 1, then only the precision will be returned
171
+ # though random, the same seed is always used to start this process, meaning
172
+ # that the same results will be produced on consecutive attempts.
173
+ #
174
+ # validator.normal_prothit_precision(peps, 13, :num_its => 1) # -> 0.95433
175
+ # validator.normal_prothit_precision(peps, 13, :num_its => 2) # -> [0.92002, 1.2223]
176
+ def normal_prothit_precision( peps, num_false_pephits, opts={})
177
+ num_iterations = opts[:num_its] || 10
178
+ srand( 38272 )
179
+
180
+ ##############################################
181
+ # The END Cases (can be dealt with quickly)
182
+ ##############################################
183
+ if num_false_pephits == 0
184
+ if num_iterations == 1
185
+ return 1.0
186
+ else
187
+ return [1.0, 0.0]
188
+ end
189
+ elsif num_false_pephits >= peps.size
190
+ if num_iterations == 1
191
+ return 0.0
192
+ else
193
+ return [0.0, 0.0]
194
+ end
195
+ end
196
+
197
+ ##############################################
198
+ # Everything else:
199
+ ##############################################
200
+
201
+ sample = Array.new(num_iterations)
202
+ base_indices = (0...(peps.size)).to_a
203
+ ### ACUTALLY, I THINK WE WANT TO CREATE AND MERGE!!!!
204
+ # This would mean that only a single hit would validate the protein
205
+ # if we are subtracting, then we lose the protein on a single peptide!!!!
206
+ prot_id_set = peps.inject(Set.new) do |prtset, pep|
207
+ prtset.merge( pep.prots.map {|prot| prot } )
208
+ end
209
+
210
+ tot_num_prots = prot_id_set.size
211
+ # could also merge off the good indices
212
+ # TODO: we should optimize based on how many false pephits given...
213
+
214
+ precision_sample = (0...num_iterations).to_a.map do
215
+ shuffled_indices = base_indices.map
216
+ shuffled_indices.shuffle!
217
+ good_indices = shuffled_indices[num_false_pephits..-1]
218
+ still_remaining = Set.new
219
+
220
+ peps.values_at(*good_indices).each do |pep|
221
+ still_remaining.merge(pep.prots.map {|prot| prot })
222
+ end
223
+ still_remaining.size.to_f / tot_num_prots
224
+ end
225
+ if num_iterations == 1
226
+ precision_sample.shift
227
+ else
228
+ #puts "PRECISION GROUP: "
229
+ #p precision_sample
230
+ sample_stats(precision_sample)
231
+ end
232
+ end
233
+ end
234
+
@@ -0,0 +1,272 @@
1
+ require 'validator'
2
+ require 'validator/digestion_based'
3
+ require 'transmem'
4
+ require 'fasta'
5
+ require 'spec_id/digestor'
6
+ require 'spec_id/sequest/params'
7
+ require 'spec_id/sequest/pepxml'
8
+
9
+
10
+ module Validator::Transmem ; end
11
+
12
+ # objects of this class can calculate pephit_precision given an array of
13
+ # SpecID::Pep objects using the pephit_precision method.
14
+ class Validator::Transmem::Protein < Validator::DigestionBased
15
+ include Precision::Calculator
16
+
17
+ # a hash keyed by index reference which is true if >= min_num_tms
18
+ attr_accessor :transmem_by_ti_key
19
+ attr_accessor :transmem_index
20
+
21
+ # min_num_tms: Integer (1...), the min # certain transmembrane segments to
22
+ # consider the protein a transmembrane protein
23
+ attr_reader :min_num_tms
24
+
25
+ # soluble_fraction: *true/false
26
+ attr_accessor :soluble_fraction
27
+
28
+ # correct_wins: *true/false,
29
+ # if the peptide is found in some proteins that are transmembrane and some
30
+ # that are not, then if soluble_fraction==true, this peptide will be
31
+ # considered non-transmembrane. If soluble_fraction==false, then this
32
+ # will be considered transmembrane.
33
+ attr_accessor :correct_wins
34
+
35
+ # no_include_tm_peps: false or Float (0.0-1.0), peptides that have a
36
+ # fraction of amino acids that fall inside transmembrane sequences greater
37
+ # than or equal to the value of the argument will not be considered in the final
38
+ # calculation of peptide hit precision. (A transmembrane segment is
39
+ # likely to have very different properties than the rest of the peptides,
40
+ # so the assumption of equally flyable peptides is broken unless these are
41
+ # removed) nil or false will skip this filter. A reasonable value is
42
+ # probably 0.7.
43
+ attr_accessor :no_include_tm_peps
44
+
45
+ # if nil, then this will be calculated whe pephit_precision is called.
46
+ attr_accessor :transmem_status_hash
47
+
48
+ # the file used (toppred or phobius file)
49
+ attr_accessor :transmem_file
50
+
51
+ DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( { :min_num_tms => 1, :soluble_fraction => true, :correct_wins => true, :no_include_tm_peps => false, :transmem_status_hash => nil} )
52
+
53
+ # expects a toppred.out file (see transmem/toppred)
54
+ # other types of transmembrane predictions)
55
+ # fasta_obj is a Fasta object.
56
+ # sequest_params_obj is a Sequest::Params object.
57
+ # OPTIONS:
58
+ # (see Validator::Transmem::Protein::DEFAULTS for defaults)
59
+ #
60
+ # no_include_tm_peps: *false
61
+ #
62
+ # NOTE: if fasta_obj and sequest_params_obj are not passed in then
63
+ # 'false_to_total_ratio' must be set later.
64
+ def initialize(a_transmem_file, options={})
65
+ @transmem_file = a_transmem_file
66
+ opts = self.class::DEFAULTS.merge(options)
67
+
68
+ (@min_num_tms, @soluble_fraction, @correct_wins, @no_include_tm_peps, @background, @transmem_status_hash, @false_to_total_ratio, fasta) = opts.values_at(:min_num_tms, :soluble_fraction, :correct_wins, :no_include_tm_peps, :background, :transmem_status_hash, :false_to_total_ratio, :fasta)
69
+
70
+ # fasta object is used to update hte phobius index if given
71
+ # a hash by reference => true/false (depending on min_num_tms)
72
+ @transmem_index = TransmemIndex.new(@transmem_file, fasta)
73
+ @transmem_by_ti_key = create_transmem_by_ti_key_hash(@transmem_index, @min_num_tms)
74
+ end
75
+
76
+ # Designates each protein as transmembrane or not depending on :min_num_tms
77
+ # The hash is keyed by the TransmemIndex key.
78
+ def create_transmem_by_ti_key_hash(transmem_index, min_num_tms)
79
+ _transmem_by_ti_key = {}
80
+ num_certain_hash = transmem_index.num_certain_index
81
+ num_certain_hash.each do |id, num_certain|
82
+ if num_certain >= min_num_tms
83
+ _transmem_by_ti_key[id] = true
84
+ else
85
+ _transmem_by_ti_key[id] = false
86
+ end
87
+ end
88
+ _transmem_by_ti_key
89
+ end
90
+
91
+ # returns a hash where each protein (and peptide if given peps) is indexed
92
+ # with itself with true/false/nil depending on transmembrane status. If
93
+ # given peptides, and :no_include_tm_peps is not false, will also set the
94
+ # attribute for peptides.
95
+ # the attribute (:no_include_tm_peps)
96
+ # NOTE: if given a list of peptides, this implementation will not overwrite a
97
+ # protein if it already has a true/false for transmem. This is so that a
98
+ # lookup does not have to be performed if the value is already defined as
99
+ # the assumption is that many peptides will point to the same protein.
100
+ def create_transmem_status_hash(peps)
101
+ thash = {}
102
+ peps.each do |pep|
103
+ pep.prots.each do |prot|
104
+ if !thash.key?(prot)
105
+ #prot.transmem == nil
106
+ thash[prot] = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
107
+ end
108
+ end
109
+ if @no_include_tm_peps
110
+ thash[pep] = pep_is_transmem?(pep)
111
+ end
112
+ end
113
+ thash
114
+ end
115
+
116
+ # sets the false_to_total_ratio and returns self for chaining.
117
+ # peps will usually be the peptides created by calling:
118
+ # peps = Digestor.digest( fasta_obj, sequest_params_obj )
119
+ def set_false_to_total_ratio(peps)
120
+ tm_hash = create_transmem_status_hash(peps)
121
+ (tps, fps) = partition(peps, tm_hash)
122
+ @false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
123
+ self
124
+ end
125
+
126
+ def pephit_precision(peps)
127
+ if !@transmem_status_hash
128
+ @transmem_status_hash = create_transmem_status_hash(peps)
129
+ end
130
+ super(peps)
131
+ end
132
+
133
+ # regardless of transmembrane status of proteins peptide belongs to, asks
134
+ # what the avg overlap is with transmembrane sequences.
135
+ def pep_is_transmem?(pep)
136
+ prts = pep.prots
137
+ prts_w_keys = 0
138
+ sum_of_fractions = 0.0
139
+ prts.each do |prot|
140
+ key = @transmem_index.reference_to_key(prot.reference)
141
+ ans = @transmem_index.avg_overlap(key, pep.aaseq, :fraction)
142
+ if ans
143
+ sum_of_fractions += ans
144
+ prts_w_keys += 1
145
+ end
146
+ end
147
+ if prts_w_keys > 0
148
+ avg_of_fractions = sum_of_fractions / prts_w_keys
149
+ avg_of_fractions >= @no_include_tm_peps
150
+ else
151
+ nil
152
+ end
153
+ end
154
+
155
+ # each peptide must have prots and the prots must respond true/false to
156
+ # the 'transmem' method
157
+ # if given a hash, it will override the @transmem_status_hash
158
+ def partition(peps, transmem_status_hash=nil)
159
+ # The fast way to do this is to play with the logic
160
+ # For the insoluble fraction we calculate as if incorrect wins
161
+ # and swap the tp's and fp's (I've verified that this is correct
162
+ # empirically)
163
+
164
+ # the code could be cleaner here, but efforts to minimize calls in the
165
+ # inner loops create this structure...
166
+ tm_hash = transmem_status_hash || @transmem_status_hash
167
+
168
+ my_peps =
169
+ if @no_include_tm_peps
170
+ # remove all thos peps with fractional overlap >= @no_include
171
+ # [1,2,3,4].reject {|n| n >= 3} #-> [1, 2]
172
+ # remove pep.transmem == true and pep.transmem == nil
173
+
174
+ if tm_hash
175
+ peps.reject do |pep|
176
+ tm_hash[pep] != false
177
+ end
178
+ else
179
+ peps.reject do |pep|
180
+ pep_is_transmem?(pep) != false
181
+ end
182
+ end
183
+ else
184
+ peps
185
+ end
186
+ cw = @correct_wins
187
+ sf = @soluble_fraction
188
+ if !sf
189
+ cw = !cw
190
+ end
191
+
192
+ tp = []
193
+ fp = []
194
+
195
+ if cw
196
+ my_peps.each do |pep|
197
+ one_prot_is_not_transmem = false
198
+ not_all_nil = false
199
+ if tm_hash
200
+ pep.prots.each do |prot|
201
+ tm_status = tm_hash[prot]
202
+ if tm_status == false
203
+ one_prot_is_not_transmem = true
204
+ break
205
+ elsif tm_status == true
206
+ not_all_nil = true
207
+ end
208
+ end
209
+ else
210
+ pep.prots.each do |prot|
211
+ tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
212
+ if tm_status == false
213
+ one_prot_is_not_transmem = true
214
+ break
215
+ elsif tm_status == true
216
+ not_all_nil = true
217
+ end
218
+ end
219
+ end
220
+ if one_prot_is_not_transmem
221
+ tp << pep
222
+ else
223
+ if not_all_nil
224
+ fp << pep
225
+ end
226
+ end
227
+ end
228
+ else
229
+ my_peps.each do |pep|
230
+ one_prot_is_transmem = false
231
+ not_all_nil = false
232
+ if tm_hash
233
+ pep.prots.each do |prot|
234
+ tm_status = tm_hash[prot]
235
+ if tm_status == true
236
+ one_prot_is_transmem = true
237
+ break
238
+ elsif tm_status == false
239
+ not_all_nil = true
240
+ end
241
+ end
242
+ else
243
+ pep.prots.each do |prot|
244
+ tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
245
+ if tm_status == true
246
+ one_prot_is_transmem = true
247
+ break
248
+ elsif tm_status == false
249
+ not_all_nil = true
250
+ end
251
+ end
252
+ end
253
+ if one_prot_is_transmem
254
+ fp << pep
255
+ else
256
+ if not_all_nil
257
+ tp << pep
258
+ end
259
+ end
260
+ end
261
+ end
262
+ if !sf # swap
263
+ fp,tp = tp,fp
264
+ cw = !cw
265
+ end
266
+ #puts "PARTITION ARRAY"
267
+ #p [tp, fp].map{|v| v.size}
268
+ [tp, fp]
269
+ end
270
+
271
+ end
272
+
@@ -0,0 +1,46 @@
1
+ require 'validator'
2
+
3
+ class Validator::TruePos < Validator
4
+ include Precision::Calculator
5
+ attr_reader :fasta
6
+ attr_accessor :correct_wins
7
+
8
+ # correct_wins means that only a single protein from a pep.aaseq must match
9
+ # the fasta object for the pep hit to be considered valid. Otherwise, all
10
+ # must be a match
11
+ def initialize(fasta_obj, correct_wins = true)
12
+ @fasta = fasta_obj
13
+ @fasta_headers = @fasta.prots.map {|prot| prot.header }
14
+ @correct_wins = correct_wins
15
+ end
16
+
17
+ def partition(peps)
18
+ if @correct_wins
19
+ peps.partition do |pep|
20
+ @fasta_headers.any? do |header|
21
+ pep.prots.any? do |pepprot|
22
+ header.include? pepprot.reference
23
+ end
24
+ end
25
+ end
26
+ else
27
+ peps.partition do |pep|
28
+ pep.prots.all? do |pepprot|
29
+ @fasta_headers.any? do |header|
30
+ header.include? pepprot.reference
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def pephit_precision(peps)
38
+ (tp, fp) = partition(peps)
39
+ calc_precision(tp.size, fp.size)
40
+ end
41
+
42
+ def to_param_string
43
+ "true_positives(tps)=" + ["{fasta=#{@fasta.filename}", "correct_wins=#{@correct_wins}}"].join(", ")
44
+ end
45
+
46
+ end