mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,234 @@
1
+ require 'validator'
2
+
3
+ require 'set'
4
+ require 'group_by'
5
+ require 'shuffle'
6
+
7
+ # calculates protein hit precision based on peptide precision
8
+ class Validator::ProtFromPep < Validator
9
+
10
+ # calculate protein precision based on the number of false peptides
11
+ # returns the precision based on the number of proteins *completely false*
12
+ # calculates the worst precision by assuming that proteins with the fewest
13
+ # peptides are all false (before prots with more pephits)
14
+ # note that this approaches the worst, but is not guaranteed to be worst
15
+ # unless each pephit maps to a single protein hit.
16
+ # [worst, normal_mean, normal_stddev]
17
+ # options
18
+ # :num_its_normal => Integer, # num iterations for normal (d: 10)
19
+ # :num_its_worstcase => Integer, # num iterations for worstcase (d: 10)
20
+ #
21
+ def prothit_precision(peps, num_false_pephits, opts={})
22
+ opts[:num_its_normal] ||= 10
23
+ opts[:num_its_worstcase] ||= 10
24
+ # get the num_peps_per_protein array
25
+ worst = worstcase_prothit_precision(peps, num_false_pephits, :num_its => opts[:num_its_worstcase])
26
+ (normal_mean, normal_stdev) = normal_prothit_precision( peps, num_false_pephits, :num_its => opts[:num_its_normal])
27
+ [worst, normal_mean, normal_stdev]
28
+ end
29
+
30
+ # returns an array of the number of peptide hits in each protein
31
+ def num_peps_per_protein(peps)
32
+ num_pephits_by_prot = Hash.new { 0 }
33
+ peps.each do |pep|
34
+ pep.prots.each do |prot|
35
+ num_pephits_by_prot[prot.reference] += 1
36
+ end
37
+ end
38
+ num_pephits_by_prot.values
39
+ end
40
+
41
+ # returns the worstcase precision. This assumes that every small protein
42
+ # with the fewest peptide hits is completely 'filled' with incorrect hits in
43
+ # preference to any higher hit protein.
44
+ # Where each peptide hit maps to a single protein, this is guaranteed to be
45
+ # worst-case. If this doesn't hold, there are some extreme cases where a
46
+ # poorer precision could be generated, but this is still probably fairly
47
+ # close. Thus, a slightly different answer may be generated each time.
48
+ # ...variation is produced by shuffling the order of the proteins from which
49
+ # peptides are removed within groups of proteins having the same number of
50
+ # peptides.
51
+ # This method does NOT require that the prothits be updated to reflect only
52
+ # those pephits being passed in.
53
+ #
54
+ # validator.worstcase_prothit_precision(peps, 14, 1) # => 0.232111
55
+ #
56
+ # options:
57
+ # :num_its => Integer (default: 10) number of times to run (finds minimum)
58
+ # :one_prot_per_pep => true | *false assumes each peptide maps to a
59
+ # single protein
60
+ def worstcase_prothit_precision(peps, num_false_pephits, opts = {})
61
+ num_its = opts[:num_its] || 10
62
+ one_prot_per_pep = opts[:one_prot_per_pep] # nil or false still == false
63
+ one_prot_per_pep = false if one_prot_per_pep == nil
64
+
65
+
66
+ ##############################################
67
+ # The END Cases (can be dealt with quickly)
68
+ ##############################################
69
+ if num_false_pephits == 0
70
+ return 1.0
71
+ elsif num_false_pephits >= peps.size
72
+ return 0.0
73
+ end
74
+
75
+ if one_prot_per_pep
76
+ num_peps_per_prot = num_peps_per_protein(peps)
77
+ return worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
78
+ else
79
+ #####################################
80
+ # HERE's the basic plan!!
81
+ #####################################
82
+ # order the proteins by num peptides
83
+ # create a set of peptides
84
+ # delete peptides from the proteins off the set o' peptides (ensuring that
85
+ # a deleted one cannot be deleted twice)
86
+
87
+ #####################################
88
+ # order the proteins by num peptides
89
+ # and create a hash that holds the peptides (given here) in those proteins
90
+ prots_to_peps_here = Hash.new {|h,k| h[k] = [] }
91
+ prots_to_peps_size = Hash.new { 0 }
92
+ pep_ids = []
93
+ pep_ids_to_prot_ids = Hash.new {|h,k| h[k] = [] }
94
+ peps.each do |pep|
95
+ #puts pep.prots.size
96
+ pep.prots.each do |prot|
97
+ #p prot.reference
98
+ prots_to_peps_here[prot] << pep
99
+ prots_to_peps_size[prot] += 1
100
+ pep_ids << pep
101
+ pep_ids_to_prot_ids[pep] << prot
102
+ end
103
+ end
104
+ prot_ids_listed_by_peps_size = prots_to_peps_size.keys
105
+ tot_num_prots = prot_ids_listed_by_peps_size.size
106
+
107
+ sample = Array.new(num_its)
108
+
109
+ srand( 777 )
110
+ precision_sample = (0...num_its).to_a.map do
111
+ num_false_pephits_counter = num_false_pephits
112
+ # create a set of peptides
113
+ pep_ids_set = pep_ids.to_set
114
+ # shuffle the proteins within size groups
115
+ finished = false
116
+ prot_ids_listed_by_peps_size.group_by {|prot_id| prots_to_peps_size[prot_id] }.sort.each do |k,group_of_proteins_with_same_pep_size|
117
+ group_of_proteins_with_same_pep_size.shuffle!
118
+ group_of_proteins_with_same_pep_size.each do |prot_id|
119
+ prots_to_peps_here[prot_id].each do |pep_id|
120
+ if pep_ids_set.include?(pep_id) # if 1
121
+ # remove a peptide
122
+ pep_ids_set.delete(pep_id)
123
+ num_false_pephits_counter -= 1
124
+ if num_false_pephits_counter == 0 # if 2
125
+ finished = true
126
+ end # close if 2
127
+ end # close if 1
128
+ break if finished # each pep
129
+ end
130
+ break if finished # each prot
131
+ end
132
+ break if finished # each group_of_proteins_with_same_pep_size
133
+ end # each group_of_proteins_with_same_pep_size
134
+ ## Figure out the number of proteins left!
135
+ proteins_still_around = pep_ids_set.inject(Set.new) {|protset,pep_id| protset.merge( pep_ids_to_prot_ids[pep_id]) }
136
+
137
+ proteins_still_around.size.to_f / tot_num_prots
138
+ end # a sample
139
+ return precision_sample.min
140
+ end # FINAL else
141
+ end
142
+
143
+ # returns the precision of the worst possible outcome
144
+ def worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
145
+ completely_false_proteins = 0
146
+ num_peps_per_prot.sort.each do |num_peps|
147
+ num_false_pephits -= num_peps
148
+ if num_false_pephits >= 0
149
+ completely_false_proteins += 1
150
+ end
151
+ if num_false_pephits <= 0
152
+ break
153
+ end
154
+ end
155
+ num_prots = num_peps_per_prot.size
156
+ (num_prots - completely_false_proteins).to_f/num_prots
157
+ end
158
+
159
+ # normal as in a standard normal distribution of peptide hits per protein
160
+ # they are distributed randomly and the precision is assumed to take on a
161
+ # standard normal distribution.
162
+ # num_peps_per_protein is an array of the number of peptides per protein hit
163
+ # (these are the true hits)
164
+ # assumes that the number follows a gaussian distribution (binomial
165
+ # distributions tend toward gaussians, I believe, at large N)
166
+ # returns [mean_precision, stdev_precision]
167
+ # options:
168
+ # :num_its => Integer (default: 10)
169
+ #
170
+ # if num_iterations is set at 1, then only the precision will be returned
171
+ # though random, the same seed is always used to start this process, meaning
172
+ # that the same results will be produced on consecutive attempts.
173
+ #
174
+ # validator.normal_prothit_precision(peps, 13, :num_its => 1) # -> 0.95433
175
+ # validator.normal_prothit_precision(peps, 13, :num_its => 2) # -> [0.92002, 1.2223]
176
+ def normal_prothit_precision( peps, num_false_pephits, opts={})
177
+ num_iterations = opts[:num_its] || 10
178
+ srand( 38272 )
179
+
180
+ ##############################################
181
+ # The END Cases (can be dealt with quickly)
182
+ ##############################################
183
+ if num_false_pephits == 0
184
+ if num_iterations == 1
185
+ return 1.0
186
+ else
187
+ return [1.0, 0.0]
188
+ end
189
+ elsif num_false_pephits >= peps.size
190
+ if num_iterations == 1
191
+ return 0.0
192
+ else
193
+ return [0.0, 0.0]
194
+ end
195
+ end
196
+
197
+ ##############################################
198
+ # Everything else:
199
+ ##############################################
200
+
201
+ sample = Array.new(num_iterations)
202
+ base_indices = (0...(peps.size)).to_a
203
+ ### ACUTALLY, I THINK WE WANT TO CREATE AND MERGE!!!!
204
+ # This would mean that only a single hit would validate the protein
205
+ # if we are subtracting, then we lose the protein on a single peptide!!!!
206
+ prot_id_set = peps.inject(Set.new) do |prtset, pep|
207
+ prtset.merge( pep.prots.map {|prot| prot } )
208
+ end
209
+
210
+ tot_num_prots = prot_id_set.size
211
+ # could also merge off the good indices
212
+ # TODO: we should optimize based on how many false pephits given...
213
+
214
+ precision_sample = (0...num_iterations).to_a.map do
215
+ shuffled_indices = base_indices.map
216
+ shuffled_indices.shuffle!
217
+ good_indices = shuffled_indices[num_false_pephits..-1]
218
+ still_remaining = Set.new
219
+
220
+ peps.values_at(*good_indices).each do |pep|
221
+ still_remaining.merge(pep.prots.map {|prot| prot })
222
+ end
223
+ still_remaining.size.to_f / tot_num_prots
224
+ end
225
+ if num_iterations == 1
226
+ precision_sample.shift
227
+ else
228
+ #puts "PRECISION GROUP: "
229
+ #p precision_sample
230
+ sample_stats(precision_sample)
231
+ end
232
+ end
233
+ end
234
+
@@ -0,0 +1,272 @@
1
+ require 'validator'
2
+ require 'validator/digestion_based'
3
+ require 'transmem'
4
+ require 'fasta'
5
+ require 'spec_id/digestor'
6
+ require 'spec_id/sequest/params'
7
+ require 'spec_id/sequest/pepxml'
8
+
9
+
10
+ module Validator::Transmem ; end
11
+
12
+ # objects of this class can calculate pephit_precision given an array of
13
+ # SpecID::Pep objects using the pephit_precision method.
14
+ class Validator::Transmem::Protein < Validator::DigestionBased
15
+ include Precision::Calculator
16
+
17
+ # a hash keyed by index reference which is true if >= min_num_tms
18
+ attr_accessor :transmem_by_ti_key
19
+ attr_accessor :transmem_index
20
+
21
+ # min_num_tms: Integer (1...), the min # certain transmembrane segments to
22
+ # consider the protein a transmembrane protein
23
+ attr_reader :min_num_tms
24
+
25
+ # soluble_fraction: *true/false
26
+ attr_accessor :soluble_fraction
27
+
28
+ # correct_wins: *true/false,
29
+ # if the peptide is found in some proteins that are transmembrane and some
30
+ # that are not, then if soluble_fraction==true, this peptide will be
31
+ # considered non-transmembrane. If soluble_fraction==false, then this
32
+ # will be considered transmembrane.
33
+ attr_accessor :correct_wins
34
+
35
+ # no_include_tm_peps: false or Float (0.0-1.0), peptides that have a
36
+ # fraction of amino acids that fall inside transmembrane sequences greater
37
+ # than or equal to the value of the argument will not be considered in the final
38
+ # calculation of peptide hit precision. (A transmembrane segment is
39
+ # likely to have very different properties than the rest of the peptides,
40
+ # so the assumption of equally flyable peptides is broken unless these are
41
+ # removed) nil or false will skip this filter. A reasonable value is
42
+ # probably 0.7.
43
+ attr_accessor :no_include_tm_peps
44
+
45
+ # if nil, then this will be calculated whe pephit_precision is called.
46
+ attr_accessor :transmem_status_hash
47
+
48
+ # the file used (toppred or phobius file)
49
+ attr_accessor :transmem_file
50
+
51
+ DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( { :min_num_tms => 1, :soluble_fraction => true, :correct_wins => true, :no_include_tm_peps => false, :transmem_status_hash => nil} )
52
+
53
+ # expects a toppred.out file (see transmem/toppred)
54
+ # other types of transmembrane predictions)
55
+ # fasta_obj is a Fasta object.
56
+ # sequest_params_obj is a Sequest::Params object.
57
+ # OPTIONS:
58
+ # (see Validator::Transmem::Protein::DEFAULTS for defaults)
59
+ #
60
+ # no_include_tm_peps: *false
61
+ #
62
+ # NOTE: if fasta_obj and sequest_params_obj are not passed in then
63
+ # 'false_to_total_ratio' must be set later.
64
+ def initialize(a_transmem_file, options={})
65
+ @transmem_file = a_transmem_file
66
+ opts = self.class::DEFAULTS.merge(options)
67
+
68
+ (@min_num_tms, @soluble_fraction, @correct_wins, @no_include_tm_peps, @background, @transmem_status_hash, @false_to_total_ratio, fasta) = opts.values_at(:min_num_tms, :soluble_fraction, :correct_wins, :no_include_tm_peps, :background, :transmem_status_hash, :false_to_total_ratio, :fasta)
69
+
70
+ # fasta object is used to update hte phobius index if given
71
+ # a hash by reference => true/false (depending on min_num_tms)
72
+ @transmem_index = TransmemIndex.new(@transmem_file, fasta)
73
+ @transmem_by_ti_key = create_transmem_by_ti_key_hash(@transmem_index, @min_num_tms)
74
+ end
75
+
76
+ # Designates each protein as transmembrane or not depending on :min_num_tms
77
+ # The hash is keyed by the TransmemIndex key.
78
+ def create_transmem_by_ti_key_hash(transmem_index, min_num_tms)
79
+ _transmem_by_ti_key = {}
80
+ num_certain_hash = transmem_index.num_certain_index
81
+ num_certain_hash.each do |id, num_certain|
82
+ if num_certain >= min_num_tms
83
+ _transmem_by_ti_key[id] = true
84
+ else
85
+ _transmem_by_ti_key[id] = false
86
+ end
87
+ end
88
+ _transmem_by_ti_key
89
+ end
90
+
91
+ # returns a hash where each protein (and peptide if given peps) is indexed
92
+ # with itself with true/false/nil depending on transmembrane status. If
93
+ # given peptides, and :no_include_tm_peps is not false, will also set the
94
+ # attribute for peptides.
95
+ # the attribute (:no_include_tm_peps)
96
+ # NOTE: if given a list of peptides, this implementation will not overwrite a
97
+ # protein if it already has a true/false for transmem. This is so that a
98
+ # lookup does not have to be performed if the value is already defined as
99
+ # the assumption is that many peptides will point to the same protein.
100
+ def create_transmem_status_hash(peps)
101
+ thash = {}
102
+ peps.each do |pep|
103
+ pep.prots.each do |prot|
104
+ if !thash.key?(prot)
105
+ #prot.transmem == nil
106
+ thash[prot] = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
107
+ end
108
+ end
109
+ if @no_include_tm_peps
110
+ thash[pep] = pep_is_transmem?(pep)
111
+ end
112
+ end
113
+ thash
114
+ end
115
+
116
+ # sets the false_to_total_ratio and returns self for chaining.
117
+ # peps will usually be the peptides created by calling:
118
+ # peps = Digestor.digest( fasta_obj, sequest_params_obj )
119
+ def set_false_to_total_ratio(peps)
120
+ tm_hash = create_transmem_status_hash(peps)
121
+ (tps, fps) = partition(peps, tm_hash)
122
+ @false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
123
+ self
124
+ end
125
+
126
+ def pephit_precision(peps)
127
+ if !@transmem_status_hash
128
+ @transmem_status_hash = create_transmem_status_hash(peps)
129
+ end
130
+ super(peps)
131
+ end
132
+
133
+ # regardless of transmembrane status of proteins peptide belongs to, asks
134
+ # what the avg overlap is with transmembrane sequences.
135
+ def pep_is_transmem?(pep)
136
+ prts = pep.prots
137
+ prts_w_keys = 0
138
+ sum_of_fractions = 0.0
139
+ prts.each do |prot|
140
+ key = @transmem_index.reference_to_key(prot.reference)
141
+ ans = @transmem_index.avg_overlap(key, pep.aaseq, :fraction)
142
+ if ans
143
+ sum_of_fractions += ans
144
+ prts_w_keys += 1
145
+ end
146
+ end
147
+ if prts_w_keys > 0
148
+ avg_of_fractions = sum_of_fractions / prts_w_keys
149
+ avg_of_fractions >= @no_include_tm_peps
150
+ else
151
+ nil
152
+ end
153
+ end
154
+
155
+ # each peptide must have prots and the prots must respond true/false to
156
+ # the 'transmem' method
157
+ # if given a hash, it will override the @transmem_status_hash
158
+ def partition(peps, transmem_status_hash=nil)
159
+ # The fast way to do this is to play with the logic
160
+ # For the insoluble fraction we calculate as if incorrect wins
161
+ # and swap the tp's and fp's (I've verified that this is correct
162
+ # empirically)
163
+
164
+ # the code could be cleaner here, but efforts to minimize calls in the
165
+ # inner loops create this structure...
166
+ tm_hash = transmem_status_hash || @transmem_status_hash
167
+
168
+ my_peps =
169
+ if @no_include_tm_peps
170
+ # remove all thos peps with fractional overlap >= @no_include
171
+ # [1,2,3,4].reject {|n| n >= 3} #-> [1, 2]
172
+ # remove pep.transmem == true and pep.transmem == nil
173
+
174
+ if tm_hash
175
+ peps.reject do |pep|
176
+ tm_hash[pep] != false
177
+ end
178
+ else
179
+ peps.reject do |pep|
180
+ pep_is_transmem?(pep) != false
181
+ end
182
+ end
183
+ else
184
+ peps
185
+ end
186
+ cw = @correct_wins
187
+ sf = @soluble_fraction
188
+ if !sf
189
+ cw = !cw
190
+ end
191
+
192
+ tp = []
193
+ fp = []
194
+
195
+ if cw
196
+ my_peps.each do |pep|
197
+ one_prot_is_not_transmem = false
198
+ not_all_nil = false
199
+ if tm_hash
200
+ pep.prots.each do |prot|
201
+ tm_status = tm_hash[prot]
202
+ if tm_status == false
203
+ one_prot_is_not_transmem = true
204
+ break
205
+ elsif tm_status == true
206
+ not_all_nil = true
207
+ end
208
+ end
209
+ else
210
+ pep.prots.each do |prot|
211
+ tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
212
+ if tm_status == false
213
+ one_prot_is_not_transmem = true
214
+ break
215
+ elsif tm_status == true
216
+ not_all_nil = true
217
+ end
218
+ end
219
+ end
220
+ if one_prot_is_not_transmem
221
+ tp << pep
222
+ else
223
+ if not_all_nil
224
+ fp << pep
225
+ end
226
+ end
227
+ end
228
+ else
229
+ my_peps.each do |pep|
230
+ one_prot_is_transmem = false
231
+ not_all_nil = false
232
+ if tm_hash
233
+ pep.prots.each do |prot|
234
+ tm_status = tm_hash[prot]
235
+ if tm_status == true
236
+ one_prot_is_transmem = true
237
+ break
238
+ elsif tm_status == false
239
+ not_all_nil = true
240
+ end
241
+ end
242
+ else
243
+ pep.prots.each do |prot|
244
+ tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
245
+ if tm_status == true
246
+ one_prot_is_transmem = true
247
+ break
248
+ elsif tm_status == false
249
+ not_all_nil = true
250
+ end
251
+ end
252
+ end
253
+ if one_prot_is_transmem
254
+ fp << pep
255
+ else
256
+ if not_all_nil
257
+ tp << pep
258
+ end
259
+ end
260
+ end
261
+ end
262
+ if !sf # swap
263
+ fp,tp = tp,fp
264
+ cw = !cw
265
+ end
266
+ #puts "PARTITION ARRAY"
267
+ #p [tp, fp].map{|v| v.size}
268
+ [tp, fp]
269
+ end
270
+
271
+ end
272
+
@@ -0,0 +1,46 @@
1
+ require 'validator'
2
+
3
+ class Validator::TruePos < Validator
4
+ include Precision::Calculator
5
+ attr_reader :fasta
6
+ attr_accessor :correct_wins
7
+
8
+ # correct_wins means that only a single protein from a pep.aaseq must match
9
+ # the fasta object for the pep hit to be considered valid. Otherwise, all
10
+ # must be a match
11
+ def initialize(fasta_obj, correct_wins = true)
12
+ @fasta = fasta_obj
13
+ @fasta_headers = @fasta.prots.map {|prot| prot.header }
14
+ @correct_wins = correct_wins
15
+ end
16
+
17
+ def partition(peps)
18
+ if @correct_wins
19
+ peps.partition do |pep|
20
+ @fasta_headers.any? do |header|
21
+ pep.prots.any? do |pepprot|
22
+ header.include? pepprot.reference
23
+ end
24
+ end
25
+ end
26
+ else
27
+ peps.partition do |pep|
28
+ pep.prots.all? do |pepprot|
29
+ @fasta_headers.any? do |header|
30
+ header.include? pepprot.reference
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def pephit_precision(peps)
38
+ (tp, fp) = partition(peps)
39
+ calc_precision(tp.size, fp.size)
40
+ end
41
+
42
+ def to_param_string
43
+ "true_positives(tps)=" + ["{fasta=#{@fasta.filename}", "correct_wins=#{@correct_wins}}"].join(", ")
44
+ end
45
+
46
+ end