mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,234 +0,0 @@
1
- require 'validator'
2
-
3
- require 'set'
4
- require 'group_by'
5
- require 'shuffle'
6
-
7
- # calculates protein hit precision based on peptide precision
8
- class Validator::ProtFromPep < Validator
9
-
10
- # calculate protein precision based on the number of false peptides
11
- # returns the precision based on the number of proteins *completely false*
12
- # calculates the worst precision by assuming that proteins with the fewest
13
- # peptides are all false (before prots with more pephits)
14
- # note that this approaches the worst, but is not guaranteed to be worst
15
- # unless each pephit maps to a single protein hit.
16
- # [worst, normal_mean, normal_stddev]
17
- # options
18
- # :num_its_normal => Integer, # num iterations for normal (d: 10)
19
- # :num_its_worstcase => Integer, # num iterations for worstcase (d: 10)
20
- #
21
- def prothit_precision(peps, num_false_pephits, opts={})
22
- opts[:num_its_normal] ||= 10
23
- opts[:num_its_worstcase] ||= 10
24
- # get the num_peps_per_protein array
25
- worst = worstcase_prothit_precision(peps, num_false_pephits, :num_its => opts[:num_its_worstcase])
26
- (normal_mean, normal_stdev) = normal_prothit_precision( peps, num_false_pephits, :num_its => opts[:num_its_normal])
27
- [worst, normal_mean, normal_stdev]
28
- end
29
-
30
- # returns an array of the number of peptide hits in each protein
31
- def num_peps_per_protein(peps)
32
- num_pephits_by_prot = Hash.new { 0 }
33
- peps.each do |pep|
34
- pep.prots.each do |prot|
35
- num_pephits_by_prot[prot.reference] += 1
36
- end
37
- end
38
- num_pephits_by_prot.values
39
- end
40
-
41
- # returns the worstcase precision. This assumes that every small protein
42
- # with the fewest peptide hits is completely 'filled' with incorrect hits in
43
- # preference to any higher hit protein.
44
- # Where each peptide hit maps to a single protein, this is guaranteed to be
45
- # worst-case. If this doesn't hold, there are some extreme cases where a
46
- # poorer precision could be generated, but this is still probably fairly
47
- # close. Thus, a slightly different answer may be generated each time.
48
- # ...variation is produced by shuffling the order of the proteins from which
49
- # peptides are removed within groups of proteins having the same number of
50
- # peptides.
51
- # This method does NOT require that the prothits be updated to reflect only
52
- # those pephits being passed in.
53
- #
54
- # validator.worstcase_prothit_precision(peps, 14, 1) # => 0.232111
55
- #
56
- # options:
57
- # :num_its => Integer (default: 10) number of times to run (finds minimum)
58
- # :one_prot_per_pep => true | *false assumes each peptide maps to a
59
- # single protein
60
- def worstcase_prothit_precision(peps, num_false_pephits, opts = {})
61
- num_its = opts[:num_its] || 10
62
- one_prot_per_pep = opts[:one_prot_per_pep] # nil or false still == false
63
- one_prot_per_pep = false if one_prot_per_pep == nil
64
-
65
-
66
- ##############################################
67
- # The END Cases (can be dealt with quickly)
68
- ##############################################
69
- if num_false_pephits == 0
70
- return 1.0
71
- elsif num_false_pephits >= peps.size
72
- return 0.0
73
- end
74
-
75
- if one_prot_per_pep
76
- num_peps_per_prot = num_peps_per_protein(peps)
77
- return worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
78
- else
79
- #####################################
80
- # HERE's the basic plan!!
81
- #####################################
82
- # order the proteins by num peptides
83
- # create a set of peptides
84
- # delete peptides from the proteins off the set o' peptides (ensuring that
85
- # a deleted one cannot be deleted twice)
86
-
87
- #####################################
88
- # order the proteins by num peptides
89
- # and create a hash that holds the peptides (given here) in those proteins
90
- prots_to_peps_here = Hash.new {|h,k| h[k] = [] }
91
- prots_to_peps_size = Hash.new { 0 }
92
- pep_ids = []
93
- pep_ids_to_prot_ids = Hash.new {|h,k| h[k] = [] }
94
- peps.each do |pep|
95
- #puts pep.prots.size
96
- pep.prots.each do |prot|
97
- #p prot.reference
98
- prots_to_peps_here[prot] << pep
99
- prots_to_peps_size[prot] += 1
100
- pep_ids << pep
101
- pep_ids_to_prot_ids[pep] << prot
102
- end
103
- end
104
- prot_ids_listed_by_peps_size = prots_to_peps_size.keys
105
- tot_num_prots = prot_ids_listed_by_peps_size.size
106
-
107
- sample = Array.new(num_its)
108
-
109
- srand( 777 )
110
- precision_sample = (0...num_its).to_a.map do
111
- num_false_pephits_counter = num_false_pephits
112
- # create a set of peptides
113
- pep_ids_set = pep_ids.to_set
114
- # shuffle the proteins within size groups
115
- finished = false
116
- prot_ids_listed_by_peps_size.group_by {|prot_id| prots_to_peps_size[prot_id] }.sort.each do |k,group_of_proteins_with_same_pep_size|
117
- group_of_proteins_with_same_pep_size.shuffle!
118
- group_of_proteins_with_same_pep_size.each do |prot_id|
119
- prots_to_peps_here[prot_id].each do |pep_id|
120
- if pep_ids_set.include?(pep_id) # if 1
121
- # remove a peptide
122
- pep_ids_set.delete(pep_id)
123
- num_false_pephits_counter -= 1
124
- if num_false_pephits_counter == 0 # if 2
125
- finished = true
126
- end # close if 2
127
- end # close if 1
128
- break if finished # each pep
129
- end
130
- break if finished # each prot
131
- end
132
- break if finished # each group_of_proteins_with_same_pep_size
133
- end # each group_of_proteins_with_same_pep_size
134
- ## Figure out the number of proteins left!
135
- proteins_still_around = pep_ids_set.inject(Set.new) {|protset,pep_id| protset.merge( pep_ids_to_prot_ids[pep_id]) }
136
-
137
- proteins_still_around.size.to_f / tot_num_prots
138
- end # a sample
139
- return precision_sample.min
140
- end # FINAL else
141
- end
142
-
143
- # returns the precision of the worst possible outcome
144
- def worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits)
145
- completely_false_proteins = 0
146
- num_peps_per_prot.sort.each do |num_peps|
147
- num_false_pephits -= num_peps
148
- if num_false_pephits >= 0
149
- completely_false_proteins += 1
150
- end
151
- if num_false_pephits <= 0
152
- break
153
- end
154
- end
155
- num_prots = num_peps_per_prot.size
156
- (num_prots - completely_false_proteins).to_f/num_prots
157
- end
158
-
159
- # normal as in a standard normal distribution of peptide hits per protein
160
- # they are distributed randomly and the precision is assumed to take on a
161
- # standard normal distribution.
162
- # num_peps_per_protein is an array of the number of peptides per protein hit
163
- # (these are the true hits)
164
- # assumes that the number follows a gaussian distribution (binomial
165
- # distributions tend toward gaussians, I believe, at large N)
166
- # returns [mean_precision, stdev_precision]
167
- # options:
168
- # :num_its => Integer (default: 10)
169
- #
170
- # if num_iterations is set at 1, then only the precision will be returned
171
- # though random, the same seed is always used to start this process, meaning
172
- # that the same results will be produced on consecutive attempts.
173
- #
174
- # validator.normal_prothit_precision(peps, 13, :num_its => 1) # -> 0.95433
175
- # validator.normal_prothit_precision(peps, 13, :num_its => 2) # -> [0.92002, 1.2223]
176
- def normal_prothit_precision( peps, num_false_pephits, opts={})
177
- num_iterations = opts[:num_its] || 10
178
- srand( 38272 )
179
-
180
- ##############################################
181
- # The END Cases (can be dealt with quickly)
182
- ##############################################
183
- if num_false_pephits == 0
184
- if num_iterations == 1
185
- return 1.0
186
- else
187
- return [1.0, 0.0]
188
- end
189
- elsif num_false_pephits >= peps.size
190
- if num_iterations == 1
191
- return 0.0
192
- else
193
- return [0.0, 0.0]
194
- end
195
- end
196
-
197
- ##############################################
198
- # Everything else:
199
- ##############################################
200
-
201
- sample = Array.new(num_iterations)
202
- base_indices = (0...(peps.size)).to_a
203
- ### ACUTALLY, I THINK WE WANT TO CREATE AND MERGE!!!!
204
- # This would mean that only a single hit would validate the protein
205
- # if we are subtracting, then we lose the protein on a single peptide!!!!
206
- prot_id_set = peps.inject(Set.new) do |prtset, pep|
207
- prtset.merge( pep.prots.map {|prot| prot } )
208
- end
209
-
210
- tot_num_prots = prot_id_set.size
211
- # could also merge off the good indices
212
- # TODO: we should optimize based on how many false pephits given...
213
-
214
- precision_sample = (0...num_iterations).to_a.map do
215
- shuffled_indices = base_indices.map
216
- shuffled_indices.shuffle!
217
- good_indices = shuffled_indices[num_false_pephits..-1]
218
- still_remaining = Set.new
219
-
220
- peps.values_at(*good_indices).each do |pep|
221
- still_remaining.merge(pep.prots.map {|prot| prot })
222
- end
223
- still_remaining.size.to_f / tot_num_prots
224
- end
225
- if num_iterations == 1
226
- precision_sample.shift
227
- else
228
- #puts "PRECISION GROUP: "
229
- #p precision_sample
230
- sample_stats(precision_sample)
231
- end
232
- end
233
- end
234
-
@@ -1,32 +0,0 @@
1
-
2
-
3
- # from percolator
4
- # This is a trivial class (since q-values are so straightforward with regards
5
- # to precision), but it allows us to work with q-values using the same
6
- # interface as all other validators
7
- class Validator::QValue
8
-
9
- # objs should respond_to :q_value
10
- # q-values: 0.0 means no false discoveries, 0.5 means 50% false discoveries
11
- # 1 - (the largest q value) is the precision
12
- def precision(objs)
13
- return 1.0 if objs.size == 0
14
- largest_q_value = objs.map {|v| v.q_value }.max
15
- prec = 1.0 - largest_q_value
16
- end
17
-
18
-
19
- # objs should respond_to :q_value
20
- # These should be added from low q-value to high q-value
21
- # The last q-value added determines the precision
22
- def increment_precision(objs)
23
- if objs.is_a?(SpecID::Pep) or objs.is_a?(SpecID::Prot)
24
- objs = [objs]
25
- end
26
- precision(objs)
27
- end
28
-
29
- alias_method :pephit_precision, :precision
30
- alias_method :prothit_precision, :precision
31
- alias_method :increment_pephits_precision, :increment_precision
32
- end
@@ -1,272 +0,0 @@
1
- require 'validator'
2
- require 'validator/digestion_based'
3
- require 'transmem'
4
- require 'fasta'
5
- require 'spec_id/digestor'
6
- require 'spec_id/sequest/params'
7
- require 'spec_id/sequest/pepxml'
8
-
9
-
10
- module Validator::Transmem ; end
11
-
12
- # objects of this class can calculate pephit_precision given an array of
13
- # SpecID::Pep objects using the pephit_precision method.
14
- class Validator::Transmem::Protein < Validator::DigestionBased
15
- include Precision::Calculator
16
-
17
- # a hash keyed by index reference which is true if >= min_num_tms
18
- attr_accessor :transmem_by_ti_key
19
- attr_accessor :transmem_index
20
-
21
- # min_num_tms: Integer (1...), the min # certain transmembrane segments to
22
- # consider the protein a transmembrane protein
23
- attr_reader :min_num_tms
24
-
25
- # soluble_fraction: *true/false
26
- attr_accessor :soluble_fraction
27
-
28
- # correct_wins: *true/false,
29
- # if the peptide is found in some proteins that are transmembrane and some
30
- # that are not, then if soluble_fraction==true, this peptide will be
31
- # considered non-transmembrane. If soluble_fraction==false, then this
32
- # will be considered transmembrane.
33
- attr_accessor :correct_wins
34
-
35
- # no_include_tm_peps: false or Float (0.0-1.0), peptides that have a
36
- # fraction of amino acids that fall inside transmembrane sequences greater
37
- # than or equal to the value of the argument will not be considered in the final
38
- # calculation of peptide hit precision. (A transmembrane segment is
39
- # likely to have very different properties than the rest of the peptides,
40
- # so the assumption of equally flyable peptides is broken unless these are
41
- # removed) nil or false will skip this filter. A reasonable value is
42
- # probably 0.7.
43
- attr_accessor :no_include_tm_peps
44
-
45
- # if nil, then this will be calculated whe pephit_precision is called.
46
- attr_accessor :transmem_status_hash
47
-
48
- # the file used (toppred or phobius file)
49
- attr_accessor :transmem_file
50
-
51
- DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( { :min_num_tms => 1, :soluble_fraction => true, :correct_wins => true, :no_include_tm_peps => false, :transmem_status_hash => nil} )
52
-
53
- # expects a toppred.out file (see transmem/toppred)
54
- # other types of transmembrane predictions)
55
- # fasta_obj is a Fasta object.
56
- # sequest_params_obj is a Sequest::Params object.
57
- # OPTIONS:
58
- # (see Validator::Transmem::Protein::DEFAULTS for defaults)
59
- #
60
- # no_include_tm_peps: *false
61
- #
62
- # NOTE: if fasta_obj and sequest_params_obj are not passed in then
63
- # 'false_to_total_ratio' must be set later.
64
- def initialize(a_transmem_file, options={})
65
- @transmem_file = a_transmem_file
66
- opts = self.class::DEFAULTS.merge(options)
67
-
68
- (@min_num_tms, @soluble_fraction, @correct_wins, @no_include_tm_peps, @background, @transmem_status_hash, @false_to_total_ratio, fasta) = opts.values_at(:min_num_tms, :soluble_fraction, :correct_wins, :no_include_tm_peps, :background, :transmem_status_hash, :false_to_total_ratio, :fasta)
69
-
70
- # fasta object is used to update hte phobius index if given
71
- # a hash by reference => true/false (depending on min_num_tms)
72
- @transmem_index = TransmemIndex.new(@transmem_file, fasta)
73
- @transmem_by_ti_key = create_transmem_by_ti_key_hash(@transmem_index, @min_num_tms)
74
- end
75
-
76
- # Designates each protein as transmembrane or not depending on :min_num_tms
77
- # The hash is keyed by the TransmemIndex key.
78
- def create_transmem_by_ti_key_hash(transmem_index, min_num_tms)
79
- _transmem_by_ti_key = {}
80
- num_certain_hash = transmem_index.num_certain_index
81
- num_certain_hash.each do |id, num_certain|
82
- if num_certain >= min_num_tms
83
- _transmem_by_ti_key[id] = true
84
- else
85
- _transmem_by_ti_key[id] = false
86
- end
87
- end
88
- _transmem_by_ti_key
89
- end
90
-
91
- # returns a hash where each protein (and peptide if given peps) is indexed
92
- # with itself with true/false/nil depending on transmembrane status. If
93
- # given peptides, and :no_include_tm_peps is not false, will also set the
94
- # attribute for peptides.
95
- # the attribute (:no_include_tm_peps)
96
- # NOTE: if given a list of peptides, this implementation will not overwrite a
97
- # protein if it already has a true/false for transmem. This is so that a
98
- # lookup does not have to be performed if the value is already defined as
99
- # the assumption is that many peptides will point to the same protein.
100
- def create_transmem_status_hash(peps)
101
- thash = {}
102
- peps.each do |pep|
103
- pep.prots.each do |prot|
104
- if !thash.key?(prot)
105
- #prot.transmem == nil
106
- thash[prot] = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
107
- end
108
- end
109
- if @no_include_tm_peps
110
- thash[pep] = pep_is_transmem?(pep)
111
- end
112
- end
113
- thash
114
- end
115
-
116
- # sets the false_to_total_ratio and returns self for chaining.
117
- # peps will usually be the peptides created by calling:
118
- # peps = Digestor.digest( fasta_obj, sequest_params_obj )
119
- def set_false_to_total_ratio(peps)
120
- tm_hash = create_transmem_status_hash(peps)
121
- (tps, fps) = partition(peps, tm_hash)
122
- @false_to_total_ratio = fps.size.to_f / (tps.size + fps.size)
123
- self
124
- end
125
-
126
- def pephit_precision(peps)
127
- if !@transmem_status_hash
128
- @transmem_status_hash = create_transmem_status_hash(peps)
129
- end
130
- super(peps)
131
- end
132
-
133
- # regardless of transmembrane status of proteins peptide belongs to, asks
134
- # what the avg overlap is with transmembrane sequences.
135
- def pep_is_transmem?(pep)
136
- prts = pep.prots
137
- prts_w_keys = 0
138
- sum_of_fractions = 0.0
139
- prts.each do |prot|
140
- key = @transmem_index.reference_to_key(prot.reference)
141
- ans = @transmem_index.avg_overlap(key, pep.aaseq, :fraction)
142
- if ans
143
- sum_of_fractions += ans
144
- prts_w_keys += 1
145
- end
146
- end
147
- if prts_w_keys > 0
148
- avg_of_fractions = sum_of_fractions / prts_w_keys
149
- avg_of_fractions >= @no_include_tm_peps
150
- else
151
- nil
152
- end
153
- end
154
-
155
- # each peptide must have prots and the prots must respond true/false to
156
- # the 'transmem' method
157
- # if given a hash, it will override the @transmem_status_hash
158
- def partition(peps, transmem_status_hash=nil)
159
- # The fast way to do this is to play with the logic
160
- # For the insoluble fraction we calculate as if incorrect wins
161
- # and swap the tp's and fp's (I've verified that this is correct
162
- # empirically)
163
-
164
- # the code could be cleaner here, but efforts to minimize calls in the
165
- # inner loops create this structure...
166
- tm_hash = transmem_status_hash || @transmem_status_hash
167
-
168
- my_peps =
169
- if @no_include_tm_peps
170
- # remove all thos peps with fractional overlap >= @no_include
171
- # [1,2,3,4].reject {|n| n >= 3} #-> [1, 2]
172
- # remove pep.transmem == true and pep.transmem == nil
173
-
174
- if tm_hash
175
- peps.reject do |pep|
176
- tm_hash[pep] != false
177
- end
178
- else
179
- peps.reject do |pep|
180
- pep_is_transmem?(pep) != false
181
- end
182
- end
183
- else
184
- peps
185
- end
186
- cw = @correct_wins
187
- sf = @soluble_fraction
188
- if !sf
189
- cw = !cw
190
- end
191
-
192
- tp = []
193
- fp = []
194
-
195
- if cw
196
- my_peps.each do |pep|
197
- one_prot_is_not_transmem = false
198
- not_all_nil = false
199
- if tm_hash
200
- pep.prots.each do |prot|
201
- tm_status = tm_hash[prot]
202
- if tm_status == false
203
- one_prot_is_not_transmem = true
204
- break
205
- elsif tm_status == true
206
- not_all_nil = true
207
- end
208
- end
209
- else
210
- pep.prots.each do |prot|
211
- tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
212
- if tm_status == false
213
- one_prot_is_not_transmem = true
214
- break
215
- elsif tm_status == true
216
- not_all_nil = true
217
- end
218
- end
219
- end
220
- if one_prot_is_not_transmem
221
- tp << pep
222
- else
223
- if not_all_nil
224
- fp << pep
225
- end
226
- end
227
- end
228
- else
229
- my_peps.each do |pep|
230
- one_prot_is_transmem = false
231
- not_all_nil = false
232
- if tm_hash
233
- pep.prots.each do |prot|
234
- tm_status = tm_hash[prot]
235
- if tm_status == true
236
- one_prot_is_transmem = true
237
- break
238
- elsif tm_status == false
239
- not_all_nil = true
240
- end
241
- end
242
- else
243
- pep.prots.each do |prot|
244
- tm_status = @transmem_by_ti_key[@transmem_index.reference_to_key(prot.reference)]
245
- if tm_status == true
246
- one_prot_is_transmem = true
247
- break
248
- elsif tm_status == false
249
- not_all_nil = true
250
- end
251
- end
252
- end
253
- if one_prot_is_transmem
254
- fp << pep
255
- else
256
- if not_all_nil
257
- tp << pep
258
- end
259
- end
260
- end
261
- end
262
- if !sf # swap
263
- fp,tp = tp,fp
264
- cw = !cw
265
- end
266
- #puts "PARTITION ARRAY"
267
- #p [tp, fp].map{|v| v.size}
268
- [tp, fp]
269
- end
270
-
271
- end
272
-
@@ -1,46 +0,0 @@
1
- require 'validator'
2
-
3
- class Validator::TruePos < Validator
4
- include Precision::Calculator
5
- attr_reader :fasta
6
- attr_accessor :correct_wins
7
-
8
- # correct_wins means that only a single protein from a pep.aaseq must match
9
- # the fasta object for the pep hit to be considered valid. Otherwise, all
10
- # must be a match
11
- def initialize(fasta_obj, correct_wins = true)
12
- @fasta = fasta_obj
13
- @fasta_headers = @fasta.prots.map {|prot| prot.header }
14
- @correct_wins = correct_wins
15
- end
16
-
17
- def partition(peps)
18
- if @correct_wins
19
- peps.partition do |pep|
20
- @fasta_headers.any? do |header|
21
- pep.prots.any? do |pepprot|
22
- header.include? pepprot.reference
23
- end
24
- end
25
- end
26
- else
27
- peps.partition do |pep|
28
- pep.prots.all? do |pepprot|
29
- @fasta_headers.any? do |header|
30
- header.include? pepprot.reference
31
- end
32
- end
33
- end
34
- end
35
- end
36
-
37
- def pephit_precision(peps)
38
- (tp, fp) = partition(peps)
39
- calc_precision(tp.size, fp.size)
40
- end
41
-
42
- def to_param_string
43
- "true_positives(tps)=" + ["{fasta=#{@fasta.filename}", "correct_wins=#{@correct_wins}}"].join(", ")
44
- end
45
-
46
- end