mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id.rb DELETED
@@ -1,778 +0,0 @@
1
- require 'ostruct'
2
- require 'set'
3
- require 'hash_by'
4
- require 'roc'
5
- require 'sample_enzyme' # for others
6
- require 'spec_id/bioworks'
7
- require 'spec_id/sequest'
8
-
9
- require 'spec_id/proph/prot_summary'
10
- require 'spec_id/proph/pep_summary'
11
-
12
- require 'spec_id_xml'
13
- require 'spec_id/sqt'
14
- require 'spec_id/mass'
15
- require 'fasta'
16
-
17
-
18
-
19
- module ProteinReferenceable ; end
20
-
21
- class SampleEnzyme ; end
22
-
23
-
24
- module SpecID ; end
25
-
26
- class GenericSpecID ; include SpecID ; end
27
-
28
- module SpecID
29
- MONO = Mass::MONO
30
- AVG = Mass::AVG
31
-
32
- attr_accessor :peps, :prots
33
- # True if a high protein/peptide score is better than low, false otherwise
34
- # This is set automatically for known file types
35
- attr_accessor :hi_prob_best
36
-
37
- # A relative pathname of the file the specid object is derived from
38
- attr_accessor :filename
39
-
40
- # tp = file_type
41
- # Will return a SpecID object (really, the object corresponding to the
42
- # file type which mixes in SpecID [is_a?(SpecID) == true])
43
- # If no file is given, will return a GenericSpecID object.
44
- # If file is an array, this is assumed to be a group of srf files which is
45
- # converted into an SRFGroup Ojbect and run.
46
- def self.new(file=nil, tp=nil)
47
- # this will need to be specialized for other groups later
48
- if file.is_a?(Array)
49
- # takes an array of srf filenames
50
- SRFGroup.new(file)
51
- elsif file
52
- from_file(file, tp)
53
- else
54
- GenericSpecID.new
55
- end
56
- end
57
-
58
- # tp = file_type
59
- # a single srf file will be packaged into an SRFGroup object
60
- def self.from_file(file, tp=nil)
61
- obj = nil
62
- unless tp
63
- tp = file_type(file)
64
- end
65
- obj = case tp
66
- when 'srf'
67
- #@hi_prob_best = false
68
- SRFGroup.new([file])
69
- when 'srg'
70
- #@hi_prob_best = false
71
- SRFGroup.new(file)
72
- when 'bioworks'
73
- #@hi_prob_best = false
74
- Bioworks.new(file)
75
- when 'protproph'
76
- #@hi_prob_best = true
77
- Proph::ProtSummary.new(file)
78
- when 'pepproph'
79
- Proph::PepSummary.new(file)
80
- when 'sqg'
81
- SQTGroup.new(file)
82
- when 'sqt'
83
- SQTGroup.new([file])
84
- else
85
- abort "UNRECOGNIZED file type for #{file}"
86
- end
87
- obj
88
- end
89
-
90
- def inspect
91
- peps_string =
92
- if peps
93
- "peps(#)=#{peps.size}"
94
- else
95
- "peps=(nil)"
96
- end
97
- "<#{self.class} #{peps_string}>"
98
- end
99
-
100
- # given some list of SpecID::Pep based objects, returns the list of proteins
101
- # associated with those peptides
102
- # kind must be a symbol:
103
- # :no_update (current proteins are returned, but their peps attribute
104
- # is not updated)
105
- # :update (current proteins returned with peps attribute updated)
106
- # :new (new proteins are created complete with peps attribute)
107
- def self.protein_list(pephits, kind=:no_update)
108
-
109
- orig_pephits_prts = []
110
- if kind == :new
111
- new_prots = {}
112
- pephits.each_with_index do |pep,i|
113
- orig_pephits_prts[i] = pep.prots
114
- peps_new_prts = pep.prots.map do |prt|
115
- if new_prots.key? prt.reference
116
- already_exists = new_prots[prt.reference]
117
- else
118
- np = prt.dup
119
- np.peps = []
120
- new_prots[np.reference] = np
121
- np
122
- end
123
- end
124
- pep.prots = peps_new_prts
125
- end
126
- end
127
-
128
- if kind == :update
129
- pephits.each do |pep|
130
- pep.prots.each do |prt|
131
- prt.peps = []
132
- end
133
- end
134
- end
135
-
136
- prot_set = {}
137
- pephits.each do |pep|
138
- prts = pep.prots
139
- prts.each do |prt|
140
- prot_set[ prt.reference ] = prt
141
- end
142
- if (kind == :update || kind == :new)
143
- prts.each do |prt|
144
- prt.peps << pep
145
- end
146
- end
147
- end
148
-
149
- ## Reset the original protein hits
150
- if kind == :new
151
- pephits.each_with_index do |pep,i|
152
- pep.prots = orig_pephits_prts[i]
153
- end
154
- end
155
-
156
- prot_set.values
157
- end
158
-
159
-
160
-
161
- # takes a comma separated list or array and extends the last to create an
162
- # array of desired size
163
- def self.extend_args(arg, desired_size)
164
- arg_arr = arg
165
- if arg.is_a? String
166
- arg_arr = arg.split(',')
167
- end
168
- new_arr = []
169
- last_arg = arg_arr[0]
170
- desired_size.times do |i|
171
- if arg_arr[i]
172
- new_arr[i] = arg_arr[i]
173
- last_arg = new_arr[i]
174
- else
175
- new_arr[i] = last_arg
176
- end
177
- end
178
- new_arr
179
- end
180
-
181
- # takes an array of proteins, each having peps
182
- # peptide grouping is done
183
- # by-
184
- # the protein with the most unique peptides ends up taking any
185
- # degenerate peptides, tie goes to one with most hits total, then the one
186
- # that had the top xcorr(s) (before removing any peptides).All other
187
- # proteins with identical peptides will lose those peptides. So, the rich
188
- # stay rich, and the poor get poorer.
189
- # returns an array of triplets where each is [prot, pep_hits,
190
- # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
191
- # peptide. The internal links (prot.peps and pep.prots) is NOT modified!!
192
- # update_prots == true will set each protein with the peptides found
193
- def self.occams_razor(array_of_prots, update_prots=false)
194
- peps_found = Set.new
195
-
196
- to_sort = array_of_prots.map do |prot|
197
- pps = prot.peps
198
-
199
- peps_by_uniq_aaseq = pps.hash_by(:aaseq)
200
- uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
201
- xcorrs = pps.map {|pep| pep.xcorr }
202
-
203
- silly = OpenStruct.new
204
- # 0 1 2 3 4 5
205
- [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
206
- end
207
- prot_triplets = []
208
- to_sort.sort.reverse.each do |ar|
209
- prot = ar[3]
210
- ## overlapping set:
211
- common = peps_found & ar[4]
212
- ## find the uniq ones in our little set of peptides:
213
- uniq = ar[4] - common
214
- pep_hits = []
215
- if uniq.size != 0
216
- ## add to the found list:
217
- peps_found.merge(uniq)
218
- uniq.each do |seq|
219
- pep_hits.push( *(ar[5][seq]) )
220
- end
221
- prot_triplets << [prot, pep_hits, uniq.to_a]
222
- prot.peps = pep_hits if update_prots
223
- end
224
- end
225
- prot_triplets
226
- end
227
-
228
- # returns number of true positives (array) and the specified output (as
229
- # parallel array). Requires the classification method and a sorted array of
230
- # tp values and an array fp values.
231
- # (This is simply a wrapper around ROC#by_tps method!)
232
- def by_tps(classification_method, tp, fp)
233
- ROC.new.by_tps(classification_method, tp, fp)
234
- end
235
-
236
- # from the unique set of peptide hits, create a separate peptide hit for
237
- # each protein reference where that peptide only references that protein
238
- # e.g. pep.prots = [(a single protein)]
239
- def pep_prots
240
- pps = []
241
- peps.each do |pep|
242
- pep.prots.map do |prt|
243
- pep.dup
244
- pep.prots = [prt]
245
- pps << pep
246
- end
247
- end
248
- pps
249
- end
250
-
251
- def self.prots?(ar)
252
- ar.first.is_a? SpecID::Prot
253
- end
254
-
255
- def self.peps?(ar)
256
- ar.first.is_a? SpecID::Pep
257
- end
258
-
259
- # for older stuff
260
- def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
261
- objects =
262
- case items
263
- when :prots
264
- prots
265
- when :peps
266
- peps
267
- end
268
- SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
269
- end
270
-
271
- # includes the peptide hit in both
272
- # returns (target, decoy)
273
- # (for peps) ties can be :both, true (target wins), false (decoy wins)
274
- # regardless of ties behavior, will partition out the proteins to be
275
- # appropriate for the peptide
276
- def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
277
- if items.size == 0
278
- return [[],[]]
279
- elsif prots?(items)
280
- myproc = proc { |prt|
281
- if prt.reference =~ regex ; !decoy_on_match
282
- else ; decoy_on_match end
283
- }
284
- return classify(items, myproc)
285
- elsif peps?(items)
286
- match = [] ; nomatch = []
287
- items.each do |pep|
288
- (match_prots, nomatch_prots) = pep.prots.partition do |prot|
289
- prot.reference =~ regex
290
- end
291
- if match_prots.size == 0
292
- nomatch << pep
293
- elsif nomatch_prots.size == 0
294
- match << pep
295
- else ## both have hits
296
- pep.prots = match_prots
297
- nomatch_pep = pep.dup
298
- nomatch_pep.prots = nomatch_prots
299
-
300
- # resolve ties
301
- case ties
302
- when true
303
- if decoy_on_match
304
- nomatch << pep
305
- else
306
- match << pep
307
- end
308
- when false
309
- if decoy_on_match
310
- match << pep
311
- else
312
- nomatch << pep
313
- end
314
- when :both
315
- match << pep
316
- nomatch << pep
317
- else ; raise ArgumentError
318
- end
319
- end
320
- end
321
- if decoy_on_match
322
- return [nomatch , match]
323
- else
324
- return [match, nomatch]
325
- end
326
- else
327
- raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
328
- end
329
- end
330
-
331
-
332
-
333
- # returns [tp, fp] based on the protein prefix for items where items =
334
- # (:prot|:peps)
335
- # this may result in a duplication of some peptides if they match both
336
- # normal and decoy proteins. In this case, the protein arrays are split,
337
- # too, so that each points only to its breed of protein.
338
- def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
339
- if prefix
340
- regex = /^#{Regexp.escape(flag)}/
341
- else
342
- regex = /#{Regexp.escape(flag)}/
343
- end
344
- classify_by_regex(items, regex, decoy_on_match)
345
- end
346
-
347
- # Returns (match, nomatch)
348
- # items = symbol (:prots, :peps)
349
- # Returns two arrays, those returning true from classify_item_by and those
350
- # returning false
351
- def classify(items, classify_item_by)
352
- its = send(items)
353
- f = []; t = []
354
- its.each do |it|
355
- if classify_item_by.call(it)
356
- t << it
357
- else
358
- f << it
359
- end
360
- end
361
- [t,f]
362
- end
363
-
364
- # returns two arrays, true positives and false positives (determined by proc
365
- # classify_item_by) sorted by proc rank_item_by. Items will be ranked from
366
- # lowest to highest based on the return value of rank_item_by. items is a
367
- # symbol (:prots or :peps)
368
- def rank_and_classify(items, rank_item_by, classify_item_by)
369
- its = send(items)
370
- #its.each do |it| puts it.probability.to_s ; puts it.reference end
371
- doublets = its.collect do |item|
372
- [ rank_item_by.call(item),
373
- classify_item_by.call(item) ]
374
- end
375
- roc = ROC.new
376
- tp, fp = roc.doublets_to_separate(doublets)
377
- return tp, fp
378
- end
379
-
380
-
381
- # returns a proc for getting all probabilities so that an ascending sort
382
- # will put the best scores first
383
- def probability_proc
384
- if hi_prob_best
385
- get_prob_proc = proc {|prt| prt.probability * -1 }
386
- else
387
- get_prob_proc = proc {|prt| prt.probability }
388
- end
389
- get_prob_proc
390
- end
391
-
392
- def separate_by_prefix(items, fp_prefix)
393
- its = send(items)
394
-
395
- if items == :prots
396
- elsif items == :peps
397
- abort "not implemented yet"
398
- else
399
- abort "no other items recognized yet"
400
- end
401
- end
402
-
403
- # sorts the probabilities and then
404
- # calcs predicted number hits and precision for protein probabilities
405
- # (summing probabilities)
406
- # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
407
- # called false positive rate
408
- # SUM(1-probX)/#prots
409
- def num_hits_and_ppv_for_protein_prophet_probabilities
410
- current_sum_one_minus_prob = 0.0
411
- num_prots = []
412
- ppv = []
413
- prot_cnt = 0
414
- probs = prots.map {|v| v.probability}
415
- sorted = probs.sort.reverse
416
- sorted.each do |prob|
417
- prot_cnt += 1
418
- num_prots << prot_cnt
419
- current_sum_one_minus_prob += 1.0 - prob
420
- ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
421
- # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
422
- end
423
- [num_prots, ppv]
424
- end
425
-
426
- # convenience method for the common task of determining precision for
427
- # proteins (with decoy proteins found by false_flag)
428
- # returns (num_hits, precision)
429
- def num_hits_and_ppv_for_prob(false_flag, prefix=false)
430
- if prefix
431
- regex = /^#{Regexp.escape(false_flag)}/
432
- else
433
- regex = /#{Regexp.escape(false_flag)}/
434
- end
435
- prob_proc = probability_proc
436
- myproc = proc { |prt|
437
- if prt.reference =~ regex ; false
438
- else ; true end
439
- }
440
-
441
- real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
442
-
443
- (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
444
- [num_hits, precision]
445
- end
446
-
447
- # # takes the existing spec_id object and marshals it into "file.msh"
448
- # # a new file will always look for a file.msh to load
449
- # def marshal(force=false)
450
- # if !(File.exist? @marshal_file)| force
451
- # File.open(@marshal_file, 'w') {|out| Marshal.dump(@obj, out) }
452
- # end
453
- # end
454
-
455
- # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
456
- # 'srf' if SRF file, 'srg' if search results group file.
457
- def self.file_type(file)
458
- if file =~ /\.srg$/
459
- return 'srg'
460
- elsif file =~ /\.sqg$/
461
- return 'sqg'
462
- end
463
- if IO.read(file, 7,438) == 'Enzyme:'
464
- return 'srf'
465
- end
466
- File.open(file) do |fh|
467
- lines = ""
468
- 8.times { lines << fh.readline }
469
- if lines =~ /<bioworksinfo>/
470
- return 'bioworks'
471
- elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
472
- return 'protproph'
473
- elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
474
- return 'pepproph'
475
- end
476
- # assumes the header of a sqt file is less than 200 lines ...
477
- 200.times do
478
- line = fh.gets
479
- if line
480
- lines << line
481
- else ; break
482
- end
483
- end
484
- if lines =~ /^H\tDatabase/ and lines =~ /^H\tSQTGenerator/
485
- return 'sqt'
486
- end
487
- end
488
- end
489
-
490
-
491
- ##############################################
492
- # These are pretty specific to Smriti's needs:
493
-
494
- # Given a hash of peptide arrays by some attribute key
495
- # Return two sorted arrays of sorted probabilities
496
- # The first of the min and second of the best 10 of each peptide array
497
- def min_and_best10(hash)
498
- ## choose the min probability and sort by prob
499
- min_peptides = hash.collect do |k,v|
500
- v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
501
- end
502
- #puts min_peptides[0] # -> Bioworks::Pep
503
- min_sorted_peps = sorted_probabilities(min_peptides)
504
- #puts min_sorted_peps[0] # -> probability (Float)
505
-
506
- peptides_by_tens = []
507
- hash.each do |k,v|
508
- arr = v.sort_by {|pep| pep.peptide_probability }.slice(0,10)
509
- peptides_by_tens.push(*arr)
510
- end
511
-
512
- top_10_sorted_peps = sorted_probabilities(peptides_by_tens)
513
- #puts top_10_sorted_peps[0] # -> float
514
- #puts "size: top_10_sorted_peps.size : #{top_10_sorted_peps.size}"
515
- #puts "size: min_sorted_peps.size : #{min_sorted_peps.size}"
516
- #p top_10_sorted_peps
517
- #p min_sorted_peps
518
- return min_sorted_peps, top_10_sorted_peps
519
- end
520
-
521
- # Returns a list of sorted probabilities given the array of peptides
522
- def sorted_probabilities(peptides)
523
- #puts peptides.first.peptide_probability.class
524
- #peptides.each do |pep| print pep.class.to_s + " " end
525
- #puts peptides.first.is_a? Array
526
- #abort "DFHDFD"
527
- peptides.collect{|pep| pep.probability }.sort
528
- end
529
-
530
- # returns a sorted lists of probabilities based on all pepprots (a peptide
531
- # associated with a protein)
532
- def pep_probs_by_pep_prots
533
- sorted_probabilities(peps)
534
- end
535
-
536
- ##########################################################################
537
- # WARNING! These might be dangerous to your health if there are multiple
538
- # files collected in your bioworks file
539
- ##########################################################################
540
-
541
- # (prob_list_by_min, prob_list_by_best10)
542
- # returns 2 sorted lists of probabilities based on:
543
- # 1. best peptide hit
544
- # 2. top 10 peptide hits
545
- # on a per scan basis
546
- # NOTE: you may want to hash on base_name first!
547
- def pep_probs_by_scan
548
- hash = peps.hash_by(:first_scan, :last_scan)
549
- return min_and_best10(hash)
550
- end
551
-
552
-
553
- #(prob_list_by_min, prob_list_by_best10)
554
- # same as pep_probs_by_scan but per charge state
555
- # NOTE: you may want to hash on base_name first!
556
- def pep_probs_by_scan_charge
557
- hash = peps.hash_by(:first_scan, :last_scan, :charge)
558
- return min_and_best10(hash)
559
- end
560
-
561
- # (prob_list_by_min)
562
- # hashes on seq-charge and returns the sorted list of probabilities of top
563
- # hit per seq-charge
564
- # NOTE: you may want to hash on base_name first!
565
- def pep_probs_by_seq_charge
566
- hash = peps.hash_by(:sequence, :charge)
567
- min_peptides = hash.collect do |k,v|
568
- v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
569
- end
570
- sorted_probabilities(min_peptides)
571
- end
572
-
573
- ##########################################################################
574
- # USE these if you have multiple files in your bioworks.xml file
575
- ##########################################################################
576
- # (prob_list_by_min, prob_list_by_best10)
577
- # returns 2 sorted lists of probabilities based on:
578
- # 1. best peptide hit
579
- # 2. top 10 peptide hits
580
- # on a per scan basis
581
- # NOTE: you may want to hash on base_name first!
582
- def pep_probs_by_bn_scan
583
- hash = peps.hash_by(:base_name, :first_scan, :last_scan)
584
- return min_and_best10(hash)
585
- end
586
-
587
-
588
- #(prob_list_by_min, prob_list_by_best10)
589
- # same as pep_probs_by_scan but per charge state
590
- # NOTE: you may want to hash on base_name first!
591
- def pep_probs_by_bn_scan_charge
592
- hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
593
- return min_and_best10(hash)
594
- end
595
-
596
- # (prob_list_by_min)
597
- # hashes on seq-charge and returns the sorted list of probabilities of top
598
- # hit per seq-charge
599
- # NOTE: you may want to hash on base_name first!
600
- def pep_probs_by_bn_seq_charge
601
- hash = peps.hash_by(:base_name, :sequence, :charge)
602
- min_peptides = hash.collect do |k,v|
603
- v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
604
- end
605
- sorted_probabilities(min_peptides)
606
- end
607
- end
608
-
609
- # A Generic spectraID protein
610
- module SpecID::Prot
611
- include ProteinReferenceable
612
-
613
- # probability is always a float!
614
- attr_accessor :probability, :reference, :peps
615
-
616
- def <=> (other)
617
- self.reference <=> other.reference
618
- end
619
-
620
- def inspect
621
- pep_string =
622
- if peps
623
- ", @peps(#)=#{peps.size}"
624
- end
625
- "<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
626
- end
627
-
628
- end
629
-
630
- module SpecID::Pep
631
-
632
- Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
633
-
634
- attr_accessor :prots
635
- attr_accessor :probability
636
- # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
637
- # or last.
638
- attr_accessor :sequence
639
-
640
- # the basic amino acid sequence (no leading or trailing '.' or amino acids)
641
- # should not contain any special symbols, etc.
642
- attr_accessor :aaseq
643
- attr_accessor :charge
644
-
645
- # removes nonstandard chars with Non_standard_amino_acid_char_re
646
- # preserves A-Z and '.' and '-'
647
- def self.remove_non_amino_acids(sequence)
648
- sequence.gsub(Non_standard_amino_acid_char_re, '')
649
- end
650
-
651
- # remove_non_amino_acids && split_sequence
652
- def self.prepare_sequence(val)
653
- nv = remove_non_amino_acids(val)
654
- split_sequence(nv)
655
- end
656
-
657
- def <=>(other)
658
- aaseq <=> other.aaseq
659
- end
660
-
661
- # Returns prev, peptide, next from sequence. Parse errors return
662
- # nil,nil,nil
663
- # R.PEPTIDE.A # -> R, PEPTIDE, A
664
- # R.PEPTIDE.- # -> R, PEPTIDE, -
665
- # PEPTIDE.A # -> -, PEPTIDE, A
666
- # A.PEPTIDE # -> A, PEPTIDE, -
667
- # PEPTIDE # -> nil,nil,nil
668
- def self.split_sequence(val)
669
- peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
670
- pieces = val.split('.')
671
- case pieces.size
672
- when 3
673
- peptide_prev_aa, peptide, peptide_next_aa = *pieces
674
- when 2
675
- if pieces[0].size > 1 ## N termini
676
- peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
677
- else ## C termini
678
- peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
679
- end
680
- when 1 ## this must be a parse error!
681
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
682
- when 0
683
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
684
- end
685
- return peptide_prev_aa, peptide, peptide_next_aa
686
- end
687
-
688
- ##
689
- def self.sequence_to_aaseq(sequence)
690
- after_removed = remove_non_amino_acids(sequence)
691
- pieces = after_removed.split('.')
692
- case pieces.size
693
- when 3
694
- pieces[1]
695
- when 2
696
- if pieces[0].size > 1 ## N termini
697
- pieces[0]
698
- else ## C termini
699
- pieces[1]
700
- end
701
- when 1 ## this must be a parse error!
702
- pieces[0] ## which is the peptide itself
703
- else
704
- abort "bad peptide sequence: #{sequence}"
705
- end
706
- end
707
-
708
- # This will rapidly determine the list of proteins for which given
709
- # peptides belong. It is meant to be low level and fast (eventually),
710
- # so it asks for the data in a format amenable to this.
711
- # returns a mirror array where each entry is an array of Fasta::Prot
712
- # objects where each protein contains the sequence
713
- def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
714
- prots = fasta_obj.prots
715
- prot_seqs = prots.map do |prot|
716
- prot.aaseq
717
- end
718
-
719
- groups = peptide_strings_list.map do |pep_seq|
720
- prot_index = 0
721
- protein_group = []
722
- prot_seqs.each do |prot_seq|
723
- if prot_seq.include? pep_seq
724
- protein_group << prots[prot_index]
725
- end
726
- prot_index += 1
727
- end
728
- protein_group
729
- end
730
-
731
- groups
732
- end
733
-
734
- # units can be :mmu, :amu, :ppm
735
- def mass_accuracy(pep, unit=:ppm, mono=true)
736
- # 10^6 * deltam accuracy/ m[measured]
737
- # i.e., theoretical mass 1000, measured 999.9: 100ppm
738
- # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
739
- # pep.mass is the theoretical M+H of the peptide
740
- # this assumes that the deltacn value we're being told is correct, but I
741
- # have my suspicions (since the <mass> value is not accurate...)
742
-
743
- ######## TO COMPLETE (and add to spec_id..?)
744
- case unit
745
- when :ppm
746
- when :amu
747
- when :mmu
748
- end
749
- end
750
-
751
- # calls the method associated with each key and returns the value
752
- def values_at(*args)
753
- args.map do |arg|
754
- send(arg)
755
- end
756
- end
757
-
758
- def inspect
759
-
760
- prot_string =
761
- if prots
762
- ", @prots(#)=#{prots.size}"
763
- end
764
- "<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
765
- end
766
-
767
- end
768
-
769
- class SpecID::GenericProt
770
- include SpecID::Prot
771
- end
772
-
773
- class SpecID::GenericPep
774
- include SpecID::Pep
775
- end
776
-
777
-
778
-