mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id.rb DELETED
@@ -1,778 +0,0 @@
1
- require 'ostruct'
2
- require 'set'
3
- require 'hash_by'
4
- require 'roc'
5
- require 'sample_enzyme' # for others
6
- require 'spec_id/bioworks'
7
- require 'spec_id/sequest'
8
-
9
- require 'spec_id/proph/prot_summary'
10
- require 'spec_id/proph/pep_summary'
11
-
12
- require 'spec_id_xml'
13
- require 'spec_id/sqt'
14
- require 'spec_id/mass'
15
- require 'fasta'
16
-
17
-
18
-
19
- module ProteinReferenceable ; end
20
-
21
- class SampleEnzyme ; end
22
-
23
-
24
- module SpecID ; end
25
-
26
- class GenericSpecID ; include SpecID ; end
27
-
28
- module SpecID
29
- MONO = Mass::MONO
30
- AVG = Mass::AVG
31
-
32
- attr_accessor :peps, :prots
33
- # True if a high protein/peptide score is better than low, false otherwise
34
- # This is set automatically for known file types
35
- attr_accessor :hi_prob_best
36
-
37
- # A relative pathname of the file the specid object is derived from
38
- attr_accessor :filename
39
-
40
- # tp = file_type
41
- # Will return a SpecID object (really, the object corresponding to the
42
- # file type which mixes in SpecID [is_a?(SpecID) == true])
43
- # If no file is given, will return a GenericSpecID object.
44
- # If file is an array, this is assumed to be a group of srf files which is
45
- # converted into an SRFGroup Ojbect and run.
46
- def self.new(file=nil, tp=nil)
47
- # this will need to be specialized for other groups later
48
- if file.is_a?(Array)
49
- # takes an array of srf filenames
50
- SRFGroup.new(file)
51
- elsif file
52
- from_file(file, tp)
53
- else
54
- GenericSpecID.new
55
- end
56
- end
57
-
58
- # tp = file_type
59
- # a single srf file will be packaged into an SRFGroup object
60
- def self.from_file(file, tp=nil)
61
- obj = nil
62
- unless tp
63
- tp = file_type(file)
64
- end
65
- obj = case tp
66
- when 'srf'
67
- #@hi_prob_best = false
68
- SRFGroup.new([file])
69
- when 'srg'
70
- #@hi_prob_best = false
71
- SRFGroup.new(file)
72
- when 'bioworks'
73
- #@hi_prob_best = false
74
- Bioworks.new(file)
75
- when 'protproph'
76
- #@hi_prob_best = true
77
- Proph::ProtSummary.new(file)
78
- when 'pepproph'
79
- Proph::PepSummary.new(file)
80
- when 'sqg'
81
- SQTGroup.new(file)
82
- when 'sqt'
83
- SQTGroup.new([file])
84
- else
85
- abort "UNRECOGNIZED file type for #{file}"
86
- end
87
- obj
88
- end
89
-
90
- def inspect
91
- peps_string =
92
- if peps
93
- "peps(#)=#{peps.size}"
94
- else
95
- "peps=(nil)"
96
- end
97
- "<#{self.class} #{peps_string}>"
98
- end
99
-
100
- # given some list of SpecID::Pep based objects, returns the list of proteins
101
- # associated with those peptides
102
- # kind must be a symbol:
103
- # :no_update (current proteins are returned, but their peps attribute
104
- # is not updated)
105
- # :update (current proteins returned with peps attribute updated)
106
- # :new (new proteins are created complete with peps attribute)
107
- def self.protein_list(pephits, kind=:no_update)
108
-
109
- orig_pephits_prts = []
110
- if kind == :new
111
- new_prots = {}
112
- pephits.each_with_index do |pep,i|
113
- orig_pephits_prts[i] = pep.prots
114
- peps_new_prts = pep.prots.map do |prt|
115
- if new_prots.key? prt.reference
116
- already_exists = new_prots[prt.reference]
117
- else
118
- np = prt.dup
119
- np.peps = []
120
- new_prots[np.reference] = np
121
- np
122
- end
123
- end
124
- pep.prots = peps_new_prts
125
- end
126
- end
127
-
128
- if kind == :update
129
- pephits.each do |pep|
130
- pep.prots.each do |prt|
131
- prt.peps = []
132
- end
133
- end
134
- end
135
-
136
- prot_set = {}
137
- pephits.each do |pep|
138
- prts = pep.prots
139
- prts.each do |prt|
140
- prot_set[ prt.reference ] = prt
141
- end
142
- if (kind == :update || kind == :new)
143
- prts.each do |prt|
144
- prt.peps << pep
145
- end
146
- end
147
- end
148
-
149
- ## Reset the original protein hits
150
- if kind == :new
151
- pephits.each_with_index do |pep,i|
152
- pep.prots = orig_pephits_prts[i]
153
- end
154
- end
155
-
156
- prot_set.values
157
- end
158
-
159
-
160
-
161
- # takes a comma separated list or array and extends the last to create an
162
- # array of desired size
163
- def self.extend_args(arg, desired_size)
164
- arg_arr = arg
165
- if arg.is_a? String
166
- arg_arr = arg.split(',')
167
- end
168
- new_arr = []
169
- last_arg = arg_arr[0]
170
- desired_size.times do |i|
171
- if arg_arr[i]
172
- new_arr[i] = arg_arr[i]
173
- last_arg = new_arr[i]
174
- else
175
- new_arr[i] = last_arg
176
- end
177
- end
178
- new_arr
179
- end
180
-
181
- # takes an array of proteins, each having peps
182
- # peptide grouping is done
183
- # by-
184
- # the protein with the most unique peptides ends up taking any
185
- # degenerate peptides, tie goes to one with most hits total, then the one
186
- # that had the top xcorr(s) (before removing any peptides).All other
187
- # proteins with identical peptides will lose those peptides. So, the rich
188
- # stay rich, and the poor get poorer.
189
- # returns an array of triplets where each is [prot, pep_hits,
190
- # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1
191
- # peptide. The internal links (prot.peps and pep.prots) is NOT modified!!
192
- # update_prots == true will set each protein with the peptides found
193
- def self.occams_razor(array_of_prots, update_prots=false)
194
- peps_found = Set.new
195
-
196
- to_sort = array_of_prots.map do |prot|
197
- pps = prot.peps
198
-
199
- peps_by_uniq_aaseq = pps.hash_by(:aaseq)
200
- uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } )
201
- xcorrs = pps.map {|pep| pep.xcorr }
202
-
203
- silly = OpenStruct.new
204
- # 0 1 2 3 4 5
205
- [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq]
206
- end
207
- prot_triplets = []
208
- to_sort.sort.reverse.each do |ar|
209
- prot = ar[3]
210
- ## overlapping set:
211
- common = peps_found & ar[4]
212
- ## find the uniq ones in our little set of peptides:
213
- uniq = ar[4] - common
214
- pep_hits = []
215
- if uniq.size != 0
216
- ## add to the found list:
217
- peps_found.merge(uniq)
218
- uniq.each do |seq|
219
- pep_hits.push( *(ar[5][seq]) )
220
- end
221
- prot_triplets << [prot, pep_hits, uniq.to_a]
222
- prot.peps = pep_hits if update_prots
223
- end
224
- end
225
- prot_triplets
226
- end
227
-
228
- # returns number of true positives (array) and the specified output (as
229
- # parallel array). Requires the classification method and a sorted array of
230
- # tp values and an array fp values.
231
- # (This is simply a wrapper around ROC#by_tps method!)
232
- def by_tps(classification_method, tp, fp)
233
- ROC.new.by_tps(classification_method, tp, fp)
234
- end
235
-
236
- # from the unique set of peptide hits, create a separate peptide hit for
237
- # each protein reference where that peptide only references that protein
238
- # e.g. pep.prots = [(a single protein)]
239
- def pep_prots
240
- pps = []
241
- peps.each do |pep|
242
- pep.prots.map do |prt|
243
- pep.dup
244
- pep.prots = [prt]
245
- pps << pep
246
- end
247
- end
248
- pps
249
- end
250
-
251
- def self.prots?(ar)
252
- ar.first.is_a? SpecID::Prot
253
- end
254
-
255
- def self.peps?(ar)
256
- ar.first.is_a? SpecID::Pep
257
- end
258
-
259
- # for older stuff
260
- def classify_by_regex(items, regex, decoy_on_match=true, ties=:both)
261
- objects =
262
- case items
263
- when :prots
264
- prots
265
- when :peps
266
- peps
267
- end
268
- SpecID.classify_by_prot(objects, regex, decoy_on_match, ties)
269
- end
270
-
271
- # includes the peptide hit in both
272
- # returns (target, decoy)
273
- # (for peps) ties can be :both, true (target wins), false (decoy wins)
274
- # regardless of ties behavior, will partition out the proteins to be
275
- # appropriate for the peptide
276
- def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both)
277
- if items.size == 0
278
- return [[],[]]
279
- elsif prots?(items)
280
- myproc = proc { |prt|
281
- if prt.reference =~ regex ; !decoy_on_match
282
- else ; decoy_on_match end
283
- }
284
- return classify(items, myproc)
285
- elsif peps?(items)
286
- match = [] ; nomatch = []
287
- items.each do |pep|
288
- (match_prots, nomatch_prots) = pep.prots.partition do |prot|
289
- prot.reference =~ regex
290
- end
291
- if match_prots.size == 0
292
- nomatch << pep
293
- elsif nomatch_prots.size == 0
294
- match << pep
295
- else ## both have hits
296
- pep.prots = match_prots
297
- nomatch_pep = pep.dup
298
- nomatch_pep.prots = nomatch_prots
299
-
300
- # resolve ties
301
- case ties
302
- when true
303
- if decoy_on_match
304
- nomatch << pep
305
- else
306
- match << pep
307
- end
308
- when false
309
- if decoy_on_match
310
- match << pep
311
- else
312
- nomatch << pep
313
- end
314
- when :both
315
- match << pep
316
- nomatch << pep
317
- else ; raise ArgumentError
318
- end
319
- end
320
- end
321
- if decoy_on_match
322
- return [nomatch , match]
323
- else
324
- return [match, nomatch]
325
- end
326
- else
327
- raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep"
328
- end
329
- end
330
-
331
-
332
-
333
- # returns [tp, fp] based on the protein prefix for items where items =
334
- # (:prot|:peps)
335
- # this may result in a duplication of some peptides if they match both
336
- # normal and decoy proteins. In this case, the protein arrays are split,
337
- # too, so that each points only to its breed of protein.
338
- def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false)
339
- if prefix
340
- regex = /^#{Regexp.escape(flag)}/
341
- else
342
- regex = /#{Regexp.escape(flag)}/
343
- end
344
- classify_by_regex(items, regex, decoy_on_match)
345
- end
346
-
347
- # Returns (match, nomatch)
348
- # items = symbol (:prots, :peps)
349
- # Returns two arrays, those returning true from classify_item_by and those
350
- # returning false
351
- def classify(items, classify_item_by)
352
- its = send(items)
353
- f = []; t = []
354
- its.each do |it|
355
- if classify_item_by.call(it)
356
- t << it
357
- else
358
- f << it
359
- end
360
- end
361
- [t,f]
362
- end
363
-
364
- # returns two arrays, true positives and false positives (determined by proc
365
- # classify_item_by) sorted by proc rank_item_by. Items will be ranked from
366
- # lowest to highest based on the return value of rank_item_by. items is a
367
- # symbol (:prots or :peps)
368
- def rank_and_classify(items, rank_item_by, classify_item_by)
369
- its = send(items)
370
- #its.each do |it| puts it.probability.to_s ; puts it.reference end
371
- doublets = its.collect do |item|
372
- [ rank_item_by.call(item),
373
- classify_item_by.call(item) ]
374
- end
375
- roc = ROC.new
376
- tp, fp = roc.doublets_to_separate(doublets)
377
- return tp, fp
378
- end
379
-
380
-
381
- # returns a proc for getting all probabilities so that an ascending sort
382
- # will put the best scores first
383
- def probability_proc
384
- if hi_prob_best
385
- get_prob_proc = proc {|prt| prt.probability * -1 }
386
- else
387
- get_prob_proc = proc {|prt| prt.probability }
388
- end
389
- get_prob_proc
390
- end
391
-
392
- def separate_by_prefix(items, fp_prefix)
393
- its = send(items)
394
-
395
- if items == :prots
396
- elsif items == :peps
397
- abort "not implemented yet"
398
- else
399
- abort "no other items recognized yet"
400
- end
401
- end
402
-
403
- # sorts the probabilities and then
404
- # calcs predicted number hits and precision for protein probabilities
405
- # (summing probabilities)
406
- # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly
407
- # called false positive rate
408
- # SUM(1-probX)/#prots
409
- def num_hits_and_ppv_for_protein_prophet_probabilities
410
- current_sum_one_minus_prob = 0.0
411
- num_prots = []
412
- ppv = []
413
- prot_cnt = 0
414
- probs = prots.map {|v| v.probability}
415
- sorted = probs.sort.reverse
416
- sorted.each do |prob|
417
- prot_cnt += 1
418
- num_prots << prot_cnt
419
- current_sum_one_minus_prob += 1.0 - prob
420
- ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt )
421
- # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt
422
- end
423
- [num_prots, ppv]
424
- end
425
-
426
- # convenience method for the common task of determining precision for
427
- # proteins (with decoy proteins found by false_flag)
428
- # returns (num_hits, precision)
429
- def num_hits_and_ppv_for_prob(false_flag, prefix=false)
430
- if prefix
431
- regex = /^#{Regexp.escape(false_flag)}/
432
- else
433
- regex = /#{Regexp.escape(false_flag)}/
434
- end
435
- prob_proc = probability_proc
436
- myproc = proc { |prt|
437
- if prt.reference =~ regex ; false
438
- else ; true end
439
- }
440
-
441
- real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc)
442
-
443
- (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits)
444
- [num_hits, precision]
445
- end
446
-
447
- # # takes the existing spec_id object and marshals it into "file.msh"
448
- # # a new file will always look for a file.msh to load
449
- # def marshal(force=false)
450
- # if !(File.exist? @marshal_file)| force
451
- # File.open(@marshal_file, 'w') {|out| Marshal.dump(@obj, out) }
452
- # end
453
- # end
454
-
455
- # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet
456
- # 'srf' if SRF file, 'srg' if search results group file.
457
- def self.file_type(file)
458
- if file =~ /\.srg$/
459
- return 'srg'
460
- elsif file =~ /\.sqg$/
461
- return 'sqg'
462
- end
463
- if IO.read(file, 7,438) == 'Enzyme:'
464
- return 'srf'
465
- end
466
- File.open(file) do |fh|
467
- lines = ""
468
- 8.times { lines << fh.readline }
469
- if lines =~ /<bioworksinfo>/
470
- return 'bioworks'
471
- elsif ((lines =~ /<protein_summary/) and ((lines =~ Proph::ProtSummary::Filetype_and_version_re_old) or (lines =~ Proph::ProtSummary::Filetype_and_version_re_new)))
472
- return 'protproph'
473
- elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
474
- return 'pepproph'
475
- end
476
- # assumes the header of a sqt file is less than 200 lines ...
477
- 200.times do
478
- line = fh.gets
479
- if line
480
- lines << line
481
- else ; break
482
- end
483
- end
484
- if lines =~ /^H\tDatabase/ and lines =~ /^H\tSQTGenerator/
485
- return 'sqt'
486
- end
487
- end
488
- end
489
-
490
-
491
- ##############################################
492
- # These are pretty specific to Smriti's needs:
493
-
494
- # Given a hash of peptide arrays by some attribute key
495
- # Return two sorted arrays of sorted probabilities
496
- # The first of the min and second of the best 10 of each peptide array
497
- def min_and_best10(hash)
498
- ## choose the min probability and sort by prob
499
- min_peptides = hash.collect do |k,v|
500
- v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
501
- end
502
- #puts min_peptides[0] # -> Bioworks::Pep
503
- min_sorted_peps = sorted_probabilities(min_peptides)
504
- #puts min_sorted_peps[0] # -> probability (Float)
505
-
506
- peptides_by_tens = []
507
- hash.each do |k,v|
508
- arr = v.sort_by {|pep| pep.peptide_probability }.slice(0,10)
509
- peptides_by_tens.push(*arr)
510
- end
511
-
512
- top_10_sorted_peps = sorted_probabilities(peptides_by_tens)
513
- #puts top_10_sorted_peps[0] # -> float
514
- #puts "size: top_10_sorted_peps.size : #{top_10_sorted_peps.size}"
515
- #puts "size: min_sorted_peps.size : #{min_sorted_peps.size}"
516
- #p top_10_sorted_peps
517
- #p min_sorted_peps
518
- return min_sorted_peps, top_10_sorted_peps
519
- end
520
-
521
- # Returns a list of sorted probabilities given the array of peptides
522
- def sorted_probabilities(peptides)
523
- #puts peptides.first.peptide_probability.class
524
- #peptides.each do |pep| print pep.class.to_s + " " end
525
- #puts peptides.first.is_a? Array
526
- #abort "DFHDFD"
527
- peptides.collect{|pep| pep.probability }.sort
528
- end
529
-
530
- # returns a sorted lists of probabilities based on all pepprots (a peptide
531
- # associated with a protein)
532
- def pep_probs_by_pep_prots
533
- sorted_probabilities(peps)
534
- end
535
-
536
- ##########################################################################
537
- # WARNING! These might be dangerous to your health if there are multiple
538
- # files collected in your bioworks file
539
- ##########################################################################
540
-
541
- # (prob_list_by_min, prob_list_by_best10)
542
- # returns 2 sorted lists of probabilities based on:
543
- # 1. best peptide hit
544
- # 2. top 10 peptide hits
545
- # on a per scan basis
546
- # NOTE: you may want to hash on base_name first!
547
- def pep_probs_by_scan
548
- hash = peps.hash_by(:first_scan, :last_scan)
549
- return min_and_best10(hash)
550
- end
551
-
552
-
553
- #(prob_list_by_min, prob_list_by_best10)
554
- # same as pep_probs_by_scan but per charge state
555
- # NOTE: you may want to hash on base_name first!
556
- def pep_probs_by_scan_charge
557
- hash = peps.hash_by(:first_scan, :last_scan, :charge)
558
- return min_and_best10(hash)
559
- end
560
-
561
- # (prob_list_by_min)
562
- # hashes on seq-charge and returns the sorted list of probabilities of top
563
- # hit per seq-charge
564
- # NOTE: you may want to hash on base_name first!
565
- def pep_probs_by_seq_charge
566
- hash = peps.hash_by(:sequence, :charge)
567
- min_peptides = hash.collect do |k,v|
568
- v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
569
- end
570
- sorted_probabilities(min_peptides)
571
- end
572
-
573
- ##########################################################################
574
- # USE these if you have multiple files in your bioworks.xml file
575
- ##########################################################################
576
- # (prob_list_by_min, prob_list_by_best10)
577
- # returns 2 sorted lists of probabilities based on:
578
- # 1. best peptide hit
579
- # 2. top 10 peptide hits
580
- # on a per scan basis
581
- # NOTE: you may want to hash on base_name first!
582
- def pep_probs_by_bn_scan
583
- hash = peps.hash_by(:base_name, :first_scan, :last_scan)
584
- return min_and_best10(hash)
585
- end
586
-
587
-
588
- #(prob_list_by_min, prob_list_by_best10)
589
- # same as pep_probs_by_scan but per charge state
590
- # NOTE: you may want to hash on base_name first!
591
- def pep_probs_by_bn_scan_charge
592
- hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge)
593
- return min_and_best10(hash)
594
- end
595
-
596
- # (prob_list_by_min)
597
- # hashes on seq-charge and returns the sorted list of probabilities of top
598
- # hit per seq-charge
599
- # NOTE: you may want to hash on base_name first!
600
- def pep_probs_by_bn_seq_charge
601
- hash = peps.hash_by(:base_name, :sequence, :charge)
602
- min_peptides = hash.collect do |k,v|
603
- v.min {|a,b| a.peptide_probability <=> b.peptide_probability }
604
- end
605
- sorted_probabilities(min_peptides)
606
- end
607
- end
608
-
609
- # A Generic spectraID protein
610
- module SpecID::Prot
611
- include ProteinReferenceable
612
-
613
- # probability is always a float!
614
- attr_accessor :probability, :reference, :peps
615
-
616
- def <=> (other)
617
- self.reference <=> other.reference
618
- end
619
-
620
- def inspect
621
- pep_string =
622
- if peps
623
- ", @peps(#)=#{peps.size}"
624
- end
625
- "<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>"
626
- end
627
-
628
- end
629
-
630
- module SpecID::Pep
631
-
632
- Non_standard_amino_acid_char_re = /[^A-Z\.\-]/
633
-
634
- attr_accessor :prots
635
- attr_accessor :probability
636
- # full sequence: (<firstAA>.<sequence>.<last>) with '-' for no first
637
- # or last.
638
- attr_accessor :sequence
639
-
640
- # the basic amino acid sequence (no leading or trailing '.' or amino acids)
641
- # should not contain any special symbols, etc.
642
- attr_accessor :aaseq
643
- attr_accessor :charge
644
-
645
- # removes nonstandard chars with Non_standard_amino_acid_char_re
646
- # preserves A-Z and '.' and '-'
647
- def self.remove_non_amino_acids(sequence)
648
- sequence.gsub(Non_standard_amino_acid_char_re, '')
649
- end
650
-
651
- # remove_non_amino_acids && split_sequence
652
- def self.prepare_sequence(val)
653
- nv = remove_non_amino_acids(val)
654
- split_sequence(nv)
655
- end
656
-
657
- def <=>(other)
658
- aaseq <=> other.aaseq
659
- end
660
-
661
- # Returns prev, peptide, next from sequence. Parse errors return
662
- # nil,nil,nil
663
- # R.PEPTIDE.A # -> R, PEPTIDE, A
664
- # R.PEPTIDE.- # -> R, PEPTIDE, -
665
- # PEPTIDE.A # -> -, PEPTIDE, A
666
- # A.PEPTIDE # -> A, PEPTIDE, -
667
- # PEPTIDE # -> nil,nil,nil
668
- def self.split_sequence(val)
669
- peptide_prev_aa = ""; peptide = ""; peptide_next_aa = ""
670
- pieces = val.split('.')
671
- case pieces.size
672
- when 3
673
- peptide_prev_aa, peptide, peptide_next_aa = *pieces
674
- when 2
675
- if pieces[0].size > 1 ## N termini
676
- peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1]
677
- else ## C termini
678
- peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-'
679
- end
680
- when 1 ## this must be a parse error!
681
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
682
- when 0
683
- peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil
684
- end
685
- return peptide_prev_aa, peptide, peptide_next_aa
686
- end
687
-
688
- ##
689
- def self.sequence_to_aaseq(sequence)
690
- after_removed = remove_non_amino_acids(sequence)
691
- pieces = after_removed.split('.')
692
- case pieces.size
693
- when 3
694
- pieces[1]
695
- when 2
696
- if pieces[0].size > 1 ## N termini
697
- pieces[0]
698
- else ## C termini
699
- pieces[1]
700
- end
701
- when 1 ## this must be a parse error!
702
- pieces[0] ## which is the peptide itself
703
- else
704
- abort "bad peptide sequence: #{sequence}"
705
- end
706
- end
707
-
708
- # This will rapidly determine the list of proteins for which given
709
- # peptides belong. It is meant to be low level and fast (eventually),
710
- # so it asks for the data in a format amenable to this.
711
- # returns a mirror array where each entry is an array of Fasta::Prot
712
- # objects where each protein contains the sequence
713
- def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj)
714
- prots = fasta_obj.prots
715
- prot_seqs = prots.map do |prot|
716
- prot.aaseq
717
- end
718
-
719
- groups = peptide_strings_list.map do |pep_seq|
720
- prot_index = 0
721
- protein_group = []
722
- prot_seqs.each do |prot_seq|
723
- if prot_seq.include? pep_seq
724
- protein_group << prots[prot_index]
725
- end
726
- prot_index += 1
727
- end
728
- protein_group
729
- end
730
-
731
- groups
732
- end
733
-
734
- # units can be :mmu, :amu, :ppm
735
- def mass_accuracy(pep, unit=:ppm, mono=true)
736
- # 10^6 * deltam accuracy/ m[measured]
737
- # i.e., theoretical mass 1000, measured 999.9: 100ppm
738
- # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD
739
- # pep.mass is the theoretical M+H of the peptide
740
- # this assumes that the deltacn value we're being told is correct, but I
741
- # have my suspicions (since the <mass> value is not accurate...)
742
-
743
- ######## TO COMPLETE (and add to spec_id..?)
744
- case unit
745
- when :ppm
746
- when :amu
747
- when :mmu
748
- end
749
- end
750
-
751
- # calls the method associated with each key and returns the value
752
- def values_at(*args)
753
- args.map do |arg|
754
- send(arg)
755
- end
756
- end
757
-
758
- def inspect
759
-
760
- prot_string =
761
- if prots
762
- ", @prots(#)=#{prots.size}"
763
- end
764
- "<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>"
765
- end
766
-
767
- end
768
-
769
- class SpecID::GenericProt
770
- include SpecID::Prot
771
- end
772
-
773
- class SpecID::GenericPep
774
- include SpecID::Pep
775
- end
776
-
777
-
778
-