mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id/mass.rb DELETED
@@ -1,179 +0,0 @@
1
-
2
- class Mass
3
- # http://expasy.org/tools/findmod/findmod_masses.html
4
- # still need to add the modifications
5
- MONO = {
6
- :A => 71.03711,
7
- :R => 156.10111,
8
- :N => 114.04293,
9
- :D => 115.02694,
10
- :C => 103.00919,
11
- :E => 129.04259,
12
- :Q => 128.05858,
13
- :G => 57.02146,
14
- :H => 137.05891,
15
- :I => 113.08406,
16
- :L => 113.08406,
17
- :K => 128.09496,
18
- :M => 131.04049,
19
- :F => 147.06841,
20
- :P => 97.05276,
21
- :S => 87.03203,
22
- :T => 101.04768,
23
- :W => 186.07931,
24
- :Y => 163.06333,
25
- :V => 99.06841,
26
-
27
- # uncommon
28
- :B => 172.048405, # average of aspartic acid and asparagine
29
- :U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
30
- :X => 118.805716, # the average of the mono masses of the 20 amino acids
31
- :* => 118.805716, # same as X
32
- :Z => (129.04259 + 128.05858) / 2, # average glutamic acid and glutamine
33
-
34
- # elements etc.
35
- :h => 1.00783,
36
- #:h_plus => 1.00728, # this is the mass I had
37
- :h_plus => 1.007276, # this is the mass used by mascot merge.pl
38
- :o => 15.9949146,
39
- :h2o => 18.01056,
40
- }
41
- AVG = {
42
- :A => 71.0788,
43
- :R => 156.1875,
44
- :N => 114.1038,
45
- :D => 115.0886,
46
- :C => 103.1388,
47
- :E => 129.1155,
48
- :Q => 128.1307,
49
- :G => 57.0519,
50
- :H => 137.1411,
51
- :I => 113.1594,
52
- :L => 113.1594,
53
- :K => 128.1741,
54
- :M => 131.1926,
55
- :F => 147.1766,
56
- :P => 97.1167,
57
- :S => 87.0782,
58
- :T => 101.1051,
59
- :W => 186.2132,
60
- :Y => 163.1760,
61
- :V => 99.1326,
62
-
63
- # uncommon
64
- :B => 172.1405, # average of aspartic acid and asparagine
65
- :U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
66
- :X => 118.88603, # the average of the masses of the 20 amino acids
67
- :* => 118.88603, # same as X
68
- :Z => (129.1155+ 128.1307) / 2, # average glutamic acid and glutamine
69
-
70
- # elements etc.
71
- :h => 1.00794,
72
- :h_plus => 1.00739,
73
- :o => 15.9994,
74
- :h2o => 18.01524,
75
- }
76
-
77
- # returns a fresh hash where it has been added to each amino acid the amount
78
- # specified in the array of a PepXML::Modifications object
79
- # if static_terminal_mods given than will create the following keys as
80
- # symbols as necessary:
81
- # add_C_term_protein
82
- # add_C_term_peptide
83
- # add_N_term_protein
84
- # add_N_term_peptide
85
- def self.add_static_masses(monoisotopic, static_mods, static_terminal_mods=nil)
86
- hash_to_use =
87
- if monoisotopic
88
- Mass::MONO
89
- else
90
- Mass::AVG
91
- end
92
- copy_hash = hash_to_use.dup
93
- static_mods.each do |mod|
94
- copy_hash[mod.aminoacid.to_sym] += mod.massdiff
95
- end
96
- static_terminal_mods.each do |mod|
97
- if x = mod.protein_terminus
98
- # its a protein terminus modification
99
- case x
100
- when 'n'
101
- copy_hash[:add_N_term_protein] = mod.massdiff
102
- when 'c'
103
- copy_hash[:add_C_term_protein] = mod.massdiff
104
- end
105
- else
106
- # its a peptide terminus modification
107
- case mod.terminus
108
- when 'n'
109
- copy_hash[:add_N_term_peptide] = mod.massdiff
110
- when 'c'
111
- copy_hash[:add_C_term_peptide] = mod.massdiff
112
- end
113
- end
114
- end
115
- copy_hash
116
- end
117
-
118
- # returns an array of masses parallel to array passed in
119
- # If you want the mass with H+, then pass in the mass as h_plus
120
- # The mass hash must repond to
121
- # :h2o (water)
122
- # and at least the twenty amino acids (by string or symbol)
123
- # The mass hash may respond to :add_N_term_peptide or :add_C_term_peptide
124
- # in which case these will be added to the final mass
125
- def self.masses(aaseqs, mass_hash=Mass::MONO, h_plus=0.0)
126
- final_add = mass_hash[:h2o] + h_plus
127
- [:add_N_term_peptide, :add_C_term_peptide].each do |sym|
128
- if mass_hash.key?(sym)
129
- final_add += mass_hash[sym]
130
- end
131
- end
132
- hash_by_aa_string = {}
133
- mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
134
-
135
- aaseqs.map do |pep_aaseqs|
136
- sum = 0.0
137
- aaseq.split('').each do |let|
138
- sum += hash_by_aa_string[let]
139
- end
140
- mh_plus = sum + final_add
141
- end
142
- end
143
-
144
-
145
- end
146
-
147
- class Mass::Calculator
148
-
149
- # mass_hash must respond to :h2o or 'h2o'. This is added to represent the
150
- # tails of the peptide. add_extra is outside of that (e.g., an H+)
151
- def initialize(mass_hash, add_extra=0.0)
152
- @mass_hash = mass_hash_to_s(mass_hash)
153
- @final_add = @mass_hash['h2o'] + add_extra
154
- end
155
-
156
- def mass_hash_to_s(mass_hash)
157
- new_hash = {}
158
- mass_hash.each do |k,v|
159
- new_hash[k.to_s] = v
160
- end
161
- new_hash
162
- end
163
-
164
- def masses(aaseqs)
165
- aaseqs.map do |aaseq|
166
- sum = @final_add # <- add in the initialization
167
- aaseq.split('').each do |let|
168
- if @mass_hash.key? let
169
- sum += @mass_hash[let]
170
- else
171
- abort "LETTER not found in mass_hash: #{let}"
172
- end
173
- end
174
- sum
175
- end
176
- end
177
-
178
- end
179
-
@@ -1,335 +0,0 @@
1
- require 'xml_style_parser'
2
- require 'spec_id/sequest/pepxml'
3
-
4
-
5
- module SpecID ; end
6
- module SpecID::Parser ; end
7
-
8
-
9
- class SpecID::Parser::PepProph
10
- include XMLStyleParser
11
-
12
- # gets the protein (and adds the pephit to the protein)
13
- def get_protein(search_hit, name, description, global_prot_hash)
14
- prot =
15
- if global_prot_hash.key?(name)
16
- global_prot_hash[name]
17
- else
18
- prt = Proph::PepSummary::Prot.new([name, description, []])
19
- global_prot_hash[name] = prt
20
- end
21
- prot.peps << search_hit
22
- prot
23
- end
24
-
25
- def initialize(parse_type=:spec_id, version='3.0')
26
- @method = parse_type
27
- @version = version
28
- implemented = %w(AXML LibXML)
29
- klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
30
- case klass_s
31
- when 'AXML'
32
- @get_root_node_from_file = Proc.new do |file|
33
- AXML.parse_file(file)
34
- end
35
- when 'LibXML' # LibXML is buggy on some machines...
36
- @get_root_node_from_file = Proc.new do |file|
37
- doc = XML::Document.file(file)
38
- doc.root
39
- end
40
- else
41
- raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
42
- end
43
- end
44
-
45
- # returns the spec_id object
46
- # :global_prot_hash is a hash if you have multiple of these files to be
47
- # combined
48
- def spec_id(file, opts={})
49
-
50
- raise NotImplementedError, "cannot do #{@version} yet" if @version.nil? or @version < '3.0'
51
- spec_id_obj =
52
- if x = opts[:spec_id]
53
- x
54
- else
55
- Proph::PepSummary.new
56
- end
57
- global_prot_hash =
58
- if y = opts[:global_prot_hash]
59
- y
60
- else
61
- {}
62
- end
63
- msms_pipeline_analysis_n = @get_root_node_from_file.call(file)
64
- spec_id_obj.peptideprophet_summary = msms_pipeline_analysis_n.find_first("descendant::peptideprophet_summary")
65
-
66
-
67
- spec_id_obj.msms_run_summaries = msms_pipeline_analysis_n.find('child::msms_run_summary').map do |msms_run_summary_n|
68
- parse_msms_run_summary(msms_run_summary_n, global_prot_hash)
69
- end
70
-
71
- peps = []
72
- spec_id_obj.msms_run_summaries.each do |mrs|
73
- mrs.spectrum_queries.each do |sq|
74
- sq.search_results.each do |sr|
75
- peps.push( *(sr.search_hits) )
76
- end
77
- end
78
- end
79
- spec_id_obj.peps = peps
80
- spec_id_obj.prots = global_prot_hash.values
81
- spec_id_obj
82
- end
83
-
84
- # returns an msms_run_summary object
85
- def parse_msms_run_summary(msms_run_summary_n, global_prot_hash)
86
- msms_run_summary_obj = Sequest::PepXML::MSMSRunSummary.new
87
-
88
- msms_run_summary_obj.from_pepxml_node(msms_run_summary_n)
89
- sample_enzyme_n = msms_run_summary_n.find_first("child::sample_enzyme")
90
- msms_run_summary_obj.sample_enzyme = SampleEnzyme.from_pepxml_node( sample_enzyme_n )
91
-
92
- search_summary_n = sample_enzyme_n.find_first("following-sibling::search_summary")
93
- spectrum_queries_nds = search_summary_n.find("following-sibling::spectrum_query")
94
-
95
- msms_run_summary_obj.spectrum_queries = spectrum_queries_nds.map do |sq_n|
96
-
97
- sq = Sequest::PepXML::SpectrumQuery.from_pepxml_node(sq_n)
98
- sq.search_results = sq_n.children.map do |sr_n|
99
- sr = Sequest::PepXML::SearchResult.new
100
- sr.search_hits = sr_n.children.map do |sh_n|
101
- sh = Proph::PepSummary::Pep.new # descended from SearchHit
102
- sh.from_pepxml_node(sh_n)
103
- sh.spectrum_query = sq
104
- prots = [ get_protein(sh, sh_n['protein'], sh_n['protein_descr'], global_prot_hash) ]
105
- ## alternative proteins:
106
- if sh.num_tot_proteins > 1
107
- sh_n.find('child::alternative_protein').each do |alt_prot_n|
108
- prots << get_protein(sh, alt_prot_n['protein'], alt_prot_n['protein_descr'], global_prot_hash)
109
- end
110
- end
111
- sh.prots = prots
112
-
113
- if modinfo_node = sh_n.find_first("child::modification_info")
114
- sh.modification_info = Sequest::PepXML::SearchHit::ModificationInfo.from_pepxml_node(modinfo_node)
115
- end
116
-
117
-
118
- ## search scores:
119
- sh_n.find("child::search_score").each do |ss_n|
120
- case ss_n['name']
121
- when 'deltacnstar'
122
- sh.deltacnstar = ss_n['value'].to_i
123
- when 'xcorr'
124
- sh.xcorr = ss_n['value'].to_f
125
- when 'deltacn'
126
- sh.deltacn = ss_n['value'].to_f
127
- when 'spscore'
128
- sh.spscore = ss_n['value'].to_f
129
- when 'sprank'
130
- sh.sprank = ss_n['value'].to_i
131
- end
132
- end
133
- sh
134
- end
135
- sr
136
- end
137
- sq
138
- end
139
-
140
- ## NOTE: this is currently just the xml node!!!! TODO: wrap everything up
141
- #into a better search summary object (to eventually depracate the params object)
142
- msms_run_summary_obj.search_summary = msms_run_summary_n
143
- msms_run_summary_obj
144
- end
145
-
146
- end
147
-
148
- class SpecID::Parser::ProtProph
149
- include XMLStyleParser
150
- Split_unique_stripped_peptides_re = /\+/
151
-
152
- def initialize(parse_type=:spec_id, version='4')
153
- @method = parse_type
154
- @version = version
155
-
156
- implemented = %w(AXML LibXML)
157
- klass_s = XMLStyleParser.available_xml_parsers.select {|v| implemented.include?(v) }.first
158
- case klass_s
159
- when 'AXML'
160
- #puts "parsing with AXML (XMLParser based)" if $VERBOSE
161
- @get_root_node_from_file = Proc.new do |file|
162
- AXML.parse_file(file)
163
- end
164
- when 'LibXML' # LibXML is buggy on some machines...
165
- #puts "parsing with LibXML" if $VERBOSE
166
- @get_root_node_from_file = Proc.new do |file|
167
- doc = XML::Document.file(file)
168
- doc.root
169
- end
170
- else
171
- raise NotImplementedError, "Can only parse with #{implemented.join(', ')} right now"
172
- end
173
- end
174
-
175
- # returns the spec_id object
176
- def spec_id(file, opts={})
177
- raise NotImplementedError, "cannot do #{@version} yet" if @version != '4'
178
- spec_id_obj =
179
- if x = opts[:spec_id]
180
- x
181
- else
182
- Proph::ProtSummary.new
183
- end
184
- protein_summary_n = @get_root_node_from_file.call(file)
185
-
186
- #protein_summary_n = scan_for_first(doc, 'protein_summary')
187
-
188
- # protein_summary_header_n = protein_summary_n.child
189
- # could grab some of this info if we wanted...
190
-
191
- pep_hash = {}
192
- prot_hash = {}
193
- protein_groups = []
194
-
195
- # get all the proteins from inside protein groups
196
- protein_group_name = 'protein_group'
197
- get_protein_summary_header = true
198
- protein_summary_n.each do |protein_group_n|
199
- if get_protein_summary_header
200
- protein_summary_header_n = protein_group_n
201
- get_protein_summary_header = false
202
- elsif protein_group_n.name == protein_group_name
203
- protein_groups << get_proteins(protein_group_n, pep_hash, prot_hash)
204
- end
205
- end
206
-
207
- # need to finalize hash stuff
208
- pep_hash.each do |k,pep|
209
- new_prots = []
210
- pep.prots.each do |prot_or_string|
211
- if prot_or_string.is_a?(Proph::Prot)
212
- new_prots << prot_or_string
213
- else
214
- prt = prot_hash[prot_or_string]
215
- if prt.nil?
216
- # this is an indistinguishable protein!
217
- else
218
- new_prots << prt
219
- end
220
- end
221
- end
222
- pep.prots = new_prots
223
- end
224
-
225
- spec_id_obj.peps = pep_hash.values
226
- spec_id_obj.prots = prot_hash.values
227
- spec_id_obj.prot_groups = protein_groups
228
- spec_id_obj
229
- end
230
-
231
- # takes a Y or N and gives true/false
232
- def booleanize(string)
233
- case string
234
- when 'Y'
235
- true
236
- when 'N'
237
- false
238
- else
239
- nil
240
- end
241
- end
242
-
243
- # assumes that all the rest of the nodes are protein_groups
244
- # pep_hash is hashed on aaseq OR modified peptide amino acid sequence (if
245
- # modified) + charge
246
- # (as far as I can tell, all protein entries are unique!)
247
- # returns a ProtGroup object
248
- def get_proteins(protein_group_node, pep_hash, prot_hash)
249
-
250
- protein_group_proteins = []
251
-
252
- protein_group_node.each do |protein_n|
253
- raise(Exception, "not expecting anything but protein's, got: #{protein_n.name}") if protein_n.name != 'protein'
254
- # probability peps protein_name n_indistinguishable_proteins percent_coverage unique_stripped_peptides group_sibling_id total_number_peptides pct_spectrum_ids description
255
-
256
- # get the description
257
- # INITIALIZE the protein and set key
258
- n = protein_n
259
- protein_name = n['protein_name']
260
- peps = []
261
- protein = Proph::Prot.new( [protein_name, n['probability'].to_f,
262
- n['n_indistinguishable_proteins'].to_i,
263
- n['percent_coverage'].to_f,
264
- n['unique_stripped_peptides'].split(Split_unique_stripped_peptides_re),
265
- n['group_sibling_id'], n['total_number_peptides'].to_i,
266
- n['pct_spectrum_ids'].to_f, nil,
267
- peps ])
268
- protein_group_proteins << protein
269
- prot_hash[protein_name] = protein
270
-
271
- # traverse through the peptides (and annotation)
272
- protein_n.each do |protein_sub_n|
273
- # create a proteins array for each peptide
274
- proteins = [protein]
275
-
276
- if protein_sub_n.name == 'annotation'
277
- protein.description = protein_sub_n['protein_description']
278
- end
279
- if protein_sub_n.name == 'peptide'
280
- peptide_n = protein_sub_n
281
- # peptide_sequence charge initial_probability nsp_adjusted_probability weight is_nondegenerate_evidence n_enzymatic_termini n_sibling_peptides n_sibling_peptides_bin n_instances is_contributing_evidence calc_neutral_pep_mass modification_info prots
282
- # get modifications, if any
283
-
284
- n = peptide_n
285
- peptide_sequence = n['peptide_sequence']
286
- charge = n['charge'].to_i
287
-
288
- # GET list of all proteins and modifications
289
-
290
- mod_info = nil
291
- peptide_hash_string = peptide_sequence
292
- if peptide_n.child?
293
- peptide_n.each do |pep_sub_n|
294
- case pep_sub_n.name
295
- when 'peptide_parent_protein'
296
- # NOTE! the proteins list will have strings until the assoc.
297
- # prot is found!
298
- proteins << pep_sub_n['protein_name']
299
- when 'modification_info'
300
- masses = pep_sub_n.map do |mod_aa_mass_n|
301
- Sequest::PepXML::SearchHit::ModificationInfo::ModAminoacidMass.new([mod_aa_mass_n['position'].to_i, mod_aa_mass_n['mass'].to_f])
302
- end
303
- peptide_hash_string = pep_sub_n['modified_peptide']
304
- mod_info = Sequest::PepXML::SearchHit::ModificationInfo.new([peptide_hash_string, masses])
305
- end
306
- end
307
- end
308
-
309
- key = [peptide_hash_string, charge]
310
- peptide =
311
- if pep_hash.key? key
312
- pep_hash[key]
313
- else
314
- pep = Proph::Prot::Pep.new([peptide_sequence, charge,
315
- n['initial_probability'].to_f, n['nsp_adjusted_probability'].to_f,
316
- n['weight'].to_f, booleanize(n['is_nondegenerate_evidence']),
317
- n['n_enzymatic_termini'].to_i, n['n_sibling_peptides'].to_f,
318
- n['n_sibling_peptides'].to_i, n['n_instances'].to_i,
319
- booleanize(n['is_contributing_evidence']),
320
- n['calc_neutral_pep_mass'].to_f, mod_info, proteins] )
321
- pep_hash[key] = pep
322
- pep
323
- end
324
- peps << peptide
325
- end
326
- end # end protein children
327
- end
328
- Proph::ProtGroup.new(:prots => protein_group_proteins, :group_number => protein_group_node['group_number'].to_i, :probability => protein_group_node['probability'].to_f)
329
- end
330
-
331
- def parse(file, opts)
332
- send(@method, file, opts)
333
- end
334
-
335
- end