mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,160 @@
1
+ require 'xml_style_parser'
2
+ require 'ms/spectrum'
3
+ require 'ms/scan'
4
+
5
+ module MS::Parser::MzData ; end
6
+
7
+ class MS::Parser::MzData::DOM
8
+ include XMLStyleParser
9
+ include MS::Parser::MzData
10
+
11
+ def initialize(parse_type=:msrun, version='1.0')
12
+ @method = parse_type
13
+ @version = version
14
+ end
15
+
16
+ # true if there is a node <dataProcessing><software><name>Bioworks Browser</...>
17
+ # otherwise false
18
+ def is_bioworks33?(description_node)
19
+ begin
20
+ software_node = description_node.find_first('child::dataProcessing').find_first('child::software')
21
+ name = software_node.find_first('child::name').content
22
+ version = software_node.find_first('child::version').content
23
+ ((name == 'Bioworks Browser') and (version == '3.3'))
24
+ rescue
25
+ false
26
+ end
27
+ end
28
+
29
+ # OPTIONS:
30
+ # :msrun => MSRun # use this object instead of creating one
31
+ # :spectra => *true|false # if false don't get spectra
32
+ def msrun(file, opts={})
33
+ unless opts.key?(:spectra)
34
+ opts[:spectra] = true
35
+ end
36
+ msrun_obj =
37
+ if x = opts[:msrun]
38
+ msrun_obj = x
39
+ else
40
+ MS::MSRun.new
41
+ end
42
+ # should ensure that parsing is not counting spaces...
43
+
44
+ # a string we'd parse like this:
45
+ # doc = XML::Parser.string(st).parse
46
+
47
+ # WE NEED TO GET scan_count, start_time and end_time!!!!
48
+ id_to_scan_hash = {}
49
+
50
+ # 0 1 2 3 4 5 6
51
+ # %w(num msLevel retentionTime startMz endMz precursors spectrum)
52
+
53
+ root = get_root_node_from_file(file)
54
+ scan_count = 0
55
+ description = root.find_first('child::description')
56
+ bioworks33 = is_bioworks33?(description)
57
+ spectrum_list = description.next
58
+ scans =
59
+ if bioworks33
60
+ [] #bioworks33 gives incorrect scan numbers!
61
+ else
62
+ Array(spectrum_list['count'].to_i)
63
+ end
64
+
65
+ # if I move from node to node, it means I've checked that it's a sequence
66
+ # and that the elements are req'd
67
+ if spectrum_list.child?
68
+ spectrum_n = spectrum_list.child
69
+ loop do
70
+ scan_count += 1
71
+ scan = MS::Scan.new(9)
72
+ id = spectrum_n["id"].to_i
73
+ id_to_scan_hash[id] = scan
74
+ spec_desc_n = spectrum_n.child # required in sequence
75
+ spec_settings_n = spec_desc_n.child # required in sequence
76
+ if acq_n = spec_settings_n.find_first('descendant::acquisition')
77
+ scan[0] = acq_n['acqNumber'].to_i
78
+ else
79
+ scan[0] = id
80
+ end
81
+ spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
82
+ scan[1] = spec_inst_n['msLevel'].to_i
83
+
84
+ if bioworks33
85
+ scans << scan # we can't trust the scan count!
86
+ else
87
+ scans[scan_count] = scan
88
+ end
89
+
90
+ scan[3] = spec_inst_n['mzRangeStart'].to_f
91
+ scan[4] = spec_inst_n['mzRangeStop'].to_f
92
+ spec_inst_n.find('child::cvParam').each do |cv_param|
93
+ if cv_param['name'] == 'TimeInMinutes'
94
+ scan[2] = cv_param['value'].to_f * 60 #convert to seconds
95
+ end
96
+ end
97
+ if scan[1] > 1 # precursormz info
98
+ prec_list_n = spec_settings_n.next
99
+ abort('can only process one precursor m/z right now!') if prec_list_n['count'] != '1'
100
+ precursors = prec_list_n.find('child::precursor').map do |prec_n|
101
+ # %w(mz inten parent ms_level parent charge_states)
102
+ prec = MS::Precursor.new
103
+ unless bioworks33 # bioworks33 points to the wrong scan!!!
104
+ prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
105
+ end
106
+ prec[3] = prec_n['msLevel'].to_i
107
+ charges = []
108
+ prec_n.find('descendant::cvParam').each do |cv_param_n|
109
+ case cv_param_n['name']
110
+ when 'MassToChargeRatio'
111
+ prec[0] = cv_param_n['value'].to_f
112
+ # find the prec intensity
113
+ unless bioworks33
114
+ prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
115
+ end
116
+ when 'ChargeState'
117
+ charges << cv_param_n['value'].to_i
118
+ end
119
+ end
120
+ prec[5] = charges
121
+ prec
122
+ end
123
+ scan[5] = precursors
124
+ else # no precursors
125
+ scan[5] = []
126
+ end
127
+ # here's the one line way of doing it, but it's probably more clear in
128
+ # the loop
129
+ #while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
130
+ if opts[:spectra]
131
+ mz_array_bin_n = nil
132
+ loop do
133
+ mz_array_bin_n = spec_desc_n.next
134
+ break if mz_array_bin_n.name == 'mzArrayBinary'
135
+ end
136
+ data_n = mz_array_bin_n.child
137
+ mz = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
138
+ inten_array_bin_n = mz_array_bin_n.next
139
+ data_n = inten_array_bin_n.child
140
+ inten = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
141
+ scan[6] = MS::Spectrum.new(mz, inten)
142
+ end
143
+
144
+ # set up the next loop
145
+ break unless spectrum_n = spectrum_n.next
146
+ end
147
+ end
148
+ if bioworks33
149
+ MS::MSRun.add_parent_scan(scans, opts[:spectra])
150
+ end
151
+ msrun_obj.scans = scans
152
+ msrun_obj.scan_count = scan_count
153
+ msrun_obj.start_time = msrun_obj.scans.first.time
154
+ msrun_obj.end_time = msrun_obj.scans.last.time
155
+ end
156
+
157
+ end
158
+
159
+
160
+
@@ -0,0 +1,7 @@
1
+
2
+ class MS::Parser::MzData::LibXML < MS::Parser::MzData::DOM
3
+ def get_root_node_from_file(file)
4
+ XML::Document.file(file).root
5
+ end
6
+ end
7
+
@@ -0,0 +1,25 @@
1
+ require 'ms/msrun'
2
+
3
+ module MS; end
4
+
5
+ module MS::Parser::MzData
6
+ Base_dir_for_parsers = 'ms/parser/mzdata'
7
+
8
+ # inherits XMLStyleParser and version
9
+ include MS::Parser
10
+ include XMLStyleParser
11
+
12
+ # returns a specific parser MS::Parser::MzXML::#{ParserType}
13
+ # based on choose_parser from xml_style_parser
14
+ def self.new(parse_type=:msrun, version='1.05')
15
+ @version = version
16
+ @method = parse_type
17
+ #p self.methods.grep /choose_parser/
18
+ XMLStyleParser.require_parse_files(Base_dir_for_parsers)
19
+ parser_class = XMLStyleParser.choose_parser(self, parse_type)
20
+ parser = parser_class.new(parse_type, version)
21
+ end
22
+
23
+ end
24
+
25
+
@@ -0,0 +1,11 @@
1
+ require 'ms/parser/mzxml/dom'
2
+
3
+ class MS::Parser::MzXML::AXML < MS::Parser::MzXML::DOM
4
+ def get_root_node_from_string(string)
5
+ ::AXML.parse(string)
6
+ end
7
+ def get_root_node_from_file(file)
8
+ ::AXML.parse_file(file)
9
+ end
10
+ end
11
+
@@ -0,0 +1,159 @@
1
+ require 'xml_style_parser'
2
+ require 'ms/spectrum'
3
+ require 'ms/scan'
4
+
5
+
6
+ class MS::Parser::MzXML::DOM
7
+ include XMLStyleParser
8
+ include MS::Parser::MzXML
9
+
10
+ #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
11
+
12
+ def initialize(parse_type=:msrun, version='1.0')
13
+ @method = parse_type
14
+ @version = version
15
+ end
16
+
17
+ def new_scan_from_hash(node)
18
+ scan = MS::Scan.new # array class creates one with 9 positions
19
+ scan[0] = node['num'].to_i
20
+ scan[1] = node['msLevel'].to_i
21
+ scan[2] = node['retentionTime'][2...-1].to_f
22
+ if x = node['startMz']
23
+ scan[3] = x.to_f
24
+ scan[4] = node['endMz'].to_f
25
+ end
26
+ scan
27
+ end
28
+
29
+ # takes a scan node and creates a scan object
30
+ # the parent scan is the one directly above it in mslevel
31
+ # if the
32
+ def create_scan(scan_n, scans_by_num, get_spectra=true)
33
+ if @version < '3.0'
34
+ scan = new_scan_from_hash(scan_n)
35
+ precs = []
36
+ scan_n.each do |node|
37
+ case node.name
38
+ when 'precursorMz'
39
+ # should be able to do this!!!
40
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
41
+ prec = MS::Precursor.new
42
+ prec[1] = node['precursorIntensity'].to_f
43
+ prec[0] = node.content.to_f
44
+ if x = node['precursorScanNum']
45
+ prec[2] = scans_by_num[x.to_i]
46
+ end
47
+ precs << prec
48
+ when 'peaks'
49
+ next unless get_spectra
50
+ # SHOULD be able to do this!!
51
+ #peaks_n = scan_n.find_first('child::peaks')
52
+ scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
53
+ end
54
+ end
55
+ scan[5] = precs
56
+ scan
57
+ else # for version > 3.0
58
+ abort 'not supporting version 3.0 just yet'
59
+ # note that mzXML version 3.0 *can* have more than one peak...
60
+ # I'm not sure how to deal with that since I have one spectrum/scan
61
+ end
62
+ end
63
+
64
+
65
+ # returns an array of msrun objects
66
+ def msruns(file)
67
+ raise NotImplementedError
68
+ end
69
+
70
+ # returns a string with double </scan></scan> tags into single and missing
71
+ # </scan> tags after peaks added in
72
+ # we do this in windows style since these are generated off a windows
73
+ # machine only
74
+ def fix_bad_scan_tags(file)
75
+ IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
76
+ end
77
+
78
+ # right now cannot parse multiple runs out of an mzXML version 2 file since
79
+ # this is built around a single run per file
80
+ # OPTIONS:
81
+ # :msrun => MSRun # use this object instead of creating one
82
+ # :spectra => *true|false # if false don't get spectra
83
+ def msrun(file, opts={})
84
+ unless opts.key?(:spectra)
85
+ opts[:spectra] = true
86
+ end
87
+
88
+ msrun_obj =
89
+ if x = opts[:msrun]
90
+ msrun_obj = x
91
+ else
92
+ MS::MSRun.new
93
+ end
94
+
95
+ root =
96
+ if @version == '2.0'
97
+ string = fix_bad_scan_tags(file)
98
+ get_root_node_from_string(string)
99
+ else
100
+ get_root_node_from_file(file)
101
+ end
102
+
103
+ # right now we are only finding the first msRun (probably a rare case of
104
+ # multiple runs in an mzXML file...)
105
+ msrun_n =
106
+ if @version >= '2.0'
107
+ kids = root.children.select {|v| v.name == 'msRun' }
108
+ raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1
109
+ kids.first
110
+ else
111
+ root
112
+ end
113
+ if msrun_n.name != 'msRun'
114
+ raise RuntimeError, "extra node slipped in somehow"
115
+ end
116
+
117
+ ## HEADER
118
+ scan_count = msrun_n['scanCount'].to_i
119
+ msrun_obj.scan_count = scan_count
120
+ scans_by_num = Array.new(scan_count + 1)
121
+
122
+ ## SPECTRUM
123
+ parent = nil
124
+ scans = Array.new( scan_count )
125
+ scn_index = 0
126
+
127
+ # we should be able to do this, but it's not working!!!
128
+ #scan_n = msrun_n.find_first('scan')
129
+ #while (scn_index < scan_count)
130
+ get_spectra = opts[:spectra]
131
+
132
+ msrun_n.each do |scan_n|
133
+ next unless scan_n.name == 'scan'
134
+ scan = create_scan(scan_n, scans_by_num, get_spectra)
135
+ scans[scn_index] = scan
136
+ #sc = scan_n.next
137
+ scans_by_num[scan[0]] = scan
138
+ scn_index += 1
139
+ end
140
+
141
+
142
+ ## update the scan's parents
143
+ MS::MSRun.add_parent_scan(scans)
144
+
145
+ # note that startTime and endTime are optional AND in >2.2 are dateTime
146
+ # instead of duration types!, so we will just use scan times...
147
+ # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
148
+ # export. They give the start and end time in seconds, but they are
149
+ # really minutes. All the more reason to use the first and last scans!
150
+ msrun_obj.start_time = scans.first.time
151
+ msrun_obj.end_time = scans.last.time
152
+
153
+ msrun_obj.scans = scans
154
+ end
155
+
156
+ end
157
+
158
+
159
+
@@ -0,0 +1,253 @@
1
+
2
+ require 'xml_style_parser'
3
+ require 'ms/spectrum'
4
+ require 'ms/scan'
5
+
6
+
7
+ class MS::Parser::MzXML::Hpricot
8
+ include XMLStyleParser
9
+ include MS::Parser::MzXML
10
+
11
+ @@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
12
+
13
+ def initialize(parse_type=:msrun, version='1.0')
14
+ @method = parse_type
15
+ @version = version
16
+ end
17
+
18
+ def new_scan_from_hash(node)
19
+ scan = MS::Scan.new # array class creates one with 9 positions
20
+ scan[0] = node['num'].to_i
21
+ scan[1] = node['msLevel'].to_i
22
+ scan[2] = node['retentionTime'][2...-1].to_f
23
+ if x = node['startMz']
24
+ scan[3] = x.to_f
25
+ scan[4] = node['endMz'].to_f
26
+ end
27
+ scan
28
+ end
29
+
30
+ # takes a scan node and creates a scan object
31
+ # the parent scan is the one directly above it in mslevel
32
+ # if the
33
+ def create_scan(scan_n, scans_by_num, get_spectra=true)
34
+ if @version < '3.0'
35
+ scan = new_scan_from_hash(scan_n)
36
+ precs = []
37
+ scan_n.each_child do |node|
38
+ case node.name
39
+ when 'precursorMz'
40
+ # should be able to do this!!!
41
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
42
+ prec = MS::Precursor.new
43
+ prec[1] = node['precursorIntensity'].to_f
44
+ prec[0] = node.content.to_f
45
+ if x = node['precursorScanNum']
46
+ prec[2] = scans_by_num[x.to_i]
47
+ end
48
+ precs << prec
49
+ when 'peaks'
50
+ next unless get_spectra
51
+ # SHOULD be able to do this!!
52
+ #peaks_n = scan_n.find_first('child::peaks')
53
+ scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
54
+ end
55
+ end
56
+ scan[5] = precs
57
+ scan
58
+ else # for version > 3.0
59
+ abort 'not supporting version 3.0 just yet'
60
+ # note that mzXML version 3.0 *can* have more than one peak...
61
+ # I'm not sure how to deal with that since I have one spectrum/scan
62
+ end
63
+ end
64
+
65
+
66
+ # returns an array of msrun objects
67
+ def msruns(file)
68
+ raise NotImplementedError
69
+ end
70
+
71
+ # returns a string with double </scan></scan> tags into single and missing
72
+ # </scan> tags after peaks added in
73
+ # we do this in windows style since these are generated off a windows
74
+ # machine only
75
+ def fix_bad_scan_tags(file)
76
+ IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
77
+ end
78
+
79
+ # right now cannot parse multiple runs out of an mzXML version 2 file since
80
+ # this is built around a single run per file
81
+ # OPTIONS:
82
+ # :msrun => MSRun # use this object instead of creating one
83
+ # :spectra => *true|false # if false don't get spectra
84
+ def msrun(file, opts={})
85
+ unless opts.key?(:spectra)
86
+ opts[:spectra] = true
87
+ end
88
+
89
+ msrun_obj =
90
+ if x = opts[:msrun]
91
+ msrun_obj = x
92
+ else
93
+ MS::MSRun.new
94
+ end
95
+
96
+ doc = File.open(file) {|fh| ::Hpricot.XML(fh) }
97
+ #if @version == '2.0'
98
+ # # may not be necessary in hpricot!
99
+ # #string = fix_bad_scan_tags(file)
100
+ # #XML::Parser.string(string).parse
101
+ #else
102
+ # XML::Document.file(file)
103
+ #end
104
+ msrun_n = doc.at('msRun')
105
+
106
+ ## HEADER
107
+ scan_count = msrun_n['scanCount'].to_i
108
+ msrun_obj.scan_count = scan_count
109
+ scans_by_num = Array.new(scan_count + 1)
110
+
111
+ ## SPECTRUM
112
+ parent = nil
113
+ scans = Array.new( scan_count )
114
+ scn_index = 0
115
+
116
+ # we should be able to do this, but it's not working!!!
117
+ #scan_n = msrun_n.find_first('scan')
118
+ #while (scn_index < scan_count)
119
+ get_spectra = opts[:spectra]
120
+
121
+ msrun_n.each_child do |scan_n|
122
+ p scan_n
123
+ abort
124
+
125
+ next unless scan_n.name == 'scan'
126
+ scan = create_scan(scan_n, scans_by_num, get_spectra)
127
+ scans[scn_index] = scan
128
+ sc = scan_n.next
129
+ scans_by_num[scan[0]] = scan
130
+ scn_index += 1
131
+ end
132
+
133
+
134
+ ## update the scan's parents
135
+ MS::MSRun.add_parent_scan(scans)
136
+
137
+ # note that startTime and endTime are optional AND in >2.2 are dateTime
138
+ # instead of duration types!, so we will just use scan times...
139
+ # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
140
+ # export. They give the start and end time in seconds, but they are
141
+ # really minutes. All the more reason to use the first and last scans!
142
+ msrun_obj.start_time = scans.first.time
143
+ msrun_obj.end_time = scans.last.time
144
+
145
+ msrun_obj.scans = scans
146
+ end
147
+
148
+ end
149
+
150
+
151
+
152
+ =begin
153
+ ## THIS IS THE SAX PARSER VERSION. IT NEEDS A BIT OF BRUSH UP AND IT WOULD
154
+ ## WORK. I THINK THE default guy is probably faster
155
+
156
+ def msrun(file, msrun_obj)
157
+ # Figure out where the first scan is at in the file:
158
+ pos_after_first_scan = nil
159
+ File.open(file) do |fh|
160
+ fh.each do |line|
161
+ if line =~ /<scan/
162
+ pos_after_first_scan = fh.pos
163
+ end
164
+ end
165
+ end
166
+
167
+ # Get only the header:
168
+ header_string = IO.read(file, pos_after_first_scan)
169
+
170
+ @msrun_obj = msrun_obj
171
+ # Parse out the header info:
172
+ parser = XML::SaxParser.new
173
+ parser.string = header_string
174
+ parser.on_start_element do |name, attrs|
175
+ if name == 'msRun'
176
+ @msrun_obj.scan_count = attrs['scanCount'].to_i
177
+ @msrun_obj.start_time = attrs['startTime'][2...-1].to_f
178
+ @msrun_obj.end_time = attrs['endTime'][2...-1].to_f
179
+ end
180
+ end
181
+ parser.parse
182
+
183
+
184
+ # Parse the scans out:
185
+ scan_st = 'scan'
186
+ prec_st = 'precursorMz'
187
+ peaks_st = 'peaks'
188
+ prec_inten_st = 'precursorIntensity'
189
+ precision_st = 'precision'
190
+
191
+ #parser = MS::Parser::MzXML::Hpricot::SaxParser::MSRun.new
192
+ parser = XML::SaxParser.new
193
+ parser.filename = file
194
+ parser.on_start_document do
195
+ @scans = []
196
+ @current_scan = nil
197
+ @get_peaks = false
198
+ @get_prec_mz = false
199
+ end
200
+
201
+ parser.on_characters do |chars|
202
+ if @get_peaks
203
+ @get_peaks << chars
204
+ elsif @get_prec_mz
205
+ @get_prec_mz << chars
206
+ end
207
+ end
208
+
209
+ parser.on_end_element do |el|
210
+ case el
211
+ when 'peaks'
212
+ @current_scan.spectrum = Spectrum.from_base64_peaks(@get_peaks, @precision, true)
213
+ @get_peaks = false
214
+ when 'precursorMz'
215
+ @current_scan[5] = [Precursor.new([@get_prec_mz.to_f])]
216
+ @get_prec_mz = false
217
+ end
218
+ end
219
+
220
+ parser.on_start_element do |name, attr_hash|
221
+ case name
222
+ when scan_st
223
+ @current_scan = new_scan_from_hash(attr_hash)
224
+ sz = @scans.size
225
+ @scans << @current_scan
226
+ when prec_st
227
+ @current_scan[5].first[1] = attr_hash[prec_inten_st].to_f
228
+ @get_prec_mz = ''
229
+ when peaks_st
230
+ @precision = attr_hash[precision_st].to_i
231
+ case @version[0,1].to_ip
232
+ when 3
233
+ if ch['pairOrder'] != 'm/z-int' # only version 3.0 has others
234
+ abort "cannot yet read anything but 'm/z-int' pair order"
235
+ end
236
+ end
237
+ @get_peaks = ''
238
+ end
239
+ end
240
+ parser.parse
241
+
242
+ @msrun_obj.scans = @scans
243
+ @msrun_obj.scans.each_with_index do |sc,i|
244
+ if sc.spectrum.mz == nil
245
+ abort "INDEX: #{i}"
246
+ end
247
+ end
248
+ @msrun_obj
249
+ end
250
+ =end
251
+
252
+
253
+
@@ -0,0 +1,15 @@
1
+
2
+ require 'ms/parser/mzxml/dom'
3
+
4
+ class MS::Parser::MzXML::LibXML < MS::Parser::MzXML::DOM
5
+ def goot_root_node_from_string(string)
6
+ XML::Parser.string(string).parse.root
7
+ end
8
+ def get_root_node_from_file(file)
9
+ XML::Document.file(file).root
10
+ end
11
+ end
12
+
13
+
14
+
15
+