mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,160 @@
1
+ require 'xml_style_parser'
2
+ require 'ms/spectrum'
3
+ require 'ms/scan'
4
+
5
+ module MS::Parser::MzData ; end
6
+
7
+ class MS::Parser::MzData::DOM
8
+ include XMLStyleParser
9
+ include MS::Parser::MzData
10
+
11
+ def initialize(parse_type=:msrun, version='1.0')
12
+ @method = parse_type
13
+ @version = version
14
+ end
15
+
16
+ # true if there is a node <dataProcessing><software><name>Bioworks Browser</...>
17
+ # otherwise false
18
+ def is_bioworks33?(description_node)
19
+ begin
20
+ software_node = description_node.find_first('child::dataProcessing').find_first('child::software')
21
+ name = software_node.find_first('child::name').content
22
+ version = software_node.find_first('child::version').content
23
+ ((name == 'Bioworks Browser') and (version == '3.3'))
24
+ rescue
25
+ false
26
+ end
27
+ end
28
+
29
+ # OPTIONS:
30
+ # :msrun => MSRun # use this object instead of creating one
31
+ # :spectra => *true|false # if false don't get spectra
32
+ def msrun(file, opts={})
33
+ unless opts.key?(:spectra)
34
+ opts[:spectra] = true
35
+ end
36
+ msrun_obj =
37
+ if x = opts[:msrun]
38
+ msrun_obj = x
39
+ else
40
+ MS::MSRun.new
41
+ end
42
+ # should ensure that parsing is not counting spaces...
43
+
44
+ # a string we'd parse like this:
45
+ # doc = XML::Parser.string(st).parse
46
+
47
+ # WE NEED TO GET scan_count, start_time and end_time!!!!
48
+ id_to_scan_hash = {}
49
+
50
+ # 0 1 2 3 4 5 6
51
+ # %w(num msLevel retentionTime startMz endMz precursors spectrum)
52
+
53
+ root = get_root_node_from_file(file)
54
+ scan_count = 0
55
+ description = root.find_first('child::description')
56
+ bioworks33 = is_bioworks33?(description)
57
+ spectrum_list = description.next
58
+ scans =
59
+ if bioworks33
60
+ [] #bioworks33 gives incorrect scan numbers!
61
+ else
62
+ Array(spectrum_list['count'].to_i)
63
+ end
64
+
65
+ # if I move from node to node, it means I've checked that it's a sequence
66
+ # and that the elements are req'd
67
+ if spectrum_list.child?
68
+ spectrum_n = spectrum_list.child
69
+ loop do
70
+ scan_count += 1
71
+ scan = MS::Scan.new(9)
72
+ id = spectrum_n["id"].to_i
73
+ id_to_scan_hash[id] = scan
74
+ spec_desc_n = spectrum_n.child # required in sequence
75
+ spec_settings_n = spec_desc_n.child # required in sequence
76
+ if acq_n = spec_settings_n.find_first('descendant::acquisition')
77
+ scan[0] = acq_n['acqNumber'].to_i
78
+ else
79
+ scan[0] = id
80
+ end
81
+ spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
82
+ scan[1] = spec_inst_n['msLevel'].to_i
83
+
84
+ if bioworks33
85
+ scans << scan # we can't trust the scan count!
86
+ else
87
+ scans[scan_count] = scan
88
+ end
89
+
90
+ scan[3] = spec_inst_n['mzRangeStart'].to_f
91
+ scan[4] = spec_inst_n['mzRangeStop'].to_f
92
+ spec_inst_n.find('child::cvParam').each do |cv_param|
93
+ if cv_param['name'] == 'TimeInMinutes'
94
+ scan[2] = cv_param['value'].to_f * 60 #convert to seconds
95
+ end
96
+ end
97
+ if scan[1] > 1 # precursormz info
98
+ prec_list_n = spec_settings_n.next
99
+ abort('can only process one precursor m/z right now!') if prec_list_n['count'] != '1'
100
+ precursors = prec_list_n.find('child::precursor').map do |prec_n|
101
+ # %w(mz inten parent ms_level parent charge_states)
102
+ prec = MS::Precursor.new
103
+ unless bioworks33 # bioworks33 points to the wrong scan!!!
104
+ prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
105
+ end
106
+ prec[3] = prec_n['msLevel'].to_i
107
+ charges = []
108
+ prec_n.find('descendant::cvParam').each do |cv_param_n|
109
+ case cv_param_n['name']
110
+ when 'MassToChargeRatio'
111
+ prec[0] = cv_param_n['value'].to_f
112
+ # find the prec intensity
113
+ unless bioworks33
114
+ prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
115
+ end
116
+ when 'ChargeState'
117
+ charges << cv_param_n['value'].to_i
118
+ end
119
+ end
120
+ prec[5] = charges
121
+ prec
122
+ end
123
+ scan[5] = precursors
124
+ else # no precursors
125
+ scan[5] = []
126
+ end
127
+ # here's the one line way of doing it, but it's probably more clear in
128
+ # the loop
129
+ #while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
130
+ if opts[:spectra]
131
+ mz_array_bin_n = nil
132
+ loop do
133
+ mz_array_bin_n = spec_desc_n.next
134
+ break if mz_array_bin_n.name == 'mzArrayBinary'
135
+ end
136
+ data_n = mz_array_bin_n.child
137
+ mz = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
138
+ inten_array_bin_n = mz_array_bin_n.next
139
+ data_n = inten_array_bin_n.child
140
+ inten = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
141
+ scan[6] = MS::Spectrum.new(mz, inten)
142
+ end
143
+
144
+ # set up the next loop
145
+ break unless spectrum_n = spectrum_n.next
146
+ end
147
+ end
148
+ if bioworks33
149
+ MS::MSRun.add_parent_scan(scans, opts[:spectra])
150
+ end
151
+ msrun_obj.scans = scans
152
+ msrun_obj.scan_count = scan_count
153
+ msrun_obj.start_time = msrun_obj.scans.first.time
154
+ msrun_obj.end_time = msrun_obj.scans.last.time
155
+ end
156
+
157
+ end
158
+
159
+
160
+
@@ -0,0 +1,7 @@
1
+
2
+ class MS::Parser::MzData::LibXML < MS::Parser::MzData::DOM
3
+ def get_root_node_from_file(file)
4
+ XML::Document.file(file).root
5
+ end
6
+ end
7
+
@@ -0,0 +1,25 @@
1
+ require 'ms/msrun'
2
+
3
+ module MS; end
4
+
5
+ module MS::Parser::MzData
6
+ Base_dir_for_parsers = 'ms/parser/mzdata'
7
+
8
+ # inherits XMLStyleParser and version
9
+ include MS::Parser
10
+ include XMLStyleParser
11
+
12
+ # returns a specific parser MS::Parser::MzXML::#{ParserType}
13
+ # based on choose_parser from xml_style_parser
14
+ def self.new(parse_type=:msrun, version='1.05')
15
+ @version = version
16
+ @method = parse_type
17
+ #p self.methods.grep /choose_parser/
18
+ XMLStyleParser.require_parse_files(Base_dir_for_parsers)
19
+ parser_class = XMLStyleParser.choose_parser(self, parse_type)
20
+ parser = parser_class.new(parse_type, version)
21
+ end
22
+
23
+ end
24
+
25
+
@@ -0,0 +1,11 @@
1
+ require 'ms/parser/mzxml/dom'
2
+
3
+ class MS::Parser::MzXML::AXML < MS::Parser::MzXML::DOM
4
+ def get_root_node_from_string(string)
5
+ ::AXML.parse(string)
6
+ end
7
+ def get_root_node_from_file(file)
8
+ ::AXML.parse_file(file)
9
+ end
10
+ end
11
+
@@ -0,0 +1,159 @@
1
+ require 'xml_style_parser'
2
+ require 'ms/spectrum'
3
+ require 'ms/scan'
4
+
5
+
6
+ class MS::Parser::MzXML::DOM
7
+ include XMLStyleParser
8
+ include MS::Parser::MzXML
9
+
10
+ #@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
11
+
12
+ def initialize(parse_type=:msrun, version='1.0')
13
+ @method = parse_type
14
+ @version = version
15
+ end
16
+
17
+ def new_scan_from_hash(node)
18
+ scan = MS::Scan.new # array class creates one with 9 positions
19
+ scan[0] = node['num'].to_i
20
+ scan[1] = node['msLevel'].to_i
21
+ scan[2] = node['retentionTime'][2...-1].to_f
22
+ if x = node['startMz']
23
+ scan[3] = x.to_f
24
+ scan[4] = node['endMz'].to_f
25
+ end
26
+ scan
27
+ end
28
+
29
+ # takes a scan node and creates a scan object
30
+ # the parent scan is the one directly above it in mslevel
31
+ # if the
32
+ def create_scan(scan_n, scans_by_num, get_spectra=true)
33
+ if @version < '3.0'
34
+ scan = new_scan_from_hash(scan_n)
35
+ precs = []
36
+ scan_n.each do |node|
37
+ case node.name
38
+ when 'precursorMz'
39
+ # should be able to do this!!!
40
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
41
+ prec = MS::Precursor.new
42
+ prec[1] = node['precursorIntensity'].to_f
43
+ prec[0] = node.content.to_f
44
+ if x = node['precursorScanNum']
45
+ prec[2] = scans_by_num[x.to_i]
46
+ end
47
+ precs << prec
48
+ when 'peaks'
49
+ next unless get_spectra
50
+ # SHOULD be able to do this!!
51
+ #peaks_n = scan_n.find_first('child::peaks')
52
+ scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
53
+ end
54
+ end
55
+ scan[5] = precs
56
+ scan
57
+ else # for version > 3.0
58
+ abort 'not supporting version 3.0 just yet'
59
+ # note that mzXML version 3.0 *can* have more than one peak...
60
+ # I'm not sure how to deal with that since I have one spectrum/scan
61
+ end
62
+ end
63
+
64
+
65
+ # returns an array of msrun objects
66
+ def msruns(file)
67
+ raise NotImplementedError
68
+ end
69
+
70
+ # returns a string with double </scan></scan> tags into single and missing
71
+ # </scan> tags after peaks added in
72
+ # we do this in windows style since these are generated off a windows
73
+ # machine only
74
+ def fix_bad_scan_tags(file)
75
+ IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
76
+ end
77
+
78
+ # right now cannot parse multiple runs out of an mzXML version 2 file since
79
+ # this is built around a single run per file
80
+ # OPTIONS:
81
+ # :msrun => MSRun # use this object instead of creating one
82
+ # :spectra => *true|false # if false don't get spectra
83
+ def msrun(file, opts={})
84
+ unless opts.key?(:spectra)
85
+ opts[:spectra] = true
86
+ end
87
+
88
+ msrun_obj =
89
+ if x = opts[:msrun]
90
+ msrun_obj = x
91
+ else
92
+ MS::MSRun.new
93
+ end
94
+
95
+ root =
96
+ if @version == '2.0'
97
+ string = fix_bad_scan_tags(file)
98
+ get_root_node_from_string(string)
99
+ else
100
+ get_root_node_from_file(file)
101
+ end
102
+
103
+ # right now we are only finding the first msRun (probably a rare case of
104
+ # multiple runs in an mzXML file...)
105
+ msrun_n =
106
+ if @version >= '2.0'
107
+ kids = root.children.select {|v| v.name == 'msRun' }
108
+ raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1
109
+ kids.first
110
+ else
111
+ root
112
+ end
113
+ if msrun_n.name != 'msRun'
114
+ raise RuntimeError, "extra node slipped in somehow"
115
+ end
116
+
117
+ ## HEADER
118
+ scan_count = msrun_n['scanCount'].to_i
119
+ msrun_obj.scan_count = scan_count
120
+ scans_by_num = Array.new(scan_count + 1)
121
+
122
+ ## SPECTRUM
123
+ parent = nil
124
+ scans = Array.new( scan_count )
125
+ scn_index = 0
126
+
127
+ # we should be able to do this, but it's not working!!!
128
+ #scan_n = msrun_n.find_first('scan')
129
+ #while (scn_index < scan_count)
130
+ get_spectra = opts[:spectra]
131
+
132
+ msrun_n.each do |scan_n|
133
+ next unless scan_n.name == 'scan'
134
+ scan = create_scan(scan_n, scans_by_num, get_spectra)
135
+ scans[scn_index] = scan
136
+ #sc = scan_n.next
137
+ scans_by_num[scan[0]] = scan
138
+ scn_index += 1
139
+ end
140
+
141
+
142
+ ## update the scan's parents
143
+ MS::MSRun.add_parent_scan(scans)
144
+
145
+ # note that startTime and endTime are optional AND in >2.2 are dateTime
146
+ # instead of duration types!, so we will just use scan times...
147
+ # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
148
+ # export. They give the start and end time in seconds, but they are
149
+ # really minutes. All the more reason to use the first and last scans!
150
+ msrun_obj.start_time = scans.first.time
151
+ msrun_obj.end_time = scans.last.time
152
+
153
+ msrun_obj.scans = scans
154
+ end
155
+
156
+ end
157
+
158
+
159
+
@@ -0,0 +1,253 @@
1
+
2
+ require 'xml_style_parser'
3
+ require 'ms/spectrum'
4
+ require 'ms/scan'
5
+
6
+
7
+ class MS::Parser::MzXML::Hpricot
8
+ include XMLStyleParser
9
+ include MS::Parser::MzXML
10
+
11
+ @@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
12
+
13
+ def initialize(parse_type=:msrun, version='1.0')
14
+ @method = parse_type
15
+ @version = version
16
+ end
17
+
18
+ def new_scan_from_hash(node)
19
+ scan = MS::Scan.new # array class creates one with 9 positions
20
+ scan[0] = node['num'].to_i
21
+ scan[1] = node['msLevel'].to_i
22
+ scan[2] = node['retentionTime'][2...-1].to_f
23
+ if x = node['startMz']
24
+ scan[3] = x.to_f
25
+ scan[4] = node['endMz'].to_f
26
+ end
27
+ scan
28
+ end
29
+
30
+ # takes a scan node and creates a scan object
31
+ # the parent scan is the one directly above it in mslevel
32
+ # if the
33
+ def create_scan(scan_n, scans_by_num, get_spectra=true)
34
+ if @version < '3.0'
35
+ scan = new_scan_from_hash(scan_n)
36
+ precs = []
37
+ scan_n.each_child do |node|
38
+ case node.name
39
+ when 'precursorMz'
40
+ # should be able to do this!!!
41
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
42
+ prec = MS::Precursor.new
43
+ prec[1] = node['precursorIntensity'].to_f
44
+ prec[0] = node.content.to_f
45
+ if x = node['precursorScanNum']
46
+ prec[2] = scans_by_num[x.to_i]
47
+ end
48
+ precs << prec
49
+ when 'peaks'
50
+ next unless get_spectra
51
+ # SHOULD be able to do this!!
52
+ #peaks_n = scan_n.find_first('child::peaks')
53
+ scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
54
+ end
55
+ end
56
+ scan[5] = precs
57
+ scan
58
+ else # for version > 3.0
59
+ abort 'not supporting version 3.0 just yet'
60
+ # note that mzXML version 3.0 *can* have more than one peak...
61
+ # I'm not sure how to deal with that since I have one spectrum/scan
62
+ end
63
+ end
64
+
65
+
66
+ # returns an array of msrun objects
67
+ def msruns(file)
68
+ raise NotImplementedError
69
+ end
70
+
71
+ # returns a string with double </scan></scan> tags into single and missing
72
+ # </scan> tags after peaks added in
73
+ # we do this in windows style since these are generated off a windows
74
+ # machine only
75
+ def fix_bad_scan_tags(file)
76
+ IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
77
+ end
78
+
79
+ # right now cannot parse multiple runs out of an mzXML version 2 file since
80
+ # this is built around a single run per file
81
+ # OPTIONS:
82
+ # :msrun => MSRun # use this object instead of creating one
83
+ # :spectra => *true|false # if false don't get spectra
84
+ def msrun(file, opts={})
85
+ unless opts.key?(:spectra)
86
+ opts[:spectra] = true
87
+ end
88
+
89
+ msrun_obj =
90
+ if x = opts[:msrun]
91
+ msrun_obj = x
92
+ else
93
+ MS::MSRun.new
94
+ end
95
+
96
+ doc = File.open(file) {|fh| ::Hpricot.XML(fh) }
97
+ #if @version == '2.0'
98
+ # # may not be necessary in hpricot!
99
+ # #string = fix_bad_scan_tags(file)
100
+ # #XML::Parser.string(string).parse
101
+ #else
102
+ # XML::Document.file(file)
103
+ #end
104
+ msrun_n = doc.at('msRun')
105
+
106
+ ## HEADER
107
+ scan_count = msrun_n['scanCount'].to_i
108
+ msrun_obj.scan_count = scan_count
109
+ scans_by_num = Array.new(scan_count + 1)
110
+
111
+ ## SPECTRUM
112
+ parent = nil
113
+ scans = Array.new( scan_count )
114
+ scn_index = 0
115
+
116
+ # we should be able to do this, but it's not working!!!
117
+ #scan_n = msrun_n.find_first('scan')
118
+ #while (scn_index < scan_count)
119
+ get_spectra = opts[:spectra]
120
+
121
+ msrun_n.each_child do |scan_n|
122
+ p scan_n
123
+ abort
124
+
125
+ next unless scan_n.name == 'scan'
126
+ scan = create_scan(scan_n, scans_by_num, get_spectra)
127
+ scans[scn_index] = scan
128
+ sc = scan_n.next
129
+ scans_by_num[scan[0]] = scan
130
+ scn_index += 1
131
+ end
132
+
133
+
134
+ ## update the scan's parents
135
+ MS::MSRun.add_parent_scan(scans)
136
+
137
+ # note that startTime and endTime are optional AND in >2.2 are dateTime
138
+ # instead of duration types!, so we will just use scan times...
139
+ # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
140
+ # export. They give the start and end time in seconds, but they are
141
+ # really minutes. All the more reason to use the first and last scans!
142
+ msrun_obj.start_time = scans.first.time
143
+ msrun_obj.end_time = scans.last.time
144
+
145
+ msrun_obj.scans = scans
146
+ end
147
+
148
+ end
149
+
150
+
151
+
152
+ =begin
153
+ ## THIS IS THE SAX PARSER VERSION. IT NEEDS A BIT OF BRUSH UP AND IT WOULD
154
+ ## WORK. I THINK THE default guy is probably faster
155
+
156
+ def msrun(file, msrun_obj)
157
+ # Figure out where the first scan is at in the file:
158
+ pos_after_first_scan = nil
159
+ File.open(file) do |fh|
160
+ fh.each do |line|
161
+ if line =~ /<scan/
162
+ pos_after_first_scan = fh.pos
163
+ end
164
+ end
165
+ end
166
+
167
+ # Get only the header:
168
+ header_string = IO.read(file, pos_after_first_scan)
169
+
170
+ @msrun_obj = msrun_obj
171
+ # Parse out the header info:
172
+ parser = XML::SaxParser.new
173
+ parser.string = header_string
174
+ parser.on_start_element do |name, attrs|
175
+ if name == 'msRun'
176
+ @msrun_obj.scan_count = attrs['scanCount'].to_i
177
+ @msrun_obj.start_time = attrs['startTime'][2...-1].to_f
178
+ @msrun_obj.end_time = attrs['endTime'][2...-1].to_f
179
+ end
180
+ end
181
+ parser.parse
182
+
183
+
184
+ # Parse the scans out:
185
+ scan_st = 'scan'
186
+ prec_st = 'precursorMz'
187
+ peaks_st = 'peaks'
188
+ prec_inten_st = 'precursorIntensity'
189
+ precision_st = 'precision'
190
+
191
+ #parser = MS::Parser::MzXML::Hpricot::SaxParser::MSRun.new
192
+ parser = XML::SaxParser.new
193
+ parser.filename = file
194
+ parser.on_start_document do
195
+ @scans = []
196
+ @current_scan = nil
197
+ @get_peaks = false
198
+ @get_prec_mz = false
199
+ end
200
+
201
+ parser.on_characters do |chars|
202
+ if @get_peaks
203
+ @get_peaks << chars
204
+ elsif @get_prec_mz
205
+ @get_prec_mz << chars
206
+ end
207
+ end
208
+
209
+ parser.on_end_element do |el|
210
+ case el
211
+ when 'peaks'
212
+ @current_scan.spectrum = Spectrum.from_base64_peaks(@get_peaks, @precision, true)
213
+ @get_peaks = false
214
+ when 'precursorMz'
215
+ @current_scan[5] = [Precursor.new([@get_prec_mz.to_f])]
216
+ @get_prec_mz = false
217
+ end
218
+ end
219
+
220
+ parser.on_start_element do |name, attr_hash|
221
+ case name
222
+ when scan_st
223
+ @current_scan = new_scan_from_hash(attr_hash)
224
+ sz = @scans.size
225
+ @scans << @current_scan
226
+ when prec_st
227
+ @current_scan[5].first[1] = attr_hash[prec_inten_st].to_f
228
+ @get_prec_mz = ''
229
+ when peaks_st
230
+ @precision = attr_hash[precision_st].to_i
231
+ case @version[0,1].to_ip
232
+ when 3
233
+ if ch['pairOrder'] != 'm/z-int' # only version 3.0 has others
234
+ abort "cannot yet read anything but 'm/z-int' pair order"
235
+ end
236
+ end
237
+ @get_peaks = ''
238
+ end
239
+ end
240
+ parser.parse
241
+
242
+ @msrun_obj.scans = @scans
243
+ @msrun_obj.scans.each_with_index do |sc,i|
244
+ if sc.spectrum.mz == nil
245
+ abort "INDEX: #{i}"
246
+ end
247
+ end
248
+ @msrun_obj
249
+ end
250
+ =end
251
+
252
+
253
+
@@ -0,0 +1,15 @@
1
+
2
+ require 'ms/parser/mzxml/dom'
3
+
4
+ class MS::Parser::MzXML::LibXML < MS::Parser::MzXML::DOM
5
+ def goot_root_node_from_string(string)
6
+ XML::Parser.string(string).parse.root
7
+ end
8
+ def get_root_node_from_file(file)
9
+ XML::Document.file(file).root
10
+ end
11
+ end
12
+
13
+
14
+
15
+