mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,122 @@
1
+ require 'strscan'
2
+
3
+ module MS::Parser::MzXML ; end
4
+
5
+ class MS::Parser::MzXML::Regexp
6
+ @@first_scan_regexp = /<scan /o
7
+ include MS::Parser::MzXML
8
+
9
+ def initialize(method=:msrun, version='1.0')
10
+ @method = method
11
+ @version = version
12
+ end
13
+
14
+ def parse(file)
15
+ send(@method, file)
16
+ end
17
+
18
+ # returns a MS::MsRun Object
19
+ def msrun(file)
20
+ fh = File.open(file)
21
+ get_header(fh)
22
+
23
+ fh.close
24
+ end
25
+
26
+ #def msrun(file, opts={})
27
+ #end
28
+
29
+ @@scan_re = /<scan.*?num="(\d+)"(.*?)<\/scan>/mo
30
+ def self.precursor_mz_and_intensity_by_scan(file)
31
+ prec_re = /msLevel="2".*?<precursorMz precursorIntensity="([\d\.]+)".*?>([\d\.]+)<\/precursorMz>/mo
32
+ self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures.reverse}
33
+ end
34
+
35
+ # (array will likely start at 1!)
36
+ def self.by_scan_num(file, regex)
37
+ arr = []
38
+ File.open(file) do |fh|
39
+ string = fh.read
40
+ matches = string.scan(@@scan_re)
41
+ matches.each do |matched|
42
+ if inner_match = regex.match(matched[1])
43
+ index = matched[0].to_i
44
+ arr[index] = yield(inner_match)
45
+ end
46
+ end
47
+ end
48
+ arr
49
+ end
50
+
51
+ # Returns array where array[scan_num] = precursorMz
52
+ # Parent scans armme not arrayed
53
+ # Values are strings. Array index likely starts at 1!
54
+ # @TODO: replace the use of a yield block
55
+ def self.precursor_mz_by_scan(file)
56
+ prec_re = /msLevel="2".*?<precursorMz.*?>([\d\.]+)<\/precursorMz>/mo
57
+ self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures[0]}
58
+ end
59
+
60
+ end
61
+
62
+
63
+ class MS::Parser::MzXML::Regexp::MsRun
64
+ @@scan_count_regexp = /scanCount="(\d+)"/o
65
+ @@start_time_regexp = /startTime="PT([\d\.]+)S"/o
66
+ @@end_time_regexp = /endTime="PT([\d\.]+)S"/o
67
+ @@first_scan_regexp = /<scan /
68
+
69
+ def initialize(version='1.0')
70
+ @version = version
71
+ end
72
+
73
+ def parse(io, msrun_object)
74
+ atts = {}
75
+ [:scan_count, :start_time, :end_time].zip(get_header_info(io)) {|v,k| atts[k] = v }
76
+ ###
77
+ # HERE <------------------------------------
78
+ abort "NEED TO FINISH WRITING SCANS EXTRACTOR!"
79
+ get_scans(io)
80
+ # HERE <------------------------------------
81
+
82
+ # set the attributes
83
+ atts.each do |k,v|
84
+ msrun_object.send(k,v)
85
+ end
86
+ # need to fill in the scan_counts array
87
+ end
88
+
89
+ # assumes the attributes are each on a line
90
+ def get_scans(io)
91
+ io.each do |line|
92
+ end
93
+ end
94
+
95
+ # returns [total_num_scans, start_time, end_time] and positions the handle
96
+ # so that the next 'gets' will call a scan
97
+ def get_header_info(io)
98
+ scan_count = nil
99
+ start_time = nil
100
+ end_time = nil
101
+
102
+ previous_position = nil
103
+ io.each do |line|
104
+ if line =~ @@scan_count_regexp
105
+ scan_count = $1.dup
106
+ end
107
+ if line =~ @@start_time_regexp
108
+ start_time = $1.dup
109
+ end
110
+ if line =~ @@end_time_regexp
111
+ end_time = $1.dup
112
+ end
113
+ if line =~ @@first_scan_regexp
114
+ io.pos = previous_position
115
+ break
116
+ end
117
+ previous_position = io.pos
118
+ end
119
+ [scan_count, start_time, end_time]
120
+ end
121
+
122
+ end
@@ -0,0 +1,72 @@
1
+ require 'rexml/document'
2
+ require 'rexml/streamlistener'
3
+
4
+ module MS::Parser::MzXML::REXMLStreamListener; end
5
+ class MS::Parser::MzXML::REXMLStreamListener::PrecMzByNum; end
6
+
7
+ module REXMLStreamListenerHelper
8
+ def parse_and_report(file, const, report_method=:report)
9
+ listener = self.const_get(const).new
10
+ File.open(file) do |fh|
11
+ REXML::Document.parse_stream(fh, listener)
12
+ end
13
+ listener.send(report_method)
14
+ end
15
+ end
16
+
17
+ class MS::Parser::MzXML::REXML
18
+ include MS::Parser::MzXML
19
+
20
+ def initialize(version='1.0', method=:msrun)
21
+ @version = version
22
+ @method = parse_type
23
+ end
24
+
25
+ # returns an array indexed by scan_num that gives the precursor_mz
26
+ def precursor_mz_by_scan(file, opts={})
27
+ parse_and_report(file, PrecMzByNum)
28
+ end
29
+
30
+ end
31
+
32
+
33
+
34
+
35
+ # for REXML
36
+ class MS::Parser::MzXML::REXML::PrecMzByNum
37
+ include REXML::StreamListener
38
+
39
+ attr_accessor :prec_mz
40
+ alias_method :report, :prec_mz
41
+
42
+ def initialize
43
+ @prec_mz = []
44
+ @scan_num = nil
45
+ @get_data = false
46
+ end
47
+
48
+ def tag_start(name,attrs)
49
+ if name == "scan"
50
+ @scan_num = attrs["num"].to_i
51
+ elsif name == "precursorMz"
52
+ @get_data = true
53
+ end
54
+ end
55
+
56
+ def tag_end(name)
57
+ if name == "precursorMz"
58
+ @get_data = false
59
+ end
60
+ end
61
+
62
+ def text(txt)
63
+ if @get_data
64
+ @prec_mz[@scan_num] = txt
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+
71
+
72
+
@@ -0,0 +1,248 @@
1
+ require 'xmlparser_wrapper'
2
+
3
+ # this is the wrapper class
4
+ class MS::Parser::MzXML::XMLParser
5
+ include XMLStyleParser
6
+ include MS::Parser::MzXML
7
+ include XMLParserWrapper
8
+
9
+ def initialize(parse_type=:msrun, version='1.0')
10
+ @method = parse_type
11
+ @version = version
12
+ end
13
+
14
+ # returns: [times_arr, [m/z,inten,m/z,inten...]]
15
+ # where times are time strings (in seconds)
16
+ def times_and_spectra(file, opts={})
17
+ parse_and_report(file, 'TimesAndSpectra')
18
+ end
19
+
20
+
21
+ ## IN PROGRESS ...
22
+ # opts is actually the msrun object that will be fleshed out in the parsing
23
+ def msrun(file, opts={})
24
+ p opts
25
+ fh = File.open(file)
26
+ reply = parse_and_report_io(fh, 'MsRunHeader')
27
+ p reply
28
+ abort
29
+ fh.close
30
+ end
31
+
32
+ def prec_mz_by_scan_num(file, opts={})
33
+ end
34
+
35
+ # could easily do this for all these guys
36
+ #def method_missing(*args)
37
+ # method = args.shift
38
+ # parse_and_report(
39
+ #end
40
+
41
+ end
42
+
43
+ class MS::Parser::MzXML::XMLParser::MsRunHeader < XMLParser
44
+ def initialize(version='1.0')
45
+ @version = version
46
+ @atts = []
47
+ end
48
+
49
+ def startElement(name,attrs)
50
+ case name
51
+ when 'msRun'
52
+ @atts = attrs.values_at(%w(scanCount startTime endTime))
53
+ end
54
+ end
55
+
56
+ def endElement(name)
57
+ if name == 'dataProcessing'
58
+ done
59
+ reset
60
+ end
61
+ end
62
+ end
63
+
64
+ class MS::Parser::MzXML::XMLParser::Spectrum < XMLParser
65
+ @@scan_atts = %w(num msLevel retentionTime startMz endMz)
66
+ @@precursor_mz_atts = %w(precursorIntensity)
67
+
68
+
69
+ def initialize(version='1.0')
70
+ @version = version
71
+ @spectrum = []
72
+ @current_scan = nil
73
+ end
74
+
75
+ def report
76
+ @spectrum
77
+ end
78
+
79
+ def startElement(name,attrs)
80
+ if name == 'scan'
81
+ vals = attrs.values_at(@@scan_atts)
82
+ vals[2] = vals[2][2...-1].to_f #remove PT and trailing S
83
+ [0, 1].each do |i| vals[i] = vals[i].to_i end # num and ms_level
84
+ [3, 4].each do |i| vals[i] = vals[i].to_f end # start_mz and end_mz
85
+ @current_scan = MS::Scan.new(vals)
86
+ elsif name == 'precursorMz'
87
+ # 5, 6, 7 are the scans indices for prec_mz prec_inten and parent
88
+ @current_scan[6] = attrs['precursorIntensity'].to_f
89
+ @current_scan[5] = ''
90
+ @get_precursor_mz = true
91
+ elsif name == 'peaks'
92
+ @precision = attrs['precision'].to_i
93
+ @get_peaks = true
94
+ @current_peaks_string = ''
95
+ end
96
+ end
97
+
98
+ def endElement(name)
99
+ if name == 'peaks'
100
+ @get_peaks = false
101
+ @spectrum << Spectrum.new(@current_peaks_string, @precision)
102
+ @spectrum.context = @current_scan
103
+ elsif name == 'precursorMz'
104
+ @current_scan[5] = @current_scan[5].to_f
105
+ @get_precursor_mz = false
106
+ end
107
+ end
108
+
109
+ def character(data)
110
+ if @get_peaks
111
+ @current_peaks_string << data
112
+ elsif @get_precursor_mz
113
+ @current_scan[5] << data
114
+ end
115
+ end
116
+
117
+ end
118
+
119
+
120
+
121
+
122
+ class MS::Parser::MzXML::XMLParser::PrecMzByNum < XMLParser
123
+ @scan_num = nil
124
+ @get_data = false
125
+
126
+ attr_accessor :prec_mz
127
+ alias_method :report, :prec_mz
128
+
129
+ def initialize
130
+ @prec_mz = []
131
+ end
132
+
133
+ def startElement(name,attrs)
134
+ if name == "scan"
135
+ @scan_num = attrs["num"].to_i
136
+ elsif name == "precursorMz"
137
+ @current_prec_mz = ""
138
+ @get_data = true
139
+ end
140
+ end
141
+
142
+ def endElement(name)
143
+ if name == "precursorMz"
144
+ @get_data = false
145
+ @prec_mz[@scan_num] = @current_prec_mz.to_f
146
+ end
147
+ end
148
+
149
+ def character(data)
150
+ if @get_data
151
+ @current_prec_mz << data
152
+ end
153
+ end
154
+
155
+ end
156
+
157
+
158
+ =begin
159
+
160
+
161
+ # Returns parallel arrays (times, spectra) where each spectra is an array
162
+ # containing alternating mz and intensity (MS1 scans only)
163
+ # and times are strings with the time in seconds
164
+ class MS::Parser::MzXML::XMLParser::TimesAndSpectra < XMLParser
165
+ include MS::Parser::MzXML
166
+ @@get_data = false
167
+ @@get_peaks = false
168
+ @@precision = 32 # @TODO: set dynamic
169
+
170
+ attr_accessor :times, :spectra
171
+ def times_and_spectra
172
+ [@times, @spectra]
173
+ end
174
+
175
+ alias_method :report, :times_and_spectra
176
+
177
+ def initialize(ms_level=1)
178
+ @ms_level = "#{ms_level}"
179
+ @times = []
180
+ @spectra = []
181
+ end
182
+
183
+ def startElement(name,attrs)
184
+ if name == "scan" && attrs["msLevel"] == @ms_level
185
+ @times << attrs["retentionTime"][2...-1] # strip PT and S: "PTx.xxxxS"
186
+ @@get_peaks = true
187
+ elsif name == "peaks" && @@get_peaks
188
+ @@get_data = true
189
+ @data = ""
190
+ end
191
+ end
192
+
193
+ def character(data)
194
+ if @@get_data
195
+ @data << data
196
+ end
197
+ end
198
+
199
+ def endElement(name)
200
+ if name == "peaks" && @@get_peaks
201
+ @spectra << base64_peaks_to_array(@data, @@precision)
202
+ @@get_data = false
203
+ @@get_peaks = false
204
+ end
205
+ end
206
+
207
+ end
208
+
209
+
210
+ class MS::Parser::MzXML::XMLParser::TimeMzIntenIndexer < XMLParser
211
+
212
+ @@scan_num = nil
213
+ @@get_data = false
214
+
215
+ attr_accessor :scans_by_num
216
+ alias_method :report, :scans_by_num
217
+
218
+ def initialize
219
+ @current_scan = nil
220
+ @scans_by_num = []
221
+ end
222
+
223
+ def startElement(name,attrs)
224
+ if name == "scan"
225
+ num = attrs["num"].to_i
226
+ @current_scan = MS::Scan.new(num, attrs["msLevel"].to_i, attrs["retentionTime"].gsub(/^PT/,'').gsub(/S$/,'').to_f)
227
+ scans_by_num[num] = @current_scan
228
+ elsif name == "precursorMz"
229
+ @current_scan.prec_inten = attrs["precursorIntensity"].to_f
230
+ @@get_data = true
231
+ end
232
+ end
233
+
234
+ def endElement(name)
235
+ if name == "precursorMz"
236
+ @@get_data = false
237
+ end
238
+ end
239
+
240
+ def character(data)
241
+ if @@get_data
242
+ @current_scan.prec_mz = data
243
+ end
244
+ end
245
+
246
+ end
247
+
248
+ =end
@@ -0,0 +1,175 @@
1
+ require 'ms/msrun'
2
+
3
+ module MS; end
4
+
5
+ module MS::Parser::MzXML
6
+ Base_dir_for_parsers = 'ms/parser/mzxml'
7
+ # inherits XMLStyleParser and version
8
+ include MS::Parser
9
+ include XMLStyleParser
10
+
11
+ # returns a specific parser MS::Parser::MzXML::#{ParserType}
12
+ # based on choose_parser from xml_style_parser
13
+ def self.new(parse_type=:msrun, version='1.0')
14
+ @version = version
15
+ @method = parse_type
16
+ XMLStyleParser.require_parse_files(Base_dir_for_parsers)
17
+ parser_class = XMLStyleParser.choose_parser(self, parse_type)
18
+ parser = parser_class.new(parse_type, version)
19
+ end
20
+
21
+ # Returns an array of scans indexed by scan number
22
+ # NOTE that the first scan (zero indexed) will likely be nil!
23
+ # accepts an optional parse_type = 'xmlparser' | 'rexml'
24
+ def scans_by_num(mzXML_file, parse_type=nil)
25
+ unless parse_type
26
+ parse_type = default_parser
27
+ end
28
+ scans = []
29
+ case parse_type
30
+ when 'xmlparser'
31
+ parser = MS::MzXML::XMLParser::TimeMzIntenIndexer.new
32
+ parser.parse(IO.read(mzXML_file))
33
+ scans = parser.scans_by_num
34
+ when 'rexml' # use REXML
35
+ # This is really too slow for files of this size
36
+ doc = REXML::Document.new File.new(mzXML_file)
37
+ doc.elements.each('msRun/scan') do |scan|
38
+ rt = scan.attributes['retentionTime'] ## like PT0.154000S"
39
+ level = scan.attributes['msLevel']
40
+ to_print = []
41
+ prec_mz = nil
42
+ prec_int = nil
43
+ if level.to_i != 1
44
+ scan.elements.each("precursorMz") do |prec|
45
+ prec_mz = prec.text.to_f
46
+ prec_int = prec.attributes["precursorIntensity"].to_f
47
+ end
48
+ end
49
+ # remove the leading PT and trailing S on the retention time!
50
+ rt = rt[2...-1]
51
+
52
+ num = scan.attributes['num'].to_i
53
+ scans[num] = MS::Scan.new(num, scan.attributes['msLevel'].to_i, rt.to_f, prec_mz, prec_int)
54
+ end #doc.elements
55
+ else
56
+ throw ArgumentError, "invalid parse type: #{parse_type}"
57
+ end
58
+ ## update the scans for parents
59
+ MS::Scan.add_parent_scan(scans)
60
+ scans
61
+ end
62
+
63
+ # Returns a Hash indexed by filename (with no extension) for a given path
64
+ # extension = glob (string) or regex
65
+ # The basename is given as: file.split('.').first
66
+ def precursor_mz_by_scan_for_path(path, extension, parse_type=nil)
67
+ hash = {}
68
+ Dir.chdir path do
69
+ files = []
70
+ if extension.class == String
71
+ files = Dir[extension]
72
+ elsif extension.class == Regexp
73
+ files = Dir.entries(".").find_all do |dir|
74
+ dir =~ extension
75
+ end
76
+ else
77
+ puts "extension: #{extension} not a String or Regexp!"
78
+ end
79
+ files.each do |file|
80
+ base = file.split('.').first
81
+ hash[base] = precursor_mz_by_scan(file, parse_type)
82
+ end
83
+ end
84
+ hash
85
+ end
86
+
87
+ # Returns hash where hash[scan_num] = [precursorMz, precursorIntensity]
88
+ # Parent scans are not hashed
89
+ # Keys and values are both strings
90
+ def precursor_mz_and_inten_by_scan(file)
91
+ # in progress
92
+ end
93
+
94
+ # Returns array where array[scan_num] = precursorMz
95
+ # precursorMz are Floats
96
+ # Array index likely starts at 1!
97
+ def precursor_mz_by_scan_num(file)
98
+ ## THIS SHOULD BE CREATED IN specific XML LIBS
99
+ end
100
+
101
+ # Returns a hash of basic info on an mzXML run:
102
+ # *mzXML_elemt* *hash keys (symbols)*
103
+ # scanCount scan_count
104
+ # startTime start_time
105
+ # endTime end_time
106
+ # startMz start_mz
107
+ # endMz end_mz
108
+ def basic_info(mzxml_file)
109
+ puts "parsing: #{mzxml_file} #{File.exist?(mzxml_file)}" if $VERBOSE
110
+ hash = {}
111
+ scan_count_tmp = []
112
+ (1..5).to_a.each do |n| scan_count_tmp[n] = 0 end
113
+ @fh = File.open(mzxml_file)
114
+ @line = ""
115
+ scan_count_tmp[0] = _el("scanCount").to_i
116
+ hash[:start_time] = _el("startTime").sub(/^PT/, "").sub(/S$/,"").to_f
117
+ hash[:end_time] = _el("endTime").sub(/^PT/, "").sub(/S$/,"").to_f
118
+ hash[:ms_level] = _el("msLevel").to_i
119
+ scan_count_tmp[1] = 1
120
+ if hash[:ms_level] == 1
121
+ hash[:start_mz] = _el("startMz").to_f
122
+ hash[:end_mz] = _el("endMz").to_f
123
+ end
124
+
125
+ while !@fh.eof?
126
+ @line = @fh.readline
127
+ ms_level = _el("msLevel")
128
+ if ms_level
129
+ scan_count_tmp[ms_level.to_i] += 1
130
+ else
131
+ break
132
+ end
133
+ end
134
+ scan_count = []
135
+ scan_count_tmp.each do |cnt|
136
+ if cnt != 0
137
+ scan_count.push cnt
138
+ else
139
+ break
140
+ end
141
+ end
142
+ hash[:scan_count] = scan_count
143
+ @fh.close
144
+ hash
145
+ end
146
+
147
+ # returns [start_mz, end_mz] of the first full scan (ms_level == 1)
148
+ def start_and_end_mz(mzxml_file)
149
+ @fh = File.open(mzxml_file)
150
+ ms_level = 0
151
+ @line = ""
152
+ while ms_level != 1
153
+ ms_level = _el("msLevel").to_i
154
+ end
155
+ start_mz = _el("startMz").to_f
156
+ end_mz = _el("endMz").to_f
157
+ @fh.close
158
+ [start_mz, end_mz]
159
+ end
160
+
161
+ def _el(name)
162
+ re = /#{name}="(.*)"/
163
+ while @line !~ re && !@fh.eof?
164
+ @line = @fh.readline
165
+ end
166
+ if $1
167
+ return $1.dup
168
+ else
169
+ return nil
170
+ end
171
+ end
172
+
173
+ end
174
+
175
+