mspire 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
@@ -0,0 +1,122 @@
1
+ require 'strscan'
2
+
3
+ module MS::Parser::MzXML ; end
4
+
5
+ class MS::Parser::MzXML::Regexp
6
+ @@first_scan_regexp = /<scan /o
7
+ include MS::Parser::MzXML
8
+
9
+ def initialize(method=:msrun, version='1.0')
10
+ @method = method
11
+ @version = version
12
+ end
13
+
14
+ def parse(file)
15
+ send(@method, file)
16
+ end
17
+
18
+ # returns a MS::MsRun Object
19
+ def msrun(file)
20
+ fh = File.open(file)
21
+ get_header(fh)
22
+
23
+ fh.close
24
+ end
25
+
26
+ #def msrun(file, opts={})
27
+ #end
28
+
29
+ @@scan_re = /<scan.*?num="(\d+)"(.*?)<\/scan>/mo
30
+ def self.precursor_mz_and_intensity_by_scan(file)
31
+ prec_re = /msLevel="2".*?<precursorMz precursorIntensity="([\d\.]+)".*?>([\d\.]+)<\/precursorMz>/mo
32
+ self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures.reverse}
33
+ end
34
+
35
+ # (array will likely start at 1!)
36
+ def self.by_scan_num(file, regex)
37
+ arr = []
38
+ File.open(file) do |fh|
39
+ string = fh.read
40
+ matches = string.scan(@@scan_re)
41
+ matches.each do |matched|
42
+ if inner_match = regex.match(matched[1])
43
+ index = matched[0].to_i
44
+ arr[index] = yield(inner_match)
45
+ end
46
+ end
47
+ end
48
+ arr
49
+ end
50
+
51
+ # Returns array where array[scan_num] = precursorMz
52
+ # Parent scans armme not arrayed
53
+ # Values are strings. Array index likely starts at 1!
54
+ # @TODO: replace the use of a yield block
55
+ def self.precursor_mz_by_scan(file)
56
+ prec_re = /msLevel="2".*?<precursorMz.*?>([\d\.]+)<\/precursorMz>/mo
57
+ self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures[0]}
58
+ end
59
+
60
+ end
61
+
62
+
63
+ class MS::Parser::MzXML::Regexp::MsRun
64
+ @@scan_count_regexp = /scanCount="(\d+)"/o
65
+ @@start_time_regexp = /startTime="PT([\d\.]+)S"/o
66
+ @@end_time_regexp = /endTime="PT([\d\.]+)S"/o
67
+ @@first_scan_regexp = /<scan /
68
+
69
+ def initialize(version='1.0')
70
+ @version = version
71
+ end
72
+
73
+ def parse(io, msrun_object)
74
+ atts = {}
75
+ [:scan_count, :start_time, :end_time].zip(get_header_info(io)) {|v,k| atts[k] = v }
76
+ ###
77
+ # HERE <------------------------------------
78
+ abort "NEED TO FINISH WRITING SCANS EXTRACTOR!"
79
+ get_scans(io)
80
+ # HERE <------------------------------------
81
+
82
+ # set the attributes
83
+ atts.each do |k,v|
84
+ msrun_object.send(k,v)
85
+ end
86
+ # need to fill in the scan_counts array
87
+ end
88
+
89
+ # assumes the attributes are each on a line
90
+ def get_scans(io)
91
+ io.each do |line|
92
+ end
93
+ end
94
+
95
+ # returns [total_num_scans, start_time, end_time] and positions the handle
96
+ # so that the next 'gets' will call a scan
97
+ def get_header_info(io)
98
+ scan_count = nil
99
+ start_time = nil
100
+ end_time = nil
101
+
102
+ previous_position = nil
103
+ io.each do |line|
104
+ if line =~ @@scan_count_regexp
105
+ scan_count = $1.dup
106
+ end
107
+ if line =~ @@start_time_regexp
108
+ start_time = $1.dup
109
+ end
110
+ if line =~ @@end_time_regexp
111
+ end_time = $1.dup
112
+ end
113
+ if line =~ @@first_scan_regexp
114
+ io.pos = previous_position
115
+ break
116
+ end
117
+ previous_position = io.pos
118
+ end
119
+ [scan_count, start_time, end_time]
120
+ end
121
+
122
+ end
@@ -0,0 +1,72 @@
1
+ require 'rexml/document'
2
+ require 'rexml/streamlistener'
3
+
4
+ module MS::Parser::MzXML::REXMLStreamListener; end
5
+ class MS::Parser::MzXML::REXMLStreamListener::PrecMzByNum; end
6
+
7
+ module REXMLStreamListenerHelper
8
+ def parse_and_report(file, const, report_method=:report)
9
+ listener = self.const_get(const).new
10
+ File.open(file) do |fh|
11
+ REXML::Document.parse_stream(fh, listener)
12
+ end
13
+ listener.send(report_method)
14
+ end
15
+ end
16
+
17
+ class MS::Parser::MzXML::REXML
18
+ include MS::Parser::MzXML
19
+
20
+ def initialize(version='1.0', method=:msrun)
21
+ @version = version
22
+ @method = parse_type
23
+ end
24
+
25
+ # returns an array indexed by scan_num that gives the precursor_mz
26
+ def precursor_mz_by_scan(file, opts={})
27
+ parse_and_report(file, PrecMzByNum)
28
+ end
29
+
30
+ end
31
+
32
+
33
+
34
+
35
+ # for REXML
36
+ class MS::Parser::MzXML::REXML::PrecMzByNum
37
+ include REXML::StreamListener
38
+
39
+ attr_accessor :prec_mz
40
+ alias_method :report, :prec_mz
41
+
42
+ def initialize
43
+ @prec_mz = []
44
+ @scan_num = nil
45
+ @get_data = false
46
+ end
47
+
48
+ def tag_start(name,attrs)
49
+ if name == "scan"
50
+ @scan_num = attrs["num"].to_i
51
+ elsif name == "precursorMz"
52
+ @get_data = true
53
+ end
54
+ end
55
+
56
+ def tag_end(name)
57
+ if name == "precursorMz"
58
+ @get_data = false
59
+ end
60
+ end
61
+
62
+ def text(txt)
63
+ if @get_data
64
+ @prec_mz[@scan_num] = txt
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+
71
+
72
+
@@ -0,0 +1,248 @@
1
+ require 'xmlparser_wrapper'
2
+
3
+ # this is the wrapper class
4
+ class MS::Parser::MzXML::XMLParser
5
+ include XMLStyleParser
6
+ include MS::Parser::MzXML
7
+ include XMLParserWrapper
8
+
9
+ def initialize(parse_type=:msrun, version='1.0')
10
+ @method = parse_type
11
+ @version = version
12
+ end
13
+
14
+ # returns: [times_arr, [m/z,inten,m/z,inten...]]
15
+ # where times are time strings (in seconds)
16
+ def times_and_spectra(file, opts={})
17
+ parse_and_report(file, 'TimesAndSpectra')
18
+ end
19
+
20
+
21
+ ## IN PROGRESS ...
22
+ # opts is actually the msrun object that will be fleshed out in the parsing
23
+ def msrun(file, opts={})
24
+ p opts
25
+ fh = File.open(file)
26
+ reply = parse_and_report_io(fh, 'MsRunHeader')
27
+ p reply
28
+ abort
29
+ fh.close
30
+ end
31
+
32
+ def prec_mz_by_scan_num(file, opts={})
33
+ end
34
+
35
+ # could easily do this for all these guys
36
+ #def method_missing(*args)
37
+ # method = args.shift
38
+ # parse_and_report(
39
+ #end
40
+
41
+ end
42
+
43
+ class MS::Parser::MzXML::XMLParser::MsRunHeader < XMLParser
44
+ def initialize(version='1.0')
45
+ @version = version
46
+ @atts = []
47
+ end
48
+
49
+ def startElement(name,attrs)
50
+ case name
51
+ when 'msRun'
52
+ @atts = attrs.values_at(%w(scanCount startTime endTime))
53
+ end
54
+ end
55
+
56
+ def endElement(name)
57
+ if name == 'dataProcessing'
58
+ done
59
+ reset
60
+ end
61
+ end
62
+ end
63
+
64
+ class MS::Parser::MzXML::XMLParser::Spectrum < XMLParser
65
+ @@scan_atts = %w(num msLevel retentionTime startMz endMz)
66
+ @@precursor_mz_atts = %w(precursorIntensity)
67
+
68
+
69
+ def initialize(version='1.0')
70
+ @version = version
71
+ @spectrum = []
72
+ @current_scan = nil
73
+ end
74
+
75
+ def report
76
+ @spectrum
77
+ end
78
+
79
+ def startElement(name,attrs)
80
+ if name == 'scan'
81
+ vals = attrs.values_at(@@scan_atts)
82
+ vals[2] = vals[2][2...-1].to_f #remove PT and trailing S
83
+ [0, 1].each do |i| vals[i] = vals[i].to_i end # num and ms_level
84
+ [3, 4].each do |i| vals[i] = vals[i].to_f end # start_mz and end_mz
85
+ @current_scan = MS::Scan.new(vals)
86
+ elsif name == 'precursorMz'
87
+ # 5, 6, 7 are the scans indices for prec_mz prec_inten and parent
88
+ @current_scan[6] = attrs['precursorIntensity'].to_f
89
+ @current_scan[5] = ''
90
+ @get_precursor_mz = true
91
+ elsif name == 'peaks'
92
+ @precision = attrs['precision'].to_i
93
+ @get_peaks = true
94
+ @current_peaks_string = ''
95
+ end
96
+ end
97
+
98
+ def endElement(name)
99
+ if name == 'peaks'
100
+ @get_peaks = false
101
+ @spectrum << Spectrum.new(@current_peaks_string, @precision)
102
+ @spectrum.context = @current_scan
103
+ elsif name == 'precursorMz'
104
+ @current_scan[5] = @current_scan[5].to_f
105
+ @get_precursor_mz = false
106
+ end
107
+ end
108
+
109
+ def character(data)
110
+ if @get_peaks
111
+ @current_peaks_string << data
112
+ elsif @get_precursor_mz
113
+ @current_scan[5] << data
114
+ end
115
+ end
116
+
117
+ end
118
+
119
+
120
+
121
+
122
+ class MS::Parser::MzXML::XMLParser::PrecMzByNum < XMLParser
123
+ @scan_num = nil
124
+ @get_data = false
125
+
126
+ attr_accessor :prec_mz
127
+ alias_method :report, :prec_mz
128
+
129
+ def initialize
130
+ @prec_mz = []
131
+ end
132
+
133
+ def startElement(name,attrs)
134
+ if name == "scan"
135
+ @scan_num = attrs["num"].to_i
136
+ elsif name == "precursorMz"
137
+ @current_prec_mz = ""
138
+ @get_data = true
139
+ end
140
+ end
141
+
142
+ def endElement(name)
143
+ if name == "precursorMz"
144
+ @get_data = false
145
+ @prec_mz[@scan_num] = @current_prec_mz.to_f
146
+ end
147
+ end
148
+
149
+ def character(data)
150
+ if @get_data
151
+ @current_prec_mz << data
152
+ end
153
+ end
154
+
155
+ end
156
+
157
+
158
+ =begin
159
+
160
+
161
+ # Returns parallel arrays (times, spectra) where each spectra is an array
162
+ # containing alternating mz and intensity (MS1 scans only)
163
+ # and times are strings with the time in seconds
164
+ class MS::Parser::MzXML::XMLParser::TimesAndSpectra < XMLParser
165
+ include MS::Parser::MzXML
166
+ @@get_data = false
167
+ @@get_peaks = false
168
+ @@precision = 32 # @TODO: set dynamic
169
+
170
+ attr_accessor :times, :spectra
171
+ def times_and_spectra
172
+ [@times, @spectra]
173
+ end
174
+
175
+ alias_method :report, :times_and_spectra
176
+
177
+ def initialize(ms_level=1)
178
+ @ms_level = "#{ms_level}"
179
+ @times = []
180
+ @spectra = []
181
+ end
182
+
183
+ def startElement(name,attrs)
184
+ if name == "scan" && attrs["msLevel"] == @ms_level
185
+ @times << attrs["retentionTime"][2...-1] # strip PT and S: "PTx.xxxxS"
186
+ @@get_peaks = true
187
+ elsif name == "peaks" && @@get_peaks
188
+ @@get_data = true
189
+ @data = ""
190
+ end
191
+ end
192
+
193
+ def character(data)
194
+ if @@get_data
195
+ @data << data
196
+ end
197
+ end
198
+
199
+ def endElement(name)
200
+ if name == "peaks" && @@get_peaks
201
+ @spectra << base64_peaks_to_array(@data, @@precision)
202
+ @@get_data = false
203
+ @@get_peaks = false
204
+ end
205
+ end
206
+
207
+ end
208
+
209
+
210
+ class MS::Parser::MzXML::XMLParser::TimeMzIntenIndexer < XMLParser
211
+
212
+ @@scan_num = nil
213
+ @@get_data = false
214
+
215
+ attr_accessor :scans_by_num
216
+ alias_method :report, :scans_by_num
217
+
218
+ def initialize
219
+ @current_scan = nil
220
+ @scans_by_num = []
221
+ end
222
+
223
+ def startElement(name,attrs)
224
+ if name == "scan"
225
+ num = attrs["num"].to_i
226
+ @current_scan = MS::Scan.new(num, attrs["msLevel"].to_i, attrs["retentionTime"].gsub(/^PT/,'').gsub(/S$/,'').to_f)
227
+ scans_by_num[num] = @current_scan
228
+ elsif name == "precursorMz"
229
+ @current_scan.prec_inten = attrs["precursorIntensity"].to_f
230
+ @@get_data = true
231
+ end
232
+ end
233
+
234
+ def endElement(name)
235
+ if name == "precursorMz"
236
+ @@get_data = false
237
+ end
238
+ end
239
+
240
+ def character(data)
241
+ if @@get_data
242
+ @current_scan.prec_mz = data
243
+ end
244
+ end
245
+
246
+ end
247
+
248
+ =end
@@ -0,0 +1,175 @@
1
+ require 'ms/msrun'
2
+
3
+ module MS; end
4
+
5
+ module MS::Parser::MzXML
6
+ Base_dir_for_parsers = 'ms/parser/mzxml'
7
+ # inherits XMLStyleParser and version
8
+ include MS::Parser
9
+ include XMLStyleParser
10
+
11
+ # returns a specific parser MS::Parser::MzXML::#{ParserType}
12
+ # based on choose_parser from xml_style_parser
13
+ def self.new(parse_type=:msrun, version='1.0')
14
+ @version = version
15
+ @method = parse_type
16
+ XMLStyleParser.require_parse_files(Base_dir_for_parsers)
17
+ parser_class = XMLStyleParser.choose_parser(self, parse_type)
18
+ parser = parser_class.new(parse_type, version)
19
+ end
20
+
21
+ # Returns an array of scans indexed by scan number
22
+ # NOTE that the first scan (zero indexed) will likely be nil!
23
+ # accepts an optional parse_type = 'xmlparser' | 'rexml'
24
+ def scans_by_num(mzXML_file, parse_type=nil)
25
+ unless parse_type
26
+ parse_type = default_parser
27
+ end
28
+ scans = []
29
+ case parse_type
30
+ when 'xmlparser'
31
+ parser = MS::MzXML::XMLParser::TimeMzIntenIndexer.new
32
+ parser.parse(IO.read(mzXML_file))
33
+ scans = parser.scans_by_num
34
+ when 'rexml' # use REXML
35
+ # This is really too slow for files of this size
36
+ doc = REXML::Document.new File.new(mzXML_file)
37
+ doc.elements.each('msRun/scan') do |scan|
38
+ rt = scan.attributes['retentionTime'] ## like PT0.154000S"
39
+ level = scan.attributes['msLevel']
40
+ to_print = []
41
+ prec_mz = nil
42
+ prec_int = nil
43
+ if level.to_i != 1
44
+ scan.elements.each("precursorMz") do |prec|
45
+ prec_mz = prec.text.to_f
46
+ prec_int = prec.attributes["precursorIntensity"].to_f
47
+ end
48
+ end
49
+ # remove the leading PT and trailing S on the retention time!
50
+ rt = rt[2...-1]
51
+
52
+ num = scan.attributes['num'].to_i
53
+ scans[num] = MS::Scan.new(num, scan.attributes['msLevel'].to_i, rt.to_f, prec_mz, prec_int)
54
+ end #doc.elements
55
+ else
56
+ throw ArgumentError, "invalid parse type: #{parse_type}"
57
+ end
58
+ ## update the scans for parents
59
+ MS::Scan.add_parent_scan(scans)
60
+ scans
61
+ end
62
+
63
+ # Returns a Hash indexed by filename (with no extension) for a given path
64
+ # extension = glob (string) or regex
65
+ # The basename is given as: file.split('.').first
66
+ def precursor_mz_by_scan_for_path(path, extension, parse_type=nil)
67
+ hash = {}
68
+ Dir.chdir path do
69
+ files = []
70
+ if extension.class == String
71
+ files = Dir[extension]
72
+ elsif extension.class == Regexp
73
+ files = Dir.entries(".").find_all do |dir|
74
+ dir =~ extension
75
+ end
76
+ else
77
+ puts "extension: #{extension} not a String or Regexp!"
78
+ end
79
+ files.each do |file|
80
+ base = file.split('.').first
81
+ hash[base] = precursor_mz_by_scan(file, parse_type)
82
+ end
83
+ end
84
+ hash
85
+ end
86
+
87
+ # Returns hash where hash[scan_num] = [precursorMz, precursorIntensity]
88
+ # Parent scans are not hashed
89
+ # Keys and values are both strings
90
+ def precursor_mz_and_inten_by_scan(file)
91
+ # in progress
92
+ end
93
+
94
+ # Returns array where array[scan_num] = precursorMz
95
+ # precursorMz are Floats
96
+ # Array index likely starts at 1!
97
+ def precursor_mz_by_scan_num(file)
98
+ ## THIS SHOULD BE CREATED IN specific XML LIBS
99
+ end
100
+
101
+ # Returns a hash of basic info on an mzXML run:
102
+ # *mzXML_elemt* *hash keys (symbols)*
103
+ # scanCount scan_count
104
+ # startTime start_time
105
+ # endTime end_time
106
+ # startMz start_mz
107
+ # endMz end_mz
108
+ def basic_info(mzxml_file)
109
+ puts "parsing: #{mzxml_file} #{File.exist?(mzxml_file)}" if $VERBOSE
110
+ hash = {}
111
+ scan_count_tmp = []
112
+ (1..5).to_a.each do |n| scan_count_tmp[n] = 0 end
113
+ @fh = File.open(mzxml_file)
114
+ @line = ""
115
+ scan_count_tmp[0] = _el("scanCount").to_i
116
+ hash[:start_time] = _el("startTime").sub(/^PT/, "").sub(/S$/,"").to_f
117
+ hash[:end_time] = _el("endTime").sub(/^PT/, "").sub(/S$/,"").to_f
118
+ hash[:ms_level] = _el("msLevel").to_i
119
+ scan_count_tmp[1] = 1
120
+ if hash[:ms_level] == 1
121
+ hash[:start_mz] = _el("startMz").to_f
122
+ hash[:end_mz] = _el("endMz").to_f
123
+ end
124
+
125
+ while !@fh.eof?
126
+ @line = @fh.readline
127
+ ms_level = _el("msLevel")
128
+ if ms_level
129
+ scan_count_tmp[ms_level.to_i] += 1
130
+ else
131
+ break
132
+ end
133
+ end
134
+ scan_count = []
135
+ scan_count_tmp.each do |cnt|
136
+ if cnt != 0
137
+ scan_count.push cnt
138
+ else
139
+ break
140
+ end
141
+ end
142
+ hash[:scan_count] = scan_count
143
+ @fh.close
144
+ hash
145
+ end
146
+
147
+ # returns [start_mz, end_mz] of the first full scan (ms_level == 1)
148
+ def start_and_end_mz(mzxml_file)
149
+ @fh = File.open(mzxml_file)
150
+ ms_level = 0
151
+ @line = ""
152
+ while ms_level != 1
153
+ ms_level = _el("msLevel").to_i
154
+ end
155
+ start_mz = _el("startMz").to_f
156
+ end_mz = _el("endMz").to_f
157
+ @fh.close
158
+ [start_mz, end_mz]
159
+ end
160
+
161
+ def _el(name)
162
+ re = /#{name}="(.*)"/
163
+ while @line !~ re && !@fh.eof?
164
+ @line = @fh.readline
165
+ end
166
+ if $1
167
+ return $1.dup
168
+ else
169
+ return nil
170
+ end
171
+ end
172
+
173
+ end
174
+
175
+