mzml 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ doc
20
+ pkg
21
+
22
+ ## PROJECT::SPECIFIC
data/.yardoc ADDED
Binary file
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Angel Pizarro
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,20 @@
1
+ = mzml
2
+
3
+ MzML is a standard data format for encoding mass spectrometry data. For more information see mzML the specification at http://psidev.info/index.php?q=node/257
4
+
5
+ This library is a non-validating mzML version 1.1 parser/reader.
6
+
7
+
8
+ == Note on Patches/Pull Requests
9
+
10
+ * Fork the project. It is hosted @ http://github.com/delagoya/mzml
11
+ * Make your feature addition or bug fix.
12
+ * Add tests for it. This is important so I don't break it in a
13
+ future version unintentionally.
14
+ * Commit, do not mess with rakefile, version, or history.
15
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
16
+ * Send me a pull request. Bonus points for topic branches.
17
+
18
+ == Copyright
19
+
20
+ Copyright (c) 2009 Angel Pizarro. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,44 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "mzml"
8
+ gem.summary = %Q{A non-validating mzML parser}
9
+ gem.description = %Q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
10
+ gem.email = "angel@delagoya.com"
11
+ gem.homepage = "http://github.com/delagoya/mzml"
12
+ gem.authors = ["Angel Pizarro"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ gem.add_dependency "nokogiri", ">= 1.3.3"
16
+
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ task :default => :spec
38
+
39
+ require 'yard'
40
+ YARD::Rake::YardocTask.new do |yardoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+ yardoc.options = ["--title", "mzml #{version}", "-r", "README.rdoc"]
43
+ yardoc.files = ['README*','lib/**/*.rb']
44
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/lib/mzml.rb ADDED
@@ -0,0 +1,228 @@
1
+ require 'nokogiri'
2
+ require 'base64'
3
+ require 'zlib'
4
+
5
+ #--
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU Library or "Lesser" General Public
8
+ # License (LGPL) as published by the Free Software Foundation;
9
+ # either version 2 of the License, or (at your option) any later
10
+ # version.
11
+ # Author: Angel Pizarro
12
+ # Date: 12/05/2009
13
+ # Copyright: Angel Pizarro, Copyright (c) University of Pennsylvania. All rights reserved.
14
+ #
15
+
16
+ # == MzML
17
+ #
18
+ # A non-validating mzML v 1.1.0 parser. Most annotation is left as XML DOM
19
+ # objects. See the Nokogiri::XML::Node and Nokogiri::XML::NodeSet
20
+ # documentation on how to work with these.
21
+ #
22
+ # ===USAGE:
23
+ #
24
+ # require 'mzml'
25
+ # mzml = MzML::Doc.new("test.mzXML")
26
+
27
+ module MzML
28
+
29
+ # An internal module containing useful regular expressions
30
+ module RGX
31
+ # The file byte offset of the start of the file index
32
+ INDEX_OFFSET = /<indexListOffset>(\d+)<\/indexListOffset>/
33
+ # The start of a either a spectrumList or chromatographList
34
+ DATA_LIST_START = /<(spectrum|chromatogram)List\s.*count\=["'](\d+)/m
35
+ # The start spectrum or chromatogram element
36
+ DATA_START = /<(spectrum|chromatogram)\s.*id=["']([^'"]+)["']/m
37
+ # The end spectrum or chromatogram element
38
+ DATA_END = /(<\/(spectrum|chromatogram)>)/
39
+ end
40
+
41
+ def parse(xml)
42
+ Nokogiri::XML.parse(xml).root
43
+ end
44
+
45
+ class UnsupportedFileFormat < Exception
46
+ end
47
+ class BadIdentifier < Exception
48
+ end
49
+
50
+ class Doc < File
51
+ attr_reader :index, :fname, :spectrum_count, :chromatogram_count, :node
52
+
53
+ def initialize(mz_fname)
54
+ unless mz_fname =~ /\.mzML$/
55
+ raise MzML::UnsupportedFileFormat.new "File extension must be .\"mzML\""
56
+ end
57
+ super(mz_fname, "r")
58
+ @index = parse_index_list
59
+ end
60
+
61
+ def chromatogram(chromatogram_id)
62
+ if @index[:chromatogram].has_key? chromatogram_id
63
+ self.seek @index[:chromatogram][chromatogram_id]
64
+ parse_next
65
+ else
66
+ raise MzML::BadIdentifier.new("Invalid ID '#{chromatogram_id}'")
67
+ end
68
+ end
69
+
70
+ def spectrum(spectrum_id)
71
+ if @index[:spectrum].has_key? spectrum_id
72
+ self.seek @index[:spectrum][spectrum_id]
73
+ return Spectrum.new(parse_next())
74
+
75
+ else
76
+ raise MzML::BadIdentifier.new("Invalid ID '#{spectrum_id}'")
77
+ end
78
+ end
79
+
80
+ # private
81
+ # Parses the IndexList
82
+ def parse_index_list
83
+ self.seek(self.stat.size - 200)
84
+ # parse the index offset
85
+ tmp = self.read
86
+ tmp =~ MzML::RGX::INDEX_OFFSET
87
+ offset = $1
88
+ # if I didn't match anything, compute the index and return
89
+ unless (offset)
90
+ return compute_index_list
91
+ end
92
+ @index = {}
93
+ self.seek(offset.to_i)
94
+ tmp = Nokogiri::XML.parse(self.read).root
95
+ tmp.css("index").each do |idx|
96
+ index_type = idx[:name].to_sym
97
+ @index[index_type] = {}
98
+ idx.css("offset").each do |o|
99
+ @index[index_type][o[:idRef]] = o.text().to_i
100
+ end
101
+ end
102
+ return @index
103
+ end
104
+
105
+ def compute_index_list
106
+ @index = {}
107
+ # start at the beginning.
108
+ self.rewind
109
+ # fast forward to the first spectrum or chromatograph
110
+ buffer = ''
111
+ while !self.eof
112
+ buffer += self.read(1024)
113
+ if start_pos = buffer =~ MzML::RGX::DATA_START
114
+ self.seek start_pos
115
+ break
116
+ end
117
+ end
118
+ # for each particular entity start to fill in the index
119
+ buffer = ''
120
+ rgx_start = /<(spectrum|chromatogram)\s.*id=["']([^"']+)["']/
121
+ while !self.eof
122
+ buffer += self.read(1024)
123
+ if start_pos = buffer =~ rgx_start
124
+ start_pos = self.pos - buffer.length + start_pos
125
+ @index[$1.to_sym][$2] = start_pos
126
+ buffer = ''
127
+ end
128
+ end
129
+ return @index
130
+ end
131
+
132
+ def parse_next
133
+ buffer = self.read(1024)
134
+ end_pos = nil
135
+ while(!self.eof)
136
+ if end_pos = buffer =~ MzML::RGX::DATA_END
137
+ buffer = buffer.slice(0..(end_pos + $1.length))
138
+ break
139
+ end
140
+ buffer += self.read(1024)
141
+ end
142
+ return Nokogiri::XML.parse(buffer)
143
+ end
144
+ end
145
+
146
+ class Spectrum
147
+ attr_accessor :id, :default_array_length, :spot_id, :type,\
148
+ :charge, :precursor, :base_peak_mz, :base_peak_intensity, :ms_level, \
149
+ :high_mz, :low_mz, :title, :tic, :polarity, :representation, :mz_node, :intensity_node, \
150
+ :mz, :intensity, :precursor_list, :scan_list, :retention_time
151
+ attr_reader :node, :params
152
+
153
+ # mz & intensity arrays will be don by proper methods maybe.
154
+ def initialize(spectrum_node)
155
+ @node = spectrum_node
156
+ @params = {}
157
+ parse_element()
158
+ end
159
+
160
+ protected
161
+ # This method pulls out all of the annotation from the XML node
162
+ def parse_element
163
+ # id
164
+ @id = @node[:id]
165
+ @default_array_length = @node[:defaultArrayLength]
166
+ @spot_id = @node[:spotID]
167
+ # now reaching into params
168
+ @params = @node.xpath("cvParam").inject({}) do |memo,prm|
169
+ memo[prm[:name]] = prm[:value]
170
+ memo
171
+ end
172
+ @ms_level = @params["ms level"].to_i
173
+ @low_mz = @params["lowest observed m/z"].to_f if @params.has_key?("lowest observed m/z")
174
+ @high_mz = @params["highest observed m/z"].to_f if @params.has_key?("highest observed m/z")
175
+ @tic = @params["total ion current"].to_i if @params.has_key?("total ion current")
176
+ @base_peak_mz = @params["base peak m/z"].to_i if @params.has_key?("base peak m/z")
177
+ @base_peak_intensity = @params["base peak intensity"].to_i if @params.has_key?("base peak intensity")
178
+ # polarity
179
+ # representation
180
+ # precursor list
181
+ if (@node.xpath("precursorList")[0])
182
+ parse_precursor_list()
183
+ else
184
+ @precursor_list = nil
185
+ end
186
+ # scan list
187
+ if (@node.xpath("scanList")[0])
188
+ @scan_list = parse_scan_list()
189
+ else
190
+ @scan_list = nil
191
+ end
192
+ # binary data
193
+ parse_binary_data()
194
+ end
195
+
196
+ def parse_precursor_list
197
+ @precursor_list = @node.css("precursorList > precursor").each do |p|
198
+ [p[:spectrumRef], p]
199
+ end
200
+ end
201
+
202
+ def parse_scan_list
203
+ @scan_list = @node.xpath("scanList/scan")
204
+ @retention_time = @node.xpath("scanList/scan/cvParam[@accesion='MS:1000016']")[0]
205
+ end
206
+
207
+ def parse_binary_data
208
+ @mz_node = @node.xpath("spectrum/binaryDataArrayList/binaryDataArray/cvParam[@accession='MS:1000514']").first.parent
209
+ data = Base64.decode64(@mz_node.xpath("binary").text)
210
+ if @mz_node.xpath("cvParam[@accession='MS:1000574']")[0]
211
+ # need to uncompress the data
212
+ data = Zlib::Inflate.inflate(data)
213
+ end
214
+ # 64-bit floats? default is 32-bit
215
+ dtype = @mz_node.xpath("cvParam[@accession='MS:1000523']")[0] ? "E*" : "e*"
216
+ @mz = data.unpack(dtype)
217
+ @intensity_node = @node.xpath("spectrum/binaryDataArrayList/binaryDataArray/cvParam[@accession='MS:1000515']").first.parent
218
+ data = Base64.decode64(@intensity_node.xpath("binary").text)
219
+ if @intensity_node.xpath("cvParam[@accession='MS:1000574']")[0]
220
+ # need to uncompress the data
221
+ data = Zlib::Inflate.inflate(data)
222
+ end
223
+ # 64-bit floats? default is 32-bit
224
+ dtype = @intensity_node.xpath("cvParam[@accession='MS:1000523']")[0] ? "E*" : "e*"
225
+ @intensity = data.unpack(dtype)
226
+ end
227
+ end
228
+ end
data/mzml.gemspec ADDED
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{mzml}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Angel Pizarro"]
12
+ s.date = %q{2009-12-06}
13
+ s.description = %q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
14
+ s.email = %q{angel@delagoya.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ ".yardoc",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "lib/mzml.rb",
28
+ "mzml.gemspec",
29
+ "spec/mzml_spec.rb",
30
+ "spec/small.compressed.mzML",
31
+ "spec/small.mgf",
32
+ "spec/small.mzML",
33
+ "spec/spec.opts",
34
+ "spec/spec_helper.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/delagoya/mzml}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.5}
40
+ s.summary = %q{A non-validating mzML parser}
41
+ s.test_files = [
42
+ "spec/mzml_spec.rb",
43
+ "spec/spec_helper.rb"
44
+ ]
45
+
46
+ if s.respond_to? :specification_version then
47
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
48
+ s.specification_version = 3
49
+
50
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
51
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
52
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.3"])
53
+ else
54
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
55
+ s.add_dependency(%q<nokogiri>, [">= 1.3.3"])
56
+ end
57
+ else
58
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
59
+ s.add_dependency(%q<nokogiri>, [">= 1.3.3"])
60
+ end
61
+ end
62
+
data/spec/mzml_spec.rb ADDED
@@ -0,0 +1,60 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe MzML do
4
+ before(:all) do
5
+ # set the input file name
6
+ @file = File.join(File.dirname(__FILE__), "small.mzML")
7
+ @compressed = File.join(File.dirname(__FILE__), "small.compressed.mzML")
8
+ @mgf = File.join(File.dirname(__FILE__), "small.mgf")
9
+ end
10
+
11
+ context "Given a valid mzML file" do
12
+ it "I should be able to open the mzML file" do
13
+ file = MzML::Doc.new(@file)
14
+ file.should(be_a_kind_of(MzML::Doc))
15
+ end
16
+ it "should read the index" do
17
+ file = MzML::Doc.new(@file)
18
+ file.index.should_not be_nil
19
+ end
20
+ it "should get the first spectrum" do
21
+ file = MzML::Doc.new(@file)
22
+ file.index.should_not be_nil
23
+ end
24
+ it "should unmarshall the a 64 byte mz array" do
25
+ mz = MzML::Doc.new(@file)
26
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
27
+ s.mz.should_not be_nil
28
+ end
29
+ it "should unmarshall the a 32 byte intensity array" do
30
+ mz = MzML::Doc.new(@file)
31
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
32
+ s.intensity.should_not be_nil
33
+ end
34
+
35
+ it "should be the same mz array as the MGF file" do
36
+ mgf = parse_mgf(@mgf)
37
+ mz = MzML::Doc.new(@file)
38
+ # grab this same spectrum from the mzML file
39
+ s = mz.spectrum(mgf.title)
40
+ i = s.intensity.map {|e| (e * 1000).to_i() / 1000.0}
41
+ m = s.mz.map {|e| (e * 1000).to_i() / 1000.0}
42
+ i.join(", ").should be == mgf.intensity.join(", ")
43
+ m.join(", ").should be == mgf.mz.join(", ")
44
+ end
45
+ end
46
+
47
+ context "Given a valid mzML file that uses compression" do
48
+ it "should unmarshall and uncompress the 64 byte mz array" do
49
+ mz = MzML::Doc.new(@compressed)
50
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
51
+ s.mz.should_not be_nil
52
+ end
53
+
54
+ it "should unmarshall and uncompress the 32 byte intensity array" do
55
+ mz = MzML::Doc.new(@compressed)
56
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
57
+ s.intensity.should_not be_nil
58
+ end
59
+ end
60
+ end