mzml 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ doc
20
+ pkg
21
+
22
+ ## PROJECT::SPECIFIC
data/.yardoc ADDED
Binary file
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Angel Pizarro
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,20 @@
1
+ = mzml
2
+
3
+ MzML is a standard data format for encoding mass spectrometry data. For more information see mzML the specification at http://psidev.info/index.php?q=node/257
4
+
5
+ This library is a non-validating mzML version 1.1 parser/reader.
6
+
7
+
8
+ == Note on Patches/Pull Requests
9
+
10
+ * Fork the project. It is hosted @ http://github.com/delagoya/mzml
11
+ * Make your feature addition or bug fix.
12
+ * Add tests for it. This is important so I don't break it in a
13
+ future version unintentionally.
14
+ * Commit, do not mess with rakefile, version, or history.
15
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
16
+ * Send me a pull request. Bonus points for topic branches.
17
+
18
+ == Copyright
19
+
20
+ Copyright (c) 2009 Angel Pizarro. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,44 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "mzml"
8
+ gem.summary = %Q{A non-validating mzML parser}
9
+ gem.description = %Q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
10
+ gem.email = "angel@delagoya.com"
11
+ gem.homepage = "http://github.com/delagoya/mzml"
12
+ gem.authors = ["Angel Pizarro"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ gem.add_dependency "nokogiri", ">= 1.3.3"
16
+
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+
35
+ task :spec => :check_dependencies
36
+
37
+ task :default => :spec
38
+
39
+ require 'yard'
40
+ YARD::Rake::YardocTask.new do |yardoc|
41
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
+ yardoc.options = ["--title", "mzml #{version}", "-r", "README.rdoc"]
43
+ yardoc.files = ['README*','lib/**/*.rb']
44
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/lib/mzml.rb ADDED
@@ -0,0 +1,228 @@
1
+ require 'nokogiri'
2
+ require 'base64'
3
+ require 'zlib'
4
+
5
+ #--
6
+ # This program is free software; you can redistribute it and/or modify
7
+ # it under the terms of the GNU Library or "Lesser" General Public
8
+ # License (LGPL) as published by the Free Software Foundation;
9
+ # either version 2 of the License, or (at your option) any later
10
+ # version.
11
+ # Author: Angel Pizarro
12
+ # Date: 12/05/2009
13
+ # Copyright: Angel Pizarro, Copyright (c) University of Pennsylvania. All rights reserved.
14
+ #
15
+
16
+ # == MzML
17
+ #
18
+ # A non-validating mzML v 1.1.0 parser. Most annotation is left as XML DOM
19
+ # objects. See the Nokogiri::XML::Node and Nokogiri::XML::NodeSet
20
+ # documentation on how to work with these.
21
+ #
22
+ # ===USAGE:
23
+ #
24
+ # require 'mzml'
25
+ # mzml = MzML::Doc.new("test.mzXML")
26
+
27
+ module MzML
28
+
29
+ # An internal module containing useful regular expressions
30
+ module RGX
31
+ # The file byte offset of the start of the file index
32
+ INDEX_OFFSET = /<indexListOffset>(\d+)<\/indexListOffset>/
33
+ # The start of a either a spectrumList or chromatographList
34
+ DATA_LIST_START = /<(spectrum|chromatogram)List\s.*count\=["'](\d+)/m
35
+ # The start spectrum or chromatogram element
36
+ DATA_START = /<(spectrum|chromatogram)\s.*id=["']([^'"]+)["']/m
37
+ # The end spectrum or chromatogram element
38
+ DATA_END = /(<\/(spectrum|chromatogram)>)/
39
+ end
40
+
41
+ def parse(xml)
42
+ Nokogiri::XML.parse(xml).root
43
+ end
44
+
45
+ class UnsupportedFileFormat < Exception
46
+ end
47
+ class BadIdentifier < Exception
48
+ end
49
+
50
+ class Doc < File
51
+ attr_reader :index, :fname, :spectrum_count, :chromatogram_count, :node
52
+
53
+ def initialize(mz_fname)
54
+ unless mz_fname =~ /\.mzML$/
55
+ raise MzML::UnsupportedFileFormat.new "File extension must be .\"mzML\""
56
+ end
57
+ super(mz_fname, "r")
58
+ @index = parse_index_list
59
+ end
60
+
61
+ def chromatogram(chromatogram_id)
62
+ if @index[:chromatogram].has_key? chromatogram_id
63
+ self.seek @index[:chromatogram][chromatogram_id]
64
+ parse_next
65
+ else
66
+ raise MzML::BadIdentifier.new("Invalid ID '#{chromatogram_id}'")
67
+ end
68
+ end
69
+
70
+ def spectrum(spectrum_id)
71
+ if @index[:spectrum].has_key? spectrum_id
72
+ self.seek @index[:spectrum][spectrum_id]
73
+ return Spectrum.new(parse_next())
74
+
75
+ else
76
+ raise MzML::BadIdentifier.new("Invalid ID '#{spectrum_id}'")
77
+ end
78
+ end
79
+
80
+ # private
81
+ # Parses the IndexList
82
+ def parse_index_list
83
+ self.seek(self.stat.size - 200)
84
+ # parse the index offset
85
+ tmp = self.read
86
+ tmp =~ MzML::RGX::INDEX_OFFSET
87
+ offset = $1
88
+ # if I didn't match anything, compute the index and return
89
+ unless (offset)
90
+ return compute_index_list
91
+ end
92
+ @index = {}
93
+ self.seek(offset.to_i)
94
+ tmp = Nokogiri::XML.parse(self.read).root
95
+ tmp.css("index").each do |idx|
96
+ index_type = idx[:name].to_sym
97
+ @index[index_type] = {}
98
+ idx.css("offset").each do |o|
99
+ @index[index_type][o[:idRef]] = o.text().to_i
100
+ end
101
+ end
102
+ return @index
103
+ end
104
+
105
+ def compute_index_list
106
+ @index = {}
107
+ # start at the beginning.
108
+ self.rewind
109
+ # fast forward to the first spectrum or chromatograph
110
+ buffer = ''
111
+ while !self.eof
112
+ buffer += self.read(1024)
113
+ if start_pos = buffer =~ MzML::RGX::DATA_START
114
+ self.seek start_pos
115
+ break
116
+ end
117
+ end
118
+ # for each particular entity start to fill in the index
119
+ buffer = ''
120
+ rgx_start = /<(spectrum|chromatogram)\s.*id=["']([^"']+)["']/
121
+ while !self.eof
122
+ buffer += self.read(1024)
123
+ if start_pos = buffer =~ rgx_start
124
+ start_pos = self.pos - buffer.length + start_pos
125
+ @index[$1.to_sym][$2] = start_pos
126
+ buffer = ''
127
+ end
128
+ end
129
+ return @index
130
+ end
131
+
132
+ def parse_next
133
+ buffer = self.read(1024)
134
+ end_pos = nil
135
+ while(!self.eof)
136
+ if end_pos = buffer =~ MzML::RGX::DATA_END
137
+ buffer = buffer.slice(0..(end_pos + $1.length))
138
+ break
139
+ end
140
+ buffer += self.read(1024)
141
+ end
142
+ return Nokogiri::XML.parse(buffer)
143
+ end
144
+ end
145
+
146
+ class Spectrum
147
+ attr_accessor :id, :default_array_length, :spot_id, :type,\
148
+ :charge, :precursor, :base_peak_mz, :base_peak_intensity, :ms_level, \
149
+ :high_mz, :low_mz, :title, :tic, :polarity, :representation, :mz_node, :intensity_node, \
150
+ :mz, :intensity, :precursor_list, :scan_list, :retention_time
151
+ attr_reader :node, :params
152
+
153
+ # mz & intensity arrays will be don by proper methods maybe.
154
+ def initialize(spectrum_node)
155
+ @node = spectrum_node
156
+ @params = {}
157
+ parse_element()
158
+ end
159
+
160
+ protected
161
+ # This method pulls out all of the annotation from the XML node
162
+ def parse_element
163
+ # id
164
+ @id = @node[:id]
165
+ @default_array_length = @node[:defaultArrayLength]
166
+ @spot_id = @node[:spotID]
167
+ # now reaching into params
168
+ @params = @node.xpath("cvParam").inject({}) do |memo,prm|
169
+ memo[prm[:name]] = prm[:value]
170
+ memo
171
+ end
172
+ @ms_level = @params["ms level"].to_i
173
+ @low_mz = @params["lowest observed m/z"].to_f if @params.has_key?("lowest observed m/z")
174
+ @high_mz = @params["highest observed m/z"].to_f if @params.has_key?("highest observed m/z")
175
+ @tic = @params["total ion current"].to_i if @params.has_key?("total ion current")
176
+ @base_peak_mz = @params["base peak m/z"].to_i if @params.has_key?("base peak m/z")
177
+ @base_peak_intensity = @params["base peak intensity"].to_i if @params.has_key?("base peak intensity")
178
+ # polarity
179
+ # representation
180
+ # precursor list
181
+ if (@node.xpath("precursorList")[0])
182
+ parse_precursor_list()
183
+ else
184
+ @precursor_list = nil
185
+ end
186
+ # scan list
187
+ if (@node.xpath("scanList")[0])
188
+ @scan_list = parse_scan_list()
189
+ else
190
+ @scan_list = nil
191
+ end
192
+ # binary data
193
+ parse_binary_data()
194
+ end
195
+
196
+ def parse_precursor_list
197
+ @precursor_list = @node.css("precursorList > precursor").each do |p|
198
+ [p[:spectrumRef], p]
199
+ end
200
+ end
201
+
202
+ def parse_scan_list
203
+ @scan_list = @node.xpath("scanList/scan")
204
+ @retention_time = @node.xpath("scanList/scan/cvParam[@accesion='MS:1000016']")[0]
205
+ end
206
+
207
+ def parse_binary_data
208
+ @mz_node = @node.xpath("spectrum/binaryDataArrayList/binaryDataArray/cvParam[@accession='MS:1000514']").first.parent
209
+ data = Base64.decode64(@mz_node.xpath("binary").text)
210
+ if @mz_node.xpath("cvParam[@accession='MS:1000574']")[0]
211
+ # need to uncompress the data
212
+ data = Zlib::Inflate.inflate(data)
213
+ end
214
+ # 64-bit floats? default is 32-bit
215
+ dtype = @mz_node.xpath("cvParam[@accession='MS:1000523']")[0] ? "E*" : "e*"
216
+ @mz = data.unpack(dtype)
217
+ @intensity_node = @node.xpath("spectrum/binaryDataArrayList/binaryDataArray/cvParam[@accession='MS:1000515']").first.parent
218
+ data = Base64.decode64(@intensity_node.xpath("binary").text)
219
+ if @intensity_node.xpath("cvParam[@accession='MS:1000574']")[0]
220
+ # need to uncompress the data
221
+ data = Zlib::Inflate.inflate(data)
222
+ end
223
+ # 64-bit floats? default is 32-bit
224
+ dtype = @intensity_node.xpath("cvParam[@accession='MS:1000523']")[0] ? "E*" : "e*"
225
+ @intensity = data.unpack(dtype)
226
+ end
227
+ end
228
+ end
data/mzml.gemspec ADDED
@@ -0,0 +1,62 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{mzml}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Angel Pizarro"]
12
+ s.date = %q{2009-12-06}
13
+ s.description = %q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
14
+ s.email = %q{angel@delagoya.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ ".yardoc",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "lib/mzml.rb",
28
+ "mzml.gemspec",
29
+ "spec/mzml_spec.rb",
30
+ "spec/small.compressed.mzML",
31
+ "spec/small.mgf",
32
+ "spec/small.mzML",
33
+ "spec/spec.opts",
34
+ "spec/spec_helper.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/delagoya/mzml}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.5}
40
+ s.summary = %q{A non-validating mzML parser}
41
+ s.test_files = [
42
+ "spec/mzml_spec.rb",
43
+ "spec/spec_helper.rb"
44
+ ]
45
+
46
+ if s.respond_to? :specification_version then
47
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
48
+ s.specification_version = 3
49
+
50
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
51
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
52
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.3"])
53
+ else
54
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
55
+ s.add_dependency(%q<nokogiri>, [">= 1.3.3"])
56
+ end
57
+ else
58
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
59
+ s.add_dependency(%q<nokogiri>, [">= 1.3.3"])
60
+ end
61
+ end
62
+
data/spec/mzml_spec.rb ADDED
@@ -0,0 +1,60 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe MzML do
4
+ before(:all) do
5
+ # set the input file name
6
+ @file = File.join(File.dirname(__FILE__), "small.mzML")
7
+ @compressed = File.join(File.dirname(__FILE__), "small.compressed.mzML")
8
+ @mgf = File.join(File.dirname(__FILE__), "small.mgf")
9
+ end
10
+
11
+ context "Given a valid mzML file" do
12
+ it "I should be able to open the mzML file" do
13
+ file = MzML::Doc.new(@file)
14
+ file.should(be_a_kind_of(MzML::Doc))
15
+ end
16
+ it "should read the index" do
17
+ file = MzML::Doc.new(@file)
18
+ file.index.should_not be_nil
19
+ end
20
+ it "should get the first spectrum" do
21
+ file = MzML::Doc.new(@file)
22
+ file.index.should_not be_nil
23
+ end
24
+ it "should unmarshall the a 64 byte mz array" do
25
+ mz = MzML::Doc.new(@file)
26
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
27
+ s.mz.should_not be_nil
28
+ end
29
+ it "should unmarshall the a 32 byte intensity array" do
30
+ mz = MzML::Doc.new(@file)
31
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
32
+ s.intensity.should_not be_nil
33
+ end
34
+
35
+ it "should be the same mz array as the MGF file" do
36
+ mgf = parse_mgf(@mgf)
37
+ mz = MzML::Doc.new(@file)
38
+ # grab this same spectrum from the mzML file
39
+ s = mz.spectrum(mgf.title)
40
+ i = s.intensity.map {|e| (e * 1000).to_i() / 1000.0}
41
+ m = s.mz.map {|e| (e * 1000).to_i() / 1000.0}
42
+ i.join(", ").should be == mgf.intensity.join(", ")
43
+ m.join(", ").should be == mgf.mz.join(", ")
44
+ end
45
+ end
46
+
47
+ context "Given a valid mzML file that uses compression" do
48
+ it "should unmarshall and uncompress the 64 byte mz array" do
49
+ mz = MzML::Doc.new(@compressed)
50
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
51
+ s.mz.should_not be_nil
52
+ end
53
+
54
+ it "should unmarshall and uncompress the 32 byte intensity array" do
55
+ mz = MzML::Doc.new(@compressed)
56
+ s = mz.spectrum(mz.index[:spectrum].keys.first)
57
+ s.intensity.should_not be_nil
58
+ end
59
+ end
60
+ end