mzml 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +22 -0
- data/.yardoc +0 -0
- data/LICENSE +20 -0
- data/README.rdoc +20 -0
- data/Rakefile +44 -0
- data/VERSION +1 -0
- data/lib/mzml.rb +228 -0
- data/mzml.gemspec +62 -0
- data/spec/mzml_spec.rb +60 -0
- data/spec/small.compressed.mzML +11416 -0
- data/spec/small.mgf +350 -0
- data/spec/small.mzML +11405 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +27 -0
- metadata +90 -0
data/.document
ADDED
data/.gitignore
ADDED
data/.yardoc
ADDED
Binary file
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Angel Pizarro
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
= mzml
|
2
|
+
|
3
|
+
MzML is a standard data format for encoding mass spectrometry data. For more information see mzML the specification at http://psidev.info/index.php?q=node/257
|
4
|
+
|
5
|
+
This library is a non-validating mzML version 1.1 parser/reader.
|
6
|
+
|
7
|
+
|
8
|
+
== Note on Patches/Pull Requests
|
9
|
+
|
10
|
+
* Fork the project. It is hosted @ http://github.com/delagoya/mzml
|
11
|
+
* Make your feature addition or bug fix.
|
12
|
+
* Add tests for it. This is important so I don't break it in a
|
13
|
+
future version unintentionally.
|
14
|
+
* Commit, do not mess with rakefile, version, or history.
|
15
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
16
|
+
* Send me a pull request. Bonus points for topic branches.
|
17
|
+
|
18
|
+
== Copyright
|
19
|
+
|
20
|
+
Copyright (c) 2009 Angel Pizarro. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "mzml"
|
8
|
+
gem.summary = %Q{A non-validating mzML parser}
|
9
|
+
gem.description = %Q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
|
10
|
+
gem.email = "angel@delagoya.com"
|
11
|
+
gem.homepage = "http://github.com/delagoya/mzml"
|
12
|
+
gem.authors = ["Angel Pizarro"]
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
gem.add_dependency "nokogiri", ">= 1.3.3"
|
16
|
+
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'spec/rake/spectask'
|
24
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
+
spec.libs << 'lib' << 'spec'
|
26
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
+
spec.libs << 'lib' << 'spec'
|
31
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
+
spec.rcov = true
|
33
|
+
end
|
34
|
+
|
35
|
+
task :spec => :check_dependencies
|
36
|
+
|
37
|
+
task :default => :spec
|
38
|
+
|
39
|
+
require 'yard'
|
40
|
+
YARD::Rake::YardocTask.new do |yardoc|
|
41
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
42
|
+
yardoc.options = ["--title", "mzml #{version}", "-r", "README.rdoc"]
|
43
|
+
yardoc.files = ['README*','lib/**/*.rb']
|
44
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/mzml.rb
ADDED
@@ -0,0 +1,228 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'base64'
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This program is free software; you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU Library or "Lesser" General Public
|
8
|
+
# License (LGPL) as published by the Free Software Foundation;
|
9
|
+
# either version 2 of the License, or (at your option) any later
|
10
|
+
# version.
|
11
|
+
# Author: Angel Pizarro
|
12
|
+
# Date: 12/05/2009
|
13
|
+
# Copyright: Angel Pizarro, Copyright (c) University of Pennsylvania. All rights reserved.
|
14
|
+
#
|
15
|
+
|
16
|
+
# == MzML
|
17
|
+
#
|
18
|
+
# A non-validating mzML v 1.1.0 parser. Most annotation is left as XML DOM
|
19
|
+
# objects. See the Nokogiri::XML::Node and Nokogiri::XML::NodeSet
|
20
|
+
# documentation on how to work with these.
|
21
|
+
#
|
22
|
+
# ===USAGE:
|
23
|
+
#
|
24
|
+
# require 'mzml'
|
25
|
+
# mzml = MzML::Doc.new("test.mzXML")
|
26
|
+
|
27
|
+
module MzML
|
28
|
+
|
29
|
+
# An internal module containing useful regular expressions
|
30
|
+
module RGX
|
31
|
+
# The file byte offset of the start of the file index
|
32
|
+
INDEX_OFFSET = /<indexListOffset>(\d+)<\/indexListOffset>/
|
33
|
+
# The start of a either a spectrumList or chromatographList
|
34
|
+
DATA_LIST_START = /<(spectrum|chromatogram)List\s.*count\=["'](\d+)/m
|
35
|
+
# The start spectrum or chromatogram element
|
36
|
+
DATA_START = /<(spectrum|chromatogram)\s.*id=["']([^'"]+)["']/m
|
37
|
+
# The end spectrum or chromatogram element
|
38
|
+
DATA_END = /(<\/(spectrum|chromatogram)>)/
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse(xml)
|
42
|
+
Nokogiri::XML.parse(xml).root
|
43
|
+
end
|
44
|
+
|
45
|
+
class UnsupportedFileFormat < Exception
|
46
|
+
end
|
47
|
+
class BadIdentifier < Exception
|
48
|
+
end
|
49
|
+
|
50
|
+
class Doc < File
|
51
|
+
attr_reader :index, :fname, :spectrum_count, :chromatogram_count, :node
|
52
|
+
|
53
|
+
def initialize(mz_fname)
|
54
|
+
unless mz_fname =~ /\.mzML$/
|
55
|
+
raise MzML::UnsupportedFileFormat.new "File extension must be .\"mzML\""
|
56
|
+
end
|
57
|
+
super(mz_fname, "r")
|
58
|
+
@index = parse_index_list
|
59
|
+
end
|
60
|
+
|
61
|
+
def chromatogram(chromatogram_id)
|
62
|
+
if @index[:chromatogram].has_key? chromatogram_id
|
63
|
+
self.seek @index[:chromatogram][chromatogram_id]
|
64
|
+
parse_next
|
65
|
+
else
|
66
|
+
raise MzML::BadIdentifier.new("Invalid ID '#{chromatogram_id}'")
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def spectrum(spectrum_id)
|
71
|
+
if @index[:spectrum].has_key? spectrum_id
|
72
|
+
self.seek @index[:spectrum][spectrum_id]
|
73
|
+
return Spectrum.new(parse_next())
|
74
|
+
|
75
|
+
else
|
76
|
+
raise MzML::BadIdentifier.new("Invalid ID '#{spectrum_id}'")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# private
|
81
|
+
# Parses the IndexList
|
82
|
+
def parse_index_list
|
83
|
+
self.seek(self.stat.size - 200)
|
84
|
+
# parse the index offset
|
85
|
+
tmp = self.read
|
86
|
+
tmp =~ MzML::RGX::INDEX_OFFSET
|
87
|
+
offset = $1
|
88
|
+
# if I didn't match anything, compute the index and return
|
89
|
+
unless (offset)
|
90
|
+
return compute_index_list
|
91
|
+
end
|
92
|
+
@index = {}
|
93
|
+
self.seek(offset.to_i)
|
94
|
+
tmp = Nokogiri::XML.parse(self.read).root
|
95
|
+
tmp.css("index").each do |idx|
|
96
|
+
index_type = idx[:name].to_sym
|
97
|
+
@index[index_type] = {}
|
98
|
+
idx.css("offset").each do |o|
|
99
|
+
@index[index_type][o[:idRef]] = o.text().to_i
|
100
|
+
end
|
101
|
+
end
|
102
|
+
return @index
|
103
|
+
end
|
104
|
+
|
105
|
+
def compute_index_list
|
106
|
+
@index = {}
|
107
|
+
# start at the beginning.
|
108
|
+
self.rewind
|
109
|
+
# fast forward to the first spectrum or chromatograph
|
110
|
+
buffer = ''
|
111
|
+
while !self.eof
|
112
|
+
buffer += self.read(1024)
|
113
|
+
if start_pos = buffer =~ MzML::RGX::DATA_START
|
114
|
+
self.seek start_pos
|
115
|
+
break
|
116
|
+
end
|
117
|
+
end
|
118
|
+
# for each particular entity start to fill in the index
|
119
|
+
buffer = ''
|
120
|
+
rgx_start = /<(spectrum|chromatogram)\s.*id=["']([^"']+)["']/
|
121
|
+
while !self.eof
|
122
|
+
buffer += self.read(1024)
|
123
|
+
if start_pos = buffer =~ rgx_start
|
124
|
+
start_pos = self.pos - buffer.length + start_pos
|
125
|
+
@index[$1.to_sym][$2] = start_pos
|
126
|
+
buffer = ''
|
127
|
+
end
|
128
|
+
end
|
129
|
+
return @index
|
130
|
+
end
|
131
|
+
|
132
|
+
def parse_next
|
133
|
+
buffer = self.read(1024)
|
134
|
+
end_pos = nil
|
135
|
+
while(!self.eof)
|
136
|
+
if end_pos = buffer =~ MzML::RGX::DATA_END
|
137
|
+
buffer = buffer.slice(0..(end_pos + $1.length))
|
138
|
+
break
|
139
|
+
end
|
140
|
+
buffer += self.read(1024)
|
141
|
+
end
|
142
|
+
return Nokogiri::XML.parse(buffer)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
class Spectrum
|
147
|
+
attr_accessor :id, :default_array_length, :spot_id, :type,\
|
148
|
+
:charge, :precursor, :base_peak_mz, :base_peak_intensity, :ms_level, \
|
149
|
+
:high_mz, :low_mz, :title, :tic, :polarity, :representation, :mz_node, :intensity_node, \
|
150
|
+
:mz, :intensity, :precursor_list, :scan_list, :retention_time
|
151
|
+
attr_reader :node, :params
|
152
|
+
|
153
|
+
# mz & intensity arrays will be don by proper methods maybe.
|
154
|
+
def initialize(spectrum_node)
|
155
|
+
@node = spectrum_node
|
156
|
+
@params = {}
|
157
|
+
parse_element()
|
158
|
+
end
|
159
|
+
|
160
|
+
protected
|
161
|
+
# This method pulls out all of the annotation from the XML node
|
162
|
+
def parse_element
|
163
|
+
# id
|
164
|
+
@id = @node[:id]
|
165
|
+
@default_array_length = @node[:defaultArrayLength]
|
166
|
+
@spot_id = @node[:spotID]
|
167
|
+
# now reaching into params
|
168
|
+
@params = @node.xpath("cvParam").inject({}) do |memo,prm|
|
169
|
+
memo[prm[:name]] = prm[:value]
|
170
|
+
memo
|
171
|
+
end
|
172
|
+
@ms_level = @params["ms level"].to_i
|
173
|
+
@low_mz = @params["lowest observed m/z"].to_f if @params.has_key?("lowest observed m/z")
|
174
|
+
@high_mz = @params["highest observed m/z"].to_f if @params.has_key?("highest observed m/z")
|
175
|
+
@tic = @params["total ion current"].to_i if @params.has_key?("total ion current")
|
176
|
+
@base_peak_mz = @params["base peak m/z"].to_i if @params.has_key?("base peak m/z")
|
177
|
+
@base_peak_intensity = @params["base peak intensity"].to_i if @params.has_key?("base peak intensity")
|
178
|
+
# polarity
|
179
|
+
# representation
|
180
|
+
# precursor list
|
181
|
+
if (@node.xpath("precursorList")[0])
|
182
|
+
parse_precursor_list()
|
183
|
+
else
|
184
|
+
@precursor_list = nil
|
185
|
+
end
|
186
|
+
# scan list
|
187
|
+
if (@node.xpath("scanList")[0])
|
188
|
+
@scan_list = parse_scan_list()
|
189
|
+
else
|
190
|
+
@scan_list = nil
|
191
|
+
end
|
192
|
+
# binary data
|
193
|
+
parse_binary_data()
|
194
|
+
end
|
195
|
+
|
196
|
+
def parse_precursor_list
|
197
|
+
@precursor_list = @node.css("precursorList > precursor").each do |p|
|
198
|
+
[p[:spectrumRef], p]
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def parse_scan_list
|
203
|
+
@scan_list = @node.xpath("scanList/scan")
|
204
|
+
@retention_time = @node.xpath("scanList/scan/cvParam[@accesion='MS:1000016']")[0]
|
205
|
+
end
|
206
|
+
|
207
|
+
def parse_binary_data
|
208
|
+
@mz_node = @node.xpath("spectrum/binaryDataArrayList/binaryDataArray/cvParam[@accession='MS:1000514']").first.parent
|
209
|
+
data = Base64.decode64(@mz_node.xpath("binary").text)
|
210
|
+
if @mz_node.xpath("cvParam[@accession='MS:1000574']")[0]
|
211
|
+
# need to uncompress the data
|
212
|
+
data = Zlib::Inflate.inflate(data)
|
213
|
+
end
|
214
|
+
# 64-bit floats? default is 32-bit
|
215
|
+
dtype = @mz_node.xpath("cvParam[@accession='MS:1000523']")[0] ? "E*" : "e*"
|
216
|
+
@mz = data.unpack(dtype)
|
217
|
+
@intensity_node = @node.xpath("spectrum/binaryDataArrayList/binaryDataArray/cvParam[@accession='MS:1000515']").first.parent
|
218
|
+
data = Base64.decode64(@intensity_node.xpath("binary").text)
|
219
|
+
if @intensity_node.xpath("cvParam[@accession='MS:1000574']")[0]
|
220
|
+
# need to uncompress the data
|
221
|
+
data = Zlib::Inflate.inflate(data)
|
222
|
+
end
|
223
|
+
# 64-bit floats? default is 32-bit
|
224
|
+
dtype = @intensity_node.xpath("cvParam[@accession='MS:1000523']")[0] ? "E*" : "e*"
|
225
|
+
@intensity = data.unpack(dtype)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
data/mzml.gemspec
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{mzml}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Angel Pizarro"]
|
12
|
+
s.date = %q{2009-12-06}
|
13
|
+
s.description = %q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
|
14
|
+
s.email = %q{angel@delagoya.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
".yardoc",
|
23
|
+
"LICENSE",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"lib/mzml.rb",
|
28
|
+
"mzml.gemspec",
|
29
|
+
"spec/mzml_spec.rb",
|
30
|
+
"spec/small.compressed.mzML",
|
31
|
+
"spec/small.mgf",
|
32
|
+
"spec/small.mzML",
|
33
|
+
"spec/spec.opts",
|
34
|
+
"spec/spec_helper.rb"
|
35
|
+
]
|
36
|
+
s.homepage = %q{http://github.com/delagoya/mzml}
|
37
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
38
|
+
s.require_paths = ["lib"]
|
39
|
+
s.rubygems_version = %q{1.3.5}
|
40
|
+
s.summary = %q{A non-validating mzML parser}
|
41
|
+
s.test_files = [
|
42
|
+
"spec/mzml_spec.rb",
|
43
|
+
"spec/spec_helper.rb"
|
44
|
+
]
|
45
|
+
|
46
|
+
if s.respond_to? :specification_version then
|
47
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
48
|
+
s.specification_version = 3
|
49
|
+
|
50
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
51
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
52
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.3"])
|
53
|
+
else
|
54
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
55
|
+
s.add_dependency(%q<nokogiri>, [">= 1.3.3"])
|
56
|
+
end
|
57
|
+
else
|
58
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
59
|
+
s.add_dependency(%q<nokogiri>, [">= 1.3.3"])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
data/spec/mzml_spec.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe MzML do
|
4
|
+
before(:all) do
|
5
|
+
# set the input file name
|
6
|
+
@file = File.join(File.dirname(__FILE__), "small.mzML")
|
7
|
+
@compressed = File.join(File.dirname(__FILE__), "small.compressed.mzML")
|
8
|
+
@mgf = File.join(File.dirname(__FILE__), "small.mgf")
|
9
|
+
end
|
10
|
+
|
11
|
+
context "Given a valid mzML file" do
|
12
|
+
it "I should be able to open the mzML file" do
|
13
|
+
file = MzML::Doc.new(@file)
|
14
|
+
file.should(be_a_kind_of(MzML::Doc))
|
15
|
+
end
|
16
|
+
it "should read the index" do
|
17
|
+
file = MzML::Doc.new(@file)
|
18
|
+
file.index.should_not be_nil
|
19
|
+
end
|
20
|
+
it "should get the first spectrum" do
|
21
|
+
file = MzML::Doc.new(@file)
|
22
|
+
file.index.should_not be_nil
|
23
|
+
end
|
24
|
+
it "should unmarshall the a 64 byte mz array" do
|
25
|
+
mz = MzML::Doc.new(@file)
|
26
|
+
s = mz.spectrum(mz.index[:spectrum].keys.first)
|
27
|
+
s.mz.should_not be_nil
|
28
|
+
end
|
29
|
+
it "should unmarshall the a 32 byte intensity array" do
|
30
|
+
mz = MzML::Doc.new(@file)
|
31
|
+
s = mz.spectrum(mz.index[:spectrum].keys.first)
|
32
|
+
s.intensity.should_not be_nil
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should be the same mz array as the MGF file" do
|
36
|
+
mgf = parse_mgf(@mgf)
|
37
|
+
mz = MzML::Doc.new(@file)
|
38
|
+
# grab this same spectrum from the mzML file
|
39
|
+
s = mz.spectrum(mgf.title)
|
40
|
+
i = s.intensity.map {|e| (e * 1000).to_i() / 1000.0}
|
41
|
+
m = s.mz.map {|e| (e * 1000).to_i() / 1000.0}
|
42
|
+
i.join(", ").should be == mgf.intensity.join(", ")
|
43
|
+
m.join(", ").should be == mgf.mz.join(", ")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
context "Given a valid mzML file that uses compression" do
|
48
|
+
it "should unmarshall and uncompress the 64 byte mz array" do
|
49
|
+
mz = MzML::Doc.new(@compressed)
|
50
|
+
s = mz.spectrum(mz.index[:spectrum].keys.first)
|
51
|
+
s.mz.should_not be_nil
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should unmarshall and uncompress the 32 byte intensity array" do
|
55
|
+
mz = MzML::Doc.new(@compressed)
|
56
|
+
s = mz.spectrum(mz.index[:spectrum].keys.first)
|
57
|
+
s.intensity.should_not be_nil
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|