mzml 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Gemfile +4 -0
- data/Rakefile +15 -39
- data/bin/{mzML2mgf.rb → mzml2mgf} +1 -17
- data/lib/mzml/chromatogram.rb +80 -0
- data/lib/mzml/doc.rb +185 -0
- data/lib/mzml/spectrum.rb +107 -0
- data/lib/mzml/version.rb +4 -0
- data/lib/mzml.rb +4 -244
- data/mzml.gemspec +15 -59
- data/spec/mzml_spec.rb +16 -0
- data/spec/sample.unindexed.mzML +221 -0
- data/test/fixtures/sample.compressed.mzML +2699 -0
- data/test/fixtures/sample.mgf +25548 -0
- data/test/fixtures/sample.mzML +2688 -0
- data/test/test_mzml-helper.rb +15 -0
- data/test/test_mzml.rb +94 -0
- metadata +83 -76
- data/.document +0 -5
- data/.yardoc +0 -0
- data/VERSION +0 -1
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/Rakefile
CHANGED
@@ -1,44 +1,20 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "mzml"
|
8
|
-
gem.summary = %Q{A non-validating mzML parser}
|
9
|
-
gem.description = %Q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
|
10
|
-
gem.email = "angel@delagoya.com"
|
11
|
-
gem.homepage = "http://github.com/delagoya/mzml"
|
12
|
-
gem.authors = ["Angel Pizarro"]
|
13
|
-
gem.add_development_dependency "rspec", "1.3.0"
|
14
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
-
gem.add_dependency "nokogiri", "1.4.1"
|
16
|
-
|
17
|
-
end
|
18
|
-
Jeweler::GemcutterTasks.new
|
19
|
-
rescue LoadError
|
20
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
-
end
|
22
|
-
|
23
|
-
require 'spec/rake/spectask'
|
24
|
-
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
-
spec.libs << 'lib' << 'spec'
|
26
|
-
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
-
end
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
28
3
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
4
|
+
# test
|
5
|
+
require 'rake'
|
6
|
+
require 'rake/testtask'
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
t.libs << "test"
|
9
|
+
t.test_files = FileList['test/test*.rb']
|
10
|
+
t.verbose = true
|
33
11
|
end
|
34
12
|
|
35
|
-
|
36
|
-
|
37
|
-
task :default => :spec
|
38
|
-
|
13
|
+
# documentation
|
39
14
|
require 'yard'
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
15
|
+
require 'yard/rake/yardoc_task'
|
16
|
+
YARD::Rake::YardocTask.new do |t|
|
17
|
+
t.files = ['lib/**/*.rb', '-' , "README.rdoc","LICENSE"]
|
18
|
+
t.options = ["-r","README.rdoc"]
|
44
19
|
end
|
20
|
+
|
@@ -1,23 +1,7 @@
|
|
1
|
-
#!/
|
2
|
-
|
3
|
-
################################
|
4
|
-
####
|
5
|
-
##
|
6
|
-
# David Austin - UPENN
|
7
|
-
# converts mzML to MGF format
|
8
|
-
# set up to replicate msconvert but muuchh slower
|
9
|
-
#
|
10
|
-
|
11
|
-
require 'rubygems'
|
1
|
+
#!/usr/bin/env ruby
|
12
2
|
require 'mzml'
|
13
|
-
|
14
|
-
|
15
|
-
#first load nokogiri document
|
16
|
-
|
17
3
|
mzml = MzML::Doc.new(ARGV[0])
|
18
4
|
|
19
|
-
#now loop through each spectrum.. sort first to be the same as msconvert
|
20
|
-
|
21
5
|
sorted_keys = mzml.parse_index_list[:spectrum].keys.sort{ |x,y| x.split('=')[3].to_i <=> y.split('=')[3].to_i }
|
22
6
|
|
23
7
|
sorted_keys.each do |k|
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
module MzML
|
5
|
+
class Chromatogram
|
6
|
+
# Canonical ID of the chromatogram
|
7
|
+
attr_reader :id
|
8
|
+
|
9
|
+
attr_reader :default_array_length
|
10
|
+
|
11
|
+
# The positional index of the chromatogram in the mzML document
|
12
|
+
attr_reader :index_position
|
13
|
+
alias_method :index, :index_position
|
14
|
+
|
15
|
+
attr_reader :default_processing_ref
|
16
|
+
|
17
|
+
# Timepoints intensity values
|
18
|
+
attr_reader :timepoint
|
19
|
+
|
20
|
+
# The unit of time that the timepoints are measured in (e.g. seconds, minutes, ...)
|
21
|
+
attr_reader :time_unit
|
22
|
+
|
23
|
+
# Intensity array of values
|
24
|
+
attr_reader :intensity
|
25
|
+
|
26
|
+
# Nokogiri::XML::Node of the document
|
27
|
+
attr_reader :node
|
28
|
+
|
29
|
+
# CV param attributes
|
30
|
+
attr_reader :params
|
31
|
+
|
32
|
+
def initialize(node)
|
33
|
+
@node = node
|
34
|
+
@params = {}
|
35
|
+
parse_element()
|
36
|
+
end
|
37
|
+
|
38
|
+
protected
|
39
|
+
def parse_element
|
40
|
+
@id = @node[:id]
|
41
|
+
@index_position = @node[:index].to_i
|
42
|
+
@default_array_length = @node[:defaultArrayLength].to_i
|
43
|
+
# CV parameters
|
44
|
+
@params = @node.xpath("./cvParam").inject([]) do |memo,prm|
|
45
|
+
memo << {:name => prm[:name],
|
46
|
+
:value => prm[:value],
|
47
|
+
:accession => prm[:accession],
|
48
|
+
:cv => prm[:cvRef]}
|
49
|
+
memo
|
50
|
+
end
|
51
|
+
# binary data
|
52
|
+
parse_binary_data()
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_binary_data
|
56
|
+
@node.xpath("./binaryDataArrayList/binaryDataArray").each do |bd|
|
57
|
+
if bd.xpath("cvParam/@accession='MS:1000523'")
|
58
|
+
# "64-bit float"
|
59
|
+
decode_type = "E*"
|
60
|
+
else
|
61
|
+
# 32-bit float
|
62
|
+
decode_type = "e*"
|
63
|
+
end
|
64
|
+
data = Base64.decode64(bd.xpath("binary").text)
|
65
|
+
# compressed?
|
66
|
+
if bd.xpath("cvParam/@accession='MS:1000574'")
|
67
|
+
data = Zlib::Inflate.inflate(data)
|
68
|
+
end
|
69
|
+
# time or intensity data?
|
70
|
+
if bd.xpath("cvParam/@accession='MS:1000595'")
|
71
|
+
# parse the time units
|
72
|
+
@time_unit = bd.xpath("cvParam[@accession='MS:1000595']")[0].attributes["unitName"].value
|
73
|
+
@timepoint = data.unpack(decode_type)
|
74
|
+
else
|
75
|
+
@intensity = data.unpack(decode_type)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/mzml/doc.rb
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
#--
|
2
|
+
# This program is free software; you can redistribute it and/or modify
|
3
|
+
# it under the terms of the GNU Library or "Lesser" General Public
|
4
|
+
# License (LGPL) as published by the Free Software Foundation;
|
5
|
+
# either version 2 of the License, or (at your option) any later
|
6
|
+
# version.
|
7
|
+
# Author: Angel Pizarro
|
8
|
+
# Date: 12/05/2009
|
9
|
+
# Copyright: Angel Pizarro, Copyright (c) University of Pennsylvania. All rights reserved.
|
10
|
+
#
|
11
|
+
|
12
|
+
# == MzML
|
13
|
+
#
|
14
|
+
# A non-validating mzML v 1.1.0 parser. Most annotation is left as XML DOM
|
15
|
+
# objects. See the Nokogiri::XML::Node and Nokogiri::XML::NodeSet
|
16
|
+
# documentation on how to work with these.
|
17
|
+
#
|
18
|
+
# ===USAGE:
|
19
|
+
#
|
20
|
+
# require 'mzml'
|
21
|
+
# mzml = MzML::Doc.new("test.mzXML")
|
22
|
+
module MzML
|
23
|
+
|
24
|
+
# An internal module containing useful regular expressions
|
25
|
+
module RGX
|
26
|
+
# The file byte offset of the start of the file index
|
27
|
+
INDEX_OFFSET = /<indexListOffset>(\d+)<\/indexListOffset>/
|
28
|
+
# The start of a either a spectrumList or chromatographList
|
29
|
+
DATA_LIST_START = /<(spectrum|chromatogram)List\s.*count\=["'](\d+)/m
|
30
|
+
# The start spectrum or chromatogram element
|
31
|
+
DATA_START = /<(spectrum|chromatogram)\s.*id=["']([^'"]+)["']/m
|
32
|
+
# The end spectrum or chromatogram element
|
33
|
+
DATA_END = /(<\/(spectrum|chromatogram)>)/
|
34
|
+
end
|
35
|
+
|
36
|
+
class UnsupportedFileFormat < Exception
|
37
|
+
end
|
38
|
+
class BadIdentifier < Exception
|
39
|
+
end
|
40
|
+
|
41
|
+
# The main mzML parser class, it is a subclass of the File class from the
|
42
|
+
# Ruby standard library in that it places a read cursor on the mzML file,
|
43
|
+
# and will skip around using byte-offsets. We utilize the index at the
|
44
|
+
# end of mzML files to facilitate random access of spectra.
|
45
|
+
#
|
46
|
+
# The {#each} method will cycle through all of the spectrum in a file, starting
|
47
|
+
# from the first one each time. If you would rather access the spectra randomly,
|
48
|
+
# the {#spectrum_list} attribute contains the ordered list of specturm identifiers.
|
49
|
+
# You can access the MzML::Spectrum objects by feeding these identifiers to the {#spectrum}
|
50
|
+
# method.
|
51
|
+
class Doc < ::File
|
52
|
+
|
53
|
+
# Open a file handle to a mzML document
|
54
|
+
def initialize(mz_fname)
|
55
|
+
unless mz_fname =~ /\.mzML$/
|
56
|
+
raise MzML::UnsupportedFileFormat.new "File extension must be .\"mzML\""
|
57
|
+
end
|
58
|
+
super(mz_fname, "r")
|
59
|
+
@fname = mz_fname
|
60
|
+
@index = parse_index_list
|
61
|
+
@spectrum_count = @spectrum_list.length
|
62
|
+
@chromatogram_count = @chromatogram_list.length
|
63
|
+
@current_spectrum_index = 0
|
64
|
+
end
|
65
|
+
attr_reader :index, :fname, :spectrum_list, :spectrum_count, :chromatogram_list, :chromatogram_count
|
66
|
+
|
67
|
+
# Fetch a {MzML::Chromatogram} from the file, given the identifier
|
68
|
+
# @param chromatogram_id String
|
69
|
+
# @return {MzML::Chromatogram}
|
70
|
+
def chromatogram(chromatogram_id)
|
71
|
+
if @index[:chromatogram].has_key? chromatogram_id
|
72
|
+
self.seek @index[:chromatogram][chromatogram_id]
|
73
|
+
return MzML::Chromatogram.new(parse_next)
|
74
|
+
else
|
75
|
+
raise MzML::BadIdentifier.new("Invalid ID '#{chromatogram_id}'")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def spectrum(spectrum_id)
|
80
|
+
if @index[:spectrum].has_key? spectrum_id
|
81
|
+
self.seek @index[:spectrum][spectrum_id]
|
82
|
+
return MzML::Spectrum.new(parse_next())
|
83
|
+
else
|
84
|
+
raise MzML::BadIdentifier.new("Invalid ID '#{spectrum_id}'")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def each &block
|
89
|
+
@spectrum_list.each do |spectrum_id|
|
90
|
+
block.call(self.spectrum(spectrum_id))
|
91
|
+
@current_spectrum_index += 1
|
92
|
+
end
|
93
|
+
end
|
94
|
+
alias_method :each_spectrum, :each
|
95
|
+
|
96
|
+
def next &block
|
97
|
+
if @current_spectrum_index < @spectrum_list.length
|
98
|
+
@current_spectrum_index += 1
|
99
|
+
self.spectrum(@spectrum_list[@current_spectrum_index - 1])
|
100
|
+
else
|
101
|
+
nil
|
102
|
+
end
|
103
|
+
end
|
104
|
+
alias_method :next_spectrum, :next
|
105
|
+
|
106
|
+
def rewind
|
107
|
+
super
|
108
|
+
@current_spectrum_index = 0
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
# Parses the IndexList
|
113
|
+
def parse_index_list
|
114
|
+
self.seek(self.stat.size - 200)
|
115
|
+
# parse the index offset
|
116
|
+
tmp = self.read
|
117
|
+
tmp =~ MzML::RGX::INDEX_OFFSET
|
118
|
+
offset = $1
|
119
|
+
# if I didn't match anything, compute the index and return
|
120
|
+
unless (offset)
|
121
|
+
return compute_index_list
|
122
|
+
end
|
123
|
+
@index = {}
|
124
|
+
@spectrum_list = []
|
125
|
+
@chromatogram_list = []
|
126
|
+
self.seek(offset.to_i)
|
127
|
+
tmp = Nokogiri::XML.parse(self.read).root
|
128
|
+
tmp.css("index").each do |idx|
|
129
|
+
index_type = idx[:name].to_sym
|
130
|
+
@index[index_type] = {}
|
131
|
+
idx.css("offset").each do |o|
|
132
|
+
@index[index_type][o[:idRef]] = o.text().to_i
|
133
|
+
if index_type == :spectrum
|
134
|
+
@spectrum_list << o[:idRef]
|
135
|
+
else
|
136
|
+
@chromatogram_list << o[:idRef]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
self.rewind
|
141
|
+
return @index
|
142
|
+
end
|
143
|
+
|
144
|
+
def compute_index_list
|
145
|
+
@index = Hash.new {|h,k| h[k] = {} }
|
146
|
+
# start at the beginning.
|
147
|
+
self.rewind
|
148
|
+
# fast forward to the first spectrum or chromatograph
|
149
|
+
buffer = ''
|
150
|
+
while !self.eof
|
151
|
+
buffer += self.read(1024)
|
152
|
+
if start_pos = buffer =~ MzML::RGX::DATA_START
|
153
|
+
self.seek start_pos
|
154
|
+
break
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# for each particular entity start to fill in the index
|
159
|
+
buffer = ''
|
160
|
+
rgx_start = /<(spectrum|chromatogram)\s.*id=["']([^"']+)["']/
|
161
|
+
while !self.eof
|
162
|
+
buffer += self.read(1024)
|
163
|
+
if start_pos = buffer =~ rgx_start
|
164
|
+
start_pos = self.pos - buffer.length + start_pos
|
165
|
+
@index[$1.to_sym][$2] = start_pos
|
166
|
+
buffer = ''
|
167
|
+
end
|
168
|
+
end
|
169
|
+
return @index
|
170
|
+
end
|
171
|
+
|
172
|
+
def parse_next
|
173
|
+
buffer = ''
|
174
|
+
while(!self.eof)
|
175
|
+
if end_pos = buffer =~ MzML::RGX::DATA_END
|
176
|
+
extra_content = buffer.slice!((end_pos + $1.length)..-1)
|
177
|
+
self.pos -= (extra_content.length)
|
178
|
+
break
|
179
|
+
end
|
180
|
+
buffer += self.read(1024)
|
181
|
+
end
|
182
|
+
return Nokogiri::XML.parse(buffer).root
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
module MzML
|
5
|
+
class Spectrum
|
6
|
+
attr_reader :id, :default_array_length, :type,
|
7
|
+
:precursor, :base_peak_mz, :base_peak_intensity, :ms_level,
|
8
|
+
:high_mz, :low_mz, :title, :tic, :polarity, :representation, :mz_node, :intensity_node,
|
9
|
+
:mz, :intensity, :precursor_list, :scan_list, :retention_time, :precursor_mass,
|
10
|
+
:precursor_intensity, :node, :params
|
11
|
+
|
12
|
+
def initialize(node)
|
13
|
+
@node = node
|
14
|
+
@params = {}
|
15
|
+
@precursor_list = []
|
16
|
+
parse_element()
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
def parse_element
|
21
|
+
|
22
|
+
# id
|
23
|
+
@id = @node.attributes["id"].value
|
24
|
+
@index = @node.attributes["index"].value.to_i
|
25
|
+
@default_array_length = @node.attributes["defaultArrayLength"].value.to_i
|
26
|
+
|
27
|
+
# now reaching into params
|
28
|
+
@params = @node.xpath("cvParam").inject({}) do |memo,prm|
|
29
|
+
memo[prm[:name]] = prm[:value]
|
30
|
+
memo
|
31
|
+
end
|
32
|
+
|
33
|
+
@ms_level = @params["ms level"].to_i
|
34
|
+
@low_mz = @params["lowest observed m/z"].to_f if @params.has_key?("lowest observed m/z")
|
35
|
+
@high_mz = @params["highest observed m/z"].to_f if @params.has_key?("highest observed m/z")
|
36
|
+
@tic = @params["total ion current"].to_i if @params.has_key?("total ion current")
|
37
|
+
@base_peak_mz = @params["base peak m/z"].to_f if @params.has_key?("base peak m/z")
|
38
|
+
@base_peak_intensity = @params["base peak intensity"].to_f if @params.has_key?("base peak intensity")
|
39
|
+
|
40
|
+
# precursor list
|
41
|
+
if @node.xpath("precursorList/precursor").length > 0
|
42
|
+
parse_precursor_list()
|
43
|
+
get_parent_info()
|
44
|
+
else
|
45
|
+
@precursor_list = []
|
46
|
+
end
|
47
|
+
|
48
|
+
# scan list
|
49
|
+
if (@node.xpath("scanList/scan").length > 0)
|
50
|
+
@scan_list = parse_scan_list()
|
51
|
+
else
|
52
|
+
@scan_list = nil
|
53
|
+
end
|
54
|
+
# binary data
|
55
|
+
parse_binary_data()
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_precursor_list
|
59
|
+
@precursor_list = []
|
60
|
+
@node.xpath("precursorList/precursor").each do |p|
|
61
|
+
@precursor_list << [p[:spectrumRef], p]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_parent_info
|
66
|
+
unless @precursor_list.empty?
|
67
|
+
if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000744'")
|
68
|
+
@precursor_mass = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000744']")[0][:value].to_f
|
69
|
+
end
|
70
|
+
if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000042'")
|
71
|
+
@precursor_intensity = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000042']")[0][:value].to_f
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def parse_scan_list
|
77
|
+
@scan_list = @node.xpath("scanList/scan")
|
78
|
+
if @node.xpath("scanList/scan/cvParam/@accession='MS:1000016'")
|
79
|
+
@retention_time = @node.xpath("scanList/scan/cvParam[@accession='MS:1000016']")[0][:value]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def parse_binary_data
|
84
|
+
@node.xpath("binaryDataArrayList/binaryDataArray").each do |bd|
|
85
|
+
if bd.xpath("cvParam/@accession='MS:1000523'")
|
86
|
+
# "64-bit float"
|
87
|
+
decode_type = "E*"
|
88
|
+
else
|
89
|
+
# 32-bit float
|
90
|
+
decode_type = "e*"
|
91
|
+
end
|
92
|
+
data = Base64.decode64(bd.xpath("binary").text)
|
93
|
+
# compressed?
|
94
|
+
if bd.xpath("cvParam/@accession='MS:1000574'")
|
95
|
+
data = Zlib::Inflate.inflate(data)
|
96
|
+
end
|
97
|
+
# m/z or intensity data?
|
98
|
+
if bd.xpath("cvParam/@accession='MS:1000514'")
|
99
|
+
# m/z data
|
100
|
+
@mz = data.unpack(decode_type)
|
101
|
+
else
|
102
|
+
@intensity = data.unpack(decode_type)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
data/lib/mzml/version.rb
ADDED