mzml 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/Gemfile +4 -0
- data/Rakefile +15 -39
- data/bin/{mzML2mgf.rb → mzml2mgf} +1 -17
- data/lib/mzml/chromatogram.rb +80 -0
- data/lib/mzml/doc.rb +185 -0
- data/lib/mzml/spectrum.rb +107 -0
- data/lib/mzml/version.rb +4 -0
- data/lib/mzml.rb +4 -244
- data/mzml.gemspec +15 -59
- data/spec/mzml_spec.rb +16 -0
- data/spec/sample.unindexed.mzML +221 -0
- data/test/fixtures/sample.compressed.mzML +2699 -0
- data/test/fixtures/sample.mgf +25548 -0
- data/test/fixtures/sample.mzML +2688 -0
- data/test/test_mzml-helper.rb +15 -0
- data/test/test_mzml.rb +94 -0
- metadata +83 -76
- data/.document +0 -5
- data/.yardoc +0 -0
- data/VERSION +0 -1
data/.gitignore
CHANGED
data/Gemfile
ADDED
data/Rakefile
CHANGED
@@ -1,44 +1,20 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "mzml"
|
8
|
-
gem.summary = %Q{A non-validating mzML parser}
|
9
|
-
gem.description = %Q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
|
10
|
-
gem.email = "angel@delagoya.com"
|
11
|
-
gem.homepage = "http://github.com/delagoya/mzml"
|
12
|
-
gem.authors = ["Angel Pizarro"]
|
13
|
-
gem.add_development_dependency "rspec", "1.3.0"
|
14
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
-
gem.add_dependency "nokogiri", "1.4.1"
|
16
|
-
|
17
|
-
end
|
18
|
-
Jeweler::GemcutterTasks.new
|
19
|
-
rescue LoadError
|
20
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
-
end
|
22
|
-
|
23
|
-
require 'spec/rake/spectask'
|
24
|
-
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
-
spec.libs << 'lib' << 'spec'
|
26
|
-
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
-
end
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
28
3
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
4
|
+
# test
|
5
|
+
require 'rake'
|
6
|
+
require 'rake/testtask'
|
7
|
+
Rake::TestTask.new do |t|
|
8
|
+
t.libs << "test"
|
9
|
+
t.test_files = FileList['test/test*.rb']
|
10
|
+
t.verbose = true
|
33
11
|
end
|
34
12
|
|
35
|
-
|
36
|
-
|
37
|
-
task :default => :spec
|
38
|
-
|
13
|
+
# documentation
|
39
14
|
require 'yard'
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
15
|
+
require 'yard/rake/yardoc_task'
|
16
|
+
YARD::Rake::YardocTask.new do |t|
|
17
|
+
t.files = ['lib/**/*.rb', '-' , "README.rdoc","LICENSE"]
|
18
|
+
t.options = ["-r","README.rdoc"]
|
44
19
|
end
|
20
|
+
|
@@ -1,23 +1,7 @@
|
|
1
|
-
#!/
|
2
|
-
|
3
|
-
################################
|
4
|
-
####
|
5
|
-
##
|
6
|
-
# David Austin - UPENN
|
7
|
-
# converts mzML to MGF format
|
8
|
-
# set up to replicate msconvert but muuchh slower
|
9
|
-
#
|
10
|
-
|
11
|
-
require 'rubygems'
|
1
|
+
#!/usr/bin/env ruby
|
12
2
|
require 'mzml'
|
13
|
-
|
14
|
-
|
15
|
-
#first load nokogiri document
|
16
|
-
|
17
3
|
mzml = MzML::Doc.new(ARGV[0])
|
18
4
|
|
19
|
-
#now loop through each spectrum.. sort first to be the same as msconvert
|
20
|
-
|
21
5
|
sorted_keys = mzml.parse_index_list[:spectrum].keys.sort{ |x,y| x.split('=')[3].to_i <=> y.split('=')[3].to_i }
|
22
6
|
|
23
7
|
sorted_keys.each do |k|
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
module MzML
|
5
|
+
class Chromatogram
|
6
|
+
# Canonical ID of the chromatogram
|
7
|
+
attr_reader :id
|
8
|
+
|
9
|
+
attr_reader :default_array_length
|
10
|
+
|
11
|
+
# The positional index of the chromatogram in the mzML document
|
12
|
+
attr_reader :index_position
|
13
|
+
alias_method :index, :index_position
|
14
|
+
|
15
|
+
attr_reader :default_processing_ref
|
16
|
+
|
17
|
+
# Timepoints intensity values
|
18
|
+
attr_reader :timepoint
|
19
|
+
|
20
|
+
# The unit of time that the timepoints are measured in (e.g. seconds, minutes, ...)
|
21
|
+
attr_reader :time_unit
|
22
|
+
|
23
|
+
# Intensity array of values
|
24
|
+
attr_reader :intensity
|
25
|
+
|
26
|
+
# Nokogiri::XML::Node of the document
|
27
|
+
attr_reader :node
|
28
|
+
|
29
|
+
# CV param attributes
|
30
|
+
attr_reader :params
|
31
|
+
|
32
|
+
def initialize(node)
|
33
|
+
@node = node
|
34
|
+
@params = {}
|
35
|
+
parse_element()
|
36
|
+
end
|
37
|
+
|
38
|
+
protected
|
39
|
+
def parse_element
|
40
|
+
@id = @node[:id]
|
41
|
+
@index_position = @node[:index].to_i
|
42
|
+
@default_array_length = @node[:defaultArrayLength].to_i
|
43
|
+
# CV parameters
|
44
|
+
@params = @node.xpath("./cvParam").inject([]) do |memo,prm|
|
45
|
+
memo << {:name => prm[:name],
|
46
|
+
:value => prm[:value],
|
47
|
+
:accession => prm[:accession],
|
48
|
+
:cv => prm[:cvRef]}
|
49
|
+
memo
|
50
|
+
end
|
51
|
+
# binary data
|
52
|
+
parse_binary_data()
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_binary_data
|
56
|
+
@node.xpath("./binaryDataArrayList/binaryDataArray").each do |bd|
|
57
|
+
if bd.xpath("cvParam/@accession='MS:1000523'")
|
58
|
+
# "64-bit float"
|
59
|
+
decode_type = "E*"
|
60
|
+
else
|
61
|
+
# 32-bit float
|
62
|
+
decode_type = "e*"
|
63
|
+
end
|
64
|
+
data = Base64.decode64(bd.xpath("binary").text)
|
65
|
+
# compressed?
|
66
|
+
if bd.xpath("cvParam/@accession='MS:1000574'")
|
67
|
+
data = Zlib::Inflate.inflate(data)
|
68
|
+
end
|
69
|
+
# time or intensity data?
|
70
|
+
if bd.xpath("cvParam/@accession='MS:1000595'")
|
71
|
+
# parse the time units
|
72
|
+
@time_unit = bd.xpath("cvParam[@accession='MS:1000595']")[0].attributes["unitName"].value
|
73
|
+
@timepoint = data.unpack(decode_type)
|
74
|
+
else
|
75
|
+
@intensity = data.unpack(decode_type)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/mzml/doc.rb
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
#--
|
2
|
+
# This program is free software; you can redistribute it and/or modify
|
3
|
+
# it under the terms of the GNU Library or "Lesser" General Public
|
4
|
+
# License (LGPL) as published by the Free Software Foundation;
|
5
|
+
# either version 2 of the License, or (at your option) any later
|
6
|
+
# version.
|
7
|
+
# Author: Angel Pizarro
|
8
|
+
# Date: 12/05/2009
|
9
|
+
# Copyright: Angel Pizarro, Copyright (c) University of Pennsylvania. All rights reserved.
|
10
|
+
#
|
11
|
+
|
12
|
+
# == MzML
|
13
|
+
#
|
14
|
+
# A non-validating mzML v 1.1.0 parser. Most annotation is left as XML DOM
|
15
|
+
# objects. See the Nokogiri::XML::Node and Nokogiri::XML::NodeSet
|
16
|
+
# documentation on how to work with these.
|
17
|
+
#
|
18
|
+
# ===USAGE:
|
19
|
+
#
|
20
|
+
# require 'mzml'
|
21
|
+
# mzml = MzML::Doc.new("test.mzXML")
|
22
|
+
module MzML
|
23
|
+
|
24
|
+
# An internal module containing useful regular expressions
|
25
|
+
module RGX
|
26
|
+
# The file byte offset of the start of the file index
|
27
|
+
INDEX_OFFSET = /<indexListOffset>(\d+)<\/indexListOffset>/
|
28
|
+
# The start of a either a spectrumList or chromatographList
|
29
|
+
DATA_LIST_START = /<(spectrum|chromatogram)List\s.*count\=["'](\d+)/m
|
30
|
+
# The start spectrum or chromatogram element
|
31
|
+
DATA_START = /<(spectrum|chromatogram)\s.*id=["']([^'"]+)["']/m
|
32
|
+
# The end spectrum or chromatogram element
|
33
|
+
DATA_END = /(<\/(spectrum|chromatogram)>)/
|
34
|
+
end
|
35
|
+
|
36
|
+
class UnsupportedFileFormat < Exception
|
37
|
+
end
|
38
|
+
class BadIdentifier < Exception
|
39
|
+
end
|
40
|
+
|
41
|
+
# The main mzML parser class, it is a subclass of the File class from the
|
42
|
+
# Ruby standard library in that it places a read cursor on the mzML file,
|
43
|
+
# and will skip around using byte-offsets. We utilize the index at the
|
44
|
+
# end of mzML files to facilitate random access of spectra.
|
45
|
+
#
|
46
|
+
# The {#each} method will cycle through all of the spectrum in a file, starting
|
47
|
+
# from the first one each time. If you would rather access the spectra randomly,
|
48
|
+
# the {#spectrum_list} attribute contains the ordered list of specturm identifiers.
|
49
|
+
# You can access the MzML::Spectrum objects by feeding these identifiers to the {#spectrum}
|
50
|
+
# method.
|
51
|
+
class Doc < ::File
|
52
|
+
|
53
|
+
# Open a file handle to a mzML document
|
54
|
+
def initialize(mz_fname)
|
55
|
+
unless mz_fname =~ /\.mzML$/
|
56
|
+
raise MzML::UnsupportedFileFormat.new "File extension must be .\"mzML\""
|
57
|
+
end
|
58
|
+
super(mz_fname, "r")
|
59
|
+
@fname = mz_fname
|
60
|
+
@index = parse_index_list
|
61
|
+
@spectrum_count = @spectrum_list.length
|
62
|
+
@chromatogram_count = @chromatogram_list.length
|
63
|
+
@current_spectrum_index = 0
|
64
|
+
end
|
65
|
+
attr_reader :index, :fname, :spectrum_list, :spectrum_count, :chromatogram_list, :chromatogram_count
|
66
|
+
|
67
|
+
# Fetch a {MzML::Chromatogram} from the file, given the identifier
|
68
|
+
# @param chromatogram_id String
|
69
|
+
# @return {MzML::Chromatogram}
|
70
|
+
def chromatogram(chromatogram_id)
|
71
|
+
if @index[:chromatogram].has_key? chromatogram_id
|
72
|
+
self.seek @index[:chromatogram][chromatogram_id]
|
73
|
+
return MzML::Chromatogram.new(parse_next)
|
74
|
+
else
|
75
|
+
raise MzML::BadIdentifier.new("Invalid ID '#{chromatogram_id}'")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def spectrum(spectrum_id)
|
80
|
+
if @index[:spectrum].has_key? spectrum_id
|
81
|
+
self.seek @index[:spectrum][spectrum_id]
|
82
|
+
return MzML::Spectrum.new(parse_next())
|
83
|
+
else
|
84
|
+
raise MzML::BadIdentifier.new("Invalid ID '#{spectrum_id}'")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def each &block
|
89
|
+
@spectrum_list.each do |spectrum_id|
|
90
|
+
block.call(self.spectrum(spectrum_id))
|
91
|
+
@current_spectrum_index += 1
|
92
|
+
end
|
93
|
+
end
|
94
|
+
alias_method :each_spectrum, :each
|
95
|
+
|
96
|
+
def next &block
|
97
|
+
if @current_spectrum_index < @spectrum_list.length
|
98
|
+
@current_spectrum_index += 1
|
99
|
+
self.spectrum(@spectrum_list[@current_spectrum_index - 1])
|
100
|
+
else
|
101
|
+
nil
|
102
|
+
end
|
103
|
+
end
|
104
|
+
alias_method :next_spectrum, :next
|
105
|
+
|
106
|
+
def rewind
|
107
|
+
super
|
108
|
+
@current_spectrum_index = 0
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
# Parses the IndexList
|
113
|
+
def parse_index_list
|
114
|
+
self.seek(self.stat.size - 200)
|
115
|
+
# parse the index offset
|
116
|
+
tmp = self.read
|
117
|
+
tmp =~ MzML::RGX::INDEX_OFFSET
|
118
|
+
offset = $1
|
119
|
+
# if I didn't match anything, compute the index and return
|
120
|
+
unless (offset)
|
121
|
+
return compute_index_list
|
122
|
+
end
|
123
|
+
@index = {}
|
124
|
+
@spectrum_list = []
|
125
|
+
@chromatogram_list = []
|
126
|
+
self.seek(offset.to_i)
|
127
|
+
tmp = Nokogiri::XML.parse(self.read).root
|
128
|
+
tmp.css("index").each do |idx|
|
129
|
+
index_type = idx[:name].to_sym
|
130
|
+
@index[index_type] = {}
|
131
|
+
idx.css("offset").each do |o|
|
132
|
+
@index[index_type][o[:idRef]] = o.text().to_i
|
133
|
+
if index_type == :spectrum
|
134
|
+
@spectrum_list << o[:idRef]
|
135
|
+
else
|
136
|
+
@chromatogram_list << o[:idRef]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
self.rewind
|
141
|
+
return @index
|
142
|
+
end
|
143
|
+
|
144
|
+
def compute_index_list
|
145
|
+
@index = Hash.new {|h,k| h[k] = {} }
|
146
|
+
# start at the beginning.
|
147
|
+
self.rewind
|
148
|
+
# fast forward to the first spectrum or chromatograph
|
149
|
+
buffer = ''
|
150
|
+
while !self.eof
|
151
|
+
buffer += self.read(1024)
|
152
|
+
if start_pos = buffer =~ MzML::RGX::DATA_START
|
153
|
+
self.seek start_pos
|
154
|
+
break
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# for each particular entity start to fill in the index
|
159
|
+
buffer = ''
|
160
|
+
rgx_start = /<(spectrum|chromatogram)\s.*id=["']([^"']+)["']/
|
161
|
+
while !self.eof
|
162
|
+
buffer += self.read(1024)
|
163
|
+
if start_pos = buffer =~ rgx_start
|
164
|
+
start_pos = self.pos - buffer.length + start_pos
|
165
|
+
@index[$1.to_sym][$2] = start_pos
|
166
|
+
buffer = ''
|
167
|
+
end
|
168
|
+
end
|
169
|
+
return @index
|
170
|
+
end
|
171
|
+
|
172
|
+
def parse_next
|
173
|
+
buffer = ''
|
174
|
+
while(!self.eof)
|
175
|
+
if end_pos = buffer =~ MzML::RGX::DATA_END
|
176
|
+
extra_content = buffer.slice!((end_pos + $1.length)..-1)
|
177
|
+
self.pos -= (extra_content.length)
|
178
|
+
break
|
179
|
+
end
|
180
|
+
buffer += self.read(1024)
|
181
|
+
end
|
182
|
+
return Nokogiri::XML.parse(buffer).root
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
module MzML
|
5
|
+
class Spectrum
|
6
|
+
attr_reader :id, :default_array_length, :type,
|
7
|
+
:precursor, :base_peak_mz, :base_peak_intensity, :ms_level,
|
8
|
+
:high_mz, :low_mz, :title, :tic, :polarity, :representation, :mz_node, :intensity_node,
|
9
|
+
:mz, :intensity, :precursor_list, :scan_list, :retention_time, :precursor_mass,
|
10
|
+
:precursor_intensity, :node, :params
|
11
|
+
|
12
|
+
def initialize(node)
|
13
|
+
@node = node
|
14
|
+
@params = {}
|
15
|
+
@precursor_list = []
|
16
|
+
parse_element()
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
def parse_element
|
21
|
+
|
22
|
+
# id
|
23
|
+
@id = @node.attributes["id"].value
|
24
|
+
@index = @node.attributes["index"].value.to_i
|
25
|
+
@default_array_length = @node.attributes["defaultArrayLength"].value.to_i
|
26
|
+
|
27
|
+
# now reaching into params
|
28
|
+
@params = @node.xpath("cvParam").inject({}) do |memo,prm|
|
29
|
+
memo[prm[:name]] = prm[:value]
|
30
|
+
memo
|
31
|
+
end
|
32
|
+
|
33
|
+
@ms_level = @params["ms level"].to_i
|
34
|
+
@low_mz = @params["lowest observed m/z"].to_f if @params.has_key?("lowest observed m/z")
|
35
|
+
@high_mz = @params["highest observed m/z"].to_f if @params.has_key?("highest observed m/z")
|
36
|
+
@tic = @params["total ion current"].to_i if @params.has_key?("total ion current")
|
37
|
+
@base_peak_mz = @params["base peak m/z"].to_f if @params.has_key?("base peak m/z")
|
38
|
+
@base_peak_intensity = @params["base peak intensity"].to_f if @params.has_key?("base peak intensity")
|
39
|
+
|
40
|
+
# precursor list
|
41
|
+
if @node.xpath("precursorList/precursor").length > 0
|
42
|
+
parse_precursor_list()
|
43
|
+
get_parent_info()
|
44
|
+
else
|
45
|
+
@precursor_list = []
|
46
|
+
end
|
47
|
+
|
48
|
+
# scan list
|
49
|
+
if (@node.xpath("scanList/scan").length > 0)
|
50
|
+
@scan_list = parse_scan_list()
|
51
|
+
else
|
52
|
+
@scan_list = nil
|
53
|
+
end
|
54
|
+
# binary data
|
55
|
+
parse_binary_data()
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_precursor_list
|
59
|
+
@precursor_list = []
|
60
|
+
@node.xpath("precursorList/precursor").each do |p|
|
61
|
+
@precursor_list << [p[:spectrumRef], p]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_parent_info
|
66
|
+
unless @precursor_list.empty?
|
67
|
+
if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000744'")
|
68
|
+
@precursor_mass = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000744']")[0][:value].to_f
|
69
|
+
end
|
70
|
+
if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000042'")
|
71
|
+
@precursor_intensity = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000042']")[0][:value].to_f
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def parse_scan_list
|
77
|
+
@scan_list = @node.xpath("scanList/scan")
|
78
|
+
if @node.xpath("scanList/scan/cvParam/@accession='MS:1000016'")
|
79
|
+
@retention_time = @node.xpath("scanList/scan/cvParam[@accession='MS:1000016']")[0][:value]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def parse_binary_data
|
84
|
+
@node.xpath("binaryDataArrayList/binaryDataArray").each do |bd|
|
85
|
+
if bd.xpath("cvParam/@accession='MS:1000523'")
|
86
|
+
# "64-bit float"
|
87
|
+
decode_type = "E*"
|
88
|
+
else
|
89
|
+
# 32-bit float
|
90
|
+
decode_type = "e*"
|
91
|
+
end
|
92
|
+
data = Base64.decode64(bd.xpath("binary").text)
|
93
|
+
# compressed?
|
94
|
+
if bd.xpath("cvParam/@accession='MS:1000574'")
|
95
|
+
data = Zlib::Inflate.inflate(data)
|
96
|
+
end
|
97
|
+
# m/z or intensity data?
|
98
|
+
if bd.xpath("cvParam/@accession='MS:1000514'")
|
99
|
+
# m/z data
|
100
|
+
@mz = data.unpack(decode_type)
|
101
|
+
else
|
102
|
+
@intensity = data.unpack(decode_type)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
data/lib/mzml/version.rb
ADDED