mzml 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -14,9 +14,12 @@ tmtags
14
14
  *.swp
15
15
 
16
16
  ## PROJECT::GENERAL
17
+ Gemfile.lock
17
18
  coverage
18
19
  rdoc
19
20
  doc
20
21
  pkg
22
+ .yardoc
21
23
 
22
24
  ## PROJECT::SPECIFIC
25
+ scratch.rb
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in mzml.gemspec
4
+ gemspec
data/Rakefile CHANGED
@@ -1,44 +1,20 @@
1
- require 'rubygems'
2
- require 'rake'
3
-
4
- begin
5
- require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "mzml"
8
- gem.summary = %Q{A non-validating mzML parser}
9
- gem.description = %Q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
10
- gem.email = "angel@delagoya.com"
11
- gem.homepage = "http://github.com/delagoya/mzml"
12
- gem.authors = ["Angel Pizarro"]
13
- gem.add_development_dependency "rspec", "1.3.0"
14
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
- gem.add_dependency "nokogiri", "1.4.1"
16
-
17
- end
18
- Jeweler::GemcutterTasks.new
19
- rescue LoadError
20
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
- end
22
-
23
- require 'spec/rake/spectask'
24
- Spec::Rake::SpecTask.new(:spec) do |spec|
25
- spec.libs << 'lib' << 'spec'
26
- spec.spec_files = FileList['spec/**/*_spec.rb']
27
- end
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
28
3
 
29
- Spec::Rake::SpecTask.new(:rcov) do |spec|
30
- spec.libs << 'lib' << 'spec'
31
- spec.pattern = 'spec/**/*_spec.rb'
32
- spec.rcov = true
4
+ # test
5
+ require 'rake'
6
+ require 'rake/testtask'
7
+ Rake::TestTask.new do |t|
8
+ t.libs << "test"
9
+ t.test_files = FileList['test/test*.rb']
10
+ t.verbose = true
33
11
  end
34
12
 
35
- task :spec => :check_dependencies
36
-
37
- task :default => :spec
38
-
13
+ # documentation
39
14
  require 'yard'
40
- YARD::Rake::YardocTask.new do |yardoc|
41
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
- yardoc.options = ["--title", "mzml #{version}", "-r", "README.rdoc"]
43
- yardoc.files = ['README*','lib/**/*.rb']
15
+ require 'yard/rake/yardoc_task'
16
+ YARD::Rake::YardocTask.new do |t|
17
+ t.files = ['lib/**/*.rb', '-' , "README.rdoc","LICENSE"]
18
+ t.options = ["-r","README.rdoc"]
44
19
  end
20
+
@@ -1,23 +1,7 @@
1
- #!/opt/local/bin/ruby
2
-
3
- ################################
4
- ####
5
- ##
6
- # David Austin - UPENN
7
- # converts mzML to MGF format
8
- # set up to replicate msconvert but muuchh slower
9
- #
10
-
11
- require 'rubygems'
1
+ #!/usr/bin/env ruby
12
2
  require 'mzml'
13
-
14
-
15
- #first load nokogiri document
16
-
17
3
  mzml = MzML::Doc.new(ARGV[0])
18
4
 
19
- #now loop through each spectrum.. sort first to be the same as msconvert
20
-
21
5
  sorted_keys = mzml.parse_index_list[:spectrum].keys.sort{ |x,y| x.split('=')[3].to_i <=> y.split('=')[3].to_i }
22
6
 
23
7
  sorted_keys.each do |k|
@@ -0,0 +1,80 @@
1
+ require 'base64'
2
+ require 'zlib'
3
+
4
+ module MzML
5
+ class Chromatogram
6
+ # Canonical ID of the chromatogram
7
+ attr_reader :id
8
+
9
+ attr_reader :default_array_length
10
+
11
+ # The positional index of the chromatogram in the mzML document
12
+ attr_reader :index_position
13
+ alias_method :index, :index_position
14
+
15
+ attr_reader :default_processing_ref
16
+
17
+ # Timepoints intensity values
18
+ attr_reader :timepoint
19
+
20
+ # The unit of time that the timepoints are measured in (e.g. seconds, minutes, ...)
21
+ attr_reader :time_unit
22
+
23
+ # Intensity array of values
24
+ attr_reader :intensity
25
+
26
+ # Nokogiri::XML::Node of the document
27
+ attr_reader :node
28
+
29
+ # CV param attributes
30
+ attr_reader :params
31
+
32
+ def initialize(node)
33
+ @node = node
34
+ @params = {}
35
+ parse_element()
36
+ end
37
+
38
+ protected
39
+ def parse_element
40
+ @id = @node[:id]
41
+ @index_position = @node[:index].to_i
42
+ @default_array_length = @node[:defaultArrayLength].to_i
43
+ # CV parameters
44
+ @params = @node.xpath("./cvParam").inject([]) do |memo,prm|
45
+ memo << {:name => prm[:name],
46
+ :value => prm[:value],
47
+ :accession => prm[:accession],
48
+ :cv => prm[:cvRef]}
49
+ memo
50
+ end
51
+ # binary data
52
+ parse_binary_data()
53
+ end
54
+
55
+ def parse_binary_data
56
+ @node.xpath("./binaryDataArrayList/binaryDataArray").each do |bd|
57
+ if bd.xpath("cvParam/@accession='MS:1000523'")
58
+ # "64-bit float"
59
+ decode_type = "E*"
60
+ else
61
+ # 32-bit float
62
+ decode_type = "e*"
63
+ end
64
+ data = Base64.decode64(bd.xpath("binary").text)
65
+ # compressed?
66
+ if bd.xpath("cvParam/@accession='MS:1000574'")
67
+ data = Zlib::Inflate.inflate(data)
68
+ end
69
+ # time or intensity data?
70
+ if bd.xpath("cvParam/@accession='MS:1000595'")
71
+ # parse the time units
72
+ @time_unit = bd.xpath("cvParam[@accession='MS:1000595']")[0].attributes["unitName"].value
73
+ @timepoint = data.unpack(decode_type)
74
+ else
75
+ @intensity = data.unpack(decode_type)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
data/lib/mzml/doc.rb ADDED
@@ -0,0 +1,185 @@
1
+ #--
2
+ # This program is free software; you can redistribute it and/or modify
3
+ # it under the terms of the GNU Library or "Lesser" General Public
4
+ # License (LGPL) as published by the Free Software Foundation;
5
+ # either version 2 of the License, or (at your option) any later
6
+ # version.
7
+ # Author: Angel Pizarro
8
+ # Date: 12/05/2009
9
+ # Copyright: Angel Pizarro, Copyright (c) University of Pennsylvania. All rights reserved.
10
+ #
11
+
12
+ # == MzML
13
+ #
14
+ # A non-validating mzML v 1.1.0 parser. Most annotation is left as XML DOM
15
+ # objects. See the Nokogiri::XML::Node and Nokogiri::XML::NodeSet
16
+ # documentation on how to work with these.
17
+ #
18
+ # ===USAGE:
19
+ #
20
+ # require 'mzml'
21
+ # mzml = MzML::Doc.new("test.mzXML")
22
+ module MzML
23
+
24
+ # An internal module containing useful regular expressions
25
+ module RGX
26
+ # The file byte offset of the start of the file index
27
+ INDEX_OFFSET = /<indexListOffset>(\d+)<\/indexListOffset>/
28
+ # The start of a either a spectrumList or chromatographList
29
+ DATA_LIST_START = /<(spectrum|chromatogram)List\s.*count\=["'](\d+)/m
30
+ # The start spectrum or chromatogram element
31
+ DATA_START = /<(spectrum|chromatogram)\s.*id=["']([^'"]+)["']/m
32
+ # The end spectrum or chromatogram element
33
+ DATA_END = /(<\/(spectrum|chromatogram)>)/
34
+ end
35
+
36
+ class UnsupportedFileFormat < Exception
37
+ end
38
+ class BadIdentifier < Exception
39
+ end
40
+
41
+ # The main mzML parser class, it is a subclass of the File class from the
42
+ # Ruby standard library in that it places a read cursor on the mzML file,
43
+ # and will skip around using byte-offsets. We utilize the index at the
44
+ # end of mzML files to facilitate random access of spectra.
45
+ #
46
+ # The {#each} method will cycle through all of the spectrum in a file, starting
47
+ # from the first one each time. If you would rather access the spectra randomly,
48
+ # the {#spectrum_list} attribute contains the ordered list of specturm identifiers.
49
+ # You can access the MzML::Spectrum objects by feeding these identifiers to the {#spectrum}
50
+ # method.
51
+ class Doc < ::File
52
+
53
+ # Open a file handle to a mzML document
54
+ def initialize(mz_fname)
55
+ unless mz_fname =~ /\.mzML$/
56
+ raise MzML::UnsupportedFileFormat.new "File extension must be .\"mzML\""
57
+ end
58
+ super(mz_fname, "r")
59
+ @fname = mz_fname
60
+ @index = parse_index_list
61
+ @spectrum_count = @spectrum_list.length
62
+ @chromatogram_count = @chromatogram_list.length
63
+ @current_spectrum_index = 0
64
+ end
65
+ attr_reader :index, :fname, :spectrum_list, :spectrum_count, :chromatogram_list, :chromatogram_count
66
+
67
+ # Fetch a {MzML::Chromatogram} from the file, given the identifier
68
+ # @param chromatogram_id String
69
+ # @return {MzML::Chromatogram}
70
+ def chromatogram(chromatogram_id)
71
+ if @index[:chromatogram].has_key? chromatogram_id
72
+ self.seek @index[:chromatogram][chromatogram_id]
73
+ return MzML::Chromatogram.new(parse_next)
74
+ else
75
+ raise MzML::BadIdentifier.new("Invalid ID '#{chromatogram_id}'")
76
+ end
77
+ end
78
+
79
+ def spectrum(spectrum_id)
80
+ if @index[:spectrum].has_key? spectrum_id
81
+ self.seek @index[:spectrum][spectrum_id]
82
+ return MzML::Spectrum.new(parse_next())
83
+ else
84
+ raise MzML::BadIdentifier.new("Invalid ID '#{spectrum_id}'")
85
+ end
86
+ end
87
+
88
+ def each &block
89
+ @spectrum_list.each do |spectrum_id|
90
+ block.call(self.spectrum(spectrum_id))
91
+ @current_spectrum_index += 1
92
+ end
93
+ end
94
+ alias_method :each_spectrum, :each
95
+
96
+ def next &block
97
+ if @current_spectrum_index < @spectrum_list.length
98
+ @current_spectrum_index += 1
99
+ self.spectrum(@spectrum_list[@current_spectrum_index - 1])
100
+ else
101
+ nil
102
+ end
103
+ end
104
+ alias_method :next_spectrum, :next
105
+
106
+ def rewind
107
+ super
108
+ @current_spectrum_index = 0
109
+ end
110
+
111
+ private
112
+ # Parses the IndexList
113
+ def parse_index_list
114
+ self.seek(self.stat.size - 200)
115
+ # parse the index offset
116
+ tmp = self.read
117
+ tmp =~ MzML::RGX::INDEX_OFFSET
118
+ offset = $1
119
+ # if I didn't match anything, compute the index and return
120
+ unless (offset)
121
+ return compute_index_list
122
+ end
123
+ @index = {}
124
+ @spectrum_list = []
125
+ @chromatogram_list = []
126
+ self.seek(offset.to_i)
127
+ tmp = Nokogiri::XML.parse(self.read).root
128
+ tmp.css("index").each do |idx|
129
+ index_type = idx[:name].to_sym
130
+ @index[index_type] = {}
131
+ idx.css("offset").each do |o|
132
+ @index[index_type][o[:idRef]] = o.text().to_i
133
+ if index_type == :spectrum
134
+ @spectrum_list << o[:idRef]
135
+ else
136
+ @chromatogram_list << o[:idRef]
137
+ end
138
+ end
139
+ end
140
+ self.rewind
141
+ return @index
142
+ end
143
+
144
+ def compute_index_list
145
+ @index = Hash.new {|h,k| h[k] = {} }
146
+ # start at the beginning.
147
+ self.rewind
148
+ # fast forward to the first spectrum or chromatograph
149
+ buffer = ''
150
+ while !self.eof
151
+ buffer += self.read(1024)
152
+ if start_pos = buffer =~ MzML::RGX::DATA_START
153
+ self.seek start_pos
154
+ break
155
+ end
156
+ end
157
+
158
+ # for each particular entity start to fill in the index
159
+ buffer = ''
160
+ rgx_start = /<(spectrum|chromatogram)\s.*id=["']([^"']+)["']/
161
+ while !self.eof
162
+ buffer += self.read(1024)
163
+ if start_pos = buffer =~ rgx_start
164
+ start_pos = self.pos - buffer.length + start_pos
165
+ @index[$1.to_sym][$2] = start_pos
166
+ buffer = ''
167
+ end
168
+ end
169
+ return @index
170
+ end
171
+
172
+ def parse_next
173
+ buffer = ''
174
+ while(!self.eof)
175
+ if end_pos = buffer =~ MzML::RGX::DATA_END
176
+ extra_content = buffer.slice!((end_pos + $1.length)..-1)
177
+ self.pos -= (extra_content.length)
178
+ break
179
+ end
180
+ buffer += self.read(1024)
181
+ end
182
+ return Nokogiri::XML.parse(buffer).root
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,107 @@
1
+ require 'base64'
2
+ require 'zlib'
3
+
4
+ module MzML
5
+ class Spectrum
6
+ attr_reader :id, :default_array_length, :type,
7
+ :precursor, :base_peak_mz, :base_peak_intensity, :ms_level,
8
+ :high_mz, :low_mz, :title, :tic, :polarity, :representation, :mz_node, :intensity_node,
9
+ :mz, :intensity, :precursor_list, :scan_list, :retention_time, :precursor_mass,
10
+ :precursor_intensity, :node, :params
11
+
12
+ def initialize(node)
13
+ @node = node
14
+ @params = {}
15
+ @precursor_list = []
16
+ parse_element()
17
+ end
18
+
19
+ protected
20
+ def parse_element
21
+
22
+ # id
23
+ @id = @node.attributes["id"].value
24
+ @index = @node.attributes["index"].value.to_i
25
+ @default_array_length = @node.attributes["defaultArrayLength"].value.to_i
26
+
27
+ # now reaching into params
28
+ @params = @node.xpath("cvParam").inject({}) do |memo,prm|
29
+ memo[prm[:name]] = prm[:value]
30
+ memo
31
+ end
32
+
33
+ @ms_level = @params["ms level"].to_i
34
+ @low_mz = @params["lowest observed m/z"].to_f if @params.has_key?("lowest observed m/z")
35
+ @high_mz = @params["highest observed m/z"].to_f if @params.has_key?("highest observed m/z")
36
+ @tic = @params["total ion current"].to_i if @params.has_key?("total ion current")
37
+ @base_peak_mz = @params["base peak m/z"].to_f if @params.has_key?("base peak m/z")
38
+ @base_peak_intensity = @params["base peak intensity"].to_f if @params.has_key?("base peak intensity")
39
+
40
+ # precursor list
41
+ if @node.xpath("precursorList/precursor").length > 0
42
+ parse_precursor_list()
43
+ get_parent_info()
44
+ else
45
+ @precursor_list = []
46
+ end
47
+
48
+ # scan list
49
+ if (@node.xpath("scanList/scan").length > 0)
50
+ @scan_list = parse_scan_list()
51
+ else
52
+ @scan_list = nil
53
+ end
54
+ # binary data
55
+ parse_binary_data()
56
+ end
57
+
58
+ def parse_precursor_list
59
+ @precursor_list = []
60
+ @node.xpath("precursorList/precursor").each do |p|
61
+ @precursor_list << [p[:spectrumRef], p]
62
+ end
63
+ end
64
+
65
+ def get_parent_info
66
+ unless @precursor_list.empty?
67
+ if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000744'")
68
+ @precursor_mass = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000744']")[0][:value].to_f
69
+ end
70
+ if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000042'")
71
+ @precursor_intensity = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000042']")[0][:value].to_f
72
+ end
73
+ end
74
+ end
75
+
76
+ def parse_scan_list
77
+ @scan_list = @node.xpath("scanList/scan")
78
+ if @node.xpath("scanList/scan/cvParam/@accession='MS:1000016'")
79
+ @retention_time = @node.xpath("scanList/scan/cvParam[@accession='MS:1000016']")[0][:value]
80
+ end
81
+ end
82
+
83
+ def parse_binary_data
84
+ @node.xpath("binaryDataArrayList/binaryDataArray").each do |bd|
85
+ if bd.xpath("cvParam/@accession='MS:1000523'")
86
+ # "64-bit float"
87
+ decode_type = "E*"
88
+ else
89
+ # 32-bit float
90
+ decode_type = "e*"
91
+ end
92
+ data = Base64.decode64(bd.xpath("binary").text)
93
+ # compressed?
94
+ if bd.xpath("cvParam/@accession='MS:1000574'")
95
+ data = Zlib::Inflate.inflate(data)
96
+ end
97
+ # m/z or intensity data?
98
+ if bd.xpath("cvParam/@accession='MS:1000514'")
99
+ # m/z data
100
+ @mz = data.unpack(decode_type)
101
+ else
102
+ @intensity = data.unpack(decode_type)
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,4 @@
1
+ module MzML
2
+ VERSION = "0.3.0"
3
+ end
4
+