mzml 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -14,9 +14,12 @@ tmtags
14
14
  *.swp
15
15
 
16
16
  ## PROJECT::GENERAL
17
+ Gemfile.lock
17
18
  coverage
18
19
  rdoc
19
20
  doc
20
21
  pkg
22
+ .yardoc
21
23
 
22
24
  ## PROJECT::SPECIFIC
25
+ scratch.rb
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in mzml.gemspec
4
+ gemspec
data/Rakefile CHANGED
@@ -1,44 +1,20 @@
1
- require 'rubygems'
2
- require 'rake'
3
-
4
- begin
5
- require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "mzml"
8
- gem.summary = %Q{A non-validating mzML parser}
9
- gem.description = %Q{A non-validating mzML parser. MzML is a standard data format for representing mass spectrometry data.}
10
- gem.email = "angel@delagoya.com"
11
- gem.homepage = "http://github.com/delagoya/mzml"
12
- gem.authors = ["Angel Pizarro"]
13
- gem.add_development_dependency "rspec", "1.3.0"
14
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
- gem.add_dependency "nokogiri", "1.4.1"
16
-
17
- end
18
- Jeweler::GemcutterTasks.new
19
- rescue LoadError
20
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
- end
22
-
23
- require 'spec/rake/spectask'
24
- Spec::Rake::SpecTask.new(:spec) do |spec|
25
- spec.libs << 'lib' << 'spec'
26
- spec.spec_files = FileList['spec/**/*_spec.rb']
27
- end
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
28
3
 
29
- Spec::Rake::SpecTask.new(:rcov) do |spec|
30
- spec.libs << 'lib' << 'spec'
31
- spec.pattern = 'spec/**/*_spec.rb'
32
- spec.rcov = true
4
+ # test
5
+ require 'rake'
6
+ require 'rake/testtask'
7
+ Rake::TestTask.new do |t|
8
+ t.libs << "test"
9
+ t.test_files = FileList['test/test*.rb']
10
+ t.verbose = true
33
11
  end
34
12
 
35
- task :spec => :check_dependencies
36
-
37
- task :default => :spec
38
-
13
+ # documentation
39
14
  require 'yard'
40
- YARD::Rake::YardocTask.new do |yardoc|
41
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
42
- yardoc.options = ["--title", "mzml #{version}", "-r", "README.rdoc"]
43
- yardoc.files = ['README*','lib/**/*.rb']
15
+ require 'yard/rake/yardoc_task'
16
+ YARD::Rake::YardocTask.new do |t|
17
+ t.files = ['lib/**/*.rb', '-' , "README.rdoc","LICENSE"]
18
+ t.options = ["-r","README.rdoc"]
44
19
  end
20
+
@@ -1,23 +1,7 @@
1
- #!/opt/local/bin/ruby
2
-
3
- ################################
4
- ####
5
- ##
6
- # David Austin - UPENN
7
- # converts mzML to MGF format
8
- # set up to replicate msconvert but muuchh slower
9
- #
10
-
11
- require 'rubygems'
1
+ #!/usr/bin/env ruby
12
2
  require 'mzml'
13
-
14
-
15
- #first load nokogiri document
16
-
17
3
  mzml = MzML::Doc.new(ARGV[0])
18
4
 
19
- #now loop through each spectrum.. sort first to be the same as msconvert
20
-
21
5
  sorted_keys = mzml.parse_index_list[:spectrum].keys.sort{ |x,y| x.split('=')[3].to_i <=> y.split('=')[3].to_i }
22
6
 
23
7
  sorted_keys.each do |k|
@@ -0,0 +1,80 @@
1
+ require 'base64'
2
+ require 'zlib'
3
+
4
+ module MzML
5
+ class Chromatogram
6
+ # Canonical ID of the chromatogram
7
+ attr_reader :id
8
+
9
+ attr_reader :default_array_length
10
+
11
+ # The positional index of the chromatogram in the mzML document
12
+ attr_reader :index_position
13
+ alias_method :index, :index_position
14
+
15
+ attr_reader :default_processing_ref
16
+
17
+ # Timepoints intensity values
18
+ attr_reader :timepoint
19
+
20
+ # The unit of time that the timepoints are measured in (e.g. seconds, minutes, ...)
21
+ attr_reader :time_unit
22
+
23
+ # Intensity array of values
24
+ attr_reader :intensity
25
+
26
+ # Nokogiri::XML::Node of the document
27
+ attr_reader :node
28
+
29
+ # CV param attributes
30
+ attr_reader :params
31
+
32
+ def initialize(node)
33
+ @node = node
34
+ @params = {}
35
+ parse_element()
36
+ end
37
+
38
+ protected
39
+ def parse_element
40
+ @id = @node[:id]
41
+ @index_position = @node[:index].to_i
42
+ @default_array_length = @node[:defaultArrayLength].to_i
43
+ # CV parameters
44
+ @params = @node.xpath("./cvParam").inject([]) do |memo,prm|
45
+ memo << {:name => prm[:name],
46
+ :value => prm[:value],
47
+ :accession => prm[:accession],
48
+ :cv => prm[:cvRef]}
49
+ memo
50
+ end
51
+ # binary data
52
+ parse_binary_data()
53
+ end
54
+
55
+ def parse_binary_data
56
+ @node.xpath("./binaryDataArrayList/binaryDataArray").each do |bd|
57
+ if bd.xpath("cvParam/@accession='MS:1000523'")
58
+ # "64-bit float"
59
+ decode_type = "E*"
60
+ else
61
+ # 32-bit float
62
+ decode_type = "e*"
63
+ end
64
+ data = Base64.decode64(bd.xpath("binary").text)
65
+ # compressed?
66
+ if bd.xpath("cvParam/@accession='MS:1000574'")
67
+ data = Zlib::Inflate.inflate(data)
68
+ end
69
+ # time or intensity data?
70
+ if bd.xpath("cvParam/@accession='MS:1000595'")
71
+ # parse the time units
72
+ @time_unit = bd.xpath("cvParam[@accession='MS:1000595']")[0].attributes["unitName"].value
73
+ @timepoint = data.unpack(decode_type)
74
+ else
75
+ @intensity = data.unpack(decode_type)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
data/lib/mzml/doc.rb ADDED
@@ -0,0 +1,185 @@
1
+ #--
2
+ # This program is free software; you can redistribute it and/or modify
3
+ # it under the terms of the GNU Library or "Lesser" General Public
4
+ # License (LGPL) as published by the Free Software Foundation;
5
+ # either version 2 of the License, or (at your option) any later
6
+ # version.
7
+ # Author: Angel Pizarro
8
+ # Date: 12/05/2009
9
+ # Copyright: Angel Pizarro, Copyright (c) University of Pennsylvania. All rights reserved.
10
+ #
11
+
12
+ # == MzML
13
+ #
14
+ # A non-validating mzML v 1.1.0 parser. Most annotation is left as XML DOM
15
+ # objects. See the Nokogiri::XML::Node and Nokogiri::XML::NodeSet
16
+ # documentation on how to work with these.
17
+ #
18
+ # ===USAGE:
19
+ #
20
+ # require 'mzml'
21
+ # mzml = MzML::Doc.new("test.mzXML")
22
+ module MzML
23
+
24
+ # An internal module containing useful regular expressions
25
+ module RGX
26
+ # The file byte offset of the start of the file index
27
+ INDEX_OFFSET = /<indexListOffset>(\d+)<\/indexListOffset>/
28
+ # The start of a either a spectrumList or chromatographList
29
+ DATA_LIST_START = /<(spectrum|chromatogram)List\s.*count\=["'](\d+)/m
30
+ # The start spectrum or chromatogram element
31
+ DATA_START = /<(spectrum|chromatogram)\s.*id=["']([^'"]+)["']/m
32
+ # The end spectrum or chromatogram element
33
+ DATA_END = /(<\/(spectrum|chromatogram)>)/
34
+ end
35
+
36
+ class UnsupportedFileFormat < Exception
37
+ end
38
+ class BadIdentifier < Exception
39
+ end
40
+
41
+ # The main mzML parser class, it is a subclass of the File class from the
42
+ # Ruby standard library in that it places a read cursor on the mzML file,
43
+ # and will skip around using byte-offsets. We utilize the index at the
44
+ # end of mzML files to facilitate random access of spectra.
45
+ #
46
+ # The {#each} method will cycle through all of the spectrum in a file, starting
47
+ # from the first one each time. If you would rather access the spectra randomly,
48
+ # the {#spectrum_list} attribute contains the ordered list of specturm identifiers.
49
+ # You can access the MzML::Spectrum objects by feeding these identifiers to the {#spectrum}
50
+ # method.
51
+ class Doc < ::File
52
+
53
+ # Open a file handle to a mzML document
54
+ def initialize(mz_fname)
55
+ unless mz_fname =~ /\.mzML$/
56
+ raise MzML::UnsupportedFileFormat.new "File extension must be .\"mzML\""
57
+ end
58
+ super(mz_fname, "r")
59
+ @fname = mz_fname
60
+ @index = parse_index_list
61
+ @spectrum_count = @spectrum_list.length
62
+ @chromatogram_count = @chromatogram_list.length
63
+ @current_spectrum_index = 0
64
+ end
65
+ attr_reader :index, :fname, :spectrum_list, :spectrum_count, :chromatogram_list, :chromatogram_count
66
+
67
+ # Fetch a {MzML::Chromatogram} from the file, given the identifier
68
+ # @param chromatogram_id String
69
+ # @return {MzML::Chromatogram}
70
+ def chromatogram(chromatogram_id)
71
+ if @index[:chromatogram].has_key? chromatogram_id
72
+ self.seek @index[:chromatogram][chromatogram_id]
73
+ return MzML::Chromatogram.new(parse_next)
74
+ else
75
+ raise MzML::BadIdentifier.new("Invalid ID '#{chromatogram_id}'")
76
+ end
77
+ end
78
+
79
+ def spectrum(spectrum_id)
80
+ if @index[:spectrum].has_key? spectrum_id
81
+ self.seek @index[:spectrum][spectrum_id]
82
+ return MzML::Spectrum.new(parse_next())
83
+ else
84
+ raise MzML::BadIdentifier.new("Invalid ID '#{spectrum_id}'")
85
+ end
86
+ end
87
+
88
+ def each &block
89
+ @spectrum_list.each do |spectrum_id|
90
+ block.call(self.spectrum(spectrum_id))
91
+ @current_spectrum_index += 1
92
+ end
93
+ end
94
+ alias_method :each_spectrum, :each
95
+
96
+ def next &block
97
+ if @current_spectrum_index < @spectrum_list.length
98
+ @current_spectrum_index += 1
99
+ self.spectrum(@spectrum_list[@current_spectrum_index - 1])
100
+ else
101
+ nil
102
+ end
103
+ end
104
+ alias_method :next_spectrum, :next
105
+
106
+ def rewind
107
+ super
108
+ @current_spectrum_index = 0
109
+ end
110
+
111
+ private
112
+ # Parses the IndexList
113
+ def parse_index_list
114
+ self.seek(self.stat.size - 200)
115
+ # parse the index offset
116
+ tmp = self.read
117
+ tmp =~ MzML::RGX::INDEX_OFFSET
118
+ offset = $1
119
+ # if I didn't match anything, compute the index and return
120
+ unless (offset)
121
+ return compute_index_list
122
+ end
123
+ @index = {}
124
+ @spectrum_list = []
125
+ @chromatogram_list = []
126
+ self.seek(offset.to_i)
127
+ tmp = Nokogiri::XML.parse(self.read).root
128
+ tmp.css("index").each do |idx|
129
+ index_type = idx[:name].to_sym
130
+ @index[index_type] = {}
131
+ idx.css("offset").each do |o|
132
+ @index[index_type][o[:idRef]] = o.text().to_i
133
+ if index_type == :spectrum
134
+ @spectrum_list << o[:idRef]
135
+ else
136
+ @chromatogram_list << o[:idRef]
137
+ end
138
+ end
139
+ end
140
+ self.rewind
141
+ return @index
142
+ end
143
+
144
+ def compute_index_list
145
+ @index = Hash.new {|h,k| h[k] = {} }
146
+ # start at the beginning.
147
+ self.rewind
148
+ # fast forward to the first spectrum or chromatograph
149
+ buffer = ''
150
+ while !self.eof
151
+ buffer += self.read(1024)
152
+ if start_pos = buffer =~ MzML::RGX::DATA_START
153
+ self.seek start_pos
154
+ break
155
+ end
156
+ end
157
+
158
+ # for each particular entity start to fill in the index
159
+ buffer = ''
160
+ rgx_start = /<(spectrum|chromatogram)\s.*id=["']([^"']+)["']/
161
+ while !self.eof
162
+ buffer += self.read(1024)
163
+ if start_pos = buffer =~ rgx_start
164
+ start_pos = self.pos - buffer.length + start_pos
165
+ @index[$1.to_sym][$2] = start_pos
166
+ buffer = ''
167
+ end
168
+ end
169
+ return @index
170
+ end
171
+
172
+ def parse_next
173
+ buffer = ''
174
+ while(!self.eof)
175
+ if end_pos = buffer =~ MzML::RGX::DATA_END
176
+ extra_content = buffer.slice!((end_pos + $1.length)..-1)
177
+ self.pos -= (extra_content.length)
178
+ break
179
+ end
180
+ buffer += self.read(1024)
181
+ end
182
+ return Nokogiri::XML.parse(buffer).root
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,107 @@
1
+ require 'base64'
2
+ require 'zlib'
3
+
4
+ module MzML
5
+ class Spectrum
6
+ attr_reader :id, :default_array_length, :type,
7
+ :precursor, :base_peak_mz, :base_peak_intensity, :ms_level,
8
+ :high_mz, :low_mz, :title, :tic, :polarity, :representation, :mz_node, :intensity_node,
9
+ :mz, :intensity, :precursor_list, :scan_list, :retention_time, :precursor_mass,
10
+ :precursor_intensity, :node, :params
11
+
12
+ def initialize(node)
13
+ @node = node
14
+ @params = {}
15
+ @precursor_list = []
16
+ parse_element()
17
+ end
18
+
19
+ protected
20
+ def parse_element
21
+
22
+ # id
23
+ @id = @node.attributes["id"].value
24
+ @index = @node.attributes["index"].value.to_i
25
+ @default_array_length = @node.attributes["defaultArrayLength"].value.to_i
26
+
27
+ # now reaching into params
28
+ @params = @node.xpath("cvParam").inject({}) do |memo,prm|
29
+ memo[prm[:name]] = prm[:value]
30
+ memo
31
+ end
32
+
33
+ @ms_level = @params["ms level"].to_i
34
+ @low_mz = @params["lowest observed m/z"].to_f if @params.has_key?("lowest observed m/z")
35
+ @high_mz = @params["highest observed m/z"].to_f if @params.has_key?("highest observed m/z")
36
+ @tic = @params["total ion current"].to_i if @params.has_key?("total ion current")
37
+ @base_peak_mz = @params["base peak m/z"].to_f if @params.has_key?("base peak m/z")
38
+ @base_peak_intensity = @params["base peak intensity"].to_f if @params.has_key?("base peak intensity")
39
+
40
+ # precursor list
41
+ if @node.xpath("precursorList/precursor").length > 0
42
+ parse_precursor_list()
43
+ get_parent_info()
44
+ else
45
+ @precursor_list = []
46
+ end
47
+
48
+ # scan list
49
+ if (@node.xpath("scanList/scan").length > 0)
50
+ @scan_list = parse_scan_list()
51
+ else
52
+ @scan_list = nil
53
+ end
54
+ # binary data
55
+ parse_binary_data()
56
+ end
57
+
58
+ def parse_precursor_list
59
+ @precursor_list = []
60
+ @node.xpath("precursorList/precursor").each do |p|
61
+ @precursor_list << [p[:spectrumRef], p]
62
+ end
63
+ end
64
+
65
+ def get_parent_info
66
+ unless @precursor_list.empty?
67
+ if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000744'")
68
+ @precursor_mass = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000744']")[0][:value].to_f
69
+ end
70
+ if @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam/@accession='MS:1000042'")
71
+ @precursor_intensity = @precursor_list[0][1].xpath("selectedIonList/selectedIon/cvParam[@accession='MS:1000042']")[0][:value].to_f
72
+ end
73
+ end
74
+ end
75
+
76
+ def parse_scan_list
77
+ @scan_list = @node.xpath("scanList/scan")
78
+ if @node.xpath("scanList/scan/cvParam/@accession='MS:1000016'")
79
+ @retention_time = @node.xpath("scanList/scan/cvParam[@accession='MS:1000016']")[0][:value]
80
+ end
81
+ end
82
+
83
+ def parse_binary_data
84
+ @node.xpath("binaryDataArrayList/binaryDataArray").each do |bd|
85
+ if bd.xpath("cvParam/@accession='MS:1000523'")
86
+ # "64-bit float"
87
+ decode_type = "E*"
88
+ else
89
+ # 32-bit float
90
+ decode_type = "e*"
91
+ end
92
+ data = Base64.decode64(bd.xpath("binary").text)
93
+ # compressed?
94
+ if bd.xpath("cvParam/@accession='MS:1000574'")
95
+ data = Zlib::Inflate.inflate(data)
96
+ end
97
+ # m/z or intensity data?
98
+ if bd.xpath("cvParam/@accession='MS:1000514'")
99
+ # m/z data
100
+ @mz = data.unpack(decode_type)
101
+ else
102
+ @intensity = data.unpack(decode_type)
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,4 @@
1
+ module MzML
2
+ VERSION = "0.3.0"
3
+ end
4
+