ms-msrun 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,297 @@
1
+
2
+ require 'ms/scan'
3
+ require 'ms/precursor'
4
+ require 'ms/spectrum'
5
+ require 'ms/msrun/search'
6
+
7
+ module Ms; end
8
+ class Ms::Msrun
9
+
10
+ # the retention time in seconds of the first scan (regardless of any
11
+ # meta-data written in the header)
12
+ attr_accessor :start_time
13
+ # the retention time in seconds of the last scan (regardless of any
14
+ # meta-data written in the header)
15
+ attr_accessor :end_time
16
+ # an array of scans
17
+ attr_accessor :scans
18
+
19
+ # The filetype. Valid types (for parsing) are:
20
+ # :mzxml
21
+ # :mzdata
22
+ # :mzml
23
+ attr_accessor :filetype
24
+
25
+ # the string passed in to open the file for reading
26
+ attr_accessor :filename
27
+
28
+ # The version string of this type of file
29
+ attr_accessor :version
30
+ # the total number of scans
31
+ attr_writer :scan_count
32
+
33
+ # The basename of the parent file listed (e.g., a .RAW file). Note that in
34
+ # v1 mzXML this will be *.mzXML while in later versions it's *.RAW.
35
+ # See parent_basename_noext for more robust value
36
+ attr_accessor :parent_basename
37
+
38
+ # The location of the parent file (e.g., a .RAW file). In version mzXML v1
39
+ # this will be nil.
40
+ attr_accessor :parent_location
41
+
42
+ # Opens the filename
43
+ def self.open(filename, &block)
44
+ File.open(filename) {|io| block.call( self.new(io, filename) ) }
45
+ end
46
+
47
+ # takes an io object. The preferred way to access Msrun objects is through
48
+ # the open method since it ensures that the io object will be available for
49
+ # the lazy evaluation of spectra.
50
+ def initialize(io, filename=nil)
51
+ @scan_counts = nil
52
+ @filename = filename
53
+ @filetype, @version = Utils.filetype_and_version(io)
54
+ parser = Utils.get_parser(@filetype, @version)
55
+ parser.new.parse(self, io, @version)
56
+ @scan_counts = nil # <- to keep warnings away
57
+ end
58
+
59
+ def parent_basename_noext
60
+ @parent_basename.chomp(File.extname(@parent_basename))
61
+ end
62
+
63
+ # returns each scan
64
+ def each(&block)
65
+ scans.each(&block)
66
+ end
67
+
68
+ # opens the file and yields each scan in the block
69
+ def self.foreach(filename, &block)
70
+ self.open(filename) do |obj|
71
+ obj.each(&block)
72
+ end
73
+ end
74
+
75
+ def scans_by_ms_level
76
+ by_level = []
77
+ scans.each do |scan|
78
+ by_level[scan.ms_level] = scan
79
+ end
80
+ by_level
81
+ end
82
+
83
+ # returns an array, whose indices provide the number of scans in each index level the ms_levels, [0] = all the scans, [1] = mslevel 1, [2] = mslevel 2,
84
+ # ...
85
+ def scan_counts
86
+ return @scan_counts if @scan_counts
87
+ ar = []
88
+ ar[0] = 0
89
+ scans.each do |sc|
90
+ level = sc.ms_level
91
+ unless ar[level]
92
+ ar[level] = 0
93
+ end
94
+ ar[level] += 1
95
+ ar[0] += 1
96
+ end
97
+ @scan_counts = ar
98
+ end
99
+
100
+ def scan_count(mslevel=0)
101
+ if @scan_counts
102
+ @scan_counts[mslevel]
103
+ else
104
+ if mslevel == 0
105
+ @scan_count
106
+ else
107
+ num = 0
108
+ scans.each do |sc|
109
+ if sc.ms_level == mslevel
110
+ num += 1
111
+ end
112
+ end
113
+ num
114
+ end
115
+ end
116
+ end
117
+
118
+ # for level 1, finds first scan and asks if it has start_mz/end_mz
119
+ # attributes. for other levels, asks for start_mz/ end_mz and takes the
120
+ # min/max. If start_mz and end_mz are not found, goes through every scan
121
+ # finding the max/min first and last m/z. returns [start_mz (rounded down to
122
+ # nearest int), end_mz (rounded up to nearest int)]
123
+ def start_and_end_mz(mslevel=1)
124
+ if mslevel == 1
125
+ # special case for mslevel 1 (where we expect scans to be same length)
126
+ scans.each do |sc|
127
+ if sc.ms_level == mslevel
128
+ if sc.start_mz && sc.end_mz
129
+ return [sc.start_mz, sc.end_mz]
130
+ end
131
+ break
132
+ end
133
+ end
134
+ end
135
+ hi_mz = nil
136
+ lo_mz = nil
137
+ # see if we have start_mz and end_mz for the level we want
138
+ # set the initial hi_mz and lo_mz in any case
139
+ have_start_end_mz = false
140
+ scans.each do |sc|
141
+ if sc.ms_level == mslevel
142
+ if sc.start_mz && sc.end_mz
143
+ lo_mz = sc.start_mz
144
+ hi_mz = sc.end_mz
145
+ else
146
+ mz_ar = sc.spectrum.mzs
147
+ hi_mz = mz_ar.last
148
+ lo_mz = mz_ar.first
149
+ end
150
+ break
151
+ end
152
+ end
153
+ if have_start_end_mz
154
+ scans.each do |sc|
155
+ if sc.ms_level == mslevel
156
+ if sc.start_mz < lo_mz
157
+ lo_mz = sc.start_mz
158
+ end
159
+ if sc.end_mz > hi_mz
160
+ hi_mz = sc.end_mz
161
+ end
162
+ end
163
+ end
164
+ else
165
+ # didn't have the attributes (find by brute force)
166
+ scans.each do |sc|
167
+ if sc.ms_level == mslevel
168
+ mz_ar = sc.spectrum.mzs
169
+ if mz_ar.last > hi_mz
170
+ hi_mz = mz_ar.last
171
+ end
172
+ if mz_ar.last < lo_mz
173
+ lo_mz = mz_ar.last
174
+ end
175
+ end
176
+ end
177
+ end
178
+ [lo_mz.floor, hi_mz.ceil]
179
+ end
180
+
181
+ # returns an array of times and parallel array of spectra objects.
182
+ # ms_level = 0 then all spectra and times
183
+ # ms_level = 1 then all spectra of ms_level 1
184
+ def times_and_spectra(ms_level=0)
185
+ spectra = []
186
+ if ms_level == 0
187
+ times = @scans.map do |scan|
188
+ spectra << scan.spectrum
189
+ scan.time
190
+ end
191
+ [times, spectra]
192
+ else # choose a particular ms_level
193
+ times = []
194
+ @scans.each do |scan|
195
+ if ms_level == scan.ms_level
196
+ spectra << scan.spectrum
197
+ times << scan.time
198
+ end
199
+ end
200
+ [times, spectra]
201
+ end
202
+ end
203
+ end
204
+
205
+
206
+ module Ms::Msrun::Axml ; end # so we can get our parser
207
+
208
+ module Ms::Msrun::Utils
209
+
210
+ def self.get_parser(filetype, version)
211
+ require "ms/msrun/axml/#{filetype}"
212
+ parser_class = filetype.to_s.capitalize
213
+ base_class = Ms::Msrun::Axml
214
+ if base_class.const_defined? parser_class
215
+ base_class.const_get parser_class
216
+ else
217
+ raise RuntimeError, "no class #{base_class}::#{parser_class}"
218
+ end
219
+ end
220
+
221
+ # only adds the parent if one is not already present!
222
+ def self.add_parent_scan(scans, add_intensities=false)
223
+ prev_scan = nil
224
+ parent_stack = [nil]
225
+ ## we want to set the level to be the first mslevel we come to
226
+ prev_level = scans.first.ms_level
227
+ scans.each do |scan|
228
+ #next unless scan ## the first one is nil, (others?)
229
+ level = scan.ms_level
230
+ if prev_level < level
231
+ parent_stack.unshift prev_scan
232
+ end
233
+ if prev_level > level
234
+ (prev_level - level).times do parent_stack.shift end
235
+ end
236
+ if scan.ms_level > 1
237
+ precursor = scan.precursor
238
+ #precursor.parent = parent_stack.first # that's the next line's
239
+ precursor[2] = parent_stack.first unless precursor[2]
240
+ #precursor.intensity
241
+ if add_intensities
242
+ precursor[1] = precursor[2].spectrum.intensity_at_mz(precursor[0])
243
+ end
244
+ end
245
+ prev_level = level
246
+ prev_scan = scan
247
+ end
248
+ end
249
+
250
+ Mzxml_regexp = /http:\/\/sashimi.sourceforge.net\/schema(_revision)?\/([\w\d_\.]+)/o
251
+ # 'http://sashimi.sourceforge.net/schema/MsXML.xsd' # version 1
252
+ # 'http://sashimi.sourceforge.net/schema_revision/mzXML_X.X' # others
253
+ Mzdata_regexp = /<mzData.*version="([\d\.]+)"/m
254
+ Raw_header_unpack_code = '@2axaxaxaxaxaxaxa'
255
+ Mzml_regexp = /http:\/\/psi.hupo.org\/schema_revision\/mzML_([\w\d\.]+)/o
256
+
257
+ def self.filetype_and_version(file_or_io)
258
+ if file_or_io.is_a? IO
259
+ io = file_or_io
260
+ found = nil
261
+ # Test for RAW file:
262
+ header = io.read(18).unpack(Raw_header_unpack_code).join
263
+ if header == 'Finnigan'
264
+ return [:raw, nil]
265
+ end
266
+ io.rewind
267
+ while (line = io.gets)
268
+ found =
269
+ case line
270
+ when Mzml_regexp
271
+ [:mzml, $1.dup]
272
+ when Mzxml_regexp
273
+ mtch = $2.dup
274
+ case mtch
275
+ when /mzXML_([\d\.]+)/
276
+ [:mzxml, $1.dup]
277
+ when /MsXML/
278
+ [:mzxml, '1.0']
279
+ else
280
+ abort "Cannot determine mzXML version!"
281
+ end
282
+ when Mzdata_regexp
283
+ [:mzdata, $1.dup]
284
+ end
285
+ if found
286
+ break
287
+ end
288
+ end
289
+ io.rewind
290
+ found
291
+ else
292
+ File.open(file_or_io) do |io|
293
+ filetype_and_version(io)
294
+ end
295
+ end
296
+ end
297
+ end
@@ -0,0 +1,141 @@
1
+
2
+ require 'ms/msrun'
3
+ require 'ms/precursor'
4
+ require 'axml'
5
+
6
+ module Ms
7
+ class Msrun
8
+ module Axml
9
+ end
10
+ end
11
+ end
12
+
13
+ class Ms::Msrun::Axml::Mzxml
14
+ NetworkOrder = true
15
+
16
+ # version is a string
17
+ def parse(msrun_obj, io, version)
18
+ root = AXML.parse(io, :text_indices => 'peaks', :parser => :xmlparser)
19
+ msrun_n = msrun_node(root, version)
20
+
21
+ # The filename
22
+ parent_n = msrun_n.find_first_child('parentFile')
23
+ fn = parent_n['fileName']
24
+ fn.gsub!(/\\/, '/')
25
+ msrun_obj.parent_basename = File.basename(fn)
26
+ dn = File.dirname(fn)
27
+ dn = nil if dn == '.' && !fn.include?('/')
28
+ msrun_obj.parent_location = dn
29
+
30
+ ## HEADER
31
+ scan_count = msrun_n['scanCount'].to_i
32
+ msrun_obj.scan_count = scan_count
33
+
34
+ scans_by_num = Array.new(scan_count + 1)
35
+
36
+ ## SPECTRUM
37
+ parent = nil
38
+ scans = Array.new( scan_count )
39
+ scn_index = 0
40
+
41
+ if version >= '3.0'
42
+ warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
43
+ # note that mzXML version 3.0 *can* have more than one peak...
44
+ # I'm not sure how to deal with that since I have one spectrum/scan
45
+ end
46
+
47
+ scan_nodes = msrun_n.find_children('scan')
48
+ add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, version, io)
49
+
50
+ ## update the scan's parents
51
+ Ms::Msrun::Utils.add_parent_scan(scans)
52
+
53
+ # note that startTime and endTime are optional AND in >2.2 are dateTime
54
+ # instead of duration types!, so we will just use scan times...
55
+ # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
56
+ # export. They give the start and end time in seconds, but they are
57
+ # really minutes. All the more reason to use the first and last scans!
58
+ msrun_obj.start_time = scans.first.time
59
+ msrun_obj.end_time = scans.last.time
60
+ msrun_obj.scans = scans
61
+ end
62
+
63
+ # takes a scan node and creates a scan object
64
+ # the parent scan is the one directly above it in mslevel
65
+ def create_scan(scan_n, scans_by_num, io=nil)
66
+ scan = new_scan_from_node(scan_n)
67
+ prec = nil
68
+ scan_n.each do |node|
69
+ case node.name
70
+ when 'precursorMz'
71
+ # should be able to do this!!!
72
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
73
+ raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
74
+ prec = Ms::Precursor.new
75
+ prec[1] = node['precursorIntensity'].to_f
76
+ prec[0] = node.content.to_f
77
+ if x = node['precursorScanNum']
78
+ prec[2] = scans_by_num[x.to_i]
79
+ end
80
+ when 'peaks'
81
+ # assumes that parsing was done with a LazyPeaks parser!
82
+ nc = node.text
83
+ #p nc
84
+ #if nc.first < 0 || nc.last < 0
85
+ # puts "PROBLEM: "
86
+ # p nc
87
+ # abort 'here'
88
+ #end
89
+ scan[8] = Ms::Spectrum.lazy(io, nc.first, nc.last, node['precision'].to_i, NetworkOrder)
90
+ end
91
+ end
92
+ scan[7] = prec
93
+ scan
94
+ end
95
+
96
+
97
+ # assumes that node contains scans and checks any scan nodes for children
98
+ def add_scan_nodes(nodes, scans, scn_index, scans_by_num, version, io)
99
+ nodes.each do |scan_n|
100
+ scan = create_scan(scan_n, scans_by_num, io)
101
+ #puts "scannum: "
102
+ #p scan[0]
103
+ scans[scn_index] = scan
104
+ scans_by_num[scan[0]] = scan
105
+ scn_index += 1
106
+ if version > '1.0'
107
+ new_nodes = scan_n.find('child::scan')
108
+ if new_nodes.size > 0
109
+ scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, version, io)
110
+ end
111
+ end
112
+ end
113
+ scn_index
114
+ end
115
+
116
+ def msrun_node(node, version)
117
+ if version >= '2.0'
118
+ kids = node.children.select {|v| v.name == 'msRun' }
119
+ raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1
120
+ kids.first
121
+ else
122
+ node
123
+ end
124
+ end
125
+
126
+ def new_scan_from_node(node)
127
+ scan = Ms::Scan.new # array class creates one with 9 positions
128
+ scan[0] = node['num'].to_i
129
+ scan[1] = node['msLevel'].to_i
130
+ if x = node['retentionTime']
131
+ scan[2] = x[2...-1].to_f
132
+ end
133
+ if x = node['startMz']
134
+ scan[3] = x.to_f
135
+ scan[4] = node['endMz'].to_f
136
+ scan[5] = node['peaksCount'].to_i
137
+ scan[6] = node['totIonCurrent'].to_f
138
+ end
139
+ scan
140
+ end
141
+ end
@@ -0,0 +1,118 @@
1
+
2
+ module Ms
3
+ class Msrun
4
+
5
+ #config :first_scan, 0, :short => 'F', &c.integer # first scan
6
+ #config :last_scan, 1e12, :short => 'L', &c.integer # last scan
7
+ ## if not determined to be +1, then create these charge states
8
+ #config( :charge_states, [2,3], :short => 'c') {|v| v.split(',') }
9
+ #config :bottom_mh, 0, :short => 'B', &c.float # bottom MH+
10
+ #config :top_mh, -1.0, :short => 'T', &c.float # top MH+
11
+ #config :min_peaks, 0, :short => 'P', &c.integer # minimum peak count
12
+ #config :ms_levels, 2..-1, :short => 'M', &c.range # ms levels to export
13
+
14
+ module Search
15
+
16
+ PROTON_MASS = 1.007276
17
+
18
+ # returns a string, or writes the string to file if given an out_filename
19
+ # if given a filename or IO object, returns the number of spectra
20
+ # written
21
+ def to_mgf(file_or_io=nil, opts={})
22
+ opts = {
23
+ :bottom_mh => 0.0,
24
+ :top_mh => nil,
25
+ :ms_levels => (2..-1), # range or intger, -1 at end will be substituted for last level
26
+ :min_peaks => 0,
27
+ :first_scan => 0,
28
+ :last_scan => nil,
29
+ :prec_mz_precision => 6,
30
+ :prec_int_precision => 2,
31
+ :frag_mz_precision => 5,
32
+ :frag_int_precision => 1,
33
+ }.merge(opts)
34
+ (_first_scan, _last_scan, _bottom_mh, _top_mh, _ms_levels, _min_peaks, _charge_states, _prec_mz_precision, _prec_int_precision, _frag_mz_precision, _frag_int_precision) = opts.values_at(:first_scan, :last_scan, :bottom_mh, :top_mh, :ms_levels, :min_peaks, :charge_states, :prec_mz_precision, :prec_int_precision, :frag_mz_precision, :frag_int_precision)
35
+
36
+ sep = ' '
37
+
38
+ if _top_mh.nil? || _top_mh == -1
39
+ _top_mh = nil
40
+ end
41
+
42
+ if _last_scan.nil? or _last_scan == -1
43
+ _last_scan = scans.last.num
44
+ end
45
+
46
+ if !_ms_levels.is_a?(Integer) && _ms_levels.last == -1
47
+ _ms_levels = ((_ms_levels.first)..(scan_counts.size-1))
48
+ end
49
+
50
+ prec_string = "PEPMASS=%0.#{_prec_mz_precision}f %0.#{_prec_int_precision}f\n"
51
+ frag_string = "%0.#{_frag_mz_precision}f%s%0.#{_frag_int_precision}f\n"
52
+
53
+ any_input(file_or_io) do |out, out_type|
54
+ scans.each do |scan|
55
+ sn = scan.num
56
+
57
+ next unless _ms_levels === scan.ms_level
58
+ next unless sn >= _first_scan and sn <= _last_scan
59
+ next unless scan.num_peaks >= _min_peaks
60
+
61
+ # tic under precursor > 95% and true = save the spectrum info
62
+ scan.spectrum.save!
63
+ if scan.plus1?(0.95)
64
+ _charge_states = [1]
65
+ end
66
+
67
+ # (scanHeader.precursorMZ * iCharge) - (iCharge - 1)*dChargeMass;
68
+
69
+ pmz = scan.precursor && scan.precursor.mz
70
+
71
+ _charge_states.each do |z|
72
+ mh = (pmz * z) - (z - 1)*PROTON_MASS
73
+ next unless (mh >= _bottom_mh)
74
+ next unless (mh <= _top_mh) if _top_mh
75
+ out.puts "BEGIN IONS"
76
+ out.puts "TITLE=#{self.parent_basename_noext}.#{sn}.#{sn}.#{z}"
77
+ out.puts "CHARGE=#{z}+"
78
+ out.printf(prec_string, pmz, scan.precursor.intensity)
79
+ scan.spectrum.peaks do |mz,int|
80
+ out.printf(frag_string, mz, sep, int )
81
+ end
82
+ out.puts "END IONS\n\n"
83
+ end
84
+
85
+ scan.spectrum.flush!
86
+ end
87
+
88
+ if out_type == :string_io
89
+ out.string
90
+ else
91
+ count
92
+ end
93
+ end
94
+
95
+ end
96
+
97
+
98
+ # yields an IO object and the type input (:io, :filename, :string_io)
99
+ def any_input(arg, &block)
100
+ # this is pretty ugly, can we clean up?
101
+ if arg.is_a? IO # an IO object passed in
102
+ block.call(arg, :io)
103
+ elsif arg && arg.is_a?(String) # open the file
104
+ File.open(arg, 'w') do |io|
105
+ block.call(io, :filename)
106
+ end
107
+ else # nil
108
+ st_io = StringIO.new
109
+ block.call(st_io, :string_io)
110
+ end
111
+ end
112
+
113
+
114
+ end
115
+
116
+ include Search
117
+ end
118
+ end