ms-msrun 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+
2
+ require 'ms/scan'
3
+ require 'ms/precursor'
4
+ require 'ms/spectrum'
5
+ require 'ms/msrun/search'
6
+
7
+ module Ms; end
8
+ class Ms::Msrun
9
+
10
+ # the retention time in seconds of the first scan (regardless of any
11
+ # meta-data written in the header)
12
+ attr_accessor :start_time
13
+ # the retention time in seconds of the last scan (regardless of any
14
+ # meta-data written in the header)
15
+ attr_accessor :end_time
16
+ # an array of scans
17
+ attr_accessor :scans
18
+
19
+ # The filetype. Valid types (for parsing) are:
20
+ # :mzxml
21
+ # :mzdata
22
+ # :mzml
23
+ attr_accessor :filetype
24
+
25
+ # the string passed in to open the file for reading
26
+ attr_accessor :filename
27
+
28
+ # The version string of this type of file
29
+ attr_accessor :version
30
+ # the total number of scans
31
+ attr_writer :scan_count
32
+
33
+ # The basename of the parent file listed (e.g., a .RAW file). Note that in
34
+ # v1 mzXML this will be *.mzXML while in later versions it's *.RAW.
35
+ # See parent_basename_noext for more robust value
36
+ attr_accessor :parent_basename
37
+
38
+ # The location of the parent file (e.g., a .RAW file). In version mzXML v1
39
+ # this will be nil.
40
+ attr_accessor :parent_location
41
+
42
+ # Opens the filename
43
+ def self.open(filename, &block)
44
+ File.open(filename) {|io| block.call( self.new(io, filename) ) }
45
+ end
46
+
47
+ # takes an io object. The preferred way to access Msrun objects is through
48
+ # the open method since it ensures that the io object will be available for
49
+ # the lazy evaluation of spectra.
50
+ def initialize(io, filename=nil)
51
+ @scan_counts = nil
52
+ @filename = filename
53
+ @filetype, @version = Utils.filetype_and_version(io)
54
+ parser = Utils.get_parser(@filetype, @version)
55
+ parser.new.parse(self, io, @version)
56
+ @scan_counts = nil # <- to keep warnings away
57
+ end
58
+
59
+ def parent_basename_noext
60
+ @parent_basename.chomp(File.extname(@parent_basename))
61
+ end
62
+
63
+ # returns each scan
64
+ def each(&block)
65
+ scans.each(&block)
66
+ end
67
+
68
+ # opens the file and yields each scan in the block
69
+ def self.foreach(filename, &block)
70
+ self.open(filename) do |obj|
71
+ obj.each(&block)
72
+ end
73
+ end
74
+
75
+ def scans_by_ms_level
76
+ by_level = []
77
+ scans.each do |scan|
78
+ by_level[scan.ms_level] = scan
79
+ end
80
+ by_level
81
+ end
82
+
83
+ # returns an array, whose indices provide the number of scans in each index level the ms_levels, [0] = all the scans, [1] = mslevel 1, [2] = mslevel 2,
84
+ # ...
85
+ def scan_counts
86
+ return @scan_counts if @scan_counts
87
+ ar = []
88
+ ar[0] = 0
89
+ scans.each do |sc|
90
+ level = sc.ms_level
91
+ unless ar[level]
92
+ ar[level] = 0
93
+ end
94
+ ar[level] += 1
95
+ ar[0] += 1
96
+ end
97
+ @scan_counts = ar
98
+ end
99
+
100
+ def scan_count(mslevel=0)
101
+ if @scan_counts
102
+ @scan_counts[mslevel]
103
+ else
104
+ if mslevel == 0
105
+ @scan_count
106
+ else
107
+ num = 0
108
+ scans.each do |sc|
109
+ if sc.ms_level == mslevel
110
+ num += 1
111
+ end
112
+ end
113
+ num
114
+ end
115
+ end
116
+ end
117
+
118
+ # for level 1, finds first scan and asks if it has start_mz/end_mz
119
+ # attributes. for other levels, asks for start_mz/ end_mz and takes the
120
+ # min/max. If start_mz and end_mz are not found, goes through every scan
121
+ # finding the max/min first and last m/z. returns [start_mz (rounded down to
122
+ # nearest int), end_mz (rounded up to nearest int)]
123
+ def start_and_end_mz(mslevel=1)
124
+ if mslevel == 1
125
+ # special case for mslevel 1 (where we expect scans to be same length)
126
+ scans.each do |sc|
127
+ if sc.ms_level == mslevel
128
+ if sc.start_mz && sc.end_mz
129
+ return [sc.start_mz, sc.end_mz]
130
+ end
131
+ break
132
+ end
133
+ end
134
+ end
135
+ hi_mz = nil
136
+ lo_mz = nil
137
+ # see if we have start_mz and end_mz for the level we want
138
+ # set the initial hi_mz and lo_mz in any case
139
+ have_start_end_mz = false
140
+ scans.each do |sc|
141
+ if sc.ms_level == mslevel
142
+ if sc.start_mz && sc.end_mz
143
+ lo_mz = sc.start_mz
144
+ hi_mz = sc.end_mz
145
+ else
146
+ mz_ar = sc.spectrum.mzs
147
+ hi_mz = mz_ar.last
148
+ lo_mz = mz_ar.first
149
+ end
150
+ break
151
+ end
152
+ end
153
+ if have_start_end_mz
154
+ scans.each do |sc|
155
+ if sc.ms_level == mslevel
156
+ if sc.start_mz < lo_mz
157
+ lo_mz = sc.start_mz
158
+ end
159
+ if sc.end_mz > hi_mz
160
+ hi_mz = sc.end_mz
161
+ end
162
+ end
163
+ end
164
+ else
165
+ # didn't have the attributes (find by brute force)
166
+ scans.each do |sc|
167
+ if sc.ms_level == mslevel
168
+ mz_ar = sc.spectrum.mzs
169
+ if mz_ar.last > hi_mz
170
+ hi_mz = mz_ar.last
171
+ end
172
+ if mz_ar.last < lo_mz
173
+ lo_mz = mz_ar.last
174
+ end
175
+ end
176
+ end
177
+ end
178
+ [lo_mz.floor, hi_mz.ceil]
179
+ end
180
+
181
+ # returns an array of times and parallel array of spectra objects.
182
+ # ms_level = 0 then all spectra and times
183
+ # ms_level = 1 then all spectra of ms_level 1
184
+ def times_and_spectra(ms_level=0)
185
+ spectra = []
186
+ if ms_level == 0
187
+ times = @scans.map do |scan|
188
+ spectra << scan.spectrum
189
+ scan.time
190
+ end
191
+ [times, spectra]
192
+ else # choose a particular ms_level
193
+ times = []
194
+ @scans.each do |scan|
195
+ if ms_level == scan.ms_level
196
+ spectra << scan.spectrum
197
+ times << scan.time
198
+ end
199
+ end
200
+ [times, spectra]
201
+ end
202
+ end
203
+ end
204
+
205
+
206
+ module Ms::Msrun::Axml ; end # so we can get our parser
207
+
208
+ module Ms::Msrun::Utils
209
+
210
+ def self.get_parser(filetype, version)
211
+ require "ms/msrun/axml/#{filetype}"
212
+ parser_class = filetype.to_s.capitalize
213
+ base_class = Ms::Msrun::Axml
214
+ if base_class.const_defined? parser_class
215
+ base_class.const_get parser_class
216
+ else
217
+ raise RuntimeError, "no class #{base_class}::#{parser_class}"
218
+ end
219
+ end
220
+
221
+ # only adds the parent if one is not already present!
222
+ def self.add_parent_scan(scans, add_intensities=false)
223
+ prev_scan = nil
224
+ parent_stack = [nil]
225
+ ## we want to set the level to be the first mslevel we come to
226
+ prev_level = scans.first.ms_level
227
+ scans.each do |scan|
228
+ #next unless scan ## the first one is nil, (others?)
229
+ level = scan.ms_level
230
+ if prev_level < level
231
+ parent_stack.unshift prev_scan
232
+ end
233
+ if prev_level > level
234
+ (prev_level - level).times do parent_stack.shift end
235
+ end
236
+ if scan.ms_level > 1
237
+ precursor = scan.precursor
238
+ #precursor.parent = parent_stack.first # that's the next line's
239
+ precursor[2] = parent_stack.first unless precursor[2]
240
+ #precursor.intensity
241
+ if add_intensities
242
+ precursor[1] = precursor[2].spectrum.intensity_at_mz(precursor[0])
243
+ end
244
+ end
245
+ prev_level = level
246
+ prev_scan = scan
247
+ end
248
+ end
249
+
250
+ Mzxml_regexp = /http:\/\/sashimi.sourceforge.net\/schema(_revision)?\/([\w\d_\.]+)/o
251
+ # 'http://sashimi.sourceforge.net/schema/MsXML.xsd' # version 1
252
+ # 'http://sashimi.sourceforge.net/schema_revision/mzXML_X.X' # others
253
+ Mzdata_regexp = /<mzData.*version="([\d\.]+)"/m
254
+ Raw_header_unpack_code = '@2axaxaxaxaxaxaxa'
255
+ Mzml_regexp = /http:\/\/psi.hupo.org\/schema_revision\/mzML_([\w\d\.]+)/o
256
+
257
+ def self.filetype_and_version(file_or_io)
258
+ if file_or_io.is_a? IO
259
+ io = file_or_io
260
+ found = nil
261
+ # Test for RAW file:
262
+ header = io.read(18).unpack(Raw_header_unpack_code).join
263
+ if header == 'Finnigan'
264
+ return [:raw, nil]
265
+ end
266
+ io.rewind
267
+ while (line = io.gets)
268
+ found =
269
+ case line
270
+ when Mzml_regexp
271
+ [:mzml, $1.dup]
272
+ when Mzxml_regexp
273
+ mtch = $2.dup
274
+ case mtch
275
+ when /mzXML_([\d\.]+)/
276
+ [:mzxml, $1.dup]
277
+ when /MsXML/
278
+ [:mzxml, '1.0']
279
+ else
280
+ abort "Cannot determine mzXML version!"
281
+ end
282
+ when Mzdata_regexp
283
+ [:mzdata, $1.dup]
284
+ end
285
+ if found
286
+ break
287
+ end
288
+ end
289
+ io.rewind
290
+ found
291
+ else
292
+ File.open(file_or_io) do |io|
293
+ filetype_and_version(io)
294
+ end
295
+ end
296
+ end
297
+ end
@@ -0,0 +1,141 @@
1
+
2
+ require 'ms/msrun'
3
+ require 'ms/precursor'
4
+ require 'axml'
5
+
6
+ module Ms
7
+ class Msrun
8
+ module Axml
9
+ end
10
+ end
11
+ end
12
+
13
+ class Ms::Msrun::Axml::Mzxml
14
+ NetworkOrder = true
15
+
16
+ # version is a string
17
+ def parse(msrun_obj, io, version)
18
+ root = AXML.parse(io, :text_indices => 'peaks', :parser => :xmlparser)
19
+ msrun_n = msrun_node(root, version)
20
+
21
+ # The filename
22
+ parent_n = msrun_n.find_first_child('parentFile')
23
+ fn = parent_n['fileName']
24
+ fn.gsub!(/\\/, '/')
25
+ msrun_obj.parent_basename = File.basename(fn)
26
+ dn = File.dirname(fn)
27
+ dn = nil if dn == '.' && !fn.include?('/')
28
+ msrun_obj.parent_location = dn
29
+
30
+ ## HEADER
31
+ scan_count = msrun_n['scanCount'].to_i
32
+ msrun_obj.scan_count = scan_count
33
+
34
+ scans_by_num = Array.new(scan_count + 1)
35
+
36
+ ## SPECTRUM
37
+ parent = nil
38
+ scans = Array.new( scan_count )
39
+ scn_index = 0
40
+
41
+ if version >= '3.0'
42
+ warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
43
+ # note that mzXML version 3.0 *can* have more than one peak...
44
+ # I'm not sure how to deal with that since I have one spectrum/scan
45
+ end
46
+
47
+ scan_nodes = msrun_n.find_children('scan')
48
+ add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, version, io)
49
+
50
+ ## update the scan's parents
51
+ Ms::Msrun::Utils.add_parent_scan(scans)
52
+
53
+ # note that startTime and endTime are optional AND in >2.2 are dateTime
54
+ # instead of duration types!, so we will just use scan times...
55
+ # Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
56
+ # export. They give the start and end time in seconds, but they are
57
+ # really minutes. All the more reason to use the first and last scans!
58
+ msrun_obj.start_time = scans.first.time
59
+ msrun_obj.end_time = scans.last.time
60
+ msrun_obj.scans = scans
61
+ end
62
+
63
+ # takes a scan node and creates a scan object
64
+ # the parent scan is the one directly above it in mslevel
65
+ def create_scan(scan_n, scans_by_num, io=nil)
66
+ scan = new_scan_from_node(scan_n)
67
+ prec = nil
68
+ scan_n.each do |node|
69
+ case node.name
70
+ when 'precursorMz'
71
+ # should be able to do this!!!
72
+ #scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
73
+ raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
74
+ prec = Ms::Precursor.new
75
+ prec[1] = node['precursorIntensity'].to_f
76
+ prec[0] = node.content.to_f
77
+ if x = node['precursorScanNum']
78
+ prec[2] = scans_by_num[x.to_i]
79
+ end
80
+ when 'peaks'
81
+ # assumes that parsing was done with a LazyPeaks parser!
82
+ nc = node.text
83
+ #p nc
84
+ #if nc.first < 0 || nc.last < 0
85
+ # puts "PROBLEM: "
86
+ # p nc
87
+ # abort 'here'
88
+ #end
89
+ scan[8] = Ms::Spectrum.lazy(io, nc.first, nc.last, node['precision'].to_i, NetworkOrder)
90
+ end
91
+ end
92
+ scan[7] = prec
93
+ scan
94
+ end
95
+
96
+
97
+ # assumes that node contains scans and checks any scan nodes for children
98
+ def add_scan_nodes(nodes, scans, scn_index, scans_by_num, version, io)
99
+ nodes.each do |scan_n|
100
+ scan = create_scan(scan_n, scans_by_num, io)
101
+ #puts "scannum: "
102
+ #p scan[0]
103
+ scans[scn_index] = scan
104
+ scans_by_num[scan[0]] = scan
105
+ scn_index += 1
106
+ if version > '1.0'
107
+ new_nodes = scan_n.find('child::scan')
108
+ if new_nodes.size > 0
109
+ scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, version, io)
110
+ end
111
+ end
112
+ end
113
+ scn_index
114
+ end
115
+
116
+ def msrun_node(node, version)
117
+ if version >= '2.0'
118
+ kids = node.children.select {|v| v.name == 'msRun' }
119
+ raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1
120
+ kids.first
121
+ else
122
+ node
123
+ end
124
+ end
125
+
126
+ def new_scan_from_node(node)
127
+ scan = Ms::Scan.new # array class creates one with 9 positions
128
+ scan[0] = node['num'].to_i
129
+ scan[1] = node['msLevel'].to_i
130
+ if x = node['retentionTime']
131
+ scan[2] = x[2...-1].to_f
132
+ end
133
+ if x = node['startMz']
134
+ scan[3] = x.to_f
135
+ scan[4] = node['endMz'].to_f
136
+ scan[5] = node['peaksCount'].to_i
137
+ scan[6] = node['totIonCurrent'].to_f
138
+ end
139
+ scan
140
+ end
141
+ end
@@ -0,0 +1,118 @@
1
+
2
+ module Ms
3
+ class Msrun
4
+
5
+ #config :first_scan, 0, :short => 'F', &c.integer # first scan
6
+ #config :last_scan, 1e12, :short => 'L', &c.integer # last scan
7
+ ## if not determined to be +1, then create these charge states
8
+ #config( :charge_states, [2,3], :short => 'c') {|v| v.split(',') }
9
+ #config :bottom_mh, 0, :short => 'B', &c.float # bottom MH+
10
+ #config :top_mh, -1.0, :short => 'T', &c.float # top MH+
11
+ #config :min_peaks, 0, :short => 'P', &c.integer # minimum peak count
12
+ #config :ms_levels, 2..-1, :short => 'M', &c.range # ms levels to export
13
+
14
+ module Search
15
+
16
+ PROTON_MASS = 1.007276
17
+
18
+ # returns a string, or writes the string to file if given an out_filename
19
+ # if given a filename or IO object, returns the number of spectra
20
+ # written
21
+ def to_mgf(file_or_io=nil, opts={})
22
+ opts = {
23
+ :bottom_mh => 0.0,
24
+ :top_mh => nil,
25
+ :ms_levels => (2..-1), # range or intger, -1 at end will be substituted for last level
26
+ :min_peaks => 0,
27
+ :first_scan => 0,
28
+ :last_scan => nil,
29
+ :prec_mz_precision => 6,
30
+ :prec_int_precision => 2,
31
+ :frag_mz_precision => 5,
32
+ :frag_int_precision => 1,
33
+ }.merge(opts)
34
+ (_first_scan, _last_scan, _bottom_mh, _top_mh, _ms_levels, _min_peaks, _charge_states, _prec_mz_precision, _prec_int_precision, _frag_mz_precision, _frag_int_precision) = opts.values_at(:first_scan, :last_scan, :bottom_mh, :top_mh, :ms_levels, :min_peaks, :charge_states, :prec_mz_precision, :prec_int_precision, :frag_mz_precision, :frag_int_precision)
35
+
36
+ sep = ' '
37
+
38
+ if _top_mh.nil? || _top_mh == -1
39
+ _top_mh = nil
40
+ end
41
+
42
+ if _last_scan.nil? or _last_scan == -1
43
+ _last_scan = scans.last.num
44
+ end
45
+
46
+ if !_ms_levels.is_a?(Integer) && _ms_levels.last == -1
47
+ _ms_levels = ((_ms_levels.first)..(scan_counts.size-1))
48
+ end
49
+
50
+ prec_string = "PEPMASS=%0.#{_prec_mz_precision}f %0.#{_prec_int_precision}f\n"
51
+ frag_string = "%0.#{_frag_mz_precision}f%s%0.#{_frag_int_precision}f\n"
52
+
53
+ any_input(file_or_io) do |out, out_type|
54
+ scans.each do |scan|
55
+ sn = scan.num
56
+
57
+ next unless _ms_levels === scan.ms_level
58
+ next unless sn >= _first_scan and sn <= _last_scan
59
+ next unless scan.num_peaks >= _min_peaks
60
+
61
+ # tic under precursor > 95% and true = save the spectrum info
62
+ scan.spectrum.save!
63
+ if scan.plus1?(0.95)
64
+ _charge_states = [1]
65
+ end
66
+
67
+ # (scanHeader.precursorMZ * iCharge) - (iCharge - 1)*dChargeMass;
68
+
69
+ pmz = scan.precursor && scan.precursor.mz
70
+
71
+ _charge_states.each do |z|
72
+ mh = (pmz * z) - (z - 1)*PROTON_MASS
73
+ next unless (mh >= _bottom_mh)
74
+ next unless (mh <= _top_mh) if _top_mh
75
+ out.puts "BEGIN IONS"
76
+ out.puts "TITLE=#{self.parent_basename_noext}.#{sn}.#{sn}.#{z}"
77
+ out.puts "CHARGE=#{z}+"
78
+ out.printf(prec_string, pmz, scan.precursor.intensity)
79
+ scan.spectrum.peaks do |mz,int|
80
+ out.printf(frag_string, mz, sep, int )
81
+ end
82
+ out.puts "END IONS\n\n"
83
+ end
84
+
85
+ scan.spectrum.flush!
86
+ end
87
+
88
+ if out_type == :string_io
89
+ out.string
90
+ else
91
+ count
92
+ end
93
+ end
94
+
95
+ end
96
+
97
+
98
+ # yields an IO object and the type input (:io, :filename, :string_io)
99
+ def any_input(arg, &block)
100
+ # this is pretty ugly, can we clean up?
101
+ if arg.is_a? IO # an IO object passed in
102
+ block.call(arg, :io)
103
+ elsif arg && arg.is_a?(String) # open the file
104
+ File.open(arg, 'w') do |io|
105
+ block.call(io, :filename)
106
+ end
107
+ else # nil
108
+ st_io = StringIO.new
109
+ block.call(st_io, :string_io)
110
+ end
111
+ end
112
+
113
+
114
+ end
115
+
116
+ include Search
117
+ end
118
+ end