ms-msrun 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,7 @@
1
1
 
2
+ require 'ms/spectrum'
3
+ require 'ms/data'
4
+ require 'ms/data/lazy_io'
2
5
  require 'ms/msrun'
3
6
  require 'ms/precursor'
4
7
  require 'axml'
@@ -48,7 +51,7 @@ class Ms::Msrun::Axml::Mzxml
48
51
  add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, version, io)
49
52
 
50
53
  ## update the scan's parents
51
- Ms::Msrun::Utils.add_parent_scan(scans)
54
+ Ms::Msrun.add_parent_scan(scans)
52
55
 
53
56
  # note that startTime and endTime are optional AND in >2.2 are dateTime
54
57
  # instead of duration types!, so we will just use scan times...
@@ -80,13 +83,8 @@ class Ms::Msrun::Axml::Mzxml
80
83
  when 'peaks'
81
84
  # assumes that parsing was done with a LazyPeaks parser!
82
85
  nc = node.text
83
- #p nc
84
- #if nc.first < 0 || nc.last < 0
85
- # puts "PROBLEM: "
86
- # p nc
87
- # abort 'here'
88
- #end
89
- scan[8] = Ms::Spectrum.lazy(io, nc.first, nc.last, node['precision'].to_i, NetworkOrder)
86
+ data = Ms::Data::LazyIO.new(io, nc.first, nc.last, Ms::Data::LazyIO.unpack_code(node['precision'].to_i, NetworkOrder))
87
+ scan[8] = Ms::Spectrum.new(Ms::Data::Interleaved.new(data))
90
88
  end
91
89
  end
92
90
  scan[7] = prec
@@ -0,0 +1,130 @@
1
+ require 'ms/msrun'
2
+
3
+ module Ms ; end
4
+ class Ms::Msrun ; end
5
+
6
+ # an index by scan number of the doublets where each doublet = [start_byte,
7
+ # length] for the scan. Index objects are enumerable and yield doublets.
8
+ # Index#scan_nums gives an array of the scan numbers.
9
+ # Index#first and Index#last return the first and the last scan, regardless of
10
+ # the scan numbers.
11
+ #
12
+ # index.scan_nums # -> [1,2,3,4]
13
+ # index.each do |starting_byte, length|
14
+ # IO.read(myfile.mzXML, length, starting_byte) # -> xml for each scan
15
+ # end
16
+ # index[0] # -> nil
17
+ # index.first # -> [<start_byte>, <length>] # for scan number 1
18
+ class Ms::Msrun::Index < Array
19
+ include Enumerable
20
+
21
+ MZXML_INDEX_TAG = 'indexOffset'
22
+ MZML_INDEX_TAG = 'indexListOffset'
23
+
24
+ # returns the length from the start to the first scan
25
+ def header_length
26
+ self.each {|pair| return (pair.first) }
27
+ end
28
+
29
+ # returns an array of the scan numbers
30
+ attr_reader :scan_nums
31
+
32
+ # takes an mzxml filename or io object
33
+ # and returns an array of offsets and lengths for the scans
34
+ # note that the offset
35
+ def initialize(filename_or_io)
36
+ (ft, version) = Ms::Msrun.filetype_and_version(filename_or_io)
37
+ tag = case ft
38
+ when :mzml : MZML_INDEX_TAG
39
+ when :mzxml : MZXML_INDEX_TAG
40
+ end
41
+ fn =
42
+ if filename_or_io.is_a? String
43
+ filename_or_io # a filename
44
+ else # a File object
45
+ filename_or_io.path
46
+ end
47
+ size = File.size(fn)
48
+ io =
49
+ if filename_or_io.is_a? String
50
+ File.open(filename_or_io)
51
+ else
52
+ filename_or_io
53
+ end
54
+ (offset, length) = index_offset(io, size, tag)
55
+ io.pos = offset
56
+ xml = io.read(length)
57
+ io.close if filename_or_io.is_a?(String)
58
+ self.replace( index_to_array(xml, offset, ft) )
59
+ self
60
+ end
61
+
62
+ def each(&block)
63
+ scan_nums.each do |scan_num|
64
+ block.call( self[scan_num] )
65
+ end
66
+ end
67
+
68
+ def first
69
+ self[scan_nums.first]
70
+ end
71
+
72
+ def last
73
+ self[scan_nums.last]
74
+ end
75
+
76
+ # returns [index_offset, length_of_index]
77
+ def index_offset(io, size, tag=MZML_INDEX_TAG, bytes_backwards=150) # :nodoc:
78
+ tag_re = /<#{tag}>([\-\d]+)<\/#{tag}>/
79
+ io.pos = size-1
80
+ io.pos = io.pos - bytes_backwards
81
+ index_offset = nil
82
+ index_end = nil
83
+ io.each do |line|
84
+ if line =~ tag_re
85
+ index_offset = $1.to_i
86
+ index_end = io.pos - line.size
87
+ break
88
+ end
89
+ end
90
+ if index_offset
91
+ [index_offset, index_end - index_offset]
92
+ else
93
+ [nil,nil]
94
+ end
95
+ end
96
+
97
+ # last_offset is used to calculate the length of the last scan object (or
98
+ # whatever)
99
+ def index_to_array(xml_string, last_offset, type=:mzml) # :nodoc:
100
+ indices = []
101
+ @scan_nums = []
102
+ case type
103
+ when :mzxml
104
+ xml_string.each_line("\n") do |line|
105
+ if line =~ /id="(\d+)".*>(\d+)</
106
+ @scan_nums << $1.to_i
107
+ indices << $2.to_i
108
+ end
109
+ end
110
+ #doc = Nokogiri::XML.parse(xml_string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
111
+ #root_el = doc.root
112
+ #raise RuntimeError, "expecting scan index!" unless root_el['name'] == 'scan'
113
+ #root_el.children.each do |el|
114
+ # indices << el.text.to_i
115
+ # @scan_nums << el['id'].to_i
116
+ #end
117
+ when :mzml
118
+ raise NotImplementedError
119
+ end
120
+ indices << last_offset
121
+
122
+ new_indices = []
123
+ 0.upto(indices.size-2) do |i|
124
+ val = indices[i]
125
+ next unless val
126
+ new_indices[@scan_nums[i]] = [indices[i], indices[i+1] - indices[i]]
127
+ end
128
+ new_indices
129
+ end
130
+ end
@@ -0,0 +1,12 @@
1
+
2
+ require 'ms/spectrum'
3
+ require 'ms/data'
4
+ require 'ms/data/lazy_io'
5
+ require 'ms/msrun'
6
+ require 'libxml'
7
+
8
+
9
+ LibXML::XML.default_keep_blanks = false
10
+
11
+
12
+ LibXML::XML::Reader.io(
@@ -0,0 +1,12 @@
1
+
2
+
3
+ module Ms
4
+ class Msrun
5
+ module Nokogiri
6
+ NOBLANKS = ::Nokogiri::XML::ParseOptions::DEFAULT_XML | ::Nokogiri::XML::ParseOptions::NOBLANKS
7
+ PARSER_ARGS = [nil,nil,NOBLANKS]
8
+ end
9
+ end
10
+ end
11
+
12
+
@@ -0,0 +1,168 @@
1
+
2
+ require 'nokogiri'
3
+ require 'ms/msrun/nokogiri'
4
+ require 'ms/msrun'
5
+ require 'ms/spectrum'
6
+ require 'ms/data'
7
+ require 'ms/data/lazy_io'
8
+ require 'ms/precursor'
9
+ require 'ms/mzxml'
10
+
11
+
12
+ class Ms::Msrun::Nokogiri::Mzxml
13
+ NetworkOrder = true
14
+
15
+ attr_accessor :msrun, :io, :version
16
+
17
+ def initialize(msrun_object, io, version)
18
+ @msrun = msrun_object
19
+ @io = io
20
+ @version = version
21
+ end
22
+
23
+ # returns the msrun
24
+ def parse_header(byte_length_or_header_string)
25
+ string =
26
+ if byte_length_or_header_string.is_a? Integer
27
+ @io.rewind
28
+ @io.read(byte_length_or_header_string)
29
+ else
30
+ length_or_header_string
31
+ end
32
+ doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
33
+ msrun_n = doc.root
34
+ if @version >= '2.0'
35
+ msrun_n = msrun_n.child
36
+ end
37
+ @msrun.scan_count = msrun_n['scanCount'].to_i
38
+ @msrun.start_time = msrun_n['startTime'][2...-1].to_f
39
+ @msrun.end_time = msrun_n['endTime'][2...-1].to_f
40
+
41
+ filename = msrun_n.search("parentFile").first['fileName']
42
+ (bn, dn) = Ms::Mzxml.parent_basename_and_dir(filename)
43
+ @msrun.parent_basename = bn
44
+ @msrun.parent_location = dn
45
+ @msrun
46
+ end
47
+
48
+ # returns the ms_level as an Integer, nil if it cannot be found.
49
+ def parse_ms_level(start_byte, length)
50
+ start_io_pos = @io.pos
51
+ @io.pos = start_byte
52
+ ms_level = nil
53
+ total_length = 0
54
+ @io.each("\n") do |line|
55
+ if line =~ /msLevel="(\d+)"/o
56
+ ms_level = $1.to_i
57
+ break
58
+ end
59
+ total_length += line.size
60
+ break if total_length > length
61
+ end
62
+ @io.pos = start_io_pos
63
+ ms_level
64
+ end
65
+
66
+ # assumes that the io object has been set to the beginning of the scan
67
+ # element. Returns an Ms::Scan object
68
+ # options:
69
+ # :spectrum => true | false (default is true)
70
+ # :precursor => true | false (default is true)
71
+ #
72
+ # Note that if both :spectrum and :precursor are set to false, the basic
73
+ # information in the scan node *is* parsed (such as ms_level)
74
+ def parse_scan(start_byte, length, options={})
75
+ opts = {:spectrum => true, :precursor => true}.merge(options)
76
+ start_io_pos = @io.pos
77
+ @io.pos = start_byte
78
+
79
+ # read in the data keeping track of peaks start and stop
80
+ string = ""
81
+ if opts[:spectrum]
82
+ string = @io.read(length)
83
+ else
84
+ # don't bother reading all the peak information if we aren't wanting it
85
+ # and can avoid it! This is important for high res instruments
86
+ # especially since the peak data is huge.
87
+ @io.each do |line|
88
+ if md = %r{<peaks}.match(line)
89
+ # just add the part of the string before the <peaks> tag
90
+ string << line.slice!(0, md.end(0) - 6)
91
+ break
92
+ else
93
+ string << line
94
+ if string.size >= length
95
+ if string.size > length
96
+ string.slice!(0,length)
97
+ end
98
+ break
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
105
+ scan_n = doc.root
106
+ scan = new_scan_from_node( scan_n )
107
+ prec_n = scan_n.child
108
+
109
+ peaks_n =
110
+ if prec_n.name == 'precursorMz'
111
+ if opts[:precursor]
112
+ prec = Ms::Precursor.new
113
+ prec[1] = prec_n['precursorIntensity'].to_f
114
+ prec[0] = prec_n.text.to_f
115
+ if x = prec_n['precursorCharge']
116
+ prec[3] = [x.to_i]
117
+ end
118
+ scan.precursor = prec
119
+ end
120
+ prec_n.next_sibling
121
+ else
122
+ prec_n # this is a peaks node
123
+ end
124
+
125
+ # is this for mzData?
126
+ #if x = node['precursorScanNum']
127
+ # prec[2] = scans_by_num[x.to_i]
128
+ #end
129
+
130
+ if opts[:spectrum]
131
+ # all mzXML (at least versions 1--3.0) *must* be 'network' byte order!
132
+ # data is stored as the base64 string until we actually try to access
133
+ # it! At that point the string is decoded and knows it is interleaved
134
+ # data. So, no spectrum is actually decoded unless it is accessed!
135
+ peaks_data = Ms::Data.new_interleaved(Ms::Data::LazyString.new(peaks_n.text, Ms::Data::LazyIO.unpack_code(peaks_n['precision'].to_i, NetworkOrder)))
136
+ spec = Ms::Spectrum.new(peaks_data)
137
+ scan[8] = Ms::Spectrum.new(peaks_data)
138
+ end
139
+ scan
140
+ end
141
+
142
+ def start_end_from_filter_line(line)
143
+ # "ITMS + c NSI d Full ms3 654.79@cid35.00 630.24@cid35.00 [160.00-1275.00]"
144
+ /\[([^-]+)-([^-]+)\]/.match(line)[1,2].map {|v| v.to_f }
145
+ end
146
+
147
+ def new_scan_from_node(node)
148
+ scan = Ms::Scan.new # array class creates one with 9 positions
149
+ scan[0] = node['num'].to_i
150
+ scan[1] = node['msLevel'].to_i
151
+ if x = node['retentionTime']
152
+ scan[2] = x[2...-1].to_f
153
+ end
154
+ if x = node['startMz']
155
+ scan[3] = x.to_f
156
+ scan[4] = node['endMz'].to_f
157
+ end
158
+ scan[5] = node['peaksCount'].to_i
159
+ scan[6] = node['totIonCurrent'].to_f
160
+ if fl = node['filterLine']
161
+ (scan[3], scan[4]) = start_end_from_filter_line(fl)
162
+ end
163
+ scan
164
+ end
165
+
166
+ end
167
+
168
+
@@ -0,0 +1,126 @@
1
+
2
+ require 'ms/msrun'
3
+ require 'ms/spectrum'
4
+ require 'ms/data'
5
+ require 'ms/data/lazy_io'
6
+ require 'ms/precursor'
7
+ require 'ms/mzxml'
8
+
9
+ module Ms
10
+ class Msrun
11
+ module Regexp
12
+ end
13
+ end
14
+ end
15
+
16
+ class Ms::Msrun::Regexp::Mzxml
17
+
18
+ attr_accessor :msrun, :io, :version
19
+
20
+ def initialize(msrun_object, io, version)
21
+ @msrun = msrun_object
22
+ @io = io
23
+ @version = version
24
+ end
25
+
26
+ # returns the msrun
27
+ def parse_header
28
+ while line = @io.gets
29
+ if line =~ %r{\s+fileName=['"](.*?)['"]}
30
+ (bn, dn) = Ms::Mzxml.parent_basename_and_dir($1)
31
+ @msrun.parent_basename = bn
32
+ @msrun.parent_location = dn
33
+ end
34
+ if line =~ /\s+scanCount=['"](\w+)['"]/
35
+ @msrun.scan_count = $1.to_i
36
+ end
37
+ if line =~ /startTime=['"]([\w\.]+)['"]/
38
+ @msrun.start_time = $1[2...-1].to_f
39
+ end
40
+ if line =~ /endTime=['"]([\w\.]+)['"]/
41
+ @msrun.end_time = $1[2...-1].to_f
42
+ end
43
+ if @io =~ /^\s*<scan/
44
+ break
45
+ end
46
+ end
47
+ @msrun
48
+ end
49
+
50
+ def self.parse_precursor(line)
51
+ prec = Ms::Precursor.new
52
+ loop do
53
+ if line =~ /precursorIntensity=['"]([\d\.]+)['"]/
54
+ prec[1] = $1.to_f
55
+ end
56
+ if line =~ /precursorCharge=["'](\d+)["']/
57
+ prec[3] = [$1.to_i]
58
+ end
59
+ if line =~ %r{>([\d\.]+)</precursorMz>}
60
+ prec[0] = $1.to_f
61
+ break
62
+ end
63
+ line = io.gets
64
+ end
65
+ end
66
+
67
+ def self.parse_peaks
68
+ precision = 32
69
+ byte_order = 'network'
70
+ while line = @io.gets
71
+ if line =~ /(precision|byteOrder)=["'](\w+)["']/
72
+ case $1
73
+ when 'precision'
74
+ $2.to_i
75
+ when 'byteOrder'
76
+ byte_order = $2
77
+ end
78
+ end
79
+ if line =~ %r{</peaks>}
80
+ first_pos = line.index('>')
81
+ last_pos = @io.pos + line.rindex("</peaks>")
82
+ Ms::Spectrum
83
+ break
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ # assumes that the io object has been set to the beginning of the scan
90
+ # element. Returns an Ms::Scan object
91
+ def self.parse_scan(start_byte, length)
92
+ @io.pos = start_byte
93
+ hash = {}
94
+ while line = @io.gets do
95
+ if line =~ /^\s*<precursorMz/
96
+ self.parse_precursor(line)
97
+ self.parse_peaks
98
+ break
99
+ end
100
+ if line =~ /(\w+)=["'](\w+)["']/
101
+ hash[$1] = $2
102
+ end
103
+ end
104
+ new_scan_from_hash(hash)
105
+ end
106
+
107
+ def new_scan_from_hash(hash)
108
+ scan = Ms::Scan.new # array class creates one with 9 positions
109
+ scan[0] = hash['num'].to_i
110
+ scan[1] = hash['msLevel'].to_i
111
+ if x = hash['retentionTime']
112
+ scan[2] = x[2...-1].to_f
113
+ end
114
+ if x = hash['startMz']
115
+ scan[3] = x.to_f
116
+ scan[4] = hash['endMz'].to_f
117
+ scan[5] = hash['peaksCount'].to_i
118
+ scan[6] = hash['totIonCurrent'].to_f
119
+ end
120
+ scan
121
+ end
122
+
123
+
124
+ end
125
+
126
+