ms-msrun 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,7 @@
1
1
 
2
+ require 'ms/spectrum'
3
+ require 'ms/data'
4
+ require 'ms/data/lazy_io'
2
5
  require 'ms/msrun'
3
6
  require 'ms/precursor'
4
7
  require 'axml'
@@ -48,7 +51,7 @@ class Ms::Msrun::Axml::Mzxml
48
51
  add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, version, io)
49
52
 
50
53
  ## update the scan's parents
51
- Ms::Msrun::Utils.add_parent_scan(scans)
54
+ Ms::Msrun.add_parent_scan(scans)
52
55
 
53
56
  # note that startTime and endTime are optional AND in >2.2 are dateTime
54
57
  # instead of duration types!, so we will just use scan times...
@@ -80,13 +83,8 @@ class Ms::Msrun::Axml::Mzxml
80
83
  when 'peaks'
81
84
  # assumes that parsing was done with a LazyPeaks parser!
82
85
  nc = node.text
83
- #p nc
84
- #if nc.first < 0 || nc.last < 0
85
- # puts "PROBLEM: "
86
- # p nc
87
- # abort 'here'
88
- #end
89
- scan[8] = Ms::Spectrum.lazy(io, nc.first, nc.last, node['precision'].to_i, NetworkOrder)
86
+ data = Ms::Data::LazyIO.new(io, nc.first, nc.last, Ms::Data::LazyIO.unpack_code(node['precision'].to_i, NetworkOrder))
87
+ scan[8] = Ms::Spectrum.new(Ms::Data::Interleaved.new(data))
90
88
  end
91
89
  end
92
90
  scan[7] = prec
@@ -0,0 +1,130 @@
1
+ require 'ms/msrun'
2
+
3
+ module Ms ; end
4
+ class Ms::Msrun ; end
5
+
6
+ # an index by scan number of the doublets where each doublet = [start_byte,
7
+ # length] for the scan. Index objects are enumerable and yield doublets.
8
+ # Index#scan_nums gives an array of the scan numbers.
9
+ # Index#first and Index#last return the first and the last scan, regardless of
10
+ # the scan numbers.
11
+ #
12
+ # index.scan_nums # -> [1,2,3,4]
13
+ # index.each do |starting_byte, length|
14
+ # IO.read(myfile.mzXML, length, starting_byte) # -> xml for each scan
15
+ # end
16
+ # index[0] # -> nil
17
+ # index.first # -> [<start_byte>, <length>] # for scan number 1
18
+ class Ms::Msrun::Index < Array
19
+ include Enumerable
20
+
21
+ MZXML_INDEX_TAG = 'indexOffset'
22
+ MZML_INDEX_TAG = 'indexListOffset'
23
+
24
+ # returns the length from the start to the first scan
25
+ def header_length
26
+ self.each {|pair| return (pair.first) }
27
+ end
28
+
29
+ # returns an array of the scan numbers
30
+ attr_reader :scan_nums
31
+
32
+ # takes an mzxml filename or io object
33
+ # and returns an array of offsets and lengths for the scans
34
+ # note that the offset
35
+ def initialize(filename_or_io)
36
+ (ft, version) = Ms::Msrun.filetype_and_version(filename_or_io)
37
+ tag = case ft
38
+ when :mzml : MZML_INDEX_TAG
39
+ when :mzxml : MZXML_INDEX_TAG
40
+ end
41
+ fn =
42
+ if filename_or_io.is_a? String
43
+ filename_or_io # a filename
44
+ else # a File object
45
+ filename_or_io.path
46
+ end
47
+ size = File.size(fn)
48
+ io =
49
+ if filename_or_io.is_a? String
50
+ File.open(filename_or_io)
51
+ else
52
+ filename_or_io
53
+ end
54
+ (offset, length) = index_offset(io, size, tag)
55
+ io.pos = offset
56
+ xml = io.read(length)
57
+ io.close if filename_or_io.is_a?(String)
58
+ self.replace( index_to_array(xml, offset, ft) )
59
+ self
60
+ end
61
+
62
+ def each(&block)
63
+ scan_nums.each do |scan_num|
64
+ block.call( self[scan_num] )
65
+ end
66
+ end
67
+
68
+ def first
69
+ self[scan_nums.first]
70
+ end
71
+
72
+ def last
73
+ self[scan_nums.last]
74
+ end
75
+
76
+ # returns [index_offset, length_of_index]
77
+ def index_offset(io, size, tag=MZML_INDEX_TAG, bytes_backwards=150) # :nodoc:
78
+ tag_re = /<#{tag}>([\-\d]+)<\/#{tag}>/
79
+ io.pos = size-1
80
+ io.pos = io.pos - bytes_backwards
81
+ index_offset = nil
82
+ index_end = nil
83
+ io.each do |line|
84
+ if line =~ tag_re
85
+ index_offset = $1.to_i
86
+ index_end = io.pos - line.size
87
+ break
88
+ end
89
+ end
90
+ if index_offset
91
+ [index_offset, index_end - index_offset]
92
+ else
93
+ [nil,nil]
94
+ end
95
+ end
96
+
97
+ # last_offset is used to calculate the length of the last scan object (or
98
+ # whatever)
99
+ def index_to_array(xml_string, last_offset, type=:mzml) # :nodoc:
100
+ indices = []
101
+ @scan_nums = []
102
+ case type
103
+ when :mzxml
104
+ xml_string.each_line("\n") do |line|
105
+ if line =~ /id="(\d+)".*>(\d+)</
106
+ @scan_nums << $1.to_i
107
+ indices << $2.to_i
108
+ end
109
+ end
110
+ #doc = Nokogiri::XML.parse(xml_string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
111
+ #root_el = doc.root
112
+ #raise RuntimeError, "expecting scan index!" unless root_el['name'] == 'scan'
113
+ #root_el.children.each do |el|
114
+ # indices << el.text.to_i
115
+ # @scan_nums << el['id'].to_i
116
+ #end
117
+ when :mzml
118
+ raise NotImplementedError
119
+ end
120
+ indices << last_offset
121
+
122
+ new_indices = []
123
+ 0.upto(indices.size-2) do |i|
124
+ val = indices[i]
125
+ next unless val
126
+ new_indices[@scan_nums[i]] = [indices[i], indices[i+1] - indices[i]]
127
+ end
128
+ new_indices
129
+ end
130
+ end
@@ -0,0 +1,12 @@
1
+
2
+ require 'ms/spectrum'
3
+ require 'ms/data'
4
+ require 'ms/data/lazy_io'
5
+ require 'ms/msrun'
6
+ require 'libxml'
7
+
8
+
9
+ LibXML::XML.default_keep_blanks = false
10
+
11
+
12
+ LibXML::XML::Reader.io(
@@ -0,0 +1,12 @@
1
+
2
+
3
+ module Ms
4
+ class Msrun
5
+ module Nokogiri
6
+ NOBLANKS = ::Nokogiri::XML::ParseOptions::DEFAULT_XML | ::Nokogiri::XML::ParseOptions::NOBLANKS
7
+ PARSER_ARGS = [nil,nil,NOBLANKS]
8
+ end
9
+ end
10
+ end
11
+
12
+
@@ -0,0 +1,168 @@
1
+
2
+ require 'nokogiri'
3
+ require 'ms/msrun/nokogiri'
4
+ require 'ms/msrun'
5
+ require 'ms/spectrum'
6
+ require 'ms/data'
7
+ require 'ms/data/lazy_io'
8
+ require 'ms/precursor'
9
+ require 'ms/mzxml'
10
+
11
+
12
+ class Ms::Msrun::Nokogiri::Mzxml
13
+ NetworkOrder = true
14
+
15
+ attr_accessor :msrun, :io, :version
16
+
17
+ def initialize(msrun_object, io, version)
18
+ @msrun = msrun_object
19
+ @io = io
20
+ @version = version
21
+ end
22
+
23
+ # returns the msrun
24
+ def parse_header(byte_length_or_header_string)
25
+ string =
26
+ if byte_length_or_header_string.is_a? Integer
27
+ @io.rewind
28
+ @io.read(byte_length_or_header_string)
29
+ else
30
+ length_or_header_string
31
+ end
32
+ doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
33
+ msrun_n = doc.root
34
+ if @version >= '2.0'
35
+ msrun_n = msrun_n.child
36
+ end
37
+ @msrun.scan_count = msrun_n['scanCount'].to_i
38
+ @msrun.start_time = msrun_n['startTime'][2...-1].to_f
39
+ @msrun.end_time = msrun_n['endTime'][2...-1].to_f
40
+
41
+ filename = msrun_n.search("parentFile").first['fileName']
42
+ (bn, dn) = Ms::Mzxml.parent_basename_and_dir(filename)
43
+ @msrun.parent_basename = bn
44
+ @msrun.parent_location = dn
45
+ @msrun
46
+ end
47
+
48
+ # returns the ms_level as an Integer, nil if it cannot be found.
49
+ def parse_ms_level(start_byte, length)
50
+ start_io_pos = @io.pos
51
+ @io.pos = start_byte
52
+ ms_level = nil
53
+ total_length = 0
54
+ @io.each("\n") do |line|
55
+ if line =~ /msLevel="(\d+)"/o
56
+ ms_level = $1.to_i
57
+ break
58
+ end
59
+ total_length += line.size
60
+ break if total_length > length
61
+ end
62
+ @io.pos = start_io_pos
63
+ ms_level
64
+ end
65
+
66
+ # assumes that the io object has been set to the beginning of the scan
67
+ # element. Returns an Ms::Scan object
68
+ # options:
69
+ # :spectrum => true | false (default is true)
70
+ # :precursor => true | false (default is true)
71
+ #
72
+ # Note that if both :spectrum and :precursor are set to false, the basic
73
+ # information in the scan node *is* parsed (such as ms_level)
74
+ def parse_scan(start_byte, length, options={})
75
+ opts = {:spectrum => true, :precursor => true}.merge(options)
76
+ start_io_pos = @io.pos
77
+ @io.pos = start_byte
78
+
79
+ # read in the data keeping track of peaks start and stop
80
+ string = ""
81
+ if opts[:spectrum]
82
+ string = @io.read(length)
83
+ else
84
+ # don't bother reading all the peak information if we aren't wanting it
85
+ # and can avoid it! This is important for high res instruments
86
+ # especially since the peak data is huge.
87
+ @io.each do |line|
88
+ if md = %r{<peaks}.match(line)
89
+ # just add the part of the string before the <peaks> tag
90
+ string << line.slice!(0, md.end(0) - 6)
91
+ break
92
+ else
93
+ string << line
94
+ if string.size >= length
95
+ if string.size > length
96
+ string.slice!(0,length)
97
+ end
98
+ break
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
105
+ scan_n = doc.root
106
+ scan = new_scan_from_node( scan_n )
107
+ prec_n = scan_n.child
108
+
109
+ peaks_n =
110
+ if prec_n.name == 'precursorMz'
111
+ if opts[:precursor]
112
+ prec = Ms::Precursor.new
113
+ prec[1] = prec_n['precursorIntensity'].to_f
114
+ prec[0] = prec_n.text.to_f
115
+ if x = prec_n['precursorCharge']
116
+ prec[3] = [x.to_i]
117
+ end
118
+ scan.precursor = prec
119
+ end
120
+ prec_n.next_sibling
121
+ else
122
+ prec_n # this is a peaks node
123
+ end
124
+
125
+ # is this for mzData?
126
+ #if x = node['precursorScanNum']
127
+ # prec[2] = scans_by_num[x.to_i]
128
+ #end
129
+
130
+ if opts[:spectrum]
131
+ # all mzXML (at least versions 1--3.0) *must* be 'network' byte order!
132
+ # data is stored as the base64 string until we actually try to access
133
+ # it! At that point the string is decoded and knows it is interleaved
134
+ # data. So, no spectrum is actually decoded unless it is accessed!
135
+ peaks_data = Ms::Data.new_interleaved(Ms::Data::LazyString.new(peaks_n.text, Ms::Data::LazyIO.unpack_code(peaks_n['precision'].to_i, NetworkOrder)))
136
+ spec = Ms::Spectrum.new(peaks_data)
137
+ scan[8] = Ms::Spectrum.new(peaks_data)
138
+ end
139
+ scan
140
+ end
141
+
142
+ def start_end_from_filter_line(line)
143
+ # "ITMS + c NSI d Full ms3 654.79@cid35.00 630.24@cid35.00 [160.00-1275.00]"
144
+ /\[([^-]+)-([^-]+)\]/.match(line)[1,2].map {|v| v.to_f }
145
+ end
146
+
147
+ def new_scan_from_node(node)
148
+ scan = Ms::Scan.new # array class creates one with 9 positions
149
+ scan[0] = node['num'].to_i
150
+ scan[1] = node['msLevel'].to_i
151
+ if x = node['retentionTime']
152
+ scan[2] = x[2...-1].to_f
153
+ end
154
+ if x = node['startMz']
155
+ scan[3] = x.to_f
156
+ scan[4] = node['endMz'].to_f
157
+ end
158
+ scan[5] = node['peaksCount'].to_i
159
+ scan[6] = node['totIonCurrent'].to_f
160
+ if fl = node['filterLine']
161
+ (scan[3], scan[4]) = start_end_from_filter_line(fl)
162
+ end
163
+ scan
164
+ end
165
+
166
+ end
167
+
168
+
@@ -0,0 +1,126 @@
1
+
2
+ require 'ms/msrun'
3
+ require 'ms/spectrum'
4
+ require 'ms/data'
5
+ require 'ms/data/lazy_io'
6
+ require 'ms/precursor'
7
+ require 'ms/mzxml'
8
+
9
+ module Ms
10
+ class Msrun
11
+ module Regexp
12
+ end
13
+ end
14
+ end
15
+
16
+ class Ms::Msrun::Regexp::Mzxml
17
+
18
+ attr_accessor :msrun, :io, :version
19
+
20
+ def initialize(msrun_object, io, version)
21
+ @msrun = msrun_object
22
+ @io = io
23
+ @version = version
24
+ end
25
+
26
+ # returns the msrun
27
+ def parse_header
28
+ while line = @io.gets
29
+ if line =~ %r{\s+fileName=['"](.*?)['"]}
30
+ (bn, dn) = Ms::Mzxml.parent_basename_and_dir($1)
31
+ @msrun.parent_basename = bn
32
+ @msrun.parent_location = dn
33
+ end
34
+ if line =~ /\s+scanCount=['"](\w+)['"]/
35
+ @msrun.scan_count = $1.to_i
36
+ end
37
+ if line =~ /startTime=['"]([\w\.]+)['"]/
38
+ @msrun.start_time = $1[2...-1].to_f
39
+ end
40
+ if line =~ /endTime=['"]([\w\.]+)['"]/
41
+ @msrun.end_time = $1[2...-1].to_f
42
+ end
43
+ if @io =~ /^\s*<scan/
44
+ break
45
+ end
46
+ end
47
+ @msrun
48
+ end
49
+
50
+ def self.parse_precursor(line)
51
+ prec = Ms::Precursor.new
52
+ loop do
53
+ if line =~ /precursorIntensity=['"]([\d\.]+)['"]/
54
+ prec[1] = $1.to_f
55
+ end
56
+ if line =~ /precursorCharge=["'](\d+)["']/
57
+ prec[3] = [$1.to_i]
58
+ end
59
+ if line =~ %r{>([\d\.]+)</precursorMz>}
60
+ prec[0] = $1.to_f
61
+ break
62
+ end
63
+ line = io.gets
64
+ end
65
+ end
66
+
67
+ def self.parse_peaks
68
+ precision = 32
69
+ byte_order = 'network'
70
+ while line = @io.gets
71
+ if line =~ /(precision|byteOrder)=["'](\w+)["']/
72
+ case $1
73
+ when 'precision'
74
+ $2.to_i
75
+ when 'byteOrder'
76
+ byte_order = $2
77
+ end
78
+ end
79
+ if line =~ %r{</peaks>}
80
+ first_pos = line.index('>')
81
+ last_pos = @io.pos + line.rindex("</peaks>")
82
+ Ms::Spectrum
83
+ break
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ # assumes that the io object has been set to the beginning of the scan
90
+ # element. Returns an Ms::Scan object
91
+ def self.parse_scan(start_byte, length)
92
+ @io.pos = start_byte
93
+ hash = {}
94
+ while line = @io.gets do
95
+ if line =~ /^\s*<precursorMz/
96
+ self.parse_precursor(line)
97
+ self.parse_peaks
98
+ break
99
+ end
100
+ if line =~ /(\w+)=["'](\w+)["']/
101
+ hash[$1] = $2
102
+ end
103
+ end
104
+ new_scan_from_hash(hash)
105
+ end
106
+
107
+ def new_scan_from_hash(hash)
108
+ scan = Ms::Scan.new # array class creates one with 9 positions
109
+ scan[0] = hash['num'].to_i
110
+ scan[1] = hash['msLevel'].to_i
111
+ if x = hash['retentionTime']
112
+ scan[2] = x[2...-1].to_f
113
+ end
114
+ if x = hash['startMz']
115
+ scan[3] = x.to_f
116
+ scan[4] = hash['endMz'].to_f
117
+ scan[5] = hash['peaksCount'].to_i
118
+ scan[6] = hash['totIonCurrent'].to_f
119
+ end
120
+ scan
121
+ end
122
+
123
+
124
+ end
125
+
126
+