ms-msrun 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +3 -4
- data/Rakefile +20 -5
- data/bin/base64_to_array.rb +3 -4
- data/bin/ms_to_obiwarp.rb +6 -15
- data/bin/ms_to_search.rb +15 -22
- data/lib/lmat.rb +47 -53
- data/lib/ms/msrun.rb +98 -108
- data/lib/ms/msrun/axml/mzxml.rb +6 -8
- data/lib/ms/msrun/index.rb +130 -0
- data/lib/ms/msrun/mzxml.rb +12 -0
- data/lib/ms/msrun/nokogiri.rb +12 -0
- data/lib/ms/msrun/nokogiri/mzxml.rb +168 -0
- data/lib/ms/msrun/regexp/mzxml.rb +126 -0
- data/lib/ms/msrun/search.rb +25 -21
- data/lib/ms/msrun/sha1.rb +36 -0
- data/lib/ms/mzxml.rb +12 -0
- data/lib/ms/precursor.rb +3 -2
- data/lib/ms/precursor/lazy_parent.rb +28 -0
- data/lib/ms/scan.rb +2 -29
- data/lib/ms/spectrum/compare.rb +42 -2
- data/lib/ms/spectrum/filter.rb +1 -1
- data/spec/ms/msrun/index_spec.rb +60 -0
- data/spec/ms/msrun/scan_spec.rb +78 -0
- data/spec/ms/msrun/search_spec.rb +6 -7
- data/spec/ms/msrun/sha1_spec.rb +23 -0
- data/spec/ms/msrun_spec.rb +111 -3
- data/spec/ms/scan_spec.rb +2 -2
- data/spec/ms/spectrum/compare_spec.rb +13 -6
- data/spec/ms/spectrum/filter_spec.rb +3 -3
- metadata +42 -21
- data/lib/bsearch.rb +0 -120
- data/lib/ms/spectrum.rb +0 -373
data/lib/ms/msrun/axml/mzxml.rb
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
|
2
|
+
require 'ms/spectrum'
|
3
|
+
require 'ms/data'
|
4
|
+
require 'ms/data/lazy_io'
|
2
5
|
require 'ms/msrun'
|
3
6
|
require 'ms/precursor'
|
4
7
|
require 'axml'
|
@@ -48,7 +51,7 @@ class Ms::Msrun::Axml::Mzxml
|
|
48
51
|
add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, version, io)
|
49
52
|
|
50
53
|
## update the scan's parents
|
51
|
-
Ms::Msrun
|
54
|
+
Ms::Msrun.add_parent_scan(scans)
|
52
55
|
|
53
56
|
# note that startTime and endTime are optional AND in >2.2 are dateTime
|
54
57
|
# instead of duration types!, so we will just use scan times...
|
@@ -80,13 +83,8 @@ class Ms::Msrun::Axml::Mzxml
|
|
80
83
|
when 'peaks'
|
81
84
|
# assumes that parsing was done with a LazyPeaks parser!
|
82
85
|
nc = node.text
|
83
|
-
|
84
|
-
|
85
|
-
# puts "PROBLEM: "
|
86
|
-
# p nc
|
87
|
-
# abort 'here'
|
88
|
-
#end
|
89
|
-
scan[8] = Ms::Spectrum.lazy(io, nc.first, nc.last, node['precision'].to_i, NetworkOrder)
|
86
|
+
data = Ms::Data::LazyIO.new(io, nc.first, nc.last, Ms::Data::LazyIO.unpack_code(node['precision'].to_i, NetworkOrder))
|
87
|
+
scan[8] = Ms::Spectrum.new(Ms::Data::Interleaved.new(data))
|
90
88
|
end
|
91
89
|
end
|
92
90
|
scan[7] = prec
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'ms/msrun'
|
2
|
+
|
3
|
+
module Ms ; end
|
4
|
+
class Ms::Msrun ; end
|
5
|
+
|
6
|
+
# an index by scan number of the doublets where each doublet = [start_byte,
|
7
|
+
# length] for the scan. Index objects are enumerable and yield doublets.
|
8
|
+
# Index#scan_nums gives an array of the scan numbers.
|
9
|
+
# Index#first and Index#last return the first and the last scan, regardless of
|
10
|
+
# the scan numbers.
|
11
|
+
#
|
12
|
+
# index.scan_nums # -> [1,2,3,4]
|
13
|
+
# index.each do |starting_byte, length|
|
14
|
+
# IO.read(myfile.mzXML, length, starting_byte) # -> xml for each scan
|
15
|
+
# end
|
16
|
+
# index[0] # -> nil
|
17
|
+
# index.first # -> [<start_byte>, <length>] # for scan number 1
|
18
|
+
class Ms::Msrun::Index < Array
|
19
|
+
include Enumerable
|
20
|
+
|
21
|
+
MZXML_INDEX_TAG = 'indexOffset'
|
22
|
+
MZML_INDEX_TAG = 'indexListOffset'
|
23
|
+
|
24
|
+
# returns the length from the start to the first scan
|
25
|
+
def header_length
|
26
|
+
self.each {|pair| return (pair.first) }
|
27
|
+
end
|
28
|
+
|
29
|
+
# returns an array of the scan numbers
|
30
|
+
attr_reader :scan_nums
|
31
|
+
|
32
|
+
# takes an mzxml filename or io object
|
33
|
+
# and returns an array of offsets and lengths for the scans
|
34
|
+
# note that the offset
|
35
|
+
def initialize(filename_or_io)
|
36
|
+
(ft, version) = Ms::Msrun.filetype_and_version(filename_or_io)
|
37
|
+
tag = case ft
|
38
|
+
when :mzml : MZML_INDEX_TAG
|
39
|
+
when :mzxml : MZXML_INDEX_TAG
|
40
|
+
end
|
41
|
+
fn =
|
42
|
+
if filename_or_io.is_a? String
|
43
|
+
filename_or_io # a filename
|
44
|
+
else # a File object
|
45
|
+
filename_or_io.path
|
46
|
+
end
|
47
|
+
size = File.size(fn)
|
48
|
+
io =
|
49
|
+
if filename_or_io.is_a? String
|
50
|
+
File.open(filename_or_io)
|
51
|
+
else
|
52
|
+
filename_or_io
|
53
|
+
end
|
54
|
+
(offset, length) = index_offset(io, size, tag)
|
55
|
+
io.pos = offset
|
56
|
+
xml = io.read(length)
|
57
|
+
io.close if filename_or_io.is_a?(String)
|
58
|
+
self.replace( index_to_array(xml, offset, ft) )
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
def each(&block)
|
63
|
+
scan_nums.each do |scan_num|
|
64
|
+
block.call( self[scan_num] )
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def first
|
69
|
+
self[scan_nums.first]
|
70
|
+
end
|
71
|
+
|
72
|
+
def last
|
73
|
+
self[scan_nums.last]
|
74
|
+
end
|
75
|
+
|
76
|
+
# returns [index_offset, length_of_index]
|
77
|
+
def index_offset(io, size, tag=MZML_INDEX_TAG, bytes_backwards=150) # :nodoc:
|
78
|
+
tag_re = /<#{tag}>([\-\d]+)<\/#{tag}>/
|
79
|
+
io.pos = size-1
|
80
|
+
io.pos = io.pos - bytes_backwards
|
81
|
+
index_offset = nil
|
82
|
+
index_end = nil
|
83
|
+
io.each do |line|
|
84
|
+
if line =~ tag_re
|
85
|
+
index_offset = $1.to_i
|
86
|
+
index_end = io.pos - line.size
|
87
|
+
break
|
88
|
+
end
|
89
|
+
end
|
90
|
+
if index_offset
|
91
|
+
[index_offset, index_end - index_offset]
|
92
|
+
else
|
93
|
+
[nil,nil]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# last_offset is used to calculate the length of the last scan object (or
|
98
|
+
# whatever)
|
99
|
+
def index_to_array(xml_string, last_offset, type=:mzml) # :nodoc:
|
100
|
+
indices = []
|
101
|
+
@scan_nums = []
|
102
|
+
case type
|
103
|
+
when :mzxml
|
104
|
+
xml_string.each_line("\n") do |line|
|
105
|
+
if line =~ /id="(\d+)".*>(\d+)</
|
106
|
+
@scan_nums << $1.to_i
|
107
|
+
indices << $2.to_i
|
108
|
+
end
|
109
|
+
end
|
110
|
+
#doc = Nokogiri::XML.parse(xml_string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
|
111
|
+
#root_el = doc.root
|
112
|
+
#raise RuntimeError, "expecting scan index!" unless root_el['name'] == 'scan'
|
113
|
+
#root_el.children.each do |el|
|
114
|
+
# indices << el.text.to_i
|
115
|
+
# @scan_nums << el['id'].to_i
|
116
|
+
#end
|
117
|
+
when :mzml
|
118
|
+
raise NotImplementedError
|
119
|
+
end
|
120
|
+
indices << last_offset
|
121
|
+
|
122
|
+
new_indices = []
|
123
|
+
0.upto(indices.size-2) do |i|
|
124
|
+
val = indices[i]
|
125
|
+
next unless val
|
126
|
+
new_indices[@scan_nums[i]] = [indices[i], indices[i+1] - indices[i]]
|
127
|
+
end
|
128
|
+
new_indices
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'ms/msrun/nokogiri'
|
4
|
+
require 'ms/msrun'
|
5
|
+
require 'ms/spectrum'
|
6
|
+
require 'ms/data'
|
7
|
+
require 'ms/data/lazy_io'
|
8
|
+
require 'ms/precursor'
|
9
|
+
require 'ms/mzxml'
|
10
|
+
|
11
|
+
|
12
|
+
class Ms::Msrun::Nokogiri::Mzxml
|
13
|
+
NetworkOrder = true
|
14
|
+
|
15
|
+
attr_accessor :msrun, :io, :version
|
16
|
+
|
17
|
+
def initialize(msrun_object, io, version)
|
18
|
+
@msrun = msrun_object
|
19
|
+
@io = io
|
20
|
+
@version = version
|
21
|
+
end
|
22
|
+
|
23
|
+
# returns the msrun
|
24
|
+
def parse_header(byte_length_or_header_string)
|
25
|
+
string =
|
26
|
+
if byte_length_or_header_string.is_a? Integer
|
27
|
+
@io.rewind
|
28
|
+
@io.read(byte_length_or_header_string)
|
29
|
+
else
|
30
|
+
length_or_header_string
|
31
|
+
end
|
32
|
+
doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
|
33
|
+
msrun_n = doc.root
|
34
|
+
if @version >= '2.0'
|
35
|
+
msrun_n = msrun_n.child
|
36
|
+
end
|
37
|
+
@msrun.scan_count = msrun_n['scanCount'].to_i
|
38
|
+
@msrun.start_time = msrun_n['startTime'][2...-1].to_f
|
39
|
+
@msrun.end_time = msrun_n['endTime'][2...-1].to_f
|
40
|
+
|
41
|
+
filename = msrun_n.search("parentFile").first['fileName']
|
42
|
+
(bn, dn) = Ms::Mzxml.parent_basename_and_dir(filename)
|
43
|
+
@msrun.parent_basename = bn
|
44
|
+
@msrun.parent_location = dn
|
45
|
+
@msrun
|
46
|
+
end
|
47
|
+
|
48
|
+
# returns the ms_level as an Integer, nil if it cannot be found.
|
49
|
+
def parse_ms_level(start_byte, length)
|
50
|
+
start_io_pos = @io.pos
|
51
|
+
@io.pos = start_byte
|
52
|
+
ms_level = nil
|
53
|
+
total_length = 0
|
54
|
+
@io.each("\n") do |line|
|
55
|
+
if line =~ /msLevel="(\d+)"/o
|
56
|
+
ms_level = $1.to_i
|
57
|
+
break
|
58
|
+
end
|
59
|
+
total_length += line.size
|
60
|
+
break if total_length > length
|
61
|
+
end
|
62
|
+
@io.pos = start_io_pos
|
63
|
+
ms_level
|
64
|
+
end
|
65
|
+
|
66
|
+
# assumes that the io object has been set to the beginning of the scan
|
67
|
+
# element. Returns an Ms::Scan object
|
68
|
+
# options:
|
69
|
+
# :spectrum => true | false (default is true)
|
70
|
+
# :precursor => true | false (default is true)
|
71
|
+
#
|
72
|
+
# Note that if both :spectrum and :precursor are set to false, the basic
|
73
|
+
# information in the scan node *is* parsed (such as ms_level)
|
74
|
+
def parse_scan(start_byte, length, options={})
|
75
|
+
opts = {:spectrum => true, :precursor => true}.merge(options)
|
76
|
+
start_io_pos = @io.pos
|
77
|
+
@io.pos = start_byte
|
78
|
+
|
79
|
+
# read in the data keeping track of peaks start and stop
|
80
|
+
string = ""
|
81
|
+
if opts[:spectrum]
|
82
|
+
string = @io.read(length)
|
83
|
+
else
|
84
|
+
# don't bother reading all the peak information if we aren't wanting it
|
85
|
+
# and can avoid it! This is important for high res instruments
|
86
|
+
# especially since the peak data is huge.
|
87
|
+
@io.each do |line|
|
88
|
+
if md = %r{<peaks}.match(line)
|
89
|
+
# just add the part of the string before the <peaks> tag
|
90
|
+
string << line.slice!(0, md.end(0) - 6)
|
91
|
+
break
|
92
|
+
else
|
93
|
+
string << line
|
94
|
+
if string.size >= length
|
95
|
+
if string.size > length
|
96
|
+
string.slice!(0,length)
|
97
|
+
end
|
98
|
+
break
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
|
105
|
+
scan_n = doc.root
|
106
|
+
scan = new_scan_from_node( scan_n )
|
107
|
+
prec_n = scan_n.child
|
108
|
+
|
109
|
+
peaks_n =
|
110
|
+
if prec_n.name == 'precursorMz'
|
111
|
+
if opts[:precursor]
|
112
|
+
prec = Ms::Precursor.new
|
113
|
+
prec[1] = prec_n['precursorIntensity'].to_f
|
114
|
+
prec[0] = prec_n.text.to_f
|
115
|
+
if x = prec_n['precursorCharge']
|
116
|
+
prec[3] = [x.to_i]
|
117
|
+
end
|
118
|
+
scan.precursor = prec
|
119
|
+
end
|
120
|
+
prec_n.next_sibling
|
121
|
+
else
|
122
|
+
prec_n # this is a peaks node
|
123
|
+
end
|
124
|
+
|
125
|
+
# is this for mzData?
|
126
|
+
#if x = node['precursorScanNum']
|
127
|
+
# prec[2] = scans_by_num[x.to_i]
|
128
|
+
#end
|
129
|
+
|
130
|
+
if opts[:spectrum]
|
131
|
+
# all mzXML (at least versions 1--3.0) *must* be 'network' byte order!
|
132
|
+
# data is stored as the base64 string until we actually try to access
|
133
|
+
# it! At that point the string is decoded and knows it is interleaved
|
134
|
+
# data. So, no spectrum is actually decoded unless it is accessed!
|
135
|
+
peaks_data = Ms::Data.new_interleaved(Ms::Data::LazyString.new(peaks_n.text, Ms::Data::LazyIO.unpack_code(peaks_n['precision'].to_i, NetworkOrder)))
|
136
|
+
spec = Ms::Spectrum.new(peaks_data)
|
137
|
+
scan[8] = Ms::Spectrum.new(peaks_data)
|
138
|
+
end
|
139
|
+
scan
|
140
|
+
end
|
141
|
+
|
142
|
+
def start_end_from_filter_line(line)
|
143
|
+
# "ITMS + c NSI d Full ms3 654.79@cid35.00 630.24@cid35.00 [160.00-1275.00]"
|
144
|
+
/\[([^-]+)-([^-]+)\]/.match(line)[1,2].map {|v| v.to_f }
|
145
|
+
end
|
146
|
+
|
147
|
+
def new_scan_from_node(node)
|
148
|
+
scan = Ms::Scan.new # array class creates one with 9 positions
|
149
|
+
scan[0] = node['num'].to_i
|
150
|
+
scan[1] = node['msLevel'].to_i
|
151
|
+
if x = node['retentionTime']
|
152
|
+
scan[2] = x[2...-1].to_f
|
153
|
+
end
|
154
|
+
if x = node['startMz']
|
155
|
+
scan[3] = x.to_f
|
156
|
+
scan[4] = node['endMz'].to_f
|
157
|
+
end
|
158
|
+
scan[5] = node['peaksCount'].to_i
|
159
|
+
scan[6] = node['totIonCurrent'].to_f
|
160
|
+
if fl = node['filterLine']
|
161
|
+
(scan[3], scan[4]) = start_end_from_filter_line(fl)
|
162
|
+
end
|
163
|
+
scan
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
|
@@ -0,0 +1,126 @@
|
|
1
|
+
|
2
|
+
require 'ms/msrun'
|
3
|
+
require 'ms/spectrum'
|
4
|
+
require 'ms/data'
|
5
|
+
require 'ms/data/lazy_io'
|
6
|
+
require 'ms/precursor'
|
7
|
+
require 'ms/mzxml'
|
8
|
+
|
9
|
+
module Ms
|
10
|
+
class Msrun
|
11
|
+
module Regexp
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Ms::Msrun::Regexp::Mzxml
|
17
|
+
|
18
|
+
attr_accessor :msrun, :io, :version
|
19
|
+
|
20
|
+
def initialize(msrun_object, io, version)
|
21
|
+
@msrun = msrun_object
|
22
|
+
@io = io
|
23
|
+
@version = version
|
24
|
+
end
|
25
|
+
|
26
|
+
# returns the msrun
|
27
|
+
def parse_header
|
28
|
+
while line = @io.gets
|
29
|
+
if line =~ %r{\s+fileName=['"](.*?)['"]}
|
30
|
+
(bn, dn) = Ms::Mzxml.parent_basename_and_dir($1)
|
31
|
+
@msrun.parent_basename = bn
|
32
|
+
@msrun.parent_location = dn
|
33
|
+
end
|
34
|
+
if line =~ /\s+scanCount=['"](\w+)['"]/
|
35
|
+
@msrun.scan_count = $1.to_i
|
36
|
+
end
|
37
|
+
if line =~ /startTime=['"]([\w\.]+)['"]/
|
38
|
+
@msrun.start_time = $1[2...-1].to_f
|
39
|
+
end
|
40
|
+
if line =~ /endTime=['"]([\w\.]+)['"]/
|
41
|
+
@msrun.end_time = $1[2...-1].to_f
|
42
|
+
end
|
43
|
+
if @io =~ /^\s*<scan/
|
44
|
+
break
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@msrun
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.parse_precursor(line)
|
51
|
+
prec = Ms::Precursor.new
|
52
|
+
loop do
|
53
|
+
if line =~ /precursorIntensity=['"]([\d\.]+)['"]/
|
54
|
+
prec[1] = $1.to_f
|
55
|
+
end
|
56
|
+
if line =~ /precursorCharge=["'](\d+)["']/
|
57
|
+
prec[3] = [$1.to_i]
|
58
|
+
end
|
59
|
+
if line =~ %r{>([\d\.]+)</precursorMz>}
|
60
|
+
prec[0] = $1.to_f
|
61
|
+
break
|
62
|
+
end
|
63
|
+
line = io.gets
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.parse_peaks
|
68
|
+
precision = 32
|
69
|
+
byte_order = 'network'
|
70
|
+
while line = @io.gets
|
71
|
+
if line =~ /(precision|byteOrder)=["'](\w+)["']/
|
72
|
+
case $1
|
73
|
+
when 'precision'
|
74
|
+
$2.to_i
|
75
|
+
when 'byteOrder'
|
76
|
+
byte_order = $2
|
77
|
+
end
|
78
|
+
end
|
79
|
+
if line =~ %r{</peaks>}
|
80
|
+
first_pos = line.index('>')
|
81
|
+
last_pos = @io.pos + line.rindex("</peaks>")
|
82
|
+
Ms::Spectrum
|
83
|
+
break
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
# assumes that the io object has been set to the beginning of the scan
|
90
|
+
# element. Returns an Ms::Scan object
|
91
|
+
def self.parse_scan(start_byte, length)
|
92
|
+
@io.pos = start_byte
|
93
|
+
hash = {}
|
94
|
+
while line = @io.gets do
|
95
|
+
if line =~ /^\s*<precursorMz/
|
96
|
+
self.parse_precursor(line)
|
97
|
+
self.parse_peaks
|
98
|
+
break
|
99
|
+
end
|
100
|
+
if line =~ /(\w+)=["'](\w+)["']/
|
101
|
+
hash[$1] = $2
|
102
|
+
end
|
103
|
+
end
|
104
|
+
new_scan_from_hash(hash)
|
105
|
+
end
|
106
|
+
|
107
|
+
def new_scan_from_hash(hash)
|
108
|
+
scan = Ms::Scan.new # array class creates one with 9 positions
|
109
|
+
scan[0] = hash['num'].to_i
|
110
|
+
scan[1] = hash['msLevel'].to_i
|
111
|
+
if x = hash['retentionTime']
|
112
|
+
scan[2] = x[2...-1].to_f
|
113
|
+
end
|
114
|
+
if x = hash['startMz']
|
115
|
+
scan[3] = x.to_f
|
116
|
+
scan[4] = hash['endMz'].to_f
|
117
|
+
scan[5] = hash['peaksCount'].to_i
|
118
|
+
scan[6] = hash['totIonCurrent'].to_f
|
119
|
+
end
|
120
|
+
scan
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
|