ms-msrun 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +3 -4
- data/Rakefile +20 -5
- data/bin/base64_to_array.rb +3 -4
- data/bin/ms_to_obiwarp.rb +6 -15
- data/bin/ms_to_search.rb +15 -22
- data/lib/lmat.rb +47 -53
- data/lib/ms/msrun.rb +98 -108
- data/lib/ms/msrun/axml/mzxml.rb +6 -8
- data/lib/ms/msrun/index.rb +130 -0
- data/lib/ms/msrun/mzxml.rb +12 -0
- data/lib/ms/msrun/nokogiri.rb +12 -0
- data/lib/ms/msrun/nokogiri/mzxml.rb +168 -0
- data/lib/ms/msrun/regexp/mzxml.rb +126 -0
- data/lib/ms/msrun/search.rb +25 -21
- data/lib/ms/msrun/sha1.rb +36 -0
- data/lib/ms/mzxml.rb +12 -0
- data/lib/ms/precursor.rb +3 -2
- data/lib/ms/precursor/lazy_parent.rb +28 -0
- data/lib/ms/scan.rb +2 -29
- data/lib/ms/spectrum/compare.rb +42 -2
- data/lib/ms/spectrum/filter.rb +1 -1
- data/spec/ms/msrun/index_spec.rb +60 -0
- data/spec/ms/msrun/scan_spec.rb +78 -0
- data/spec/ms/msrun/search_spec.rb +6 -7
- data/spec/ms/msrun/sha1_spec.rb +23 -0
- data/spec/ms/msrun_spec.rb +111 -3
- data/spec/ms/scan_spec.rb +2 -2
- data/spec/ms/spectrum/compare_spec.rb +13 -6
- data/spec/ms/spectrum/filter_spec.rb +3 -3
- metadata +42 -21
- data/lib/bsearch.rb +0 -120
- data/lib/ms/spectrum.rb +0 -373
data/lib/ms/msrun/axml/mzxml.rb
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
|
2
|
+
require 'ms/spectrum'
|
3
|
+
require 'ms/data'
|
4
|
+
require 'ms/data/lazy_io'
|
2
5
|
require 'ms/msrun'
|
3
6
|
require 'ms/precursor'
|
4
7
|
require 'axml'
|
@@ -48,7 +51,7 @@ class Ms::Msrun::Axml::Mzxml
|
|
48
51
|
add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, version, io)
|
49
52
|
|
50
53
|
## update the scan's parents
|
51
|
-
Ms::Msrun
|
54
|
+
Ms::Msrun.add_parent_scan(scans)
|
52
55
|
|
53
56
|
# note that startTime and endTime are optional AND in >2.2 are dateTime
|
54
57
|
# instead of duration types!, so we will just use scan times...
|
@@ -80,13 +83,8 @@ class Ms::Msrun::Axml::Mzxml
|
|
80
83
|
when 'peaks'
|
81
84
|
# assumes that parsing was done with a LazyPeaks parser!
|
82
85
|
nc = node.text
|
83
|
-
|
84
|
-
|
85
|
-
# puts "PROBLEM: "
|
86
|
-
# p nc
|
87
|
-
# abort 'here'
|
88
|
-
#end
|
89
|
-
scan[8] = Ms::Spectrum.lazy(io, nc.first, nc.last, node['precision'].to_i, NetworkOrder)
|
86
|
+
data = Ms::Data::LazyIO.new(io, nc.first, nc.last, Ms::Data::LazyIO.unpack_code(node['precision'].to_i, NetworkOrder))
|
87
|
+
scan[8] = Ms::Spectrum.new(Ms::Data::Interleaved.new(data))
|
90
88
|
end
|
91
89
|
end
|
92
90
|
scan[7] = prec
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'ms/msrun'
|
2
|
+
|
3
|
+
module Ms ; end
|
4
|
+
class Ms::Msrun ; end
|
5
|
+
|
6
|
+
# an index by scan number of the doublets where each doublet = [start_byte,
|
7
|
+
# length] for the scan. Index objects are enumerable and yield doublets.
|
8
|
+
# Index#scan_nums gives an array of the scan numbers.
|
9
|
+
# Index#first and Index#last return the first and the last scan, regardless of
|
10
|
+
# the scan numbers.
|
11
|
+
#
|
12
|
+
# index.scan_nums # -> [1,2,3,4]
|
13
|
+
# index.each do |starting_byte, length|
|
14
|
+
# IO.read(myfile.mzXML, length, starting_byte) # -> xml for each scan
|
15
|
+
# end
|
16
|
+
# index[0] # -> nil
|
17
|
+
# index.first # -> [<start_byte>, <length>] # for scan number 1
|
18
|
+
class Ms::Msrun::Index < Array
|
19
|
+
include Enumerable
|
20
|
+
|
21
|
+
MZXML_INDEX_TAG = 'indexOffset'
|
22
|
+
MZML_INDEX_TAG = 'indexListOffset'
|
23
|
+
|
24
|
+
# returns the length from the start to the first scan
|
25
|
+
def header_length
|
26
|
+
self.each {|pair| return (pair.first) }
|
27
|
+
end
|
28
|
+
|
29
|
+
# returns an array of the scan numbers
|
30
|
+
attr_reader :scan_nums
|
31
|
+
|
32
|
+
# takes an mzxml filename or io object
|
33
|
+
# and returns an array of offsets and lengths for the scans
|
34
|
+
# note that the offset
|
35
|
+
def initialize(filename_or_io)
|
36
|
+
(ft, version) = Ms::Msrun.filetype_and_version(filename_or_io)
|
37
|
+
tag = case ft
|
38
|
+
when :mzml : MZML_INDEX_TAG
|
39
|
+
when :mzxml : MZXML_INDEX_TAG
|
40
|
+
end
|
41
|
+
fn =
|
42
|
+
if filename_or_io.is_a? String
|
43
|
+
filename_or_io # a filename
|
44
|
+
else # a File object
|
45
|
+
filename_or_io.path
|
46
|
+
end
|
47
|
+
size = File.size(fn)
|
48
|
+
io =
|
49
|
+
if filename_or_io.is_a? String
|
50
|
+
File.open(filename_or_io)
|
51
|
+
else
|
52
|
+
filename_or_io
|
53
|
+
end
|
54
|
+
(offset, length) = index_offset(io, size, tag)
|
55
|
+
io.pos = offset
|
56
|
+
xml = io.read(length)
|
57
|
+
io.close if filename_or_io.is_a?(String)
|
58
|
+
self.replace( index_to_array(xml, offset, ft) )
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
def each(&block)
|
63
|
+
scan_nums.each do |scan_num|
|
64
|
+
block.call( self[scan_num] )
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def first
|
69
|
+
self[scan_nums.first]
|
70
|
+
end
|
71
|
+
|
72
|
+
def last
|
73
|
+
self[scan_nums.last]
|
74
|
+
end
|
75
|
+
|
76
|
+
# returns [index_offset, length_of_index]
|
77
|
+
def index_offset(io, size, tag=MZML_INDEX_TAG, bytes_backwards=150) # :nodoc:
|
78
|
+
tag_re = /<#{tag}>([\-\d]+)<\/#{tag}>/
|
79
|
+
io.pos = size-1
|
80
|
+
io.pos = io.pos - bytes_backwards
|
81
|
+
index_offset = nil
|
82
|
+
index_end = nil
|
83
|
+
io.each do |line|
|
84
|
+
if line =~ tag_re
|
85
|
+
index_offset = $1.to_i
|
86
|
+
index_end = io.pos - line.size
|
87
|
+
break
|
88
|
+
end
|
89
|
+
end
|
90
|
+
if index_offset
|
91
|
+
[index_offset, index_end - index_offset]
|
92
|
+
else
|
93
|
+
[nil,nil]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# last_offset is used to calculate the length of the last scan object (or
|
98
|
+
# whatever)
|
99
|
+
def index_to_array(xml_string, last_offset, type=:mzml) # :nodoc:
|
100
|
+
indices = []
|
101
|
+
@scan_nums = []
|
102
|
+
case type
|
103
|
+
when :mzxml
|
104
|
+
xml_string.each_line("\n") do |line|
|
105
|
+
if line =~ /id="(\d+)".*>(\d+)</
|
106
|
+
@scan_nums << $1.to_i
|
107
|
+
indices << $2.to_i
|
108
|
+
end
|
109
|
+
end
|
110
|
+
#doc = Nokogiri::XML.parse(xml_string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
|
111
|
+
#root_el = doc.root
|
112
|
+
#raise RuntimeError, "expecting scan index!" unless root_el['name'] == 'scan'
|
113
|
+
#root_el.children.each do |el|
|
114
|
+
# indices << el.text.to_i
|
115
|
+
# @scan_nums << el['id'].to_i
|
116
|
+
#end
|
117
|
+
when :mzml
|
118
|
+
raise NotImplementedError
|
119
|
+
end
|
120
|
+
indices << last_offset
|
121
|
+
|
122
|
+
new_indices = []
|
123
|
+
0.upto(indices.size-2) do |i|
|
124
|
+
val = indices[i]
|
125
|
+
next unless val
|
126
|
+
new_indices[@scan_nums[i]] = [indices[i], indices[i+1] - indices[i]]
|
127
|
+
end
|
128
|
+
new_indices
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'ms/msrun/nokogiri'
|
4
|
+
require 'ms/msrun'
|
5
|
+
require 'ms/spectrum'
|
6
|
+
require 'ms/data'
|
7
|
+
require 'ms/data/lazy_io'
|
8
|
+
require 'ms/precursor'
|
9
|
+
require 'ms/mzxml'
|
10
|
+
|
11
|
+
|
12
|
+
class Ms::Msrun::Nokogiri::Mzxml
|
13
|
+
NetworkOrder = true
|
14
|
+
|
15
|
+
attr_accessor :msrun, :io, :version
|
16
|
+
|
17
|
+
def initialize(msrun_object, io, version)
|
18
|
+
@msrun = msrun_object
|
19
|
+
@io = io
|
20
|
+
@version = version
|
21
|
+
end
|
22
|
+
|
23
|
+
# returns the msrun
|
24
|
+
def parse_header(byte_length_or_header_string)
|
25
|
+
string =
|
26
|
+
if byte_length_or_header_string.is_a? Integer
|
27
|
+
@io.rewind
|
28
|
+
@io.read(byte_length_or_header_string)
|
29
|
+
else
|
30
|
+
length_or_header_string
|
31
|
+
end
|
32
|
+
doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
|
33
|
+
msrun_n = doc.root
|
34
|
+
if @version >= '2.0'
|
35
|
+
msrun_n = msrun_n.child
|
36
|
+
end
|
37
|
+
@msrun.scan_count = msrun_n['scanCount'].to_i
|
38
|
+
@msrun.start_time = msrun_n['startTime'][2...-1].to_f
|
39
|
+
@msrun.end_time = msrun_n['endTime'][2...-1].to_f
|
40
|
+
|
41
|
+
filename = msrun_n.search("parentFile").first['fileName']
|
42
|
+
(bn, dn) = Ms::Mzxml.parent_basename_and_dir(filename)
|
43
|
+
@msrun.parent_basename = bn
|
44
|
+
@msrun.parent_location = dn
|
45
|
+
@msrun
|
46
|
+
end
|
47
|
+
|
48
|
+
# returns the ms_level as an Integer, nil if it cannot be found.
|
49
|
+
def parse_ms_level(start_byte, length)
|
50
|
+
start_io_pos = @io.pos
|
51
|
+
@io.pos = start_byte
|
52
|
+
ms_level = nil
|
53
|
+
total_length = 0
|
54
|
+
@io.each("\n") do |line|
|
55
|
+
if line =~ /msLevel="(\d+)"/o
|
56
|
+
ms_level = $1.to_i
|
57
|
+
break
|
58
|
+
end
|
59
|
+
total_length += line.size
|
60
|
+
break if total_length > length
|
61
|
+
end
|
62
|
+
@io.pos = start_io_pos
|
63
|
+
ms_level
|
64
|
+
end
|
65
|
+
|
66
|
+
# assumes that the io object has been set to the beginning of the scan
|
67
|
+
# element. Returns an Ms::Scan object
|
68
|
+
# options:
|
69
|
+
# :spectrum => true | false (default is true)
|
70
|
+
# :precursor => true | false (default is true)
|
71
|
+
#
|
72
|
+
# Note that if both :spectrum and :precursor are set to false, the basic
|
73
|
+
# information in the scan node *is* parsed (such as ms_level)
|
74
|
+
def parse_scan(start_byte, length, options={})
|
75
|
+
opts = {:spectrum => true, :precursor => true}.merge(options)
|
76
|
+
start_io_pos = @io.pos
|
77
|
+
@io.pos = start_byte
|
78
|
+
|
79
|
+
# read in the data keeping track of peaks start and stop
|
80
|
+
string = ""
|
81
|
+
if opts[:spectrum]
|
82
|
+
string = @io.read(length)
|
83
|
+
else
|
84
|
+
# don't bother reading all the peak information if we aren't wanting it
|
85
|
+
# and can avoid it! This is important for high res instruments
|
86
|
+
# especially since the peak data is huge.
|
87
|
+
@io.each do |line|
|
88
|
+
if md = %r{<peaks}.match(line)
|
89
|
+
# just add the part of the string before the <peaks> tag
|
90
|
+
string << line.slice!(0, md.end(0) - 6)
|
91
|
+
break
|
92
|
+
else
|
93
|
+
string << line
|
94
|
+
if string.size >= length
|
95
|
+
if string.size > length
|
96
|
+
string.slice!(0,length)
|
97
|
+
end
|
98
|
+
break
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
doc = Nokogiri::XML.parse(string, *Ms::Msrun::Nokogiri::PARSER_ARGS)
|
105
|
+
scan_n = doc.root
|
106
|
+
scan = new_scan_from_node( scan_n )
|
107
|
+
prec_n = scan_n.child
|
108
|
+
|
109
|
+
peaks_n =
|
110
|
+
if prec_n.name == 'precursorMz'
|
111
|
+
if opts[:precursor]
|
112
|
+
prec = Ms::Precursor.new
|
113
|
+
prec[1] = prec_n['precursorIntensity'].to_f
|
114
|
+
prec[0] = prec_n.text.to_f
|
115
|
+
if x = prec_n['precursorCharge']
|
116
|
+
prec[3] = [x.to_i]
|
117
|
+
end
|
118
|
+
scan.precursor = prec
|
119
|
+
end
|
120
|
+
prec_n.next_sibling
|
121
|
+
else
|
122
|
+
prec_n # this is a peaks node
|
123
|
+
end
|
124
|
+
|
125
|
+
# is this for mzData?
|
126
|
+
#if x = node['precursorScanNum']
|
127
|
+
# prec[2] = scans_by_num[x.to_i]
|
128
|
+
#end
|
129
|
+
|
130
|
+
if opts[:spectrum]
|
131
|
+
# all mzXML (at least versions 1--3.0) *must* be 'network' byte order!
|
132
|
+
# data is stored as the base64 string until we actually try to access
|
133
|
+
# it! At that point the string is decoded and knows it is interleaved
|
134
|
+
# data. So, no spectrum is actually decoded unless it is accessed!
|
135
|
+
peaks_data = Ms::Data.new_interleaved(Ms::Data::LazyString.new(peaks_n.text, Ms::Data::LazyIO.unpack_code(peaks_n['precision'].to_i, NetworkOrder)))
|
136
|
+
spec = Ms::Spectrum.new(peaks_data)
|
137
|
+
scan[8] = Ms::Spectrum.new(peaks_data)
|
138
|
+
end
|
139
|
+
scan
|
140
|
+
end
|
141
|
+
|
142
|
+
def start_end_from_filter_line(line)
|
143
|
+
# "ITMS + c NSI d Full ms3 654.79@cid35.00 630.24@cid35.00 [160.00-1275.00]"
|
144
|
+
/\[([^-]+)-([^-]+)\]/.match(line)[1,2].map {|v| v.to_f }
|
145
|
+
end
|
146
|
+
|
147
|
+
def new_scan_from_node(node)
|
148
|
+
scan = Ms::Scan.new # array class creates one with 9 positions
|
149
|
+
scan[0] = node['num'].to_i
|
150
|
+
scan[1] = node['msLevel'].to_i
|
151
|
+
if x = node['retentionTime']
|
152
|
+
scan[2] = x[2...-1].to_f
|
153
|
+
end
|
154
|
+
if x = node['startMz']
|
155
|
+
scan[3] = x.to_f
|
156
|
+
scan[4] = node['endMz'].to_f
|
157
|
+
end
|
158
|
+
scan[5] = node['peaksCount'].to_i
|
159
|
+
scan[6] = node['totIonCurrent'].to_f
|
160
|
+
if fl = node['filterLine']
|
161
|
+
(scan[3], scan[4]) = start_end_from_filter_line(fl)
|
162
|
+
end
|
163
|
+
scan
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
|
@@ -0,0 +1,126 @@
|
|
1
|
+
|
2
|
+
require 'ms/msrun'
|
3
|
+
require 'ms/spectrum'
|
4
|
+
require 'ms/data'
|
5
|
+
require 'ms/data/lazy_io'
|
6
|
+
require 'ms/precursor'
|
7
|
+
require 'ms/mzxml'
|
8
|
+
|
9
|
+
module Ms
|
10
|
+
class Msrun
|
11
|
+
module Regexp
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Ms::Msrun::Regexp::Mzxml
|
17
|
+
|
18
|
+
attr_accessor :msrun, :io, :version
|
19
|
+
|
20
|
+
def initialize(msrun_object, io, version)
|
21
|
+
@msrun = msrun_object
|
22
|
+
@io = io
|
23
|
+
@version = version
|
24
|
+
end
|
25
|
+
|
26
|
+
# returns the msrun
|
27
|
+
def parse_header
|
28
|
+
while line = @io.gets
|
29
|
+
if line =~ %r{\s+fileName=['"](.*?)['"]}
|
30
|
+
(bn, dn) = Ms::Mzxml.parent_basename_and_dir($1)
|
31
|
+
@msrun.parent_basename = bn
|
32
|
+
@msrun.parent_location = dn
|
33
|
+
end
|
34
|
+
if line =~ /\s+scanCount=['"](\w+)['"]/
|
35
|
+
@msrun.scan_count = $1.to_i
|
36
|
+
end
|
37
|
+
if line =~ /startTime=['"]([\w\.]+)['"]/
|
38
|
+
@msrun.start_time = $1[2...-1].to_f
|
39
|
+
end
|
40
|
+
if line =~ /endTime=['"]([\w\.]+)['"]/
|
41
|
+
@msrun.end_time = $1[2...-1].to_f
|
42
|
+
end
|
43
|
+
if @io =~ /^\s*<scan/
|
44
|
+
break
|
45
|
+
end
|
46
|
+
end
|
47
|
+
@msrun
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.parse_precursor(line)
|
51
|
+
prec = Ms::Precursor.new
|
52
|
+
loop do
|
53
|
+
if line =~ /precursorIntensity=['"]([\d\.]+)['"]/
|
54
|
+
prec[1] = $1.to_f
|
55
|
+
end
|
56
|
+
if line =~ /precursorCharge=["'](\d+)["']/
|
57
|
+
prec[3] = [$1.to_i]
|
58
|
+
end
|
59
|
+
if line =~ %r{>([\d\.]+)</precursorMz>}
|
60
|
+
prec[0] = $1.to_f
|
61
|
+
break
|
62
|
+
end
|
63
|
+
line = io.gets
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.parse_peaks
|
68
|
+
precision = 32
|
69
|
+
byte_order = 'network'
|
70
|
+
while line = @io.gets
|
71
|
+
if line =~ /(precision|byteOrder)=["'](\w+)["']/
|
72
|
+
case $1
|
73
|
+
when 'precision'
|
74
|
+
$2.to_i
|
75
|
+
when 'byteOrder'
|
76
|
+
byte_order = $2
|
77
|
+
end
|
78
|
+
end
|
79
|
+
if line =~ %r{</peaks>}
|
80
|
+
first_pos = line.index('>')
|
81
|
+
last_pos = @io.pos + line.rindex("</peaks>")
|
82
|
+
Ms::Spectrum
|
83
|
+
break
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
# assumes that the io object has been set to the beginning of the scan
|
90
|
+
# element. Returns an Ms::Scan object
|
91
|
+
def self.parse_scan(start_byte, length)
|
92
|
+
@io.pos = start_byte
|
93
|
+
hash = {}
|
94
|
+
while line = @io.gets do
|
95
|
+
if line =~ /^\s*<precursorMz/
|
96
|
+
self.parse_precursor(line)
|
97
|
+
self.parse_peaks
|
98
|
+
break
|
99
|
+
end
|
100
|
+
if line =~ /(\w+)=["'](\w+)["']/
|
101
|
+
hash[$1] = $2
|
102
|
+
end
|
103
|
+
end
|
104
|
+
new_scan_from_hash(hash)
|
105
|
+
end
|
106
|
+
|
107
|
+
def new_scan_from_hash(hash)
|
108
|
+
scan = Ms::Scan.new # array class creates one with 9 positions
|
109
|
+
scan[0] = hash['num'].to_i
|
110
|
+
scan[1] = hash['msLevel'].to_i
|
111
|
+
if x = hash['retentionTime']
|
112
|
+
scan[2] = x[2...-1].to_f
|
113
|
+
end
|
114
|
+
if x = hash['startMz']
|
115
|
+
scan[3] = x.to_f
|
116
|
+
scan[4] = hash['endMz'].to_f
|
117
|
+
scan[5] = hash['peaksCount'].to_i
|
118
|
+
scan[6] = hash['totIonCurrent'].to_f
|
119
|
+
end
|
120
|
+
scan
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
end
|
125
|
+
|
126
|
+
|