mspire 0.7.8 → 0.7.9
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/lib/mspire/mzml.rb +5 -1
- data/lib/mspire/mzml/data_array.rb +8 -1
- data/lib/mspire/peak.rb +8 -0
- data/lib/mspire/peak_list.rb +206 -136
- data/script/mzml_read_binary.rb +1 -1
- data/script/quant_compare_direct_injections.rb +110 -0
- data/spec/mspire/peak_list_spec.rb +189 -44
- metadata +32 -71
- data/mspire.gemspec +0 -236
data/Rakefile
CHANGED
@@ -19,6 +19,7 @@ Jeweler::Tasks.new do |gem|
|
|
19
19
|
gem.add_dependency "builder", "~> 3.0.0"
|
20
20
|
gem.add_dependency "bio", "~> 1.4.2"
|
21
21
|
gem.add_dependency "trollop", "~> 1.16.2"
|
22
|
+
gem.add_dependency "uuid", ">= 2.3.5"
|
22
23
|
# this should be a real dependency, but need to document getting this
|
23
24
|
# working on windows first!
|
24
25
|
gem.add_development_dependency "fftw3", "~> 0.3"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.7.
|
1
|
+
0.7.9
|
data/lib/mspire/mzml.rb
CHANGED
@@ -150,7 +150,11 @@ module Mspire
|
|
150
150
|
case arg
|
151
151
|
when IO
|
152
152
|
@io = arg
|
153
|
-
|
153
|
+
begin
|
154
|
+
@encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
|
155
|
+
rescue EOFError
|
156
|
+
raise RuntimeError, "no encoding present in XML! (Is this even an xml file?)"
|
157
|
+
end
|
154
158
|
@index_list = get_index_list
|
155
159
|
read_header!
|
156
160
|
when Hash
|
@@ -89,7 +89,14 @@ module Mspire
|
|
89
89
|
end
|
90
90
|
end
|
91
91
|
data = base64.unpack("m*").first
|
92
|
-
|
92
|
+
# some implementations leave data blank if there aren't peaks
|
93
|
+
# even if they say it is zlib compressed...
|
94
|
+
unzipped =
|
95
|
+
if data.size > 0
|
96
|
+
compressed ? Zlib::Inflate.inflate(data) : data
|
97
|
+
else
|
98
|
+
data
|
99
|
+
end
|
93
100
|
self.new( unzipped.unpack(precision_unpack) )
|
94
101
|
end
|
95
102
|
|
data/lib/mspire/peak.rb
CHANGED
data/lib/mspire/peak_list.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'mspire/bin'
|
2
2
|
|
3
3
|
module Mspire
|
4
|
-
# a collection of peak objects
|
4
|
+
# a collection of peak objects. At a minimum, each peak should respond to
|
5
|
+
# :x and :y
|
5
6
|
class PeakList < Array
|
6
7
|
|
7
8
|
def lo_x
|
@@ -21,22 +22,46 @@ module Mspire
|
|
21
22
|
:centroided => true,
|
22
23
|
}
|
23
24
|
|
25
|
+
# for spectral peaks, this is the weighted m/z
|
26
|
+
def weighted_x
|
27
|
+
tot_intensity = self.inject(0.0) {|sum,peak| sum + peak.y }
|
28
|
+
_weighted_x = 0.0
|
29
|
+
self.each do |peak|
|
30
|
+
int = peak.y
|
31
|
+
signal_by_sample_index[peak.sample_id] += int
|
32
|
+
_weighted_x += (peak.first * (int/tot_intensity))
|
33
|
+
end
|
34
|
+
_weighted_x
|
35
|
+
end
|
36
|
+
|
37
|
+
# class methods
|
24
38
|
class << self
|
25
39
|
|
40
|
+
# creates a new Mspire::PeakList and coerces each peak into an
|
41
|
+
# Mspire::Peak. If your peaks already behave like peaks you should use
|
42
|
+
# .new
|
43
|
+
def [](*peaks)
|
44
|
+
self.new( peaks.map {|peak| Mspire::Peak.new(peak) } )
|
45
|
+
end
|
46
|
+
|
26
47
|
def create_bins(peaklists, opts)
|
27
|
-
min, max =
|
48
|
+
min, max = min_max_x(peaklists)
|
28
49
|
|
29
50
|
divisions = []
|
30
51
|
bin_width = opts[:bin_width]
|
31
52
|
use_ppm = (opts[:bin_unit] == :ppm)
|
32
|
-
|
53
|
+
|
54
|
+
puts "using bin width: #{bin_width}" if $VERBOSE
|
55
|
+
puts "using ppm for bins: #{use_ppm}" if $VERBOSE
|
56
|
+
|
57
|
+
current_x = min
|
33
58
|
loop do
|
34
|
-
if
|
59
|
+
if current_x >= max
|
35
60
|
divisions << max
|
36
61
|
break
|
37
62
|
else
|
38
|
-
divisions <<
|
39
|
-
|
63
|
+
divisions << current_x
|
64
|
+
current_x += ( use_ppm ? current_x./(1e6).*(bin_width) : bin_width )
|
40
65
|
end
|
41
66
|
end
|
42
67
|
# make each bin exclusive so there is no overlap
|
@@ -46,10 +71,10 @@ module Mspire
|
|
46
71
|
bins
|
47
72
|
end
|
48
73
|
|
49
|
-
def
|
74
|
+
def min_max_x(peaklists)
|
50
75
|
# find the min and max across all spectra
|
51
76
|
first_peaklist = peaklists.first
|
52
|
-
min = first_peaklist.first
|
77
|
+
min = first_peaklist.first.x; max = first_peaklist.last.x
|
53
78
|
peaklists.each do |peaklist|
|
54
79
|
min = peaklist.lo_x if peaklist.lo_x < min
|
55
80
|
max = peaklist.hi_x if peaklist.hi_x > max
|
@@ -58,53 +83,62 @@ module Mspire
|
|
58
83
|
end
|
59
84
|
|
60
85
|
def merge_centroids(peaklists, opts={})
|
86
|
+
opts[:return_data] = true if opts[:only_data]
|
61
87
|
|
62
88
|
# Create Mspire::Bin objects
|
63
89
|
bins = opts[:bins] ? opts[:bins] : create_bins(peaklists, opts)
|
90
|
+
puts "created #{bins.size} bins" if $VERBOSE
|
64
91
|
|
65
92
|
peaklists.each do |peaklist|
|
66
|
-
Mspire::Bin.bin(bins, peaklist, &:
|
93
|
+
Mspire::Bin.bin(bins, peaklist, &:x)
|
67
94
|
end
|
68
95
|
|
69
96
|
pseudo_peaks = bins.map do |bin|
|
70
|
-
[bin, bin.data.reduce(0.0) {|sum,peak| sum + peak
|
97
|
+
Mspire::Peak.new( [bin, bin.data.reduce(0.0) {|sum,peak| sum + peak.y }] )
|
71
98
|
end
|
72
99
|
|
73
100
|
pseudo_peaklist = Mspire::PeakList.new(pseudo_peaks)
|
74
101
|
|
75
|
-
|
102
|
+
separate_peaklists = pseudo_peaklist.split(opts[:split])
|
103
|
+
|
104
|
+
normalize_factor = opts[:normalize] ? peaklists.size : 1
|
76
105
|
|
77
106
|
return_data = []
|
78
|
-
final_peaklist = []
|
79
|
-
peak_lists.each_with_index do |peak_list,i|
|
80
|
-
#peaks.each do |peak|
|
81
|
-
tot_intensity = peak_list.map(&:last).reduce(:+)
|
82
|
-
return_data_per_peak = [] if opts[:return_data]
|
83
|
-
weighted_mz = 0.0
|
84
|
-
peak_list.each do |peak|
|
85
|
-
pre_scaled_intensity = peak[0].data.reduce(0.0) {|sum,v| sum + v.last }
|
86
|
-
post_scaled_intensity = peak[1]
|
87
|
-
# some peaks may have been shared. In this case the intensity
|
88
|
-
# for that peak was downweighted. However, the actual data
|
89
|
-
# composing that peak is not altered when the intensity is
|
90
|
-
# shared. So, to calculate a proper weighted avg we need to
|
91
|
-
# downweight the intensity of any data point found within a bin
|
92
|
-
# whose intensity was scaled.
|
93
|
-
correction_factor =
|
94
|
-
if pre_scaled_intensity != post_scaled_intensity
|
95
|
-
post_scaled_intensity / pre_scaled_intensity
|
96
|
-
else
|
97
|
-
1.0
|
98
|
-
end
|
107
|
+
final_peaklist = Mspire::PeakList.new unless opts[:only_data]
|
99
108
|
|
100
|
-
|
109
|
+
separate_peaklists.each do |pseudo_peaklist|
|
110
|
+
data_peaklist = Mspire::PeakList.new
|
111
|
+
weight_x = 0.0
|
112
|
+
tot_intensity = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.y }
|
113
|
+
#puts "TOT INTENSITY:"
|
114
|
+
#p tot_intensity
|
115
|
+
calc_from_lil_bins = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.x.data.map(&:y).reduce(:+) }
|
116
|
+
#puts "LILBINS: "
|
117
|
+
#p calc_from_lil_bins
|
118
|
+
pseudo_peaklist.each do |bin_peak|
|
101
119
|
|
102
|
-
|
103
|
-
|
120
|
+
# For the :share method, the psuedo_peak intensity may have been
|
121
|
+
# adjusted, but the individual peaks were not. Correct this.
|
122
|
+
if opts[:split] == :share
|
123
|
+
post_scaled_y = bin_peak.y
|
124
|
+
pre_scaled_y = bin_peak.x.data.reduce(0.0) {|sum,peak| sum + peak.last }
|
125
|
+
#puts "PRESCALED Y:"
|
126
|
+
#p pre_scaled_y
|
127
|
+
if (post_scaled_y - pre_scaled_y).abs.round(10) != 0.0
|
128
|
+
correction = post_scaled_y / pre_scaled_y
|
129
|
+
bin_peak.x.data.each {|peak| peak.y = (peak.y * correction) }
|
130
|
+
end
|
104
131
|
end
|
132
|
+
|
133
|
+
unless opts[:only_data]
|
134
|
+
bin_peak.x.data.each do |peak|
|
135
|
+
weight_x += peak.x * ( peak.y.to_f / tot_intensity)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
(data_peaklist.push( *bin_peak.x.data )) if opts[:return_data]
|
105
139
|
end
|
106
|
-
|
107
|
-
|
140
|
+
final_peaklist << Mspire::Peak.new([weight_x, tot_intensity / normalize_factor]) unless opts[:only_data]
|
141
|
+
return_data << data_peaklist if opts[:return_data]
|
108
142
|
end
|
109
143
|
[final_peaklist, return_data]
|
110
144
|
end
|
@@ -116,13 +150,13 @@ module Mspire
|
|
116
150
|
# first):
|
117
151
|
#
|
118
152
|
# :bin_width => 5
|
119
|
-
# :bin_unit => :ppm
|
153
|
+
# :bin_unit => :ppm|:amu interpret bin_width as ppm or amu
|
120
154
|
# :bins => array of Mspire::Bin objects for custom bins (overides other bin options)
|
121
155
|
# :normalize => true if true, divides total intensity by
|
122
156
|
# number of spectra
|
123
157
|
# :return_data => false returns a parallel array containing
|
124
158
|
# the peaks associated with each returned peak
|
125
|
-
# :split =>
|
159
|
+
# :split => :zero|:greedy_y|:share see Mspire::Peak#split
|
126
160
|
# :centroided => true treat the data as centroided
|
127
161
|
#
|
128
162
|
# The binning algorithm is roughly the fastest possible algorithm that
|
@@ -130,131 +164,167 @@ module Mspire
|
|
130
164
|
# algorithm O(n + m))
|
131
165
|
#
|
132
166
|
# Assumes the peaklists are already sorted by m/z.
|
167
|
+
#
|
168
|
+
# Note that the peaks themselves will be altered if using the :share
|
169
|
+
# split method.
|
133
170
|
def merge(peaklists, opts={})
|
134
171
|
opts = DEFAULT_MERGE.merge(opts)
|
135
172
|
|
136
173
|
(peaklist, returned_data) =
|
137
174
|
if opts[:centroided]
|
138
|
-
merge_centroids(peaklists, opts)
|
175
|
+
merge_centroids(peaklists, opts.dup)
|
139
176
|
else
|
140
177
|
raise NotImplementedError, "need to implement profile merging"
|
141
178
|
end
|
142
179
|
|
143
|
-
if opts[:
|
144
|
-
|
145
|
-
|
146
|
-
end
|
147
|
-
if opts[:return_data]
|
148
|
-
$stderr.puts "returning peaklist (#{peaklist.size}) and data" if $VERBOSE
|
180
|
+
if opts[:only_data]
|
181
|
+
returned_data
|
182
|
+
elsif opts[:return_data]
|
149
183
|
[peaklist, returned_data]
|
150
184
|
else
|
151
|
-
|
152
|
-
peaklist
|
185
|
+
peaklist
|
153
186
|
end
|
154
187
|
end
|
188
|
+
end # end class << self
|
189
|
+
|
190
|
+
|
191
|
+
# returns an array with the indices outlining each peak. The first index
|
192
|
+
# is the start of the peak, the last index is the last of the peak.
|
193
|
+
# Interior indices represent local minima. So, peaks that have only two
|
194
|
+
# indices have no local minima.
|
195
|
+
def peak_boundaries(gt=0.0)
|
196
|
+
in_peak = false
|
197
|
+
prev_y = gt
|
198
|
+
prev_prev_y = gt
|
199
|
+
peak_inds = []
|
200
|
+
self.each_with_index do |peak, index|
|
201
|
+
curr_y = peak.y
|
202
|
+
if curr_y > gt
|
203
|
+
if !in_peak
|
204
|
+
in_peak = true
|
205
|
+
peak_inds << [index]
|
206
|
+
else
|
207
|
+
# if on_upslope
|
208
|
+
if prev_y < curr_y
|
209
|
+
# If we were previously on a downslope and we are now on an upslope
|
210
|
+
# then the previous index is a local min
|
211
|
+
# on_downslope(prev_previous_y, prev_y)
|
212
|
+
if prev_prev_y > prev_y
|
213
|
+
# We have found a local min
|
214
|
+
peak_inds.last << (index - 1)
|
215
|
+
end
|
216
|
+
end # end if (upslope)
|
217
|
+
end # end if !in_peak
|
218
|
+
elsif in_peak
|
219
|
+
peak_inds.last << (index - 1)
|
220
|
+
in_peak = false
|
221
|
+
end
|
222
|
+
prev_prev_y = prev_y
|
223
|
+
prev_y = curr_y
|
224
|
+
end
|
225
|
+
# add the last one to the last peak if it is a boundary
|
226
|
+
if self[-1].y > gt
|
227
|
+
peak_inds.last << (self.size-1)
|
228
|
+
end
|
229
|
+
peak_inds
|
155
230
|
end
|
156
231
|
|
232
|
+
# returns an array of PeakList objects
|
233
|
+
def split_on_zeros(given_peak_boundaries=nil)
|
234
|
+
pk_bounds = given_peak_boundaries || peak_boundaries(0.0)
|
235
|
+
pk_bounds.map do |indices|
|
236
|
+
self.class.new self[indices.first..indices.last]
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# returns an array of PeakList objects
|
241
|
+
# assumes that this is one connected list of peaks (i.e., no
|
242
|
+
# zeros/whitespace on the edges or internally)
|
243
|
+
#
|
244
|
+
# /\
|
245
|
+
# / \/\
|
246
|
+
# / \
|
247
|
+
#
|
248
|
+
# if there are no local minima, just returns self inside the array
|
249
|
+
def split_contiguous(methd=:greedy_y, local_min_indices=nil)
|
250
|
+
local_min_indices ||= ((pb=peak_boundaries.first) && pb.shift && pb.pop && pb)
|
251
|
+
|
252
|
+
if local_min_indices.size == 0
|
253
|
+
self
|
254
|
+
else
|
255
|
+
peak_class = first.class
|
256
|
+
prev_lm_i = 0 # <- don't worry, will be set to bumped to zero
|
257
|
+
peak_lists = [ self.class.new([self[0]]) ]
|
258
|
+
local_min_indices.each do |lm_i|
|
259
|
+
peak_lists.last.push( *self[(prev_lm_i+1)..(lm_i-1)] )
|
260
|
+
case methd
|
261
|
+
when :greedy_y
|
262
|
+
if self[lm_i-1].y >= self[lm_i+1].y
|
263
|
+
peak_lists.last << self[lm_i]
|
264
|
+
peak_lists << self.class.new
|
265
|
+
else
|
266
|
+
peak_lists << self.class.new( [self[lm_i]] )
|
267
|
+
end
|
268
|
+
when :share
|
269
|
+
# for each local min, two new peaks will be created, with
|
270
|
+
# intensity shared between adjacent peak_lists
|
271
|
+
lm = self[lm_i]
|
272
|
+
sum = self[lm_i-1].y + self[lm_i+1].y
|
273
|
+
# push onto the last peaklist its portion of the local min
|
274
|
+
peak_lists.last << peak_class.new( [lm.x, lm.y * (self[lm_i-1].y.to_f/sum)] )
|
275
|
+
# create a new peaklist that contains its portion of the local min
|
276
|
+
peak_lists << self.class.new( [peak_class.new([lm.x, lm.y * (self[lm_i+1].y.to_f/sum)])] )
|
277
|
+
end
|
278
|
+
prev_lm_i = lm_i
|
279
|
+
end
|
280
|
+
peak_lists.last.push(*self[(prev_lm_i+1)...(self.size)] )
|
281
|
+
peak_lists
|
282
|
+
end
|
283
|
+
end
|
157
284
|
|
158
285
|
# returns an Array of peaklist objects. Splits run of 1 or more local
|
159
286
|
# minima into multiple peaklists. When a point is 'shared' between two
|
160
287
|
# adjacent hill-ish areas, the choice of how to resolve multi-hills (runs
|
161
288
|
# of data above zero) is one of:
|
162
289
|
#
|
163
|
-
#
|
164
|
-
# :share
|
290
|
+
# :zero = only split on zeros
|
291
|
+
# :share = give each peak its rightful portion of shared peaks, dividing the
|
165
292
|
# intensity based on the intensity of adjacent peaks
|
166
|
-
# :greedy_y
|
293
|
+
# :greedy_y = give the point to the peak with highest point next to
|
167
294
|
# the point in question. tie goes lower.
|
168
295
|
#
|
169
|
-
#
|
170
|
-
#
|
171
|
-
#
|
296
|
+
# Note that the peak surrounding a local_minima may be altered if using
|
297
|
+
# :share
|
298
|
+
#
|
172
299
|
# assumes that a new peak can be made with an array containing the x
|
173
300
|
# value and the y value.
|
174
|
-
def split(
|
175
|
-
if
|
176
|
-
|
177
|
-
$stderr.print "splitting on local minima ..." if $VERBOSE
|
178
|
-
no_local_minima_peaks = zeroed_peaks.zip(local_min_ind_ar).map do |peak, lm_indices|
|
179
|
-
new_peaks = [ peak.class.new ]
|
180
|
-
if lm_indices.size > 0
|
181
|
-
prev_lm_i = -1 # <- it's okay, we don't use until it is zero
|
182
|
-
lm_indices.each do |lm_i|
|
183
|
-
lm = peak[lm_i]
|
184
|
-
point_class = lm.class
|
185
|
-
|
186
|
-
# push onto the last peak all the points from right after the previous local min
|
187
|
-
# to just before this local min
|
188
|
-
new_peaks.last.push( *peak[(prev_lm_i+1)..(lm_i-1)] )
|
189
|
-
before_pnt = peak[lm_i-1]
|
190
|
-
after_pnt = peak[lm_i+1]
|
191
|
-
|
192
|
-
case split_multipeaks
|
193
|
-
when :share
|
194
|
-
sum = before_pnt[1] + after_pnt[1]
|
195
|
-
# push onto the last peak its portion of the local min
|
196
|
-
new_peaks.last << point_class.new( [lm[0], lm[1] * (before_pnt[1].to_f/sum)] )
|
197
|
-
# create a new peak that contains its portion of the local min
|
198
|
-
new_peaks << self.class.new( [point_class.new([lm[0], lm[1] * (after_pnt[1].to_f/sum)])] )
|
199
|
-
prev_lm_i = lm_i
|
200
|
-
when :greedy_y
|
201
|
-
if before_pnt[1] >= after_pnt[1]
|
202
|
-
new_peaks.last << lm
|
203
|
-
new_peaks << self.class.new
|
204
|
-
prev_lm_i = lm_i
|
205
|
-
else
|
206
|
-
new_peaks << self.class.new( [lm] )
|
207
|
-
prev_lm_i = lm_i
|
208
|
-
end
|
209
|
-
else
|
210
|
-
raise ArgumentError, "only recognize :share, :greedy_y, or false for the arg in #split(arg)"
|
211
|
-
end
|
212
|
-
end
|
213
|
-
new_peaks.last.push( *peak[(prev_lm_i+1)...peak.size] )
|
214
|
-
new_peaks
|
215
|
-
else
|
216
|
-
[peak]
|
217
|
-
end
|
218
|
-
end.flatten(1) # end zip
|
219
|
-
$stderr.puts "now #{no_local_minima_peaks.size} peaks." if $VERBOSE
|
220
|
-
no_local_minima_peaks
|
301
|
+
def split(split_multipeaks_mthd=:zero)
|
302
|
+
if split_multipeaks_mthd == :zero
|
303
|
+
split_on_zeros
|
221
304
|
else
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
peak_lists << self.class.new([peak])
|
235
|
-
local_min_ind_ar << []
|
236
|
-
else
|
237
|
-
peak_lists.last << peak
|
238
|
-
# if on_upslope(previous_y, point[1])
|
239
|
-
if previous_y < peak[1]
|
240
|
-
# If we were previously on a downslope and we are now on an upslope
|
241
|
-
# then the previous index is a local min
|
242
|
-
prev_previous_y = self[index - 2][1]
|
243
|
-
# on_downslope(prev_previous_y, previous_y)
|
244
|
-
if prev_previous_y > previous_y
|
245
|
-
# We have found a local min
|
246
|
-
local_min_ind_ar.last << (in_peak-1)
|
247
|
-
end
|
248
|
-
end # end if (upslope)
|
249
|
-
end # end if !in_peak
|
250
|
-
in_peak += 1
|
251
|
-
elsif in_peak
|
252
|
-
in_peak = false
|
253
|
-
end # end if point[1] > 0
|
305
|
+
boundaries = peak_boundaries(0.0)
|
306
|
+
no_lm_pklsts = []
|
307
|
+
boundaries.each do |indices|
|
308
|
+
peak = self[indices.first..indices.last]
|
309
|
+
if indices.size == 2
|
310
|
+
no_lm_pklsts << peak
|
311
|
+
else # have local minima
|
312
|
+
multipeak = PeakList.new(peak)
|
313
|
+
local_min_inds = indices[1..-2].map {|i| i-indices.first}
|
314
|
+
peaklists = multipeak.split_contiguous(split_multipeaks_mthd, local_min_inds)
|
315
|
+
no_lm_pklsts.push *peaklists
|
316
|
+
end
|
254
317
|
end
|
255
|
-
|
256
|
-
|
257
|
-
end
|
318
|
+
#$stderr.puts "now #{no_lm_pklsts.size} peaks." if $VERBOSE
|
319
|
+
no_lm_pklsts
|
320
|
+
end
|
258
321
|
end # def split
|
259
322
|
end
|
260
323
|
end
|
324
|
+
|
325
|
+
|
326
|
+
|
327
|
+
=begin
|
328
|
+
if !opts[:only_data]
|
329
|
+
=end
|
330
|
+
|