mspire 0.7.8 → 0.7.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/lib/mspire/mzml.rb +5 -1
- data/lib/mspire/mzml/data_array.rb +8 -1
- data/lib/mspire/peak.rb +8 -0
- data/lib/mspire/peak_list.rb +206 -136
- data/script/mzml_read_binary.rb +1 -1
- data/script/quant_compare_direct_injections.rb +110 -0
- data/spec/mspire/peak_list_spec.rb +189 -44
- metadata +32 -71
- data/mspire.gemspec +0 -236
data/Rakefile
CHANGED
@@ -19,6 +19,7 @@ Jeweler::Tasks.new do |gem|
|
|
19
19
|
gem.add_dependency "builder", "~> 3.0.0"
|
20
20
|
gem.add_dependency "bio", "~> 1.4.2"
|
21
21
|
gem.add_dependency "trollop", "~> 1.16.2"
|
22
|
+
gem.add_dependency "uuid", ">= 2.3.5"
|
22
23
|
# this should be a real dependency, but need to document getting this
|
23
24
|
# working on windows first!
|
24
25
|
gem.add_development_dependency "fftw3", "~> 0.3"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.7.
|
1
|
+
0.7.9
|
data/lib/mspire/mzml.rb
CHANGED
@@ -150,7 +150,11 @@ module Mspire
|
|
150
150
|
case arg
|
151
151
|
when IO
|
152
152
|
@io = arg
|
153
|
-
|
153
|
+
begin
|
154
|
+
@encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
|
155
|
+
rescue EOFError
|
156
|
+
raise RuntimeError, "no encoding present in XML! (Is this even an xml file?)"
|
157
|
+
end
|
154
158
|
@index_list = get_index_list
|
155
159
|
read_header!
|
156
160
|
when Hash
|
@@ -89,7 +89,14 @@ module Mspire
|
|
89
89
|
end
|
90
90
|
end
|
91
91
|
data = base64.unpack("m*").first
|
92
|
-
|
92
|
+
# some implementations leave data blank if there aren't peaks
|
93
|
+
# even if they say it is zlib compressed...
|
94
|
+
unzipped =
|
95
|
+
if data.size > 0
|
96
|
+
compressed ? Zlib::Inflate.inflate(data) : data
|
97
|
+
else
|
98
|
+
data
|
99
|
+
end
|
93
100
|
self.new( unzipped.unpack(precision_unpack) )
|
94
101
|
end
|
95
102
|
|
data/lib/mspire/peak.rb
CHANGED
data/lib/mspire/peak_list.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require 'mspire/bin'
|
2
2
|
|
3
3
|
module Mspire
|
4
|
-
# a collection of peak objects
|
4
|
+
# a collection of peak objects. At a minimum, each peak should respond to
|
5
|
+
# :x and :y
|
5
6
|
class PeakList < Array
|
6
7
|
|
7
8
|
def lo_x
|
@@ -21,22 +22,46 @@ module Mspire
|
|
21
22
|
:centroided => true,
|
22
23
|
}
|
23
24
|
|
25
|
+
# for spectral peaks, this is the weighted m/z
|
26
|
+
def weighted_x
|
27
|
+
tot_intensity = self.inject(0.0) {|sum,peak| sum + peak.y }
|
28
|
+
_weighted_x = 0.0
|
29
|
+
self.each do |peak|
|
30
|
+
int = peak.y
|
31
|
+
signal_by_sample_index[peak.sample_id] += int
|
32
|
+
_weighted_x += (peak.first * (int/tot_intensity))
|
33
|
+
end
|
34
|
+
_weighted_x
|
35
|
+
end
|
36
|
+
|
37
|
+
# class methods
|
24
38
|
class << self
|
25
39
|
|
40
|
+
# creates a new Mspire::PeakList and coerces each peak into an
|
41
|
+
# Mspire::Peak. If your peaks already behave like peaks you should use
|
42
|
+
# .new
|
43
|
+
def [](*peaks)
|
44
|
+
self.new( peaks.map {|peak| Mspire::Peak.new(peak) } )
|
45
|
+
end
|
46
|
+
|
26
47
|
def create_bins(peaklists, opts)
|
27
|
-
min, max =
|
48
|
+
min, max = min_max_x(peaklists)
|
28
49
|
|
29
50
|
divisions = []
|
30
51
|
bin_width = opts[:bin_width]
|
31
52
|
use_ppm = (opts[:bin_unit] == :ppm)
|
32
|
-
|
53
|
+
|
54
|
+
puts "using bin width: #{bin_width}" if $VERBOSE
|
55
|
+
puts "using ppm for bins: #{use_ppm}" if $VERBOSE
|
56
|
+
|
57
|
+
current_x = min
|
33
58
|
loop do
|
34
|
-
if
|
59
|
+
if current_x >= max
|
35
60
|
divisions << max
|
36
61
|
break
|
37
62
|
else
|
38
|
-
divisions <<
|
39
|
-
|
63
|
+
divisions << current_x
|
64
|
+
current_x += ( use_ppm ? current_x./(1e6).*(bin_width) : bin_width )
|
40
65
|
end
|
41
66
|
end
|
42
67
|
# make each bin exclusive so there is no overlap
|
@@ -46,10 +71,10 @@ module Mspire
|
|
46
71
|
bins
|
47
72
|
end
|
48
73
|
|
49
|
-
def
|
74
|
+
def min_max_x(peaklists)
|
50
75
|
# find the min and max across all spectra
|
51
76
|
first_peaklist = peaklists.first
|
52
|
-
min = first_peaklist.first
|
77
|
+
min = first_peaklist.first.x; max = first_peaklist.last.x
|
53
78
|
peaklists.each do |peaklist|
|
54
79
|
min = peaklist.lo_x if peaklist.lo_x < min
|
55
80
|
max = peaklist.hi_x if peaklist.hi_x > max
|
@@ -58,53 +83,62 @@ module Mspire
|
|
58
83
|
end
|
59
84
|
|
60
85
|
def merge_centroids(peaklists, opts={})
|
86
|
+
opts[:return_data] = true if opts[:only_data]
|
61
87
|
|
62
88
|
# Create Mspire::Bin objects
|
63
89
|
bins = opts[:bins] ? opts[:bins] : create_bins(peaklists, opts)
|
90
|
+
puts "created #{bins.size} bins" if $VERBOSE
|
64
91
|
|
65
92
|
peaklists.each do |peaklist|
|
66
|
-
Mspire::Bin.bin(bins, peaklist, &:
|
93
|
+
Mspire::Bin.bin(bins, peaklist, &:x)
|
67
94
|
end
|
68
95
|
|
69
96
|
pseudo_peaks = bins.map do |bin|
|
70
|
-
[bin, bin.data.reduce(0.0) {|sum,peak| sum + peak
|
97
|
+
Mspire::Peak.new( [bin, bin.data.reduce(0.0) {|sum,peak| sum + peak.y }] )
|
71
98
|
end
|
72
99
|
|
73
100
|
pseudo_peaklist = Mspire::PeakList.new(pseudo_peaks)
|
74
101
|
|
75
|
-
|
102
|
+
separate_peaklists = pseudo_peaklist.split(opts[:split])
|
103
|
+
|
104
|
+
normalize_factor = opts[:normalize] ? peaklists.size : 1
|
76
105
|
|
77
106
|
return_data = []
|
78
|
-
final_peaklist = []
|
79
|
-
peak_lists.each_with_index do |peak_list,i|
|
80
|
-
#peaks.each do |peak|
|
81
|
-
tot_intensity = peak_list.map(&:last).reduce(:+)
|
82
|
-
return_data_per_peak = [] if opts[:return_data]
|
83
|
-
weighted_mz = 0.0
|
84
|
-
peak_list.each do |peak|
|
85
|
-
pre_scaled_intensity = peak[0].data.reduce(0.0) {|sum,v| sum + v.last }
|
86
|
-
post_scaled_intensity = peak[1]
|
87
|
-
# some peaks may have been shared. In this case the intensity
|
88
|
-
# for that peak was downweighted. However, the actual data
|
89
|
-
# composing that peak is not altered when the intensity is
|
90
|
-
# shared. So, to calculate a proper weighted avg we need to
|
91
|
-
# downweight the intensity of any data point found within a bin
|
92
|
-
# whose intensity was scaled.
|
93
|
-
correction_factor =
|
94
|
-
if pre_scaled_intensity != post_scaled_intensity
|
95
|
-
post_scaled_intensity / pre_scaled_intensity
|
96
|
-
else
|
97
|
-
1.0
|
98
|
-
end
|
107
|
+
final_peaklist = Mspire::PeakList.new unless opts[:only_data]
|
99
108
|
|
100
|
-
|
109
|
+
separate_peaklists.each do |pseudo_peaklist|
|
110
|
+
data_peaklist = Mspire::PeakList.new
|
111
|
+
weight_x = 0.0
|
112
|
+
tot_intensity = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.y }
|
113
|
+
#puts "TOT INTENSITY:"
|
114
|
+
#p tot_intensity
|
115
|
+
calc_from_lil_bins = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.x.data.map(&:y).reduce(:+) }
|
116
|
+
#puts "LILBINS: "
|
117
|
+
#p calc_from_lil_bins
|
118
|
+
pseudo_peaklist.each do |bin_peak|
|
101
119
|
|
102
|
-
|
103
|
-
|
120
|
+
# For the :share method, the psuedo_peak intensity may have been
|
121
|
+
# adjusted, but the individual peaks were not. Correct this.
|
122
|
+
if opts[:split] == :share
|
123
|
+
post_scaled_y = bin_peak.y
|
124
|
+
pre_scaled_y = bin_peak.x.data.reduce(0.0) {|sum,peak| sum + peak.last }
|
125
|
+
#puts "PRESCALED Y:"
|
126
|
+
#p pre_scaled_y
|
127
|
+
if (post_scaled_y - pre_scaled_y).abs.round(10) != 0.0
|
128
|
+
correction = post_scaled_y / pre_scaled_y
|
129
|
+
bin_peak.x.data.each {|peak| peak.y = (peak.y * correction) }
|
130
|
+
end
|
104
131
|
end
|
132
|
+
|
133
|
+
unless opts[:only_data]
|
134
|
+
bin_peak.x.data.each do |peak|
|
135
|
+
weight_x += peak.x * ( peak.y.to_f / tot_intensity)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
(data_peaklist.push( *bin_peak.x.data )) if opts[:return_data]
|
105
139
|
end
|
106
|
-
|
107
|
-
|
140
|
+
final_peaklist << Mspire::Peak.new([weight_x, tot_intensity / normalize_factor]) unless opts[:only_data]
|
141
|
+
return_data << data_peaklist if opts[:return_data]
|
108
142
|
end
|
109
143
|
[final_peaklist, return_data]
|
110
144
|
end
|
@@ -116,13 +150,13 @@ module Mspire
|
|
116
150
|
# first):
|
117
151
|
#
|
118
152
|
# :bin_width => 5
|
119
|
-
# :bin_unit => :ppm
|
153
|
+
# :bin_unit => :ppm|:amu interpret bin_width as ppm or amu
|
120
154
|
# :bins => array of Mspire::Bin objects for custom bins (overides other bin options)
|
121
155
|
# :normalize => true if true, divides total intensity by
|
122
156
|
# number of spectra
|
123
157
|
# :return_data => false returns a parallel array containing
|
124
158
|
# the peaks associated with each returned peak
|
125
|
-
# :split =>
|
159
|
+
# :split => :zero|:greedy_y|:share see Mspire::Peak#split
|
126
160
|
# :centroided => true treat the data as centroided
|
127
161
|
#
|
128
162
|
# The binning algorithm is roughly the fastest possible algorithm that
|
@@ -130,131 +164,167 @@ module Mspire
|
|
130
164
|
# algorithm O(n + m))
|
131
165
|
#
|
132
166
|
# Assumes the peaklists are already sorted by m/z.
|
167
|
+
#
|
168
|
+
# Note that the peaks themselves will be altered if using the :share
|
169
|
+
# split method.
|
133
170
|
def merge(peaklists, opts={})
|
134
171
|
opts = DEFAULT_MERGE.merge(opts)
|
135
172
|
|
136
173
|
(peaklist, returned_data) =
|
137
174
|
if opts[:centroided]
|
138
|
-
merge_centroids(peaklists, opts)
|
175
|
+
merge_centroids(peaklists, opts.dup)
|
139
176
|
else
|
140
177
|
raise NotImplementedError, "need to implement profile merging"
|
141
178
|
end
|
142
179
|
|
143
|
-
if opts[:
|
144
|
-
|
145
|
-
|
146
|
-
end
|
147
|
-
if opts[:return_data]
|
148
|
-
$stderr.puts "returning peaklist (#{peaklist.size}) and data" if $VERBOSE
|
180
|
+
if opts[:only_data]
|
181
|
+
returned_data
|
182
|
+
elsif opts[:return_data]
|
149
183
|
[peaklist, returned_data]
|
150
184
|
else
|
151
|
-
|
152
|
-
peaklist
|
185
|
+
peaklist
|
153
186
|
end
|
154
187
|
end
|
188
|
+
end # end class << self
|
189
|
+
|
190
|
+
|
191
|
+
# returns an array with the indices outlining each peak. The first index
|
192
|
+
# is the start of the peak, the last index is the last of the peak.
|
193
|
+
# Interior indices represent local minima. So, peaks that have only two
|
194
|
+
# indices have no local minima.
|
195
|
+
def peak_boundaries(gt=0.0)
|
196
|
+
in_peak = false
|
197
|
+
prev_y = gt
|
198
|
+
prev_prev_y = gt
|
199
|
+
peak_inds = []
|
200
|
+
self.each_with_index do |peak, index|
|
201
|
+
curr_y = peak.y
|
202
|
+
if curr_y > gt
|
203
|
+
if !in_peak
|
204
|
+
in_peak = true
|
205
|
+
peak_inds << [index]
|
206
|
+
else
|
207
|
+
# if on_upslope
|
208
|
+
if prev_y < curr_y
|
209
|
+
# If we were previously on a downslope and we are now on an upslope
|
210
|
+
# then the previous index is a local min
|
211
|
+
# on_downslope(prev_previous_y, prev_y)
|
212
|
+
if prev_prev_y > prev_y
|
213
|
+
# We have found a local min
|
214
|
+
peak_inds.last << (index - 1)
|
215
|
+
end
|
216
|
+
end # end if (upslope)
|
217
|
+
end # end if !in_peak
|
218
|
+
elsif in_peak
|
219
|
+
peak_inds.last << (index - 1)
|
220
|
+
in_peak = false
|
221
|
+
end
|
222
|
+
prev_prev_y = prev_y
|
223
|
+
prev_y = curr_y
|
224
|
+
end
|
225
|
+
# add the last one to the last peak if it is a boundary
|
226
|
+
if self[-1].y > gt
|
227
|
+
peak_inds.last << (self.size-1)
|
228
|
+
end
|
229
|
+
peak_inds
|
155
230
|
end
|
156
231
|
|
232
|
+
# returns an array of PeakList objects
|
233
|
+
def split_on_zeros(given_peak_boundaries=nil)
|
234
|
+
pk_bounds = given_peak_boundaries || peak_boundaries(0.0)
|
235
|
+
pk_bounds.map do |indices|
|
236
|
+
self.class.new self[indices.first..indices.last]
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# returns an array of PeakList objects
|
241
|
+
# assumes that this is one connected list of peaks (i.e., no
|
242
|
+
# zeros/whitespace on the edges or internally)
|
243
|
+
#
|
244
|
+
# /\
|
245
|
+
# / \/\
|
246
|
+
# / \
|
247
|
+
#
|
248
|
+
# if there are no local minima, just returns self inside the array
|
249
|
+
def split_contiguous(methd=:greedy_y, local_min_indices=nil)
|
250
|
+
local_min_indices ||= ((pb=peak_boundaries.first) && pb.shift && pb.pop && pb)
|
251
|
+
|
252
|
+
if local_min_indices.size == 0
|
253
|
+
self
|
254
|
+
else
|
255
|
+
peak_class = first.class
|
256
|
+
prev_lm_i = 0 # <- don't worry, will be set to bumped to zero
|
257
|
+
peak_lists = [ self.class.new([self[0]]) ]
|
258
|
+
local_min_indices.each do |lm_i|
|
259
|
+
peak_lists.last.push( *self[(prev_lm_i+1)..(lm_i-1)] )
|
260
|
+
case methd
|
261
|
+
when :greedy_y
|
262
|
+
if self[lm_i-1].y >= self[lm_i+1].y
|
263
|
+
peak_lists.last << self[lm_i]
|
264
|
+
peak_lists << self.class.new
|
265
|
+
else
|
266
|
+
peak_lists << self.class.new( [self[lm_i]] )
|
267
|
+
end
|
268
|
+
when :share
|
269
|
+
# for each local min, two new peaks will be created, with
|
270
|
+
# intensity shared between adjacent peak_lists
|
271
|
+
lm = self[lm_i]
|
272
|
+
sum = self[lm_i-1].y + self[lm_i+1].y
|
273
|
+
# push onto the last peaklist its portion of the local min
|
274
|
+
peak_lists.last << peak_class.new( [lm.x, lm.y * (self[lm_i-1].y.to_f/sum)] )
|
275
|
+
# create a new peaklist that contains its portion of the local min
|
276
|
+
peak_lists << self.class.new( [peak_class.new([lm.x, lm.y * (self[lm_i+1].y.to_f/sum)])] )
|
277
|
+
end
|
278
|
+
prev_lm_i = lm_i
|
279
|
+
end
|
280
|
+
peak_lists.last.push(*self[(prev_lm_i+1)...(self.size)] )
|
281
|
+
peak_lists
|
282
|
+
end
|
283
|
+
end
|
157
284
|
|
158
285
|
# returns an Array of peaklist objects. Splits run of 1 or more local
|
159
286
|
# minima into multiple peaklists. When a point is 'shared' between two
|
160
287
|
# adjacent hill-ish areas, the choice of how to resolve multi-hills (runs
|
161
288
|
# of data above zero) is one of:
|
162
289
|
#
|
163
|
-
#
|
164
|
-
# :share
|
290
|
+
# :zero = only split on zeros
|
291
|
+
# :share = give each peak its rightful portion of shared peaks, dividing the
|
165
292
|
# intensity based on the intensity of adjacent peaks
|
166
|
-
# :greedy_y
|
293
|
+
# :greedy_y = give the point to the peak with highest point next to
|
167
294
|
# the point in question. tie goes lower.
|
168
295
|
#
|
169
|
-
#
|
170
|
-
#
|
171
|
-
#
|
296
|
+
# Note that the peak surrounding a local_minima may be altered if using
|
297
|
+
# :share
|
298
|
+
#
|
172
299
|
# assumes that a new peak can be made with an array containing the x
|
173
300
|
# value and the y value.
|
174
|
-
def split(
|
175
|
-
if
|
176
|
-
|
177
|
-
$stderr.print "splitting on local minima ..." if $VERBOSE
|
178
|
-
no_local_minima_peaks = zeroed_peaks.zip(local_min_ind_ar).map do |peak, lm_indices|
|
179
|
-
new_peaks = [ peak.class.new ]
|
180
|
-
if lm_indices.size > 0
|
181
|
-
prev_lm_i = -1 # <- it's okay, we don't use until it is zero
|
182
|
-
lm_indices.each do |lm_i|
|
183
|
-
lm = peak[lm_i]
|
184
|
-
point_class = lm.class
|
185
|
-
|
186
|
-
# push onto the last peak all the points from right after the previous local min
|
187
|
-
# to just before this local min
|
188
|
-
new_peaks.last.push( *peak[(prev_lm_i+1)..(lm_i-1)] )
|
189
|
-
before_pnt = peak[lm_i-1]
|
190
|
-
after_pnt = peak[lm_i+1]
|
191
|
-
|
192
|
-
case split_multipeaks
|
193
|
-
when :share
|
194
|
-
sum = before_pnt[1] + after_pnt[1]
|
195
|
-
# push onto the last peak its portion of the local min
|
196
|
-
new_peaks.last << point_class.new( [lm[0], lm[1] * (before_pnt[1].to_f/sum)] )
|
197
|
-
# create a new peak that contains its portion of the local min
|
198
|
-
new_peaks << self.class.new( [point_class.new([lm[0], lm[1] * (after_pnt[1].to_f/sum)])] )
|
199
|
-
prev_lm_i = lm_i
|
200
|
-
when :greedy_y
|
201
|
-
if before_pnt[1] >= after_pnt[1]
|
202
|
-
new_peaks.last << lm
|
203
|
-
new_peaks << self.class.new
|
204
|
-
prev_lm_i = lm_i
|
205
|
-
else
|
206
|
-
new_peaks << self.class.new( [lm] )
|
207
|
-
prev_lm_i = lm_i
|
208
|
-
end
|
209
|
-
else
|
210
|
-
raise ArgumentError, "only recognize :share, :greedy_y, or false for the arg in #split(arg)"
|
211
|
-
end
|
212
|
-
end
|
213
|
-
new_peaks.last.push( *peak[(prev_lm_i+1)...peak.size] )
|
214
|
-
new_peaks
|
215
|
-
else
|
216
|
-
[peak]
|
217
|
-
end
|
218
|
-
end.flatten(1) # end zip
|
219
|
-
$stderr.puts "now #{no_local_minima_peaks.size} peaks." if $VERBOSE
|
220
|
-
no_local_minima_peaks
|
301
|
+
def split(split_multipeaks_mthd=:zero)
|
302
|
+
if split_multipeaks_mthd == :zero
|
303
|
+
split_on_zeros
|
221
304
|
else
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
peak_lists << self.class.new([peak])
|
235
|
-
local_min_ind_ar << []
|
236
|
-
else
|
237
|
-
peak_lists.last << peak
|
238
|
-
# if on_upslope(previous_y, point[1])
|
239
|
-
if previous_y < peak[1]
|
240
|
-
# If we were previously on a downslope and we are now on an upslope
|
241
|
-
# then the previous index is a local min
|
242
|
-
prev_previous_y = self[index - 2][1]
|
243
|
-
# on_downslope(prev_previous_y, previous_y)
|
244
|
-
if prev_previous_y > previous_y
|
245
|
-
# We have found a local min
|
246
|
-
local_min_ind_ar.last << (in_peak-1)
|
247
|
-
end
|
248
|
-
end # end if (upslope)
|
249
|
-
end # end if !in_peak
|
250
|
-
in_peak += 1
|
251
|
-
elsif in_peak
|
252
|
-
in_peak = false
|
253
|
-
end # end if point[1] > 0
|
305
|
+
boundaries = peak_boundaries(0.0)
|
306
|
+
no_lm_pklsts = []
|
307
|
+
boundaries.each do |indices|
|
308
|
+
peak = self[indices.first..indices.last]
|
309
|
+
if indices.size == 2
|
310
|
+
no_lm_pklsts << peak
|
311
|
+
else # have local minima
|
312
|
+
multipeak = PeakList.new(peak)
|
313
|
+
local_min_inds = indices[1..-2].map {|i| i-indices.first}
|
314
|
+
peaklists = multipeak.split_contiguous(split_multipeaks_mthd, local_min_inds)
|
315
|
+
no_lm_pklsts.push *peaklists
|
316
|
+
end
|
254
317
|
end
|
255
|
-
|
256
|
-
|
257
|
-
end
|
318
|
+
#$stderr.puts "now #{no_lm_pklsts.size} peaks." if $VERBOSE
|
319
|
+
no_lm_pklsts
|
320
|
+
end
|
258
321
|
end # def split
|
259
322
|
end
|
260
323
|
end
|
324
|
+
|
325
|
+
|
326
|
+
|
327
|
+
=begin
|
328
|
+
if !opts[:only_data]
|
329
|
+
=end
|
330
|
+
|