mspire 0.7.8 → 0.7.9

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -19,6 +19,7 @@ Jeweler::Tasks.new do |gem|
19
19
  gem.add_dependency "builder", "~> 3.0.0"
20
20
  gem.add_dependency "bio", "~> 1.4.2"
21
21
  gem.add_dependency "trollop", "~> 1.16.2"
22
+ gem.add_dependency "uuid", ">= 2.3.5"
22
23
  # this should be a real dependency, but need to document getting this
23
24
  # working on windows first!
24
25
  gem.add_development_dependency "fftw3", "~> 0.3"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.8
1
+ 0.7.9
@@ -150,7 +150,11 @@ module Mspire
150
150
  case arg
151
151
  when IO
152
152
  @io = arg
153
- @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
153
+ begin
154
+ @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
155
+ rescue EOFError
156
+ raise RuntimeError, "no encoding present in XML! (Is this even an xml file?)"
157
+ end
154
158
  @index_list = get_index_list
155
159
  read_header!
156
160
  when Hash
@@ -89,7 +89,14 @@ module Mspire
89
89
  end
90
90
  end
91
91
  data = base64.unpack("m*").first
92
- unzipped = compressed ? Zlib::Inflate.inflate(data) : data
92
+ # some implementations leave data blank if there aren't peaks
93
+ # even if they say it is zlib compressed...
94
+ unzipped =
95
+ if data.size > 0
96
+ compressed ? Zlib::Inflate.inflate(data) : data
97
+ else
98
+ data
99
+ end
93
100
  self.new( unzipped.unpack(precision_unpack) )
94
101
  end
95
102
 
@@ -7,5 +7,13 @@ module Mspire
7
7
  class Peak < Array
8
8
  alias_method :x, :first
9
9
  alias_method :y, :last
10
+
11
+ def x=(val)
12
+ self[0] = val
13
+ end
14
+
15
+ def y=(val)
16
+ self[1] = val
17
+ end
10
18
  end
11
19
  end
@@ -1,7 +1,8 @@
1
1
  require 'mspire/bin'
2
2
 
3
3
  module Mspire
4
- # a collection of peak objects
4
+ # a collection of peak objects. At a minimum, each peak should respond to
5
+ # :x and :y
5
6
  class PeakList < Array
6
7
 
7
8
  def lo_x
@@ -21,22 +22,46 @@ module Mspire
21
22
  :centroided => true,
22
23
  }
23
24
 
25
+ # for spectral peaks, this is the weighted m/z
26
+ def weighted_x
27
+ tot_intensity = self.inject(0.0) {|sum,peak| sum + peak.y }
28
+ _weighted_x = 0.0
29
+ self.each do |peak|
30
+ int = peak.y
31
+ signal_by_sample_index[peak.sample_id] += int
32
+ _weighted_x += (peak.first * (int/tot_intensity))
33
+ end
34
+ _weighted_x
35
+ end
36
+
37
+ # class methods
24
38
  class << self
25
39
 
40
+ # creates a new Mspire::PeakList and coerces each peak into an
41
+ # Mspire::Peak. If your peaks already behave like peaks you should use
42
+ # .new
43
+ def [](*peaks)
44
+ self.new( peaks.map {|peak| Mspire::Peak.new(peak) } )
45
+ end
46
+
26
47
  def create_bins(peaklists, opts)
27
- min, max = min_max_mz(peaklists)
48
+ min, max = min_max_x(peaklists)
28
49
 
29
50
  divisions = []
30
51
  bin_width = opts[:bin_width]
31
52
  use_ppm = (opts[:bin_unit] == :ppm)
32
- current_mz = min
53
+
54
+ puts "using bin width: #{bin_width}" if $VERBOSE
55
+ puts "using ppm for bins: #{use_ppm}" if $VERBOSE
56
+
57
+ current_x = min
33
58
  loop do
34
- if current_mz >= max
59
+ if current_x >= max
35
60
  divisions << max
36
61
  break
37
62
  else
38
- divisions << current_mz
39
- current_mz += ( use_ppm ? current_mz./(1e6).*(bin_width) : bin_width )
63
+ divisions << current_x
64
+ current_x += ( use_ppm ? current_x./(1e6).*(bin_width) : bin_width )
40
65
  end
41
66
  end
42
67
  # make each bin exclusive so there is no overlap
@@ -46,10 +71,10 @@ module Mspire
46
71
  bins
47
72
  end
48
73
 
49
- def min_max_mz(peaklists)
74
+ def min_max_x(peaklists)
50
75
  # find the min and max across all spectra
51
76
  first_peaklist = peaklists.first
52
- min = first_peaklist.first[0]; max = first_peaklist.last[0]
77
+ min = first_peaklist.first.x; max = first_peaklist.last.x
53
78
  peaklists.each do |peaklist|
54
79
  min = peaklist.lo_x if peaklist.lo_x < min
55
80
  max = peaklist.hi_x if peaklist.hi_x > max
@@ -58,53 +83,62 @@ module Mspire
58
83
  end
59
84
 
60
85
  def merge_centroids(peaklists, opts={})
86
+ opts[:return_data] = true if opts[:only_data]
61
87
 
62
88
  # Create Mspire::Bin objects
63
89
  bins = opts[:bins] ? opts[:bins] : create_bins(peaklists, opts)
90
+ puts "created #{bins.size} bins" if $VERBOSE
64
91
 
65
92
  peaklists.each do |peaklist|
66
- Mspire::Bin.bin(bins, peaklist, &:first)
93
+ Mspire::Bin.bin(bins, peaklist, &:x)
67
94
  end
68
95
 
69
96
  pseudo_peaks = bins.map do |bin|
70
- [bin, bin.data.reduce(0.0) {|sum,peak| sum + peak[1] }]
97
+ Mspire::Peak.new( [bin, bin.data.reduce(0.0) {|sum,peak| sum + peak.y }] )
71
98
  end
72
99
 
73
100
  pseudo_peaklist = Mspire::PeakList.new(pseudo_peaks)
74
101
 
75
- peak_lists = pseudo_peaklist.split(opts[:split])
102
+ separate_peaklists = pseudo_peaklist.split(opts[:split])
103
+
104
+ normalize_factor = opts[:normalize] ? peaklists.size : 1
76
105
 
77
106
  return_data = []
78
- final_peaklist = []
79
- peak_lists.each_with_index do |peak_list,i|
80
- #peaks.each do |peak|
81
- tot_intensity = peak_list.map(&:last).reduce(:+)
82
- return_data_per_peak = [] if opts[:return_data]
83
- weighted_mz = 0.0
84
- peak_list.each do |peak|
85
- pre_scaled_intensity = peak[0].data.reduce(0.0) {|sum,v| sum + v.last }
86
- post_scaled_intensity = peak[1]
87
- # some peaks may have been shared. In this case the intensity
88
- # for that peak was downweighted. However, the actual data
89
- # composing that peak is not altered when the intensity is
90
- # shared. So, to calculate a proper weighted avg we need to
91
- # downweight the intensity of any data point found within a bin
92
- # whose intensity was scaled.
93
- correction_factor =
94
- if pre_scaled_intensity != post_scaled_intensity
95
- post_scaled_intensity / pre_scaled_intensity
96
- else
97
- 1.0
98
- end
107
+ final_peaklist = Mspire::PeakList.new unless opts[:only_data]
99
108
 
100
- return_data_per_peak.push(*peak[0].data) if opts[:return_data]
109
+ separate_peaklists.each do |pseudo_peaklist|
110
+ data_peaklist = Mspire::PeakList.new
111
+ weight_x = 0.0
112
+ tot_intensity = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.y }
113
+ #puts "TOT INTENSITY:"
114
+ #p tot_intensity
115
+ calc_from_lil_bins = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.x.data.map(&:y).reduce(:+) }
116
+ #puts "LILBINS: "
117
+ #p calc_from_lil_bins
118
+ pseudo_peaklist.each do |bin_peak|
101
119
 
102
- peak[0].data.each do |lil_point|
103
- weighted_mz += lil_point[0] * ( (lil_point[1].to_f * correction_factor) / tot_intensity)
120
+ # For the :share method, the psuedo_peak intensity may have been
121
+ # adjusted, but the individual peaks were not. Correct this.
122
+ if opts[:split] == :share
123
+ post_scaled_y = bin_peak.y
124
+ pre_scaled_y = bin_peak.x.data.reduce(0.0) {|sum,peak| sum + peak.last }
125
+ #puts "PRESCALED Y:"
126
+ #p pre_scaled_y
127
+ if (post_scaled_y - pre_scaled_y).abs.round(10) != 0.0
128
+ correction = post_scaled_y / pre_scaled_y
129
+ bin_peak.x.data.each {|peak| peak.y = (peak.y * correction) }
130
+ end
104
131
  end
132
+
133
+ unless opts[:only_data]
134
+ bin_peak.x.data.each do |peak|
135
+ weight_x += peak.x * ( peak.y.to_f / tot_intensity)
136
+ end
137
+ end
138
+ (data_peaklist.push( *bin_peak.x.data )) if opts[:return_data]
105
139
  end
106
- return_data << return_data_per_peak if opts[:return_data]
107
- final_peaklist << Mspire::Peak.new([weighted_mz, tot_intensity])
140
+ final_peaklist << Mspire::Peak.new([weight_x, tot_intensity / normalize_factor]) unless opts[:only_data]
141
+ return_data << data_peaklist if opts[:return_data]
108
142
  end
109
143
  [final_peaklist, return_data]
110
144
  end
@@ -116,13 +150,13 @@ module Mspire
116
150
  # first):
117
151
  #
118
152
  # :bin_width => 5
119
- # :bin_unit => :ppm | :amu interpret bin_width as ppm or amu
153
+ # :bin_unit => :ppm|:amu interpret bin_width as ppm or amu
120
154
  # :bins => array of Mspire::Bin objects for custom bins (overides other bin options)
121
155
  # :normalize => true if true, divides total intensity by
122
156
  # number of spectra
123
157
  # :return_data => false returns a parallel array containing
124
158
  # the peaks associated with each returned peak
125
- # :split => false | :share | :greedy_y see Mspire::Peak#split
159
+ # :split => :zero|:greedy_y|:share see Mspire::Peak#split
126
160
  # :centroided => true treat the data as centroided
127
161
  #
128
162
  # The binning algorithm is roughly the fastest possible algorithm that
@@ -130,131 +164,167 @@ module Mspire
130
164
  # algorithm O(n + m))
131
165
  #
132
166
  # Assumes the peaklists are already sorted by m/z.
167
+ #
168
+ # Note that the peaks themselves will be altered if using the :share
169
+ # split method.
133
170
  def merge(peaklists, opts={})
134
171
  opts = DEFAULT_MERGE.merge(opts)
135
172
 
136
173
  (peaklist, returned_data) =
137
174
  if opts[:centroided]
138
- merge_centroids(peaklists, opts)
175
+ merge_centroids(peaklists, opts.dup)
139
176
  else
140
177
  raise NotImplementedError, "need to implement profile merging"
141
178
  end
142
179
 
143
- if opts[:normalize]
144
- sz = peaklists.size
145
- peaklist.each {|peak| peak[1] = peak[1].to_f / sz }
146
- end
147
- if opts[:return_data]
148
- $stderr.puts "returning peaklist (#{peaklist.size}) and data" if $VERBOSE
180
+ if opts[:only_data]
181
+ returned_data
182
+ elsif opts[:return_data]
149
183
  [peaklist, returned_data]
150
184
  else
151
- $stderr.puts "returning peaklist (#{peaklist.size})" if $VERBOSE
152
- peaklist
185
+ peaklist
153
186
  end
154
187
  end
188
+ end # end class << self
189
+
190
+
191
+ # returns an array with the indices outlining each peak. The first index
192
+ # is the start of the peak, the last index is the last of the peak.
193
+ # Interior indices represent local minima. So, peaks that have only two
194
+ # indices have no local minima.
195
+ def peak_boundaries(gt=0.0)
196
+ in_peak = false
197
+ prev_y = gt
198
+ prev_prev_y = gt
199
+ peak_inds = []
200
+ self.each_with_index do |peak, index|
201
+ curr_y = peak.y
202
+ if curr_y > gt
203
+ if !in_peak
204
+ in_peak = true
205
+ peak_inds << [index]
206
+ else
207
+ # if on_upslope
208
+ if prev_y < curr_y
209
+ # If we were previously on a downslope and we are now on an upslope
210
+ # then the previous index is a local min
211
+ # on_downslope(prev_previous_y, prev_y)
212
+ if prev_prev_y > prev_y
213
+ # We have found a local min
214
+ peak_inds.last << (index - 1)
215
+ end
216
+ end # end if (upslope)
217
+ end # end if !in_peak
218
+ elsif in_peak
219
+ peak_inds.last << (index - 1)
220
+ in_peak = false
221
+ end
222
+ prev_prev_y = prev_y
223
+ prev_y = curr_y
224
+ end
225
+ # add the last one to the last peak if it is a boundary
226
+ if self[-1].y > gt
227
+ peak_inds.last << (self.size-1)
228
+ end
229
+ peak_inds
155
230
  end
156
231
 
232
+ # returns an array of PeakList objects
233
+ def split_on_zeros(given_peak_boundaries=nil)
234
+ pk_bounds = given_peak_boundaries || peak_boundaries(0.0)
235
+ pk_bounds.map do |indices|
236
+ self.class.new self[indices.first..indices.last]
237
+ end
238
+ end
239
+
240
+ # returns an array of PeakList objects
241
+ # assumes that this is one connected list of peaks (i.e., no
242
+ # zeros/whitespace on the edges or internally)
243
+ #
244
+ # /\
245
+ # / \/\
246
+ # / \
247
+ #
248
+ # if there are no local minima, just returns self inside the array
249
+ def split_contiguous(methd=:greedy_y, local_min_indices=nil)
250
+ local_min_indices ||= ((pb=peak_boundaries.first) && pb.shift && pb.pop && pb)
251
+
252
+ if local_min_indices.size == 0
253
+ self
254
+ else
255
+ peak_class = first.class
256
+ prev_lm_i = 0 # <- don't worry, will be set to bumped to zero
257
+ peak_lists = [ self.class.new([self[0]]) ]
258
+ local_min_indices.each do |lm_i|
259
+ peak_lists.last.push( *self[(prev_lm_i+1)..(lm_i-1)] )
260
+ case methd
261
+ when :greedy_y
262
+ if self[lm_i-1].y >= self[lm_i+1].y
263
+ peak_lists.last << self[lm_i]
264
+ peak_lists << self.class.new
265
+ else
266
+ peak_lists << self.class.new( [self[lm_i]] )
267
+ end
268
+ when :share
269
+ # for each local min, two new peaks will be created, with
270
+ # intensity shared between adjacent peak_lists
271
+ lm = self[lm_i]
272
+ sum = self[lm_i-1].y + self[lm_i+1].y
273
+ # push onto the last peaklist its portion of the local min
274
+ peak_lists.last << peak_class.new( [lm.x, lm.y * (self[lm_i-1].y.to_f/sum)] )
275
+ # create a new peaklist that contains its portion of the local min
276
+ peak_lists << self.class.new( [peak_class.new([lm.x, lm.y * (self[lm_i+1].y.to_f/sum)])] )
277
+ end
278
+ prev_lm_i = lm_i
279
+ end
280
+ peak_lists.last.push(*self[(prev_lm_i+1)...(self.size)] )
281
+ peak_lists
282
+ end
283
+ end
157
284
 
158
285
  # returns an Array of peaklist objects. Splits run of 1 or more local
159
286
  # minima into multiple peaklists. When a point is 'shared' between two
160
287
  # adjacent hill-ish areas, the choice of how to resolve multi-hills (runs
161
288
  # of data above zero) is one of:
162
289
  #
163
- # false/nil => only split on zeros
164
- # :share => give each peak its rightful portion of shared peaks, dividing the
290
+ # :zero = only split on zeros
291
+ # :share = give each peak its rightful portion of shared peaks, dividing the
165
292
  # intensity based on the intensity of adjacent peaks
166
- # :greedy_y => give the point to the peak with highest point next to
293
+ # :greedy_y = give the point to the peak with highest point next to
167
294
  # the point in question. tie goes lower.
168
295
  #
169
- # if return_local_minima is true, a parallel array of local minima indices is
170
- # returned (only makes sense if split_multipeaks is false)
171
- #
296
+ # Note that the peak surrounding a local_minima may be altered if using
297
+ # :share
298
+ #
172
299
  # assumes that a new peak can be made with an array containing the x
173
300
  # value and the y value.
174
- def split(split_multipeaks=false, return_local_minima=false)
175
- if split_multipeaks
176
- (zeroed_peaks, local_min_ind_ar) = self.split(false, true)
177
- $stderr.print "splitting on local minima ..." if $VERBOSE
178
- no_local_minima_peaks = zeroed_peaks.zip(local_min_ind_ar).map do |peak, lm_indices|
179
- new_peaks = [ peak.class.new ]
180
- if lm_indices.size > 0
181
- prev_lm_i = -1 # <- it's okay, we don't use until it is zero
182
- lm_indices.each do |lm_i|
183
- lm = peak[lm_i]
184
- point_class = lm.class
185
-
186
- # push onto the last peak all the points from right after the previous local min
187
- # to just before this local min
188
- new_peaks.last.push( *peak[(prev_lm_i+1)..(lm_i-1)] )
189
- before_pnt = peak[lm_i-1]
190
- after_pnt = peak[lm_i+1]
191
-
192
- case split_multipeaks
193
- when :share
194
- sum = before_pnt[1] + after_pnt[1]
195
- # push onto the last peak its portion of the local min
196
- new_peaks.last << point_class.new( [lm[0], lm[1] * (before_pnt[1].to_f/sum)] )
197
- # create a new peak that contains its portion of the local min
198
- new_peaks << self.class.new( [point_class.new([lm[0], lm[1] * (after_pnt[1].to_f/sum)])] )
199
- prev_lm_i = lm_i
200
- when :greedy_y
201
- if before_pnt[1] >= after_pnt[1]
202
- new_peaks.last << lm
203
- new_peaks << self.class.new
204
- prev_lm_i = lm_i
205
- else
206
- new_peaks << self.class.new( [lm] )
207
- prev_lm_i = lm_i
208
- end
209
- else
210
- raise ArgumentError, "only recognize :share, :greedy_y, or false for the arg in #split(arg)"
211
- end
212
- end
213
- new_peaks.last.push( *peak[(prev_lm_i+1)...peak.size] )
214
- new_peaks
215
- else
216
- [peak]
217
- end
218
- end.flatten(1) # end zip
219
- $stderr.puts "now #{no_local_minima_peaks.size} peaks." if $VERBOSE
220
- no_local_minima_peaks
301
+ def split(split_multipeaks_mthd=:zero)
302
+ if split_multipeaks_mthd == :zero
303
+ split_on_zeros
221
304
  else
222
- $stderr.print "splitting on zeros..." if $VERBOSE
223
- # first, split the peaks based on zero intensity values
224
- # and simultaneously keep track of the local minima within each
225
- # resulting peak
226
- peak_lists = []
227
- local_min_ind_ar = []
228
- in_peak = false
229
- self.each_with_index do |peak, index|
230
- previous_y = self[index - 1][1]
231
- if peak[1] > 0
232
- if !in_peak
233
- in_peak = 0
234
- peak_lists << self.class.new([peak])
235
- local_min_ind_ar << []
236
- else
237
- peak_lists.last << peak
238
- # if on_upslope(previous_y, point[1])
239
- if previous_y < peak[1]
240
- # If we were previously on a downslope and we are now on an upslope
241
- # then the previous index is a local min
242
- prev_previous_y = self[index - 2][1]
243
- # on_downslope(prev_previous_y, previous_y)
244
- if prev_previous_y > previous_y
245
- # We have found a local min
246
- local_min_ind_ar.last << (in_peak-1)
247
- end
248
- end # end if (upslope)
249
- end # end if !in_peak
250
- in_peak += 1
251
- elsif in_peak
252
- in_peak = false
253
- end # end if point[1] > 0
305
+ boundaries = peak_boundaries(0.0)
306
+ no_lm_pklsts = []
307
+ boundaries.each do |indices|
308
+ peak = self[indices.first..indices.last]
309
+ if indices.size == 2
310
+ no_lm_pklsts << peak
311
+ else # have local minima
312
+ multipeak = PeakList.new(peak)
313
+ local_min_inds = indices[1..-2].map {|i| i-indices.first}
314
+ peaklists = multipeak.split_contiguous(split_multipeaks_mthd, local_min_inds)
315
+ no_lm_pklsts.push *peaklists
316
+ end
254
317
  end
255
- $stderr.puts "#{peak_lists.size} no-whitespace-inside peak_lists." if $VERBOSE
256
- return_local_minima ? [peak_lists, local_min_ind_ar] : peak_lists
257
- end #
318
+ #$stderr.puts "now #{no_lm_pklsts.size} peaks." if $VERBOSE
319
+ no_lm_pklsts
320
+ end
258
321
  end # def split
259
322
  end
260
323
  end
324
+
325
+
326
+
327
+ =begin
328
+ if !opts[:only_data]
329
+ =end
330
+