mspire 0.7.8 → 0.7.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -19,6 +19,7 @@ Jeweler::Tasks.new do |gem|
19
19
  gem.add_dependency "builder", "~> 3.0.0"
20
20
  gem.add_dependency "bio", "~> 1.4.2"
21
21
  gem.add_dependency "trollop", "~> 1.16.2"
22
+ gem.add_dependency "uuid", ">= 2.3.5"
22
23
  # this should be a real dependency, but need to document getting this
23
24
  # working on windows first!
24
25
  gem.add_development_dependency "fftw3", "~> 0.3"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.8
1
+ 0.7.9
@@ -150,7 +150,11 @@ module Mspire
150
150
  case arg
151
151
  when IO
152
152
  @io = arg
153
- @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
153
+ begin
154
+ @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
155
+ rescue EOFError
156
+ raise RuntimeError, "no encoding present in XML! (Is this even an xml file?)"
157
+ end
154
158
  @index_list = get_index_list
155
159
  read_header!
156
160
  when Hash
@@ -89,7 +89,14 @@ module Mspire
89
89
  end
90
90
  end
91
91
  data = base64.unpack("m*").first
92
- unzipped = compressed ? Zlib::Inflate.inflate(data) : data
92
+ # some implementations leave data blank if there aren't peaks
93
+ # even if they say it is zlib compressed...
94
+ unzipped =
95
+ if data.size > 0
96
+ compressed ? Zlib::Inflate.inflate(data) : data
97
+ else
98
+ data
99
+ end
93
100
  self.new( unzipped.unpack(precision_unpack) )
94
101
  end
95
102
 
@@ -7,5 +7,13 @@ module Mspire
7
7
  class Peak < Array
8
8
  alias_method :x, :first
9
9
  alias_method :y, :last
10
+
11
+ def x=(val)
12
+ self[0] = val
13
+ end
14
+
15
+ def y=(val)
16
+ self[1] = val
17
+ end
10
18
  end
11
19
  end
@@ -1,7 +1,8 @@
1
1
  require 'mspire/bin'
2
2
 
3
3
  module Mspire
4
- # a collection of peak objects
4
+ # a collection of peak objects. At a minimum, each peak should respond to
5
+ # :x and :y
5
6
  class PeakList < Array
6
7
 
7
8
  def lo_x
@@ -21,22 +22,46 @@ module Mspire
21
22
  :centroided => true,
22
23
  }
23
24
 
25
+ # for spectral peaks, this is the weighted m/z
26
+ def weighted_x
27
+ tot_intensity = self.inject(0.0) {|sum,peak| sum + peak.y }
28
+ _weighted_x = 0.0
29
+ self.each do |peak|
30
+ int = peak.y
31
+ signal_by_sample_index[peak.sample_id] += int
32
+ _weighted_x += (peak.first * (int/tot_intensity))
33
+ end
34
+ _weighted_x
35
+ end
36
+
37
+ # class methods
24
38
  class << self
25
39
 
40
+ # creates a new Mspire::PeakList and coerces each peak into an
41
+ # Mspire::Peak. If your peaks already behave like peaks you should use
42
+ # .new
43
+ def [](*peaks)
44
+ self.new( peaks.map {|peak| Mspire::Peak.new(peak) } )
45
+ end
46
+
26
47
  def create_bins(peaklists, opts)
27
- min, max = min_max_mz(peaklists)
48
+ min, max = min_max_x(peaklists)
28
49
 
29
50
  divisions = []
30
51
  bin_width = opts[:bin_width]
31
52
  use_ppm = (opts[:bin_unit] == :ppm)
32
- current_mz = min
53
+
54
+ puts "using bin width: #{bin_width}" if $VERBOSE
55
+ puts "using ppm for bins: #{use_ppm}" if $VERBOSE
56
+
57
+ current_x = min
33
58
  loop do
34
- if current_mz >= max
59
+ if current_x >= max
35
60
  divisions << max
36
61
  break
37
62
  else
38
- divisions << current_mz
39
- current_mz += ( use_ppm ? current_mz./(1e6).*(bin_width) : bin_width )
63
+ divisions << current_x
64
+ current_x += ( use_ppm ? current_x./(1e6).*(bin_width) : bin_width )
40
65
  end
41
66
  end
42
67
  # make each bin exclusive so there is no overlap
@@ -46,10 +71,10 @@ module Mspire
46
71
  bins
47
72
  end
48
73
 
49
- def min_max_mz(peaklists)
74
+ def min_max_x(peaklists)
50
75
  # find the min and max across all spectra
51
76
  first_peaklist = peaklists.first
52
- min = first_peaklist.first[0]; max = first_peaklist.last[0]
77
+ min = first_peaklist.first.x; max = first_peaklist.last.x
53
78
  peaklists.each do |peaklist|
54
79
  min = peaklist.lo_x if peaklist.lo_x < min
55
80
  max = peaklist.hi_x if peaklist.hi_x > max
@@ -58,53 +83,62 @@ module Mspire
58
83
  end
59
84
 
60
85
  def merge_centroids(peaklists, opts={})
86
+ opts[:return_data] = true if opts[:only_data]
61
87
 
62
88
  # Create Mspire::Bin objects
63
89
  bins = opts[:bins] ? opts[:bins] : create_bins(peaklists, opts)
90
+ puts "created #{bins.size} bins" if $VERBOSE
64
91
 
65
92
  peaklists.each do |peaklist|
66
- Mspire::Bin.bin(bins, peaklist, &:first)
93
+ Mspire::Bin.bin(bins, peaklist, &:x)
67
94
  end
68
95
 
69
96
  pseudo_peaks = bins.map do |bin|
70
- [bin, bin.data.reduce(0.0) {|sum,peak| sum + peak[1] }]
97
+ Mspire::Peak.new( [bin, bin.data.reduce(0.0) {|sum,peak| sum + peak.y }] )
71
98
  end
72
99
 
73
100
  pseudo_peaklist = Mspire::PeakList.new(pseudo_peaks)
74
101
 
75
- peak_lists = pseudo_peaklist.split(opts[:split])
102
+ separate_peaklists = pseudo_peaklist.split(opts[:split])
103
+
104
+ normalize_factor = opts[:normalize] ? peaklists.size : 1
76
105
 
77
106
  return_data = []
78
- final_peaklist = []
79
- peak_lists.each_with_index do |peak_list,i|
80
- #peaks.each do |peak|
81
- tot_intensity = peak_list.map(&:last).reduce(:+)
82
- return_data_per_peak = [] if opts[:return_data]
83
- weighted_mz = 0.0
84
- peak_list.each do |peak|
85
- pre_scaled_intensity = peak[0].data.reduce(0.0) {|sum,v| sum + v.last }
86
- post_scaled_intensity = peak[1]
87
- # some peaks may have been shared. In this case the intensity
88
- # for that peak was downweighted. However, the actual data
89
- # composing that peak is not altered when the intensity is
90
- # shared. So, to calculate a proper weighted avg we need to
91
- # downweight the intensity of any data point found within a bin
92
- # whose intensity was scaled.
93
- correction_factor =
94
- if pre_scaled_intensity != post_scaled_intensity
95
- post_scaled_intensity / pre_scaled_intensity
96
- else
97
- 1.0
98
- end
107
+ final_peaklist = Mspire::PeakList.new unless opts[:only_data]
99
108
 
100
- return_data_per_peak.push(*peak[0].data) if opts[:return_data]
109
+ separate_peaklists.each do |pseudo_peaklist|
110
+ data_peaklist = Mspire::PeakList.new
111
+ weight_x = 0.0
112
+ tot_intensity = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.y }
113
+ #puts "TOT INTENSITY:"
114
+ #p tot_intensity
115
+ calc_from_lil_bins = pseudo_peaklist.inject(0.0) {|sum, bin_peak| sum + bin_peak.x.data.map(&:y).reduce(:+) }
116
+ #puts "LILBINS: "
117
+ #p calc_from_lil_bins
118
+ pseudo_peaklist.each do |bin_peak|
101
119
 
102
- peak[0].data.each do |lil_point|
103
- weighted_mz += lil_point[0] * ( (lil_point[1].to_f * correction_factor) / tot_intensity)
120
+ # For the :share method, the psuedo_peak intensity may have been
121
+ # adjusted, but the individual peaks were not. Correct this.
122
+ if opts[:split] == :share
123
+ post_scaled_y = bin_peak.y
124
+ pre_scaled_y = bin_peak.x.data.reduce(0.0) {|sum,peak| sum + peak.last }
125
+ #puts "PRESCALED Y:"
126
+ #p pre_scaled_y
127
+ if (post_scaled_y - pre_scaled_y).abs.round(10) != 0.0
128
+ correction = post_scaled_y / pre_scaled_y
129
+ bin_peak.x.data.each {|peak| peak.y = (peak.y * correction) }
130
+ end
104
131
  end
132
+
133
+ unless opts[:only_data]
134
+ bin_peak.x.data.each do |peak|
135
+ weight_x += peak.x * ( peak.y.to_f / tot_intensity)
136
+ end
137
+ end
138
+ (data_peaklist.push( *bin_peak.x.data )) if opts[:return_data]
105
139
  end
106
- return_data << return_data_per_peak if opts[:return_data]
107
- final_peaklist << Mspire::Peak.new([weighted_mz, tot_intensity])
140
+ final_peaklist << Mspire::Peak.new([weight_x, tot_intensity / normalize_factor]) unless opts[:only_data]
141
+ return_data << data_peaklist if opts[:return_data]
108
142
  end
109
143
  [final_peaklist, return_data]
110
144
  end
@@ -116,13 +150,13 @@ module Mspire
116
150
  # first):
117
151
  #
118
152
  # :bin_width => 5
119
- # :bin_unit => :ppm | :amu interpret bin_width as ppm or amu
153
+ # :bin_unit => :ppm|:amu interpret bin_width as ppm or amu
120
154
  # :bins => array of Mspire::Bin objects for custom bins (overides other bin options)
121
155
  # :normalize => true if true, divides total intensity by
122
156
  # number of spectra
123
157
  # :return_data => false returns a parallel array containing
124
158
  # the peaks associated with each returned peak
125
- # :split => false | :share | :greedy_y see Mspire::Peak#split
159
+ # :split => :zero|:greedy_y|:share see Mspire::Peak#split
126
160
  # :centroided => true treat the data as centroided
127
161
  #
128
162
  # The binning algorithm is roughly the fastest possible algorithm that
@@ -130,131 +164,167 @@ module Mspire
130
164
  # algorithm O(n + m))
131
165
  #
132
166
  # Assumes the peaklists are already sorted by m/z.
167
+ #
168
+ # Note that the peaks themselves will be altered if using the :share
169
+ # split method.
133
170
  def merge(peaklists, opts={})
134
171
  opts = DEFAULT_MERGE.merge(opts)
135
172
 
136
173
  (peaklist, returned_data) =
137
174
  if opts[:centroided]
138
- merge_centroids(peaklists, opts)
175
+ merge_centroids(peaklists, opts.dup)
139
176
  else
140
177
  raise NotImplementedError, "need to implement profile merging"
141
178
  end
142
179
 
143
- if opts[:normalize]
144
- sz = peaklists.size
145
- peaklist.each {|peak| peak[1] = peak[1].to_f / sz }
146
- end
147
- if opts[:return_data]
148
- $stderr.puts "returning peaklist (#{peaklist.size}) and data" if $VERBOSE
180
+ if opts[:only_data]
181
+ returned_data
182
+ elsif opts[:return_data]
149
183
  [peaklist, returned_data]
150
184
  else
151
- $stderr.puts "returning peaklist (#{peaklist.size})" if $VERBOSE
152
- peaklist
185
+ peaklist
153
186
  end
154
187
  end
188
+ end # end class << self
189
+
190
+
191
+ # returns an array with the indices outlining each peak. The first index
192
+ # is the start of the peak, the last index is the last of the peak.
193
+ # Interior indices represent local minima. So, peaks that have only two
194
+ # indices have no local minima.
195
+ def peak_boundaries(gt=0.0)
196
+ in_peak = false
197
+ prev_y = gt
198
+ prev_prev_y = gt
199
+ peak_inds = []
200
+ self.each_with_index do |peak, index|
201
+ curr_y = peak.y
202
+ if curr_y > gt
203
+ if !in_peak
204
+ in_peak = true
205
+ peak_inds << [index]
206
+ else
207
+ # if on_upslope
208
+ if prev_y < curr_y
209
+ # If we were previously on a downslope and we are now on an upslope
210
+ # then the previous index is a local min
211
+ # on_downslope(prev_previous_y, prev_y)
212
+ if prev_prev_y > prev_y
213
+ # We have found a local min
214
+ peak_inds.last << (index - 1)
215
+ end
216
+ end # end if (upslope)
217
+ end # end if !in_peak
218
+ elsif in_peak
219
+ peak_inds.last << (index - 1)
220
+ in_peak = false
221
+ end
222
+ prev_prev_y = prev_y
223
+ prev_y = curr_y
224
+ end
225
+ # add the last one to the last peak if it is a boundary
226
+ if self[-1].y > gt
227
+ peak_inds.last << (self.size-1)
228
+ end
229
+ peak_inds
155
230
  end
156
231
 
232
+ # returns an array of PeakList objects
233
+ def split_on_zeros(given_peak_boundaries=nil)
234
+ pk_bounds = given_peak_boundaries || peak_boundaries(0.0)
235
+ pk_bounds.map do |indices|
236
+ self.class.new self[indices.first..indices.last]
237
+ end
238
+ end
239
+
240
+ # returns an array of PeakList objects
241
+ # assumes that this is one connected list of peaks (i.e., no
242
+ # zeros/whitespace on the edges or internally)
243
+ #
244
+ # /\
245
+ # / \/\
246
+ # / \
247
+ #
248
+ # if there are no local minima, just returns self inside the array
249
+ def split_contiguous(methd=:greedy_y, local_min_indices=nil)
250
+ local_min_indices ||= ((pb=peak_boundaries.first) && pb.shift && pb.pop && pb)
251
+
252
+ if local_min_indices.size == 0
253
+ self
254
+ else
255
+ peak_class = first.class
256
+ prev_lm_i = 0 # <- don't worry, will be set to bumped to zero
257
+ peak_lists = [ self.class.new([self[0]]) ]
258
+ local_min_indices.each do |lm_i|
259
+ peak_lists.last.push( *self[(prev_lm_i+1)..(lm_i-1)] )
260
+ case methd
261
+ when :greedy_y
262
+ if self[lm_i-1].y >= self[lm_i+1].y
263
+ peak_lists.last << self[lm_i]
264
+ peak_lists << self.class.new
265
+ else
266
+ peak_lists << self.class.new( [self[lm_i]] )
267
+ end
268
+ when :share
269
+ # for each local min, two new peaks will be created, with
270
+ # intensity shared between adjacent peak_lists
271
+ lm = self[lm_i]
272
+ sum = self[lm_i-1].y + self[lm_i+1].y
273
+ # push onto the last peaklist its portion of the local min
274
+ peak_lists.last << peak_class.new( [lm.x, lm.y * (self[lm_i-1].y.to_f/sum)] )
275
+ # create a new peaklist that contains its portion of the local min
276
+ peak_lists << self.class.new( [peak_class.new([lm.x, lm.y * (self[lm_i+1].y.to_f/sum)])] )
277
+ end
278
+ prev_lm_i = lm_i
279
+ end
280
+ peak_lists.last.push(*self[(prev_lm_i+1)...(self.size)] )
281
+ peak_lists
282
+ end
283
+ end
157
284
 
158
285
  # returns an Array of peaklist objects. Splits run of 1 or more local
159
286
  # minima into multiple peaklists. When a point is 'shared' between two
160
287
  # adjacent hill-ish areas, the choice of how to resolve multi-hills (runs
161
288
  # of data above zero) is one of:
162
289
  #
163
- # false/nil => only split on zeros
164
- # :share => give each peak its rightful portion of shared peaks, dividing the
290
+ # :zero = only split on zeros
291
+ # :share = give each peak its rightful portion of shared peaks, dividing the
165
292
  # intensity based on the intensity of adjacent peaks
166
- # :greedy_y => give the point to the peak with highest point next to
293
+ # :greedy_y = give the point to the peak with highest point next to
167
294
  # the point in question. tie goes lower.
168
295
  #
169
- # if return_local_minima is true, a parallel array of local minima indices is
170
- # returned (only makes sense if split_multipeaks is false)
171
- #
296
+ # Note that the peak surrounding a local_minima may be altered if using
297
+ # :share
298
+ #
172
299
  # assumes that a new peak can be made with an array containing the x
173
300
  # value and the y value.
174
- def split(split_multipeaks=false, return_local_minima=false)
175
- if split_multipeaks
176
- (zeroed_peaks, local_min_ind_ar) = self.split(false, true)
177
- $stderr.print "splitting on local minima ..." if $VERBOSE
178
- no_local_minima_peaks = zeroed_peaks.zip(local_min_ind_ar).map do |peak, lm_indices|
179
- new_peaks = [ peak.class.new ]
180
- if lm_indices.size > 0
181
- prev_lm_i = -1 # <- it's okay, we don't use until it is zero
182
- lm_indices.each do |lm_i|
183
- lm = peak[lm_i]
184
- point_class = lm.class
185
-
186
- # push onto the last peak all the points from right after the previous local min
187
- # to just before this local min
188
- new_peaks.last.push( *peak[(prev_lm_i+1)..(lm_i-1)] )
189
- before_pnt = peak[lm_i-1]
190
- after_pnt = peak[lm_i+1]
191
-
192
- case split_multipeaks
193
- when :share
194
- sum = before_pnt[1] + after_pnt[1]
195
- # push onto the last peak its portion of the local min
196
- new_peaks.last << point_class.new( [lm[0], lm[1] * (before_pnt[1].to_f/sum)] )
197
- # create a new peak that contains its portion of the local min
198
- new_peaks << self.class.new( [point_class.new([lm[0], lm[1] * (after_pnt[1].to_f/sum)])] )
199
- prev_lm_i = lm_i
200
- when :greedy_y
201
- if before_pnt[1] >= after_pnt[1]
202
- new_peaks.last << lm
203
- new_peaks << self.class.new
204
- prev_lm_i = lm_i
205
- else
206
- new_peaks << self.class.new( [lm] )
207
- prev_lm_i = lm_i
208
- end
209
- else
210
- raise ArgumentError, "only recognize :share, :greedy_y, or false for the arg in #split(arg)"
211
- end
212
- end
213
- new_peaks.last.push( *peak[(prev_lm_i+1)...peak.size] )
214
- new_peaks
215
- else
216
- [peak]
217
- end
218
- end.flatten(1) # end zip
219
- $stderr.puts "now #{no_local_minima_peaks.size} peaks." if $VERBOSE
220
- no_local_minima_peaks
301
+ def split(split_multipeaks_mthd=:zero)
302
+ if split_multipeaks_mthd == :zero
303
+ split_on_zeros
221
304
  else
222
- $stderr.print "splitting on zeros..." if $VERBOSE
223
- # first, split the peaks based on zero intensity values
224
- # and simultaneously keep track of the local minima within each
225
- # resulting peak
226
- peak_lists = []
227
- local_min_ind_ar = []
228
- in_peak = false
229
- self.each_with_index do |peak, index|
230
- previous_y = self[index - 1][1]
231
- if peak[1] > 0
232
- if !in_peak
233
- in_peak = 0
234
- peak_lists << self.class.new([peak])
235
- local_min_ind_ar << []
236
- else
237
- peak_lists.last << peak
238
- # if on_upslope(previous_y, point[1])
239
- if previous_y < peak[1]
240
- # If we were previously on a downslope and we are now on an upslope
241
- # then the previous index is a local min
242
- prev_previous_y = self[index - 2][1]
243
- # on_downslope(prev_previous_y, previous_y)
244
- if prev_previous_y > previous_y
245
- # We have found a local min
246
- local_min_ind_ar.last << (in_peak-1)
247
- end
248
- end # end if (upslope)
249
- end # end if !in_peak
250
- in_peak += 1
251
- elsif in_peak
252
- in_peak = false
253
- end # end if point[1] > 0
305
+ boundaries = peak_boundaries(0.0)
306
+ no_lm_pklsts = []
307
+ boundaries.each do |indices|
308
+ peak = self[indices.first..indices.last]
309
+ if indices.size == 2
310
+ no_lm_pklsts << peak
311
+ else # have local minima
312
+ multipeak = PeakList.new(peak)
313
+ local_min_inds = indices[1..-2].map {|i| i-indices.first}
314
+ peaklists = multipeak.split_contiguous(split_multipeaks_mthd, local_min_inds)
315
+ no_lm_pklsts.push *peaklists
316
+ end
254
317
  end
255
- $stderr.puts "#{peak_lists.size} no-whitespace-inside peak_lists." if $VERBOSE
256
- return_local_minima ? [peak_lists, local_min_ind_ar] : peak_lists
257
- end #
318
+ #$stderr.puts "now #{no_lm_pklsts.size} peaks." if $VERBOSE
319
+ no_lm_pklsts
320
+ end
258
321
  end # def split
259
322
  end
260
323
  end
324
+
325
+
326
+
327
+ =begin
328
+ if !opts[:only_data]
329
+ =end
330
+