peak-performance 0.6.4__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- peak_performance/models.py +130 -54
- peak_performance/pipeline.py +28 -25
- peak_performance/plots.py +50 -44
- peak_performance/test_models.py +127 -32
- peak_performance/test_pipeline.py +8 -1
- peak_performance-0.7.1.dist-info/METADATA +48 -0
- peak_performance-0.7.1.dist-info/RECORD +13 -0
- {peak_performance-0.6.4.dist-info → peak_performance-0.7.1.dist-info}/WHEEL +1 -1
- peak_performance-0.6.4.dist-info/METADATA +0 -67
- peak_performance-0.6.4.dist-info/RECORD +0 -13
- {peak_performance-0.6.4.dist-info → peak_performance-0.7.1.dist-info}/LICENSE.md +0 -0
- {peak_performance-0.6.4.dist-info → peak_performance-0.7.1.dist-info}/top_level.txt +0 -0
peak_performance/models.py
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
Copyright (C) 2023 Forschungszentrum Jülich GmbH
|
|
1
|
+
# PeakPerformance
|
|
2
|
+
# Copyright (C) 2023 Forschungszentrum Jülich GmbH
|
|
4
3
|
|
|
5
|
-
This program is free software: you can redistribute it and/or modify
|
|
6
|
-
it under the terms of the GNU Affero General Public License as published
|
|
7
|
-
by the Free Software Foundation, either version 3 of the License, or
|
|
8
|
-
(at your option) any later version.
|
|
4
|
+
# This program is free software: you can redistribute it and/or modify
|
|
5
|
+
# it under the terms of the GNU Affero General Public License as published
|
|
6
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
|
7
|
+
# (at your option) any later version.
|
|
9
8
|
|
|
10
|
-
This program is distributed in the hope that it will be useful,
|
|
11
|
-
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
GNU Affero General Public License for more details.
|
|
9
|
+
# This program is distributed in the hope that it will be useful,
|
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
# GNU Affero General Public License for more details.
|
|
14
13
|
|
|
15
|
-
You should have received a copy of the GNU Affero General Public License
|
|
16
|
-
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
14
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
15
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
16
|
+
"""
|
|
17
|
+
This module contains functions for creating various kinds of peak models and to make initial guesses for their parameters.
|
|
17
18
|
"""
|
|
18
19
|
|
|
19
20
|
from enum import Enum
|
|
@@ -28,12 +29,39 @@ import scipy.stats as st
|
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
class ModelType(str, Enum):
|
|
31
|
-
"""
|
|
32
|
+
"""Enum of default model types."""
|
|
32
33
|
|
|
33
34
|
Normal = "normal"
|
|
35
|
+
"""Shape of a Gaussian Normal PDF."""
|
|
36
|
+
|
|
34
37
|
SkewNormal = "skew_normal"
|
|
38
|
+
"""Shape of a skewed Normal PDF."""
|
|
39
|
+
|
|
35
40
|
DoubleNormal = "double_normal"
|
|
41
|
+
"""Superposition of two ``Normal`` peaks."""
|
|
42
|
+
|
|
36
43
|
DoubleSkewNormal = "double_skew_normal"
|
|
44
|
+
"""Superposition of two ``SkewedNormal`` peaks."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def guess_noise(intensity):
|
|
48
|
+
"""
|
|
49
|
+
Function for providing a guess for the noise width of a given signal
|
|
50
|
+
based on the first and last 15 % of data points in a time series.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
time
|
|
55
|
+
NumPy array with the time values of the relevant timeframe.
|
|
56
|
+
intensity
|
|
57
|
+
NumPy array with the intensity values of the relevant timeframe.
|
|
58
|
+
"""
|
|
59
|
+
n = len(intensity)
|
|
60
|
+
ifrom = int(np.ceil(0.15 * n))
|
|
61
|
+
ito = int(np.floor(0.85 * n))
|
|
62
|
+
start_ints = intensity[:ifrom]
|
|
63
|
+
end_ints = intensity[ito:]
|
|
64
|
+
return np.std([*(start_ints - np.mean(start_ints)), *(end_ints - np.mean(end_ints))])
|
|
37
65
|
|
|
38
66
|
|
|
39
67
|
def initial_guesses(time: np.ndarray, intensity: np.ndarray):
|
|
@@ -79,12 +107,16 @@ def initial_guesses(time: np.ndarray, intensity: np.ndarray):
|
|
|
79
107
|
# use the indeces in noise_index to get the time and intensity of all noise data points
|
|
80
108
|
noise_time = [time[n] for n in noise_index]
|
|
81
109
|
noise_intensity = [intensity[n] for n in noise_index]
|
|
82
|
-
# calculate the width of the noise
|
|
83
|
-
noise_width_guess = max(noise_intensity) - min(noise_intensity)
|
|
84
110
|
|
|
85
111
|
# use scipy to fit a linear regression through the noise as a prior for the eventual baseline
|
|
86
112
|
baseline_fit = st.linregress(noise_time, noise_intensity)
|
|
87
113
|
|
|
114
|
+
# calculate the width of the noise
|
|
115
|
+
noise_width_guess = guess_noise(intensity)
|
|
116
|
+
|
|
117
|
+
# clip the noise to at least 10
|
|
118
|
+
noise_width_guess = np.clip(noise_width_guess, 10, np.inf)
|
|
119
|
+
|
|
88
120
|
return baseline_fit.slope, baseline_fit.intercept, noise_width_guess
|
|
89
121
|
|
|
90
122
|
|
|
@@ -118,9 +150,9 @@ def baseline_slope_prior_params(slope_guess: Union[float, int]) -> Mapping[str,
|
|
|
118
150
|
}
|
|
119
151
|
|
|
120
152
|
|
|
121
|
-
def
|
|
153
|
+
def normal_peak_shape(baseline, time: np.ndarray, mean, std, *, height):
|
|
122
154
|
"""
|
|
123
|
-
Model a peak shaped like
|
|
155
|
+
Model a peak shaped like a normal distribution.
|
|
124
156
|
|
|
125
157
|
Parameters
|
|
126
158
|
----------
|
|
@@ -166,7 +198,7 @@ def define_model_normal(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
166
198
|
# add guesses to the pmodel as ConstantData
|
|
167
199
|
pm.ConstantData("intercept_guess", intercept_guess)
|
|
168
200
|
pm.ConstantData("slope_guess", slope_guess)
|
|
169
|
-
pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
201
|
+
noise_guess = pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
170
202
|
|
|
171
203
|
# priors plus error handling in case of mathematically impermissible values
|
|
172
204
|
baseline_intercept = pm.Normal(
|
|
@@ -174,7 +206,7 @@ def define_model_normal(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
174
206
|
)
|
|
175
207
|
baseline_slope = pm.Normal("baseline_slope", **baseline_slope_prior_params(slope_guess))
|
|
176
208
|
baseline = pm.Deterministic("baseline", baseline_intercept + baseline_slope * time)
|
|
177
|
-
noise = pm.LogNormal("noise",
|
|
209
|
+
noise = pm.LogNormal("noise", pt.log(noise_guess))
|
|
178
210
|
# define priors for parameters of a normally distributed posterior
|
|
179
211
|
mean = pm.Normal("mean", np.mean(time[[0, -1]]), np.ptp(time) / 2)
|
|
180
212
|
std = pm.HalfNormal("std", np.ptp(time) / 3)
|
|
@@ -182,7 +214,7 @@ def define_model_normal(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
182
214
|
pm.Deterministic("area", height / (1 / (std * np.sqrt(2 * np.pi))))
|
|
183
215
|
pm.Deterministic("sn", height / noise)
|
|
184
216
|
# posterior
|
|
185
|
-
y =
|
|
217
|
+
y = normal_peak_shape(baseline, time, mean, std, height=height)
|
|
186
218
|
y = pm.Deterministic("y", y)
|
|
187
219
|
|
|
188
220
|
# likelihood
|
|
@@ -193,7 +225,7 @@ def define_model_normal(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
193
225
|
|
|
194
226
|
def double_model_mean_prior(time):
|
|
195
227
|
"""
|
|
196
|
-
Function creating prior probability distributions for
|
|
228
|
+
Function creating prior probability distributions for the mean retention times of a dual-peak.
|
|
197
229
|
|
|
198
230
|
Parameters
|
|
199
231
|
----------
|
|
@@ -203,31 +235,75 @@ def double_model_mean_prior(time):
|
|
|
203
235
|
Returns
|
|
204
236
|
-------
|
|
205
237
|
mean
|
|
206
|
-
Normally distributed prior for the ordered means of the
|
|
238
|
+
Normally distributed prior for the ordered means of the multi-peak model.
|
|
207
239
|
diff
|
|
208
|
-
Difference between
|
|
240
|
+
Difference between the group mean and peak-wise mean.
|
|
209
241
|
meanmean
|
|
210
|
-
Normally distributed prior for the mean of the
|
|
242
|
+
Normally distributed prior for the group mean of the peak means.
|
|
243
|
+
"""
|
|
244
|
+
tmin = np.min(time)
|
|
245
|
+
tdelta = np.ptp(time)
|
|
246
|
+
meanmean = pm.Normal("meanmean", mu=tmin + tdelta / 2, sigma=tdelta / 6)
|
|
247
|
+
separation = pm.Gamma(
|
|
248
|
+
"separation",
|
|
249
|
+
mu=tdelta / 6,
|
|
250
|
+
sigma=tdelta / 12,
|
|
251
|
+
)
|
|
252
|
+
offset = pm.Deterministic("offset", pt.stack([-separation / 2, separation / 2]), dims="subpeak")
|
|
253
|
+
mean = pm.Deterministic(
|
|
254
|
+
"mean",
|
|
255
|
+
meanmean + offset,
|
|
256
|
+
dims=("subpeak",),
|
|
257
|
+
)
|
|
258
|
+
return mean, offset, meanmean
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def multi_peak_means_prior(time):
|
|
262
|
+
"""
|
|
263
|
+
Function creating prior probability distributions for multi-peaks using a ZeroSumNormal distribution.
|
|
264
|
+
|
|
265
|
+
The number of peaks is determined from the `"subpeak"` model coordinates.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
time
|
|
270
|
+
NumPy array with the time values of the relevant timeframe.
|
|
271
|
+
|
|
272
|
+
Returns
|
|
273
|
+
-------
|
|
274
|
+
mean
|
|
275
|
+
Normally distributed prior for the ordered means of the multi-peak model.
|
|
276
|
+
offset
|
|
277
|
+
Time offset between the group mean and peak-wise mean.
|
|
278
|
+
meanmean
|
|
279
|
+
Normally distributed prior for the group mean of the peak means.
|
|
211
280
|
"""
|
|
281
|
+
pmodel = pm.modelcontext(None)
|
|
212
282
|
meanmean = pm.Normal("meanmean", mu=np.min(time) + np.ptp(time) / 2, sigma=np.ptp(time) / 6)
|
|
213
|
-
|
|
214
|
-
"
|
|
215
|
-
sigma=
|
|
216
|
-
|
|
283
|
+
offset_unsorted = pm.ZeroSumNormal(
|
|
284
|
+
"offset_unsorted",
|
|
285
|
+
sigma=2,
|
|
286
|
+
# Support arbitrary number of subpeaks
|
|
287
|
+
shape=len(pmodel.coords["subpeak"]),
|
|
288
|
+
# NOTE: As of PyMC v5.14, the OrderedTransform and ZeroSumTransform are incompatible.
|
|
289
|
+
# See https://github.com/pymc-devs/pymc/issues/6975.
|
|
290
|
+
# As a workaround we'll call pt.sort a few lines below.
|
|
217
291
|
)
|
|
218
|
-
|
|
292
|
+
offset = pm.Deterministic("offset", pt.sort(offset_unsorted), dims="subpeak")
|
|
293
|
+
mean = pm.Deterministic(
|
|
219
294
|
"mean",
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
295
|
+
meanmean + offset,
|
|
296
|
+
# Introduce a small jitter to the subpeak means to decouple them
|
|
297
|
+
# from the strictly asymmetric ZeroSumNormal entries.
|
|
298
|
+
# This reduces the chances of unwanted bimodality.
|
|
223
299
|
dims=("subpeak",),
|
|
224
300
|
)
|
|
225
|
-
return mean,
|
|
301
|
+
return mean, offset, meanmean
|
|
226
302
|
|
|
227
303
|
|
|
228
|
-
def
|
|
304
|
+
def double_normal_peak_shape(baseline, time: np.ndarray, mean, std, *, height):
|
|
229
305
|
"""
|
|
230
|
-
|
|
306
|
+
Model a peak shaped like a univariate ordered normal distribution.
|
|
231
307
|
|
|
232
308
|
Parameters
|
|
233
309
|
----------
|
|
@@ -281,7 +357,7 @@ def define_model_double_normal(time: np.ndarray, intensity: np.ndarray) -> pm.Mo
|
|
|
281
357
|
# add guesses to the pmodel as ConstantData
|
|
282
358
|
pm.ConstantData("intercept_guess", intercept_guess)
|
|
283
359
|
pm.ConstantData("slope_guess", slope_guess)
|
|
284
|
-
pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
360
|
+
noise_guess = pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
285
361
|
|
|
286
362
|
# priors
|
|
287
363
|
baseline_intercept = pm.Normal(
|
|
@@ -289,8 +365,9 @@ def define_model_double_normal(time: np.ndarray, intensity: np.ndarray) -> pm.Mo
|
|
|
289
365
|
)
|
|
290
366
|
baseline_slope = pm.Normal("baseline_slope", **baseline_slope_prior_params(slope_guess))
|
|
291
367
|
baseline = pm.Deterministic("baseline", baseline_intercept + baseline_slope * time)
|
|
292
|
-
noise = pm.LogNormal("noise",
|
|
293
|
-
|
|
368
|
+
noise = pm.LogNormal("noise", pt.log(noise_guess))
|
|
369
|
+
# NOTE: We expect dobule-peaks to be narrower w.r.t. the time frame, compare to single peaks.
|
|
370
|
+
std = pm.HalfNormal("std", sigma=[np.ptp(time) / 6, np.ptp(time) / 6], dims=("subpeak",))
|
|
294
371
|
height = pm.HalfNormal(
|
|
295
372
|
"height", sigma=[0.95 * np.max(intensity), 0.95 * np.max(intensity)], dims=("subpeak",)
|
|
296
373
|
)
|
|
@@ -302,7 +379,7 @@ def define_model_double_normal(time: np.ndarray, intensity: np.ndarray) -> pm.Mo
|
|
|
302
379
|
mean, diff, meanmean = double_model_mean_prior(time)
|
|
303
380
|
|
|
304
381
|
# posterior
|
|
305
|
-
y =
|
|
382
|
+
y = double_normal_peak_shape(baseline, time, mean, std, height=height)
|
|
306
383
|
y = pm.Deterministic("y", y)
|
|
307
384
|
|
|
308
385
|
# likelihood
|
|
@@ -323,10 +400,9 @@ def std_skew_calculation(scale, alpha):
|
|
|
323
400
|
Skewness parameter of the skew normal distribution.
|
|
324
401
|
|
|
325
402
|
Returns
|
|
326
|
-
|
|
403
|
+
-------
|
|
327
404
|
std
|
|
328
405
|
Standard deviation of a skew normal distribution.
|
|
329
|
-
-------
|
|
330
406
|
"""
|
|
331
407
|
return np.sqrt(scale**2 * (1 - (2 * alpha**2) / ((alpha**2 + 1) * np.pi)))
|
|
332
408
|
|
|
@@ -345,7 +421,7 @@ def mean_skew_calculation(loc, scale, alpha):
|
|
|
345
421
|
Skewness parameter of the skew normal distribution.
|
|
346
422
|
|
|
347
423
|
Returns
|
|
348
|
-
|
|
424
|
+
-------
|
|
349
425
|
mean
|
|
350
426
|
Arithmetic mean of a skew normal distribution.
|
|
351
427
|
"""
|
|
@@ -419,7 +495,7 @@ def height_calculation(area, loc, scale, alpha, mode_skew):
|
|
|
419
495
|
Mode of the skew normal distribution.
|
|
420
496
|
|
|
421
497
|
Returns
|
|
422
|
-
|
|
498
|
+
-------
|
|
423
499
|
mean
|
|
424
500
|
Arithmetic mean of a skew normal distribution.
|
|
425
501
|
"""
|
|
@@ -430,9 +506,9 @@ def height_calculation(area, loc, scale, alpha, mode_skew):
|
|
|
430
506
|
)
|
|
431
507
|
|
|
432
508
|
|
|
433
|
-
def
|
|
509
|
+
def skew_normal_peak_shape(baseline, time, mean, std, alpha, *, area):
|
|
434
510
|
"""
|
|
435
|
-
|
|
511
|
+
Model a peak shaped like a skew normal distribution.
|
|
436
512
|
|
|
437
513
|
Parameters
|
|
438
514
|
----------
|
|
@@ -489,7 +565,7 @@ def define_model_skew(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
489
565
|
# add guesses to the pmodel as ConstantData
|
|
490
566
|
pm.ConstantData("intercept_guess", intercept_guess)
|
|
491
567
|
pm.ConstantData("slope_guess", slope_guess)
|
|
492
|
-
pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
568
|
+
noise_guess = pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
493
569
|
|
|
494
570
|
# priors plus error handling in case of mathematically impermissible values
|
|
495
571
|
baseline_intercept = pm.Normal(
|
|
@@ -497,7 +573,7 @@ def define_model_skew(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
497
573
|
)
|
|
498
574
|
baseline_slope = pm.Normal("baseline_slope", **baseline_slope_prior_params(slope_guess))
|
|
499
575
|
baseline = pm.Deterministic("baseline", baseline_intercept + baseline_slope * time)
|
|
500
|
-
noise = pm.LogNormal("noise",
|
|
576
|
+
noise = pm.LogNormal("noise", pt.log(noise_guess))
|
|
501
577
|
mean = pm.Normal("mean", np.mean(time[[0, -1]]), np.ptp(time) / 2)
|
|
502
578
|
std = pm.HalfNormal("std", np.ptp(time) / 3)
|
|
503
579
|
alpha = pm.Normal("alpha", 0, 3.5)
|
|
@@ -528,7 +604,7 @@ def define_model_skew(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
528
604
|
height_formula,
|
|
529
605
|
)
|
|
530
606
|
pm.Deterministic("sn", height / noise)
|
|
531
|
-
y =
|
|
607
|
+
y = skew_normal_peak_shape(baseline, time, mean, std, alpha, area=area)
|
|
532
608
|
y = pm.Deterministic("y", y)
|
|
533
609
|
|
|
534
610
|
# likelihood
|
|
@@ -537,9 +613,9 @@ def define_model_skew(time: np.ndarray, intensity: np.ndarray) -> pm.Model:
|
|
|
537
613
|
return pmodel
|
|
538
614
|
|
|
539
615
|
|
|
540
|
-
def
|
|
616
|
+
def double_skew_normal_peak_shape(baseline, time: np.ndarray, mean, std, alpha, *, area):
|
|
541
617
|
"""
|
|
542
|
-
|
|
618
|
+
Model a peak shaped like the a univariate ordered skew normal distribution.
|
|
543
619
|
|
|
544
620
|
Parameters
|
|
545
621
|
----------
|
|
@@ -605,7 +681,7 @@ def define_model_double_skew_normal(time: np.ndarray, intensity: np.ndarray) ->
|
|
|
605
681
|
# add guesses to the pmodel as ConstantData
|
|
606
682
|
pm.ConstantData("intercept_guess", intercept_guess)
|
|
607
683
|
pm.ConstantData("slope_guess", slope_guess)
|
|
608
|
-
pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
684
|
+
noise_guess = pm.ConstantData("noise_width_guess", noise_width_guess)
|
|
609
685
|
|
|
610
686
|
# priors plus error handling in case of mathematically impermissible values
|
|
611
687
|
baseline_intercept = pm.Normal(
|
|
@@ -613,7 +689,7 @@ def define_model_double_skew_normal(time: np.ndarray, intensity: np.ndarray) ->
|
|
|
613
689
|
)
|
|
614
690
|
baseline_slope = pm.Normal("baseline_slope", **baseline_slope_prior_params(slope_guess))
|
|
615
691
|
baseline = pm.Deterministic("baseline", baseline_intercept + baseline_slope * time)
|
|
616
|
-
noise = pm.LogNormal("noise",
|
|
692
|
+
noise = pm.LogNormal("noise", pt.log(noise_guess))
|
|
617
693
|
# use univariate ordered normal distribution for the mean values
|
|
618
694
|
# use a zero sum normal distribution to describe the distance of the mean values
|
|
619
695
|
# from the mean of the mean values ("meanmean")
|
|
@@ -656,7 +732,7 @@ def define_model_double_skew_normal(time: np.ndarray, intensity: np.ndarray) ->
|
|
|
656
732
|
pm.Deterministic("sn", height / noise, dims=("subpeak",))
|
|
657
733
|
|
|
658
734
|
# posterior
|
|
659
|
-
y =
|
|
735
|
+
y = double_skew_normal_peak_shape(baseline, time, mean, std, alpha, area=area)
|
|
660
736
|
y = pm.Deterministic("y", y)
|
|
661
737
|
|
|
662
738
|
# likelihood
|
peak_performance/pipeline.py
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
Copyright (C) 2023 Forschungszentrum Jülich GmbH
|
|
1
|
+
# PeakPerformance
|
|
2
|
+
# Copyright (C) 2023 Forschungszentrum Jülich GmbH
|
|
4
3
|
|
|
5
|
-
This program is free software: you can redistribute it and/or modify
|
|
6
|
-
it under the terms of the GNU Affero General Public License as published
|
|
7
|
-
by the Free Software Foundation, either version 3 of the License, or
|
|
8
|
-
(at your option) any later version.
|
|
4
|
+
# This program is free software: you can redistribute it and/or modify
|
|
5
|
+
# it under the terms of the GNU Affero General Public License as published
|
|
6
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
|
7
|
+
# (at your option) any later version.
|
|
9
8
|
|
|
10
|
-
This program is distributed in the hope that it will be useful,
|
|
11
|
-
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
GNU Affero General Public License for more details.
|
|
9
|
+
# This program is distributed in the hope that it will be useful,
|
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
# GNU Affero General Public License for more details.
|
|
14
13
|
|
|
15
|
-
You should have received a copy of the GNU Affero General Public License
|
|
16
|
-
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
14
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
15
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
16
|
+
"""
|
|
17
|
+
Defines steps for a pipeline to process LC-MS-MS data.
|
|
17
18
|
"""
|
|
18
19
|
|
|
19
20
|
import importlib
|
|
@@ -489,6 +490,7 @@ def sampling(pmodel, **sample_kwargs):
|
|
|
489
490
|
idata
|
|
490
491
|
Inference data object.
|
|
491
492
|
"""
|
|
493
|
+
sample_kwargs.setdefault("chains", 4)
|
|
492
494
|
sample_kwargs.setdefault("tune", 2000)
|
|
493
495
|
sample_kwargs.setdefault("draws", 2000)
|
|
494
496
|
# check if nutpie is available; if so, use it to enhance performance
|
|
@@ -647,7 +649,7 @@ def posterior_predictive_sampling(pmodel, idata):
|
|
|
647
649
|
Inference data object updated with the posterior predictive samples.
|
|
648
650
|
"""
|
|
649
651
|
with pmodel:
|
|
650
|
-
idata.extend(pm.sample_posterior_predictive(idata
|
|
652
|
+
idata.extend(pm.sample_posterior_predictive(idata))
|
|
651
653
|
return idata
|
|
652
654
|
|
|
653
655
|
|
|
@@ -1185,7 +1187,7 @@ def pipeline(
|
|
|
1185
1187
|
Data format (suffix) of the raw data, default is '.npy'.
|
|
1186
1188
|
|
|
1187
1189
|
Returns
|
|
1188
|
-
|
|
1190
|
+
-------
|
|
1189
1191
|
path_results
|
|
1190
1192
|
Path variable pointing to the newly created folder for this batch.
|
|
1191
1193
|
"""
|
|
@@ -1222,7 +1224,7 @@ def pipeline_restart(
|
|
|
1222
1224
|
Path variable pointing to the directory of the broken PeakPerformance batch
|
|
1223
1225
|
|
|
1224
1226
|
Returns
|
|
1225
|
-
|
|
1227
|
+
-------
|
|
1226
1228
|
path_results_new
|
|
1227
1229
|
Path variable pointing to the newly created folder for the restarted batch.
|
|
1228
1230
|
"""
|
|
@@ -1321,7 +1323,7 @@ def parse_files_for_model_selection(signals: pandas.DataFrame) -> Dict[str, str]
|
|
|
1321
1323
|
DataFrame containing the signals tab of Template.xlsx.
|
|
1322
1324
|
|
|
1323
1325
|
Returns
|
|
1324
|
-
|
|
1326
|
+
-------
|
|
1325
1327
|
files_for_selection
|
|
1326
1328
|
Dict with file names as keys and unique identifiers as values.
|
|
1327
1329
|
"""
|
|
@@ -1409,7 +1411,7 @@ def selected_models_to_template(
|
|
|
1409
1411
|
|
|
1410
1412
|
|
|
1411
1413
|
def model_selection_check(
|
|
1412
|
-
result_df: pandas.DataFrame, ic: str, elpd_threshold: Union[str, float] =
|
|
1414
|
+
result_df: pandas.DataFrame, ic: str, elpd_threshold: Union[str, float] = 35
|
|
1413
1415
|
) -> str:
|
|
1414
1416
|
"""
|
|
1415
1417
|
During model seleciton, double peak models are sometimes incorrectly preferred due to their increased complexity.
|
|
@@ -1428,17 +1430,18 @@ def model_selection_check(
|
|
|
1428
1430
|
to be accepted.
|
|
1429
1431
|
|
|
1430
1432
|
Returns
|
|
1431
|
-
|
|
1433
|
+
-------
|
|
1432
1434
|
selected_model
|
|
1433
1435
|
Name of the selected model type.
|
|
1434
1436
|
"""
|
|
1435
1437
|
selected_model = str(result_df.index[0])
|
|
1436
1438
|
if "double" in selected_model:
|
|
1437
1439
|
df_single_peak_models = result_df[~result_df.index.str.contains("double")]
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1440
|
+
if len(df_single_peak_models) > 0:
|
|
1441
|
+
elpd_single = max(list(df_single_peak_models[f"elpd_{ic}"]))
|
|
1442
|
+
elpd_double = max(list(result_df[f"elpd_{ic}"]))
|
|
1443
|
+
if not elpd_double > elpd_single + elpd_threshold:
|
|
1444
|
+
selected_model = str(df_single_peak_models.index[0])
|
|
1442
1445
|
return selected_model
|
|
1443
1446
|
|
|
1444
1447
|
|
|
@@ -1470,7 +1473,7 @@ def selection_loop(
|
|
|
1470
1473
|
"waic": widely applicable information criterion)
|
|
1471
1474
|
|
|
1472
1475
|
Returns
|
|
1473
|
-
|
|
1476
|
+
-------
|
|
1474
1477
|
result_df
|
|
1475
1478
|
DataFrame containing the ranking and scores of the model selection.
|
|
1476
1479
|
model_dict
|
|
@@ -1562,7 +1565,7 @@ def model_selection(path_raw_data: Union[str, os.PathLike], *, ic: str = "loo"):
|
|
|
1562
1565
|
"waic": widely applicable information criterion)
|
|
1563
1566
|
|
|
1564
1567
|
Returns
|
|
1565
|
-
|
|
1568
|
+
-------
|
|
1566
1569
|
comparison_results
|
|
1567
1570
|
DataFrame containing all rankings from model selection.
|
|
1568
1571
|
model_dict
|
peak_performance/plots.py
CHANGED
|
@@ -1,24 +1,25 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
Copyright (C) 2023 Forschungszentrum Jülich GmbH
|
|
1
|
+
# PeakPerformance
|
|
2
|
+
# Copyright (C) 2023 Forschungszentrum Jülich GmbH
|
|
4
3
|
|
|
5
|
-
This program is free software: you can redistribute it and/or modify
|
|
6
|
-
it under the terms of the GNU Affero General Public License as published
|
|
7
|
-
by the Free Software Foundation, either version 3 of the License, or
|
|
8
|
-
(at your option) any later version.
|
|
4
|
+
# This program is free software: you can redistribute it and/or modify
|
|
5
|
+
# it under the terms of the GNU Affero General Public License as published
|
|
6
|
+
# by the Free Software Foundation, either version 3 of the License, or
|
|
7
|
+
# (at your option) any later version.
|
|
9
8
|
|
|
10
|
-
This program is distributed in the hope that it will be useful,
|
|
11
|
-
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
-
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
-
GNU Affero General Public License for more details.
|
|
9
|
+
# This program is distributed in the hope that it will be useful,
|
|
10
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
# GNU Affero General Public License for more details.
|
|
14
13
|
|
|
15
|
-
You should have received a copy of the GNU Affero General Public License
|
|
16
|
-
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
14
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
15
|
+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
16
|
+
"""
|
|
17
|
+
Functions for preparing diagnostic and QC plots.
|
|
17
18
|
"""
|
|
18
19
|
|
|
19
20
|
import os
|
|
20
21
|
from pathlib import Path
|
|
21
|
-
from typing import Sequence, Union
|
|
22
|
+
from typing import Optional, Sequence, Union
|
|
22
23
|
|
|
23
24
|
import arviz as az
|
|
24
25
|
import numpy as np
|
|
@@ -31,7 +32,7 @@ def plot_raw_data(
|
|
|
31
32
|
identifier: str,
|
|
32
33
|
time: np.ndarray,
|
|
33
34
|
intensity: np.ndarray,
|
|
34
|
-
path: Union[str, os.PathLike],
|
|
35
|
+
path: Optional[Union[str, os.PathLike]],
|
|
35
36
|
save_formats: Sequence[str] = ("png", "svg"),
|
|
36
37
|
):
|
|
37
38
|
"""
|
|
@@ -62,9 +63,10 @@ def plot_raw_data(
|
|
|
62
63
|
plt.xticks(size=11.5)
|
|
63
64
|
plt.yticks(size=11.5)
|
|
64
65
|
fig.tight_layout()
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
if path is not None:
|
|
67
|
+
for format in save_formats:
|
|
68
|
+
fig.savefig(Path(path) / f"{identifier}_NoPeak.{format}", format=format)
|
|
69
|
+
plt.close(fig)
|
|
68
70
|
|
|
69
71
|
return
|
|
70
72
|
|
|
@@ -74,7 +76,6 @@ def plot_density(
|
|
|
74
76
|
):
|
|
75
77
|
"""
|
|
76
78
|
Method to plot the original data points alongside the posterior predictive plot (percentiles marked with a black, dashed line).
|
|
77
|
-
Serves as a more accurate comparison between data and model than comparing data and posterior distribution.
|
|
78
79
|
|
|
79
80
|
Parameters
|
|
80
81
|
----------
|
|
@@ -135,7 +136,7 @@ def plot_posterior_predictive(
|
|
|
135
136
|
identifier: str,
|
|
136
137
|
time: np.ndarray,
|
|
137
138
|
intensity: np.ndarray,
|
|
138
|
-
path: Union[str, os.PathLike],
|
|
139
|
+
path: Optional[Union[str, os.PathLike]],
|
|
139
140
|
idata: az.InferenceData,
|
|
140
141
|
discarded: bool,
|
|
141
142
|
save_formats: Sequence[str] = ("png", "svg"),
|
|
@@ -168,7 +169,7 @@ def plot_posterior_predictive(
|
|
|
168
169
|
plot_density(
|
|
169
170
|
ax=ax,
|
|
170
171
|
x=time,
|
|
171
|
-
samples=idata.posterior_predictive.
|
|
172
|
+
samples=idata.posterior_predictive["L"].stack(sample=("chain", "draw")).T.values,
|
|
172
173
|
percentiles=(2.5, 97.5),
|
|
173
174
|
)
|
|
174
175
|
# plot the raw data points
|
|
@@ -179,16 +180,19 @@ def plot_posterior_predictive(
|
|
|
179
180
|
plt.yticks(size=11.5)
|
|
180
181
|
plt.legend()
|
|
181
182
|
fig.tight_layout()
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
183
|
+
if path is not None:
|
|
184
|
+
# if signal was discarded, add a "_NoPeak" to the file name
|
|
185
|
+
if discarded:
|
|
186
|
+
for format in save_formats:
|
|
187
|
+
fig.savefig(
|
|
188
|
+
Path(path) / f"{identifier}_predictive_posterior_NoPeak.{format}", format=format
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
for format in save_formats:
|
|
192
|
+
fig.savefig(
|
|
193
|
+
Path(path) / f"{identifier}_predictive_posterior.{format}", format=format
|
|
194
|
+
)
|
|
195
|
+
plt.close(fig)
|
|
192
196
|
|
|
193
197
|
return
|
|
194
198
|
|
|
@@ -197,7 +201,7 @@ def plot_posterior(
|
|
|
197
201
|
identifier: str,
|
|
198
202
|
time: np.ndarray,
|
|
199
203
|
intensity: np.ndarray,
|
|
200
|
-
path: Union[str, os.PathLike],
|
|
204
|
+
path: Optional[Union[str, os.PathLike]],
|
|
201
205
|
idata: az.InferenceData,
|
|
202
206
|
discarded: bool,
|
|
203
207
|
save_formats: Sequence[str] = ("png", "svg"),
|
|
@@ -246,14 +250,15 @@ def plot_posterior(
|
|
|
246
250
|
plt.xticks(size=11.5)
|
|
247
251
|
plt.yticks(size=11.5)
|
|
248
252
|
fig.tight_layout()
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
253
|
+
if path is not None:
|
|
254
|
+
# if signal was discarded, add a "_NoPeak" to the file name
|
|
255
|
+
if discarded:
|
|
256
|
+
for format in save_formats:
|
|
257
|
+
fig.savefig(Path(path) / f"{identifier}_posterior_NoPeak.{format}", format=format)
|
|
258
|
+
else:
|
|
259
|
+
for format in save_formats:
|
|
260
|
+
fig.savefig(Path(path) / f"{identifier}_posterior.{format}", format=format)
|
|
261
|
+
plt.close(fig)
|
|
257
262
|
|
|
258
263
|
return
|
|
259
264
|
|
|
@@ -261,7 +266,7 @@ def plot_posterior(
|
|
|
261
266
|
def plot_model_comparison(
|
|
262
267
|
df_comp: pandas.DataFrame,
|
|
263
268
|
identifier: str,
|
|
264
|
-
path: Union[str, os.PathLike],
|
|
269
|
+
path: Optional[Union[str, os.PathLike]],
|
|
265
270
|
save_formats: Sequence[str] = ("png", "svg"),
|
|
266
271
|
):
|
|
267
272
|
"""
|
|
@@ -282,8 +287,9 @@ def plot_model_comparison(
|
|
|
282
287
|
axes = az.plot_compare(df_comp, insample_dev=False)
|
|
283
288
|
fig = axes.figure
|
|
284
289
|
plt.tight_layout()
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
290
|
+
if path is not None:
|
|
291
|
+
for format in save_formats:
|
|
292
|
+
fig.savefig(Path(path) / f"model_comparison_{identifier}.{format}", format=format)
|
|
293
|
+
plt.close(fig)
|
|
288
294
|
|
|
289
295
|
return
|
peak_performance/test_models.py
CHANGED
|
@@ -3,34 +3,110 @@ from pathlib import Path
|
|
|
3
3
|
import arviz as az
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pymc as pm
|
|
6
|
+
import pytensor.tensor as pt
|
|
6
7
|
import pytest
|
|
7
8
|
import scipy.integrate
|
|
8
9
|
import scipy.stats as st
|
|
9
10
|
|
|
10
11
|
from peak_performance import models
|
|
11
12
|
|
|
13
|
+
_DP_ROOT = Path(__file__).absolute().parent.parent
|
|
14
|
+
_REQUIRED_VARIABLES = {
|
|
15
|
+
"baseline_slope",
|
|
16
|
+
"baseline_intercept",
|
|
17
|
+
"baseline",
|
|
18
|
+
"std",
|
|
19
|
+
"height",
|
|
20
|
+
"area",
|
|
21
|
+
"sn",
|
|
22
|
+
"mean",
|
|
23
|
+
"y",
|
|
24
|
+
"noise",
|
|
25
|
+
}
|
|
26
|
+
_REQUIRED_DATA = {
|
|
27
|
+
"slope_guess",
|
|
28
|
+
"intercept_guess",
|
|
29
|
+
"noise_width_guess",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_noise_guessing():
|
|
34
|
+
expected = 0.7
|
|
35
|
+
intensities = [
|
|
36
|
+
*np.random.normal(10, expected, size=200),
|
|
37
|
+
*np.random.normal(0, 6, size=600),
|
|
38
|
+
*np.random.normal(40, expected, size=200),
|
|
39
|
+
]
|
|
40
|
+
actual = models.guess_noise(intensities)
|
|
41
|
+
assert 0.6 < actual < 0.8
|
|
42
|
+
pass
|
|
43
|
+
|
|
12
44
|
|
|
13
45
|
def test_initial_guesses():
|
|
14
46
|
# define time and intensity for example with known result
|
|
15
47
|
time = 2 + 0.1 * np.arange(17)
|
|
16
48
|
intensity = [1, 5, 3] + 11 * [1000] + [7, 9, 11]
|
|
17
49
|
# define expected results
|
|
18
|
-
expected_noise_width = np.ptp([1, 5, 3, 7, 9, 11])
|
|
19
50
|
expected_baseline_fit = st.linregress([2, 2.1, 2.2, 3.4, 3.5, 3.6], [1, 5, 3, 7, 9, 11])
|
|
20
51
|
# get the values from the initial guesses function
|
|
21
52
|
slope, intercept, noise_width = models.initial_guesses(time, intensity)
|
|
22
53
|
# compare the outcome with the expected values
|
|
23
54
|
assert expected_baseline_fit.slope == slope
|
|
24
55
|
assert expected_baseline_fit.intercept == intercept
|
|
25
|
-
|
|
56
|
+
# With this example the noise is clipped to at least 10
|
|
57
|
+
assert noise_width == 10
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_zsn_sorting():
|
|
62
|
+
"""This tests a workaround that we rely on for multi-peak models."""
|
|
63
|
+
coords = {
|
|
64
|
+
"thing": ["left", "center", "right"],
|
|
65
|
+
}
|
|
66
|
+
with pm.Model(coords=coords) as pmodel:
|
|
67
|
+
hyper = pm.Normal("hyper", mu=0, sigma=3)
|
|
68
|
+
offset_unsorted = pm.ZeroSumNormal(
|
|
69
|
+
"offset_unsorted",
|
|
70
|
+
sigma=1,
|
|
71
|
+
shape=3,
|
|
72
|
+
)
|
|
73
|
+
# Create a sorted deterministic without using transforms
|
|
74
|
+
offset = pm.Deterministic("offset", pt.sort(offset_unsorted), dims="thing")
|
|
75
|
+
pos = pm.Deterministic(
|
|
76
|
+
"pos",
|
|
77
|
+
hyper + offset,
|
|
78
|
+
dims="thing",
|
|
79
|
+
)
|
|
80
|
+
# Observe the two things in incorrect order to provoke the model 😈
|
|
81
|
+
dat = pm.Data("dat", [0.2, 0.05, -0.3], dims="thing")
|
|
82
|
+
pm.Normal("L", pos, observed=dat, dims="thing")
|
|
83
|
+
|
|
84
|
+
# Check draws from the prior
|
|
85
|
+
drawn = pm.draw(offset, draws=69)
|
|
86
|
+
np.testing.assert_array_less(drawn[:, 0], drawn[:, 1])
|
|
87
|
+
|
|
88
|
+
# And check MCMC draws too
|
|
89
|
+
with pmodel:
|
|
90
|
+
idata = pm.sample(
|
|
91
|
+
chains=1, tune=10, draws=69, step=pm.Metropolis(), compute_convergence_checks=False
|
|
92
|
+
)
|
|
93
|
+
for vname in ["offset", "pos"]:
|
|
94
|
+
np.testing.assert_array_less(
|
|
95
|
+
idata.posterior[vname].sel(thing="left"),
|
|
96
|
+
idata.posterior[vname].sel(thing="center"),
|
|
97
|
+
)
|
|
98
|
+
np.testing.assert_array_less(
|
|
99
|
+
idata.posterior[vname].sel(thing="center"),
|
|
100
|
+
idata.posterior[vname].sel(thing="right"),
|
|
101
|
+
)
|
|
26
102
|
pass
|
|
27
103
|
|
|
28
104
|
|
|
29
105
|
class TestDistributions:
|
|
30
|
-
def
|
|
106
|
+
def test_normal_peak_shape(self):
|
|
31
107
|
x = np.linspace(-5, 10, 10000)
|
|
32
108
|
expected = st.norm.pdf(x, 3, 2)
|
|
33
|
-
actual_pt = models.
|
|
109
|
+
actual_pt = models.normal_peak_shape(0, x, 3, 2, height=np.max(expected))
|
|
34
110
|
# cast arrays to float data type in order to avoid error of np.testing.assert_allclose() due to using np.isfinite under the hood
|
|
35
111
|
actual = actual_pt.eval().astype(float)
|
|
36
112
|
expected = expected.astype(float)
|
|
@@ -38,11 +114,11 @@ class TestDistributions:
|
|
|
38
114
|
np.testing.assert_allclose(expected, actual, atol=0.0000001)
|
|
39
115
|
pass
|
|
40
116
|
|
|
41
|
-
def
|
|
117
|
+
def test_double_normal_peak_shape(self):
|
|
42
118
|
x = np.linspace(5, 12, 10000)
|
|
43
119
|
y1 = st.norm.pdf(x, loc=7.5, scale=0.6)
|
|
44
120
|
y2 = st.norm.pdf(x, loc=9, scale=0.4) * 2
|
|
45
|
-
y_double_pt = models.
|
|
121
|
+
y_double_pt = models.double_normal_peak_shape(
|
|
46
122
|
0, x, (7.5, 9), (0.6, 0.4), height=(np.max(y1), np.max(y2))
|
|
47
123
|
)
|
|
48
124
|
y_double = y_double_pt.eval().astype(float)
|
|
@@ -105,11 +181,11 @@ class TestDistributions:
|
|
|
105
181
|
np.testing.assert_allclose(expected_mode_skew, actual_mode, atol=5e-3)
|
|
106
182
|
pass
|
|
107
183
|
|
|
108
|
-
def
|
|
184
|
+
def test_skew_normal_peak_shape(self):
|
|
109
185
|
x = np.linspace(-1, 5.5, 10000)
|
|
110
186
|
# test first with positive alpha
|
|
111
187
|
expected = st.skewnorm.pdf(x, 3, loc=1.2, scale=1.1)
|
|
112
|
-
actual_pt = models.
|
|
188
|
+
actual_pt = models.skew_normal_peak_shape(0, x, 1.2, 1.1, 3, area=1)
|
|
113
189
|
# cast arrays to float data type in order to avoid error of np.testing.assert_allclose() due to using np.isfinite under the hood
|
|
114
190
|
actual = actual_pt.eval().astype(float)
|
|
115
191
|
expected = expected.astype(float)
|
|
@@ -118,7 +194,7 @@ class TestDistributions:
|
|
|
118
194
|
|
|
119
195
|
# test again with negative alpha
|
|
120
196
|
expected = st.skewnorm.pdf(x, -3, loc=1.2, scale=1.1)
|
|
121
|
-
actual_pt = models.
|
|
197
|
+
actual_pt = models.skew_normal_peak_shape(0, x, 1.2, 1.1, -3, area=1)
|
|
122
198
|
# cast arrays to float data type in order to avoid error of np.testing.assert_allclose() due to using np.isfinite under the hood
|
|
123
199
|
actual = actual_pt.eval().astype(float)
|
|
124
200
|
expected = expected.astype(float)
|
|
@@ -133,8 +209,8 @@ class TestDistributions:
|
|
|
133
209
|
height = np.max(y)
|
|
134
210
|
area = scipy.integrate.quad(lambda x: st.norm.pdf(x, loc=1, scale=1), -10, 10)[0]
|
|
135
211
|
x = np.linspace(-10, 10, 10000)
|
|
136
|
-
y_actual_pt = models.
|
|
137
|
-
y_skew_actual_pt = models.
|
|
212
|
+
y_actual_pt = models.normal_peak_shape(0, x, 1, 1, height=height)
|
|
213
|
+
y_skew_actual_pt = models.skew_normal_peak_shape(0, x, 1, 1, 0, area=area)
|
|
138
214
|
y_actual = y_actual_pt.eval().astype(float)
|
|
139
215
|
y_skew_actual = y_skew_actual_pt.eval().astype(float)
|
|
140
216
|
# many values are extremely close to zero so rtol was increased.
|
|
@@ -142,7 +218,7 @@ class TestDistributions:
|
|
|
142
218
|
np.testing.assert_allclose(y_skew_actual, y_actual, atol=1e-20, rtol=0.9)
|
|
143
219
|
pass
|
|
144
220
|
|
|
145
|
-
def
|
|
221
|
+
def test_double_skew_normal_peak_shape(self):
|
|
146
222
|
x1 = np.arange(4, 6, 0.1)
|
|
147
223
|
x2 = np.arange(6, 8, 0.1)
|
|
148
224
|
alpha = 5
|
|
@@ -150,7 +226,7 @@ class TestDistributions:
|
|
|
150
226
|
y2 = st.skewnorm.pdf(x2, alpha, loc=6.3, scale=0.2)
|
|
151
227
|
time = np.array(list(x1) + list(x2))
|
|
152
228
|
intensity = np.array(list(y1) + list(y2))
|
|
153
|
-
y_double_pt = models.
|
|
229
|
+
y_double_pt = models.double_skew_normal_peak_shape(
|
|
154
230
|
0, time, (5, 6.3), (0.2, 0.2), (5, 5), area=(1, 1)
|
|
155
231
|
)
|
|
156
232
|
y_double = y_double_pt.eval().astype(float)
|
|
@@ -158,33 +234,52 @@ class TestDistributions:
|
|
|
158
234
|
|
|
159
235
|
|
|
160
236
|
@pytest.mark.parametrize(
|
|
161
|
-
"
|
|
237
|
+
"define_func",
|
|
238
|
+
[
|
|
239
|
+
models.define_model_normal,
|
|
240
|
+
models.define_model_skew,
|
|
241
|
+
],
|
|
162
242
|
)
|
|
163
|
-
def
|
|
164
|
-
timeseries = np.load(
|
|
165
|
-
Path(__file__).absolute().parent.parent / "example" / "A2t2R1Part1_132_85.9_86.1.npy"
|
|
166
|
-
)
|
|
243
|
+
def test_singlepeak_sampling(define_func):
|
|
244
|
+
timeseries = np.load(_DP_ROOT / "example" / "A2t2R1Part1_132_85.9_86.1.npy")
|
|
167
245
|
|
|
168
|
-
|
|
169
|
-
pmodel = models.define_model_normal(timeseries[0], timeseries[1])
|
|
170
|
-
elif model_type == models.ModelType.SkewNormal:
|
|
171
|
-
pmodel = models.define_model_skew(timeseries[0], timeseries[1])
|
|
172
|
-
elif model_type == models.ModelType.DoubleNormal:
|
|
173
|
-
pmodel = models.define_model_double_normal(timeseries[0], timeseries[1])
|
|
174
|
-
elif model_type == models.ModelType.DoubleSkewNormal:
|
|
175
|
-
pmodel = models.define_model_double_skew_normal(timeseries[0], timeseries[1])
|
|
246
|
+
pmodel = define_func(timeseries[0], timeseries[1])
|
|
176
247
|
with pmodel:
|
|
177
248
|
idata = pm.sample(cores=2, chains=2, tune=3, draws=5)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
249
|
+
assert set(idata.posterior.keys()) >= _REQUIRED_VARIABLES
|
|
250
|
+
assert set(idata.constant_data.keys()) >= _REQUIRED_DATA
|
|
251
|
+
pass
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
@pytest.mark.parametrize(
|
|
255
|
+
"define_func",
|
|
256
|
+
[
|
|
257
|
+
models.define_model_double_normal,
|
|
258
|
+
models.define_model_double_skew_normal,
|
|
259
|
+
],
|
|
260
|
+
)
|
|
261
|
+
def test_doublepeak_sampling(define_func):
|
|
262
|
+
timeseries = np.load(_DP_ROOT / "example" / "A2t2R1Part1_132_85.9_86.1.npy")
|
|
263
|
+
|
|
264
|
+
pmodel = define_func(timeseries[0], timeseries[1])
|
|
265
|
+
with pmodel:
|
|
266
|
+
idata = pm.sample(cores=2, chains=2, tune=3, draws=5)
|
|
267
|
+
assert set(idata.posterior.keys()) >= _REQUIRED_VARIABLES
|
|
268
|
+
assert set(idata.constant_data.keys()) >= _REQUIRED_DATA
|
|
269
|
+
# Confirm the order of peaks is as intended
|
|
270
|
+
np.testing.assert_array_less(
|
|
271
|
+
idata.posterior["offset"].sel(subpeak=0),
|
|
272
|
+
idata.posterior["offset"].sel(subpeak=1),
|
|
273
|
+
)
|
|
274
|
+
np.testing.assert_array_less(
|
|
275
|
+
idata.posterior["mean"].sel(subpeak=0),
|
|
276
|
+
idata.posterior["mean"].sel(subpeak=1),
|
|
277
|
+
)
|
|
183
278
|
pass
|
|
184
279
|
|
|
185
280
|
|
|
186
281
|
def test_model_comparison():
|
|
187
|
-
path =
|
|
282
|
+
path = _DP_ROOT / "test_data/test_model_comparison"
|
|
188
283
|
idata_normal = az.from_netcdf(path / "idata_normal.nc")
|
|
189
284
|
idata_skew = az.from_netcdf(path / "idata_skew.nc")
|
|
190
285
|
compare_dict = {
|
|
@@ -636,11 +636,18 @@ def test_model_selection_check():
|
|
|
636
636
|
assert selected_model == "normal"
|
|
637
637
|
# case 2: double peak exceeds elpd score difference threshold and is thusly accepted
|
|
638
638
|
result_df = pandas.DataFrame(
|
|
639
|
-
{"elpd_loo": [50, 30,
|
|
639
|
+
{"elpd_loo": [50, 30, 20, -5], "ic": ["loo", "loo", "loo", "loo"]},
|
|
640
640
|
index=["double_normal", "double_skew_normal", "normal", "skew_normal"],
|
|
641
641
|
)
|
|
642
642
|
selected_model = pl.model_selection_check(result_df, "loo", 25)
|
|
643
643
|
assert selected_model == "double_normal"
|
|
644
|
+
# case 3: single peak models were excluded
|
|
645
|
+
result_df = pandas.DataFrame(
|
|
646
|
+
{"elpd_loo": [50, 30], "ic": ["loo", "loo"]},
|
|
647
|
+
index=["double_normal", "double_skew_normal"],
|
|
648
|
+
)
|
|
649
|
+
selected_model = pl.model_selection_check(result_df, "loo", 25)
|
|
650
|
+
assert selected_model == "double_normal"
|
|
644
651
|
pass
|
|
645
652
|
|
|
646
653
|
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: peak-performance
|
|
3
|
+
Version: 0.7.1
|
|
4
|
+
Summary: A Python toolbox to fit chromatography peaks with uncertainty.
|
|
5
|
+
Author-email: Jochen Nießer <j.niesser@fz-juelich.de>, Michael Osthege <m.osthege@fz-juelich.de>
|
|
6
|
+
License: AGPLv3
|
|
7
|
+
Project-URL: homepage, https://jugit.fz-juelich.de/IBG-1/micropro/peak-performance
|
|
8
|
+
Project-URL: documentation, https://jugit.fz-juelich.de/IBG-1/micropro/peak-performance
|
|
9
|
+
Project-URL: repository, https://jugit.fz-juelich.de/IBG-1/micropro/peak-performance
|
|
10
|
+
Keywords: hplc,mass-spectrometry,uncertainty quantification
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE.md
|
|
18
|
+
Requires-Dist: arviz
|
|
19
|
+
Requires-Dist: matplotlib
|
|
20
|
+
Requires-Dist: numpy
|
|
21
|
+
Requires-Dist: pandas
|
|
22
|
+
Requires-Dist: pymc>=5.9.1
|
|
23
|
+
Requires-Dist: pytensor
|
|
24
|
+
Requires-Dist: scipy
|
|
25
|
+
Requires-Dist: openpyxl
|
|
26
|
+
|
|
27
|
+
[](https://pypi.org/project/peak-performance/)
|
|
28
|
+
[](https://github.com/JuBiotech/peak-performance/actions)
|
|
29
|
+
[](https://app.codecov.io/gh/JuBiotech/peak-performance)
|
|
30
|
+
[](https://peak-performance.readthedocs.io/en/latest)
|
|
31
|
+
[](https://zenodo.org/doi/10.5281/zenodo.10255543)
|
|
32
|
+
|
|
33
|
+
# About PeakPerformance
|
|
34
|
+
PeakPerformance employs Bayesian modeling for chromatographic peak data fitting.
|
|
35
|
+
This has the innate advantage of providing uncertainty quantification while jointly estimating all peak parameters united in a single peak model.
|
|
36
|
+
As Markov Chain Monte Carlo (MCMC) methods are utilized to infer the posterior probability distribution, convergence checks and the aformentioned uncertainty quantification are applied as novel quality metrics for a robust peak recognition.
|
|
37
|
+
|
|
38
|
+
# First steps
|
|
39
|
+
Be sure to check out our thorough [documentation](https://peak-performance.readthedocs.io/en/latest). It contains not only information on how to install PeakPerformance and prepare raw data for its application but also detailed treatises about the implemented model structures, validation with both synthetic and experimental data against a commercially available vendor software, exemplary usage of diagnostic plots and investigation of various effects.
|
|
40
|
+
Furthermore, you will find example notebooks and data sets showcasing different aspects of PeakPerformance.
|
|
41
|
+
|
|
42
|
+
# How to contribute
|
|
43
|
+
If you encounter bugs while using PeakPerformance, please bring them to our attention by opening an issue. When doing so, describe the problem in detail and add screenshots/code snippets and whatever other helpful material you can provide.
|
|
44
|
+
When contributing code, create a local clone of PeakPerformance, create a new branch, and open a pull request (PR).
|
|
45
|
+
|
|
46
|
+
# How to cite
|
|
47
|
+
Head over to Zenodo to [generate a BibTeX citation](https://doi.org/10.5281/zenodo.10255543) for the latest release.
|
|
48
|
+
A publication has just been submitted to a scientific journal. Once published, this section will be updated.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
peak_performance/__init__.py,sha256=yTq4THYewbWRnrs2Qkv4nCd-7MyvDlu_t0fPeWeKxQc,261
|
|
2
|
+
peak_performance/models.py,sha256=m32qCkEW00E3WV5d8xDlcMVHvdmcLH0fRnziPLsgDMk,27755
|
|
3
|
+
peak_performance/pipeline.py,sha256=O38AtmtGTA4fFYj78S836TgcFa1nuyf6npsbIM7DGec,64456
|
|
4
|
+
peak_performance/plots.py,sha256=JToIsNxGF-uh09t8IJvN9cWRTsL3opjDE8DMqGocYJQ,9528
|
|
5
|
+
peak_performance/test_main.py,sha256=xQiLDjhldxZzY5sp3RyIJUTtXxX46auWY9Qy7nuifxw,97
|
|
6
|
+
peak_performance/test_models.py,sha256=r6kqAVBtAbycf4IoRaXcSCZp6Lras3afK6o9qcLZbH8,11592
|
|
7
|
+
peak_performance/test_pipeline.py,sha256=gTZAxcJEVwJ0XW4IewmIWGLmx1n7KaK8egrovKHsCFI,22961
|
|
8
|
+
peak_performance/test_plots.py,sha256=lGwPWzezAhzEnyu_NMx2lFtyzzb1wxy-jnRMtOaaniY,4100
|
|
9
|
+
peak_performance-0.7.1.dist-info/LICENSE.md,sha256=zj-4LZ7oChyw5Uj5sFYOrVI3juK06Cb9lFm0rPcHXYk,32387
|
|
10
|
+
peak_performance-0.7.1.dist-info/METADATA,sha256=62R5sa4j-zdBzwVQRBaUD9cfS6pXvfPqUBpEUx0rVmk,3388
|
|
11
|
+
peak_performance-0.7.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
12
|
+
peak_performance-0.7.1.dist-info/top_level.txt,sha256=-lZSmgn2fZA-xPVmddLwaRt2hQeeWj7TYVefOk7_T58,17
|
|
13
|
+
peak_performance-0.7.1.dist-info/RECORD,,
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: peak-performance
|
|
3
|
-
Version: 0.6.4
|
|
4
|
-
Summary: A Python toolbox to fit chromatography peaks with uncertainty.
|
|
5
|
-
Author-email: Jochen Nießer <j.niesser@fz-juelich.de>, Michael Osthege <m.osthege@fz-juelich.de>
|
|
6
|
-
License: AGPLv3
|
|
7
|
-
Project-URL: homepage, https://jugit.fz-juelich.de/IBG-1/micropro/peak-performance
|
|
8
|
-
Project-URL: documentation, https://jugit.fz-juelich.de/IBG-1/micropro/peak-performance
|
|
9
|
-
Project-URL: repository, https://jugit.fz-juelich.de/IBG-1/micropro/peak-performance
|
|
10
|
-
Keywords: hplc,mass-spectrometry,uncertainty quantification
|
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Operating System :: OS Independent
|
|
13
|
-
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
14
|
-
Classifier: Intended Audience :: Science/Research
|
|
15
|
-
Requires-Python: >=3.9
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
License-File: LICENSE.md
|
|
18
|
-
Requires-Dist: arviz
|
|
19
|
-
Requires-Dist: matplotlib
|
|
20
|
-
Requires-Dist: numpy
|
|
21
|
-
Requires-Dist: pandas
|
|
22
|
-
Requires-Dist: pymc >=5.9.1
|
|
23
|
-
Requires-Dist: pytensor
|
|
24
|
-
Requires-Dist: scipy
|
|
25
|
-
Requires-Dist: openpyxl
|
|
26
|
-
Requires-Dist: numpy <1.26.0
|
|
27
|
-
Provides-Extra: test
|
|
28
|
-
Requires-Dist: pytest ; extra == 'test'
|
|
29
|
-
Requires-Dist: pytest-cov ; extra == 'test'
|
|
30
|
-
Requires-Dist: twine ; extra == 'test'
|
|
31
|
-
|
|
32
|
-
[](https://pypi.org/project/peak-performance/)
|
|
33
|
-
[](https://github.com/JuBiotech/peak-performance/actions)
|
|
34
|
-
[](https://app.codecov.io/gh/JuBiotech/peak-performance)
|
|
35
|
-
|
|
36
|
-
# How to use PeakPerformance
|
|
37
|
-
For installation instructions, see `Installation.md`.
|
|
38
|
-
For instructions regarding the use of PeakPerformance, check out the example notebook(s) under `notebooks`, the complementary example data under `example`, and the following introductory explanations.
|
|
39
|
-
|
|
40
|
-
## Preparing raw data
|
|
41
|
-
This step is crucial when using PeakPerformance. Raw data has to be supplied as time series meaning for each signal you want to analyze, save a NumPy array consisting of time in the first dimension and intensity in the second dimension (compare example data). Both time and intensity should also be NumPy arrays. If you e.g. have time and intensity of a singal as lists, you can use the following code to convert, format, and save them in the correct manner:
|
|
42
|
-
```
|
|
43
|
-
import numpy as np
|
|
44
|
-
from pathlib import Path
|
|
45
|
-
|
|
46
|
-
time_series = np.array([np.array(time), np.array(intensity)])
|
|
47
|
-
np.save(Path(r"example_path/time_series.npy"), time_series)
|
|
48
|
-
```
|
|
49
|
-
The naming convention of raw data files is `<acquisition name>_<precursor ion m/z or experiment number>_<product ion m/z start>_<product ion m/z end>.npy`. There should be no underscores within the named sections such as `acquisition name`. Essentially, the raw data names include the acquisition and mass trace, thus yielding a recognizable and unique name for each isotopomer/fragment/metabolite/sample.
|
|
50
|
-
|
|
51
|
-
## Model selection
|
|
52
|
-
When it comes to selecting models, PeakPerformance has a function performing an automated selection process by analyzing one acquisiton per mass trace with all implemented models. Subsequently, all models are ranked based on an information criterion (either pareto-smoothed importance sampling leave-one-out cross-validation or widely applicable information criterion). For this process to work as intended, you need to specify acquisitions with representative peaks for each mass trace (see example notebook 1). If e.g. most peaks of an analyte show a skewed shape, then select an acquisition where this is the case. For double peaks, select an acquision where the peaks are as distinct and comparable in height as possible.
|
|
53
|
-
Since model selection is a computationally demanding and time consuming process, it is suggested to state the model type as the user (see example notebook 1) if possible.
|
|
54
|
-
|
|
55
|
-
## Troubleshooting
|
|
56
|
-
### A batch run broke and I want to restart it.
|
|
57
|
-
If an error occured in the middle of a batch run, then you can use the `pipeline_restart` function in the `pipeline` module to create a new batch which will analyze only those samples, which have not been analyzed previously.
|
|
58
|
-
|
|
59
|
-
### The model parameters don't converge and/or the fit does not describe the raw data well.
|
|
60
|
-
Check the separate file `How to adapt PeakPerformance to you data`.
|
|
61
|
-
|
|
62
|
-
# How to contribute
|
|
63
|
-
If you encounter bugs while using PeakPerformance, please bring them to our attention by opening an issue. When doing so, describe the problem in detail and add screenshots/code snippets and whatever other helpful material you can provide.
|
|
64
|
-
When contributing code, create a local clone of PeakPerformance, create a new branch, and open a pull request (PR).
|
|
65
|
-
|
|
66
|
-
# How to cite
|
|
67
|
-
Will be updated once the paper has been released and a zenodo DOI has been created.
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
peak_performance/__init__.py,sha256=yTq4THYewbWRnrs2Qkv4nCd-7MyvDlu_t0fPeWeKxQc,261
|
|
2
|
-
peak_performance/models.py,sha256=L47mNU1HItYv5cB-cs2H0ooswhdcLfBdg8X1MHeiTUY,25130
|
|
3
|
-
peak_performance/pipeline.py,sha256=A-eIwhbn9hCIvWgrG5ksfQLn--ISBKVBjq09nVDwFO8,64311
|
|
4
|
-
peak_performance/plots.py,sha256=OO5rSC-kTCzH8-Fh0diz0Cq86fyrZ_FSOiDjcboZRAU,9280
|
|
5
|
-
peak_performance/test_main.py,sha256=xQiLDjhldxZzY5sp3RyIJUTtXxX46auWY9Qy7nuifxw,97
|
|
6
|
-
peak_performance/test_models.py,sha256=X3fy-kNih7TNrr4jKzgcx8qRnmh6cA27hSr2b6Tmf18,9334
|
|
7
|
-
peak_performance/test_pipeline.py,sha256=wyzVgVYT0pK_Lnh5VZEgL8Rxn8sjiCa1dRp1tF79foM,22652
|
|
8
|
-
peak_performance/test_plots.py,sha256=lGwPWzezAhzEnyu_NMx2lFtyzzb1wxy-jnRMtOaaniY,4100
|
|
9
|
-
peak_performance-0.6.4.dist-info/LICENSE.md,sha256=zj-4LZ7oChyw5Uj5sFYOrVI3juK06Cb9lFm0rPcHXYk,32387
|
|
10
|
-
peak_performance-0.6.4.dist-info/METADATA,sha256=X7qWgjCWDwi9KseQnDasaijG0k9u-L6CbGTH0qj8Zd4,4796
|
|
11
|
-
peak_performance-0.6.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
12
|
-
peak_performance-0.6.4.dist-info/top_level.txt,sha256=-lZSmgn2fZA-xPVmddLwaRt2hQeeWj7TYVefOk7_T58,17
|
|
13
|
-
peak_performance-0.6.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|