peak-performance 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- peak_performance/__init__.py +13 -0
- peak_performance/models.py +711 -0
- peak_performance/pipeline.py +1596 -0
- peak_performance/plots.py +289 -0
- peak_performance/test_main.py +4 -0
- peak_performance/test_models.py +196 -0
- peak_performance/test_pipeline.py +662 -0
- peak_performance/test_plots.py +122 -0
- peak_performance-0.6.3.dist-info/LICENSE.md +619 -0
- peak_performance-0.6.3.dist-info/METADATA +63 -0
- peak_performance-0.6.3.dist-info/RECORD +13 -0
- peak_performance-0.6.3.dist-info/WHEEL +5 -0
- peak_performance-0.6.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import arviz as az
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from peak_performance import pipeline as pl
|
|
11
|
+
|
|
12
|
+
# define columns for empty summary DataFrame for results
|
|
13
|
+
COLUMNS = [
|
|
14
|
+
"mean",
|
|
15
|
+
"sd",
|
|
16
|
+
"hdi_3%",
|
|
17
|
+
"hdi_97%",
|
|
18
|
+
"mcse_mean",
|
|
19
|
+
"mcse_sd",
|
|
20
|
+
"ess_bulk",
|
|
21
|
+
"ess_tail",
|
|
22
|
+
"r_hat",
|
|
23
|
+
"acquisition",
|
|
24
|
+
"experiment_or_precursor_mz",
|
|
25
|
+
"product_mz_start",
|
|
26
|
+
"product_mz_end",
|
|
27
|
+
"is_peak",
|
|
28
|
+
"cause_for_rejection",
|
|
29
|
+
"model_type",
|
|
30
|
+
"subpeak",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_user_input_class():
|
|
35
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
36
|
+
raw_data_files = ["A1t1R1Part2_110_109.9_110.1.npy"]
|
|
37
|
+
data_file_format = ".npy"
|
|
38
|
+
model_type = ["normal"]
|
|
39
|
+
retention_time_estimate = [22.5]
|
|
40
|
+
peak_width_estimate = 1.5
|
|
41
|
+
pre_filtering = True
|
|
42
|
+
minimum_sn = 5
|
|
43
|
+
timeseries = np.load(
|
|
44
|
+
Path(__file__).absolute().parent.parent / "example" / "A1t1R1Part2_110_109.9_110.1.npy"
|
|
45
|
+
)
|
|
46
|
+
acquisition = "A1t1R1"
|
|
47
|
+
precursor_mz = 118
|
|
48
|
+
product_mz_start = "71.9"
|
|
49
|
+
product_mz_end = 72.1
|
|
50
|
+
# test instantiation of the UserInput class
|
|
51
|
+
ui = pl.UserInput(
|
|
52
|
+
path,
|
|
53
|
+
raw_data_files,
|
|
54
|
+
data_file_format,
|
|
55
|
+
model_type,
|
|
56
|
+
retention_time_estimate,
|
|
57
|
+
peak_width_estimate,
|
|
58
|
+
pre_filtering,
|
|
59
|
+
minimum_sn,
|
|
60
|
+
timeseries,
|
|
61
|
+
acquisition,
|
|
62
|
+
precursor_mz,
|
|
63
|
+
product_mz_start,
|
|
64
|
+
product_mz_end,
|
|
65
|
+
)
|
|
66
|
+
assert ui.timeseries.all() == timeseries.all()
|
|
67
|
+
assert ui.precursor == 118
|
|
68
|
+
assert ui.product_mz_start == 71.9
|
|
69
|
+
assert ui.product_mz_end == 72.1
|
|
70
|
+
# test some of the error handling of the parameter setter of the UserInput class
|
|
71
|
+
with pytest.raises(pl.InputError):
|
|
72
|
+
ui = pl.UserInput(
|
|
73
|
+
path,
|
|
74
|
+
raw_data_files,
|
|
75
|
+
data_file_format,
|
|
76
|
+
model_type,
|
|
77
|
+
retention_time_estimate,
|
|
78
|
+
peak_width_estimate,
|
|
79
|
+
pre_filtering,
|
|
80
|
+
minimum_sn,
|
|
81
|
+
timeseries,
|
|
82
|
+
5,
|
|
83
|
+
precursor_mz,
|
|
84
|
+
product_mz_start,
|
|
85
|
+
product_mz_end,
|
|
86
|
+
)
|
|
87
|
+
with pytest.raises(pl.InputError):
|
|
88
|
+
ui = pl.UserInput(
|
|
89
|
+
path,
|
|
90
|
+
raw_data_files,
|
|
91
|
+
data_file_format,
|
|
92
|
+
model_type,
|
|
93
|
+
retention_time_estimate,
|
|
94
|
+
peak_width_estimate,
|
|
95
|
+
pre_filtering,
|
|
96
|
+
minimum_sn,
|
|
97
|
+
timeseries,
|
|
98
|
+
acquisition,
|
|
99
|
+
"mz",
|
|
100
|
+
product_mz_start,
|
|
101
|
+
product_mz_end,
|
|
102
|
+
)
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_detect_raw_data():
|
|
107
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
108
|
+
data_format = ".npy"
|
|
109
|
+
files = pl.detect_raw_data(path, data_type=data_format)
|
|
110
|
+
files = sorted(files)
|
|
111
|
+
expected_files = sorted(
|
|
112
|
+
[
|
|
113
|
+
"A1t1R1Part2_110_109.9_110.1.npy",
|
|
114
|
+
"A1t1R1Part2_111_109.9_110.1.npy",
|
|
115
|
+
"A1t1R1Part2_111_110.9_111.1.npy",
|
|
116
|
+
"A1t1R1Part2_112_110.9_111.1.npy",
|
|
117
|
+
"A1t1R1Part2_112_111.9_112.1.npy",
|
|
118
|
+
"A2t2R1Part1_132_85.9_86.1.npy",
|
|
119
|
+
"A4t4R1Part2_137_72.9_73.1.npy",
|
|
120
|
+
]
|
|
121
|
+
)
|
|
122
|
+
assert files == expected_files
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_parse_data():
|
|
127
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
128
|
+
data_format = ".npy"
|
|
129
|
+
filename = "A1t1R1Part2_110_109.9_110.1.npy"
|
|
130
|
+
(
|
|
131
|
+
timeseries,
|
|
132
|
+
acquisition,
|
|
133
|
+
precursor,
|
|
134
|
+
product_mz_start,
|
|
135
|
+
product_mz_end,
|
|
136
|
+
) = pl.parse_data(path, filename, data_format)
|
|
137
|
+
assert isinstance(timeseries[0], np.ndarray)
|
|
138
|
+
assert isinstance(timeseries[1], np.ndarray)
|
|
139
|
+
assert acquisition == "A1t1R1Part2"
|
|
140
|
+
assert precursor == 110.0
|
|
141
|
+
assert product_mz_start == 109.9
|
|
142
|
+
assert product_mz_end == 110.1
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_initiate():
|
|
147
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
148
|
+
run_dir = "test"
|
|
149
|
+
df_summary, path = pl.initiate(path, run_dir=run_dir)
|
|
150
|
+
df_summary2 = pandas.DataFrame(columns=COLUMNS)
|
|
151
|
+
assert df_summary2.values.all() == df_summary.values.all()
|
|
152
|
+
assert df_summary2.columns.all() == df_summary.columns.all()
|
|
153
|
+
assert path == Path(__file__).absolute().parent.parent / "example" / "test"
|
|
154
|
+
assert path.exists()
|
|
155
|
+
shutil.rmtree(path)
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_prefiltering():
|
|
160
|
+
# create df_summary
|
|
161
|
+
df_summary = pandas.DataFrame(columns=COLUMNS)
|
|
162
|
+
# create instance of the UserInput class
|
|
163
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
164
|
+
raw_data_files = ["A1t1R1Part2_110_109.9_110.1.npy"]
|
|
165
|
+
data_file_format = ".npy"
|
|
166
|
+
model_type = ["normal"]
|
|
167
|
+
retention_time_estimate = [26.3]
|
|
168
|
+
peak_width_estimate = 1.5
|
|
169
|
+
pre_filtering = True
|
|
170
|
+
minimum_sn = 5
|
|
171
|
+
timeseries = np.load(
|
|
172
|
+
Path(__file__).absolute().parent.parent / "example" / "A1t1R1Part2_110_109.9_110.1.npy"
|
|
173
|
+
)
|
|
174
|
+
acquisition = "A1t1R1"
|
|
175
|
+
precursor_mz = 118
|
|
176
|
+
product_mz_start = 71.9
|
|
177
|
+
product_mz_end = 72.1
|
|
178
|
+
# positive test
|
|
179
|
+
ui = pl.UserInput(
|
|
180
|
+
path,
|
|
181
|
+
raw_data_files,
|
|
182
|
+
data_file_format,
|
|
183
|
+
model_type,
|
|
184
|
+
retention_time_estimate,
|
|
185
|
+
peak_width_estimate,
|
|
186
|
+
pre_filtering,
|
|
187
|
+
minimum_sn,
|
|
188
|
+
timeseries,
|
|
189
|
+
acquisition,
|
|
190
|
+
precursor_mz,
|
|
191
|
+
product_mz_start,
|
|
192
|
+
product_mz_end,
|
|
193
|
+
)
|
|
194
|
+
filename = "A1t1R1Part2_110_109.9_110.1.npy"
|
|
195
|
+
found_peak, df_summary_1 = pl.prefiltering(filename, ui, 108, df_summary)
|
|
196
|
+
assert found_peak
|
|
197
|
+
assert df_summary_1.values.all() == df_summary.values.all()
|
|
198
|
+
assert df_summary_1.columns.all() == df_summary.columns.all()
|
|
199
|
+
# negative test due to retention time
|
|
200
|
+
retention_time_estimate = [0]
|
|
201
|
+
ui = pl.UserInput(
|
|
202
|
+
path,
|
|
203
|
+
raw_data_files,
|
|
204
|
+
data_file_format,
|
|
205
|
+
model_type,
|
|
206
|
+
retention_time_estimate,
|
|
207
|
+
peak_width_estimate,
|
|
208
|
+
pre_filtering,
|
|
209
|
+
minimum_sn,
|
|
210
|
+
timeseries,
|
|
211
|
+
acquisition,
|
|
212
|
+
precursor_mz,
|
|
213
|
+
product_mz_start,
|
|
214
|
+
product_mz_end,
|
|
215
|
+
)
|
|
216
|
+
filename = "A1t1R1Part2_110_109.9_110.1.npy"
|
|
217
|
+
found_peak, df_summary_1 = pl.prefiltering(filename, ui, 108, df_summary)
|
|
218
|
+
assert not found_peak
|
|
219
|
+
assert len(df_summary_1.loc[:, "mean"].values) == 8
|
|
220
|
+
assert list(df_summary_1.columns) == COLUMNS
|
|
221
|
+
assert all(pandas.isna(df_summary_1["mean"]))
|
|
222
|
+
# negative test due to signal-to-noise ratio
|
|
223
|
+
timeseries = np.load(
|
|
224
|
+
Path(__file__).absolute().parent.parent / "example" / "A4t4R1Part2_137_72.9_73.1.npy"
|
|
225
|
+
)
|
|
226
|
+
raw_data_files = ["A4t4R1Part2_137_72.9_73.1.npy"]
|
|
227
|
+
retention_time_estimate = [26.3]
|
|
228
|
+
ui = pl.UserInput(
|
|
229
|
+
path,
|
|
230
|
+
raw_data_files,
|
|
231
|
+
data_file_format,
|
|
232
|
+
model_type,
|
|
233
|
+
retention_time_estimate,
|
|
234
|
+
peak_width_estimate,
|
|
235
|
+
pre_filtering,
|
|
236
|
+
minimum_sn,
|
|
237
|
+
timeseries,
|
|
238
|
+
acquisition,
|
|
239
|
+
precursor_mz,
|
|
240
|
+
product_mz_start,
|
|
241
|
+
product_mz_end,
|
|
242
|
+
)
|
|
243
|
+
filename = "A4t4R1Part2_137_72.9_73.1.npy"
|
|
244
|
+
found_peak, df_summary_2 = pl.prefiltering(filename, ui, 108, df_summary)
|
|
245
|
+
assert not found_peak
|
|
246
|
+
assert len(df_summary_2.loc[:, "mean"].values) == 8
|
|
247
|
+
assert list(df_summary_2.columns) == COLUMNS
|
|
248
|
+
assert all(pandas.isna(df_summary_2["mean"]))
|
|
249
|
+
pass
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def test_postfiltering_success():
|
|
253
|
+
# load exemplary inference data object
|
|
254
|
+
idata = az.from_netcdf(
|
|
255
|
+
Path(__file__).absolute().parent.parent
|
|
256
|
+
/ "test_data/test_postfiltering_success/idata_double_normal.nc"
|
|
257
|
+
)
|
|
258
|
+
# create df_summary
|
|
259
|
+
df_summary = pandas.DataFrame(columns=COLUMNS)
|
|
260
|
+
# create instance of the UserInput class
|
|
261
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
262
|
+
raw_data_files = ["A2t2R1Part1_132_85.9_86.1.npy"]
|
|
263
|
+
data_file_format = ".npy"
|
|
264
|
+
model_type = ["double_normal"]
|
|
265
|
+
retention_time_estimate = [22.5]
|
|
266
|
+
peak_width_estimate = 1
|
|
267
|
+
pre_filtering = True
|
|
268
|
+
minimum_sn = 5
|
|
269
|
+
timeseries = np.load(
|
|
270
|
+
Path(__file__).absolute().parent.parent / "example" / "A2t2R1Part1_132_85.9_86.1.npy"
|
|
271
|
+
)
|
|
272
|
+
acquisition = "A2t2R1Part1"
|
|
273
|
+
precursor_mz = 132
|
|
274
|
+
product_mz_start = 85.9
|
|
275
|
+
product_mz_end = 86.1
|
|
276
|
+
ui = pl.UserInput(
|
|
277
|
+
path,
|
|
278
|
+
raw_data_files,
|
|
279
|
+
data_file_format,
|
|
280
|
+
model_type,
|
|
281
|
+
retention_time_estimate,
|
|
282
|
+
peak_width_estimate,
|
|
283
|
+
pre_filtering,
|
|
284
|
+
minimum_sn,
|
|
285
|
+
timeseries,
|
|
286
|
+
acquisition,
|
|
287
|
+
precursor_mz,
|
|
288
|
+
product_mz_start,
|
|
289
|
+
product_mz_end,
|
|
290
|
+
)
|
|
291
|
+
filename = "A2t2R1Part1_132_85.9_86.1.npy"
|
|
292
|
+
resample, discard, df_summary = pl.postfiltering(filename, idata, ui, df_summary)
|
|
293
|
+
# tests
|
|
294
|
+
assert not resample
|
|
295
|
+
assert not discard
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def test_postfiltering_resample():
|
|
300
|
+
# load exemplary inference data object
|
|
301
|
+
idata = az.from_netcdf(
|
|
302
|
+
Path(__file__).absolute().parent.parent
|
|
303
|
+
/ "test_data/test_postfiltering_resample/idata_double_skew_rhat_too_high.nc"
|
|
304
|
+
)
|
|
305
|
+
# create df_summary
|
|
306
|
+
df_summary = pandas.DataFrame(columns=COLUMNS)
|
|
307
|
+
# create instance of the UserInput class
|
|
308
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
309
|
+
raw_data_files = ["A2t2R1Part1_132_85.9_86.1.npy"]
|
|
310
|
+
data_file_format = ".npy"
|
|
311
|
+
model_type = ["double_normal"]
|
|
312
|
+
retention_time_estimate = [22.5]
|
|
313
|
+
peak_width_estimate = 1
|
|
314
|
+
pre_filtering = True
|
|
315
|
+
minimum_sn = 5
|
|
316
|
+
timeseries = np.load(
|
|
317
|
+
Path(__file__).absolute().parent.parent / "example" / "A2t2R1Part1_132_85.9_86.1.npy"
|
|
318
|
+
)
|
|
319
|
+
acquisition = "A2t2R1Part1"
|
|
320
|
+
precursor_mz = 132
|
|
321
|
+
product_mz_start = 85.9
|
|
322
|
+
product_mz_end = 86.1
|
|
323
|
+
ui = pl.UserInput(
|
|
324
|
+
path,
|
|
325
|
+
raw_data_files,
|
|
326
|
+
data_file_format,
|
|
327
|
+
model_type,
|
|
328
|
+
retention_time_estimate,
|
|
329
|
+
peak_width_estimate,
|
|
330
|
+
pre_filtering,
|
|
331
|
+
minimum_sn,
|
|
332
|
+
timeseries,
|
|
333
|
+
acquisition,
|
|
334
|
+
precursor_mz,
|
|
335
|
+
product_mz_start,
|
|
336
|
+
product_mz_end,
|
|
337
|
+
)
|
|
338
|
+
filename = "A2t2R1Part1_132_85.9_86.1.npy"
|
|
339
|
+
resample, discard, df_summary = pl.postfiltering(filename, idata, ui, df_summary)
|
|
340
|
+
# tests
|
|
341
|
+
assert resample
|
|
342
|
+
assert not discard
|
|
343
|
+
pass
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def test_single_peak_report_add_nan_to_summary():
|
|
347
|
+
# create df_summary
|
|
348
|
+
df_summary = pandas.DataFrame(columns=COLUMNS)
|
|
349
|
+
# create instance of the UserInput class
|
|
350
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
351
|
+
raw_data_files = ["A1t1R1Part2_110_109.9_110.1.npy"]
|
|
352
|
+
data_file_format = ".npy"
|
|
353
|
+
model_type = ["skew_normal"]
|
|
354
|
+
retention_time_estimate = [22.5]
|
|
355
|
+
peak_width_estimate = 1.5
|
|
356
|
+
pre_filtering = True
|
|
357
|
+
minimum_sn = 5
|
|
358
|
+
timeseries = np.load(
|
|
359
|
+
Path(__file__).absolute().parent.parent / "example" / "A1t1R1Part2_110_109.9_110.1.npy"
|
|
360
|
+
)
|
|
361
|
+
acquisition = "A1t1R1"
|
|
362
|
+
precursor_mz = 118
|
|
363
|
+
product_mz_start = 71.9
|
|
364
|
+
product_mz_end = 72.1
|
|
365
|
+
ui = pl.UserInput(
|
|
366
|
+
path,
|
|
367
|
+
raw_data_files,
|
|
368
|
+
data_file_format,
|
|
369
|
+
model_type,
|
|
370
|
+
retention_time_estimate,
|
|
371
|
+
peak_width_estimate,
|
|
372
|
+
pre_filtering,
|
|
373
|
+
minimum_sn,
|
|
374
|
+
timeseries,
|
|
375
|
+
acquisition,
|
|
376
|
+
precursor_mz,
|
|
377
|
+
product_mz_start,
|
|
378
|
+
product_mz_end,
|
|
379
|
+
)
|
|
380
|
+
filename = "A1t1R1Part2_110_109.9_110.1.npy"
|
|
381
|
+
rejection_msg = "because I said so"
|
|
382
|
+
df_summary = pl.report_add_nan_to_summary(filename, ui, df_summary, rejection_msg)
|
|
383
|
+
# tests
|
|
384
|
+
assert len(df_summary.loc[:, "mean"].values) == 8
|
|
385
|
+
assert list(df_summary.columns) == COLUMNS
|
|
386
|
+
assert all(pandas.isna(df_summary["mean"]))
|
|
387
|
+
assert list(df_summary.loc[:, "acquisition"]) == len(df_summary.index) * ["A1t1R1"]
|
|
388
|
+
assert list(df_summary.loc[:, "experiment_or_precursor_mz"]) == len(df_summary.index) * [118]
|
|
389
|
+
assert list(df_summary.loc[:, "product_mz_start"]) == len(df_summary.index) * [71.9]
|
|
390
|
+
assert list(df_summary.loc[:, "product_mz_end"]) == len(df_summary.index) * [72.1]
|
|
391
|
+
assert not any(df_summary["is_peak"])
|
|
392
|
+
assert all(df_summary["cause_for_rejection"] == rejection_msg)
|
|
393
|
+
assert list(df_summary.loc[:, "model_type"]) == len(df_summary.index) * ["skew_normal"]
|
|
394
|
+
pass
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def test_double_peak_report_add_nan_to_summary():
|
|
398
|
+
# create df_summary
|
|
399
|
+
df_summary = pandas.DataFrame(columns=COLUMNS)
|
|
400
|
+
# create instance of the UserInput class
|
|
401
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
402
|
+
raw_data_files = ["A1t1R1Part2_110_109.9_110.1.npy"]
|
|
403
|
+
data_file_format = ".npy"
|
|
404
|
+
model_type = ["double_normal"]
|
|
405
|
+
retention_time_estimate = [22.5]
|
|
406
|
+
peak_width_estimate = 1.5
|
|
407
|
+
pre_filtering = True
|
|
408
|
+
minimum_sn = 5
|
|
409
|
+
timeseries = np.load(
|
|
410
|
+
Path(__file__).absolute().parent.parent / "example" / "A1t1R1Part2_110_109.9_110.1.npy"
|
|
411
|
+
)
|
|
412
|
+
acquisition = "A1t1R1"
|
|
413
|
+
precursor_mz = 118
|
|
414
|
+
product_mz_start = 71.9
|
|
415
|
+
product_mz_end = 72.1
|
|
416
|
+
ui = pl.UserInput(
|
|
417
|
+
path,
|
|
418
|
+
raw_data_files,
|
|
419
|
+
data_file_format,
|
|
420
|
+
model_type,
|
|
421
|
+
retention_time_estimate,
|
|
422
|
+
peak_width_estimate,
|
|
423
|
+
pre_filtering,
|
|
424
|
+
minimum_sn,
|
|
425
|
+
timeseries,
|
|
426
|
+
acquisition,
|
|
427
|
+
precursor_mz,
|
|
428
|
+
product_mz_start,
|
|
429
|
+
product_mz_end,
|
|
430
|
+
)
|
|
431
|
+
filename = "A1t1R1Part2_110_109.9_110.1.npy"
|
|
432
|
+
rejection_msg = "because I said so"
|
|
433
|
+
df_summary = pl.report_add_nan_to_summary(filename, ui, df_summary, rejection_msg)
|
|
434
|
+
# tests
|
|
435
|
+
assert len(df_summary.loc[:, "mean"].values) == 8
|
|
436
|
+
assert list(df_summary.columns) == COLUMNS
|
|
437
|
+
assert all(pandas.isna(df_summary["mean"]))
|
|
438
|
+
assert list(df_summary.loc[:, "acquisition"]) == len(df_summary.index) * ["A1t1R1"]
|
|
439
|
+
assert list(df_summary.loc[:, "experiment_or_precursor_mz"]) == len(df_summary.index) * [118]
|
|
440
|
+
assert list(df_summary.loc[:, "product_mz_start"]) == len(df_summary.index) * [71.9]
|
|
441
|
+
assert list(df_summary.loc[:, "product_mz_end"]) == len(df_summary.index) * [72.1]
|
|
442
|
+
assert not any(df_summary["is_peak"])
|
|
443
|
+
assert all(df_summary["cause_for_rejection"] == rejection_msg)
|
|
444
|
+
assert list(df_summary.loc[:, "model_type"]) == len(df_summary.index) * ["double_normal"]
|
|
445
|
+
pass
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def test_single_peak_report():
|
|
449
|
+
# load exemplary inference data object
|
|
450
|
+
idata = az.from_netcdf(
|
|
451
|
+
Path(__file__).absolute().parent.parent / "test_data/test_single_peak_report/idata.nc"
|
|
452
|
+
)
|
|
453
|
+
# create empty DataFrame
|
|
454
|
+
df_summary = pandas.DataFrame(columns=COLUMNS)
|
|
455
|
+
# create instance of the UserInput class
|
|
456
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
457
|
+
raw_data_files = ["A1t1R1Part2_110_109.9_110.1.npy"]
|
|
458
|
+
data_file_format = ".npy"
|
|
459
|
+
model_type = ["skew_normal"]
|
|
460
|
+
retention_time_estimate = [22.5]
|
|
461
|
+
peak_width_estimate = 1.5
|
|
462
|
+
pre_filtering = True
|
|
463
|
+
minimum_sn = 5
|
|
464
|
+
timeseries = np.load(
|
|
465
|
+
Path(__file__).absolute().parent.parent / "example" / "A1t1R1Part2_110_109.9_110.1.npy"
|
|
466
|
+
)
|
|
467
|
+
acquisition = "A1t1R1"
|
|
468
|
+
precursor_mz = 118
|
|
469
|
+
product_mz_start = 71.9
|
|
470
|
+
product_mz_end = 72.1
|
|
471
|
+
ui = pl.UserInput(
|
|
472
|
+
path,
|
|
473
|
+
raw_data_files,
|
|
474
|
+
data_file_format,
|
|
475
|
+
model_type,
|
|
476
|
+
retention_time_estimate,
|
|
477
|
+
peak_width_estimate,
|
|
478
|
+
pre_filtering,
|
|
479
|
+
minimum_sn,
|
|
480
|
+
timeseries,
|
|
481
|
+
acquisition,
|
|
482
|
+
precursor_mz,
|
|
483
|
+
product_mz_start,
|
|
484
|
+
product_mz_end,
|
|
485
|
+
)
|
|
486
|
+
filename = "A1t1R1Part2_110_109.9_110.1.npy"
|
|
487
|
+
# add data to df_summary
|
|
488
|
+
df_summary = pl.report_add_data_to_summary(filename, idata, df_summary, ui, True)
|
|
489
|
+
# tests
|
|
490
|
+
assert len(df_summary.loc[:, "mean"].values) == 8
|
|
491
|
+
assert list(df_summary.columns) == COLUMNS
|
|
492
|
+
assert list(df_summary.loc[:, "mean"]) == [
|
|
493
|
+
5.565,
|
|
494
|
+
8.446,
|
|
495
|
+
25.989,
|
|
496
|
+
132.743,
|
|
497
|
+
0.516,
|
|
498
|
+
2180.529,
|
|
499
|
+
2762.695,
|
|
500
|
+
20.924,
|
|
501
|
+
]
|
|
502
|
+
assert list(df_summary.loc[:, "acquisition"]) == len(df_summary.index) * ["A1t1R1"]
|
|
503
|
+
assert list(df_summary.loc[:, "experiment_or_precursor_mz"]) == len(df_summary.index) * [118]
|
|
504
|
+
assert list(df_summary.loc[:, "product_mz_start"]) == len(df_summary.index) * [71.9]
|
|
505
|
+
assert list(df_summary.loc[:, "product_mz_end"]) == len(df_summary.index) * [72.1]
|
|
506
|
+
assert all(df_summary["is_peak"])
|
|
507
|
+
assert all(df_summary["cause_for_rejection"] == "")
|
|
508
|
+
assert list(df_summary.loc[:, "model_type"]) == len(df_summary.index) * ["skew_normal"]
|
|
509
|
+
pass
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
@pytest.mark.parametrize("idata", ["idata_double_normal.nc", "idata_double_skew_normal.nc"])
|
|
513
|
+
def test_double_peak_report(idata):
|
|
514
|
+
# load exemplary inference data object
|
|
515
|
+
idata = az.from_netcdf(
|
|
516
|
+
Path(__file__).absolute().parent.parent / "test_data/test_double_peak_report" / idata
|
|
517
|
+
)
|
|
518
|
+
# create empty DataFrame
|
|
519
|
+
df_summary = pandas.DataFrame(columns=COLUMNS)
|
|
520
|
+
# create instance of the UserInput class
|
|
521
|
+
path = Path(__file__).absolute().parent.parent / "example"
|
|
522
|
+
raw_data_files = ["A2t2R1Part1_132_85.9_86.1.npy"]
|
|
523
|
+
data_file_format = ".npy"
|
|
524
|
+
model_type = ["double_skew_normal"]
|
|
525
|
+
retention_time_estimate = [22.5]
|
|
526
|
+
peak_width_estimate = 1.5
|
|
527
|
+
pre_filtering = True
|
|
528
|
+
minimum_sn = 5
|
|
529
|
+
timeseries = np.load(
|
|
530
|
+
Path(__file__).absolute().parent.parent / "example" / "A2t2R1Part1_132_85.9_86.1.npy"
|
|
531
|
+
)
|
|
532
|
+
acquisition = "A1t1R1"
|
|
533
|
+
precursor_mz = 132
|
|
534
|
+
product_mz_start = 85.9
|
|
535
|
+
product_mz_end = 86.1
|
|
536
|
+
ui = pl.UserInput(
|
|
537
|
+
path,
|
|
538
|
+
raw_data_files,
|
|
539
|
+
data_file_format,
|
|
540
|
+
model_type,
|
|
541
|
+
retention_time_estimate,
|
|
542
|
+
peak_width_estimate,
|
|
543
|
+
pre_filtering,
|
|
544
|
+
minimum_sn,
|
|
545
|
+
timeseries,
|
|
546
|
+
acquisition,
|
|
547
|
+
precursor_mz,
|
|
548
|
+
product_mz_start,
|
|
549
|
+
product_mz_end,
|
|
550
|
+
)
|
|
551
|
+
filename = "A2t2R1Part1_132_85.9_86.1.npy"
|
|
552
|
+
# add data to df_summary
|
|
553
|
+
df_summary = pl.report_add_data_to_summary(filename, idata, df_summary, ui, True)
|
|
554
|
+
# tests
|
|
555
|
+
assert list(df_summary.columns) == COLUMNS
|
|
556
|
+
assert len(df_summary.index) == 16
|
|
557
|
+
assert list(df_summary.loc[:, "acquisition"]) == len(df_summary.index) * ["A1t1R1"]
|
|
558
|
+
assert list(df_summary.loc[:, "experiment_or_precursor_mz"]) == len(df_summary.index) * [132]
|
|
559
|
+
assert list(df_summary.loc[:, "product_mz_start"]) == len(df_summary.index) * [85.9]
|
|
560
|
+
assert list(df_summary.loc[:, "product_mz_end"]) == len(df_summary.index) * [86.1]
|
|
561
|
+
assert all(df_summary["is_peak"])
|
|
562
|
+
assert all(df_summary["cause_for_rejection"] == "")
|
|
563
|
+
assert list(df_summary.loc[:, "model_type"]) == 16 * ["double_skew_normal"]
|
|
564
|
+
assert list(df_summary.loc[:, "subpeak"]) == 8 * ["1st"] + 8 * ["2nd"]
|
|
565
|
+
pass
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def test_parse_unique_identifiers():
|
|
569
|
+
files = [
|
|
570
|
+
"A1t1R1Part2_110_109.9_110.1.npy",
|
|
571
|
+
"A1t1R1Part2_111_109.9_110.1.npy",
|
|
572
|
+
"A1t1R1Part2_111_110.9_111.1.npy",
|
|
573
|
+
"A1t1R1Part2_112_110.9_111.1.npy",
|
|
574
|
+
]
|
|
575
|
+
unique_identifiers = pl.parse_unique_identifiers(files)
|
|
576
|
+
assert sorted(unique_identifiers) == sorted(
|
|
577
|
+
[
|
|
578
|
+
"110_109.9_110.1",
|
|
579
|
+
"111_109.9_110.1",
|
|
580
|
+
"111_110.9_111.1",
|
|
581
|
+
"112_110.9_111.1",
|
|
582
|
+
]
|
|
583
|
+
)
|
|
584
|
+
pass
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def test_excel_template_prepare():
|
|
588
|
+
path_raw_data = Path(__file__).absolute().parent.parent / "example"
|
|
589
|
+
path_peak_performance = Path(__file__).absolute().parent.parent
|
|
590
|
+
files = ["mp3", "flac", "wav", "m4a"]
|
|
591
|
+
identifiers = ["1", "2", "3"]
|
|
592
|
+
pl.excel_template_prepare(path_raw_data, path_peak_performance, files, identifiers)
|
|
593
|
+
# test whether Template.xlsx was copied from peak-performance to example
|
|
594
|
+
assert Path(path_raw_data / "Template.xlsx").exists()
|
|
595
|
+
# remove Template.xlsx from example
|
|
596
|
+
os.remove(Path(path_raw_data / "Template.xlsx"))
|
|
597
|
+
pass
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def test_parse_files_for_model_selection():
|
|
601
|
+
path_peak_performance = Path(__file__).absolute().parent.parent
|
|
602
|
+
# load empty signals sheet from Template.xlsx
|
|
603
|
+
signals = pandas.read_excel(Path(path_peak_performance) / "Template.xlsx", sheet_name="signals")
|
|
604
|
+
signals["unique_identifier"] = ["1", "2", "3", "4", "5", "6", "7"]
|
|
605
|
+
with pytest.raises(pl.InputError):
|
|
606
|
+
files = pl.parse_files_for_model_selection(signals)
|
|
607
|
+
# have one unique_identifier where neither model nor acquisition were given
|
|
608
|
+
# (and multiple different acquisitions were defined for other unique identifiers)
|
|
609
|
+
signals["acquisition_for_choosing_model_type"] = ["A1", "B1", "C1", "D1", "E1", "F1", np.nan]
|
|
610
|
+
signals["model_type"] = 7 * [np.nan]
|
|
611
|
+
with pytest.raises(pl.InputError):
|
|
612
|
+
files = pl.parse_files_for_model_selection(signals)
|
|
613
|
+
# if models for every unique identifier were supplied, the result should be empty
|
|
614
|
+
signals["model_type"] = 7 * ["normal"]
|
|
615
|
+
with pytest.raises(pl.InputError):
|
|
616
|
+
files = pl.parse_files_for_model_selection(signals)
|
|
617
|
+
# mixture of supplied model and supplying different acquisitions for model selection
|
|
618
|
+
signals["acquisition_for_choosing_model_type"] = [np.nan, "B1", "C1", "D1", "E1", "F1", "G1"]
|
|
619
|
+
signals["model_type"] = ["normal"] + 6 * [np.nan]
|
|
620
|
+
files = pl.parse_files_for_model_selection(signals)
|
|
621
|
+
assert files == {"B1_2": "2", "C1_3": "3", "D1_4": "4", "E1_5": "5", "F1_6": "6", "G1_7": "7"}
|
|
622
|
+
# mixture of supplied model and supplying one acquisition for model selection
|
|
623
|
+
signals["acquisition_for_choosing_model_type"] = [np.nan, "B1"] + 5 * [np.nan]
|
|
624
|
+
files = pl.parse_files_for_model_selection(signals)
|
|
625
|
+
assert files == {"B1_2": "2", "B1_3": "3", "B1_4": "4", "B1_5": "5", "B1_6": "6", "B1_7": "7"}
|
|
626
|
+
pass
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def test_model_selection_check():
|
|
630
|
+
# case 1: double peak is too close to single peak in elpd score
|
|
631
|
+
result_df = pandas.DataFrame(
|
|
632
|
+
{"elpd_loo": [50, 30, 29, -5], "ic": ["loo", "loo", "loo", "loo"]},
|
|
633
|
+
index=["double_normal", "double_skew_normal", "normal", "skew_normal"],
|
|
634
|
+
)
|
|
635
|
+
selected_model = pl.model_selection_check(result_df, "loo", 25)
|
|
636
|
+
assert selected_model == "normal"
|
|
637
|
+
# case 2: double peak exceeds elpd score difference threshold and is thusly accepted
|
|
638
|
+
result_df = pandas.DataFrame(
|
|
639
|
+
{"elpd_loo": [50, 30, 10, -5], "ic": ["loo", "loo", "loo", "loo"]},
|
|
640
|
+
index=["double_normal", "double_skew_normal", "normal", "skew_normal"],
|
|
641
|
+
)
|
|
642
|
+
selected_model = pl.model_selection_check(result_df, "loo", 25)
|
|
643
|
+
assert selected_model == "double_normal"
|
|
644
|
+
pass
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def test_model_selection():
|
|
648
|
+
"""
|
|
649
|
+
Test the model_selection function from the pipeline modul.
|
|
650
|
+
The function contains the model selection pipeline.
|
|
651
|
+
"""
|
|
652
|
+
path = Path(__file__).absolute().parent.parent / "test_data/test_model_selection"
|
|
653
|
+
# Template.xlsx will be updated so copy it freshly and delete it in the end
|
|
654
|
+
shutil.copy(path / "template/Template.xlsx", path / "Template.xlsx")
|
|
655
|
+
result, model_dict = pl.model_selection(path)
|
|
656
|
+
# make sure that the excluded model was really excluded
|
|
657
|
+
assert "double_normal" not in result.index
|
|
658
|
+
assert "normal" in result.index
|
|
659
|
+
assert "skew_normal" in result.index
|
|
660
|
+
assert model_dict
|
|
661
|
+
os.remove(path / "Template.xlsx")
|
|
662
|
+
pass
|