peak-performance 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- peak_performance/__init__.py +13 -0
- peak_performance/models.py +711 -0
- peak_performance/pipeline.py +1596 -0
- peak_performance/plots.py +289 -0
- peak_performance/test_main.py +4 -0
- peak_performance/test_models.py +196 -0
- peak_performance/test_pipeline.py +662 -0
- peak_performance/test_plots.py +122 -0
- peak_performance-0.6.3.dist-info/LICENSE.md +619 -0
- peak_performance-0.6.3.dist-info/METADATA +63 -0
- peak_performance-0.6.3.dist-info/RECORD +13 -0
- peak_performance-0.6.3.dist-info/WHEEL +5 -0
- peak_performance-0.6.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1596 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PeakPerformance
|
|
3
|
+
Copyright (C) 2023 Forschungszentrum Jülich GmbH
|
|
4
|
+
|
|
5
|
+
This program is free software: you can redistribute it and/or modify
|
|
6
|
+
it under the terms of the GNU Affero General Public License as published
|
|
7
|
+
by the Free Software Foundation, either version 3 of the License, or
|
|
8
|
+
(at your option) any later version.
|
|
9
|
+
|
|
10
|
+
This program is distributed in the hope that it will be useful,
|
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
+
GNU Affero General Public License for more details.
|
|
14
|
+
|
|
15
|
+
You should have received a copy of the GNU Affero General Public License
|
|
16
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import importlib
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import shutil
|
|
23
|
+
import warnings
|
|
24
|
+
from datetime import date, datetime
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Dict, List, Mapping, Sequence, Tuple, Union
|
|
27
|
+
|
|
28
|
+
import arviz as az
|
|
29
|
+
import numpy as np
|
|
30
|
+
import pandas
|
|
31
|
+
import pymc as pm
|
|
32
|
+
import scipy.integrate
|
|
33
|
+
import scipy.signal
|
|
34
|
+
from openpyxl import load_workbook
|
|
35
|
+
from openpyxl.utils.dataframe import dataframe_to_rows
|
|
36
|
+
|
|
37
|
+
from peak_performance import models, plots
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ParsingError(Exception):
|
|
41
|
+
"""Base type of parsing exceptions."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class InputError(Exception):
|
|
45
|
+
"""Base type of exceptions related to information given by the user."""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class UserInput:
|
|
49
|
+
"""Collect all information required from the user and format them in the correct manner."""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
path: Union[str, os.PathLike],
|
|
54
|
+
files: Sequence[str],
|
|
55
|
+
raw_data_file_format: str,
|
|
56
|
+
peak_model: Sequence[str],
|
|
57
|
+
retention_time_estimate: Union[Sequence[float], Sequence[int]],
|
|
58
|
+
peak_width_estimate: Union[float, int],
|
|
59
|
+
pre_filtering: bool,
|
|
60
|
+
minimum_sn: Union[float, int],
|
|
61
|
+
timeseries: np.ndarray,
|
|
62
|
+
acquisition: str,
|
|
63
|
+
precursor: Union[float, int],
|
|
64
|
+
product_mz_start: Union[float, int],
|
|
65
|
+
product_mz_end: Union[float, int],
|
|
66
|
+
):
|
|
67
|
+
"""
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
path
|
|
71
|
+
Path to the folder containing the results of the current run.
|
|
72
|
+
files
|
|
73
|
+
List of raw data file names in path.
|
|
74
|
+
raw_data_file_format
|
|
75
|
+
Data format (suffix) of the raw data, default is '.npy'.
|
|
76
|
+
peak_model
|
|
77
|
+
List specifying models for peak fitting in the same order as files.
|
|
78
|
+
("normal", "skew_normal", "double_normal", "double_skew_normal")
|
|
79
|
+
retention_time_estimate
|
|
80
|
+
In case you set pre_filtering to True, give a retention time estimate (float) for each signal in files.
|
|
81
|
+
In case of a double peak, give two retention times (in chronological order) as a tuple containing two floats.
|
|
82
|
+
peak_width_estimate
|
|
83
|
+
Rough estimate of the average peak width in minutes expected for the LC-MS method with which the data was obtained.
|
|
84
|
+
pre_filtering
|
|
85
|
+
If True, potential peaks will be filtered based on retention time and signal to noise ratio before sampling.
|
|
86
|
+
minimum_sn
|
|
87
|
+
Minimum signal to noise ratio for a signal to be recognized as a peak during pre-filtering.
|
|
88
|
+
timeseries
|
|
89
|
+
NumPy Array containing time (at first position) and intensity (at second position) data as NumPy arrays.
|
|
90
|
+
acquisition
|
|
91
|
+
Name of a single acquisition.
|
|
92
|
+
precursor
|
|
93
|
+
Can be one of the following:
|
|
94
|
+
Either the experiment number of the signal within the acquisition (each experiment = one mass trace)
|
|
95
|
+
or the mass to charge ratio of the precursor ion selected in Q1.
|
|
96
|
+
product_mz_start
|
|
97
|
+
Start of the mass to charge ratio range of the product ion in the TOF.
|
|
98
|
+
product_mz_end
|
|
99
|
+
End of the mass to charge ratio range of the product ion in the TOF.
|
|
100
|
+
"""
|
|
101
|
+
self.path = path
|
|
102
|
+
self.files = list(files)
|
|
103
|
+
self.raw_data_file_format = raw_data_file_format
|
|
104
|
+
self.peak_model = peak_model
|
|
105
|
+
self.retention_time_estimate = retention_time_estimate
|
|
106
|
+
self.peak_width_estimate = peak_width_estimate
|
|
107
|
+
self.pre_filtering = pre_filtering
|
|
108
|
+
self.minimum_sn = minimum_sn
|
|
109
|
+
self.timeseries = timeseries
|
|
110
|
+
self.acquisition = acquisition
|
|
111
|
+
self.precursor = precursor
|
|
112
|
+
self.product_mz_start = product_mz_start
|
|
113
|
+
self.product_mz_end = product_mz_end
|
|
114
|
+
super().__init__()
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def timeseries(self):
|
|
118
|
+
"""
|
|
119
|
+
Getting the value of the timeseries attribute.
|
|
120
|
+
(NumPy Array containing time (at first position) and intensity (at second position) data as NumPy arrays.)
|
|
121
|
+
"""
|
|
122
|
+
return self._timeseries
|
|
123
|
+
|
|
124
|
+
@timeseries.setter
|
|
125
|
+
def timeseries(self, data):
|
|
126
|
+
"""Setting the value of the timeseries attribute."""
|
|
127
|
+
if data is None:
|
|
128
|
+
raise InputError("The timeseries parameter is a None type.")
|
|
129
|
+
self._timeseries = np.asarray(data)
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def acquisition(self):
|
|
133
|
+
"""Getting the value of the acquisition attribute (name of a single acquisition)."""
|
|
134
|
+
return self._acquisition
|
|
135
|
+
|
|
136
|
+
@acquisition.setter
|
|
137
|
+
def acquisition(self, name):
|
|
138
|
+
"""Setting the value of the acquisition attribute."""
|
|
139
|
+
if not isinstance(name, str):
|
|
140
|
+
raise InputError(
|
|
141
|
+
f"The acquisition parameter {name} is {type(name)} but needs to be a string."
|
|
142
|
+
)
|
|
143
|
+
if name is None:
|
|
144
|
+
raise InputError("The acquisition parameter is a None type.")
|
|
145
|
+
self._acquisition = name
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def precursor(self):
|
|
149
|
+
"""
|
|
150
|
+
Getting the value of the precursor attribute which can be one of the following:
|
|
151
|
+
Either the experiment number of the signal within the acquisition (each experiment = one mass trace)
|
|
152
|
+
or the mass to charge ratio of the precursor ion selected in Q1.
|
|
153
|
+
"""
|
|
154
|
+
return self._precursor
|
|
155
|
+
|
|
156
|
+
@precursor.setter
|
|
157
|
+
def precursor(self, mz):
|
|
158
|
+
"""Setting the value of the precursor attribute."""
|
|
159
|
+
if not isinstance(mz, int) and not isinstance(mz, float):
|
|
160
|
+
try:
|
|
161
|
+
mz = float(mz)
|
|
162
|
+
except ValueError as ex:
|
|
163
|
+
raise InputError(
|
|
164
|
+
f"The precursor parameter {mz} is {type(mz)} but needs to be an int or a float."
|
|
165
|
+
) from ex
|
|
166
|
+
if mz is None:
|
|
167
|
+
raise InputError("The precursor parameter is a None type.")
|
|
168
|
+
self._precursor = mz
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def product_mz_start(self):
|
|
172
|
+
"""Getting the value of the product_mz_start attribute."""
|
|
173
|
+
return self._product_mz_start
|
|
174
|
+
|
|
175
|
+
@product_mz_start.setter
|
|
176
|
+
def product_mz_start(self, mz):
|
|
177
|
+
"""
|
|
178
|
+
Setting the value of the product_mz_start attribute.
|
|
179
|
+
(Start of the mass to charge ratio range of the product ion in the TOF.)
|
|
180
|
+
"""
|
|
181
|
+
if not isinstance(mz, int) and not isinstance(mz, float):
|
|
182
|
+
try:
|
|
183
|
+
mz = float(mz)
|
|
184
|
+
except ValueError as ex:
|
|
185
|
+
raise InputError(
|
|
186
|
+
f"The product_mz parameter {mz} is {type(mz)} but needs to be an int or a float."
|
|
187
|
+
) from ex
|
|
188
|
+
if mz is None:
|
|
189
|
+
raise InputError("The product_mz_start parameter is a None type.")
|
|
190
|
+
self._product_mz_start = mz
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def product_mz_end(self):
|
|
194
|
+
"""
|
|
195
|
+
Getting the value of the product_mz_end attribute.
|
|
196
|
+
(End of the mass to charge ratio range of the product ion in the TOF.)
|
|
197
|
+
"""
|
|
198
|
+
return self._product_mz_end
|
|
199
|
+
|
|
200
|
+
@product_mz_end.setter
|
|
201
|
+
def product_mz_end(self, mz):
|
|
202
|
+
"""Setting the value of the product_mz_end attribute."""
|
|
203
|
+
if not isinstance(mz, int) and not isinstance(mz, float):
|
|
204
|
+
try:
|
|
205
|
+
mz = float(mz)
|
|
206
|
+
except ValueError as ex:
|
|
207
|
+
raise InputError(
|
|
208
|
+
f"The product_mz parameter is {type(mz)} but needs to be an int or a float."
|
|
209
|
+
) from ex
|
|
210
|
+
if mz is None:
|
|
211
|
+
raise InputError("The product_mz_end parameter is a None type.")
|
|
212
|
+
self._product_mz_end = mz
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def user_info(self):
|
|
216
|
+
"""Create a dictionary with the necessary user information based on the class attributes."""
|
|
217
|
+
# first, some sanity checks
|
|
218
|
+
if len(self.files) != len(self.peak_model):
|
|
219
|
+
raise InputError(
|
|
220
|
+
f"The length of 'files' ({len(self.files)}) and of 'peak_model' ({len(self.peak_model)}) are not identical."
|
|
221
|
+
)
|
|
222
|
+
if self.pre_filtering:
|
|
223
|
+
# check length of lists
|
|
224
|
+
if len(self.files) != len(self.peak_model) or len(self.peak_model) != len(
|
|
225
|
+
self.retention_time_estimate
|
|
226
|
+
):
|
|
227
|
+
raise InputError(
|
|
228
|
+
f"The length of 'files' ({len(self.files)}), 'peak_model' ({self.peak_model}), "
|
|
229
|
+
f"and retention_time_estimate ({len(self.retention_time_estimate)}) are not identical."
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
# if pre_filtering is False, then retention_time_estimate is not needed
|
|
233
|
+
# but the dictionary still needs to be created without errors -> set it to np.nan
|
|
234
|
+
if len(self.retention_time_estimate) == 1:
|
|
235
|
+
self.retention_time_estimate = len(self.files) * [np.nan]
|
|
236
|
+
elif not self.retention_time_estimate:
|
|
237
|
+
self.retention_time_estimate = len(self.files) * [np.nan]
|
|
238
|
+
if np.any(np.array(self.retention_time_estimate) < 0):
|
|
239
|
+
raise InputError("Retention time estimates below 0 are not valid.")
|
|
240
|
+
# actually create the dictionary
|
|
241
|
+
user_info = dict(zip(self.files, zip(self.peak_model, self.retention_time_estimate)))
|
|
242
|
+
user_info["peak_width_estimate"] = self.peak_width_estimate
|
|
243
|
+
user_info["pre_filtering"] = self.pre_filtering
|
|
244
|
+
user_info["minimum_sn"] = self.minimum_sn
|
|
245
|
+
return user_info
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def detect_raw_data(path: Union[str, os.PathLike], *, data_type: str = ".npy"):
|
|
249
|
+
"""
|
|
250
|
+
Detect all .npy files with time and intensity data for peaks in a given directory.
|
|
251
|
+
|
|
252
|
+
Parameters
|
|
253
|
+
----------
|
|
254
|
+
path
|
|
255
|
+
Path to the folder containing raw data.
|
|
256
|
+
data_type
|
|
257
|
+
Data format of the raw data files (e.g. '.npy').
|
|
258
|
+
|
|
259
|
+
Returns
|
|
260
|
+
-------
|
|
261
|
+
files
|
|
262
|
+
List with names of all files of the specified data type in path.
|
|
263
|
+
"""
|
|
264
|
+
all_files = os.listdir(path)
|
|
265
|
+
files = [file for file in all_files if data_type in file]
|
|
266
|
+
if not files:
|
|
267
|
+
raise FileNotFoundError(
|
|
268
|
+
f"In the given directory '{path}', there are no '{data_type}' files."
|
|
269
|
+
)
|
|
270
|
+
return files
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def parse_data(
|
|
274
|
+
path: Union[str, os.PathLike], filename: str, raw_data_file_format: str
|
|
275
|
+
) -> Tuple[np.ndarray, str, float, float, float]:
|
|
276
|
+
"""
|
|
277
|
+
Extract names of data files.
|
|
278
|
+
|
|
279
|
+
Parameters
|
|
280
|
+
----------
|
|
281
|
+
path
|
|
282
|
+
Path to the raw data files.
|
|
283
|
+
filename
|
|
284
|
+
Name of a raw date file containing a NumPy array with a time series (time as first, intensity as second element of the array).
|
|
285
|
+
raw_data_file_format
|
|
286
|
+
Data format (suffix) of the raw data, default is '.npy'.
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
timeseries
|
|
291
|
+
Updated NumPy array containing time and intensity data as NumPy arrays in first and second row, respectively.
|
|
292
|
+
NaN values have been replaced with zeroes.
|
|
293
|
+
acquisition
|
|
294
|
+
Name of a single acquisition.
|
|
295
|
+
precursor
|
|
296
|
+
Can be one of the following:
|
|
297
|
+
Either the experiment number of the signal within the acquisition (each experiment = one mass trace)
|
|
298
|
+
or the mass to charge ratio of the precursor ion selected in Q1.
|
|
299
|
+
product_mz_start
|
|
300
|
+
Start of the mass to charge ratio range of the product ion in the TOF.
|
|
301
|
+
product_mz_end
|
|
302
|
+
End of the mass to charge ratio range of the product ion in the TOF.
|
|
303
|
+
"""
|
|
304
|
+
# load time series
|
|
305
|
+
timeseries = np.load(Path(path) / filename)
|
|
306
|
+
# if NaN are in time or intensity, replace it with 0.0
|
|
307
|
+
timeseries = np.nan_to_num(timeseries)
|
|
308
|
+
# get information from the raw data file name
|
|
309
|
+
splits = filename.split("_")
|
|
310
|
+
if len(splits) != 4:
|
|
311
|
+
raise InputError(
|
|
312
|
+
f"""The standardized naming scheme was violated by file {filename}.
|
|
313
|
+
\nThe name should be divided by underscores into the sections acquisition name, precursor, product_mz_start, and product_mz_end.
|
|
314
|
+
"""
|
|
315
|
+
)
|
|
316
|
+
try:
|
|
317
|
+
pattern = r"(.*?)_(\d+\.?\d*)_(\d+\.?\d*)_(\d+\.?\d*).*"
|
|
318
|
+
m = re.match(pattern, filename)
|
|
319
|
+
if m is not None:
|
|
320
|
+
acquisition, precursor, mz_start, mz_end = m.groups()
|
|
321
|
+
precursor_converted = float(precursor)
|
|
322
|
+
product_mz_start_converted = float(mz_start)
|
|
323
|
+
product_mz_end_converted = float(mz_end)
|
|
324
|
+
except ValueError as ex:
|
|
325
|
+
raise InputError(
|
|
326
|
+
f"The name of file {filename} does not follow the standardized naming convention."
|
|
327
|
+
) from ex
|
|
328
|
+
|
|
329
|
+
return (
|
|
330
|
+
timeseries,
|
|
331
|
+
acquisition,
|
|
332
|
+
precursor_converted,
|
|
333
|
+
product_mz_start_converted,
|
|
334
|
+
product_mz_end_converted,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def parse_unique_identifiers(raw_data_files: Sequence[str]) -> List[str]:
|
|
339
|
+
"""
|
|
340
|
+
Get a set of all mass traces based on the standardized raw data file names (excluding acquisitions).
|
|
341
|
+
Used to automatically fill out the unique_identifiers column in the Template.xlsx' signals tab.
|
|
342
|
+
|
|
343
|
+
Parameters
|
|
344
|
+
----------
|
|
345
|
+
raw_data_files
|
|
346
|
+
Names of all files of the specified data type in path_raw_data.
|
|
347
|
+
|
|
348
|
+
Returns
|
|
349
|
+
-------
|
|
350
|
+
unique_identifiers
|
|
351
|
+
List with all unique combinations of targeted molecules.
|
|
352
|
+
(i.e. experiment number or precursor ion m/z ratio and product ion m/z ratio range)
|
|
353
|
+
"""
|
|
354
|
+
# remove acquisition from file names
|
|
355
|
+
identifiers = []
|
|
356
|
+
for filename in raw_data_files:
|
|
357
|
+
pattern = r"(.*?)_(\d+\.?\d*)_(\d+\.?\d*)_(\d+\.?\d*).*"
|
|
358
|
+
m = re.match(pattern, filename)
|
|
359
|
+
if m is not None:
|
|
360
|
+
acquisition, precursor, mz_start, mz_end = m.groups()
|
|
361
|
+
identifiers.append("_".join([precursor, mz_start, mz_end]))
|
|
362
|
+
|
|
363
|
+
# select only unique identifiers
|
|
364
|
+
unique_identifiers = list(set(identifiers))
|
|
365
|
+
return unique_identifiers
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def initiate(path: Union[str, os.PathLike], *, run_dir: str = ""):
|
|
369
|
+
"""
|
|
370
|
+
Create a folder for the results. Also create a zip file inside that folder. Also create df_summary.
|
|
371
|
+
|
|
372
|
+
Parameters
|
|
373
|
+
----------
|
|
374
|
+
path
|
|
375
|
+
Path to the directory containing the raw data.
|
|
376
|
+
run_dir
|
|
377
|
+
Name of the directory created to store the results of the current run (default: current date and time).
|
|
378
|
+
|
|
379
|
+
Returns
|
|
380
|
+
-------
|
|
381
|
+
df_summary
|
|
382
|
+
DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
383
|
+
path
|
|
384
|
+
Updated path variable pointing to the newly created folder for this batch.
|
|
385
|
+
"""
|
|
386
|
+
# get current date and time
|
|
387
|
+
if not run_dir:
|
|
388
|
+
today = str(date.today())
|
|
389
|
+
now = datetime.now().strftime("%H-%M-%S")
|
|
390
|
+
timestamp = today + "_" + now
|
|
391
|
+
run_dir = timestamp + "_run"
|
|
392
|
+
# create a directory
|
|
393
|
+
path = Path(path) / run_dir
|
|
394
|
+
path.mkdir(exist_ok=True)
|
|
395
|
+
# create DataFrame for data report
|
|
396
|
+
df_summary = pandas.DataFrame(
|
|
397
|
+
columns=[
|
|
398
|
+
"mean",
|
|
399
|
+
"sd",
|
|
400
|
+
"hdi_3%",
|
|
401
|
+
"hdi_97%",
|
|
402
|
+
"mcse_mean",
|
|
403
|
+
"mcse_sd",
|
|
404
|
+
"ess_bulk",
|
|
405
|
+
"ess_tail",
|
|
406
|
+
"r_hat",
|
|
407
|
+
"acquisition",
|
|
408
|
+
"experiment_or_precursor_mz",
|
|
409
|
+
"product_mz_start",
|
|
410
|
+
"product_mz_end",
|
|
411
|
+
"is_peak",
|
|
412
|
+
"cause_for_rejection",
|
|
413
|
+
"model_type",
|
|
414
|
+
"subpeak",
|
|
415
|
+
]
|
|
416
|
+
)
|
|
417
|
+
return df_summary, path
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def prefiltering(
|
|
421
|
+
filename: str, ui: UserInput, noise_width_guess: float, df_summary: pandas.DataFrame
|
|
422
|
+
):
|
|
423
|
+
"""
|
|
424
|
+
Optional method to skip signals where clearly no peak is present. Saves a lot of computation time.
|
|
425
|
+
|
|
426
|
+
Parameters
|
|
427
|
+
----------
|
|
428
|
+
filename
|
|
429
|
+
Name of the raw data file.
|
|
430
|
+
ui
|
|
431
|
+
Instance of the UserInput class
|
|
432
|
+
noise_width_guess
|
|
433
|
+
Estimated width of the noise of a particular measurement.
|
|
434
|
+
|
|
435
|
+
Returns
|
|
436
|
+
-------
|
|
437
|
+
found_peak
|
|
438
|
+
True, if any peak candidate was found within the time frame; False, if not.
|
|
439
|
+
df_summary
|
|
440
|
+
DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
441
|
+
"""
|
|
442
|
+
# pre-fit tests for peaks to save computation time (optional)
|
|
443
|
+
t_ret = ui.user_info[filename][1]
|
|
444
|
+
est_width = ui.peak_width_estimate
|
|
445
|
+
# find all potential peaks with scipy
|
|
446
|
+
peaks, _ = scipy.signal.find_peaks(ui.timeseries[1])
|
|
447
|
+
peak_candidates = []
|
|
448
|
+
# differentiate between single and double peaks
|
|
449
|
+
for peak in peaks:
|
|
450
|
+
# define conditions for passing the pre-filtering
|
|
451
|
+
# check proximity of any peak candidate to the estimated retention time
|
|
452
|
+
retention_time_condition = t_ret - est_width <= ui.timeseries[0][peak] <= t_ret + est_width
|
|
453
|
+
# check signal to noise ratio
|
|
454
|
+
signal_to_noise_condition = (
|
|
455
|
+
ui.timeseries[1][peak] / (noise_width_guess + 0.1) > ui.minimum_sn
|
|
456
|
+
)
|
|
457
|
+
# check the neighbouring data points to prevent classification of a single elevated data point as a peak
|
|
458
|
+
check_preceding_point = ui.timeseries[1][peak - 1] / (noise_width_guess + 0.1) > 2
|
|
459
|
+
check_succeeding_point = ui.timeseries[1][peak + 1] / (noise_width_guess + 0.1) > 2
|
|
460
|
+
if (
|
|
461
|
+
retention_time_condition
|
|
462
|
+
and signal_to_noise_condition
|
|
463
|
+
and check_preceding_point
|
|
464
|
+
and check_succeeding_point
|
|
465
|
+
):
|
|
466
|
+
peak_candidates.append(peak)
|
|
467
|
+
if not peak_candidates:
|
|
468
|
+
df_summary = report_add_nan_to_summary(filename, ui, df_summary, "pre-filtering")
|
|
469
|
+
return False, df_summary
|
|
470
|
+
return True, df_summary
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def sampling(pmodel, **sample_kwargs):
|
|
474
|
+
"""Performs sampling.
|
|
475
|
+
|
|
476
|
+
Parameters
|
|
477
|
+
----------
|
|
478
|
+
pmodel
|
|
479
|
+
A PyMC model.
|
|
480
|
+
**kwargs
|
|
481
|
+
The keyword arguments are used in pm.sample().
|
|
482
|
+
tune
|
|
483
|
+
Number of tuning samples (default = 2000).
|
|
484
|
+
draws
|
|
485
|
+
Number of samples after tuning (default = 2000).
|
|
486
|
+
|
|
487
|
+
Returns
|
|
488
|
+
-------
|
|
489
|
+
idata
|
|
490
|
+
Inference data object.
|
|
491
|
+
"""
|
|
492
|
+
sample_kwargs.setdefault("tune", 2000)
|
|
493
|
+
sample_kwargs.setdefault("draws", 2000)
|
|
494
|
+
# check if nutpie is available; if so, use it to enhance performance
|
|
495
|
+
if importlib.util.find_spec("nutpie"):
|
|
496
|
+
nuts_sampler = "nutpie"
|
|
497
|
+
else:
|
|
498
|
+
nuts_sampler = "pymc"
|
|
499
|
+
with pmodel:
|
|
500
|
+
idata = pm.sample_prior_predictive()
|
|
501
|
+
idata.extend(pm.sample(nuts_sampler=nuts_sampler, **sample_kwargs))
|
|
502
|
+
return idata
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def postfiltering(filename: str, idata, ui: UserInput, df_summary: pandas.DataFrame):
|
|
506
|
+
"""
|
|
507
|
+
Method to filter out false positive peaks after sampling based on the obtained uncertainties of several peak parameters.
|
|
508
|
+
|
|
509
|
+
Parameters
|
|
510
|
+
----------
|
|
511
|
+
filename
|
|
512
|
+
Name of the raw data file.
|
|
513
|
+
idata
|
|
514
|
+
Inference data object resulting from sampling.
|
|
515
|
+
ui
|
|
516
|
+
Instance of the UserInput class.
|
|
517
|
+
df_summary
|
|
518
|
+
DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
519
|
+
|
|
520
|
+
Returns
|
|
521
|
+
-------
|
|
522
|
+
acceptance
|
|
523
|
+
True if the signal was accepted as a peak -> save data and continue with next signal.
|
|
524
|
+
False if the signal was not accepted as a peak -> re-sampling with more tuning samples or discard signal.
|
|
525
|
+
resample
|
|
526
|
+
True: re-sample with more tuning samples, False: don't.
|
|
527
|
+
discard
|
|
528
|
+
True: discard sample.
|
|
529
|
+
"""
|
|
530
|
+
# check whether convergence, i.e. r_hat <= 1.05, was not reached OR peak criteria were not met
|
|
531
|
+
model = ui.user_info[filename][0]
|
|
532
|
+
resample = False
|
|
533
|
+
discard = False
|
|
534
|
+
rejection_msg = ""
|
|
535
|
+
az_summary: pandas.DataFrame = az.summary(idata)
|
|
536
|
+
if model in ["normal", "skew_normal"]:
|
|
537
|
+
# for single peak
|
|
538
|
+
# Get data needed for rejection decisions
|
|
539
|
+
max_rhat = max(az_summary.loc[:, "r_hat"])
|
|
540
|
+
std = az_summary.loc["std", "mean"]
|
|
541
|
+
area_sd = az_summary.loc["area", "sd"]
|
|
542
|
+
area_mean = az_summary.loc["area", "mean"]
|
|
543
|
+
height_sd = az_summary.loc["height", "sd"]
|
|
544
|
+
height_mean = az_summary.loc["height", "mean"]
|
|
545
|
+
|
|
546
|
+
# decide whether to discard signal or sample with more tune samples based on size of sigma parameter
|
|
547
|
+
# of normal distribution (std) and on the relative sizes of standard deviations of area and height
|
|
548
|
+
reject_reasons = []
|
|
549
|
+
if max_rhat > 1.05:
|
|
550
|
+
reject_reasons.append(f"maximum Rhat ({max_rhat:.3f}) was too high")
|
|
551
|
+
if std <= ui.peak_width_estimate / 100:
|
|
552
|
+
reject_reasons.append(f"standard deviation estimate ({std:.2f}) was too low")
|
|
553
|
+
if area_sd > area_mean * 0.2:
|
|
554
|
+
reject_reasons.append(f"area estimate ({area_mean} ± {area_sd}) was too uncertain")
|
|
555
|
+
if height_sd > height_mean * 0.2:
|
|
556
|
+
reject_reasons.append(
|
|
557
|
+
f"height estimate ({height_mean} ± {height_sd}) was too uncertain"
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
if len(reject_reasons) == 1 and "Rhat" in reject_reasons[0]:
|
|
561
|
+
# r_hat failed but rest of post-fit check passed
|
|
562
|
+
# sample again with more tune samples to possibly reach convergence yet
|
|
563
|
+
resample = True
|
|
564
|
+
discard = False
|
|
565
|
+
elif reject_reasons:
|
|
566
|
+
rejection_msg = " and ".join(reject_reasons)
|
|
567
|
+
df_summary = report_add_nan_to_summary(filename, ui, df_summary, rejection_msg)
|
|
568
|
+
resample = False
|
|
569
|
+
discard = True
|
|
570
|
+
|
|
571
|
+
elif model in ["double_normal", "double_skew_normal"]:
|
|
572
|
+
# for double peak
|
|
573
|
+
max_rhat = max(az_summary.loc[:, "r_hat"])
|
|
574
|
+
std = az_summary.loc["std[0]", "mean"]
|
|
575
|
+
area_sd = az_summary.loc["area[0]", "sd"]
|
|
576
|
+
area_mean = az_summary.loc["area[0]", "mean"]
|
|
577
|
+
height_sd = az_summary.loc["height[0]", "sd"]
|
|
578
|
+
height_mean = az_summary.loc["height[0]", "mean"]
|
|
579
|
+
std2 = az_summary.loc["std[1]", "mean"]
|
|
580
|
+
area_sd2 = az_summary.loc["area[1]", "sd"]
|
|
581
|
+
area_mean2 = az_summary.loc["area[1]", "mean"]
|
|
582
|
+
height_sd2 = az_summary.loc["height[1]", "sd"]
|
|
583
|
+
height_mean2 = az_summary.loc["height[1]", "mean"]
|
|
584
|
+
|
|
585
|
+
if max_rhat > 1.05:
|
|
586
|
+
resample = True
|
|
587
|
+
discard = False
|
|
588
|
+
return resample, discard, df_summary
|
|
589
|
+
# Booleans to differentiate which peak is or is not detected
|
|
590
|
+
double_not_found_first = False
|
|
591
|
+
double_not_found_second = False
|
|
592
|
+
if std <= 1 / 100 or area_sd > area_mean * 0.2 or height_sd > height_mean * 0.2:
|
|
593
|
+
# post-fit check failed
|
|
594
|
+
# add NaN values to summary DataFrame
|
|
595
|
+
double_not_found_first = True
|
|
596
|
+
if std2 <= 1 / 100 or area_sd2 > area_mean2 * 0.2 or height_sd2 > height_mean2 * 0.2:
|
|
597
|
+
# post-fit check failed
|
|
598
|
+
# add NaN values to summary DataFrame
|
|
599
|
+
double_not_found_second = True
|
|
600
|
+
# if both peaks failed the peak criteria tests, then reject peaks
|
|
601
|
+
if double_not_found_first and double_not_found_second:
|
|
602
|
+
reject_reasons = []
|
|
603
|
+
if std <= ui.peak_width_estimate / 100:
|
|
604
|
+
reject_reasons.append(f"standard deviation estimate ({std:.2f}) was too low")
|
|
605
|
+
if std2 <= ui.peak_width_estimate / 100:
|
|
606
|
+
reject_reasons.append(f"standard deviation estimate ({std2:.2f}) was too low")
|
|
607
|
+
if area_sd > area_mean * 0.2:
|
|
608
|
+
reject_reasons.append(f"area estimate ({area_mean} ± {area_sd}) was too uncertain")
|
|
609
|
+
if area_sd2 > area_mean2 * 0.2:
|
|
610
|
+
reject_reasons.append(
|
|
611
|
+
f"area estimate ({area_mean2} ± {area_sd2}) was too uncertain"
|
|
612
|
+
)
|
|
613
|
+
if height_sd > height_mean * 0.2:
|
|
614
|
+
reject_reasons.append(
|
|
615
|
+
f"height estimate ({height_mean} ± {height_sd}) was too uncertain"
|
|
616
|
+
)
|
|
617
|
+
if height_sd2 > height_mean2 * 0.2:
|
|
618
|
+
reject_reasons.append(
|
|
619
|
+
f"height estimate ({height_mean2} ± {height_sd2}) was too uncertain"
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
if reject_reasons:
|
|
623
|
+
rejection_msg = " and ".join(reject_reasons)
|
|
624
|
+
|
|
625
|
+
df_summary = report_add_nan_to_summary(filename, ui, df_summary, rejection_msg)
|
|
626
|
+
resample = False
|
|
627
|
+
discard = True
|
|
628
|
+
|
|
629
|
+
else:
|
|
630
|
+
raise NotImplementedError(f"The model {model} is not implemented.")
|
|
631
|
+
return resample, discard, df_summary
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def posterior_predictive_sampling(pmodel, idata):
|
|
635
|
+
"""Performs posterior predictive sampling for signals recognized as peaks.
|
|
636
|
+
|
|
637
|
+
Parameters
|
|
638
|
+
----------
|
|
639
|
+
pmodel
|
|
640
|
+
A PyMC model.
|
|
641
|
+
idata
|
|
642
|
+
Previously sampled inference data object.
|
|
643
|
+
|
|
644
|
+
Returns
|
|
645
|
+
-------
|
|
646
|
+
idata
|
|
647
|
+
Inference data object updated with the posterior predictive samples.
|
|
648
|
+
"""
|
|
649
|
+
with pmodel:
|
|
650
|
+
idata.extend(pm.sample_posterior_predictive(idata, var_names=["y"]))
|
|
651
|
+
return idata
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def report_save_idata(idata, ui: UserInput, filename: str):
|
|
655
|
+
"""
|
|
656
|
+
Saves inference data object as a .nc file.
|
|
657
|
+
|
|
658
|
+
Parameters
|
|
659
|
+
----------
|
|
660
|
+
idata
|
|
661
|
+
Inference data object resulting from sampling.
|
|
662
|
+
ui
|
|
663
|
+
Instance of the UserInput class.
|
|
664
|
+
filename
|
|
665
|
+
Name of a raw date file containing a NumPy array with a time series (time as first, intensity as second element of the array).
|
|
666
|
+
"""
|
|
667
|
+
fp = Path(ui.path) / f"{filename}.nc"
|
|
668
|
+
idata.to_netcdf(str(fp.absolute()))
|
|
669
|
+
return
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def report_add_data_to_summary(
|
|
673
|
+
filename: str,
|
|
674
|
+
idata,
|
|
675
|
+
df_summary: pandas.DataFrame,
|
|
676
|
+
ui: UserInput,
|
|
677
|
+
is_peak: bool,
|
|
678
|
+
rejection_cause: str = "",
|
|
679
|
+
):
|
|
680
|
+
"""
|
|
681
|
+
Extracts the relevant information from idata, concatenates it to the summary DataFrame, and saves the DataFrame as an Excel file.
|
|
682
|
+
Error handling prevents stop of the pipeline in case the saving doesn't work (e.g. because the file was opened by someone).
|
|
683
|
+
|
|
684
|
+
Parameters
|
|
685
|
+
----------
|
|
686
|
+
idata
|
|
687
|
+
Inference data object resulting from sampling.
|
|
688
|
+
df_summary
|
|
689
|
+
DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
690
|
+
ui
|
|
691
|
+
Instance of the UserInput class.
|
|
692
|
+
is_peak
|
|
693
|
+
Boolean stating whether a signal was recognized as a peak (True) or not (False).
|
|
694
|
+
rejection_cause
|
|
695
|
+
Cause for rejecting a given signal.
|
|
696
|
+
|
|
697
|
+
Returns
|
|
698
|
+
-------
|
|
699
|
+
df_summary
|
|
700
|
+
Updated DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
701
|
+
"""
|
|
702
|
+
az_summary: pandas.DataFrame = az.summary(idata)
|
|
703
|
+
model = ui.user_info[filename][0]
|
|
704
|
+
# split double peak into first and second peak (when extracting the data from az.summary(idata))
|
|
705
|
+
if model in ["double_normal", "double_skew_normal"]:
|
|
706
|
+
# first peak of double peak
|
|
707
|
+
parameters = [
|
|
708
|
+
"baseline_intercept",
|
|
709
|
+
"baseline_slope",
|
|
710
|
+
"mean[0]",
|
|
711
|
+
"noise",
|
|
712
|
+
"std[0]",
|
|
713
|
+
"area[0]",
|
|
714
|
+
"height[0]",
|
|
715
|
+
"sn[0]",
|
|
716
|
+
]
|
|
717
|
+
df = az_summary.loc[parameters, :]
|
|
718
|
+
df = df.rename(
|
|
719
|
+
index={
|
|
720
|
+
"mean[0]": "mean",
|
|
721
|
+
"std[0]": "std",
|
|
722
|
+
"area[0]": "area",
|
|
723
|
+
"height[0]": "height",
|
|
724
|
+
"sn[0]": "sn",
|
|
725
|
+
}
|
|
726
|
+
)
|
|
727
|
+
df["acquisition"] = len(parameters) * [f"{ui.acquisition}"]
|
|
728
|
+
df["experiment_or_precursor_mz"] = len(parameters) * [ui.precursor]
|
|
729
|
+
df["product_mz_start"] = len(parameters) * [ui.product_mz_start]
|
|
730
|
+
df["product_mz_end"] = len(parameters) * [ui.product_mz_end]
|
|
731
|
+
df["is_peak"] = is_peak
|
|
732
|
+
df["cause_for_rejection"] = rejection_cause
|
|
733
|
+
df["model_type"] = len(parameters) * [model]
|
|
734
|
+
df["subpeak"] = len(parameters) * ["1st"]
|
|
735
|
+
|
|
736
|
+
# second peak of double peak
|
|
737
|
+
parameters = [
|
|
738
|
+
"baseline_intercept",
|
|
739
|
+
"baseline_slope",
|
|
740
|
+
"mean[1]",
|
|
741
|
+
"noise",
|
|
742
|
+
"std[1]",
|
|
743
|
+
"area[1]",
|
|
744
|
+
"height[1]",
|
|
745
|
+
"sn[1]",
|
|
746
|
+
]
|
|
747
|
+
df2 = az_summary.loc[parameters, :]
|
|
748
|
+
df2 = df2.rename(
|
|
749
|
+
index={
|
|
750
|
+
"area[1]": "area",
|
|
751
|
+
"height[1]": "height",
|
|
752
|
+
"sn[1]": "sn",
|
|
753
|
+
"std[1]": "std",
|
|
754
|
+
"mean[1]": "mean",
|
|
755
|
+
}
|
|
756
|
+
)
|
|
757
|
+
df2["acquisition"] = len(parameters) * [f"{ui.acquisition}"]
|
|
758
|
+
df2["experiment_or_precursor_mz"] = len(parameters) * [ui.precursor]
|
|
759
|
+
df2["product_mz_start"] = len(parameters) * [ui.product_mz_start]
|
|
760
|
+
df2["product_mz_end"] = len(parameters) * [ui.product_mz_end]
|
|
761
|
+
df2["is_peak"] = is_peak
|
|
762
|
+
df2["cause_for_rejection"] = rejection_cause
|
|
763
|
+
df2["model_type"] = len(parameters) * [model]
|
|
764
|
+
df2["subpeak"] = len(parameters) * ["2nd"]
|
|
765
|
+
df_double = pandas.concat([df, df2])
|
|
766
|
+
df_summary = pandas.concat([df_summary, df_double])
|
|
767
|
+
|
|
768
|
+
else:
|
|
769
|
+
# for single peak
|
|
770
|
+
parameters = [
|
|
771
|
+
"baseline_intercept",
|
|
772
|
+
"baseline_slope",
|
|
773
|
+
"mean",
|
|
774
|
+
"noise",
|
|
775
|
+
"std",
|
|
776
|
+
"area",
|
|
777
|
+
"height",
|
|
778
|
+
"sn",
|
|
779
|
+
]
|
|
780
|
+
df = az_summary.loc[parameters, :]
|
|
781
|
+
df["acquisition"] = len(parameters) * [f"{ui.acquisition}"]
|
|
782
|
+
df["experiment_or_precursor_mz"] = len(parameters) * [ui.precursor]
|
|
783
|
+
df["product_mz_start"] = len(parameters) * [ui.product_mz_start]
|
|
784
|
+
df["product_mz_end"] = len(parameters) * [ui.product_mz_end]
|
|
785
|
+
df["is_peak"] = is_peak
|
|
786
|
+
df["cause_for_rejection"] = rejection_cause
|
|
787
|
+
df["model_type"] = len(parameters) * [model]
|
|
788
|
+
df["subpeak"] = len(parameters) * [""]
|
|
789
|
+
df_summary = pandas.concat([df_summary, df])
|
|
790
|
+
# pandas.concat(df_summary, df)
|
|
791
|
+
# save summary df as Excel file
|
|
792
|
+
with pandas.ExcelWriter(
|
|
793
|
+
path=rf"{ui.path}/peak_data_summary.xlsx", engine="openpyxl", mode="w"
|
|
794
|
+
) as writer:
|
|
795
|
+
df_summary.to_excel(writer)
|
|
796
|
+
return df_summary
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def report_area_sheet(path: Union[str, os.PathLike], df_summary: pandas.DataFrame):
|
|
800
|
+
"""
|
|
801
|
+
Save a different, more minimalist report sheet focussing on the area data.
|
|
802
|
+
|
|
803
|
+
Parameters
|
|
804
|
+
----------
|
|
805
|
+
path
|
|
806
|
+
Path to the directory containing the raw data.
|
|
807
|
+
df_summary
|
|
808
|
+
DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
809
|
+
"""
|
|
810
|
+
# also save a version of df_summary only for areas with correct order and only necessary data
|
|
811
|
+
df_area_summary = df_summary[df_summary.index == "area"]
|
|
812
|
+
sorted_area_summary = df_area_summary.sort_values(
|
|
813
|
+
["acquisition", "experiment_or_precursor_mz", "product_mz_start"]
|
|
814
|
+
)
|
|
815
|
+
sorted_area_summary = sorted_area_summary.drop(
|
|
816
|
+
labels=["mcse_mean", "mcse_sd", "ess_bulk", "ess_tail"], axis=1
|
|
817
|
+
)
|
|
818
|
+
sorted_area_summary.to_excel(rf"{path}/area_summary.xlsx")
|
|
819
|
+
return
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def report_add_nan_to_summary(
|
|
823
|
+
filename: str, ui: UserInput, df_summary: pandas.DataFrame, rejection_cause: str
|
|
824
|
+
):
|
|
825
|
+
"""
|
|
826
|
+
Method to add NaN values to the summary DataFrame in case a signal did not contain a peak.
|
|
827
|
+
|
|
828
|
+
Parameters
|
|
829
|
+
----------
|
|
830
|
+
ui
|
|
831
|
+
Instance of the UserInput class.
|
|
832
|
+
df_summary
|
|
833
|
+
DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
834
|
+
rejection_cause
|
|
835
|
+
Cause for rejecting a given signal.
|
|
836
|
+
|
|
837
|
+
Returns
|
|
838
|
+
-------
|
|
839
|
+
df_summary
|
|
840
|
+
Updated DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
841
|
+
"""
|
|
842
|
+
model = ui.user_info[filename][0]
|
|
843
|
+
# create DataFrame with correct format and fill it with NaN
|
|
844
|
+
nan_dictionary = {
|
|
845
|
+
"mean": np.nan,
|
|
846
|
+
"sd": np.nan,
|
|
847
|
+
"hdi_3%": np.nan,
|
|
848
|
+
"hdi_97%": np.nan,
|
|
849
|
+
"mcse_mean": np.nan,
|
|
850
|
+
"mcse_sd": np.nan,
|
|
851
|
+
"ess_bulk": np.nan,
|
|
852
|
+
"ess_tail": np.nan,
|
|
853
|
+
"r_hat": np.nan,
|
|
854
|
+
}
|
|
855
|
+
df = pandas.DataFrame(
|
|
856
|
+
{
|
|
857
|
+
"baseline_intercept": nan_dictionary,
|
|
858
|
+
"baseline_slope": nan_dictionary,
|
|
859
|
+
"mean": nan_dictionary,
|
|
860
|
+
"noise": nan_dictionary,
|
|
861
|
+
"std": nan_dictionary,
|
|
862
|
+
"area": nan_dictionary,
|
|
863
|
+
"height": nan_dictionary,
|
|
864
|
+
"sn": nan_dictionary,
|
|
865
|
+
}
|
|
866
|
+
).transpose()
|
|
867
|
+
# add information about the signal
|
|
868
|
+
df["acquisition"] = len(df.index) * [f"{ui.acquisition}"]
|
|
869
|
+
df["experiment_or_precursor_mz"] = len(df.index) * [ui.precursor]
|
|
870
|
+
df["product_mz_start"] = len(df.index) * [ui.product_mz_start]
|
|
871
|
+
df["product_mz_end"] = len(df.index) * [ui.product_mz_end]
|
|
872
|
+
df["is_peak"] = len(df.index) * [False]
|
|
873
|
+
df["cause_for_rejection"] = len(df.index) * [rejection_cause]
|
|
874
|
+
# if no peak was detected, there is no need for splitting double peaks, just give the info whether one was expected or not
|
|
875
|
+
df["model_type"] = len(df.index) * [model]
|
|
876
|
+
df["subpeak"] = len(df.index) * [""]
|
|
877
|
+
# concatenate to existing summary DataFrame
|
|
878
|
+
df_summary = pandas.concat([df_summary, df])
|
|
879
|
+
# save summary df as Excel file
|
|
880
|
+
with pandas.ExcelWriter(
|
|
881
|
+
path=rf"{ui.path}/peak_data_summary.xlsx", engine="openpyxl", mode="w"
|
|
882
|
+
) as writer:
|
|
883
|
+
df_summary.to_excel(writer)
|
|
884
|
+
return df_summary
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def pipeline_read_template(path_raw_data: Union[str, os.PathLike]):
|
|
888
|
+
"""
|
|
889
|
+
Function to read and check the input settings and data from Template.xlsx when running the data pipeline.
|
|
890
|
+
|
|
891
|
+
Parameters
|
|
892
|
+
----------
|
|
893
|
+
path_raw_data
|
|
894
|
+
Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
|
|
895
|
+
The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
|
|
896
|
+
|
|
897
|
+
Returns
|
|
898
|
+
-------
|
|
899
|
+
pre_filtering
|
|
900
|
+
If True, potential peaks will be filtered based on retention time and signal to noise ratio before sampling.
|
|
901
|
+
plotting
|
|
902
|
+
If True, PeakPerformance will plot results.
|
|
903
|
+
peak_width_estimate
|
|
904
|
+
Rough estimate of the average peak width in minutes expected for the LC-MS method with which the data was obtained.
|
|
905
|
+
minimum_sn
|
|
906
|
+
Minimum signal to noise ratio for a signal to be recognized as a peak during pre-filtering.
|
|
907
|
+
df_signals
|
|
908
|
+
Read-out of the signals tab from Template.xlsx as a DataFrame.
|
|
909
|
+
unique_identifiers
|
|
910
|
+
List of unique identifiers from the signals tab of Template.xlsx.
|
|
911
|
+
"""
|
|
912
|
+
# read data and user input from the settings tab of Template.xlsx
|
|
913
|
+
df_settings = pandas.read_excel(
|
|
914
|
+
Path(path_raw_data) / "Template.xlsx", sheet_name="settings", index_col="parameter"
|
|
915
|
+
)
|
|
916
|
+
pre_filtering = eval(df_settings.loc["pre_filtering", "setting"])
|
|
917
|
+
if not isinstance(pre_filtering, bool):
|
|
918
|
+
raise InputError("pre_filtering under settings in Template.xlsx must be a bool.")
|
|
919
|
+
plotting = eval(df_settings.loc["plotting", "setting"])
|
|
920
|
+
if not isinstance(plotting, bool):
|
|
921
|
+
raise InputError("plotting under settings in Template.xlsx must be a bool.")
|
|
922
|
+
peak_width_estimate = df_settings.loc["peak_width_estimate", "setting"]
|
|
923
|
+
if not isinstance(peak_width_estimate, float) and not isinstance(peak_width_estimate, int):
|
|
924
|
+
try:
|
|
925
|
+
peak_width_estimate = float(peak_width_estimate)
|
|
926
|
+
except: # noqa: E722
|
|
927
|
+
raise InputError(
|
|
928
|
+
"peak_width_estimate under settings in Template.xlsx must be an int or float."
|
|
929
|
+
)
|
|
930
|
+
minimum_sn = df_settings.loc["minimum_sn", "setting"]
|
|
931
|
+
if not isinstance(minimum_sn, float) and not isinstance(minimum_sn, int):
|
|
932
|
+
try:
|
|
933
|
+
minimum_sn = float(minimum_sn)
|
|
934
|
+
except: # noqa: E722
|
|
935
|
+
raise InputError("minimum_sn under settings in Template.xlsx must be an int or float.")
|
|
936
|
+
|
|
937
|
+
# read data and user input from the signals tab of Template.xlsx
|
|
938
|
+
df_signals = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="signals")
|
|
939
|
+
unique_identifiers = list(df_signals["unique_identifier"].replace("", np.nan).dropna())
|
|
940
|
+
unique_identifiers = [str(identifier) for identifier in unique_identifiers]
|
|
941
|
+
if not unique_identifiers:
|
|
942
|
+
raise InputError(
|
|
943
|
+
"The list in column unique_identifier in the signals tab of Template.xlsx must not be empty."
|
|
944
|
+
)
|
|
945
|
+
if len(set(unique_identifiers)) != len(unique_identifiers):
|
|
946
|
+
raise InputError(
|
|
947
|
+
"The list in column unique_identifier in the signals tab of Template.xlsx must contain only unique entries."
|
|
948
|
+
)
|
|
949
|
+
# test whether df_signals is filled out correctly
|
|
950
|
+
for x in range(len(df_signals)):
|
|
951
|
+
if not df_signals.isnull()["unique_identifier"][x] and df_signals.isnull()["model_type"][x]:
|
|
952
|
+
raise InputError(
|
|
953
|
+
f"In the signals tab of Template.xlsx, the unique identifier in row {x + 2} has no model type."
|
|
954
|
+
)
|
|
955
|
+
if pre_filtering:
|
|
956
|
+
if (
|
|
957
|
+
not df_signals.isnull()["unique_identifier"][x]
|
|
958
|
+
and df_signals.isnull()["retention_time_estimate"][x]
|
|
959
|
+
):
|
|
960
|
+
raise InputError(
|
|
961
|
+
f"In the signals tab of Template.xlsx, the unique_identifier in row {x + 2} has no retention time estimate."
|
|
962
|
+
)
|
|
963
|
+
df_signals.set_index("unique_identifier", inplace=True)
|
|
964
|
+
return pre_filtering, plotting, peak_width_estimate, minimum_sn, df_signals, unique_identifiers
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
def pipeline_loop(
|
|
968
|
+
path_raw_data: Union[str, os.PathLike],
|
|
969
|
+
path_results: Union[str, os.PathLike],
|
|
970
|
+
raw_data_file_format: str,
|
|
971
|
+
df_summary: pandas.DataFrame,
|
|
972
|
+
*,
|
|
973
|
+
restart: bool = False,
|
|
974
|
+
):
|
|
975
|
+
"""
|
|
976
|
+
Function to run the complete PeakPerformance pipeline.
|
|
977
|
+
|
|
978
|
+
Parameters
|
|
979
|
+
----------
|
|
980
|
+
path_raw_data
|
|
981
|
+
Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
|
|
982
|
+
The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
|
|
983
|
+
path_results
|
|
984
|
+
Path to the directory for the results of a given Batch run of PeakPerformance.
|
|
985
|
+
raw_data_file_format
|
|
986
|
+
Data format (suffix) of the raw data, default is '.npy'.
|
|
987
|
+
df_summary
|
|
988
|
+
DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
|
|
989
|
+
restart
|
|
990
|
+
If a pipeline broke for some reason, it can be restarted by setting restart to True.
|
|
991
|
+
That way, already analyzed files won't be analyzed again.
|
|
992
|
+
"""
|
|
993
|
+
# read data and user input from the settings tab of Template.xlsx
|
|
994
|
+
(
|
|
995
|
+
pre_filtering,
|
|
996
|
+
plotting,
|
|
997
|
+
peak_width_estimate,
|
|
998
|
+
minimum_sn,
|
|
999
|
+
df_signals,
|
|
1000
|
+
unique_identifiers,
|
|
1001
|
+
) = pipeline_read_template(path_raw_data)
|
|
1002
|
+
peak_model_list = []
|
|
1003
|
+
retention_time_estimate_list = []
|
|
1004
|
+
# synchronize the lists of raw data files, peak models, and retention times
|
|
1005
|
+
# they will be converted to the user_info dict when instantiating the UserInput class below
|
|
1006
|
+
df_files = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="files")
|
|
1007
|
+
raw_data_files = list(df_files.loc[:, "file_name"])
|
|
1008
|
+
# in case of a restart, update raw_data_files to only contain files which have not been analyzed
|
|
1009
|
+
if restart:
|
|
1010
|
+
analyzed_files = os.listdir(path_results)
|
|
1011
|
+
for raw in raw_data_files:
|
|
1012
|
+
for analyzed in analyzed_files:
|
|
1013
|
+
if raw in analyzed:
|
|
1014
|
+
raw_data_files.remove(raw)
|
|
1015
|
+
for file in raw_data_files:
|
|
1016
|
+
for identifier in unique_identifiers:
|
|
1017
|
+
if identifier in file:
|
|
1018
|
+
peak_model_list.append(str(df_signals.loc[identifier, "model_type"]))
|
|
1019
|
+
retention_time_estimate_list.append(
|
|
1020
|
+
df_signals.loc[identifier, "retention_time_estimate"]
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
# loop over filenames
|
|
1024
|
+
for file in raw_data_files:
|
|
1025
|
+
# parse the data and extract information from the (standardized) file name
|
|
1026
|
+
(
|
|
1027
|
+
timeseries,
|
|
1028
|
+
acquisition,
|
|
1029
|
+
precursor,
|
|
1030
|
+
product_mz_start,
|
|
1031
|
+
product_mz_end,
|
|
1032
|
+
) = parse_data(path_raw_data, file, raw_data_file_format)
|
|
1033
|
+
# instantiate the UserInput class all given information
|
|
1034
|
+
ui = UserInput(
|
|
1035
|
+
path_results,
|
|
1036
|
+
raw_data_files,
|
|
1037
|
+
raw_data_file_format,
|
|
1038
|
+
peak_model_list,
|
|
1039
|
+
retention_time_estimate_list,
|
|
1040
|
+
peak_width_estimate,
|
|
1041
|
+
pre_filtering,
|
|
1042
|
+
minimum_sn,
|
|
1043
|
+
timeseries,
|
|
1044
|
+
acquisition,
|
|
1045
|
+
precursor,
|
|
1046
|
+
product_mz_start,
|
|
1047
|
+
product_mz_end,
|
|
1048
|
+
)
|
|
1049
|
+
# apply pre-sampling filter (if selected)
|
|
1050
|
+
if pre_filtering:
|
|
1051
|
+
# test if necessary settings were provided by the user
|
|
1052
|
+
if not retention_time_estimate_list:
|
|
1053
|
+
raise InputError(
|
|
1054
|
+
"If selecting pre-filtering, provide a list of retention time estimate in Template.xlsx."
|
|
1055
|
+
)
|
|
1056
|
+
if not minimum_sn:
|
|
1057
|
+
raise InputError(
|
|
1058
|
+
"If selecting pre-filtering, provide a minimum signal-to-noise ratio in Template.xlsx."
|
|
1059
|
+
)
|
|
1060
|
+
if not peak_width_estimate:
|
|
1061
|
+
raise InputError(
|
|
1062
|
+
"If selecting pre-filtering, provide a rough estimate of the general peak width in Template.xlsx."
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
# calculate noise guess for pre-filtering
|
|
1066
|
+
slope_guess, intercept_guess, noise_guess = models.initial_guesses(
|
|
1067
|
+
ui.timeseries[0], ui.timeseries[1]
|
|
1068
|
+
)
|
|
1069
|
+
prefilter, df_summary = prefiltering(file, ui, noise_guess, df_summary)
|
|
1070
|
+
if not prefilter:
|
|
1071
|
+
# if no peak candidates were found, continue with the next signal
|
|
1072
|
+
if plotting:
|
|
1073
|
+
plots.plot_raw_data(
|
|
1074
|
+
file[: -len(ui.raw_data_file_format)],
|
|
1075
|
+
ui.timeseries[0],
|
|
1076
|
+
ui.timeseries[1],
|
|
1077
|
+
ui.path,
|
|
1078
|
+
)
|
|
1079
|
+
continue
|
|
1080
|
+
# select model based on information in UserInput
|
|
1081
|
+
model = ui.user_info[file][0]
|
|
1082
|
+
if model == models.ModelType.Normal:
|
|
1083
|
+
pmodel = models.define_model_normal(ui.timeseries[0], ui.timeseries[1])
|
|
1084
|
+
elif model == models.ModelType.SkewNormal:
|
|
1085
|
+
pmodel = models.define_model_skew(ui.timeseries[0], ui.timeseries[1])
|
|
1086
|
+
elif model == models.ModelType.DoubleNormal:
|
|
1087
|
+
pmodel = models.define_model_double_normal(ui.timeseries[0], ui.timeseries[1])
|
|
1088
|
+
elif model == models.ModelType.DoubleSkewNormal:
|
|
1089
|
+
pmodel = models.define_model_double_skew_normal(ui.timeseries[0], ui.timeseries[1])
|
|
1090
|
+
else:
|
|
1091
|
+
raise NotImplementedError(
|
|
1092
|
+
f"The model '{model}' specified for file '{file}' is not implemented."
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
# sample the chosen model
|
|
1096
|
+
idata = sampling(pmodel)
|
|
1097
|
+
# apply post-sampling filter
|
|
1098
|
+
resample, discard, df_summary = postfiltering(file, idata, ui, df_summary)
|
|
1099
|
+
# if peak was discarded, continue with the next signal
|
|
1100
|
+
if discard:
|
|
1101
|
+
if plotting:
|
|
1102
|
+
plots.plot_posterior(
|
|
1103
|
+
file[: -len(ui.raw_data_file_format)],
|
|
1104
|
+
ui.timeseries[0],
|
|
1105
|
+
ui.timeseries[1],
|
|
1106
|
+
ui.path,
|
|
1107
|
+
idata,
|
|
1108
|
+
True,
|
|
1109
|
+
)
|
|
1110
|
+
continue
|
|
1111
|
+
# if convergence was not yet reached, sample again with more tuning samples
|
|
1112
|
+
if resample:
|
|
1113
|
+
if "double" in model:
|
|
1114
|
+
idata = sampling(pmodel, tune=16000)
|
|
1115
|
+
else:
|
|
1116
|
+
idata = sampling(pmodel, tune=6000)
|
|
1117
|
+
resample, discard, df_summary = postfiltering(file, idata, ui, df_summary)
|
|
1118
|
+
if discard:
|
|
1119
|
+
plots.plot_posterior(
|
|
1120
|
+
file[: -len(ui.raw_data_file_format)],
|
|
1121
|
+
ui.timeseries[0],
|
|
1122
|
+
ui.timeseries[1],
|
|
1123
|
+
ui.path,
|
|
1124
|
+
idata,
|
|
1125
|
+
True,
|
|
1126
|
+
)
|
|
1127
|
+
continue
|
|
1128
|
+
if resample:
|
|
1129
|
+
# if signal was flagged for re-sampling a second time, discard it
|
|
1130
|
+
rejection_msg = "postfiltering: signal was flagged for re-sampling with increased sample number twice"
|
|
1131
|
+
df_summary = report_add_data_to_summary(
|
|
1132
|
+
file, idata, df_summary, ui, False, rejection_msg
|
|
1133
|
+
)
|
|
1134
|
+
if plotting:
|
|
1135
|
+
plots.plot_posterior(
|
|
1136
|
+
file[: -len(ui.raw_data_file_format)],
|
|
1137
|
+
ui.timeseries[0],
|
|
1138
|
+
ui.timeseries[1],
|
|
1139
|
+
ui.path,
|
|
1140
|
+
idata,
|
|
1141
|
+
True,
|
|
1142
|
+
)
|
|
1143
|
+
continue
|
|
1144
|
+
# perform posterior predictive sampling
|
|
1145
|
+
idata = posterior_predictive_sampling(pmodel, idata)
|
|
1146
|
+
# add inference data to df_summary and save it as an Excel file
|
|
1147
|
+
df_summary = report_add_data_to_summary(file, idata, df_summary, ui, True)
|
|
1148
|
+
# save the inference data object as a netcdf file
|
|
1149
|
+
report_save_idata(idata, ui, file[: -len(ui.raw_data_file_format)])
|
|
1150
|
+
# plot data
|
|
1151
|
+
if plotting:
|
|
1152
|
+
plots.plot_posterior_predictive(
|
|
1153
|
+
file[: -len(ui.raw_data_file_format)],
|
|
1154
|
+
ui.timeseries[0],
|
|
1155
|
+
ui.timeseries[1],
|
|
1156
|
+
ui.path,
|
|
1157
|
+
idata,
|
|
1158
|
+
False,
|
|
1159
|
+
)
|
|
1160
|
+
plots.plot_posterior(
|
|
1161
|
+
file[: -len(ui.raw_data_file_format)],
|
|
1162
|
+
ui.timeseries[0],
|
|
1163
|
+
ui.timeseries[1],
|
|
1164
|
+
ui.path,
|
|
1165
|
+
idata,
|
|
1166
|
+
False,
|
|
1167
|
+
)
|
|
1168
|
+
# save condesed Excel file with area data
|
|
1169
|
+
report_area_sheet(path_results, df_summary)
|
|
1170
|
+
|
|
1171
|
+
|
|
1172
|
+
def pipeline(
|
|
1173
|
+
path_raw_data: Union[str, os.PathLike],
|
|
1174
|
+
raw_data_file_format: str,
|
|
1175
|
+
):
|
|
1176
|
+
"""
|
|
1177
|
+
Function to run the complete PeakPerformance pipeline.
|
|
1178
|
+
|
|
1179
|
+
Parameters
|
|
1180
|
+
----------
|
|
1181
|
+
path_raw_data
|
|
1182
|
+
Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
|
|
1183
|
+
The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
|
|
1184
|
+
raw_data_file_format
|
|
1185
|
+
Data format (suffix) of the raw data, default is '.npy'.
|
|
1186
|
+
|
|
1187
|
+
Returns
|
|
1188
|
+
----------
|
|
1189
|
+
path_results
|
|
1190
|
+
Path variable pointing to the newly created folder for this batch.
|
|
1191
|
+
"""
|
|
1192
|
+
# create data structure and DataFrame(s) for results
|
|
1193
|
+
df_summary, path_results = initiate(path_raw_data)
|
|
1194
|
+
pipeline_loop(
|
|
1195
|
+
path_raw_data,
|
|
1196
|
+
path_results,
|
|
1197
|
+
raw_data_file_format,
|
|
1198
|
+
df_summary,
|
|
1199
|
+
)
|
|
1200
|
+
return path_results
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
def pipeline_restart(
|
|
1204
|
+
path_raw_data: Union[str, os.PathLike],
|
|
1205
|
+
raw_data_file_format: str,
|
|
1206
|
+
path_results: Union[str, os.PathLike],
|
|
1207
|
+
):
|
|
1208
|
+
"""
|
|
1209
|
+
Function to restart a broken PeakPerformance pipeline.
|
|
1210
|
+
Files which are in the results directory of the broken pipeline will not be analyzed again.
|
|
1211
|
+
WARNING: This only works once! If a pipeline fails more than once, copy all files (except the Excel report sheets)
|
|
1212
|
+
into one directory and specify this directory as the path_results argument.
|
|
1213
|
+
|
|
1214
|
+
Parameters
|
|
1215
|
+
----------
|
|
1216
|
+
path_raw_data
|
|
1217
|
+
Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
|
|
1218
|
+
The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
|
|
1219
|
+
raw_data_file_format
|
|
1220
|
+
Data format (suffix) of the raw data, default is '.npy'.
|
|
1221
|
+
path_results
|
|
1222
|
+
Path variable pointing to the directory of the broken PeakPerformance batch
|
|
1223
|
+
|
|
1224
|
+
Returns
|
|
1225
|
+
----------
|
|
1226
|
+
path_results_new
|
|
1227
|
+
Path variable pointing to the newly created folder for the restarted batch.
|
|
1228
|
+
"""
|
|
1229
|
+
df_summary, path_results_new = initiate(path_raw_data)
|
|
1230
|
+
pipeline_loop(
|
|
1231
|
+
path_raw_data,
|
|
1232
|
+
path_results,
|
|
1233
|
+
raw_data_file_format,
|
|
1234
|
+
df_summary,
|
|
1235
|
+
restart=True,
|
|
1236
|
+
)
|
|
1237
|
+
return path_results_new
|
|
1238
|
+
|
|
1239
|
+
|
|
1240
|
+
def excel_template_prepare(
|
|
1241
|
+
path_raw_data: Union[str, os.PathLike],
|
|
1242
|
+
path_peak_performance: Union[str, os.PathLike],
|
|
1243
|
+
raw_data_files: Union[List[str], Tuple[str]],
|
|
1244
|
+
unique_identifiers: Union[List[str], Tuple[str]],
|
|
1245
|
+
):
|
|
1246
|
+
"""
|
|
1247
|
+
Function to copy Template.xlsx from the peak performance directory to the directory containing the raw data files.
|
|
1248
|
+
Subsequently, update Template.xlsx with a list of all raw data files and of all unique_identifiers.
|
|
1249
|
+
|
|
1250
|
+
Parameters
|
|
1251
|
+
----------
|
|
1252
|
+
path_raw_data
|
|
1253
|
+
Path to the folder containing raw data.
|
|
1254
|
+
path_peak_performance
|
|
1255
|
+
Path to the folder containing PeakPerformance.
|
|
1256
|
+
raw_data_files
|
|
1257
|
+
List with names of all files of the specified data type in path_raw_data.
|
|
1258
|
+
unique_identifiers
|
|
1259
|
+
List with all unique combinations of targeted molecules.
|
|
1260
|
+
(i.e. experiment number or precursor ion m/z ratio and product ion m/z ratio range)
|
|
1261
|
+
"""
|
|
1262
|
+
# copy Template.xlsx from PeakPerformance to the directory with the raw data
|
|
1263
|
+
try:
|
|
1264
|
+
shutil.copy(
|
|
1265
|
+
Path(path_peak_performance) / "Template.xlsx", Path(path_raw_data) / "Template.xlsx"
|
|
1266
|
+
)
|
|
1267
|
+
except FileNotFoundError:
|
|
1268
|
+
raise ParsingError(f"Template.xlsx was not found in {path_peak_performance}.")
|
|
1269
|
+
except Exception:
|
|
1270
|
+
raise ParsingError(
|
|
1271
|
+
f"Error while copying Template.xlsx from {path_peak_performance} into {path_raw_data}."
|
|
1272
|
+
)
|
|
1273
|
+
# load Template.xlsx
|
|
1274
|
+
wb = load_workbook(Path(path_raw_data) / "Template.xlsx")
|
|
1275
|
+
# add list of all files names to the files tab
|
|
1276
|
+
wb_files = wb["files"]
|
|
1277
|
+
df1 = pandas.DataFrame({"file_name": raw_data_files})
|
|
1278
|
+
for r in dataframe_to_rows(df1, index=False, header=False):
|
|
1279
|
+
wb_files.append(r)
|
|
1280
|
+
# add list of all unique identifiers (i.e. mass traces) to the signals tab
|
|
1281
|
+
wb_signals = wb["signals"]
|
|
1282
|
+
df2 = pandas.DataFrame({"unique_identifier": unique_identifiers})
|
|
1283
|
+
for r in dataframe_to_rows(df2, index=False, header=False):
|
|
1284
|
+
wb_signals.append(r)
|
|
1285
|
+
wb.save(Path(path_raw_data) / "Template.xlsx")
|
|
1286
|
+
return
|
|
1287
|
+
|
|
1288
|
+
|
|
1289
|
+
def prepare_model_selection(
|
|
1290
|
+
path_raw_data: Union[str, os.PathLike],
|
|
1291
|
+
path_template: Union[str, os.PathLike],
|
|
1292
|
+
):
|
|
1293
|
+
"""
|
|
1294
|
+
Function to prepare model selection by providing and mostly filling out an Excel template
|
|
1295
|
+
Template.xlsx. After this step, the user has to provide relevant information in Template.xlsx
|
|
1296
|
+
which is finally used for model selection.
|
|
1297
|
+
|
|
1298
|
+
Parameters
|
|
1299
|
+
----------
|
|
1300
|
+
path_raw_data
|
|
1301
|
+
Path to the folder containing raw data.
|
|
1302
|
+
path_template
|
|
1303
|
+
Path to the folder containing Template.xlsx from PeakPerformance.
|
|
1304
|
+
"""
|
|
1305
|
+
# detect raw data files
|
|
1306
|
+
raw_data_files = detect_raw_data(path_raw_data)
|
|
1307
|
+
# parse unique identifiers
|
|
1308
|
+
identifiers = parse_unique_identifiers(raw_data_files)
|
|
1309
|
+
# copy Template.xlsx into raw data directory and add data from the previous commmands
|
|
1310
|
+
excel_template_prepare(path_raw_data, path_template, raw_data_files, identifiers)
|
|
1311
|
+
return
|
|
1312
|
+
|
|
1313
|
+
|
|
1314
|
+
def parse_files_for_model_selection(signals: pandas.DataFrame) -> Dict[str, str]:
|
|
1315
|
+
"""
|
|
1316
|
+
Function to parse the file names for model selection.
|
|
1317
|
+
|
|
1318
|
+
Parameters
|
|
1319
|
+
----------
|
|
1320
|
+
signals
|
|
1321
|
+
DataFrame containing the signals tab of Template.xlsx.
|
|
1322
|
+
|
|
1323
|
+
Returns
|
|
1324
|
+
----------
|
|
1325
|
+
files_for_selection
|
|
1326
|
+
Dict with file names as keys and unique identifiers as values.
|
|
1327
|
+
"""
|
|
1328
|
+
identifier_list = list(signals["unique_identifier"].replace("", np.nan).dropna())
|
|
1329
|
+
model_list = list(signals["model_type"].replace("", np.nan).dropna())
|
|
1330
|
+
acquisition_list = list(
|
|
1331
|
+
signals["acquisition_for_choosing_model_type"].replace("", np.nan).dropna()
|
|
1332
|
+
)
|
|
1333
|
+
# sanity checks
|
|
1334
|
+
if not identifier_list:
|
|
1335
|
+
raise InputError("In the signals tab of Template.xlsx, there are no unqiue_identifiers.")
|
|
1336
|
+
if not model_list and not acquisition_list:
|
|
1337
|
+
raise InputError(
|
|
1338
|
+
"In the signals tab of Template.xlsx, no model or acquisition(s) for model selection were provided."
|
|
1339
|
+
)
|
|
1340
|
+
if len(identifier_list) == len(model_list):
|
|
1341
|
+
raise InputError(
|
|
1342
|
+
"""In the signals tab of Template.xlsx, for each unique identifier a model type was provided.
|
|
1343
|
+
Thus, no model selection is performed."""
|
|
1344
|
+
)
|
|
1345
|
+
# multiple scenarios have to be covered
|
|
1346
|
+
files_for_selection: Dict[str, str] = {}
|
|
1347
|
+
signals = signals.fillna("")
|
|
1348
|
+
if len(model_list) == len(signals.index):
|
|
1349
|
+
# scenario 1: a model was specified for every unique identifier (by the user) -> model selection obsolete
|
|
1350
|
+
return files_for_selection
|
|
1351
|
+
elif len(signals.index) - len(model_list) > 1 and len(acquisition_list) == 1:
|
|
1352
|
+
# scenario 2: for more than one unique identifier no model was specified by the user
|
|
1353
|
+
# but a single acquisition was given for model selection -> model selection from one acquisition
|
|
1354
|
+
acquisition = acquisition_list[0]
|
|
1355
|
+
# remove possible whitespace in front or after an entry made by the user
|
|
1356
|
+
acquisition = acquisition.strip()
|
|
1357
|
+
for idx, row in signals.iterrows():
|
|
1358
|
+
if not signals.loc[idx, "model_type"]:
|
|
1359
|
+
unique_identifier = getattr(row, "unique_identifier")
|
|
1360
|
+
filename = "_".join([acquisition, unique_identifier])
|
|
1361
|
+
files_for_selection[filename] = unique_identifier
|
|
1362
|
+
elif len(signals.index) - len(model_list) == len(acquisition_list):
|
|
1363
|
+
# scenario 3: for every unique identifier for which no model was specified by the user,
|
|
1364
|
+
# they provided an acquistion for model selection
|
|
1365
|
+
for idx, row in signals.iterrows():
|
|
1366
|
+
if not signals.loc[idx, "model_type"]:
|
|
1367
|
+
acquisition = getattr(row, "acquisition_for_choosing_model_type")
|
|
1368
|
+
unique_identifier = getattr(row, "unique_identifier")
|
|
1369
|
+
filename = "_".join([acquisition, unique_identifier])
|
|
1370
|
+
files_for_selection[filename] = unique_identifier
|
|
1371
|
+
else:
|
|
1372
|
+
raise InputError(
|
|
1373
|
+
"When using model selection, provide either one acquisition or one acquisition per unique identifier (no in-betweens)."
|
|
1374
|
+
)
|
|
1375
|
+
return files_for_selection
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
def selected_models_to_template(
|
|
1379
|
+
path_raw_data: Union[str, os.PathLike],
|
|
1380
|
+
signals: pandas.DataFrame,
|
|
1381
|
+
model_dict: Mapping[str, str],
|
|
1382
|
+
):
|
|
1383
|
+
"""
|
|
1384
|
+
Function to update Template.xlsx with the selected model types.
|
|
1385
|
+
|
|
1386
|
+
Parameters
|
|
1387
|
+
----------
|
|
1388
|
+
path_raw_data
|
|
1389
|
+
Path to the folder containing raw data.
|
|
1390
|
+
signals
|
|
1391
|
+
DataFrame containing the signals tab of Template.xlsx.
|
|
1392
|
+
model_dict
|
|
1393
|
+
Dict with unique identifiers as keys and model types as values.
|
|
1394
|
+
"""
|
|
1395
|
+
signals = signals.fillna("")
|
|
1396
|
+
for idx, row in signals.iterrows():
|
|
1397
|
+
if not signals.loc[idx, "model_type"]:
|
|
1398
|
+
unique_identifier = getattr(row, "unique_identifier")
|
|
1399
|
+
signals.loc[idx, "model_type"] = model_dict[unique_identifier]
|
|
1400
|
+
# update in Excel
|
|
1401
|
+
wb = load_workbook(Path(path_raw_data) / "Template.xlsx")
|
|
1402
|
+
# update signals tab with model types by deleting rows and appending signals
|
|
1403
|
+
wb_signals = wb["signals"]
|
|
1404
|
+
wb_signals.delete_rows(wb_signals.min_row + 1, wb_signals.max_row)
|
|
1405
|
+
for r in dataframe_to_rows(signals, index=False, header=False):
|
|
1406
|
+
wb_signals.append(r)
|
|
1407
|
+
wb.save(Path(path_raw_data) / "Template.xlsx")
|
|
1408
|
+
return
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
def model_selection_check(
|
|
1412
|
+
result_df: pandas.DataFrame, ic: str, elpd_threshold: Union[str, float] = 25
|
|
1413
|
+
) -> str:
|
|
1414
|
+
"""
|
|
1415
|
+
During model seleciton, double peak models are sometimes incorrectly preferred due to their increased complexity.
|
|
1416
|
+
Therefore, they have to outperform single peak models by an empirically determined value of the elpd.
|
|
1417
|
+
|
|
1418
|
+
Parameters
|
|
1419
|
+
----------
|
|
1420
|
+
result_df
|
|
1421
|
+
DataFrame with the result of model comparison via az.compare().
|
|
1422
|
+
ic
|
|
1423
|
+
Information criterion to be used for model selection.
|
|
1424
|
+
("loo": pareto-smoothed importance sampling leave-one-out cross-validation,
|
|
1425
|
+
"waic": widely applicable information criterion)
|
|
1426
|
+
elpd_threshold
|
|
1427
|
+
Threshold of the elpd difference between a double and a single peak model for the double peak model
|
|
1428
|
+
to be accepted.
|
|
1429
|
+
|
|
1430
|
+
Returns
|
|
1431
|
+
----------
|
|
1432
|
+
selected_model
|
|
1433
|
+
Name of the selected model type.
|
|
1434
|
+
"""
|
|
1435
|
+
selected_model = str(result_df.index[0])
|
|
1436
|
+
if "double" in selected_model:
|
|
1437
|
+
df_single_peak_models = result_df[~result_df.index.str.contains("double")]
|
|
1438
|
+
elpd_single = max(list(df_single_peak_models[f"elpd_{ic}"]))
|
|
1439
|
+
elpd_double = max(list(result_df[f"elpd_{ic}"]))
|
|
1440
|
+
if not elpd_double > elpd_single + elpd_threshold:
|
|
1441
|
+
selected_model = str(df_single_peak_models.index[0])
|
|
1442
|
+
return selected_model
|
|
1443
|
+
|
|
1444
|
+
|
|
1445
|
+
def selection_loop(
|
|
1446
|
+
path_raw_data: Union[str, os.PathLike],
|
|
1447
|
+
*,
|
|
1448
|
+
files_for_selection: Mapping[str, str],
|
|
1449
|
+
raw_data_files: Union[List[str], Tuple[str]],
|
|
1450
|
+
ic: str,
|
|
1451
|
+
signals: pandas.DataFrame,
|
|
1452
|
+
):
|
|
1453
|
+
"""
|
|
1454
|
+
Function containing the loop over all filenames intended for the model selection.
|
|
1455
|
+
Involves sampling every model featured by PeakPerformance, computing the loglikelihood
|
|
1456
|
+
and an information criterion, and comparing the results to ascertain the best model for every file.
|
|
1457
|
+
|
|
1458
|
+
Parameters
|
|
1459
|
+
----------
|
|
1460
|
+
path_raw_data
|
|
1461
|
+
Path to the folder containing raw data.
|
|
1462
|
+
files_for_selection
|
|
1463
|
+
Dict with file names as keys and unique identifiers as values.
|
|
1464
|
+
raw_data_files
|
|
1465
|
+
List of raw data files returned by the detect_raw_data() function.
|
|
1466
|
+
Is needed here only to get access to the file format.
|
|
1467
|
+
ic
|
|
1468
|
+
Information criterion to be used for model selection.
|
|
1469
|
+
("loo": pareto-smoothed importance sampling leave-one-out cross-validation,
|
|
1470
|
+
"waic": widely applicable information criterion)
|
|
1471
|
+
|
|
1472
|
+
Returns
|
|
1473
|
+
----------
|
|
1474
|
+
result_df
|
|
1475
|
+
DataFrame containing the ranking and scores of the model selection.
|
|
1476
|
+
model_dict
|
|
1477
|
+
Dict with unique identifiers as keys and model types as values.
|
|
1478
|
+
"""
|
|
1479
|
+
model_dict = {}
|
|
1480
|
+
# get data file format from raw_data_files
|
|
1481
|
+
file_format = raw_data_files[0].split(".")[-1]
|
|
1482
|
+
# loop over all filenames in files_for_selection
|
|
1483
|
+
for filename in files_for_selection.keys():
|
|
1484
|
+
# load time series
|
|
1485
|
+
timeseries = np.load(Path(path_raw_data) / (filename + "." + file_format))
|
|
1486
|
+
idata_dict = {}
|
|
1487
|
+
# get all implemented models, then remove those which were excluded
|
|
1488
|
+
# from model selection by the user
|
|
1489
|
+
models_to_exclude = str(
|
|
1490
|
+
signals.loc[files_for_selection[filename], "models_to_exclude_from_selection"]
|
|
1491
|
+
)
|
|
1492
|
+
model_list = set(models.ModelType)
|
|
1493
|
+
if models_to_exclude:
|
|
1494
|
+
exclude_models = {mex.strip() for mex in models_to_exclude.split(",")}
|
|
1495
|
+
model_list = model_list - exclude_models # type: ignore[operator]
|
|
1496
|
+
if models.ModelType.Normal in model_list:
|
|
1497
|
+
pmodel_normal = models.define_model_normal(timeseries[0], timeseries[1])
|
|
1498
|
+
idata_normal = sampling(pmodel_normal, tune=6000)
|
|
1499
|
+
idata_normal = models.compute_log_likelihood(pmodel_normal, idata_normal)
|
|
1500
|
+
idata_normal_summary = az.summary(idata_normal)
|
|
1501
|
+
idata_dict["normal"] = [idata_normal_summary, idata_normal]
|
|
1502
|
+
if models.ModelType.SkewNormal in model_list:
|
|
1503
|
+
pmodel_skew = models.define_model_skew(timeseries[0], timeseries[1])
|
|
1504
|
+
idata_skew = sampling(pmodel_skew, tune=6000)
|
|
1505
|
+
idata_skew = models.compute_log_likelihood(pmodel_skew, idata_skew)
|
|
1506
|
+
idata_skew_normal_summary = az.summary(idata_skew)
|
|
1507
|
+
idata_dict["skew_normal"] = [idata_skew_normal_summary, idata_skew]
|
|
1508
|
+
if models.ModelType.DoubleNormal in model_list:
|
|
1509
|
+
pmodel_double_normal = models.define_model_double_normal(timeseries[0], timeseries[1])
|
|
1510
|
+
idata_double_normal = sampling(pmodel_double_normal, tune=6000)
|
|
1511
|
+
idata_double_normal = models.compute_log_likelihood(
|
|
1512
|
+
pmodel_double_normal, idata_double_normal
|
|
1513
|
+
)
|
|
1514
|
+
idata_double_normal_summary = az.summary(idata_double_normal)
|
|
1515
|
+
idata_dict["double_normal"] = [idata_double_normal_summary, idata_double_normal]
|
|
1516
|
+
if models.ModelType.DoubleSkewNormal in model_list:
|
|
1517
|
+
pmodel_double_skew = models.define_model_double_skew_normal(
|
|
1518
|
+
timeseries[0], timeseries[1]
|
|
1519
|
+
)
|
|
1520
|
+
idata_double_skew = sampling(pmodel_double_skew, tune=6000)
|
|
1521
|
+
idata_double_skew = models.compute_log_likelihood(pmodel_double_skew, idata_double_skew)
|
|
1522
|
+
idata_double_skew_normal_summary = az.summary(idata_double_skew)
|
|
1523
|
+
idata_dict["double_skew_normal"] = [idata_double_skew_normal_summary, idata_double_skew]
|
|
1524
|
+
|
|
1525
|
+
# add model to compare_dict for model selection only if convergence criterion was met (r_hat <= 1.05)
|
|
1526
|
+
compare_dict = {}
|
|
1527
|
+
for model in idata_dict.keys():
|
|
1528
|
+
if not (idata_dict[model][0].loc[:, "r_hat"] > 1.05).any():
|
|
1529
|
+
compare_dict[model] = idata_dict[model][1]
|
|
1530
|
+
# compare_dict needs at least two entries for model comparison
|
|
1531
|
+
# if not enough pass the r_hat test, accept all for now to avoid error
|
|
1532
|
+
if len(compare_dict) < 2:
|
|
1533
|
+
warnings.warn(
|
|
1534
|
+
f"Only one or less models converged during model selection for {filename}."
|
|
1535
|
+
)
|
|
1536
|
+
for model in idata_dict.keys():
|
|
1537
|
+
compare_dict[model] = idata_dict[model][1]
|
|
1538
|
+
# perform the actual model comparison
|
|
1539
|
+
result_df = models.model_comparison(compare_dict, ic)
|
|
1540
|
+
# double peak models are sometimes incorrectly preferred due to their increased complexity
|
|
1541
|
+
# therefore, they have to outperform single peak models by an empirically determined value of the elpd
|
|
1542
|
+
selected_model = model_selection_check(result_df, ic)
|
|
1543
|
+
# update model_dict with unique_identifier as key and selected_model as value
|
|
1544
|
+
model_dict[files_for_selection[filename]] = selected_model
|
|
1545
|
+
# optional: plot the results of model comparison
|
|
1546
|
+
return result_df, model_dict
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
def model_selection(path_raw_data: Union[str, os.PathLike], *, ic: str = "loo"):
|
|
1550
|
+
"""
|
|
1551
|
+
Method to select the best model for every signal (i.e. combination of experiment number or precursor ion m/z ratio
|
|
1552
|
+
and product ion m/z ratio). This is realized by analyzing one representative sample of the batch with all models and
|
|
1553
|
+
comparing the results based on an informantion criterion.
|
|
1554
|
+
|
|
1555
|
+
Parameters
|
|
1556
|
+
----------
|
|
1557
|
+
path_raw_data
|
|
1558
|
+
Path to the folder containing raw data.
|
|
1559
|
+
ic
|
|
1560
|
+
Information criterion to be used for model selection.
|
|
1561
|
+
("loo": pareto-smoothed importance sampling leave-one-out cross-validation,
|
|
1562
|
+
"waic": widely applicable information criterion)
|
|
1563
|
+
|
|
1564
|
+
Returns
|
|
1565
|
+
----------
|
|
1566
|
+
comparison_results
|
|
1567
|
+
DataFrame containing all rankings from model selection.
|
|
1568
|
+
model_dict
|
|
1569
|
+
Dict with unique identifiers as keys and model types as values.
|
|
1570
|
+
"""
|
|
1571
|
+
# check for which signals model selection is wished and whether from one or different acquisitions
|
|
1572
|
+
df_signals = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="signals")
|
|
1573
|
+
files_for_selection = parse_files_for_model_selection(df_signals)
|
|
1574
|
+
# get raw_data_files to get automatic access to file format in seleciton_loop
|
|
1575
|
+
raw_data_files = detect_raw_data(path_raw_data)
|
|
1576
|
+
# loop over all files_for_selection
|
|
1577
|
+
df_signals.set_index("unique_identifier", inplace=True)
|
|
1578
|
+
comparison_results = pandas.DataFrame()
|
|
1579
|
+
result_df, model_dict = selection_loop(
|
|
1580
|
+
path_raw_data,
|
|
1581
|
+
files_for_selection=files_for_selection,
|
|
1582
|
+
raw_data_files=raw_data_files,
|
|
1583
|
+
ic=ic,
|
|
1584
|
+
signals=df_signals,
|
|
1585
|
+
)
|
|
1586
|
+
comparison_results = pandas.concat([comparison_results, result_df])
|
|
1587
|
+
# update signals tab of Template.xlsx; read again to reset index
|
|
1588
|
+
df_signals = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="signals")
|
|
1589
|
+
try:
|
|
1590
|
+
selected_models_to_template(path_raw_data, df_signals, model_dict)
|
|
1591
|
+
except PermissionError:
|
|
1592
|
+
warnings.warn(
|
|
1593
|
+
"""Since Template.xlsx was open during model selection, it could not be updated.
|
|
1594
|
+
Use the returned variables and pl.selected_models_to_template() to update it."""
|
|
1595
|
+
)
|
|
1596
|
+
return comparison_results, model_dict
|