peak-performance 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1596 @@
1
+ """
2
+ PeakPerformance
3
+ Copyright (C) 2023 Forschungszentrum Jülich GmbH
4
+
5
+ This program is free software: you can redistribute it and/or modify
6
+ it under the terms of the GNU Affero General Public License as published
7
+ by the Free Software Foundation, either version 3 of the License, or
8
+ (at your option) any later version.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU Affero General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Affero General Public License
16
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
17
+ """
18
+
19
+ import importlib
20
+ import os
21
+ import re
22
+ import shutil
23
+ import warnings
24
+ from datetime import date, datetime
25
+ from pathlib import Path
26
+ from typing import Dict, List, Mapping, Sequence, Tuple, Union
27
+
28
+ import arviz as az
29
+ import numpy as np
30
+ import pandas
31
+ import pymc as pm
32
+ import scipy.integrate
33
+ import scipy.signal
34
+ from openpyxl import load_workbook
35
+ from openpyxl.utils.dataframe import dataframe_to_rows
36
+
37
+ from peak_performance import models, plots
38
+
39
+
40
+ class ParsingError(Exception):
41
+ """Base type of parsing exceptions."""
42
+
43
+
44
+ class InputError(Exception):
45
+ """Base type of exceptions related to information given by the user."""
46
+
47
+
48
+ class UserInput:
49
+ """Collect all information required from the user and format them in the correct manner."""
50
+
51
+ def __init__(
52
+ self,
53
+ path: Union[str, os.PathLike],
54
+ files: Sequence[str],
55
+ raw_data_file_format: str,
56
+ peak_model: Sequence[str],
57
+ retention_time_estimate: Union[Sequence[float], Sequence[int]],
58
+ peak_width_estimate: Union[float, int],
59
+ pre_filtering: bool,
60
+ minimum_sn: Union[float, int],
61
+ timeseries: np.ndarray,
62
+ acquisition: str,
63
+ precursor: Union[float, int],
64
+ product_mz_start: Union[float, int],
65
+ product_mz_end: Union[float, int],
66
+ ):
67
+ """
68
+ Parameters
69
+ ----------
70
+ path
71
+ Path to the folder containing the results of the current run.
72
+ files
73
+ List of raw data file names in path.
74
+ raw_data_file_format
75
+ Data format (suffix) of the raw data, default is '.npy'.
76
+ peak_model
77
+ List specifying models for peak fitting in the same order as files.
78
+ ("normal", "skew_normal", "double_normal", "double_skew_normal")
79
+ retention_time_estimate
80
+ In case you set pre_filtering to True, give a retention time estimate (float) for each signal in files.
81
+ In case of a double peak, give two retention times (in chronological order) as a tuple containing two floats.
82
+ peak_width_estimate
83
+ Rough estimate of the average peak width in minutes expected for the LC-MS method with which the data was obtained.
84
+ pre_filtering
85
+ If True, potential peaks will be filtered based on retention time and signal to noise ratio before sampling.
86
+ minimum_sn
87
+ Minimum signal to noise ratio for a signal to be recognized as a peak during pre-filtering.
88
+ timeseries
89
+ NumPy Array containing time (at first position) and intensity (at second position) data as NumPy arrays.
90
+ acquisition
91
+ Name of a single acquisition.
92
+ precursor
93
+ Can be one of the following:
94
+ Either the experiment number of the signal within the acquisition (each experiment = one mass trace)
95
+ or the mass to charge ratio of the precursor ion selected in Q1.
96
+ product_mz_start
97
+ Start of the mass to charge ratio range of the product ion in the TOF.
98
+ product_mz_end
99
+ End of the mass to charge ratio range of the product ion in the TOF.
100
+ """
101
+ self.path = path
102
+ self.files = list(files)
103
+ self.raw_data_file_format = raw_data_file_format
104
+ self.peak_model = peak_model
105
+ self.retention_time_estimate = retention_time_estimate
106
+ self.peak_width_estimate = peak_width_estimate
107
+ self.pre_filtering = pre_filtering
108
+ self.minimum_sn = minimum_sn
109
+ self.timeseries = timeseries
110
+ self.acquisition = acquisition
111
+ self.precursor = precursor
112
+ self.product_mz_start = product_mz_start
113
+ self.product_mz_end = product_mz_end
114
+ super().__init__()
115
+
116
+ @property
117
+ def timeseries(self):
118
+ """
119
+ Getting the value of the timeseries attribute.
120
+ (NumPy Array containing time (at first position) and intensity (at second position) data as NumPy arrays.)
121
+ """
122
+ return self._timeseries
123
+
124
+ @timeseries.setter
125
+ def timeseries(self, data):
126
+ """Setting the value of the timeseries attribute."""
127
+ if data is None:
128
+ raise InputError("The timeseries parameter is a None type.")
129
+ self._timeseries = np.asarray(data)
130
+
131
+ @property
132
+ def acquisition(self):
133
+ """Getting the value of the acquisition attribute (name of a single acquisition)."""
134
+ return self._acquisition
135
+
136
+ @acquisition.setter
137
+ def acquisition(self, name):
138
+ """Setting the value of the acquisition attribute."""
139
+ if not isinstance(name, str):
140
+ raise InputError(
141
+ f"The acquisition parameter {name} is {type(name)} but needs to be a string."
142
+ )
143
+ if name is None:
144
+ raise InputError("The acquisition parameter is a None type.")
145
+ self._acquisition = name
146
+
147
+ @property
148
+ def precursor(self):
149
+ """
150
+ Getting the value of the precursor attribute which can be one of the following:
151
+ Either the experiment number of the signal within the acquisition (each experiment = one mass trace)
152
+ or the mass to charge ratio of the precursor ion selected in Q1.
153
+ """
154
+ return self._precursor
155
+
156
+ @precursor.setter
157
+ def precursor(self, mz):
158
+ """Setting the value of the precursor attribute."""
159
+ if not isinstance(mz, int) and not isinstance(mz, float):
160
+ try:
161
+ mz = float(mz)
162
+ except ValueError as ex:
163
+ raise InputError(
164
+ f"The precursor parameter {mz} is {type(mz)} but needs to be an int or a float."
165
+ ) from ex
166
+ if mz is None:
167
+ raise InputError("The precursor parameter is a None type.")
168
+ self._precursor = mz
169
+
170
+ @property
171
+ def product_mz_start(self):
172
+ """Getting the value of the product_mz_start attribute."""
173
+ return self._product_mz_start
174
+
175
+ @product_mz_start.setter
176
+ def product_mz_start(self, mz):
177
+ """
178
+ Setting the value of the product_mz_start attribute.
179
+ (Start of the mass to charge ratio range of the product ion in the TOF.)
180
+ """
181
+ if not isinstance(mz, int) and not isinstance(mz, float):
182
+ try:
183
+ mz = float(mz)
184
+ except ValueError as ex:
185
+ raise InputError(
186
+ f"The product_mz parameter {mz} is {type(mz)} but needs to be an int or a float."
187
+ ) from ex
188
+ if mz is None:
189
+ raise InputError("The product_mz_start parameter is a None type.")
190
+ self._product_mz_start = mz
191
+
192
+ @property
193
+ def product_mz_end(self):
194
+ """
195
+ Getting the value of the product_mz_end attribute.
196
+ (End of the mass to charge ratio range of the product ion in the TOF.)
197
+ """
198
+ return self._product_mz_end
199
+
200
+ @product_mz_end.setter
201
+ def product_mz_end(self, mz):
202
+ """Setting the value of the product_mz_end attribute."""
203
+ if not isinstance(mz, int) and not isinstance(mz, float):
204
+ try:
205
+ mz = float(mz)
206
+ except ValueError as ex:
207
+ raise InputError(
208
+ f"The product_mz parameter is {type(mz)} but needs to be an int or a float."
209
+ ) from ex
210
+ if mz is None:
211
+ raise InputError("The product_mz_end parameter is a None type.")
212
+ self._product_mz_end = mz
213
+
214
+ @property
215
+ def user_info(self):
216
+ """Create a dictionary with the necessary user information based on the class attributes."""
217
+ # first, some sanity checks
218
+ if len(self.files) != len(self.peak_model):
219
+ raise InputError(
220
+ f"The length of 'files' ({len(self.files)}) and of 'peak_model' ({len(self.peak_model)}) are not identical."
221
+ )
222
+ if self.pre_filtering:
223
+ # check length of lists
224
+ if len(self.files) != len(self.peak_model) or len(self.peak_model) != len(
225
+ self.retention_time_estimate
226
+ ):
227
+ raise InputError(
228
+ f"The length of 'files' ({len(self.files)}), 'peak_model' ({self.peak_model}), "
229
+ f"and retention_time_estimate ({len(self.retention_time_estimate)}) are not identical."
230
+ )
231
+ else:
232
+ # if pre_filtering is False, then retention_time_estimate is not needed
233
+ # but the dictionary still needs to be created without errors -> set it to np.nan
234
+ if len(self.retention_time_estimate) == 1:
235
+ self.retention_time_estimate = len(self.files) * [np.nan]
236
+ elif not self.retention_time_estimate:
237
+ self.retention_time_estimate = len(self.files) * [np.nan]
238
+ if np.any(np.array(self.retention_time_estimate) < 0):
239
+ raise InputError("Retention time estimates below 0 are not valid.")
240
+ # actually create the dictionary
241
+ user_info = dict(zip(self.files, zip(self.peak_model, self.retention_time_estimate)))
242
+ user_info["peak_width_estimate"] = self.peak_width_estimate
243
+ user_info["pre_filtering"] = self.pre_filtering
244
+ user_info["minimum_sn"] = self.minimum_sn
245
+ return user_info
246
+
247
+
248
+ def detect_raw_data(path: Union[str, os.PathLike], *, data_type: str = ".npy"):
249
+ """
250
+ Detect all .npy files with time and intensity data for peaks in a given directory.
251
+
252
+ Parameters
253
+ ----------
254
+ path
255
+ Path to the folder containing raw data.
256
+ data_type
257
+ Data format of the raw data files (e.g. '.npy').
258
+
259
+ Returns
260
+ -------
261
+ files
262
+ List with names of all files of the specified data type in path.
263
+ """
264
+ all_files = os.listdir(path)
265
+ files = [file for file in all_files if data_type in file]
266
+ if not files:
267
+ raise FileNotFoundError(
268
+ f"In the given directory '{path}', there are no '{data_type}' files."
269
+ )
270
+ return files
271
+
272
+
273
+ def parse_data(
274
+ path: Union[str, os.PathLike], filename: str, raw_data_file_format: str
275
+ ) -> Tuple[np.ndarray, str, float, float, float]:
276
+ """
277
+ Extract names of data files.
278
+
279
+ Parameters
280
+ ----------
281
+ path
282
+ Path to the raw data files.
283
+ filename
284
+ Name of a raw date file containing a NumPy array with a time series (time as first, intensity as second element of the array).
285
+ raw_data_file_format
286
+ Data format (suffix) of the raw data, default is '.npy'.
287
+
288
+ Returns
289
+ -------
290
+ timeseries
291
+ Updated NumPy array containing time and intensity data as NumPy arrays in first and second row, respectively.
292
+ NaN values have been replaced with zeroes.
293
+ acquisition
294
+ Name of a single acquisition.
295
+ precursor
296
+ Can be one of the following:
297
+ Either the experiment number of the signal within the acquisition (each experiment = one mass trace)
298
+ or the mass to charge ratio of the precursor ion selected in Q1.
299
+ product_mz_start
300
+ Start of the mass to charge ratio range of the product ion in the TOF.
301
+ product_mz_end
302
+ End of the mass to charge ratio range of the product ion in the TOF.
303
+ """
304
+ # load time series
305
+ timeseries = np.load(Path(path) / filename)
306
+ # if NaN are in time or intensity, replace it with 0.0
307
+ timeseries = np.nan_to_num(timeseries)
308
+ # get information from the raw data file name
309
+ splits = filename.split("_")
310
+ if len(splits) != 4:
311
+ raise InputError(
312
+ f"""The standardized naming scheme was violated by file {filename}.
313
+ \nThe name should be divided by underscores into the sections acquisition name, precursor, product_mz_start, and product_mz_end.
314
+ """
315
+ )
316
+ try:
317
+ pattern = r"(.*?)_(\d+\.?\d*)_(\d+\.?\d*)_(\d+\.?\d*).*"
318
+ m = re.match(pattern, filename)
319
+ if m is not None:
320
+ acquisition, precursor, mz_start, mz_end = m.groups()
321
+ precursor_converted = float(precursor)
322
+ product_mz_start_converted = float(mz_start)
323
+ product_mz_end_converted = float(mz_end)
324
+ except ValueError as ex:
325
+ raise InputError(
326
+ f"The name of file {filename} does not follow the standardized naming convention."
327
+ ) from ex
328
+
329
+ return (
330
+ timeseries,
331
+ acquisition,
332
+ precursor_converted,
333
+ product_mz_start_converted,
334
+ product_mz_end_converted,
335
+ )
336
+
337
+
338
+ def parse_unique_identifiers(raw_data_files: Sequence[str]) -> List[str]:
339
+ """
340
+ Get a set of all mass traces based on the standardized raw data file names (excluding acquisitions).
341
+ Used to automatically fill out the unique_identifiers column in the Template.xlsx' signals tab.
342
+
343
+ Parameters
344
+ ----------
345
+ raw_data_files
346
+ Names of all files of the specified data type in path_raw_data.
347
+
348
+ Returns
349
+ -------
350
+ unique_identifiers
351
+ List with all unique combinations of targeted molecules.
352
+ (i.e. experiment number or precursor ion m/z ratio and product ion m/z ratio range)
353
+ """
354
+ # remove acquisition from file names
355
+ identifiers = []
356
+ for filename in raw_data_files:
357
+ pattern = r"(.*?)_(\d+\.?\d*)_(\d+\.?\d*)_(\d+\.?\d*).*"
358
+ m = re.match(pattern, filename)
359
+ if m is not None:
360
+ acquisition, precursor, mz_start, mz_end = m.groups()
361
+ identifiers.append("_".join([precursor, mz_start, mz_end]))
362
+
363
+ # select only unique identifiers
364
+ unique_identifiers = list(set(identifiers))
365
+ return unique_identifiers
366
+
367
+
368
+ def initiate(path: Union[str, os.PathLike], *, run_dir: str = ""):
369
+ """
370
+ Create a folder for the results. Also create a zip file inside that folder. Also create df_summary.
371
+
372
+ Parameters
373
+ ----------
374
+ path
375
+ Path to the directory containing the raw data.
376
+ run_dir
377
+ Name of the directory created to store the results of the current run (default: current date and time).
378
+
379
+ Returns
380
+ -------
381
+ df_summary
382
+ DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
383
+ path
384
+ Updated path variable pointing to the newly created folder for this batch.
385
+ """
386
+ # get current date and time
387
+ if not run_dir:
388
+ today = str(date.today())
389
+ now = datetime.now().strftime("%H-%M-%S")
390
+ timestamp = today + "_" + now
391
+ run_dir = timestamp + "_run"
392
+ # create a directory
393
+ path = Path(path) / run_dir
394
+ path.mkdir(exist_ok=True)
395
+ # create DataFrame for data report
396
+ df_summary = pandas.DataFrame(
397
+ columns=[
398
+ "mean",
399
+ "sd",
400
+ "hdi_3%",
401
+ "hdi_97%",
402
+ "mcse_mean",
403
+ "mcse_sd",
404
+ "ess_bulk",
405
+ "ess_tail",
406
+ "r_hat",
407
+ "acquisition",
408
+ "experiment_or_precursor_mz",
409
+ "product_mz_start",
410
+ "product_mz_end",
411
+ "is_peak",
412
+ "cause_for_rejection",
413
+ "model_type",
414
+ "subpeak",
415
+ ]
416
+ )
417
+ return df_summary, path
418
+
419
+
420
+ def prefiltering(
421
+ filename: str, ui: UserInput, noise_width_guess: float, df_summary: pandas.DataFrame
422
+ ):
423
+ """
424
+ Optional method to skip signals where clearly no peak is present. Saves a lot of computation time.
425
+
426
+ Parameters
427
+ ----------
428
+ filename
429
+ Name of the raw data file.
430
+ ui
431
+ Instance of the UserInput class
432
+ noise_width_guess
433
+ Estimated width of the noise of a particular measurement.
434
+
435
+ Returns
436
+ -------
437
+ found_peak
438
+ True, if any peak candidate was found within the time frame; False, if not.
439
+ df_summary
440
+ DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
441
+ """
442
+ # pre-fit tests for peaks to save computation time (optional)
443
+ t_ret = ui.user_info[filename][1]
444
+ est_width = ui.peak_width_estimate
445
+ # find all potential peaks with scipy
446
+ peaks, _ = scipy.signal.find_peaks(ui.timeseries[1])
447
+ peak_candidates = []
448
+ # differentiate between single and double peaks
449
+ for peak in peaks:
450
+ # define conditions for passing the pre-filtering
451
+ # check proximity of any peak candidate to the estimated retention time
452
+ retention_time_condition = t_ret - est_width <= ui.timeseries[0][peak] <= t_ret + est_width
453
+ # check signal to noise ratio
454
+ signal_to_noise_condition = (
455
+ ui.timeseries[1][peak] / (noise_width_guess + 0.1) > ui.minimum_sn
456
+ )
457
+ # check the neighbouring data points to prevent classification of a single elevated data point as a peak
458
+ check_preceding_point = ui.timeseries[1][peak - 1] / (noise_width_guess + 0.1) > 2
459
+ check_succeeding_point = ui.timeseries[1][peak + 1] / (noise_width_guess + 0.1) > 2
460
+ if (
461
+ retention_time_condition
462
+ and signal_to_noise_condition
463
+ and check_preceding_point
464
+ and check_succeeding_point
465
+ ):
466
+ peak_candidates.append(peak)
467
+ if not peak_candidates:
468
+ df_summary = report_add_nan_to_summary(filename, ui, df_summary, "pre-filtering")
469
+ return False, df_summary
470
+ return True, df_summary
471
+
472
+
473
+ def sampling(pmodel, **sample_kwargs):
474
+ """Performs sampling.
475
+
476
+ Parameters
477
+ ----------
478
+ pmodel
479
+ A PyMC model.
480
+ **kwargs
481
+ The keyword arguments are used in pm.sample().
482
+ tune
483
+ Number of tuning samples (default = 2000).
484
+ draws
485
+ Number of samples after tuning (default = 2000).
486
+
487
+ Returns
488
+ -------
489
+ idata
490
+ Inference data object.
491
+ """
492
+ sample_kwargs.setdefault("tune", 2000)
493
+ sample_kwargs.setdefault("draws", 2000)
494
+ # check if nutpie is available; if so, use it to enhance performance
495
+ if importlib.util.find_spec("nutpie"):
496
+ nuts_sampler = "nutpie"
497
+ else:
498
+ nuts_sampler = "pymc"
499
+ with pmodel:
500
+ idata = pm.sample_prior_predictive()
501
+ idata.extend(pm.sample(nuts_sampler=nuts_sampler, **sample_kwargs))
502
+ return idata
503
+
504
+
505
+ def postfiltering(filename: str, idata, ui: UserInput, df_summary: pandas.DataFrame):
506
+ """
507
+ Method to filter out false positive peaks after sampling based on the obtained uncertainties of several peak parameters.
508
+
509
+ Parameters
510
+ ----------
511
+ filename
512
+ Name of the raw data file.
513
+ idata
514
+ Inference data object resulting from sampling.
515
+ ui
516
+ Instance of the UserInput class.
517
+ df_summary
518
+ DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
519
+
520
+ Returns
521
+ -------
522
+ acceptance
523
+ True if the signal was accepted as a peak -> save data and continue with next signal.
524
+ False if the signal was not accepted as a peak -> re-sampling with more tuning samples or discard signal.
525
+ resample
526
+ True: re-sample with more tuning samples, False: don't.
527
+ discard
528
+ True: discard sample.
529
+ """
530
+ # check whether convergence, i.e. r_hat <= 1.05, was not reached OR peak criteria were not met
531
+ model = ui.user_info[filename][0]
532
+ resample = False
533
+ discard = False
534
+ rejection_msg = ""
535
+ az_summary: pandas.DataFrame = az.summary(idata)
536
+ if model in ["normal", "skew_normal"]:
537
+ # for single peak
538
+ # Get data needed for rejection decisions
539
+ max_rhat = max(az_summary.loc[:, "r_hat"])
540
+ std = az_summary.loc["std", "mean"]
541
+ area_sd = az_summary.loc["area", "sd"]
542
+ area_mean = az_summary.loc["area", "mean"]
543
+ height_sd = az_summary.loc["height", "sd"]
544
+ height_mean = az_summary.loc["height", "mean"]
545
+
546
+ # decide whether to discard signal or sample with more tune samples based on size of sigma parameter
547
+ # of normal distribution (std) and on the relative sizes of standard deviations of area and height
548
+ reject_reasons = []
549
+ if max_rhat > 1.05:
550
+ reject_reasons.append(f"maximum Rhat ({max_rhat:.3f}) was too high")
551
+ if std <= ui.peak_width_estimate / 100:
552
+ reject_reasons.append(f"standard deviation estimate ({std:.2f}) was too low")
553
+ if area_sd > area_mean * 0.2:
554
+ reject_reasons.append(f"area estimate ({area_mean} ± {area_sd}) was too uncertain")
555
+ if height_sd > height_mean * 0.2:
556
+ reject_reasons.append(
557
+ f"height estimate ({height_mean} ± {height_sd}) was too uncertain"
558
+ )
559
+
560
+ if len(reject_reasons) == 1 and "Rhat" in reject_reasons[0]:
561
+ # r_hat failed but rest of post-fit check passed
562
+ # sample again with more tune samples to possibly reach convergence yet
563
+ resample = True
564
+ discard = False
565
+ elif reject_reasons:
566
+ rejection_msg = " and ".join(reject_reasons)
567
+ df_summary = report_add_nan_to_summary(filename, ui, df_summary, rejection_msg)
568
+ resample = False
569
+ discard = True
570
+
571
+ elif model in ["double_normal", "double_skew_normal"]:
572
+ # for double peak
573
+ max_rhat = max(az_summary.loc[:, "r_hat"])
574
+ std = az_summary.loc["std[0]", "mean"]
575
+ area_sd = az_summary.loc["area[0]", "sd"]
576
+ area_mean = az_summary.loc["area[0]", "mean"]
577
+ height_sd = az_summary.loc["height[0]", "sd"]
578
+ height_mean = az_summary.loc["height[0]", "mean"]
579
+ std2 = az_summary.loc["std[1]", "mean"]
580
+ area_sd2 = az_summary.loc["area[1]", "sd"]
581
+ area_mean2 = az_summary.loc["area[1]", "mean"]
582
+ height_sd2 = az_summary.loc["height[1]", "sd"]
583
+ height_mean2 = az_summary.loc["height[1]", "mean"]
584
+
585
+ if max_rhat > 1.05:
586
+ resample = True
587
+ discard = False
588
+ return resample, discard, df_summary
589
+ # Booleans to differentiate which peak is or is not detected
590
+ double_not_found_first = False
591
+ double_not_found_second = False
592
+ if std <= 1 / 100 or area_sd > area_mean * 0.2 or height_sd > height_mean * 0.2:
593
+ # post-fit check failed
594
+ # add NaN values to summary DataFrame
595
+ double_not_found_first = True
596
+ if std2 <= 1 / 100 or area_sd2 > area_mean2 * 0.2 or height_sd2 > height_mean2 * 0.2:
597
+ # post-fit check failed
598
+ # add NaN values to summary DataFrame
599
+ double_not_found_second = True
600
+ # if both peaks failed the peak criteria tests, then reject peaks
601
+ if double_not_found_first and double_not_found_second:
602
+ reject_reasons = []
603
+ if std <= ui.peak_width_estimate / 100:
604
+ reject_reasons.append(f"standard deviation estimate ({std:.2f}) was too low")
605
+ if std2 <= ui.peak_width_estimate / 100:
606
+ reject_reasons.append(f"standard deviation estimate ({std2:.2f}) was too low")
607
+ if area_sd > area_mean * 0.2:
608
+ reject_reasons.append(f"area estimate ({area_mean} ± {area_sd}) was too uncertain")
609
+ if area_sd2 > area_mean2 * 0.2:
610
+ reject_reasons.append(
611
+ f"area estimate ({area_mean2} ± {area_sd2}) was too uncertain"
612
+ )
613
+ if height_sd > height_mean * 0.2:
614
+ reject_reasons.append(
615
+ f"height estimate ({height_mean} ± {height_sd}) was too uncertain"
616
+ )
617
+ if height_sd2 > height_mean2 * 0.2:
618
+ reject_reasons.append(
619
+ f"height estimate ({height_mean2} ± {height_sd2}) was too uncertain"
620
+ )
621
+
622
+ if reject_reasons:
623
+ rejection_msg = " and ".join(reject_reasons)
624
+
625
+ df_summary = report_add_nan_to_summary(filename, ui, df_summary, rejection_msg)
626
+ resample = False
627
+ discard = True
628
+
629
+ else:
630
+ raise NotImplementedError(f"The model {model} is not implemented.")
631
+ return resample, discard, df_summary
632
+
633
+
634
+ def posterior_predictive_sampling(pmodel, idata):
635
+ """Performs posterior predictive sampling for signals recognized as peaks.
636
+
637
+ Parameters
638
+ ----------
639
+ pmodel
640
+ A PyMC model.
641
+ idata
642
+ Previously sampled inference data object.
643
+
644
+ Returns
645
+ -------
646
+ idata
647
+ Inference data object updated with the posterior predictive samples.
648
+ """
649
+ with pmodel:
650
+ idata.extend(pm.sample_posterior_predictive(idata, var_names=["y"]))
651
+ return idata
652
+
653
+
654
+ def report_save_idata(idata, ui: UserInput, filename: str):
655
+ """
656
+ Saves inference data object as a .nc file.
657
+
658
+ Parameters
659
+ ----------
660
+ idata
661
+ Inference data object resulting from sampling.
662
+ ui
663
+ Instance of the UserInput class.
664
+ filename
665
+ Name of a raw date file containing a NumPy array with a time series (time as first, intensity as second element of the array).
666
+ """
667
+ fp = Path(ui.path) / f"{filename}.nc"
668
+ idata.to_netcdf(str(fp.absolute()))
669
+ return
670
+
671
+
672
+ def report_add_data_to_summary(
673
+ filename: str,
674
+ idata,
675
+ df_summary: pandas.DataFrame,
676
+ ui: UserInput,
677
+ is_peak: bool,
678
+ rejection_cause: str = "",
679
+ ):
680
+ """
681
+ Extracts the relevant information from idata, concatenates it to the summary DataFrame, and saves the DataFrame as an Excel file.
682
+ Error handling prevents stop of the pipeline in case the saving doesn't work (e.g. because the file was opened by someone).
683
+
684
+ Parameters
685
+ ----------
686
+ idata
687
+ Inference data object resulting from sampling.
688
+ df_summary
689
+ DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
690
+ ui
691
+ Instance of the UserInput class.
692
+ is_peak
693
+ Boolean stating whether a signal was recognized as a peak (True) or not (False).
694
+ rejection_cause
695
+ Cause for rejecting a given signal.
696
+
697
+ Returns
698
+ -------
699
+ df_summary
700
+ Updated DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
701
+ """
702
+ az_summary: pandas.DataFrame = az.summary(idata)
703
+ model = ui.user_info[filename][0]
704
+ # split double peak into first and second peak (when extracting the data from az.summary(idata))
705
+ if model in ["double_normal", "double_skew_normal"]:
706
+ # first peak of double peak
707
+ parameters = [
708
+ "baseline_intercept",
709
+ "baseline_slope",
710
+ "mean[0]",
711
+ "noise",
712
+ "std[0]",
713
+ "area[0]",
714
+ "height[0]",
715
+ "sn[0]",
716
+ ]
717
+ df = az_summary.loc[parameters, :]
718
+ df = df.rename(
719
+ index={
720
+ "mean[0]": "mean",
721
+ "std[0]": "std",
722
+ "area[0]": "area",
723
+ "height[0]": "height",
724
+ "sn[0]": "sn",
725
+ }
726
+ )
727
+ df["acquisition"] = len(parameters) * [f"{ui.acquisition}"]
728
+ df["experiment_or_precursor_mz"] = len(parameters) * [ui.precursor]
729
+ df["product_mz_start"] = len(parameters) * [ui.product_mz_start]
730
+ df["product_mz_end"] = len(parameters) * [ui.product_mz_end]
731
+ df["is_peak"] = is_peak
732
+ df["cause_for_rejection"] = rejection_cause
733
+ df["model_type"] = len(parameters) * [model]
734
+ df["subpeak"] = len(parameters) * ["1st"]
735
+
736
+ # second peak of double peak
737
+ parameters = [
738
+ "baseline_intercept",
739
+ "baseline_slope",
740
+ "mean[1]",
741
+ "noise",
742
+ "std[1]",
743
+ "area[1]",
744
+ "height[1]",
745
+ "sn[1]",
746
+ ]
747
+ df2 = az_summary.loc[parameters, :]
748
+ df2 = df2.rename(
749
+ index={
750
+ "area[1]": "area",
751
+ "height[1]": "height",
752
+ "sn[1]": "sn",
753
+ "std[1]": "std",
754
+ "mean[1]": "mean",
755
+ }
756
+ )
757
+ df2["acquisition"] = len(parameters) * [f"{ui.acquisition}"]
758
+ df2["experiment_or_precursor_mz"] = len(parameters) * [ui.precursor]
759
+ df2["product_mz_start"] = len(parameters) * [ui.product_mz_start]
760
+ df2["product_mz_end"] = len(parameters) * [ui.product_mz_end]
761
+ df2["is_peak"] = is_peak
762
+ df2["cause_for_rejection"] = rejection_cause
763
+ df2["model_type"] = len(parameters) * [model]
764
+ df2["subpeak"] = len(parameters) * ["2nd"]
765
+ df_double = pandas.concat([df, df2])
766
+ df_summary = pandas.concat([df_summary, df_double])
767
+
768
+ else:
769
+ # for single peak
770
+ parameters = [
771
+ "baseline_intercept",
772
+ "baseline_slope",
773
+ "mean",
774
+ "noise",
775
+ "std",
776
+ "area",
777
+ "height",
778
+ "sn",
779
+ ]
780
+ df = az_summary.loc[parameters, :]
781
+ df["acquisition"] = len(parameters) * [f"{ui.acquisition}"]
782
+ df["experiment_or_precursor_mz"] = len(parameters) * [ui.precursor]
783
+ df["product_mz_start"] = len(parameters) * [ui.product_mz_start]
784
+ df["product_mz_end"] = len(parameters) * [ui.product_mz_end]
785
+ df["is_peak"] = is_peak
786
+ df["cause_for_rejection"] = rejection_cause
787
+ df["model_type"] = len(parameters) * [model]
788
+ df["subpeak"] = len(parameters) * [""]
789
+ df_summary = pandas.concat([df_summary, df])
790
+ # pandas.concat(df_summary, df)
791
+ # save summary df as Excel file
792
+ with pandas.ExcelWriter(
793
+ path=rf"{ui.path}/peak_data_summary.xlsx", engine="openpyxl", mode="w"
794
+ ) as writer:
795
+ df_summary.to_excel(writer)
796
+ return df_summary
797
+
798
+
799
+ def report_area_sheet(path: Union[str, os.PathLike], df_summary: pandas.DataFrame):
800
+ """
801
+ Save a different, more minimalist report sheet focussing on the area data.
802
+
803
+ Parameters
804
+ ----------
805
+ path
806
+ Path to the directory containing the raw data.
807
+ df_summary
808
+ DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
809
+ """
810
+ # also save a version of df_summary only for areas with correct order and only necessary data
811
+ df_area_summary = df_summary[df_summary.index == "area"]
812
+ sorted_area_summary = df_area_summary.sort_values(
813
+ ["acquisition", "experiment_or_precursor_mz", "product_mz_start"]
814
+ )
815
+ sorted_area_summary = sorted_area_summary.drop(
816
+ labels=["mcse_mean", "mcse_sd", "ess_bulk", "ess_tail"], axis=1
817
+ )
818
+ sorted_area_summary.to_excel(rf"{path}/area_summary.xlsx")
819
+ return
820
+
821
+
822
+ def report_add_nan_to_summary(
823
+ filename: str, ui: UserInput, df_summary: pandas.DataFrame, rejection_cause: str
824
+ ):
825
+ """
826
+ Method to add NaN values to the summary DataFrame in case a signal did not contain a peak.
827
+
828
+ Parameters
829
+ ----------
830
+ ui
831
+ Instance of the UserInput class.
832
+ df_summary
833
+ DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
834
+ rejection_cause
835
+ Cause for rejecting a given signal.
836
+
837
+ Returns
838
+ -------
839
+ df_summary
840
+ Updated DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
841
+ """
842
+ model = ui.user_info[filename][0]
843
+ # create DataFrame with correct format and fill it with NaN
844
+ nan_dictionary = {
845
+ "mean": np.nan,
846
+ "sd": np.nan,
847
+ "hdi_3%": np.nan,
848
+ "hdi_97%": np.nan,
849
+ "mcse_mean": np.nan,
850
+ "mcse_sd": np.nan,
851
+ "ess_bulk": np.nan,
852
+ "ess_tail": np.nan,
853
+ "r_hat": np.nan,
854
+ }
855
+ df = pandas.DataFrame(
856
+ {
857
+ "baseline_intercept": nan_dictionary,
858
+ "baseline_slope": nan_dictionary,
859
+ "mean": nan_dictionary,
860
+ "noise": nan_dictionary,
861
+ "std": nan_dictionary,
862
+ "area": nan_dictionary,
863
+ "height": nan_dictionary,
864
+ "sn": nan_dictionary,
865
+ }
866
+ ).transpose()
867
+ # add information about the signal
868
+ df["acquisition"] = len(df.index) * [f"{ui.acquisition}"]
869
+ df["experiment_or_precursor_mz"] = len(df.index) * [ui.precursor]
870
+ df["product_mz_start"] = len(df.index) * [ui.product_mz_start]
871
+ df["product_mz_end"] = len(df.index) * [ui.product_mz_end]
872
+ df["is_peak"] = len(df.index) * [False]
873
+ df["cause_for_rejection"] = len(df.index) * [rejection_cause]
874
+ # if no peak was detected, there is no need for splitting double peaks, just give the info whether one was expected or not
875
+ df["model_type"] = len(df.index) * [model]
876
+ df["subpeak"] = len(df.index) * [""]
877
+ # concatenate to existing summary DataFrame
878
+ df_summary = pandas.concat([df_summary, df])
879
+ # save summary df as Excel file
880
+ with pandas.ExcelWriter(
881
+ path=rf"{ui.path}/peak_data_summary.xlsx", engine="openpyxl", mode="w"
882
+ ) as writer:
883
+ df_summary.to_excel(writer)
884
+ return df_summary
885
+
886
+
887
+ def pipeline_read_template(path_raw_data: Union[str, os.PathLike]):
888
+ """
889
+ Function to read and check the input settings and data from Template.xlsx when running the data pipeline.
890
+
891
+ Parameters
892
+ ----------
893
+ path_raw_data
894
+ Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
895
+ The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
896
+
897
+ Returns
898
+ -------
899
+ pre_filtering
900
+ If True, potential peaks will be filtered based on retention time and signal to noise ratio before sampling.
901
+ plotting
902
+ If True, PeakPerformance will plot results.
903
+ peak_width_estimate
904
+ Rough estimate of the average peak width in minutes expected for the LC-MS method with which the data was obtained.
905
+ minimum_sn
906
+ Minimum signal to noise ratio for a signal to be recognized as a peak during pre-filtering.
907
+ df_signals
908
+ Read-out of the signals tab from Template.xlsx as a DataFrame.
909
+ unique_identifiers
910
+ List of unique identifiers from the signals tab of Template.xlsx.
911
+ """
912
+ # read data and user input from the settings tab of Template.xlsx
913
+ df_settings = pandas.read_excel(
914
+ Path(path_raw_data) / "Template.xlsx", sheet_name="settings", index_col="parameter"
915
+ )
916
+ pre_filtering = eval(df_settings.loc["pre_filtering", "setting"])
917
+ if not isinstance(pre_filtering, bool):
918
+ raise InputError("pre_filtering under settings in Template.xlsx must be a bool.")
919
+ plotting = eval(df_settings.loc["plotting", "setting"])
920
+ if not isinstance(plotting, bool):
921
+ raise InputError("plotting under settings in Template.xlsx must be a bool.")
922
+ peak_width_estimate = df_settings.loc["peak_width_estimate", "setting"]
923
+ if not isinstance(peak_width_estimate, float) and not isinstance(peak_width_estimate, int):
924
+ try:
925
+ peak_width_estimate = float(peak_width_estimate)
926
+ except: # noqa: E722
927
+ raise InputError(
928
+ "peak_width_estimate under settings in Template.xlsx must be an int or float."
929
+ )
930
+ minimum_sn = df_settings.loc["minimum_sn", "setting"]
931
+ if not isinstance(minimum_sn, float) and not isinstance(minimum_sn, int):
932
+ try:
933
+ minimum_sn = float(minimum_sn)
934
+ except: # noqa: E722
935
+ raise InputError("minimum_sn under settings in Template.xlsx must be an int or float.")
936
+
937
+ # read data and user input from the signals tab of Template.xlsx
938
+ df_signals = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="signals")
939
+ unique_identifiers = list(df_signals["unique_identifier"].replace("", np.nan).dropna())
940
+ unique_identifiers = [str(identifier) for identifier in unique_identifiers]
941
+ if not unique_identifiers:
942
+ raise InputError(
943
+ "The list in column unique_identifier in the signals tab of Template.xlsx must not be empty."
944
+ )
945
+ if len(set(unique_identifiers)) != len(unique_identifiers):
946
+ raise InputError(
947
+ "The list in column unique_identifier in the signals tab of Template.xlsx must contain only unique entries."
948
+ )
949
+ # test whether df_signals is filled out correctly
950
+ for x in range(len(df_signals)):
951
+ if not df_signals.isnull()["unique_identifier"][x] and df_signals.isnull()["model_type"][x]:
952
+ raise InputError(
953
+ f"In the signals tab of Template.xlsx, the unique identifier in row {x + 2} has no model type."
954
+ )
955
+ if pre_filtering:
956
+ if (
957
+ not df_signals.isnull()["unique_identifier"][x]
958
+ and df_signals.isnull()["retention_time_estimate"][x]
959
+ ):
960
+ raise InputError(
961
+ f"In the signals tab of Template.xlsx, the unique_identifier in row {x + 2} has no retention time estimate."
962
+ )
963
+ df_signals.set_index("unique_identifier", inplace=True)
964
+ return pre_filtering, plotting, peak_width_estimate, minimum_sn, df_signals, unique_identifiers
965
+
966
+
967
+ def pipeline_loop(
968
+ path_raw_data: Union[str, os.PathLike],
969
+ path_results: Union[str, os.PathLike],
970
+ raw_data_file_format: str,
971
+ df_summary: pandas.DataFrame,
972
+ *,
973
+ restart: bool = False,
974
+ ):
975
+ """
976
+ Function to run the complete PeakPerformance pipeline.
977
+
978
+ Parameters
979
+ ----------
980
+ path_raw_data
981
+ Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
982
+ The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
983
+ path_results
984
+ Path to the directory for the results of a given Batch run of PeakPerformance.
985
+ raw_data_file_format
986
+ Data format (suffix) of the raw data, default is '.npy'.
987
+ df_summary
988
+ DataFrame for collecting the results (i.e. peak parameters) of every signal of a given pipeline.
989
+ restart
990
+ If a pipeline broke for some reason, it can be restarted by setting restart to True.
991
+ That way, already analyzed files won't be analyzed again.
992
+ """
993
+ # read data and user input from the settings tab of Template.xlsx
994
+ (
995
+ pre_filtering,
996
+ plotting,
997
+ peak_width_estimate,
998
+ minimum_sn,
999
+ df_signals,
1000
+ unique_identifiers,
1001
+ ) = pipeline_read_template(path_raw_data)
1002
+ peak_model_list = []
1003
+ retention_time_estimate_list = []
1004
+ # synchronize the lists of raw data files, peak models, and retention times
1005
+ # they will be converted to the user_info dict when instantiating the UserInput class below
1006
+ df_files = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="files")
1007
+ raw_data_files = list(df_files.loc[:, "file_name"])
1008
+ # in case of a restart, update raw_data_files to only contain files which have not been analyzed
1009
+ if restart:
1010
+ analyzed_files = os.listdir(path_results)
1011
+ for raw in raw_data_files:
1012
+ for analyzed in analyzed_files:
1013
+ if raw in analyzed:
1014
+ raw_data_files.remove(raw)
1015
+ for file in raw_data_files:
1016
+ for identifier in unique_identifiers:
1017
+ if identifier in file:
1018
+ peak_model_list.append(str(df_signals.loc[identifier, "model_type"]))
1019
+ retention_time_estimate_list.append(
1020
+ df_signals.loc[identifier, "retention_time_estimate"]
1021
+ )
1022
+
1023
+ # loop over filenames
1024
+ for file in raw_data_files:
1025
+ # parse the data and extract information from the (standardized) file name
1026
+ (
1027
+ timeseries,
1028
+ acquisition,
1029
+ precursor,
1030
+ product_mz_start,
1031
+ product_mz_end,
1032
+ ) = parse_data(path_raw_data, file, raw_data_file_format)
1033
+ # instantiate the UserInput class all given information
1034
+ ui = UserInput(
1035
+ path_results,
1036
+ raw_data_files,
1037
+ raw_data_file_format,
1038
+ peak_model_list,
1039
+ retention_time_estimate_list,
1040
+ peak_width_estimate,
1041
+ pre_filtering,
1042
+ minimum_sn,
1043
+ timeseries,
1044
+ acquisition,
1045
+ precursor,
1046
+ product_mz_start,
1047
+ product_mz_end,
1048
+ )
1049
+ # apply pre-sampling filter (if selected)
1050
+ if pre_filtering:
1051
+ # test if necessary settings were provided by the user
1052
+ if not retention_time_estimate_list:
1053
+ raise InputError(
1054
+ "If selecting pre-filtering, provide a list of retention time estimate in Template.xlsx."
1055
+ )
1056
+ if not minimum_sn:
1057
+ raise InputError(
1058
+ "If selecting pre-filtering, provide a minimum signal-to-noise ratio in Template.xlsx."
1059
+ )
1060
+ if not peak_width_estimate:
1061
+ raise InputError(
1062
+ "If selecting pre-filtering, provide a rough estimate of the general peak width in Template.xlsx."
1063
+ )
1064
+
1065
+ # calculate noise guess for pre-filtering
1066
+ slope_guess, intercept_guess, noise_guess = models.initial_guesses(
1067
+ ui.timeseries[0], ui.timeseries[1]
1068
+ )
1069
+ prefilter, df_summary = prefiltering(file, ui, noise_guess, df_summary)
1070
+ if not prefilter:
1071
+ # if no peak candidates were found, continue with the next signal
1072
+ if plotting:
1073
+ plots.plot_raw_data(
1074
+ file[: -len(ui.raw_data_file_format)],
1075
+ ui.timeseries[0],
1076
+ ui.timeseries[1],
1077
+ ui.path,
1078
+ )
1079
+ continue
1080
+ # select model based on information in UserInput
1081
+ model = ui.user_info[file][0]
1082
+ if model == models.ModelType.Normal:
1083
+ pmodel = models.define_model_normal(ui.timeseries[0], ui.timeseries[1])
1084
+ elif model == models.ModelType.SkewNormal:
1085
+ pmodel = models.define_model_skew(ui.timeseries[0], ui.timeseries[1])
1086
+ elif model == models.ModelType.DoubleNormal:
1087
+ pmodel = models.define_model_double_normal(ui.timeseries[0], ui.timeseries[1])
1088
+ elif model == models.ModelType.DoubleSkewNormal:
1089
+ pmodel = models.define_model_double_skew_normal(ui.timeseries[0], ui.timeseries[1])
1090
+ else:
1091
+ raise NotImplementedError(
1092
+ f"The model '{model}' specified for file '{file}' is not implemented."
1093
+ )
1094
+
1095
+ # sample the chosen model
1096
+ idata = sampling(pmodel)
1097
+ # apply post-sampling filter
1098
+ resample, discard, df_summary = postfiltering(file, idata, ui, df_summary)
1099
+ # if peak was discarded, continue with the next signal
1100
+ if discard:
1101
+ if plotting:
1102
+ plots.plot_posterior(
1103
+ file[: -len(ui.raw_data_file_format)],
1104
+ ui.timeseries[0],
1105
+ ui.timeseries[1],
1106
+ ui.path,
1107
+ idata,
1108
+ True,
1109
+ )
1110
+ continue
1111
+ # if convergence was not yet reached, sample again with more tuning samples
1112
+ if resample:
1113
+ if "double" in model:
1114
+ idata = sampling(pmodel, tune=16000)
1115
+ else:
1116
+ idata = sampling(pmodel, tune=6000)
1117
+ resample, discard, df_summary = postfiltering(file, idata, ui, df_summary)
1118
+ if discard:
1119
+ plots.plot_posterior(
1120
+ file[: -len(ui.raw_data_file_format)],
1121
+ ui.timeseries[0],
1122
+ ui.timeseries[1],
1123
+ ui.path,
1124
+ idata,
1125
+ True,
1126
+ )
1127
+ continue
1128
+ if resample:
1129
+ # if signal was flagged for re-sampling a second time, discard it
1130
+ rejection_msg = "postfiltering: signal was flagged for re-sampling with increased sample number twice"
1131
+ df_summary = report_add_data_to_summary(
1132
+ file, idata, df_summary, ui, False, rejection_msg
1133
+ )
1134
+ if plotting:
1135
+ plots.plot_posterior(
1136
+ file[: -len(ui.raw_data_file_format)],
1137
+ ui.timeseries[0],
1138
+ ui.timeseries[1],
1139
+ ui.path,
1140
+ idata,
1141
+ True,
1142
+ )
1143
+ continue
1144
+ # perform posterior predictive sampling
1145
+ idata = posterior_predictive_sampling(pmodel, idata)
1146
+ # add inference data to df_summary and save it as an Excel file
1147
+ df_summary = report_add_data_to_summary(file, idata, df_summary, ui, True)
1148
+ # save the inference data object as a netcdf file
1149
+ report_save_idata(idata, ui, file[: -len(ui.raw_data_file_format)])
1150
+ # plot data
1151
+ if plotting:
1152
+ plots.plot_posterior_predictive(
1153
+ file[: -len(ui.raw_data_file_format)],
1154
+ ui.timeseries[0],
1155
+ ui.timeseries[1],
1156
+ ui.path,
1157
+ idata,
1158
+ False,
1159
+ )
1160
+ plots.plot_posterior(
1161
+ file[: -len(ui.raw_data_file_format)],
1162
+ ui.timeseries[0],
1163
+ ui.timeseries[1],
1164
+ ui.path,
1165
+ idata,
1166
+ False,
1167
+ )
1168
+ # save condesed Excel file with area data
1169
+ report_area_sheet(path_results, df_summary)
1170
+
1171
+
1172
+ def pipeline(
1173
+ path_raw_data: Union[str, os.PathLike],
1174
+ raw_data_file_format: str,
1175
+ ):
1176
+ """
1177
+ Function to run the complete PeakPerformance pipeline.
1178
+
1179
+ Parameters
1180
+ ----------
1181
+ path_raw_data
1182
+ Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
1183
+ The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
1184
+ raw_data_file_format
1185
+ Data format (suffix) of the raw data, default is '.npy'.
1186
+
1187
+ Returns
1188
+ ----------
1189
+ path_results
1190
+ Path variable pointing to the newly created folder for this batch.
1191
+ """
1192
+ # create data structure and DataFrame(s) for results
1193
+ df_summary, path_results = initiate(path_raw_data)
1194
+ pipeline_loop(
1195
+ path_raw_data,
1196
+ path_results,
1197
+ raw_data_file_format,
1198
+ df_summary,
1199
+ )
1200
+ return path_results
1201
+
1202
+
1203
+ def pipeline_restart(
1204
+ path_raw_data: Union[str, os.PathLike],
1205
+ raw_data_file_format: str,
1206
+ path_results: Union[str, os.PathLike],
1207
+ ):
1208
+ """
1209
+ Function to restart a broken PeakPerformance pipeline.
1210
+ Files which are in the results directory of the broken pipeline will not be analyzed again.
1211
+ WARNING: This only works once! If a pipeline fails more than once, copy all files (except the Excel report sheets)
1212
+ into one directory and specify this directory as the path_results argument.
1213
+
1214
+ Parameters
1215
+ ----------
1216
+ path_raw_data
1217
+ Path to the raw data files. Files should be in the given raw_data_file_format, default is '.npy'.
1218
+ The `.npy` files are expected to be (2, ?)-shaped 2D NumPy arrays with time and intensity in the first dimension.
1219
+ raw_data_file_format
1220
+ Data format (suffix) of the raw data, default is '.npy'.
1221
+ path_results
1222
+ Path variable pointing to the directory of the broken PeakPerformance batch
1223
+
1224
+ Returns
1225
+ ----------
1226
+ path_results_new
1227
+ Path variable pointing to the newly created folder for the restarted batch.
1228
+ """
1229
+ df_summary, path_results_new = initiate(path_raw_data)
1230
+ pipeline_loop(
1231
+ path_raw_data,
1232
+ path_results,
1233
+ raw_data_file_format,
1234
+ df_summary,
1235
+ restart=True,
1236
+ )
1237
+ return path_results_new
1238
+
1239
+
1240
+ def excel_template_prepare(
1241
+ path_raw_data: Union[str, os.PathLike],
1242
+ path_peak_performance: Union[str, os.PathLike],
1243
+ raw_data_files: Union[List[str], Tuple[str]],
1244
+ unique_identifiers: Union[List[str], Tuple[str]],
1245
+ ):
1246
+ """
1247
+ Function to copy Template.xlsx from the peak performance directory to the directory containing the raw data files.
1248
+ Subsequently, update Template.xlsx with a list of all raw data files and of all unique_identifiers.
1249
+
1250
+ Parameters
1251
+ ----------
1252
+ path_raw_data
1253
+ Path to the folder containing raw data.
1254
+ path_peak_performance
1255
+ Path to the folder containing PeakPerformance.
1256
+ raw_data_files
1257
+ List with names of all files of the specified data type in path_raw_data.
1258
+ unique_identifiers
1259
+ List with all unique combinations of targeted molecules.
1260
+ (i.e. experiment number or precursor ion m/z ratio and product ion m/z ratio range)
1261
+ """
1262
+ # copy Template.xlsx from PeakPerformance to the directory with the raw data
1263
+ try:
1264
+ shutil.copy(
1265
+ Path(path_peak_performance) / "Template.xlsx", Path(path_raw_data) / "Template.xlsx"
1266
+ )
1267
+ except FileNotFoundError:
1268
+ raise ParsingError(f"Template.xlsx was not found in {path_peak_performance}.")
1269
+ except Exception:
1270
+ raise ParsingError(
1271
+ f"Error while copying Template.xlsx from {path_peak_performance} into {path_raw_data}."
1272
+ )
1273
+ # load Template.xlsx
1274
+ wb = load_workbook(Path(path_raw_data) / "Template.xlsx")
1275
+ # add list of all files names to the files tab
1276
+ wb_files = wb["files"]
1277
+ df1 = pandas.DataFrame({"file_name": raw_data_files})
1278
+ for r in dataframe_to_rows(df1, index=False, header=False):
1279
+ wb_files.append(r)
1280
+ # add list of all unique identifiers (i.e. mass traces) to the signals tab
1281
+ wb_signals = wb["signals"]
1282
+ df2 = pandas.DataFrame({"unique_identifier": unique_identifiers})
1283
+ for r in dataframe_to_rows(df2, index=False, header=False):
1284
+ wb_signals.append(r)
1285
+ wb.save(Path(path_raw_data) / "Template.xlsx")
1286
+ return
1287
+
1288
+
1289
+ def prepare_model_selection(
1290
+ path_raw_data: Union[str, os.PathLike],
1291
+ path_template: Union[str, os.PathLike],
1292
+ ):
1293
+ """
1294
+ Function to prepare model selection by providing and mostly filling out an Excel template
1295
+ Template.xlsx. After this step, the user has to provide relevant information in Template.xlsx
1296
+ which is finally used for model selection.
1297
+
1298
+ Parameters
1299
+ ----------
1300
+ path_raw_data
1301
+ Path to the folder containing raw data.
1302
+ path_template
1303
+ Path to the folder containing Template.xlsx from PeakPerformance.
1304
+ """
1305
+ # detect raw data files
1306
+ raw_data_files = detect_raw_data(path_raw_data)
1307
+ # parse unique identifiers
1308
+ identifiers = parse_unique_identifiers(raw_data_files)
1309
+ # copy Template.xlsx into raw data directory and add data from the previous commmands
1310
+ excel_template_prepare(path_raw_data, path_template, raw_data_files, identifiers)
1311
+ return
1312
+
1313
+
1314
+ def parse_files_for_model_selection(signals: pandas.DataFrame) -> Dict[str, str]:
1315
+ """
1316
+ Function to parse the file names for model selection.
1317
+
1318
+ Parameters
1319
+ ----------
1320
+ signals
1321
+ DataFrame containing the signals tab of Template.xlsx.
1322
+
1323
+ Returns
1324
+ ----------
1325
+ files_for_selection
1326
+ Dict with file names as keys and unique identifiers as values.
1327
+ """
1328
+ identifier_list = list(signals["unique_identifier"].replace("", np.nan).dropna())
1329
+ model_list = list(signals["model_type"].replace("", np.nan).dropna())
1330
+ acquisition_list = list(
1331
+ signals["acquisition_for_choosing_model_type"].replace("", np.nan).dropna()
1332
+ )
1333
+ # sanity checks
1334
+ if not identifier_list:
1335
+ raise InputError("In the signals tab of Template.xlsx, there are no unqiue_identifiers.")
1336
+ if not model_list and not acquisition_list:
1337
+ raise InputError(
1338
+ "In the signals tab of Template.xlsx, no model or acquisition(s) for model selection were provided."
1339
+ )
1340
+ if len(identifier_list) == len(model_list):
1341
+ raise InputError(
1342
+ """In the signals tab of Template.xlsx, for each unique identifier a model type was provided.
1343
+ Thus, no model selection is performed."""
1344
+ )
1345
+ # multiple scenarios have to be covered
1346
+ files_for_selection: Dict[str, str] = {}
1347
+ signals = signals.fillna("")
1348
+ if len(model_list) == len(signals.index):
1349
+ # scenario 1: a model was specified for every unique identifier (by the user) -> model selection obsolete
1350
+ return files_for_selection
1351
+ elif len(signals.index) - len(model_list) > 1 and len(acquisition_list) == 1:
1352
+ # scenario 2: for more than one unique identifier no model was specified by the user
1353
+ # but a single acquisition was given for model selection -> model selection from one acquisition
1354
+ acquisition = acquisition_list[0]
1355
+ # remove possible whitespace in front or after an entry made by the user
1356
+ acquisition = acquisition.strip()
1357
+ for idx, row in signals.iterrows():
1358
+ if not signals.loc[idx, "model_type"]:
1359
+ unique_identifier = getattr(row, "unique_identifier")
1360
+ filename = "_".join([acquisition, unique_identifier])
1361
+ files_for_selection[filename] = unique_identifier
1362
+ elif len(signals.index) - len(model_list) == len(acquisition_list):
1363
+ # scenario 3: for every unique identifier for which no model was specified by the user,
1364
+ # they provided an acquistion for model selection
1365
+ for idx, row in signals.iterrows():
1366
+ if not signals.loc[idx, "model_type"]:
1367
+ acquisition = getattr(row, "acquisition_for_choosing_model_type")
1368
+ unique_identifier = getattr(row, "unique_identifier")
1369
+ filename = "_".join([acquisition, unique_identifier])
1370
+ files_for_selection[filename] = unique_identifier
1371
+ else:
1372
+ raise InputError(
1373
+ "When using model selection, provide either one acquisition or one acquisition per unique identifier (no in-betweens)."
1374
+ )
1375
+ return files_for_selection
1376
+
1377
+
1378
+ def selected_models_to_template(
1379
+ path_raw_data: Union[str, os.PathLike],
1380
+ signals: pandas.DataFrame,
1381
+ model_dict: Mapping[str, str],
1382
+ ):
1383
+ """
1384
+ Function to update Template.xlsx with the selected model types.
1385
+
1386
+ Parameters
1387
+ ----------
1388
+ path_raw_data
1389
+ Path to the folder containing raw data.
1390
+ signals
1391
+ DataFrame containing the signals tab of Template.xlsx.
1392
+ model_dict
1393
+ Dict with unique identifiers as keys and model types as values.
1394
+ """
1395
+ signals = signals.fillna("")
1396
+ for idx, row in signals.iterrows():
1397
+ if not signals.loc[idx, "model_type"]:
1398
+ unique_identifier = getattr(row, "unique_identifier")
1399
+ signals.loc[idx, "model_type"] = model_dict[unique_identifier]
1400
+ # update in Excel
1401
+ wb = load_workbook(Path(path_raw_data) / "Template.xlsx")
1402
+ # update signals tab with model types by deleting rows and appending signals
1403
+ wb_signals = wb["signals"]
1404
+ wb_signals.delete_rows(wb_signals.min_row + 1, wb_signals.max_row)
1405
+ for r in dataframe_to_rows(signals, index=False, header=False):
1406
+ wb_signals.append(r)
1407
+ wb.save(Path(path_raw_data) / "Template.xlsx")
1408
+ return
1409
+
1410
+
1411
+ def model_selection_check(
1412
+ result_df: pandas.DataFrame, ic: str, elpd_threshold: Union[str, float] = 25
1413
+ ) -> str:
1414
+ """
1415
+ During model seleciton, double peak models are sometimes incorrectly preferred due to their increased complexity.
1416
+ Therefore, they have to outperform single peak models by an empirically determined value of the elpd.
1417
+
1418
+ Parameters
1419
+ ----------
1420
+ result_df
1421
+ DataFrame with the result of model comparison via az.compare().
1422
+ ic
1423
+ Information criterion to be used for model selection.
1424
+ ("loo": pareto-smoothed importance sampling leave-one-out cross-validation,
1425
+ "waic": widely applicable information criterion)
1426
+ elpd_threshold
1427
+ Threshold of the elpd difference between a double and a single peak model for the double peak model
1428
+ to be accepted.
1429
+
1430
+ Returns
1431
+ ----------
1432
+ selected_model
1433
+ Name of the selected model type.
1434
+ """
1435
+ selected_model = str(result_df.index[0])
1436
+ if "double" in selected_model:
1437
+ df_single_peak_models = result_df[~result_df.index.str.contains("double")]
1438
+ elpd_single = max(list(df_single_peak_models[f"elpd_{ic}"]))
1439
+ elpd_double = max(list(result_df[f"elpd_{ic}"]))
1440
+ if not elpd_double > elpd_single + elpd_threshold:
1441
+ selected_model = str(df_single_peak_models.index[0])
1442
+ return selected_model
1443
+
1444
+
1445
+ def selection_loop(
1446
+ path_raw_data: Union[str, os.PathLike],
1447
+ *,
1448
+ files_for_selection: Mapping[str, str],
1449
+ raw_data_files: Union[List[str], Tuple[str]],
1450
+ ic: str,
1451
+ signals: pandas.DataFrame,
1452
+ ):
1453
+ """
1454
+ Function containing the loop over all filenames intended for the model selection.
1455
+ Involves sampling every model featured by PeakPerformance, computing the loglikelihood
1456
+ and an information criterion, and comparing the results to ascertain the best model for every file.
1457
+
1458
+ Parameters
1459
+ ----------
1460
+ path_raw_data
1461
+ Path to the folder containing raw data.
1462
+ files_for_selection
1463
+ Dict with file names as keys and unique identifiers as values.
1464
+ raw_data_files
1465
+ List of raw data files returned by the detect_raw_data() function.
1466
+ Is needed here only to get access to the file format.
1467
+ ic
1468
+ Information criterion to be used for model selection.
1469
+ ("loo": pareto-smoothed importance sampling leave-one-out cross-validation,
1470
+ "waic": widely applicable information criterion)
1471
+
1472
+ Returns
1473
+ ----------
1474
+ result_df
1475
+ DataFrame containing the ranking and scores of the model selection.
1476
+ model_dict
1477
+ Dict with unique identifiers as keys and model types as values.
1478
+ """
1479
+ model_dict = {}
1480
+ # get data file format from raw_data_files
1481
+ file_format = raw_data_files[0].split(".")[-1]
1482
+ # loop over all filenames in files_for_selection
1483
+ for filename in files_for_selection.keys():
1484
+ # load time series
1485
+ timeseries = np.load(Path(path_raw_data) / (filename + "." + file_format))
1486
+ idata_dict = {}
1487
+ # get all implemented models, then remove those which were excluded
1488
+ # from model selection by the user
1489
+ models_to_exclude = str(
1490
+ signals.loc[files_for_selection[filename], "models_to_exclude_from_selection"]
1491
+ )
1492
+ model_list = set(models.ModelType)
1493
+ if models_to_exclude:
1494
+ exclude_models = {mex.strip() for mex in models_to_exclude.split(",")}
1495
+ model_list = model_list - exclude_models # type: ignore[operator]
1496
+ if models.ModelType.Normal in model_list:
1497
+ pmodel_normal = models.define_model_normal(timeseries[0], timeseries[1])
1498
+ idata_normal = sampling(pmodel_normal, tune=6000)
1499
+ idata_normal = models.compute_log_likelihood(pmodel_normal, idata_normal)
1500
+ idata_normal_summary = az.summary(idata_normal)
1501
+ idata_dict["normal"] = [idata_normal_summary, idata_normal]
1502
+ if models.ModelType.SkewNormal in model_list:
1503
+ pmodel_skew = models.define_model_skew(timeseries[0], timeseries[1])
1504
+ idata_skew = sampling(pmodel_skew, tune=6000)
1505
+ idata_skew = models.compute_log_likelihood(pmodel_skew, idata_skew)
1506
+ idata_skew_normal_summary = az.summary(idata_skew)
1507
+ idata_dict["skew_normal"] = [idata_skew_normal_summary, idata_skew]
1508
+ if models.ModelType.DoubleNormal in model_list:
1509
+ pmodel_double_normal = models.define_model_double_normal(timeseries[0], timeseries[1])
1510
+ idata_double_normal = sampling(pmodel_double_normal, tune=6000)
1511
+ idata_double_normal = models.compute_log_likelihood(
1512
+ pmodel_double_normal, idata_double_normal
1513
+ )
1514
+ idata_double_normal_summary = az.summary(idata_double_normal)
1515
+ idata_dict["double_normal"] = [idata_double_normal_summary, idata_double_normal]
1516
+ if models.ModelType.DoubleSkewNormal in model_list:
1517
+ pmodel_double_skew = models.define_model_double_skew_normal(
1518
+ timeseries[0], timeseries[1]
1519
+ )
1520
+ idata_double_skew = sampling(pmodel_double_skew, tune=6000)
1521
+ idata_double_skew = models.compute_log_likelihood(pmodel_double_skew, idata_double_skew)
1522
+ idata_double_skew_normal_summary = az.summary(idata_double_skew)
1523
+ idata_dict["double_skew_normal"] = [idata_double_skew_normal_summary, idata_double_skew]
1524
+
1525
+ # add model to compare_dict for model selection only if convergence criterion was met (r_hat <= 1.05)
1526
+ compare_dict = {}
1527
+ for model in idata_dict.keys():
1528
+ if not (idata_dict[model][0].loc[:, "r_hat"] > 1.05).any():
1529
+ compare_dict[model] = idata_dict[model][1]
1530
+ # compare_dict needs at least two entries for model comparison
1531
+ # if not enough pass the r_hat test, accept all for now to avoid error
1532
+ if len(compare_dict) < 2:
1533
+ warnings.warn(
1534
+ f"Only one or less models converged during model selection for {filename}."
1535
+ )
1536
+ for model in idata_dict.keys():
1537
+ compare_dict[model] = idata_dict[model][1]
1538
+ # perform the actual model comparison
1539
+ result_df = models.model_comparison(compare_dict, ic)
1540
+ # double peak models are sometimes incorrectly preferred due to their increased complexity
1541
+ # therefore, they have to outperform single peak models by an empirically determined value of the elpd
1542
+ selected_model = model_selection_check(result_df, ic)
1543
+ # update model_dict with unique_identifier as key and selected_model as value
1544
+ model_dict[files_for_selection[filename]] = selected_model
1545
+ # optional: plot the results of model comparison
1546
+ return result_df, model_dict
1547
+
1548
+
1549
+ def model_selection(path_raw_data: Union[str, os.PathLike], *, ic: str = "loo"):
1550
+ """
1551
+ Method to select the best model for every signal (i.e. combination of experiment number or precursor ion m/z ratio
1552
+ and product ion m/z ratio). This is realized by analyzing one representative sample of the batch with all models and
1553
+ comparing the results based on an informantion criterion.
1554
+
1555
+ Parameters
1556
+ ----------
1557
+ path_raw_data
1558
+ Path to the folder containing raw data.
1559
+ ic
1560
+ Information criterion to be used for model selection.
1561
+ ("loo": pareto-smoothed importance sampling leave-one-out cross-validation,
1562
+ "waic": widely applicable information criterion)
1563
+
1564
+ Returns
1565
+ ----------
1566
+ comparison_results
1567
+ DataFrame containing all rankings from model selection.
1568
+ model_dict
1569
+ Dict with unique identifiers as keys and model types as values.
1570
+ """
1571
+ # check for which signals model selection is wished and whether from one or different acquisitions
1572
+ df_signals = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="signals")
1573
+ files_for_selection = parse_files_for_model_selection(df_signals)
1574
+ # get raw_data_files to get automatic access to file format in seleciton_loop
1575
+ raw_data_files = detect_raw_data(path_raw_data)
1576
+ # loop over all files_for_selection
1577
+ df_signals.set_index("unique_identifier", inplace=True)
1578
+ comparison_results = pandas.DataFrame()
1579
+ result_df, model_dict = selection_loop(
1580
+ path_raw_data,
1581
+ files_for_selection=files_for_selection,
1582
+ raw_data_files=raw_data_files,
1583
+ ic=ic,
1584
+ signals=df_signals,
1585
+ )
1586
+ comparison_results = pandas.concat([comparison_results, result_df])
1587
+ # update signals tab of Template.xlsx; read again to reset index
1588
+ df_signals = pandas.read_excel(Path(path_raw_data) / "Template.xlsx", sheet_name="signals")
1589
+ try:
1590
+ selected_models_to_template(path_raw_data, df_signals, model_dict)
1591
+ except PermissionError:
1592
+ warnings.warn(
1593
+ """Since Template.xlsx was open during model selection, it could not be updated.
1594
+ Use the returned variables and pl.selected_models_to_template() to update it."""
1595
+ )
1596
+ return comparison_results, model_dict