paradigma 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,7 @@
1
- from typing import List
1
+ import json
2
+ import logging
3
+ from importlib.resources import files
4
+ from pathlib import Path
2
5
 
3
6
  import numpy as np
4
7
  import pandas as pd
@@ -6,7 +9,7 @@ from scipy.signal import welch
6
9
  from scipy.signal.windows import hamming, hann
7
10
 
8
11
  from paradigma.classification import ClassifierPackage
9
- from paradigma.config import PulseRateConfig
12
+ from paradigma.config import PPGConfig, PulseRateConfig
10
13
  from paradigma.constants import DataColumns
11
14
  from paradigma.feature_extraction import (
12
15
  compute_auto_correlation,
@@ -21,6 +24,7 @@ from paradigma.pipelines.pulse_rate_utils import (
21
24
  extract_pr_from_segment,
22
25
  extract_pr_segments,
23
26
  )
27
+ from paradigma.preprocessing import preprocess_ppg_data
24
28
  from paradigma.segmenting import WindowedDataExtractor, tabulate_windows
25
29
  from paradigma.util import aggregate_parameter
26
30
 
@@ -33,9 +37,11 @@ def extract_signal_quality_features(
33
37
  ) -> pd.DataFrame:
34
38
  """
35
39
  Extract signal quality features from the PPG signal.
36
- The features are extracted from the temporal and spectral domain of the PPG signal.
37
- The temporal domain features include variance, mean, median, kurtosis, skewness, signal-to-noise ratio, and autocorrelation.
38
- The spectral domain features include the dominant frequency, relative power, spectral entropy.
40
+ The features are extracted from the temporal and spectral domain of the
41
+ PPG signal. The temporal domain features include variance, mean, median,
42
+ kurtosis, skewness, signal-to-noise ratio, and autocorrelation. The
43
+ spectral domain features include the dominant frequency, relative power,
44
+ spectral entropy.
39
45
 
40
46
  Parameters
41
47
  ----------
@@ -44,9 +50,11 @@ def extract_signal_quality_features(
44
50
  df_acc : pd.DataFrame
45
51
  The DataFrame containing the accelerometer signal.
46
52
  ppg_config: PulseRateConfig
47
- The configuration for the signal quality feature extraction of the PPG signal.
53
+ The configuration for the signal quality feature extraction of the PPG
54
+ signal.
48
55
  acc_config: PulseRateConfig
49
- The configuration for the signal quality feature extraction of the accelerometer signal.
56
+ The configuration for the signal quality feature extraction of the
57
+ accelerometer signal.
50
58
 
51
59
  Returns
52
60
  -------
@@ -68,9 +76,9 @@ def extract_signal_quality_features(
68
76
  extractor = WindowedDataExtractor(ppg_windowed_colnames)
69
77
  idx_time = extractor.get_index(ppg_config.time_colname)
70
78
  idx_ppg = extractor.get_index(ppg_config.ppg_colname)
71
- start_time_ppg = np.min(
72
- ppg_windowed[:, :, idx_time], axis=1
73
- ) # Start time of the window is relative to the first datapoint in the PPG data
79
+ # Start time of the window is relative to the first datapoint in the PPG
80
+ # data
81
+ start_time_ppg = np.min(ppg_windowed[:, :, idx_time], axis=1)
74
82
  ppg_values_windowed = ppg_windowed[:, :, idx_ppg]
75
83
 
76
84
  df_features = pd.DataFrame(start_time_ppg, columns=[ppg_config.time_colname])
@@ -125,15 +133,22 @@ def signal_quality_classification(
125
133
  df: pd.DataFrame, config: PulseRateConfig, clf_package: ClassifierPackage
126
134
  ) -> pd.DataFrame:
127
135
  """
128
- Classify the signal quality of the PPG signal using a logistic regression classifier. A probability close to 1 indicates a high-quality signal, while a probability close to 0 indicates a low-quality signal.
129
- The classifier is trained on features extracted from the PPG signal. The features are extracted using the extract_signal_quality_features function.
130
- The accelerometer signal is used to determine the signal quality based on the power ratio of the accelerometer signal and returns a binary label based on a threshold.
131
- A value of 1 on the indicates no/minor periodic motion influence of the accelerometer on the PPG signal, 0 indicates major periodic motion influence.
136
+ Classify the signal quality of the PPG signal using a logistic regression
137
+ classifier. A probability close to 1 indicates a high-quality signal,
138
+ while a probability close to 0 indicates a low-quality signal. The
139
+ classifier is trained on features extracted from the PPG signal. The
140
+ features are extracted using the extract_signal_quality_features
141
+ function. The accelerometer signal is used to determine the signal
142
+ quality based on the power ratio of the accelerometer signal and returns
143
+ a binary label based on a threshold. A value of 1 on the indicates
144
+ no/minor periodic motion influence of the accelerometer on the PPG
145
+ signal, 0 indicates major periodic motion influence.
132
146
 
133
147
  Parameters
134
148
  ----------
135
149
  df : pd.DataFrame
136
- The DataFrame containing the PPG features and the accelerometer feature for signal quality classification.
150
+ The DataFrame containing the PPG features and the accelerometer
151
+ feature for signal quality classification.
137
152
  config : PulseRateConfig
138
153
  The configuration for the signal quality classification.
139
154
  clf_package : ClassifierPackage
@@ -142,7 +157,9 @@ def signal_quality_classification(
142
157
  Returns
143
158
  -------
144
159
  df_sqa pd.DataFrame
145
- The DataFrame containing the PPG signal quality predictions (both probabilities of the PPG signal quality classification and the accelerometer label based on the threshold).
160
+ The DataFrame containing the PPG signal quality predictions (both
161
+ probabilities of the PPG signal quality classification and the
162
+ accelerometer label based on the threshold).
146
163
  """
147
164
  # Set classifier
148
165
  clf = clf_package.classifier # Load the logistic regression classifier
@@ -152,16 +169,16 @@ def signal_quality_classification(
152
169
  df.loc[:, clf.feature_names_in]
153
170
  ) # Apply scaling to the features
154
171
 
155
- # Make predictions for PPG signal quality assessment, and assign the probabilities to the DataFrame and drop the features
172
+ # Make predictions for PPG signal quality assessment, and assign the
173
+ # probabilities to the DataFrame and drop the features
156
174
  df[DataColumns.PRED_SQA_PROBA] = clf.predict_proba(scaled_features)[:, 0]
157
175
  keep_cols = [config.time_colname, DataColumns.PRED_SQA_PROBA]
158
176
 
159
177
  if DataColumns.ACC_POWER_RATIO in df.columns:
178
+ # Assign accelerometer label to the DataFrame based on the threshold
160
179
  df[DataColumns.PRED_SQA_ACC_LABEL] = (
161
180
  df[DataColumns.ACC_POWER_RATIO] < config.threshold_sqa_accelerometer
162
- ).astype(
163
- int
164
- ) # Assign accelerometer label to the DataFrame based on the threshold
181
+ ).astype(int)
165
182
  keep_cols += [DataColumns.PRED_SQA_ACC_LABEL]
166
183
 
167
184
  return df[keep_cols]
@@ -252,9 +269,9 @@ def estimate_pulse_rate(
252
269
  config.kern_params,
253
270
  )
254
271
  n_pr = len(pr_est) # Number of pulse rate estimates
255
- end_idx_time = (
256
- n_pr * step_size + start_idx
257
- ) # Calculate end index for time, different from end_idx since it is always a multiple of step_size, while end_idx is not
272
+ # Calculate end index for time, different from end_idx since it is
273
+ # always a multiple of step_size, while end_idx is not
274
+ end_idx_time = n_pr * step_size + start_idx
258
275
 
259
276
  # Extract relative time for PR estimates
260
277
  pr_time = ppg_preprocessed[start_idx:end_idx_time:step_size, time_idx]
@@ -270,7 +287,7 @@ def estimate_pulse_rate(
270
287
 
271
288
 
272
289
  def aggregate_pulse_rate(
273
- pr_values: np.ndarray, aggregates: List[str] = ["mode", "99p"]
290
+ pr_values: np.ndarray, aggregates: list[str] = ["mode", "99p"]
274
291
  ) -> dict:
275
292
  """
276
293
  Aggregate the pulse rate estimates using the specified aggregation methods.
@@ -280,7 +297,8 @@ def aggregate_pulse_rate(
280
297
  pr_values : np.ndarray
281
298
  The array containing the pulse rate estimates
282
299
  aggregates : List[str]
283
- The list of aggregation methods to be used for the pulse rate estimates. The default is ['mode', '99p'].
300
+ The list of aggregation methods to be used for the pulse rate
301
+ estimates. The default is ['mode', '99p'].
284
302
 
285
303
  Returns
286
304
  -------
@@ -306,10 +324,12 @@ def aggregate_pulse_rate(
306
324
  def extract_temporal_domain_features(
307
325
  ppg_windowed: np.ndarray,
308
326
  config: PulseRateConfig,
309
- quality_stats: List[str] = ["mean", "std"],
327
+ quality_stats: list[str] = ["mean", "std"],
310
328
  ) -> pd.DataFrame:
311
329
  """
312
- Compute temporal domain features for the ppg signal. The features are added to the dataframe. Therefore the original dataframe is modified, and the modified dataframe is returned.
330
+ Compute temporal domain features for the ppg signal. The features are
331
+ added to the dataframe. Therefore the original dataframe is modified,
332
+ and the modified dataframe is returned.
313
333
 
314
334
  Parameters
315
335
  ----------
@@ -320,7 +340,8 @@ def extract_temporal_domain_features(
320
340
  The configuration object containing the parameters for the feature extraction
321
341
 
322
342
  quality_stats: list, optional
323
- The statistics to be computed for the gravity component of the accelerometer signal (default: ['mean', 'std'])
343
+ The statistics to be computed for the gravity component of the
344
+ accelerometer signal (default: ['mean', 'std'])
324
345
 
325
346
  Returns
326
347
  -------
@@ -344,9 +365,11 @@ def extract_spectral_domain_features(
344
365
  config: PulseRateConfig,
345
366
  ) -> pd.DataFrame:
346
367
  """
347
- Calculate the spectral features (dominant frequency, relative power, and spectral entropy)
348
- for each segment of a PPG signal using a single Welch's method computation. The features are added to the dataframe.
349
- Therefore the original dataframe is modified, and the modified dataframe is returned.
368
+ Calculate the spectral features (dominant frequency, relative power, and
369
+ spectral entropy) for each segment of a PPG signal using a single
370
+ Welch's method computation. The features are added to the dataframe.
371
+ Therefore the original dataframe is modified, and the modified dataframe
372
+ is returned.
350
373
 
351
374
  Parameters
352
375
  ----------
@@ -386,7 +409,10 @@ def extract_spectral_domain_features(
386
409
 
387
410
 
388
411
  def extract_acc_power_feature(
389
- f1: np.ndarray, PSD_acc: np.ndarray, f2: np.ndarray, PSD_ppg: np.ndarray
412
+ f1: np.ndarray,
413
+ psd_acc: np.ndarray,
414
+ f2: np.ndarray,
415
+ psd_ppg: np.ndarray,
390
416
  ) -> np.ndarray:
391
417
  """
392
418
  Extract the accelerometer power feature in the PPG frequency range.
@@ -395,11 +421,11 @@ def extract_acc_power_feature(
395
421
  ----------
396
422
  f1: np.ndarray
397
423
  The frequency bins of the accelerometer signal.
398
- PSD_acc: np.ndarray
424
+ psd_acc: np.ndarray
399
425
  The power spectral density of the accelerometer signal.
400
426
  f2: np.ndarray
401
427
  The frequency bins of the PPG signal.
402
- PSD_ppg: np.ndarray
428
+ psd_ppg: np.ndarray
403
429
  The power spectral density of the PPG signal.
404
430
 
405
431
  Returns
@@ -409,32 +435,33 @@ def extract_acc_power_feature(
409
435
  """
410
436
 
411
437
  # Find the index of the maximum PSD value in the PPG signal
412
- max_PPG_psd_idx = np.argmax(PSD_ppg, axis=1)
413
- max_PPG_freq_psd = f2[max_PPG_psd_idx]
438
+ max_ppg_psd_idx = np.argmax(psd_ppg, axis=1)
439
+ max_ppg_freq_psd = f2[max_ppg_psd_idx]
414
440
 
415
441
  # Find the neighboring indices of the maximum PSD value in the PPG signal
416
442
  df_idx = np.column_stack(
417
- (max_PPG_psd_idx - 1, max_PPG_psd_idx, max_PPG_psd_idx + 1)
443
+ (max_ppg_psd_idx - 1, max_ppg_psd_idx, max_ppg_psd_idx + 1)
418
444
  )
419
445
 
420
- # Find the index of the closest frequency in the accelerometer signal to the first harmonic of the PPG frequency
421
- corr_acc_psd_fh_idx = np.argmin(np.abs(f1[:, None] - max_PPG_freq_psd * 2), axis=0)
446
+ # Find the index of the closest frequency in the accelerometer signal
447
+ # to the first harmonic of the PPG frequency
448
+ corr_acc_psd_fh_idx = np.argmin(np.abs(f1[:, None] - max_ppg_freq_psd * 2), axis=0)
422
449
  fh_idx = np.column_stack(
423
450
  (corr_acc_psd_fh_idx - 1, corr_acc_psd_fh_idx, corr_acc_psd_fh_idx + 1)
424
451
  )
425
452
 
426
453
  # Compute the power in the ranges corresponding to the PPG frequency
427
- acc_power_PPG_range = np.trapz(
428
- PSD_acc[np.arange(PSD_acc.shape[0])[:, None], df_idx], f1[df_idx], axis=1
429
- ) + np.trapz(
430
- PSD_acc[np.arange(PSD_acc.shape[0])[:, None], fh_idx], f1[fh_idx], axis=1
454
+ acc_power_ppg_range = np.trapezoid(
455
+ psd_acc[np.arange(psd_acc.shape[0])[:, None], df_idx], f1[df_idx], axis=1
456
+ ) + np.trapezoid(
457
+ psd_acc[np.arange(psd_acc.shape[0])[:, None], fh_idx], f1[fh_idx], axis=1
431
458
  )
432
459
 
433
460
  # Compute the total power across the entire frequency range
434
- acc_power_total = np.trapz(PSD_acc, f1)
461
+ acc_power_total = np.trapezoid(psd_acc, f1)
435
462
 
436
463
  # Compute the power ratio of the accelerometer signal in the PPG frequency range
437
- acc_power_ratio = acc_power_PPG_range / acc_power_total
464
+ acc_power_ratio = acc_power_ppg_range / acc_power_total
438
465
 
439
466
  return acc_power_ratio
440
467
 
@@ -443,7 +470,8 @@ def extract_accelerometer_feature(
443
470
  acc_windowed: np.ndarray, ppg_windowed: np.ndarray, config: PulseRateConfig
444
471
  ) -> pd.DataFrame:
445
472
  """
446
- Extract accelerometer features from the accelerometer signal in the PPG frequency range.
473
+ Extract accelerometer features from the accelerometer signal in the PPG
474
+ frequency range.
447
475
 
448
476
  Parameters
449
477
  ----------
@@ -493,3 +521,207 @@ def extract_accelerometer_feature(
493
521
  )
494
522
 
495
523
  return pd.DataFrame(acc_power_ratio, columns=["acc_power_ratio"])
524
+
525
+
526
+ def run_pulse_rate_pipeline(
527
+ df_ppg_prepared: pd.DataFrame,
528
+ output_dir: str | Path,
529
+ store_intermediate: list[str] = [],
530
+ pulse_rate_config: PulseRateConfig | None = None,
531
+ ppg_config: PPGConfig | None = None,
532
+ logging_level: int = logging.INFO,
533
+ custom_logger: logging.Logger | None = None,
534
+ ) -> pd.DataFrame:
535
+ """
536
+ High-level pulse rate analysis pipeline for a single segment.
537
+
538
+ This function implements the complete pulse rate analysis workflow from the
539
+ pulse rate tutorial:
540
+ 1. Preprocess PPG and accelerometer data (accelerometer is optional)
541
+ 2. Extract signal quality features
542
+ 3. Signal quality classification
543
+ 4. Pulse rate estimation
544
+ 5. Quantify pulse rate (select relevant columns)
545
+
546
+ Parameters
547
+ ----------
548
+ df_ppg_prepared : pd.DataFrame
549
+ Prepared sensor data with time and PPG column.
550
+ output_dir : str or Path
551
+ Output directory for intermediate results (required)
552
+ store_intermediate : list of str, default []
553
+ Which intermediate results to store.
554
+ pulse_rate_config : PulseRateConfig, optional
555
+ Pulse rate analysis configuration
556
+ ppg_config : PPGConfig, optional
557
+ PPG preprocessing configuration
558
+ logging_level : int, default logging.INFO
559
+ Logging level using standard logging constants
560
+ custom_logger : logging.Logger, optional
561
+ Custom logger instance
562
+
563
+ Returns
564
+ -------
565
+ pd.DataFrame
566
+ Quantified pulse rate data with columns:
567
+ - time: timestamp
568
+ - pulse_rate: pulse rate estimate
569
+ - signal_quality: quality assessment (if available)
570
+ """
571
+ # Setup logger
572
+ active_logger = (
573
+ custom_logger if custom_logger is not None else logging.getLogger(__name__)
574
+ )
575
+ if custom_logger is None:
576
+ active_logger.setLevel(logging_level)
577
+
578
+ if pulse_rate_config is None:
579
+ pulse_rate_config = PulseRateConfig()
580
+ if ppg_config is None:
581
+ ppg_config = PPGConfig()
582
+
583
+ output_dir = Path(output_dir)
584
+ output_dir.mkdir(parents=True, exist_ok=True)
585
+
586
+ # Validate input data columns (PPG is required, accelerometer is optional)
587
+ required_columns = [DataColumns.TIME, DataColumns.PPG]
588
+ missing_columns = [
589
+ col for col in required_columns if col not in df_ppg_prepared.columns
590
+ ]
591
+ if missing_columns:
592
+ active_logger.warning(
593
+ f"Missing required columns for pulse rate pipeline: {missing_columns}"
594
+ )
595
+ return pd.DataFrame()
596
+
597
+ # Step 1: Preprocess PPG and accelerometer data (following tutorial)
598
+ active_logger.info("Step 1: Preprocessing PPG and accelerometer data")
599
+ try:
600
+ # Separate PPG data (always available)
601
+ ppg_cols = [DataColumns.TIME, DataColumns.PPG]
602
+ df_ppg = df_ppg_prepared[ppg_cols].copy()
603
+
604
+ # Preprocess the data
605
+ df_ppg_proc, _ = preprocess_ppg_data(
606
+ df_ppg=df_ppg,
607
+ ppg_config=ppg_config,
608
+ verbose=1 if logging_level <= logging.INFO else 0,
609
+ )
610
+
611
+ if "preprocessing" in store_intermediate:
612
+ preprocessing_dir = output_dir / "preprocessing"
613
+ preprocessing_dir.mkdir(exist_ok=True)
614
+ df_ppg_proc.to_parquet(preprocessing_dir / "ppg_preprocessed.parquet")
615
+ active_logger.info(f"Saved preprocessed data to {preprocessing_dir}")
616
+
617
+ except Exception as e:
618
+ active_logger.error(f"Preprocessing failed: {e}")
619
+ return pd.DataFrame()
620
+
621
+ # Step 2: Extract signal quality features
622
+ active_logger.info("Step 2: Extracting signal quality features")
623
+ try:
624
+ df_features = extract_signal_quality_features(df_ppg_proc, pulse_rate_config)
625
+
626
+ if "pulse_rate" in store_intermediate:
627
+ pulse_rate_dir = output_dir / "pulse_rate"
628
+ pulse_rate_dir.mkdir(exist_ok=True)
629
+ df_features.to_parquet(pulse_rate_dir / "signal_quality_features.parquet")
630
+ active_logger.info(f"Saved signal quality features to {pulse_rate_dir}")
631
+
632
+ except Exception as e:
633
+ active_logger.error(f"Feature extraction failed: {e}")
634
+ return pd.DataFrame()
635
+
636
+ # Step 3: Signal quality classification
637
+ active_logger.info("Step 3: Signal quality classification")
638
+ try:
639
+ classifier_path = files("paradigma.assets") / "ppg_quality_clf_package.pkl"
640
+ classifier_package = ClassifierPackage.load(classifier_path)
641
+
642
+ df_classified = signal_quality_classification(
643
+ df_features, pulse_rate_config, classifier_package
644
+ )
645
+
646
+ except Exception as e:
647
+ active_logger.error(f"Signal quality classification failed: {e}")
648
+ return pd.DataFrame()
649
+
650
+ # Step 4: Pulse rate estimation
651
+ active_logger.info("Step 4: Pulse rate estimation")
652
+ try:
653
+ df_pulse_rates = estimate_pulse_rate(
654
+ df_sqa=df_classified,
655
+ df_ppg_preprocessed=df_ppg_proc,
656
+ config=pulse_rate_config,
657
+ )
658
+
659
+ except Exception as e:
660
+ active_logger.error(f"Pulse rate estimation failed: {e}")
661
+ return pd.DataFrame()
662
+
663
+ # Step 5: Quantify pulse rate (select relevant columns and apply quality filtering)
664
+ active_logger.info("Step 5: Quantifying pulse rate")
665
+
666
+ # Select quantification columns
667
+ quantification_columns = []
668
+ if DataColumns.TIME in df_pulse_rates.columns:
669
+ quantification_columns.append(DataColumns.TIME)
670
+ if DataColumns.PULSE_RATE in df_pulse_rates.columns:
671
+ quantification_columns.append(DataColumns.PULSE_RATE)
672
+ if "signal_quality" in df_pulse_rates.columns:
673
+ quantification_columns.append("signal_quality")
674
+
675
+ # Use available columns
676
+ available_columns = [
677
+ col for col in quantification_columns if col in df_pulse_rates.columns
678
+ ]
679
+ if not available_columns:
680
+ active_logger.warning("No valid quantification columns found")
681
+ return pd.DataFrame()
682
+
683
+ df_quantification = df_pulse_rates[available_columns].copy()
684
+
685
+ # Apply quality filtering if signal quality is available
686
+ if (
687
+ "signal_quality" in df_quantification.columns
688
+ and DataColumns.PULSE_RATE in df_quantification.columns
689
+ ):
690
+ quality_threshold = getattr(pulse_rate_config, "threshold_sqa", 0.5)
691
+ low_quality_mask = df_quantification["signal_quality"] < quality_threshold
692
+ df_quantification.loc[low_quality_mask, DataColumns.PULSE_RATE] = np.nan
693
+
694
+ if "quantification" in store_intermediate:
695
+ quantification_dir = output_dir / "quantification"
696
+ quantification_dir.mkdir(exist_ok=True)
697
+ df_quantification.to_parquet(
698
+ quantification_dir / "pulse_rate_quantification.parquet"
699
+ )
700
+
701
+ # Save quantification metadata
702
+ valid_pulse_rates = (
703
+ df_quantification[DataColumns.PULSE_RATE].dropna()
704
+ if DataColumns.PULSE_RATE in df_quantification.columns
705
+ else pd.Series(dtype=float)
706
+ )
707
+ quantification_meta = {
708
+ "total_windows": len(df_quantification),
709
+ "valid_pulse_rate_estimates": len(valid_pulse_rates),
710
+ "columns": list(df_quantification.columns),
711
+ }
712
+ with open(quantification_dir / "pulse_rate_quantification_meta.json", "w") as f:
713
+ json.dump(quantification_meta, f, indent=2)
714
+
715
+ active_logger.info(f"Saved pulse rate quantification to {quantification_dir}")
716
+
717
+ pulse_rate_estimates = (
718
+ len(df_quantification[DataColumns.PULSE_RATE].dropna())
719
+ if DataColumns.PULSE_RATE in df_quantification.columns
720
+ else 0
721
+ )
722
+ active_logger.info(
723
+ f"Pulse rate analysis completed: {pulse_rate_estimates} valid pulse "
724
+ f"rate estimates from {len(df_quantification)} total windows"
725
+ )
726
+
727
+ return df_quantification