paradigma 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,130 +1,339 @@
1
- import json
1
+ from datetime import datetime
2
+
2
3
  import numpy as np
3
4
  import pandas as pd
4
- import tsdf
5
- from pathlib import Path
6
5
  from scipy import signal
7
6
  from scipy.interpolate import interp1d
8
- from typing import List, Tuple, Union
9
- from datetime import datetime
10
7
 
11
- from paradigma.constants import TimeUnit, DataColumns
12
- from paradigma.config import PPGConfig, IMUConfig
13
- from paradigma.util import write_df_data, read_metadata, invert_watch_side
8
+ from paradigma.config import IMUConfig, PPGConfig
9
+ from paradigma.segmenting import create_segments, discard_segments
10
+ from paradigma.util import invert_watch_side
14
11
 
15
12
 
16
13
  def resample_data(
17
14
  df: pd.DataFrame,
18
- time_column : str,
19
- values_column_names: List[str],
20
- sampling_frequency: int,
21
- resampling_frequency: int,
22
- tolerance: float | None = None
15
+ time_column: str = "time",
16
+ values_column_names: list[str] | None = None,
17
+ sampling_frequency: int | None = None,
18
+ resampling_frequency: int | None = None,
19
+ tolerance: float | None = None,
20
+ validate_contiguous: bool = True,
21
+ auto_segment: bool = False,
22
+ max_segment_gap_s: float | None = None,
23
+ min_segment_length_s: float | None = None,
24
+ verbose: int = 2,
23
25
  ) -> pd.DataFrame:
24
26
  """
25
- Resamples sensor data to a specified frequency using cubic interpolation.
27
+ Unified resampling function with optional auto-segmentation for non-contiguous data.
28
+
29
+ This function supports:
30
+ - Automatic frequency detection or explicit specification
31
+ - Contiguity validation with configurable tolerance
32
+ - Automatic segmentation of non-contiguous data
33
+ - Preservation of non-numeric columns
26
34
 
27
35
  Parameters
28
36
  ----------
29
37
  df : pd.DataFrame
30
38
  The input DataFrame containing the sensor data.
31
- time_column : str
39
+ time_column : str, default 'time'
32
40
  The name of the column containing the time data.
33
- values_column_names : List[str]
34
- A list of column names that should be resampled.
35
- sampling_frequency : int
36
- The original sampling frequency of the data (in Hz).
37
- resampling_frequency : int
38
- The frequency to which the data should be resampled (in Hz).
41
+ values_column_names : List[str], optional
42
+ Column names to resample. If None, auto-detects all numeric columns except time.
43
+ sampling_frequency : int, optional
44
+ Original sampling frequency (Hz). If None, auto-detected from data.
45
+ resampling_frequency : int, optional
46
+ Target sampling frequency in Hz.
39
47
  tolerance : float, optional
40
- The tolerance added to the expected difference when checking
41
- for contiguous timestamps. If not provided, it defaults to
42
- three times the expected interval.
48
+ Tolerance for contiguity checking (seconds). Defaults to IMUConfig tolerance.
49
+ validate_contiguous : bool, default True
50
+ Whether to validate data contiguity. If False, gaps are silently interpolated.
51
+ auto_segment : bool, default False
52
+ If True, automatically split non-contiguous data into segments and
53
+ process each. Adds 'data_segment_nr' column to output. If False and
54
+ data is non-contiguous with validate_contiguous=True, raises
55
+ ValueError.
56
+ max_segment_gap_s : float, optional
57
+ Maximum gap (seconds) before starting new segment. Used when auto_segment=True.
58
+ Defaults to IMUConfig.max_segment_gap_s (1.5s).
59
+ min_segment_length_s : float, optional
60
+ Minimum segment length (seconds) to keep. Used when auto_segment=True.
61
+ Defaults to IMUConfig.min_segment_length_s (1.5s).
62
+ verbose : int, default 1
63
+ Logging verbose: 0=errors only, 1=basic info, 2+=detailed info.
64
+ Note: This function still uses verbose for backward compatibility
65
+ with existing code that calls it directly.
43
66
 
44
67
  Returns
45
68
  -------
46
69
  pd.DataFrame
47
- A DataFrame with the resampled data, where each column contains resampled values.
48
- The time column will reflect the new resampling frequency.
70
+ Resampled DataFrame. If auto_segment=True and multiple segments found,
71
+ includes 'data_segment_nr' column identifying each contiguous data segment.
49
72
 
50
73
  Raises
51
74
  ------
52
75
  ValueError
53
- If the time array is not strictly increasing.
76
+ - If time array is not strictly increasing
77
+ - If time array is not contiguous and validate_contiguous=True
78
+ and auto_segment=False
79
+ - If no numeric columns found for resampling
80
+ - If all segments are discarded due to min_segment_length_s
54
81
 
55
82
  Notes
56
83
  -----
57
- - Uses cubic interpolation for smooth resampling if there are enough points.
58
- - If only two timestamps are available, it falls back to linear interpolation.
59
- """
60
- # Set default tolerance if not provided to three times the expected interval
61
- if tolerance is None:
62
- tolerance = 3 * 1 / sampling_frequency
84
+ - Uses cubic interpolation for smooth resampling if there are enough points
85
+ - Falls back to linear interpolation if only 2-3 points available
86
+ - Non-numeric columns are preserved (first value copied to all rows)
87
+ - Backwards compatible with both old resample_data signatures
88
+
89
+ Examples
90
+ --------
91
+ # Auto-detection mode
92
+ df_resampled = resample_data(df, resampling_frequency=100)
93
+
94
+ # Explicit mode
95
+ df_resampled = resample_data(
96
+ df, time_column='time', values_column_names=['acc_x', 'acc_y'],
97
+ sampling_frequency=128, resampling_frequency=100
98
+ )
63
99
 
64
- # Extract time and values
100
+ # Auto-segmentation mode
101
+ df_segmented = resample_data(
102
+ df, resampling_frequency=100, auto_segment=True,
103
+ max_segment_gap_s=2.0, min_segment_length_s=3.0
104
+ )
105
+ """
106
+ df = df.copy()
107
+
108
+ if time_column not in df.columns:
109
+ raise ValueError(f"Time column '{time_column}' not found in DataFrame")
110
+
111
+ # Validate resampling frequency
112
+ if resampling_frequency is None:
113
+ raise ValueError("resampling_frequency must be provided")
114
+
115
+ resampling_frequency = float(resampling_frequency)
116
+
117
+ # Auto-detect or use provided column names
118
+ if values_column_names is None:
119
+ numeric_columns = df.select_dtypes(include=[np.number]).columns
120
+ values_column_names = [
121
+ col
122
+ for col in numeric_columns
123
+ if col != time_column and col != "data_segment_nr"
124
+ ]
125
+ if not values_column_names:
126
+ raise ValueError("No numeric columns found for resampling")
127
+ if verbose >= 2:
128
+ print(f"Auto-detected {len(values_column_names)} columns for resampling")
129
+
130
+ # Auto-detect or use provided sampling frequency
65
131
  time_abs_array = np.array(df[time_column])
66
- values_array = np.array(df[values_column_names])
132
+ if sampling_frequency is None:
133
+ time_diff = df[time_column].diff().dropna()
134
+ current_dt = time_diff.median()
135
+ sampling_frequency = 1.0 / current_dt
136
+ if verbose >= 2:
137
+ print(f"Auto-detected sampling frequency: {sampling_frequency:.2f} Hz")
138
+ else:
139
+ sampling_frequency = float(sampling_frequency)
67
140
 
68
- # Ensure the time array is strictly increasing
141
+ # Ensure time array is strictly increasing
69
142
  if not np.all(np.diff(time_abs_array) > 0):
70
143
  raise ValueError("Time array is not strictly increasing")
71
-
72
- # Ensure the time array is contiguous
144
+
145
+ # Set default tolerance if not provided
146
+ if tolerance is None:
147
+ tolerance = IMUConfig().tolerance
148
+
149
+ # Set default segmentation parameters
150
+ if auto_segment:
151
+ if max_segment_gap_s is None:
152
+ max_segment_gap_s = 1.5 # IMUConfig default
153
+ if min_segment_length_s is None:
154
+ min_segment_length_s = 1.5 # IMUConfig default
155
+
156
+ # Check contiguity
73
157
  expected_interval = 1 / sampling_frequency
74
158
  timestamp_diffs = np.diff(time_abs_array)
75
- if np.any(np.abs(timestamp_diffs - expected_interval) > tolerance):
76
- raise ValueError("Time array is not contiguous")
159
+ is_contiguous = not np.any(np.abs(timestamp_diffs - expected_interval) > tolerance)
160
+
161
+ if not is_contiguous:
162
+ if validate_contiguous and not auto_segment:
163
+ raise ValueError(
164
+ "Time array is not contiguous. Consider enabling automatic "
165
+ "segmentation to split and process non-contiguous segments, or "
166
+ "disable contiguity validation to interpolate over gaps."
167
+ )
168
+ elif auto_segment:
169
+ # Split into segments
170
+ if verbose >= 1:
171
+ print("Non-contiguous data detected. Auto-segmenting...")
172
+
173
+ # Create segments based on gaps
174
+ segment_array = create_segments(
175
+ time_array=time_abs_array,
176
+ max_segment_gap_s=max_segment_gap_s,
177
+ )
178
+ df["data_segment_nr"] = segment_array
179
+
180
+ # Discard segments that are too short
181
+ df = discard_segments(
182
+ df=df,
183
+ segment_nr_colname="data_segment_nr",
184
+ min_segment_length_s=min_segment_length_s,
185
+ fs=int(sampling_frequency),
186
+ format="timestamps",
187
+ )
188
+
189
+ n_segments = df["data_segment_nr"].nunique()
190
+ if verbose >= 1:
191
+ segment_durations = []
192
+ for seg_nr in df["data_segment_nr"].unique():
193
+ seg_df = df[df["data_segment_nr"] == seg_nr]
194
+ duration = (
195
+ seg_df[time_column].iloc[-1] - seg_df[time_column].iloc[0]
196
+ )
197
+ segment_durations.append(f"{duration:.1f}s")
198
+ print(f"Created {n_segments} segments: {', '.join(segment_durations)}")
199
+
200
+ # Resample each segment independently
201
+ resampled_segments = []
202
+ for seg_nr in df["data_segment_nr"].unique():
203
+ seg_df = df[df["data_segment_nr"] == seg_nr].copy()
204
+ seg_time = np.array(seg_df[time_column])
205
+ seg_values = np.array(seg_df[values_column_names])
206
+
207
+ # Resample this segment
208
+ duration = seg_time[-1] - seg_time[0]
209
+ n_samples = int(np.round(duration * resampling_frequency)) + 1
210
+ t_resampled = np.linspace(seg_time[0], seg_time[-1], n_samples)
211
+
212
+ interpolation_kind = "cubic" if len(seg_time) > 3 else "linear"
213
+ interpolator = interp1d(
214
+ seg_time,
215
+ seg_values,
216
+ axis=0,
217
+ kind=interpolation_kind,
218
+ fill_value="extrapolate",
219
+ )
220
+ resampled_values = interpolator(t_resampled)
221
+
222
+ # Create resampled segment DataFrame
223
+ df_seg_resampled = pd.DataFrame(
224
+ resampled_values, columns=values_column_names
225
+ )
226
+ df_seg_resampled[time_column] = t_resampled
227
+ df_seg_resampled["data_segment_nr"] = seg_nr
228
+
229
+ # Copy non-numeric columns from first row of segment
230
+ for column in seg_df.columns:
231
+ if (
232
+ column not in df_seg_resampled.columns
233
+ and column != "data_segment_nr"
234
+ ):
235
+ df_seg_resampled[column] = seg_df[column].iloc[0]
236
+
237
+ resampled_segments.append(df_seg_resampled)
238
+
239
+ # Concatenate all segments
240
+ df_resampled = pd.concat(resampled_segments, ignore_index=True)
241
+
242
+ # Ensure correct column order
243
+ resampled_columns = (
244
+ [time_column] + values_column_names + ["data_segment_nr"]
245
+ )
246
+ other_cols = [
247
+ col for col in df_resampled.columns if col not in resampled_columns
248
+ ]
249
+ df_resampled = df_resampled[resampled_columns + other_cols]
250
+
251
+ if verbose >= 1:
252
+ print(
253
+ f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
254
+ f"{resampling_frequency} Hz"
255
+ )
256
+
257
+ return df_resampled
258
+
259
+ elif verbose >= 2:
260
+ print(
261
+ "Warning: Data is not contiguous but validation is disabled. "
262
+ "Interpolating over gaps."
263
+ )
264
+
265
+ # Standard resampling for contiguous data (or when validation is disabled)
266
+ values_array = np.array(df[values_column_names])
267
+
268
+ # Resample the time data
269
+ t_resampled = np.arange(
270
+ time_abs_array[0], time_abs_array[-1], 1 / resampling_frequency
271
+ )
77
272
 
78
- # Resample the time data using the specified frequency
79
- t_resampled = np.arange(time_abs_array[0], time_abs_array[-1], 1 / resampling_frequency)
80
-
81
273
  # Choose interpolation method
82
274
  interpolation_kind = "cubic" if len(time_abs_array) > 3 else "linear"
83
- interpolator = interp1d(time_abs_array, values_array, axis=0, kind=interpolation_kind, fill_value="extrapolate")
84
-
275
+ interpolator = interp1d(
276
+ time_abs_array,
277
+ values_array,
278
+ axis=0,
279
+ kind=interpolation_kind,
280
+ fill_value="extrapolate",
281
+ )
282
+
85
283
  # Interpolate
86
284
  resampled_values = interpolator(t_resampled)
87
285
 
88
- # Create a DataFrame with the resampled data
286
+ # Create resampled DataFrame
89
287
  df_resampled = pd.DataFrame(resampled_values, columns=values_column_names)
90
288
  df_resampled[time_column] = t_resampled
91
289
 
92
- # Return the DataFrame with columns in the correct order
93
- return df_resampled[[time_column] + values_column_names]
290
+ # Return with correct column order
291
+ resampled_columns = [time_column] + values_column_names
292
+ df_resampled = df_resampled[resampled_columns]
293
+
294
+ if verbose >= 1:
295
+ print(
296
+ f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
297
+ f"{resampling_frequency} Hz"
298
+ )
299
+
300
+ return df_resampled
94
301
 
95
302
 
96
303
  def butterworth_filter(
97
304
  data: np.ndarray,
98
305
  order: int,
99
- cutoff_frequency: Union[float, List[float]],
306
+ cutoff_frequency: float | list[float],
100
307
  passband: str,
101
308
  sampling_frequency: int,
102
309
  ):
103
310
  """
104
311
  Applies a Butterworth filter to 1D or 2D sensor data.
105
312
 
106
- This function applies a low-pass, high-pass, or band-pass Butterworth filter to the
107
- input data. The filter is designed using the specified order, cutoff frequency,
313
+ This function applies a low-pass, high-pass, or band-pass Butterworth filter to the
314
+ input data. The filter is designed using the specified order, cutoff frequency,
108
315
  and passband type. The function can handle both 1D and 2D data arrays.
109
316
 
110
317
  Parameters
111
318
  ----------
112
319
  data : np.ndarray
113
- The sensor data to be filtered. Can be 1D (e.g., a single signal) or 2D
320
+ The sensor data to be filtered. Can be 1D (e.g., a single signal) or 2D
114
321
  (e.g., multi-axis sensor data).
115
322
  order : int
116
323
  The order of the Butterworth filter. Higher values result in a steeper roll-off.
117
- cutoff_frequency : float or List[float]
118
- The cutoff frequency (or frequencies) for the filter. For a low-pass or high-pass filter,
119
- this is a single float. For a band-pass filter, this should be a list of two floats,
120
- specifying the lower and upper cutoff frequencies.
324
+ cutoff_frequency : float or list of float
325
+ The cutoff frequency (or frequencies) for the filter. For a low-pass
326
+ or high-pass filter, this is a single float. For a band-pass filter,
327
+ this should be a list of two floats, specifying the lower and upper
328
+ cutoff frequencies.
121
329
  passband : str
122
330
  The type of passband to apply. Options are:
123
331
  - 'hp' : high-pass filter
124
332
  - 'lp' : low-pass filter
125
333
  - 'band' : band-pass filter
126
334
  sampling_frequency : int
127
- The sampling frequency of the data in Hz. This is used to normalize the cutoff frequency.
335
+ The sampling frequency of the data in Hz. This is used to normalize
336
+ the cutoff frequency.
128
337
 
129
338
  Returns
130
339
  -------
@@ -134,12 +343,14 @@ def butterworth_filter(
134
343
  Raises
135
344
  ------
136
345
  ValueError
137
- If the input data has more than two dimensions, or if an invalid passband is specified.
346
+ If the input data has more than two dimensions, or if an invalid
347
+ passband is specified.
138
348
 
139
349
  Notes
140
350
  -----
141
- The function uses `scipy.signal.butter` to design the filter and `scipy.signal.sosfiltfilt`
142
- to apply it using second-order sections (SOS) to improve numerical stability.
351
+ The function uses `scipy.signal.butter` to design the filter and
352
+ `scipy.signal.sosfiltfilt` to apply it using second-order sections (SOS)
353
+ to improve numerical stability.
143
354
  """
144
355
  # Design the filter using second-order sections (SOS)
145
356
  sos = signal.butter(
@@ -159,7 +370,14 @@ def butterworth_filter(
159
370
  else:
160
371
  raise ValueError("Data must be either 1D or 2D.")
161
372
 
162
- def preprocess_imu_data(df: pd.DataFrame, config: IMUConfig, sensor: str, watch_side: str) -> pd.DataFrame:
373
+
374
+ def preprocess_imu_data(
375
+ df: pd.DataFrame,
376
+ config: IMUConfig,
377
+ sensor: str,
378
+ watch_side: str,
379
+ verbose: int = 1,
380
+ ) -> pd.DataFrame:
163
381
  """
164
382
  Preprocesses IMU data by resampling and applying filters.
165
383
 
@@ -168,8 +386,9 @@ def preprocess_imu_data(df: pd.DataFrame, config: IMUConfig, sensor: str, watch_
168
386
  df : pd.DataFrame
169
387
  The DataFrame containing raw accelerometer and/or gyroscope data.
170
388
  config : IMUConfig
171
- Configuration object containing various settings, such as time column name, accelerometer and/or gyroscope columns,
172
- filter settings, and sampling frequency.
389
+ Configuration object containing various settings, such as time column
390
+ name, accelerometer and/or gyroscope columns, filter settings, and
391
+ sampling frequency.
173
392
  sensor: str
174
393
  Name of the sensor data to be preprocessed. Must be one of:
175
394
  - "accelerometer": Preprocess accelerometer data only.
@@ -179,169 +398,281 @@ def preprocess_imu_data(df: pd.DataFrame, config: IMUConfig, sensor: str, watch_
179
398
  The side of the watch where the data was collected. Must be one of:
180
399
  - "left": Data was collected from the left wrist.
181
400
  - "right": Data was collected from the right wrist.
401
+ verbose : int, default 1
402
+ Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
182
403
 
183
404
  Returns
184
405
  -------
185
406
  pd.DataFrame
186
- The preprocessed accelerometer and or gyroscope data with the following transformations:
407
+ The preprocessed accelerometer and or gyroscope data with the
408
+ following transformations:
187
409
  - Resampled data at the specified frequency.
188
- - Filtered accelerometer data with high-pass and low-pass filtering applied.
189
-
410
+ - Filtered accelerometer data with high-pass and low-pass filtering
411
+ applied.
412
+
190
413
  Notes
191
414
  -----
192
- - The function applies Butterworth filters to accelerometer data, both high-pass and low-pass.
415
+ - The function applies Butterworth filters to accelerometer data, both
416
+ high-pass and low-pass.
193
417
  """
418
+ # Make a copy to avoid SettingWithCopyWarning
419
+ df = df.copy()
194
420
 
195
421
  # Extract sensor column
196
- if sensor == 'accelerometer':
197
- values_colnames = config.accelerometer_cols
198
- elif sensor == 'gyroscope':
199
- values_colnames = config.gyroscope_cols
200
- elif sensor == 'both':
201
- values_colnames = config.accelerometer_cols + config.gyroscope_cols
422
+ if sensor == "accelerometer":
423
+ values_colnames = config.accelerometer_colnames
424
+ elif sensor == "gyroscope":
425
+ values_colnames = config.gyroscope_colnames
426
+ elif sensor == "both":
427
+ values_colnames = config.accelerometer_colnames + config.gyroscope_colnames
202
428
  else:
203
- raise('Sensor should be either accelerometer, gyroscope, or both')
204
-
205
- # Resample the data to the specified frequency
206
- df = resample_data(
207
- df=df,
208
- time_column=DataColumns.TIME,
209
- values_column_names=values_colnames,
210
- sampling_frequency=config.sampling_frequency,
211
- resampling_frequency=config.resampling_frequency
212
- )
429
+ raise ("Sensor should be either accelerometer, gyroscope, or both")
430
+
431
+ # Check if data needs resampling
432
+ # Skip resampling if already at target frequency or if data has been pre-segmented
433
+ needs_resampling = True
434
+ validate_contiguous = True
435
+
436
+ if "data_segment_nr" in df.columns:
437
+ # Data has been pre-segmented, skip contiguity validation
438
+ validate_contiguous = False
439
+
440
+ # Check current sampling frequency
441
+ time_diff = df[config.time_colname].diff().dropna()
442
+ current_dt = time_diff.median()
443
+ current_frequency = 1.0 / current_dt
444
+
445
+ if abs(current_frequency - config.resampling_frequency) < 0.1:
446
+ needs_resampling = False
447
+
448
+ if needs_resampling:
449
+ # Resample the data to the specified frequency
450
+ df = resample_data(
451
+ df=df,
452
+ time_column=config.time_colname,
453
+ values_column_names=values_colnames,
454
+ sampling_frequency=config.sampling_frequency,
455
+ resampling_frequency=config.resampling_frequency,
456
+ tolerance=config.tolerance,
457
+ validate_contiguous=validate_contiguous,
458
+ verbose=verbose,
459
+ )
213
460
 
214
461
  # Invert the IMU data if the watch was worn on the right wrist
215
462
  df = invert_watch_side(df, watch_side, sensor)
216
-
217
- if sensor in ['accelerometer', 'both']:
218
-
463
+
464
+ if sensor in ["accelerometer", "both"]:
465
+
219
466
  # Extract accelerometer data for filtering
220
- accel_data = df[config.accelerometer_cols].values
467
+ accel_data = df[config.accelerometer_colnames].values
221
468
 
222
469
  # Define filter configurations for high-pass and low-pass
223
470
  filter_renaming_configs = {
224
- "hp": {"result_columns": config.accelerometer_cols, "replace_original": True},
225
- "lp": {"result_columns": [f'{col}_grav' for col in config.accelerometer_cols], "replace_original": False},
471
+ "hp": {
472
+ "result_columns": config.accelerometer_colnames,
473
+ "replace_original": True,
474
+ },
475
+ "lp": {
476
+ "result_columns": [
477
+ f"{col}_grav" for col in config.accelerometer_colnames
478
+ ],
479
+ "replace_original": False,
480
+ },
226
481
  }
227
482
 
228
483
  # Apply filters in a loop
229
484
  for passband, filter_config in filter_renaming_configs.items():
230
485
  filtered_data = butterworth_filter(
231
- data=accel_data,
232
- order=config.filter_order,
233
- cutoff_frequency=config.lower_cutoff_frequency,
234
- passband=passband,
235
- sampling_frequency=config.sampling_frequency,
486
+ data=accel_data,
487
+ order=config.filter_order,
488
+ cutoff_frequency=config.lower_cutoff_frequency,
489
+ passband=passband,
490
+ sampling_frequency=config.sampling_frequency,
236
491
  )
237
492
 
238
493
  # Replace or add new columns based on configuration
239
494
  df[filter_config["result_columns"]] = filtered_data
240
495
 
241
- values_colnames += config.gravity_cols
496
+ values_colnames += config.gravity_colnames
242
497
 
243
- df = df[[DataColumns.TIME, *values_colnames]]
498
+ df = df[[config.time_colname, *values_colnames]]
244
499
 
245
500
  return df
246
501
 
247
502
 
248
- def preprocess_ppg_data(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, ppg_config: PPGConfig,
249
- imu_config: IMUConfig, start_time_ppg: str, start_time_imu: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
503
+ def preprocess_ppg_data(
504
+ df_ppg: pd.DataFrame,
505
+ ppg_config: PPGConfig,
506
+ start_time_ppg: str | None = None,
507
+ df_acc: pd.DataFrame | None = None,
508
+ imu_config: IMUConfig | None = None,
509
+ start_time_imu: str | None = None,
510
+ verbose: int = 1,
511
+ ) -> tuple[pd.DataFrame, pd.DataFrame | None]:
250
512
  """
251
- Preprocess PPG and IMU (accelerometer only) data by resampling, filtering, and aligning the data segments.
513
+ This function preprocesses PPG and accelerometer data by resampling,
514
+ filtering and aligning the data segments of both sensors (if applicable).
515
+ Aligning is done using the extract_overlapping_segments function which is
516
+ based on the provided start times of the PPG and IMU data and returns
517
+ only the data points where both signals overlap in time. The remaining
518
+ data points are discarded.
519
+ After alignment, the function resamples the data to the specified
520
+ frequency and applies Butterworth filters to both PPG and accelerometer
521
+ data (if applicable).
522
+ The output is two DataFrames: one for the preprocessed PPG data and
523
+ another for the preprocessed accelerometer data (if provided, otherwise
524
+ return is None).
252
525
 
253
526
  Parameters
254
527
  ----------
255
528
  df_ppg : pd.DataFrame
256
529
  DataFrame containing PPG data.
257
- df_acc : pd.DataFrame
258
- DataFrame containing accelerometer from IMU data.
259
530
  ppg_config : PPGPreprocessingConfig
260
531
  Configuration object for PPG preprocessing.
261
- imu_config : IMUPreprocessingConfig
262
- Configuration object for IMU preprocessing.
263
532
  start_time_ppg : str
264
533
  iso8601 formatted start time of the PPG data.
534
+ df_acc : pd.DataFrame
535
+ DataFrame containing accelerometer from IMU data.
536
+ imu_config : IMUPreprocessingConfig
537
+ Configuration object for IMU preprocessing.
265
538
  start_time_imu : str
266
539
  iso8601 formatted start time of the IMU data.
540
+ verbose : int, default 1
541
+ Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
267
542
 
268
543
  Returns
269
544
  -------
270
- Tuple[pd.DataFrame, pd.DataFrame]
271
- Preprocessed PPG and IMU data as DataFrames.
272
-
545
+ Tuple[pd.DataFrame, pd.DataFrame | None]
546
+ A tuple containing two DataFrames:
547
+ - Preprocessed PPG data with the following transformations:
548
+ - Resampled data at the specified frequency.
549
+ - Filtered PPG data with bandpass filtering applied.
550
+ - Preprocessed accelerometer data (if provided, otherwise return is
551
+ None) with the following transformations:
552
+ - Resampled data at the specified frequency.
553
+ - Filtered accelerometer data with high-pass and low-pass
554
+ filtering applied.
555
+
556
+ Notes
557
+ -----
558
+ - If accelerometer data or IMU configuration is not provided, the
559
+ function only preprocesses PPG data.
560
+ - The function applies Butterworth filters to PPG and accelerometer
561
+ (if applicable) data, both high-pass and low-pass.
562
+
273
563
  """
564
+ # Make copies to avoid SettingWithCopyWarning
565
+ df_ppg = df_ppg.copy()
566
+ if df_acc is not None:
567
+ df_acc = df_acc.copy()
568
+
569
+ if df_acc is not None and imu_config is not None:
570
+ # Extract overlapping segments
571
+ df_ppg_overlapping, df_acc_overlapping = extract_overlapping_segments(
572
+ df_ppg=df_ppg,
573
+ df_acc=df_acc,
574
+ time_colname_ppg=ppg_config.time_colname,
575
+ time_colname_imu=imu_config.time_colname,
576
+ start_time_ppg=start_time_ppg,
577
+ start_time_acc=start_time_imu,
578
+ )
274
579
 
275
- # Extract overlapping segments
276
- df_ppg_overlapping, df_acc_overlapping = extract_overlapping_segments(df_ppg, df_acc, start_time_ppg, start_time_imu)
277
-
278
- # Resample accelerometer data
279
- df_acc_proc = resample_data(
280
- df=df_acc_overlapping,
281
- time_column=DataColumns.TIME,
282
- values_column_names = list(imu_config.d_channels_accelerometer.keys()),
283
- sampling_frequency=imu_config.sampling_frequency,
284
- resampling_frequency=imu_config.sampling_frequency
285
- )
580
+ # Resample accelerometer data
581
+ # Skip contiguity validation if data has been pre-segmented
582
+ validate_contiguous_acc = "data_segment_nr" not in df_acc_overlapping.columns
583
+ df_acc_proc = resample_data(
584
+ df=df_acc_overlapping,
585
+ time_column=imu_config.time_colname,
586
+ values_column_names=list(imu_config.d_channels_accelerometer.keys()),
587
+ sampling_frequency=imu_config.sampling_frequency,
588
+ resampling_frequency=imu_config.resampling_frequency,
589
+ tolerance=imu_config.tolerance,
590
+ validate_contiguous=validate_contiguous_acc,
591
+ verbose=verbose,
592
+ )
593
+
594
+ # Extract accelerometer data for filtering
595
+ accel_data = df_acc_proc[imu_config.accelerometer_colnames].values
596
+
597
+ # Define filter configurations for high-pass and low-pass
598
+ filter_renaming_configs = {
599
+ "hp": {
600
+ "result_columns": imu_config.accelerometer_colnames,
601
+ "replace_original": True,
602
+ }
603
+ }
604
+
605
+ # Apply filters in a loop
606
+ for passband, filter_config in filter_renaming_configs.items():
607
+ filtered_data = butterworth_filter(
608
+ data=accel_data,
609
+ order=imu_config.filter_order,
610
+ cutoff_frequency=imu_config.lower_cutoff_frequency,
611
+ passband=passband,
612
+ sampling_frequency=imu_config.sampling_frequency,
613
+ )
614
+
615
+ # Replace or add new columns based on configuration
616
+ df_acc_proc[filter_config["result_columns"]] = filtered_data
617
+
618
+ else:
619
+ df_ppg_overlapping = df_ppg
286
620
 
287
621
  # Resample PPG data
622
+ # Skip contiguity validation if data has been pre-segmented
623
+ validate_contiguous_ppg = "data_segment_nr" not in df_ppg_overlapping.columns
288
624
  df_ppg_proc = resample_data(
289
625
  df=df_ppg_overlapping,
290
- time_column=DataColumns.TIME,
291
- values_column_names = list(ppg_config.d_channels_ppg.keys()),
626
+ time_column=ppg_config.time_colname,
627
+ values_column_names=list(ppg_config.d_channels_ppg.keys()),
292
628
  sampling_frequency=ppg_config.sampling_frequency,
293
- resampling_frequency=ppg_config.sampling_frequency
629
+ resampling_frequency=ppg_config.resampling_frequency,
630
+ tolerance=ppg_config.tolerance,
631
+ validate_contiguous=validate_contiguous_ppg,
632
+ verbose=verbose,
294
633
  )
295
634
 
296
-
297
- # Extract accelerometer data for filtering
298
- accel_data = df_acc_proc[imu_config.accelerometer_cols].values
299
-
300
- # Define filter configurations for high-pass and low-pass
301
- filter_renaming_configs = {
302
- "hp": {"result_columns": imu_config.accelerometer_cols, "replace_original": True}}
303
-
304
- # Apply filters in a loop
305
- for passband, filter_config in filter_renaming_configs.items():
306
- filtered_data = butterworth_filter(
307
- data=accel_data,
308
- order=imu_config.filter_order,
309
- cutoff_frequency=imu_config.lower_cutoff_frequency,
310
- passband=passband,
311
- sampling_frequency=imu_config.sampling_frequency,
312
- )
313
-
314
- # Replace or add new columns based on configuration
315
- df_acc_proc[filter_config["result_columns"]] = filtered_data
316
-
317
635
  # Extract accelerometer data for filtering
318
636
  ppg_data = df_ppg_proc[ppg_config.ppg_colname].values
319
637
 
320
638
  # Define filter configurations for high-pass and low-pass
321
639
  filter_renaming_configs = {
322
- "bandpass": {"result_columns": ppg_config.ppg_colname, "replace_original": True}}
640
+ "bandpass": {"result_columns": ppg_config.ppg_colname, "replace_original": True}
641
+ }
323
642
 
324
643
  # Apply filters in a loop
325
644
  for passband, filter_config in filter_renaming_configs.items():
326
645
  filtered_data = butterworth_filter(
327
- data=ppg_data,
328
- order=ppg_config.filter_order,
329
- cutoff_frequency=[ppg_config.lower_cutoff_frequency, ppg_config.upper_cutoff_frequency],
330
- passband=passband,
331
- sampling_frequency=ppg_config.sampling_frequency,
646
+ data=ppg_data,
647
+ order=ppg_config.filter_order,
648
+ cutoff_frequency=[
649
+ ppg_config.lower_cutoff_frequency,
650
+ ppg_config.upper_cutoff_frequency,
651
+ ],
652
+ passband=passband,
653
+ sampling_frequency=ppg_config.sampling_frequency,
332
654
  )
333
655
 
334
656
  # Replace or add new columns based on configuration
335
657
  df_ppg_proc[filter_config["result_columns"]] = filtered_data
336
-
337
- return df_ppg_proc, df_acc_proc
338
-
339
658
 
659
+ if df_acc is not None and imu_config is not None:
660
+ return df_ppg_proc, df_acc_proc
661
+ else:
662
+ return df_ppg_proc, None
340
663
 
341
664
 
342
- def extract_overlapping_segments(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, start_time_ppg: str, start_time_acc: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
665
+ def extract_overlapping_segments(
666
+ df_ppg: pd.DataFrame,
667
+ df_acc: pd.DataFrame,
668
+ time_colname_ppg: str,
669
+ time_colname_imu: str,
670
+ start_time_ppg: str,
671
+ start_time_acc: str,
672
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
343
673
  """
344
- Extract DataFrames with overlapping data segments between accelerometer (from the IMU) and PPG datasets based on their timestamps.
674
+ Extract DataFrames with overlapping data segments between accelerometer
675
+ (from the IMU) and PPG datasets based on their timestamps.
345
676
 
346
677
  Parameters
347
678
  ----------
@@ -349,6 +680,10 @@ def extract_overlapping_segments(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, sta
349
680
  DataFrame containing PPG data.
350
681
  df_acc : pd.DataFrame
351
682
  DataFrame containing accelerometer data from the IMU.
683
+ time_colname_ppg : str
684
+ The name of the column containing the time data in the PPG dataframe.
685
+ time_colname_imu : str
686
+ The name of the column containing the time data in the IMU dataframe.
352
687
  start_time_ppg : str
353
688
  iso8601 formatted start time of the PPG data.
354
689
  start_time_acc : str
@@ -357,7 +692,8 @@ def extract_overlapping_segments(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, sta
357
692
  Returns
358
693
  -------
359
694
  Tuple[pd.DataFrame, pd.DataFrame]
360
- DataFrames containing the overlapping segments (time and values) of PPG and accelerometer data.
695
+ DataFrames containing the overlapping segments (time and values) of
696
+ PPG and accelerometer data.
361
697
  """
362
698
  # Convert start times to Unix timestamps
363
699
  datetime_ppg_start = datetime.fromisoformat(start_time_ppg.replace("Z", "+00:00"))
@@ -365,22 +701,23 @@ def extract_overlapping_segments(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, sta
365
701
  datetime_acc_start = datetime.fromisoformat(start_time_acc.replace("Z", "+00:00"))
366
702
  start_acc_ppg = int(datetime_acc_start.timestamp())
367
703
 
368
- # Calculate the time in Unix timestamps for each dataset because the timestamps are relative to the start time
369
- ppg_time = df_ppg[DataColumns.TIME] + start_unix_ppg
370
- acc_time = df_acc[DataColumns.TIME] + start_acc_ppg
704
+ # Calculate the time in Unix timestamps for each dataset because the
705
+ # timestamps are relative to the start time
706
+ ppg_time = df_ppg[time_colname_ppg] + start_unix_ppg
707
+ acc_time = df_acc[time_colname_imu] + start_acc_ppg
371
708
 
372
709
  # Determine the overlapping time interval
373
710
  start_time = max(ppg_time.iloc[0], acc_time.iloc[0])
374
711
  end_time = min(ppg_time.iloc[-1], acc_time.iloc[-1])
375
712
 
376
713
  # Extract indices for overlapping segments
377
- ppg_start_index = np.searchsorted(ppg_time, start_time, 'left')
378
- ppg_end_index = np.searchsorted(ppg_time, end_time, 'right') - 1
379
- acc_start_index = np.searchsorted(acc_time, start_time, 'left')
380
- acc_end_index = np.searchsorted(acc_time, end_time, 'right') - 1
714
+ ppg_start_index = np.searchsorted(ppg_time, start_time, "left")
715
+ ppg_end_index = np.searchsorted(ppg_time, end_time, "right") - 1
716
+ acc_start_index = np.searchsorted(acc_time, start_time, "left")
717
+ acc_end_index = np.searchsorted(acc_time, end_time, "right") - 1
381
718
 
382
719
  # Extract overlapping segments from DataFrames
383
- df_ppg_overlapping = df_ppg.iloc[ppg_start_index:ppg_end_index + 1]
384
- df_acc_overlapping = df_acc.iloc[acc_start_index:acc_end_index + 1]
720
+ df_ppg_overlapping = df_ppg.iloc[ppg_start_index : ppg_end_index + 1]
721
+ df_acc_overlapping = df_acc.iloc[acc_start_index : acc_end_index + 1]
385
722
 
386
- return df_ppg_overlapping, df_acc_overlapping
723
+ return df_ppg_overlapping, df_acc_overlapping