paradigma 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,4 @@
1
1
  from datetime import datetime
2
- from typing import List, Tuple, Union
3
2
 
4
3
  import numpy as np
5
4
  import pandas as pd
@@ -7,72 +6,266 @@ from scipy import signal
7
6
  from scipy.interpolate import interp1d
8
7
 
9
8
  from paradigma.config import IMUConfig, PPGConfig
9
+ from paradigma.segmenting import create_segments, discard_segments
10
10
  from paradigma.util import invert_watch_side
11
11
 
12
12
 
13
13
  def resample_data(
14
14
  df: pd.DataFrame,
15
- time_column: str,
16
- values_column_names: List[str],
17
- sampling_frequency: int,
18
- resampling_frequency: int,
15
+ time_column: str = "time",
16
+ values_column_names: list[str] | None = None,
17
+ sampling_frequency: int | None = None,
18
+ resampling_frequency: int | None = None,
19
19
  tolerance: float | None = None,
20
+ validate_contiguous: bool = True,
21
+ auto_segment: bool = False,
22
+ max_segment_gap_s: float | None = None,
23
+ min_segment_length_s: float | None = None,
24
+ verbose: int = 2,
20
25
  ) -> pd.DataFrame:
21
26
  """
22
- Resamples sensor data to a specified frequency using cubic interpolation.
27
+ Unified resampling function with optional auto-segmentation for non-contiguous data.
28
+
29
+ This function supports:
30
+ - Automatic frequency detection or explicit specification
31
+ - Contiguity validation with configurable tolerance
32
+ - Automatic segmentation of non-contiguous data
33
+ - Preservation of non-numeric columns
23
34
 
24
35
  Parameters
25
36
  ----------
26
37
  df : pd.DataFrame
27
38
  The input DataFrame containing the sensor data.
28
- time_column : str
39
+ time_column : str, default 'time'
29
40
  The name of the column containing the time data.
30
- values_column_names : List[str]
31
- A list of column names that should be resampled.
32
- sampling_frequency : int
33
- The original sampling frequency of the data (in Hz).
34
- resampling_frequency : int
35
- The frequency to which the data should be resampled (in Hz).
41
+ values_column_names : List[str], optional
42
+ Column names to resample. If None, auto-detects all numeric columns except time.
43
+ sampling_frequency : int, optional
44
+ Original sampling frequency (Hz). If None, auto-detected from data.
45
+ resampling_frequency : int, optional
46
+ Target sampling frequency in Hz.
36
47
  tolerance : float, optional
37
- The tolerance added to the expected difference when checking
38
- for contiguous timestamps. If not provided, it defaults to the tolerance specified in IMUConfig.
48
+ Tolerance for contiguity checking (seconds). Defaults to IMUConfig tolerance.
49
+ validate_contiguous : bool, default True
50
+ Whether to validate data contiguity. If False, gaps are silently interpolated.
51
+ auto_segment : bool, default False
52
+ If True, automatically split non-contiguous data into segments and
53
+ process each. Adds 'data_segment_nr' column to output. If False and
54
+ data is non-contiguous with validate_contiguous=True, raises
55
+ ValueError.
56
+ max_segment_gap_s : float, optional
57
+ Maximum gap (seconds) before starting new segment. Used when auto_segment=True.
58
+ Defaults to IMUConfig.max_segment_gap_s (1.5s).
59
+ min_segment_length_s : float, optional
60
+ Minimum segment length (seconds) to keep. Used when auto_segment=True.
61
+ Defaults to IMUConfig.min_segment_length_s (1.5s).
62
+ verbose : int, default 1
63
+ Logging verbose: 0=errors only, 1=basic info, 2+=detailed info.
64
+ Note: This function still uses verbose for backward compatibility
65
+ with existing code that calls it directly.
39
66
 
40
67
  Returns
41
68
  -------
42
69
  pd.DataFrame
43
- A DataFrame with the resampled data, where each column contains resampled values.
44
- The time column will reflect the new resampling frequency.
70
+ Resampled DataFrame. If auto_segment=True and multiple segments found,
71
+ includes 'data_segment_nr' column identifying each contiguous data segment.
45
72
 
46
73
  Raises
47
74
  ------
48
75
  ValueError
49
- If the time array is not strictly increasing.
76
+ - If time array is not strictly increasing
77
+ - If time array is not contiguous and validate_contiguous=True
78
+ and auto_segment=False
79
+ - If no numeric columns found for resampling
80
+ - If all segments are discarded due to min_segment_length_s
50
81
 
51
82
  Notes
52
83
  -----
53
- - Uses cubic interpolation for smooth resampling if there are enough points.
54
- - If only two timestamps are available, it falls back to linear interpolation.
55
- """
56
-
57
- # Set default tolerance if not provided to tolerance specified in IMUConfig
58
- if tolerance is None:
59
- tolerance = IMUConfig().tolerance
84
+ - Uses cubic interpolation for smooth resampling if there are enough points
85
+ - Falls back to linear interpolation if only 2-3 points available
86
+ - Non-numeric columns are preserved (first value copied to all rows)
87
+ - Backwards compatible with both old resample_data signatures
88
+
89
+ Examples
90
+ --------
91
+ # Auto-detection mode
92
+ df_resampled = resample_data(df, resampling_frequency=100)
93
+
94
+ # Explicit mode
95
+ df_resampled = resample_data(
96
+ df, time_column='time', values_column_names=['acc_x', 'acc_y'],
97
+ sampling_frequency=128, resampling_frequency=100
98
+ )
60
99
 
61
- # Extract time and values
100
+ # Auto-segmentation mode
101
+ df_segmented = resample_data(
102
+ df, resampling_frequency=100, auto_segment=True,
103
+ max_segment_gap_s=2.0, min_segment_length_s=3.0
104
+ )
105
+ """
106
+ df = df.copy()
107
+
108
+ if time_column not in df.columns:
109
+ raise ValueError(f"Time column '{time_column}' not found in DataFrame")
110
+
111
+ # Validate resampling frequency
112
+ if resampling_frequency is None:
113
+ raise ValueError("resampling_frequency must be provided")
114
+
115
+ resampling_frequency = float(resampling_frequency)
116
+
117
+ # Auto-detect or use provided column names
118
+ if values_column_names is None:
119
+ numeric_columns = df.select_dtypes(include=[np.number]).columns
120
+ values_column_names = [
121
+ col
122
+ for col in numeric_columns
123
+ if col != time_column and col != "data_segment_nr"
124
+ ]
125
+ if not values_column_names:
126
+ raise ValueError("No numeric columns found for resampling")
127
+ if verbose >= 2:
128
+ print(f"Auto-detected {len(values_column_names)} columns for resampling")
129
+
130
+ # Auto-detect or use provided sampling frequency
62
131
  time_abs_array = np.array(df[time_column])
63
- values_array = np.array(df[values_column_names])
132
+ if sampling_frequency is None:
133
+ time_diff = df[time_column].diff().dropna()
134
+ current_dt = time_diff.median()
135
+ sampling_frequency = 1.0 / current_dt
136
+ if verbose >= 2:
137
+ print(f"Auto-detected sampling frequency: {sampling_frequency:.2f} Hz")
138
+ else:
139
+ sampling_frequency = float(sampling_frequency)
64
140
 
65
- # Ensure the time array is strictly increasing
141
+ # Ensure time array is strictly increasing
66
142
  if not np.all(np.diff(time_abs_array) > 0):
67
143
  raise ValueError("Time array is not strictly increasing")
68
144
 
69
- # Ensure the time array is contiguous
145
+ # Set default tolerance if not provided
146
+ if tolerance is None:
147
+ tolerance = IMUConfig().tolerance
148
+
149
+ # Set default segmentation parameters
150
+ if auto_segment:
151
+ if max_segment_gap_s is None:
152
+ max_segment_gap_s = 1.5 # IMUConfig default
153
+ if min_segment_length_s is None:
154
+ min_segment_length_s = 1.5 # IMUConfig default
155
+
156
+ # Check contiguity
70
157
  expected_interval = 1 / sampling_frequency
71
158
  timestamp_diffs = np.diff(time_abs_array)
72
- if np.any(np.abs(timestamp_diffs - expected_interval) > tolerance):
73
- raise ValueError("Time array is not contiguous")
159
+ is_contiguous = not np.any(np.abs(timestamp_diffs - expected_interval) > tolerance)
160
+
161
+ if not is_contiguous:
162
+ if validate_contiguous and not auto_segment:
163
+ raise ValueError(
164
+ "Time array is not contiguous. Consider enabling automatic "
165
+ "segmentation to split and process non-contiguous segments, or "
166
+ "disable contiguity validation to interpolate over gaps."
167
+ )
168
+ elif auto_segment:
169
+ # Split into segments
170
+ if verbose >= 1:
171
+ print("Non-contiguous data detected. Auto-segmenting...")
172
+
173
+ # Create segments based on gaps
174
+ segment_array = create_segments(
175
+ time_array=time_abs_array,
176
+ max_segment_gap_s=max_segment_gap_s,
177
+ )
178
+ df["data_segment_nr"] = segment_array
179
+
180
+ # Discard segments that are too short
181
+ df = discard_segments(
182
+ df=df,
183
+ segment_nr_colname="data_segment_nr",
184
+ min_segment_length_s=min_segment_length_s,
185
+ fs=int(sampling_frequency),
186
+ format="timestamps",
187
+ )
74
188
 
75
- # Resample the time data using the specified frequency
189
+ n_segments = df["data_segment_nr"].nunique()
190
+ if verbose >= 1:
191
+ segment_durations = []
192
+ for seg_nr in df["data_segment_nr"].unique():
193
+ seg_df = df[df["data_segment_nr"] == seg_nr]
194
+ duration = (
195
+ seg_df[time_column].iloc[-1] - seg_df[time_column].iloc[0]
196
+ )
197
+ segment_durations.append(f"{duration:.1f}s")
198
+ print(f"Created {n_segments} segments: {', '.join(segment_durations)}")
199
+
200
+ # Resample each segment independently
201
+ resampled_segments = []
202
+ for seg_nr in df["data_segment_nr"].unique():
203
+ seg_df = df[df["data_segment_nr"] == seg_nr].copy()
204
+ seg_time = np.array(seg_df[time_column])
205
+ seg_values = np.array(seg_df[values_column_names])
206
+
207
+ # Resample this segment
208
+ duration = seg_time[-1] - seg_time[0]
209
+ n_samples = int(np.round(duration * resampling_frequency)) + 1
210
+ t_resampled = np.linspace(seg_time[0], seg_time[-1], n_samples)
211
+
212
+ interpolation_kind = "cubic" if len(seg_time) > 3 else "linear"
213
+ interpolator = interp1d(
214
+ seg_time,
215
+ seg_values,
216
+ axis=0,
217
+ kind=interpolation_kind,
218
+ fill_value="extrapolate",
219
+ )
220
+ resampled_values = interpolator(t_resampled)
221
+
222
+ # Create resampled segment DataFrame
223
+ df_seg_resampled = pd.DataFrame(
224
+ resampled_values, columns=values_column_names
225
+ )
226
+ df_seg_resampled[time_column] = t_resampled
227
+ df_seg_resampled["data_segment_nr"] = seg_nr
228
+
229
+ # Copy non-numeric columns from first row of segment
230
+ for column in seg_df.columns:
231
+ if (
232
+ column not in df_seg_resampled.columns
233
+ and column != "data_segment_nr"
234
+ ):
235
+ df_seg_resampled[column] = seg_df[column].iloc[0]
236
+
237
+ resampled_segments.append(df_seg_resampled)
238
+
239
+ # Concatenate all segments
240
+ df_resampled = pd.concat(resampled_segments, ignore_index=True)
241
+
242
+ # Ensure correct column order
243
+ resampled_columns = (
244
+ [time_column] + values_column_names + ["data_segment_nr"]
245
+ )
246
+ other_cols = [
247
+ col for col in df_resampled.columns if col not in resampled_columns
248
+ ]
249
+ df_resampled = df_resampled[resampled_columns + other_cols]
250
+
251
+ if verbose >= 1:
252
+ print(
253
+ f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
254
+ f"{resampling_frequency} Hz"
255
+ )
256
+
257
+ return df_resampled
258
+
259
+ elif verbose >= 2:
260
+ print(
261
+ "Warning: Data is not contiguous but validation is disabled. "
262
+ "Interpolating over gaps."
263
+ )
264
+
265
+ # Standard resampling for contiguous data (or when validation is disabled)
266
+ values_array = np.array(df[values_column_names])
267
+
268
+ # Resample the time data
76
269
  t_resampled = np.arange(
77
270
  time_abs_array[0], time_abs_array[-1], 1 / resampling_frequency
78
271
  )
@@ -90,18 +283,27 @@ def resample_data(
90
283
  # Interpolate
91
284
  resampled_values = interpolator(t_resampled)
92
285
 
93
- # Create a DataFrame with the resampled data
286
+ # Create resampled DataFrame
94
287
  df_resampled = pd.DataFrame(resampled_values, columns=values_column_names)
95
288
  df_resampled[time_column] = t_resampled
96
289
 
97
- # Return the DataFrame with columns in the correct order
98
- return df_resampled[[time_column] + values_column_names]
290
+ # Return with correct column order
291
+ resampled_columns = [time_column] + values_column_names
292
+ df_resampled = df_resampled[resampled_columns]
293
+
294
+ if verbose >= 1:
295
+ print(
296
+ f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
297
+ f"{resampling_frequency} Hz"
298
+ )
299
+
300
+ return df_resampled
99
301
 
100
302
 
101
303
  def butterworth_filter(
102
304
  data: np.ndarray,
103
305
  order: int,
104
- cutoff_frequency: Union[float, List[float]],
306
+ cutoff_frequency: float | list[float],
105
307
  passband: str,
106
308
  sampling_frequency: int,
107
309
  ):
@@ -119,17 +321,19 @@ def butterworth_filter(
119
321
  (e.g., multi-axis sensor data).
120
322
  order : int
121
323
  The order of the Butterworth filter. Higher values result in a steeper roll-off.
122
- cutoff_frequency : float or List[float]
123
- The cutoff frequency (or frequencies) for the filter. For a low-pass or high-pass filter,
124
- this is a single float. For a band-pass filter, this should be a list of two floats,
125
- specifying the lower and upper cutoff frequencies.
324
+ cutoff_frequency : float or list of float
325
+ The cutoff frequency (or frequencies) for the filter. For a low-pass
326
+ or high-pass filter, this is a single float. For a band-pass filter,
327
+ this should be a list of two floats, specifying the lower and upper
328
+ cutoff frequencies.
126
329
  passband : str
127
330
  The type of passband to apply. Options are:
128
331
  - 'hp' : high-pass filter
129
332
  - 'lp' : low-pass filter
130
333
  - 'band' : band-pass filter
131
334
  sampling_frequency : int
132
- The sampling frequency of the data in Hz. This is used to normalize the cutoff frequency.
335
+ The sampling frequency of the data in Hz. This is used to normalize
336
+ the cutoff frequency.
133
337
 
134
338
  Returns
135
339
  -------
@@ -139,12 +343,14 @@ def butterworth_filter(
139
343
  Raises
140
344
  ------
141
345
  ValueError
142
- If the input data has more than two dimensions, or if an invalid passband is specified.
346
+ If the input data has more than two dimensions, or if an invalid
347
+ passband is specified.
143
348
 
144
349
  Notes
145
350
  -----
146
- The function uses `scipy.signal.butter` to design the filter and `scipy.signal.sosfiltfilt`
147
- to apply it using second-order sections (SOS) to improve numerical stability.
351
+ The function uses `scipy.signal.butter` to design the filter and
352
+ `scipy.signal.sosfiltfilt` to apply it using second-order sections (SOS)
353
+ to improve numerical stability.
148
354
  """
149
355
  # Design the filter using second-order sections (SOS)
150
356
  sos = signal.butter(
@@ -166,7 +372,11 @@ def butterworth_filter(
166
372
 
167
373
 
168
374
  def preprocess_imu_data(
169
- df: pd.DataFrame, config: IMUConfig, sensor: str, watch_side: str
375
+ df: pd.DataFrame,
376
+ config: IMUConfig,
377
+ sensor: str,
378
+ watch_side: str,
379
+ verbose: int = 1,
170
380
  ) -> pd.DataFrame:
171
381
  """
172
382
  Preprocesses IMU data by resampling and applying filters.
@@ -176,8 +386,9 @@ def preprocess_imu_data(
176
386
  df : pd.DataFrame
177
387
  The DataFrame containing raw accelerometer and/or gyroscope data.
178
388
  config : IMUConfig
179
- Configuration object containing various settings, such as time column name, accelerometer and/or gyroscope columns,
180
- filter settings, and sampling frequency.
389
+ Configuration object containing various settings, such as time column
390
+ name, accelerometer and/or gyroscope columns, filter settings, and
391
+ sampling frequency.
181
392
  sensor: str
182
393
  Name of the sensor data to be preprocessed. Must be one of:
183
394
  - "accelerometer": Preprocess accelerometer data only.
@@ -187,18 +398,25 @@ def preprocess_imu_data(
187
398
  The side of the watch where the data was collected. Must be one of:
188
399
  - "left": Data was collected from the left wrist.
189
400
  - "right": Data was collected from the right wrist.
401
+ verbose : int, default 1
402
+ Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
190
403
 
191
404
  Returns
192
405
  -------
193
406
  pd.DataFrame
194
- The preprocessed accelerometer and or gyroscope data with the following transformations:
407
+ The preprocessed accelerometer and or gyroscope data with the
408
+ following transformations:
195
409
  - Resampled data at the specified frequency.
196
- - Filtered accelerometer data with high-pass and low-pass filtering applied.
410
+ - Filtered accelerometer data with high-pass and low-pass filtering
411
+ applied.
197
412
 
198
413
  Notes
199
414
  -----
200
- - The function applies Butterworth filters to accelerometer data, both high-pass and low-pass.
415
+ - The function applies Butterworth filters to accelerometer data, both
416
+ high-pass and low-pass.
201
417
  """
418
+ # Make a copy to avoid SettingWithCopyWarning
419
+ df = df.copy()
202
420
 
203
421
  # Extract sensor column
204
422
  if sensor == "accelerometer":
@@ -210,15 +428,35 @@ def preprocess_imu_data(
210
428
  else:
211
429
  raise ("Sensor should be either accelerometer, gyroscope, or both")
212
430
 
213
- # Resample the data to the specified frequency
214
- df = resample_data(
215
- df=df,
216
- time_column=config.time_colname,
217
- values_column_names=values_colnames,
218
- sampling_frequency=config.sampling_frequency,
219
- resampling_frequency=config.resampling_frequency,
220
- tolerance=config.tolerance,
221
- )
431
+ # Check if data needs resampling
432
+ # Skip resampling if already at target frequency or if data has been pre-segmented
433
+ needs_resampling = True
434
+ validate_contiguous = True
435
+
436
+ if "data_segment_nr" in df.columns:
437
+ # Data has been pre-segmented, skip contiguity validation
438
+ validate_contiguous = False
439
+
440
+ # Check current sampling frequency
441
+ time_diff = df[config.time_colname].diff().dropna()
442
+ current_dt = time_diff.median()
443
+ current_frequency = 1.0 / current_dt
444
+
445
+ if abs(current_frequency - config.resampling_frequency) < 0.1:
446
+ needs_resampling = False
447
+
448
+ if needs_resampling:
449
+ # Resample the data to the specified frequency
450
+ df = resample_data(
451
+ df=df,
452
+ time_column=config.time_colname,
453
+ values_column_names=values_colnames,
454
+ sampling_frequency=config.sampling_frequency,
455
+ resampling_frequency=config.resampling_frequency,
456
+ tolerance=config.tolerance,
457
+ validate_contiguous=validate_contiguous,
458
+ verbose=verbose,
459
+ )
222
460
 
223
461
  # Invert the IMU data if the watch was worn on the right wrist
224
462
  df = invert_watch_side(df, watch_side, sensor)
@@ -269,28 +507,38 @@ def preprocess_ppg_data(
269
507
  df_acc: pd.DataFrame | None = None,
270
508
  imu_config: IMUConfig | None = None,
271
509
  start_time_imu: str | None = None,
272
- ) -> Tuple[pd.DataFrame, pd.DataFrame | None]:
510
+ verbose: int = 1,
511
+ ) -> tuple[pd.DataFrame, pd.DataFrame | None]:
273
512
  """
274
- This function preprocesses PPG and accelerometer data by resampling, filtering and aligning the data segments of both sensors (if applicable).
275
- Aligning is done using the extract_overlapping_segments function which is based on the provided start times of the PPG and IMU data and returns
276
- only the data points where both signals overlap in time. The remaining data points are discarded.
277
- After alignment, the function resamples the data to the specified frequency and applies Butterworth filters to both PPG and accelerometer data (if applicable).
278
- The output is two DataFrames: one for the preprocessed PPG data and another for the preprocessed accelerometer data (if provided, otherwise return is None).
513
+ This function preprocesses PPG and accelerometer data by resampling,
514
+ filtering and aligning the data segments of both sensors (if applicable).
515
+ Aligning is done using the extract_overlapping_segments function which is
516
+ based on the provided start times of the PPG and IMU data and returns
517
+ only the data points where both signals overlap in time. The remaining
518
+ data points are discarded.
519
+ After alignment, the function resamples the data to the specified
520
+ frequency and applies Butterworth filters to both PPG and accelerometer
521
+ data (if applicable).
522
+ The output is two DataFrames: one for the preprocessed PPG data and
523
+ another for the preprocessed accelerometer data (if provided, otherwise
524
+ return is None).
279
525
 
280
526
  Parameters
281
527
  ----------
282
528
  df_ppg : pd.DataFrame
283
529
  DataFrame containing PPG data.
284
- df_acc : pd.DataFrame
285
- DataFrame containing accelerometer from IMU data.
286
530
  ppg_config : PPGPreprocessingConfig
287
531
  Configuration object for PPG preprocessing.
288
- imu_config : IMUPreprocessingConfig
289
- Configuration object for IMU preprocessing.
290
532
  start_time_ppg : str
291
533
  iso8601 formatted start time of the PPG data.
534
+ df_acc : pd.DataFrame
535
+ DataFrame containing accelerometer from IMU data.
536
+ imu_config : IMUPreprocessingConfig
537
+ Configuration object for IMU preprocessing.
292
538
  start_time_imu : str
293
539
  iso8601 formatted start time of the IMU data.
540
+ verbose : int, default 1
541
+ Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
294
542
 
295
543
  Returns
296
544
  -------
@@ -299,16 +547,25 @@ def preprocess_ppg_data(
299
547
  - Preprocessed PPG data with the following transformations:
300
548
  - Resampled data at the specified frequency.
301
549
  - Filtered PPG data with bandpass filtering applied.
302
- - Preprocessed accelerometer data (if provided, otherwise return is None) with the following transformations:
550
+ - Preprocessed accelerometer data (if provided, otherwise return is
551
+ None) with the following transformations:
303
552
  - Resampled data at the specified frequency.
304
- - Filtered accelerometer data with high-pass and low-pass filtering applied.
553
+ - Filtered accelerometer data with high-pass and low-pass
554
+ filtering applied.
305
555
 
306
556
  Notes
307
557
  -----
308
- - If accelerometer data or IMU configuration is not provided, the function only preprocesses PPG data.
309
- - The function applies Butterworth filters to PPG and accelerometer (if applicable) data, both high-pass and low-pass.
558
+ - If accelerometer data or IMU configuration is not provided, the
559
+ function only preprocesses PPG data.
560
+ - The function applies Butterworth filters to PPG and accelerometer
561
+ (if applicable) data, both high-pass and low-pass.
310
562
 
311
563
  """
564
+ # Make copies to avoid SettingWithCopyWarning
565
+ df_ppg = df_ppg.copy()
566
+ if df_acc is not None:
567
+ df_acc = df_acc.copy()
568
+
312
569
  if df_acc is not None and imu_config is not None:
313
570
  # Extract overlapping segments
314
571
  df_ppg_overlapping, df_acc_overlapping = extract_overlapping_segments(
@@ -321,6 +578,8 @@ def preprocess_ppg_data(
321
578
  )
322
579
 
323
580
  # Resample accelerometer data
581
+ # Skip contiguity validation if data has been pre-segmented
582
+ validate_contiguous_acc = "data_segment_nr" not in df_acc_overlapping.columns
324
583
  df_acc_proc = resample_data(
325
584
  df=df_acc_overlapping,
326
585
  time_column=imu_config.time_colname,
@@ -328,6 +587,8 @@ def preprocess_ppg_data(
328
587
  sampling_frequency=imu_config.sampling_frequency,
329
588
  resampling_frequency=imu_config.resampling_frequency,
330
589
  tolerance=imu_config.tolerance,
590
+ validate_contiguous=validate_contiguous_acc,
591
+ verbose=verbose,
331
592
  )
332
593
 
333
594
  # Extract accelerometer data for filtering
@@ -358,6 +619,8 @@ def preprocess_ppg_data(
358
619
  df_ppg_overlapping = df_ppg
359
620
 
360
621
  # Resample PPG data
622
+ # Skip contiguity validation if data has been pre-segmented
623
+ validate_contiguous_ppg = "data_segment_nr" not in df_ppg_overlapping.columns
361
624
  df_ppg_proc = resample_data(
362
625
  df=df_ppg_overlapping,
363
626
  time_column=ppg_config.time_colname,
@@ -365,6 +628,8 @@ def preprocess_ppg_data(
365
628
  sampling_frequency=ppg_config.sampling_frequency,
366
629
  resampling_frequency=ppg_config.resampling_frequency,
367
630
  tolerance=ppg_config.tolerance,
631
+ validate_contiguous=validate_contiguous_ppg,
632
+ verbose=verbose,
368
633
  )
369
634
 
370
635
  # Extract accelerometer data for filtering
@@ -404,9 +669,10 @@ def extract_overlapping_segments(
404
669
  time_colname_imu: str,
405
670
  start_time_ppg: str,
406
671
  start_time_acc: str,
407
- ) -> Tuple[pd.DataFrame, pd.DataFrame]:
672
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
408
673
  """
409
- Extract DataFrames with overlapping data segments between accelerometer (from the IMU) and PPG datasets based on their timestamps.
674
+ Extract DataFrames with overlapping data segments between accelerometer
675
+ (from the IMU) and PPG datasets based on their timestamps.
410
676
 
411
677
  Parameters
412
678
  ----------
@@ -426,7 +692,8 @@ def extract_overlapping_segments(
426
692
  Returns
427
693
  -------
428
694
  Tuple[pd.DataFrame, pd.DataFrame]
429
- DataFrames containing the overlapping segments (time and values) of PPG and accelerometer data.
695
+ DataFrames containing the overlapping segments (time and values) of
696
+ PPG and accelerometer data.
430
697
  """
431
698
  # Convert start times to Unix timestamps
432
699
  datetime_ppg_start = datetime.fromisoformat(start_time_ppg.replace("Z", "+00:00"))
@@ -434,7 +701,8 @@ def extract_overlapping_segments(
434
701
  datetime_acc_start = datetime.fromisoformat(start_time_acc.replace("Z", "+00:00"))
435
702
  start_acc_ppg = int(datetime_acc_start.timestamp())
436
703
 
437
- # Calculate the time in Unix timestamps for each dataset because the timestamps are relative to the start time
704
+ # Calculate the time in Unix timestamps for each dataset because the
705
+ # timestamps are relative to the start time
438
706
  ppg_time = df_ppg[time_colname_ppg] + start_unix_ppg
439
707
  acc_time = df_acc[time_colname_imu] + start_acc_ppg
440
708