paradigma 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paradigma/__init__.py +10 -1
- paradigma/classification.py +14 -14
- paradigma/config.py +38 -29
- paradigma/constants.py +10 -2
- paradigma/feature_extraction.py +106 -75
- paradigma/load.py +476 -0
- paradigma/orchestrator.py +670 -0
- paradigma/pipelines/gait_pipeline.py +488 -97
- paradigma/pipelines/pulse_rate_pipeline.py +278 -46
- paradigma/pipelines/pulse_rate_utils.py +176 -137
- paradigma/pipelines/tremor_pipeline.py +292 -72
- paradigma/prepare_data.py +409 -0
- paradigma/preprocessing.py +345 -77
- paradigma/segmenting.py +57 -42
- paradigma/testing.py +14 -9
- paradigma/util.py +36 -22
- paradigma-1.1.0.dist-info/METADATA +229 -0
- paradigma-1.1.0.dist-info/RECORD +26 -0
- {paradigma-1.0.4.dist-info → paradigma-1.1.0.dist-info}/WHEEL +1 -1
- paradigma-1.0.4.dist-info/METADATA +0 -140
- paradigma-1.0.4.dist-info/RECORD +0 -23
- {paradigma-1.0.4.dist-info → paradigma-1.1.0.dist-info}/entry_points.txt +0 -0
- {paradigma-1.0.4.dist-info → paradigma-1.1.0.dist-info}/licenses/LICENSE +0 -0
paradigma/preprocessing.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
|
-
from typing import List, Tuple, Union
|
|
3
2
|
|
|
4
3
|
import numpy as np
|
|
5
4
|
import pandas as pd
|
|
@@ -7,72 +6,266 @@ from scipy import signal
|
|
|
7
6
|
from scipy.interpolate import interp1d
|
|
8
7
|
|
|
9
8
|
from paradigma.config import IMUConfig, PPGConfig
|
|
9
|
+
from paradigma.segmenting import create_segments, discard_segments
|
|
10
10
|
from paradigma.util import invert_watch_side
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def resample_data(
|
|
14
14
|
df: pd.DataFrame,
|
|
15
|
-
time_column: str,
|
|
16
|
-
values_column_names:
|
|
17
|
-
sampling_frequency: int,
|
|
18
|
-
resampling_frequency: int,
|
|
15
|
+
time_column: str = "time",
|
|
16
|
+
values_column_names: list[str] | None = None,
|
|
17
|
+
sampling_frequency: int | None = None,
|
|
18
|
+
resampling_frequency: int | None = None,
|
|
19
19
|
tolerance: float | None = None,
|
|
20
|
+
validate_contiguous: bool = True,
|
|
21
|
+
auto_segment: bool = False,
|
|
22
|
+
max_segment_gap_s: float | None = None,
|
|
23
|
+
min_segment_length_s: float | None = None,
|
|
24
|
+
verbose: int = 2,
|
|
20
25
|
) -> pd.DataFrame:
|
|
21
26
|
"""
|
|
22
|
-
|
|
27
|
+
Unified resampling function with optional auto-segmentation for non-contiguous data.
|
|
28
|
+
|
|
29
|
+
This function supports:
|
|
30
|
+
- Automatic frequency detection or explicit specification
|
|
31
|
+
- Contiguity validation with configurable tolerance
|
|
32
|
+
- Automatic segmentation of non-contiguous data
|
|
33
|
+
- Preservation of non-numeric columns
|
|
23
34
|
|
|
24
35
|
Parameters
|
|
25
36
|
----------
|
|
26
37
|
df : pd.DataFrame
|
|
27
38
|
The input DataFrame containing the sensor data.
|
|
28
|
-
time_column : str
|
|
39
|
+
time_column : str, default 'time'
|
|
29
40
|
The name of the column containing the time data.
|
|
30
|
-
values_column_names : List[str]
|
|
31
|
-
|
|
32
|
-
sampling_frequency : int
|
|
33
|
-
|
|
34
|
-
resampling_frequency : int
|
|
35
|
-
|
|
41
|
+
values_column_names : List[str], optional
|
|
42
|
+
Column names to resample. If None, auto-detects all numeric columns except time.
|
|
43
|
+
sampling_frequency : int, optional
|
|
44
|
+
Original sampling frequency (Hz). If None, auto-detected from data.
|
|
45
|
+
resampling_frequency : int, optional
|
|
46
|
+
Target sampling frequency in Hz.
|
|
36
47
|
tolerance : float, optional
|
|
37
|
-
|
|
38
|
-
|
|
48
|
+
Tolerance for contiguity checking (seconds). Defaults to IMUConfig tolerance.
|
|
49
|
+
validate_contiguous : bool, default True
|
|
50
|
+
Whether to validate data contiguity. If False, gaps are silently interpolated.
|
|
51
|
+
auto_segment : bool, default False
|
|
52
|
+
If True, automatically split non-contiguous data into segments and
|
|
53
|
+
process each. Adds 'data_segment_nr' column to output. If False and
|
|
54
|
+
data is non-contiguous with validate_contiguous=True, raises
|
|
55
|
+
ValueError.
|
|
56
|
+
max_segment_gap_s : float, optional
|
|
57
|
+
Maximum gap (seconds) before starting new segment. Used when auto_segment=True.
|
|
58
|
+
Defaults to IMUConfig.max_segment_gap_s (1.5s).
|
|
59
|
+
min_segment_length_s : float, optional
|
|
60
|
+
Minimum segment length (seconds) to keep. Used when auto_segment=True.
|
|
61
|
+
Defaults to IMUConfig.min_segment_length_s (1.5s).
|
|
62
|
+
verbose : int, default 1
|
|
63
|
+
Logging verbose: 0=errors only, 1=basic info, 2+=detailed info.
|
|
64
|
+
Note: This function still uses verbose for backward compatibility
|
|
65
|
+
with existing code that calls it directly.
|
|
39
66
|
|
|
40
67
|
Returns
|
|
41
68
|
-------
|
|
42
69
|
pd.DataFrame
|
|
43
|
-
|
|
44
|
-
|
|
70
|
+
Resampled DataFrame. If auto_segment=True and multiple segments found,
|
|
71
|
+
includes 'data_segment_nr' column identifying each contiguous data segment.
|
|
45
72
|
|
|
46
73
|
Raises
|
|
47
74
|
------
|
|
48
75
|
ValueError
|
|
49
|
-
If
|
|
76
|
+
- If time array is not strictly increasing
|
|
77
|
+
- If time array is not contiguous and validate_contiguous=True
|
|
78
|
+
and auto_segment=False
|
|
79
|
+
- If no numeric columns found for resampling
|
|
80
|
+
- If all segments are discarded due to min_segment_length_s
|
|
50
81
|
|
|
51
82
|
Notes
|
|
52
83
|
-----
|
|
53
|
-
- Uses cubic interpolation for smooth resampling if there are enough points
|
|
54
|
-
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
84
|
+
- Uses cubic interpolation for smooth resampling if there are enough points
|
|
85
|
+
- Falls back to linear interpolation if only 2-3 points available
|
|
86
|
+
- Non-numeric columns are preserved (first value copied to all rows)
|
|
87
|
+
- Backwards compatible with both old resample_data signatures
|
|
88
|
+
|
|
89
|
+
Examples
|
|
90
|
+
--------
|
|
91
|
+
# Auto-detection mode
|
|
92
|
+
df_resampled = resample_data(df, resampling_frequency=100)
|
|
93
|
+
|
|
94
|
+
# Explicit mode
|
|
95
|
+
df_resampled = resample_data(
|
|
96
|
+
df, time_column='time', values_column_names=['acc_x', 'acc_y'],
|
|
97
|
+
sampling_frequency=128, resampling_frequency=100
|
|
98
|
+
)
|
|
60
99
|
|
|
61
|
-
#
|
|
100
|
+
# Auto-segmentation mode
|
|
101
|
+
df_segmented = resample_data(
|
|
102
|
+
df, resampling_frequency=100, auto_segment=True,
|
|
103
|
+
max_segment_gap_s=2.0, min_segment_length_s=3.0
|
|
104
|
+
)
|
|
105
|
+
"""
|
|
106
|
+
df = df.copy()
|
|
107
|
+
|
|
108
|
+
if time_column not in df.columns:
|
|
109
|
+
raise ValueError(f"Time column '{time_column}' not found in DataFrame")
|
|
110
|
+
|
|
111
|
+
# Validate resampling frequency
|
|
112
|
+
if resampling_frequency is None:
|
|
113
|
+
raise ValueError("resampling_frequency must be provided")
|
|
114
|
+
|
|
115
|
+
resampling_frequency = float(resampling_frequency)
|
|
116
|
+
|
|
117
|
+
# Auto-detect or use provided column names
|
|
118
|
+
if values_column_names is None:
|
|
119
|
+
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
|
120
|
+
values_column_names = [
|
|
121
|
+
col
|
|
122
|
+
for col in numeric_columns
|
|
123
|
+
if col != time_column and col != "data_segment_nr"
|
|
124
|
+
]
|
|
125
|
+
if not values_column_names:
|
|
126
|
+
raise ValueError("No numeric columns found for resampling")
|
|
127
|
+
if verbose >= 2:
|
|
128
|
+
print(f"Auto-detected {len(values_column_names)} columns for resampling")
|
|
129
|
+
|
|
130
|
+
# Auto-detect or use provided sampling frequency
|
|
62
131
|
time_abs_array = np.array(df[time_column])
|
|
63
|
-
|
|
132
|
+
if sampling_frequency is None:
|
|
133
|
+
time_diff = df[time_column].diff().dropna()
|
|
134
|
+
current_dt = time_diff.median()
|
|
135
|
+
sampling_frequency = 1.0 / current_dt
|
|
136
|
+
if verbose >= 2:
|
|
137
|
+
print(f"Auto-detected sampling frequency: {sampling_frequency:.2f} Hz")
|
|
138
|
+
else:
|
|
139
|
+
sampling_frequency = float(sampling_frequency)
|
|
64
140
|
|
|
65
|
-
# Ensure
|
|
141
|
+
# Ensure time array is strictly increasing
|
|
66
142
|
if not np.all(np.diff(time_abs_array) > 0):
|
|
67
143
|
raise ValueError("Time array is not strictly increasing")
|
|
68
144
|
|
|
69
|
-
#
|
|
145
|
+
# Set default tolerance if not provided
|
|
146
|
+
if tolerance is None:
|
|
147
|
+
tolerance = IMUConfig().tolerance
|
|
148
|
+
|
|
149
|
+
# Set default segmentation parameters
|
|
150
|
+
if auto_segment:
|
|
151
|
+
if max_segment_gap_s is None:
|
|
152
|
+
max_segment_gap_s = 1.5 # IMUConfig default
|
|
153
|
+
if min_segment_length_s is None:
|
|
154
|
+
min_segment_length_s = 1.5 # IMUConfig default
|
|
155
|
+
|
|
156
|
+
# Check contiguity
|
|
70
157
|
expected_interval = 1 / sampling_frequency
|
|
71
158
|
timestamp_diffs = np.diff(time_abs_array)
|
|
72
|
-
|
|
73
|
-
|
|
159
|
+
is_contiguous = not np.any(np.abs(timestamp_diffs - expected_interval) > tolerance)
|
|
160
|
+
|
|
161
|
+
if not is_contiguous:
|
|
162
|
+
if validate_contiguous and not auto_segment:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
"Time array is not contiguous. Consider enabling automatic "
|
|
165
|
+
"segmentation to split and process non-contiguous segments, or "
|
|
166
|
+
"disable contiguity validation to interpolate over gaps."
|
|
167
|
+
)
|
|
168
|
+
elif auto_segment:
|
|
169
|
+
# Split into segments
|
|
170
|
+
if verbose >= 1:
|
|
171
|
+
print("Non-contiguous data detected. Auto-segmenting...")
|
|
172
|
+
|
|
173
|
+
# Create segments based on gaps
|
|
174
|
+
segment_array = create_segments(
|
|
175
|
+
time_array=time_abs_array,
|
|
176
|
+
max_segment_gap_s=max_segment_gap_s,
|
|
177
|
+
)
|
|
178
|
+
df["data_segment_nr"] = segment_array
|
|
179
|
+
|
|
180
|
+
# Discard segments that are too short
|
|
181
|
+
df = discard_segments(
|
|
182
|
+
df=df,
|
|
183
|
+
segment_nr_colname="data_segment_nr",
|
|
184
|
+
min_segment_length_s=min_segment_length_s,
|
|
185
|
+
fs=int(sampling_frequency),
|
|
186
|
+
format="timestamps",
|
|
187
|
+
)
|
|
74
188
|
|
|
75
|
-
|
|
189
|
+
n_segments = df["data_segment_nr"].nunique()
|
|
190
|
+
if verbose >= 1:
|
|
191
|
+
segment_durations = []
|
|
192
|
+
for seg_nr in df["data_segment_nr"].unique():
|
|
193
|
+
seg_df = df[df["data_segment_nr"] == seg_nr]
|
|
194
|
+
duration = (
|
|
195
|
+
seg_df[time_column].iloc[-1] - seg_df[time_column].iloc[0]
|
|
196
|
+
)
|
|
197
|
+
segment_durations.append(f"{duration:.1f}s")
|
|
198
|
+
print(f"Created {n_segments} segments: {', '.join(segment_durations)}")
|
|
199
|
+
|
|
200
|
+
# Resample each segment independently
|
|
201
|
+
resampled_segments = []
|
|
202
|
+
for seg_nr in df["data_segment_nr"].unique():
|
|
203
|
+
seg_df = df[df["data_segment_nr"] == seg_nr].copy()
|
|
204
|
+
seg_time = np.array(seg_df[time_column])
|
|
205
|
+
seg_values = np.array(seg_df[values_column_names])
|
|
206
|
+
|
|
207
|
+
# Resample this segment
|
|
208
|
+
duration = seg_time[-1] - seg_time[0]
|
|
209
|
+
n_samples = int(np.round(duration * resampling_frequency)) + 1
|
|
210
|
+
t_resampled = np.linspace(seg_time[0], seg_time[-1], n_samples)
|
|
211
|
+
|
|
212
|
+
interpolation_kind = "cubic" if len(seg_time) > 3 else "linear"
|
|
213
|
+
interpolator = interp1d(
|
|
214
|
+
seg_time,
|
|
215
|
+
seg_values,
|
|
216
|
+
axis=0,
|
|
217
|
+
kind=interpolation_kind,
|
|
218
|
+
fill_value="extrapolate",
|
|
219
|
+
)
|
|
220
|
+
resampled_values = interpolator(t_resampled)
|
|
221
|
+
|
|
222
|
+
# Create resampled segment DataFrame
|
|
223
|
+
df_seg_resampled = pd.DataFrame(
|
|
224
|
+
resampled_values, columns=values_column_names
|
|
225
|
+
)
|
|
226
|
+
df_seg_resampled[time_column] = t_resampled
|
|
227
|
+
df_seg_resampled["data_segment_nr"] = seg_nr
|
|
228
|
+
|
|
229
|
+
# Copy non-numeric columns from first row of segment
|
|
230
|
+
for column in seg_df.columns:
|
|
231
|
+
if (
|
|
232
|
+
column not in df_seg_resampled.columns
|
|
233
|
+
and column != "data_segment_nr"
|
|
234
|
+
):
|
|
235
|
+
df_seg_resampled[column] = seg_df[column].iloc[0]
|
|
236
|
+
|
|
237
|
+
resampled_segments.append(df_seg_resampled)
|
|
238
|
+
|
|
239
|
+
# Concatenate all segments
|
|
240
|
+
df_resampled = pd.concat(resampled_segments, ignore_index=True)
|
|
241
|
+
|
|
242
|
+
# Ensure correct column order
|
|
243
|
+
resampled_columns = (
|
|
244
|
+
[time_column] + values_column_names + ["data_segment_nr"]
|
|
245
|
+
)
|
|
246
|
+
other_cols = [
|
|
247
|
+
col for col in df_resampled.columns if col not in resampled_columns
|
|
248
|
+
]
|
|
249
|
+
df_resampled = df_resampled[resampled_columns + other_cols]
|
|
250
|
+
|
|
251
|
+
if verbose >= 1:
|
|
252
|
+
print(
|
|
253
|
+
f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
|
|
254
|
+
f"{resampling_frequency} Hz"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return df_resampled
|
|
258
|
+
|
|
259
|
+
elif verbose >= 2:
|
|
260
|
+
print(
|
|
261
|
+
"Warning: Data is not contiguous but validation is disabled. "
|
|
262
|
+
"Interpolating over gaps."
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Standard resampling for contiguous data (or when validation is disabled)
|
|
266
|
+
values_array = np.array(df[values_column_names])
|
|
267
|
+
|
|
268
|
+
# Resample the time data
|
|
76
269
|
t_resampled = np.arange(
|
|
77
270
|
time_abs_array[0], time_abs_array[-1], 1 / resampling_frequency
|
|
78
271
|
)
|
|
@@ -90,18 +283,27 @@ def resample_data(
|
|
|
90
283
|
# Interpolate
|
|
91
284
|
resampled_values = interpolator(t_resampled)
|
|
92
285
|
|
|
93
|
-
# Create
|
|
286
|
+
# Create resampled DataFrame
|
|
94
287
|
df_resampled = pd.DataFrame(resampled_values, columns=values_column_names)
|
|
95
288
|
df_resampled[time_column] = t_resampled
|
|
96
289
|
|
|
97
|
-
# Return
|
|
98
|
-
|
|
290
|
+
# Return with correct column order
|
|
291
|
+
resampled_columns = [time_column] + values_column_names
|
|
292
|
+
df_resampled = df_resampled[resampled_columns]
|
|
293
|
+
|
|
294
|
+
if verbose >= 1:
|
|
295
|
+
print(
|
|
296
|
+
f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
|
|
297
|
+
f"{resampling_frequency} Hz"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
return df_resampled
|
|
99
301
|
|
|
100
302
|
|
|
101
303
|
def butterworth_filter(
|
|
102
304
|
data: np.ndarray,
|
|
103
305
|
order: int,
|
|
104
|
-
cutoff_frequency:
|
|
306
|
+
cutoff_frequency: float | list[float],
|
|
105
307
|
passband: str,
|
|
106
308
|
sampling_frequency: int,
|
|
107
309
|
):
|
|
@@ -119,17 +321,19 @@ def butterworth_filter(
|
|
|
119
321
|
(e.g., multi-axis sensor data).
|
|
120
322
|
order : int
|
|
121
323
|
The order of the Butterworth filter. Higher values result in a steeper roll-off.
|
|
122
|
-
cutoff_frequency : float or
|
|
123
|
-
The cutoff frequency (or frequencies) for the filter. For a low-pass
|
|
124
|
-
this is a single float. For a band-pass filter,
|
|
125
|
-
specifying the lower and upper
|
|
324
|
+
cutoff_frequency : float or list of float
|
|
325
|
+
The cutoff frequency (or frequencies) for the filter. For a low-pass
|
|
326
|
+
or high-pass filter, this is a single float. For a band-pass filter,
|
|
327
|
+
this should be a list of two floats, specifying the lower and upper
|
|
328
|
+
cutoff frequencies.
|
|
126
329
|
passband : str
|
|
127
330
|
The type of passband to apply. Options are:
|
|
128
331
|
- 'hp' : high-pass filter
|
|
129
332
|
- 'lp' : low-pass filter
|
|
130
333
|
- 'band' : band-pass filter
|
|
131
334
|
sampling_frequency : int
|
|
132
|
-
The sampling frequency of the data in Hz. This is used to normalize
|
|
335
|
+
The sampling frequency of the data in Hz. This is used to normalize
|
|
336
|
+
the cutoff frequency.
|
|
133
337
|
|
|
134
338
|
Returns
|
|
135
339
|
-------
|
|
@@ -139,12 +343,14 @@ def butterworth_filter(
|
|
|
139
343
|
Raises
|
|
140
344
|
------
|
|
141
345
|
ValueError
|
|
142
|
-
If the input data has more than two dimensions, or if an invalid
|
|
346
|
+
If the input data has more than two dimensions, or if an invalid
|
|
347
|
+
passband is specified.
|
|
143
348
|
|
|
144
349
|
Notes
|
|
145
350
|
-----
|
|
146
|
-
The function uses `scipy.signal.butter` to design the filter and
|
|
147
|
-
to apply it using second-order sections (SOS)
|
|
351
|
+
The function uses `scipy.signal.butter` to design the filter and
|
|
352
|
+
`scipy.signal.sosfiltfilt` to apply it using second-order sections (SOS)
|
|
353
|
+
to improve numerical stability.
|
|
148
354
|
"""
|
|
149
355
|
# Design the filter using second-order sections (SOS)
|
|
150
356
|
sos = signal.butter(
|
|
@@ -166,7 +372,11 @@ def butterworth_filter(
|
|
|
166
372
|
|
|
167
373
|
|
|
168
374
|
def preprocess_imu_data(
|
|
169
|
-
df: pd.DataFrame,
|
|
375
|
+
df: pd.DataFrame,
|
|
376
|
+
config: IMUConfig,
|
|
377
|
+
sensor: str,
|
|
378
|
+
watch_side: str,
|
|
379
|
+
verbose: int = 1,
|
|
170
380
|
) -> pd.DataFrame:
|
|
171
381
|
"""
|
|
172
382
|
Preprocesses IMU data by resampling and applying filters.
|
|
@@ -176,8 +386,9 @@ def preprocess_imu_data(
|
|
|
176
386
|
df : pd.DataFrame
|
|
177
387
|
The DataFrame containing raw accelerometer and/or gyroscope data.
|
|
178
388
|
config : IMUConfig
|
|
179
|
-
Configuration object containing various settings, such as time column
|
|
180
|
-
filter settings, and
|
|
389
|
+
Configuration object containing various settings, such as time column
|
|
390
|
+
name, accelerometer and/or gyroscope columns, filter settings, and
|
|
391
|
+
sampling frequency.
|
|
181
392
|
sensor: str
|
|
182
393
|
Name of the sensor data to be preprocessed. Must be one of:
|
|
183
394
|
- "accelerometer": Preprocess accelerometer data only.
|
|
@@ -187,18 +398,25 @@ def preprocess_imu_data(
|
|
|
187
398
|
The side of the watch where the data was collected. Must be one of:
|
|
188
399
|
- "left": Data was collected from the left wrist.
|
|
189
400
|
- "right": Data was collected from the right wrist.
|
|
401
|
+
verbose : int, default 1
|
|
402
|
+
Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
|
|
190
403
|
|
|
191
404
|
Returns
|
|
192
405
|
-------
|
|
193
406
|
pd.DataFrame
|
|
194
|
-
The preprocessed accelerometer and or gyroscope data with the
|
|
407
|
+
The preprocessed accelerometer and or gyroscope data with the
|
|
408
|
+
following transformations:
|
|
195
409
|
- Resampled data at the specified frequency.
|
|
196
|
-
- Filtered accelerometer data with high-pass and low-pass filtering
|
|
410
|
+
- Filtered accelerometer data with high-pass and low-pass filtering
|
|
411
|
+
applied.
|
|
197
412
|
|
|
198
413
|
Notes
|
|
199
414
|
-----
|
|
200
|
-
- The function applies Butterworth filters to accelerometer data, both
|
|
415
|
+
- The function applies Butterworth filters to accelerometer data, both
|
|
416
|
+
high-pass and low-pass.
|
|
201
417
|
"""
|
|
418
|
+
# Make a copy to avoid SettingWithCopyWarning
|
|
419
|
+
df = df.copy()
|
|
202
420
|
|
|
203
421
|
# Extract sensor column
|
|
204
422
|
if sensor == "accelerometer":
|
|
@@ -210,15 +428,35 @@ def preprocess_imu_data(
|
|
|
210
428
|
else:
|
|
211
429
|
raise ("Sensor should be either accelerometer, gyroscope, or both")
|
|
212
430
|
|
|
213
|
-
#
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
431
|
+
# Check if data needs resampling
|
|
432
|
+
# Skip resampling if already at target frequency or if data has been pre-segmented
|
|
433
|
+
needs_resampling = True
|
|
434
|
+
validate_contiguous = True
|
|
435
|
+
|
|
436
|
+
if "data_segment_nr" in df.columns:
|
|
437
|
+
# Data has been pre-segmented, skip contiguity validation
|
|
438
|
+
validate_contiguous = False
|
|
439
|
+
|
|
440
|
+
# Check current sampling frequency
|
|
441
|
+
time_diff = df[config.time_colname].diff().dropna()
|
|
442
|
+
current_dt = time_diff.median()
|
|
443
|
+
current_frequency = 1.0 / current_dt
|
|
444
|
+
|
|
445
|
+
if abs(current_frequency - config.resampling_frequency) < 0.1:
|
|
446
|
+
needs_resampling = False
|
|
447
|
+
|
|
448
|
+
if needs_resampling:
|
|
449
|
+
# Resample the data to the specified frequency
|
|
450
|
+
df = resample_data(
|
|
451
|
+
df=df,
|
|
452
|
+
time_column=config.time_colname,
|
|
453
|
+
values_column_names=values_colnames,
|
|
454
|
+
sampling_frequency=config.sampling_frequency,
|
|
455
|
+
resampling_frequency=config.resampling_frequency,
|
|
456
|
+
tolerance=config.tolerance,
|
|
457
|
+
validate_contiguous=validate_contiguous,
|
|
458
|
+
verbose=verbose,
|
|
459
|
+
)
|
|
222
460
|
|
|
223
461
|
# Invert the IMU data if the watch was worn on the right wrist
|
|
224
462
|
df = invert_watch_side(df, watch_side, sensor)
|
|
@@ -269,28 +507,38 @@ def preprocess_ppg_data(
|
|
|
269
507
|
df_acc: pd.DataFrame | None = None,
|
|
270
508
|
imu_config: IMUConfig | None = None,
|
|
271
509
|
start_time_imu: str | None = None,
|
|
272
|
-
|
|
510
|
+
verbose: int = 1,
|
|
511
|
+
) -> tuple[pd.DataFrame, pd.DataFrame | None]:
|
|
273
512
|
"""
|
|
274
|
-
This function preprocesses PPG and accelerometer data by resampling,
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
513
|
+
This function preprocesses PPG and accelerometer data by resampling,
|
|
514
|
+
filtering and aligning the data segments of both sensors (if applicable).
|
|
515
|
+
Aligning is done using the extract_overlapping_segments function which is
|
|
516
|
+
based on the provided start times of the PPG and IMU data and returns
|
|
517
|
+
only the data points where both signals overlap in time. The remaining
|
|
518
|
+
data points are discarded.
|
|
519
|
+
After alignment, the function resamples the data to the specified
|
|
520
|
+
frequency and applies Butterworth filters to both PPG and accelerometer
|
|
521
|
+
data (if applicable).
|
|
522
|
+
The output is two DataFrames: one for the preprocessed PPG data and
|
|
523
|
+
another for the preprocessed accelerometer data (if provided, otherwise
|
|
524
|
+
return is None).
|
|
279
525
|
|
|
280
526
|
Parameters
|
|
281
527
|
----------
|
|
282
528
|
df_ppg : pd.DataFrame
|
|
283
529
|
DataFrame containing PPG data.
|
|
284
|
-
df_acc : pd.DataFrame
|
|
285
|
-
DataFrame containing accelerometer from IMU data.
|
|
286
530
|
ppg_config : PPGPreprocessingConfig
|
|
287
531
|
Configuration object for PPG preprocessing.
|
|
288
|
-
imu_config : IMUPreprocessingConfig
|
|
289
|
-
Configuration object for IMU preprocessing.
|
|
290
532
|
start_time_ppg : str
|
|
291
533
|
iso8601 formatted start time of the PPG data.
|
|
534
|
+
df_acc : pd.DataFrame
|
|
535
|
+
DataFrame containing accelerometer from IMU data.
|
|
536
|
+
imu_config : IMUPreprocessingConfig
|
|
537
|
+
Configuration object for IMU preprocessing.
|
|
292
538
|
start_time_imu : str
|
|
293
539
|
iso8601 formatted start time of the IMU data.
|
|
540
|
+
verbose : int, default 1
|
|
541
|
+
Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
|
|
294
542
|
|
|
295
543
|
Returns
|
|
296
544
|
-------
|
|
@@ -299,16 +547,25 @@ def preprocess_ppg_data(
|
|
|
299
547
|
- Preprocessed PPG data with the following transformations:
|
|
300
548
|
- Resampled data at the specified frequency.
|
|
301
549
|
- Filtered PPG data with bandpass filtering applied.
|
|
302
|
-
- Preprocessed accelerometer data (if provided, otherwise return is
|
|
550
|
+
- Preprocessed accelerometer data (if provided, otherwise return is
|
|
551
|
+
None) with the following transformations:
|
|
303
552
|
- Resampled data at the specified frequency.
|
|
304
|
-
- Filtered accelerometer data with high-pass and low-pass
|
|
553
|
+
- Filtered accelerometer data with high-pass and low-pass
|
|
554
|
+
filtering applied.
|
|
305
555
|
|
|
306
556
|
Notes
|
|
307
557
|
-----
|
|
308
|
-
- If accelerometer data or IMU configuration is not provided, the
|
|
309
|
-
|
|
558
|
+
- If accelerometer data or IMU configuration is not provided, the
|
|
559
|
+
function only preprocesses PPG data.
|
|
560
|
+
- The function applies Butterworth filters to PPG and accelerometer
|
|
561
|
+
(if applicable) data, both high-pass and low-pass.
|
|
310
562
|
|
|
311
563
|
"""
|
|
564
|
+
# Make copies to avoid SettingWithCopyWarning
|
|
565
|
+
df_ppg = df_ppg.copy()
|
|
566
|
+
if df_acc is not None:
|
|
567
|
+
df_acc = df_acc.copy()
|
|
568
|
+
|
|
312
569
|
if df_acc is not None and imu_config is not None:
|
|
313
570
|
# Extract overlapping segments
|
|
314
571
|
df_ppg_overlapping, df_acc_overlapping = extract_overlapping_segments(
|
|
@@ -321,6 +578,8 @@ def preprocess_ppg_data(
|
|
|
321
578
|
)
|
|
322
579
|
|
|
323
580
|
# Resample accelerometer data
|
|
581
|
+
# Skip contiguity validation if data has been pre-segmented
|
|
582
|
+
validate_contiguous_acc = "data_segment_nr" not in df_acc_overlapping.columns
|
|
324
583
|
df_acc_proc = resample_data(
|
|
325
584
|
df=df_acc_overlapping,
|
|
326
585
|
time_column=imu_config.time_colname,
|
|
@@ -328,6 +587,8 @@ def preprocess_ppg_data(
|
|
|
328
587
|
sampling_frequency=imu_config.sampling_frequency,
|
|
329
588
|
resampling_frequency=imu_config.resampling_frequency,
|
|
330
589
|
tolerance=imu_config.tolerance,
|
|
590
|
+
validate_contiguous=validate_contiguous_acc,
|
|
591
|
+
verbose=verbose,
|
|
331
592
|
)
|
|
332
593
|
|
|
333
594
|
# Extract accelerometer data for filtering
|
|
@@ -358,6 +619,8 @@ def preprocess_ppg_data(
|
|
|
358
619
|
df_ppg_overlapping = df_ppg
|
|
359
620
|
|
|
360
621
|
# Resample PPG data
|
|
622
|
+
# Skip contiguity validation if data has been pre-segmented
|
|
623
|
+
validate_contiguous_ppg = "data_segment_nr" not in df_ppg_overlapping.columns
|
|
361
624
|
df_ppg_proc = resample_data(
|
|
362
625
|
df=df_ppg_overlapping,
|
|
363
626
|
time_column=ppg_config.time_colname,
|
|
@@ -365,6 +628,8 @@ def preprocess_ppg_data(
|
|
|
365
628
|
sampling_frequency=ppg_config.sampling_frequency,
|
|
366
629
|
resampling_frequency=ppg_config.resampling_frequency,
|
|
367
630
|
tolerance=ppg_config.tolerance,
|
|
631
|
+
validate_contiguous=validate_contiguous_ppg,
|
|
632
|
+
verbose=verbose,
|
|
368
633
|
)
|
|
369
634
|
|
|
370
635
|
# Extract accelerometer data for filtering
|
|
@@ -404,9 +669,10 @@ def extract_overlapping_segments(
|
|
|
404
669
|
time_colname_imu: str,
|
|
405
670
|
start_time_ppg: str,
|
|
406
671
|
start_time_acc: str,
|
|
407
|
-
) ->
|
|
672
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
408
673
|
"""
|
|
409
|
-
Extract DataFrames with overlapping data segments between accelerometer
|
|
674
|
+
Extract DataFrames with overlapping data segments between accelerometer
|
|
675
|
+
(from the IMU) and PPG datasets based on their timestamps.
|
|
410
676
|
|
|
411
677
|
Parameters
|
|
412
678
|
----------
|
|
@@ -426,7 +692,8 @@ def extract_overlapping_segments(
|
|
|
426
692
|
Returns
|
|
427
693
|
-------
|
|
428
694
|
Tuple[pd.DataFrame, pd.DataFrame]
|
|
429
|
-
DataFrames containing the overlapping segments (time and values) of
|
|
695
|
+
DataFrames containing the overlapping segments (time and values) of
|
|
696
|
+
PPG and accelerometer data.
|
|
430
697
|
"""
|
|
431
698
|
# Convert start times to Unix timestamps
|
|
432
699
|
datetime_ppg_start = datetime.fromisoformat(start_time_ppg.replace("Z", "+00:00"))
|
|
@@ -434,7 +701,8 @@ def extract_overlapping_segments(
|
|
|
434
701
|
datetime_acc_start = datetime.fromisoformat(start_time_acc.replace("Z", "+00:00"))
|
|
435
702
|
start_acc_ppg = int(datetime_acc_start.timestamp())
|
|
436
703
|
|
|
437
|
-
# Calculate the time in Unix timestamps for each dataset because the
|
|
704
|
+
# Calculate the time in Unix timestamps for each dataset because the
|
|
705
|
+
# timestamps are relative to the start time
|
|
438
706
|
ppg_time = df_ppg[time_colname_ppg] + start_unix_ppg
|
|
439
707
|
acc_time = df_acc[time_colname_imu] + start_acc_ppg
|
|
440
708
|
|