paradigma 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paradigma/__init__.py +10 -1
- paradigma/classification.py +38 -21
- paradigma/config.py +187 -123
- paradigma/constants.py +48 -35
- paradigma/feature_extraction.py +345 -255
- paradigma/load.py +476 -0
- paradigma/orchestrator.py +670 -0
- paradigma/pipelines/gait_pipeline.py +685 -246
- paradigma/pipelines/pulse_rate_pipeline.py +456 -155
- paradigma/pipelines/pulse_rate_utils.py +289 -248
- paradigma/pipelines/tremor_pipeline.py +405 -132
- paradigma/prepare_data.py +409 -0
- paradigma/preprocessing.py +500 -163
- paradigma/segmenting.py +180 -140
- paradigma/testing.py +370 -178
- paradigma/util.py +190 -101
- paradigma-1.1.0.dist-info/METADATA +229 -0
- paradigma-1.1.0.dist-info/RECORD +26 -0
- {paradigma-1.0.3.dist-info → paradigma-1.1.0.dist-info}/WHEEL +1 -1
- paradigma-1.1.0.dist-info/entry_points.txt +4 -0
- {paradigma-1.0.3.dist-info → paradigma-1.1.0.dist-info/licenses}/LICENSE +0 -1
- paradigma-1.0.3.dist-info/METADATA +0 -138
- paradigma-1.0.3.dist-info/RECORD +0 -22
paradigma/preprocessing.py
CHANGED
|
@@ -1,130 +1,339 @@
|
|
|
1
|
-
import
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
2
3
|
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
4
|
-
import tsdf
|
|
5
|
-
from pathlib import Path
|
|
6
5
|
from scipy import signal
|
|
7
6
|
from scipy.interpolate import interp1d
|
|
8
|
-
from typing import List, Tuple, Union
|
|
9
|
-
from datetime import datetime
|
|
10
7
|
|
|
11
|
-
from paradigma.
|
|
12
|
-
from paradigma.
|
|
13
|
-
from paradigma.util import
|
|
8
|
+
from paradigma.config import IMUConfig, PPGConfig
|
|
9
|
+
from paradigma.segmenting import create_segments, discard_segments
|
|
10
|
+
from paradigma.util import invert_watch_side
|
|
14
11
|
|
|
15
12
|
|
|
16
13
|
def resample_data(
|
|
17
14
|
df: pd.DataFrame,
|
|
18
|
-
time_column
|
|
19
|
-
values_column_names:
|
|
20
|
-
sampling_frequency: int,
|
|
21
|
-
resampling_frequency: int,
|
|
22
|
-
tolerance: float | None = None
|
|
15
|
+
time_column: str = "time",
|
|
16
|
+
values_column_names: list[str] | None = None,
|
|
17
|
+
sampling_frequency: int | None = None,
|
|
18
|
+
resampling_frequency: int | None = None,
|
|
19
|
+
tolerance: float | None = None,
|
|
20
|
+
validate_contiguous: bool = True,
|
|
21
|
+
auto_segment: bool = False,
|
|
22
|
+
max_segment_gap_s: float | None = None,
|
|
23
|
+
min_segment_length_s: float | None = None,
|
|
24
|
+
verbose: int = 2,
|
|
23
25
|
) -> pd.DataFrame:
|
|
24
26
|
"""
|
|
25
|
-
|
|
27
|
+
Unified resampling function with optional auto-segmentation for non-contiguous data.
|
|
28
|
+
|
|
29
|
+
This function supports:
|
|
30
|
+
- Automatic frequency detection or explicit specification
|
|
31
|
+
- Contiguity validation with configurable tolerance
|
|
32
|
+
- Automatic segmentation of non-contiguous data
|
|
33
|
+
- Preservation of non-numeric columns
|
|
26
34
|
|
|
27
35
|
Parameters
|
|
28
36
|
----------
|
|
29
37
|
df : pd.DataFrame
|
|
30
38
|
The input DataFrame containing the sensor data.
|
|
31
|
-
time_column : str
|
|
39
|
+
time_column : str, default 'time'
|
|
32
40
|
The name of the column containing the time data.
|
|
33
|
-
values_column_names : List[str]
|
|
34
|
-
|
|
35
|
-
sampling_frequency : int
|
|
36
|
-
|
|
37
|
-
resampling_frequency : int
|
|
38
|
-
|
|
41
|
+
values_column_names : List[str], optional
|
|
42
|
+
Column names to resample. If None, auto-detects all numeric columns except time.
|
|
43
|
+
sampling_frequency : int, optional
|
|
44
|
+
Original sampling frequency (Hz). If None, auto-detected from data.
|
|
45
|
+
resampling_frequency : int, optional
|
|
46
|
+
Target sampling frequency in Hz.
|
|
39
47
|
tolerance : float, optional
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
Tolerance for contiguity checking (seconds). Defaults to IMUConfig tolerance.
|
|
49
|
+
validate_contiguous : bool, default True
|
|
50
|
+
Whether to validate data contiguity. If False, gaps are silently interpolated.
|
|
51
|
+
auto_segment : bool, default False
|
|
52
|
+
If True, automatically split non-contiguous data into segments and
|
|
53
|
+
process each. Adds 'data_segment_nr' column to output. If False and
|
|
54
|
+
data is non-contiguous with validate_contiguous=True, raises
|
|
55
|
+
ValueError.
|
|
56
|
+
max_segment_gap_s : float, optional
|
|
57
|
+
Maximum gap (seconds) before starting new segment. Used when auto_segment=True.
|
|
58
|
+
Defaults to IMUConfig.max_segment_gap_s (1.5s).
|
|
59
|
+
min_segment_length_s : float, optional
|
|
60
|
+
Minimum segment length (seconds) to keep. Used when auto_segment=True.
|
|
61
|
+
Defaults to IMUConfig.min_segment_length_s (1.5s).
|
|
62
|
+
verbose : int, default 1
|
|
63
|
+
Logging verbose: 0=errors only, 1=basic info, 2+=detailed info.
|
|
64
|
+
Note: This function still uses verbose for backward compatibility
|
|
65
|
+
with existing code that calls it directly.
|
|
43
66
|
|
|
44
67
|
Returns
|
|
45
68
|
-------
|
|
46
69
|
pd.DataFrame
|
|
47
|
-
|
|
48
|
-
|
|
70
|
+
Resampled DataFrame. If auto_segment=True and multiple segments found,
|
|
71
|
+
includes 'data_segment_nr' column identifying each contiguous data segment.
|
|
49
72
|
|
|
50
73
|
Raises
|
|
51
74
|
------
|
|
52
75
|
ValueError
|
|
53
|
-
If
|
|
76
|
+
- If time array is not strictly increasing
|
|
77
|
+
- If time array is not contiguous and validate_contiguous=True
|
|
78
|
+
and auto_segment=False
|
|
79
|
+
- If no numeric columns found for resampling
|
|
80
|
+
- If all segments are discarded due to min_segment_length_s
|
|
54
81
|
|
|
55
82
|
Notes
|
|
56
83
|
-----
|
|
57
|
-
- Uses cubic interpolation for smooth resampling if there are enough points
|
|
58
|
-
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
84
|
+
- Uses cubic interpolation for smooth resampling if there are enough points
|
|
85
|
+
- Falls back to linear interpolation if only 2-3 points available
|
|
86
|
+
- Non-numeric columns are preserved (first value copied to all rows)
|
|
87
|
+
- Backwards compatible with both old resample_data signatures
|
|
88
|
+
|
|
89
|
+
Examples
|
|
90
|
+
--------
|
|
91
|
+
# Auto-detection mode
|
|
92
|
+
df_resampled = resample_data(df, resampling_frequency=100)
|
|
93
|
+
|
|
94
|
+
# Explicit mode
|
|
95
|
+
df_resampled = resample_data(
|
|
96
|
+
df, time_column='time', values_column_names=['acc_x', 'acc_y'],
|
|
97
|
+
sampling_frequency=128, resampling_frequency=100
|
|
98
|
+
)
|
|
63
99
|
|
|
64
|
-
#
|
|
100
|
+
# Auto-segmentation mode
|
|
101
|
+
df_segmented = resample_data(
|
|
102
|
+
df, resampling_frequency=100, auto_segment=True,
|
|
103
|
+
max_segment_gap_s=2.0, min_segment_length_s=3.0
|
|
104
|
+
)
|
|
105
|
+
"""
|
|
106
|
+
df = df.copy()
|
|
107
|
+
|
|
108
|
+
if time_column not in df.columns:
|
|
109
|
+
raise ValueError(f"Time column '{time_column}' not found in DataFrame")
|
|
110
|
+
|
|
111
|
+
# Validate resampling frequency
|
|
112
|
+
if resampling_frequency is None:
|
|
113
|
+
raise ValueError("resampling_frequency must be provided")
|
|
114
|
+
|
|
115
|
+
resampling_frequency = float(resampling_frequency)
|
|
116
|
+
|
|
117
|
+
# Auto-detect or use provided column names
|
|
118
|
+
if values_column_names is None:
|
|
119
|
+
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
|
120
|
+
values_column_names = [
|
|
121
|
+
col
|
|
122
|
+
for col in numeric_columns
|
|
123
|
+
if col != time_column and col != "data_segment_nr"
|
|
124
|
+
]
|
|
125
|
+
if not values_column_names:
|
|
126
|
+
raise ValueError("No numeric columns found for resampling")
|
|
127
|
+
if verbose >= 2:
|
|
128
|
+
print(f"Auto-detected {len(values_column_names)} columns for resampling")
|
|
129
|
+
|
|
130
|
+
# Auto-detect or use provided sampling frequency
|
|
65
131
|
time_abs_array = np.array(df[time_column])
|
|
66
|
-
|
|
132
|
+
if sampling_frequency is None:
|
|
133
|
+
time_diff = df[time_column].diff().dropna()
|
|
134
|
+
current_dt = time_diff.median()
|
|
135
|
+
sampling_frequency = 1.0 / current_dt
|
|
136
|
+
if verbose >= 2:
|
|
137
|
+
print(f"Auto-detected sampling frequency: {sampling_frequency:.2f} Hz")
|
|
138
|
+
else:
|
|
139
|
+
sampling_frequency = float(sampling_frequency)
|
|
67
140
|
|
|
68
|
-
# Ensure
|
|
141
|
+
# Ensure time array is strictly increasing
|
|
69
142
|
if not np.all(np.diff(time_abs_array) > 0):
|
|
70
143
|
raise ValueError("Time array is not strictly increasing")
|
|
71
|
-
|
|
72
|
-
#
|
|
144
|
+
|
|
145
|
+
# Set default tolerance if not provided
|
|
146
|
+
if tolerance is None:
|
|
147
|
+
tolerance = IMUConfig().tolerance
|
|
148
|
+
|
|
149
|
+
# Set default segmentation parameters
|
|
150
|
+
if auto_segment:
|
|
151
|
+
if max_segment_gap_s is None:
|
|
152
|
+
max_segment_gap_s = 1.5 # IMUConfig default
|
|
153
|
+
if min_segment_length_s is None:
|
|
154
|
+
min_segment_length_s = 1.5 # IMUConfig default
|
|
155
|
+
|
|
156
|
+
# Check contiguity
|
|
73
157
|
expected_interval = 1 / sampling_frequency
|
|
74
158
|
timestamp_diffs = np.diff(time_abs_array)
|
|
75
|
-
|
|
76
|
-
|
|
159
|
+
is_contiguous = not np.any(np.abs(timestamp_diffs - expected_interval) > tolerance)
|
|
160
|
+
|
|
161
|
+
if not is_contiguous:
|
|
162
|
+
if validate_contiguous and not auto_segment:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
"Time array is not contiguous. Consider enabling automatic "
|
|
165
|
+
"segmentation to split and process non-contiguous segments, or "
|
|
166
|
+
"disable contiguity validation to interpolate over gaps."
|
|
167
|
+
)
|
|
168
|
+
elif auto_segment:
|
|
169
|
+
# Split into segments
|
|
170
|
+
if verbose >= 1:
|
|
171
|
+
print("Non-contiguous data detected. Auto-segmenting...")
|
|
172
|
+
|
|
173
|
+
# Create segments based on gaps
|
|
174
|
+
segment_array = create_segments(
|
|
175
|
+
time_array=time_abs_array,
|
|
176
|
+
max_segment_gap_s=max_segment_gap_s,
|
|
177
|
+
)
|
|
178
|
+
df["data_segment_nr"] = segment_array
|
|
179
|
+
|
|
180
|
+
# Discard segments that are too short
|
|
181
|
+
df = discard_segments(
|
|
182
|
+
df=df,
|
|
183
|
+
segment_nr_colname="data_segment_nr",
|
|
184
|
+
min_segment_length_s=min_segment_length_s,
|
|
185
|
+
fs=int(sampling_frequency),
|
|
186
|
+
format="timestamps",
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
n_segments = df["data_segment_nr"].nunique()
|
|
190
|
+
if verbose >= 1:
|
|
191
|
+
segment_durations = []
|
|
192
|
+
for seg_nr in df["data_segment_nr"].unique():
|
|
193
|
+
seg_df = df[df["data_segment_nr"] == seg_nr]
|
|
194
|
+
duration = (
|
|
195
|
+
seg_df[time_column].iloc[-1] - seg_df[time_column].iloc[0]
|
|
196
|
+
)
|
|
197
|
+
segment_durations.append(f"{duration:.1f}s")
|
|
198
|
+
print(f"Created {n_segments} segments: {', '.join(segment_durations)}")
|
|
199
|
+
|
|
200
|
+
# Resample each segment independently
|
|
201
|
+
resampled_segments = []
|
|
202
|
+
for seg_nr in df["data_segment_nr"].unique():
|
|
203
|
+
seg_df = df[df["data_segment_nr"] == seg_nr].copy()
|
|
204
|
+
seg_time = np.array(seg_df[time_column])
|
|
205
|
+
seg_values = np.array(seg_df[values_column_names])
|
|
206
|
+
|
|
207
|
+
# Resample this segment
|
|
208
|
+
duration = seg_time[-1] - seg_time[0]
|
|
209
|
+
n_samples = int(np.round(duration * resampling_frequency)) + 1
|
|
210
|
+
t_resampled = np.linspace(seg_time[0], seg_time[-1], n_samples)
|
|
211
|
+
|
|
212
|
+
interpolation_kind = "cubic" if len(seg_time) > 3 else "linear"
|
|
213
|
+
interpolator = interp1d(
|
|
214
|
+
seg_time,
|
|
215
|
+
seg_values,
|
|
216
|
+
axis=0,
|
|
217
|
+
kind=interpolation_kind,
|
|
218
|
+
fill_value="extrapolate",
|
|
219
|
+
)
|
|
220
|
+
resampled_values = interpolator(t_resampled)
|
|
221
|
+
|
|
222
|
+
# Create resampled segment DataFrame
|
|
223
|
+
df_seg_resampled = pd.DataFrame(
|
|
224
|
+
resampled_values, columns=values_column_names
|
|
225
|
+
)
|
|
226
|
+
df_seg_resampled[time_column] = t_resampled
|
|
227
|
+
df_seg_resampled["data_segment_nr"] = seg_nr
|
|
228
|
+
|
|
229
|
+
# Copy non-numeric columns from first row of segment
|
|
230
|
+
for column in seg_df.columns:
|
|
231
|
+
if (
|
|
232
|
+
column not in df_seg_resampled.columns
|
|
233
|
+
and column != "data_segment_nr"
|
|
234
|
+
):
|
|
235
|
+
df_seg_resampled[column] = seg_df[column].iloc[0]
|
|
236
|
+
|
|
237
|
+
resampled_segments.append(df_seg_resampled)
|
|
238
|
+
|
|
239
|
+
# Concatenate all segments
|
|
240
|
+
df_resampled = pd.concat(resampled_segments, ignore_index=True)
|
|
241
|
+
|
|
242
|
+
# Ensure correct column order
|
|
243
|
+
resampled_columns = (
|
|
244
|
+
[time_column] + values_column_names + ["data_segment_nr"]
|
|
245
|
+
)
|
|
246
|
+
other_cols = [
|
|
247
|
+
col for col in df_resampled.columns if col not in resampled_columns
|
|
248
|
+
]
|
|
249
|
+
df_resampled = df_resampled[resampled_columns + other_cols]
|
|
250
|
+
|
|
251
|
+
if verbose >= 1:
|
|
252
|
+
print(
|
|
253
|
+
f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
|
|
254
|
+
f"{resampling_frequency} Hz"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return df_resampled
|
|
258
|
+
|
|
259
|
+
elif verbose >= 2:
|
|
260
|
+
print(
|
|
261
|
+
"Warning: Data is not contiguous but validation is disabled. "
|
|
262
|
+
"Interpolating over gaps."
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Standard resampling for contiguous data (or when validation is disabled)
|
|
266
|
+
values_array = np.array(df[values_column_names])
|
|
267
|
+
|
|
268
|
+
# Resample the time data
|
|
269
|
+
t_resampled = np.arange(
|
|
270
|
+
time_abs_array[0], time_abs_array[-1], 1 / resampling_frequency
|
|
271
|
+
)
|
|
77
272
|
|
|
78
|
-
# Resample the time data using the specified frequency
|
|
79
|
-
t_resampled = np.arange(time_abs_array[0], time_abs_array[-1], 1 / resampling_frequency)
|
|
80
|
-
|
|
81
273
|
# Choose interpolation method
|
|
82
274
|
interpolation_kind = "cubic" if len(time_abs_array) > 3 else "linear"
|
|
83
|
-
interpolator = interp1d(
|
|
84
|
-
|
|
275
|
+
interpolator = interp1d(
|
|
276
|
+
time_abs_array,
|
|
277
|
+
values_array,
|
|
278
|
+
axis=0,
|
|
279
|
+
kind=interpolation_kind,
|
|
280
|
+
fill_value="extrapolate",
|
|
281
|
+
)
|
|
282
|
+
|
|
85
283
|
# Interpolate
|
|
86
284
|
resampled_values = interpolator(t_resampled)
|
|
87
285
|
|
|
88
|
-
# Create
|
|
286
|
+
# Create resampled DataFrame
|
|
89
287
|
df_resampled = pd.DataFrame(resampled_values, columns=values_column_names)
|
|
90
288
|
df_resampled[time_column] = t_resampled
|
|
91
289
|
|
|
92
|
-
# Return
|
|
93
|
-
|
|
290
|
+
# Return with correct column order
|
|
291
|
+
resampled_columns = [time_column] + values_column_names
|
|
292
|
+
df_resampled = df_resampled[resampled_columns]
|
|
293
|
+
|
|
294
|
+
if verbose >= 1:
|
|
295
|
+
print(
|
|
296
|
+
f"Resampled: {len(df)} -> {len(df_resampled)} rows at "
|
|
297
|
+
f"{resampling_frequency} Hz"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
return df_resampled
|
|
94
301
|
|
|
95
302
|
|
|
96
303
|
def butterworth_filter(
|
|
97
304
|
data: np.ndarray,
|
|
98
305
|
order: int,
|
|
99
|
-
cutoff_frequency:
|
|
306
|
+
cutoff_frequency: float | list[float],
|
|
100
307
|
passband: str,
|
|
101
308
|
sampling_frequency: int,
|
|
102
309
|
):
|
|
103
310
|
"""
|
|
104
311
|
Applies a Butterworth filter to 1D or 2D sensor data.
|
|
105
312
|
|
|
106
|
-
This function applies a low-pass, high-pass, or band-pass Butterworth filter to the
|
|
107
|
-
input data. The filter is designed using the specified order, cutoff frequency,
|
|
313
|
+
This function applies a low-pass, high-pass, or band-pass Butterworth filter to the
|
|
314
|
+
input data. The filter is designed using the specified order, cutoff frequency,
|
|
108
315
|
and passband type. The function can handle both 1D and 2D data arrays.
|
|
109
316
|
|
|
110
317
|
Parameters
|
|
111
318
|
----------
|
|
112
319
|
data : np.ndarray
|
|
113
|
-
The sensor data to be filtered. Can be 1D (e.g., a single signal) or 2D
|
|
320
|
+
The sensor data to be filtered. Can be 1D (e.g., a single signal) or 2D
|
|
114
321
|
(e.g., multi-axis sensor data).
|
|
115
322
|
order : int
|
|
116
323
|
The order of the Butterworth filter. Higher values result in a steeper roll-off.
|
|
117
|
-
cutoff_frequency : float or
|
|
118
|
-
The cutoff frequency (or frequencies) for the filter. For a low-pass
|
|
119
|
-
this is a single float. For a band-pass filter,
|
|
120
|
-
specifying the lower and upper
|
|
324
|
+
cutoff_frequency : float or list of float
|
|
325
|
+
The cutoff frequency (or frequencies) for the filter. For a low-pass
|
|
326
|
+
or high-pass filter, this is a single float. For a band-pass filter,
|
|
327
|
+
this should be a list of two floats, specifying the lower and upper
|
|
328
|
+
cutoff frequencies.
|
|
121
329
|
passband : str
|
|
122
330
|
The type of passband to apply. Options are:
|
|
123
331
|
- 'hp' : high-pass filter
|
|
124
332
|
- 'lp' : low-pass filter
|
|
125
333
|
- 'band' : band-pass filter
|
|
126
334
|
sampling_frequency : int
|
|
127
|
-
The sampling frequency of the data in Hz. This is used to normalize
|
|
335
|
+
The sampling frequency of the data in Hz. This is used to normalize
|
|
336
|
+
the cutoff frequency.
|
|
128
337
|
|
|
129
338
|
Returns
|
|
130
339
|
-------
|
|
@@ -134,12 +343,14 @@ def butterworth_filter(
|
|
|
134
343
|
Raises
|
|
135
344
|
------
|
|
136
345
|
ValueError
|
|
137
|
-
If the input data has more than two dimensions, or if an invalid
|
|
346
|
+
If the input data has more than two dimensions, or if an invalid
|
|
347
|
+
passband is specified.
|
|
138
348
|
|
|
139
349
|
Notes
|
|
140
350
|
-----
|
|
141
|
-
The function uses `scipy.signal.butter` to design the filter and
|
|
142
|
-
to apply it using second-order sections (SOS)
|
|
351
|
+
The function uses `scipy.signal.butter` to design the filter and
|
|
352
|
+
`scipy.signal.sosfiltfilt` to apply it using second-order sections (SOS)
|
|
353
|
+
to improve numerical stability.
|
|
143
354
|
"""
|
|
144
355
|
# Design the filter using second-order sections (SOS)
|
|
145
356
|
sos = signal.butter(
|
|
@@ -159,7 +370,14 @@ def butterworth_filter(
|
|
|
159
370
|
else:
|
|
160
371
|
raise ValueError("Data must be either 1D or 2D.")
|
|
161
372
|
|
|
162
|
-
|
|
373
|
+
|
|
374
|
+
def preprocess_imu_data(
|
|
375
|
+
df: pd.DataFrame,
|
|
376
|
+
config: IMUConfig,
|
|
377
|
+
sensor: str,
|
|
378
|
+
watch_side: str,
|
|
379
|
+
verbose: int = 1,
|
|
380
|
+
) -> pd.DataFrame:
|
|
163
381
|
"""
|
|
164
382
|
Preprocesses IMU data by resampling and applying filters.
|
|
165
383
|
|
|
@@ -168,8 +386,9 @@ def preprocess_imu_data(df: pd.DataFrame, config: IMUConfig, sensor: str, watch_
|
|
|
168
386
|
df : pd.DataFrame
|
|
169
387
|
The DataFrame containing raw accelerometer and/or gyroscope data.
|
|
170
388
|
config : IMUConfig
|
|
171
|
-
Configuration object containing various settings, such as time column
|
|
172
|
-
filter settings, and
|
|
389
|
+
Configuration object containing various settings, such as time column
|
|
390
|
+
name, accelerometer and/or gyroscope columns, filter settings, and
|
|
391
|
+
sampling frequency.
|
|
173
392
|
sensor: str
|
|
174
393
|
Name of the sensor data to be preprocessed. Must be one of:
|
|
175
394
|
- "accelerometer": Preprocess accelerometer data only.
|
|
@@ -179,169 +398,281 @@ def preprocess_imu_data(df: pd.DataFrame, config: IMUConfig, sensor: str, watch_
|
|
|
179
398
|
The side of the watch where the data was collected. Must be one of:
|
|
180
399
|
- "left": Data was collected from the left wrist.
|
|
181
400
|
- "right": Data was collected from the right wrist.
|
|
401
|
+
verbose : int, default 1
|
|
402
|
+
Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
|
|
182
403
|
|
|
183
404
|
Returns
|
|
184
405
|
-------
|
|
185
406
|
pd.DataFrame
|
|
186
|
-
The preprocessed accelerometer and or gyroscope data with the
|
|
407
|
+
The preprocessed accelerometer and or gyroscope data with the
|
|
408
|
+
following transformations:
|
|
187
409
|
- Resampled data at the specified frequency.
|
|
188
|
-
- Filtered accelerometer data with high-pass and low-pass filtering
|
|
189
|
-
|
|
410
|
+
- Filtered accelerometer data with high-pass and low-pass filtering
|
|
411
|
+
applied.
|
|
412
|
+
|
|
190
413
|
Notes
|
|
191
414
|
-----
|
|
192
|
-
- The function applies Butterworth filters to accelerometer data, both
|
|
415
|
+
- The function applies Butterworth filters to accelerometer data, both
|
|
416
|
+
high-pass and low-pass.
|
|
193
417
|
"""
|
|
418
|
+
# Make a copy to avoid SettingWithCopyWarning
|
|
419
|
+
df = df.copy()
|
|
194
420
|
|
|
195
421
|
# Extract sensor column
|
|
196
|
-
if sensor ==
|
|
197
|
-
values_colnames = config.
|
|
198
|
-
elif sensor ==
|
|
199
|
-
values_colnames = config.
|
|
200
|
-
elif sensor ==
|
|
201
|
-
values_colnames = config.
|
|
422
|
+
if sensor == "accelerometer":
|
|
423
|
+
values_colnames = config.accelerometer_colnames
|
|
424
|
+
elif sensor == "gyroscope":
|
|
425
|
+
values_colnames = config.gyroscope_colnames
|
|
426
|
+
elif sensor == "both":
|
|
427
|
+
values_colnames = config.accelerometer_colnames + config.gyroscope_colnames
|
|
202
428
|
else:
|
|
203
|
-
raise(
|
|
204
|
-
|
|
205
|
-
#
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
429
|
+
raise ("Sensor should be either accelerometer, gyroscope, or both")
|
|
430
|
+
|
|
431
|
+
# Check if data needs resampling
|
|
432
|
+
# Skip resampling if already at target frequency or if data has been pre-segmented
|
|
433
|
+
needs_resampling = True
|
|
434
|
+
validate_contiguous = True
|
|
435
|
+
|
|
436
|
+
if "data_segment_nr" in df.columns:
|
|
437
|
+
# Data has been pre-segmented, skip contiguity validation
|
|
438
|
+
validate_contiguous = False
|
|
439
|
+
|
|
440
|
+
# Check current sampling frequency
|
|
441
|
+
time_diff = df[config.time_colname].diff().dropna()
|
|
442
|
+
current_dt = time_diff.median()
|
|
443
|
+
current_frequency = 1.0 / current_dt
|
|
444
|
+
|
|
445
|
+
if abs(current_frequency - config.resampling_frequency) < 0.1:
|
|
446
|
+
needs_resampling = False
|
|
447
|
+
|
|
448
|
+
if needs_resampling:
|
|
449
|
+
# Resample the data to the specified frequency
|
|
450
|
+
df = resample_data(
|
|
451
|
+
df=df,
|
|
452
|
+
time_column=config.time_colname,
|
|
453
|
+
values_column_names=values_colnames,
|
|
454
|
+
sampling_frequency=config.sampling_frequency,
|
|
455
|
+
resampling_frequency=config.resampling_frequency,
|
|
456
|
+
tolerance=config.tolerance,
|
|
457
|
+
validate_contiguous=validate_contiguous,
|
|
458
|
+
verbose=verbose,
|
|
459
|
+
)
|
|
213
460
|
|
|
214
461
|
# Invert the IMU data if the watch was worn on the right wrist
|
|
215
462
|
df = invert_watch_side(df, watch_side, sensor)
|
|
216
|
-
|
|
217
|
-
if sensor in [
|
|
218
|
-
|
|
463
|
+
|
|
464
|
+
if sensor in ["accelerometer", "both"]:
|
|
465
|
+
|
|
219
466
|
# Extract accelerometer data for filtering
|
|
220
|
-
accel_data = df[config.
|
|
467
|
+
accel_data = df[config.accelerometer_colnames].values
|
|
221
468
|
|
|
222
469
|
# Define filter configurations for high-pass and low-pass
|
|
223
470
|
filter_renaming_configs = {
|
|
224
|
-
|
|
225
|
-
|
|
471
|
+
"hp": {
|
|
472
|
+
"result_columns": config.accelerometer_colnames,
|
|
473
|
+
"replace_original": True,
|
|
474
|
+
},
|
|
475
|
+
"lp": {
|
|
476
|
+
"result_columns": [
|
|
477
|
+
f"{col}_grav" for col in config.accelerometer_colnames
|
|
478
|
+
],
|
|
479
|
+
"replace_original": False,
|
|
480
|
+
},
|
|
226
481
|
}
|
|
227
482
|
|
|
228
483
|
# Apply filters in a loop
|
|
229
484
|
for passband, filter_config in filter_renaming_configs.items():
|
|
230
485
|
filtered_data = butterworth_filter(
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
486
|
+
data=accel_data,
|
|
487
|
+
order=config.filter_order,
|
|
488
|
+
cutoff_frequency=config.lower_cutoff_frequency,
|
|
489
|
+
passband=passband,
|
|
490
|
+
sampling_frequency=config.sampling_frequency,
|
|
236
491
|
)
|
|
237
492
|
|
|
238
493
|
# Replace or add new columns based on configuration
|
|
239
494
|
df[filter_config["result_columns"]] = filtered_data
|
|
240
495
|
|
|
241
|
-
values_colnames += config.
|
|
496
|
+
values_colnames += config.gravity_colnames
|
|
242
497
|
|
|
243
|
-
df = df[[
|
|
498
|
+
df = df[[config.time_colname, *values_colnames]]
|
|
244
499
|
|
|
245
500
|
return df
|
|
246
501
|
|
|
247
502
|
|
|
248
|
-
def preprocess_ppg_data(
|
|
249
|
-
|
|
503
|
+
def preprocess_ppg_data(
|
|
504
|
+
df_ppg: pd.DataFrame,
|
|
505
|
+
ppg_config: PPGConfig,
|
|
506
|
+
start_time_ppg: str | None = None,
|
|
507
|
+
df_acc: pd.DataFrame | None = None,
|
|
508
|
+
imu_config: IMUConfig | None = None,
|
|
509
|
+
start_time_imu: str | None = None,
|
|
510
|
+
verbose: int = 1,
|
|
511
|
+
) -> tuple[pd.DataFrame, pd.DataFrame | None]:
|
|
250
512
|
"""
|
|
251
|
-
|
|
513
|
+
This function preprocesses PPG and accelerometer data by resampling,
|
|
514
|
+
filtering and aligning the data segments of both sensors (if applicable).
|
|
515
|
+
Aligning is done using the extract_overlapping_segments function which is
|
|
516
|
+
based on the provided start times of the PPG and IMU data and returns
|
|
517
|
+
only the data points where both signals overlap in time. The remaining
|
|
518
|
+
data points are discarded.
|
|
519
|
+
After alignment, the function resamples the data to the specified
|
|
520
|
+
frequency and applies Butterworth filters to both PPG and accelerometer
|
|
521
|
+
data (if applicable).
|
|
522
|
+
The output is two DataFrames: one for the preprocessed PPG data and
|
|
523
|
+
another for the preprocessed accelerometer data (if provided, otherwise
|
|
524
|
+
return is None).
|
|
252
525
|
|
|
253
526
|
Parameters
|
|
254
527
|
----------
|
|
255
528
|
df_ppg : pd.DataFrame
|
|
256
529
|
DataFrame containing PPG data.
|
|
257
|
-
df_acc : pd.DataFrame
|
|
258
|
-
DataFrame containing accelerometer from IMU data.
|
|
259
530
|
ppg_config : PPGPreprocessingConfig
|
|
260
531
|
Configuration object for PPG preprocessing.
|
|
261
|
-
imu_config : IMUPreprocessingConfig
|
|
262
|
-
Configuration object for IMU preprocessing.
|
|
263
532
|
start_time_ppg : str
|
|
264
533
|
iso8601 formatted start time of the PPG data.
|
|
534
|
+
df_acc : pd.DataFrame
|
|
535
|
+
DataFrame containing accelerometer from IMU data.
|
|
536
|
+
imu_config : IMUPreprocessingConfig
|
|
537
|
+
Configuration object for IMU preprocessing.
|
|
265
538
|
start_time_imu : str
|
|
266
539
|
iso8601 formatted start time of the IMU data.
|
|
540
|
+
verbose : int, default 1
|
|
541
|
+
Logging verbose level: 0=errors only, 1=basic info, 2+=detailed info.
|
|
267
542
|
|
|
268
543
|
Returns
|
|
269
544
|
-------
|
|
270
|
-
Tuple[pd.DataFrame, pd.DataFrame]
|
|
271
|
-
|
|
272
|
-
|
|
545
|
+
Tuple[pd.DataFrame, pd.DataFrame | None]
|
|
546
|
+
A tuple containing two DataFrames:
|
|
547
|
+
- Preprocessed PPG data with the following transformations:
|
|
548
|
+
- Resampled data at the specified frequency.
|
|
549
|
+
- Filtered PPG data with bandpass filtering applied.
|
|
550
|
+
- Preprocessed accelerometer data (if provided, otherwise return is
|
|
551
|
+
None) with the following transformations:
|
|
552
|
+
- Resampled data at the specified frequency.
|
|
553
|
+
- Filtered accelerometer data with high-pass and low-pass
|
|
554
|
+
filtering applied.
|
|
555
|
+
|
|
556
|
+
Notes
|
|
557
|
+
-----
|
|
558
|
+
- If accelerometer data or IMU configuration is not provided, the
|
|
559
|
+
function only preprocesses PPG data.
|
|
560
|
+
- The function applies Butterworth filters to PPG and accelerometer
|
|
561
|
+
(if applicable) data, both high-pass and low-pass.
|
|
562
|
+
|
|
273
563
|
"""
|
|
564
|
+
# Make copies to avoid SettingWithCopyWarning
|
|
565
|
+
df_ppg = df_ppg.copy()
|
|
566
|
+
if df_acc is not None:
|
|
567
|
+
df_acc = df_acc.copy()
|
|
568
|
+
|
|
569
|
+
if df_acc is not None and imu_config is not None:
|
|
570
|
+
# Extract overlapping segments
|
|
571
|
+
df_ppg_overlapping, df_acc_overlapping = extract_overlapping_segments(
|
|
572
|
+
df_ppg=df_ppg,
|
|
573
|
+
df_acc=df_acc,
|
|
574
|
+
time_colname_ppg=ppg_config.time_colname,
|
|
575
|
+
time_colname_imu=imu_config.time_colname,
|
|
576
|
+
start_time_ppg=start_time_ppg,
|
|
577
|
+
start_time_acc=start_time_imu,
|
|
578
|
+
)
|
|
274
579
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
580
|
+
# Resample accelerometer data
|
|
581
|
+
# Skip contiguity validation if data has been pre-segmented
|
|
582
|
+
validate_contiguous_acc = "data_segment_nr" not in df_acc_overlapping.columns
|
|
583
|
+
df_acc_proc = resample_data(
|
|
584
|
+
df=df_acc_overlapping,
|
|
585
|
+
time_column=imu_config.time_colname,
|
|
586
|
+
values_column_names=list(imu_config.d_channels_accelerometer.keys()),
|
|
587
|
+
sampling_frequency=imu_config.sampling_frequency,
|
|
588
|
+
resampling_frequency=imu_config.resampling_frequency,
|
|
589
|
+
tolerance=imu_config.tolerance,
|
|
590
|
+
validate_contiguous=validate_contiguous_acc,
|
|
591
|
+
verbose=verbose,
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
# Extract accelerometer data for filtering
|
|
595
|
+
accel_data = df_acc_proc[imu_config.accelerometer_colnames].values
|
|
596
|
+
|
|
597
|
+
# Define filter configurations for high-pass and low-pass
|
|
598
|
+
filter_renaming_configs = {
|
|
599
|
+
"hp": {
|
|
600
|
+
"result_columns": imu_config.accelerometer_colnames,
|
|
601
|
+
"replace_original": True,
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
# Apply filters in a loop
|
|
606
|
+
for passband, filter_config in filter_renaming_configs.items():
|
|
607
|
+
filtered_data = butterworth_filter(
|
|
608
|
+
data=accel_data,
|
|
609
|
+
order=imu_config.filter_order,
|
|
610
|
+
cutoff_frequency=imu_config.lower_cutoff_frequency,
|
|
611
|
+
passband=passband,
|
|
612
|
+
sampling_frequency=imu_config.sampling_frequency,
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
# Replace or add new columns based on configuration
|
|
616
|
+
df_acc_proc[filter_config["result_columns"]] = filtered_data
|
|
617
|
+
|
|
618
|
+
else:
|
|
619
|
+
df_ppg_overlapping = df_ppg
|
|
286
620
|
|
|
287
621
|
# Resample PPG data
|
|
622
|
+
# Skip contiguity validation if data has been pre-segmented
|
|
623
|
+
validate_contiguous_ppg = "data_segment_nr" not in df_ppg_overlapping.columns
|
|
288
624
|
df_ppg_proc = resample_data(
|
|
289
625
|
df=df_ppg_overlapping,
|
|
290
|
-
time_column=
|
|
291
|
-
values_column_names
|
|
626
|
+
time_column=ppg_config.time_colname,
|
|
627
|
+
values_column_names=list(ppg_config.d_channels_ppg.keys()),
|
|
292
628
|
sampling_frequency=ppg_config.sampling_frequency,
|
|
293
|
-
resampling_frequency=ppg_config.
|
|
629
|
+
resampling_frequency=ppg_config.resampling_frequency,
|
|
630
|
+
tolerance=ppg_config.tolerance,
|
|
631
|
+
validate_contiguous=validate_contiguous_ppg,
|
|
632
|
+
verbose=verbose,
|
|
294
633
|
)
|
|
295
634
|
|
|
296
|
-
|
|
297
|
-
# Extract accelerometer data for filtering
|
|
298
|
-
accel_data = df_acc_proc[imu_config.accelerometer_cols].values
|
|
299
|
-
|
|
300
|
-
# Define filter configurations for high-pass and low-pass
|
|
301
|
-
filter_renaming_configs = {
|
|
302
|
-
"hp": {"result_columns": imu_config.accelerometer_cols, "replace_original": True}}
|
|
303
|
-
|
|
304
|
-
# Apply filters in a loop
|
|
305
|
-
for passband, filter_config in filter_renaming_configs.items():
|
|
306
|
-
filtered_data = butterworth_filter(
|
|
307
|
-
data=accel_data,
|
|
308
|
-
order=imu_config.filter_order,
|
|
309
|
-
cutoff_frequency=imu_config.lower_cutoff_frequency,
|
|
310
|
-
passband=passband,
|
|
311
|
-
sampling_frequency=imu_config.sampling_frequency,
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
# Replace or add new columns based on configuration
|
|
315
|
-
df_acc_proc[filter_config["result_columns"]] = filtered_data
|
|
316
|
-
|
|
317
635
|
# Extract accelerometer data for filtering
|
|
318
636
|
ppg_data = df_ppg_proc[ppg_config.ppg_colname].values
|
|
319
637
|
|
|
320
638
|
# Define filter configurations for high-pass and low-pass
|
|
321
639
|
filter_renaming_configs = {
|
|
322
|
-
|
|
640
|
+
"bandpass": {"result_columns": ppg_config.ppg_colname, "replace_original": True}
|
|
641
|
+
}
|
|
323
642
|
|
|
324
643
|
# Apply filters in a loop
|
|
325
644
|
for passband, filter_config in filter_renaming_configs.items():
|
|
326
645
|
filtered_data = butterworth_filter(
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
646
|
+
data=ppg_data,
|
|
647
|
+
order=ppg_config.filter_order,
|
|
648
|
+
cutoff_frequency=[
|
|
649
|
+
ppg_config.lower_cutoff_frequency,
|
|
650
|
+
ppg_config.upper_cutoff_frequency,
|
|
651
|
+
],
|
|
652
|
+
passband=passband,
|
|
653
|
+
sampling_frequency=ppg_config.sampling_frequency,
|
|
332
654
|
)
|
|
333
655
|
|
|
334
656
|
# Replace or add new columns based on configuration
|
|
335
657
|
df_ppg_proc[filter_config["result_columns"]] = filtered_data
|
|
336
|
-
|
|
337
|
-
return df_ppg_proc, df_acc_proc
|
|
338
|
-
|
|
339
658
|
|
|
659
|
+
if df_acc is not None and imu_config is not None:
|
|
660
|
+
return df_ppg_proc, df_acc_proc
|
|
661
|
+
else:
|
|
662
|
+
return df_ppg_proc, None
|
|
340
663
|
|
|
341
664
|
|
|
342
|
-
def extract_overlapping_segments(
|
|
665
|
+
def extract_overlapping_segments(
|
|
666
|
+
df_ppg: pd.DataFrame,
|
|
667
|
+
df_acc: pd.DataFrame,
|
|
668
|
+
time_colname_ppg: str,
|
|
669
|
+
time_colname_imu: str,
|
|
670
|
+
start_time_ppg: str,
|
|
671
|
+
start_time_acc: str,
|
|
672
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
343
673
|
"""
|
|
344
|
-
Extract DataFrames with overlapping data segments between accelerometer
|
|
674
|
+
Extract DataFrames with overlapping data segments between accelerometer
|
|
675
|
+
(from the IMU) and PPG datasets based on their timestamps.
|
|
345
676
|
|
|
346
677
|
Parameters
|
|
347
678
|
----------
|
|
@@ -349,6 +680,10 @@ def extract_overlapping_segments(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, sta
|
|
|
349
680
|
DataFrame containing PPG data.
|
|
350
681
|
df_acc : pd.DataFrame
|
|
351
682
|
DataFrame containing accelerometer data from the IMU.
|
|
683
|
+
time_colname_ppg : str
|
|
684
|
+
The name of the column containing the time data in the PPG dataframe.
|
|
685
|
+
time_colname_imu : str
|
|
686
|
+
The name of the column containing the time data in the IMU dataframe.
|
|
352
687
|
start_time_ppg : str
|
|
353
688
|
iso8601 formatted start time of the PPG data.
|
|
354
689
|
start_time_acc : str
|
|
@@ -357,7 +692,8 @@ def extract_overlapping_segments(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, sta
|
|
|
357
692
|
Returns
|
|
358
693
|
-------
|
|
359
694
|
Tuple[pd.DataFrame, pd.DataFrame]
|
|
360
|
-
DataFrames containing the overlapping segments (time and values) of
|
|
695
|
+
DataFrames containing the overlapping segments (time and values) of
|
|
696
|
+
PPG and accelerometer data.
|
|
361
697
|
"""
|
|
362
698
|
# Convert start times to Unix timestamps
|
|
363
699
|
datetime_ppg_start = datetime.fromisoformat(start_time_ppg.replace("Z", "+00:00"))
|
|
@@ -365,22 +701,23 @@ def extract_overlapping_segments(df_ppg: pd.DataFrame, df_acc: pd.DataFrame, sta
|
|
|
365
701
|
datetime_acc_start = datetime.fromisoformat(start_time_acc.replace("Z", "+00:00"))
|
|
366
702
|
start_acc_ppg = int(datetime_acc_start.timestamp())
|
|
367
703
|
|
|
368
|
-
# Calculate the time in Unix timestamps for each dataset because the
|
|
369
|
-
|
|
370
|
-
|
|
704
|
+
# Calculate the time in Unix timestamps for each dataset because the
|
|
705
|
+
# timestamps are relative to the start time
|
|
706
|
+
ppg_time = df_ppg[time_colname_ppg] + start_unix_ppg
|
|
707
|
+
acc_time = df_acc[time_colname_imu] + start_acc_ppg
|
|
371
708
|
|
|
372
709
|
# Determine the overlapping time interval
|
|
373
710
|
start_time = max(ppg_time.iloc[0], acc_time.iloc[0])
|
|
374
711
|
end_time = min(ppg_time.iloc[-1], acc_time.iloc[-1])
|
|
375
712
|
|
|
376
713
|
# Extract indices for overlapping segments
|
|
377
|
-
ppg_start_index = np.searchsorted(ppg_time, start_time,
|
|
378
|
-
ppg_end_index = np.searchsorted(ppg_time, end_time,
|
|
379
|
-
acc_start_index = np.searchsorted(acc_time, start_time,
|
|
380
|
-
acc_end_index = np.searchsorted(acc_time, end_time,
|
|
714
|
+
ppg_start_index = np.searchsorted(ppg_time, start_time, "left")
|
|
715
|
+
ppg_end_index = np.searchsorted(ppg_time, end_time, "right") - 1
|
|
716
|
+
acc_start_index = np.searchsorted(acc_time, start_time, "left")
|
|
717
|
+
acc_end_index = np.searchsorted(acc_time, end_time, "right") - 1
|
|
381
718
|
|
|
382
719
|
# Extract overlapping segments from DataFrames
|
|
383
|
-
df_ppg_overlapping = df_ppg.iloc[ppg_start_index:ppg_end_index + 1]
|
|
384
|
-
df_acc_overlapping = df_acc.iloc[acc_start_index:acc_end_index + 1]
|
|
720
|
+
df_ppg_overlapping = df_ppg.iloc[ppg_start_index : ppg_end_index + 1]
|
|
721
|
+
df_acc_overlapping = df_acc.iloc[acc_start_index : acc_end_index + 1]
|
|
385
722
|
|
|
386
|
-
return df_ppg_overlapping, df_acc_overlapping
|
|
723
|
+
return df_ppg_overlapping, df_acc_overlapping
|