paradigma 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,409 @@
1
+ """
2
+ Data preparation module for ParaDigMa toolbox.
3
+
4
+ This module provides functions to prepare raw sensor data for analysis:
5
+ - Unit conversion (m/s² to g, rad/s to deg/s)
6
+ - Time column formatting
7
+ - Column name standardization
8
+ - Watch side orientation correction
9
+ - Resampling to 100 Hz
10
+
11
+ Based on data_preparation tutorial.
12
+ """
13
+
14
+ import logging
15
+
16
+ import pandas as pd
17
+
18
+ from paradigma.constants import DataColumns, TimeUnit
19
+ from paradigma.preprocessing import resample_data
20
+ from paradigma.util import (
21
+ convert_units_accelerometer,
22
+ convert_units_gyroscope,
23
+ transform_time_array,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def standardize_column_names(
30
+ df: pd.DataFrame,
31
+ column_mapping: dict[str, str] | None = None,
32
+ ) -> pd.DataFrame:
33
+ """
34
+ Standardize column names to ParaDigMa conventions.
35
+
36
+ Parameters
37
+ ----------
38
+ df : pd.DataFrame
39
+ Input DataFrame.
40
+ column_mapping : dict, optional
41
+ Custom column mapping.
42
+
43
+ Returns
44
+ -------
45
+ pd.DataFrame
46
+ DataFrame with standardized column names.
47
+ """
48
+ df = df.copy()
49
+
50
+ # Apply mapping for existing columns only
51
+ if column_mapping is None:
52
+ return df
53
+
54
+ existing_mapping = {k: v for k, v in column_mapping.items() if k in df.columns}
55
+ df = df.rename(columns=existing_mapping)
56
+
57
+ if existing_mapping:
58
+ logger.debug(f"Standardized columns: {existing_mapping}")
59
+ return df
60
+
61
+
62
+ def convert_sensor_units(
63
+ df: pd.DataFrame,
64
+ accelerometer_units: str = "m/s^2",
65
+ gyroscope_units: str = "deg/s",
66
+ ) -> pd.DataFrame:
67
+ """
68
+ Convert sensor units to ParaDigMa expected format (g for acceleration,
69
+ deg/s for gyroscope).
70
+
71
+ Parameters
72
+ ----------
73
+ df : pd.DataFrame
74
+ Input DataFrame with sensor data.
75
+ accelerometer_units : str, default 'm/s^2'
76
+ Current units of accelerometer data.
77
+ gyroscope_units : str, default 'deg/s'
78
+ Current units of gyroscope data.
79
+
80
+ Returns
81
+ -------
82
+ pd.DataFrame
83
+ DataFrame with converted units.
84
+ """
85
+ df = df.copy()
86
+
87
+ # Convert accelerometer units
88
+ accelerometer_columns = [
89
+ col
90
+ for col in [
91
+ DataColumns.ACCELEROMETER_X,
92
+ DataColumns.ACCELEROMETER_Y,
93
+ DataColumns.ACCELEROMETER_Z,
94
+ ]
95
+ if col in df.columns
96
+ ]
97
+
98
+ if accelerometer_columns and accelerometer_units != "g":
99
+ logger.debug(f"Converting accelerometer units from {accelerometer_units} to g")
100
+ accelerometer_data = df[accelerometer_columns].values
101
+ df[accelerometer_columns] = convert_units_accelerometer(
102
+ data=accelerometer_data, units=accelerometer_units
103
+ )
104
+
105
+ # Convert gyroscope units
106
+ gyroscope_columns = [
107
+ col
108
+ for col in [
109
+ DataColumns.GYROSCOPE_X,
110
+ DataColumns.GYROSCOPE_Y,
111
+ DataColumns.GYROSCOPE_Z,
112
+ ]
113
+ if col in df.columns
114
+ ]
115
+
116
+ if gyroscope_columns and gyroscope_units != "deg/s":
117
+ logger.debug(f"Converting gyroscope units from {gyroscope_units} to deg/s")
118
+ gyroscope_data = df[gyroscope_columns].values
119
+ df[gyroscope_columns] = convert_units_gyroscope(
120
+ data=gyroscope_data, units=gyroscope_units
121
+ )
122
+
123
+ return df
124
+
125
+
126
+ def prepare_time_column(
127
+ df: pd.DataFrame,
128
+ time_column: str = DataColumns.TIME,
129
+ input_unit_type: TimeUnit = TimeUnit.RELATIVE_S,
130
+ output_unit_type: TimeUnit = TimeUnit.RELATIVE_S,
131
+ ) -> pd.DataFrame:
132
+ """
133
+ Prepare time column to start from 0 seconds.
134
+
135
+ Parameters
136
+ ----------
137
+ df : pd.DataFrame
138
+ Input DataFrame.
139
+ time_column : str, default DataColumns.TIME
140
+ Name of time column.
141
+ input_unit_type : TimeUnit, default TimeUnit.RELATIVE_S
142
+ Input time unit type.
143
+ output_unit_type : TimeUnit, default TimeUnit.RELATIVE_S
144
+ Output time unit type.
145
+
146
+ Returns
147
+ -------
148
+ pd.DataFrame
149
+ DataFrame with prepared time column.
150
+ """
151
+ df = df.copy()
152
+
153
+ if time_column not in df.columns:
154
+ raise ValueError(f"Time column '{time_column}' not found in DataFrame")
155
+
156
+ logger.debug(f"Preparing time column: {input_unit_type} -> {output_unit_type}")
157
+
158
+ df[time_column] = transform_time_array(
159
+ time_array=df[time_column],
160
+ input_unit_type=input_unit_type,
161
+ output_unit_type=output_unit_type,
162
+ )
163
+
164
+ return df
165
+
166
+
167
+ def correct_watch_orientation(
168
+ df: pd.DataFrame,
169
+ device_orientation: list[str] | None = None,
170
+ sensor: str = "both",
171
+ ) -> pd.DataFrame:
172
+ """
173
+ Apply custom device orientation mapping if provided.
174
+
175
+ Note: Watch-side inversion is handled separately during preprocessing
176
+ in the pipeline functions (preprocess_imu_data), not during data preparation.
177
+
178
+ Parameters
179
+ ----------
180
+ df : pd.DataFrame
181
+ Input DataFrame with sensor data
182
+ device_orientation : list of str, optional
183
+ Custom orientation correction multipliers for each axis.
184
+ Maps device axes to standard [x, y, z] orientation.
185
+ sensor: str, optional
186
+ Sensor to correct ('accelerometer', 'gyroscope', or 'both').
187
+
188
+ Returns
189
+ -------
190
+ pd.DataFrame
191
+ DataFrame with corrected device orientation (if custom mapping provided)
192
+ """
193
+ out = df.copy()
194
+
195
+ target_orientation = ["x", "y", "z"]
196
+ valid_axes = ["x", "-x", "y", "-y", "z", "-z"]
197
+
198
+ if sensor == "both":
199
+ sensors_to_correct = ["accelerometer", "gyroscope"]
200
+ elif sensor in ["accelerometer", "gyroscope"]:
201
+ sensors_to_correct = [sensor]
202
+ else:
203
+ raise ValueError("Sensor must be 'accelerometer', 'gyroscope', or 'both'")
204
+
205
+ if device_orientation is not None:
206
+ if any([axis not in valid_axes for axis in device_orientation]):
207
+ raise ValueError(
208
+ f"Invalid device_orientation values. Must be one of {valid_axes}"
209
+ )
210
+ if len(device_orientation) != 3:
211
+ raise ValueError("device_orientation must have exactly 3 elements")
212
+
213
+ if all([device_orientation[x] == target_orientation[x] for x in range(3)]):
214
+ logger.debug(
215
+ "Device orientation matches target orientation, "
216
+ "no correction applied"
217
+ )
218
+ else:
219
+ for sensor_type in sensors_to_correct:
220
+ for target_axis, mapping in zip(["x", "y", "z"], device_orientation):
221
+ sign = -1 if mapping.startswith("-") else 1
222
+ source_axis = mapping[-1]
223
+
224
+ out[f"{sensor_type}_{target_axis}"] = (
225
+ sign * df[f"{sensor_type}_{source_axis}"]
226
+ )
227
+
228
+ logger.debug(
229
+ f"Applied custom orientation: {sensor_type} "
230
+ f"{target_axis} mapped from {mapping}"
231
+ )
232
+
233
+ return out
234
+
235
+
236
+ def validate_prepared_data(df: pd.DataFrame) -> dict[str, bool | str]:
237
+ """
238
+ Validate that data is properly prepared for ParaDigMa analysis.
239
+
240
+ Parameters
241
+ ----------
242
+ df : pd.DataFrame
243
+ Prepared DataFrame
244
+
245
+ Returns
246
+ -------
247
+ dict
248
+ Validation results with checks and error messages
249
+ """
250
+ validation = {"valid": True, "errors": [], "warnings": []}
251
+
252
+ # Check required columns
253
+ if DataColumns.TIME not in df.columns:
254
+ validation["errors"].append(f"Missing required time column: {DataColumns.TIME}")
255
+
256
+ # Check for at least accelerometer or gyroscope data
257
+ accel_cols = [
258
+ DataColumns.ACCELEROMETER_X,
259
+ DataColumns.ACCELEROMETER_Y,
260
+ DataColumns.ACCELEROMETER_Z,
261
+ ]
262
+ gyro_cols = [
263
+ DataColumns.GYROSCOPE_X,
264
+ DataColumns.GYROSCOPE_Y,
265
+ DataColumns.GYROSCOPE_Z,
266
+ ]
267
+
268
+ has_accel = all(col in df.columns for col in accel_cols)
269
+ has_gyro = all(col in df.columns for col in gyro_cols)
270
+
271
+ if not has_accel and not has_gyro:
272
+ validation["errors"].append("Missing accelerometer and gyroscope data")
273
+ elif not has_accel:
274
+ validation["warnings"].append("Missing accelerometer data")
275
+ elif not has_gyro:
276
+ validation["warnings"].append("Missing gyroscope data")
277
+
278
+ # Check time column format
279
+ if DataColumns.TIME in df.columns:
280
+ if df[DataColumns.TIME].iloc[0] != 0:
281
+ validation["warnings"].append("Time column does not start at 0")
282
+
283
+ time_diff = df[DataColumns.TIME].diff().dropna()
284
+ if time_diff.std() / time_diff.mean() > 0.1:
285
+ validation["warnings"].append("Time column has irregular sampling")
286
+
287
+ # Check for NaN values
288
+ nan_columns = df.columns[df.isnull().any()].tolist()
289
+ if nan_columns:
290
+ validation["warnings"].append(f"Columns with NaN values: {nan_columns}")
291
+
292
+ # Check sampling frequency
293
+ if DataColumns.TIME in df.columns and len(df) > 1:
294
+ time_diff = df[DataColumns.TIME].diff().dropna()
295
+ current_dt = time_diff.median()
296
+ current_frequency = 1.0 / current_dt
297
+
298
+ if abs(current_frequency - 100.0) > 5.0:
299
+ validation["warnings"].append(
300
+ f"Sampling frequency {current_frequency:.2f} Hz differs from "
301
+ f"expected 100 Hz"
302
+ )
303
+
304
+ # Set overall validity
305
+ validation["valid"] = len(validation["errors"]) == 0
306
+
307
+ return validation
308
+
309
+
310
+ def prepare_raw_data(
311
+ df: pd.DataFrame,
312
+ accelerometer_units: str = "m/s^2",
313
+ gyroscope_units: str = "deg/s",
314
+ time_input_unit: TimeUnit = TimeUnit.RELATIVE_S,
315
+ resampling_frequency: float = 100.0,
316
+ column_mapping: dict[str, str] | None = None,
317
+ device_orientation: dict[str, int] | None = None,
318
+ validate: bool = True,
319
+ auto_segment: bool = False,
320
+ max_segment_gap_s: float | None = None,
321
+ min_segment_length_s: float | None = None,
322
+ ) -> pd.DataFrame:
323
+ """
324
+ Complete data preparation pipeline for raw sensor data.
325
+
326
+ Parameters
327
+ ----------
328
+ df : pd.DataFrame
329
+ Raw sensor data
330
+ accelerometer_units : str, default 'm/s^2'
331
+ Current units of accelerometer data
332
+ gyroscope_units : str, default 'deg/s'
333
+ Current units of gyroscope data
334
+ time_input_unit : TimeUnit, default TimeUnit.RELATIVE_S
335
+ Input time unit type
336
+ resampling_frequency : float, default 100.0
337
+ Target sampling frequency in Hz
338
+ column_mapping : Dict[str, str], optional
339
+ Custom column name mapping
340
+ device_orientation : Dict[str, int], optional
341
+ Custom orientation correction
342
+ validate : bool, default True
343
+ Whether to validate the prepared data
344
+ auto_segment : bool, default False
345
+ If True, automatically split non-contiguous data into segments.
346
+ Adds 'data_segment_nr' column to output.
347
+ max_segment_gap_s : float, optional
348
+ Maximum gap (seconds) before starting new segment. Used when auto_segment=True.
349
+ Defaults to 1.5s.
350
+ min_segment_length_s : float, optional
351
+ Minimum segment length (seconds) to keep. Used when auto_segment=True.
352
+ Defaults to 1.5s.
353
+
354
+ Returns
355
+ -------
356
+ pd.DataFrame
357
+ Prepared data ready for ParaDigMa analysis. If auto_segment=True and multiple
358
+ segments found, includes 'data_segment_nr' column.
359
+ """
360
+ logger.info("Starting data preparation pipeline")
361
+
362
+ # Step 1: Standardize column names
363
+ logger.info("Step 1: Standardizing column names")
364
+ if column_mapping is None:
365
+ logger.debug("No column mapping provided, using default mapping")
366
+ else:
367
+ df = standardize_column_names(df, column_mapping)
368
+
369
+ # Step 2: Convert units
370
+ logger.info("Step 2: Converting sensor units")
371
+ df = convert_sensor_units(df, accelerometer_units, gyroscope_units)
372
+
373
+ # Step 3: Prepare time column
374
+ logger.info("Step 3: Preparing time column")
375
+ df = prepare_time_column(df, input_unit_type=time_input_unit)
376
+
377
+ # Step 4: Correct device orientation
378
+ logger.info("Step 4: Correcting device orientation")
379
+ df = correct_watch_orientation(df, device_orientation=device_orientation)
380
+
381
+ # Step 5: Resample to target frequency
382
+ logger.info(f"Step 5: Resampling to {resampling_frequency} Hz")
383
+ df = resample_data(
384
+ df,
385
+ resampling_frequency=resampling_frequency,
386
+ auto_segment=auto_segment,
387
+ max_segment_gap_s=max_segment_gap_s,
388
+ min_segment_length_s=min_segment_length_s,
389
+ verbose=1 if logger.level <= logging.INFO else 0,
390
+ )
391
+
392
+ # Step 6: Validate prepared data
393
+ if validate:
394
+ logger.info("Step 6: Validating prepared data")
395
+ validation = validate_prepared_data(df)
396
+
397
+ if validation["warnings"]:
398
+ for warning in validation["warnings"]:
399
+ logger.warning(warning)
400
+
401
+ if not validation["valid"]:
402
+ for error in validation["errors"]:
403
+ logger.error(error)
404
+ raise ValueError("Data preparation validation failed")
405
+
406
+ logger.info(
407
+ f"Data preparation completed: {df.shape[0]} rows, {df.shape[1]} columns"
408
+ )
409
+ return df