paradigma 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
paradigma/load.py ADDED
@@ -0,0 +1,476 @@
1
+ """
2
+ Data loading module for ParaDigMa toolbox.
3
+
4
+ This module provides functions to load sensor data from various formats:
5
+ - Raw data: TSDF (.meta/.bin), Empatica (.avro), Axivity (.CWA)
6
+ - Prepared data: parquet, pickle, csv
7
+
8
+ Based on device_specific_data_loading tutorial.
9
+ """
10
+
11
+ import logging
12
+ import pickle
13
+ from pathlib import Path
14
+
15
+ import pandas as pd
16
+ from avro.datafile import DataFileReader
17
+ from avro.io import DatumReader
18
+
19
+ from paradigma.util import load_tsdf_dataframe
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def load_tsdf_data(
25
+ data_path: str | Path,
26
+ prefix: str = "IMU",
27
+ ) -> tuple[pd.DataFrame, dict, dict]:
28
+ """
29
+ Load TSDF data from .meta and .bin files.
30
+
31
+ Parameters
32
+ ----------
33
+ data_path : str or Path
34
+ Path to directory containing TSDF files.
35
+ prefix : str, default "IMU"
36
+ Prefix for TSDF files (e.g., "IMU_segment0001").
37
+
38
+ Returns
39
+ -------
40
+ tuple
41
+ Tuple containing (DataFrame with loaded data, time metadata
42
+ dict, values metadata dict)
43
+ """
44
+ data_path = Path(data_path)
45
+ logger.info(f"Loading TSDF data from {data_path} with prefix '{prefix}'")
46
+
47
+ df, time_meta, values_meta = load_tsdf_dataframe(
48
+ path_to_data=data_path, prefix=prefix
49
+ )
50
+
51
+ logger.info(f"Loaded TSDF data: {df.shape[0]} rows, {df.shape[1]} columns")
52
+ return df, time_meta, values_meta
53
+
54
+
55
+ def load_empatica_data(
56
+ file_path: str | Path,
57
+ ) -> pd.DataFrame:
58
+ """
59
+ Load Empatica .avro file.
60
+
61
+ Parameters
62
+ ----------
63
+ file_path : str or Path
64
+ Path to .avro file.
65
+
66
+ Returns
67
+ -------
68
+ pd.DataFrame
69
+ DataFrame with columns: time, time_dt, accelerometer_x/y/z,
70
+ gyroscope_x/y/z (if available).
71
+ """
72
+ file_path = Path(file_path)
73
+ logger.info(f"Loading Empatica data from {file_path}")
74
+
75
+ with open(file_path, "rb") as f:
76
+ reader = DataFileReader(f, DatumReader())
77
+ empatica_data = next(reader)
78
+
79
+ accel_data = empatica_data["rawData"]["accelerometer"]
80
+
81
+ # Check for gyroscope data
82
+ gyro_data = None
83
+ if (
84
+ "gyroscope" in empatica_data["rawData"]
85
+ and len(empatica_data["rawData"]["gyroscope"]["x"]) > 0
86
+ ):
87
+ gyro_data = empatica_data["rawData"]["gyroscope"]
88
+ else:
89
+ raise ValueError("Gyroscope data not found in Empatica file.")
90
+
91
+ # Check Avro schema version for conversion
92
+ avro_version = (
93
+ empatica_data["schemaVersion"]["major"],
94
+ empatica_data["schemaVersion"]["minor"],
95
+ empatica_data["schemaVersion"]["patch"],
96
+ )
97
+
98
+ # Convert accelerometer data based on schema version
99
+ if avro_version < (6, 5, 0):
100
+ physical_range = (
101
+ accel_data["imuParams"]["physicalMax"]
102
+ - accel_data["imuParams"]["physicalMin"]
103
+ )
104
+ digital_range = (
105
+ accel_data["imuParams"]["digitalMax"]
106
+ - accel_data["imuParams"]["digitalMin"]
107
+ )
108
+ conversion_factor = physical_range / digital_range
109
+ else:
110
+ conversion_factor = accel_data["imuParams"]["conversionFactor"]
111
+
112
+ accel_x = [val * conversion_factor for val in accel_data["x"]]
113
+ accel_y = [val * conversion_factor for val in accel_data["y"]]
114
+ accel_z = [val * conversion_factor for val in accel_data["z"]]
115
+
116
+ sampling_frequency = accel_data["samplingFrequency"]
117
+ nrows = len(accel_x)
118
+
119
+ # Create time arrays
120
+ t_start = accel_data["timestampStart"]
121
+ t_array = [t_start + i * (1e6 / sampling_frequency) for i in range(nrows)]
122
+ t_from_0_array = [(x - t_array[0]) / 1e6 for x in t_array]
123
+
124
+ # Build DataFrame
125
+ df_data = {
126
+ "time": t_from_0_array,
127
+ "time_dt": pd.to_datetime(t_array, unit="us"),
128
+ "accelerometer_x": accel_x,
129
+ "accelerometer_y": accel_y,
130
+ "accelerometer_z": accel_z,
131
+ }
132
+
133
+ # Add gyroscope data if available
134
+ if gyro_data:
135
+ # Apply same conversion to gyroscope
136
+ gyro_x = [val * conversion_factor for val in gyro_data["x"]]
137
+ gyro_y = [val * conversion_factor for val in gyro_data["y"]]
138
+ gyro_z = [val * conversion_factor for val in gyro_data["z"]]
139
+
140
+ df_data.update(
141
+ {
142
+ "gyroscope_x": gyro_x,
143
+ "gyroscope_y": gyro_y,
144
+ "gyroscope_z": gyro_z,
145
+ }
146
+ )
147
+
148
+ df = pd.DataFrame(df_data)
149
+
150
+ logger.info(f"Loaded Empatica data: {nrows} rows at {sampling_frequency} Hz")
151
+ logger.debug(f"Start time: {pd.to_datetime(t_start, unit='us')}")
152
+ logger.debug(f"Columns: {list(df.columns)}")
153
+
154
+ return df
155
+
156
+
157
+ def load_axivity_data(
158
+ file_path: str | Path,
159
+ ) -> pd.DataFrame:
160
+ """
161
+ Load Axivity .CWA file.
162
+
163
+ Parameters
164
+ ----------
165
+ file_path : str or Path
166
+ Path to .CWA file.
167
+
168
+ Returns
169
+ -------
170
+ pd.DataFrame
171
+ DataFrame with columns: time, time_dt, accelerometer_x/y/z,
172
+ gyroscope_x/y/z (if available).
173
+ """
174
+ try:
175
+ from openmovement.load import CwaData
176
+ except ImportError:
177
+ raise ImportError(
178
+ "openmovement package required for Axivity data loading. "
179
+ "Install with: pip install git+https://github.com/digitalinteraction/openmovement-python.git@master"
180
+ )
181
+
182
+ file_path = Path(file_path)
183
+ logger.info(f"Loading Axivity data from {file_path}")
184
+
185
+ with CwaData(
186
+ filename=file_path,
187
+ include_gyro=True, # Set to False for AX3 devices without gyroscope
188
+ include_temperature=False,
189
+ ) as cwa_data:
190
+ logger.debug(f"Data format info: {cwa_data.data_format}")
191
+ df = cwa_data.get_samples()
192
+
193
+ # Set time to start at 0 seconds
194
+ df["time_dt"] = df["time"].copy()
195
+ df["time"] = (df["time"] - df["time"].iloc[0]).dt.total_seconds()
196
+
197
+ # Standardize column names
198
+ column_mapping = {}
199
+ if "accel_x" in df.columns:
200
+ column_mapping.update(
201
+ {
202
+ "accel_x": "accelerometer_x",
203
+ "accel_y": "accelerometer_y",
204
+ "accel_z": "accelerometer_z",
205
+ }
206
+ )
207
+ if "gyro_x" in df.columns:
208
+ column_mapping.update(
209
+ {"gyro_x": "gyroscope_x", "gyro_y": "gyroscope_y", "gyro_z": "gyroscope_z"}
210
+ )
211
+
212
+ df = df.rename(columns=column_mapping)
213
+
214
+ logger.info(f"Loaded Axivity data: {df.shape[0]} rows, {df.shape[1]} columns")
215
+ logger.debug(f"Columns: {list(df.columns)}")
216
+
217
+ return df
218
+
219
+
220
+ def load_prepared_data(
221
+ file_path: str | Path,
222
+ ) -> pd.DataFrame:
223
+ """
224
+ Load prepared data from various formats (parquet, pickle, csv, json).
225
+ If json, expects TSDF format with corresponding .bin files.
226
+
227
+ Parameters
228
+ ----------
229
+ file_path : str or Path
230
+ Path to prepared data file.
231
+
232
+ Returns
233
+ -------
234
+ pd.DataFrame
235
+ DataFrame with prepared data.
236
+ """
237
+ file_path = Path(file_path)
238
+ logger.info(f"Loading prepared data from {file_path}")
239
+
240
+ if not file_path.exists():
241
+ raise FileNotFoundError(f"File not found: {file_path}")
242
+
243
+ # Determine file format and load accordingly
244
+ suffix = file_path.suffix.lower()
245
+
246
+ if suffix == ".parquet":
247
+ df = pd.read_parquet(file_path)
248
+ elif suffix == ".csv":
249
+ df = pd.read_csv(file_path)
250
+ elif suffix in [".pkl", ".pickle"]:
251
+ with open(file_path, "rb") as f:
252
+ df = pickle.load(f)
253
+ elif suffix == ".json":
254
+ # Load TSDF from JSON and corresponding .bin files
255
+ df, _, _ = load_tsdf_dataframe(
256
+ path_to_data=file_path.parent,
257
+ prefix=file_path.stem.replace("_meta", ""),
258
+ )
259
+ else:
260
+ raise ValueError(
261
+ f"Unsupported file format: {suffix}. "
262
+ f"Supported: .parquet, .csv, .pkl, .pickle"
263
+ )
264
+
265
+ logger.info(f"Loaded {file_path.name}: {df.shape[0]} rows, {df.shape[1]} columns")
266
+ logger.debug(f"Columns: {list(df.columns)}")
267
+
268
+ return df
269
+
270
+
271
+ def detect_file_format(file_path: str | Path) -> str:
272
+ """
273
+ Detect the format of a data file based on its extension.
274
+
275
+ Parameters
276
+ ----------
277
+ file_path : str or Path
278
+ Path to data file
279
+
280
+ Returns
281
+ -------
282
+ str
283
+ File format: 'json', 'empatica', 'axivity', 'prepared'
284
+ """
285
+ file_path = Path(file_path)
286
+ suffix = file_path.suffix.lower()
287
+
288
+ if suffix == ".json":
289
+ return "tsdf"
290
+ elif suffix == ".avro":
291
+ return "empatica"
292
+ elif suffix == ".cwa":
293
+ return "axivity"
294
+ elif suffix in [".parquet", ".csv", ".pkl", ".pickle"]:
295
+ return "prepared"
296
+ else:
297
+ raise ValueError(f"Unknown file format: {suffix}")
298
+
299
+
300
+ def get_data_file_paths(
301
+ data_path: str | Path,
302
+ file_patterns: list[str] | str | None = None,
303
+ ) -> list[Path]:
304
+ """
305
+ Get list of data file paths without loading them.
306
+
307
+ This function is useful for memory-efficient processing where you want to
308
+ load and process files one at a time instead of loading all at once.
309
+
310
+ Parameters
311
+ ----------
312
+ data_path : str or Path
313
+ Path to directory containing data files
314
+ file_patterns : str or list of str, optional
315
+ File extensions to consider (e.g. ["parquet", "csv", "cwa"]).
316
+ If None, all supported formats are considered.
317
+
318
+ Returns
319
+ -------
320
+ list of Path
321
+ List of file paths found in the directory
322
+ """
323
+ data_path = Path(data_path)
324
+
325
+ if not data_path.exists():
326
+ raise FileNotFoundError(f"Directory not found: {data_path}")
327
+
328
+ valid_file_patterns = ["parquet", "csv", "pkl", "pickle", "json", "avro", "cwa"]
329
+
330
+ if file_patterns is None:
331
+ file_patterns = valid_file_patterns
332
+ elif isinstance(file_patterns, str):
333
+ file_patterns = [file_patterns]
334
+
335
+ # Collect candidate files
336
+ all_files = [
337
+ f
338
+ for f in data_path.iterdir()
339
+ if f.is_file() and f.suffix[1:].lower() in file_patterns
340
+ ]
341
+
342
+ logger.info(f"Found {len(all_files)} data files in {data_path}")
343
+
344
+ return all_files
345
+
346
+
347
+ def load_single_data_file(
348
+ file_path: str | Path,
349
+ ) -> tuple[str, pd.DataFrame]:
350
+ """
351
+ Load a single data file with automatic format detection.
352
+
353
+ Parameters
354
+ ----------
355
+ file_path : str or Path
356
+ Path to data file
357
+
358
+ Returns
359
+ -------
360
+ tuple
361
+ Tuple of (file_key, DataFrame) where file_key is the file name without extension
362
+ """
363
+ file_path = Path(file_path)
364
+
365
+ if not file_path.exists():
366
+ raise FileNotFoundError(f"File not found: {file_path}")
367
+
368
+ try:
369
+ file_format = detect_file_format(file_path)
370
+
371
+ if file_format == "tsdf":
372
+ # For TSDF, load based on .meta file and infer prefix
373
+ if file_path.suffix.lower() == ".json":
374
+ prefix = file_path.stem.replace("_meta", "")
375
+ df, _, _ = load_tsdf_data(file_path.parent, prefix)
376
+ return prefix, df
377
+
378
+ elif file_format == "empatica":
379
+ df = load_empatica_data(file_path)
380
+ return file_path.stem, df
381
+
382
+ elif file_format == "axivity":
383
+ df = load_axivity_data(file_path)
384
+ return file_path.stem, df
385
+
386
+ elif file_format == "prepared":
387
+ df = load_prepared_data(file_path)
388
+ prefix = file_path.stem.replace("_meta", "")
389
+ return prefix, df
390
+
391
+ else:
392
+ raise ValueError(f"Unknown file format for {file_path}")
393
+
394
+ except Exception as e:
395
+ logger.error(f"Failed to load {file_path}: {e}")
396
+ raise
397
+
398
+
399
+ def load_data_files(
400
+ data_path: str | Path,
401
+ file_patterns: list[str] | None = None,
402
+ ) -> dict[str, pd.DataFrame]:
403
+ """
404
+ Load all data files from a directory with automatic format detection.
405
+
406
+ Note: This function loads all files into memory at once. For large datasets,
407
+ consider using get_data_file_paths() and load_single_data_file() to process
408
+ files one at a time.
409
+
410
+ Parameters
411
+ ----------
412
+ data_path : str or Path
413
+ Path to directory containing data files
414
+ file_patterns : str or list of str, optional
415
+ File extensions to consider (e.g. ["parquet", "csv", "cwa"]).
416
+ If None, all supported formats are considered.
417
+
418
+ Returns
419
+ -------
420
+ dict
421
+ Dictionary mapping file names (without extension) to DataFrames
422
+ """
423
+ # Get all file paths
424
+ all_files = get_data_file_paths(data_path, file_patterns)
425
+
426
+ loaded_files = {}
427
+
428
+ # Load each file
429
+ for file_path in all_files:
430
+ try:
431
+ file_key, df = load_single_data_file(file_path)
432
+ loaded_files[file_key] = df
433
+ except Exception as e:
434
+ logger.warning(f"Failed to load {file_path}: {e}")
435
+
436
+ if len(loaded_files) == 0:
437
+ logger.warning("No data files were loaded.")
438
+ else:
439
+ logger.info(f"Successfully loaded {len(loaded_files)} files")
440
+
441
+ return loaded_files
442
+
443
+
444
+ def save_prepared_data(
445
+ df: pd.DataFrame,
446
+ file_path: str | Path,
447
+ file_format: str = "parquet",
448
+ ) -> None:
449
+ """
450
+ Save prepared data to file.
451
+
452
+ Parameters
453
+ ----------
454
+ df : pd.DataFrame
455
+ DataFrame to save
456
+ file_path : str or Path
457
+ Path for output file
458
+ file_format : str, default 'parquet'
459
+ Output format: 'parquet', 'csv', 'pickle'
460
+ """
461
+ file_path = Path(file_path)
462
+
463
+ # Ensure directory exists
464
+ file_path.parent.mkdir(parents=True, exist_ok=True)
465
+
466
+ if file_format == "parquet":
467
+ df.to_parquet(file_path, index=False)
468
+ elif file_format == "csv":
469
+ df.to_csv(file_path, index=False)
470
+ elif file_format == "pickle":
471
+ with open(file_path, "wb") as f:
472
+ pickle.dump(df, f)
473
+ else:
474
+ raise ValueError(f"Unsupported file_format: {file_format}")
475
+
476
+ logger.info(f"Saved prepared data to {file_path}")