paradigma 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paradigma/__init__.py +10 -1
- paradigma/classification.py +38 -21
- paradigma/config.py +187 -123
- paradigma/constants.py +48 -35
- paradigma/feature_extraction.py +345 -255
- paradigma/load.py +476 -0
- paradigma/orchestrator.py +670 -0
- paradigma/pipelines/gait_pipeline.py +685 -246
- paradigma/pipelines/pulse_rate_pipeline.py +456 -155
- paradigma/pipelines/pulse_rate_utils.py +289 -248
- paradigma/pipelines/tremor_pipeline.py +405 -132
- paradigma/prepare_data.py +409 -0
- paradigma/preprocessing.py +500 -163
- paradigma/segmenting.py +180 -140
- paradigma/testing.py +370 -178
- paradigma/util.py +190 -101
- paradigma-1.1.0.dist-info/METADATA +229 -0
- paradigma-1.1.0.dist-info/RECORD +26 -0
- {paradigma-1.0.3.dist-info → paradigma-1.1.0.dist-info}/WHEEL +1 -1
- paradigma-1.1.0.dist-info/entry_points.txt +4 -0
- {paradigma-1.0.3.dist-info → paradigma-1.1.0.dist-info/licenses}/LICENSE +0 -1
- paradigma-1.0.3.dist-info/METADATA +0 -138
- paradigma-1.0.3.dist-info/RECORD +0 -22
paradigma/util.py
CHANGED
|
@@ -1,17 +1,45 @@
|
|
|
1
|
+
import functools
|
|
1
2
|
import os
|
|
3
|
+
import warnings
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
|
|
2
6
|
import numpy as np
|
|
3
7
|
import pandas as pd
|
|
4
|
-
|
|
8
|
+
import tsdf
|
|
5
9
|
from dateutil import parser
|
|
6
|
-
from typing import List, Tuple, Optional
|
|
7
10
|
from scipy.stats import gaussian_kde
|
|
8
|
-
|
|
9
|
-
import tsdf
|
|
10
11
|
from tsdf import TSDFMetadata
|
|
11
12
|
|
|
12
13
|
from paradigma.constants import DataColumns, TimeUnit
|
|
13
14
|
|
|
14
15
|
|
|
16
|
+
def deprecated(reason: str = ""):
|
|
17
|
+
"""
|
|
18
|
+
Decorator to mark functions as deprecated. It will show a warning when the
|
|
19
|
+
function is used.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
reason : str, optional
|
|
24
|
+
Additional message to explain why it is deprecated and what to use
|
|
25
|
+
instead.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def decorator(func):
|
|
29
|
+
message = f"Function {func.__name__} is deprecated."
|
|
30
|
+
if reason:
|
|
31
|
+
message += f" {reason}"
|
|
32
|
+
|
|
33
|
+
@functools.wraps(func)
|
|
34
|
+
def wrapper(*args, **kwargs):
|
|
35
|
+
warnings.warn(message, category=DeprecationWarning, stacklevel=2)
|
|
36
|
+
return func(*args, **kwargs)
|
|
37
|
+
|
|
38
|
+
return wrapper
|
|
39
|
+
|
|
40
|
+
return decorator
|
|
41
|
+
|
|
42
|
+
|
|
15
43
|
def parse_iso8601_to_datetime(date_str):
|
|
16
44
|
return parser.parse(date_str)
|
|
17
45
|
|
|
@@ -28,7 +56,7 @@ def get_end_iso8601(start_iso8601, window_length_seconds):
|
|
|
28
56
|
|
|
29
57
|
def write_np_data(
|
|
30
58
|
metadata_time: TSDFMetadata,
|
|
31
|
-
np_array_time: np.ndarray,
|
|
59
|
+
np_array_time: np.ndarray,
|
|
32
60
|
metadata_values: TSDFMetadata,
|
|
33
61
|
np_array_values: np.ndarray,
|
|
34
62
|
output_path: str,
|
|
@@ -53,7 +81,7 @@ def write_np_data(
|
|
|
53
81
|
The filename for the metadata.
|
|
54
82
|
|
|
55
83
|
"""
|
|
56
|
-
|
|
84
|
+
|
|
57
85
|
if not os.path.exists(output_path):
|
|
58
86
|
os.makedirs(output_path)
|
|
59
87
|
|
|
@@ -62,9 +90,19 @@ def write_np_data(
|
|
|
62
90
|
metadata_values.file_dir_path = output_path
|
|
63
91
|
|
|
64
92
|
# store binaries and metadata
|
|
65
|
-
time_tsdf = tsdf.write_binary_file(
|
|
93
|
+
time_tsdf = tsdf.write_binary_file(
|
|
94
|
+
file_dir=output_path,
|
|
95
|
+
file_name=metadata_time.file_name,
|
|
96
|
+
data=np_array_time,
|
|
97
|
+
metadata=metadata_time.get_plain_tsdf_dict_copy(),
|
|
98
|
+
)
|
|
66
99
|
|
|
67
|
-
samples_tsdf = tsdf.write_binary_file(
|
|
100
|
+
samples_tsdf = tsdf.write_binary_file(
|
|
101
|
+
file_dir=output_path,
|
|
102
|
+
file_name=metadata_values.file_name,
|
|
103
|
+
data=np_array_values,
|
|
104
|
+
metadata=metadata_values.get_plain_tsdf_dict_copy(),
|
|
105
|
+
)
|
|
68
106
|
|
|
69
107
|
tsdf.write_metadata([time_tsdf, samples_tsdf], output_filename)
|
|
70
108
|
|
|
@@ -118,7 +156,7 @@ def write_df_data(
|
|
|
118
156
|
|
|
119
157
|
def read_metadata(
|
|
120
158
|
input_path: str, meta_filename: str, time_filename: str, values_filename: str
|
|
121
|
-
) ->
|
|
159
|
+
) -> tuple[TSDFMetadata, TSDFMetadata]:
|
|
122
160
|
metadata_dict = tsdf.load_metadata_from_path(
|
|
123
161
|
os.path.join(input_path, meta_filename)
|
|
124
162
|
)
|
|
@@ -127,20 +165,30 @@ def read_metadata(
|
|
|
127
165
|
return metadata_time, metadata_values
|
|
128
166
|
|
|
129
167
|
|
|
130
|
-
def load_tsdf_dataframe(
|
|
168
|
+
def load_tsdf_dataframe(
|
|
169
|
+
path_to_data,
|
|
170
|
+
prefix,
|
|
171
|
+
meta_suffix="meta.json",
|
|
172
|
+
time_suffix="time.bin",
|
|
173
|
+
values_suffix="values.bin",
|
|
174
|
+
):
|
|
131
175
|
meta_filename = f"{prefix}_{meta_suffix}"
|
|
132
176
|
time_filename = f"{prefix}_{time_suffix}"
|
|
133
177
|
values_filename = f"{prefix}_{values_suffix}"
|
|
134
178
|
|
|
135
|
-
metadata_time, metadata_values = read_metadata(
|
|
136
|
-
|
|
179
|
+
metadata_time, metadata_values = read_metadata(
|
|
180
|
+
path_to_data, meta_filename, time_filename, values_filename
|
|
181
|
+
)
|
|
182
|
+
df = tsdf.load_dataframe_from_binaries(
|
|
183
|
+
[metadata_time, metadata_values], tsdf.constants.ConcatenationType.columns
|
|
184
|
+
)
|
|
137
185
|
|
|
138
186
|
return df, metadata_time, metadata_values
|
|
139
187
|
|
|
140
188
|
|
|
141
189
|
def load_metadata_list(
|
|
142
|
-
dir_path: str, meta_filename: str, filenames:
|
|
143
|
-
) ->
|
|
190
|
+
dir_path: str, meta_filename: str, filenames: list[str]
|
|
191
|
+
) -> list[TSDFMetadata]:
|
|
144
192
|
"""
|
|
145
193
|
Load the metadata objects from a metadata file according to the specified binaries.
|
|
146
194
|
|
|
@@ -152,11 +200,9 @@ def load_metadata_list(
|
|
|
152
200
|
The filename of the metadata file.
|
|
153
201
|
filenames : List[str]
|
|
154
202
|
The list of binary files of which the metadata files need to be loaded
|
|
155
|
-
|
|
156
|
-
"""
|
|
157
|
-
metadata_dict = tsdf.load_metadata_from_path(
|
|
158
|
-
os.path.join(dir_path, meta_filename)
|
|
159
|
-
)
|
|
203
|
+
|
|
204
|
+
"""
|
|
205
|
+
metadata_dict = tsdf.load_metadata_from_path(os.path.join(dir_path, meta_filename))
|
|
160
206
|
metadata_list = []
|
|
161
207
|
for filename in filenames:
|
|
162
208
|
metadata_list.append(metadata_dict[filename])
|
|
@@ -171,7 +217,8 @@ def transform_time_array(
|
|
|
171
217
|
start_time: float = 0.0,
|
|
172
218
|
) -> np.ndarray:
|
|
173
219
|
"""
|
|
174
|
-
Transforms the time array to relative time (when defined in delta time)
|
|
220
|
+
Transforms the time array to relative time (when defined in delta time)
|
|
221
|
+
and scales the values.
|
|
175
222
|
|
|
176
223
|
Parameters
|
|
177
224
|
----------
|
|
@@ -180,7 +227,8 @@ def transform_time_array(
|
|
|
180
227
|
input_unit_type : str
|
|
181
228
|
The time unit type of the input time array.
|
|
182
229
|
output_unit_type : str
|
|
183
|
-
The time unit type of the output time array. ParaDigMa expects
|
|
230
|
+
The time unit type of the output time array. ParaDigMa expects
|
|
231
|
+
`TimeUnit.RELATIVE_S`.
|
|
184
232
|
start_time : float, optional
|
|
185
233
|
The start time of the time array in UNIX seconds (default is 0.0)
|
|
186
234
|
|
|
@@ -191,41 +239,65 @@ def transform_time_array(
|
|
|
191
239
|
|
|
192
240
|
Notes
|
|
193
241
|
-----
|
|
194
|
-
- The function handles different time units (`TimeUnit.RELATIVE_MS`,
|
|
195
|
-
|
|
196
|
-
|
|
242
|
+
- The function handles different time units (`TimeUnit.RELATIVE_MS`,
|
|
243
|
+
`TimeUnit.RELATIVE_S`, `TimeUnit.ABSOLUTE_MS`, `TimeUnit.ABSOLUTE_S`,
|
|
244
|
+
`TimeUnit.DIFFERENCE_MS`, `TimeUnit.DIFFERENCE_S`).
|
|
245
|
+
- The transformation allows for scaling of the time array, converting
|
|
246
|
+
between time unit types (e.g., relative, absolute, or difference).
|
|
247
|
+
- When converting to `TimeUnit.RELATIVE_MS`, the function calculates the
|
|
248
|
+
relative time starting from the provided or default start time.
|
|
197
249
|
"""
|
|
198
|
-
input_units = input_unit_type.split(
|
|
199
|
-
output_units = output_unit_type.split(
|
|
250
|
+
input_units = input_unit_type.split("_")[-1].lower()
|
|
251
|
+
output_units = output_unit_type.split("_")[-1].lower()
|
|
200
252
|
|
|
201
253
|
if input_units == output_units:
|
|
202
254
|
scale_factor = 1
|
|
203
|
-
elif input_units ==
|
|
255
|
+
elif input_units == "s" and output_units == "ms":
|
|
204
256
|
scale_factor = 1e3
|
|
205
|
-
elif input_units ==
|
|
257
|
+
elif input_units == "ms" and output_units == "s":
|
|
206
258
|
scale_factor = 1 / 1e3
|
|
207
259
|
else:
|
|
208
|
-
raise ValueError(
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
#
|
|
260
|
+
raise ValueError(
|
|
261
|
+
f"Unsupported time units conversion: {input_units} to {output_units}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# Transform to relative time (`TimeUnit.RELATIVE_MS`)
|
|
265
|
+
if (
|
|
266
|
+
input_unit_type == TimeUnit.DIFFERENCE_MS
|
|
267
|
+
or input_unit_type == TimeUnit.DIFFERENCE_S
|
|
268
|
+
):
|
|
269
|
+
# Convert a series of differences into cumulative sum to
|
|
270
|
+
# reconstruct original time series.
|
|
213
271
|
time_array = np.cumsum(np.double(time_array))
|
|
214
|
-
elif
|
|
272
|
+
elif (
|
|
273
|
+
input_unit_type == TimeUnit.ABSOLUTE_MS
|
|
274
|
+
or input_unit_type == TimeUnit.ABSOLUTE_S
|
|
275
|
+
):
|
|
215
276
|
# Set the start time if not provided.
|
|
216
277
|
if np.isclose(start_time, 0.0, rtol=1e-09, atol=1e-09):
|
|
217
278
|
start_time = time_array[0]
|
|
218
279
|
# Convert absolute time stamps into a time series relative to start_time.
|
|
219
|
-
time_array =
|
|
220
|
-
|
|
221
|
-
# Transform the time array from `TimeUnit.RELATIVE_MS` to the
|
|
222
|
-
|
|
280
|
+
time_array = time_array - start_time
|
|
281
|
+
|
|
282
|
+
# Transform the time array from `TimeUnit.RELATIVE_MS` to the
|
|
283
|
+
# specified time unit type
|
|
284
|
+
if (
|
|
285
|
+
output_unit_type == TimeUnit.ABSOLUTE_MS
|
|
286
|
+
or output_unit_type == TimeUnit.ABSOLUTE_S
|
|
287
|
+
):
|
|
223
288
|
# Converts time array to absolute time by adding the start time to each element.
|
|
224
289
|
time_array = time_array + start_time
|
|
225
|
-
elif
|
|
226
|
-
|
|
290
|
+
elif (
|
|
291
|
+
output_unit_type == TimeUnit.DIFFERENCE_MS
|
|
292
|
+
or output_unit_type == TimeUnit.DIFFERENCE_S
|
|
293
|
+
):
|
|
294
|
+
# Creates a new array starting with 0, followed by the
|
|
295
|
+
# differences between consecutive elements.
|
|
227
296
|
time_array = np.diff(np.insert(time_array, 0, start_time))
|
|
228
|
-
elif
|
|
297
|
+
elif (
|
|
298
|
+
output_unit_type == TimeUnit.RELATIVE_MS
|
|
299
|
+
or output_unit_type == TimeUnit.RELATIVE_S
|
|
300
|
+
):
|
|
229
301
|
# The array is already in relative format, do nothing.
|
|
230
302
|
pass
|
|
231
303
|
|
|
@@ -256,25 +328,25 @@ def convert_units_accelerometer(data: np.ndarray, units: str) -> np.ndarray:
|
|
|
256
328
|
return data
|
|
257
329
|
else:
|
|
258
330
|
raise ValueError(f"Unsupported unit: {units}")
|
|
259
|
-
|
|
331
|
+
|
|
260
332
|
|
|
261
333
|
def convert_units_gyroscope(data: np.ndarray, units: str) -> np.ndarray:
|
|
262
334
|
"""
|
|
263
335
|
Convert gyroscope data to deg/s.
|
|
264
|
-
|
|
336
|
+
|
|
265
337
|
Parameters
|
|
266
338
|
----------
|
|
267
339
|
data : np.ndarray
|
|
268
340
|
The gyroscope data.
|
|
269
|
-
|
|
341
|
+
|
|
270
342
|
units : str
|
|
271
343
|
The unit of the data (currently supports deg/s and rad/s).
|
|
272
|
-
|
|
344
|
+
|
|
273
345
|
Returns
|
|
274
346
|
-------
|
|
275
347
|
np.ndarray
|
|
276
348
|
The gyroscope data in deg/s.
|
|
277
|
-
|
|
349
|
+
|
|
278
350
|
"""
|
|
279
351
|
if units == "deg/s":
|
|
280
352
|
return data
|
|
@@ -282,9 +354,9 @@ def convert_units_gyroscope(data: np.ndarray, units: str) -> np.ndarray:
|
|
|
282
354
|
return np.degrees(data)
|
|
283
355
|
else:
|
|
284
356
|
raise ValueError(f"Unsupported unit: {units}")
|
|
285
|
-
|
|
286
357
|
|
|
287
|
-
|
|
358
|
+
|
|
359
|
+
def invert_watch_side(df: pd.DataFrame, side: str, sensor="both") -> np.ndarray:
|
|
288
360
|
"""
|
|
289
361
|
Invert the data based on the watch side.
|
|
290
362
|
|
|
@@ -305,78 +377,88 @@ def invert_watch_side(df: pd.DataFrame, side: str, sensor='both') -> np.ndarray:
|
|
|
305
377
|
"""
|
|
306
378
|
if side not in ["left", "right"]:
|
|
307
379
|
raise ValueError(f"Unsupported side: {side}")
|
|
308
|
-
if sensor not in [
|
|
380
|
+
if sensor not in ["accelerometer", "gyroscope", "both"]:
|
|
309
381
|
raise ValueError(f"Unsupported sensor: {sensor}")
|
|
310
382
|
|
|
311
383
|
elif side == "right":
|
|
312
|
-
if sensor in [
|
|
384
|
+
if sensor in ["gyroscope", "both"]:
|
|
313
385
|
df[DataColumns.GYROSCOPE_Y] *= -1
|
|
314
386
|
df[DataColumns.GYROSCOPE_Z] *= -1
|
|
315
|
-
if sensor in [
|
|
387
|
+
if sensor in ["accelerometer", "both"]:
|
|
316
388
|
df[DataColumns.ACCELEROMETER_X] *= -1
|
|
317
389
|
|
|
318
390
|
return df
|
|
319
391
|
|
|
320
|
-
|
|
392
|
+
|
|
393
|
+
def aggregate_parameter(
|
|
394
|
+
parameter: np.ndarray,
|
|
395
|
+
aggregate: str,
|
|
396
|
+
evaluation_points: np.ndarray | None = None,
|
|
397
|
+
) -> np.ndarray | int:
|
|
321
398
|
"""
|
|
322
399
|
Aggregate a parameter based on the specified method.
|
|
323
|
-
|
|
400
|
+
|
|
324
401
|
Parameters
|
|
325
402
|
----------
|
|
326
403
|
parameter : np.ndarray
|
|
327
404
|
The parameter to aggregate.
|
|
328
|
-
|
|
405
|
+
|
|
329
406
|
aggregate : str
|
|
330
407
|
The aggregation method to apply.
|
|
331
408
|
|
|
332
409
|
evaluation_points : np.ndarray, optional
|
|
333
|
-
Should be specified if the mode is derived for a continuous parameter.
|
|
334
|
-
Defines the evaluation points for the kernel density estimation
|
|
410
|
+
Should be specified if the mode is derived for a continuous parameter.
|
|
411
|
+
Defines the evaluation points for the kernel density estimation
|
|
412
|
+
function, from which the maximum is derived as the mode.
|
|
335
413
|
|
|
336
414
|
Returns
|
|
337
415
|
-------
|
|
338
416
|
np.ndarray
|
|
339
417
|
The aggregated parameter.
|
|
340
418
|
"""
|
|
341
|
-
if aggregate ==
|
|
419
|
+
if aggregate == "mean":
|
|
342
420
|
return np.mean(parameter)
|
|
343
|
-
elif aggregate ==
|
|
421
|
+
elif aggregate == "median":
|
|
344
422
|
return np.median(parameter)
|
|
345
|
-
elif aggregate ==
|
|
423
|
+
elif aggregate == "mode_binned":
|
|
346
424
|
if evaluation_points is None:
|
|
347
|
-
raise ValueError(
|
|
425
|
+
raise ValueError(
|
|
426
|
+
"evaluation_points must be provided for 'mode_binned' aggregation."
|
|
427
|
+
)
|
|
348
428
|
else:
|
|
349
429
|
kde = gaussian_kde(parameter)
|
|
350
430
|
kde_values = kde(evaluation_points)
|
|
351
431
|
max_index = np.argmax(kde_values)
|
|
352
432
|
return evaluation_points[max_index]
|
|
353
|
-
elif aggregate ==
|
|
433
|
+
elif aggregate == "mode":
|
|
354
434
|
unique_values, counts = np.unique(parameter, return_counts=True)
|
|
355
435
|
return unique_values[np.argmax(counts)]
|
|
356
|
-
elif aggregate ==
|
|
436
|
+
elif aggregate == "90p":
|
|
357
437
|
return np.percentile(parameter, 90)
|
|
358
|
-
elif aggregate ==
|
|
438
|
+
elif aggregate == "95p":
|
|
359
439
|
return np.percentile(parameter, 95)
|
|
360
|
-
elif aggregate ==
|
|
440
|
+
elif aggregate == "99p":
|
|
361
441
|
return np.percentile(parameter, 99)
|
|
362
|
-
elif aggregate ==
|
|
442
|
+
elif aggregate == "std":
|
|
363
443
|
return np.std(parameter)
|
|
364
|
-
elif aggregate ==
|
|
444
|
+
elif aggregate == "cov":
|
|
365
445
|
mean_value = np.mean(parameter)
|
|
366
446
|
return np.std(parameter) / mean_value if mean_value != 0 else 0
|
|
367
447
|
else:
|
|
368
448
|
raise ValueError(f"Invalid aggregation method: {aggregate}")
|
|
369
449
|
|
|
450
|
+
|
|
370
451
|
def merge_predictions_with_timestamps(
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
452
|
+
df_ts: pd.DataFrame,
|
|
453
|
+
df_predictions: pd.DataFrame,
|
|
454
|
+
pred_proba_colname: str,
|
|
455
|
+
window_length_s: float,
|
|
456
|
+
fs: int,
|
|
457
|
+
) -> pd.DataFrame:
|
|
377
458
|
"""
|
|
378
|
-
Merges prediction probabilities with timestamps by expanding overlapping
|
|
379
|
-
into individual timestamps and averaging probabilities per unique
|
|
459
|
+
Merges prediction probabilities with timestamps by expanding overlapping
|
|
460
|
+
windows into individual timestamps and averaging probabilities per unique
|
|
461
|
+
timestamp.
|
|
380
462
|
|
|
381
463
|
Parameters:
|
|
382
464
|
----------
|
|
@@ -385,10 +467,11 @@ def merge_predictions_with_timestamps(
|
|
|
385
467
|
Must include the timestamp column specified in `DataColumns.TIME`.
|
|
386
468
|
|
|
387
469
|
df_predictions : pd.DataFrame
|
|
388
|
-
DataFrame containing prediction windows with start times and
|
|
389
|
-
Must include:
|
|
470
|
+
DataFrame containing prediction windows with start times and
|
|
471
|
+
probabilities. Must include:
|
|
390
472
|
- A column for window start times (defined by `DataColumns.TIME`).
|
|
391
|
-
- A column for prediction probabilities (defined by
|
|
473
|
+
- A column for prediction probabilities (defined by
|
|
474
|
+
`DataColumns.PRED_GAIT_PROBA`).
|
|
392
475
|
|
|
393
476
|
pred_proba_colname : str
|
|
394
477
|
The column name for the prediction probabilities in `df_predictions`.
|
|
@@ -398,7 +481,7 @@ def merge_predictions_with_timestamps(
|
|
|
398
481
|
|
|
399
482
|
fs : int
|
|
400
483
|
The sampling frequency of the data.
|
|
401
|
-
|
|
484
|
+
|
|
402
485
|
Returns:
|
|
403
486
|
-------
|
|
404
487
|
pd.DataFrame
|
|
@@ -419,22 +502,18 @@ def merge_predictions_with_timestamps(
|
|
|
419
502
|
# Step 1: Generate all timestamps for prediction windows using NumPy broadcasting
|
|
420
503
|
window_length = int(window_length_s * fs)
|
|
421
504
|
timestamps = (
|
|
422
|
-
df_predictions[DataColumns.TIME].values[:, None]
|
|
423
|
-
np.arange(0, window_length) / fs
|
|
505
|
+
df_predictions[DataColumns.TIME].values[:, None]
|
|
506
|
+
+ np.arange(0, window_length) / fs
|
|
424
507
|
)
|
|
425
|
-
|
|
508
|
+
|
|
426
509
|
# Flatten timestamps and probabilities into a single array for efficient processing
|
|
427
510
|
flat_timestamps = timestamps.ravel()
|
|
428
|
-
flat_proba = np.repeat(
|
|
429
|
-
df_predictions[pred_proba_colname].values,
|
|
430
|
-
window_length
|
|
431
|
-
)
|
|
511
|
+
flat_proba = np.repeat(df_predictions[pred_proba_colname].values, window_length)
|
|
432
512
|
|
|
433
513
|
# Step 2: Create a DataFrame for expanded data
|
|
434
|
-
expanded_df = pd.DataFrame(
|
|
435
|
-
DataColumns.TIME: flat_timestamps,
|
|
436
|
-
|
|
437
|
-
})
|
|
514
|
+
expanded_df = pd.DataFrame(
|
|
515
|
+
{DataColumns.TIME: flat_timestamps, pred_proba_colname: flat_proba}
|
|
516
|
+
)
|
|
438
517
|
|
|
439
518
|
# Step 3: Round timestamps and aggregate probabilities
|
|
440
519
|
expanded_df[DataColumns.TIME] = expanded_df[DataColumns.TIME].round(2)
|
|
@@ -442,14 +521,15 @@ def merge_predictions_with_timestamps(
|
|
|
442
521
|
|
|
443
522
|
# Step 4: Round timestamps in `df_ts` and merge
|
|
444
523
|
df_ts[DataColumns.TIME] = df_ts[DataColumns.TIME].round(2)
|
|
445
|
-
df_ts = pd.merge(df_ts, mean_proba, how=
|
|
524
|
+
df_ts = pd.merge(df_ts, mean_proba, how="left", on=DataColumns.TIME)
|
|
446
525
|
df_ts = df_ts.dropna(subset=[pred_proba_colname])
|
|
447
526
|
|
|
448
527
|
return df_ts
|
|
449
528
|
|
|
450
529
|
|
|
451
|
-
def select_hours(
|
|
452
|
-
|
|
530
|
+
def select_hours(
|
|
531
|
+
df: pd.DataFrame, select_hours_start: str, select_hours_end: str
|
|
532
|
+
) -> pd.DataFrame:
|
|
453
533
|
"""
|
|
454
534
|
Select hours of interest from the data to include in the aggregation step.
|
|
455
535
|
|
|
@@ -460,7 +540,7 @@ def select_hours(df: pd.DataFrame, select_hours_start: str, select_hours_end: st
|
|
|
460
540
|
|
|
461
541
|
select_hours_start: str
|
|
462
542
|
The start time of the selected hours in "HH:MM" format.
|
|
463
|
-
|
|
543
|
+
|
|
464
544
|
select_hours_end: str
|
|
465
545
|
The end time of the selected hours in "HH:MM" format.
|
|
466
546
|
|
|
@@ -471,14 +551,18 @@ def select_hours(df: pd.DataFrame, select_hours_start: str, select_hours_end: st
|
|
|
471
551
|
|
|
472
552
|
"""
|
|
473
553
|
|
|
474
|
-
select_hours_start = datetime.strptime(
|
|
475
|
-
|
|
476
|
-
|
|
554
|
+
select_hours_start = datetime.strptime(
|
|
555
|
+
select_hours_start, "%H:%M"
|
|
556
|
+
).time() # convert to time object
|
|
557
|
+
select_hours_end = datetime.strptime(select_hours_end, "%H:%M").time()
|
|
558
|
+
df_subset = df[
|
|
559
|
+
df["time_dt"].dt.time.between(select_hours_start, select_hours_end)
|
|
560
|
+
] # select the hours of interest
|
|
477
561
|
|
|
478
562
|
return df_subset
|
|
479
563
|
|
|
480
|
-
def select_days(df: pd.DataFrame, min_hours_per_day: int) -> pd.DataFrame:
|
|
481
564
|
|
|
565
|
+
def select_days(df: pd.DataFrame, min_hours_per_day: int) -> pd.DataFrame:
|
|
482
566
|
"""
|
|
483
567
|
Select days of interest from the data to include in the aggregation step.
|
|
484
568
|
|
|
@@ -488,7 +572,8 @@ def select_days(df: pd.DataFrame, min_hours_per_day: int) -> pd.DataFrame:
|
|
|
488
572
|
Input data with column 'time_dt' in which the date is stored.
|
|
489
573
|
|
|
490
574
|
min_hours_per_day: int
|
|
491
|
-
The minimum number of hours per day required for including the day
|
|
575
|
+
The minimum number of hours per day required for including the day
|
|
576
|
+
in the aggregation step.
|
|
492
577
|
|
|
493
578
|
|
|
494
579
|
Returns
|
|
@@ -499,8 +584,12 @@ def select_days(df: pd.DataFrame, min_hours_per_day: int) -> pd.DataFrame:
|
|
|
499
584
|
"""
|
|
500
585
|
|
|
501
586
|
min_s_per_day = min_hours_per_day * 3600
|
|
502
|
-
window_length_s =
|
|
587
|
+
window_length_s = (
|
|
588
|
+
df["time_dt"].diff().dt.total_seconds().iloc[1]
|
|
589
|
+
) # determine the length of the first window in seconds
|
|
503
590
|
min_windows_per_day = min_s_per_day / window_length_s
|
|
504
|
-
df_subset = df.groupby(df[
|
|
591
|
+
df_subset = df.groupby(df["time_dt"].dt.date).filter(
|
|
592
|
+
lambda x: len(x) >= min_windows_per_day
|
|
593
|
+
)
|
|
505
594
|
|
|
506
|
-
return df_subset
|
|
595
|
+
return df_subset
|