paradigma 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paradigma/classification.py +28 -11
- paradigma/config.py +157 -102
- paradigma/constants.py +39 -34
- paradigma/feature_extraction.py +270 -211
- paradigma/pipelines/gait_pipeline.py +232 -184
- paradigma/pipelines/pulse_rate_pipeline.py +202 -133
- paradigma/pipelines/pulse_rate_utils.py +144 -142
- paradigma/pipelines/tremor_pipeline.py +138 -85
- paradigma/preprocessing.py +179 -110
- paradigma/segmenting.py +138 -113
- paradigma/testing.py +359 -172
- paradigma/util.py +158 -83
- {paradigma-1.0.3.dist-info → paradigma-1.0.4.dist-info}/METADATA +31 -29
- paradigma-1.0.4.dist-info/RECORD +23 -0
- {paradigma-1.0.3.dist-info → paradigma-1.0.4.dist-info}/WHEEL +1 -1
- paradigma-1.0.4.dist-info/entry_points.txt +4 -0
- {paradigma-1.0.3.dist-info → paradigma-1.0.4.dist-info/licenses}/LICENSE +0 -1
- paradigma-1.0.3.dist-info/RECORD +0 -22
paradigma/segmenting.py
CHANGED
|
@@ -1,25 +1,26 @@
|
|
|
1
|
-
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
2
3
|
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
3
5
|
|
|
4
|
-
from typing import List
|
|
5
6
|
from paradigma.constants import DataColumns
|
|
7
|
+
from paradigma.util import deprecated
|
|
6
8
|
|
|
7
|
-
import numpy as np
|
|
8
9
|
|
|
9
10
|
def tabulate_windows(
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
11
|
+
df: pd.DataFrame,
|
|
12
|
+
columns: List[str],
|
|
13
|
+
window_length_s: float,
|
|
14
|
+
window_step_length_s: float,
|
|
15
|
+
fs: int,
|
|
16
|
+
) -> np.ndarray:
|
|
16
17
|
"""
|
|
17
18
|
Split the given DataFrame into overlapping windows of specified length and step size.
|
|
18
19
|
|
|
19
20
|
This function extracts windows of data from the specified columns of the DataFrame, based on
|
|
20
21
|
the window length and step size provided in the configuration. The windows are returned in
|
|
21
22
|
a 3D NumPy array, where the first dimension represents the window index, the second dimension
|
|
22
|
-
represents the time steps within the window, and the third dimension represents the columns
|
|
23
|
+
represents the time steps within the window, and the third dimension represents the columns
|
|
23
24
|
of the data.
|
|
24
25
|
|
|
25
26
|
Parameters
|
|
@@ -42,7 +43,7 @@ def tabulate_windows(
|
|
|
42
43
|
- `n_windows` is the number of windows that can be formed from the data.
|
|
43
44
|
- `window_size` is the length of each window in terms of the number of time steps.
|
|
44
45
|
- `n_columns` is the number of columns in the input DataFrame specified by `columns`.
|
|
45
|
-
|
|
46
|
+
|
|
46
47
|
If the length of the data is shorter than the specified window size, an empty array is returned.
|
|
47
48
|
|
|
48
49
|
Notes
|
|
@@ -66,12 +67,14 @@ def tabulate_windows(
|
|
|
66
67
|
|
|
67
68
|
# Check if data length is sufficient
|
|
68
69
|
if len(data) < window_size:
|
|
69
|
-
return np.empty(
|
|
70
|
-
|
|
70
|
+
return np.empty(
|
|
71
|
+
(0, window_size, n_columns)
|
|
72
|
+
) # Return an empty array if insufficient data
|
|
73
|
+
|
|
71
74
|
windows = np.lib.stride_tricks.sliding_window_view(
|
|
72
75
|
data, window_shape=(window_size, n_columns)
|
|
73
|
-
|
|
74
|
-
|
|
76
|
+
)[::window_step_size].squeeze()
|
|
77
|
+
|
|
75
78
|
# Ensure 3D shape (n_windows, window_size, n_columns)
|
|
76
79
|
if windows.ndim == 2: # Single window case
|
|
77
80
|
windows = windows[np.newaxis, :, :] # Add a new axis at the start
|
|
@@ -79,88 +82,108 @@ def tabulate_windows(
|
|
|
79
82
|
return windows
|
|
80
83
|
|
|
81
84
|
|
|
82
|
-
def tabulate_windows_legacy(config, df, agg_func=
|
|
85
|
+
def tabulate_windows_legacy(config, df, agg_func="first"):
|
|
83
86
|
"""
|
|
84
87
|
Efficiently creates a windowed dataframe from the input dataframe using vectorized operations.
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
config : object
|
|
92
|
+
A configuration object containing:
|
|
93
|
+
- `window_length_s`: The number of seconds per window.
|
|
94
|
+
- `window_step_length_s`: The number of seconds to shift between windows.
|
|
95
|
+
- `sampling_frequency`: The sampling frequency in Hz.
|
|
96
|
+
- `single_value_colnames`: List of column names where a single value (e.g., mean) is needed.
|
|
97
|
+
- `list_value_colnames`: List of column names where all 600 values should be stored in a list.
|
|
98
|
+
agg_func : str or callable, optional
|
|
99
|
+
Aggregation function for single-value columns. Can be 'mean', 'first', or a custom callable.
|
|
100
|
+
Default is 'first'.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
pd.DataFrame
|
|
105
|
+
A new DataFrame where each row corresponds to a window, containing:
|
|
106
|
+
- `window_nr`: The window number (starting from 1).
|
|
107
|
+
- `window_start`: The start time of the window.
|
|
108
|
+
- `window_end`: The end time of the window.
|
|
109
|
+
- Aggregated values for `single_value_colnames`.
|
|
110
|
+
- Lists of values for `list_value_colnames`.
|
|
111
|
+
|
|
96
112
|
"""
|
|
97
|
-
# If
|
|
98
|
-
if config.
|
|
99
|
-
config.
|
|
100
|
-
if config.
|
|
101
|
-
config.
|
|
113
|
+
# If single_value_colnames or list_value_colnames is None, default to an empty list
|
|
114
|
+
if config.single_value_colnames is None:
|
|
115
|
+
config.single_value_colnames = []
|
|
116
|
+
if config.list_value_colnames is None:
|
|
117
|
+
config.list_value_colnames = []
|
|
102
118
|
|
|
103
119
|
window_length = int(config.window_length_s * config.sampling_frequency)
|
|
104
120
|
window_step_size = int(config.window_step_length_s * config.sampling_frequency)
|
|
105
121
|
|
|
106
122
|
n_rows = len(df)
|
|
107
123
|
if window_length > n_rows:
|
|
108
|
-
raise ValueError(
|
|
109
|
-
|
|
110
|
-
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"Window size ({window_length}) cannot be greater than the number of rows ({n_rows}) in the dataframe."
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Create indices for window start positions
|
|
111
129
|
window_starts = np.arange(0, n_rows - window_length + 1, window_step_size)
|
|
112
|
-
|
|
130
|
+
|
|
113
131
|
# Prepare the result for the final DataFrame
|
|
114
132
|
result = []
|
|
115
|
-
|
|
133
|
+
|
|
116
134
|
# Handle single value columns with vectorized operations
|
|
117
135
|
agg_func_map = {
|
|
118
|
-
|
|
119
|
-
|
|
136
|
+
"mean": np.mean,
|
|
137
|
+
"first": lambda x: x[0],
|
|
120
138
|
}
|
|
121
139
|
|
|
122
140
|
# Check if agg_func is a callable (custom function) or get the function from the map
|
|
123
141
|
if callable(agg_func):
|
|
124
142
|
agg_func_np = agg_func
|
|
125
143
|
else:
|
|
126
|
-
agg_func_np = agg_func_map.get(
|
|
144
|
+
agg_func_np = agg_func_map.get(
|
|
145
|
+
agg_func, agg_func_map["mean"]
|
|
146
|
+
) # Default to 'mean' if agg_func is not recognized
|
|
127
147
|
|
|
128
|
-
|
|
129
148
|
for window_nr, start in enumerate(window_starts, 1):
|
|
130
149
|
end = start + window_length
|
|
131
150
|
window = df.iloc[start:end]
|
|
132
151
|
|
|
133
152
|
agg_data = {
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
153
|
+
"window_nr": window_nr,
|
|
154
|
+
"window_start": window[DataColumns.TIME].iloc[0],
|
|
155
|
+
"window_end": window[DataColumns.TIME].iloc[-1],
|
|
137
156
|
}
|
|
138
|
-
|
|
157
|
+
|
|
139
158
|
# Aggregate single-value columns
|
|
140
|
-
for col in config.
|
|
159
|
+
for col in config.single_value_colnames:
|
|
141
160
|
if col in window.columns: # Only process columns that exist in the window
|
|
142
161
|
agg_data[col] = agg_func_np(window[col].values)
|
|
143
|
-
|
|
162
|
+
|
|
144
163
|
# Collect list-value columns efficiently using numpy slicing
|
|
145
|
-
for col in config.
|
|
164
|
+
for col in config.list_value_colnames:
|
|
146
165
|
if col in window.columns: # Only process columns that exist in the window
|
|
147
166
|
agg_data[col] = window[col].values.tolist()
|
|
148
167
|
|
|
149
168
|
result.append(agg_data)
|
|
150
|
-
|
|
169
|
+
|
|
151
170
|
# Convert result list into a DataFrame
|
|
152
171
|
windowed_df = pd.DataFrame(result)
|
|
153
|
-
|
|
172
|
+
|
|
154
173
|
# Ensure the column order is as desired: window_nr, window_start, window_end, pre_or_post, and then the rest
|
|
155
|
-
desired_order =
|
|
156
|
-
|
|
174
|
+
desired_order = (
|
|
175
|
+
["window_nr", "window_start", "window_end"]
|
|
176
|
+
+ config.single_value_colnames
|
|
177
|
+
+ config.list_value_colnames
|
|
178
|
+
)
|
|
179
|
+
|
|
157
180
|
return windowed_df[desired_order]
|
|
158
181
|
|
|
159
182
|
|
|
160
183
|
def create_segments(
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
184
|
+
time_array: np.ndarray,
|
|
185
|
+
max_segment_gap_s: float,
|
|
186
|
+
):
|
|
164
187
|
# Calculate the difference between consecutive time values
|
|
165
188
|
time_diff = np.diff(time_array, prepend=0.0)
|
|
166
189
|
|
|
@@ -174,17 +197,17 @@ def create_segments(
|
|
|
174
197
|
|
|
175
198
|
|
|
176
199
|
def discard_segments(
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
200
|
+
df: pd.DataFrame,
|
|
201
|
+
segment_nr_colname: str,
|
|
202
|
+
min_segment_length_s: float,
|
|
203
|
+
fs: int,
|
|
204
|
+
format: str = "timestamps",
|
|
205
|
+
) -> pd.DataFrame:
|
|
183
206
|
"""
|
|
184
207
|
Remove segments smaller than a specified size and reset segment enumeration.
|
|
185
208
|
|
|
186
|
-
This function filters out segments from the DataFrame that are smaller than a
|
|
187
|
-
given minimum size, based on the configuration. After removing small segments,
|
|
209
|
+
This function filters out segments from the DataFrame that are smaller than a
|
|
210
|
+
given minimum size, based on the configuration. After removing small segments,
|
|
188
211
|
the segment numbers are reset to start from 1.
|
|
189
212
|
|
|
190
213
|
Parameters
|
|
@@ -201,7 +224,7 @@ def discard_segments(
|
|
|
201
224
|
Returns
|
|
202
225
|
-------
|
|
203
226
|
pd.DataFrame
|
|
204
|
-
A filtered DataFrame where small segments have been removed and segment
|
|
227
|
+
A filtered DataFrame where small segments have been removed and segment
|
|
205
228
|
numbers have been reset to start from 1.
|
|
206
229
|
|
|
207
230
|
Example
|
|
@@ -221,17 +244,17 @@ def discard_segments(
|
|
|
221
244
|
# 4 2 4
|
|
222
245
|
"""
|
|
223
246
|
# Minimum segment size in number of samples
|
|
224
|
-
if format ==
|
|
247
|
+
if format == "timestamps":
|
|
225
248
|
min_samples = min_segment_length_s * fs
|
|
226
|
-
elif format ==
|
|
249
|
+
elif format == "windows":
|
|
227
250
|
min_samples = min_segment_length_s
|
|
228
251
|
else:
|
|
229
252
|
raise ValueError("Invalid format. Must be 'timestamps' or 'windows'.")
|
|
230
253
|
|
|
231
254
|
# Group by segment and filter out small segments in one step
|
|
232
255
|
valid_segment_mask = (
|
|
233
|
-
df.groupby(segment_nr_colname)[segment_nr_colname]
|
|
234
|
-
|
|
256
|
+
df.groupby(segment_nr_colname)[segment_nr_colname].transform("size")
|
|
257
|
+
>= min_samples
|
|
235
258
|
)
|
|
236
259
|
|
|
237
260
|
df = df[valid_segment_mask].copy()
|
|
@@ -246,18 +269,19 @@ def discard_segments(
|
|
|
246
269
|
return df
|
|
247
270
|
|
|
248
271
|
|
|
249
|
-
|
|
272
|
+
@deprecated("This will be removed in v1.1.")
|
|
273
|
+
def categorize_segments(df, fs, format="timestamps", window_step_length_s=None):
|
|
250
274
|
"""
|
|
251
275
|
Categorize segments based on their duration.
|
|
252
276
|
|
|
253
|
-
This function categorizes segments into four categories based on their duration
|
|
277
|
+
This function categorizes segments into four categories based on their duration
|
|
254
278
|
in seconds. The categories are defined as:
|
|
255
279
|
- Category 1: Segments shorter than 5 seconds
|
|
256
280
|
- Category 2: Segments between 5 and 10 seconds
|
|
257
281
|
- Category 3: Segments between 10 and 20 seconds
|
|
258
282
|
- Category 4: Segments longer than 20 seconds
|
|
259
283
|
|
|
260
|
-
The duration of each segment is calculated based on the sampling frequency and
|
|
284
|
+
The duration of each segment is calculated based on the sampling frequency and
|
|
261
285
|
the number of rows (data points) in the segment.
|
|
262
286
|
|
|
263
287
|
Parameters
|
|
@@ -278,44 +302,43 @@ def categorize_segments(df, fs, format='timestamps', window_step_length_s=None):
|
|
|
278
302
|
- 'long' for segments between 10 and 20 seconds
|
|
279
303
|
- 'very_long' for segments > 20 seconds
|
|
280
304
|
"""
|
|
281
|
-
if format ==
|
|
305
|
+
if format == "windows" and window_step_length_s is None:
|
|
282
306
|
raise ValueError("Window step length must be provided for 'windows' format.")
|
|
283
|
-
|
|
307
|
+
|
|
284
308
|
# Define duration thresholds in seconds
|
|
285
|
-
d_max_duration = {
|
|
286
|
-
|
|
287
|
-
'moderately_long': 10,
|
|
288
|
-
'long': 20
|
|
289
|
-
}
|
|
290
|
-
|
|
309
|
+
d_max_duration = {"short": 5, "moderately_long": 10, "long": 20}
|
|
310
|
+
|
|
291
311
|
# Convert thresholds to rows if format is 'timestamps'
|
|
292
|
-
if format ==
|
|
312
|
+
if format == "timestamps":
|
|
293
313
|
d_max_duration = {k: v * fs for k, v in d_max_duration.items()}
|
|
294
314
|
|
|
295
315
|
# Count rows per segment
|
|
296
316
|
segment_sizes = df[DataColumns.SEGMENT_NR].value_counts()
|
|
297
317
|
|
|
298
318
|
# Convert segment sizes to duration in seconds
|
|
299
|
-
if format ==
|
|
319
|
+
if format == "windows":
|
|
300
320
|
segment_sizes *= window_step_length_s
|
|
301
321
|
|
|
302
322
|
# Group by the segment column and apply the categorization
|
|
303
323
|
def categorize(segment_size):
|
|
304
|
-
if segment_size < d_max_duration[
|
|
305
|
-
return
|
|
306
|
-
elif segment_size < d_max_duration[
|
|
307
|
-
return
|
|
308
|
-
elif segment_size < d_max_duration[
|
|
309
|
-
return
|
|
324
|
+
if segment_size < d_max_duration["short"]:
|
|
325
|
+
return "short"
|
|
326
|
+
elif segment_size < d_max_duration["moderately_long"]:
|
|
327
|
+
return "moderately_long"
|
|
328
|
+
elif segment_size < d_max_duration["long"]:
|
|
329
|
+
return "long"
|
|
310
330
|
else:
|
|
311
|
-
return
|
|
331
|
+
return "very_long"
|
|
312
332
|
|
|
313
333
|
# Apply categorization to the DataFrame
|
|
314
|
-
return
|
|
334
|
+
return (
|
|
335
|
+
df[DataColumns.SEGMENT_NR].map(segment_sizes).map(categorize).astype("category")
|
|
336
|
+
)
|
|
337
|
+
|
|
315
338
|
|
|
316
339
|
class WindowedDataExtractor:
|
|
317
340
|
"""
|
|
318
|
-
A utility class for extracting specific column indices and slices
|
|
341
|
+
A utility class for extracting specific column indices and slices
|
|
319
342
|
from a list of windowed column names.
|
|
320
343
|
|
|
321
344
|
Attributes
|
|
@@ -325,31 +348,31 @@ class WindowedDataExtractor:
|
|
|
325
348
|
|
|
326
349
|
Methods
|
|
327
350
|
-------
|
|
328
|
-
get_index(
|
|
329
|
-
Returns the index of a specific column.
|
|
330
|
-
get_slice(
|
|
331
|
-
Returns a slice object for a range of consecutive
|
|
351
|
+
get_index(colname)
|
|
352
|
+
Returns the index of a specific column name.
|
|
353
|
+
get_slice(colnames)
|
|
354
|
+
Returns a slice object for a range of consecutive column names.
|
|
332
355
|
"""
|
|
333
356
|
|
|
334
|
-
def __init__(self,
|
|
357
|
+
def __init__(self, windowed_colnames: List[str]):
|
|
335
358
|
"""
|
|
336
359
|
Initialize the WindowedDataExtractor.
|
|
337
360
|
|
|
338
361
|
Parameters
|
|
339
362
|
----------
|
|
340
|
-
|
|
363
|
+
windowed_colnames : list of str
|
|
341
364
|
A list of column names in the windowed data.
|
|
342
365
|
|
|
343
366
|
Raises
|
|
344
367
|
------
|
|
345
368
|
ValueError
|
|
346
|
-
If the list of `
|
|
369
|
+
If the list of `windowed_colnames` is empty.
|
|
347
370
|
"""
|
|
348
|
-
if not
|
|
371
|
+
if not windowed_colnames:
|
|
349
372
|
raise ValueError("The list of windowed columns cannot be empty.")
|
|
350
|
-
self.column_indices = {col: idx for idx, col in enumerate(
|
|
373
|
+
self.column_indices = {col: idx for idx, col in enumerate(windowed_colnames)}
|
|
351
374
|
|
|
352
|
-
def get_index(self,
|
|
375
|
+
def get_index(self, colname: str) -> int:
|
|
353
376
|
"""
|
|
354
377
|
Get the index of a specific column.
|
|
355
378
|
|
|
@@ -366,19 +389,19 @@ class WindowedDataExtractor:
|
|
|
366
389
|
Raises
|
|
367
390
|
------
|
|
368
391
|
ValueError
|
|
369
|
-
If the column is not found in the `
|
|
392
|
+
If the column is not found in the `windowed_colnames` list.
|
|
370
393
|
"""
|
|
371
|
-
if
|
|
372
|
-
raise ValueError(f"Column '{
|
|
373
|
-
return self.column_indices[
|
|
394
|
+
if colname not in self.column_indices:
|
|
395
|
+
raise ValueError(f"Column name '{colname}' not found in windowed_colnames.")
|
|
396
|
+
return self.column_indices[colname]
|
|
374
397
|
|
|
375
|
-
def get_slice(self,
|
|
398
|
+
def get_slice(self, colnames: List[str]) -> slice:
|
|
376
399
|
"""
|
|
377
400
|
Get a slice object for a range of consecutive columns.
|
|
378
401
|
|
|
379
402
|
Parameters
|
|
380
403
|
----------
|
|
381
|
-
|
|
404
|
+
colnames : list of str
|
|
382
405
|
A list of consecutive column names to define the slice.
|
|
383
406
|
|
|
384
407
|
Returns
|
|
@@ -389,11 +412,13 @@ class WindowedDataExtractor:
|
|
|
389
412
|
Raises
|
|
390
413
|
------
|
|
391
414
|
ValueError
|
|
392
|
-
If one or more columns in `
|
|
415
|
+
If one or more columns in `colnames` are not found in the `windowed_colnames` list.
|
|
393
416
|
"""
|
|
394
|
-
if not all(col in self.column_indices for col in
|
|
395
|
-
missing = [col for col in
|
|
396
|
-
raise ValueError(
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
417
|
+
if not all(col in self.column_indices for col in colnames):
|
|
418
|
+
missing = [col for col in colnames if col not in self.column_indices]
|
|
419
|
+
raise ValueError(
|
|
420
|
+
f"The following columns are missing from windowed_colnames: {missing}"
|
|
421
|
+
)
|
|
422
|
+
start_idx = self.column_indices[colnames[0]]
|
|
423
|
+
end_idx = self.column_indices[colnames[-1]] + 1
|
|
424
|
+
return slice(start_idx, end_idx)
|