paradigma 1.0.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
paradigma/segmenting.py CHANGED
@@ -1,26 +1,27 @@
1
- import pandas as pd
2
1
  import numpy as np
2
+ import pandas as pd
3
3
 
4
- from typing import List
5
4
  from paradigma.constants import DataColumns
5
+ from paradigma.util import deprecated
6
6
 
7
- import numpy as np
8
7
 
9
8
  def tabulate_windows(
10
- df: pd.DataFrame,
11
- columns: List[str],
12
- window_length_s: float,
13
- window_step_length_s: float,
14
- fs: int,
15
- ) -> np.ndarray:
9
+ df: pd.DataFrame,
10
+ columns: list[str],
11
+ window_length_s: float,
12
+ window_step_length_s: float,
13
+ fs: int,
14
+ ) -> np.ndarray:
16
15
  """
17
- Split the given DataFrame into overlapping windows of specified length and step size.
16
+ Split the given DataFrame into overlapping windows of specified length
17
+ and step size.
18
18
 
19
- This function extracts windows of data from the specified columns of the DataFrame, based on
20
- the window length and step size provided in the configuration. The windows are returned in
21
- a 3D NumPy array, where the first dimension represents the window index, the second dimension
22
- represents the time steps within the window, and the third dimension represents the columns
23
- of the data.
19
+ This function extracts windows of data from the specified columns of the
20
+ DataFrame, based on the window length and step size provided in the
21
+ configuration. The windows are returned in a 3D NumPy array, where the
22
+ first dimension represents the window index, the second dimension
23
+ represents the time steps within the window, and the third dimension
24
+ represents the columns of the data.
24
25
 
25
26
  Parameters
26
27
  ----------
@@ -39,17 +40,22 @@ def tabulate_windows(
39
40
  -------
40
41
  np.ndarray
41
42
  A 3D NumPy array of shape (n_windows, window_size, n_columns), where:
42
- - `n_windows` is the number of windows that can be formed from the data.
43
- - `window_size` is the length of each window in terms of the number of time steps.
44
- - `n_columns` is the number of columns in the input DataFrame specified by `columns`.
45
-
46
- If the length of the data is shorter than the specified window size, an empty array is returned.
43
+ - `n_windows` is the number of windows that can be formed from the
44
+ data.
45
+ - `window_size` is the length of each window in terms of the number
46
+ of time steps.
47
+ - `n_columns` is the number of columns in the input DataFrame
48
+ specified by `columns`.
49
+
50
+ If the length of the data is shorter than the specified window size,
51
+ an empty array is returned.
47
52
 
48
53
  Notes
49
54
  -----
50
- This function uses `np.lib.stride_tricks.sliding_window_view` to generate sliding windows of data.
51
- The step size is applied to extract windows at intervals.
52
- If the data is insufficient for at least one window, an empty array will be returned.
55
+ This function uses `np.lib.stride_tricks.sliding_window_view` to
56
+ generate sliding windows of data. The step size is applied to extract
57
+ windows at intervals. If the data is insufficient for at least one
58
+ window, an empty array will be returned.
53
59
 
54
60
  Example
55
61
  -------
@@ -66,12 +72,14 @@ def tabulate_windows(
66
72
 
67
73
  # Check if data length is sufficient
68
74
  if len(data) < window_size:
69
- return np.empty((0, window_size, n_columns)) # Return an empty array if insufficient data
70
-
75
+ return np.empty(
76
+ (0, window_size, n_columns)
77
+ ) # Return an empty array if insufficient data
78
+
71
79
  windows = np.lib.stride_tricks.sliding_window_view(
72
80
  data, window_shape=(window_size, n_columns)
73
- )[::window_step_size].squeeze()
74
-
81
+ )[::window_step_size].squeeze()
82
+
75
83
  # Ensure 3D shape (n_windows, window_size, n_columns)
76
84
  if windows.ndim == 2: # Single window case
77
85
  windows = windows[np.newaxis, :, :] # Add a new axis at the start
@@ -79,88 +87,113 @@ def tabulate_windows(
79
87
  return windows
80
88
 
81
89
 
82
- def tabulate_windows_legacy(config, df, agg_func='first'):
90
+ def tabulate_windows_legacy(config, df, agg_func="first"):
83
91
  """
84
- Efficiently creates a windowed dataframe from the input dataframe using vectorized operations.
85
-
86
- Args:
87
- df: The input dataframe, where each row represents a timestamp (0.01 sec).
88
- window_length_s: The number of seconds per window.
89
- window_step_length_s: The number of seconds to shift between windows.
90
- single_value_cols: List of columns where a single value (e.g., mean) is needed.
91
- list_value_cols: List of columns where all 600 values should be stored in a list.
92
- agg_func: Aggregation function for single-value columns (e.g., 'mean', 'first').
93
-
94
- Returns:
95
- The windowed dataframe.
92
+ Efficiently creates a windowed dataframe from the input dataframe using
93
+ vectorized operations.
94
+
95
+ Parameters
96
+ ----------
97
+ config : object
98
+ A configuration object containing:
99
+ - `window_length_s`: The number of seconds per window.
100
+ - `window_step_length_s`: The number of seconds to shift between windows.
101
+ - `sampling_frequency`: The sampling frequency in Hz.
102
+ - `single_value_colnames`: List of column names where a single value
103
+ (e.g., mean) is needed.
104
+ - `list_value_colnames`: List of column names where all 600 values
105
+ should be stored in a list.
106
+ agg_func : str or callable, optional
107
+ Aggregation function for single-value columns. Can be 'mean',
108
+ 'first', or a custom callable. Default is 'first'.
109
+
110
+ Returns
111
+ -------
112
+ pd.DataFrame
113
+ A new DataFrame where each row corresponds to a window, containing:
114
+ - `window_nr`: The window number (starting from 1).
115
+ - `window_start`: The start time of the window.
116
+ - `window_end`: The end time of the window.
117
+ - Aggregated values for `single_value_colnames`.
118
+ - Lists of values for `list_value_colnames`.
119
+
96
120
  """
97
- # If single_value_cols or list_value_cols is None, default to an empty list
98
- if config.single_value_cols is None:
99
- config.single_value_cols = []
100
- if config.list_value_cols is None:
101
- config.list_value_cols = []
121
+ # If single_value_colnames or list_value_colnames is None, default to an empty list
122
+ if config.single_value_colnames is None:
123
+ config.single_value_colnames = []
124
+ if config.list_value_colnames is None:
125
+ config.list_value_colnames = []
102
126
 
103
127
  window_length = int(config.window_length_s * config.sampling_frequency)
104
128
  window_step_size = int(config.window_step_length_s * config.sampling_frequency)
105
129
 
106
130
  n_rows = len(df)
107
131
  if window_length > n_rows:
108
- raise ValueError(f"Window size ({window_length}) cannot be greater than the number of rows ({n_rows}) in the dataframe.")
109
-
110
- # Create indices for window start positions
132
+ raise ValueError(
133
+ f"Window size ({window_length}) cannot be greater than the "
134
+ f"number of rows ({n_rows}) in the dataframe."
135
+ )
136
+
137
+ # Create indices for window start positions
111
138
  window_starts = np.arange(0, n_rows - window_length + 1, window_step_size)
112
-
139
+
113
140
  # Prepare the result for the final DataFrame
114
141
  result = []
115
-
142
+
116
143
  # Handle single value columns with vectorized operations
117
144
  agg_func_map = {
118
- 'mean': np.mean,
119
- 'first': lambda x: x[0],
145
+ "mean": np.mean,
146
+ "first": lambda x: x[0],
120
147
  }
121
148
 
122
149
  # Check if agg_func is a callable (custom function) or get the function from the map
123
150
  if callable(agg_func):
124
151
  agg_func_np = agg_func
125
152
  else:
126
- agg_func_np = agg_func_map.get(agg_func, agg_func_map['mean']) # Default to 'mean' if agg_func is not recognized
153
+ agg_func_np = agg_func_map.get(
154
+ agg_func, agg_func_map["mean"]
155
+ ) # Default to 'mean' if agg_func is not recognized
127
156
 
128
-
129
157
  for window_nr, start in enumerate(window_starts, 1):
130
158
  end = start + window_length
131
159
  window = df.iloc[start:end]
132
160
 
133
161
  agg_data = {
134
- 'window_nr': window_nr,
135
- 'window_start': window[DataColumns.TIME].iloc[0],
136
- 'window_end': window[DataColumns.TIME].iloc[-1],
162
+ "window_nr": window_nr,
163
+ "window_start": window[DataColumns.TIME].iloc[0],
164
+ "window_end": window[DataColumns.TIME].iloc[-1],
137
165
  }
138
-
166
+
139
167
  # Aggregate single-value columns
140
- for col in config.single_value_cols:
168
+ for col in config.single_value_colnames:
141
169
  if col in window.columns: # Only process columns that exist in the window
142
170
  agg_data[col] = agg_func_np(window[col].values)
143
-
171
+
144
172
  # Collect list-value columns efficiently using numpy slicing
145
- for col in config.list_value_cols:
173
+ for col in config.list_value_colnames:
146
174
  if col in window.columns: # Only process columns that exist in the window
147
175
  agg_data[col] = window[col].values.tolist()
148
176
 
149
177
  result.append(agg_data)
150
-
178
+
151
179
  # Convert result list into a DataFrame
152
180
  windowed_df = pd.DataFrame(result)
153
-
154
- # Ensure the column order is as desired: window_nr, window_start, window_end, pre_or_post, and then the rest
155
- desired_order = ['window_nr', 'window_start', 'window_end'] + config.single_value_cols + config.list_value_cols
156
-
181
+
182
+ # Ensure the column order is as desired: window_nr, window_start,
183
+ # window_end, pre_or_post, and then the rest
184
+ desired_order = (
185
+ ["window_nr", "window_start", "window_end"]
186
+ + config.single_value_colnames
187
+ + config.list_value_colnames
188
+ )
189
+
157
190
  return windowed_df[desired_order]
158
191
 
159
192
 
160
193
  def create_segments(
161
- time_array: np.ndarray,
162
- max_segment_gap_s: float,
163
- ):
194
+ time_array: np.ndarray,
195
+ max_segment_gap_s: float,
196
+ ):
164
197
  # Calculate the difference between consecutive time values
165
198
  time_diff = np.diff(time_array, prepend=0.0)
166
199
 
@@ -168,23 +201,23 @@ def create_segments(
168
201
  gap_exceeds = time_diff > max_segment_gap_s
169
202
 
170
203
  # Create the segment number based on the cumulative sum of the gap_exceeds mask
171
- segments = gap_exceeds.cumsum()
204
+ segments = gap_exceeds.cumsum() + 1
172
205
 
173
206
  return segments
174
207
 
175
208
 
176
209
  def discard_segments(
177
- df: pd.DataFrame,
178
- segment_nr_colname: str,
179
- min_segment_length_s: float,
180
- fs: int,
181
- format: str='timestamps'
182
- ) -> pd.DataFrame:
210
+ df: pd.DataFrame,
211
+ segment_nr_colname: str,
212
+ min_segment_length_s: float,
213
+ fs: int,
214
+ format: str = "timestamps",
215
+ ) -> pd.DataFrame:
183
216
  """
184
217
  Remove segments smaller than a specified size and reset segment enumeration.
185
218
 
186
- This function filters out segments from the DataFrame that are smaller than a
187
- given minimum size, based on the configuration. After removing small segments,
219
+ This function filters out segments from the DataFrame that are smaller than a
220
+ given minimum size, based on the configuration. After removing small segments,
188
221
  the segment numbers are reset to start from 1.
189
222
 
190
223
  Parameters
@@ -201,12 +234,13 @@ def discard_segments(
201
234
  Returns
202
235
  -------
203
236
  pd.DataFrame
204
- A filtered DataFrame where small segments have been removed and segment
237
+ A filtered DataFrame where small segments have been removed and segment
205
238
  numbers have been reset to start from 1.
206
239
 
207
240
  Example
208
241
  -------
209
- config = Config(min_segment_length_s=2, sampling_frequency=100, segment_nr_colname='segment')
242
+ config = Config(min_segment_length_s=2, sampling_frequency=100,
243
+ segment_nr_colname='segment')
210
244
  df = pd.DataFrame({
211
245
  'segment': [1, 1, 2, 2, 2],
212
246
  'time': [0, 1, 2, 3, 4]
@@ -221,43 +255,44 @@ def discard_segments(
221
255
  # 4 2 4
222
256
  """
223
257
  # Minimum segment size in number of samples
224
- if format == 'timestamps':
225
- min_samples = min_segment_length_s * fs
226
- elif format == 'windows':
227
- min_samples = min_segment_length_s
258
+ if format == "timestamps":
259
+ min_samples = int(min_segment_length_s * fs)
260
+ elif format == "windows":
261
+ min_samples = int(min_segment_length_s)
228
262
  else:
229
263
  raise ValueError("Invalid format. Must be 'timestamps' or 'windows'.")
230
264
 
231
- # Group by segment and filter out small segments in one step
232
- valid_segment_mask = (
233
- df.groupby(segment_nr_colname)[segment_nr_colname]
234
- .transform('size') >= min_samples
235
- )
265
+ # Count samples per segment
266
+ segment_counts = df.groupby(segment_nr_colname).size()
236
267
 
237
- df = df[valid_segment_mask].copy()
268
+ # Filter rows for valid segments (>= min samples)
269
+ counts_map = segment_counts.to_dict()
270
+ df = df[df[segment_nr_colname].map(counts_map) >= min_samples].copy()
238
271
 
239
272
  if df.empty:
240
- raise ValueError("All segments were removed.")
273
+ raise ValueError(
274
+ f"All segments were removed: no segment ≥ {min_samples} samples."
275
+ )
241
276
 
242
- # Reset segment numbers in a single step
243
- unique_segments = pd.factorize(df[segment_nr_colname])[0] + 1
244
- df[segment_nr_colname] = unique_segments
277
+ # Reset segment numbers
278
+ df[segment_nr_colname] = pd.factorize(df[segment_nr_colname])[0] + 1
245
279
 
246
280
  return df
247
281
 
248
282
 
249
- def categorize_segments(df, fs, format='timestamps', window_step_length_s=None):
283
+ @deprecated("This will be removed in v1.1.")
284
+ def categorize_segments(df, fs, format="timestamps", window_step_length_s=None):
250
285
  """
251
286
  Categorize segments based on their duration.
252
287
 
253
- This function categorizes segments into four categories based on their duration
288
+ This function categorizes segments into four categories based on their duration
254
289
  in seconds. The categories are defined as:
255
290
  - Category 1: Segments shorter than 5 seconds
256
291
  - Category 2: Segments between 5 and 10 seconds
257
292
  - Category 3: Segments between 10 and 20 seconds
258
293
  - Category 4: Segments longer than 20 seconds
259
294
 
260
- The duration of each segment is calculated based on the sampling frequency and
295
+ The duration of each segment is calculated based on the sampling frequency and
261
296
  the number of rows (data points) in the segment.
262
297
 
263
298
  Parameters
@@ -278,44 +313,46 @@ def categorize_segments(df, fs, format='timestamps', window_step_length_s=None):
278
313
  - 'long' for segments between 10 and 20 seconds
279
314
  - 'very_long' for segments > 20 seconds
280
315
  """
281
- if format == 'windows' and window_step_length_s is None:
316
+ if format == "windows" and window_step_length_s is None:
282
317
  raise ValueError("Window step length must be provided for 'windows' format.")
283
-
318
+
284
319
  # Define duration thresholds in seconds
285
- d_max_duration = {
286
- 'short': 5,
287
- 'moderately_long': 10,
288
- 'long': 20
289
- }
290
-
320
+ d_max_duration = {"short": 5, "moderately_long": 10, "long": 20}
321
+
291
322
  # Convert thresholds to rows if format is 'timestamps'
292
- if format == 'timestamps':
323
+ if format == "timestamps":
293
324
  d_max_duration = {k: v * fs for k, v in d_max_duration.items()}
294
325
 
295
326
  # Count rows per segment
296
- segment_sizes = df[DataColumns.SEGMENT_NR].value_counts()
327
+ segment_sizes = df[DataColumns.GAIT_SEGMENT_NR].value_counts()
297
328
 
298
329
  # Convert segment sizes to duration in seconds
299
- if format == 'windows':
330
+ if format == "windows":
300
331
  segment_sizes *= window_step_length_s
301
332
 
302
333
  # Group by the segment column and apply the categorization
303
334
  def categorize(segment_size):
304
- if segment_size < d_max_duration['short']:
305
- return 'short'
306
- elif segment_size < d_max_duration['moderately_long']:
307
- return 'moderately_long'
308
- elif segment_size < d_max_duration['long']:
309
- return 'long'
335
+ if segment_size < d_max_duration["short"]:
336
+ return "short"
337
+ elif segment_size < d_max_duration["moderately_long"]:
338
+ return "moderately_long"
339
+ elif segment_size < d_max_duration["long"]:
340
+ return "long"
310
341
  else:
311
- return 'very_long'
342
+ return "very_long"
312
343
 
313
344
  # Apply categorization to the DataFrame
314
- return df[DataColumns.SEGMENT_NR].map(segment_sizes).map(categorize).astype('category')
345
+ return (
346
+ df[DataColumns.GAIT_SEGMENT_NR]
347
+ .map(segment_sizes)
348
+ .map(categorize)
349
+ .astype("category")
350
+ )
351
+
315
352
 
316
353
  class WindowedDataExtractor:
317
354
  """
318
- A utility class for extracting specific column indices and slices
355
+ A utility class for extracting specific column indices and slices
319
356
  from a list of windowed column names.
320
357
 
321
358
  Attributes
@@ -325,31 +362,31 @@ class WindowedDataExtractor:
325
362
 
326
363
  Methods
327
364
  -------
328
- get_index(col)
329
- Returns the index of a specific column.
330
- get_slice(cols)
331
- Returns a slice object for a range of consecutive columns.
365
+ get_index(colname)
366
+ Returns the index of a specific column name.
367
+ get_slice(colnames)
368
+ Returns a slice object for a range of consecutive column names.
332
369
  """
333
370
 
334
- def __init__(self, windowed_cols):
371
+ def __init__(self, windowed_colnames: list[str]):
335
372
  """
336
373
  Initialize the WindowedDataExtractor.
337
374
 
338
375
  Parameters
339
376
  ----------
340
- windowed_cols : list of str
377
+ windowed_colnames : list of str
341
378
  A list of column names in the windowed data.
342
379
 
343
380
  Raises
344
381
  ------
345
382
  ValueError
346
- If the list of `windowed_cols` is empty.
383
+ If the list of `windowed_colnames` is empty.
347
384
  """
348
- if not windowed_cols:
385
+ if not windowed_colnames:
349
386
  raise ValueError("The list of windowed columns cannot be empty.")
350
- self.column_indices = {col: idx for idx, col in enumerate(windowed_cols)}
387
+ self.column_indices = {col: idx for idx, col in enumerate(windowed_colnames)}
351
388
 
352
- def get_index(self, col):
389
+ def get_index(self, colname: str) -> int:
353
390
  """
354
391
  Get the index of a specific column.
355
392
 
@@ -366,19 +403,19 @@ class WindowedDataExtractor:
366
403
  Raises
367
404
  ------
368
405
  ValueError
369
- If the column is not found in the `windowed_cols` list.
406
+ If the column is not found in the `windowed_colnames` list.
370
407
  """
371
- if col not in self.column_indices:
372
- raise ValueError(f"Column '{col}' not found in windowed_cols.")
373
- return self.column_indices[col]
408
+ if colname not in self.column_indices:
409
+ raise ValueError(f"Column name '{colname}' not found in windowed_colnames.")
410
+ return self.column_indices[colname]
374
411
 
375
- def get_slice(self, cols):
412
+ def get_slice(self, colnames: list[str]) -> slice:
376
413
  """
377
414
  Get a slice object for a range of consecutive columns.
378
415
 
379
416
  Parameters
380
417
  ----------
381
- cols : list of str
418
+ colnames : list of str
382
419
  A list of consecutive column names to define the slice.
383
420
 
384
421
  Returns
@@ -389,11 +426,14 @@ class WindowedDataExtractor:
389
426
  Raises
390
427
  ------
391
428
  ValueError
392
- If one or more columns in `cols` are not found in the `windowed_cols` list.
429
+ If one or more columns in `colnames` are not found in the
430
+ `windowed_colnames` list.
393
431
  """
394
- if not all(col in self.column_indices for col in cols):
395
- missing = [col for col in cols if col not in self.column_indices]
396
- raise ValueError(f"The following columns are missing from windowed_cols: {missing}")
397
- start_idx = self.column_indices[cols[0]]
398
- end_idx = self.column_indices[cols[-1]] + 1
399
- return slice(start_idx, end_idx)
432
+ if not all(col in self.column_indices for col in colnames):
433
+ missing = [col for col in colnames if col not in self.column_indices]
434
+ raise ValueError(
435
+ f"The following columns are missing from windowed_colnames: {missing}"
436
+ )
437
+ start_idx = self.column_indices[colnames[0]]
438
+ end_idx = self.column_indices[colnames[-1]] + 1
439
+ return slice(start_idx, end_idx)