paradigma 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
paradigma/segmenting.py CHANGED
@@ -1,25 +1,26 @@
1
- import pandas as pd
1
+ from typing import List
2
+
2
3
  import numpy as np
4
+ import pandas as pd
3
5
 
4
- from typing import List
5
6
  from paradigma.constants import DataColumns
7
+ from paradigma.util import deprecated
6
8
 
7
- import numpy as np
8
9
 
9
10
  def tabulate_windows(
10
- df: pd.DataFrame,
11
- columns: List[str],
12
- window_length_s: float,
13
- window_step_length_s: float,
14
- fs: int,
15
- ) -> np.ndarray:
11
+ df: pd.DataFrame,
12
+ columns: List[str],
13
+ window_length_s: float,
14
+ window_step_length_s: float,
15
+ fs: int,
16
+ ) -> np.ndarray:
16
17
  """
17
18
  Split the given DataFrame into overlapping windows of specified length and step size.
18
19
 
19
20
  This function extracts windows of data from the specified columns of the DataFrame, based on
20
21
  the window length and step size provided in the configuration. The windows are returned in
21
22
  a 3D NumPy array, where the first dimension represents the window index, the second dimension
22
- represents the time steps within the window, and the third dimension represents the columns
23
+ represents the time steps within the window, and the third dimension represents the columns
23
24
  of the data.
24
25
 
25
26
  Parameters
@@ -42,7 +43,7 @@ def tabulate_windows(
42
43
  - `n_windows` is the number of windows that can be formed from the data.
43
44
  - `window_size` is the length of each window in terms of the number of time steps.
44
45
  - `n_columns` is the number of columns in the input DataFrame specified by `columns`.
45
-
46
+
46
47
  If the length of the data is shorter than the specified window size, an empty array is returned.
47
48
 
48
49
  Notes
@@ -66,12 +67,14 @@ def tabulate_windows(
66
67
 
67
68
  # Check if data length is sufficient
68
69
  if len(data) < window_size:
69
- return np.empty((0, window_size, n_columns)) # Return an empty array if insufficient data
70
-
70
+ return np.empty(
71
+ (0, window_size, n_columns)
72
+ ) # Return an empty array if insufficient data
73
+
71
74
  windows = np.lib.stride_tricks.sliding_window_view(
72
75
  data, window_shape=(window_size, n_columns)
73
- )[::window_step_size].squeeze()
74
-
76
+ )[::window_step_size].squeeze()
77
+
75
78
  # Ensure 3D shape (n_windows, window_size, n_columns)
76
79
  if windows.ndim == 2: # Single window case
77
80
  windows = windows[np.newaxis, :, :] # Add a new axis at the start
@@ -79,88 +82,108 @@ def tabulate_windows(
79
82
  return windows
80
83
 
81
84
 
82
- def tabulate_windows_legacy(config, df, agg_func='first'):
85
+ def tabulate_windows_legacy(config, df, agg_func="first"):
83
86
  """
84
87
  Efficiently creates a windowed dataframe from the input dataframe using vectorized operations.
85
-
86
- Args:
87
- df: The input dataframe, where each row represents a timestamp (0.01 sec).
88
- window_length_s: The number of seconds per window.
89
- window_step_length_s: The number of seconds to shift between windows.
90
- single_value_cols: List of columns where a single value (e.g., mean) is needed.
91
- list_value_cols: List of columns where all 600 values should be stored in a list.
92
- agg_func: Aggregation function for single-value columns (e.g., 'mean', 'first').
93
-
94
- Returns:
95
- The windowed dataframe.
88
+
89
+ Parameters
90
+ ----------
91
+ config : object
92
+ A configuration object containing:
93
+ - `window_length_s`: The number of seconds per window.
94
+ - `window_step_length_s`: The number of seconds to shift between windows.
95
+ - `sampling_frequency`: The sampling frequency in Hz.
96
+ - `single_value_colnames`: List of column names where a single value (e.g., mean) is needed.
97
+ - `list_value_colnames`: List of column names where all 600 values should be stored in a list.
98
+ agg_func : str or callable, optional
99
+ Aggregation function for single-value columns. Can be 'mean', 'first', or a custom callable.
100
+ Default is 'first'.
101
+
102
+ Returns
103
+ -------
104
+ pd.DataFrame
105
+ A new DataFrame where each row corresponds to a window, containing:
106
+ - `window_nr`: The window number (starting from 1).
107
+ - `window_start`: The start time of the window.
108
+ - `window_end`: The end time of the window.
109
+ - Aggregated values for `single_value_colnames`.
110
+ - Lists of values for `list_value_colnames`.
111
+
96
112
  """
97
- # If single_value_cols or list_value_cols is None, default to an empty list
98
- if config.single_value_cols is None:
99
- config.single_value_cols = []
100
- if config.list_value_cols is None:
101
- config.list_value_cols = []
113
+ # If single_value_colnames or list_value_colnames is None, default to an empty list
114
+ if config.single_value_colnames is None:
115
+ config.single_value_colnames = []
116
+ if config.list_value_colnames is None:
117
+ config.list_value_colnames = []
102
118
 
103
119
  window_length = int(config.window_length_s * config.sampling_frequency)
104
120
  window_step_size = int(config.window_step_length_s * config.sampling_frequency)
105
121
 
106
122
  n_rows = len(df)
107
123
  if window_length > n_rows:
108
- raise ValueError(f"Window size ({window_length}) cannot be greater than the number of rows ({n_rows}) in the dataframe.")
109
-
110
- # Create indices for window start positions
124
+ raise ValueError(
125
+ f"Window size ({window_length}) cannot be greater than the number of rows ({n_rows}) in the dataframe."
126
+ )
127
+
128
+ # Create indices for window start positions
111
129
  window_starts = np.arange(0, n_rows - window_length + 1, window_step_size)
112
-
130
+
113
131
  # Prepare the result for the final DataFrame
114
132
  result = []
115
-
133
+
116
134
  # Handle single value columns with vectorized operations
117
135
  agg_func_map = {
118
- 'mean': np.mean,
119
- 'first': lambda x: x[0],
136
+ "mean": np.mean,
137
+ "first": lambda x: x[0],
120
138
  }
121
139
 
122
140
  # Check if agg_func is a callable (custom function) or get the function from the map
123
141
  if callable(agg_func):
124
142
  agg_func_np = agg_func
125
143
  else:
126
- agg_func_np = agg_func_map.get(agg_func, agg_func_map['mean']) # Default to 'mean' if agg_func is not recognized
144
+ agg_func_np = agg_func_map.get(
145
+ agg_func, agg_func_map["mean"]
146
+ ) # Default to 'mean' if agg_func is not recognized
127
147
 
128
-
129
148
  for window_nr, start in enumerate(window_starts, 1):
130
149
  end = start + window_length
131
150
  window = df.iloc[start:end]
132
151
 
133
152
  agg_data = {
134
- 'window_nr': window_nr,
135
- 'window_start': window[DataColumns.TIME].iloc[0],
136
- 'window_end': window[DataColumns.TIME].iloc[-1],
153
+ "window_nr": window_nr,
154
+ "window_start": window[DataColumns.TIME].iloc[0],
155
+ "window_end": window[DataColumns.TIME].iloc[-1],
137
156
  }
138
-
157
+
139
158
  # Aggregate single-value columns
140
- for col in config.single_value_cols:
159
+ for col in config.single_value_colnames:
141
160
  if col in window.columns: # Only process columns that exist in the window
142
161
  agg_data[col] = agg_func_np(window[col].values)
143
-
162
+
144
163
  # Collect list-value columns efficiently using numpy slicing
145
- for col in config.list_value_cols:
164
+ for col in config.list_value_colnames:
146
165
  if col in window.columns: # Only process columns that exist in the window
147
166
  agg_data[col] = window[col].values.tolist()
148
167
 
149
168
  result.append(agg_data)
150
-
169
+
151
170
  # Convert result list into a DataFrame
152
171
  windowed_df = pd.DataFrame(result)
153
-
172
+
154
173
  # Ensure the column order is as desired: window_nr, window_start, window_end, pre_or_post, and then the rest
155
- desired_order = ['window_nr', 'window_start', 'window_end'] + config.single_value_cols + config.list_value_cols
156
-
174
+ desired_order = (
175
+ ["window_nr", "window_start", "window_end"]
176
+ + config.single_value_colnames
177
+ + config.list_value_colnames
178
+ )
179
+
157
180
  return windowed_df[desired_order]
158
181
 
159
182
 
160
183
  def create_segments(
161
- time_array: np.ndarray,
162
- max_segment_gap_s: float,
163
- ):
184
+ time_array: np.ndarray,
185
+ max_segment_gap_s: float,
186
+ ):
164
187
  # Calculate the difference between consecutive time values
165
188
  time_diff = np.diff(time_array, prepend=0.0)
166
189
 
@@ -174,17 +197,17 @@ def create_segments(
174
197
 
175
198
 
176
199
  def discard_segments(
177
- df: pd.DataFrame,
178
- segment_nr_colname: str,
179
- min_segment_length_s: float,
180
- fs: int,
181
- format: str='timestamps'
182
- ) -> pd.DataFrame:
200
+ df: pd.DataFrame,
201
+ segment_nr_colname: str,
202
+ min_segment_length_s: float,
203
+ fs: int,
204
+ format: str = "timestamps",
205
+ ) -> pd.DataFrame:
183
206
  """
184
207
  Remove segments smaller than a specified size and reset segment enumeration.
185
208
 
186
- This function filters out segments from the DataFrame that are smaller than a
187
- given minimum size, based on the configuration. After removing small segments,
209
+ This function filters out segments from the DataFrame that are smaller than a
210
+ given minimum size, based on the configuration. After removing small segments,
188
211
  the segment numbers are reset to start from 1.
189
212
 
190
213
  Parameters
@@ -201,7 +224,7 @@ def discard_segments(
201
224
  Returns
202
225
  -------
203
226
  pd.DataFrame
204
- A filtered DataFrame where small segments have been removed and segment
227
+ A filtered DataFrame where small segments have been removed and segment
205
228
  numbers have been reset to start from 1.
206
229
 
207
230
  Example
@@ -221,17 +244,17 @@ def discard_segments(
221
244
  # 4 2 4
222
245
  """
223
246
  # Minimum segment size in number of samples
224
- if format == 'timestamps':
247
+ if format == "timestamps":
225
248
  min_samples = min_segment_length_s * fs
226
- elif format == 'windows':
249
+ elif format == "windows":
227
250
  min_samples = min_segment_length_s
228
251
  else:
229
252
  raise ValueError("Invalid format. Must be 'timestamps' or 'windows'.")
230
253
 
231
254
  # Group by segment and filter out small segments in one step
232
255
  valid_segment_mask = (
233
- df.groupby(segment_nr_colname)[segment_nr_colname]
234
- .transform('size') >= min_samples
256
+ df.groupby(segment_nr_colname)[segment_nr_colname].transform("size")
257
+ >= min_samples
235
258
  )
236
259
 
237
260
  df = df[valid_segment_mask].copy()
@@ -246,18 +269,19 @@ def discard_segments(
246
269
  return df
247
270
 
248
271
 
249
- def categorize_segments(df, fs, format='timestamps', window_step_length_s=None):
272
+ @deprecated("This will be removed in v1.1.")
273
+ def categorize_segments(df, fs, format="timestamps", window_step_length_s=None):
250
274
  """
251
275
  Categorize segments based on their duration.
252
276
 
253
- This function categorizes segments into four categories based on their duration
277
+ This function categorizes segments into four categories based on their duration
254
278
  in seconds. The categories are defined as:
255
279
  - Category 1: Segments shorter than 5 seconds
256
280
  - Category 2: Segments between 5 and 10 seconds
257
281
  - Category 3: Segments between 10 and 20 seconds
258
282
  - Category 4: Segments longer than 20 seconds
259
283
 
260
- The duration of each segment is calculated based on the sampling frequency and
284
+ The duration of each segment is calculated based on the sampling frequency and
261
285
  the number of rows (data points) in the segment.
262
286
 
263
287
  Parameters
@@ -278,44 +302,43 @@ def categorize_segments(df, fs, format='timestamps', window_step_length_s=None):
278
302
  - 'long' for segments between 10 and 20 seconds
279
303
  - 'very_long' for segments > 20 seconds
280
304
  """
281
- if format == 'windows' and window_step_length_s is None:
305
+ if format == "windows" and window_step_length_s is None:
282
306
  raise ValueError("Window step length must be provided for 'windows' format.")
283
-
307
+
284
308
  # Define duration thresholds in seconds
285
- d_max_duration = {
286
- 'short': 5,
287
- 'moderately_long': 10,
288
- 'long': 20
289
- }
290
-
309
+ d_max_duration = {"short": 5, "moderately_long": 10, "long": 20}
310
+
291
311
  # Convert thresholds to rows if format is 'timestamps'
292
- if format == 'timestamps':
312
+ if format == "timestamps":
293
313
  d_max_duration = {k: v * fs for k, v in d_max_duration.items()}
294
314
 
295
315
  # Count rows per segment
296
316
  segment_sizes = df[DataColumns.SEGMENT_NR].value_counts()
297
317
 
298
318
  # Convert segment sizes to duration in seconds
299
- if format == 'windows':
319
+ if format == "windows":
300
320
  segment_sizes *= window_step_length_s
301
321
 
302
322
  # Group by the segment column and apply the categorization
303
323
  def categorize(segment_size):
304
- if segment_size < d_max_duration['short']:
305
- return 'short'
306
- elif segment_size < d_max_duration['moderately_long']:
307
- return 'moderately_long'
308
- elif segment_size < d_max_duration['long']:
309
- return 'long'
324
+ if segment_size < d_max_duration["short"]:
325
+ return "short"
326
+ elif segment_size < d_max_duration["moderately_long"]:
327
+ return "moderately_long"
328
+ elif segment_size < d_max_duration["long"]:
329
+ return "long"
310
330
  else:
311
- return 'very_long'
331
+ return "very_long"
312
332
 
313
333
  # Apply categorization to the DataFrame
314
- return df[DataColumns.SEGMENT_NR].map(segment_sizes).map(categorize).astype('category')
334
+ return (
335
+ df[DataColumns.SEGMENT_NR].map(segment_sizes).map(categorize).astype("category")
336
+ )
337
+
315
338
 
316
339
  class WindowedDataExtractor:
317
340
  """
318
- A utility class for extracting specific column indices and slices
341
+ A utility class for extracting specific column indices and slices
319
342
  from a list of windowed column names.
320
343
 
321
344
  Attributes
@@ -325,31 +348,31 @@ class WindowedDataExtractor:
325
348
 
326
349
  Methods
327
350
  -------
328
- get_index(col)
329
- Returns the index of a specific column.
330
- get_slice(cols)
331
- Returns a slice object for a range of consecutive columns.
351
+ get_index(colname)
352
+ Returns the index of a specific column name.
353
+ get_slice(colnames)
354
+ Returns a slice object for a range of consecutive column names.
332
355
  """
333
356
 
334
- def __init__(self, windowed_cols):
357
+ def __init__(self, windowed_colnames: List[str]):
335
358
  """
336
359
  Initialize the WindowedDataExtractor.
337
360
 
338
361
  Parameters
339
362
  ----------
340
- windowed_cols : list of str
363
+ windowed_colnames : list of str
341
364
  A list of column names in the windowed data.
342
365
 
343
366
  Raises
344
367
  ------
345
368
  ValueError
346
- If the list of `windowed_cols` is empty.
369
+ If the list of `windowed_colnames` is empty.
347
370
  """
348
- if not windowed_cols:
371
+ if not windowed_colnames:
349
372
  raise ValueError("The list of windowed columns cannot be empty.")
350
- self.column_indices = {col: idx for idx, col in enumerate(windowed_cols)}
373
+ self.column_indices = {col: idx for idx, col in enumerate(windowed_colnames)}
351
374
 
352
- def get_index(self, col):
375
+ def get_index(self, colname: str) -> int:
353
376
  """
354
377
  Get the index of a specific column.
355
378
 
@@ -366,19 +389,19 @@ class WindowedDataExtractor:
366
389
  Raises
367
390
  ------
368
391
  ValueError
369
- If the column is not found in the `windowed_cols` list.
392
+ If the column is not found in the `windowed_colnames` list.
370
393
  """
371
- if col not in self.column_indices:
372
- raise ValueError(f"Column '{col}' not found in windowed_cols.")
373
- return self.column_indices[col]
394
+ if colname not in self.column_indices:
395
+ raise ValueError(f"Column name '{colname}' not found in windowed_colnames.")
396
+ return self.column_indices[colname]
374
397
 
375
- def get_slice(self, cols):
398
+ def get_slice(self, colnames: List[str]) -> slice:
376
399
  """
377
400
  Get a slice object for a range of consecutive columns.
378
401
 
379
402
  Parameters
380
403
  ----------
381
- cols : list of str
404
+ colnames : list of str
382
405
  A list of consecutive column names to define the slice.
383
406
 
384
407
  Returns
@@ -389,11 +412,13 @@ class WindowedDataExtractor:
389
412
  Raises
390
413
  ------
391
414
  ValueError
392
- If one or more columns in `cols` are not found in the `windowed_cols` list.
415
+ If one or more columns in `colnames` are not found in the `windowed_colnames` list.
393
416
  """
394
- if not all(col in self.column_indices for col in cols):
395
- missing = [col for col in cols if col not in self.column_indices]
396
- raise ValueError(f"The following columns are missing from windowed_cols: {missing}")
397
- start_idx = self.column_indices[cols[0]]
398
- end_idx = self.column_indices[cols[-1]] + 1
399
- return slice(start_idx, end_idx)
417
+ if not all(col in self.column_indices for col in colnames):
418
+ missing = [col for col in colnames if col not in self.column_indices]
419
+ raise ValueError(
420
+ f"The following columns are missing from windowed_colnames: {missing}"
421
+ )
422
+ start_idx = self.column_indices[colnames[0]]
423
+ end_idx = self.column_indices[colnames[-1]] + 1
424
+ return slice(start_idx, end_idx)