eegdash 0.4.0.dev173498563__py3-none-any.whl → 0.4.1.dev185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -18,22 +18,41 @@ from braindecode.datasets.base import (
18
18
 
19
19
  from ..logging import logger
20
20
 
21
+ __all__ = [
22
+ "FeaturesDataset",
23
+ "FeaturesConcatDataset",
24
+ ]
25
+
21
26
 
22
27
  class FeaturesDataset(EEGWindowsDataset):
23
- """Returns samples from a pandas DataFrame object along with a target.
28
+ """A dataset of features extracted from EEG windows.
24
29
 
25
- Dataset which serves samples from a pandas DataFrame object along with a
26
- target. The target is unique for the dataset, and is obtained through the
27
- `description` attribute.
30
+ This class holds features in a pandas DataFrame and provides an interface
31
+ compatible with braindecode's dataset structure. Each row in the feature
32
+ DataFrame corresponds to a single sample (e.g., an EEG window).
28
33
 
29
34
  Parameters
30
35
  ----------
31
- features : a pandas DataFrame
32
- Tabular data.
33
- description : dict | pandas.Series | None
34
- Holds additional description about the continuous signal / subject.
35
- transform : callable | None
36
- On-the-fly transform applied to the example before it is returned.
36
+ features : pandas.DataFrame
37
+ A DataFrame where each row is a sample and each column is a feature.
38
+ metadata : pandas.DataFrame, optional
39
+ A DataFrame containing metadata for each sample, indexed consistently
40
+ with `features`. Must include columns 'i_window_in_trial',
41
+ 'i_start_in_trial', 'i_stop_in_trial', and 'target'.
42
+ description : dict or pandas.Series, optional
43
+ Additional high-level information about the dataset (e.g., subject ID).
44
+ transform : callable, optional
45
+ A function or transform to apply to the feature data on-the-fly.
46
+ raw_info : dict, optional
47
+ Information about the original raw recording, for provenance.
48
+ raw_preproc_kwargs : dict, optional
49
+ Keyword arguments used for preprocessing the raw data.
50
+ window_kwargs : dict, optional
51
+ Keyword arguments used for windowing the data.
52
+ window_preproc_kwargs : dict, optional
53
+ Keyword arguments used for preprocessing the windowed data.
54
+ features_kwargs : dict, optional
55
+ Keyword arguments used for feature extraction.
37
56
 
38
57
  """
39
58
 
@@ -65,7 +84,21 @@ class FeaturesDataset(EEGWindowsDataset):
65
84
  ].to_numpy()
66
85
  self.y = metadata.loc[:, "target"].to_list()
67
86
 
68
- def __getitem__(self, index):
87
+ def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
88
+ """Get a single sample from the dataset.
89
+
90
+ Parameters
91
+ ----------
92
+ index : int
93
+ The index of the sample to retrieve.
94
+
95
+ Returns
96
+ -------
97
+ tuple
98
+ A tuple containing the feature vector (X), the target (y), and the
99
+ cropping indices.
100
+
101
+ """
69
102
  crop_inds = self.crop_inds[index].tolist()
70
103
  X = self.features.iloc[index].to_numpy()
71
104
  X = X.copy()
@@ -75,18 +108,27 @@ class FeaturesDataset(EEGWindowsDataset):
75
108
  y = self.y[index]
76
109
  return X, y, crop_inds
77
110
 
78
- def __len__(self):
111
+ def __len__(self) -> int:
112
+ """Return the number of samples in the dataset.
113
+
114
+ Returns
115
+ -------
116
+ int
117
+ The total number of feature samples.
118
+
119
+ """
79
120
  return len(self.features.index)
80
121
 
81
122
 
82
123
  def _compute_stats(
83
124
  ds: FeaturesDataset,
84
- return_count=False,
85
- return_mean=False,
86
- return_var=False,
87
- ddof=1,
88
- numeric_only=False,
89
- ):
125
+ return_count: bool = False,
126
+ return_mean: bool = False,
127
+ return_var: bool = False,
128
+ ddof: int = 1,
129
+ numeric_only: bool = False,
130
+ ) -> tuple:
131
+ """Compute statistics for a single :class:`~eegdash.features.datasets.FeaturesDataset`."""
90
132
  res = []
91
133
  if return_count:
92
134
  res.append(ds.features.count(numeric_only=numeric_only))
@@ -97,7 +139,14 @@ def _compute_stats(
97
139
  return tuple(res)
98
140
 
99
141
 
100
- def _pooled_var(counts, means, variances, ddof, ddof_in=None):
142
+ def _pooled_var(
143
+ counts: np.ndarray,
144
+ means: np.ndarray,
145
+ variances: np.ndarray,
146
+ ddof: int,
147
+ ddof_in: int | None = None,
148
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
149
+ """Compute pooled variance across multiple datasets."""
101
150
  if ddof_in is None:
102
151
  ddof_in = ddof
103
152
  count = counts.sum(axis=0)
@@ -110,17 +159,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
110
159
 
111
160
 
112
161
  class FeaturesConcatDataset(BaseConcatDataset):
113
- """A base class for concatenated datasets.
162
+ """A concatenated dataset of :class:`~eegdash.features.datasets.FeaturesDataset` objects.
163
+
164
+ This class holds a list of :class:`~eegdash.features.datasets.FeaturesDataset` instances and allows
165
+ them to be treated as a single, larger dataset. It provides methods for
114
166
 
115
- Holds either mne.Raw or mne.Epoch in self.datasets and has
116
- a pandas DataFrame with additional description.
167
+ splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
168
+ `var`, `fillna`) across all contained datasets.
117
169
 
118
170
  Parameters
119
171
  ----------
120
- list_of_ds : list
121
- list of BaseDataset, BaseConcatDataset or WindowsDataset
122
- target_transform : callable | None
123
- Optional function to call on targets before returning them.
172
+ list_of_ds : list of ~eegdash.features.datasets.FeaturesDataset
173
+ A list of :class:`~eegdash.features.datasets.FeaturesDataset` objects to concatenate.
174
+ target_transform : callable, optional
175
+ A function to apply to the target values before they are returned.
124
176
 
125
177
  """
126
178
 
@@ -140,26 +192,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
140
192
  self,
141
193
  by: str | list[int] | list[list[int]] | dict[str, list[int]],
142
194
  ) -> dict[str, FeaturesConcatDataset]:
143
- """Split the dataset based on information listed in its description.
195
+ """Split the dataset into subsets.
144
196
 
145
- The format could be based on a DataFrame or based on indices.
197
+ The splitting can be done based on a column in the description
198
+ DataFrame or by providing explicit indices for each split.
146
199
 
147
200
  Parameters
148
201
  ----------
149
- by : str | list | dict
150
- If ``by`` is a string, splitting is performed based on the
151
- description DataFrame column with this name.
152
- If ``by`` is a (list of) list of integers, the position in the first
153
- list corresponds to the split id and the integers to the
154
- datapoints of that split.
155
- If a dict then each key will be used in the returned
156
- splits dict and each value should be a list of int.
202
+ by : str or list or dict
203
+ - If a string, splits are created for each unique value in the
204
+ description column `by`.
205
+ - If a list of integers, a single split is created containing the
206
+ datasets at the specified indices.
207
+ - If a list of lists of integers, multiple splits are created, one
208
+ for each sublist of indices.
209
+ - If a dictionary, keys are used as split names and values are
210
+ lists of dataset indices.
157
211
 
158
212
  Returns
159
213
  -------
160
- splits : dict
161
- A dictionary with the name of the split (a string) as key and the
162
- dataset as value.
214
+ dict[str, ~eegdash.features.datasets.FeaturesConcatDataset]
215
+ A dictionary where keys are split names and values are the new
216
+ :class:`~eegdash.features.datasets.FeaturesConcatDataset` subsets.
163
217
 
164
218
  """
165
219
  if isinstance(by, str):
@@ -184,14 +238,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
184
238
  }
185
239
 
186
240
  def get_metadata(self) -> pd.DataFrame:
187
- """Concatenate the metadata and description of the wrapped Epochs.
241
+ """Get the metadata of all datasets as a single DataFrame.
242
+
243
+ Concatenates the metadata from all contained datasets and adds columns
244
+ from their `description` attributes.
188
245
 
189
246
  Returns
190
247
  -------
191
- metadata : pd.DataFrame
192
- DataFrame containing as many rows as there are windows in the
193
- BaseConcatDataset, with the metadata and description information
194
- for each window.
248
+ pandas.DataFrame
249
+ A DataFrame containing the metadata for every sample in the
250
+ concatenated dataset.
251
+
252
+ Raises
253
+ ------
254
+ TypeError
255
+ If any of the contained datasets is not a
256
+ :class:`~eegdash.features.datasets.FeaturesDataset`.
195
257
 
196
258
  """
197
259
  if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
@@ -202,60 +264,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
202
264
 
203
265
  all_dfs = list()
204
266
  for ds in self.datasets:
205
- df = ds.metadata
267
+ df = ds.metadata.copy()
206
268
  for k, v in ds.description.items():
207
269
  df[k] = v
208
270
  all_dfs.append(df)
209
271
 
210
272
  return pd.concat(all_dfs)
211
273
 
212
- def save(self, path: str, overwrite: bool = False, offset: int = 0):
213
- """Save datasets to files by creating one subdirectory for each dataset:
214
- path/
215
- 0/
216
- 0-feat.parquet
217
- metadata_df.pkl
218
- description.json
219
- raw-info.fif (if raw info was saved)
220
- raw_preproc_kwargs.json (if raws were preprocessed)
221
- window_kwargs.json (if this is a windowed dataset)
222
- window_preproc_kwargs.json (if windows were preprocessed)
223
- features_kwargs.json
224
- 1/
225
- 1-feat.parquet
226
- metadata_df.pkl
227
- description.json
228
- raw-info.fif (if raw info was saved)
229
- raw_preproc_kwargs.json (if raws were preprocessed)
230
- window_kwargs.json (if this is a windowed dataset)
231
- window_preproc_kwargs.json (if windows were preprocessed)
232
- features_kwargs.json
274
+ def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
275
+ """Save the concatenated dataset to a directory.
276
+
277
+ Creates a directory structure where each contained dataset is saved in
278
+ its own numbered subdirectory.
279
+
280
+ .. code-block::
281
+
282
+ path/
283
+ 0/
284
+ 0-feat.parquet
285
+ metadata_df.pkl
286
+ description.json
287
+ ...
288
+ 1/
289
+ 1-feat.parquet
290
+ ...
233
291
 
234
292
  Parameters
235
293
  ----------
236
294
  path : str
237
- Directory in which subdirectories are created to store
238
- -feat.parquet and .json files to.
239
- overwrite : bool
240
- Whether to delete old subdirectories that will be saved to in this
241
- call.
242
- offset : int
243
- If provided, the integer is added to the id of the dataset in the
244
- concat. This is useful in the setting of very large datasets, where
245
- one dataset has to be processed and saved at a time to account for
246
- its original position.
295
+ The directory where the dataset will be saved.
296
+ overwrite : bool, default False
297
+ If True, any existing subdirectories that conflict with the new
298
+ ones will be removed.
299
+ offset : int, default 0
300
+ An integer to add to the subdirectory names. Useful for saving
301
+ datasets in chunks.
302
+
303
+ Raises
304
+ ------
305
+ ValueError
306
+ If the dataset is empty.
307
+ FileExistsError
308
+ If a subdirectory already exists and `overwrite` is False.
247
309
 
248
310
  """
249
311
  if len(self.datasets) == 0:
250
312
  raise ValueError("Expect at least one dataset")
251
313
  path_contents = os.listdir(path)
252
- n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
314
+ n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
253
315
  for i_ds, ds in enumerate(self.datasets):
254
- # remove subdirectory from list of untouched files / subdirectories
255
- if str(i_ds + offset) in path_contents:
256
- path_contents.remove(str(i_ds + offset))
257
- # save_dir/i_ds/
258
- sub_dir = os.path.join(path, str(i_ds + offset))
316
+ sub_dir_name = str(i_ds + offset)
317
+ if sub_dir_name in path_contents:
318
+ path_contents.remove(sub_dir_name)
319
+ sub_dir = os.path.join(path, sub_dir_name)
259
320
  if os.path.exists(sub_dir):
260
321
  if overwrite:
261
322
  shutil.rmtree(sub_dir)
@@ -265,35 +326,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
265
326
  f" a different directory, set overwrite=True, or "
266
327
  f"resolve manually."
267
328
  )
268
- # save_dir/{i_ds+offset}/
269
329
  os.makedirs(sub_dir)
270
- # save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
271
330
  self._save_features(sub_dir, ds, i_ds, offset)
272
- # save_dir/{i_ds+offset}/metadata_df.pkl
273
331
  self._save_metadata(sub_dir, ds)
274
- # save_dir/{i_ds+offset}/description.json
275
332
  self._save_description(sub_dir, ds.description)
276
- # save_dir/{i_ds+offset}/raw-info.fif
277
333
  self._save_raw_info(sub_dir, ds)
278
- # save_dir/{i_ds+offset}/raw_preproc_kwargs.json
279
- # save_dir/{i_ds+offset}/window_kwargs.json
280
- # save_dir/{i_ds+offset}/window_preproc_kwargs.json
281
- # save_dir/{i_ds+offset}/features_kwargs.json
282
334
  self._save_kwargs(sub_dir, ds)
283
- if overwrite:
284
- # the following will be True for all datasets preprocessed and
285
- # stored in parallel with braindecode.preprocessing.preprocess
286
- if i_ds + 1 + offset < n_sub_dirs:
287
- logger.warning(
288
- f"The number of saved datasets ({i_ds + 1 + offset}) "
289
- f"does not match the number of existing "
290
- f"subdirectories ({n_sub_dirs}). You may now "
291
- f"encounter a mix of differently preprocessed "
292
- f"datasets!",
293
- UserWarning,
294
- )
295
- # if path contains files or directories that were not touched, raise
296
- # warning
335
+ if overwrite and i_ds + 1 + offset < n_sub_dirs:
336
+ logger.warning(
337
+ f"The number of saved datasets ({i_ds + 1 + offset}) "
338
+ f"does not match the number of existing "
339
+ f"subdirectories ({n_sub_dirs}). You may now "
340
+ f"encounter a mix of differently preprocessed "
341
+ f"datasets!",
342
+ UserWarning,
343
+ )
297
344
  if path_contents:
298
345
  logger.warning(
299
346
  f"Chosen directory {path} contains other "
@@ -301,20 +348,37 @@ class FeaturesConcatDataset(BaseConcatDataset):
301
348
  )
302
349
 
303
350
  @staticmethod
304
- def _save_features(sub_dir, ds, i_ds, offset):
351
+ def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
352
+ """Save the feature DataFrame to a Parquet file."""
305
353
  parquet_file_name = f"{i_ds + offset}-feat.parquet"
306
354
  parquet_file_path = os.path.join(sub_dir, parquet_file_name)
307
355
  ds.features.to_parquet(parquet_file_path)
308
356
 
309
357
  @staticmethod
310
- def _save_raw_info(sub_dir, ds):
311
- if hasattr(ds, "raw_info"):
358
+ def _save_metadata(sub_dir: str, ds: FeaturesDataset):
359
+ """Save the metadata DataFrame to a pickle file."""
360
+ metadata_file_name = "metadata_df.pkl"
361
+ metadata_file_path = os.path.join(sub_dir, metadata_file_name)
362
+ ds.metadata.to_pickle(metadata_file_path)
363
+
364
+ @staticmethod
365
+ def _save_description(sub_dir: str, description: pd.Series):
366
+ """Save the description Series to a JSON file."""
367
+ desc_file_name = "description.json"
368
+ desc_file_path = os.path.join(sub_dir, desc_file_name)
369
+ description.to_json(desc_file_path)
370
+
371
+ @staticmethod
372
+ def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
373
+ """Save the raw info dictionary to a FIF file if it exists."""
374
+ if hasattr(ds, "raw_info") and ds.raw_info is not None:
312
375
  fif_file_name = "raw-info.fif"
313
376
  fif_file_path = os.path.join(sub_dir, fif_file_name)
314
- ds.raw_info.save(fif_file_path)
377
+ ds.raw_info.save(fif_file_path, overwrite=True)
315
378
 
316
379
  @staticmethod
317
- def _save_kwargs(sub_dir, ds):
380
+ def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
381
+ """Save various keyword argument dictionaries to JSON files."""
318
382
  for kwargs_name in [
319
383
  "raw_preproc_kwargs",
320
384
  "window_kwargs",
@@ -322,10 +386,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
322
386
  "features_kwargs",
323
387
  ]:
324
388
  if hasattr(ds, kwargs_name):
325
- kwargs_file_name = ".".join([kwargs_name, "json"])
326
- kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
327
389
  kwargs = getattr(ds, kwargs_name)
328
390
  if kwargs is not None:
391
+ kwargs_file_name = ".".join([kwargs_name, "json"])
392
+ kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
329
393
  with open(kwargs_file_path, "w") as f:
330
394
  json.dump(kwargs, f)
331
395
 
@@ -334,7 +398,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
334
398
  include_metadata: bool | str | List[str] = False,
335
399
  include_target: bool = False,
336
400
  include_crop_inds: bool = False,
337
- ):
401
+ ) -> pd.DataFrame:
402
+ """Convert the dataset to a single pandas DataFrame.
403
+
404
+ Parameters
405
+ ----------
406
+ include_metadata : bool or str or list of str, default False
407
+ If True, include all metadata columns. If a string or list of
408
+ strings, include only the specified metadata columns.
409
+ include_target : bool, default False
410
+ If True, include the 'target' column.
411
+ include_crop_inds : bool, default False
412
+ If True, include window cropping index columns.
413
+
414
+ Returns
415
+ -------
416
+ pandas.DataFrame
417
+ A DataFrame containing the features and requested metadata.
418
+
419
+ """
338
420
  if (
339
421
  not isinstance(include_metadata, bool)
340
422
  or include_metadata
@@ -343,7 +425,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
343
425
  include_dataset = False
344
426
  if isinstance(include_metadata, bool) and include_metadata:
345
427
  include_dataset = True
346
- cols = self.datasets[0].metadata.columns
428
+ cols = self.datasets[0].metadata.columns.tolist()
347
429
  else:
348
430
  cols = include_metadata
349
431
  if isinstance(cols, bool) and not cols:
@@ -352,13 +434,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
352
434
  cols = [cols]
353
435
  cols = set(cols)
354
436
  if include_crop_inds:
355
- cols = {
356
- "i_dataset",
357
- "i_window_in_trial",
358
- "i_start_in_trial",
359
- "i_stop_in_trial",
360
- *cols,
361
- }
437
+ cols.update(
438
+ {
439
+ "i_dataset",
440
+ "i_window_in_trial",
441
+ "i_start_in_trial",
442
+ "i_stop_in_trial",
443
+ }
444
+ )
362
445
  if include_target:
363
446
  cols.add("target")
364
447
  cols = list(cols)
@@ -381,10 +464,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
381
464
  dataframes = [ds.features for ds in self.datasets]
382
465
  return pd.concat(dataframes, axis=0, ignore_index=True)
383
466
 
384
- def _numeric_columns(self):
467
+ def _numeric_columns(self) -> pd.Index:
468
+ """Get the names of numeric columns from the feature DataFrames."""
385
469
  return self.datasets[0].features.select_dtypes(include=np.number).columns
386
470
 
387
- def count(self, numeric_only=False, n_jobs=1):
471
+ def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
472
+ """Count non-NA cells for each feature column.
473
+
474
+ Parameters
475
+ ----------
476
+ numeric_only : bool, default False
477
+ Include only float, int, boolean columns.
478
+ n_jobs : int, default 1
479
+ Number of jobs to run in parallel.
480
+
481
+ Returns
482
+ -------
483
+ pandas.Series
484
+ The count of non-NA cells for each column.
485
+
486
+ """
388
487
  stats = Parallel(n_jobs)(
389
488
  delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
390
489
  for ds in self.datasets
@@ -393,7 +492,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
393
492
  count = counts.sum(axis=0)
394
493
  return pd.Series(count, index=self._numeric_columns())
395
494
 
396
- def mean(self, numeric_only=False, n_jobs=1):
495
+ def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
496
+ """Compute the mean for each feature column.
497
+
498
+ Parameters
499
+ ----------
500
+ numeric_only : bool, default False
501
+ Include only float, int, boolean columns.
502
+ n_jobs : int, default 1
503
+ Number of jobs to run in parallel.
504
+
505
+ Returns
506
+ -------
507
+ pandas.Series
508
+ The mean of each column.
509
+
510
+ """
397
511
  stats = Parallel(n_jobs)(
398
512
  delayed(_compute_stats)(
399
513
  ds, return_count=True, return_mean=True, numeric_only=numeric_only
@@ -405,7 +519,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
405
519
  mean = np.sum((counts / count) * means, axis=0)
406
520
  return pd.Series(mean, index=self._numeric_columns())
407
521
 
408
- def var(self, ddof=1, numeric_only=False, n_jobs=1):
522
+ def var(
523
+ self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
524
+ ) -> pd.Series:
525
+ """Compute the variance for each feature column.
526
+
527
+ Parameters
528
+ ----------
529
+ ddof : int, default 1
530
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
531
+ numeric_only : bool, default False
532
+ Include only float, int, boolean columns.
533
+ n_jobs : int, default 1
534
+ Number of jobs to run in parallel.
535
+
536
+ Returns
537
+ -------
538
+ pandas.Series
539
+ The variance of each column.
540
+
541
+ """
409
542
  stats = Parallel(n_jobs)(
410
543
  delayed(_compute_stats)(
411
544
  ds,
@@ -425,12 +558,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
425
558
  _, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
426
559
  return pd.Series(var, index=self._numeric_columns())
427
560
 
428
- def std(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
561
+ def std(
562
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
563
+ ) -> pd.Series:
564
+ """Compute the standard deviation for each feature column.
565
+
566
+ Parameters
567
+ ----------
568
+ ddof : int, default 1
569
+ Delta Degrees of Freedom.
570
+ numeric_only : bool, default False
571
+ Include only float, int, boolean columns.
572
+ eps : float, default 0
573
+ A small epsilon value to add to the variance before taking the
574
+ square root to avoid numerical instability.
575
+ n_jobs : int, default 1
576
+ Number of jobs to run in parallel.
577
+
578
+ Returns
579
+ -------
580
+ pandas.Series
581
+ The standard deviation of each column.
582
+
583
+ """
429
584
  return np.sqrt(
430
585
  self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
431
586
  )
432
587
 
433
- def zscore(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
588
+ def zscore(
589
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
590
+ ) -> None:
591
+ """Apply z-score normalization to numeric columns in-place.
592
+
593
+ Parameters
594
+ ----------
595
+ ddof : int, default 1
596
+ Delta Degrees of Freedom for variance calculation.
597
+ numeric_only : bool, default False
598
+ Include only float, int, boolean columns.
599
+ eps : float, default 0
600
+ Epsilon for numerical stability.
601
+ n_jobs : int, default 1
602
+ Number of jobs to run in parallel for statistics computation.
603
+
604
+ """
434
605
  stats = Parallel(n_jobs)(
435
606
  delayed(_compute_stats)(
436
607
  ds,
@@ -450,10 +621,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
450
621
  _, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
451
622
  std = np.sqrt(var + eps)
452
623
  for ds in self.datasets:
453
- ds.features = (ds.features - mean) / std
624
+ ds.features.loc[:, self._numeric_columns()] = (
625
+ ds.features.loc[:, self._numeric_columns()] - mean
626
+ ) / std
454
627
 
455
628
  @staticmethod
456
- def _enforce_inplace_operations(func_name, kwargs):
629
+ def _enforce_inplace_operations(func_name: str, kwargs: dict):
630
+ """Raise an error if 'inplace=False' is passed to a method."""
457
631
  if "inplace" in kwargs and kwargs["inplace"] is False:
458
632
  raise ValueError(
459
633
  f"{func_name} only works inplace, please change "
@@ -461,33 +635,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
461
635
  )
462
636
  kwargs["inplace"] = True
463
637
 
464
- def fillna(self, *args, **kwargs):
638
+ def fillna(self, *args, **kwargs) -> None:
639
+ """Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
465
640
  FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
466
641
  for ds in self.datasets:
467
642
  ds.features.fillna(*args, **kwargs)
468
643
 
469
- def replace(self, *args, **kwargs):
644
+ def replace(self, *args, **kwargs) -> None:
645
+ """Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
470
646
  FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
471
647
  for ds in self.datasets:
472
648
  ds.features.replace(*args, **kwargs)
473
649
 
474
- def interpolate(self, *args, **kwargs):
650
+ def interpolate(self, *args, **kwargs) -> None:
651
+ """Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
475
652
  FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
476
653
  for ds in self.datasets:
477
654
  ds.features.interpolate(*args, **kwargs)
478
655
 
479
- def dropna(self, *args, **kwargs):
656
+ def dropna(self, *args, **kwargs) -> None:
657
+ """Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
480
658
  FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
481
659
  for ds in self.datasets:
482
660
  ds.features.dropna(*args, **kwargs)
483
661
 
484
- def drop(self, *args, **kwargs):
662
+ def drop(self, *args, **kwargs) -> None:
663
+ """Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
485
664
  FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
486
665
  for ds in self.datasets:
487
666
  ds.features.drop(*args, **kwargs)
488
667
 
489
- def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
668
+ def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
669
+ """Join columns with other FeaturesConcatDataset in-place.
670
+
671
+ Parameters
672
+ ----------
673
+ concat_dataset : FeaturesConcatDataset
674
+ The dataset to join with. Must have the same number of datasets,
675
+ and each corresponding dataset must have the same length.
676
+ **kwargs
677
+ Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
678
+
679
+ """
490
680
  assert len(self.datasets) == len(concat_dataset.datasets)
491
681
  for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
492
682
  assert len(ds1) == len(ds2)
493
- ds1.features.join(ds2, **kwargs)
683
+ ds1.features = ds1.features.join(ds2.features, **kwargs)