eegdash 0.3.9.dev170082126__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import json
4
4
  import os
5
5
  import shutil
6
- import warnings
7
6
  from collections.abc import Callable
8
7
  from typing import Dict, List
9
8
 
@@ -17,22 +16,38 @@ from braindecode.datasets.base import (
17
16
  _create_description,
18
17
  )
19
18
 
19
+ from ..logging import logger
20
+
20
21
 
21
22
  class FeaturesDataset(EEGWindowsDataset):
22
- """Returns samples from a pandas DataFrame object along with a target.
23
+ """A dataset of features extracted from EEG windows.
23
24
 
24
- Dataset which serves samples from a pandas DataFrame object along with a
25
- target. The target is unique for the dataset, and is obtained through the
26
- `description` attribute.
25
+ This class holds features in a pandas DataFrame and provides an interface
26
+ compatible with braindecode's dataset structure. Each row in the feature
27
+ DataFrame corresponds to a single sample (e.g., an EEG window).
27
28
 
28
29
  Parameters
29
30
  ----------
30
- features : a pandas DataFrame
31
- Tabular data.
32
- description : dict | pandas.Series | None
33
- Holds additional description about the continuous signal / subject.
34
- transform : callable | None
35
- On-the-fly transform applied to the example before it is returned.
31
+ features : pandas.DataFrame
32
+ A DataFrame where each row is a sample and each column is a feature.
33
+ metadata : pandas.DataFrame, optional
34
+ A DataFrame containing metadata for each sample, indexed consistently
35
+ with `features`. Must include columns 'i_window_in_trial',
36
+ 'i_start_in_trial', 'i_stop_in_trial', and 'target'.
37
+ description : dict or pandas.Series, optional
38
+ Additional high-level information about the dataset (e.g., subject ID).
39
+ transform : callable, optional
40
+ A function or transform to apply to the feature data on-the-fly.
41
+ raw_info : dict, optional
42
+ Information about the original raw recording, for provenance.
43
+ raw_preproc_kwargs : dict, optional
44
+ Keyword arguments used for preprocessing the raw data.
45
+ window_kwargs : dict, optional
46
+ Keyword arguments used for windowing the data.
47
+ window_preproc_kwargs : dict, optional
48
+ Keyword arguments used for preprocessing the windowed data.
49
+ features_kwargs : dict, optional
50
+ Keyword arguments used for feature extraction.
36
51
 
37
52
  """
38
53
 
@@ -64,7 +79,21 @@ class FeaturesDataset(EEGWindowsDataset):
64
79
  ].to_numpy()
65
80
  self.y = metadata.loc[:, "target"].to_list()
66
81
 
67
- def __getitem__(self, index):
82
+ def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
83
+ """Get a single sample from the dataset.
84
+
85
+ Parameters
86
+ ----------
87
+ index : int
88
+ The index of the sample to retrieve.
89
+
90
+ Returns
91
+ -------
92
+ tuple
93
+ A tuple containing the feature vector (X), the target (y), and the
94
+ cropping indices.
95
+
96
+ """
68
97
  crop_inds = self.crop_inds[index].tolist()
69
98
  X = self.features.iloc[index].to_numpy()
70
99
  X = X.copy()
@@ -74,18 +103,27 @@ class FeaturesDataset(EEGWindowsDataset):
74
103
  y = self.y[index]
75
104
  return X, y, crop_inds
76
105
 
77
- def __len__(self):
106
+ def __len__(self) -> int:
107
+ """Return the number of samples in the dataset.
108
+
109
+ Returns
110
+ -------
111
+ int
112
+ The total number of feature samples.
113
+
114
+ """
78
115
  return len(self.features.index)
79
116
 
80
117
 
81
118
  def _compute_stats(
82
119
  ds: FeaturesDataset,
83
- return_count=False,
84
- return_mean=False,
85
- return_var=False,
86
- ddof=1,
87
- numeric_only=False,
88
- ):
120
+ return_count: bool = False,
121
+ return_mean: bool = False,
122
+ return_var: bool = False,
123
+ ddof: int = 1,
124
+ numeric_only: bool = False,
125
+ ) -> tuple:
126
+ """Compute statistics for a single FeaturesDataset."""
89
127
  res = []
90
128
  if return_count:
91
129
  res.append(ds.features.count(numeric_only=numeric_only))
@@ -96,7 +134,14 @@ def _compute_stats(
96
134
  return tuple(res)
97
135
 
98
136
 
99
- def _pooled_var(counts, means, variances, ddof, ddof_in=None):
137
+ def _pooled_var(
138
+ counts: np.ndarray,
139
+ means: np.ndarray,
140
+ variances: np.ndarray,
141
+ ddof: int,
142
+ ddof_in: int | None = None,
143
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
144
+ """Compute pooled variance across multiple datasets."""
100
145
  if ddof_in is None:
101
146
  ddof_in = ddof
102
147
  count = counts.sum(axis=0)
@@ -109,17 +154,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
109
154
 
110
155
 
111
156
  class FeaturesConcatDataset(BaseConcatDataset):
112
- """A base class for concatenated datasets.
157
+ """A concatenated dataset of `FeaturesDataset` objects.
158
+
159
+ This class holds a list of :class:`FeaturesDataset` instances and allows
160
+ them to be treated as a single, larger dataset. It provides methods for
113
161
 
114
- Holds either mne.Raw or mne.Epoch in self.datasets and has
115
- a pandas DataFrame with additional description.
162
+ splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
163
+ `var`, `fillna`) across all contained datasets.
116
164
 
117
165
  Parameters
118
166
  ----------
119
- list_of_ds : list
120
- list of BaseDataset, BaseConcatDataset or WindowsDataset
121
- target_transform : callable | None
122
- Optional function to call on targets before returning them.
167
+ list_of_ds : list of FeaturesDataset
168
+ A list of :class:`FeaturesDataset` objects to concatenate.
169
+ target_transform : callable, optional
170
+ A function to apply to the target values before they are returned.
123
171
 
124
172
  """
125
173
 
@@ -139,26 +187,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
139
187
  self,
140
188
  by: str | list[int] | list[list[int]] | dict[str, list[int]],
141
189
  ) -> dict[str, FeaturesConcatDataset]:
142
- """Split the dataset based on information listed in its description.
190
+ """Split the dataset into subsets.
143
191
 
144
- The format could be based on a DataFrame or based on indices.
192
+ The splitting can be done based on a column in the description
193
+ DataFrame or by providing explicit indices for each split.
145
194
 
146
195
  Parameters
147
196
  ----------
148
- by : str | list | dict
149
- If ``by`` is a string, splitting is performed based on the
150
- description DataFrame column with this name.
151
- If ``by`` is a (list of) list of integers, the position in the first
152
- list corresponds to the split id and the integers to the
153
- datapoints of that split.
154
- If a dict then each key will be used in the returned
155
- splits dict and each value should be a list of int.
197
+ by : str or list or dict
198
+ - If a string, splits are created for each unique value in the
199
+ description column `by`.
200
+ - If a list of integers, a single split is created containing the
201
+ datasets at the specified indices.
202
+ - If a list of lists of integers, multiple splits are created, one
203
+ for each sublist of indices.
204
+ - If a dictionary, keys are used as split names and values are
205
+ lists of dataset indices.
156
206
 
157
207
  Returns
158
208
  -------
159
- splits : dict
160
- A dictionary with the name of the split (a string) as key and the
161
- dataset as value.
209
+ dict[str, FeaturesConcatDataset]
210
+ A dictionary where keys are split names and values are the new
211
+ :class:`FeaturesConcatDataset` subsets.
162
212
 
163
213
  """
164
214
  if isinstance(by, str):
@@ -183,14 +233,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
183
233
  }
184
234
 
185
235
  def get_metadata(self) -> pd.DataFrame:
186
- """Concatenate the metadata and description of the wrapped Epochs.
236
+ """Get the metadata of all datasets as a single DataFrame.
237
+
238
+ Concatenates the metadata from all contained datasets and adds columns
239
+ from their `description` attributes.
187
240
 
188
241
  Returns
189
242
  -------
190
- metadata : pd.DataFrame
191
- DataFrame containing as many rows as there are windows in the
192
- BaseConcatDataset, with the metadata and description information
193
- for each window.
243
+ pandas.DataFrame
244
+ A DataFrame containing the metadata for every sample in the
245
+ concatenated dataset.
246
+
247
+ Raises
248
+ ------
249
+ TypeError
250
+ If any of the contained datasets is not a :class:`FeaturesDataset`.
194
251
 
195
252
  """
196
253
  if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
@@ -201,60 +258,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
201
258
 
202
259
  all_dfs = list()
203
260
  for ds in self.datasets:
204
- df = ds.metadata
261
+ df = ds.metadata.copy()
205
262
  for k, v in ds.description.items():
206
263
  df[k] = v
207
264
  all_dfs.append(df)
208
265
 
209
266
  return pd.concat(all_dfs)
210
267
 
211
- def save(self, path: str, overwrite: bool = False, offset: int = 0):
212
- """Save datasets to files by creating one subdirectory for each dataset:
213
- path/
214
- 0/
215
- 0-feat.parquet
216
- metadata_df.pkl
217
- description.json
218
- raw-info.fif (if raw info was saved)
219
- raw_preproc_kwargs.json (if raws were preprocessed)
220
- window_kwargs.json (if this is a windowed dataset)
221
- window_preproc_kwargs.json (if windows were preprocessed)
222
- features_kwargs.json
223
- 1/
224
- 1-feat.parquet
225
- metadata_df.pkl
226
- description.json
227
- raw-info.fif (if raw info was saved)
228
- raw_preproc_kwargs.json (if raws were preprocessed)
229
- window_kwargs.json (if this is a windowed dataset)
230
- window_preproc_kwargs.json (if windows were preprocessed)
231
- features_kwargs.json
268
+ def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
269
+ """Save the concatenated dataset to a directory.
270
+
271
+ Creates a directory structure where each contained dataset is saved in
272
+ its own numbered subdirectory.
273
+
274
+ .. code-block::
275
+
276
+ path/
277
+ 0/
278
+ 0-feat.parquet
279
+ metadata_df.pkl
280
+ description.json
281
+ ...
282
+ 1/
283
+ 1-feat.parquet
284
+ ...
232
285
 
233
286
  Parameters
234
287
  ----------
235
288
  path : str
236
- Directory in which subdirectories are created to store
237
- -feat.parquet and .json files to.
238
- overwrite : bool
239
- Whether to delete old subdirectories that will be saved to in this
240
- call.
241
- offset : int
242
- If provided, the integer is added to the id of the dataset in the
243
- concat. This is useful in the setting of very large datasets, where
244
- one dataset has to be processed and saved at a time to account for
245
- its original position.
289
+ The directory where the dataset will be saved.
290
+ overwrite : bool, default False
291
+ If True, any existing subdirectories that conflict with the new
292
+ ones will be removed.
293
+ offset : int, default 0
294
+ An integer to add to the subdirectory names. Useful for saving
295
+ datasets in chunks.
296
+
297
+ Raises
298
+ ------
299
+ ValueError
300
+ If the dataset is empty.
301
+ FileExistsError
302
+ If a subdirectory already exists and `overwrite` is False.
246
303
 
247
304
  """
248
305
  if len(self.datasets) == 0:
249
306
  raise ValueError("Expect at least one dataset")
250
307
  path_contents = os.listdir(path)
251
- n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
308
+ n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
252
309
  for i_ds, ds in enumerate(self.datasets):
253
- # remove subdirectory from list of untouched files / subdirectories
254
- if str(i_ds + offset) in path_contents:
255
- path_contents.remove(str(i_ds + offset))
256
- # save_dir/i_ds/
257
- sub_dir = os.path.join(path, str(i_ds + offset))
310
+ sub_dir_name = str(i_ds + offset)
311
+ if sub_dir_name in path_contents:
312
+ path_contents.remove(sub_dir_name)
313
+ sub_dir = os.path.join(path, sub_dir_name)
258
314
  if os.path.exists(sub_dir):
259
315
  if overwrite:
260
316
  shutil.rmtree(sub_dir)
@@ -264,56 +320,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
264
320
  f" a different directory, set overwrite=True, or "
265
321
  f"resolve manually."
266
322
  )
267
- # save_dir/{i_ds+offset}/
268
323
  os.makedirs(sub_dir)
269
- # save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
270
324
  self._save_features(sub_dir, ds, i_ds, offset)
271
- # save_dir/{i_ds+offset}/metadata_df.pkl
272
325
  self._save_metadata(sub_dir, ds)
273
- # save_dir/{i_ds+offset}/description.json
274
326
  self._save_description(sub_dir, ds.description)
275
- # save_dir/{i_ds+offset}/raw-info.fif
276
327
  self._save_raw_info(sub_dir, ds)
277
- # save_dir/{i_ds+offset}/raw_preproc_kwargs.json
278
- # save_dir/{i_ds+offset}/window_kwargs.json
279
- # save_dir/{i_ds+offset}/window_preproc_kwargs.json
280
- # save_dir/{i_ds+offset}/features_kwargs.json
281
328
  self._save_kwargs(sub_dir, ds)
282
- if overwrite:
283
- # the following will be True for all datasets preprocessed and
284
- # stored in parallel with braindecode.preprocessing.preprocess
285
- if i_ds + 1 + offset < n_sub_dirs:
286
- warnings.warn(
287
- f"The number of saved datasets ({i_ds + 1 + offset}) "
288
- f"does not match the number of existing "
289
- f"subdirectories ({n_sub_dirs}). You may now "
290
- f"encounter a mix of differently preprocessed "
291
- f"datasets!",
292
- UserWarning,
293
- )
294
- # if path contains files or directories that were not touched, raise
295
- # warning
329
+ if overwrite and i_ds + 1 + offset < n_sub_dirs:
330
+ logger.warning(
331
+ f"The number of saved datasets ({i_ds + 1 + offset}) "
332
+ f"does not match the number of existing "
333
+ f"subdirectories ({n_sub_dirs}). You may now "
334
+ f"encounter a mix of differently preprocessed "
335
+ f"datasets!",
336
+ UserWarning,
337
+ )
296
338
  if path_contents:
297
- warnings.warn(
339
+ logger.warning(
298
340
  f"Chosen directory {path} contains other "
299
341
  f"subdirectories or files {path_contents}."
300
342
  )
301
343
 
302
344
  @staticmethod
303
- def _save_features(sub_dir, ds, i_ds, offset):
345
+ def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
346
+ """Save the feature DataFrame to a Parquet file."""
304
347
  parquet_file_name = f"{i_ds + offset}-feat.parquet"
305
348
  parquet_file_path = os.path.join(sub_dir, parquet_file_name)
306
349
  ds.features.to_parquet(parquet_file_path)
307
350
 
308
351
  @staticmethod
309
- def _save_raw_info(sub_dir, ds):
310
- if hasattr(ds, "raw_info"):
352
+ def _save_metadata(sub_dir: str, ds: FeaturesDataset):
353
+ """Save the metadata DataFrame to a pickle file."""
354
+ metadata_file_name = "metadata_df.pkl"
355
+ metadata_file_path = os.path.join(sub_dir, metadata_file_name)
356
+ ds.metadata.to_pickle(metadata_file_path)
357
+
358
+ @staticmethod
359
+ def _save_description(sub_dir: str, description: pd.Series):
360
+ """Save the description Series to a JSON file."""
361
+ desc_file_name = "description.json"
362
+ desc_file_path = os.path.join(sub_dir, desc_file_name)
363
+ description.to_json(desc_file_path)
364
+
365
+ @staticmethod
366
+ def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
367
+ """Save the raw info dictionary to a FIF file if it exists."""
368
+ if hasattr(ds, "raw_info") and ds.raw_info is not None:
311
369
  fif_file_name = "raw-info.fif"
312
370
  fif_file_path = os.path.join(sub_dir, fif_file_name)
313
- ds.raw_info.save(fif_file_path)
371
+ ds.raw_info.save(fif_file_path, overwrite=True)
314
372
 
315
373
  @staticmethod
316
- def _save_kwargs(sub_dir, ds):
374
+ def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
375
+ """Save various keyword argument dictionaries to JSON files."""
317
376
  for kwargs_name in [
318
377
  "raw_preproc_kwargs",
319
378
  "window_kwargs",
@@ -321,10 +380,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
321
380
  "features_kwargs",
322
381
  ]:
323
382
  if hasattr(ds, kwargs_name):
324
- kwargs_file_name = ".".join([kwargs_name, "json"])
325
- kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
326
383
  kwargs = getattr(ds, kwargs_name)
327
384
  if kwargs is not None:
385
+ kwargs_file_name = ".".join([kwargs_name, "json"])
386
+ kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
328
387
  with open(kwargs_file_path, "w") as f:
329
388
  json.dump(kwargs, f)
330
389
 
@@ -333,7 +392,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
333
392
  include_metadata: bool | str | List[str] = False,
334
393
  include_target: bool = False,
335
394
  include_crop_inds: bool = False,
336
- ):
395
+ ) -> pd.DataFrame:
396
+ """Convert the dataset to a single pandas DataFrame.
397
+
398
+ Parameters
399
+ ----------
400
+ include_metadata : bool or str or list of str, default False
401
+ If True, include all metadata columns. If a string or list of
402
+ strings, include only the specified metadata columns.
403
+ include_target : bool, default False
404
+ If True, include the 'target' column.
405
+ include_crop_inds : bool, default False
406
+ If True, include window cropping index columns.
407
+
408
+ Returns
409
+ -------
410
+ pandas.DataFrame
411
+ A DataFrame containing the features and requested metadata.
412
+
413
+ """
337
414
  if (
338
415
  not isinstance(include_metadata, bool)
339
416
  or include_metadata
@@ -342,7 +419,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
342
419
  include_dataset = False
343
420
  if isinstance(include_metadata, bool) and include_metadata:
344
421
  include_dataset = True
345
- cols = self.datasets[0].metadata.columns
422
+ cols = self.datasets[0].metadata.columns.tolist()
346
423
  else:
347
424
  cols = include_metadata
348
425
  if isinstance(cols, bool) and not cols:
@@ -351,13 +428,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
351
428
  cols = [cols]
352
429
  cols = set(cols)
353
430
  if include_crop_inds:
354
- cols = {
355
- "i_dataset",
356
- "i_window_in_trial",
357
- "i_start_in_trial",
358
- "i_stop_in_trial",
359
- *cols,
360
- }
431
+ cols.update(
432
+ {
433
+ "i_dataset",
434
+ "i_window_in_trial",
435
+ "i_start_in_trial",
436
+ "i_stop_in_trial",
437
+ }
438
+ )
361
439
  if include_target:
362
440
  cols.add("target")
363
441
  cols = list(cols)
@@ -380,10 +458,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
380
458
  dataframes = [ds.features for ds in self.datasets]
381
459
  return pd.concat(dataframes, axis=0, ignore_index=True)
382
460
 
383
- def _numeric_columns(self):
461
+ def _numeric_columns(self) -> pd.Index:
462
+ """Get the names of numeric columns from the feature DataFrames."""
384
463
  return self.datasets[0].features.select_dtypes(include=np.number).columns
385
464
 
386
- def count(self, numeric_only=False, n_jobs=1):
465
+ def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
466
+ """Count non-NA cells for each feature column.
467
+
468
+ Parameters
469
+ ----------
470
+ numeric_only : bool, default False
471
+ Include only float, int, boolean columns.
472
+ n_jobs : int, default 1
473
+ Number of jobs to run in parallel.
474
+
475
+ Returns
476
+ -------
477
+ pandas.Series
478
+ The count of non-NA cells for each column.
479
+
480
+ """
387
481
  stats = Parallel(n_jobs)(
388
482
  delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
389
483
  for ds in self.datasets
@@ -392,7 +486,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
392
486
  count = counts.sum(axis=0)
393
487
  return pd.Series(count, index=self._numeric_columns())
394
488
 
395
- def mean(self, numeric_only=False, n_jobs=1):
489
+ def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
490
+ """Compute the mean for each feature column.
491
+
492
+ Parameters
493
+ ----------
494
+ numeric_only : bool, default False
495
+ Include only float, int, boolean columns.
496
+ n_jobs : int, default 1
497
+ Number of jobs to run in parallel.
498
+
499
+ Returns
500
+ -------
501
+ pandas.Series
502
+ The mean of each column.
503
+
504
+ """
396
505
  stats = Parallel(n_jobs)(
397
506
  delayed(_compute_stats)(
398
507
  ds, return_count=True, return_mean=True, numeric_only=numeric_only
@@ -404,7 +513,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
404
513
  mean = np.sum((counts / count) * means, axis=0)
405
514
  return pd.Series(mean, index=self._numeric_columns())
406
515
 
407
- def var(self, ddof=1, numeric_only=False, n_jobs=1):
516
+ def var(
517
+ self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
518
+ ) -> pd.Series:
519
+ """Compute the variance for each feature column.
520
+
521
+ Parameters
522
+ ----------
523
+ ddof : int, default 1
524
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
525
+ numeric_only : bool, default False
526
+ Include only float, int, boolean columns.
527
+ n_jobs : int, default 1
528
+ Number of jobs to run in parallel.
529
+
530
+ Returns
531
+ -------
532
+ pandas.Series
533
+ The variance of each column.
534
+
535
+ """
408
536
  stats = Parallel(n_jobs)(
409
537
  delayed(_compute_stats)(
410
538
  ds,
@@ -424,12 +552,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
424
552
  _, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
425
553
  return pd.Series(var, index=self._numeric_columns())
426
554
 
427
- def std(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
555
+ def std(
556
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
557
+ ) -> pd.Series:
558
+ """Compute the standard deviation for each feature column.
559
+
560
+ Parameters
561
+ ----------
562
+ ddof : int, default 1
563
+ Delta Degrees of Freedom.
564
+ numeric_only : bool, default False
565
+ Include only float, int, boolean columns.
566
+ eps : float, default 0
567
+ A small epsilon value to add to the variance before taking the
568
+ square root to avoid numerical instability.
569
+ n_jobs : int, default 1
570
+ Number of jobs to run in parallel.
571
+
572
+ Returns
573
+ -------
574
+ pandas.Series
575
+ The standard deviation of each column.
576
+
577
+ """
428
578
  return np.sqrt(
429
579
  self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
430
580
  )
431
581
 
432
- def zscore(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
582
+ def zscore(
583
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
584
+ ) -> None:
585
+ """Apply z-score normalization to numeric columns in-place.
586
+
587
+ Parameters
588
+ ----------
589
+ ddof : int, default 1
590
+ Delta Degrees of Freedom for variance calculation.
591
+ numeric_only : bool, default False
592
+ Include only float, int, boolean columns.
593
+ eps : float, default 0
594
+ Epsilon for numerical stability.
595
+ n_jobs : int, default 1
596
+ Number of jobs to run in parallel for statistics computation.
597
+
598
+ """
433
599
  stats = Parallel(n_jobs)(
434
600
  delayed(_compute_stats)(
435
601
  ds,
@@ -449,10 +615,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
449
615
  _, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
450
616
  std = np.sqrt(var + eps)
451
617
  for ds in self.datasets:
452
- ds.features = (ds.features - mean) / std
618
+ ds.features.loc[:, self._numeric_columns()] = (
619
+ ds.features.loc[:, self._numeric_columns()] - mean
620
+ ) / std
453
621
 
454
622
  @staticmethod
455
- def _enforce_inplace_operations(func_name, kwargs):
623
+ def _enforce_inplace_operations(func_name: str, kwargs: dict):
624
+ """Raise an error if 'inplace=False' is passed to a method."""
456
625
  if "inplace" in kwargs and kwargs["inplace"] is False:
457
626
  raise ValueError(
458
627
  f"{func_name} only works inplace, please change "
@@ -460,33 +629,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
460
629
  )
461
630
  kwargs["inplace"] = True
462
631
 
463
- def fillna(self, *args, **kwargs):
632
+ def fillna(self, *args, **kwargs) -> None:
633
+ """Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
464
634
  FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
465
635
  for ds in self.datasets:
466
636
  ds.features.fillna(*args, **kwargs)
467
637
 
468
- def replace(self, *args, **kwargs):
638
+ def replace(self, *args, **kwargs) -> None:
639
+ """Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
469
640
  FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
470
641
  for ds in self.datasets:
471
642
  ds.features.replace(*args, **kwargs)
472
643
 
473
- def interpolate(self, *args, **kwargs):
644
+ def interpolate(self, *args, **kwargs) -> None:
645
+ """Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
474
646
  FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
475
647
  for ds in self.datasets:
476
648
  ds.features.interpolate(*args, **kwargs)
477
649
 
478
- def dropna(self, *args, **kwargs):
650
+ def dropna(self, *args, **kwargs) -> None:
651
+ """Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
479
652
  FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
480
653
  for ds in self.datasets:
481
654
  ds.features.dropna(*args, **kwargs)
482
655
 
483
- def drop(self, *args, **kwargs):
656
+ def drop(self, *args, **kwargs) -> None:
657
+ """Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
484
658
  FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
485
659
  for ds in self.datasets:
486
660
  ds.features.drop(*args, **kwargs)
487
661
 
488
- def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
662
+ def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
663
+ """Join columns with other FeaturesConcatDataset in-place.
664
+
665
+ Parameters
666
+ ----------
667
+ concat_dataset : FeaturesConcatDataset
668
+ The dataset to join with. Must have the same number of datasets,
669
+ and each corresponding dataset must have the same length.
670
+ **kwargs
671
+ Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
672
+
673
+ """
489
674
  assert len(self.datasets) == len(concat_dataset.datasets)
490
675
  for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
491
676
  assert len(ds1) == len(ds2)
492
- ds1.features.join(ds2, **kwargs)
677
+ ds1.features = ds1.features.join(ds2.features, **kwargs)