eegdash 0.3.3.dev61__py3-none-any.whl → 0.5.0.dev180784713__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. eegdash/__init__.py +19 -6
  2. eegdash/api.py +336 -539
  3. eegdash/bids_eeg_metadata.py +495 -0
  4. eegdash/const.py +349 -0
  5. eegdash/dataset/__init__.py +28 -0
  6. eegdash/dataset/base.py +311 -0
  7. eegdash/dataset/bids_dataset.py +641 -0
  8. eegdash/dataset/dataset.py +692 -0
  9. eegdash/dataset/dataset_summary.csv +255 -0
  10. eegdash/dataset/registry.py +287 -0
  11. eegdash/downloader.py +197 -0
  12. eegdash/features/__init__.py +15 -13
  13. eegdash/features/datasets.py +329 -138
  14. eegdash/features/decorators.py +105 -13
  15. eegdash/features/extractors.py +233 -63
  16. eegdash/features/feature_bank/__init__.py +12 -12
  17. eegdash/features/feature_bank/complexity.py +22 -20
  18. eegdash/features/feature_bank/connectivity.py +27 -28
  19. eegdash/features/feature_bank/csp.py +3 -1
  20. eegdash/features/feature_bank/dimensionality.py +6 -6
  21. eegdash/features/feature_bank/signal.py +29 -30
  22. eegdash/features/feature_bank/spectral.py +40 -44
  23. eegdash/features/feature_bank/utils.py +8 -0
  24. eegdash/features/inspect.py +126 -15
  25. eegdash/features/serialization.py +58 -17
  26. eegdash/features/utils.py +90 -16
  27. eegdash/hbn/__init__.py +28 -0
  28. eegdash/hbn/preprocessing.py +105 -0
  29. eegdash/hbn/windows.py +428 -0
  30. eegdash/logging.py +54 -0
  31. eegdash/mongodb.py +55 -24
  32. eegdash/paths.py +52 -0
  33. eegdash/utils.py +29 -1
  34. eegdash-0.5.0.dev180784713.dist-info/METADATA +121 -0
  35. eegdash-0.5.0.dev180784713.dist-info/RECORD +38 -0
  36. eegdash-0.5.0.dev180784713.dist-info/licenses/LICENSE +29 -0
  37. eegdash/data_config.py +0 -34
  38. eegdash/data_utils.py +0 -687
  39. eegdash/dataset.py +0 -69
  40. eegdash/preprocessing.py +0 -63
  41. eegdash-0.3.3.dev61.dist-info/METADATA +0 -192
  42. eegdash-0.3.3.dev61.dist-info/RECORD +0 -28
  43. eegdash-0.3.3.dev61.dist-info/licenses/LICENSE +0 -23
  44. {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/WHEEL +0 -0
  45. {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import json
4
4
  import os
5
5
  import shutil
6
- import warnings
7
6
  from collections.abc import Callable
8
7
  from typing import Dict, List
9
8
 
@@ -17,22 +16,43 @@ from braindecode.datasets.base import (
17
16
  _create_description,
18
17
  )
19
18
 
19
+ from ..logging import logger
20
+
21
+ __all__ = [
22
+ "FeaturesDataset",
23
+ "FeaturesConcatDataset",
24
+ ]
25
+
20
26
 
21
27
  class FeaturesDataset(EEGWindowsDataset):
22
- """Returns samples from a pandas DataFrame object along with a target.
28
+ """A dataset of features extracted from EEG windows.
23
29
 
24
- Dataset which serves samples from a pandas DataFrame object along with a
25
- target. The target is unique for the dataset, and is obtained through the
26
- `description` attribute.
30
+ This class holds features in a pandas DataFrame and provides an interface
31
+ compatible with braindecode's dataset structure. Each row in the feature
32
+ DataFrame corresponds to a single sample (e.g., an EEG window).
27
33
 
28
34
  Parameters
29
35
  ----------
30
- features : a pandas DataFrame
31
- Tabular data.
32
- description : dict | pandas.Series | None
33
- Holds additional description about the continuous signal / subject.
34
- transform : callable | None
35
- On-the-fly transform applied to the example before it is returned.
36
+ features : pandas.DataFrame
37
+ A DataFrame where each row is a sample and each column is a feature.
38
+ metadata : pandas.DataFrame, optional
39
+ A DataFrame containing metadata for each sample, indexed consistently
40
+ with `features`. Must include columns 'i_window_in_trial',
41
+ 'i_start_in_trial', 'i_stop_in_trial', and 'target'.
42
+ description : dict or pandas.Series, optional
43
+ Additional high-level information about the dataset (e.g., subject ID).
44
+ transform : callable, optional
45
+ A function or transform to apply to the feature data on-the-fly.
46
+ raw_info : dict, optional
47
+ Information about the original raw recording, for provenance.
48
+ raw_preproc_kwargs : dict, optional
49
+ Keyword arguments used for preprocessing the raw data.
50
+ window_kwargs : dict, optional
51
+ Keyword arguments used for windowing the data.
52
+ window_preproc_kwargs : dict, optional
53
+ Keyword arguments used for preprocessing the windowed data.
54
+ features_kwargs : dict, optional
55
+ Keyword arguments used for feature extraction.
36
56
 
37
57
  """
38
58
 
@@ -64,7 +84,21 @@ class FeaturesDataset(EEGWindowsDataset):
64
84
  ].to_numpy()
65
85
  self.y = metadata.loc[:, "target"].to_list()
66
86
 
67
- def __getitem__(self, index):
87
+ def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
88
+ """Get a single sample from the dataset.
89
+
90
+ Parameters
91
+ ----------
92
+ index : int
93
+ The index of the sample to retrieve.
94
+
95
+ Returns
96
+ -------
97
+ tuple
98
+ A tuple containing the feature vector (X), the target (y), and the
99
+ cropping indices.
100
+
101
+ """
68
102
  crop_inds = self.crop_inds[index].tolist()
69
103
  X = self.features.iloc[index].to_numpy()
70
104
  X = X.copy()
@@ -74,18 +108,27 @@ class FeaturesDataset(EEGWindowsDataset):
74
108
  y = self.y[index]
75
109
  return X, y, crop_inds
76
110
 
77
- def __len__(self):
111
+ def __len__(self) -> int:
112
+ """Return the number of samples in the dataset.
113
+
114
+ Returns
115
+ -------
116
+ int
117
+ The total number of feature samples.
118
+
119
+ """
78
120
  return len(self.features.index)
79
121
 
80
122
 
81
123
  def _compute_stats(
82
124
  ds: FeaturesDataset,
83
- return_count=False,
84
- return_mean=False,
85
- return_var=False,
86
- ddof=1,
87
- numeric_only=False,
88
- ):
125
+ return_count: bool = False,
126
+ return_mean: bool = False,
127
+ return_var: bool = False,
128
+ ddof: int = 1,
129
+ numeric_only: bool = False,
130
+ ) -> tuple:
131
+ """Compute statistics for a single :class:`~eegdash.features.datasets.FeaturesDataset`."""
89
132
  res = []
90
133
  if return_count:
91
134
  res.append(ds.features.count(numeric_only=numeric_only))
@@ -96,7 +139,14 @@ def _compute_stats(
96
139
  return tuple(res)
97
140
 
98
141
 
99
- def _pooled_var(counts, means, variances, ddof, ddof_in=None):
142
+ def _pooled_var(
143
+ counts: np.ndarray,
144
+ means: np.ndarray,
145
+ variances: np.ndarray,
146
+ ddof: int,
147
+ ddof_in: int | None = None,
148
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
149
+ """Compute pooled variance across multiple datasets."""
100
150
  if ddof_in is None:
101
151
  ddof_in = ddof
102
152
  count = counts.sum(axis=0)
@@ -109,17 +159,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
109
159
 
110
160
 
111
161
  class FeaturesConcatDataset(BaseConcatDataset):
112
- """A base class for concatenated datasets.
162
+ """A concatenated dataset of :class:`~eegdash.features.datasets.FeaturesDataset` objects.
113
163
 
114
- Holds either mne.Raw or mne.Epoch in self.datasets and has
115
- a pandas DataFrame with additional description.
164
+ This class holds a list of :class:`~eegdash.features.datasets.FeaturesDataset` instances and allows
165
+ them to be treated as a single, larger dataset. It provides methods for
166
+
167
+ splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
168
+ `var`, `fillna`) across all contained datasets.
116
169
 
117
170
  Parameters
118
171
  ----------
119
- list_of_ds : list
120
- list of BaseDataset, BaseConcatDataset or WindowsDataset
121
- target_transform : callable | None
122
- Optional function to call on targets before returning them.
172
+ list_of_ds : list of ~eegdash.features.datasets.FeaturesDataset
173
+ A list of :class:`~eegdash.features.datasets.FeaturesDataset` objects to concatenate.
174
+ target_transform : callable, optional
175
+ A function to apply to the target values before they are returned.
123
176
 
124
177
  """
125
178
 
@@ -139,26 +192,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
139
192
  self,
140
193
  by: str | list[int] | list[list[int]] | dict[str, list[int]],
141
194
  ) -> dict[str, FeaturesConcatDataset]:
142
- """Split the dataset based on information listed in its description.
195
+ """Split the dataset into subsets.
143
196
 
144
- The format could be based on a DataFrame or based on indices.
197
+ The splitting can be done based on a column in the description
198
+ DataFrame or by providing explicit indices for each split.
145
199
 
146
200
  Parameters
147
201
  ----------
148
- by : str | list | dict
149
- If ``by`` is a string, splitting is performed based on the
150
- description DataFrame column with this name.
151
- If ``by`` is a (list of) list of integers, the position in the first
152
- list corresponds to the split id and the integers to the
153
- datapoints of that split.
154
- If a dict then each key will be used in the returned
155
- splits dict and each value should be a list of int.
202
+ by : str or list or dict
203
+ - If a string, splits are created for each unique value in the
204
+ description column `by`.
205
+ - If a list of integers, a single split is created containing the
206
+ datasets at the specified indices.
207
+ - If a list of lists of integers, multiple splits are created, one
208
+ for each sublist of indices.
209
+ - If a dictionary, keys are used as split names and values are
210
+ lists of dataset indices.
156
211
 
157
212
  Returns
158
213
  -------
159
- splits : dict
160
- A dictionary with the name of the split (a string) as key and the
161
- dataset as value.
214
+ dict[str, ~eegdash.features.datasets.FeaturesConcatDataset]
215
+ A dictionary where keys are split names and values are the new
216
+ :class:`~eegdash.features.datasets.FeaturesConcatDataset` subsets.
162
217
 
163
218
  """
164
219
  if isinstance(by, str):
@@ -183,14 +238,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
183
238
  }
184
239
 
185
240
  def get_metadata(self) -> pd.DataFrame:
186
- """Concatenate the metadata and description of the wrapped Epochs.
241
+ """Get the metadata of all datasets as a single DataFrame.
242
+
243
+ Concatenates the metadata from all contained datasets and adds columns
244
+ from their `description` attributes.
187
245
 
188
246
  Returns
189
247
  -------
190
- metadata : pd.DataFrame
191
- DataFrame containing as many rows as there are windows in the
192
- BaseConcatDataset, with the metadata and description information
193
- for each window.
248
+ pandas.DataFrame
249
+ A DataFrame containing the metadata for every sample in the
250
+ concatenated dataset.
251
+
252
+ Raises
253
+ ------
254
+ TypeError
255
+ If any of the contained datasets is not a
256
+ :class:`~eegdash.features.datasets.FeaturesDataset`.
194
257
 
195
258
  """
196
259
  if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
@@ -201,60 +264,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
201
264
 
202
265
  all_dfs = list()
203
266
  for ds in self.datasets:
204
- df = ds.metadata
267
+ df = ds.metadata.copy()
205
268
  for k, v in ds.description.items():
206
269
  df[k] = v
207
270
  all_dfs.append(df)
208
271
 
209
272
  return pd.concat(all_dfs)
210
273
 
211
- def save(self, path: str, overwrite: bool = False, offset: int = 0):
212
- """Save datasets to files by creating one subdirectory for each dataset:
213
- path/
214
- 0/
215
- 0-feat.parquet
216
- metadata_df.pkl
217
- description.json
218
- raw-info.fif (if raw info was saved)
219
- raw_preproc_kwargs.json (if raws were preprocessed)
220
- window_kwargs.json (if this is a windowed dataset)
221
- window_preproc_kwargs.json (if windows were preprocessed)
222
- features_kwargs.json
223
- 1/
224
- 1-feat.parquet
225
- metadata_df.pkl
226
- description.json
227
- raw-info.fif (if raw info was saved)
228
- raw_preproc_kwargs.json (if raws were preprocessed)
229
- window_kwargs.json (if this is a windowed dataset)
230
- window_preproc_kwargs.json (if windows were preprocessed)
231
- features_kwargs.json
274
+ def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
275
+ """Save the concatenated dataset to a directory.
276
+
277
+ Creates a directory structure where each contained dataset is saved in
278
+ its own numbered subdirectory.
279
+
280
+ .. code-block::
281
+
282
+ path/
283
+ 0/
284
+ 0-feat.parquet
285
+ metadata_df.pkl
286
+ description.json
287
+ ...
288
+ 1/
289
+ 1-feat.parquet
290
+ ...
232
291
 
233
292
  Parameters
234
293
  ----------
235
294
  path : str
236
- Directory in which subdirectories are created to store
237
- -feat.parquet and .json files to.
238
- overwrite : bool
239
- Whether to delete old subdirectories that will be saved to in this
240
- call.
241
- offset : int
242
- If provided, the integer is added to the id of the dataset in the
243
- concat. This is useful in the setting of very large datasets, where
244
- one dataset has to be processed and saved at a time to account for
245
- its original position.
295
+ The directory where the dataset will be saved.
296
+ overwrite : bool, default False
297
+ If True, any existing subdirectories that conflict with the new
298
+ ones will be removed.
299
+ offset : int, default 0
300
+ An integer to add to the subdirectory names. Useful for saving
301
+ datasets in chunks.
302
+
303
+ Raises
304
+ ------
305
+ ValueError
306
+ If the dataset is empty.
307
+ FileExistsError
308
+ If a subdirectory already exists and `overwrite` is False.
246
309
 
247
310
  """
248
311
  if len(self.datasets) == 0:
249
312
  raise ValueError("Expect at least one dataset")
250
313
  path_contents = os.listdir(path)
251
- n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
314
+ n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
252
315
  for i_ds, ds in enumerate(self.datasets):
253
- # remove subdirectory from list of untouched files / subdirectories
254
- if str(i_ds + offset) in path_contents:
255
- path_contents.remove(str(i_ds + offset))
256
- # save_dir/i_ds/
257
- sub_dir = os.path.join(path, str(i_ds + offset))
316
+ sub_dir_name = str(i_ds + offset)
317
+ if sub_dir_name in path_contents:
318
+ path_contents.remove(sub_dir_name)
319
+ sub_dir = os.path.join(path, sub_dir_name)
258
320
  if os.path.exists(sub_dir):
259
321
  if overwrite:
260
322
  shutil.rmtree(sub_dir)
@@ -264,56 +326,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
264
326
  f" a different directory, set overwrite=True, or "
265
327
  f"resolve manually."
266
328
  )
267
- # save_dir/{i_ds+offset}/
268
329
  os.makedirs(sub_dir)
269
- # save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
270
330
  self._save_features(sub_dir, ds, i_ds, offset)
271
- # save_dir/{i_ds+offset}/metadata_df.pkl
272
331
  self._save_metadata(sub_dir, ds)
273
- # save_dir/{i_ds+offset}/description.json
274
332
  self._save_description(sub_dir, ds.description)
275
- # save_dir/{i_ds+offset}/raw-info.fif
276
333
  self._save_raw_info(sub_dir, ds)
277
- # save_dir/{i_ds+offset}/raw_preproc_kwargs.json
278
- # save_dir/{i_ds+offset}/window_kwargs.json
279
- # save_dir/{i_ds+offset}/window_preproc_kwargs.json
280
- # save_dir/{i_ds+offset}/features_kwargs.json
281
334
  self._save_kwargs(sub_dir, ds)
282
- if overwrite:
283
- # the following will be True for all datasets preprocessed and
284
- # stored in parallel with braindecode.preprocessing.preprocess
285
- if i_ds + 1 + offset < n_sub_dirs:
286
- warnings.warn(
287
- f"The number of saved datasets ({i_ds + 1 + offset}) "
288
- f"does not match the number of existing "
289
- f"subdirectories ({n_sub_dirs}). You may now "
290
- f"encounter a mix of differently preprocessed "
291
- f"datasets!",
292
- UserWarning,
293
- )
294
- # if path contains files or directories that were not touched, raise
295
- # warning
335
+ if overwrite and i_ds + 1 + offset < n_sub_dirs:
336
+ logger.warning(
337
+ f"The number of saved datasets ({i_ds + 1 + offset}) "
338
+ f"does not match the number of existing "
339
+ f"subdirectories ({n_sub_dirs}). You may now "
340
+ f"encounter a mix of differently preprocessed "
341
+ f"datasets!",
342
+ UserWarning,
343
+ )
296
344
  if path_contents:
297
- warnings.warn(
345
+ logger.warning(
298
346
  f"Chosen directory {path} contains other "
299
347
  f"subdirectories or files {path_contents}."
300
348
  )
301
349
 
302
350
  @staticmethod
303
- def _save_features(sub_dir, ds, i_ds, offset):
351
+ def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
352
+ """Save the feature DataFrame to a Parquet file."""
304
353
  parquet_file_name = f"{i_ds + offset}-feat.parquet"
305
354
  parquet_file_path = os.path.join(sub_dir, parquet_file_name)
306
355
  ds.features.to_parquet(parquet_file_path)
307
356
 
308
357
  @staticmethod
309
- def _save_raw_info(sub_dir, ds):
310
- if hasattr(ds, "raw_info"):
358
+ def _save_metadata(sub_dir: str, ds: FeaturesDataset):
359
+ """Save the metadata DataFrame to a pickle file."""
360
+ metadata_file_name = "metadata_df.pkl"
361
+ metadata_file_path = os.path.join(sub_dir, metadata_file_name)
362
+ ds.metadata.to_pickle(metadata_file_path)
363
+
364
+ @staticmethod
365
+ def _save_description(sub_dir: str, description: pd.Series):
366
+ """Save the description Series to a JSON file."""
367
+ desc_file_name = "description.json"
368
+ desc_file_path = os.path.join(sub_dir, desc_file_name)
369
+ description.to_json(desc_file_path)
370
+
371
+ @staticmethod
372
+ def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
373
+ """Save the raw info dictionary to a FIF file if it exists."""
374
+ if hasattr(ds, "raw_info") and ds.raw_info is not None:
311
375
  fif_file_name = "raw-info.fif"
312
376
  fif_file_path = os.path.join(sub_dir, fif_file_name)
313
- ds.raw_info.save(fif_file_path)
377
+ ds.raw_info.save(fif_file_path, overwrite=True)
314
378
 
315
379
  @staticmethod
316
- def _save_kwargs(sub_dir, ds):
380
+ def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
381
+ """Save various keyword argument dictionaries to JSON files."""
317
382
  for kwargs_name in [
318
383
  "raw_preproc_kwargs",
319
384
  "window_kwargs",
@@ -321,10 +386,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
321
386
  "features_kwargs",
322
387
  ]:
323
388
  if hasattr(ds, kwargs_name):
324
- kwargs_file_name = ".".join([kwargs_name, "json"])
325
- kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
326
389
  kwargs = getattr(ds, kwargs_name)
327
390
  if kwargs is not None:
391
+ kwargs_file_name = ".".join([kwargs_name, "json"])
392
+ kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
328
393
  with open(kwargs_file_path, "w") as f:
329
394
  json.dump(kwargs, f)
330
395
 
@@ -333,7 +398,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
333
398
  include_metadata: bool | str | List[str] = False,
334
399
  include_target: bool = False,
335
400
  include_crop_inds: bool = False,
336
- ):
401
+ ) -> pd.DataFrame:
402
+ """Convert the dataset to a single pandas DataFrame.
403
+
404
+ Parameters
405
+ ----------
406
+ include_metadata : bool or str or list of str, default False
407
+ If True, include all metadata columns. If a string or list of
408
+ strings, include only the specified metadata columns.
409
+ include_target : bool, default False
410
+ If True, include the 'target' column.
411
+ include_crop_inds : bool, default False
412
+ If True, include window cropping index columns.
413
+
414
+ Returns
415
+ -------
416
+ pandas.DataFrame
417
+ A DataFrame containing the features and requested metadata.
418
+
419
+ """
337
420
  if (
338
421
  not isinstance(include_metadata, bool)
339
422
  or include_metadata
@@ -342,7 +425,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
342
425
  include_dataset = False
343
426
  if isinstance(include_metadata, bool) and include_metadata:
344
427
  include_dataset = True
345
- cols = self.datasets[0].metadata.columns
428
+ cols = self.datasets[0].metadata.columns.tolist()
346
429
  else:
347
430
  cols = include_metadata
348
431
  if isinstance(cols, bool) and not cols:
@@ -351,13 +434,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
351
434
  cols = [cols]
352
435
  cols = set(cols)
353
436
  if include_crop_inds:
354
- cols = {
355
- "i_dataset",
356
- "i_window_in_trial",
357
- "i_start_in_trial",
358
- "i_stop_in_trial",
359
- *cols,
360
- }
437
+ cols.update(
438
+ {
439
+ "i_dataset",
440
+ "i_window_in_trial",
441
+ "i_start_in_trial",
442
+ "i_stop_in_trial",
443
+ }
444
+ )
361
445
  if include_target:
362
446
  cols.add("target")
363
447
  cols = list(cols)
@@ -380,10 +464,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
380
464
  dataframes = [ds.features for ds in self.datasets]
381
465
  return pd.concat(dataframes, axis=0, ignore_index=True)
382
466
 
383
- def _numeric_columns(self):
467
+ def _numeric_columns(self) -> pd.Index:
468
+ """Get the names of numeric columns from the feature DataFrames."""
384
469
  return self.datasets[0].features.select_dtypes(include=np.number).columns
385
470
 
386
- def count(self, numeric_only=False, n_jobs=1):
471
+ def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
472
+ """Count non-NA cells for each feature column.
473
+
474
+ Parameters
475
+ ----------
476
+ numeric_only : bool, default False
477
+ Include only float, int, boolean columns.
478
+ n_jobs : int, default 1
479
+ Number of jobs to run in parallel.
480
+
481
+ Returns
482
+ -------
483
+ pandas.Series
484
+ The count of non-NA cells for each column.
485
+
486
+ """
387
487
  stats = Parallel(n_jobs)(
388
488
  delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
389
489
  for ds in self.datasets
@@ -392,7 +492,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
392
492
  count = counts.sum(axis=0)
393
493
  return pd.Series(count, index=self._numeric_columns())
394
494
 
395
- def mean(self, numeric_only=False, n_jobs=1):
495
+ def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
496
+ """Compute the mean for each feature column.
497
+
498
+ Parameters
499
+ ----------
500
+ numeric_only : bool, default False
501
+ Include only float, int, boolean columns.
502
+ n_jobs : int, default 1
503
+ Number of jobs to run in parallel.
504
+
505
+ Returns
506
+ -------
507
+ pandas.Series
508
+ The mean of each column.
509
+
510
+ """
396
511
  stats = Parallel(n_jobs)(
397
512
  delayed(_compute_stats)(
398
513
  ds, return_count=True, return_mean=True, numeric_only=numeric_only
@@ -404,7 +519,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
404
519
  mean = np.sum((counts / count) * means, axis=0)
405
520
  return pd.Series(mean, index=self._numeric_columns())
406
521
 
407
- def var(self, ddof=1, numeric_only=False, n_jobs=1):
522
+ def var(
523
+ self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
524
+ ) -> pd.Series:
525
+ """Compute the variance for each feature column.
526
+
527
+ Parameters
528
+ ----------
529
+ ddof : int, default 1
530
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
531
+ numeric_only : bool, default False
532
+ Include only float, int, boolean columns.
533
+ n_jobs : int, default 1
534
+ Number of jobs to run in parallel.
535
+
536
+ Returns
537
+ -------
538
+ pandas.Series
539
+ The variance of each column.
540
+
541
+ """
408
542
  stats = Parallel(n_jobs)(
409
543
  delayed(_compute_stats)(
410
544
  ds,
@@ -424,12 +558,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
424
558
  _, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
425
559
  return pd.Series(var, index=self._numeric_columns())
426
560
 
427
- def std(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
561
+ def std(
562
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
563
+ ) -> pd.Series:
564
+ """Compute the standard deviation for each feature column.
565
+
566
+ Parameters
567
+ ----------
568
+ ddof : int, default 1
569
+ Delta Degrees of Freedom.
570
+ numeric_only : bool, default False
571
+ Include only float, int, boolean columns.
572
+ eps : float, default 0
573
+ A small epsilon value to add to the variance before taking the
574
+ square root to avoid numerical instability.
575
+ n_jobs : int, default 1
576
+ Number of jobs to run in parallel.
577
+
578
+ Returns
579
+ -------
580
+ pandas.Series
581
+ The standard deviation of each column.
582
+
583
+ """
428
584
  return np.sqrt(
429
585
  self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
430
586
  )
431
587
 
432
- def zscore(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
588
+ def zscore(
589
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
590
+ ) -> None:
591
+ """Apply z-score normalization to numeric columns in-place.
592
+
593
+ Parameters
594
+ ----------
595
+ ddof : int, default 1
596
+ Delta Degrees of Freedom for variance calculation.
597
+ numeric_only : bool, default False
598
+ Include only float, int, boolean columns.
599
+ eps : float, default 0
600
+ Epsilon for numerical stability.
601
+ n_jobs : int, default 1
602
+ Number of jobs to run in parallel for statistics computation.
603
+
604
+ """
433
605
  stats = Parallel(n_jobs)(
434
606
  delayed(_compute_stats)(
435
607
  ds,
@@ -449,10 +621,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
449
621
  _, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
450
622
  std = np.sqrt(var + eps)
451
623
  for ds in self.datasets:
452
- ds.features = (ds.features - mean) / std
624
+ ds.features.loc[:, self._numeric_columns()] = (
625
+ ds.features.loc[:, self._numeric_columns()] - mean
626
+ ) / std
453
627
 
454
628
  @staticmethod
455
- def _enforce_inplace_operations(func_name, kwargs):
629
+ def _enforce_inplace_operations(func_name: str, kwargs: dict):
630
+ """Raise an error if 'inplace=False' is passed to a method."""
456
631
  if "inplace" in kwargs and kwargs["inplace"] is False:
457
632
  raise ValueError(
458
633
  f"{func_name} only works inplace, please change "
@@ -460,33 +635,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
460
635
  )
461
636
  kwargs["inplace"] = True
462
637
 
463
- def fillna(self, *args, **kwargs):
638
+ def fillna(self, *args, **kwargs) -> None:
639
+ """Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
464
640
  FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
465
641
  for ds in self.datasets:
466
642
  ds.features.fillna(*args, **kwargs)
467
643
 
468
- def replace(self, *args, **kwargs):
644
+ def replace(self, *args, **kwargs) -> None:
645
+ """Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
469
646
  FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
470
647
  for ds in self.datasets:
471
648
  ds.features.replace(*args, **kwargs)
472
649
 
473
- def interpolate(self, *args, **kwargs):
650
+ def interpolate(self, *args, **kwargs) -> None:
651
+ """Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
474
652
  FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
475
653
  for ds in self.datasets:
476
654
  ds.features.interpolate(*args, **kwargs)
477
655
 
478
- def dropna(self, *args, **kwargs):
656
+ def dropna(self, *args, **kwargs) -> None:
657
+ """Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
479
658
  FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
480
659
  for ds in self.datasets:
481
660
  ds.features.dropna(*args, **kwargs)
482
661
 
483
- def drop(self, *args, **kwargs):
662
+ def drop(self, *args, **kwargs) -> None:
663
+ """Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
484
664
  FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
485
665
  for ds in self.datasets:
486
666
  ds.features.drop(*args, **kwargs)
487
667
 
488
- def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
668
+ def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
669
+ """Join columns with other FeaturesConcatDataset in-place.
670
+
671
+ Parameters
672
+ ----------
673
+ concat_dataset : FeaturesConcatDataset
674
+ The dataset to join with. Must have the same number of datasets,
675
+ and each corresponding dataset must have the same length.
676
+ **kwargs
677
+ Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
678
+
679
+ """
489
680
  assert len(self.datasets) == len(concat_dataset.datasets)
490
681
  for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
491
682
  assert len(ds1) == len(ds2)
492
- ds1.features.join(ds2, **kwargs)
683
+ ds1.features = ds1.features.join(ds2.features, **kwargs)