eegdash 0.4.0.dev153__py3-none-any.whl → 0.4.0.dev171__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -20,20 +20,34 @@ from ..logging import logger
20
20
 
21
21
 
22
22
  class FeaturesDataset(EEGWindowsDataset):
23
- """Returns samples from a pandas DataFrame object along with a target.
23
+ """A dataset of features extracted from EEG windows.
24
24
 
25
- Dataset which serves samples from a pandas DataFrame object along with a
26
- target. The target is unique for the dataset, and is obtained through the
27
- `description` attribute.
25
+ This class holds features in a pandas DataFrame and provides an interface
26
+ compatible with braindecode's dataset structure. Each row in the feature
27
+ DataFrame corresponds to a single sample (e.g., an EEG window).
28
28
 
29
29
  Parameters
30
30
  ----------
31
- features : a pandas DataFrame
32
- Tabular data.
33
- description : dict | pandas.Series | None
34
- Holds additional description about the continuous signal / subject.
35
- transform : callable | None
36
- On-the-fly transform applied to the example before it is returned.
31
+ features : pandas.DataFrame
32
+ A DataFrame where each row is a sample and each column is a feature.
33
+ metadata : pandas.DataFrame, optional
34
+ A DataFrame containing metadata for each sample, indexed consistently
35
+ with `features`. Must include columns 'i_window_in_trial',
36
+ 'i_start_in_trial', 'i_stop_in_trial', and 'target'.
37
+ description : dict or pandas.Series, optional
38
+ Additional high-level information about the dataset (e.g., subject ID).
39
+ transform : callable, optional
40
+ A function or transform to apply to the feature data on-the-fly.
41
+ raw_info : dict, optional
42
+ Information about the original raw recording, for provenance.
43
+ raw_preproc_kwargs : dict, optional
44
+ Keyword arguments used for preprocessing the raw data.
45
+ window_kwargs : dict, optional
46
+ Keyword arguments used for windowing the data.
47
+ window_preproc_kwargs : dict, optional
48
+ Keyword arguments used for preprocessing the windowed data.
49
+ features_kwargs : dict, optional
50
+ Keyword arguments used for feature extraction.
37
51
 
38
52
  """
39
53
 
@@ -65,7 +79,21 @@ class FeaturesDataset(EEGWindowsDataset):
65
79
  ].to_numpy()
66
80
  self.y = metadata.loc[:, "target"].to_list()
67
81
 
68
- def __getitem__(self, index):
82
+ def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
83
+ """Get a single sample from the dataset.
84
+
85
+ Parameters
86
+ ----------
87
+ index : int
88
+ The index of the sample to retrieve.
89
+
90
+ Returns
91
+ -------
92
+ tuple
93
+ A tuple containing the feature vector (X), the target (y), and the
94
+ cropping indices.
95
+
96
+ """
69
97
  crop_inds = self.crop_inds[index].tolist()
70
98
  X = self.features.iloc[index].to_numpy()
71
99
  X = X.copy()
@@ -75,18 +103,27 @@ class FeaturesDataset(EEGWindowsDataset):
75
103
  y = self.y[index]
76
104
  return X, y, crop_inds
77
105
 
78
- def __len__(self):
106
+ def __len__(self) -> int:
107
+ """Return the number of samples in the dataset.
108
+
109
+ Returns
110
+ -------
111
+ int
112
+ The total number of feature samples.
113
+
114
+ """
79
115
  return len(self.features.index)
80
116
 
81
117
 
82
118
  def _compute_stats(
83
119
  ds: FeaturesDataset,
84
- return_count=False,
85
- return_mean=False,
86
- return_var=False,
87
- ddof=1,
88
- numeric_only=False,
89
- ):
120
+ return_count: bool = False,
121
+ return_mean: bool = False,
122
+ return_var: bool = False,
123
+ ddof: int = 1,
124
+ numeric_only: bool = False,
125
+ ) -> tuple:
126
+ """Compute statistics for a single FeaturesDataset."""
90
127
  res = []
91
128
  if return_count:
92
129
  res.append(ds.features.count(numeric_only=numeric_only))
@@ -97,7 +134,14 @@ def _compute_stats(
97
134
  return tuple(res)
98
135
 
99
136
 
100
- def _pooled_var(counts, means, variances, ddof, ddof_in=None):
137
+ def _pooled_var(
138
+ counts: np.ndarray,
139
+ means: np.ndarray,
140
+ variances: np.ndarray,
141
+ ddof: int,
142
+ ddof_in: int | None = None,
143
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
144
+ """Compute pooled variance across multiple datasets."""
101
145
  if ddof_in is None:
102
146
  ddof_in = ddof
103
147
  count = counts.sum(axis=0)
@@ -110,17 +154,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
110
154
 
111
155
 
112
156
  class FeaturesConcatDataset(BaseConcatDataset):
113
- """A base class for concatenated datasets.
157
+ """A concatenated dataset of `FeaturesDataset` objects.
114
158
 
115
- Holds either mne.Raw or mne.Epoch in self.datasets and has
116
- a pandas DataFrame with additional description.
159
+ This class holds a list of :class:`FeaturesDataset` instances and allows
160
+ them to be treated as a single, larger dataset. It provides methods for
161
+
162
+ splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
163
+ `var`, `fillna`) across all contained datasets.
117
164
 
118
165
  Parameters
119
166
  ----------
120
- list_of_ds : list
121
- list of BaseDataset, BaseConcatDataset or WindowsDataset
122
- target_transform : callable | None
123
- Optional function to call on targets before returning them.
167
+ list_of_ds : list of FeaturesDataset
168
+ A list of :class:`FeaturesDataset` objects to concatenate.
169
+ target_transform : callable, optional
170
+ A function to apply to the target values before they are returned.
124
171
 
125
172
  """
126
173
 
@@ -140,26 +187,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
140
187
  self,
141
188
  by: str | list[int] | list[list[int]] | dict[str, list[int]],
142
189
  ) -> dict[str, FeaturesConcatDataset]:
143
- """Split the dataset based on information listed in its description.
190
+ """Split the dataset into subsets.
144
191
 
145
- The format could be based on a DataFrame or based on indices.
192
+ The splitting can be done based on a column in the description
193
+ DataFrame or by providing explicit indices for each split.
146
194
 
147
195
  Parameters
148
196
  ----------
149
- by : str | list | dict
150
- If ``by`` is a string, splitting is performed based on the
151
- description DataFrame column with this name.
152
- If ``by`` is a (list of) list of integers, the position in the first
153
- list corresponds to the split id and the integers to the
154
- datapoints of that split.
155
- If a dict then each key will be used in the returned
156
- splits dict and each value should be a list of int.
197
+ by : str or list or dict
198
+ - If a string, splits are created for each unique value in the
199
+ description column `by`.
200
+ - If a list of integers, a single split is created containing the
201
+ datasets at the specified indices.
202
+ - If a list of lists of integers, multiple splits are created, one
203
+ for each sublist of indices.
204
+ - If a dictionary, keys are used as split names and values are
205
+ lists of dataset indices.
157
206
 
158
207
  Returns
159
208
  -------
160
- splits : dict
161
- A dictionary with the name of the split (a string) as key and the
162
- dataset as value.
209
+ dict[str, FeaturesConcatDataset]
210
+ A dictionary where keys are split names and values are the new
211
+ :class:`FeaturesConcatDataset` subsets.
163
212
 
164
213
  """
165
214
  if isinstance(by, str):
@@ -184,14 +233,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
184
233
  }
185
234
 
186
235
  def get_metadata(self) -> pd.DataFrame:
187
- """Concatenate the metadata and description of the wrapped Epochs.
236
+ """Get the metadata of all datasets as a single DataFrame.
237
+
238
+ Concatenates the metadata from all contained datasets and adds columns
239
+ from their `description` attributes.
188
240
 
189
241
  Returns
190
242
  -------
191
- metadata : pd.DataFrame
192
- DataFrame containing as many rows as there are windows in the
193
- BaseConcatDataset, with the metadata and description information
194
- for each window.
243
+ pandas.DataFrame
244
+ A DataFrame containing the metadata for every sample in the
245
+ concatenated dataset.
246
+
247
+ Raises
248
+ ------
249
+ TypeError
250
+ If any of the contained datasets is not a :class:`FeaturesDataset`.
195
251
 
196
252
  """
197
253
  if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
@@ -202,60 +258,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
202
258
 
203
259
  all_dfs = list()
204
260
  for ds in self.datasets:
205
- df = ds.metadata
261
+ df = ds.metadata.copy()
206
262
  for k, v in ds.description.items():
207
263
  df[k] = v
208
264
  all_dfs.append(df)
209
265
 
210
266
  return pd.concat(all_dfs)
211
267
 
212
- def save(self, path: str, overwrite: bool = False, offset: int = 0):
213
- """Save datasets to files by creating one subdirectory for each dataset:
214
- path/
215
- 0/
216
- 0-feat.parquet
217
- metadata_df.pkl
218
- description.json
219
- raw-info.fif (if raw info was saved)
220
- raw_preproc_kwargs.json (if raws were preprocessed)
221
- window_kwargs.json (if this is a windowed dataset)
222
- window_preproc_kwargs.json (if windows were preprocessed)
223
- features_kwargs.json
224
- 1/
225
- 1-feat.parquet
226
- metadata_df.pkl
227
- description.json
228
- raw-info.fif (if raw info was saved)
229
- raw_preproc_kwargs.json (if raws were preprocessed)
230
- window_kwargs.json (if this is a windowed dataset)
231
- window_preproc_kwargs.json (if windows were preprocessed)
232
- features_kwargs.json
268
+ def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
269
+ """Save the concatenated dataset to a directory.
270
+
271
+ Creates a directory structure where each contained dataset is saved in
272
+ its own numbered subdirectory.
273
+
274
+ .. code-block::
275
+
276
+ path/
277
+ 0/
278
+ 0-feat.parquet
279
+ metadata_df.pkl
280
+ description.json
281
+ ...
282
+ 1/
283
+ 1-feat.parquet
284
+ ...
233
285
 
234
286
  Parameters
235
287
  ----------
236
288
  path : str
237
- Directory in which subdirectories are created to store
238
- -feat.parquet and .json files to.
239
- overwrite : bool
240
- Whether to delete old subdirectories that will be saved to in this
241
- call.
242
- offset : int
243
- If provided, the integer is added to the id of the dataset in the
244
- concat. This is useful in the setting of very large datasets, where
245
- one dataset has to be processed and saved at a time to account for
246
- its original position.
289
+ The directory where the dataset will be saved.
290
+ overwrite : bool, default False
291
+ If True, any existing subdirectories that conflict with the new
292
+ ones will be removed.
293
+ offset : int, default 0
294
+ An integer to add to the subdirectory names. Useful for saving
295
+ datasets in chunks.
296
+
297
+ Raises
298
+ ------
299
+ ValueError
300
+ If the dataset is empty.
301
+ FileExistsError
302
+ If a subdirectory already exists and `overwrite` is False.
247
303
 
248
304
  """
249
305
  if len(self.datasets) == 0:
250
306
  raise ValueError("Expect at least one dataset")
251
307
  path_contents = os.listdir(path)
252
- n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
308
+ n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
253
309
  for i_ds, ds in enumerate(self.datasets):
254
- # remove subdirectory from list of untouched files / subdirectories
255
- if str(i_ds + offset) in path_contents:
256
- path_contents.remove(str(i_ds + offset))
257
- # save_dir/i_ds/
258
- sub_dir = os.path.join(path, str(i_ds + offset))
310
+ sub_dir_name = str(i_ds + offset)
311
+ if sub_dir_name in path_contents:
312
+ path_contents.remove(sub_dir_name)
313
+ sub_dir = os.path.join(path, sub_dir_name)
259
314
  if os.path.exists(sub_dir):
260
315
  if overwrite:
261
316
  shutil.rmtree(sub_dir)
@@ -265,35 +320,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
265
320
  f" a different directory, set overwrite=True, or "
266
321
  f"resolve manually."
267
322
  )
268
- # save_dir/{i_ds+offset}/
269
323
  os.makedirs(sub_dir)
270
- # save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
271
324
  self._save_features(sub_dir, ds, i_ds, offset)
272
- # save_dir/{i_ds+offset}/metadata_df.pkl
273
325
  self._save_metadata(sub_dir, ds)
274
- # save_dir/{i_ds+offset}/description.json
275
326
  self._save_description(sub_dir, ds.description)
276
- # save_dir/{i_ds+offset}/raw-info.fif
277
327
  self._save_raw_info(sub_dir, ds)
278
- # save_dir/{i_ds+offset}/raw_preproc_kwargs.json
279
- # save_dir/{i_ds+offset}/window_kwargs.json
280
- # save_dir/{i_ds+offset}/window_preproc_kwargs.json
281
- # save_dir/{i_ds+offset}/features_kwargs.json
282
328
  self._save_kwargs(sub_dir, ds)
283
- if overwrite:
284
- # the following will be True for all datasets preprocessed and
285
- # stored in parallel with braindecode.preprocessing.preprocess
286
- if i_ds + 1 + offset < n_sub_dirs:
287
- logger.warning(
288
- f"The number of saved datasets ({i_ds + 1 + offset}) "
289
- f"does not match the number of existing "
290
- f"subdirectories ({n_sub_dirs}). You may now "
291
- f"encounter a mix of differently preprocessed "
292
- f"datasets!",
293
- UserWarning,
294
- )
295
- # if path contains files or directories that were not touched, raise
296
- # warning
329
+ if overwrite and i_ds + 1 + offset < n_sub_dirs:
330
+ logger.warning(
331
+ f"The number of saved datasets ({i_ds + 1 + offset}) "
332
+ f"does not match the number of existing "
333
+ f"subdirectories ({n_sub_dirs}). You may now "
334
+ f"encounter a mix of differently preprocessed "
335
+ f"datasets!",
336
+ UserWarning,
337
+ )
297
338
  if path_contents:
298
339
  logger.warning(
299
340
  f"Chosen directory {path} contains other "
@@ -301,20 +342,37 @@ class FeaturesConcatDataset(BaseConcatDataset):
301
342
  )
302
343
 
303
344
  @staticmethod
304
- def _save_features(sub_dir, ds, i_ds, offset):
345
+ def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
346
+ """Save the feature DataFrame to a Parquet file."""
305
347
  parquet_file_name = f"{i_ds + offset}-feat.parquet"
306
348
  parquet_file_path = os.path.join(sub_dir, parquet_file_name)
307
349
  ds.features.to_parquet(parquet_file_path)
308
350
 
309
351
  @staticmethod
310
- def _save_raw_info(sub_dir, ds):
311
- if hasattr(ds, "raw_info"):
352
+ def _save_metadata(sub_dir: str, ds: FeaturesDataset):
353
+ """Save the metadata DataFrame to a pickle file."""
354
+ metadata_file_name = "metadata_df.pkl"
355
+ metadata_file_path = os.path.join(sub_dir, metadata_file_name)
356
+ ds.metadata.to_pickle(metadata_file_path)
357
+
358
+ @staticmethod
359
+ def _save_description(sub_dir: str, description: pd.Series):
360
+ """Save the description Series to a JSON file."""
361
+ desc_file_name = "description.json"
362
+ desc_file_path = os.path.join(sub_dir, desc_file_name)
363
+ description.to_json(desc_file_path)
364
+
365
+ @staticmethod
366
+ def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
367
+ """Save the raw info dictionary to a FIF file if it exists."""
368
+ if hasattr(ds, "raw_info") and ds.raw_info is not None:
312
369
  fif_file_name = "raw-info.fif"
313
370
  fif_file_path = os.path.join(sub_dir, fif_file_name)
314
- ds.raw_info.save(fif_file_path)
371
+ ds.raw_info.save(fif_file_path, overwrite=True)
315
372
 
316
373
  @staticmethod
317
- def _save_kwargs(sub_dir, ds):
374
+ def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
375
+ """Save various keyword argument dictionaries to JSON files."""
318
376
  for kwargs_name in [
319
377
  "raw_preproc_kwargs",
320
378
  "window_kwargs",
@@ -322,10 +380,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
322
380
  "features_kwargs",
323
381
  ]:
324
382
  if hasattr(ds, kwargs_name):
325
- kwargs_file_name = ".".join([kwargs_name, "json"])
326
- kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
327
383
  kwargs = getattr(ds, kwargs_name)
328
384
  if kwargs is not None:
385
+ kwargs_file_name = ".".join([kwargs_name, "json"])
386
+ kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
329
387
  with open(kwargs_file_path, "w") as f:
330
388
  json.dump(kwargs, f)
331
389
 
@@ -334,7 +392,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
334
392
  include_metadata: bool | str | List[str] = False,
335
393
  include_target: bool = False,
336
394
  include_crop_inds: bool = False,
337
- ):
395
+ ) -> pd.DataFrame:
396
+ """Convert the dataset to a single pandas DataFrame.
397
+
398
+ Parameters
399
+ ----------
400
+ include_metadata : bool or str or list of str, default False
401
+ If True, include all metadata columns. If a string or list of
402
+ strings, include only the specified metadata columns.
403
+ include_target : bool, default False
404
+ If True, include the 'target' column.
405
+ include_crop_inds : bool, default False
406
+ If True, include window cropping index columns.
407
+
408
+ Returns
409
+ -------
410
+ pandas.DataFrame
411
+ A DataFrame containing the features and requested metadata.
412
+
413
+ """
338
414
  if (
339
415
  not isinstance(include_metadata, bool)
340
416
  or include_metadata
@@ -343,7 +419,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
343
419
  include_dataset = False
344
420
  if isinstance(include_metadata, bool) and include_metadata:
345
421
  include_dataset = True
346
- cols = self.datasets[0].metadata.columns
422
+ cols = self.datasets[0].metadata.columns.tolist()
347
423
  else:
348
424
  cols = include_metadata
349
425
  if isinstance(cols, bool) and not cols:
@@ -352,13 +428,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
352
428
  cols = [cols]
353
429
  cols = set(cols)
354
430
  if include_crop_inds:
355
- cols = {
356
- "i_dataset",
357
- "i_window_in_trial",
358
- "i_start_in_trial",
359
- "i_stop_in_trial",
360
- *cols,
361
- }
431
+ cols.update(
432
+ {
433
+ "i_dataset",
434
+ "i_window_in_trial",
435
+ "i_start_in_trial",
436
+ "i_stop_in_trial",
437
+ }
438
+ )
362
439
  if include_target:
363
440
  cols.add("target")
364
441
  cols = list(cols)
@@ -381,10 +458,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
381
458
  dataframes = [ds.features for ds in self.datasets]
382
459
  return pd.concat(dataframes, axis=0, ignore_index=True)
383
460
 
384
- def _numeric_columns(self):
461
+ def _numeric_columns(self) -> pd.Index:
462
+ """Get the names of numeric columns from the feature DataFrames."""
385
463
  return self.datasets[0].features.select_dtypes(include=np.number).columns
386
464
 
387
- def count(self, numeric_only=False, n_jobs=1):
465
+ def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
466
+ """Count non-NA cells for each feature column.
467
+
468
+ Parameters
469
+ ----------
470
+ numeric_only : bool, default False
471
+ Include only float, int, boolean columns.
472
+ n_jobs : int, default 1
473
+ Number of jobs to run in parallel.
474
+
475
+ Returns
476
+ -------
477
+ pandas.Series
478
+ The count of non-NA cells for each column.
479
+
480
+ """
388
481
  stats = Parallel(n_jobs)(
389
482
  delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
390
483
  for ds in self.datasets
@@ -393,7 +486,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
393
486
  count = counts.sum(axis=0)
394
487
  return pd.Series(count, index=self._numeric_columns())
395
488
 
396
- def mean(self, numeric_only=False, n_jobs=1):
489
+ def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
490
+ """Compute the mean for each feature column.
491
+
492
+ Parameters
493
+ ----------
494
+ numeric_only : bool, default False
495
+ Include only float, int, boolean columns.
496
+ n_jobs : int, default 1
497
+ Number of jobs to run in parallel.
498
+
499
+ Returns
500
+ -------
501
+ pandas.Series
502
+ The mean of each column.
503
+
504
+ """
397
505
  stats = Parallel(n_jobs)(
398
506
  delayed(_compute_stats)(
399
507
  ds, return_count=True, return_mean=True, numeric_only=numeric_only
@@ -405,7 +513,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
405
513
  mean = np.sum((counts / count) * means, axis=0)
406
514
  return pd.Series(mean, index=self._numeric_columns())
407
515
 
408
- def var(self, ddof=1, numeric_only=False, n_jobs=1):
516
+ def var(
517
+ self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
518
+ ) -> pd.Series:
519
+ """Compute the variance for each feature column.
520
+
521
+ Parameters
522
+ ----------
523
+ ddof : int, default 1
524
+ Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
525
+ numeric_only : bool, default False
526
+ Include only float, int, boolean columns.
527
+ n_jobs : int, default 1
528
+ Number of jobs to run in parallel.
529
+
530
+ Returns
531
+ -------
532
+ pandas.Series
533
+ The variance of each column.
534
+
535
+ """
409
536
  stats = Parallel(n_jobs)(
410
537
  delayed(_compute_stats)(
411
538
  ds,
@@ -425,12 +552,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
425
552
  _, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
426
553
  return pd.Series(var, index=self._numeric_columns())
427
554
 
428
- def std(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
555
+ def std(
556
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
557
+ ) -> pd.Series:
558
+ """Compute the standard deviation for each feature column.
559
+
560
+ Parameters
561
+ ----------
562
+ ddof : int, default 1
563
+ Delta Degrees of Freedom.
564
+ numeric_only : bool, default False
565
+ Include only float, int, boolean columns.
566
+ eps : float, default 0
567
+ A small epsilon value to add to the variance before taking the
568
+ square root to avoid numerical instability.
569
+ n_jobs : int, default 1
570
+ Number of jobs to run in parallel.
571
+
572
+ Returns
573
+ -------
574
+ pandas.Series
575
+ The standard deviation of each column.
576
+
577
+ """
429
578
  return np.sqrt(
430
579
  self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
431
580
  )
432
581
 
433
- def zscore(self, ddof=1, numeric_only=False, eps=0, n_jobs=1):
582
+ def zscore(
583
+ self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
584
+ ) -> None:
585
+ """Apply z-score normalization to numeric columns in-place.
586
+
587
+ Parameters
588
+ ----------
589
+ ddof : int, default 1
590
+ Delta Degrees of Freedom for variance calculation.
591
+ numeric_only : bool, default False
592
+ Include only float, int, boolean columns.
593
+ eps : float, default 0
594
+ Epsilon for numerical stability.
595
+ n_jobs : int, default 1
596
+ Number of jobs to run in parallel for statistics computation.
597
+
598
+ """
434
599
  stats = Parallel(n_jobs)(
435
600
  delayed(_compute_stats)(
436
601
  ds,
@@ -450,10 +615,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
450
615
  _, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
451
616
  std = np.sqrt(var + eps)
452
617
  for ds in self.datasets:
453
- ds.features = (ds.features - mean) / std
618
+ ds.features.loc[:, self._numeric_columns()] = (
619
+ ds.features.loc[:, self._numeric_columns()] - mean
620
+ ) / std
454
621
 
455
622
  @staticmethod
456
- def _enforce_inplace_operations(func_name, kwargs):
623
+ def _enforce_inplace_operations(func_name: str, kwargs: dict):
624
+ """Raise an error if 'inplace=False' is passed to a method."""
457
625
  if "inplace" in kwargs and kwargs["inplace"] is False:
458
626
  raise ValueError(
459
627
  f"{func_name} only works inplace, please change "
@@ -461,33 +629,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
461
629
  )
462
630
  kwargs["inplace"] = True
463
631
 
464
- def fillna(self, *args, **kwargs):
632
+ def fillna(self, *args, **kwargs) -> None:
633
+ """Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
465
634
  FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
466
635
  for ds in self.datasets:
467
636
  ds.features.fillna(*args, **kwargs)
468
637
 
469
- def replace(self, *args, **kwargs):
638
+ def replace(self, *args, **kwargs) -> None:
639
+ """Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
470
640
  FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
471
641
  for ds in self.datasets:
472
642
  ds.features.replace(*args, **kwargs)
473
643
 
474
- def interpolate(self, *args, **kwargs):
644
+ def interpolate(self, *args, **kwargs) -> None:
645
+ """Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
475
646
  FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
476
647
  for ds in self.datasets:
477
648
  ds.features.interpolate(*args, **kwargs)
478
649
 
479
- def dropna(self, *args, **kwargs):
650
+ def dropna(self, *args, **kwargs) -> None:
651
+ """Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
480
652
  FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
481
653
  for ds in self.datasets:
482
654
  ds.features.dropna(*args, **kwargs)
483
655
 
484
- def drop(self, *args, **kwargs):
656
+ def drop(self, *args, **kwargs) -> None:
657
+ """Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
485
658
  FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
486
659
  for ds in self.datasets:
487
660
  ds.features.drop(*args, **kwargs)
488
661
 
489
- def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
662
+ def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
663
+ """Join columns with other FeaturesConcatDataset in-place.
664
+
665
+ Parameters
666
+ ----------
667
+ concat_dataset : FeaturesConcatDataset
668
+ The dataset to join with. Must have the same number of datasets,
669
+ and each corresponding dataset must have the same length.
670
+ **kwargs
671
+ Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
672
+
673
+ """
490
674
  assert len(self.datasets) == len(concat_dataset.datasets)
491
675
  for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
492
676
  assert len(ds1) == len(ds2)
493
- ds1.features.join(ds2, **kwargs)
677
+ ds1.features = ds1.features.join(ds2.features, **kwargs)