eegdash 0.3.3.dev61__py3-none-any.whl → 0.5.0.dev180784713__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eegdash/__init__.py +19 -6
- eegdash/api.py +336 -539
- eegdash/bids_eeg_metadata.py +495 -0
- eegdash/const.py +349 -0
- eegdash/dataset/__init__.py +28 -0
- eegdash/dataset/base.py +311 -0
- eegdash/dataset/bids_dataset.py +641 -0
- eegdash/dataset/dataset.py +692 -0
- eegdash/dataset/dataset_summary.csv +255 -0
- eegdash/dataset/registry.py +287 -0
- eegdash/downloader.py +197 -0
- eegdash/features/__init__.py +15 -13
- eegdash/features/datasets.py +329 -138
- eegdash/features/decorators.py +105 -13
- eegdash/features/extractors.py +233 -63
- eegdash/features/feature_bank/__init__.py +12 -12
- eegdash/features/feature_bank/complexity.py +22 -20
- eegdash/features/feature_bank/connectivity.py +27 -28
- eegdash/features/feature_bank/csp.py +3 -1
- eegdash/features/feature_bank/dimensionality.py +6 -6
- eegdash/features/feature_bank/signal.py +29 -30
- eegdash/features/feature_bank/spectral.py +40 -44
- eegdash/features/feature_bank/utils.py +8 -0
- eegdash/features/inspect.py +126 -15
- eegdash/features/serialization.py +58 -17
- eegdash/features/utils.py +90 -16
- eegdash/hbn/__init__.py +28 -0
- eegdash/hbn/preprocessing.py +105 -0
- eegdash/hbn/windows.py +428 -0
- eegdash/logging.py +54 -0
- eegdash/mongodb.py +55 -24
- eegdash/paths.py +52 -0
- eegdash/utils.py +29 -1
- eegdash-0.5.0.dev180784713.dist-info/METADATA +121 -0
- eegdash-0.5.0.dev180784713.dist-info/RECORD +38 -0
- eegdash-0.5.0.dev180784713.dist-info/licenses/LICENSE +29 -0
- eegdash/data_config.py +0 -34
- eegdash/data_utils.py +0 -687
- eegdash/dataset.py +0 -69
- eegdash/preprocessing.py +0 -63
- eegdash-0.3.3.dev61.dist-info/METADATA +0 -192
- eegdash-0.3.3.dev61.dist-info/RECORD +0 -28
- eegdash-0.3.3.dev61.dist-info/licenses/LICENSE +0 -23
- {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/WHEEL +0 -0
- {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/top_level.txt +0 -0
eegdash/features/datasets.py
CHANGED
|
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import shutil
|
|
6
|
-
import warnings
|
|
7
6
|
from collections.abc import Callable
|
|
8
7
|
from typing import Dict, List
|
|
9
8
|
|
|
@@ -17,22 +16,43 @@ from braindecode.datasets.base import (
|
|
|
17
16
|
_create_description,
|
|
18
17
|
)
|
|
19
18
|
|
|
19
|
+
from ..logging import logger
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"FeaturesDataset",
|
|
23
|
+
"FeaturesConcatDataset",
|
|
24
|
+
]
|
|
25
|
+
|
|
20
26
|
|
|
21
27
|
class FeaturesDataset(EEGWindowsDataset):
|
|
22
|
-
"""
|
|
28
|
+
"""A dataset of features extracted from EEG windows.
|
|
23
29
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
This class holds features in a pandas DataFrame and provides an interface
|
|
31
|
+
compatible with braindecode's dataset structure. Each row in the feature
|
|
32
|
+
DataFrame corresponds to a single sample (e.g., an EEG window).
|
|
27
33
|
|
|
28
34
|
Parameters
|
|
29
35
|
----------
|
|
30
|
-
features :
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
+
features : pandas.DataFrame
|
|
37
|
+
A DataFrame where each row is a sample and each column is a feature.
|
|
38
|
+
metadata : pandas.DataFrame, optional
|
|
39
|
+
A DataFrame containing metadata for each sample, indexed consistently
|
|
40
|
+
with `features`. Must include columns 'i_window_in_trial',
|
|
41
|
+
'i_start_in_trial', 'i_stop_in_trial', and 'target'.
|
|
42
|
+
description : dict or pandas.Series, optional
|
|
43
|
+
Additional high-level information about the dataset (e.g., subject ID).
|
|
44
|
+
transform : callable, optional
|
|
45
|
+
A function or transform to apply to the feature data on-the-fly.
|
|
46
|
+
raw_info : dict, optional
|
|
47
|
+
Information about the original raw recording, for provenance.
|
|
48
|
+
raw_preproc_kwargs : dict, optional
|
|
49
|
+
Keyword arguments used for preprocessing the raw data.
|
|
50
|
+
window_kwargs : dict, optional
|
|
51
|
+
Keyword arguments used for windowing the data.
|
|
52
|
+
window_preproc_kwargs : dict, optional
|
|
53
|
+
Keyword arguments used for preprocessing the windowed data.
|
|
54
|
+
features_kwargs : dict, optional
|
|
55
|
+
Keyword arguments used for feature extraction.
|
|
36
56
|
|
|
37
57
|
"""
|
|
38
58
|
|
|
@@ -64,7 +84,21 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
64
84
|
].to_numpy()
|
|
65
85
|
self.y = metadata.loc[:, "target"].to_list()
|
|
66
86
|
|
|
67
|
-
def __getitem__(self, index):
|
|
87
|
+
def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
|
|
88
|
+
"""Get a single sample from the dataset.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
index : int
|
|
93
|
+
The index of the sample to retrieve.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
tuple
|
|
98
|
+
A tuple containing the feature vector (X), the target (y), and the
|
|
99
|
+
cropping indices.
|
|
100
|
+
|
|
101
|
+
"""
|
|
68
102
|
crop_inds = self.crop_inds[index].tolist()
|
|
69
103
|
X = self.features.iloc[index].to_numpy()
|
|
70
104
|
X = X.copy()
|
|
@@ -74,18 +108,27 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
74
108
|
y = self.y[index]
|
|
75
109
|
return X, y, crop_inds
|
|
76
110
|
|
|
77
|
-
def __len__(self):
|
|
111
|
+
def __len__(self) -> int:
|
|
112
|
+
"""Return the number of samples in the dataset.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
int
|
|
117
|
+
The total number of feature samples.
|
|
118
|
+
|
|
119
|
+
"""
|
|
78
120
|
return len(self.features.index)
|
|
79
121
|
|
|
80
122
|
|
|
81
123
|
def _compute_stats(
|
|
82
124
|
ds: FeaturesDataset,
|
|
83
|
-
return_count=False,
|
|
84
|
-
return_mean=False,
|
|
85
|
-
return_var=False,
|
|
86
|
-
ddof=1,
|
|
87
|
-
numeric_only=False,
|
|
88
|
-
):
|
|
125
|
+
return_count: bool = False,
|
|
126
|
+
return_mean: bool = False,
|
|
127
|
+
return_var: bool = False,
|
|
128
|
+
ddof: int = 1,
|
|
129
|
+
numeric_only: bool = False,
|
|
130
|
+
) -> tuple:
|
|
131
|
+
"""Compute statistics for a single :class:`~eegdash.features.datasets.FeaturesDataset`."""
|
|
89
132
|
res = []
|
|
90
133
|
if return_count:
|
|
91
134
|
res.append(ds.features.count(numeric_only=numeric_only))
|
|
@@ -96,7 +139,14 @@ def _compute_stats(
|
|
|
96
139
|
return tuple(res)
|
|
97
140
|
|
|
98
141
|
|
|
99
|
-
def _pooled_var(
|
|
142
|
+
def _pooled_var(
|
|
143
|
+
counts: np.ndarray,
|
|
144
|
+
means: np.ndarray,
|
|
145
|
+
variances: np.ndarray,
|
|
146
|
+
ddof: int,
|
|
147
|
+
ddof_in: int | None = None,
|
|
148
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
149
|
+
"""Compute pooled variance across multiple datasets."""
|
|
100
150
|
if ddof_in is None:
|
|
101
151
|
ddof_in = ddof
|
|
102
152
|
count = counts.sum(axis=0)
|
|
@@ -109,17 +159,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
|
|
|
109
159
|
|
|
110
160
|
|
|
111
161
|
class FeaturesConcatDataset(BaseConcatDataset):
|
|
112
|
-
"""A
|
|
162
|
+
"""A concatenated dataset of :class:`~eegdash.features.datasets.FeaturesDataset` objects.
|
|
113
163
|
|
|
114
|
-
|
|
115
|
-
a
|
|
164
|
+
This class holds a list of :class:`~eegdash.features.datasets.FeaturesDataset` instances and allows
|
|
165
|
+
them to be treated as a single, larger dataset. It provides methods for
|
|
166
|
+
|
|
167
|
+
splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
|
|
168
|
+
`var`, `fillna`) across all contained datasets.
|
|
116
169
|
|
|
117
170
|
Parameters
|
|
118
171
|
----------
|
|
119
|
-
list_of_ds : list
|
|
120
|
-
list of
|
|
121
|
-
target_transform : callable
|
|
122
|
-
|
|
172
|
+
list_of_ds : list of ~eegdash.features.datasets.FeaturesDataset
|
|
173
|
+
A list of :class:`~eegdash.features.datasets.FeaturesDataset` objects to concatenate.
|
|
174
|
+
target_transform : callable, optional
|
|
175
|
+
A function to apply to the target values before they are returned.
|
|
123
176
|
|
|
124
177
|
"""
|
|
125
178
|
|
|
@@ -139,26 +192,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
139
192
|
self,
|
|
140
193
|
by: str | list[int] | list[list[int]] | dict[str, list[int]],
|
|
141
194
|
) -> dict[str, FeaturesConcatDataset]:
|
|
142
|
-
"""Split the dataset
|
|
195
|
+
"""Split the dataset into subsets.
|
|
143
196
|
|
|
144
|
-
The
|
|
197
|
+
The splitting can be done based on a column in the description
|
|
198
|
+
DataFrame or by providing explicit indices for each split.
|
|
145
199
|
|
|
146
200
|
Parameters
|
|
147
201
|
----------
|
|
148
|
-
by : str
|
|
149
|
-
If
|
|
150
|
-
|
|
151
|
-
If
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
202
|
+
by : str or list or dict
|
|
203
|
+
- If a string, splits are created for each unique value in the
|
|
204
|
+
description column `by`.
|
|
205
|
+
- If a list of integers, a single split is created containing the
|
|
206
|
+
datasets at the specified indices.
|
|
207
|
+
- If a list of lists of integers, multiple splits are created, one
|
|
208
|
+
for each sublist of indices.
|
|
209
|
+
- If a dictionary, keys are used as split names and values are
|
|
210
|
+
lists of dataset indices.
|
|
156
211
|
|
|
157
212
|
Returns
|
|
158
213
|
-------
|
|
159
|
-
|
|
160
|
-
A dictionary
|
|
161
|
-
|
|
214
|
+
dict[str, ~eegdash.features.datasets.FeaturesConcatDataset]
|
|
215
|
+
A dictionary where keys are split names and values are the new
|
|
216
|
+
:class:`~eegdash.features.datasets.FeaturesConcatDataset` subsets.
|
|
162
217
|
|
|
163
218
|
"""
|
|
164
219
|
if isinstance(by, str):
|
|
@@ -183,14 +238,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
183
238
|
}
|
|
184
239
|
|
|
185
240
|
def get_metadata(self) -> pd.DataFrame:
|
|
186
|
-
"""
|
|
241
|
+
"""Get the metadata of all datasets as a single DataFrame.
|
|
242
|
+
|
|
243
|
+
Concatenates the metadata from all contained datasets and adds columns
|
|
244
|
+
from their `description` attributes.
|
|
187
245
|
|
|
188
246
|
Returns
|
|
189
247
|
-------
|
|
190
|
-
|
|
191
|
-
DataFrame containing
|
|
192
|
-
|
|
193
|
-
|
|
248
|
+
pandas.DataFrame
|
|
249
|
+
A DataFrame containing the metadata for every sample in the
|
|
250
|
+
concatenated dataset.
|
|
251
|
+
|
|
252
|
+
Raises
|
|
253
|
+
------
|
|
254
|
+
TypeError
|
|
255
|
+
If any of the contained datasets is not a
|
|
256
|
+
:class:`~eegdash.features.datasets.FeaturesDataset`.
|
|
194
257
|
|
|
195
258
|
"""
|
|
196
259
|
if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
|
|
@@ -201,60 +264,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
201
264
|
|
|
202
265
|
all_dfs = list()
|
|
203
266
|
for ds in self.datasets:
|
|
204
|
-
df = ds.metadata
|
|
267
|
+
df = ds.metadata.copy()
|
|
205
268
|
for k, v in ds.description.items():
|
|
206
269
|
df[k] = v
|
|
207
270
|
all_dfs.append(df)
|
|
208
271
|
|
|
209
272
|
return pd.concat(all_dfs)
|
|
210
273
|
|
|
211
|
-
def save(self, path: str, overwrite: bool = False, offset: int = 0):
|
|
212
|
-
"""Save
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
raw_preproc_kwargs.json (if raws were preprocessed)
|
|
229
|
-
window_kwargs.json (if this is a windowed dataset)
|
|
230
|
-
window_preproc_kwargs.json (if windows were preprocessed)
|
|
231
|
-
features_kwargs.json
|
|
274
|
+
def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
|
|
275
|
+
"""Save the concatenated dataset to a directory.
|
|
276
|
+
|
|
277
|
+
Creates a directory structure where each contained dataset is saved in
|
|
278
|
+
its own numbered subdirectory.
|
|
279
|
+
|
|
280
|
+
.. code-block::
|
|
281
|
+
|
|
282
|
+
path/
|
|
283
|
+
0/
|
|
284
|
+
0-feat.parquet
|
|
285
|
+
metadata_df.pkl
|
|
286
|
+
description.json
|
|
287
|
+
...
|
|
288
|
+
1/
|
|
289
|
+
1-feat.parquet
|
|
290
|
+
...
|
|
232
291
|
|
|
233
292
|
Parameters
|
|
234
293
|
----------
|
|
235
294
|
path : str
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
295
|
+
The directory where the dataset will be saved.
|
|
296
|
+
overwrite : bool, default False
|
|
297
|
+
If True, any existing subdirectories that conflict with the new
|
|
298
|
+
ones will be removed.
|
|
299
|
+
offset : int, default 0
|
|
300
|
+
An integer to add to the subdirectory names. Useful for saving
|
|
301
|
+
datasets in chunks.
|
|
302
|
+
|
|
303
|
+
Raises
|
|
304
|
+
------
|
|
305
|
+
ValueError
|
|
306
|
+
If the dataset is empty.
|
|
307
|
+
FileExistsError
|
|
308
|
+
If a subdirectory already exists and `overwrite` is False.
|
|
246
309
|
|
|
247
310
|
"""
|
|
248
311
|
if len(self.datasets) == 0:
|
|
249
312
|
raise ValueError("Expect at least one dataset")
|
|
250
313
|
path_contents = os.listdir(path)
|
|
251
|
-
n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
|
|
314
|
+
n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
|
|
252
315
|
for i_ds, ds in enumerate(self.datasets):
|
|
253
|
-
|
|
254
|
-
if
|
|
255
|
-
path_contents.remove(
|
|
256
|
-
|
|
257
|
-
sub_dir = os.path.join(path, str(i_ds + offset))
|
|
316
|
+
sub_dir_name = str(i_ds + offset)
|
|
317
|
+
if sub_dir_name in path_contents:
|
|
318
|
+
path_contents.remove(sub_dir_name)
|
|
319
|
+
sub_dir = os.path.join(path, sub_dir_name)
|
|
258
320
|
if os.path.exists(sub_dir):
|
|
259
321
|
if overwrite:
|
|
260
322
|
shutil.rmtree(sub_dir)
|
|
@@ -264,56 +326,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
264
326
|
f" a different directory, set overwrite=True, or "
|
|
265
327
|
f"resolve manually."
|
|
266
328
|
)
|
|
267
|
-
# save_dir/{i_ds+offset}/
|
|
268
329
|
os.makedirs(sub_dir)
|
|
269
|
-
# save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
|
|
270
330
|
self._save_features(sub_dir, ds, i_ds, offset)
|
|
271
|
-
# save_dir/{i_ds+offset}/metadata_df.pkl
|
|
272
331
|
self._save_metadata(sub_dir, ds)
|
|
273
|
-
# save_dir/{i_ds+offset}/description.json
|
|
274
332
|
self._save_description(sub_dir, ds.description)
|
|
275
|
-
# save_dir/{i_ds+offset}/raw-info.fif
|
|
276
333
|
self._save_raw_info(sub_dir, ds)
|
|
277
|
-
# save_dir/{i_ds+offset}/raw_preproc_kwargs.json
|
|
278
|
-
# save_dir/{i_ds+offset}/window_kwargs.json
|
|
279
|
-
# save_dir/{i_ds+offset}/window_preproc_kwargs.json
|
|
280
|
-
# save_dir/{i_ds+offset}/features_kwargs.json
|
|
281
334
|
self._save_kwargs(sub_dir, ds)
|
|
282
|
-
if overwrite:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
f"datasets!",
|
|
292
|
-
UserWarning,
|
|
293
|
-
)
|
|
294
|
-
# if path contains files or directories that were not touched, raise
|
|
295
|
-
# warning
|
|
335
|
+
if overwrite and i_ds + 1 + offset < n_sub_dirs:
|
|
336
|
+
logger.warning(
|
|
337
|
+
f"The number of saved datasets ({i_ds + 1 + offset}) "
|
|
338
|
+
f"does not match the number of existing "
|
|
339
|
+
f"subdirectories ({n_sub_dirs}). You may now "
|
|
340
|
+
f"encounter a mix of differently preprocessed "
|
|
341
|
+
f"datasets!",
|
|
342
|
+
UserWarning,
|
|
343
|
+
)
|
|
296
344
|
if path_contents:
|
|
297
|
-
|
|
345
|
+
logger.warning(
|
|
298
346
|
f"Chosen directory {path} contains other "
|
|
299
347
|
f"subdirectories or files {path_contents}."
|
|
300
348
|
)
|
|
301
349
|
|
|
302
350
|
@staticmethod
|
|
303
|
-
def _save_features(sub_dir, ds, i_ds, offset):
|
|
351
|
+
def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
|
|
352
|
+
"""Save the feature DataFrame to a Parquet file."""
|
|
304
353
|
parquet_file_name = f"{i_ds + offset}-feat.parquet"
|
|
305
354
|
parquet_file_path = os.path.join(sub_dir, parquet_file_name)
|
|
306
355
|
ds.features.to_parquet(parquet_file_path)
|
|
307
356
|
|
|
308
357
|
@staticmethod
|
|
309
|
-
def
|
|
310
|
-
|
|
358
|
+
def _save_metadata(sub_dir: str, ds: FeaturesDataset):
|
|
359
|
+
"""Save the metadata DataFrame to a pickle file."""
|
|
360
|
+
metadata_file_name = "metadata_df.pkl"
|
|
361
|
+
metadata_file_path = os.path.join(sub_dir, metadata_file_name)
|
|
362
|
+
ds.metadata.to_pickle(metadata_file_path)
|
|
363
|
+
|
|
364
|
+
@staticmethod
|
|
365
|
+
def _save_description(sub_dir: str, description: pd.Series):
|
|
366
|
+
"""Save the description Series to a JSON file."""
|
|
367
|
+
desc_file_name = "description.json"
|
|
368
|
+
desc_file_path = os.path.join(sub_dir, desc_file_name)
|
|
369
|
+
description.to_json(desc_file_path)
|
|
370
|
+
|
|
371
|
+
@staticmethod
|
|
372
|
+
def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
|
|
373
|
+
"""Save the raw info dictionary to a FIF file if it exists."""
|
|
374
|
+
if hasattr(ds, "raw_info") and ds.raw_info is not None:
|
|
311
375
|
fif_file_name = "raw-info.fif"
|
|
312
376
|
fif_file_path = os.path.join(sub_dir, fif_file_name)
|
|
313
|
-
ds.raw_info.save(fif_file_path)
|
|
377
|
+
ds.raw_info.save(fif_file_path, overwrite=True)
|
|
314
378
|
|
|
315
379
|
@staticmethod
|
|
316
|
-
def _save_kwargs(sub_dir, ds):
|
|
380
|
+
def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
|
|
381
|
+
"""Save various keyword argument dictionaries to JSON files."""
|
|
317
382
|
for kwargs_name in [
|
|
318
383
|
"raw_preproc_kwargs",
|
|
319
384
|
"window_kwargs",
|
|
@@ -321,10 +386,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
321
386
|
"features_kwargs",
|
|
322
387
|
]:
|
|
323
388
|
if hasattr(ds, kwargs_name):
|
|
324
|
-
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
325
|
-
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
326
389
|
kwargs = getattr(ds, kwargs_name)
|
|
327
390
|
if kwargs is not None:
|
|
391
|
+
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
392
|
+
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
328
393
|
with open(kwargs_file_path, "w") as f:
|
|
329
394
|
json.dump(kwargs, f)
|
|
330
395
|
|
|
@@ -333,7 +398,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
333
398
|
include_metadata: bool | str | List[str] = False,
|
|
334
399
|
include_target: bool = False,
|
|
335
400
|
include_crop_inds: bool = False,
|
|
336
|
-
):
|
|
401
|
+
) -> pd.DataFrame:
|
|
402
|
+
"""Convert the dataset to a single pandas DataFrame.
|
|
403
|
+
|
|
404
|
+
Parameters
|
|
405
|
+
----------
|
|
406
|
+
include_metadata : bool or str or list of str, default False
|
|
407
|
+
If True, include all metadata columns. If a string or list of
|
|
408
|
+
strings, include only the specified metadata columns.
|
|
409
|
+
include_target : bool, default False
|
|
410
|
+
If True, include the 'target' column.
|
|
411
|
+
include_crop_inds : bool, default False
|
|
412
|
+
If True, include window cropping index columns.
|
|
413
|
+
|
|
414
|
+
Returns
|
|
415
|
+
-------
|
|
416
|
+
pandas.DataFrame
|
|
417
|
+
A DataFrame containing the features and requested metadata.
|
|
418
|
+
|
|
419
|
+
"""
|
|
337
420
|
if (
|
|
338
421
|
not isinstance(include_metadata, bool)
|
|
339
422
|
or include_metadata
|
|
@@ -342,7 +425,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
342
425
|
include_dataset = False
|
|
343
426
|
if isinstance(include_metadata, bool) and include_metadata:
|
|
344
427
|
include_dataset = True
|
|
345
|
-
cols = self.datasets[0].metadata.columns
|
|
428
|
+
cols = self.datasets[0].metadata.columns.tolist()
|
|
346
429
|
else:
|
|
347
430
|
cols = include_metadata
|
|
348
431
|
if isinstance(cols, bool) and not cols:
|
|
@@ -351,13 +434,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
351
434
|
cols = [cols]
|
|
352
435
|
cols = set(cols)
|
|
353
436
|
if include_crop_inds:
|
|
354
|
-
cols
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
437
|
+
cols.update(
|
|
438
|
+
{
|
|
439
|
+
"i_dataset",
|
|
440
|
+
"i_window_in_trial",
|
|
441
|
+
"i_start_in_trial",
|
|
442
|
+
"i_stop_in_trial",
|
|
443
|
+
}
|
|
444
|
+
)
|
|
361
445
|
if include_target:
|
|
362
446
|
cols.add("target")
|
|
363
447
|
cols = list(cols)
|
|
@@ -380,10 +464,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
380
464
|
dataframes = [ds.features for ds in self.datasets]
|
|
381
465
|
return pd.concat(dataframes, axis=0, ignore_index=True)
|
|
382
466
|
|
|
383
|
-
def _numeric_columns(self):
|
|
467
|
+
def _numeric_columns(self) -> pd.Index:
|
|
468
|
+
"""Get the names of numeric columns from the feature DataFrames."""
|
|
384
469
|
return self.datasets[0].features.select_dtypes(include=np.number).columns
|
|
385
470
|
|
|
386
|
-
def count(self, numeric_only=False, n_jobs=1):
|
|
471
|
+
def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
472
|
+
"""Count non-NA cells for each feature column.
|
|
473
|
+
|
|
474
|
+
Parameters
|
|
475
|
+
----------
|
|
476
|
+
numeric_only : bool, default False
|
|
477
|
+
Include only float, int, boolean columns.
|
|
478
|
+
n_jobs : int, default 1
|
|
479
|
+
Number of jobs to run in parallel.
|
|
480
|
+
|
|
481
|
+
Returns
|
|
482
|
+
-------
|
|
483
|
+
pandas.Series
|
|
484
|
+
The count of non-NA cells for each column.
|
|
485
|
+
|
|
486
|
+
"""
|
|
387
487
|
stats = Parallel(n_jobs)(
|
|
388
488
|
delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
|
|
389
489
|
for ds in self.datasets
|
|
@@ -392,7 +492,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
392
492
|
count = counts.sum(axis=0)
|
|
393
493
|
return pd.Series(count, index=self._numeric_columns())
|
|
394
494
|
|
|
395
|
-
def mean(self, numeric_only=False, n_jobs=1):
|
|
495
|
+
def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
496
|
+
"""Compute the mean for each feature column.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
numeric_only : bool, default False
|
|
501
|
+
Include only float, int, boolean columns.
|
|
502
|
+
n_jobs : int, default 1
|
|
503
|
+
Number of jobs to run in parallel.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
pandas.Series
|
|
508
|
+
The mean of each column.
|
|
509
|
+
|
|
510
|
+
"""
|
|
396
511
|
stats = Parallel(n_jobs)(
|
|
397
512
|
delayed(_compute_stats)(
|
|
398
513
|
ds, return_count=True, return_mean=True, numeric_only=numeric_only
|
|
@@ -404,7 +519,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
404
519
|
mean = np.sum((counts / count) * means, axis=0)
|
|
405
520
|
return pd.Series(mean, index=self._numeric_columns())
|
|
406
521
|
|
|
407
|
-
def var(
|
|
522
|
+
def var(
|
|
523
|
+
self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
|
|
524
|
+
) -> pd.Series:
|
|
525
|
+
"""Compute the variance for each feature column.
|
|
526
|
+
|
|
527
|
+
Parameters
|
|
528
|
+
----------
|
|
529
|
+
ddof : int, default 1
|
|
530
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
|
|
531
|
+
numeric_only : bool, default False
|
|
532
|
+
Include only float, int, boolean columns.
|
|
533
|
+
n_jobs : int, default 1
|
|
534
|
+
Number of jobs to run in parallel.
|
|
535
|
+
|
|
536
|
+
Returns
|
|
537
|
+
-------
|
|
538
|
+
pandas.Series
|
|
539
|
+
The variance of each column.
|
|
540
|
+
|
|
541
|
+
"""
|
|
408
542
|
stats = Parallel(n_jobs)(
|
|
409
543
|
delayed(_compute_stats)(
|
|
410
544
|
ds,
|
|
@@ -424,12 +558,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
424
558
|
_, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
425
559
|
return pd.Series(var, index=self._numeric_columns())
|
|
426
560
|
|
|
427
|
-
def std(
|
|
561
|
+
def std(
|
|
562
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
563
|
+
) -> pd.Series:
|
|
564
|
+
"""Compute the standard deviation for each feature column.
|
|
565
|
+
|
|
566
|
+
Parameters
|
|
567
|
+
----------
|
|
568
|
+
ddof : int, default 1
|
|
569
|
+
Delta Degrees of Freedom.
|
|
570
|
+
numeric_only : bool, default False
|
|
571
|
+
Include only float, int, boolean columns.
|
|
572
|
+
eps : float, default 0
|
|
573
|
+
A small epsilon value to add to the variance before taking the
|
|
574
|
+
square root to avoid numerical instability.
|
|
575
|
+
n_jobs : int, default 1
|
|
576
|
+
Number of jobs to run in parallel.
|
|
577
|
+
|
|
578
|
+
Returns
|
|
579
|
+
-------
|
|
580
|
+
pandas.Series
|
|
581
|
+
The standard deviation of each column.
|
|
582
|
+
|
|
583
|
+
"""
|
|
428
584
|
return np.sqrt(
|
|
429
585
|
self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
|
|
430
586
|
)
|
|
431
587
|
|
|
432
|
-
def zscore(
|
|
588
|
+
def zscore(
|
|
589
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
590
|
+
) -> None:
|
|
591
|
+
"""Apply z-score normalization to numeric columns in-place.
|
|
592
|
+
|
|
593
|
+
Parameters
|
|
594
|
+
----------
|
|
595
|
+
ddof : int, default 1
|
|
596
|
+
Delta Degrees of Freedom for variance calculation.
|
|
597
|
+
numeric_only : bool, default False
|
|
598
|
+
Include only float, int, boolean columns.
|
|
599
|
+
eps : float, default 0
|
|
600
|
+
Epsilon for numerical stability.
|
|
601
|
+
n_jobs : int, default 1
|
|
602
|
+
Number of jobs to run in parallel for statistics computation.
|
|
603
|
+
|
|
604
|
+
"""
|
|
433
605
|
stats = Parallel(n_jobs)(
|
|
434
606
|
delayed(_compute_stats)(
|
|
435
607
|
ds,
|
|
@@ -449,10 +621,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
449
621
|
_, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
450
622
|
std = np.sqrt(var + eps)
|
|
451
623
|
for ds in self.datasets:
|
|
452
|
-
ds.features
|
|
624
|
+
ds.features.loc[:, self._numeric_columns()] = (
|
|
625
|
+
ds.features.loc[:, self._numeric_columns()] - mean
|
|
626
|
+
) / std
|
|
453
627
|
|
|
454
628
|
@staticmethod
|
|
455
|
-
def _enforce_inplace_operations(func_name, kwargs):
|
|
629
|
+
def _enforce_inplace_operations(func_name: str, kwargs: dict):
|
|
630
|
+
"""Raise an error if 'inplace=False' is passed to a method."""
|
|
456
631
|
if "inplace" in kwargs and kwargs["inplace"] is False:
|
|
457
632
|
raise ValueError(
|
|
458
633
|
f"{func_name} only works inplace, please change "
|
|
@@ -460,33 +635,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
460
635
|
)
|
|
461
636
|
kwargs["inplace"] = True
|
|
462
637
|
|
|
463
|
-
def fillna(self, *args, **kwargs):
|
|
638
|
+
def fillna(self, *args, **kwargs) -> None:
|
|
639
|
+
"""Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
|
|
464
640
|
FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
|
|
465
641
|
for ds in self.datasets:
|
|
466
642
|
ds.features.fillna(*args, **kwargs)
|
|
467
643
|
|
|
468
|
-
def replace(self, *args, **kwargs):
|
|
644
|
+
def replace(self, *args, **kwargs) -> None:
|
|
645
|
+
"""Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
|
|
469
646
|
FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
|
|
470
647
|
for ds in self.datasets:
|
|
471
648
|
ds.features.replace(*args, **kwargs)
|
|
472
649
|
|
|
473
|
-
def interpolate(self, *args, **kwargs):
|
|
650
|
+
def interpolate(self, *args, **kwargs) -> None:
|
|
651
|
+
"""Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
|
|
474
652
|
FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
|
|
475
653
|
for ds in self.datasets:
|
|
476
654
|
ds.features.interpolate(*args, **kwargs)
|
|
477
655
|
|
|
478
|
-
def dropna(self, *args, **kwargs):
|
|
656
|
+
def dropna(self, *args, **kwargs) -> None:
|
|
657
|
+
"""Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
|
|
479
658
|
FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
|
|
480
659
|
for ds in self.datasets:
|
|
481
660
|
ds.features.dropna(*args, **kwargs)
|
|
482
661
|
|
|
483
|
-
def drop(self, *args, **kwargs):
|
|
662
|
+
def drop(self, *args, **kwargs) -> None:
|
|
663
|
+
"""Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
|
|
484
664
|
FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
|
|
485
665
|
for ds in self.datasets:
|
|
486
666
|
ds.features.drop(*args, **kwargs)
|
|
487
667
|
|
|
488
|
-
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
|
|
668
|
+
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
|
|
669
|
+
"""Join columns with other FeaturesConcatDataset in-place.
|
|
670
|
+
|
|
671
|
+
Parameters
|
|
672
|
+
----------
|
|
673
|
+
concat_dataset : FeaturesConcatDataset
|
|
674
|
+
The dataset to join with. Must have the same number of datasets,
|
|
675
|
+
and each corresponding dataset must have the same length.
|
|
676
|
+
**kwargs
|
|
677
|
+
Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
|
|
678
|
+
|
|
679
|
+
"""
|
|
489
680
|
assert len(self.datasets) == len(concat_dataset.datasets)
|
|
490
681
|
for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
|
|
491
682
|
assert len(ds1) == len(ds2)
|
|
492
|
-
ds1.features.join(ds2, **kwargs)
|
|
683
|
+
ds1.features = ds1.features.join(ds2.features, **kwargs)
|