eegdash 0.3.9.dev182388821__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +12 -1
- eegdash/api.py +297 -295
- eegdash/bids_eeg_metadata.py +297 -56
- eegdash/const.py +43 -0
- eegdash/data_utils.py +327 -430
- eegdash/dataset/__init__.py +19 -1
- eegdash/dataset/dataset.py +61 -33
- eegdash/dataset/dataset_summary.csv +255 -256
- eegdash/dataset/registry.py +163 -11
- eegdash/downloader.py +197 -0
- eegdash/features/datasets.py +323 -138
- eegdash/features/decorators.py +88 -3
- eegdash/features/extractors.py +203 -55
- eegdash/features/feature_bank/complexity.py +7 -3
- eegdash/features/feature_bank/dimensionality.py +1 -1
- eegdash/features/inspect.py +80 -5
- eegdash/features/serialization.py +49 -17
- eegdash/features/utils.py +75 -8
- eegdash/hbn/__init__.py +11 -0
- eegdash/hbn/preprocessing.py +61 -19
- eegdash/hbn/windows.py +157 -34
- eegdash/logging.py +54 -0
- eegdash/mongodb.py +55 -24
- eegdash/paths.py +28 -5
- eegdash/utils.py +29 -1
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/METADATA +11 -59
- eegdash-0.4.0.dist-info/RECORD +37 -0
- eegdash-0.3.9.dev182388821.dist-info/RECORD +0 -35
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/WHEEL +0 -0
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/top_level.txt +0 -0
eegdash/features/datasets.py
CHANGED
|
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import shutil
|
|
6
|
-
import warnings
|
|
7
6
|
from collections.abc import Callable
|
|
8
7
|
from typing import Dict, List
|
|
9
8
|
|
|
@@ -17,22 +16,38 @@ from braindecode.datasets.base import (
|
|
|
17
16
|
_create_description,
|
|
18
17
|
)
|
|
19
18
|
|
|
19
|
+
from ..logging import logger
|
|
20
|
+
|
|
20
21
|
|
|
21
22
|
class FeaturesDataset(EEGWindowsDataset):
|
|
22
|
-
"""
|
|
23
|
+
"""A dataset of features extracted from EEG windows.
|
|
23
24
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
This class holds features in a pandas DataFrame and provides an interface
|
|
26
|
+
compatible with braindecode's dataset structure. Each row in the feature
|
|
27
|
+
DataFrame corresponds to a single sample (e.g., an EEG window).
|
|
27
28
|
|
|
28
29
|
Parameters
|
|
29
30
|
----------
|
|
30
|
-
features :
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
31
|
+
features : pandas.DataFrame
|
|
32
|
+
A DataFrame where each row is a sample and each column is a feature.
|
|
33
|
+
metadata : pandas.DataFrame, optional
|
|
34
|
+
A DataFrame containing metadata for each sample, indexed consistently
|
|
35
|
+
with `features`. Must include columns 'i_window_in_trial',
|
|
36
|
+
'i_start_in_trial', 'i_stop_in_trial', and 'target'.
|
|
37
|
+
description : dict or pandas.Series, optional
|
|
38
|
+
Additional high-level information about the dataset (e.g., subject ID).
|
|
39
|
+
transform : callable, optional
|
|
40
|
+
A function or transform to apply to the feature data on-the-fly.
|
|
41
|
+
raw_info : dict, optional
|
|
42
|
+
Information about the original raw recording, for provenance.
|
|
43
|
+
raw_preproc_kwargs : dict, optional
|
|
44
|
+
Keyword arguments used for preprocessing the raw data.
|
|
45
|
+
window_kwargs : dict, optional
|
|
46
|
+
Keyword arguments used for windowing the data.
|
|
47
|
+
window_preproc_kwargs : dict, optional
|
|
48
|
+
Keyword arguments used for preprocessing the windowed data.
|
|
49
|
+
features_kwargs : dict, optional
|
|
50
|
+
Keyword arguments used for feature extraction.
|
|
36
51
|
|
|
37
52
|
"""
|
|
38
53
|
|
|
@@ -64,7 +79,21 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
64
79
|
].to_numpy()
|
|
65
80
|
self.y = metadata.loc[:, "target"].to_list()
|
|
66
81
|
|
|
67
|
-
def __getitem__(self, index):
|
|
82
|
+
def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
|
|
83
|
+
"""Get a single sample from the dataset.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
index : int
|
|
88
|
+
The index of the sample to retrieve.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
tuple
|
|
93
|
+
A tuple containing the feature vector (X), the target (y), and the
|
|
94
|
+
cropping indices.
|
|
95
|
+
|
|
96
|
+
"""
|
|
68
97
|
crop_inds = self.crop_inds[index].tolist()
|
|
69
98
|
X = self.features.iloc[index].to_numpy()
|
|
70
99
|
X = X.copy()
|
|
@@ -74,18 +103,27 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
74
103
|
y = self.y[index]
|
|
75
104
|
return X, y, crop_inds
|
|
76
105
|
|
|
77
|
-
def __len__(self):
|
|
106
|
+
def __len__(self) -> int:
|
|
107
|
+
"""Return the number of samples in the dataset.
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
int
|
|
112
|
+
The total number of feature samples.
|
|
113
|
+
|
|
114
|
+
"""
|
|
78
115
|
return len(self.features.index)
|
|
79
116
|
|
|
80
117
|
|
|
81
118
|
def _compute_stats(
|
|
82
119
|
ds: FeaturesDataset,
|
|
83
|
-
return_count=False,
|
|
84
|
-
return_mean=False,
|
|
85
|
-
return_var=False,
|
|
86
|
-
ddof=1,
|
|
87
|
-
numeric_only=False,
|
|
88
|
-
):
|
|
120
|
+
return_count: bool = False,
|
|
121
|
+
return_mean: bool = False,
|
|
122
|
+
return_var: bool = False,
|
|
123
|
+
ddof: int = 1,
|
|
124
|
+
numeric_only: bool = False,
|
|
125
|
+
) -> tuple:
|
|
126
|
+
"""Compute statistics for a single FeaturesDataset."""
|
|
89
127
|
res = []
|
|
90
128
|
if return_count:
|
|
91
129
|
res.append(ds.features.count(numeric_only=numeric_only))
|
|
@@ -96,7 +134,14 @@ def _compute_stats(
|
|
|
96
134
|
return tuple(res)
|
|
97
135
|
|
|
98
136
|
|
|
99
|
-
def _pooled_var(
|
|
137
|
+
def _pooled_var(
|
|
138
|
+
counts: np.ndarray,
|
|
139
|
+
means: np.ndarray,
|
|
140
|
+
variances: np.ndarray,
|
|
141
|
+
ddof: int,
|
|
142
|
+
ddof_in: int | None = None,
|
|
143
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
144
|
+
"""Compute pooled variance across multiple datasets."""
|
|
100
145
|
if ddof_in is None:
|
|
101
146
|
ddof_in = ddof
|
|
102
147
|
count = counts.sum(axis=0)
|
|
@@ -109,17 +154,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
|
|
|
109
154
|
|
|
110
155
|
|
|
111
156
|
class FeaturesConcatDataset(BaseConcatDataset):
|
|
112
|
-
"""A
|
|
157
|
+
"""A concatenated dataset of `FeaturesDataset` objects.
|
|
158
|
+
|
|
159
|
+
This class holds a list of :class:`FeaturesDataset` instances and allows
|
|
160
|
+
them to be treated as a single, larger dataset. It provides methods for
|
|
113
161
|
|
|
114
|
-
|
|
115
|
-
|
|
162
|
+
splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
|
|
163
|
+
`var`, `fillna`) across all contained datasets.
|
|
116
164
|
|
|
117
165
|
Parameters
|
|
118
166
|
----------
|
|
119
|
-
list_of_ds : list
|
|
120
|
-
list of
|
|
121
|
-
target_transform : callable
|
|
122
|
-
|
|
167
|
+
list_of_ds : list of FeaturesDataset
|
|
168
|
+
A list of :class:`FeaturesDataset` objects to concatenate.
|
|
169
|
+
target_transform : callable, optional
|
|
170
|
+
A function to apply to the target values before they are returned.
|
|
123
171
|
|
|
124
172
|
"""
|
|
125
173
|
|
|
@@ -139,26 +187,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
139
187
|
self,
|
|
140
188
|
by: str | list[int] | list[list[int]] | dict[str, list[int]],
|
|
141
189
|
) -> dict[str, FeaturesConcatDataset]:
|
|
142
|
-
"""Split the dataset
|
|
190
|
+
"""Split the dataset into subsets.
|
|
143
191
|
|
|
144
|
-
The
|
|
192
|
+
The splitting can be done based on a column in the description
|
|
193
|
+
DataFrame or by providing explicit indices for each split.
|
|
145
194
|
|
|
146
195
|
Parameters
|
|
147
196
|
----------
|
|
148
|
-
by : str
|
|
149
|
-
If
|
|
150
|
-
|
|
151
|
-
If
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
197
|
+
by : str or list or dict
|
|
198
|
+
- If a string, splits are created for each unique value in the
|
|
199
|
+
description column `by`.
|
|
200
|
+
- If a list of integers, a single split is created containing the
|
|
201
|
+
datasets at the specified indices.
|
|
202
|
+
- If a list of lists of integers, multiple splits are created, one
|
|
203
|
+
for each sublist of indices.
|
|
204
|
+
- If a dictionary, keys are used as split names and values are
|
|
205
|
+
lists of dataset indices.
|
|
156
206
|
|
|
157
207
|
Returns
|
|
158
208
|
-------
|
|
159
|
-
|
|
160
|
-
A dictionary
|
|
161
|
-
|
|
209
|
+
dict[str, FeaturesConcatDataset]
|
|
210
|
+
A dictionary where keys are split names and values are the new
|
|
211
|
+
:class:`FeaturesConcatDataset` subsets.
|
|
162
212
|
|
|
163
213
|
"""
|
|
164
214
|
if isinstance(by, str):
|
|
@@ -183,14 +233,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
183
233
|
}
|
|
184
234
|
|
|
185
235
|
def get_metadata(self) -> pd.DataFrame:
|
|
186
|
-
"""
|
|
236
|
+
"""Get the metadata of all datasets as a single DataFrame.
|
|
237
|
+
|
|
238
|
+
Concatenates the metadata from all contained datasets and adds columns
|
|
239
|
+
from their `description` attributes.
|
|
187
240
|
|
|
188
241
|
Returns
|
|
189
242
|
-------
|
|
190
|
-
|
|
191
|
-
DataFrame containing
|
|
192
|
-
|
|
193
|
-
|
|
243
|
+
pandas.DataFrame
|
|
244
|
+
A DataFrame containing the metadata for every sample in the
|
|
245
|
+
concatenated dataset.
|
|
246
|
+
|
|
247
|
+
Raises
|
|
248
|
+
------
|
|
249
|
+
TypeError
|
|
250
|
+
If any of the contained datasets is not a :class:`FeaturesDataset`.
|
|
194
251
|
|
|
195
252
|
"""
|
|
196
253
|
if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
|
|
@@ -201,60 +258,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
201
258
|
|
|
202
259
|
all_dfs = list()
|
|
203
260
|
for ds in self.datasets:
|
|
204
|
-
df = ds.metadata
|
|
261
|
+
df = ds.metadata.copy()
|
|
205
262
|
for k, v in ds.description.items():
|
|
206
263
|
df[k] = v
|
|
207
264
|
all_dfs.append(df)
|
|
208
265
|
|
|
209
266
|
return pd.concat(all_dfs)
|
|
210
267
|
|
|
211
|
-
def save(self, path: str, overwrite: bool = False, offset: int = 0):
|
|
212
|
-
"""Save
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
raw_preproc_kwargs.json (if raws were preprocessed)
|
|
229
|
-
window_kwargs.json (if this is a windowed dataset)
|
|
230
|
-
window_preproc_kwargs.json (if windows were preprocessed)
|
|
231
|
-
features_kwargs.json
|
|
268
|
+
def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
|
|
269
|
+
"""Save the concatenated dataset to a directory.
|
|
270
|
+
|
|
271
|
+
Creates a directory structure where each contained dataset is saved in
|
|
272
|
+
its own numbered subdirectory.
|
|
273
|
+
|
|
274
|
+
.. code-block::
|
|
275
|
+
|
|
276
|
+
path/
|
|
277
|
+
0/
|
|
278
|
+
0-feat.parquet
|
|
279
|
+
metadata_df.pkl
|
|
280
|
+
description.json
|
|
281
|
+
...
|
|
282
|
+
1/
|
|
283
|
+
1-feat.parquet
|
|
284
|
+
...
|
|
232
285
|
|
|
233
286
|
Parameters
|
|
234
287
|
----------
|
|
235
288
|
path : str
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
289
|
+
The directory where the dataset will be saved.
|
|
290
|
+
overwrite : bool, default False
|
|
291
|
+
If True, any existing subdirectories that conflict with the new
|
|
292
|
+
ones will be removed.
|
|
293
|
+
offset : int, default 0
|
|
294
|
+
An integer to add to the subdirectory names. Useful for saving
|
|
295
|
+
datasets in chunks.
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
ValueError
|
|
300
|
+
If the dataset is empty.
|
|
301
|
+
FileExistsError
|
|
302
|
+
If a subdirectory already exists and `overwrite` is False.
|
|
246
303
|
|
|
247
304
|
"""
|
|
248
305
|
if len(self.datasets) == 0:
|
|
249
306
|
raise ValueError("Expect at least one dataset")
|
|
250
307
|
path_contents = os.listdir(path)
|
|
251
|
-
n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
|
|
308
|
+
n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
|
|
252
309
|
for i_ds, ds in enumerate(self.datasets):
|
|
253
|
-
|
|
254
|
-
if
|
|
255
|
-
path_contents.remove(
|
|
256
|
-
|
|
257
|
-
sub_dir = os.path.join(path, str(i_ds + offset))
|
|
310
|
+
sub_dir_name = str(i_ds + offset)
|
|
311
|
+
if sub_dir_name in path_contents:
|
|
312
|
+
path_contents.remove(sub_dir_name)
|
|
313
|
+
sub_dir = os.path.join(path, sub_dir_name)
|
|
258
314
|
if os.path.exists(sub_dir):
|
|
259
315
|
if overwrite:
|
|
260
316
|
shutil.rmtree(sub_dir)
|
|
@@ -264,56 +320,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
264
320
|
f" a different directory, set overwrite=True, or "
|
|
265
321
|
f"resolve manually."
|
|
266
322
|
)
|
|
267
|
-
# save_dir/{i_ds+offset}/
|
|
268
323
|
os.makedirs(sub_dir)
|
|
269
|
-
# save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
|
|
270
324
|
self._save_features(sub_dir, ds, i_ds, offset)
|
|
271
|
-
# save_dir/{i_ds+offset}/metadata_df.pkl
|
|
272
325
|
self._save_metadata(sub_dir, ds)
|
|
273
|
-
# save_dir/{i_ds+offset}/description.json
|
|
274
326
|
self._save_description(sub_dir, ds.description)
|
|
275
|
-
# save_dir/{i_ds+offset}/raw-info.fif
|
|
276
327
|
self._save_raw_info(sub_dir, ds)
|
|
277
|
-
# save_dir/{i_ds+offset}/raw_preproc_kwargs.json
|
|
278
|
-
# save_dir/{i_ds+offset}/window_kwargs.json
|
|
279
|
-
# save_dir/{i_ds+offset}/window_preproc_kwargs.json
|
|
280
|
-
# save_dir/{i_ds+offset}/features_kwargs.json
|
|
281
328
|
self._save_kwargs(sub_dir, ds)
|
|
282
|
-
if overwrite:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
f"datasets!",
|
|
292
|
-
UserWarning,
|
|
293
|
-
)
|
|
294
|
-
# if path contains files or directories that were not touched, raise
|
|
295
|
-
# warning
|
|
329
|
+
if overwrite and i_ds + 1 + offset < n_sub_dirs:
|
|
330
|
+
logger.warning(
|
|
331
|
+
f"The number of saved datasets ({i_ds + 1 + offset}) "
|
|
332
|
+
f"does not match the number of existing "
|
|
333
|
+
f"subdirectories ({n_sub_dirs}). You may now "
|
|
334
|
+
f"encounter a mix of differently preprocessed "
|
|
335
|
+
f"datasets!",
|
|
336
|
+
UserWarning,
|
|
337
|
+
)
|
|
296
338
|
if path_contents:
|
|
297
|
-
|
|
339
|
+
logger.warning(
|
|
298
340
|
f"Chosen directory {path} contains other "
|
|
299
341
|
f"subdirectories or files {path_contents}."
|
|
300
342
|
)
|
|
301
343
|
|
|
302
344
|
@staticmethod
|
|
303
|
-
def _save_features(sub_dir, ds, i_ds, offset):
|
|
345
|
+
def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
|
|
346
|
+
"""Save the feature DataFrame to a Parquet file."""
|
|
304
347
|
parquet_file_name = f"{i_ds + offset}-feat.parquet"
|
|
305
348
|
parquet_file_path = os.path.join(sub_dir, parquet_file_name)
|
|
306
349
|
ds.features.to_parquet(parquet_file_path)
|
|
307
350
|
|
|
308
351
|
@staticmethod
|
|
309
|
-
def
|
|
310
|
-
|
|
352
|
+
def _save_metadata(sub_dir: str, ds: FeaturesDataset):
|
|
353
|
+
"""Save the metadata DataFrame to a pickle file."""
|
|
354
|
+
metadata_file_name = "metadata_df.pkl"
|
|
355
|
+
metadata_file_path = os.path.join(sub_dir, metadata_file_name)
|
|
356
|
+
ds.metadata.to_pickle(metadata_file_path)
|
|
357
|
+
|
|
358
|
+
@staticmethod
|
|
359
|
+
def _save_description(sub_dir: str, description: pd.Series):
|
|
360
|
+
"""Save the description Series to a JSON file."""
|
|
361
|
+
desc_file_name = "description.json"
|
|
362
|
+
desc_file_path = os.path.join(sub_dir, desc_file_name)
|
|
363
|
+
description.to_json(desc_file_path)
|
|
364
|
+
|
|
365
|
+
@staticmethod
|
|
366
|
+
def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
|
|
367
|
+
"""Save the raw info dictionary to a FIF file if it exists."""
|
|
368
|
+
if hasattr(ds, "raw_info") and ds.raw_info is not None:
|
|
311
369
|
fif_file_name = "raw-info.fif"
|
|
312
370
|
fif_file_path = os.path.join(sub_dir, fif_file_name)
|
|
313
|
-
ds.raw_info.save(fif_file_path)
|
|
371
|
+
ds.raw_info.save(fif_file_path, overwrite=True)
|
|
314
372
|
|
|
315
373
|
@staticmethod
|
|
316
|
-
def _save_kwargs(sub_dir, ds):
|
|
374
|
+
def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
|
|
375
|
+
"""Save various keyword argument dictionaries to JSON files."""
|
|
317
376
|
for kwargs_name in [
|
|
318
377
|
"raw_preproc_kwargs",
|
|
319
378
|
"window_kwargs",
|
|
@@ -321,10 +380,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
321
380
|
"features_kwargs",
|
|
322
381
|
]:
|
|
323
382
|
if hasattr(ds, kwargs_name):
|
|
324
|
-
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
325
|
-
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
326
383
|
kwargs = getattr(ds, kwargs_name)
|
|
327
384
|
if kwargs is not None:
|
|
385
|
+
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
386
|
+
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
328
387
|
with open(kwargs_file_path, "w") as f:
|
|
329
388
|
json.dump(kwargs, f)
|
|
330
389
|
|
|
@@ -333,7 +392,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
333
392
|
include_metadata: bool | str | List[str] = False,
|
|
334
393
|
include_target: bool = False,
|
|
335
394
|
include_crop_inds: bool = False,
|
|
336
|
-
):
|
|
395
|
+
) -> pd.DataFrame:
|
|
396
|
+
"""Convert the dataset to a single pandas DataFrame.
|
|
397
|
+
|
|
398
|
+
Parameters
|
|
399
|
+
----------
|
|
400
|
+
include_metadata : bool or str or list of str, default False
|
|
401
|
+
If True, include all metadata columns. If a string or list of
|
|
402
|
+
strings, include only the specified metadata columns.
|
|
403
|
+
include_target : bool, default False
|
|
404
|
+
If True, include the 'target' column.
|
|
405
|
+
include_crop_inds : bool, default False
|
|
406
|
+
If True, include window cropping index columns.
|
|
407
|
+
|
|
408
|
+
Returns
|
|
409
|
+
-------
|
|
410
|
+
pandas.DataFrame
|
|
411
|
+
A DataFrame containing the features and requested metadata.
|
|
412
|
+
|
|
413
|
+
"""
|
|
337
414
|
if (
|
|
338
415
|
not isinstance(include_metadata, bool)
|
|
339
416
|
or include_metadata
|
|
@@ -342,7 +419,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
342
419
|
include_dataset = False
|
|
343
420
|
if isinstance(include_metadata, bool) and include_metadata:
|
|
344
421
|
include_dataset = True
|
|
345
|
-
cols = self.datasets[0].metadata.columns
|
|
422
|
+
cols = self.datasets[0].metadata.columns.tolist()
|
|
346
423
|
else:
|
|
347
424
|
cols = include_metadata
|
|
348
425
|
if isinstance(cols, bool) and not cols:
|
|
@@ -351,13 +428,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
351
428
|
cols = [cols]
|
|
352
429
|
cols = set(cols)
|
|
353
430
|
if include_crop_inds:
|
|
354
|
-
cols
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
431
|
+
cols.update(
|
|
432
|
+
{
|
|
433
|
+
"i_dataset",
|
|
434
|
+
"i_window_in_trial",
|
|
435
|
+
"i_start_in_trial",
|
|
436
|
+
"i_stop_in_trial",
|
|
437
|
+
}
|
|
438
|
+
)
|
|
361
439
|
if include_target:
|
|
362
440
|
cols.add("target")
|
|
363
441
|
cols = list(cols)
|
|
@@ -380,10 +458,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
380
458
|
dataframes = [ds.features for ds in self.datasets]
|
|
381
459
|
return pd.concat(dataframes, axis=0, ignore_index=True)
|
|
382
460
|
|
|
383
|
-
def _numeric_columns(self):
|
|
461
|
+
def _numeric_columns(self) -> pd.Index:
|
|
462
|
+
"""Get the names of numeric columns from the feature DataFrames."""
|
|
384
463
|
return self.datasets[0].features.select_dtypes(include=np.number).columns
|
|
385
464
|
|
|
386
|
-
def count(self, numeric_only=False, n_jobs=1):
|
|
465
|
+
def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
466
|
+
"""Count non-NA cells for each feature column.
|
|
467
|
+
|
|
468
|
+
Parameters
|
|
469
|
+
----------
|
|
470
|
+
numeric_only : bool, default False
|
|
471
|
+
Include only float, int, boolean columns.
|
|
472
|
+
n_jobs : int, default 1
|
|
473
|
+
Number of jobs to run in parallel.
|
|
474
|
+
|
|
475
|
+
Returns
|
|
476
|
+
-------
|
|
477
|
+
pandas.Series
|
|
478
|
+
The count of non-NA cells for each column.
|
|
479
|
+
|
|
480
|
+
"""
|
|
387
481
|
stats = Parallel(n_jobs)(
|
|
388
482
|
delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
|
|
389
483
|
for ds in self.datasets
|
|
@@ -392,7 +486,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
392
486
|
count = counts.sum(axis=0)
|
|
393
487
|
return pd.Series(count, index=self._numeric_columns())
|
|
394
488
|
|
|
395
|
-
def mean(self, numeric_only=False, n_jobs=1):
|
|
489
|
+
def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
490
|
+
"""Compute the mean for each feature column.
|
|
491
|
+
|
|
492
|
+
Parameters
|
|
493
|
+
----------
|
|
494
|
+
numeric_only : bool, default False
|
|
495
|
+
Include only float, int, boolean columns.
|
|
496
|
+
n_jobs : int, default 1
|
|
497
|
+
Number of jobs to run in parallel.
|
|
498
|
+
|
|
499
|
+
Returns
|
|
500
|
+
-------
|
|
501
|
+
pandas.Series
|
|
502
|
+
The mean of each column.
|
|
503
|
+
|
|
504
|
+
"""
|
|
396
505
|
stats = Parallel(n_jobs)(
|
|
397
506
|
delayed(_compute_stats)(
|
|
398
507
|
ds, return_count=True, return_mean=True, numeric_only=numeric_only
|
|
@@ -404,7 +513,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
404
513
|
mean = np.sum((counts / count) * means, axis=0)
|
|
405
514
|
return pd.Series(mean, index=self._numeric_columns())
|
|
406
515
|
|
|
407
|
-
def var(
|
|
516
|
+
def var(
|
|
517
|
+
self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
|
|
518
|
+
) -> pd.Series:
|
|
519
|
+
"""Compute the variance for each feature column.
|
|
520
|
+
|
|
521
|
+
Parameters
|
|
522
|
+
----------
|
|
523
|
+
ddof : int, default 1
|
|
524
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
|
|
525
|
+
numeric_only : bool, default False
|
|
526
|
+
Include only float, int, boolean columns.
|
|
527
|
+
n_jobs : int, default 1
|
|
528
|
+
Number of jobs to run in parallel.
|
|
529
|
+
|
|
530
|
+
Returns
|
|
531
|
+
-------
|
|
532
|
+
pandas.Series
|
|
533
|
+
The variance of each column.
|
|
534
|
+
|
|
535
|
+
"""
|
|
408
536
|
stats = Parallel(n_jobs)(
|
|
409
537
|
delayed(_compute_stats)(
|
|
410
538
|
ds,
|
|
@@ -424,12 +552,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
424
552
|
_, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
425
553
|
return pd.Series(var, index=self._numeric_columns())
|
|
426
554
|
|
|
427
|
-
def std(
|
|
555
|
+
def std(
|
|
556
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
557
|
+
) -> pd.Series:
|
|
558
|
+
"""Compute the standard deviation for each feature column.
|
|
559
|
+
|
|
560
|
+
Parameters
|
|
561
|
+
----------
|
|
562
|
+
ddof : int, default 1
|
|
563
|
+
Delta Degrees of Freedom.
|
|
564
|
+
numeric_only : bool, default False
|
|
565
|
+
Include only float, int, boolean columns.
|
|
566
|
+
eps : float, default 0
|
|
567
|
+
A small epsilon value to add to the variance before taking the
|
|
568
|
+
square root to avoid numerical instability.
|
|
569
|
+
n_jobs : int, default 1
|
|
570
|
+
Number of jobs to run in parallel.
|
|
571
|
+
|
|
572
|
+
Returns
|
|
573
|
+
-------
|
|
574
|
+
pandas.Series
|
|
575
|
+
The standard deviation of each column.
|
|
576
|
+
|
|
577
|
+
"""
|
|
428
578
|
return np.sqrt(
|
|
429
579
|
self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
|
|
430
580
|
)
|
|
431
581
|
|
|
432
|
-
def zscore(
|
|
582
|
+
def zscore(
|
|
583
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
584
|
+
) -> None:
|
|
585
|
+
"""Apply z-score normalization to numeric columns in-place.
|
|
586
|
+
|
|
587
|
+
Parameters
|
|
588
|
+
----------
|
|
589
|
+
ddof : int, default 1
|
|
590
|
+
Delta Degrees of Freedom for variance calculation.
|
|
591
|
+
numeric_only : bool, default False
|
|
592
|
+
Include only float, int, boolean columns.
|
|
593
|
+
eps : float, default 0
|
|
594
|
+
Epsilon for numerical stability.
|
|
595
|
+
n_jobs : int, default 1
|
|
596
|
+
Number of jobs to run in parallel for statistics computation.
|
|
597
|
+
|
|
598
|
+
"""
|
|
433
599
|
stats = Parallel(n_jobs)(
|
|
434
600
|
delayed(_compute_stats)(
|
|
435
601
|
ds,
|
|
@@ -449,10 +615,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
449
615
|
_, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
450
616
|
std = np.sqrt(var + eps)
|
|
451
617
|
for ds in self.datasets:
|
|
452
|
-
ds.features
|
|
618
|
+
ds.features.loc[:, self._numeric_columns()] = (
|
|
619
|
+
ds.features.loc[:, self._numeric_columns()] - mean
|
|
620
|
+
) / std
|
|
453
621
|
|
|
454
622
|
@staticmethod
|
|
455
|
-
def _enforce_inplace_operations(func_name, kwargs):
|
|
623
|
+
def _enforce_inplace_operations(func_name: str, kwargs: dict):
|
|
624
|
+
"""Raise an error if 'inplace=False' is passed to a method."""
|
|
456
625
|
if "inplace" in kwargs and kwargs["inplace"] is False:
|
|
457
626
|
raise ValueError(
|
|
458
627
|
f"{func_name} only works inplace, please change "
|
|
@@ -460,33 +629,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
460
629
|
)
|
|
461
630
|
kwargs["inplace"] = True
|
|
462
631
|
|
|
463
|
-
def fillna(self, *args, **kwargs):
|
|
632
|
+
def fillna(self, *args, **kwargs) -> None:
|
|
633
|
+
"""Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
|
|
464
634
|
FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
|
|
465
635
|
for ds in self.datasets:
|
|
466
636
|
ds.features.fillna(*args, **kwargs)
|
|
467
637
|
|
|
468
|
-
def replace(self, *args, **kwargs):
|
|
638
|
+
def replace(self, *args, **kwargs) -> None:
|
|
639
|
+
"""Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
|
|
469
640
|
FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
|
|
470
641
|
for ds in self.datasets:
|
|
471
642
|
ds.features.replace(*args, **kwargs)
|
|
472
643
|
|
|
473
|
-
def interpolate(self, *args, **kwargs):
|
|
644
|
+
def interpolate(self, *args, **kwargs) -> None:
|
|
645
|
+
"""Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
|
|
474
646
|
FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
|
|
475
647
|
for ds in self.datasets:
|
|
476
648
|
ds.features.interpolate(*args, **kwargs)
|
|
477
649
|
|
|
478
|
-
def dropna(self, *args, **kwargs):
|
|
650
|
+
def dropna(self, *args, **kwargs) -> None:
|
|
651
|
+
"""Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
|
|
479
652
|
FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
|
|
480
653
|
for ds in self.datasets:
|
|
481
654
|
ds.features.dropna(*args, **kwargs)
|
|
482
655
|
|
|
483
|
-
def drop(self, *args, **kwargs):
|
|
656
|
+
def drop(self, *args, **kwargs) -> None:
|
|
657
|
+
"""Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
|
|
484
658
|
FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
|
|
485
659
|
for ds in self.datasets:
|
|
486
660
|
ds.features.drop(*args, **kwargs)
|
|
487
661
|
|
|
488
|
-
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
|
|
662
|
+
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
|
|
663
|
+
"""Join columns with other FeaturesConcatDataset in-place.
|
|
664
|
+
|
|
665
|
+
Parameters
|
|
666
|
+
----------
|
|
667
|
+
concat_dataset : FeaturesConcatDataset
|
|
668
|
+
The dataset to join with. Must have the same number of datasets,
|
|
669
|
+
and each corresponding dataset must have the same length.
|
|
670
|
+
**kwargs
|
|
671
|
+
Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
|
|
672
|
+
|
|
673
|
+
"""
|
|
489
674
|
assert len(self.datasets) == len(concat_dataset.datasets)
|
|
490
675
|
for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
|
|
491
676
|
assert len(ds1) == len(ds2)
|
|
492
|
-
ds1.features.join(ds2, **kwargs)
|
|
677
|
+
ds1.features = ds1.features.join(ds2.features, **kwargs)
|