eegdash 0.4.0.dev173498563__py3-none-any.whl → 0.4.1.dev185__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +3 -3
- eegdash/api.py +143 -526
- eegdash/bids_eeg_metadata.py +139 -39
- eegdash/const.py +25 -0
- eegdash/dataset/__init__.py +8 -2
- eegdash/dataset/base.py +311 -0
- eegdash/dataset/bids_dataset.py +443 -0
- eegdash/dataset/dataset.py +542 -17
- eegdash/dataset/dataset_summary.csv +255 -255
- eegdash/dataset/registry.py +69 -4
- eegdash/downloader.py +95 -9
- eegdash/features/datasets.py +326 -136
- eegdash/features/decorators.py +96 -3
- eegdash/features/extractors.py +212 -55
- eegdash/features/feature_bank/complexity.py +7 -3
- eegdash/features/feature_bank/dimensionality.py +1 -1
- eegdash/features/feature_bank/signal.py +11 -10
- eegdash/features/feature_bank/utils.py +8 -0
- eegdash/features/inspect.py +97 -11
- eegdash/features/serialization.py +56 -19
- eegdash/features/utils.py +90 -16
- eegdash/hbn/preprocessing.py +50 -17
- eegdash/hbn/windows.py +145 -32
- eegdash/logging.py +19 -0
- eegdash/mongodb.py +44 -27
- eegdash/paths.py +15 -5
- eegdash/utils.py +16 -1
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/METADATA +7 -8
- eegdash-0.4.1.dev185.dist-info/RECORD +38 -0
- eegdash/data_utils.py +0 -677
- eegdash-0.4.0.dev173498563.dist-info/RECORD +0 -37
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/WHEEL +0 -0
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/top_level.txt +0 -0
eegdash/features/datasets.py
CHANGED
|
@@ -18,22 +18,41 @@ from braindecode.datasets.base import (
|
|
|
18
18
|
|
|
19
19
|
from ..logging import logger
|
|
20
20
|
|
|
21
|
+
__all__ = [
|
|
22
|
+
"FeaturesDataset",
|
|
23
|
+
"FeaturesConcatDataset",
|
|
24
|
+
]
|
|
25
|
+
|
|
21
26
|
|
|
22
27
|
class FeaturesDataset(EEGWindowsDataset):
|
|
23
|
-
"""
|
|
28
|
+
"""A dataset of features extracted from EEG windows.
|
|
24
29
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
30
|
+
This class holds features in a pandas DataFrame and provides an interface
|
|
31
|
+
compatible with braindecode's dataset structure. Each row in the feature
|
|
32
|
+
DataFrame corresponds to a single sample (e.g., an EEG window).
|
|
28
33
|
|
|
29
34
|
Parameters
|
|
30
35
|
----------
|
|
31
|
-
features :
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
features : pandas.DataFrame
|
|
37
|
+
A DataFrame where each row is a sample and each column is a feature.
|
|
38
|
+
metadata : pandas.DataFrame, optional
|
|
39
|
+
A DataFrame containing metadata for each sample, indexed consistently
|
|
40
|
+
with `features`. Must include columns 'i_window_in_trial',
|
|
41
|
+
'i_start_in_trial', 'i_stop_in_trial', and 'target'.
|
|
42
|
+
description : dict or pandas.Series, optional
|
|
43
|
+
Additional high-level information about the dataset (e.g., subject ID).
|
|
44
|
+
transform : callable, optional
|
|
45
|
+
A function or transform to apply to the feature data on-the-fly.
|
|
46
|
+
raw_info : dict, optional
|
|
47
|
+
Information about the original raw recording, for provenance.
|
|
48
|
+
raw_preproc_kwargs : dict, optional
|
|
49
|
+
Keyword arguments used for preprocessing the raw data.
|
|
50
|
+
window_kwargs : dict, optional
|
|
51
|
+
Keyword arguments used for windowing the data.
|
|
52
|
+
window_preproc_kwargs : dict, optional
|
|
53
|
+
Keyword arguments used for preprocessing the windowed data.
|
|
54
|
+
features_kwargs : dict, optional
|
|
55
|
+
Keyword arguments used for feature extraction.
|
|
37
56
|
|
|
38
57
|
"""
|
|
39
58
|
|
|
@@ -65,7 +84,21 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
65
84
|
].to_numpy()
|
|
66
85
|
self.y = metadata.loc[:, "target"].to_list()
|
|
67
86
|
|
|
68
|
-
def __getitem__(self, index):
|
|
87
|
+
def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
|
|
88
|
+
"""Get a single sample from the dataset.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
index : int
|
|
93
|
+
The index of the sample to retrieve.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
tuple
|
|
98
|
+
A tuple containing the feature vector (X), the target (y), and the
|
|
99
|
+
cropping indices.
|
|
100
|
+
|
|
101
|
+
"""
|
|
69
102
|
crop_inds = self.crop_inds[index].tolist()
|
|
70
103
|
X = self.features.iloc[index].to_numpy()
|
|
71
104
|
X = X.copy()
|
|
@@ -75,18 +108,27 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
75
108
|
y = self.y[index]
|
|
76
109
|
return X, y, crop_inds
|
|
77
110
|
|
|
78
|
-
def __len__(self):
|
|
111
|
+
def __len__(self) -> int:
|
|
112
|
+
"""Return the number of samples in the dataset.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
int
|
|
117
|
+
The total number of feature samples.
|
|
118
|
+
|
|
119
|
+
"""
|
|
79
120
|
return len(self.features.index)
|
|
80
121
|
|
|
81
122
|
|
|
82
123
|
def _compute_stats(
|
|
83
124
|
ds: FeaturesDataset,
|
|
84
|
-
return_count=False,
|
|
85
|
-
return_mean=False,
|
|
86
|
-
return_var=False,
|
|
87
|
-
ddof=1,
|
|
88
|
-
numeric_only=False,
|
|
89
|
-
):
|
|
125
|
+
return_count: bool = False,
|
|
126
|
+
return_mean: bool = False,
|
|
127
|
+
return_var: bool = False,
|
|
128
|
+
ddof: int = 1,
|
|
129
|
+
numeric_only: bool = False,
|
|
130
|
+
) -> tuple:
|
|
131
|
+
"""Compute statistics for a single :class:`~eegdash.features.datasets.FeaturesDataset`."""
|
|
90
132
|
res = []
|
|
91
133
|
if return_count:
|
|
92
134
|
res.append(ds.features.count(numeric_only=numeric_only))
|
|
@@ -97,7 +139,14 @@ def _compute_stats(
|
|
|
97
139
|
return tuple(res)
|
|
98
140
|
|
|
99
141
|
|
|
100
|
-
def _pooled_var(
|
|
142
|
+
def _pooled_var(
|
|
143
|
+
counts: np.ndarray,
|
|
144
|
+
means: np.ndarray,
|
|
145
|
+
variances: np.ndarray,
|
|
146
|
+
ddof: int,
|
|
147
|
+
ddof_in: int | None = None,
|
|
148
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
149
|
+
"""Compute pooled variance across multiple datasets."""
|
|
101
150
|
if ddof_in is None:
|
|
102
151
|
ddof_in = ddof
|
|
103
152
|
count = counts.sum(axis=0)
|
|
@@ -110,17 +159,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
|
|
|
110
159
|
|
|
111
160
|
|
|
112
161
|
class FeaturesConcatDataset(BaseConcatDataset):
|
|
113
|
-
"""A
|
|
162
|
+
"""A concatenated dataset of :class:`~eegdash.features.datasets.FeaturesDataset` objects.
|
|
163
|
+
|
|
164
|
+
This class holds a list of :class:`~eegdash.features.datasets.FeaturesDataset` instances and allows
|
|
165
|
+
them to be treated as a single, larger dataset. It provides methods for
|
|
114
166
|
|
|
115
|
-
|
|
116
|
-
|
|
167
|
+
splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
|
|
168
|
+
`var`, `fillna`) across all contained datasets.
|
|
117
169
|
|
|
118
170
|
Parameters
|
|
119
171
|
----------
|
|
120
|
-
list_of_ds : list
|
|
121
|
-
list of
|
|
122
|
-
target_transform : callable
|
|
123
|
-
|
|
172
|
+
list_of_ds : list of ~eegdash.features.datasets.FeaturesDataset
|
|
173
|
+
A list of :class:`~eegdash.features.datasets.FeaturesDataset` objects to concatenate.
|
|
174
|
+
target_transform : callable, optional
|
|
175
|
+
A function to apply to the target values before they are returned.
|
|
124
176
|
|
|
125
177
|
"""
|
|
126
178
|
|
|
@@ -140,26 +192,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
140
192
|
self,
|
|
141
193
|
by: str | list[int] | list[list[int]] | dict[str, list[int]],
|
|
142
194
|
) -> dict[str, FeaturesConcatDataset]:
|
|
143
|
-
"""Split the dataset
|
|
195
|
+
"""Split the dataset into subsets.
|
|
144
196
|
|
|
145
|
-
The
|
|
197
|
+
The splitting can be done based on a column in the description
|
|
198
|
+
DataFrame or by providing explicit indices for each split.
|
|
146
199
|
|
|
147
200
|
Parameters
|
|
148
201
|
----------
|
|
149
|
-
by : str
|
|
150
|
-
If
|
|
151
|
-
|
|
152
|
-
If
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
202
|
+
by : str or list or dict
|
|
203
|
+
- If a string, splits are created for each unique value in the
|
|
204
|
+
description column `by`.
|
|
205
|
+
- If a list of integers, a single split is created containing the
|
|
206
|
+
datasets at the specified indices.
|
|
207
|
+
- If a list of lists of integers, multiple splits are created, one
|
|
208
|
+
for each sublist of indices.
|
|
209
|
+
- If a dictionary, keys are used as split names and values are
|
|
210
|
+
lists of dataset indices.
|
|
157
211
|
|
|
158
212
|
Returns
|
|
159
213
|
-------
|
|
160
|
-
|
|
161
|
-
A dictionary
|
|
162
|
-
|
|
214
|
+
dict[str, ~eegdash.features.datasets.FeaturesConcatDataset]
|
|
215
|
+
A dictionary where keys are split names and values are the new
|
|
216
|
+
:class:`~eegdash.features.datasets.FeaturesConcatDataset` subsets.
|
|
163
217
|
|
|
164
218
|
"""
|
|
165
219
|
if isinstance(by, str):
|
|
@@ -184,14 +238,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
184
238
|
}
|
|
185
239
|
|
|
186
240
|
def get_metadata(self) -> pd.DataFrame:
|
|
187
|
-
"""
|
|
241
|
+
"""Get the metadata of all datasets as a single DataFrame.
|
|
242
|
+
|
|
243
|
+
Concatenates the metadata from all contained datasets and adds columns
|
|
244
|
+
from their `description` attributes.
|
|
188
245
|
|
|
189
246
|
Returns
|
|
190
247
|
-------
|
|
191
|
-
|
|
192
|
-
DataFrame containing
|
|
193
|
-
|
|
194
|
-
|
|
248
|
+
pandas.DataFrame
|
|
249
|
+
A DataFrame containing the metadata for every sample in the
|
|
250
|
+
concatenated dataset.
|
|
251
|
+
|
|
252
|
+
Raises
|
|
253
|
+
------
|
|
254
|
+
TypeError
|
|
255
|
+
If any of the contained datasets is not a
|
|
256
|
+
:class:`~eegdash.features.datasets.FeaturesDataset`.
|
|
195
257
|
|
|
196
258
|
"""
|
|
197
259
|
if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
|
|
@@ -202,60 +264,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
202
264
|
|
|
203
265
|
all_dfs = list()
|
|
204
266
|
for ds in self.datasets:
|
|
205
|
-
df = ds.metadata
|
|
267
|
+
df = ds.metadata.copy()
|
|
206
268
|
for k, v in ds.description.items():
|
|
207
269
|
df[k] = v
|
|
208
270
|
all_dfs.append(df)
|
|
209
271
|
|
|
210
272
|
return pd.concat(all_dfs)
|
|
211
273
|
|
|
212
|
-
def save(self, path: str, overwrite: bool = False, offset: int = 0):
|
|
213
|
-
"""Save
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
raw_preproc_kwargs.json (if raws were preprocessed)
|
|
230
|
-
window_kwargs.json (if this is a windowed dataset)
|
|
231
|
-
window_preproc_kwargs.json (if windows were preprocessed)
|
|
232
|
-
features_kwargs.json
|
|
274
|
+
def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
|
|
275
|
+
"""Save the concatenated dataset to a directory.
|
|
276
|
+
|
|
277
|
+
Creates a directory structure where each contained dataset is saved in
|
|
278
|
+
its own numbered subdirectory.
|
|
279
|
+
|
|
280
|
+
.. code-block::
|
|
281
|
+
|
|
282
|
+
path/
|
|
283
|
+
0/
|
|
284
|
+
0-feat.parquet
|
|
285
|
+
metadata_df.pkl
|
|
286
|
+
description.json
|
|
287
|
+
...
|
|
288
|
+
1/
|
|
289
|
+
1-feat.parquet
|
|
290
|
+
...
|
|
233
291
|
|
|
234
292
|
Parameters
|
|
235
293
|
----------
|
|
236
294
|
path : str
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
295
|
+
The directory where the dataset will be saved.
|
|
296
|
+
overwrite : bool, default False
|
|
297
|
+
If True, any existing subdirectories that conflict with the new
|
|
298
|
+
ones will be removed.
|
|
299
|
+
offset : int, default 0
|
|
300
|
+
An integer to add to the subdirectory names. Useful for saving
|
|
301
|
+
datasets in chunks.
|
|
302
|
+
|
|
303
|
+
Raises
|
|
304
|
+
------
|
|
305
|
+
ValueError
|
|
306
|
+
If the dataset is empty.
|
|
307
|
+
FileExistsError
|
|
308
|
+
If a subdirectory already exists and `overwrite` is False.
|
|
247
309
|
|
|
248
310
|
"""
|
|
249
311
|
if len(self.datasets) == 0:
|
|
250
312
|
raise ValueError("Expect at least one dataset")
|
|
251
313
|
path_contents = os.listdir(path)
|
|
252
|
-
n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
|
|
314
|
+
n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
|
|
253
315
|
for i_ds, ds in enumerate(self.datasets):
|
|
254
|
-
|
|
255
|
-
if
|
|
256
|
-
path_contents.remove(
|
|
257
|
-
|
|
258
|
-
sub_dir = os.path.join(path, str(i_ds + offset))
|
|
316
|
+
sub_dir_name = str(i_ds + offset)
|
|
317
|
+
if sub_dir_name in path_contents:
|
|
318
|
+
path_contents.remove(sub_dir_name)
|
|
319
|
+
sub_dir = os.path.join(path, sub_dir_name)
|
|
259
320
|
if os.path.exists(sub_dir):
|
|
260
321
|
if overwrite:
|
|
261
322
|
shutil.rmtree(sub_dir)
|
|
@@ -265,35 +326,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
265
326
|
f" a different directory, set overwrite=True, or "
|
|
266
327
|
f"resolve manually."
|
|
267
328
|
)
|
|
268
|
-
# save_dir/{i_ds+offset}/
|
|
269
329
|
os.makedirs(sub_dir)
|
|
270
|
-
# save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
|
|
271
330
|
self._save_features(sub_dir, ds, i_ds, offset)
|
|
272
|
-
# save_dir/{i_ds+offset}/metadata_df.pkl
|
|
273
331
|
self._save_metadata(sub_dir, ds)
|
|
274
|
-
# save_dir/{i_ds+offset}/description.json
|
|
275
332
|
self._save_description(sub_dir, ds.description)
|
|
276
|
-
# save_dir/{i_ds+offset}/raw-info.fif
|
|
277
333
|
self._save_raw_info(sub_dir, ds)
|
|
278
|
-
# save_dir/{i_ds+offset}/raw_preproc_kwargs.json
|
|
279
|
-
# save_dir/{i_ds+offset}/window_kwargs.json
|
|
280
|
-
# save_dir/{i_ds+offset}/window_preproc_kwargs.json
|
|
281
|
-
# save_dir/{i_ds+offset}/features_kwargs.json
|
|
282
334
|
self._save_kwargs(sub_dir, ds)
|
|
283
|
-
if overwrite:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
f"datasets!",
|
|
293
|
-
UserWarning,
|
|
294
|
-
)
|
|
295
|
-
# if path contains files or directories that were not touched, raise
|
|
296
|
-
# warning
|
|
335
|
+
if overwrite and i_ds + 1 + offset < n_sub_dirs:
|
|
336
|
+
logger.warning(
|
|
337
|
+
f"The number of saved datasets ({i_ds + 1 + offset}) "
|
|
338
|
+
f"does not match the number of existing "
|
|
339
|
+
f"subdirectories ({n_sub_dirs}). You may now "
|
|
340
|
+
f"encounter a mix of differently preprocessed "
|
|
341
|
+
f"datasets!",
|
|
342
|
+
UserWarning,
|
|
343
|
+
)
|
|
297
344
|
if path_contents:
|
|
298
345
|
logger.warning(
|
|
299
346
|
f"Chosen directory {path} contains other "
|
|
@@ -301,20 +348,37 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
301
348
|
)
|
|
302
349
|
|
|
303
350
|
@staticmethod
|
|
304
|
-
def _save_features(sub_dir, ds, i_ds, offset):
|
|
351
|
+
def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
|
|
352
|
+
"""Save the feature DataFrame to a Parquet file."""
|
|
305
353
|
parquet_file_name = f"{i_ds + offset}-feat.parquet"
|
|
306
354
|
parquet_file_path = os.path.join(sub_dir, parquet_file_name)
|
|
307
355
|
ds.features.to_parquet(parquet_file_path)
|
|
308
356
|
|
|
309
357
|
@staticmethod
|
|
310
|
-
def
|
|
311
|
-
|
|
358
|
+
def _save_metadata(sub_dir: str, ds: FeaturesDataset):
|
|
359
|
+
"""Save the metadata DataFrame to a pickle file."""
|
|
360
|
+
metadata_file_name = "metadata_df.pkl"
|
|
361
|
+
metadata_file_path = os.path.join(sub_dir, metadata_file_name)
|
|
362
|
+
ds.metadata.to_pickle(metadata_file_path)
|
|
363
|
+
|
|
364
|
+
@staticmethod
|
|
365
|
+
def _save_description(sub_dir: str, description: pd.Series):
|
|
366
|
+
"""Save the description Series to a JSON file."""
|
|
367
|
+
desc_file_name = "description.json"
|
|
368
|
+
desc_file_path = os.path.join(sub_dir, desc_file_name)
|
|
369
|
+
description.to_json(desc_file_path)
|
|
370
|
+
|
|
371
|
+
@staticmethod
|
|
372
|
+
def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
|
|
373
|
+
"""Save the raw info dictionary to a FIF file if it exists."""
|
|
374
|
+
if hasattr(ds, "raw_info") and ds.raw_info is not None:
|
|
312
375
|
fif_file_name = "raw-info.fif"
|
|
313
376
|
fif_file_path = os.path.join(sub_dir, fif_file_name)
|
|
314
|
-
ds.raw_info.save(fif_file_path)
|
|
377
|
+
ds.raw_info.save(fif_file_path, overwrite=True)
|
|
315
378
|
|
|
316
379
|
@staticmethod
|
|
317
|
-
def _save_kwargs(sub_dir, ds):
|
|
380
|
+
def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
|
|
381
|
+
"""Save various keyword argument dictionaries to JSON files."""
|
|
318
382
|
for kwargs_name in [
|
|
319
383
|
"raw_preproc_kwargs",
|
|
320
384
|
"window_kwargs",
|
|
@@ -322,10 +386,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
322
386
|
"features_kwargs",
|
|
323
387
|
]:
|
|
324
388
|
if hasattr(ds, kwargs_name):
|
|
325
|
-
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
326
|
-
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
327
389
|
kwargs = getattr(ds, kwargs_name)
|
|
328
390
|
if kwargs is not None:
|
|
391
|
+
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
392
|
+
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
329
393
|
with open(kwargs_file_path, "w") as f:
|
|
330
394
|
json.dump(kwargs, f)
|
|
331
395
|
|
|
@@ -334,7 +398,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
334
398
|
include_metadata: bool | str | List[str] = False,
|
|
335
399
|
include_target: bool = False,
|
|
336
400
|
include_crop_inds: bool = False,
|
|
337
|
-
):
|
|
401
|
+
) -> pd.DataFrame:
|
|
402
|
+
"""Convert the dataset to a single pandas DataFrame.
|
|
403
|
+
|
|
404
|
+
Parameters
|
|
405
|
+
----------
|
|
406
|
+
include_metadata : bool or str or list of str, default False
|
|
407
|
+
If True, include all metadata columns. If a string or list of
|
|
408
|
+
strings, include only the specified metadata columns.
|
|
409
|
+
include_target : bool, default False
|
|
410
|
+
If True, include the 'target' column.
|
|
411
|
+
include_crop_inds : bool, default False
|
|
412
|
+
If True, include window cropping index columns.
|
|
413
|
+
|
|
414
|
+
Returns
|
|
415
|
+
-------
|
|
416
|
+
pandas.DataFrame
|
|
417
|
+
A DataFrame containing the features and requested metadata.
|
|
418
|
+
|
|
419
|
+
"""
|
|
338
420
|
if (
|
|
339
421
|
not isinstance(include_metadata, bool)
|
|
340
422
|
or include_metadata
|
|
@@ -343,7 +425,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
343
425
|
include_dataset = False
|
|
344
426
|
if isinstance(include_metadata, bool) and include_metadata:
|
|
345
427
|
include_dataset = True
|
|
346
|
-
cols = self.datasets[0].metadata.columns
|
|
428
|
+
cols = self.datasets[0].metadata.columns.tolist()
|
|
347
429
|
else:
|
|
348
430
|
cols = include_metadata
|
|
349
431
|
if isinstance(cols, bool) and not cols:
|
|
@@ -352,13 +434,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
352
434
|
cols = [cols]
|
|
353
435
|
cols = set(cols)
|
|
354
436
|
if include_crop_inds:
|
|
355
|
-
cols
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
437
|
+
cols.update(
|
|
438
|
+
{
|
|
439
|
+
"i_dataset",
|
|
440
|
+
"i_window_in_trial",
|
|
441
|
+
"i_start_in_trial",
|
|
442
|
+
"i_stop_in_trial",
|
|
443
|
+
}
|
|
444
|
+
)
|
|
362
445
|
if include_target:
|
|
363
446
|
cols.add("target")
|
|
364
447
|
cols = list(cols)
|
|
@@ -381,10 +464,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
381
464
|
dataframes = [ds.features for ds in self.datasets]
|
|
382
465
|
return pd.concat(dataframes, axis=0, ignore_index=True)
|
|
383
466
|
|
|
384
|
-
def _numeric_columns(self):
|
|
467
|
+
def _numeric_columns(self) -> pd.Index:
|
|
468
|
+
"""Get the names of numeric columns from the feature DataFrames."""
|
|
385
469
|
return self.datasets[0].features.select_dtypes(include=np.number).columns
|
|
386
470
|
|
|
387
|
-
def count(self, numeric_only=False, n_jobs=1):
|
|
471
|
+
def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
472
|
+
"""Count non-NA cells for each feature column.
|
|
473
|
+
|
|
474
|
+
Parameters
|
|
475
|
+
----------
|
|
476
|
+
numeric_only : bool, default False
|
|
477
|
+
Include only float, int, boolean columns.
|
|
478
|
+
n_jobs : int, default 1
|
|
479
|
+
Number of jobs to run in parallel.
|
|
480
|
+
|
|
481
|
+
Returns
|
|
482
|
+
-------
|
|
483
|
+
pandas.Series
|
|
484
|
+
The count of non-NA cells for each column.
|
|
485
|
+
|
|
486
|
+
"""
|
|
388
487
|
stats = Parallel(n_jobs)(
|
|
389
488
|
delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
|
|
390
489
|
for ds in self.datasets
|
|
@@ -393,7 +492,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
393
492
|
count = counts.sum(axis=0)
|
|
394
493
|
return pd.Series(count, index=self._numeric_columns())
|
|
395
494
|
|
|
396
|
-
def mean(self, numeric_only=False, n_jobs=1):
|
|
495
|
+
def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
496
|
+
"""Compute the mean for each feature column.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
numeric_only : bool, default False
|
|
501
|
+
Include only float, int, boolean columns.
|
|
502
|
+
n_jobs : int, default 1
|
|
503
|
+
Number of jobs to run in parallel.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
pandas.Series
|
|
508
|
+
The mean of each column.
|
|
509
|
+
|
|
510
|
+
"""
|
|
397
511
|
stats = Parallel(n_jobs)(
|
|
398
512
|
delayed(_compute_stats)(
|
|
399
513
|
ds, return_count=True, return_mean=True, numeric_only=numeric_only
|
|
@@ -405,7 +519,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
405
519
|
mean = np.sum((counts / count) * means, axis=0)
|
|
406
520
|
return pd.Series(mean, index=self._numeric_columns())
|
|
407
521
|
|
|
408
|
-
def var(
|
|
522
|
+
def var(
|
|
523
|
+
self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
|
|
524
|
+
) -> pd.Series:
|
|
525
|
+
"""Compute the variance for each feature column.
|
|
526
|
+
|
|
527
|
+
Parameters
|
|
528
|
+
----------
|
|
529
|
+
ddof : int, default 1
|
|
530
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
|
|
531
|
+
numeric_only : bool, default False
|
|
532
|
+
Include only float, int, boolean columns.
|
|
533
|
+
n_jobs : int, default 1
|
|
534
|
+
Number of jobs to run in parallel.
|
|
535
|
+
|
|
536
|
+
Returns
|
|
537
|
+
-------
|
|
538
|
+
pandas.Series
|
|
539
|
+
The variance of each column.
|
|
540
|
+
|
|
541
|
+
"""
|
|
409
542
|
stats = Parallel(n_jobs)(
|
|
410
543
|
delayed(_compute_stats)(
|
|
411
544
|
ds,
|
|
@@ -425,12 +558,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
425
558
|
_, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
426
559
|
return pd.Series(var, index=self._numeric_columns())
|
|
427
560
|
|
|
428
|
-
def std(
|
|
561
|
+
def std(
|
|
562
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
563
|
+
) -> pd.Series:
|
|
564
|
+
"""Compute the standard deviation for each feature column.
|
|
565
|
+
|
|
566
|
+
Parameters
|
|
567
|
+
----------
|
|
568
|
+
ddof : int, default 1
|
|
569
|
+
Delta Degrees of Freedom.
|
|
570
|
+
numeric_only : bool, default False
|
|
571
|
+
Include only float, int, boolean columns.
|
|
572
|
+
eps : float, default 0
|
|
573
|
+
A small epsilon value to add to the variance before taking the
|
|
574
|
+
square root to avoid numerical instability.
|
|
575
|
+
n_jobs : int, default 1
|
|
576
|
+
Number of jobs to run in parallel.
|
|
577
|
+
|
|
578
|
+
Returns
|
|
579
|
+
-------
|
|
580
|
+
pandas.Series
|
|
581
|
+
The standard deviation of each column.
|
|
582
|
+
|
|
583
|
+
"""
|
|
429
584
|
return np.sqrt(
|
|
430
585
|
self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
|
|
431
586
|
)
|
|
432
587
|
|
|
433
|
-
def zscore(
|
|
588
|
+
def zscore(
|
|
589
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
590
|
+
) -> None:
|
|
591
|
+
"""Apply z-score normalization to numeric columns in-place.
|
|
592
|
+
|
|
593
|
+
Parameters
|
|
594
|
+
----------
|
|
595
|
+
ddof : int, default 1
|
|
596
|
+
Delta Degrees of Freedom for variance calculation.
|
|
597
|
+
numeric_only : bool, default False
|
|
598
|
+
Include only float, int, boolean columns.
|
|
599
|
+
eps : float, default 0
|
|
600
|
+
Epsilon for numerical stability.
|
|
601
|
+
n_jobs : int, default 1
|
|
602
|
+
Number of jobs to run in parallel for statistics computation.
|
|
603
|
+
|
|
604
|
+
"""
|
|
434
605
|
stats = Parallel(n_jobs)(
|
|
435
606
|
delayed(_compute_stats)(
|
|
436
607
|
ds,
|
|
@@ -450,10 +621,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
450
621
|
_, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
451
622
|
std = np.sqrt(var + eps)
|
|
452
623
|
for ds in self.datasets:
|
|
453
|
-
ds.features
|
|
624
|
+
ds.features.loc[:, self._numeric_columns()] = (
|
|
625
|
+
ds.features.loc[:, self._numeric_columns()] - mean
|
|
626
|
+
) / std
|
|
454
627
|
|
|
455
628
|
@staticmethod
|
|
456
|
-
def _enforce_inplace_operations(func_name, kwargs):
|
|
629
|
+
def _enforce_inplace_operations(func_name: str, kwargs: dict):
|
|
630
|
+
"""Raise an error if 'inplace=False' is passed to a method."""
|
|
457
631
|
if "inplace" in kwargs and kwargs["inplace"] is False:
|
|
458
632
|
raise ValueError(
|
|
459
633
|
f"{func_name} only works inplace, please change "
|
|
@@ -461,33 +635,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
461
635
|
)
|
|
462
636
|
kwargs["inplace"] = True
|
|
463
637
|
|
|
464
|
-
def fillna(self, *args, **kwargs):
|
|
638
|
+
def fillna(self, *args, **kwargs) -> None:
|
|
639
|
+
"""Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
|
|
465
640
|
FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
|
|
466
641
|
for ds in self.datasets:
|
|
467
642
|
ds.features.fillna(*args, **kwargs)
|
|
468
643
|
|
|
469
|
-
def replace(self, *args, **kwargs):
|
|
644
|
+
def replace(self, *args, **kwargs) -> None:
|
|
645
|
+
"""Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
|
|
470
646
|
FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
|
|
471
647
|
for ds in self.datasets:
|
|
472
648
|
ds.features.replace(*args, **kwargs)
|
|
473
649
|
|
|
474
|
-
def interpolate(self, *args, **kwargs):
|
|
650
|
+
def interpolate(self, *args, **kwargs) -> None:
|
|
651
|
+
"""Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
|
|
475
652
|
FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
|
|
476
653
|
for ds in self.datasets:
|
|
477
654
|
ds.features.interpolate(*args, **kwargs)
|
|
478
655
|
|
|
479
|
-
def dropna(self, *args, **kwargs):
|
|
656
|
+
def dropna(self, *args, **kwargs) -> None:
|
|
657
|
+
"""Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
|
|
480
658
|
FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
|
|
481
659
|
for ds in self.datasets:
|
|
482
660
|
ds.features.dropna(*args, **kwargs)
|
|
483
661
|
|
|
484
|
-
def drop(self, *args, **kwargs):
|
|
662
|
+
def drop(self, *args, **kwargs) -> None:
|
|
663
|
+
"""Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
|
|
485
664
|
FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
|
|
486
665
|
for ds in self.datasets:
|
|
487
666
|
ds.features.drop(*args, **kwargs)
|
|
488
667
|
|
|
489
|
-
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
|
|
668
|
+
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
|
|
669
|
+
"""Join columns with other FeaturesConcatDataset in-place.
|
|
670
|
+
|
|
671
|
+
Parameters
|
|
672
|
+
----------
|
|
673
|
+
concat_dataset : FeaturesConcatDataset
|
|
674
|
+
The dataset to join with. Must have the same number of datasets,
|
|
675
|
+
and each corresponding dataset must have the same length.
|
|
676
|
+
**kwargs
|
|
677
|
+
Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
|
|
678
|
+
|
|
679
|
+
"""
|
|
490
680
|
assert len(self.datasets) == len(concat_dataset.datasets)
|
|
491
681
|
for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
|
|
492
682
|
assert len(ds1) == len(ds2)
|
|
493
|
-
ds1.features.join(ds2, **kwargs)
|
|
683
|
+
ds1.features = ds1.features.join(ds2.features, **kwargs)
|