eegdash 0.4.0.dev173498563__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +1 -1
- eegdash/api.py +183 -88
- eegdash/bids_eeg_metadata.py +139 -39
- eegdash/const.py +25 -0
- eegdash/data_utils.py +333 -276
- eegdash/dataset/dataset.py +35 -13
- eegdash/dataset/dataset_summary.csv +255 -255
- eegdash/dataset/registry.py +69 -4
- eegdash/downloader.py +95 -9
- eegdash/features/datasets.py +325 -136
- eegdash/features/decorators.py +96 -3
- eegdash/features/extractors.py +212 -55
- eegdash/features/feature_bank/complexity.py +7 -3
- eegdash/features/feature_bank/dimensionality.py +1 -1
- eegdash/features/feature_bank/signal.py +11 -10
- eegdash/features/feature_bank/utils.py +8 -0
- eegdash/features/inspect.py +88 -5
- eegdash/features/serialization.py +51 -19
- eegdash/features/utils.py +80 -8
- eegdash/hbn/preprocessing.py +50 -17
- eegdash/hbn/windows.py +145 -32
- eegdash/logging.py +19 -0
- eegdash/mongodb.py +44 -27
- eegdash/paths.py +14 -5
- eegdash/utils.py +16 -1
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dist-info}/METADATA +6 -8
- eegdash-0.4.1.dist-info/RECORD +37 -0
- eegdash-0.4.0.dev173498563.dist-info/RECORD +0 -37
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dist-info}/WHEEL +0 -0
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dist-info}/top_level.txt +0 -0
eegdash/features/datasets.py
CHANGED
|
@@ -18,22 +18,41 @@ from braindecode.datasets.base import (
|
|
|
18
18
|
|
|
19
19
|
from ..logging import logger
|
|
20
20
|
|
|
21
|
+
__all__ = [
|
|
22
|
+
"FeaturesDataset",
|
|
23
|
+
"FeaturesConcatDataset",
|
|
24
|
+
]
|
|
25
|
+
|
|
21
26
|
|
|
22
27
|
class FeaturesDataset(EEGWindowsDataset):
|
|
23
|
-
"""
|
|
28
|
+
"""A dataset of features extracted from EEG windows.
|
|
24
29
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
30
|
+
This class holds features in a pandas DataFrame and provides an interface
|
|
31
|
+
compatible with braindecode's dataset structure. Each row in the feature
|
|
32
|
+
DataFrame corresponds to a single sample (e.g., an EEG window).
|
|
28
33
|
|
|
29
34
|
Parameters
|
|
30
35
|
----------
|
|
31
|
-
features :
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
features : pandas.DataFrame
|
|
37
|
+
A DataFrame where each row is a sample and each column is a feature.
|
|
38
|
+
metadata : pandas.DataFrame, optional
|
|
39
|
+
A DataFrame containing metadata for each sample, indexed consistently
|
|
40
|
+
with `features`. Must include columns 'i_window_in_trial',
|
|
41
|
+
'i_start_in_trial', 'i_stop_in_trial', and 'target'.
|
|
42
|
+
description : dict or pandas.Series, optional
|
|
43
|
+
Additional high-level information about the dataset (e.g., subject ID).
|
|
44
|
+
transform : callable, optional
|
|
45
|
+
A function or transform to apply to the feature data on-the-fly.
|
|
46
|
+
raw_info : dict, optional
|
|
47
|
+
Information about the original raw recording, for provenance.
|
|
48
|
+
raw_preproc_kwargs : dict, optional
|
|
49
|
+
Keyword arguments used for preprocessing the raw data.
|
|
50
|
+
window_kwargs : dict, optional
|
|
51
|
+
Keyword arguments used for windowing the data.
|
|
52
|
+
window_preproc_kwargs : dict, optional
|
|
53
|
+
Keyword arguments used for preprocessing the windowed data.
|
|
54
|
+
features_kwargs : dict, optional
|
|
55
|
+
Keyword arguments used for feature extraction.
|
|
37
56
|
|
|
38
57
|
"""
|
|
39
58
|
|
|
@@ -65,7 +84,21 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
65
84
|
].to_numpy()
|
|
66
85
|
self.y = metadata.loc[:, "target"].to_list()
|
|
67
86
|
|
|
68
|
-
def __getitem__(self, index):
|
|
87
|
+
def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
|
|
88
|
+
"""Get a single sample from the dataset.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
index : int
|
|
93
|
+
The index of the sample to retrieve.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
tuple
|
|
98
|
+
A tuple containing the feature vector (X), the target (y), and the
|
|
99
|
+
cropping indices.
|
|
100
|
+
|
|
101
|
+
"""
|
|
69
102
|
crop_inds = self.crop_inds[index].tolist()
|
|
70
103
|
X = self.features.iloc[index].to_numpy()
|
|
71
104
|
X = X.copy()
|
|
@@ -75,18 +108,27 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
75
108
|
y = self.y[index]
|
|
76
109
|
return X, y, crop_inds
|
|
77
110
|
|
|
78
|
-
def __len__(self):
|
|
111
|
+
def __len__(self) -> int:
|
|
112
|
+
"""Return the number of samples in the dataset.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
int
|
|
117
|
+
The total number of feature samples.
|
|
118
|
+
|
|
119
|
+
"""
|
|
79
120
|
return len(self.features.index)
|
|
80
121
|
|
|
81
122
|
|
|
82
123
|
def _compute_stats(
|
|
83
124
|
ds: FeaturesDataset,
|
|
84
|
-
return_count=False,
|
|
85
|
-
return_mean=False,
|
|
86
|
-
return_var=False,
|
|
87
|
-
ddof=1,
|
|
88
|
-
numeric_only=False,
|
|
89
|
-
):
|
|
125
|
+
return_count: bool = False,
|
|
126
|
+
return_mean: bool = False,
|
|
127
|
+
return_var: bool = False,
|
|
128
|
+
ddof: int = 1,
|
|
129
|
+
numeric_only: bool = False,
|
|
130
|
+
) -> tuple:
|
|
131
|
+
"""Compute statistics for a single FeaturesDataset."""
|
|
90
132
|
res = []
|
|
91
133
|
if return_count:
|
|
92
134
|
res.append(ds.features.count(numeric_only=numeric_only))
|
|
@@ -97,7 +139,14 @@ def _compute_stats(
|
|
|
97
139
|
return tuple(res)
|
|
98
140
|
|
|
99
141
|
|
|
100
|
-
def _pooled_var(
|
|
142
|
+
def _pooled_var(
|
|
143
|
+
counts: np.ndarray,
|
|
144
|
+
means: np.ndarray,
|
|
145
|
+
variances: np.ndarray,
|
|
146
|
+
ddof: int,
|
|
147
|
+
ddof_in: int | None = None,
|
|
148
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
149
|
+
"""Compute pooled variance across multiple datasets."""
|
|
101
150
|
if ddof_in is None:
|
|
102
151
|
ddof_in = ddof
|
|
103
152
|
count = counts.sum(axis=0)
|
|
@@ -110,17 +159,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
|
|
|
110
159
|
|
|
111
160
|
|
|
112
161
|
class FeaturesConcatDataset(BaseConcatDataset):
|
|
113
|
-
"""A
|
|
162
|
+
"""A concatenated dataset of `FeaturesDataset` objects.
|
|
163
|
+
|
|
164
|
+
This class holds a list of :class:`FeaturesDataset` instances and allows
|
|
165
|
+
them to be treated as a single, larger dataset. It provides methods for
|
|
114
166
|
|
|
115
|
-
|
|
116
|
-
|
|
167
|
+
splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
|
|
168
|
+
`var`, `fillna`) across all contained datasets.
|
|
117
169
|
|
|
118
170
|
Parameters
|
|
119
171
|
----------
|
|
120
|
-
list_of_ds : list
|
|
121
|
-
list of
|
|
122
|
-
target_transform : callable
|
|
123
|
-
|
|
172
|
+
list_of_ds : list of FeaturesDataset
|
|
173
|
+
A list of :class:`FeaturesDataset` objects to concatenate.
|
|
174
|
+
target_transform : callable, optional
|
|
175
|
+
A function to apply to the target values before they are returned.
|
|
124
176
|
|
|
125
177
|
"""
|
|
126
178
|
|
|
@@ -140,26 +192,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
140
192
|
self,
|
|
141
193
|
by: str | list[int] | list[list[int]] | dict[str, list[int]],
|
|
142
194
|
) -> dict[str, FeaturesConcatDataset]:
|
|
143
|
-
"""Split the dataset
|
|
195
|
+
"""Split the dataset into subsets.
|
|
144
196
|
|
|
145
|
-
The
|
|
197
|
+
The splitting can be done based on a column in the description
|
|
198
|
+
DataFrame or by providing explicit indices for each split.
|
|
146
199
|
|
|
147
200
|
Parameters
|
|
148
201
|
----------
|
|
149
|
-
by : str
|
|
150
|
-
If
|
|
151
|
-
|
|
152
|
-
If
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
202
|
+
by : str or list or dict
|
|
203
|
+
- If a string, splits are created for each unique value in the
|
|
204
|
+
description column `by`.
|
|
205
|
+
- If a list of integers, a single split is created containing the
|
|
206
|
+
datasets at the specified indices.
|
|
207
|
+
- If a list of lists of integers, multiple splits are created, one
|
|
208
|
+
for each sublist of indices.
|
|
209
|
+
- If a dictionary, keys are used as split names and values are
|
|
210
|
+
lists of dataset indices.
|
|
157
211
|
|
|
158
212
|
Returns
|
|
159
213
|
-------
|
|
160
|
-
|
|
161
|
-
A dictionary
|
|
162
|
-
|
|
214
|
+
dict[str, FeaturesConcatDataset]
|
|
215
|
+
A dictionary where keys are split names and values are the new
|
|
216
|
+
:class:`FeaturesConcatDataset` subsets.
|
|
163
217
|
|
|
164
218
|
"""
|
|
165
219
|
if isinstance(by, str):
|
|
@@ -184,14 +238,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
184
238
|
}
|
|
185
239
|
|
|
186
240
|
def get_metadata(self) -> pd.DataFrame:
|
|
187
|
-
"""
|
|
241
|
+
"""Get the metadata of all datasets as a single DataFrame.
|
|
242
|
+
|
|
243
|
+
Concatenates the metadata from all contained datasets and adds columns
|
|
244
|
+
from their `description` attributes.
|
|
188
245
|
|
|
189
246
|
Returns
|
|
190
247
|
-------
|
|
191
|
-
|
|
192
|
-
DataFrame containing
|
|
193
|
-
|
|
194
|
-
|
|
248
|
+
pandas.DataFrame
|
|
249
|
+
A DataFrame containing the metadata for every sample in the
|
|
250
|
+
concatenated dataset.
|
|
251
|
+
|
|
252
|
+
Raises
|
|
253
|
+
------
|
|
254
|
+
TypeError
|
|
255
|
+
If any of the contained datasets is not a :class:`FeaturesDataset`.
|
|
195
256
|
|
|
196
257
|
"""
|
|
197
258
|
if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
|
|
@@ -202,60 +263,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
202
263
|
|
|
203
264
|
all_dfs = list()
|
|
204
265
|
for ds in self.datasets:
|
|
205
|
-
df = ds.metadata
|
|
266
|
+
df = ds.metadata.copy()
|
|
206
267
|
for k, v in ds.description.items():
|
|
207
268
|
df[k] = v
|
|
208
269
|
all_dfs.append(df)
|
|
209
270
|
|
|
210
271
|
return pd.concat(all_dfs)
|
|
211
272
|
|
|
212
|
-
def save(self, path: str, overwrite: bool = False, offset: int = 0):
|
|
213
|
-
"""Save
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
raw_preproc_kwargs.json (if raws were preprocessed)
|
|
230
|
-
window_kwargs.json (if this is a windowed dataset)
|
|
231
|
-
window_preproc_kwargs.json (if windows were preprocessed)
|
|
232
|
-
features_kwargs.json
|
|
273
|
+
def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
|
|
274
|
+
"""Save the concatenated dataset to a directory.
|
|
275
|
+
|
|
276
|
+
Creates a directory structure where each contained dataset is saved in
|
|
277
|
+
its own numbered subdirectory.
|
|
278
|
+
|
|
279
|
+
.. code-block::
|
|
280
|
+
|
|
281
|
+
path/
|
|
282
|
+
0/
|
|
283
|
+
0-feat.parquet
|
|
284
|
+
metadata_df.pkl
|
|
285
|
+
description.json
|
|
286
|
+
...
|
|
287
|
+
1/
|
|
288
|
+
1-feat.parquet
|
|
289
|
+
...
|
|
233
290
|
|
|
234
291
|
Parameters
|
|
235
292
|
----------
|
|
236
293
|
path : str
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
294
|
+
The directory where the dataset will be saved.
|
|
295
|
+
overwrite : bool, default False
|
|
296
|
+
If True, any existing subdirectories that conflict with the new
|
|
297
|
+
ones will be removed.
|
|
298
|
+
offset : int, default 0
|
|
299
|
+
An integer to add to the subdirectory names. Useful for saving
|
|
300
|
+
datasets in chunks.
|
|
301
|
+
|
|
302
|
+
Raises
|
|
303
|
+
------
|
|
304
|
+
ValueError
|
|
305
|
+
If the dataset is empty.
|
|
306
|
+
FileExistsError
|
|
307
|
+
If a subdirectory already exists and `overwrite` is False.
|
|
247
308
|
|
|
248
309
|
"""
|
|
249
310
|
if len(self.datasets) == 0:
|
|
250
311
|
raise ValueError("Expect at least one dataset")
|
|
251
312
|
path_contents = os.listdir(path)
|
|
252
|
-
n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
|
|
313
|
+
n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
|
|
253
314
|
for i_ds, ds in enumerate(self.datasets):
|
|
254
|
-
|
|
255
|
-
if
|
|
256
|
-
path_contents.remove(
|
|
257
|
-
|
|
258
|
-
sub_dir = os.path.join(path, str(i_ds + offset))
|
|
315
|
+
sub_dir_name = str(i_ds + offset)
|
|
316
|
+
if sub_dir_name in path_contents:
|
|
317
|
+
path_contents.remove(sub_dir_name)
|
|
318
|
+
sub_dir = os.path.join(path, sub_dir_name)
|
|
259
319
|
if os.path.exists(sub_dir):
|
|
260
320
|
if overwrite:
|
|
261
321
|
shutil.rmtree(sub_dir)
|
|
@@ -265,35 +325,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
265
325
|
f" a different directory, set overwrite=True, or "
|
|
266
326
|
f"resolve manually."
|
|
267
327
|
)
|
|
268
|
-
# save_dir/{i_ds+offset}/
|
|
269
328
|
os.makedirs(sub_dir)
|
|
270
|
-
# save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
|
|
271
329
|
self._save_features(sub_dir, ds, i_ds, offset)
|
|
272
|
-
# save_dir/{i_ds+offset}/metadata_df.pkl
|
|
273
330
|
self._save_metadata(sub_dir, ds)
|
|
274
|
-
# save_dir/{i_ds+offset}/description.json
|
|
275
331
|
self._save_description(sub_dir, ds.description)
|
|
276
|
-
# save_dir/{i_ds+offset}/raw-info.fif
|
|
277
332
|
self._save_raw_info(sub_dir, ds)
|
|
278
|
-
# save_dir/{i_ds+offset}/raw_preproc_kwargs.json
|
|
279
|
-
# save_dir/{i_ds+offset}/window_kwargs.json
|
|
280
|
-
# save_dir/{i_ds+offset}/window_preproc_kwargs.json
|
|
281
|
-
# save_dir/{i_ds+offset}/features_kwargs.json
|
|
282
333
|
self._save_kwargs(sub_dir, ds)
|
|
283
|
-
if overwrite:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
f"datasets!",
|
|
293
|
-
UserWarning,
|
|
294
|
-
)
|
|
295
|
-
# if path contains files or directories that were not touched, raise
|
|
296
|
-
# warning
|
|
334
|
+
if overwrite and i_ds + 1 + offset < n_sub_dirs:
|
|
335
|
+
logger.warning(
|
|
336
|
+
f"The number of saved datasets ({i_ds + 1 + offset}) "
|
|
337
|
+
f"does not match the number of existing "
|
|
338
|
+
f"subdirectories ({n_sub_dirs}). You may now "
|
|
339
|
+
f"encounter a mix of differently preprocessed "
|
|
340
|
+
f"datasets!",
|
|
341
|
+
UserWarning,
|
|
342
|
+
)
|
|
297
343
|
if path_contents:
|
|
298
344
|
logger.warning(
|
|
299
345
|
f"Chosen directory {path} contains other "
|
|
@@ -301,20 +347,37 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
301
347
|
)
|
|
302
348
|
|
|
303
349
|
@staticmethod
|
|
304
|
-
def _save_features(sub_dir, ds, i_ds, offset):
|
|
350
|
+
def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
|
|
351
|
+
"""Save the feature DataFrame to a Parquet file."""
|
|
305
352
|
parquet_file_name = f"{i_ds + offset}-feat.parquet"
|
|
306
353
|
parquet_file_path = os.path.join(sub_dir, parquet_file_name)
|
|
307
354
|
ds.features.to_parquet(parquet_file_path)
|
|
308
355
|
|
|
309
356
|
@staticmethod
|
|
310
|
-
def
|
|
311
|
-
|
|
357
|
+
def _save_metadata(sub_dir: str, ds: FeaturesDataset):
|
|
358
|
+
"""Save the metadata DataFrame to a pickle file."""
|
|
359
|
+
metadata_file_name = "metadata_df.pkl"
|
|
360
|
+
metadata_file_path = os.path.join(sub_dir, metadata_file_name)
|
|
361
|
+
ds.metadata.to_pickle(metadata_file_path)
|
|
362
|
+
|
|
363
|
+
@staticmethod
|
|
364
|
+
def _save_description(sub_dir: str, description: pd.Series):
|
|
365
|
+
"""Save the description Series to a JSON file."""
|
|
366
|
+
desc_file_name = "description.json"
|
|
367
|
+
desc_file_path = os.path.join(sub_dir, desc_file_name)
|
|
368
|
+
description.to_json(desc_file_path)
|
|
369
|
+
|
|
370
|
+
@staticmethod
|
|
371
|
+
def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
|
|
372
|
+
"""Save the raw info dictionary to a FIF file if it exists."""
|
|
373
|
+
if hasattr(ds, "raw_info") and ds.raw_info is not None:
|
|
312
374
|
fif_file_name = "raw-info.fif"
|
|
313
375
|
fif_file_path = os.path.join(sub_dir, fif_file_name)
|
|
314
|
-
ds.raw_info.save(fif_file_path)
|
|
376
|
+
ds.raw_info.save(fif_file_path, overwrite=True)
|
|
315
377
|
|
|
316
378
|
@staticmethod
|
|
317
|
-
def _save_kwargs(sub_dir, ds):
|
|
379
|
+
def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
|
|
380
|
+
"""Save various keyword argument dictionaries to JSON files."""
|
|
318
381
|
for kwargs_name in [
|
|
319
382
|
"raw_preproc_kwargs",
|
|
320
383
|
"window_kwargs",
|
|
@@ -322,10 +385,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
322
385
|
"features_kwargs",
|
|
323
386
|
]:
|
|
324
387
|
if hasattr(ds, kwargs_name):
|
|
325
|
-
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
326
|
-
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
327
388
|
kwargs = getattr(ds, kwargs_name)
|
|
328
389
|
if kwargs is not None:
|
|
390
|
+
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
391
|
+
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
329
392
|
with open(kwargs_file_path, "w") as f:
|
|
330
393
|
json.dump(kwargs, f)
|
|
331
394
|
|
|
@@ -334,7 +397,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
334
397
|
include_metadata: bool | str | List[str] = False,
|
|
335
398
|
include_target: bool = False,
|
|
336
399
|
include_crop_inds: bool = False,
|
|
337
|
-
):
|
|
400
|
+
) -> pd.DataFrame:
|
|
401
|
+
"""Convert the dataset to a single pandas DataFrame.
|
|
402
|
+
|
|
403
|
+
Parameters
|
|
404
|
+
----------
|
|
405
|
+
include_metadata : bool or str or list of str, default False
|
|
406
|
+
If True, include all metadata columns. If a string or list of
|
|
407
|
+
strings, include only the specified metadata columns.
|
|
408
|
+
include_target : bool, default False
|
|
409
|
+
If True, include the 'target' column.
|
|
410
|
+
include_crop_inds : bool, default False
|
|
411
|
+
If True, include window cropping index columns.
|
|
412
|
+
|
|
413
|
+
Returns
|
|
414
|
+
-------
|
|
415
|
+
pandas.DataFrame
|
|
416
|
+
A DataFrame containing the features and requested metadata.
|
|
417
|
+
|
|
418
|
+
"""
|
|
338
419
|
if (
|
|
339
420
|
not isinstance(include_metadata, bool)
|
|
340
421
|
or include_metadata
|
|
@@ -343,7 +424,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
343
424
|
include_dataset = False
|
|
344
425
|
if isinstance(include_metadata, bool) and include_metadata:
|
|
345
426
|
include_dataset = True
|
|
346
|
-
cols = self.datasets[0].metadata.columns
|
|
427
|
+
cols = self.datasets[0].metadata.columns.tolist()
|
|
347
428
|
else:
|
|
348
429
|
cols = include_metadata
|
|
349
430
|
if isinstance(cols, bool) and not cols:
|
|
@@ -352,13 +433,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
352
433
|
cols = [cols]
|
|
353
434
|
cols = set(cols)
|
|
354
435
|
if include_crop_inds:
|
|
355
|
-
cols
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
436
|
+
cols.update(
|
|
437
|
+
{
|
|
438
|
+
"i_dataset",
|
|
439
|
+
"i_window_in_trial",
|
|
440
|
+
"i_start_in_trial",
|
|
441
|
+
"i_stop_in_trial",
|
|
442
|
+
}
|
|
443
|
+
)
|
|
362
444
|
if include_target:
|
|
363
445
|
cols.add("target")
|
|
364
446
|
cols = list(cols)
|
|
@@ -381,10 +463,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
381
463
|
dataframes = [ds.features for ds in self.datasets]
|
|
382
464
|
return pd.concat(dataframes, axis=0, ignore_index=True)
|
|
383
465
|
|
|
384
|
-
def _numeric_columns(self):
|
|
466
|
+
def _numeric_columns(self) -> pd.Index:
|
|
467
|
+
"""Get the names of numeric columns from the feature DataFrames."""
|
|
385
468
|
return self.datasets[0].features.select_dtypes(include=np.number).columns
|
|
386
469
|
|
|
387
|
-
def count(self, numeric_only=False, n_jobs=1):
|
|
470
|
+
def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
471
|
+
"""Count non-NA cells for each feature column.
|
|
472
|
+
|
|
473
|
+
Parameters
|
|
474
|
+
----------
|
|
475
|
+
numeric_only : bool, default False
|
|
476
|
+
Include only float, int, boolean columns.
|
|
477
|
+
n_jobs : int, default 1
|
|
478
|
+
Number of jobs to run in parallel.
|
|
479
|
+
|
|
480
|
+
Returns
|
|
481
|
+
-------
|
|
482
|
+
pandas.Series
|
|
483
|
+
The count of non-NA cells for each column.
|
|
484
|
+
|
|
485
|
+
"""
|
|
388
486
|
stats = Parallel(n_jobs)(
|
|
389
487
|
delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
|
|
390
488
|
for ds in self.datasets
|
|
@@ -393,7 +491,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
393
491
|
count = counts.sum(axis=0)
|
|
394
492
|
return pd.Series(count, index=self._numeric_columns())
|
|
395
493
|
|
|
396
|
-
def mean(self, numeric_only=False, n_jobs=1):
|
|
494
|
+
def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
495
|
+
"""Compute the mean for each feature column.
|
|
496
|
+
|
|
497
|
+
Parameters
|
|
498
|
+
----------
|
|
499
|
+
numeric_only : bool, default False
|
|
500
|
+
Include only float, int, boolean columns.
|
|
501
|
+
n_jobs : int, default 1
|
|
502
|
+
Number of jobs to run in parallel.
|
|
503
|
+
|
|
504
|
+
Returns
|
|
505
|
+
-------
|
|
506
|
+
pandas.Series
|
|
507
|
+
The mean of each column.
|
|
508
|
+
|
|
509
|
+
"""
|
|
397
510
|
stats = Parallel(n_jobs)(
|
|
398
511
|
delayed(_compute_stats)(
|
|
399
512
|
ds, return_count=True, return_mean=True, numeric_only=numeric_only
|
|
@@ -405,7 +518,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
405
518
|
mean = np.sum((counts / count) * means, axis=0)
|
|
406
519
|
return pd.Series(mean, index=self._numeric_columns())
|
|
407
520
|
|
|
408
|
-
def var(
|
|
521
|
+
def var(
|
|
522
|
+
self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
|
|
523
|
+
) -> pd.Series:
|
|
524
|
+
"""Compute the variance for each feature column.
|
|
525
|
+
|
|
526
|
+
Parameters
|
|
527
|
+
----------
|
|
528
|
+
ddof : int, default 1
|
|
529
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
|
|
530
|
+
numeric_only : bool, default False
|
|
531
|
+
Include only float, int, boolean columns.
|
|
532
|
+
n_jobs : int, default 1
|
|
533
|
+
Number of jobs to run in parallel.
|
|
534
|
+
|
|
535
|
+
Returns
|
|
536
|
+
-------
|
|
537
|
+
pandas.Series
|
|
538
|
+
The variance of each column.
|
|
539
|
+
|
|
540
|
+
"""
|
|
409
541
|
stats = Parallel(n_jobs)(
|
|
410
542
|
delayed(_compute_stats)(
|
|
411
543
|
ds,
|
|
@@ -425,12 +557,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
425
557
|
_, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
426
558
|
return pd.Series(var, index=self._numeric_columns())
|
|
427
559
|
|
|
428
|
-
def std(
|
|
560
|
+
def std(
|
|
561
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
562
|
+
) -> pd.Series:
|
|
563
|
+
"""Compute the standard deviation for each feature column.
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
ddof : int, default 1
|
|
568
|
+
Delta Degrees of Freedom.
|
|
569
|
+
numeric_only : bool, default False
|
|
570
|
+
Include only float, int, boolean columns.
|
|
571
|
+
eps : float, default 0
|
|
572
|
+
A small epsilon value to add to the variance before taking the
|
|
573
|
+
square root to avoid numerical instability.
|
|
574
|
+
n_jobs : int, default 1
|
|
575
|
+
Number of jobs to run in parallel.
|
|
576
|
+
|
|
577
|
+
Returns
|
|
578
|
+
-------
|
|
579
|
+
pandas.Series
|
|
580
|
+
The standard deviation of each column.
|
|
581
|
+
|
|
582
|
+
"""
|
|
429
583
|
return np.sqrt(
|
|
430
584
|
self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
|
|
431
585
|
)
|
|
432
586
|
|
|
433
|
-
def zscore(
|
|
587
|
+
def zscore(
|
|
588
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
589
|
+
) -> None:
|
|
590
|
+
"""Apply z-score normalization to numeric columns in-place.
|
|
591
|
+
|
|
592
|
+
Parameters
|
|
593
|
+
----------
|
|
594
|
+
ddof : int, default 1
|
|
595
|
+
Delta Degrees of Freedom for variance calculation.
|
|
596
|
+
numeric_only : bool, default False
|
|
597
|
+
Include only float, int, boolean columns.
|
|
598
|
+
eps : float, default 0
|
|
599
|
+
Epsilon for numerical stability.
|
|
600
|
+
n_jobs : int, default 1
|
|
601
|
+
Number of jobs to run in parallel for statistics computation.
|
|
602
|
+
|
|
603
|
+
"""
|
|
434
604
|
stats = Parallel(n_jobs)(
|
|
435
605
|
delayed(_compute_stats)(
|
|
436
606
|
ds,
|
|
@@ -450,10 +620,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
450
620
|
_, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
451
621
|
std = np.sqrt(var + eps)
|
|
452
622
|
for ds in self.datasets:
|
|
453
|
-
ds.features
|
|
623
|
+
ds.features.loc[:, self._numeric_columns()] = (
|
|
624
|
+
ds.features.loc[:, self._numeric_columns()] - mean
|
|
625
|
+
) / std
|
|
454
626
|
|
|
455
627
|
@staticmethod
|
|
456
|
-
def _enforce_inplace_operations(func_name, kwargs):
|
|
628
|
+
def _enforce_inplace_operations(func_name: str, kwargs: dict):
|
|
629
|
+
"""Raise an error if 'inplace=False' is passed to a method."""
|
|
457
630
|
if "inplace" in kwargs and kwargs["inplace"] is False:
|
|
458
631
|
raise ValueError(
|
|
459
632
|
f"{func_name} only works inplace, please change "
|
|
@@ -461,33 +634,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
461
634
|
)
|
|
462
635
|
kwargs["inplace"] = True
|
|
463
636
|
|
|
464
|
-
def fillna(self, *args, **kwargs):
|
|
637
|
+
def fillna(self, *args, **kwargs) -> None:
|
|
638
|
+
"""Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
|
|
465
639
|
FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
|
|
466
640
|
for ds in self.datasets:
|
|
467
641
|
ds.features.fillna(*args, **kwargs)
|
|
468
642
|
|
|
469
|
-
def replace(self, *args, **kwargs):
|
|
643
|
+
def replace(self, *args, **kwargs) -> None:
|
|
644
|
+
"""Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
|
|
470
645
|
FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
|
|
471
646
|
for ds in self.datasets:
|
|
472
647
|
ds.features.replace(*args, **kwargs)
|
|
473
648
|
|
|
474
|
-
def interpolate(self, *args, **kwargs):
|
|
649
|
+
def interpolate(self, *args, **kwargs) -> None:
|
|
650
|
+
"""Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
|
|
475
651
|
FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
|
|
476
652
|
for ds in self.datasets:
|
|
477
653
|
ds.features.interpolate(*args, **kwargs)
|
|
478
654
|
|
|
479
|
-
def dropna(self, *args, **kwargs):
|
|
655
|
+
def dropna(self, *args, **kwargs) -> None:
|
|
656
|
+
"""Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
|
|
480
657
|
FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
|
|
481
658
|
for ds in self.datasets:
|
|
482
659
|
ds.features.dropna(*args, **kwargs)
|
|
483
660
|
|
|
484
|
-
def drop(self, *args, **kwargs):
|
|
661
|
+
def drop(self, *args, **kwargs) -> None:
|
|
662
|
+
"""Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
|
|
485
663
|
FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
|
|
486
664
|
for ds in self.datasets:
|
|
487
665
|
ds.features.drop(*args, **kwargs)
|
|
488
666
|
|
|
489
|
-
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
|
|
667
|
+
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
|
|
668
|
+
"""Join columns with other FeaturesConcatDataset in-place.
|
|
669
|
+
|
|
670
|
+
Parameters
|
|
671
|
+
----------
|
|
672
|
+
concat_dataset : FeaturesConcatDataset
|
|
673
|
+
The dataset to join with. Must have the same number of datasets,
|
|
674
|
+
and each corresponding dataset must have the same length.
|
|
675
|
+
**kwargs
|
|
676
|
+
Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
|
|
677
|
+
|
|
678
|
+
"""
|
|
490
679
|
assert len(self.datasets) == len(concat_dataset.datasets)
|
|
491
680
|
for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
|
|
492
681
|
assert len(ds1) == len(ds2)
|
|
493
|
-
ds1.features.join(ds2, **kwargs)
|
|
682
|
+
ds1.features = ds1.features.join(ds2.features, **kwargs)
|