eegdash 0.4.0.dev153__py3-none-any.whl → 0.4.0.dev162__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +1 -1
- eegdash/api.py +180 -86
- eegdash/bids_eeg_metadata.py +139 -39
- eegdash/const.py +25 -0
- eegdash/data_utils.py +239 -173
- eegdash/dataset/dataset.py +35 -13
- eegdash/dataset/dataset_summary.csv +1 -1
- eegdash/dataset/registry.py +69 -4
- eegdash/downloader.py +95 -9
- eegdash/features/datasets.py +320 -136
- eegdash/features/decorators.py +88 -3
- eegdash/features/extractors.py +201 -55
- eegdash/features/inspect.py +78 -5
- eegdash/features/serialization.py +45 -19
- eegdash/features/utils.py +75 -8
- eegdash/hbn/preprocessing.py +50 -17
- eegdash/hbn/windows.py +145 -32
- eegdash/logging.py +19 -0
- eegdash/mongodb.py +44 -27
- eegdash/paths.py +14 -5
- eegdash/utils.py +16 -1
- {eegdash-0.4.0.dev153.dist-info → eegdash-0.4.0.dev162.dist-info}/METADATA +1 -1
- eegdash-0.4.0.dev162.dist-info/RECORD +37 -0
- eegdash-0.4.0.dev153.dist-info/RECORD +0 -37
- {eegdash-0.4.0.dev153.dist-info → eegdash-0.4.0.dev162.dist-info}/WHEEL +0 -0
- {eegdash-0.4.0.dev153.dist-info → eegdash-0.4.0.dev162.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.4.0.dev153.dist-info → eegdash-0.4.0.dev162.dist-info}/top_level.txt +0 -0
eegdash/features/datasets.py
CHANGED
|
@@ -20,20 +20,34 @@ from ..logging import logger
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class FeaturesDataset(EEGWindowsDataset):
|
|
23
|
-
"""
|
|
23
|
+
"""A dataset of features extracted from EEG windows.
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
This class holds features in a pandas DataFrame and provides an interface
|
|
26
|
+
compatible with braindecode's dataset structure. Each row in the feature
|
|
27
|
+
DataFrame corresponds to a single sample (e.g., an EEG window).
|
|
28
28
|
|
|
29
29
|
Parameters
|
|
30
30
|
----------
|
|
31
|
-
features :
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
31
|
+
features : pandas.DataFrame
|
|
32
|
+
A DataFrame where each row is a sample and each column is a feature.
|
|
33
|
+
metadata : pandas.DataFrame, optional
|
|
34
|
+
A DataFrame containing metadata for each sample, indexed consistently
|
|
35
|
+
with `features`. Must include columns 'i_window_in_trial',
|
|
36
|
+
'i_start_in_trial', 'i_stop_in_trial', and 'target'.
|
|
37
|
+
description : dict or pandas.Series, optional
|
|
38
|
+
Additional high-level information about the dataset (e.g., subject ID).
|
|
39
|
+
transform : callable, optional
|
|
40
|
+
A function or transform to apply to the feature data on-the-fly.
|
|
41
|
+
raw_info : dict, optional
|
|
42
|
+
Information about the original raw recording, for provenance.
|
|
43
|
+
raw_preproc_kwargs : dict, optional
|
|
44
|
+
Keyword arguments used for preprocessing the raw data.
|
|
45
|
+
window_kwargs : dict, optional
|
|
46
|
+
Keyword arguments used for windowing the data.
|
|
47
|
+
window_preproc_kwargs : dict, optional
|
|
48
|
+
Keyword arguments used for preprocessing the windowed data.
|
|
49
|
+
features_kwargs : dict, optional
|
|
50
|
+
Keyword arguments used for feature extraction.
|
|
37
51
|
|
|
38
52
|
"""
|
|
39
53
|
|
|
@@ -65,7 +79,21 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
65
79
|
].to_numpy()
|
|
66
80
|
self.y = metadata.loc[:, "target"].to_list()
|
|
67
81
|
|
|
68
|
-
def __getitem__(self, index):
|
|
82
|
+
def __getitem__(self, index: int) -> tuple[np.ndarray, int, list]:
|
|
83
|
+
"""Get a single sample from the dataset.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
index : int
|
|
88
|
+
The index of the sample to retrieve.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
tuple
|
|
93
|
+
A tuple containing the feature vector (X), the target (y), and the
|
|
94
|
+
cropping indices.
|
|
95
|
+
|
|
96
|
+
"""
|
|
69
97
|
crop_inds = self.crop_inds[index].tolist()
|
|
70
98
|
X = self.features.iloc[index].to_numpy()
|
|
71
99
|
X = X.copy()
|
|
@@ -75,18 +103,27 @@ class FeaturesDataset(EEGWindowsDataset):
|
|
|
75
103
|
y = self.y[index]
|
|
76
104
|
return X, y, crop_inds
|
|
77
105
|
|
|
78
|
-
def __len__(self):
|
|
106
|
+
def __len__(self) -> int:
|
|
107
|
+
"""Return the number of samples in the dataset.
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
int
|
|
112
|
+
The total number of feature samples.
|
|
113
|
+
|
|
114
|
+
"""
|
|
79
115
|
return len(self.features.index)
|
|
80
116
|
|
|
81
117
|
|
|
82
118
|
def _compute_stats(
|
|
83
119
|
ds: FeaturesDataset,
|
|
84
|
-
return_count=False,
|
|
85
|
-
return_mean=False,
|
|
86
|
-
return_var=False,
|
|
87
|
-
ddof=1,
|
|
88
|
-
numeric_only=False,
|
|
89
|
-
):
|
|
120
|
+
return_count: bool = False,
|
|
121
|
+
return_mean: bool = False,
|
|
122
|
+
return_var: bool = False,
|
|
123
|
+
ddof: int = 1,
|
|
124
|
+
numeric_only: bool = False,
|
|
125
|
+
) -> tuple:
|
|
126
|
+
"""Compute statistics for a single FeaturesDataset."""
|
|
90
127
|
res = []
|
|
91
128
|
if return_count:
|
|
92
129
|
res.append(ds.features.count(numeric_only=numeric_only))
|
|
@@ -97,7 +134,14 @@ def _compute_stats(
|
|
|
97
134
|
return tuple(res)
|
|
98
135
|
|
|
99
136
|
|
|
100
|
-
def _pooled_var(
|
|
137
|
+
def _pooled_var(
|
|
138
|
+
counts: np.ndarray,
|
|
139
|
+
means: np.ndarray,
|
|
140
|
+
variances: np.ndarray,
|
|
141
|
+
ddof: int,
|
|
142
|
+
ddof_in: int | None = None,
|
|
143
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
144
|
+
"""Compute pooled variance across multiple datasets."""
|
|
101
145
|
if ddof_in is None:
|
|
102
146
|
ddof_in = ddof
|
|
103
147
|
count = counts.sum(axis=0)
|
|
@@ -110,17 +154,20 @@ def _pooled_var(counts, means, variances, ddof, ddof_in=None):
|
|
|
110
154
|
|
|
111
155
|
|
|
112
156
|
class FeaturesConcatDataset(BaseConcatDataset):
|
|
113
|
-
"""A
|
|
157
|
+
"""A concatenated dataset of `FeaturesDataset` objects.
|
|
114
158
|
|
|
115
|
-
|
|
116
|
-
a
|
|
159
|
+
This class holds a list of :class:`FeaturesDataset` instances and allows
|
|
160
|
+
them to be treated as a single, larger dataset. It provides methods for
|
|
161
|
+
|
|
162
|
+
splitting, saving, and performing DataFrame-like operations (e.g., `mean`,
|
|
163
|
+
`var`, `fillna`) across all contained datasets.
|
|
117
164
|
|
|
118
165
|
Parameters
|
|
119
166
|
----------
|
|
120
|
-
list_of_ds : list
|
|
121
|
-
list of
|
|
122
|
-
target_transform : callable
|
|
123
|
-
|
|
167
|
+
list_of_ds : list of FeaturesDataset
|
|
168
|
+
A list of :class:`FeaturesDataset` objects to concatenate.
|
|
169
|
+
target_transform : callable, optional
|
|
170
|
+
A function to apply to the target values before they are returned.
|
|
124
171
|
|
|
125
172
|
"""
|
|
126
173
|
|
|
@@ -140,26 +187,28 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
140
187
|
self,
|
|
141
188
|
by: str | list[int] | list[list[int]] | dict[str, list[int]],
|
|
142
189
|
) -> dict[str, FeaturesConcatDataset]:
|
|
143
|
-
"""Split the dataset
|
|
190
|
+
"""Split the dataset into subsets.
|
|
144
191
|
|
|
145
|
-
The
|
|
192
|
+
The splitting can be done based on a column in the description
|
|
193
|
+
DataFrame or by providing explicit indices for each split.
|
|
146
194
|
|
|
147
195
|
Parameters
|
|
148
196
|
----------
|
|
149
|
-
by : str
|
|
150
|
-
If
|
|
151
|
-
|
|
152
|
-
If
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
197
|
+
by : str or list or dict
|
|
198
|
+
- If a string, splits are created for each unique value in the
|
|
199
|
+
description column `by`.
|
|
200
|
+
- If a list of integers, a single split is created containing the
|
|
201
|
+
datasets at the specified indices.
|
|
202
|
+
- If a list of lists of integers, multiple splits are created, one
|
|
203
|
+
for each sublist of indices.
|
|
204
|
+
- If a dictionary, keys are used as split names and values are
|
|
205
|
+
lists of dataset indices.
|
|
157
206
|
|
|
158
207
|
Returns
|
|
159
208
|
-------
|
|
160
|
-
|
|
161
|
-
A dictionary
|
|
162
|
-
|
|
209
|
+
dict[str, FeaturesConcatDataset]
|
|
210
|
+
A dictionary where keys are split names and values are the new
|
|
211
|
+
:class:`FeaturesConcatDataset` subsets.
|
|
163
212
|
|
|
164
213
|
"""
|
|
165
214
|
if isinstance(by, str):
|
|
@@ -184,14 +233,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
184
233
|
}
|
|
185
234
|
|
|
186
235
|
def get_metadata(self) -> pd.DataFrame:
|
|
187
|
-
"""
|
|
236
|
+
"""Get the metadata of all datasets as a single DataFrame.
|
|
237
|
+
|
|
238
|
+
Concatenates the metadata from all contained datasets and adds columns
|
|
239
|
+
from their `description` attributes.
|
|
188
240
|
|
|
189
241
|
Returns
|
|
190
242
|
-------
|
|
191
|
-
|
|
192
|
-
DataFrame containing
|
|
193
|
-
|
|
194
|
-
|
|
243
|
+
pandas.DataFrame
|
|
244
|
+
A DataFrame containing the metadata for every sample in the
|
|
245
|
+
concatenated dataset.
|
|
246
|
+
|
|
247
|
+
Raises
|
|
248
|
+
------
|
|
249
|
+
TypeError
|
|
250
|
+
If any of the contained datasets is not a :class:`FeaturesDataset`.
|
|
195
251
|
|
|
196
252
|
"""
|
|
197
253
|
if not all([isinstance(ds, FeaturesDataset) for ds in self.datasets]):
|
|
@@ -202,60 +258,59 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
202
258
|
|
|
203
259
|
all_dfs = list()
|
|
204
260
|
for ds in self.datasets:
|
|
205
|
-
df = ds.metadata
|
|
261
|
+
df = ds.metadata.copy()
|
|
206
262
|
for k, v in ds.description.items():
|
|
207
263
|
df[k] = v
|
|
208
264
|
all_dfs.append(df)
|
|
209
265
|
|
|
210
266
|
return pd.concat(all_dfs)
|
|
211
267
|
|
|
212
|
-
def save(self, path: str, overwrite: bool = False, offset: int = 0):
|
|
213
|
-
"""Save
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
raw_preproc_kwargs.json (if raws were preprocessed)
|
|
230
|
-
window_kwargs.json (if this is a windowed dataset)
|
|
231
|
-
window_preproc_kwargs.json (if windows were preprocessed)
|
|
232
|
-
features_kwargs.json
|
|
268
|
+
def save(self, path: str, overwrite: bool = False, offset: int = 0) -> None:
|
|
269
|
+
"""Save the concatenated dataset to a directory.
|
|
270
|
+
|
|
271
|
+
Creates a directory structure where each contained dataset is saved in
|
|
272
|
+
its own numbered subdirectory.
|
|
273
|
+
|
|
274
|
+
.. code-block::
|
|
275
|
+
|
|
276
|
+
path/
|
|
277
|
+
0/
|
|
278
|
+
0-feat.parquet
|
|
279
|
+
metadata_df.pkl
|
|
280
|
+
description.json
|
|
281
|
+
...
|
|
282
|
+
1/
|
|
283
|
+
1-feat.parquet
|
|
284
|
+
...
|
|
233
285
|
|
|
234
286
|
Parameters
|
|
235
287
|
----------
|
|
236
288
|
path : str
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
289
|
+
The directory where the dataset will be saved.
|
|
290
|
+
overwrite : bool, default False
|
|
291
|
+
If True, any existing subdirectories that conflict with the new
|
|
292
|
+
ones will be removed.
|
|
293
|
+
offset : int, default 0
|
|
294
|
+
An integer to add to the subdirectory names. Useful for saving
|
|
295
|
+
datasets in chunks.
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
ValueError
|
|
300
|
+
If the dataset is empty.
|
|
301
|
+
FileExistsError
|
|
302
|
+
If a subdirectory already exists and `overwrite` is False.
|
|
247
303
|
|
|
248
304
|
"""
|
|
249
305
|
if len(self.datasets) == 0:
|
|
250
306
|
raise ValueError("Expect at least one dataset")
|
|
251
307
|
path_contents = os.listdir(path)
|
|
252
|
-
n_sub_dirs = len([os.path.isdir(e) for e in path_contents])
|
|
308
|
+
n_sub_dirs = len([os.path.isdir(os.path.join(path, e)) for e in path_contents])
|
|
253
309
|
for i_ds, ds in enumerate(self.datasets):
|
|
254
|
-
|
|
255
|
-
if
|
|
256
|
-
path_contents.remove(
|
|
257
|
-
|
|
258
|
-
sub_dir = os.path.join(path, str(i_ds + offset))
|
|
310
|
+
sub_dir_name = str(i_ds + offset)
|
|
311
|
+
if sub_dir_name in path_contents:
|
|
312
|
+
path_contents.remove(sub_dir_name)
|
|
313
|
+
sub_dir = os.path.join(path, sub_dir_name)
|
|
259
314
|
if os.path.exists(sub_dir):
|
|
260
315
|
if overwrite:
|
|
261
316
|
shutil.rmtree(sub_dir)
|
|
@@ -265,35 +320,21 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
265
320
|
f" a different directory, set overwrite=True, or "
|
|
266
321
|
f"resolve manually."
|
|
267
322
|
)
|
|
268
|
-
# save_dir/{i_ds+offset}/
|
|
269
323
|
os.makedirs(sub_dir)
|
|
270
|
-
# save_dir/{i_ds+offset}/{i_ds+offset}-feat.parquet
|
|
271
324
|
self._save_features(sub_dir, ds, i_ds, offset)
|
|
272
|
-
# save_dir/{i_ds+offset}/metadata_df.pkl
|
|
273
325
|
self._save_metadata(sub_dir, ds)
|
|
274
|
-
# save_dir/{i_ds+offset}/description.json
|
|
275
326
|
self._save_description(sub_dir, ds.description)
|
|
276
|
-
# save_dir/{i_ds+offset}/raw-info.fif
|
|
277
327
|
self._save_raw_info(sub_dir, ds)
|
|
278
|
-
# save_dir/{i_ds+offset}/raw_preproc_kwargs.json
|
|
279
|
-
# save_dir/{i_ds+offset}/window_kwargs.json
|
|
280
|
-
# save_dir/{i_ds+offset}/window_preproc_kwargs.json
|
|
281
|
-
# save_dir/{i_ds+offset}/features_kwargs.json
|
|
282
328
|
self._save_kwargs(sub_dir, ds)
|
|
283
|
-
if overwrite:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
f"datasets!",
|
|
293
|
-
UserWarning,
|
|
294
|
-
)
|
|
295
|
-
# if path contains files or directories that were not touched, raise
|
|
296
|
-
# warning
|
|
329
|
+
if overwrite and i_ds + 1 + offset < n_sub_dirs:
|
|
330
|
+
logger.warning(
|
|
331
|
+
f"The number of saved datasets ({i_ds + 1 + offset}) "
|
|
332
|
+
f"does not match the number of existing "
|
|
333
|
+
f"subdirectories ({n_sub_dirs}). You may now "
|
|
334
|
+
f"encounter a mix of differently preprocessed "
|
|
335
|
+
f"datasets!",
|
|
336
|
+
UserWarning,
|
|
337
|
+
)
|
|
297
338
|
if path_contents:
|
|
298
339
|
logger.warning(
|
|
299
340
|
f"Chosen directory {path} contains other "
|
|
@@ -301,20 +342,37 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
301
342
|
)
|
|
302
343
|
|
|
303
344
|
@staticmethod
|
|
304
|
-
def _save_features(sub_dir, ds, i_ds, offset):
|
|
345
|
+
def _save_features(sub_dir: str, ds: FeaturesDataset, i_ds: int, offset: int):
|
|
346
|
+
"""Save the feature DataFrame to a Parquet file."""
|
|
305
347
|
parquet_file_name = f"{i_ds + offset}-feat.parquet"
|
|
306
348
|
parquet_file_path = os.path.join(sub_dir, parquet_file_name)
|
|
307
349
|
ds.features.to_parquet(parquet_file_path)
|
|
308
350
|
|
|
309
351
|
@staticmethod
|
|
310
|
-
def
|
|
311
|
-
|
|
352
|
+
def _save_metadata(sub_dir: str, ds: FeaturesDataset):
|
|
353
|
+
"""Save the metadata DataFrame to a pickle file."""
|
|
354
|
+
metadata_file_name = "metadata_df.pkl"
|
|
355
|
+
metadata_file_path = os.path.join(sub_dir, metadata_file_name)
|
|
356
|
+
ds.metadata.to_pickle(metadata_file_path)
|
|
357
|
+
|
|
358
|
+
@staticmethod
|
|
359
|
+
def _save_description(sub_dir: str, description: pd.Series):
|
|
360
|
+
"""Save the description Series to a JSON file."""
|
|
361
|
+
desc_file_name = "description.json"
|
|
362
|
+
desc_file_path = os.path.join(sub_dir, desc_file_name)
|
|
363
|
+
description.to_json(desc_file_path)
|
|
364
|
+
|
|
365
|
+
@staticmethod
|
|
366
|
+
def _save_raw_info(sub_dir: str, ds: FeaturesDataset):
|
|
367
|
+
"""Save the raw info dictionary to a FIF file if it exists."""
|
|
368
|
+
if hasattr(ds, "raw_info") and ds.raw_info is not None:
|
|
312
369
|
fif_file_name = "raw-info.fif"
|
|
313
370
|
fif_file_path = os.path.join(sub_dir, fif_file_name)
|
|
314
|
-
ds.raw_info.save(fif_file_path)
|
|
371
|
+
ds.raw_info.save(fif_file_path, overwrite=True)
|
|
315
372
|
|
|
316
373
|
@staticmethod
|
|
317
|
-
def _save_kwargs(sub_dir, ds):
|
|
374
|
+
def _save_kwargs(sub_dir: str, ds: FeaturesDataset):
|
|
375
|
+
"""Save various keyword argument dictionaries to JSON files."""
|
|
318
376
|
for kwargs_name in [
|
|
319
377
|
"raw_preproc_kwargs",
|
|
320
378
|
"window_kwargs",
|
|
@@ -322,10 +380,10 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
322
380
|
"features_kwargs",
|
|
323
381
|
]:
|
|
324
382
|
if hasattr(ds, kwargs_name):
|
|
325
|
-
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
326
|
-
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
327
383
|
kwargs = getattr(ds, kwargs_name)
|
|
328
384
|
if kwargs is not None:
|
|
385
|
+
kwargs_file_name = ".".join([kwargs_name, "json"])
|
|
386
|
+
kwargs_file_path = os.path.join(sub_dir, kwargs_file_name)
|
|
329
387
|
with open(kwargs_file_path, "w") as f:
|
|
330
388
|
json.dump(kwargs, f)
|
|
331
389
|
|
|
@@ -334,7 +392,25 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
334
392
|
include_metadata: bool | str | List[str] = False,
|
|
335
393
|
include_target: bool = False,
|
|
336
394
|
include_crop_inds: bool = False,
|
|
337
|
-
):
|
|
395
|
+
) -> pd.DataFrame:
|
|
396
|
+
"""Convert the dataset to a single pandas DataFrame.
|
|
397
|
+
|
|
398
|
+
Parameters
|
|
399
|
+
----------
|
|
400
|
+
include_metadata : bool or str or list of str, default False
|
|
401
|
+
If True, include all metadata columns. If a string or list of
|
|
402
|
+
strings, include only the specified metadata columns.
|
|
403
|
+
include_target : bool, default False
|
|
404
|
+
If True, include the 'target' column.
|
|
405
|
+
include_crop_inds : bool, default False
|
|
406
|
+
If True, include window cropping index columns.
|
|
407
|
+
|
|
408
|
+
Returns
|
|
409
|
+
-------
|
|
410
|
+
pandas.DataFrame
|
|
411
|
+
A DataFrame containing the features and requested metadata.
|
|
412
|
+
|
|
413
|
+
"""
|
|
338
414
|
if (
|
|
339
415
|
not isinstance(include_metadata, bool)
|
|
340
416
|
or include_metadata
|
|
@@ -343,7 +419,7 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
343
419
|
include_dataset = False
|
|
344
420
|
if isinstance(include_metadata, bool) and include_metadata:
|
|
345
421
|
include_dataset = True
|
|
346
|
-
cols = self.datasets[0].metadata.columns
|
|
422
|
+
cols = self.datasets[0].metadata.columns.tolist()
|
|
347
423
|
else:
|
|
348
424
|
cols = include_metadata
|
|
349
425
|
if isinstance(cols, bool) and not cols:
|
|
@@ -352,13 +428,14 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
352
428
|
cols = [cols]
|
|
353
429
|
cols = set(cols)
|
|
354
430
|
if include_crop_inds:
|
|
355
|
-
cols
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
431
|
+
cols.update(
|
|
432
|
+
{
|
|
433
|
+
"i_dataset",
|
|
434
|
+
"i_window_in_trial",
|
|
435
|
+
"i_start_in_trial",
|
|
436
|
+
"i_stop_in_trial",
|
|
437
|
+
}
|
|
438
|
+
)
|
|
362
439
|
if include_target:
|
|
363
440
|
cols.add("target")
|
|
364
441
|
cols = list(cols)
|
|
@@ -381,10 +458,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
381
458
|
dataframes = [ds.features for ds in self.datasets]
|
|
382
459
|
return pd.concat(dataframes, axis=0, ignore_index=True)
|
|
383
460
|
|
|
384
|
-
def _numeric_columns(self):
|
|
461
|
+
def _numeric_columns(self) -> pd.Index:
|
|
462
|
+
"""Get the names of numeric columns from the feature DataFrames."""
|
|
385
463
|
return self.datasets[0].features.select_dtypes(include=np.number).columns
|
|
386
464
|
|
|
387
|
-
def count(self, numeric_only=False, n_jobs=1):
|
|
465
|
+
def count(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
466
|
+
"""Count non-NA cells for each feature column.
|
|
467
|
+
|
|
468
|
+
Parameters
|
|
469
|
+
----------
|
|
470
|
+
numeric_only : bool, default False
|
|
471
|
+
Include only float, int, boolean columns.
|
|
472
|
+
n_jobs : int, default 1
|
|
473
|
+
Number of jobs to run in parallel.
|
|
474
|
+
|
|
475
|
+
Returns
|
|
476
|
+
-------
|
|
477
|
+
pandas.Series
|
|
478
|
+
The count of non-NA cells for each column.
|
|
479
|
+
|
|
480
|
+
"""
|
|
388
481
|
stats = Parallel(n_jobs)(
|
|
389
482
|
delayed(_compute_stats)(ds, return_count=True, numeric_only=numeric_only)
|
|
390
483
|
for ds in self.datasets
|
|
@@ -393,7 +486,22 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
393
486
|
count = counts.sum(axis=0)
|
|
394
487
|
return pd.Series(count, index=self._numeric_columns())
|
|
395
488
|
|
|
396
|
-
def mean(self, numeric_only=False, n_jobs=1):
|
|
489
|
+
def mean(self, numeric_only: bool = False, n_jobs: int = 1) -> pd.Series:
|
|
490
|
+
"""Compute the mean for each feature column.
|
|
491
|
+
|
|
492
|
+
Parameters
|
|
493
|
+
----------
|
|
494
|
+
numeric_only : bool, default False
|
|
495
|
+
Include only float, int, boolean columns.
|
|
496
|
+
n_jobs : int, default 1
|
|
497
|
+
Number of jobs to run in parallel.
|
|
498
|
+
|
|
499
|
+
Returns
|
|
500
|
+
-------
|
|
501
|
+
pandas.Series
|
|
502
|
+
The mean of each column.
|
|
503
|
+
|
|
504
|
+
"""
|
|
397
505
|
stats = Parallel(n_jobs)(
|
|
398
506
|
delayed(_compute_stats)(
|
|
399
507
|
ds, return_count=True, return_mean=True, numeric_only=numeric_only
|
|
@@ -405,7 +513,26 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
405
513
|
mean = np.sum((counts / count) * means, axis=0)
|
|
406
514
|
return pd.Series(mean, index=self._numeric_columns())
|
|
407
515
|
|
|
408
|
-
def var(
|
|
516
|
+
def var(
|
|
517
|
+
self, ddof: int = 1, numeric_only: bool = False, n_jobs: int = 1
|
|
518
|
+
) -> pd.Series:
|
|
519
|
+
"""Compute the variance for each feature column.
|
|
520
|
+
|
|
521
|
+
Parameters
|
|
522
|
+
----------
|
|
523
|
+
ddof : int, default 1
|
|
524
|
+
Delta Degrees of Freedom. The divisor used in calculations is N - ddof.
|
|
525
|
+
numeric_only : bool, default False
|
|
526
|
+
Include only float, int, boolean columns.
|
|
527
|
+
n_jobs : int, default 1
|
|
528
|
+
Number of jobs to run in parallel.
|
|
529
|
+
|
|
530
|
+
Returns
|
|
531
|
+
-------
|
|
532
|
+
pandas.Series
|
|
533
|
+
The variance of each column.
|
|
534
|
+
|
|
535
|
+
"""
|
|
409
536
|
stats = Parallel(n_jobs)(
|
|
410
537
|
delayed(_compute_stats)(
|
|
411
538
|
ds,
|
|
@@ -425,12 +552,50 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
425
552
|
_, _, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
426
553
|
return pd.Series(var, index=self._numeric_columns())
|
|
427
554
|
|
|
428
|
-
def std(
|
|
555
|
+
def std(
|
|
556
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
557
|
+
) -> pd.Series:
|
|
558
|
+
"""Compute the standard deviation for each feature column.
|
|
559
|
+
|
|
560
|
+
Parameters
|
|
561
|
+
----------
|
|
562
|
+
ddof : int, default 1
|
|
563
|
+
Delta Degrees of Freedom.
|
|
564
|
+
numeric_only : bool, default False
|
|
565
|
+
Include only float, int, boolean columns.
|
|
566
|
+
eps : float, default 0
|
|
567
|
+
A small epsilon value to add to the variance before taking the
|
|
568
|
+
square root to avoid numerical instability.
|
|
569
|
+
n_jobs : int, default 1
|
|
570
|
+
Number of jobs to run in parallel.
|
|
571
|
+
|
|
572
|
+
Returns
|
|
573
|
+
-------
|
|
574
|
+
pandas.Series
|
|
575
|
+
The standard deviation of each column.
|
|
576
|
+
|
|
577
|
+
"""
|
|
429
578
|
return np.sqrt(
|
|
430
579
|
self.var(ddof=ddof, numeric_only=numeric_only, n_jobs=n_jobs) + eps
|
|
431
580
|
)
|
|
432
581
|
|
|
433
|
-
def zscore(
|
|
582
|
+
def zscore(
|
|
583
|
+
self, ddof: int = 1, numeric_only: bool = False, eps: float = 0, n_jobs: int = 1
|
|
584
|
+
) -> None:
|
|
585
|
+
"""Apply z-score normalization to numeric columns in-place.
|
|
586
|
+
|
|
587
|
+
Parameters
|
|
588
|
+
----------
|
|
589
|
+
ddof : int, default 1
|
|
590
|
+
Delta Degrees of Freedom for variance calculation.
|
|
591
|
+
numeric_only : bool, default False
|
|
592
|
+
Include only float, int, boolean columns.
|
|
593
|
+
eps : float, default 0
|
|
594
|
+
Epsilon for numerical stability.
|
|
595
|
+
n_jobs : int, default 1
|
|
596
|
+
Number of jobs to run in parallel for statistics computation.
|
|
597
|
+
|
|
598
|
+
"""
|
|
434
599
|
stats = Parallel(n_jobs)(
|
|
435
600
|
delayed(_compute_stats)(
|
|
436
601
|
ds,
|
|
@@ -450,10 +615,13 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
450
615
|
_, mean, var = _pooled_var(counts, means, variances, ddof, ddof_in=0)
|
|
451
616
|
std = np.sqrt(var + eps)
|
|
452
617
|
for ds in self.datasets:
|
|
453
|
-
ds.features
|
|
618
|
+
ds.features.loc[:, self._numeric_columns()] = (
|
|
619
|
+
ds.features.loc[:, self._numeric_columns()] - mean
|
|
620
|
+
) / std
|
|
454
621
|
|
|
455
622
|
@staticmethod
|
|
456
|
-
def _enforce_inplace_operations(func_name, kwargs):
|
|
623
|
+
def _enforce_inplace_operations(func_name: str, kwargs: dict):
|
|
624
|
+
"""Raise an error if 'inplace=False' is passed to a method."""
|
|
457
625
|
if "inplace" in kwargs and kwargs["inplace"] is False:
|
|
458
626
|
raise ValueError(
|
|
459
627
|
f"{func_name} only works inplace, please change "
|
|
@@ -461,33 +629,49 @@ class FeaturesConcatDataset(BaseConcatDataset):
|
|
|
461
629
|
)
|
|
462
630
|
kwargs["inplace"] = True
|
|
463
631
|
|
|
464
|
-
def fillna(self, *args, **kwargs):
|
|
632
|
+
def fillna(self, *args, **kwargs) -> None:
|
|
633
|
+
"""Fill NA/NaN values in-place. See :meth:`pandas.DataFrame.fillna`."""
|
|
465
634
|
FeaturesConcatDataset._enforce_inplace_operations("fillna", kwargs)
|
|
466
635
|
for ds in self.datasets:
|
|
467
636
|
ds.features.fillna(*args, **kwargs)
|
|
468
637
|
|
|
469
|
-
def replace(self, *args, **kwargs):
|
|
638
|
+
def replace(self, *args, **kwargs) -> None:
|
|
639
|
+
"""Replace values in-place. See :meth:`pandas.DataFrame.replace`."""
|
|
470
640
|
FeaturesConcatDataset._enforce_inplace_operations("replace", kwargs)
|
|
471
641
|
for ds in self.datasets:
|
|
472
642
|
ds.features.replace(*args, **kwargs)
|
|
473
643
|
|
|
474
|
-
def interpolate(self, *args, **kwargs):
|
|
644
|
+
def interpolate(self, *args, **kwargs) -> None:
|
|
645
|
+
"""Interpolate values in-place. See :meth:`pandas.DataFrame.interpolate`."""
|
|
475
646
|
FeaturesConcatDataset._enforce_inplace_operations("interpolate", kwargs)
|
|
476
647
|
for ds in self.datasets:
|
|
477
648
|
ds.features.interpolate(*args, **kwargs)
|
|
478
649
|
|
|
479
|
-
def dropna(self, *args, **kwargs):
|
|
650
|
+
def dropna(self, *args, **kwargs) -> None:
|
|
651
|
+
"""Remove missing values in-place. See :meth:`pandas.DataFrame.dropna`."""
|
|
480
652
|
FeaturesConcatDataset._enforce_inplace_operations("dropna", kwargs)
|
|
481
653
|
for ds in self.datasets:
|
|
482
654
|
ds.features.dropna(*args, **kwargs)
|
|
483
655
|
|
|
484
|
-
def drop(self, *args, **kwargs):
|
|
656
|
+
def drop(self, *args, **kwargs) -> None:
|
|
657
|
+
"""Drop specified labels from rows or columns in-place. See :meth:`pandas.DataFrame.drop`."""
|
|
485
658
|
FeaturesConcatDataset._enforce_inplace_operations("drop", kwargs)
|
|
486
659
|
for ds in self.datasets:
|
|
487
660
|
ds.features.drop(*args, **kwargs)
|
|
488
661
|
|
|
489
|
-
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs):
|
|
662
|
+
def join(self, concat_dataset: FeaturesConcatDataset, **kwargs) -> None:
|
|
663
|
+
"""Join columns with other FeaturesConcatDataset in-place.
|
|
664
|
+
|
|
665
|
+
Parameters
|
|
666
|
+
----------
|
|
667
|
+
concat_dataset : FeaturesConcatDataset
|
|
668
|
+
The dataset to join with. Must have the same number of datasets,
|
|
669
|
+
and each corresponding dataset must have the same length.
|
|
670
|
+
**kwargs
|
|
671
|
+
Keyword arguments to pass to :meth:`pandas.DataFrame.join`.
|
|
672
|
+
|
|
673
|
+
"""
|
|
490
674
|
assert len(self.datasets) == len(concat_dataset.datasets)
|
|
491
675
|
for ds1, ds2 in zip(self.datasets, concat_dataset.datasets):
|
|
492
676
|
assert len(ds1) == len(ds2)
|
|
493
|
-
ds1.features.join(ds2, **kwargs)
|
|
677
|
+
ds1.features = ds1.features.join(ds2.features, **kwargs)
|