eegdash 0.3.3.dev61__py3-none-any.whl → 0.5.0.dev180784713__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eegdash/__init__.py +19 -6
- eegdash/api.py +336 -539
- eegdash/bids_eeg_metadata.py +495 -0
- eegdash/const.py +349 -0
- eegdash/dataset/__init__.py +28 -0
- eegdash/dataset/base.py +311 -0
- eegdash/dataset/bids_dataset.py +641 -0
- eegdash/dataset/dataset.py +692 -0
- eegdash/dataset/dataset_summary.csv +255 -0
- eegdash/dataset/registry.py +287 -0
- eegdash/downloader.py +197 -0
- eegdash/features/__init__.py +15 -13
- eegdash/features/datasets.py +329 -138
- eegdash/features/decorators.py +105 -13
- eegdash/features/extractors.py +233 -63
- eegdash/features/feature_bank/__init__.py +12 -12
- eegdash/features/feature_bank/complexity.py +22 -20
- eegdash/features/feature_bank/connectivity.py +27 -28
- eegdash/features/feature_bank/csp.py +3 -1
- eegdash/features/feature_bank/dimensionality.py +6 -6
- eegdash/features/feature_bank/signal.py +29 -30
- eegdash/features/feature_bank/spectral.py +40 -44
- eegdash/features/feature_bank/utils.py +8 -0
- eegdash/features/inspect.py +126 -15
- eegdash/features/serialization.py +58 -17
- eegdash/features/utils.py +90 -16
- eegdash/hbn/__init__.py +28 -0
- eegdash/hbn/preprocessing.py +105 -0
- eegdash/hbn/windows.py +428 -0
- eegdash/logging.py +54 -0
- eegdash/mongodb.py +55 -24
- eegdash/paths.py +52 -0
- eegdash/utils.py +29 -1
- eegdash-0.5.0.dev180784713.dist-info/METADATA +121 -0
- eegdash-0.5.0.dev180784713.dist-info/RECORD +38 -0
- eegdash-0.5.0.dev180784713.dist-info/licenses/LICENSE +29 -0
- eegdash/data_config.py +0 -34
- eegdash/data_utils.py +0 -687
- eegdash/dataset.py +0 -69
- eegdash/preprocessing.py +0 -63
- eegdash-0.3.3.dev61.dist-info/METADATA +0 -192
- eegdash-0.3.3.dev61.dist-info/RECORD +0 -28
- eegdash-0.3.3.dev61.dist-info/licenses/LICENSE +0 -23
- {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/WHEEL +0 -0
- {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,692 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from docstring_inheritance import NumpyDocstringInheritanceInitMeta
|
|
5
|
+
from mne_bids import find_matching_paths
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.panel import Panel
|
|
8
|
+
from rich.text import Text
|
|
9
|
+
|
|
10
|
+
from braindecode.datasets import BaseConcatDataset
|
|
11
|
+
|
|
12
|
+
from .. import downloader
|
|
13
|
+
from ..bids_eeg_metadata import (
|
|
14
|
+
build_query_from_kwargs,
|
|
15
|
+
merge_participants_fields,
|
|
16
|
+
normalize_key,
|
|
17
|
+
)
|
|
18
|
+
from ..const import (
|
|
19
|
+
ALLOWED_QUERY_FIELDS,
|
|
20
|
+
RELEASE_TO_OPENNEURO_DATASET_MAP,
|
|
21
|
+
SUBJECT_MINI_RELEASE_MAP,
|
|
22
|
+
)
|
|
23
|
+
from ..logging import logger
|
|
24
|
+
from ..paths import get_default_cache_dir
|
|
25
|
+
from .base import EEGDashBaseDataset
|
|
26
|
+
from .bids_dataset import EEGBIDSDataset
|
|
27
|
+
from .registry import register_openneuro_datasets
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EEGDashDataset(BaseConcatDataset, metaclass=NumpyDocstringInheritanceInitMeta):
|
|
31
|
+
"""Create a new EEGDashDataset from a given query or local BIDS dataset directory
|
|
32
|
+
and dataset name. An EEGDashDataset is pooled collection of EEGDashBaseDataset
|
|
33
|
+
instances (individual recordings) and is a subclass of braindecode's BaseConcatDataset.
|
|
34
|
+
|
|
35
|
+
Examples
|
|
36
|
+
--------
|
|
37
|
+
Basic usage with dataset and subject filtering:
|
|
38
|
+
|
|
39
|
+
>>> from eegdash import EEGDashDataset
|
|
40
|
+
>>> dataset = EEGDashDataset(
|
|
41
|
+
... cache_dir="./data",
|
|
42
|
+
... dataset="ds002718",
|
|
43
|
+
... subject="012"
|
|
44
|
+
... )
|
|
45
|
+
>>> print(f"Number of recordings: {len(dataset)}")
|
|
46
|
+
|
|
47
|
+
Filter by multiple subjects and specific task:
|
|
48
|
+
|
|
49
|
+
>>> subjects = ["012", "013", "014"]
|
|
50
|
+
>>> dataset = EEGDashDataset(
|
|
51
|
+
... cache_dir="./data",
|
|
52
|
+
... dataset="ds002718",
|
|
53
|
+
... subject=subjects,
|
|
54
|
+
... task="RestingState"
|
|
55
|
+
... )
|
|
56
|
+
|
|
57
|
+
Load and inspect EEG data from recordings:
|
|
58
|
+
|
|
59
|
+
>>> if len(dataset) > 0:
|
|
60
|
+
... recording = dataset[0]
|
|
61
|
+
... raw = recording.load()
|
|
62
|
+
... print(f"Sampling rate: {raw.info['sfreq']} Hz")
|
|
63
|
+
... print(f"Number of channels: {len(raw.ch_names)}")
|
|
64
|
+
... print(f"Duration: {raw.times[-1]:.1f} seconds")
|
|
65
|
+
|
|
66
|
+
Advanced filtering with raw MongoDB queries:
|
|
67
|
+
|
|
68
|
+
>>> from eegdash import EEGDashDataset
|
|
69
|
+
>>> query = {
|
|
70
|
+
... "dataset": "ds002718",
|
|
71
|
+
... "subject": {"$in": ["012", "013"]},
|
|
72
|
+
... "task": "RestingState"
|
|
73
|
+
... }
|
|
74
|
+
>>> dataset = EEGDashDataset(cache_dir="./data", query=query)
|
|
75
|
+
|
|
76
|
+
Working with dataset collections and braindecode integration:
|
|
77
|
+
|
|
78
|
+
>>> # EEGDashDataset is a braindecode BaseConcatDataset
|
|
79
|
+
>>> for i, recording in enumerate(dataset):
|
|
80
|
+
... if i >= 2: # limit output
|
|
81
|
+
... break
|
|
82
|
+
... print(f"Recording {i}: {recording.description}")
|
|
83
|
+
... raw = recording.load()
|
|
84
|
+
... print(f" Channels: {len(raw.ch_names)}, Duration: {raw.times[-1]:.1f}s")
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
cache_dir : str | Path
|
|
89
|
+
Directory where data are cached locally.
|
|
90
|
+
query : dict | None
|
|
91
|
+
Raw MongoDB query to filter records. If provided, it is merged with
|
|
92
|
+
keyword filtering arguments (see ``**kwargs``) using logical AND.
|
|
93
|
+
You must provide at least a ``dataset`` (either in ``query`` or
|
|
94
|
+
as a keyword argument). Only fields in ``ALLOWED_QUERY_FIELDS`` are
|
|
95
|
+
considered for filtering.
|
|
96
|
+
dataset : str
|
|
97
|
+
Dataset identifier (e.g., ``"ds002718"``). Required if ``query`` does
|
|
98
|
+
not already specify a dataset.
|
|
99
|
+
task : str | list[str]
|
|
100
|
+
Task name(s) to filter by (e.g., ``"RestingState"``).
|
|
101
|
+
subject : str | list[str]
|
|
102
|
+
Subject identifier(s) to filter by (e.g., ``"NDARCA153NKE"``).
|
|
103
|
+
session : str | list[str]
|
|
104
|
+
Session identifier(s) to filter by (e.g., ``"1"``).
|
|
105
|
+
run : str | list[str]
|
|
106
|
+
Run identifier(s) to filter by (e.g., ``"1"``).
|
|
107
|
+
description_fields : list[str]
|
|
108
|
+
Fields to extract from each record and include in dataset descriptions
|
|
109
|
+
(e.g., "subject", "session", "run", "task").
|
|
110
|
+
s3_bucket : str | None
|
|
111
|
+
Optional S3 bucket URI (e.g., "s3://mybucket") to use instead of the
|
|
112
|
+
default OpenNeuro bucket when downloading data files.
|
|
113
|
+
records : list[dict] | None
|
|
114
|
+
Pre-fetched metadata records. If provided, the dataset is constructed
|
|
115
|
+
directly from these records and no MongoDB query is performed.
|
|
116
|
+
download : bool, default True
|
|
117
|
+
If False, load from local BIDS files only. Local data are expected
|
|
118
|
+
under ``cache_dir / dataset``; no DB or S3 access is attempted.
|
|
119
|
+
n_jobs : int
|
|
120
|
+
Number of parallel jobs to use where applicable (-1 uses all cores).
|
|
121
|
+
eeg_dash_instance : EEGDash | None
|
|
122
|
+
Optional existing EEGDash client to reuse for DB queries. If None,
|
|
123
|
+
a new client is created on demand, not used in the case of no download.
|
|
124
|
+
**kwargs : dict
|
|
125
|
+
Additional keyword arguments serving two purposes:
|
|
126
|
+
|
|
127
|
+
- Filtering: any keys present in ``ALLOWED_QUERY_FIELDS`` are treated as
|
|
128
|
+
query filters (e.g., ``dataset``, ``subject``, ``task``, ...).
|
|
129
|
+
- Dataset options: remaining keys are forwarded to
|
|
130
|
+
``EEGDashBaseDataset``.
|
|
131
|
+
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
def __init__(
|
|
135
|
+
self,
|
|
136
|
+
cache_dir: str | Path,
|
|
137
|
+
query: dict[str, Any] = None,
|
|
138
|
+
description_fields: list[str] | None = None,
|
|
139
|
+
s3_bucket: str | None = None,
|
|
140
|
+
records: list[dict] | None = None,
|
|
141
|
+
download: bool = True,
|
|
142
|
+
n_jobs: int = -1,
|
|
143
|
+
eeg_dash_instance: Any = None,
|
|
144
|
+
**kwargs,
|
|
145
|
+
):
|
|
146
|
+
# Parameters that don't need validation
|
|
147
|
+
_suppress_comp_warning: bool = kwargs.pop("_suppress_comp_warning", False)
|
|
148
|
+
self.s3_bucket = s3_bucket
|
|
149
|
+
self.records = records
|
|
150
|
+
self.download = download
|
|
151
|
+
self.n_jobs = n_jobs
|
|
152
|
+
self.eeg_dash_instance = eeg_dash_instance
|
|
153
|
+
|
|
154
|
+
if description_fields is None:
|
|
155
|
+
description_fields = [
|
|
156
|
+
"subject",
|
|
157
|
+
"session",
|
|
158
|
+
"run",
|
|
159
|
+
"task",
|
|
160
|
+
"age",
|
|
161
|
+
"gender",
|
|
162
|
+
"sex",
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
self.cache_dir = cache_dir
|
|
166
|
+
if self.cache_dir == "" or self.cache_dir is None:
|
|
167
|
+
self.cache_dir = get_default_cache_dir()
|
|
168
|
+
logger.warning(
|
|
169
|
+
f"Cache directory is empty, using the eegdash default path: {self.cache_dir}"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
self.cache_dir = Path(self.cache_dir)
|
|
173
|
+
|
|
174
|
+
if not self.cache_dir.exists():
|
|
175
|
+
logger.warning(
|
|
176
|
+
f"Cache directory does not exist, creating it: {self.cache_dir}"
|
|
177
|
+
)
|
|
178
|
+
self.cache_dir.mkdir(exist_ok=True, parents=True)
|
|
179
|
+
|
|
180
|
+
# Separate query kwargs from other kwargs passed to the BaseDataset constructor
|
|
181
|
+
self.query = query or {}
|
|
182
|
+
self.query.update(
|
|
183
|
+
{k: v for k, v in kwargs.items() if k in ALLOWED_QUERY_FIELDS}
|
|
184
|
+
)
|
|
185
|
+
base_dataset_kwargs = {k: v for k, v in kwargs.items() if k not in self.query}
|
|
186
|
+
if "dataset" not in self.query:
|
|
187
|
+
# If explicit records are provided, infer dataset from records
|
|
188
|
+
if isinstance(records, list) and records and isinstance(records[0], dict):
|
|
189
|
+
inferred = records[0].get("dataset")
|
|
190
|
+
if inferred:
|
|
191
|
+
self.query["dataset"] = inferred
|
|
192
|
+
else:
|
|
193
|
+
raise ValueError("You must provide a 'dataset' argument")
|
|
194
|
+
else:
|
|
195
|
+
raise ValueError("You must provide a 'dataset' argument")
|
|
196
|
+
|
|
197
|
+
# Decide on a dataset subfolder name for cache isolation. If using
|
|
198
|
+
# challenge/preprocessed buckets (e.g., BDF, mini subsets), append
|
|
199
|
+
# informative suffixes to avoid overlapping with the original dataset.
|
|
200
|
+
dataset_folder = self.query["dataset"]
|
|
201
|
+
if self.s3_bucket:
|
|
202
|
+
suffixes: list[str] = []
|
|
203
|
+
bucket_lower = str(self.s3_bucket).lower()
|
|
204
|
+
if "bdf" in bucket_lower:
|
|
205
|
+
suffixes.append("bdf")
|
|
206
|
+
if "mini" in bucket_lower:
|
|
207
|
+
suffixes.append("mini")
|
|
208
|
+
if suffixes:
|
|
209
|
+
dataset_folder = f"{dataset_folder}-{'-'.join(suffixes)}"
|
|
210
|
+
|
|
211
|
+
self.data_dir = self.cache_dir / dataset_folder
|
|
212
|
+
|
|
213
|
+
if (
|
|
214
|
+
not _suppress_comp_warning
|
|
215
|
+
and self.query["dataset"] in RELEASE_TO_OPENNEURO_DATASET_MAP.values()
|
|
216
|
+
):
|
|
217
|
+
message_text = Text.from_markup(
|
|
218
|
+
"[italic]This notice is only for users who are participating in the [link=https://eeg2025.github.io/]EEG 2025 Competition[/link].[/italic]\n\n"
|
|
219
|
+
"[bold]EEG 2025 Competition Data Notice![/bold]\n"
|
|
220
|
+
"You are loading one of the datasets that is used in competition, but via `EEGDashDataset`.\n\n"
|
|
221
|
+
"[bold red]IMPORTANT[/bold red]: \n"
|
|
222
|
+
"If you download data from `EEGDashDataset`, it is [u]NOT[/u] identical to the official \n"
|
|
223
|
+
"competition data, which is accessed via `EEGChallengeDataset`. "
|
|
224
|
+
"The competition data has been downsampled and filtered.\n\n"
|
|
225
|
+
"[bold]If you are participating in the competition, \nyou must use the `EEGChallengeDataset` object to ensure consistency.[/bold] \n\n"
|
|
226
|
+
"If you are not participating in the competition, you can ignore this message."
|
|
227
|
+
)
|
|
228
|
+
warning_panel = Panel(
|
|
229
|
+
message_text,
|
|
230
|
+
title="[yellow]EEG 2025 Competition Data Notice[/yellow]",
|
|
231
|
+
subtitle="[cyan]Source: EEGDashDataset[/cyan]",
|
|
232
|
+
border_style="yellow",
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
Console().print(warning_panel)
|
|
237
|
+
except Exception:
|
|
238
|
+
logger.warning(str(message_text))
|
|
239
|
+
|
|
240
|
+
if records is not None:
|
|
241
|
+
self.records = records
|
|
242
|
+
datasets = [
|
|
243
|
+
EEGDashBaseDataset(
|
|
244
|
+
record,
|
|
245
|
+
self.cache_dir,
|
|
246
|
+
self.s3_bucket,
|
|
247
|
+
**base_dataset_kwargs,
|
|
248
|
+
)
|
|
249
|
+
for record in self.records
|
|
250
|
+
]
|
|
251
|
+
elif not download: # only assume local data is complete if not downloading
|
|
252
|
+
if not self.data_dir.exists():
|
|
253
|
+
raise ValueError(
|
|
254
|
+
f"Offline mode is enabled, but local data_dir {self.data_dir} does not exist."
|
|
255
|
+
)
|
|
256
|
+
records = self._find_local_bids_records(self.data_dir, self.query)
|
|
257
|
+
# Try to enrich from local participants.tsv to restore requested fields
|
|
258
|
+
try:
|
|
259
|
+
bids_ds = EEGBIDSDataset(
|
|
260
|
+
data_dir=str(self.data_dir), dataset=self.query["dataset"]
|
|
261
|
+
) # type: ignore[index]
|
|
262
|
+
except Exception:
|
|
263
|
+
bids_ds = None
|
|
264
|
+
|
|
265
|
+
datasets = []
|
|
266
|
+
for record in records:
|
|
267
|
+
# Start with entity values from filename
|
|
268
|
+
desc: dict[str, Any] = {
|
|
269
|
+
k: record.get(k)
|
|
270
|
+
for k in ("subject", "session", "run", "task")
|
|
271
|
+
if record.get(k) is not None
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if bids_ds is not None:
|
|
275
|
+
try:
|
|
276
|
+
rel_from_dataset = Path(record["bidspath"]).relative_to(
|
|
277
|
+
record["dataset"]
|
|
278
|
+
) # type: ignore[index]
|
|
279
|
+
local_file = (self.data_dir / rel_from_dataset).as_posix()
|
|
280
|
+
part_row = bids_ds.subject_participant_tsv(local_file)
|
|
281
|
+
desc = merge_participants_fields(
|
|
282
|
+
description=desc,
|
|
283
|
+
participants_row=part_row
|
|
284
|
+
if isinstance(part_row, dict)
|
|
285
|
+
else None,
|
|
286
|
+
description_fields=description_fields,
|
|
287
|
+
)
|
|
288
|
+
except Exception:
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
datasets.append(
|
|
292
|
+
EEGDashBaseDataset(
|
|
293
|
+
record=record,
|
|
294
|
+
cache_dir=self.cache_dir,
|
|
295
|
+
s3_bucket=self.s3_bucket,
|
|
296
|
+
description=desc,
|
|
297
|
+
**base_dataset_kwargs,
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
elif self.query:
|
|
301
|
+
if self.eeg_dash_instance is None:
|
|
302
|
+
# to avoid circular import
|
|
303
|
+
from ..api import EEGDash
|
|
304
|
+
|
|
305
|
+
self.eeg_dash_instance = EEGDash()
|
|
306
|
+
datasets = self._find_datasets(
|
|
307
|
+
query=build_query_from_kwargs(**self.query),
|
|
308
|
+
description_fields=description_fields,
|
|
309
|
+
base_dataset_kwargs=base_dataset_kwargs,
|
|
310
|
+
)
|
|
311
|
+
# We only need filesystem if we need to access S3
|
|
312
|
+
self.filesystem = downloader.get_s3_filesystem()
|
|
313
|
+
else:
|
|
314
|
+
raise ValueError(
|
|
315
|
+
"You must provide either 'records', a 'data_dir', or a query/keyword arguments for filtering."
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
super().__init__(datasets)
|
|
319
|
+
|
|
320
|
+
def _find_local_bids_records(
|
|
321
|
+
self, dataset_root: Path, filters: dict[str, Any]
|
|
322
|
+
) -> list[dict]:
|
|
323
|
+
"""Discover local BIDS EEG files and build minimal records.
|
|
324
|
+
|
|
325
|
+
Enumerates EEG recordings under ``dataset_root`` using
|
|
326
|
+
``mne_bids.find_matching_paths`` and applies entity filters to produce
|
|
327
|
+
records suitable for :class:`EEGDashBaseDataset`. No network access is
|
|
328
|
+
performed, and files are not read.
|
|
329
|
+
|
|
330
|
+
Parameters
|
|
331
|
+
----------
|
|
332
|
+
dataset_root : Path
|
|
333
|
+
Local dataset directory (e.g., ``/path/to/cache/ds005509``).
|
|
334
|
+
filters : dict
|
|
335
|
+
Query filters. Must include ``'dataset'`` and may include BIDS
|
|
336
|
+
entities like ``'subject'``, ``'session'``, etc.
|
|
337
|
+
|
|
338
|
+
Returns
|
|
339
|
+
-------
|
|
340
|
+
list of dict
|
|
341
|
+
A list of records, one for each matched EEG file. Each record
|
|
342
|
+
contains BIDS entities, paths, and minimal metadata for offline use.
|
|
343
|
+
|
|
344
|
+
Notes
|
|
345
|
+
-----
|
|
346
|
+
Matching is performed for ``datatypes=['eeg']`` and ``suffixes=['eeg']``.
|
|
347
|
+
The ``bidspath`` is normalized to ensure it starts with the dataset ID,
|
|
348
|
+
even for suffixed cache directories.
|
|
349
|
+
|
|
350
|
+
"""
|
|
351
|
+
dataset_id = filters["dataset"]
|
|
352
|
+
arg_map = {
|
|
353
|
+
"subjects": "subject",
|
|
354
|
+
"sessions": "session",
|
|
355
|
+
"tasks": "task",
|
|
356
|
+
"runs": "run",
|
|
357
|
+
}
|
|
358
|
+
matching_args: dict[str, list[str]] = {}
|
|
359
|
+
for finder_key, entity_key in arg_map.items():
|
|
360
|
+
entity_val = filters.get(entity_key)
|
|
361
|
+
if entity_val is None:
|
|
362
|
+
continue
|
|
363
|
+
if isinstance(entity_val, (list, tuple, set)):
|
|
364
|
+
entity_vals = list(entity_val)
|
|
365
|
+
if not entity_vals:
|
|
366
|
+
continue
|
|
367
|
+
matching_args[finder_key] = entity_vals
|
|
368
|
+
else:
|
|
369
|
+
matching_args[finder_key] = [entity_val]
|
|
370
|
+
|
|
371
|
+
matched_paths = find_matching_paths(
|
|
372
|
+
root=str(dataset_root),
|
|
373
|
+
datatypes=["eeg"],
|
|
374
|
+
suffixes=["eeg"],
|
|
375
|
+
ignore_json=True,
|
|
376
|
+
**matching_args,
|
|
377
|
+
)
|
|
378
|
+
records_out: list[dict] = []
|
|
379
|
+
|
|
380
|
+
for bids_path in matched_paths:
|
|
381
|
+
# Build bidspath as dataset_id / relative_path_from_dataset_root (POSIX)
|
|
382
|
+
rel_from_root = (
|
|
383
|
+
Path(bids_path.fpath)
|
|
384
|
+
.resolve()
|
|
385
|
+
.relative_to(Path(bids_path.root).resolve())
|
|
386
|
+
)
|
|
387
|
+
bidspath = f"{dataset_id}/{rel_from_root.as_posix()}"
|
|
388
|
+
|
|
389
|
+
rec = {
|
|
390
|
+
"data_name": f"{dataset_id}_{Path(bids_path.fpath).name}",
|
|
391
|
+
"dataset": dataset_id,
|
|
392
|
+
"bidspath": bidspath,
|
|
393
|
+
"subject": (bids_path.subject or None),
|
|
394
|
+
"session": (bids_path.session or None),
|
|
395
|
+
"task": (bids_path.task or None),
|
|
396
|
+
"run": (bids_path.run or None),
|
|
397
|
+
# minimal fields to satisfy BaseDataset from eegdash
|
|
398
|
+
"bidsdependencies": [], # not needed to just run.
|
|
399
|
+
"modality": "eeg",
|
|
400
|
+
# minimal numeric defaults for offline length calculation
|
|
401
|
+
"sampling_frequency": None,
|
|
402
|
+
"nchans": None,
|
|
403
|
+
"ntimes": None,
|
|
404
|
+
}
|
|
405
|
+
records_out.append(rec)
|
|
406
|
+
|
|
407
|
+
return records_out
|
|
408
|
+
|
|
409
|
+
def _find_key_in_nested_dict(self, data: Any, target_key: str) -> Any:
|
|
410
|
+
"""Recursively search for a key in nested dicts/lists.
|
|
411
|
+
|
|
412
|
+
Performs a case-insensitive and underscore/hyphen-agnostic search.
|
|
413
|
+
|
|
414
|
+
Parameters
|
|
415
|
+
----------
|
|
416
|
+
data : Any
|
|
417
|
+
The nested data structure (dicts, lists) to search.
|
|
418
|
+
target_key : str
|
|
419
|
+
The key to search for.
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
Any
|
|
424
|
+
The value of the first matching key, or None if not found.
|
|
425
|
+
|
|
426
|
+
"""
|
|
427
|
+
norm_target = normalize_key(target_key)
|
|
428
|
+
if isinstance(data, dict):
|
|
429
|
+
for k, v in data.items():
|
|
430
|
+
if normalize_key(k) == norm_target:
|
|
431
|
+
return v
|
|
432
|
+
res = self._find_key_in_nested_dict(v, target_key)
|
|
433
|
+
if res is not None:
|
|
434
|
+
return res
|
|
435
|
+
elif isinstance(data, list):
|
|
436
|
+
for item in data:
|
|
437
|
+
res = self._find_key_in_nested_dict(item, target_key)
|
|
438
|
+
if res is not None:
|
|
439
|
+
return res
|
|
440
|
+
return None
|
|
441
|
+
|
|
442
|
+
def _find_datasets(
|
|
443
|
+
self,
|
|
444
|
+
query: dict[str, Any] | None,
|
|
445
|
+
description_fields: list[str],
|
|
446
|
+
base_dataset_kwargs: dict,
|
|
447
|
+
) -> list[EEGDashBaseDataset]:
|
|
448
|
+
"""Find and construct datasets from a MongoDB query.
|
|
449
|
+
|
|
450
|
+
Queries the database, then creates a list of
|
|
451
|
+
:class:`EEGDashBaseDataset` objects from the results.
|
|
452
|
+
|
|
453
|
+
Parameters
|
|
454
|
+
----------
|
|
455
|
+
query : dict, optional
|
|
456
|
+
The MongoDB query to execute.
|
|
457
|
+
description_fields : list of str
|
|
458
|
+
Fields to extract from each record for the dataset description.
|
|
459
|
+
base_dataset_kwargs : dict
|
|
460
|
+
Additional keyword arguments to pass to the
|
|
461
|
+
:class:`EEGDashBaseDataset` constructor.
|
|
462
|
+
|
|
463
|
+
Returns
|
|
464
|
+
-------
|
|
465
|
+
list of EEGDashBaseDataset
|
|
466
|
+
A list of dataset objects matching the query.
|
|
467
|
+
|
|
468
|
+
"""
|
|
469
|
+
datasets: list[EEGDashBaseDataset] = []
|
|
470
|
+
self.records = self.eeg_dash_instance.find(query)
|
|
471
|
+
|
|
472
|
+
for record in self.records:
|
|
473
|
+
description: dict[str, Any] = {}
|
|
474
|
+
# Requested fields first (normalized matching)
|
|
475
|
+
for field in description_fields:
|
|
476
|
+
value = self._find_key_in_nested_dict(record, field)
|
|
477
|
+
if value is not None:
|
|
478
|
+
description[field] = value
|
|
479
|
+
# Merge all participants.tsv columns generically
|
|
480
|
+
part = self._find_key_in_nested_dict(record, "participant_tsv")
|
|
481
|
+
if isinstance(part, dict):
|
|
482
|
+
description = merge_participants_fields(
|
|
483
|
+
description=description,
|
|
484
|
+
participants_row=part,
|
|
485
|
+
description_fields=description_fields,
|
|
486
|
+
)
|
|
487
|
+
datasets.append(
|
|
488
|
+
EEGDashBaseDataset(
|
|
489
|
+
record,
|
|
490
|
+
cache_dir=self.cache_dir,
|
|
491
|
+
s3_bucket=self.s3_bucket,
|
|
492
|
+
description=description,
|
|
493
|
+
**base_dataset_kwargs,
|
|
494
|
+
)
|
|
495
|
+
)
|
|
496
|
+
return datasets
|
|
497
|
+
|
|
498
|
+
# just to fix the docstring inheritance until we solved it in braindecode.
|
|
499
|
+
def save(self, path, overwrite=False):
|
|
500
|
+
"""Save the dataset to disk.
|
|
501
|
+
|
|
502
|
+
Parameters
|
|
503
|
+
----------
|
|
504
|
+
path : str or Path
|
|
505
|
+
Destination file path.
|
|
506
|
+
overwrite : bool, default False
|
|
507
|
+
If True, overwrite existing file.
|
|
508
|
+
|
|
509
|
+
Returns
|
|
510
|
+
-------
|
|
511
|
+
None
|
|
512
|
+
|
|
513
|
+
"""
|
|
514
|
+
return super().save(path, overwrite=overwrite)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
class EEGChallengeDataset(EEGDashDataset):
|
|
518
|
+
"""A dataset helper for the EEG 2025 Challenge.
|
|
519
|
+
|
|
520
|
+
This class simplifies access to the EEG 2025 Challenge datasets. It is a
|
|
521
|
+
specialized version of :class:`~eegdash.api.EEGDashDataset` that is
|
|
522
|
+
pre-configured for the challenge's data releases. It automatically maps a
|
|
523
|
+
release name (e.g., "R1") to the corresponding OpenNeuro dataset and handles
|
|
524
|
+
the selection of subject subsets (e.g., "mini" release).
|
|
525
|
+
|
|
526
|
+
Parameters
|
|
527
|
+
----------
|
|
528
|
+
release : str
|
|
529
|
+
The name of the challenge release to load. Must be one of the keys in
|
|
530
|
+
:const:`~eegdash.const.RELEASE_TO_OPENNEURO_DATASET_MAP`
|
|
531
|
+
(e.g., "R1", "R2", ..., "R11").
|
|
532
|
+
cache_dir : str
|
|
533
|
+
The local directory where the dataset will be downloaded and cached.
|
|
534
|
+
mini : bool, default True
|
|
535
|
+
If True, the dataset is restricted to the official "mini" subset of
|
|
536
|
+
subjects for the specified release. If False, all subjects for the
|
|
537
|
+
release are included.
|
|
538
|
+
query : dict, optional
|
|
539
|
+
An additional MongoDB-style query to apply as a filter. This query is
|
|
540
|
+
combined with the release and subject filters using a logical AND.
|
|
541
|
+
The query must not contain the ``dataset`` key, as this is determined
|
|
542
|
+
by the ``release`` parameter.
|
|
543
|
+
s3_bucket : str, optional
|
|
544
|
+
The base S3 bucket URI where the challenge data is stored. Defaults to
|
|
545
|
+
the official challenge bucket.
|
|
546
|
+
**kwargs
|
|
547
|
+
Additional keyword arguments that are passed directly to the
|
|
548
|
+
:class:`~eegdash.api.EEGDashDataset` constructor.
|
|
549
|
+
|
|
550
|
+
Raises
|
|
551
|
+
------
|
|
552
|
+
ValueError
|
|
553
|
+
If the specified ``release`` is unknown, or if the ``query`` argument
|
|
554
|
+
contains a ``dataset`` key. Also raised if ``mini`` is True and a
|
|
555
|
+
requested subject is not part of the official mini-release subset.
|
|
556
|
+
|
|
557
|
+
See Also
|
|
558
|
+
--------
|
|
559
|
+
EEGDashDataset : The base class for creating datasets from queries.
|
|
560
|
+
|
|
561
|
+
"""
|
|
562
|
+
|
|
563
|
+
def __init__(
|
|
564
|
+
self,
|
|
565
|
+
release: str,
|
|
566
|
+
cache_dir: str,
|
|
567
|
+
mini: bool = True,
|
|
568
|
+
query: dict | None = None,
|
|
569
|
+
s3_bucket: str | None = "s3://nmdatasets/NeurIPS25",
|
|
570
|
+
**kwargs,
|
|
571
|
+
):
|
|
572
|
+
self.release = release
|
|
573
|
+
self.mini = mini
|
|
574
|
+
|
|
575
|
+
if release not in RELEASE_TO_OPENNEURO_DATASET_MAP:
|
|
576
|
+
raise ValueError(
|
|
577
|
+
f"Unknown release: {release}, expected one of {list(RELEASE_TO_OPENNEURO_DATASET_MAP.keys())}"
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
dataset_parameters = []
|
|
581
|
+
if isinstance(release, str):
|
|
582
|
+
dataset_parameters.append(RELEASE_TO_OPENNEURO_DATASET_MAP[release])
|
|
583
|
+
else:
|
|
584
|
+
raise ValueError(
|
|
585
|
+
f"Unknown release type: {type(release)}, the expected type is str."
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
if query and "dataset" in query:
|
|
589
|
+
raise ValueError(
|
|
590
|
+
"Query using the parameters `dataset` with the class EEGChallengeDataset is not possible."
|
|
591
|
+
"Please use the release argument instead, or the object EEGDashDataset instead."
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
if self.mini:
|
|
595
|
+
# When using the mini release, restrict subjects to the predefined subset.
|
|
596
|
+
# If the user specifies subject(s), ensure they all belong to the mini subset;
|
|
597
|
+
# otherwise, default to the full mini subject list for this release.
|
|
598
|
+
|
|
599
|
+
allowed_subjects = set(SUBJECT_MINI_RELEASE_MAP[release])
|
|
600
|
+
|
|
601
|
+
# Normalize potential 'subjects' -> 'subject' for convenience
|
|
602
|
+
if "subjects" in kwargs and "subject" not in kwargs:
|
|
603
|
+
kwargs["subject"] = kwargs.pop("subjects")
|
|
604
|
+
|
|
605
|
+
# Collect user-requested subjects from kwargs/query. We canonicalize
|
|
606
|
+
# kwargs via build_query_from_kwargs to leverage existing validation,
|
|
607
|
+
# and support Mongo-style {"$in": [...]} shapes from a raw query.
|
|
608
|
+
requested_subjects: list[str] = []
|
|
609
|
+
|
|
610
|
+
# From kwargs
|
|
611
|
+
if "subject" in kwargs and kwargs["subject"] is not None:
|
|
612
|
+
# Use the shared query builder to normalize scalars/lists
|
|
613
|
+
built = build_query_from_kwargs(subject=kwargs["subject"])
|
|
614
|
+
s_val = built.get("subject")
|
|
615
|
+
if isinstance(s_val, dict) and "$in" in s_val:
|
|
616
|
+
requested_subjects.extend(list(s_val["$in"]))
|
|
617
|
+
elif s_val is not None:
|
|
618
|
+
requested_subjects.append(s_val) # type: ignore[arg-type]
|
|
619
|
+
|
|
620
|
+
# From query (top-level only)
|
|
621
|
+
if query and isinstance(query, dict) and "subject" in query:
|
|
622
|
+
qval = query["subject"]
|
|
623
|
+
if isinstance(qval, dict) and "$in" in qval:
|
|
624
|
+
requested_subjects.extend(list(qval["$in"]))
|
|
625
|
+
elif isinstance(qval, (list, tuple, set)):
|
|
626
|
+
requested_subjects.extend(list(qval))
|
|
627
|
+
elif qval is not None:
|
|
628
|
+
requested_subjects.append(qval)
|
|
629
|
+
|
|
630
|
+
# Validate if any subjects were explicitly requested
|
|
631
|
+
if requested_subjects:
|
|
632
|
+
invalid = sorted(
|
|
633
|
+
{s for s in requested_subjects if s not in allowed_subjects}
|
|
634
|
+
)
|
|
635
|
+
if invalid:
|
|
636
|
+
raise ValueError(
|
|
637
|
+
"Some requested subject(s) are not part of the mini release for "
|
|
638
|
+
f"{release}: {invalid}. Allowed subjects: {sorted(allowed_subjects)}"
|
|
639
|
+
)
|
|
640
|
+
# Do not override user selection; keep their (validated) subjects as-is.
|
|
641
|
+
else:
|
|
642
|
+
# No subject specified by the user: default to the full mini subset
|
|
643
|
+
kwargs["subject"] = sorted(allowed_subjects)
|
|
644
|
+
|
|
645
|
+
s3_bucket = f"{s3_bucket}/{release}_mini_L100_bdf"
|
|
646
|
+
else:
|
|
647
|
+
s3_bucket = f"{s3_bucket}/{release}_L100_bdf"
|
|
648
|
+
|
|
649
|
+
message_text = Text.from_markup(
|
|
650
|
+
"This object loads the HBN dataset that has been preprocessed for the EEG Challenge:\n"
|
|
651
|
+
" * Downsampled from 500Hz to 100Hz\n"
|
|
652
|
+
" * Bandpass filtered (0.5-50 Hz)\n\n"
|
|
653
|
+
"For full preprocessing applied for competition details, see:\n"
|
|
654
|
+
" [link=https://github.com/eeg2025/downsample-datasets]https://github.com/eeg2025/downsample-datasets[/link]\n\n"
|
|
655
|
+
"The HBN dataset have some preprocessing applied by the HBN team:\n"
|
|
656
|
+
" * Re-reference (Cz Channel)\n\n"
|
|
657
|
+
"[bold red]IMPORTANT[/bold red]: The data accessed via `EEGChallengeDataset` is [u]NOT[/u] identical to what you get from [link=https://github.com/eegdash/EEGDash/blob/develop/eegdash/api.py]EEGDashDataset[/link] directly.\n"
|
|
658
|
+
"If you are participating in the competition, always use `EEGChallengeDataset` to ensure consistency with the challenge data."
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
warning_panel = Panel(
|
|
662
|
+
message_text,
|
|
663
|
+
title="[yellow]EEG 2025 Competition Data Notice[/yellow]",
|
|
664
|
+
subtitle="[cyan]Source: EEGChallengeDataset[/cyan]",
|
|
665
|
+
border_style="yellow",
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
# Render the panel directly to the console so it displays in IPython/terminals
|
|
669
|
+
try:
|
|
670
|
+
Console().print(warning_panel)
|
|
671
|
+
except Exception:
|
|
672
|
+
warning_message = str(message_text)
|
|
673
|
+
logger.warning(warning_message)
|
|
674
|
+
|
|
675
|
+
super().__init__(
|
|
676
|
+
dataset=RELEASE_TO_OPENNEURO_DATASET_MAP[release],
|
|
677
|
+
query=query,
|
|
678
|
+
cache_dir=cache_dir,
|
|
679
|
+
s3_bucket=s3_bucket,
|
|
680
|
+
_suppress_comp_warning=True,
|
|
681
|
+
**kwargs,
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
registered_classes = register_openneuro_datasets(
|
|
686
|
+
summary_file=Path(__file__).with_name("dataset_summary.csv"),
|
|
687
|
+
base_class=EEGDashDataset,
|
|
688
|
+
namespace=globals(),
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
__all__ = ["EEGDashDataset", "EEGChallengeDataset"] + list(registered_classes.keys())
|