eegdash 0.3.9.dev182388821__py3-none-any.whl → 0.4.0.dev144__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +12 -1
- eegdash/api.py +128 -155
- eegdash/bids_eeg_metadata.py +160 -27
- eegdash/const.py +18 -0
- eegdash/data_utils.py +74 -254
- eegdash/dataset/__init__.py +19 -1
- eegdash/dataset/dataset.py +27 -21
- eegdash/dataset/dataset_summary.csv +0 -1
- eegdash/dataset/registry.py +96 -9
- eegdash/downloader.py +187 -0
- eegdash/features/datasets.py +4 -3
- eegdash/features/serialization.py +8 -4
- eegdash/hbn/__init__.py +11 -0
- eegdash/hbn/preprocessing.py +11 -2
- eegdash/hbn/windows.py +12 -2
- eegdash/logging.py +33 -0
- eegdash/mongodb.py +11 -0
- eegdash/paths.py +11 -0
- eegdash/utils.py +10 -0
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dev144.dist-info}/METADATA +6 -56
- eegdash-0.4.0.dev144.dist-info/RECORD +37 -0
- eegdash-0.3.9.dev182388821.dist-info/RECORD +0 -35
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dev144.dist-info}/WHEEL +0 -0
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dev144.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dev144.dist-info}/top_level.txt +0 -0
eegdash/data_utils.py
CHANGED
|
@@ -1,10 +1,19 @@
|
|
|
1
|
+
# Authors: The EEGDash contributors.
|
|
2
|
+
# License: GNU General Public License
|
|
3
|
+
# Copyright the EEGDash contributors.
|
|
4
|
+
|
|
5
|
+
"""Data utilities and dataset classes for EEG data handling.
|
|
6
|
+
|
|
7
|
+
This module provides core dataset classes for working with EEG data in the EEGDash ecosystem,
|
|
8
|
+
including classes for individual recordings and collections of datasets. It integrates with
|
|
9
|
+
braindecode for machine learning workflows and handles data loading from both local and remote sources.
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
import io
|
|
2
13
|
import json
|
|
3
|
-
import logging
|
|
4
14
|
import os
|
|
5
15
|
import re
|
|
6
16
|
import traceback
|
|
7
|
-
import warnings
|
|
8
17
|
from contextlib import redirect_stderr
|
|
9
18
|
from pathlib import Path
|
|
10
19
|
from typing import Any
|
|
@@ -13,9 +22,7 @@ import mne
|
|
|
13
22
|
import mne_bids
|
|
14
23
|
import numpy as np
|
|
15
24
|
import pandas as pd
|
|
16
|
-
import s3fs
|
|
17
25
|
from bids import BIDSLayout
|
|
18
|
-
from fsspec.callbacks import TqdmCallback
|
|
19
26
|
from joblib import Parallel, delayed
|
|
20
27
|
from mne._fiff.utils import _read_segments_file
|
|
21
28
|
from mne.io import BaseRaw
|
|
@@ -23,10 +30,11 @@ from mne_bids import BIDSPath
|
|
|
23
30
|
|
|
24
31
|
from braindecode.datasets import BaseDataset
|
|
25
32
|
|
|
33
|
+
from . import downloader
|
|
34
|
+
from .bids_eeg_metadata import enrich_from_participants
|
|
35
|
+
from .logging import logger
|
|
26
36
|
from .paths import get_default_cache_dir
|
|
27
37
|
|
|
28
|
-
logger = logging.getLogger("eegdash")
|
|
29
|
-
|
|
30
38
|
|
|
31
39
|
class EEGDashBaseDataset(BaseDataset):
|
|
32
40
|
"""A single EEG recording hosted on AWS S3 and cached locally upon first access.
|
|
@@ -73,6 +81,7 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
73
81
|
# Compute a dataset folder name under cache_dir that encodes preprocessing
|
|
74
82
|
# (e.g., bdf, mini) to avoid overlapping with the original dataset cache.
|
|
75
83
|
self.dataset_folder = record.get("dataset", "")
|
|
84
|
+
# TODO: remove this hack when competition is over
|
|
76
85
|
if s3_bucket:
|
|
77
86
|
suffixes: list[str] = []
|
|
78
87
|
bucket_lower = str(s3_bucket).lower()
|
|
@@ -91,6 +100,7 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
91
100
|
rel = Path(self.dataset_folder) / rel
|
|
92
101
|
self.filecache = self.cache_dir / rel
|
|
93
102
|
self.bids_root = self.cache_dir / self.dataset_folder
|
|
103
|
+
|
|
94
104
|
self.bidspath = BIDSPath(
|
|
95
105
|
root=self.bids_root,
|
|
96
106
|
datatype="eeg",
|
|
@@ -98,113 +108,18 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
98
108
|
**self.bids_kwargs,
|
|
99
109
|
)
|
|
100
110
|
|
|
101
|
-
self.s3file = self.
|
|
111
|
+
self.s3file = downloader.get_s3path(self.s3_bucket, record["bidspath"])
|
|
102
112
|
self.bids_dependencies = record["bidsdependencies"]
|
|
103
|
-
|
|
104
|
-
#
|
|
113
|
+
self.bids_dependencies_original = record["bidsdependencies"]
|
|
114
|
+
# TODO: removing temporary fix for BIDS dependencies path
|
|
115
|
+
# when the competition is over and dataset is digested properly
|
|
105
116
|
if not self.s3_open_neuro:
|
|
106
|
-
self.bids_dependencies_original = self.bids_dependencies
|
|
107
117
|
self.bids_dependencies = [
|
|
108
118
|
dep.split("/", 1)[1] for dep in self.bids_dependencies
|
|
109
119
|
]
|
|
110
120
|
|
|
111
121
|
self._raw = None
|
|
112
122
|
|
|
113
|
-
def _get_s3path(self, filepath: str) -> str:
|
|
114
|
-
"""Helper to form an AWS S3 URI for the given relative filepath."""
|
|
115
|
-
return f"{self.s3_bucket}/{filepath}"
|
|
116
|
-
|
|
117
|
-
def _download_s3(self) -> None:
|
|
118
|
-
"""Download function that gets the raw EEG data from S3."""
|
|
119
|
-
filesystem = s3fs.S3FileSystem(
|
|
120
|
-
anon=True, client_kwargs={"region_name": "us-east-2"}
|
|
121
|
-
)
|
|
122
|
-
if not self.s3_open_neuro:
|
|
123
|
-
self.s3file = re.sub(r"(^|/)ds\d{6}/", r"\1", self.s3file, count=1)
|
|
124
|
-
if self.s3file.endswith(".set"):
|
|
125
|
-
self.s3file = self.s3file[:-4] + ".bdf"
|
|
126
|
-
self.filecache = self.filecache.with_suffix(".bdf")
|
|
127
|
-
|
|
128
|
-
self.filecache.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
-
info = filesystem.info(self.s3file)
|
|
130
|
-
size = info.get("size") or info.get("Size")
|
|
131
|
-
|
|
132
|
-
callback = TqdmCallback(
|
|
133
|
-
size=size,
|
|
134
|
-
tqdm_kwargs=dict(
|
|
135
|
-
desc=f"Downloading {Path(self.s3file).name}",
|
|
136
|
-
unit="B",
|
|
137
|
-
unit_scale=True,
|
|
138
|
-
unit_divisor=1024,
|
|
139
|
-
dynamic_ncols=True,
|
|
140
|
-
leave=True,
|
|
141
|
-
mininterval=0.2,
|
|
142
|
-
smoothing=0.1,
|
|
143
|
-
miniters=1,
|
|
144
|
-
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
|
|
145
|
-
"[{elapsed}<{remaining}, {rate_fmt}]",
|
|
146
|
-
),
|
|
147
|
-
)
|
|
148
|
-
filesystem.get(self.s3file, self.filecache, callback=callback)
|
|
149
|
-
|
|
150
|
-
self.filenames = [self.filecache]
|
|
151
|
-
|
|
152
|
-
def _download_dependencies(self) -> None:
|
|
153
|
-
"""Download all BIDS dependency files (metadata files, recording sidecar files)
|
|
154
|
-
from S3 and cache them locally.
|
|
155
|
-
"""
|
|
156
|
-
filesystem = s3fs.S3FileSystem(
|
|
157
|
-
anon=True, client_kwargs={"region_name": "us-east-2"}
|
|
158
|
-
)
|
|
159
|
-
for i, dep in enumerate(self.bids_dependencies):
|
|
160
|
-
if not self.s3_open_neuro:
|
|
161
|
-
# fix this when our bucket is integrated into the
|
|
162
|
-
# mongodb
|
|
163
|
-
# if the file have ".set" replace to ".bdf"
|
|
164
|
-
if dep.endswith(".set"):
|
|
165
|
-
dep = dep[:-4] + ".bdf"
|
|
166
|
-
|
|
167
|
-
s3path = self._get_s3path(dep)
|
|
168
|
-
if not self.s3_open_neuro:
|
|
169
|
-
dep = self.bids_dependencies_original[i]
|
|
170
|
-
|
|
171
|
-
dep_path = Path(dep)
|
|
172
|
-
if dep_path.parts and dep_path.parts[0] == self.record.get("dataset"):
|
|
173
|
-
dep_local = Path(self.dataset_folder, *dep_path.parts[1:])
|
|
174
|
-
else:
|
|
175
|
-
dep_local = Path(self.dataset_folder) / dep_path
|
|
176
|
-
filepath = self.cache_dir / dep_local
|
|
177
|
-
if not self.s3_open_neuro:
|
|
178
|
-
if filepath.suffix == ".set":
|
|
179
|
-
filepath = filepath.with_suffix(".bdf")
|
|
180
|
-
if self.filecache.suffix == ".set":
|
|
181
|
-
self.filecache = self.filecache.with_suffix(".bdf")
|
|
182
|
-
|
|
183
|
-
# here, we download the dependency and it is fine
|
|
184
|
-
# in the case of the competition.
|
|
185
|
-
if not filepath.exists():
|
|
186
|
-
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
187
|
-
info = filesystem.info(s3path)
|
|
188
|
-
size = info.get("size") or info.get("Size")
|
|
189
|
-
|
|
190
|
-
callback = TqdmCallback(
|
|
191
|
-
size=size,
|
|
192
|
-
tqdm_kwargs=dict(
|
|
193
|
-
desc=f"Downloading {Path(s3path).name}",
|
|
194
|
-
unit="B",
|
|
195
|
-
unit_scale=True,
|
|
196
|
-
unit_divisor=1024,
|
|
197
|
-
dynamic_ncols=True,
|
|
198
|
-
leave=True,
|
|
199
|
-
mininterval=0.2,
|
|
200
|
-
smoothing=0.1,
|
|
201
|
-
miniters=1,
|
|
202
|
-
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
|
|
203
|
-
"[{elapsed}<{remaining}, {rate_fmt}]",
|
|
204
|
-
),
|
|
205
|
-
)
|
|
206
|
-
filesystem.get(s3path, filepath, callback=callback)
|
|
207
|
-
|
|
208
123
|
def _get_raw_bids_args(self) -> dict[str, Any]:
|
|
209
124
|
"""Helper to restrict the metadata record to the fields needed to locate a BIDS
|
|
210
125
|
recording.
|
|
@@ -222,130 +137,43 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
222
137
|
|
|
223
138
|
if not os.path.exists(self.filecache): # not preload
|
|
224
139
|
if self.bids_dependencies:
|
|
225
|
-
|
|
226
|
-
|
|
140
|
+
downloader.download_dependencies(
|
|
141
|
+
s3_bucket=self.s3_bucket,
|
|
142
|
+
bids_dependencies=self.bids_dependencies,
|
|
143
|
+
bids_dependencies_original=self.bids_dependencies_original,
|
|
144
|
+
cache_dir=self.cache_dir,
|
|
145
|
+
dataset_folder=self.dataset_folder,
|
|
146
|
+
record=self.record,
|
|
147
|
+
s3_open_neuro=self.s3_open_neuro,
|
|
148
|
+
)
|
|
149
|
+
self.filecache = downloader.download_s3_file(
|
|
150
|
+
self.s3file, self.filecache, self.s3_open_neuro
|
|
151
|
+
)
|
|
152
|
+
self.filenames = [self.filecache]
|
|
227
153
|
if self._raw is None:
|
|
228
|
-
# capturing any warnings
|
|
229
|
-
# to-do: remove this once is fixed on the mne-bids side.
|
|
230
|
-
with warnings.catch_warnings(record=True) as w:
|
|
231
|
-
# Ensure all warnings are captured into 'w' and not shown to users
|
|
232
|
-
warnings.simplefilter("always")
|
|
233
|
-
try:
|
|
234
|
-
# mne-bids emits RuntimeWarnings to stderr; silence stderr during read
|
|
235
|
-
_stderr_buffer = io.StringIO()
|
|
236
|
-
with redirect_stderr(_stderr_buffer):
|
|
237
|
-
self._raw = mne_bids.read_raw_bids(
|
|
238
|
-
bids_path=self.bidspath, verbose="ERROR"
|
|
239
|
-
)
|
|
240
|
-
# Parse unmapped participants.tsv fields reported by mne-bids and
|
|
241
|
-
# inject them into Raw.info and the dataset description generically.
|
|
242
|
-
extras = self._extract_unmapped_participants_from_warnings(w)
|
|
243
|
-
if extras:
|
|
244
|
-
# 1) Attach to Raw.info under subject_info.participants_extras
|
|
245
|
-
try:
|
|
246
|
-
subject_info = self._raw.info.get("subject_info") or {}
|
|
247
|
-
if not isinstance(subject_info, dict):
|
|
248
|
-
subject_info = {}
|
|
249
|
-
pe = subject_info.get("participants_extras") or {}
|
|
250
|
-
if not isinstance(pe, dict):
|
|
251
|
-
pe = {}
|
|
252
|
-
# Merge without overwriting
|
|
253
|
-
for k, v in extras.items():
|
|
254
|
-
pe.setdefault(k, v)
|
|
255
|
-
subject_info["participants_extras"] = pe
|
|
256
|
-
self._raw.info["subject_info"] = subject_info
|
|
257
|
-
except Exception:
|
|
258
|
-
# Non-fatal; continue
|
|
259
|
-
pass
|
|
260
|
-
|
|
261
|
-
# 2) Also add to this dataset's description, if possible, so
|
|
262
|
-
# targets can be selected later without naming specifics.
|
|
263
|
-
try:
|
|
264
|
-
if isinstance(self.description, dict):
|
|
265
|
-
for k, v in extras.items():
|
|
266
|
-
self.description.setdefault(k, v)
|
|
267
|
-
elif isinstance(self.description, pd.Series):
|
|
268
|
-
for k, v in extras.items():
|
|
269
|
-
if k not in self.description.index:
|
|
270
|
-
self.description.loc[k] = v
|
|
271
|
-
except Exception:
|
|
272
|
-
pass
|
|
273
|
-
except Exception as e:
|
|
274
|
-
logger.error(
|
|
275
|
-
f"Error while reading BIDS file: {self.bidspath}\n"
|
|
276
|
-
"This may be due to a missing or corrupted file.\n"
|
|
277
|
-
"Please check the file and try again."
|
|
278
|
-
)
|
|
279
|
-
logger.error(f"Exception: {e}")
|
|
280
|
-
logger.error(traceback.format_exc())
|
|
281
|
-
raise e
|
|
282
|
-
# Filter noisy mapping notices from mne-bids; surface others
|
|
283
|
-
for captured_warning in w:
|
|
284
|
-
try:
|
|
285
|
-
msg = str(captured_warning.message)
|
|
286
|
-
except Exception:
|
|
287
|
-
continue
|
|
288
|
-
# Suppress verbose participants mapping messages
|
|
289
|
-
if "Unable to map the following column" in msg and "MNE" in msg:
|
|
290
|
-
logger.debug(
|
|
291
|
-
"Suppressed mne-bids mapping warning while reading BIDS file: %s",
|
|
292
|
-
msg,
|
|
293
|
-
)
|
|
294
|
-
continue
|
|
295
|
-
|
|
296
|
-
def _extract_unmapped_participants_from_warnings(
|
|
297
|
-
self, warnings_list: list[Any]
|
|
298
|
-
) -> dict[str, Any]:
|
|
299
|
-
"""Scan captured warnings from mne-bids and extract unmapped participants.tsv
|
|
300
|
-
entries in a generic way.
|
|
301
|
-
|
|
302
|
-
Optionally, the column name can carry a note in parentheses that we ignore
|
|
303
|
-
for key/value extraction. Returns a mapping of column name -> raw value.
|
|
304
|
-
"""
|
|
305
|
-
extras: dict[str, Any] = {}
|
|
306
|
-
header = "Unable to map the following column(s) to MNE:"
|
|
307
|
-
for wr in warnings_list:
|
|
308
|
-
try:
|
|
309
|
-
msg = str(wr.message)
|
|
310
|
-
except Exception:
|
|
311
|
-
continue
|
|
312
|
-
if header not in msg:
|
|
313
|
-
continue
|
|
314
|
-
lines = msg.splitlines()
|
|
315
|
-
# Find the header line, then parse subsequent lines as entries
|
|
316
154
|
try:
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
"""Main function to access a sample from the dataset."""
|
|
340
|
-
X = self.raw[:, index][0]
|
|
341
|
-
y = None
|
|
342
|
-
if self.target_name is not None:
|
|
343
|
-
y = self.description[self.target_name]
|
|
344
|
-
if isinstance(y, pd.Series):
|
|
345
|
-
y = y.to_list()
|
|
346
|
-
if self.transform is not None:
|
|
347
|
-
X = self.transform(X)
|
|
348
|
-
return X, y
|
|
155
|
+
# mne-bids can emit noisy warnings to stderr; keep user logs clean
|
|
156
|
+
_stderr_buffer = io.StringIO()
|
|
157
|
+
with redirect_stderr(_stderr_buffer):
|
|
158
|
+
self._raw = mne_bids.read_raw_bids(
|
|
159
|
+
bids_path=self.bidspath, verbose="ERROR"
|
|
160
|
+
)
|
|
161
|
+
# Enrich Raw.info and description with participants.tsv extras
|
|
162
|
+
enrich_from_participants(
|
|
163
|
+
self.bids_root, self.bidspath, self._raw, self.description
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(
|
|
168
|
+
f"Error while reading BIDS file: {self.bidspath}\n"
|
|
169
|
+
"This may be due to a missing or corrupted file.\n"
|
|
170
|
+
"Please check the file and try again.\n"
|
|
171
|
+
"Usually erasing the local cache and re-downloading helps.\n"
|
|
172
|
+
f"`rm {self.bidspath}`"
|
|
173
|
+
)
|
|
174
|
+
logger.error(f"Exception: {e}")
|
|
175
|
+
logger.error(traceback.format_exc())
|
|
176
|
+
raise e
|
|
349
177
|
|
|
350
178
|
def __len__(self) -> int:
|
|
351
179
|
"""Return the number of samples in the dataset."""
|
|
@@ -426,13 +254,16 @@ class EEGDashBaseRaw(BaseRaw):
|
|
|
426
254
|
ch_types.append(chtype)
|
|
427
255
|
info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
|
|
428
256
|
|
|
429
|
-
self.s3file = self.
|
|
257
|
+
self.s3file = downloader.get_s3path(self._AWS_BUCKET, input_fname)
|
|
430
258
|
self.cache_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
|
|
431
259
|
self.filecache = self.cache_dir / input_fname
|
|
432
260
|
self.bids_dependencies = bids_dependencies
|
|
433
261
|
|
|
434
262
|
if preload and not os.path.exists(self.filecache):
|
|
435
|
-
self.
|
|
263
|
+
self.filecache = downloader.download_s3_file(
|
|
264
|
+
self.s3file, self.filecache, self.s3_open_neuro
|
|
265
|
+
)
|
|
266
|
+
self.filenames = [self.filecache]
|
|
436
267
|
preload = self.filecache
|
|
437
268
|
|
|
438
269
|
super().__init__(
|
|
@@ -443,35 +274,24 @@ class EEGDashBaseRaw(BaseRaw):
|
|
|
443
274
|
verbose=verbose,
|
|
444
275
|
)
|
|
445
276
|
|
|
446
|
-
def _get_s3path(self, filepath):
|
|
447
|
-
return f"{self._AWS_BUCKET}/{filepath}"
|
|
448
|
-
|
|
449
|
-
def _download_s3(self) -> None:
|
|
450
|
-
self.filecache.parent.mkdir(parents=True, exist_ok=True)
|
|
451
|
-
filesystem = s3fs.S3FileSystem(
|
|
452
|
-
anon=True, client_kwargs={"region_name": "us-east-2"}
|
|
453
|
-
)
|
|
454
|
-
filesystem.download(self.s3file, self.filecache)
|
|
455
|
-
self.filenames = [self.filecache]
|
|
456
|
-
|
|
457
|
-
def _download_dependencies(self):
|
|
458
|
-
filesystem = s3fs.S3FileSystem(
|
|
459
|
-
anon=True, client_kwargs={"region_name": "us-east-2"}
|
|
460
|
-
)
|
|
461
|
-
for dep in self.bids_dependencies:
|
|
462
|
-
s3path = self._get_s3path(dep)
|
|
463
|
-
filepath = self.cache_dir / dep
|
|
464
|
-
if not filepath.exists():
|
|
465
|
-
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
466
|
-
filesystem.download(s3path, filepath)
|
|
467
|
-
|
|
468
277
|
def _read_segment(
|
|
469
278
|
self, start=0, stop=None, sel=None, data_buffer=None, *, verbose=None
|
|
470
279
|
):
|
|
471
280
|
if not os.path.exists(self.filecache): # not preload
|
|
472
|
-
if self.bids_dependencies:
|
|
473
|
-
|
|
474
|
-
|
|
281
|
+
if self.bids_dependencies: # this is use only to sidecars for now
|
|
282
|
+
downloader.download_dependencies(
|
|
283
|
+
s3_bucket=self._AWS_BUCKET,
|
|
284
|
+
bids_dependencies=self.bids_dependencies,
|
|
285
|
+
bids_dependencies_original=None,
|
|
286
|
+
cache_dir=self.cache_dir,
|
|
287
|
+
dataset_folder=self.filecache,
|
|
288
|
+
record={},
|
|
289
|
+
s3_open_neuro=self.s3_open_neuro,
|
|
290
|
+
)
|
|
291
|
+
self.filecache = downloader.download_s3_file(
|
|
292
|
+
self.s3file, self.filecache, self.s3_open_neuro
|
|
293
|
+
)
|
|
294
|
+
self.filenames = [self.filecache]
|
|
475
295
|
else: # not preload and file is not cached
|
|
476
296
|
self.filenames = [self.filecache]
|
|
477
297
|
return super()._read_segment(start, stop, sel, data_buffer, verbose=verbose)
|
eegdash/dataset/__init__.py
CHANGED
|
@@ -1,4 +1,22 @@
|
|
|
1
|
+
"""Public API for dataset helpers and dynamically generated datasets."""
|
|
2
|
+
|
|
3
|
+
from . import dataset as _dataset_mod # triggers dynamic class registration
|
|
1
4
|
from .dataset import EEGChallengeDataset
|
|
2
5
|
from .registry import register_openneuro_datasets
|
|
3
6
|
|
|
4
|
-
|
|
7
|
+
# Re-export dynamically generated dataset classes at the package level so that
|
|
8
|
+
# ``eegdash.dataset`` shows them in the API docs and users can import as
|
|
9
|
+
# ``from eegdash.dataset import DSXXXXX``.
|
|
10
|
+
_dyn_names = []
|
|
11
|
+
for _name in getattr(_dataset_mod, "__all__", []):
|
|
12
|
+
if _name == "EEGChallengeDataset":
|
|
13
|
+
# Already imported explicitly above
|
|
14
|
+
continue
|
|
15
|
+
_obj = getattr(_dataset_mod, _name, None)
|
|
16
|
+
if _obj is not None:
|
|
17
|
+
globals()[_name] = _obj
|
|
18
|
+
_dyn_names.append(_name)
|
|
19
|
+
|
|
20
|
+
__all__ = ["EEGChallengeDataset", "register_openneuro_datasets"] + _dyn_names
|
|
21
|
+
|
|
22
|
+
del _dataset_mod, _name, _obj, _dyn_names
|
eegdash/dataset/dataset.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
|
|
4
|
-
from
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.panel import Panel
|
|
5
|
+
from rich.text import Text
|
|
5
6
|
|
|
6
7
|
from ..api import EEGDashDataset
|
|
7
8
|
from ..bids_eeg_metadata import build_query_from_kwargs
|
|
8
9
|
from ..const import RELEASE_TO_OPENNEURO_DATASET_MAP, SUBJECT_MINI_RELEASE_MAP
|
|
10
|
+
from ..logging import logger
|
|
9
11
|
from .registry import register_openneuro_datasets
|
|
10
12
|
|
|
11
|
-
logger = logging.getLogger("eegdash")
|
|
12
|
-
|
|
13
13
|
|
|
14
14
|
class EEGChallengeDataset(EEGDashDataset):
|
|
15
15
|
"""EEG 2025 Challenge dataset helper.
|
|
@@ -23,8 +23,6 @@ class EEGChallengeDataset(EEGDashDataset):
|
|
|
23
23
|
----------
|
|
24
24
|
release : str
|
|
25
25
|
Release name. One of ["R1", ..., "R11"].
|
|
26
|
-
cache_dir : str
|
|
27
|
-
Local cache directory for data files.
|
|
28
26
|
mini : bool, default True
|
|
29
27
|
If True, restrict subjects to the challenge mini subset.
|
|
30
28
|
query : dict | None
|
|
@@ -123,24 +121,32 @@ class EEGChallengeDataset(EEGDashDataset):
|
|
|
123
121
|
else:
|
|
124
122
|
s3_bucket = f"{s3_bucket}/{release}_L100_bdf"
|
|
125
123
|
|
|
126
|
-
|
|
127
|
-
"\n\n"
|
|
128
|
-
"[EEGChallengeDataset] EEG 2025 Competition Data Notice:\n"
|
|
129
|
-
"-------------------------------------------------------\n"
|
|
124
|
+
message_text = Text.from_markup(
|
|
130
125
|
"This object loads the HBN dataset that has been preprocessed for the EEG Challenge:\n"
|
|
131
|
-
"
|
|
132
|
-
"
|
|
133
|
-
"
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
"\n"
|
|
137
|
-
"IMPORTANT: The data accessed via `EEGChallengeDataset` is NOT identical to what you get from
|
|
138
|
-
"If you are participating in the competition, always use `EEGChallengeDataset` to ensure consistency with the challenge data
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
126
|
+
" * Downsampled from 500Hz to 100Hz\n"
|
|
127
|
+
" * Bandpass filtered (0.5-50 Hz)\n\n"
|
|
128
|
+
"For full preprocessing applied for competition details, see:\n"
|
|
129
|
+
" [link=https://github.com/eeg2025/downsample-datasets]https://github.com/eeg2025/downsample-datasets[/link]\n\n"
|
|
130
|
+
"The HBN dataset have some preprocessing applied by the HBN team:\n"
|
|
131
|
+
" * Re-reference (Cz Channel)\n\n"
|
|
132
|
+
"[bold red]IMPORTANT[/bold red]: The data accessed via `EEGChallengeDataset` is [u]NOT[/u] identical to what you get from [link=https://github.com/sccn/EEGDash/blob/develop/eegdash/api.py]EEGDashDataset[/link] directly.\n"
|
|
133
|
+
"If you are participating in the competition, always use `EEGChallengeDataset` to ensure consistency with the challenge data."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
warning_panel = Panel(
|
|
137
|
+
message_text,
|
|
138
|
+
title="[yellow]EEG 2025 Competition Data Notice[/yellow]",
|
|
139
|
+
subtitle="[cyan]Source: EEGChallengeDataset[/cyan]",
|
|
140
|
+
border_style="yellow",
|
|
142
141
|
)
|
|
143
142
|
|
|
143
|
+
# Render the panel directly to the console so it displays in IPython/terminals
|
|
144
|
+
try:
|
|
145
|
+
Console().print(warning_panel)
|
|
146
|
+
except Exception:
|
|
147
|
+
warning_message = str(message_text)
|
|
148
|
+
logger.warning(warning_message)
|
|
149
|
+
|
|
144
150
|
super().__init__(
|
|
145
151
|
dataset=RELEASE_TO_OPENNEURO_DATASET_MAP[release],
|
|
146
152
|
query=query,
|
|
@@ -10,7 +10,6 @@
|
|
|
10
10
|
8,ds005508,3342,324,10,129,500,269.281,229.81 GB,246753736933,0,,,,,
|
|
11
11
|
9,ds005507,1812,184,10,129,500,168.649,139.37 GB,149646718160,0,,,,,
|
|
12
12
|
10,ds005506,1405,150,10,129,500,127.896,111.88 GB,120126449650,0,,,,,
|
|
13
|
-
11,test,2,1,1,64,500,20.556,0 B,0,0,,,,,
|
|
14
13
|
12,ds004854,1,1,1,64,128,0.535,79.21 MB,83057080,0,,,,,
|
|
15
14
|
13,ds004853,1,1,1,64,128,0.535,79.21 MB,83057080,0,,,,,
|
|
16
15
|
14,ds004844,68,17,1,64,1024,21.252,22.33 GB,23976121966,0,ds004844,,,Multisensory,Decision-making
|
eegdash/dataset/registry.py
CHANGED
|
@@ -57,14 +57,8 @@ def register_openneuro_datasets(
|
|
|
57
57
|
|
|
58
58
|
init = make_init(dataset_id)
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
{_markdown_table(row_series)}
|
|
63
|
-
|
|
64
|
-
This class is a thin convenience wrapper for the dataset ``{dataset_id}``.
|
|
65
|
-
Constructor arguments are forwarded to :class:`{base_class.__name__}`; see the
|
|
66
|
-
base class documentation for parameter details and examples.
|
|
67
|
-
"""
|
|
60
|
+
# Generate rich docstring with dataset metadata
|
|
61
|
+
doc = _generate_rich_docstring(dataset_id, row_series, base_class)
|
|
68
62
|
|
|
69
63
|
# init.__doc__ = doc
|
|
70
64
|
|
|
@@ -90,6 +84,94 @@ def register_openneuro_datasets(
|
|
|
90
84
|
return registered
|
|
91
85
|
|
|
92
86
|
|
|
87
|
+
def _generate_rich_docstring(dataset_id: str, row_series: pd.Series, base_class) -> str:
|
|
88
|
+
"""Generate a comprehensive docstring for a dataset class."""
|
|
89
|
+
# Extract metadata with safe defaults
|
|
90
|
+
n_subjects = row_series.get("n_subjects", "Unknown")
|
|
91
|
+
n_records = row_series.get("n_records", "Unknown")
|
|
92
|
+
n_tasks = row_series.get("n_tasks", "Unknown")
|
|
93
|
+
modality = row_series.get("modality of exp", "")
|
|
94
|
+
exp_type = row_series.get("type of exp", "")
|
|
95
|
+
subject_type = row_series.get("Type Subject", "")
|
|
96
|
+
duration = row_series.get("duration_hours_total", "Unknown")
|
|
97
|
+
size = row_series.get("size", "Unknown")
|
|
98
|
+
|
|
99
|
+
# Create description based on available metadata
|
|
100
|
+
description_parts = []
|
|
101
|
+
if modality and str(modality).strip():
|
|
102
|
+
description_parts.append(f"**Modality**: {modality}")
|
|
103
|
+
if exp_type and str(exp_type).strip():
|
|
104
|
+
description_parts.append(f"**Type**: {exp_type}")
|
|
105
|
+
if subject_type and str(subject_type).strip():
|
|
106
|
+
description_parts.append(f"**Subjects**: {subject_type}")
|
|
107
|
+
|
|
108
|
+
description = (
|
|
109
|
+
" | ".join(description_parts)
|
|
110
|
+
if description_parts
|
|
111
|
+
else "EEG dataset from OpenNeuro"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Generate the docstring
|
|
115
|
+
docstring = f"""OpenNeuro dataset ``{dataset_id}``.
|
|
116
|
+
|
|
117
|
+
{description}
|
|
118
|
+
|
|
119
|
+
This dataset contains {n_subjects} subjects with {n_records} recordings across {n_tasks} tasks.
|
|
120
|
+
Total duration: {duration} hours. Dataset size: {size}.
|
|
121
|
+
|
|
122
|
+
{_markdown_table(row_series)}
|
|
123
|
+
|
|
124
|
+
This dataset class provides convenient access to the ``{dataset_id}`` dataset through the EEGDash interface.
|
|
125
|
+
It inherits all functionality from :class:`~{base_class.__module__}.{base_class.__name__}` with the dataset filter pre-configured.
|
|
126
|
+
|
|
127
|
+
Parameters
|
|
128
|
+
----------
|
|
129
|
+
cache_dir : str
|
|
130
|
+
Directory to cache downloaded data.
|
|
131
|
+
query : dict, optional
|
|
132
|
+
Additional MongoDB-style filters to AND with the dataset selection.
|
|
133
|
+
Must not contain the key ``dataset``.
|
|
134
|
+
s3_bucket : str, optional
|
|
135
|
+
Base S3 bucket used to locate the data.
|
|
136
|
+
**kwargs
|
|
137
|
+
Additional arguments passed to the base dataset class.
|
|
138
|
+
|
|
139
|
+
Examples
|
|
140
|
+
--------
|
|
141
|
+
Basic usage:
|
|
142
|
+
|
|
143
|
+
>>> from eegdash.dataset import {dataset_id.upper()}
|
|
144
|
+
>>> dataset = {dataset_id.upper()}(cache_dir="./data")
|
|
145
|
+
>>> print(f"Number of recordings: {{len(dataset)}}")
|
|
146
|
+
|
|
147
|
+
Load a specific recording:
|
|
148
|
+
|
|
149
|
+
>>> if len(dataset) > 0:
|
|
150
|
+
... recording = dataset[0]
|
|
151
|
+
... raw = recording.load()
|
|
152
|
+
... print(f"Sampling rate: {{raw.info['sfreq']}} Hz")
|
|
153
|
+
... print(f"Number of channels: {{len(raw.ch_names)}}")
|
|
154
|
+
|
|
155
|
+
Filter by additional criteria:
|
|
156
|
+
|
|
157
|
+
>>> # Get subset with specific task or subject
|
|
158
|
+
>>> filtered_dataset = {dataset_id.upper()}(
|
|
159
|
+
... cache_dir="./data",
|
|
160
|
+
... query={{"task": "RestingState"}} # if applicable
|
|
161
|
+
... )
|
|
162
|
+
|
|
163
|
+
Notes
|
|
164
|
+
-----
|
|
165
|
+
More details available in the `NEMAR documentation <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`__.
|
|
166
|
+
|
|
167
|
+
See Also
|
|
168
|
+
--------
|
|
169
|
+
{base_class.__name__} : Base dataset class with full API documentation
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
return docstring
|
|
173
|
+
|
|
174
|
+
|
|
93
175
|
def _markdown_table(row_series: pd.Series) -> str:
|
|
94
176
|
"""Create a reStructuredText grid table from a pandas Series."""
|
|
95
177
|
if row_series.empty:
|
|
@@ -128,7 +210,12 @@ def _markdown_table(row_series: pd.Series) -> str:
|
|
|
128
210
|
table = tabulate(df, headers="keys", tablefmt="rst", showindex=False)
|
|
129
211
|
|
|
130
212
|
# Add a caption for the table
|
|
131
|
-
|
|
213
|
+
# Use an anonymous external link (double underscore) to avoid duplicate
|
|
214
|
+
# target warnings when this docstring is repeated across many classes.
|
|
215
|
+
caption = (
|
|
216
|
+
f"Short overview of dataset {dataset_id} more details in the "
|
|
217
|
+
f"`NeMAR documentation <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`__."
|
|
218
|
+
)
|
|
132
219
|
# adding caption below the table
|
|
133
220
|
# Indent the table to fit within the admonition block
|
|
134
221
|
indented_table = "\n".join(" " + line for line in table.split("\n"))
|