eegdash 0.3.6.dev183416654__py3-none-any.whl → 0.3.7.dev105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +5 -4
- eegdash/api.py +445 -413
- eegdash/bids_eeg_metadata.py +184 -0
- eegdash/{dataset.py → const.py} +46 -93
- eegdash/data_utils.py +68 -28
- eegdash/dataset/__init__.py +4 -0
- eegdash/dataset/dataset.py +161 -0
- eegdash/{registry.py → dataset/registry.py} +3 -3
- eegdash/utils.py +1 -1
- {eegdash-0.3.6.dev183416654.dist-info → eegdash-0.3.7.dev105.dist-info}/METADATA +1 -1
- {eegdash-0.3.6.dev183416654.dist-info → eegdash-0.3.7.dev105.dist-info}/RECORD +14 -14
- eegdash/data_config.py +0 -34
- eegdash/dataset_summary.csv +0 -256
- eegdash/preprocessing.py +0 -63
- {eegdash-0.3.6.dev183416654.dist-info → eegdash-0.3.7.dev105.dist-info}/WHEEL +0 -0
- {eegdash-0.3.6.dev183416654.dist-info → eegdash-0.3.7.dev105.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.3.6.dev183416654.dist-info → eegdash-0.3.7.dev105.dist-info}/top_level.txt +0 -0
eegdash/api.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import tempfile
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Mapping
|
|
6
|
+
from urllib.parse import urlsplit
|
|
6
7
|
|
|
7
8
|
import mne
|
|
8
9
|
import numpy as np
|
|
@@ -11,13 +12,18 @@ import xarray as xr
|
|
|
11
12
|
from dotenv import load_dotenv
|
|
12
13
|
from joblib import Parallel, delayed
|
|
13
14
|
from mne.utils import warn
|
|
14
|
-
from mne_bids import get_bids_path_from_fname, read_raw_bids
|
|
15
|
+
from mne_bids import find_matching_paths, get_bids_path_from_fname, read_raw_bids
|
|
15
16
|
from pymongo import InsertOne, UpdateOne
|
|
16
17
|
from s3fs import S3FileSystem
|
|
17
18
|
|
|
18
19
|
from braindecode.datasets import BaseConcatDataset
|
|
19
20
|
|
|
20
|
-
from .
|
|
21
|
+
from .bids_eeg_metadata import build_query_from_kwargs, load_eeg_attrs_from_bids_file
|
|
22
|
+
from .const import (
|
|
23
|
+
ALLOWED_QUERY_FIELDS,
|
|
24
|
+
RELEASE_TO_OPENNEURO_DATASET_MAP,
|
|
25
|
+
)
|
|
26
|
+
from .const import config as data_config
|
|
21
27
|
from .data_utils import EEGBIDSDataset, EEGDashBaseDataset
|
|
22
28
|
from .mongodb import MongoConnectionManager
|
|
23
29
|
|
|
@@ -25,46 +31,31 @@ logger = logging.getLogger("eegdash")
|
|
|
25
31
|
|
|
26
32
|
|
|
27
33
|
class EEGDash:
|
|
28
|
-
"""
|
|
34
|
+
"""High-level interface to the EEGDash metadata database.
|
|
29
35
|
|
|
30
|
-
|
|
31
|
-
EEGDash database (or
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
While this class provides basic support for loading EEG data, please see
|
|
35
|
-
the EEGDashDataset class for a more complete way to retrieve and work with full
|
|
36
|
-
datasets.
|
|
36
|
+
Provides methods to query, insert, and update metadata records stored in the
|
|
37
|
+
EEGDash MongoDB database (public or private). Also includes utilities to load
|
|
38
|
+
EEG data from S3 for matched records.
|
|
37
39
|
|
|
40
|
+
For working with collections of
|
|
41
|
+
recordings as PyTorch datasets, prefer :class:`EEGDashDataset`.
|
|
38
42
|
"""
|
|
39
43
|
|
|
40
|
-
_ALLOWED_QUERY_FIELDS = {
|
|
41
|
-
"data_name",
|
|
42
|
-
"dataset",
|
|
43
|
-
"subject",
|
|
44
|
-
"task",
|
|
45
|
-
"session",
|
|
46
|
-
"run",
|
|
47
|
-
"modality",
|
|
48
|
-
"sampling_frequency",
|
|
49
|
-
"nchans",
|
|
50
|
-
"ntimes",
|
|
51
|
-
}
|
|
52
|
-
|
|
53
44
|
def __init__(self, *, is_public: bool = True, is_staging: bool = False) -> None:
|
|
54
|
-
"""Create new
|
|
45
|
+
"""Create a new EEGDash client.
|
|
55
46
|
|
|
56
47
|
Parameters
|
|
57
48
|
----------
|
|
58
|
-
is_public: bool
|
|
59
|
-
|
|
60
|
-
private database instance
|
|
61
|
-
(or
|
|
62
|
-
is_staging: bool
|
|
63
|
-
If True
|
|
64
|
-
production database (
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
49
|
+
is_public : bool, default True
|
|
50
|
+
Connect to the public MongoDB database. If ``False``, connect to a
|
|
51
|
+
private database instance using the ``DB_CONNECTION_STRING`` environment
|
|
52
|
+
variable (or value from a ``.env`` file).
|
|
53
|
+
is_staging : bool, default False
|
|
54
|
+
If ``True``, use the staging database (``eegdashstaging``); otherwise
|
|
55
|
+
use the production database (``eegdash``).
|
|
56
|
+
|
|
57
|
+
Examples
|
|
58
|
+
--------
|
|
68
59
|
>>> eegdash = EEGDash()
|
|
69
60
|
|
|
70
61
|
"""
|
|
@@ -105,23 +96,25 @@ class EEGDash:
|
|
|
105
96
|
|
|
106
97
|
Parameters
|
|
107
98
|
----------
|
|
108
|
-
query: dict, optional
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
99
|
+
query : dict, optional
|
|
100
|
+
Complete MongoDB query dictionary. This is a positional-only
|
|
101
|
+
argument.
|
|
102
|
+
**kwargs
|
|
103
|
+
User-friendly field filters that are converted to a MongoDB query.
|
|
104
|
+
Values can be scalars (e.g., ``"sub-01"``) or sequences (translated
|
|
105
|
+
to ``$in`` queries).
|
|
113
106
|
|
|
114
107
|
Returns
|
|
115
108
|
-------
|
|
116
|
-
list
|
|
117
|
-
|
|
109
|
+
list of dict
|
|
110
|
+
DB records that match the query.
|
|
118
111
|
|
|
119
112
|
"""
|
|
120
113
|
final_query: dict[str, Any] | None = None
|
|
121
114
|
|
|
122
115
|
# Accept explicit empty dict {} to mean "match all"
|
|
123
116
|
raw_query = query if isinstance(query, dict) else None
|
|
124
|
-
kwargs_query =
|
|
117
|
+
kwargs_query = build_query_from_kwargs(**kwargs) if kwargs else None
|
|
125
118
|
|
|
126
119
|
# Determine presence, treating {} as a valid raw query
|
|
127
120
|
has_raw = isinstance(raw_query, dict)
|
|
@@ -238,59 +231,12 @@ class EEGDash:
|
|
|
238
231
|
return record
|
|
239
232
|
|
|
240
233
|
def _build_query_from_kwargs(self, **kwargs) -> dict[str, Any]:
|
|
241
|
-
"""
|
|
234
|
+
"""Internal helper to build a validated MongoDB query from keyword args.
|
|
242
235
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
- For list/tuple/set values: strip strings, drop None/empties, deduplicate, and use `$in`
|
|
246
|
-
- Preserve scalars as exact matches
|
|
236
|
+
This delegates to the module-level builder used across the package and
|
|
237
|
+
is exposed here for testing and convenience.
|
|
247
238
|
"""
|
|
248
|
-
|
|
249
|
-
unknown_fields = set(kwargs.keys()) - self._ALLOWED_QUERY_FIELDS
|
|
250
|
-
if unknown_fields:
|
|
251
|
-
raise ValueError(
|
|
252
|
-
f"Unsupported query field(s): {', '.join(sorted(unknown_fields))}. "
|
|
253
|
-
f"Allowed fields are: {', '.join(sorted(self._ALLOWED_QUERY_FIELDS))}"
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
# 2. Construct the query dictionary
|
|
257
|
-
query = {}
|
|
258
|
-
for key, value in kwargs.items():
|
|
259
|
-
# None is not a valid constraint
|
|
260
|
-
if value is None:
|
|
261
|
-
raise ValueError(
|
|
262
|
-
f"Received None for query parameter '{key}'. Provide a concrete value."
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
# Handle list-like values as multi-constraints
|
|
266
|
-
if isinstance(value, (list, tuple, set)):
|
|
267
|
-
cleaned: list[Any] = []
|
|
268
|
-
for item in value:
|
|
269
|
-
if item is None:
|
|
270
|
-
continue
|
|
271
|
-
if isinstance(item, str):
|
|
272
|
-
item = item.strip()
|
|
273
|
-
if not item:
|
|
274
|
-
continue
|
|
275
|
-
cleaned.append(item)
|
|
276
|
-
# Deduplicate while preserving order
|
|
277
|
-
cleaned = list(dict.fromkeys(cleaned))
|
|
278
|
-
if not cleaned:
|
|
279
|
-
raise ValueError(
|
|
280
|
-
f"Received an empty list for query parameter '{key}'. This is not supported."
|
|
281
|
-
)
|
|
282
|
-
query[key] = {"$in": cleaned}
|
|
283
|
-
else:
|
|
284
|
-
# Scalars: trim strings and validate
|
|
285
|
-
if isinstance(value, str):
|
|
286
|
-
value = value.strip()
|
|
287
|
-
if not value:
|
|
288
|
-
raise ValueError(
|
|
289
|
-
f"Received an empty string for query parameter '{key}'."
|
|
290
|
-
)
|
|
291
|
-
query[key] = value
|
|
292
|
-
|
|
293
|
-
return query
|
|
239
|
+
return build_query_from_kwargs(**kwargs)
|
|
294
240
|
|
|
295
241
|
# --- Query merging and conflict detection helpers ---
|
|
296
242
|
def _extract_simple_constraint(self, query: dict[str, Any], key: str):
|
|
@@ -323,8 +269,8 @@ class EEGDash:
|
|
|
323
269
|
return
|
|
324
270
|
|
|
325
271
|
# Only consider fields we generally allow; skip meta operators like $and
|
|
326
|
-
raw_keys = set(raw_query.keys()) &
|
|
327
|
-
kw_keys = set(kwargs_query.keys()) &
|
|
272
|
+
raw_keys = set(raw_query.keys()) & ALLOWED_QUERY_FIELDS
|
|
273
|
+
kw_keys = set(kwargs_query.keys()) & ALLOWED_QUERY_FIELDS
|
|
328
274
|
dup_keys = raw_keys & kw_keys
|
|
329
275
|
for key in dup_keys:
|
|
330
276
|
rc = self._extract_simple_constraint(raw_query, key)
|
|
@@ -359,44 +305,95 @@ class EEGDash:
|
|
|
359
305
|
)
|
|
360
306
|
|
|
361
307
|
def load_eeg_data_from_s3(self, s3path: str) -> xr.DataArray:
|
|
362
|
-
"""Load
|
|
308
|
+
"""Load EEG data from an S3 URI into an ``xarray.DataArray``.
|
|
309
|
+
|
|
310
|
+
Preserves the original filename, downloads sidecar files when applicable
|
|
311
|
+
(e.g., ``.fdt`` for EEGLAB, ``.vmrk``/``.eeg`` for BrainVision), and uses
|
|
312
|
+
MNE's direct readers.
|
|
363
313
|
|
|
364
314
|
Parameters
|
|
365
315
|
----------
|
|
366
316
|
s3path : str
|
|
367
|
-
An S3 URI (should start with "s3://")
|
|
317
|
+
An S3 URI (should start with "s3://").
|
|
368
318
|
|
|
369
319
|
Returns
|
|
370
320
|
-------
|
|
371
321
|
xr.DataArray
|
|
372
|
-
|
|
322
|
+
EEG data with dimensions ``("channel", "time")``.
|
|
373
323
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
>>> mydata = eegdash.load_eeg_data_from_s3(mypath)
|
|
324
|
+
Raises
|
|
325
|
+
------
|
|
326
|
+
ValueError
|
|
327
|
+
If the file extension is unsupported.
|
|
379
328
|
|
|
380
329
|
"""
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
330
|
+
# choose a temp dir so sidecars can be colocated
|
|
331
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
332
|
+
# Derive local filenames from the S3 key to keep base name consistent
|
|
333
|
+
s3_key = urlsplit(s3path).path # e.g., "/dsXXXX/sub-.../..._eeg.set"
|
|
334
|
+
basename = Path(s3_key).name
|
|
335
|
+
ext = Path(basename).suffix.lower()
|
|
336
|
+
local_main = Path(tmpdir) / basename
|
|
337
|
+
|
|
338
|
+
# Download main file
|
|
339
|
+
with (
|
|
340
|
+
self.filesystem.open(s3path, mode="rb") as fsrc,
|
|
341
|
+
open(local_main, "wb") as fdst,
|
|
342
|
+
):
|
|
343
|
+
fdst.write(fsrc.read())
|
|
344
|
+
|
|
345
|
+
# Determine and fetch any required sidecars
|
|
346
|
+
sidecars: list[str] = []
|
|
347
|
+
if ext == ".set": # EEGLAB
|
|
348
|
+
sidecars = [".fdt"]
|
|
349
|
+
elif ext == ".vhdr": # BrainVision
|
|
350
|
+
sidecars = [".vmrk", ".eeg", ".dat", ".raw"]
|
|
351
|
+
|
|
352
|
+
for sc_ext in sidecars:
|
|
353
|
+
sc_key = s3_key[: -len(ext)] + sc_ext
|
|
354
|
+
sc_uri = f"s3://{urlsplit(s3path).netloc}{sc_key}"
|
|
355
|
+
try:
|
|
356
|
+
# If sidecar exists, download next to the main file
|
|
357
|
+
info = self.filesystem.info(sc_uri)
|
|
358
|
+
if info:
|
|
359
|
+
sc_local = Path(tmpdir) / Path(sc_key).name
|
|
360
|
+
with (
|
|
361
|
+
self.filesystem.open(sc_uri, mode="rb") as fsrc,
|
|
362
|
+
open(sc_local, "wb") as fdst,
|
|
363
|
+
):
|
|
364
|
+
fdst.write(fsrc.read())
|
|
365
|
+
except Exception:
|
|
366
|
+
# Sidecar not present; skip silently
|
|
367
|
+
pass
|
|
368
|
+
|
|
369
|
+
# Read using appropriate MNE reader
|
|
370
|
+
raw = mne.io.read_raw(str(local_main), preload=True, verbose=False)
|
|
371
|
+
|
|
372
|
+
data = raw.get_data()
|
|
373
|
+
fs = raw.info["sfreq"]
|
|
374
|
+
max_time = data.shape[1] / fs
|
|
375
|
+
time_steps = np.linspace(0, max_time, data.shape[1]).squeeze()
|
|
376
|
+
channel_names = raw.ch_names
|
|
377
|
+
|
|
378
|
+
return xr.DataArray(
|
|
379
|
+
data=data,
|
|
380
|
+
dims=["channel", "time"],
|
|
381
|
+
coords={"time": time_steps, "channel": channel_names},
|
|
382
|
+
)
|
|
388
383
|
|
|
389
384
|
def load_eeg_data_from_bids_file(self, bids_file: str) -> xr.DataArray:
|
|
390
|
-
"""Load EEG data from a local file
|
|
385
|
+
"""Load EEG data from a local BIDS-formatted file.
|
|
391
386
|
|
|
392
387
|
Parameters
|
|
393
388
|
----------
|
|
394
389
|
bids_file : str
|
|
395
|
-
Path to
|
|
390
|
+
Path to a BIDS-compliant EEG file (e.g., ``*_eeg.edf``, ``*_eeg.bdf``,
|
|
391
|
+
``*_eeg.vhdr``, ``*_eeg.set``).
|
|
396
392
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
393
|
+
Returns
|
|
394
|
+
-------
|
|
395
|
+
xr.DataArray
|
|
396
|
+
EEG data with dimensions ``("channel", "time")``.
|
|
400
397
|
|
|
401
398
|
"""
|
|
402
399
|
bids_path = get_bids_path_from_fname(bids_file, verbose=False)
|
|
@@ -416,140 +413,25 @@ class EEGDash:
|
|
|
416
413
|
)
|
|
417
414
|
return eeg_xarray
|
|
418
415
|
|
|
419
|
-
def get_raw_extensions(
|
|
420
|
-
self, bids_file: str, bids_dataset: EEGBIDSDataset
|
|
421
|
-
) -> list[str]:
|
|
422
|
-
"""Helper to find paths to additional "sidecar" files that may be associated
|
|
423
|
-
with a given main data file in a BIDS dataset; paths are returned as relative to
|
|
424
|
-
the parent dataset path.
|
|
425
|
-
|
|
426
|
-
For example, if the input file is a .set file, this will return the relative path
|
|
427
|
-
to a corresponding .fdt file (if any).
|
|
428
|
-
"""
|
|
429
|
-
bids_file = Path(bids_file)
|
|
430
|
-
extensions = {
|
|
431
|
-
".set": [".set", ".fdt"], # eeglab
|
|
432
|
-
".edf": [".edf"], # european
|
|
433
|
-
".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"], # brainvision
|
|
434
|
-
".bdf": [".bdf"], # biosemi
|
|
435
|
-
}
|
|
436
|
-
return [
|
|
437
|
-
str(bids_dataset.get_relative_bidspath(bids_file.with_suffix(suffix)))
|
|
438
|
-
for suffix in extensions[bids_file.suffix]
|
|
439
|
-
if bids_file.with_suffix(suffix).exists()
|
|
440
|
-
]
|
|
441
|
-
|
|
442
|
-
def load_eeg_attrs_from_bids_file(
|
|
443
|
-
self, bids_dataset: EEGBIDSDataset, bids_file: str
|
|
444
|
-
) -> dict[str, Any]:
|
|
445
|
-
"""Build the metadata record for a given BIDS file (single recording) in a BIDS dataset.
|
|
446
|
-
|
|
447
|
-
Attributes are at least the ones defined in data_config attributes (set to None if missing),
|
|
448
|
-
but are typically a superset, and include, among others, the paths to relevant
|
|
449
|
-
meta-data files needed to load and interpret the file in question.
|
|
450
|
-
|
|
451
|
-
Parameters
|
|
452
|
-
----------
|
|
453
|
-
bids_dataset : EEGBIDSDataset
|
|
454
|
-
The BIDS dataset object containing the file.
|
|
455
|
-
bids_file : str
|
|
456
|
-
The path to the BIDS file within the dataset.
|
|
457
|
-
|
|
458
|
-
Returns
|
|
459
|
-
-------
|
|
460
|
-
dict:
|
|
461
|
-
A dictionary representing the metadata record for the given file. This is the
|
|
462
|
-
same format as the records stored in the database.
|
|
463
|
-
|
|
464
|
-
"""
|
|
465
|
-
if bids_file not in bids_dataset.files:
|
|
466
|
-
raise ValueError(f"{bids_file} not in {bids_dataset.dataset}")
|
|
467
|
-
|
|
468
|
-
# Initialize attrs with None values for all expected fields
|
|
469
|
-
attrs = {field: None for field in self.config["attributes"].keys()}
|
|
470
|
-
|
|
471
|
-
file = Path(bids_file).name
|
|
472
|
-
dsnumber = bids_dataset.dataset
|
|
473
|
-
# extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
|
|
474
|
-
openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
|
|
475
|
-
|
|
476
|
-
# Update with actual values where available
|
|
477
|
-
try:
|
|
478
|
-
participants_tsv = bids_dataset.subject_participant_tsv(bids_file)
|
|
479
|
-
except Exception as e:
|
|
480
|
-
logger.error("Error getting participants_tsv: %s", str(e))
|
|
481
|
-
participants_tsv = None
|
|
482
|
-
|
|
483
|
-
try:
|
|
484
|
-
eeg_json = bids_dataset.eeg_json(bids_file)
|
|
485
|
-
except Exception as e:
|
|
486
|
-
logger.error("Error getting eeg_json: %s", str(e))
|
|
487
|
-
eeg_json = None
|
|
488
|
-
|
|
489
|
-
bids_dependencies_files = self.config["bids_dependencies_files"]
|
|
490
|
-
bidsdependencies = []
|
|
491
|
-
for extension in bids_dependencies_files:
|
|
492
|
-
try:
|
|
493
|
-
dep_path = bids_dataset.get_bids_metadata_files(bids_file, extension)
|
|
494
|
-
dep_path = [
|
|
495
|
-
str(bids_dataset.get_relative_bidspath(dep)) for dep in dep_path
|
|
496
|
-
]
|
|
497
|
-
bidsdependencies.extend(dep_path)
|
|
498
|
-
except Exception:
|
|
499
|
-
pass
|
|
500
|
-
|
|
501
|
-
bidsdependencies.extend(self.get_raw_extensions(bids_file, bids_dataset))
|
|
502
|
-
|
|
503
|
-
# Define field extraction functions with error handling
|
|
504
|
-
field_extractors = {
|
|
505
|
-
"data_name": lambda: f"{bids_dataset.dataset}_{file}",
|
|
506
|
-
"dataset": lambda: bids_dataset.dataset,
|
|
507
|
-
"bidspath": lambda: openneuro_path,
|
|
508
|
-
"subject": lambda: bids_dataset.get_bids_file_attribute(
|
|
509
|
-
"subject", bids_file
|
|
510
|
-
),
|
|
511
|
-
"task": lambda: bids_dataset.get_bids_file_attribute("task", bids_file),
|
|
512
|
-
"session": lambda: bids_dataset.get_bids_file_attribute(
|
|
513
|
-
"session", bids_file
|
|
514
|
-
),
|
|
515
|
-
"run": lambda: bids_dataset.get_bids_file_attribute("run", bids_file),
|
|
516
|
-
"modality": lambda: bids_dataset.get_bids_file_attribute(
|
|
517
|
-
"modality", bids_file
|
|
518
|
-
),
|
|
519
|
-
"sampling_frequency": lambda: bids_dataset.get_bids_file_attribute(
|
|
520
|
-
"sfreq", bids_file
|
|
521
|
-
),
|
|
522
|
-
"nchans": lambda: bids_dataset.get_bids_file_attribute("nchans", bids_file),
|
|
523
|
-
"ntimes": lambda: bids_dataset.get_bids_file_attribute("ntimes", bids_file),
|
|
524
|
-
"participant_tsv": lambda: participants_tsv,
|
|
525
|
-
"eeg_json": lambda: eeg_json,
|
|
526
|
-
"bidsdependencies": lambda: bidsdependencies,
|
|
527
|
-
}
|
|
528
|
-
|
|
529
|
-
# Dynamically populate attrs with error handling
|
|
530
|
-
for field, extractor in field_extractors.items():
|
|
531
|
-
try:
|
|
532
|
-
attrs[field] = extractor()
|
|
533
|
-
except Exception as e:
|
|
534
|
-
logger.error("Error extracting %s : %s", field, str(e))
|
|
535
|
-
attrs[field] = None
|
|
536
|
-
|
|
537
|
-
return attrs
|
|
538
|
-
|
|
539
416
|
def add_bids_dataset(
|
|
540
417
|
self, dataset: str, data_dir: str, overwrite: bool = True
|
|
541
418
|
) -> None:
|
|
542
|
-
"""
|
|
543
|
-
under the given dataset name.
|
|
419
|
+
"""Scan a local BIDS dataset and upsert records into MongoDB.
|
|
544
420
|
|
|
545
421
|
Parameters
|
|
546
422
|
----------
|
|
547
|
-
dataset : str
|
|
548
|
-
|
|
423
|
+
dataset : str
|
|
424
|
+
Dataset identifier (e.g., ``"ds002718"``).
|
|
549
425
|
data_dir : str
|
|
550
|
-
|
|
551
|
-
overwrite : bool
|
|
552
|
-
|
|
426
|
+
Path to the local BIDS dataset directory.
|
|
427
|
+
overwrite : bool, default True
|
|
428
|
+
If ``True``, update existing records when encountered; otherwise,
|
|
429
|
+
skip records that already exist.
|
|
430
|
+
|
|
431
|
+
Raises
|
|
432
|
+
------
|
|
433
|
+
ValueError
|
|
434
|
+
If called on a public client ``(is_public=True)``.
|
|
553
435
|
|
|
554
436
|
"""
|
|
555
437
|
if self.is_public:
|
|
@@ -564,7 +446,7 @@ class EEGDash:
|
|
|
564
446
|
dataset=dataset,
|
|
565
447
|
)
|
|
566
448
|
except Exception as e:
|
|
567
|
-
logger.error("Error creating bids dataset %s:
|
|
449
|
+
logger.error("Error creating bids dataset %s: %s", dataset, str(e))
|
|
568
450
|
raise e
|
|
569
451
|
requests = []
|
|
570
452
|
for bids_file in bids_dataset.get_files():
|
|
@@ -573,15 +455,13 @@ class EEGDash:
|
|
|
573
455
|
|
|
574
456
|
if self.exist({"data_name": data_id}):
|
|
575
457
|
if overwrite:
|
|
576
|
-
eeg_attrs =
|
|
458
|
+
eeg_attrs = load_eeg_attrs_from_bids_file(
|
|
577
459
|
bids_dataset, bids_file
|
|
578
460
|
)
|
|
579
|
-
requests.append(self.
|
|
461
|
+
requests.append(self._update_request(eeg_attrs))
|
|
580
462
|
else:
|
|
581
|
-
eeg_attrs =
|
|
582
|
-
|
|
583
|
-
)
|
|
584
|
-
requests.append(self.add_request(eeg_attrs))
|
|
463
|
+
eeg_attrs = load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
|
|
464
|
+
requests.append(self._add_request(eeg_attrs))
|
|
585
465
|
except Exception as e:
|
|
586
466
|
logger.error("Error adding record %s", bids_file)
|
|
587
467
|
logger.error(str(e))
|
|
@@ -597,22 +477,22 @@ class EEGDash:
|
|
|
597
477
|
logger.info("Errors: %s ", result.bulk_api_result.get("writeErrors", []))
|
|
598
478
|
|
|
599
479
|
def get(self, query: dict[str, Any]) -> list[xr.DataArray]:
|
|
600
|
-
"""
|
|
601
|
-
the `find()` method for details on the query format.
|
|
480
|
+
"""Download and return EEG data arrays for records matching a query.
|
|
602
481
|
|
|
603
482
|
Parameters
|
|
604
483
|
----------
|
|
605
484
|
query : dict
|
|
606
|
-
|
|
607
|
-
document that is used to match records in the MongoDB collection.
|
|
485
|
+
MongoDB query used to select records.
|
|
608
486
|
|
|
609
487
|
Returns
|
|
610
488
|
-------
|
|
611
|
-
|
|
489
|
+
list of xr.DataArray
|
|
490
|
+
EEG data for each matching record, with dimensions ``("channel", "time")``.
|
|
612
491
|
|
|
613
492
|
Notes
|
|
614
493
|
-----
|
|
615
|
-
Retrieval
|
|
494
|
+
Retrieval runs in parallel. Downloaded files are read and discarded
|
|
495
|
+
(no on-disk caching here).
|
|
616
496
|
|
|
617
497
|
"""
|
|
618
498
|
sessions = self.find(query)
|
|
@@ -622,12 +502,40 @@ class EEGDash:
|
|
|
622
502
|
results = Parallel(
|
|
623
503
|
n_jobs=-1 if len(sessions) > 1 else 1, prefer="threads", verbose=1
|
|
624
504
|
)(
|
|
625
|
-
delayed(self.load_eeg_data_from_s3)(self.
|
|
505
|
+
delayed(self.load_eeg_data_from_s3)(self._get_s3path(session))
|
|
626
506
|
for session in sessions
|
|
627
507
|
)
|
|
628
508
|
return results
|
|
629
509
|
|
|
630
|
-
def
|
|
510
|
+
def _get_s3path(self, record: Mapping[str, Any] | str) -> str:
|
|
511
|
+
"""Build an S3 URI from a DB record or a relative path.
|
|
512
|
+
|
|
513
|
+
Parameters
|
|
514
|
+
----------
|
|
515
|
+
record : dict or str
|
|
516
|
+
Either a DB record containing a ``'bidspath'`` key, or a relative
|
|
517
|
+
path string under the OpenNeuro bucket.
|
|
518
|
+
|
|
519
|
+
Returns
|
|
520
|
+
-------
|
|
521
|
+
str
|
|
522
|
+
Fully qualified S3 URI.
|
|
523
|
+
|
|
524
|
+
Raises
|
|
525
|
+
------
|
|
526
|
+
ValueError
|
|
527
|
+
If a mapping is provided but ``'bidspath'`` is missing.
|
|
528
|
+
|
|
529
|
+
"""
|
|
530
|
+
if isinstance(record, str):
|
|
531
|
+
rel = record
|
|
532
|
+
else:
|
|
533
|
+
rel = record.get("bidspath")
|
|
534
|
+
if not rel:
|
|
535
|
+
raise ValueError("Record missing 'bidspath' for S3 path resolution")
|
|
536
|
+
return f"s3://openneuro.org/{rel}"
|
|
537
|
+
|
|
538
|
+
def _add_request(self, record: dict):
|
|
631
539
|
"""Internal helper method to create a MongoDB insertion request for a record."""
|
|
632
540
|
return InsertOne(record)
|
|
633
541
|
|
|
@@ -641,12 +549,19 @@ class EEGDash:
|
|
|
641
549
|
except:
|
|
642
550
|
logger.error("Error adding record: %s ", record["data_name"])
|
|
643
551
|
|
|
644
|
-
def
|
|
552
|
+
def _update_request(self, record: dict):
|
|
645
553
|
"""Internal helper method to create a MongoDB update request for a record."""
|
|
646
554
|
return UpdateOne({"data_name": record["data_name"]}, {"$set": record})
|
|
647
555
|
|
|
648
556
|
def update(self, record: dict):
|
|
649
|
-
"""Update a single record in the MongoDB collection.
|
|
557
|
+
"""Update a single record in the MongoDB collection.
|
|
558
|
+
|
|
559
|
+
Parameters
|
|
560
|
+
----------
|
|
561
|
+
record : dict
|
|
562
|
+
Record content to set at the matching ``data_name``.
|
|
563
|
+
|
|
564
|
+
"""
|
|
650
565
|
try:
|
|
651
566
|
self.__collection.update_one(
|
|
652
567
|
{"data_name": record["data_name"]}, {"$set": record}
|
|
@@ -654,15 +569,33 @@ class EEGDash:
|
|
|
654
569
|
except: # silent failure
|
|
655
570
|
logger.error("Error updating record: %s", record["data_name"])
|
|
656
571
|
|
|
572
|
+
def exists(self, query: dict[str, Any]) -> bool:
|
|
573
|
+
"""Alias for :meth:`exist` provided for API clarity."""
|
|
574
|
+
return self.exist(query)
|
|
575
|
+
|
|
657
576
|
def remove_field(self, record, field):
|
|
658
|
-
"""Remove a specific field from a record in the MongoDB collection.
|
|
577
|
+
"""Remove a specific field from a record in the MongoDB collection.
|
|
578
|
+
|
|
579
|
+
Parameters
|
|
580
|
+
----------
|
|
581
|
+
record : dict
|
|
582
|
+
Record identifying object with ``data_name``.
|
|
583
|
+
field : str
|
|
584
|
+
Field name to remove.
|
|
585
|
+
|
|
586
|
+
"""
|
|
659
587
|
self.__collection.update_one(
|
|
660
588
|
{"data_name": record["data_name"]}, {"$unset": {field: 1}}
|
|
661
589
|
)
|
|
662
590
|
|
|
663
591
|
def remove_field_from_db(self, field):
|
|
664
|
-
"""
|
|
665
|
-
|
|
592
|
+
"""Remove a field from all records (destructive).
|
|
593
|
+
|
|
594
|
+
Parameters
|
|
595
|
+
----------
|
|
596
|
+
field : str
|
|
597
|
+
Field name to remove from every document.
|
|
598
|
+
|
|
666
599
|
"""
|
|
667
600
|
self.__collection.update_many({}, {"$unset": {field: 1}})
|
|
668
601
|
|
|
@@ -672,11 +605,13 @@ class EEGDash:
|
|
|
672
605
|
return self.__collection
|
|
673
606
|
|
|
674
607
|
def close(self):
|
|
675
|
-
"""
|
|
608
|
+
"""Backward-compatibility no-op; connections are managed globally.
|
|
609
|
+
|
|
610
|
+
Notes
|
|
611
|
+
-----
|
|
612
|
+
Connections are managed by :class:`MongoConnectionManager`. Use
|
|
613
|
+
:meth:`close_all_connections` to explicitly close all clients.
|
|
676
614
|
|
|
677
|
-
Note: Since MongoDB clients are now managed by a singleton,
|
|
678
|
-
this method no longer closes connections. Use close_all_connections()
|
|
679
|
-
class method to close all connections if needed.
|
|
680
615
|
"""
|
|
681
616
|
# Individual instances no longer close the shared client
|
|
682
617
|
pass
|
|
@@ -687,7 +622,7 @@ class EEGDash:
|
|
|
687
622
|
MongoConnectionManager.close_all()
|
|
688
623
|
|
|
689
624
|
def __del__(self):
|
|
690
|
-
"""
|
|
625
|
+
"""Destructor; no explicit action needed due to global connection manager."""
|
|
691
626
|
# No longer needed since we're using singleton pattern
|
|
692
627
|
pass
|
|
693
628
|
|
|
@@ -707,16 +642,16 @@ class EEGDashDataset(BaseConcatDataset):
|
|
|
707
642
|
"sex",
|
|
708
643
|
],
|
|
709
644
|
s3_bucket: str | None = None,
|
|
710
|
-
eeg_dash_instance=None,
|
|
711
645
|
records: list[dict] | None = None,
|
|
712
|
-
|
|
646
|
+
download: bool = True,
|
|
647
|
+
n_jobs: int = -1,
|
|
648
|
+
eeg_dash_instance: EEGDash | None = None,
|
|
713
649
|
**kwargs,
|
|
714
650
|
):
|
|
715
651
|
"""Create a new EEGDashDataset from a given query or local BIDS dataset directory
|
|
716
652
|
and dataset name. An EEGDashDataset is pooled collection of EEGDashBaseDataset
|
|
717
653
|
instances (individual recordings) and is a subclass of braindecode's BaseConcatDataset.
|
|
718
654
|
|
|
719
|
-
|
|
720
655
|
Querying Examples:
|
|
721
656
|
------------------
|
|
722
657
|
# Find by single subject
|
|
@@ -732,108 +667,267 @@ class EEGDashDataset(BaseConcatDataset):
|
|
|
732
667
|
|
|
733
668
|
Parameters
|
|
734
669
|
----------
|
|
670
|
+
cache_dir : str | Path
|
|
671
|
+
Directory where data are cached locally. If not specified, a default
|
|
672
|
+
cache directory under the user cache is used.
|
|
735
673
|
query : dict | None
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
A directory where the dataset will be cached locally.
|
|
742
|
-
data_dir : str | None
|
|
743
|
-
Optionally a string specifying a local BIDS dataset directory from which to load the EEG data files. Exactly one
|
|
744
|
-
of query or data_dir must be provided.
|
|
745
|
-
dataset : str | None
|
|
746
|
-
If data_dir is given, a name for the dataset to be loaded.
|
|
674
|
+
Raw MongoDB query to filter records. If provided, it is merged with
|
|
675
|
+
keyword filtering arguments (see ``**kwargs``) using logical AND.
|
|
676
|
+
You must provide at least a ``dataset`` (either in ``query`` or
|
|
677
|
+
as a keyword argument). Only fields in ``ALLOWED_QUERY_FIELDS`` are
|
|
678
|
+
considered for filtering.
|
|
747
679
|
description_fields : list[str]
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
subject metadata fields such as "subject", "session", "run", "task", etc.;
|
|
751
|
-
see also data_config.description_fields for the default set of fields.
|
|
680
|
+
Fields to extract from each record and include in dataset descriptions
|
|
681
|
+
(e.g., "subject", "session", "run", "task").
|
|
752
682
|
s3_bucket : str | None
|
|
753
|
-
|
|
754
|
-
default OpenNeuro bucket
|
|
683
|
+
Optional S3 bucket URI (e.g., "s3://mybucket") to use instead of the
|
|
684
|
+
default OpenNeuro bucket when downloading data files.
|
|
755
685
|
records : list[dict] | None
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
If
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
686
|
+
Pre-fetched metadata records. If provided, the dataset is constructed
|
|
687
|
+
directly from these records and no MongoDB query is performed.
|
|
688
|
+
download : bool, default True
|
|
689
|
+
If False, load from local BIDS files only. Local data are expected
|
|
690
|
+
under ``cache_dir / dataset``; no DB or S3 access is attempted.
|
|
691
|
+
n_jobs : int
|
|
692
|
+
Number of parallel jobs to use where applicable (-1 uses all cores).
|
|
693
|
+
eeg_dash_instance : EEGDash | None
|
|
694
|
+
Optional existing EEGDash client to reuse for DB queries. If None,
|
|
695
|
+
a new client is created on demand, not used in the case of no download.
|
|
696
|
+
**kwargs : dict
|
|
697
|
+
Additional keyword arguments serving two purposes:
|
|
698
|
+
- Filtering: any keys present in ``ALLOWED_QUERY_FIELDS`` are treated
|
|
699
|
+
as query filters (e.g., ``dataset``, ``subject``, ``task``, ...).
|
|
700
|
+
- Dataset options: remaining keys are forwarded to the
|
|
701
|
+
``EEGDashBaseDataset`` constructor.
|
|
764
702
|
|
|
765
703
|
"""
|
|
704
|
+
# Parameters that don't need validation
|
|
705
|
+
_suppress_comp_warning: bool = kwargs.pop("_suppress_comp_warning", False)
|
|
706
|
+
self.s3_bucket = s3_bucket
|
|
707
|
+
self.records = records
|
|
708
|
+
self.download = download
|
|
709
|
+
self.n_jobs = n_jobs
|
|
710
|
+
self.eeg_dash_instance = eeg_dash_instance or EEGDash()
|
|
711
|
+
|
|
766
712
|
self.cache_dir = Path(cache_dir or platformdirs.user_cache_dir("EEGDash"))
|
|
713
|
+
|
|
767
714
|
if not self.cache_dir.exists():
|
|
768
715
|
warn(f"Cache directory does not exist, creating it: {self.cache_dir}")
|
|
769
716
|
self.cache_dir.mkdir(exist_ok=True, parents=True)
|
|
770
|
-
self.s3_bucket = s3_bucket
|
|
771
|
-
self.eeg_dash = eeg_dash_instance
|
|
772
717
|
|
|
773
718
|
# Separate query kwargs from other kwargs passed to the BaseDataset constructor
|
|
774
719
|
self.query = query or {}
|
|
775
720
|
self.query.update(
|
|
776
|
-
{k: v for k, v in kwargs.items() if k in
|
|
721
|
+
{k: v for k, v in kwargs.items() if k in ALLOWED_QUERY_FIELDS}
|
|
777
722
|
)
|
|
778
723
|
base_dataset_kwargs = {k: v for k, v in kwargs.items() if k not in self.query}
|
|
779
724
|
if "dataset" not in self.query:
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
if self.eeg_dash is None and records is None:
|
|
786
|
-
self.eeg_dash = EEGDash()
|
|
787
|
-
_owns_client = True
|
|
788
|
-
|
|
789
|
-
try:
|
|
790
|
-
if records is not None:
|
|
791
|
-
self.records = records
|
|
792
|
-
datasets = [
|
|
793
|
-
EEGDashBaseDataset(
|
|
794
|
-
record,
|
|
795
|
-
self.cache_dir,
|
|
796
|
-
self.s3_bucket,
|
|
797
|
-
**base_dataset_kwargs,
|
|
798
|
-
)
|
|
799
|
-
for record in self.records
|
|
800
|
-
]
|
|
801
|
-
elif offline_mode: # only assume local data is complete if in offline mode
|
|
802
|
-
if self.data_dir.exists():
|
|
803
|
-
# This path loads from a local directory and is not affected by DB query logic
|
|
804
|
-
datasets = self.load_bids_dataset(
|
|
805
|
-
dataset=self.query["dataset"],
|
|
806
|
-
data_dir=self.data_dir,
|
|
807
|
-
description_fields=description_fields,
|
|
808
|
-
s3_bucket=s3_bucket,
|
|
809
|
-
**base_dataset_kwargs,
|
|
810
|
-
)
|
|
725
|
+
# If explicit records are provided, infer dataset from records
|
|
726
|
+
if isinstance(records, list) and records and isinstance(records[0], dict):
|
|
727
|
+
inferred = records[0].get("dataset")
|
|
728
|
+
if inferred:
|
|
729
|
+
self.query["dataset"] = inferred
|
|
811
730
|
else:
|
|
812
|
-
raise ValueError(
|
|
813
|
-
f"Offline mode is enabled, but local data_dir {self.data_dir} does not exist."
|
|
814
|
-
)
|
|
815
|
-
elif self.query:
|
|
816
|
-
# This is the DB query path that we are improving
|
|
817
|
-
datasets = self._find_datasets(
|
|
818
|
-
query=self.eeg_dash._build_query_from_kwargs(**self.query),
|
|
819
|
-
description_fields=description_fields,
|
|
820
|
-
base_dataset_kwargs=base_dataset_kwargs,
|
|
821
|
-
)
|
|
822
|
-
# We only need filesystem if we need to access S3
|
|
823
|
-
self.filesystem = S3FileSystem(
|
|
824
|
-
anon=True, client_kwargs={"region_name": "us-east-2"}
|
|
825
|
-
)
|
|
731
|
+
raise ValueError("You must provide a 'dataset' argument")
|
|
826
732
|
else:
|
|
733
|
+
raise ValueError("You must provide a 'dataset' argument")
|
|
734
|
+
|
|
735
|
+
# Decide on a dataset subfolder name for cache isolation. If using
|
|
736
|
+
# challenge/preprocessed buckets (e.g., BDF, mini subsets), append
|
|
737
|
+
# informative suffixes to avoid overlapping with the original dataset.
|
|
738
|
+
dataset_folder = self.query["dataset"]
|
|
739
|
+
if self.s3_bucket:
|
|
740
|
+
suffixes: list[str] = []
|
|
741
|
+
bucket_lower = str(self.s3_bucket).lower()
|
|
742
|
+
if "bdf" in bucket_lower:
|
|
743
|
+
suffixes.append("bdf")
|
|
744
|
+
if "mini" in bucket_lower:
|
|
745
|
+
suffixes.append("mini")
|
|
746
|
+
if suffixes:
|
|
747
|
+
dataset_folder = f"{dataset_folder}-{'-'.join(suffixes)}"
|
|
748
|
+
|
|
749
|
+
self.data_dir = self.cache_dir / dataset_folder
|
|
750
|
+
|
|
751
|
+
if (
|
|
752
|
+
not _suppress_comp_warning
|
|
753
|
+
and self.query["dataset"] in RELEASE_TO_OPENNEURO_DATASET_MAP.values()
|
|
754
|
+
):
|
|
755
|
+
warn(
|
|
756
|
+
"If you are not participating in the competition, you can ignore this warning!"
|
|
757
|
+
"\n\n"
|
|
758
|
+
"EEG 2025 Competition Data Notice:\n"
|
|
759
|
+
"---------------------------------\n"
|
|
760
|
+
" You are loading the dataset that is used in the EEG 2025 Competition:\n"
|
|
761
|
+
"IMPORTANT: The data accessed via `EEGDashDataset` is NOT identical to what you get from `EEGChallengeDataset` object directly.\n"
|
|
762
|
+
"and it is not what you will use for the competition. Downsampling and filtering were applied to the data"
|
|
763
|
+
"to allow more people to participate.\n"
|
|
764
|
+
"\n"
|
|
765
|
+
"If you are participating in the competition, always use `EEGChallengeDataset` to ensure consistency with the challenge data.\n"
|
|
766
|
+
"\n",
|
|
767
|
+
UserWarning,
|
|
768
|
+
module="eegdash",
|
|
769
|
+
)
|
|
770
|
+
if records is not None:
|
|
771
|
+
self.records = records
|
|
772
|
+
datasets = [
|
|
773
|
+
EEGDashBaseDataset(
|
|
774
|
+
record,
|
|
775
|
+
self.cache_dir,
|
|
776
|
+
self.s3_bucket,
|
|
777
|
+
**base_dataset_kwargs,
|
|
778
|
+
)
|
|
779
|
+
for record in self.records
|
|
780
|
+
]
|
|
781
|
+
elif not download: # only assume local data is complete if not downloading
|
|
782
|
+
if not self.data_dir.exists():
|
|
827
783
|
raise ValueError(
|
|
828
|
-
"
|
|
784
|
+
f"Offline mode is enabled, but local data_dir {self.data_dir} does not exist."
|
|
785
|
+
)
|
|
786
|
+
records = self._find_local_bids_records(self.data_dir, self.query)
|
|
787
|
+
datasets = [
|
|
788
|
+
EEGDashBaseDataset(
|
|
789
|
+
record=record,
|
|
790
|
+
cache_dir=self.cache_dir,
|
|
791
|
+
s3_bucket=self.s3_bucket,
|
|
792
|
+
description={
|
|
793
|
+
k: record.get(k)
|
|
794
|
+
for k in description_fields
|
|
795
|
+
if record.get(k) is not None
|
|
796
|
+
},
|
|
797
|
+
**base_dataset_kwargs,
|
|
829
798
|
)
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
799
|
+
for record in records
|
|
800
|
+
]
|
|
801
|
+
elif self.query:
|
|
802
|
+
# This is the DB query path that we are improving
|
|
803
|
+
datasets = self._find_datasets(
|
|
804
|
+
query=build_query_from_kwargs(**self.query),
|
|
805
|
+
description_fields=description_fields,
|
|
806
|
+
base_dataset_kwargs=base_dataset_kwargs,
|
|
807
|
+
)
|
|
808
|
+
# We only need filesystem if we need to access S3
|
|
809
|
+
self.filesystem = S3FileSystem(
|
|
810
|
+
anon=True, client_kwargs={"region_name": "us-east-2"}
|
|
811
|
+
)
|
|
812
|
+
else:
|
|
813
|
+
raise ValueError(
|
|
814
|
+
"You must provide either 'records', a 'data_dir', or a query/keyword arguments for filtering."
|
|
815
|
+
)
|
|
833
816
|
|
|
834
817
|
super().__init__(datasets)
|
|
835
818
|
|
|
836
|
-
def
|
|
819
|
+
def _find_local_bids_records(
|
|
820
|
+
self, dataset_root: Path, filters: dict[str, Any]
|
|
821
|
+
) -> list[dict]:
|
|
822
|
+
"""Discover local BIDS EEG files and build minimal records.
|
|
823
|
+
|
|
824
|
+
This helper enumerates EEG recordings under ``dataset_root`` via
|
|
825
|
+
``mne_bids.find_matching_paths`` and applies entity filters to produce a
|
|
826
|
+
list of records suitable for ``EEGDashBaseDataset``. No network access
|
|
827
|
+
is performed and files are not read.
|
|
828
|
+
|
|
829
|
+
Parameters
|
|
830
|
+
----------
|
|
831
|
+
dataset_root : Path
|
|
832
|
+
Local dataset directory. May be the plain dataset folder (e.g.,
|
|
833
|
+
``ds005509``) or a suffixed cache variant (e.g.,
|
|
834
|
+
``ds005509-bdf-mini``).
|
|
835
|
+
filters : dict of {str, Any}
|
|
836
|
+
Query filters. Must include ``'dataset'`` with the dataset id (without
|
|
837
|
+
local suffixes). May include BIDS entities ``'subject'``,
|
|
838
|
+
``'session'``, ``'task'``, and ``'run'``. Each value can be a scalar
|
|
839
|
+
or a sequence of scalars.
|
|
840
|
+
|
|
841
|
+
Returns
|
|
842
|
+
-------
|
|
843
|
+
records : list of dict
|
|
844
|
+
One record per matched EEG file with at least:
|
|
845
|
+
|
|
846
|
+
- ``'data_name'``
|
|
847
|
+
- ``'dataset'`` (dataset id, without suffixes)
|
|
848
|
+
- ``'bidspath'`` (normalized to start with the dataset id)
|
|
849
|
+
- ``'subject'``, ``'session'``, ``'task'``, ``'run'`` (may be None)
|
|
850
|
+
- ``'bidsdependencies'`` (empty list)
|
|
851
|
+
- ``'modality'`` (``"eeg"``)
|
|
852
|
+
- ``'sampling_frequency'``, ``'nchans'``, ``'ntimes'`` (minimal
|
|
853
|
+
defaults for offline usage)
|
|
854
|
+
|
|
855
|
+
Notes
|
|
856
|
+
-----
|
|
857
|
+
- Matching uses ``datatypes=['eeg']`` and ``suffixes=['eeg']``.
|
|
858
|
+
- ``bidspath`` is constructed as
|
|
859
|
+
``<dataset_id> / <relative_path_from_dataset_root>`` to ensure the
|
|
860
|
+
first path component is the dataset id (without local cache suffixes).
|
|
861
|
+
- Minimal defaults are set for ``sampling_frequency``, ``nchans``, and
|
|
862
|
+
``ntimes`` to satisfy dataset length requirements offline.
|
|
863
|
+
|
|
864
|
+
"""
|
|
865
|
+
dataset_id = filters["dataset"]
|
|
866
|
+
arg_map = {
|
|
867
|
+
"subjects": "subject",
|
|
868
|
+
"sessions": "session",
|
|
869
|
+
"tasks": "task",
|
|
870
|
+
"runs": "run",
|
|
871
|
+
}
|
|
872
|
+
matching_args: dict[str, list[str]] = {}
|
|
873
|
+
for finder_key, entity_key in arg_map.items():
|
|
874
|
+
entity_val = filters.get(entity_key)
|
|
875
|
+
if entity_val is None:
|
|
876
|
+
continue
|
|
877
|
+
if isinstance(entity_val, (list, tuple, set)):
|
|
878
|
+
entity_vals = list(entity_val)
|
|
879
|
+
if not entity_vals:
|
|
880
|
+
continue
|
|
881
|
+
matching_args[finder_key] = entity_vals
|
|
882
|
+
else:
|
|
883
|
+
matching_args[finder_key] = [entity_val]
|
|
884
|
+
|
|
885
|
+
paths = find_matching_paths(
|
|
886
|
+
root=str(dataset_root),
|
|
887
|
+
datatypes=["eeg"],
|
|
888
|
+
suffixes=["eeg"],
|
|
889
|
+
ignore_json=True,
|
|
890
|
+
**matching_args,
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
records: list[dict] = []
|
|
894
|
+
seen_files: set[str] = set()
|
|
895
|
+
|
|
896
|
+
for bids_path in paths:
|
|
897
|
+
fpath = str(Path(bids_path.fpath).resolve())
|
|
898
|
+
if fpath in seen_files:
|
|
899
|
+
continue
|
|
900
|
+
seen_files.add(fpath)
|
|
901
|
+
|
|
902
|
+
# Build bidspath as dataset_id / relative_path_from_dataset_root (POSIX)
|
|
903
|
+
rel_from_root = (
|
|
904
|
+
Path(bids_path.fpath)
|
|
905
|
+
.resolve()
|
|
906
|
+
.relative_to(Path(bids_path.root).resolve())
|
|
907
|
+
)
|
|
908
|
+
bidspath = f"{dataset_id}/{rel_from_root.as_posix()}"
|
|
909
|
+
|
|
910
|
+
rec = {
|
|
911
|
+
"data_name": f"{dataset_id}_{Path(bids_path.fpath).name}",
|
|
912
|
+
"dataset": dataset_id,
|
|
913
|
+
"bidspath": bidspath,
|
|
914
|
+
"subject": (bids_path.subject or None),
|
|
915
|
+
"session": (bids_path.session or None),
|
|
916
|
+
"task": (bids_path.task or None),
|
|
917
|
+
"run": (bids_path.run or None),
|
|
918
|
+
# minimal fields to satisfy BaseDataset
|
|
919
|
+
"bidsdependencies": [], # not needed to just run.
|
|
920
|
+
"modality": "eeg",
|
|
921
|
+
# this information is from eegdash schema but not available locally
|
|
922
|
+
"sampling_frequency": 1.0,
|
|
923
|
+
"nchans": 1,
|
|
924
|
+
"ntimes": 1,
|
|
925
|
+
}
|
|
926
|
+
records.append(rec)
|
|
927
|
+
|
|
928
|
+
return records
|
|
929
|
+
|
|
930
|
+
def _find_key_in_nested_dict(self, data: Any, target_key: str) -> Any:
|
|
837
931
|
"""Helper to recursively search for a key in a nested dictionary structure; returns
|
|
838
932
|
the value associated with the first occurrence of the key, or None if not found.
|
|
839
933
|
"""
|
|
@@ -841,7 +935,7 @@ class EEGDashDataset(BaseConcatDataset):
|
|
|
841
935
|
if target_key in data:
|
|
842
936
|
return data[target_key]
|
|
843
937
|
for value in data.values():
|
|
844
|
-
result = self.
|
|
938
|
+
result = self._find_key_in_nested_dict(value, target_key)
|
|
845
939
|
if result is not None:
|
|
846
940
|
return result
|
|
847
941
|
return None
|
|
@@ -872,13 +966,12 @@ class EEGDashDataset(BaseConcatDataset):
|
|
|
872
966
|
|
|
873
967
|
"""
|
|
874
968
|
datasets: list[EEGDashBaseDataset] = []
|
|
875
|
-
|
|
876
|
-
self.records = self.eeg_dash.find(query)
|
|
969
|
+
self.records = self.eeg_dash_instance.find(query)
|
|
877
970
|
|
|
878
971
|
for record in self.records:
|
|
879
972
|
description = {}
|
|
880
973
|
for field in description_fields:
|
|
881
|
-
value = self.
|
|
974
|
+
value = self._find_key_in_nested_dict(record, field)
|
|
882
975
|
if value is not None:
|
|
883
976
|
description[field] = value
|
|
884
977
|
datasets.append(
|
|
@@ -891,64 +984,3 @@ class EEGDashDataset(BaseConcatDataset):
|
|
|
891
984
|
)
|
|
892
985
|
)
|
|
893
986
|
return datasets
|
|
894
|
-
|
|
895
|
-
def load_bids_dataset(
|
|
896
|
-
self,
|
|
897
|
-
dataset: str,
|
|
898
|
-
data_dir: str | Path,
|
|
899
|
-
description_fields: list[str],
|
|
900
|
-
s3_bucket: str | None = None,
|
|
901
|
-
**kwargs,
|
|
902
|
-
):
|
|
903
|
-
"""Helper method to load a single local BIDS dataset and return it as a list of
|
|
904
|
-
EEGDashBaseDatasets (one for each recording in the dataset).
|
|
905
|
-
|
|
906
|
-
Parameters
|
|
907
|
-
----------
|
|
908
|
-
dataset : str
|
|
909
|
-
A name for the dataset to be loaded (e.g., "ds002718").
|
|
910
|
-
data_dir : str
|
|
911
|
-
The path to the local BIDS dataset directory.
|
|
912
|
-
description_fields : list[str]
|
|
913
|
-
A list of fields to be extracted from the dataset records
|
|
914
|
-
and included in the returned dataset description(s).
|
|
915
|
-
|
|
916
|
-
"""
|
|
917
|
-
bids_dataset = EEGBIDSDataset(
|
|
918
|
-
data_dir=data_dir,
|
|
919
|
-
dataset=dataset,
|
|
920
|
-
)
|
|
921
|
-
datasets = Parallel(n_jobs=-1, prefer="threads", verbose=1)(
|
|
922
|
-
delayed(self.get_base_dataset_from_bids_file)(
|
|
923
|
-
bids_dataset=bids_dataset,
|
|
924
|
-
bids_file=bids_file,
|
|
925
|
-
s3_bucket=s3_bucket,
|
|
926
|
-
description_fields=description_fields,
|
|
927
|
-
**kwargs,
|
|
928
|
-
)
|
|
929
|
-
for bids_file in bids_dataset.get_files()
|
|
930
|
-
)
|
|
931
|
-
return datasets
|
|
932
|
-
|
|
933
|
-
def get_base_dataset_from_bids_file(
|
|
934
|
-
self,
|
|
935
|
-
bids_dataset: "EEGBIDSDataset",
|
|
936
|
-
bids_file: str,
|
|
937
|
-
s3_bucket: str | None,
|
|
938
|
-
description_fields: list[str],
|
|
939
|
-
**kwargs,
|
|
940
|
-
) -> "EEGDashBaseDataset":
|
|
941
|
-
"""Instantiate a single EEGDashBaseDataset given a local BIDS file (metadata only)."""
|
|
942
|
-
record = self.eeg_dash.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
|
|
943
|
-
description = {}
|
|
944
|
-
for field in description_fields:
|
|
945
|
-
value = self.find_key_in_nested_dict(record, field)
|
|
946
|
-
if value is not None:
|
|
947
|
-
description[field] = value
|
|
948
|
-
return EEGDashBaseDataset(
|
|
949
|
-
record,
|
|
950
|
-
self.cache_dir,
|
|
951
|
-
s3_bucket,
|
|
952
|
-
description=description,
|
|
953
|
-
**kwargs,
|
|
954
|
-
)
|