eegdash 0.4.0.dev173498563__py3-none-any.whl → 0.4.1.dev185__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +3 -3
- eegdash/api.py +143 -526
- eegdash/bids_eeg_metadata.py +139 -39
- eegdash/const.py +25 -0
- eegdash/dataset/__init__.py +8 -2
- eegdash/dataset/base.py +311 -0
- eegdash/dataset/bids_dataset.py +443 -0
- eegdash/dataset/dataset.py +542 -17
- eegdash/dataset/dataset_summary.csv +255 -255
- eegdash/dataset/registry.py +69 -4
- eegdash/downloader.py +95 -9
- eegdash/features/datasets.py +326 -136
- eegdash/features/decorators.py +96 -3
- eegdash/features/extractors.py +212 -55
- eegdash/features/feature_bank/complexity.py +7 -3
- eegdash/features/feature_bank/dimensionality.py +1 -1
- eegdash/features/feature_bank/signal.py +11 -10
- eegdash/features/feature_bank/utils.py +8 -0
- eegdash/features/inspect.py +97 -11
- eegdash/features/serialization.py +56 -19
- eegdash/features/utils.py +90 -16
- eegdash/hbn/preprocessing.py +50 -17
- eegdash/hbn/windows.py +145 -32
- eegdash/logging.py +19 -0
- eegdash/mongodb.py +44 -27
- eegdash/paths.py +15 -5
- eegdash/utils.py +16 -1
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/METADATA +7 -8
- eegdash-0.4.1.dev185.dist-info/RECORD +38 -0
- eegdash/data_utils.py +0 -677
- eegdash-0.4.0.dev173498563.dist-info/RECORD +0 -37
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/WHEEL +0 -0
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.4.0.dev173498563.dist-info → eegdash-0.4.1.dev185.dist-info}/top_level.txt +0 -0
eegdash/api.py
CHANGED
|
@@ -15,35 +15,20 @@ from pathlib import Path
|
|
|
15
15
|
from typing import Any, Mapping
|
|
16
16
|
|
|
17
17
|
import mne
|
|
18
|
-
from
|
|
19
|
-
from dotenv import load_dotenv
|
|
20
|
-
from mne_bids import find_matching_paths
|
|
18
|
+
from mne.utils import _soft_import
|
|
21
19
|
from pymongo import InsertOne, UpdateOne
|
|
22
|
-
from rich.console import Console
|
|
23
|
-
from rich.panel import Panel
|
|
24
|
-
from rich.text import Text
|
|
25
20
|
|
|
26
|
-
from braindecode.datasets import BaseConcatDataset
|
|
27
|
-
|
|
28
|
-
from . import downloader
|
|
29
21
|
from .bids_eeg_metadata import (
|
|
30
22
|
build_query_from_kwargs,
|
|
31
23
|
load_eeg_attrs_from_bids_file,
|
|
32
|
-
merge_participants_fields,
|
|
33
|
-
normalize_key,
|
|
34
24
|
)
|
|
35
25
|
from .const import (
|
|
36
26
|
ALLOWED_QUERY_FIELDS,
|
|
37
|
-
RELEASE_TO_OPENNEURO_DATASET_MAP,
|
|
38
27
|
)
|
|
39
28
|
from .const import config as data_config
|
|
40
|
-
from .
|
|
41
|
-
EEGBIDSDataset,
|
|
42
|
-
EEGDashBaseDataset,
|
|
43
|
-
)
|
|
29
|
+
from .dataset.bids_dataset import EEGBIDSDataset
|
|
44
30
|
from .logging import logger
|
|
45
31
|
from .mongodb import MongoConnectionManager
|
|
46
|
-
from .paths import get_default_cache_dir
|
|
47
32
|
from .utils import _init_mongo_client
|
|
48
33
|
|
|
49
34
|
|
|
@@ -89,7 +74,8 @@ class EEGDash:
|
|
|
89
74
|
except Exception:
|
|
90
75
|
DB_CONNECTION_STRING = None
|
|
91
76
|
else:
|
|
92
|
-
|
|
77
|
+
dotenv = _soft_import("dotenv", "eegdash[full] is necessary.")
|
|
78
|
+
dotenv.load_dotenv()
|
|
93
79
|
DB_CONNECTION_STRING = os.getenv("DB_CONNECTION_STRING")
|
|
94
80
|
|
|
95
81
|
# Use singleton to get MongoDB client, database, and collection
|
|
@@ -212,17 +198,22 @@ class EEGDash:
|
|
|
212
198
|
return doc is not None
|
|
213
199
|
|
|
214
200
|
def _validate_input(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
215
|
-
"""
|
|
201
|
+
"""Validate the input record against the expected schema.
|
|
216
202
|
|
|
217
203
|
Parameters
|
|
218
204
|
----------
|
|
219
|
-
record: dict
|
|
205
|
+
record : dict
|
|
220
206
|
A dictionary representing the EEG data record to be validated.
|
|
221
207
|
|
|
222
208
|
Returns
|
|
223
209
|
-------
|
|
224
|
-
dict
|
|
225
|
-
|
|
210
|
+
dict
|
|
211
|
+
The record itself on success.
|
|
212
|
+
|
|
213
|
+
Raises
|
|
214
|
+
------
|
|
215
|
+
ValueError
|
|
216
|
+
If the record is missing required keys or has values of the wrong type.
|
|
226
217
|
|
|
227
218
|
"""
|
|
228
219
|
input_types = {
|
|
@@ -252,20 +243,44 @@ class EEGDash:
|
|
|
252
243
|
return record
|
|
253
244
|
|
|
254
245
|
def _build_query_from_kwargs(self, **kwargs) -> dict[str, Any]:
|
|
255
|
-
"""
|
|
246
|
+
"""Build a validated MongoDB query from keyword arguments.
|
|
247
|
+
|
|
248
|
+
This delegates to the module-level builder used across the package.
|
|
249
|
+
|
|
250
|
+
Parameters
|
|
251
|
+
----------
|
|
252
|
+
**kwargs
|
|
253
|
+
Keyword arguments to convert into a MongoDB query.
|
|
254
|
+
|
|
255
|
+
Returns
|
|
256
|
+
-------
|
|
257
|
+
dict
|
|
258
|
+
A MongoDB query dictionary.
|
|
256
259
|
|
|
257
|
-
This delegates to the module-level builder used across the package and
|
|
258
|
-
is exposed here for testing and convenience.
|
|
259
260
|
"""
|
|
260
261
|
return build_query_from_kwargs(**kwargs)
|
|
261
262
|
|
|
262
|
-
|
|
263
|
-
|
|
263
|
+
def _extract_simple_constraint(
|
|
264
|
+
self, query: dict[str, Any], key: str
|
|
265
|
+
) -> tuple[str, Any] | None:
|
|
264
266
|
"""Extract a simple constraint for a given key from a query dict.
|
|
265
267
|
|
|
266
|
-
Supports
|
|
267
|
-
|
|
268
|
-
|
|
268
|
+
Supports top-level equality (e.g., ``{'subject': '01'}``) and ``$in``
|
|
269
|
+
(e.g., ``{'subject': {'$in': ['01', '02']}}``) constraints.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
query : dict
|
|
274
|
+
The MongoDB query dictionary.
|
|
275
|
+
key : str
|
|
276
|
+
The key for which to extract the constraint.
|
|
277
|
+
|
|
278
|
+
Returns
|
|
279
|
+
-------
|
|
280
|
+
tuple or None
|
|
281
|
+
A tuple of (kind, value) where kind is "eq" or "in", or None if the
|
|
282
|
+
constraint is not present or unsupported.
|
|
283
|
+
|
|
269
284
|
"""
|
|
270
285
|
if not isinstance(query, dict) or key not in query:
|
|
271
286
|
return None
|
|
@@ -275,16 +290,28 @@ class EEGDash:
|
|
|
275
290
|
return ("in", list(val["$in"]))
|
|
276
291
|
return None # unsupported operator shape for conflict checking
|
|
277
292
|
else:
|
|
278
|
-
return
|
|
293
|
+
return "eq", val
|
|
279
294
|
|
|
280
295
|
def _raise_if_conflicting_constraints(
|
|
281
296
|
self, raw_query: dict[str, Any], kwargs_query: dict[str, Any]
|
|
282
297
|
) -> None:
|
|
283
|
-
"""Raise ValueError if
|
|
298
|
+
"""Raise ValueError if query sources have incompatible constraints.
|
|
299
|
+
|
|
300
|
+
Checks for mutually exclusive constraints on the same field to avoid
|
|
301
|
+
silent empty results.
|
|
302
|
+
|
|
303
|
+
Parameters
|
|
304
|
+
----------
|
|
305
|
+
raw_query : dict
|
|
306
|
+
The raw MongoDB query dictionary.
|
|
307
|
+
kwargs_query : dict
|
|
308
|
+
The query dictionary built from keyword arguments.
|
|
309
|
+
|
|
310
|
+
Raises
|
|
311
|
+
------
|
|
312
|
+
ValueError
|
|
313
|
+
If conflicting constraints are found.
|
|
284
314
|
|
|
285
|
-
We conservatively check only top-level fields with simple equality or $in
|
|
286
|
-
constraints. If a field appears in both queries and constraints are mutually
|
|
287
|
-
exclusive, raise an explicit error to avoid silent empty result sets.
|
|
288
315
|
"""
|
|
289
316
|
if not raw_query or not kwargs_query:
|
|
290
317
|
return
|
|
@@ -388,12 +415,31 @@ class EEGDash:
|
|
|
388
415
|
logger.info("Upserted: %s", result.upserted_count)
|
|
389
416
|
logger.info("Errors: %s ", result.bulk_api_result.get("writeErrors", []))
|
|
390
417
|
|
|
391
|
-
def _add_request(self, record: dict):
|
|
392
|
-
"""
|
|
418
|
+
def _add_request(self, record: dict) -> InsertOne:
|
|
419
|
+
"""Create a MongoDB insertion request for a record.
|
|
420
|
+
|
|
421
|
+
Parameters
|
|
422
|
+
----------
|
|
423
|
+
record : dict
|
|
424
|
+
The record to insert.
|
|
425
|
+
|
|
426
|
+
Returns
|
|
427
|
+
-------
|
|
428
|
+
InsertOne
|
|
429
|
+
A PyMongo ``InsertOne`` object.
|
|
430
|
+
|
|
431
|
+
"""
|
|
393
432
|
return InsertOne(record)
|
|
394
433
|
|
|
395
|
-
def add(self, record: dict):
|
|
396
|
-
"""Add a single record to the MongoDB collection.
|
|
434
|
+
def add(self, record: dict) -> None:
|
|
435
|
+
"""Add a single record to the MongoDB collection.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
record : dict
|
|
440
|
+
The record to add.
|
|
441
|
+
|
|
442
|
+
"""
|
|
397
443
|
try:
|
|
398
444
|
self.__collection.insert_one(record)
|
|
399
445
|
except ValueError as e:
|
|
@@ -405,11 +451,23 @@ class EEGDash:
|
|
|
405
451
|
)
|
|
406
452
|
logger.debug("Add operation failed", exc_info=exc)
|
|
407
453
|
|
|
408
|
-
def _update_request(self, record: dict):
|
|
409
|
-
"""
|
|
454
|
+
def _update_request(self, record: dict) -> UpdateOne:
|
|
455
|
+
"""Create a MongoDB update request for a record.
|
|
456
|
+
|
|
457
|
+
Parameters
|
|
458
|
+
----------
|
|
459
|
+
record : dict
|
|
460
|
+
The record to update.
|
|
461
|
+
|
|
462
|
+
Returns
|
|
463
|
+
-------
|
|
464
|
+
UpdateOne
|
|
465
|
+
A PyMongo ``UpdateOne`` object.
|
|
466
|
+
|
|
467
|
+
"""
|
|
410
468
|
return UpdateOne({"data_name": record["data_name"]}, {"$set": record})
|
|
411
469
|
|
|
412
|
-
def update(self, record: dict):
|
|
470
|
+
def update(self, record: dict) -> None:
|
|
413
471
|
"""Update a single record in the MongoDB collection.
|
|
414
472
|
|
|
415
473
|
Parameters
|
|
@@ -429,525 +487,84 @@ class EEGDash:
|
|
|
429
487
|
logger.debug("Update operation failed", exc_info=exc)
|
|
430
488
|
|
|
431
489
|
def exists(self, query: dict[str, Any]) -> bool:
|
|
432
|
-
"""
|
|
490
|
+
"""Check if at least one record matches the query.
|
|
491
|
+
|
|
492
|
+
This is an alias for :meth:`exist`.
|
|
493
|
+
|
|
494
|
+
Parameters
|
|
495
|
+
----------
|
|
496
|
+
query : dict
|
|
497
|
+
MongoDB query to check for existence.
|
|
498
|
+
|
|
499
|
+
Returns
|
|
500
|
+
-------
|
|
501
|
+
bool
|
|
502
|
+
True if a matching record exists, False otherwise.
|
|
503
|
+
|
|
504
|
+
"""
|
|
433
505
|
return self.exist(query)
|
|
434
506
|
|
|
435
|
-
def remove_field(self, record, field):
|
|
436
|
-
"""Remove a
|
|
507
|
+
def remove_field(self, record: dict, field: str) -> None:
|
|
508
|
+
"""Remove a field from a specific record in the MongoDB collection.
|
|
437
509
|
|
|
438
510
|
Parameters
|
|
439
511
|
----------
|
|
440
512
|
record : dict
|
|
441
|
-
Record
|
|
513
|
+
Record-identifying object with a ``data_name`` key.
|
|
442
514
|
field : str
|
|
443
|
-
|
|
515
|
+
The name of the field to remove.
|
|
444
516
|
|
|
445
517
|
"""
|
|
446
518
|
self.__collection.update_one(
|
|
447
519
|
{"data_name": record["data_name"]}, {"$unset": {field: 1}}
|
|
448
520
|
)
|
|
449
521
|
|
|
450
|
-
def remove_field_from_db(self, field):
|
|
451
|
-
"""Remove a field from all records
|
|
522
|
+
def remove_field_from_db(self, field: str) -> None:
|
|
523
|
+
"""Remove a field from all records in the database.
|
|
524
|
+
|
|
525
|
+
.. warning::
|
|
526
|
+
This is a destructive operation and cannot be undone.
|
|
452
527
|
|
|
453
528
|
Parameters
|
|
454
529
|
----------
|
|
455
530
|
field : str
|
|
456
|
-
|
|
531
|
+
The name of the field to remove from all documents.
|
|
457
532
|
|
|
458
533
|
"""
|
|
459
534
|
self.__collection.update_many({}, {"$unset": {field: 1}})
|
|
460
535
|
|
|
461
536
|
@property
|
|
462
537
|
def collection(self):
|
|
463
|
-
"""
|
|
464
|
-
|
|
538
|
+
"""The underlying PyMongo ``Collection`` object.
|
|
539
|
+
|
|
540
|
+
Returns
|
|
541
|
+
-------
|
|
542
|
+
pymongo.collection.Collection
|
|
543
|
+
The collection object used for database interactions.
|
|
465
544
|
|
|
466
|
-
|
|
467
|
-
|
|
545
|
+
"""
|
|
546
|
+
return self.__collection
|
|
468
547
|
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
Connections are managed by :class:`MongoConnectionManager`. Use
|
|
472
|
-
:meth:`close_all_connections` to explicitly close all clients.
|
|
548
|
+
def close(self) -> None:
|
|
549
|
+
"""Close the MongoDB connection.
|
|
473
550
|
|
|
551
|
+
.. deprecated:: 0.1
|
|
552
|
+
Connections are now managed globally by :class:`MongoConnectionManager`.
|
|
553
|
+
This method is a no-op and will be removed in a future version.
|
|
554
|
+
Use :meth:`EEGDash.close_all_connections` to close all clients.
|
|
474
555
|
"""
|
|
475
556
|
# Individual instances no longer close the shared client
|
|
476
557
|
pass
|
|
477
558
|
|
|
478
559
|
@classmethod
|
|
479
|
-
def close_all_connections(cls):
|
|
480
|
-
"""Close all MongoDB client connections managed by the singleton."""
|
|
560
|
+
def close_all_connections(cls) -> None:
|
|
561
|
+
"""Close all MongoDB client connections managed by the singleton manager."""
|
|
481
562
|
MongoConnectionManager.close_all()
|
|
482
563
|
|
|
483
|
-
def __del__(self):
|
|
564
|
+
def __del__(self) -> None:
|
|
484
565
|
"""Destructor; no explicit action needed due to global connection manager."""
|
|
485
566
|
# No longer needed since we're using singleton pattern
|
|
486
567
|
pass
|
|
487
568
|
|
|
488
569
|
|
|
489
|
-
|
|
490
|
-
"""Create a new EEGDashDataset from a given query or local BIDS dataset directory
|
|
491
|
-
and dataset name. An EEGDashDataset is pooled collection of EEGDashBaseDataset
|
|
492
|
-
instances (individual recordings) and is a subclass of braindecode's BaseConcatDataset.
|
|
493
|
-
|
|
494
|
-
Examples
|
|
495
|
-
--------
|
|
496
|
-
Basic usage with dataset and subject filtering:
|
|
497
|
-
|
|
498
|
-
>>> from eegdash import EEGDashDataset
|
|
499
|
-
>>> dataset = EEGDashDataset(
|
|
500
|
-
... cache_dir="./data",
|
|
501
|
-
... dataset="ds002718",
|
|
502
|
-
... subject="012"
|
|
503
|
-
... )
|
|
504
|
-
>>> print(f"Number of recordings: {len(dataset)}")
|
|
505
|
-
|
|
506
|
-
Filter by multiple subjects and specific task:
|
|
507
|
-
|
|
508
|
-
>>> subjects = ["012", "013", "014"]
|
|
509
|
-
>>> dataset = EEGDashDataset(
|
|
510
|
-
... cache_dir="./data",
|
|
511
|
-
... dataset="ds002718",
|
|
512
|
-
... subject=subjects,
|
|
513
|
-
... task="RestingState"
|
|
514
|
-
... )
|
|
515
|
-
|
|
516
|
-
Load and inspect EEG data from recordings:
|
|
517
|
-
|
|
518
|
-
>>> if len(dataset) > 0:
|
|
519
|
-
... recording = dataset[0]
|
|
520
|
-
... raw = recording.load()
|
|
521
|
-
... print(f"Sampling rate: {raw.info['sfreq']} Hz")
|
|
522
|
-
... print(f"Number of channels: {len(raw.ch_names)}")
|
|
523
|
-
... print(f"Duration: {raw.times[-1]:.1f} seconds")
|
|
524
|
-
|
|
525
|
-
Advanced filtering with raw MongoDB queries:
|
|
526
|
-
|
|
527
|
-
>>> from eegdash import EEGDashDataset
|
|
528
|
-
>>> query = {
|
|
529
|
-
... "dataset": "ds002718",
|
|
530
|
-
... "subject": {"$in": ["012", "013"]},
|
|
531
|
-
... "task": "RestingState"
|
|
532
|
-
... }
|
|
533
|
-
>>> dataset = EEGDashDataset(cache_dir="./data", query=query)
|
|
534
|
-
|
|
535
|
-
Working with dataset collections and braindecode integration:
|
|
536
|
-
|
|
537
|
-
>>> # EEGDashDataset is a braindecode BaseConcatDataset
|
|
538
|
-
>>> for i, recording in enumerate(dataset):
|
|
539
|
-
... if i >= 2: # limit output
|
|
540
|
-
... break
|
|
541
|
-
... print(f"Recording {i}: {recording.description}")
|
|
542
|
-
... raw = recording.load()
|
|
543
|
-
... print(f" Channels: {len(raw.ch_names)}, Duration: {raw.times[-1]:.1f}s")
|
|
544
|
-
|
|
545
|
-
Parameters
|
|
546
|
-
----------
|
|
547
|
-
cache_dir : str | Path
|
|
548
|
-
Directory where data are cached locally.
|
|
549
|
-
query : dict | None
|
|
550
|
-
Raw MongoDB query to filter records. If provided, it is merged with
|
|
551
|
-
keyword filtering arguments (see ``**kwargs``) using logical AND.
|
|
552
|
-
You must provide at least a ``dataset`` (either in ``query`` or
|
|
553
|
-
as a keyword argument). Only fields in ``ALLOWED_QUERY_FIELDS`` are
|
|
554
|
-
considered for filtering.
|
|
555
|
-
dataset : str
|
|
556
|
-
Dataset identifier (e.g., ``"ds002718"``). Required if ``query`` does
|
|
557
|
-
not already specify a dataset.
|
|
558
|
-
task : str | list[str]
|
|
559
|
-
Task name(s) to filter by (e.g., ``"RestingState"``).
|
|
560
|
-
subject : str | list[str]
|
|
561
|
-
Subject identifier(s) to filter by (e.g., ``"NDARCA153NKE"``).
|
|
562
|
-
session : str | list[str]
|
|
563
|
-
Session identifier(s) to filter by (e.g., ``"1"``).
|
|
564
|
-
run : str | list[str]
|
|
565
|
-
Run identifier(s) to filter by (e.g., ``"1"``).
|
|
566
|
-
description_fields : list[str]
|
|
567
|
-
Fields to extract from each record and include in dataset descriptions
|
|
568
|
-
(e.g., "subject", "session", "run", "task").
|
|
569
|
-
s3_bucket : str | None
|
|
570
|
-
Optional S3 bucket URI (e.g., "s3://mybucket") to use instead of the
|
|
571
|
-
default OpenNeuro bucket when downloading data files.
|
|
572
|
-
records : list[dict] | None
|
|
573
|
-
Pre-fetched metadata records. If provided, the dataset is constructed
|
|
574
|
-
directly from these records and no MongoDB query is performed.
|
|
575
|
-
download : bool, default True
|
|
576
|
-
If False, load from local BIDS files only. Local data are expected
|
|
577
|
-
under ``cache_dir / dataset``; no DB or S3 access is attempted.
|
|
578
|
-
n_jobs : int
|
|
579
|
-
Number of parallel jobs to use where applicable (-1 uses all cores).
|
|
580
|
-
eeg_dash_instance : EEGDash | None
|
|
581
|
-
Optional existing EEGDash client to reuse for DB queries. If None,
|
|
582
|
-
a new client is created on demand, not used in the case of no download.
|
|
583
|
-
**kwargs : dict
|
|
584
|
-
Additional keyword arguments serving two purposes:
|
|
585
|
-
|
|
586
|
-
- Filtering: any keys present in ``ALLOWED_QUERY_FIELDS`` are treated as
|
|
587
|
-
query filters (e.g., ``dataset``, ``subject``, ``task``, ...).
|
|
588
|
-
- Dataset options: remaining keys are forwarded to
|
|
589
|
-
``EEGDashBaseDataset``.
|
|
590
|
-
|
|
591
|
-
"""
|
|
592
|
-
|
|
593
|
-
def __init__(
|
|
594
|
-
self,
|
|
595
|
-
cache_dir: str | Path,
|
|
596
|
-
query: dict[str, Any] = None,
|
|
597
|
-
description_fields: list[str] = [
|
|
598
|
-
"subject",
|
|
599
|
-
"session",
|
|
600
|
-
"run",
|
|
601
|
-
"task",
|
|
602
|
-
"age",
|
|
603
|
-
"gender",
|
|
604
|
-
"sex",
|
|
605
|
-
],
|
|
606
|
-
s3_bucket: str | None = None,
|
|
607
|
-
records: list[dict] | None = None,
|
|
608
|
-
download: bool = True,
|
|
609
|
-
n_jobs: int = -1,
|
|
610
|
-
eeg_dash_instance: EEGDash | None = None,
|
|
611
|
-
**kwargs,
|
|
612
|
-
):
|
|
613
|
-
# Parameters that don't need validation
|
|
614
|
-
_suppress_comp_warning: bool = kwargs.pop("_suppress_comp_warning", False)
|
|
615
|
-
self.s3_bucket = s3_bucket
|
|
616
|
-
self.records = records
|
|
617
|
-
self.download = download
|
|
618
|
-
self.n_jobs = n_jobs
|
|
619
|
-
self.eeg_dash_instance = eeg_dash_instance
|
|
620
|
-
|
|
621
|
-
self.cache_dir = cache_dir
|
|
622
|
-
if self.cache_dir == "" or self.cache_dir is None:
|
|
623
|
-
self.cache_dir = get_default_cache_dir()
|
|
624
|
-
logger.warning(
|
|
625
|
-
f"Cache directory is empty, using the eegdash default path: {self.cache_dir}"
|
|
626
|
-
)
|
|
627
|
-
|
|
628
|
-
self.cache_dir = Path(self.cache_dir)
|
|
629
|
-
|
|
630
|
-
if not self.cache_dir.exists():
|
|
631
|
-
logger.warning(
|
|
632
|
-
f"Cache directory does not exist, creating it: {self.cache_dir}"
|
|
633
|
-
)
|
|
634
|
-
self.cache_dir.mkdir(exist_ok=True, parents=True)
|
|
635
|
-
|
|
636
|
-
# Separate query kwargs from other kwargs passed to the BaseDataset constructor
|
|
637
|
-
self.query = query or {}
|
|
638
|
-
self.query.update(
|
|
639
|
-
{k: v for k, v in kwargs.items() if k in ALLOWED_QUERY_FIELDS}
|
|
640
|
-
)
|
|
641
|
-
base_dataset_kwargs = {k: v for k, v in kwargs.items() if k not in self.query}
|
|
642
|
-
if "dataset" not in self.query:
|
|
643
|
-
# If explicit records are provided, infer dataset from records
|
|
644
|
-
if isinstance(records, list) and records and isinstance(records[0], dict):
|
|
645
|
-
inferred = records[0].get("dataset")
|
|
646
|
-
if inferred:
|
|
647
|
-
self.query["dataset"] = inferred
|
|
648
|
-
else:
|
|
649
|
-
raise ValueError("You must provide a 'dataset' argument")
|
|
650
|
-
else:
|
|
651
|
-
raise ValueError("You must provide a 'dataset' argument")
|
|
652
|
-
|
|
653
|
-
# Decide on a dataset subfolder name for cache isolation. If using
|
|
654
|
-
# challenge/preprocessed buckets (e.g., BDF, mini subsets), append
|
|
655
|
-
# informative suffixes to avoid overlapping with the original dataset.
|
|
656
|
-
dataset_folder = self.query["dataset"]
|
|
657
|
-
if self.s3_bucket:
|
|
658
|
-
suffixes: list[str] = []
|
|
659
|
-
bucket_lower = str(self.s3_bucket).lower()
|
|
660
|
-
if "bdf" in bucket_lower:
|
|
661
|
-
suffixes.append("bdf")
|
|
662
|
-
if "mini" in bucket_lower:
|
|
663
|
-
suffixes.append("mini")
|
|
664
|
-
if suffixes:
|
|
665
|
-
dataset_folder = f"{dataset_folder}-{'-'.join(suffixes)}"
|
|
666
|
-
|
|
667
|
-
self.data_dir = self.cache_dir / dataset_folder
|
|
668
|
-
|
|
669
|
-
if (
|
|
670
|
-
not _suppress_comp_warning
|
|
671
|
-
and self.query["dataset"] in RELEASE_TO_OPENNEURO_DATASET_MAP.values()
|
|
672
|
-
):
|
|
673
|
-
message_text = Text.from_markup(
|
|
674
|
-
"[italic]This notice is only for users who are participating in the [link=https://eeg2025.github.io/]EEG 2025 Competition[/link].[/italic]\n\n"
|
|
675
|
-
"[bold]EEG 2025 Competition Data Notice![/bold]\n"
|
|
676
|
-
"You are loading one of the datasets that is used in competition, but via `EEGDashDataset`.\n\n"
|
|
677
|
-
"[bold red]IMPORTANT[/bold red]: \n"
|
|
678
|
-
"If you download data from `EEGDashDataset`, it is [u]NOT[/u] identical to the official \n"
|
|
679
|
-
"competition data, which is accessed via `EEGChallengeDataset`. "
|
|
680
|
-
"The competition data has been downsampled and filtered.\n\n"
|
|
681
|
-
"[bold]If you are participating in the competition, \nyou must use the `EEGChallengeDataset` object to ensure consistency.[/bold] \n\n"
|
|
682
|
-
"If you are not participating in the competition, you can ignore this message."
|
|
683
|
-
)
|
|
684
|
-
warning_panel = Panel(
|
|
685
|
-
message_text,
|
|
686
|
-
title="[yellow]EEG 2025 Competition Data Notice[/yellow]",
|
|
687
|
-
subtitle="[cyan]Source: EEGDashDataset[/cyan]",
|
|
688
|
-
border_style="yellow",
|
|
689
|
-
)
|
|
690
|
-
|
|
691
|
-
try:
|
|
692
|
-
Console().print(warning_panel)
|
|
693
|
-
except Exception:
|
|
694
|
-
logger.warning(str(message_text))
|
|
695
|
-
|
|
696
|
-
if records is not None:
|
|
697
|
-
self.records = records
|
|
698
|
-
datasets = [
|
|
699
|
-
EEGDashBaseDataset(
|
|
700
|
-
record,
|
|
701
|
-
self.cache_dir,
|
|
702
|
-
self.s3_bucket,
|
|
703
|
-
**base_dataset_kwargs,
|
|
704
|
-
)
|
|
705
|
-
for record in self.records
|
|
706
|
-
]
|
|
707
|
-
elif not download: # only assume local data is complete if not downloading
|
|
708
|
-
if not self.data_dir.exists():
|
|
709
|
-
raise ValueError(
|
|
710
|
-
f"Offline mode is enabled, but local data_dir {self.data_dir} does not exist."
|
|
711
|
-
)
|
|
712
|
-
records = self._find_local_bids_records(self.data_dir, self.query)
|
|
713
|
-
# Try to enrich from local participants.tsv to restore requested fields
|
|
714
|
-
try:
|
|
715
|
-
bids_ds = EEGBIDSDataset(
|
|
716
|
-
data_dir=str(self.data_dir), dataset=self.query["dataset"]
|
|
717
|
-
) # type: ignore[index]
|
|
718
|
-
except Exception:
|
|
719
|
-
bids_ds = None
|
|
720
|
-
|
|
721
|
-
datasets = []
|
|
722
|
-
for record in records:
|
|
723
|
-
# Start with entity values from filename
|
|
724
|
-
desc: dict[str, Any] = {
|
|
725
|
-
k: record.get(k)
|
|
726
|
-
for k in ("subject", "session", "run", "task")
|
|
727
|
-
if record.get(k) is not None
|
|
728
|
-
}
|
|
729
|
-
|
|
730
|
-
if bids_ds is not None:
|
|
731
|
-
try:
|
|
732
|
-
rel_from_dataset = Path(record["bidspath"]).relative_to(
|
|
733
|
-
record["dataset"]
|
|
734
|
-
) # type: ignore[index]
|
|
735
|
-
local_file = (self.data_dir / rel_from_dataset).as_posix()
|
|
736
|
-
part_row = bids_ds.subject_participant_tsv(local_file)
|
|
737
|
-
desc = merge_participants_fields(
|
|
738
|
-
description=desc,
|
|
739
|
-
participants_row=part_row
|
|
740
|
-
if isinstance(part_row, dict)
|
|
741
|
-
else None,
|
|
742
|
-
description_fields=description_fields,
|
|
743
|
-
)
|
|
744
|
-
except Exception:
|
|
745
|
-
pass
|
|
746
|
-
|
|
747
|
-
datasets.append(
|
|
748
|
-
EEGDashBaseDataset(
|
|
749
|
-
record=record,
|
|
750
|
-
cache_dir=self.cache_dir,
|
|
751
|
-
s3_bucket=self.s3_bucket,
|
|
752
|
-
description=desc,
|
|
753
|
-
**base_dataset_kwargs,
|
|
754
|
-
)
|
|
755
|
-
)
|
|
756
|
-
elif self.query:
|
|
757
|
-
if self.eeg_dash_instance is None:
|
|
758
|
-
self.eeg_dash_instance = EEGDash()
|
|
759
|
-
datasets = self._find_datasets(
|
|
760
|
-
query=build_query_from_kwargs(**self.query),
|
|
761
|
-
description_fields=description_fields,
|
|
762
|
-
base_dataset_kwargs=base_dataset_kwargs,
|
|
763
|
-
)
|
|
764
|
-
# We only need filesystem if we need to access S3
|
|
765
|
-
self.filesystem = downloader.get_s3_filesystem()
|
|
766
|
-
else:
|
|
767
|
-
raise ValueError(
|
|
768
|
-
"You must provide either 'records', a 'data_dir', or a query/keyword arguments for filtering."
|
|
769
|
-
)
|
|
770
|
-
|
|
771
|
-
super().__init__(datasets)
|
|
772
|
-
|
|
773
|
-
def _find_local_bids_records(
|
|
774
|
-
self, dataset_root: Path, filters: dict[str, Any]
|
|
775
|
-
) -> list[dict]:
|
|
776
|
-
"""Discover local BIDS EEG files and build minimal records.
|
|
777
|
-
|
|
778
|
-
This helper enumerates EEG recordings under ``dataset_root`` via
|
|
779
|
-
``mne_bids.find_matching_paths`` and applies entity filters to produce a
|
|
780
|
-
list of records suitable for ``EEGDashBaseDataset``. No network access
|
|
781
|
-
is performed and files are not read.
|
|
782
|
-
|
|
783
|
-
Parameters
|
|
784
|
-
----------
|
|
785
|
-
dataset_root : Path
|
|
786
|
-
Local dataset directory. May be the plain dataset folder (e.g.,
|
|
787
|
-
``ds005509``) or a suffixed cache variant (e.g.,
|
|
788
|
-
``ds005509-bdf-mini``).
|
|
789
|
-
filters : dict of {str, Any}
|
|
790
|
-
Query filters. Must include ``'dataset'`` with the dataset id (without
|
|
791
|
-
local suffixes). May include BIDS entities ``'subject'``,
|
|
792
|
-
``'session'``, ``'task'``, and ``'run'``. Each value can be a scalar
|
|
793
|
-
or a sequence of scalars.
|
|
794
|
-
|
|
795
|
-
Returns
|
|
796
|
-
-------
|
|
797
|
-
records : list of dict
|
|
798
|
-
One record per matched EEG file with at least:
|
|
799
|
-
|
|
800
|
-
- ``'data_name'``
|
|
801
|
-
- ``'dataset'`` (dataset id, without suffixes)
|
|
802
|
-
- ``'bidspath'`` (normalized to start with the dataset id)
|
|
803
|
-
- ``'subject'``, ``'session'``, ``'task'``, ``'run'`` (may be None)
|
|
804
|
-
- ``'bidsdependencies'`` (empty list)
|
|
805
|
-
- ``'modality'`` (``"eeg"``)
|
|
806
|
-
- ``'sampling_frequency'``, ``'nchans'``, ``'ntimes'`` (minimal
|
|
807
|
-
defaults for offline usage)
|
|
808
|
-
|
|
809
|
-
Notes
|
|
810
|
-
-----
|
|
811
|
-
- Matching uses ``datatypes=['eeg']`` and ``suffixes=['eeg']``.
|
|
812
|
-
- ``bidspath`` is constructed as
|
|
813
|
-
``<dataset_id> / <relative_path_from_dataset_root>`` to ensure the
|
|
814
|
-
first path component is the dataset id (without local cache suffixes).
|
|
815
|
-
- Minimal defaults are set for ``sampling_frequency``, ``nchans``, and
|
|
816
|
-
``ntimes`` to satisfy dataset length requirements offline.
|
|
817
|
-
|
|
818
|
-
"""
|
|
819
|
-
dataset_id = filters["dataset"]
|
|
820
|
-
arg_map = {
|
|
821
|
-
"subjects": "subject",
|
|
822
|
-
"sessions": "session",
|
|
823
|
-
"tasks": "task",
|
|
824
|
-
"runs": "run",
|
|
825
|
-
}
|
|
826
|
-
matching_args: dict[str, list[str]] = {}
|
|
827
|
-
for finder_key, entity_key in arg_map.items():
|
|
828
|
-
entity_val = filters.get(entity_key)
|
|
829
|
-
if entity_val is None:
|
|
830
|
-
continue
|
|
831
|
-
if isinstance(entity_val, (list, tuple, set)):
|
|
832
|
-
entity_vals = list(entity_val)
|
|
833
|
-
if not entity_vals:
|
|
834
|
-
continue
|
|
835
|
-
matching_args[finder_key] = entity_vals
|
|
836
|
-
else:
|
|
837
|
-
matching_args[finder_key] = [entity_val]
|
|
838
|
-
|
|
839
|
-
matched_paths = find_matching_paths(
|
|
840
|
-
root=str(dataset_root),
|
|
841
|
-
datatypes=["eeg"],
|
|
842
|
-
suffixes=["eeg"],
|
|
843
|
-
ignore_json=True,
|
|
844
|
-
**matching_args,
|
|
845
|
-
)
|
|
846
|
-
records_out: list[dict] = []
|
|
847
|
-
|
|
848
|
-
for bids_path in matched_paths:
|
|
849
|
-
# Build bidspath as dataset_id / relative_path_from_dataset_root (POSIX)
|
|
850
|
-
rel_from_root = (
|
|
851
|
-
Path(bids_path.fpath)
|
|
852
|
-
.resolve()
|
|
853
|
-
.relative_to(Path(bids_path.root).resolve())
|
|
854
|
-
)
|
|
855
|
-
bidspath = f"{dataset_id}/{rel_from_root.as_posix()}"
|
|
856
|
-
|
|
857
|
-
rec = {
|
|
858
|
-
"data_name": f"{dataset_id}_{Path(bids_path.fpath).name}",
|
|
859
|
-
"dataset": dataset_id,
|
|
860
|
-
"bidspath": bidspath,
|
|
861
|
-
"subject": (bids_path.subject or None),
|
|
862
|
-
"session": (bids_path.session or None),
|
|
863
|
-
"task": (bids_path.task or None),
|
|
864
|
-
"run": (bids_path.run or None),
|
|
865
|
-
# minimal fields to satisfy BaseDataset from eegdash
|
|
866
|
-
"bidsdependencies": [], # not needed to just run.
|
|
867
|
-
"modality": "eeg",
|
|
868
|
-
# minimal numeric defaults for offline length calculation
|
|
869
|
-
"sampling_frequency": None,
|
|
870
|
-
"nchans": None,
|
|
871
|
-
"ntimes": None,
|
|
872
|
-
}
|
|
873
|
-
records_out.append(rec)
|
|
874
|
-
|
|
875
|
-
return records_out
|
|
876
|
-
|
|
877
|
-
def _find_key_in_nested_dict(self, data: Any, target_key: str) -> Any:
|
|
878
|
-
"""Recursively search for target_key in nested dicts/lists with normalized matching.
|
|
879
|
-
|
|
880
|
-
This makes lookups tolerant to naming differences like "p-factor" vs "p_factor".
|
|
881
|
-
Returns the first match or None.
|
|
882
|
-
"""
|
|
883
|
-
norm_target = normalize_key(target_key)
|
|
884
|
-
if isinstance(data, dict):
|
|
885
|
-
for k, v in data.items():
|
|
886
|
-
if normalize_key(k) == norm_target:
|
|
887
|
-
return v
|
|
888
|
-
res = self._find_key_in_nested_dict(v, target_key)
|
|
889
|
-
if res is not None:
|
|
890
|
-
return res
|
|
891
|
-
elif isinstance(data, list):
|
|
892
|
-
for item in data:
|
|
893
|
-
res = self._find_key_in_nested_dict(item, target_key)
|
|
894
|
-
if res is not None:
|
|
895
|
-
return res
|
|
896
|
-
return None
|
|
897
|
-
|
|
898
|
-
def _find_datasets(
|
|
899
|
-
self,
|
|
900
|
-
query: dict[str, Any] | None,
|
|
901
|
-
description_fields: list[str],
|
|
902
|
-
base_dataset_kwargs: dict,
|
|
903
|
-
) -> list[EEGDashBaseDataset]:
|
|
904
|
-
"""Helper method to find datasets in the MongoDB collection that satisfy the
|
|
905
|
-
given query and return them as a list of EEGDashBaseDataset objects.
|
|
906
|
-
|
|
907
|
-
Parameters
|
|
908
|
-
----------
|
|
909
|
-
query : dict
|
|
910
|
-
The query object, as in EEGDash.find().
|
|
911
|
-
description_fields : list[str]
|
|
912
|
-
A list of fields to be extracted from the dataset records and included in
|
|
913
|
-
the returned dataset description(s).
|
|
914
|
-
kwargs: additional keyword arguments to be passed to the EEGDashBaseDataset
|
|
915
|
-
constructor.
|
|
916
|
-
|
|
917
|
-
Returns
|
|
918
|
-
-------
|
|
919
|
-
list :
|
|
920
|
-
A list of EEGDashBaseDataset objects that match the query.
|
|
921
|
-
|
|
922
|
-
"""
|
|
923
|
-
datasets: list[EEGDashBaseDataset] = []
|
|
924
|
-
self.records = self.eeg_dash_instance.find(query)
|
|
925
|
-
|
|
926
|
-
for record in self.records:
|
|
927
|
-
description: dict[str, Any] = {}
|
|
928
|
-
# Requested fields first (normalized matching)
|
|
929
|
-
for field in description_fields:
|
|
930
|
-
value = self._find_key_in_nested_dict(record, field)
|
|
931
|
-
if value is not None:
|
|
932
|
-
description[field] = value
|
|
933
|
-
# Merge all participants.tsv columns generically
|
|
934
|
-
part = self._find_key_in_nested_dict(record, "participant_tsv")
|
|
935
|
-
if isinstance(part, dict):
|
|
936
|
-
description = merge_participants_fields(
|
|
937
|
-
description=description,
|
|
938
|
-
participants_row=part,
|
|
939
|
-
description_fields=description_fields,
|
|
940
|
-
)
|
|
941
|
-
datasets.append(
|
|
942
|
-
EEGDashBaseDataset(
|
|
943
|
-
record,
|
|
944
|
-
cache_dir=self.cache_dir,
|
|
945
|
-
s3_bucket=self.s3_bucket,
|
|
946
|
-
description=description,
|
|
947
|
-
**base_dataset_kwargs,
|
|
948
|
-
)
|
|
949
|
-
)
|
|
950
|
-
return datasets
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
__all__ = ["EEGDash", "EEGDashDataset"]
|
|
570
|
+
__all__ = ["EEGDash"]
|