eegdash 0.4.0.dev173498563__py3-none-any.whl → 0.4.1.dev185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -1,37 +1,562 @@
1
1
  from pathlib import Path
2
+ from typing import Any
2
3
 
4
+ from docstring_inheritance import NumpyDocstringInheritanceInitMeta
5
+ from mne_bids import find_matching_paths
3
6
  from rich.console import Console
4
7
  from rich.panel import Panel
5
8
  from rich.text import Text
6
9
 
7
- from ..api import EEGDashDataset
8
- from ..bids_eeg_metadata import build_query_from_kwargs
9
- from ..const import RELEASE_TO_OPENNEURO_DATASET_MAP, SUBJECT_MINI_RELEASE_MAP
10
+ from braindecode.datasets import BaseConcatDataset
11
+
12
+ from .. import downloader
13
+ from ..bids_eeg_metadata import (
14
+ build_query_from_kwargs,
15
+ merge_participants_fields,
16
+ normalize_key,
17
+ )
18
+ from ..const import (
19
+ ALLOWED_QUERY_FIELDS,
20
+ RELEASE_TO_OPENNEURO_DATASET_MAP,
21
+ SUBJECT_MINI_RELEASE_MAP,
22
+ )
10
23
  from ..logging import logger
24
+ from ..paths import get_default_cache_dir
25
+ from .base import EEGDashBaseDataset
26
+ from .bids_dataset import EEGBIDSDataset
11
27
  from .registry import register_openneuro_datasets
12
28
 
13
29
 
30
+ class EEGDashDataset(BaseConcatDataset, metaclass=NumpyDocstringInheritanceInitMeta):
31
+ """Create a new EEGDashDataset from a given query or local BIDS dataset directory
32
+ and dataset name. An EEGDashDataset is pooled collection of EEGDashBaseDataset
33
+ instances (individual recordings) and is a subclass of braindecode's BaseConcatDataset.
34
+
35
+ Examples
36
+ --------
37
+ Basic usage with dataset and subject filtering:
38
+
39
+ >>> from eegdash import EEGDashDataset
40
+ >>> dataset = EEGDashDataset(
41
+ ... cache_dir="./data",
42
+ ... dataset="ds002718",
43
+ ... subject="012"
44
+ ... )
45
+ >>> print(f"Number of recordings: {len(dataset)}")
46
+
47
+ Filter by multiple subjects and specific task:
48
+
49
+ >>> subjects = ["012", "013", "014"]
50
+ >>> dataset = EEGDashDataset(
51
+ ... cache_dir="./data",
52
+ ... dataset="ds002718",
53
+ ... subject=subjects,
54
+ ... task="RestingState"
55
+ ... )
56
+
57
+ Load and inspect EEG data from recordings:
58
+
59
+ >>> if len(dataset) > 0:
60
+ ... recording = dataset[0]
61
+ ... raw = recording.load()
62
+ ... print(f"Sampling rate: {raw.info['sfreq']} Hz")
63
+ ... print(f"Number of channels: {len(raw.ch_names)}")
64
+ ... print(f"Duration: {raw.times[-1]:.1f} seconds")
65
+
66
+ Advanced filtering with raw MongoDB queries:
67
+
68
+ >>> from eegdash import EEGDashDataset
69
+ >>> query = {
70
+ ... "dataset": "ds002718",
71
+ ... "subject": {"$in": ["012", "013"]},
72
+ ... "task": "RestingState"
73
+ ... }
74
+ >>> dataset = EEGDashDataset(cache_dir="./data", query=query)
75
+
76
+ Working with dataset collections and braindecode integration:
77
+
78
+ >>> # EEGDashDataset is a braindecode BaseConcatDataset
79
+ >>> for i, recording in enumerate(dataset):
80
+ ... if i >= 2: # limit output
81
+ ... break
82
+ ... print(f"Recording {i}: {recording.description}")
83
+ ... raw = recording.load()
84
+ ... print(f" Channels: {len(raw.ch_names)}, Duration: {raw.times[-1]:.1f}s")
85
+
86
+ Parameters
87
+ ----------
88
+ cache_dir : str | Path
89
+ Directory where data are cached locally.
90
+ query : dict | None
91
+ Raw MongoDB query to filter records. If provided, it is merged with
92
+ keyword filtering arguments (see ``**kwargs``) using logical AND.
93
+ You must provide at least a ``dataset`` (either in ``query`` or
94
+ as a keyword argument). Only fields in ``ALLOWED_QUERY_FIELDS`` are
95
+ considered for filtering.
96
+ dataset : str
97
+ Dataset identifier (e.g., ``"ds002718"``). Required if ``query`` does
98
+ not already specify a dataset.
99
+ task : str | list[str]
100
+ Task name(s) to filter by (e.g., ``"RestingState"``).
101
+ subject : str | list[str]
102
+ Subject identifier(s) to filter by (e.g., ``"NDARCA153NKE"``).
103
+ session : str | list[str]
104
+ Session identifier(s) to filter by (e.g., ``"1"``).
105
+ run : str | list[str]
106
+ Run identifier(s) to filter by (e.g., ``"1"``).
107
+ description_fields : list[str]
108
+ Fields to extract from each record and include in dataset descriptions
109
+ (e.g., "subject", "session", "run", "task").
110
+ s3_bucket : str | None
111
+ Optional S3 bucket URI (e.g., "s3://mybucket") to use instead of the
112
+ default OpenNeuro bucket when downloading data files.
113
+ records : list[dict] | None
114
+ Pre-fetched metadata records. If provided, the dataset is constructed
115
+ directly from these records and no MongoDB query is performed.
116
+ download : bool, default True
117
+ If False, load from local BIDS files only. Local data are expected
118
+ under ``cache_dir / dataset``; no DB or S3 access is attempted.
119
+ n_jobs : int
120
+ Number of parallel jobs to use where applicable (-1 uses all cores).
121
+ eeg_dash_instance : EEGDash | None
122
+ Optional existing EEGDash client to reuse for DB queries. If None,
123
+ a new client is created on demand, not used in the case of no download.
124
+ **kwargs : dict
125
+ Additional keyword arguments serving two purposes:
126
+
127
+ - Filtering: any keys present in ``ALLOWED_QUERY_FIELDS`` are treated as
128
+ query filters (e.g., ``dataset``, ``subject``, ``task``, ...).
129
+ - Dataset options: remaining keys are forwarded to
130
+ ``EEGDashBaseDataset``.
131
+
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ cache_dir: str | Path,
137
+ query: dict[str, Any] = None,
138
+ description_fields: list[str] | None = None,
139
+ s3_bucket: str | None = None,
140
+ records: list[dict] | None = None,
141
+ download: bool = True,
142
+ n_jobs: int = -1,
143
+ eeg_dash_instance: Any = None,
144
+ **kwargs,
145
+ ):
146
+ # Parameters that don't need validation
147
+ _suppress_comp_warning: bool = kwargs.pop("_suppress_comp_warning", False)
148
+ self.s3_bucket = s3_bucket
149
+ self.records = records
150
+ self.download = download
151
+ self.n_jobs = n_jobs
152
+ self.eeg_dash_instance = eeg_dash_instance
153
+
154
+ if description_fields is None:
155
+ description_fields = [
156
+ "subject",
157
+ "session",
158
+ "run",
159
+ "task",
160
+ "age",
161
+ "gender",
162
+ "sex",
163
+ ]
164
+
165
+ self.cache_dir = cache_dir
166
+ if self.cache_dir == "" or self.cache_dir is None:
167
+ self.cache_dir = get_default_cache_dir()
168
+ logger.warning(
169
+ f"Cache directory is empty, using the eegdash default path: {self.cache_dir}"
170
+ )
171
+
172
+ self.cache_dir = Path(self.cache_dir)
173
+
174
+ if not self.cache_dir.exists():
175
+ logger.warning(
176
+ f"Cache directory does not exist, creating it: {self.cache_dir}"
177
+ )
178
+ self.cache_dir.mkdir(exist_ok=True, parents=True)
179
+
180
+ # Separate query kwargs from other kwargs passed to the BaseDataset constructor
181
+ self.query = query or {}
182
+ self.query.update(
183
+ {k: v for k, v in kwargs.items() if k in ALLOWED_QUERY_FIELDS}
184
+ )
185
+ base_dataset_kwargs = {k: v for k, v in kwargs.items() if k not in self.query}
186
+ if "dataset" not in self.query:
187
+ # If explicit records are provided, infer dataset from records
188
+ if isinstance(records, list) and records and isinstance(records[0], dict):
189
+ inferred = records[0].get("dataset")
190
+ if inferred:
191
+ self.query["dataset"] = inferred
192
+ else:
193
+ raise ValueError("You must provide a 'dataset' argument")
194
+ else:
195
+ raise ValueError("You must provide a 'dataset' argument")
196
+
197
+ # Decide on a dataset subfolder name for cache isolation. If using
198
+ # challenge/preprocessed buckets (e.g., BDF, mini subsets), append
199
+ # informative suffixes to avoid overlapping with the original dataset.
200
+ dataset_folder = self.query["dataset"]
201
+ if self.s3_bucket:
202
+ suffixes: list[str] = []
203
+ bucket_lower = str(self.s3_bucket).lower()
204
+ if "bdf" in bucket_lower:
205
+ suffixes.append("bdf")
206
+ if "mini" in bucket_lower:
207
+ suffixes.append("mini")
208
+ if suffixes:
209
+ dataset_folder = f"{dataset_folder}-{'-'.join(suffixes)}"
210
+
211
+ self.data_dir = self.cache_dir / dataset_folder
212
+
213
+ if (
214
+ not _suppress_comp_warning
215
+ and self.query["dataset"] in RELEASE_TO_OPENNEURO_DATASET_MAP.values()
216
+ ):
217
+ message_text = Text.from_markup(
218
+ "[italic]This notice is only for users who are participating in the [link=https://eeg2025.github.io/]EEG 2025 Competition[/link].[/italic]\n\n"
219
+ "[bold]EEG 2025 Competition Data Notice![/bold]\n"
220
+ "You are loading one of the datasets that is used in competition, but via `EEGDashDataset`.\n\n"
221
+ "[bold red]IMPORTANT[/bold red]: \n"
222
+ "If you download data from `EEGDashDataset`, it is [u]NOT[/u] identical to the official \n"
223
+ "competition data, which is accessed via `EEGChallengeDataset`. "
224
+ "The competition data has been downsampled and filtered.\n\n"
225
+ "[bold]If you are participating in the competition, \nyou must use the `EEGChallengeDataset` object to ensure consistency.[/bold] \n\n"
226
+ "If you are not participating in the competition, you can ignore this message."
227
+ )
228
+ warning_panel = Panel(
229
+ message_text,
230
+ title="[yellow]EEG 2025 Competition Data Notice[/yellow]",
231
+ subtitle="[cyan]Source: EEGDashDataset[/cyan]",
232
+ border_style="yellow",
233
+ )
234
+
235
+ try:
236
+ Console().print(warning_panel)
237
+ except Exception:
238
+ logger.warning(str(message_text))
239
+
240
+ if records is not None:
241
+ self.records = records
242
+ datasets = [
243
+ EEGDashBaseDataset(
244
+ record,
245
+ self.cache_dir,
246
+ self.s3_bucket,
247
+ **base_dataset_kwargs,
248
+ )
249
+ for record in self.records
250
+ ]
251
+ elif not download: # only assume local data is complete if not downloading
252
+ if not self.data_dir.exists():
253
+ raise ValueError(
254
+ f"Offline mode is enabled, but local data_dir {self.data_dir} does not exist."
255
+ )
256
+ records = self._find_local_bids_records(self.data_dir, self.query)
257
+ # Try to enrich from local participants.tsv to restore requested fields
258
+ try:
259
+ bids_ds = EEGBIDSDataset(
260
+ data_dir=str(self.data_dir), dataset=self.query["dataset"]
261
+ ) # type: ignore[index]
262
+ except Exception:
263
+ bids_ds = None
264
+
265
+ datasets = []
266
+ for record in records:
267
+ # Start with entity values from filename
268
+ desc: dict[str, Any] = {
269
+ k: record.get(k)
270
+ for k in ("subject", "session", "run", "task")
271
+ if record.get(k) is not None
272
+ }
273
+
274
+ if bids_ds is not None:
275
+ try:
276
+ rel_from_dataset = Path(record["bidspath"]).relative_to(
277
+ record["dataset"]
278
+ ) # type: ignore[index]
279
+ local_file = (self.data_dir / rel_from_dataset).as_posix()
280
+ part_row = bids_ds.subject_participant_tsv(local_file)
281
+ desc = merge_participants_fields(
282
+ description=desc,
283
+ participants_row=part_row
284
+ if isinstance(part_row, dict)
285
+ else None,
286
+ description_fields=description_fields,
287
+ )
288
+ except Exception:
289
+ pass
290
+
291
+ datasets.append(
292
+ EEGDashBaseDataset(
293
+ record=record,
294
+ cache_dir=self.cache_dir,
295
+ s3_bucket=self.s3_bucket,
296
+ description=desc,
297
+ **base_dataset_kwargs,
298
+ )
299
+ )
300
+ elif self.query:
301
+ if self.eeg_dash_instance is None:
302
+ # to avoid circular import
303
+ from ..api import EEGDash
304
+
305
+ self.eeg_dash_instance = EEGDash()
306
+ datasets = self._find_datasets(
307
+ query=build_query_from_kwargs(**self.query),
308
+ description_fields=description_fields,
309
+ base_dataset_kwargs=base_dataset_kwargs,
310
+ )
311
+ # We only need filesystem if we need to access S3
312
+ self.filesystem = downloader.get_s3_filesystem()
313
+ else:
314
+ raise ValueError(
315
+ "You must provide either 'records', a 'data_dir', or a query/keyword arguments for filtering."
316
+ )
317
+
318
+ super().__init__(datasets)
319
+
320
+ def _find_local_bids_records(
321
+ self, dataset_root: Path, filters: dict[str, Any]
322
+ ) -> list[dict]:
323
+ """Discover local BIDS EEG files and build minimal records.
324
+
325
+ Enumerates EEG recordings under ``dataset_root`` using
326
+ ``mne_bids.find_matching_paths`` and applies entity filters to produce
327
+ records suitable for :class:`EEGDashBaseDataset`. No network access is
328
+ performed, and files are not read.
329
+
330
+ Parameters
331
+ ----------
332
+ dataset_root : Path
333
+ Local dataset directory (e.g., ``/path/to/cache/ds005509``).
334
+ filters : dict
335
+ Query filters. Must include ``'dataset'`` and may include BIDS
336
+ entities like ``'subject'``, ``'session'``, etc.
337
+
338
+ Returns
339
+ -------
340
+ list of dict
341
+ A list of records, one for each matched EEG file. Each record
342
+ contains BIDS entities, paths, and minimal metadata for offline use.
343
+
344
+ Notes
345
+ -----
346
+ Matching is performed for ``datatypes=['eeg']`` and ``suffixes=['eeg']``.
347
+ The ``bidspath`` is normalized to ensure it starts with the dataset ID,
348
+ even for suffixed cache directories.
349
+
350
+ """
351
+ dataset_id = filters["dataset"]
352
+ arg_map = {
353
+ "subjects": "subject",
354
+ "sessions": "session",
355
+ "tasks": "task",
356
+ "runs": "run",
357
+ }
358
+ matching_args: dict[str, list[str]] = {}
359
+ for finder_key, entity_key in arg_map.items():
360
+ entity_val = filters.get(entity_key)
361
+ if entity_val is None:
362
+ continue
363
+ if isinstance(entity_val, (list, tuple, set)):
364
+ entity_vals = list(entity_val)
365
+ if not entity_vals:
366
+ continue
367
+ matching_args[finder_key] = entity_vals
368
+ else:
369
+ matching_args[finder_key] = [entity_val]
370
+
371
+ matched_paths = find_matching_paths(
372
+ root=str(dataset_root),
373
+ datatypes=["eeg"],
374
+ suffixes=["eeg"],
375
+ ignore_json=True,
376
+ **matching_args,
377
+ )
378
+ records_out: list[dict] = []
379
+
380
+ for bids_path in matched_paths:
381
+ # Build bidspath as dataset_id / relative_path_from_dataset_root (POSIX)
382
+ rel_from_root = (
383
+ Path(bids_path.fpath)
384
+ .resolve()
385
+ .relative_to(Path(bids_path.root).resolve())
386
+ )
387
+ bidspath = f"{dataset_id}/{rel_from_root.as_posix()}"
388
+
389
+ rec = {
390
+ "data_name": f"{dataset_id}_{Path(bids_path.fpath).name}",
391
+ "dataset": dataset_id,
392
+ "bidspath": bidspath,
393
+ "subject": (bids_path.subject or None),
394
+ "session": (bids_path.session or None),
395
+ "task": (bids_path.task or None),
396
+ "run": (bids_path.run or None),
397
+ # minimal fields to satisfy BaseDataset from eegdash
398
+ "bidsdependencies": [], # not needed to just run.
399
+ "modality": "eeg",
400
+ # minimal numeric defaults for offline length calculation
401
+ "sampling_frequency": None,
402
+ "nchans": None,
403
+ "ntimes": None,
404
+ }
405
+ records_out.append(rec)
406
+
407
+ return records_out
408
+
409
+ def _find_key_in_nested_dict(self, data: Any, target_key: str) -> Any:
410
+ """Recursively search for a key in nested dicts/lists.
411
+
412
+ Performs a case-insensitive and underscore/hyphen-agnostic search.
413
+
414
+ Parameters
415
+ ----------
416
+ data : Any
417
+ The nested data structure (dicts, lists) to search.
418
+ target_key : str
419
+ The key to search for.
420
+
421
+ Returns
422
+ -------
423
+ Any
424
+ The value of the first matching key, or None if not found.
425
+
426
+ """
427
+ norm_target = normalize_key(target_key)
428
+ if isinstance(data, dict):
429
+ for k, v in data.items():
430
+ if normalize_key(k) == norm_target:
431
+ return v
432
+ res = self._find_key_in_nested_dict(v, target_key)
433
+ if res is not None:
434
+ return res
435
+ elif isinstance(data, list):
436
+ for item in data:
437
+ res = self._find_key_in_nested_dict(item, target_key)
438
+ if res is not None:
439
+ return res
440
+ return None
441
+
442
+ def _find_datasets(
443
+ self,
444
+ query: dict[str, Any] | None,
445
+ description_fields: list[str],
446
+ base_dataset_kwargs: dict,
447
+ ) -> list[EEGDashBaseDataset]:
448
+ """Find and construct datasets from a MongoDB query.
449
+
450
+ Queries the database, then creates a list of
451
+ :class:`EEGDashBaseDataset` objects from the results.
452
+
453
+ Parameters
454
+ ----------
455
+ query : dict, optional
456
+ The MongoDB query to execute.
457
+ description_fields : list of str
458
+ Fields to extract from each record for the dataset description.
459
+ base_dataset_kwargs : dict
460
+ Additional keyword arguments to pass to the
461
+ :class:`EEGDashBaseDataset` constructor.
462
+
463
+ Returns
464
+ -------
465
+ list of EEGDashBaseDataset
466
+ A list of dataset objects matching the query.
467
+
468
+ """
469
+ datasets: list[EEGDashBaseDataset] = []
470
+ self.records = self.eeg_dash_instance.find(query)
471
+
472
+ for record in self.records:
473
+ description: dict[str, Any] = {}
474
+ # Requested fields first (normalized matching)
475
+ for field in description_fields:
476
+ value = self._find_key_in_nested_dict(record, field)
477
+ if value is not None:
478
+ description[field] = value
479
+ # Merge all participants.tsv columns generically
480
+ part = self._find_key_in_nested_dict(record, "participant_tsv")
481
+ if isinstance(part, dict):
482
+ description = merge_participants_fields(
483
+ description=description,
484
+ participants_row=part,
485
+ description_fields=description_fields,
486
+ )
487
+ datasets.append(
488
+ EEGDashBaseDataset(
489
+ record,
490
+ cache_dir=self.cache_dir,
491
+ s3_bucket=self.s3_bucket,
492
+ description=description,
493
+ **base_dataset_kwargs,
494
+ )
495
+ )
496
+ return datasets
497
+
498
+ # just to fix the docstring inheritance until we solved it in braindecode.
499
+ def save(self, path, overwrite=False):
500
+ """Save the dataset to disk.
501
+
502
+ Parameters
503
+ ----------
504
+ path : str or Path
505
+ Destination file path.
506
+ overwrite : bool, default False
507
+ If True, overwrite existing file.
508
+
509
+ Returns
510
+ -------
511
+ None
512
+
513
+ """
514
+ return super().save(path, overwrite=overwrite)
515
+
516
+
14
517
  class EEGChallengeDataset(EEGDashDataset):
15
- """EEG 2025 Challenge dataset helper.
518
+ """A dataset helper for the EEG 2025 Challenge.
16
519
 
17
- This class provides a convenient wrapper around :class:`EEGDashDataset`
18
- configured for the EEG 2025 Challenge releases. It maps a given
19
- ``release`` to its corresponding OpenNeuro dataset and optionally restricts
20
- to the official "mini" subject subset.
520
+ This class simplifies access to the EEG 2025 Challenge datasets. It is a
521
+ specialized version of :class:`~eegdash.api.EEGDashDataset` that is
522
+ pre-configured for the challenge's data releases. It automatically maps a
523
+ release name (e.g., "R1") to the corresponding OpenNeuro dataset and handles
524
+ the selection of subject subsets (e.g., "mini" release).
21
525
 
22
526
  Parameters
23
527
  ----------
24
528
  release : str
25
- Release name. One of ["R1", ..., "R11"].
529
+ The name of the challenge release to load. Must be one of the keys in
530
+ :const:`~eegdash.const.RELEASE_TO_OPENNEURO_DATASET_MAP`
531
+ (e.g., "R1", "R2", ..., "R11").
532
+ cache_dir : str
533
+ The local directory where the dataset will be downloaded and cached.
26
534
  mini : bool, default True
27
- If True, restrict subjects to the challenge mini subset.
28
- query : dict | None
29
- Additional MongoDB-style filters to AND with the release selection.
30
- Must not contain the key ``dataset``.
31
- s3_bucket : str | None, default "s3://nmdatasets/NeurIPS25"
32
- Base S3 bucket used to locate the challenge data.
535
+ If True, the dataset is restricted to the official "mini" subset of
536
+ subjects for the specified release. If False, all subjects for the
537
+ release are included.
538
+ query : dict, optional
539
+ An additional MongoDB-style query to apply as a filter. This query is
540
+ combined with the release and subject filters using a logical AND.
541
+ The query must not contain the ``dataset`` key, as this is determined
542
+ by the ``release`` parameter.
543
+ s3_bucket : str, optional
544
+ The base S3 bucket URI where the challenge data is stored. Defaults to
545
+ the official challenge bucket.
33
546
  **kwargs
34
- Passed through to :class:`EEGDashDataset`.
547
+ Additional keyword arguments that are passed directly to the
548
+ :class:`~eegdash.api.EEGDashDataset` constructor.
549
+
550
+ Raises
551
+ ------
552
+ ValueError
553
+ If the specified ``release`` is unknown, or if the ``query`` argument
554
+ contains a ``dataset`` key. Also raised if ``mini`` is True and a
555
+ requested subject is not part of the official mini-release subset.
556
+
557
+ See Also
558
+ --------
559
+ EEGDashDataset : The base class for creating datasets from queries.
35
560
 
36
561
  """
37
562
 
@@ -164,4 +689,4 @@ registered_classes = register_openneuro_datasets(
164
689
  )
165
690
 
166
691
 
167
- __all__ = ["EEGChallengeDataset"] + list(registered_classes.keys())
692
+ __all__ = ["EEGDashDataset", "EEGChallengeDataset"] + list(registered_classes.keys())