eegdash 0.3.3.dev61__py3-none-any.whl → 0.5.0.dev180784713__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. eegdash/__init__.py +19 -6
  2. eegdash/api.py +336 -539
  3. eegdash/bids_eeg_metadata.py +495 -0
  4. eegdash/const.py +349 -0
  5. eegdash/dataset/__init__.py +28 -0
  6. eegdash/dataset/base.py +311 -0
  7. eegdash/dataset/bids_dataset.py +641 -0
  8. eegdash/dataset/dataset.py +692 -0
  9. eegdash/dataset/dataset_summary.csv +255 -0
  10. eegdash/dataset/registry.py +287 -0
  11. eegdash/downloader.py +197 -0
  12. eegdash/features/__init__.py +15 -13
  13. eegdash/features/datasets.py +329 -138
  14. eegdash/features/decorators.py +105 -13
  15. eegdash/features/extractors.py +233 -63
  16. eegdash/features/feature_bank/__init__.py +12 -12
  17. eegdash/features/feature_bank/complexity.py +22 -20
  18. eegdash/features/feature_bank/connectivity.py +27 -28
  19. eegdash/features/feature_bank/csp.py +3 -1
  20. eegdash/features/feature_bank/dimensionality.py +6 -6
  21. eegdash/features/feature_bank/signal.py +29 -30
  22. eegdash/features/feature_bank/spectral.py +40 -44
  23. eegdash/features/feature_bank/utils.py +8 -0
  24. eegdash/features/inspect.py +126 -15
  25. eegdash/features/serialization.py +58 -17
  26. eegdash/features/utils.py +90 -16
  27. eegdash/hbn/__init__.py +28 -0
  28. eegdash/hbn/preprocessing.py +105 -0
  29. eegdash/hbn/windows.py +428 -0
  30. eegdash/logging.py +54 -0
  31. eegdash/mongodb.py +55 -24
  32. eegdash/paths.py +52 -0
  33. eegdash/utils.py +29 -1
  34. eegdash-0.5.0.dev180784713.dist-info/METADATA +121 -0
  35. eegdash-0.5.0.dev180784713.dist-info/RECORD +38 -0
  36. eegdash-0.5.0.dev180784713.dist-info/licenses/LICENSE +29 -0
  37. eegdash/data_config.py +0 -34
  38. eegdash/data_utils.py +0 -687
  39. eegdash/dataset.py +0 -69
  40. eegdash/preprocessing.py +0 -63
  41. eegdash-0.3.3.dev61.dist-info/METADATA +0 -192
  42. eegdash-0.3.3.dev61.dist-info/RECORD +0 -28
  43. eegdash-0.3.3.dev61.dist-info/licenses/LICENSE +0 -23
  44. {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/WHEEL +0 -0
  45. {eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,641 @@
1
+ # Authors: The EEGDash contributors.
2
+ # License: BSD-3-Clause
3
+ # Copyright the EEGDash contributors.
4
+
5
+ """Local BIDS dataset interface for EEGDash.
6
+
7
+ This module provides the EEGBIDSDataset class for interfacing with local BIDS
8
+ datasets on the filesystem, parsing metadata, and retrieving BIDS-related information.
9
+ """
10
+
11
+ import json
12
+ import os
13
+ import re
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ import pandas as pd
18
+ from mne_bids import BIDSPath, find_matching_paths
19
+ from mne_bids.config import ALLOWED_DATATYPE_EXTENSIONS, EPHY_ALLOWED_DATATYPES, reader
20
+
21
+ # Known companion/sidecar files for specific formats (BIDS spec requirement)
22
+ # These files must be downloaded together with the primary file
23
+ _COMPANION_FILES = {
24
+ ".set": [".fdt"], # EEGLAB: data file
25
+ ".vhdr": [".eeg", ".vmrk"], # BrainVision: data + marker files
26
+ }
27
+
28
+
29
+ class EEGBIDSDataset:
30
+ """An interface to a local BIDS dataset containing electrophysiology recordings.
31
+
32
+ This class centralizes interactions with a BIDS dataset on the local
33
+ filesystem, providing methods to parse metadata, find files, and
34
+ retrieve BIDS-related information. Supports multiple modalities including
35
+ EEG, MEG, iEEG, and NIRS.
36
+
37
+ The class uses MNE-BIDS constants to stay synchronized with the BIDS
38
+ specification and automatically supports all file formats recognized by MNE.
39
+
40
+ Parameters
41
+ ----------
42
+ data_dir : str or Path
43
+ The path to the local BIDS dataset directory.
44
+ dataset : str
45
+ A name for the dataset (e.g., "ds002718").
46
+ allow_symlinks : bool, default False
47
+ If True, accept broken symlinks (e.g., git-annex) for metadata extraction.
48
+ If False, require actual readable files for data loading.
49
+ Set to True when doing metadata digestion without loading raw data.
50
+ modalities : list of str or None, default None
51
+ List of modalities to search for (e.g., ["eeg", "meg"]).
52
+ If None, defaults to all electrophysiology modalities from MNE-BIDS:
53
+ ['meg', 'eeg', 'ieeg', 'nirs'].
54
+
55
+ Attributes
56
+ ----------
57
+ RAW_EXTENSIONS : dict
58
+ Mapping of file extensions to their companion files, dynamically
59
+ built from mne_bids.config.reader.
60
+ files : list of str
61
+ List of all recording file paths found in the dataset.
62
+ detected_modality : str
63
+ The modality of the first file found (e.g., 'eeg', 'meg').
64
+
65
+ Examples
66
+ --------
67
+ >>> # Load EEG-only dataset
68
+ >>> dataset = EEGBIDSDataset(
69
+ ... data_dir="/path/to/ds002718",
70
+ ... dataset="ds002718",
71
+ ... modalities=["eeg"]
72
+ ... )
73
+
74
+ >>> # Load dataset with multiple modalities
75
+ >>> dataset = EEGBIDSDataset(
76
+ ... data_dir="/path/to/ds005810",
77
+ ... dataset="ds005810",
78
+ ... modalities=["meg", "eeg"]
79
+ ... )
80
+
81
+ >>> # Metadata extraction from git-annex (symlinks)
82
+ >>> dataset = EEGBIDSDataset(
83
+ ... data_dir="/path/to/dataset",
84
+ ... dataset="ds000001",
85
+ ... allow_symlinks=True
86
+ ... )
87
+
88
+ """
89
+
90
+ # Dynamically build from MNE-BIDS constants (mne_bids.config.reader)
91
+ # reader dict maps file extensions to MNE read functions
92
+ # This ensures compatibility with the latest BIDS specification
93
+
94
+ # Primary extension + companions = files that must be downloaded together
95
+ RAW_EXTENSIONS = {
96
+ ext: [ext] + _COMPANION_FILES.get(ext, []) for ext in reader.keys()
97
+ }
98
+
99
+ def __init__(
100
+ self,
101
+ data_dir=None, # location of bids dataset
102
+ dataset="", # dataset name
103
+ allow_symlinks=False, # allow broken symlinks for digestion
104
+ modalities=None,
105
+ ):
106
+ if data_dir is None or not os.path.exists(data_dir):
107
+ raise ValueError("data_dir must be specified and must exist")
108
+
109
+ self.bidsdir = Path(data_dir)
110
+ self.dataset = dataset
111
+ self.data_dir = data_dir
112
+ self.allow_symlinks = allow_symlinks
113
+
114
+ # Set modalities to search for (default: all electrophysiology modalities from MNE-BIDS)
115
+ if modalities is None:
116
+ self.modalities = EPHY_ALLOWED_DATATYPES # ['meg', 'eeg', 'ieeg', 'nirs']
117
+ else:
118
+ self.modalities = (
119
+ modalities if isinstance(modalities, list) else [modalities]
120
+ )
121
+
122
+ # Accept exact dataset folder or a variant with informative suffixes
123
+ # (e.g., dsXXXXX-bdf, dsXXXXX-bdf-mini) to avoid collisions.
124
+ dir_name = self.bidsdir.name
125
+ if not (dir_name == self.dataset or dir_name.startswith(self.dataset + "-")):
126
+ raise AssertionError(
127
+ f"BIDS directory '{dir_name}' does not correspond to dataset '{self.dataset}'"
128
+ )
129
+
130
+ # Initialize BIDS paths using fast mne_bids approach instead of pybids
131
+ self._init_bids_paths()
132
+
133
+ # get all recording files in the bids directory
134
+ assert len(self.files) > 0, ValueError(
135
+ f"Unable to construct dataset. No recordings found for modalities: {self.modalities}"
136
+ )
137
+ # Store the detected modality for later use
138
+ self.detected_modality = self.get_bids_file_attribute(
139
+ "modality", self.files[0]
140
+ ).lower()
141
+
142
+ def check_eeg_dataset(self) -> bool:
143
+ """Check if the BIDS dataset contains EEG data.
144
+
145
+ Returns
146
+ -------
147
+ bool
148
+ True if the dataset's modality is EEG, False otherwise.
149
+
150
+ """
151
+ return self.detected_modality == "eeg"
152
+
153
+ def _init_bids_paths(self) -> None:
154
+ """Initialize BIDS file paths using mne_bids for fast discovery.
155
+
156
+ Uses mne_bids.find_matching_paths() for efficient pattern-based file
157
+ discovery. Falls back to manual glob search if needed.
158
+
159
+ When allow_symlinks=True, includes broken symlinks (e.g., git-annex)
160
+ for metadata extraction without requiring actual data files.
161
+
162
+ Searches across multiple modalities (eeg, meg, ieeg) based on self.modalities.
163
+ """
164
+ # Initialize cache for BIDSPath objects
165
+ self._bids_path_cache = {}
166
+
167
+ # Find all recordings across specified modalities
168
+ # Use MNE-BIDS constants to get valid extensions per modality
169
+ self.files = []
170
+ for modality in self.modalities:
171
+ for ext in ALLOWED_DATATYPE_EXTENSIONS.get(modality, []):
172
+ found_files = _find_bids_files(
173
+ self.bidsdir,
174
+ ext,
175
+ modalities=[modality],
176
+ allow_symlinks=self.allow_symlinks,
177
+ )
178
+ if found_files:
179
+ self.files = found_files
180
+ break
181
+ if self.files:
182
+ break
183
+
184
+ def _get_bids_path_from_file(self, data_filepath: str):
185
+ """Get a BIDSPath object for a data file with caching.
186
+
187
+ Parameters
188
+ ----------
189
+ data_filepath : str
190
+ The path to the data file.
191
+
192
+ Returns
193
+ -------
194
+ BIDSPath
195
+ The BIDSPath object for the file.
196
+
197
+ """
198
+ if data_filepath not in self._bids_path_cache:
199
+ # Parse the filename to extract BIDS entities
200
+ filepath = Path(data_filepath)
201
+ filename = filepath.name
202
+
203
+ # Detect modality from the directory path
204
+ # BIDS structure: .../sub-XX/[ses-YY/]<modality>/sub-XX_...
205
+ path_parts = filepath.parts
206
+ modality = "eeg" # default
207
+ for part in path_parts:
208
+ if part in ["eeg", "meg", "ieeg", "emg"]:
209
+ modality = part
210
+ break
211
+
212
+ # Extract entities from filename using BIDS pattern
213
+ # Expected format: sub-<label>[_ses-<label>][_task-<label>][_run-<label>]_<modality>.<ext>
214
+ subject = re.search(r"sub-([^_]*)", filename)
215
+ session = re.search(r"ses-([^_]*)", filename)
216
+ task = re.search(r"task-([^_]*)", filename)
217
+ run = re.search(r"run-([^_]*)", filename)
218
+
219
+ bids_path = BIDSPath(
220
+ subject=subject.group(1) if subject else None,
221
+ session=session.group(1) if session else None,
222
+ task=task.group(1) if task else None,
223
+ run=int(run.group(1)) if run else None,
224
+ datatype=modality,
225
+ extension=filepath.suffix,
226
+ root=self.bidsdir,
227
+ )
228
+ self._bids_path_cache[data_filepath] = bids_path
229
+
230
+ return self._bids_path_cache[data_filepath]
231
+
232
+ def _get_json_with_inheritance(
233
+ self, data_filepath: str, json_filename: str
234
+ ) -> dict:
235
+ """Get JSON metadata with BIDS inheritance handling.
236
+
237
+ Walks up the directory tree to find and merge JSON files following
238
+ BIDS inheritance principles.
239
+
240
+ Parameters
241
+ ----------
242
+ data_filepath : str
243
+ The path to the data file.
244
+ json_filename : str
245
+ The name of the JSON file to find (e.g., "eeg.json").
246
+
247
+ Returns
248
+ -------
249
+ dict
250
+ The merged JSON metadata.
251
+
252
+ """
253
+ json_dict = {}
254
+ current_dir = Path(data_filepath).parent
255
+ root_dir = self.bidsdir
256
+
257
+ # Walk up from file directory to root, collecting JSON files
258
+ while current_dir >= root_dir:
259
+ # Try exact match first (e.g., "eeg.json" at root level)
260
+ json_path = current_dir / json_filename
261
+ if json_path.exists():
262
+ with open(json_path) as f:
263
+ json_dict.update(json.load(f))
264
+ else:
265
+ # Look for BIDS-specific JSON files (e.g., "sub-001_task-rest_eeg.json")
266
+ # Match files ending with the json_filename pattern
267
+ for json_file in current_dir.glob(f"*_{json_filename}"):
268
+ # Check if this JSON corresponds to the data file
269
+ data_basename = Path(data_filepath).stem
270
+ json_basename = json_file.stem
271
+ # They should share the same BIDS entities prefix
272
+ if data_basename.split("_eeg")[0] == json_basename.split("_eeg")[0]:
273
+ with open(json_file) as f:
274
+ json_dict.update(json.load(f))
275
+ break
276
+
277
+ # Stop at BIDS root (contains dataset_description.json)
278
+ if (current_dir / "dataset_description.json").exists():
279
+ break
280
+
281
+ current_dir = current_dir.parent
282
+
283
+ return json_dict
284
+
285
+ def _merge_json_inheritance(self, json_files: list[str | Path]) -> dict:
286
+ """Merge a list of JSON files according to BIDS inheritance."""
287
+ json_files.reverse()
288
+ json_dict = {}
289
+ for f in json_files:
290
+ with open(f) as fp:
291
+ json_dict.update(json.load(fp))
292
+ return json_dict
293
+
294
+ def _get_bids_file_inheritance(
295
+ self, path: str | Path, basename: str, extension: str
296
+ ) -> list[Path]:
297
+ """Find all applicable metadata files using BIDS inheritance."""
298
+ top_level_files = ["README", "dataset_description.json", "participants.tsv"]
299
+ bids_files = []
300
+
301
+ if isinstance(path, str):
302
+ path = Path(path)
303
+ if not path.exists():
304
+ raise ValueError(f"path {path} does not exist")
305
+
306
+ for file in os.listdir(path):
307
+ if os.path.isfile(path / file) and file.endswith(extension):
308
+ bids_files.append(path / file)
309
+
310
+ if any(file in os.listdir(path) for file in top_level_files):
311
+ return bids_files
312
+ else:
313
+ bids_files.extend(
314
+ self._get_bids_file_inheritance(path.parent, basename, extension)
315
+ )
316
+ return bids_files
317
+
318
+ def get_bids_metadata_files(
319
+ self, filepath: str | Path, metadata_file_extension: str
320
+ ) -> list[Path]:
321
+ """Retrieve all metadata files that apply to a given data file.
322
+
323
+ Follows the BIDS inheritance principle to find all relevant metadata
324
+ files (e.g., ``channels.tsv``, ``eeg.json``) for a specific recording.
325
+
326
+ Parameters
327
+ ----------
328
+ filepath : str or Path
329
+ The path to the data file.
330
+ metadata_file_extension : str
331
+ The extension of the metadata file to search for (e.g., "channels.tsv").
332
+
333
+ Returns
334
+ -------
335
+ list of Path
336
+ A list of paths to the matching metadata files.
337
+
338
+ """
339
+ if isinstance(filepath, str):
340
+ filepath = Path(filepath)
341
+
342
+ # Validate file based on current mode
343
+ if not _is_valid_eeg_file(filepath, allow_symlinks=self.allow_symlinks):
344
+ raise ValueError(
345
+ f"filepath {filepath} does not exist. "
346
+ f"If doing metadata extraction from git-annex, set allow_symlinks=True"
347
+ )
348
+
349
+ path, filename = os.path.split(filepath)
350
+ basename = filename[: filename.rfind("_")]
351
+ meta_files = self._get_bids_file_inheritance(
352
+ path, basename, metadata_file_extension
353
+ )
354
+ return meta_files
355
+
356
+ def get_files(self) -> list[str]:
357
+ """Get all EEG recording file paths in the BIDS dataset.
358
+
359
+ Returns
360
+ -------
361
+ list of str
362
+ A list of file paths for all valid EEG recordings.
363
+
364
+ """
365
+ return self.files
366
+
367
+ def get_bids_file_attribute(self, attribute: str, data_filepath: str) -> Any:
368
+ """Retrieve a specific attribute from BIDS metadata.
369
+
370
+ Parameters
371
+ ----------
372
+ attribute : str
373
+ The name of the attribute to retrieve (e.g., "sfreq", "subject").
374
+ data_filepath : str
375
+ The path to the data file.
376
+
377
+ Returns
378
+ -------
379
+ Any
380
+ The value of the requested attribute, or None if not found.
381
+
382
+ """
383
+ bids_path = self._get_bids_path_from_file(data_filepath)
384
+
385
+ # Direct BIDSPath properties for entities
386
+ direct_attrs = {
387
+ "subject": bids_path.subject,
388
+ "session": bids_path.session,
389
+ "task": bids_path.task,
390
+ "run": bids_path.run,
391
+ "modality": bids_path.datatype,
392
+ }
393
+
394
+ if attribute in direct_attrs:
395
+ return direct_attrs[attribute]
396
+
397
+ # For JSON-based attributes, read the modality-specific JSON file
398
+ # (eeg.json for EEG, meg.json for MEG, ieeg.json for iEEG)
399
+ modality = bids_path.datatype or "eeg"
400
+ json_filename = f"{modality}.json"
401
+ modality_json = self._get_json_with_inheritance(data_filepath, json_filename)
402
+
403
+ json_attrs = {
404
+ "sfreq": modality_json.get("SamplingFrequency"),
405
+ "ntimes": modality_json.get("RecordingDuration"),
406
+ "nchans": modality_json.get("EEGChannelCount")
407
+ or modality_json.get("MEGChannelCount")
408
+ or modality_json.get("iEEGChannelCount"),
409
+ }
410
+
411
+ return json_attrs.get(attribute)
412
+
413
+ def channel_labels(self, data_filepath: str) -> list[str]:
414
+ """Get a list of channel labels from channels.tsv.
415
+
416
+ Parameters
417
+ ----------
418
+ data_filepath : str
419
+ The path to the data file.
420
+
421
+ Returns
422
+ -------
423
+ list of str
424
+ A list of channel names.
425
+
426
+ """
427
+ # Find channels.tsv in the same directory as the data file
428
+ # It can be named either "channels.tsv" or "*_channels.tsv"
429
+ filepath = Path(data_filepath)
430
+ parent_dir = filepath.parent
431
+
432
+ # Try the standard channels.tsv first
433
+ channels_tsv_path = parent_dir / "channels.tsv"
434
+ if not channels_tsv_path.exists():
435
+ # Try to find *_channels.tsv matching the filename prefix
436
+ base_name = filepath.stem # filename without extension
437
+ for tsv_file in parent_dir.glob("*_channels.tsv"):
438
+ # Check if it matches by looking at task/run components
439
+ tsv_name = tsv_file.stem.replace("_channels", "")
440
+ if base_name.startswith(tsv_name):
441
+ channels_tsv_path = tsv_file
442
+ break
443
+
444
+ if not channels_tsv_path.exists():
445
+ raise FileNotFoundError(f"No channels.tsv found for {data_filepath}")
446
+
447
+ channels_tsv = pd.read_csv(channels_tsv_path, sep="\t")
448
+ return channels_tsv["name"].tolist()
449
+
450
+ def channel_types(self, data_filepath: str) -> list[str]:
451
+ """Get a list of channel types from channels.tsv.
452
+
453
+ Parameters
454
+ ----------
455
+ data_filepath : str
456
+ The path to the data file.
457
+
458
+ Returns
459
+ -------
460
+ list of str
461
+ A list of channel types.
462
+
463
+ """
464
+ # Find channels.tsv in the same directory as the data file
465
+ # It can be named either "channels.tsv" or "*_channels.tsv"
466
+ filepath = Path(data_filepath)
467
+ parent_dir = filepath.parent
468
+
469
+ # Try the standard channels.tsv first
470
+ channels_tsv_path = parent_dir / "channels.tsv"
471
+ if not channels_tsv_path.exists():
472
+ # Try to find *_channels.tsv matching the filename prefix
473
+ base_name = filepath.stem # filename without extension
474
+ for tsv_file in parent_dir.glob("*_channels.tsv"):
475
+ # Check if it matches by looking at task/run components
476
+ tsv_name = tsv_file.stem.replace("_channels", "")
477
+ if base_name.startswith(tsv_name):
478
+ channels_tsv_path = tsv_file
479
+ break
480
+
481
+ if not channels_tsv_path.exists():
482
+ raise FileNotFoundError(f"No channels.tsv found for {data_filepath}")
483
+
484
+ channels_tsv = pd.read_csv(channels_tsv_path, sep="\t")
485
+ return channels_tsv["type"].tolist()
486
+
487
+ def num_times(self, data_filepath: str) -> int:
488
+ """Get the number of time points in the recording.
489
+
490
+ Calculated from ``SamplingFrequency`` and ``RecordingDuration`` in eeg.json.
491
+
492
+ Parameters
493
+ ----------
494
+ data_filepath : str
495
+ The path to the data file.
496
+
497
+ Returns
498
+ -------
499
+ int
500
+ The approximate number of time points.
501
+
502
+ """
503
+ eeg_json_dict = self._get_json_with_inheritance(data_filepath, "eeg.json")
504
+ return int(
505
+ eeg_json_dict.get("SamplingFrequency", 0)
506
+ * eeg_json_dict.get("RecordingDuration", 0)
507
+ )
508
+
509
+ def subject_participant_tsv(self, data_filepath: str) -> dict[str, Any]:
510
+ """Get the participants.tsv record for a subject.
511
+
512
+ Parameters
513
+ ----------
514
+ data_filepath : str
515
+ The path to a data file belonging to the subject.
516
+
517
+ Returns
518
+ -------
519
+ dict
520
+ A dictionary of the subject's information from participants.tsv.
521
+
522
+ """
523
+ participants_tsv_path = self.get_bids_metadata_files(
524
+ data_filepath, "participants.tsv"
525
+ )[0]
526
+ participants_tsv = pd.read_csv(participants_tsv_path, sep="\t")
527
+ if participants_tsv.empty:
528
+ return {}
529
+ participants_tsv.set_index("participant_id", inplace=True)
530
+ subject = f"sub-{self.get_bids_file_attribute('subject', data_filepath)}"
531
+ return participants_tsv.loc[subject].to_dict()
532
+
533
+ def eeg_json(self, data_filepath: str) -> dict[str, Any]:
534
+ """Get the merged eeg.json metadata for a data file.
535
+
536
+ Parameters
537
+ ----------
538
+ data_filepath : str
539
+ The path to the data file.
540
+
541
+ Returns
542
+ -------
543
+ dict
544
+ The merged eeg.json metadata.
545
+
546
+ """
547
+ return self._get_json_with_inheritance(data_filepath, "eeg.json")
548
+
549
+
550
+ def _is_valid_eeg_file(filepath: Path, allow_symlinks: bool = False) -> bool:
551
+ """Check if a file path is valid for EEG processing.
552
+
553
+ Parameters
554
+ ----------
555
+ filepath : Path
556
+ The file path to check.
557
+ allow_symlinks : bool, default False
558
+ If True, accept broken symlinks (e.g., git-annex pointers).
559
+ If False, only accept files that actually exist and can be read.
560
+
561
+ Returns
562
+ -------
563
+ bool
564
+ True if the file is valid for the current mode.
565
+
566
+ """
567
+ if filepath.exists():
568
+ return True
569
+ if allow_symlinks and filepath.is_symlink():
570
+ return True
571
+ return False
572
+
573
+
574
+ def _find_bids_files(
575
+ bidsdir: Path,
576
+ extension: str,
577
+ modalities: list[str] = None,
578
+ allow_symlinks: bool = False,
579
+ ) -> list[str]:
580
+ """Find BIDS files in a BIDS directory across multiple modalities.
581
+
582
+ Parameters
583
+ ----------
584
+ bidsdir : Path
585
+ The BIDS dataset root directory.
586
+ extension : str
587
+ File extension to search for (e.g., '.set', '.bdf', '.fif').
588
+ modalities : list of str, optional
589
+ List of modalities to search (e.g., ["eeg", "meg", "ieeg"]).
590
+ If None, defaults to EPHY_ALLOWED_DATATYPES from mne_bids.config.
591
+ allow_symlinks : bool, default False
592
+ If True, include broken symlinks in results (for metadata extraction).
593
+ If False, only return files that can be read (for data loading).
594
+
595
+ Returns
596
+ -------
597
+ list of str
598
+ List of file paths found.
599
+
600
+ """
601
+ if modalities is None:
602
+ modalities = EPHY_ALLOWED_DATATYPES
603
+
604
+ all_files = []
605
+
606
+ for modality in modalities:
607
+ # First try mne_bids (fast, but skips broken symlinks)
608
+ if not allow_symlinks:
609
+ try:
610
+ paths = find_matching_paths(
611
+ bidsdir, datatypes=modality, extensions=extension
612
+ )
613
+ if paths:
614
+ all_files.extend([str(p.fpath) for p in paths])
615
+ except Exception:
616
+ pass # Continue to fallback search
617
+
618
+ # Fallback: manual glob search (finds symlinks too)
619
+ pattern = f"**/{modality}/*{extension}"
620
+ found = list(bidsdir.glob(pattern))
621
+
622
+ # Filter based on validation mode
623
+ valid_files = [
624
+ str(f)
625
+ for f in found
626
+ if _is_valid_eeg_file(f, allow_symlinks=allow_symlinks)
627
+ ]
628
+ all_files.extend(valid_files)
629
+
630
+ # Remove duplicates while preserving order
631
+ seen = set()
632
+ unique_files = []
633
+ for f in all_files:
634
+ if f not in seen:
635
+ seen.add(f)
636
+ unique_files.append(f)
637
+
638
+ return unique_files
639
+
640
+
641
+ __all__ = ["EEGBIDSDataset"]