eegdash 0.4.0.dev173498563__py3-none-any.whl → 0.4.1.dev185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -0,0 +1,443 @@
1
+ # Authors: The EEGDash contributors.
2
+ # License: GNU General Public License
3
+ # Copyright the EEGDash contributors.
4
+
5
+ """Local BIDS dataset interface for EEGDash.
6
+
7
+ This module provides the EEGBIDSDataset class for interfacing with local BIDS
8
+ datasets on the filesystem, parsing metadata, and retrieving BIDS-related information.
9
+ """
10
+
11
+ import json
12
+ import os
13
+ import re
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ import pandas as pd
18
+ from mne_bids import BIDSPath, find_matching_paths
19
+
20
+
21
+ class EEGBIDSDataset:
22
+ """An interface to a local BIDS dataset containing EEG recordings.
23
+
24
+ This class centralizes interactions with a BIDS dataset on the local
25
+ filesystem, providing methods to parse metadata, find files, and
26
+ retrieve BIDS-related information.
27
+
28
+ Parameters
29
+ ----------
30
+ data_dir : str or Path
31
+ The path to the local BIDS dataset directory.
32
+ dataset : str
33
+ A name for the dataset (e.g., "ds002718").
34
+
35
+ """
36
+
37
+ ALLOWED_FILE_FORMAT = ["eeglab", "brainvision", "biosemi", "european"]
38
+ RAW_EXTENSIONS = {
39
+ ".set": [".set", ".fdt"], # eeglab
40
+ ".edf": [".edf"], # european
41
+ ".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"], # brainvision
42
+ ".bdf": [".bdf"], # biosemi
43
+ }
44
+ METADATA_FILE_EXTENSIONS = [
45
+ "eeg.json",
46
+ "channels.tsv",
47
+ "electrodes.tsv",
48
+ "events.tsv",
49
+ "events.json",
50
+ ]
51
+
52
+ def __init__(
53
+ self,
54
+ data_dir=None, # location of bids dataset
55
+ dataset="", # dataset name
56
+ ):
57
+ if data_dir is None or not os.path.exists(data_dir):
58
+ raise ValueError("data_dir must be specified and must exist")
59
+
60
+ self.bidsdir = Path(data_dir)
61
+ self.dataset = dataset
62
+ self.data_dir = data_dir
63
+
64
+ # Accept exact dataset folder or a variant with informative suffixes
65
+ # (e.g., dsXXXXX-bdf, dsXXXXX-bdf-mini) to avoid collisions.
66
+ dir_name = self.bidsdir.name
67
+ if not (dir_name == self.dataset or dir_name.startswith(self.dataset + "-")):
68
+ raise AssertionError(
69
+ f"BIDS directory '{dir_name}' does not correspond to dataset '{self.dataset}'"
70
+ )
71
+
72
+ # Initialize BIDS paths using fast mne_bids approach instead of pybids
73
+ self._init_bids_paths()
74
+
75
+ # get all recording files in the bids directory
76
+ assert len(self.files) > 0, ValueError(
77
+ "Unable to construct EEG dataset. No EEG recordings found."
78
+ )
79
+ assert self.check_eeg_dataset(), ValueError("Dataset is not an EEG dataset.")
80
+
81
+ def check_eeg_dataset(self) -> bool:
82
+ """Check if the BIDS dataset contains EEG data.
83
+
84
+ Returns
85
+ -------
86
+ bool
87
+ True if the dataset's modality is EEG, False otherwise.
88
+
89
+ """
90
+ return self.get_bids_file_attribute("modality", self.files[0]).lower() == "eeg"
91
+
92
+ def _init_bids_paths(self) -> None:
93
+ """Initialize BIDS file paths using mne_bids for fast discovery.
94
+
95
+ Uses mne_bids.find_matching_paths() for efficient pattern-based file
96
+ discovery instead of heavy pybids BIDSLayout indexing.
97
+ """
98
+ # Initialize cache for BIDSPath objects
99
+ self._bids_path_cache = {}
100
+
101
+ # Find all EEG recordings using pattern matching (fast!)
102
+ self.files = []
103
+ for ext in self.RAW_EXTENSIONS.keys():
104
+ # find_matching_paths returns BIDSPath objects
105
+ paths = find_matching_paths(self.bidsdir, datatypes="eeg", extensions=ext)
106
+ if paths:
107
+ # Convert BIDSPath objects to filename strings
108
+ self.files = [str(p.fpath) for p in paths]
109
+ break
110
+
111
+ def _get_bids_path_from_file(self, data_filepath: str):
112
+ """Get a BIDSPath object for a data file with caching.
113
+
114
+ Parameters
115
+ ----------
116
+ data_filepath : str
117
+ The path to the data file.
118
+
119
+ Returns
120
+ -------
121
+ BIDSPath
122
+ The BIDSPath object for the file.
123
+
124
+ """
125
+ if data_filepath not in self._bids_path_cache:
126
+ # Parse the filename to extract BIDS entities
127
+ filepath = Path(data_filepath)
128
+ filename = filepath.name
129
+
130
+ # Extract entities from filename using BIDS pattern
131
+ # Expected format: sub-<label>[_ses-<label>][_task-<label>][_run-<label>]_eeg.<ext>
132
+ subject = re.search(r"sub-([^_]*)", filename)
133
+ session = re.search(r"ses-([^_]*)", filename)
134
+ task = re.search(r"task-([^_]*)", filename)
135
+ run = re.search(r"run-([^_]*)", filename)
136
+
137
+ bids_path = BIDSPath(
138
+ subject=subject.group(1) if subject else None,
139
+ session=session.group(1) if session else None,
140
+ task=task.group(1) if task else None,
141
+ run=int(run.group(1)) if run else None,
142
+ datatype="eeg",
143
+ extension=filepath.suffix,
144
+ root=self.bidsdir,
145
+ )
146
+ self._bids_path_cache[data_filepath] = bids_path
147
+
148
+ return self._bids_path_cache[data_filepath]
149
+
150
+ def _get_json_with_inheritance(
151
+ self, data_filepath: str, json_filename: str
152
+ ) -> dict:
153
+ """Get JSON metadata with BIDS inheritance handling.
154
+
155
+ Walks up the directory tree to find and merge JSON files following
156
+ BIDS inheritance principles.
157
+
158
+ Parameters
159
+ ----------
160
+ data_filepath : str
161
+ The path to the data file.
162
+ json_filename : str
163
+ The name of the JSON file to find (e.g., "eeg.json").
164
+
165
+ Returns
166
+ -------
167
+ dict
168
+ The merged JSON metadata.
169
+
170
+ """
171
+ json_dict = {}
172
+ current_dir = Path(data_filepath).parent
173
+ root_dir = self.bidsdir
174
+
175
+ # Walk up from file directory to root, collecting JSON files
176
+ while current_dir >= root_dir:
177
+ json_path = current_dir / json_filename
178
+ if json_path.exists():
179
+ with open(json_path) as f:
180
+ json_dict.update(json.load(f))
181
+
182
+ # Stop at BIDS root (contains dataset_description.json)
183
+ if (current_dir / "dataset_description.json").exists():
184
+ break
185
+
186
+ current_dir = current_dir.parent
187
+
188
+ return json_dict
189
+
190
+ def _merge_json_inheritance(self, json_files: list[str | Path]) -> dict:
191
+ """Merge a list of JSON files according to BIDS inheritance."""
192
+ json_files.reverse()
193
+ json_dict = {}
194
+ for f in json_files:
195
+ with open(f) as fp:
196
+ json_dict.update(json.load(fp))
197
+ return json_dict
198
+
199
+ def _get_bids_file_inheritance(
200
+ self, path: str | Path, basename: str, extension: str
201
+ ) -> list[Path]:
202
+ """Find all applicable metadata files using BIDS inheritance."""
203
+ top_level_files = ["README", "dataset_description.json", "participants.tsv"]
204
+ bids_files = []
205
+
206
+ if isinstance(path, str):
207
+ path = Path(path)
208
+ if not path.exists():
209
+ raise ValueError(f"path {path} does not exist")
210
+
211
+ for file in os.listdir(path):
212
+ if os.path.isfile(path / file) and file.endswith(extension):
213
+ bids_files.append(path / file)
214
+
215
+ if any(file in os.listdir(path) for file in top_level_files):
216
+ return bids_files
217
+ else:
218
+ bids_files.extend(
219
+ self._get_bids_file_inheritance(path.parent, basename, extension)
220
+ )
221
+ return bids_files
222
+
223
+ def get_bids_metadata_files(
224
+ self, filepath: str | Path, metadata_file_extension: str
225
+ ) -> list[Path]:
226
+ """Retrieve all metadata files that apply to a given data file.
227
+
228
+ Follows the BIDS inheritance principle to find all relevant metadata
229
+ files (e.g., ``channels.tsv``, ``eeg.json``) for a specific recording.
230
+
231
+ Parameters
232
+ ----------
233
+ filepath : str or Path
234
+ The path to the data file.
235
+ metadata_file_extension : str
236
+ The extension of the metadata file to search for (e.g., "channels.tsv").
237
+
238
+ Returns
239
+ -------
240
+ list of Path
241
+ A list of paths to the matching metadata files.
242
+
243
+ """
244
+ if isinstance(filepath, str):
245
+ filepath = Path(filepath)
246
+ if not filepath.exists():
247
+ raise ValueError(f"filepath {filepath} does not exist")
248
+ path, filename = os.path.split(filepath)
249
+ basename = filename[: filename.rfind("_")]
250
+ meta_files = self._get_bids_file_inheritance(
251
+ path, basename, metadata_file_extension
252
+ )
253
+ return meta_files
254
+
255
+ def get_files(self) -> list[str]:
256
+ """Get all EEG recording file paths in the BIDS dataset.
257
+
258
+ Returns
259
+ -------
260
+ list of str
261
+ A list of file paths for all valid EEG recordings.
262
+
263
+ """
264
+ return self.files
265
+
266
+ def get_bids_file_attribute(self, attribute: str, data_filepath: str) -> Any:
267
+ """Retrieve a specific attribute from BIDS metadata.
268
+
269
+ Parameters
270
+ ----------
271
+ attribute : str
272
+ The name of the attribute to retrieve (e.g., "sfreq", "subject").
273
+ data_filepath : str
274
+ The path to the data file.
275
+
276
+ Returns
277
+ -------
278
+ Any
279
+ The value of the requested attribute, or None if not found.
280
+
281
+ """
282
+ bids_path = self._get_bids_path_from_file(data_filepath)
283
+
284
+ # Direct BIDSPath properties for entities
285
+ direct_attrs = {
286
+ "subject": bids_path.subject,
287
+ "session": bids_path.session,
288
+ "task": bids_path.task,
289
+ "run": bids_path.run,
290
+ "modality": bids_path.datatype,
291
+ }
292
+
293
+ if attribute in direct_attrs:
294
+ return direct_attrs[attribute]
295
+
296
+ # For JSON-based attributes, read and cache eeg.json
297
+ eeg_json = self._get_json_with_inheritance(data_filepath, "eeg.json")
298
+ json_attrs = {
299
+ "sfreq": eeg_json.get("SamplingFrequency"),
300
+ "ntimes": eeg_json.get("RecordingDuration"),
301
+ "nchans": eeg_json.get("EEGChannelCount"),
302
+ }
303
+
304
+ return json_attrs.get(attribute)
305
+
306
+ def channel_labels(self, data_filepath: str) -> list[str]:
307
+ """Get a list of channel labels from channels.tsv.
308
+
309
+ Parameters
310
+ ----------
311
+ data_filepath : str
312
+ The path to the data file.
313
+
314
+ Returns
315
+ -------
316
+ list of str
317
+ A list of channel names.
318
+
319
+ """
320
+ # Find channels.tsv in the same directory as the data file
321
+ # It can be named either "channels.tsv" or "*_channels.tsv"
322
+ filepath = Path(data_filepath)
323
+ parent_dir = filepath.parent
324
+
325
+ # Try the standard channels.tsv first
326
+ channels_tsv_path = parent_dir / "channels.tsv"
327
+ if not channels_tsv_path.exists():
328
+ # Try to find *_channels.tsv matching the filename prefix
329
+ base_name = filepath.stem # filename without extension
330
+ for tsv_file in parent_dir.glob("*_channels.tsv"):
331
+ # Check if it matches by looking at task/run components
332
+ tsv_name = tsv_file.stem.replace("_channels", "")
333
+ if base_name.startswith(tsv_name):
334
+ channels_tsv_path = tsv_file
335
+ break
336
+
337
+ if not channels_tsv_path.exists():
338
+ raise FileNotFoundError(f"No channels.tsv found for {data_filepath}")
339
+
340
+ channels_tsv = pd.read_csv(channels_tsv_path, sep="\t")
341
+ return channels_tsv["name"].tolist()
342
+
343
+ def channel_types(self, data_filepath: str) -> list[str]:
344
+ """Get a list of channel types from channels.tsv.
345
+
346
+ Parameters
347
+ ----------
348
+ data_filepath : str
349
+ The path to the data file.
350
+
351
+ Returns
352
+ -------
353
+ list of str
354
+ A list of channel types.
355
+
356
+ """
357
+ # Find channels.tsv in the same directory as the data file
358
+ # It can be named either "channels.tsv" or "*_channels.tsv"
359
+ filepath = Path(data_filepath)
360
+ parent_dir = filepath.parent
361
+
362
+ # Try the standard channels.tsv first
363
+ channels_tsv_path = parent_dir / "channels.tsv"
364
+ if not channels_tsv_path.exists():
365
+ # Try to find *_channels.tsv matching the filename prefix
366
+ base_name = filepath.stem # filename without extension
367
+ for tsv_file in parent_dir.glob("*_channels.tsv"):
368
+ # Check if it matches by looking at task/run components
369
+ tsv_name = tsv_file.stem.replace("_channels", "")
370
+ if base_name.startswith(tsv_name):
371
+ channels_tsv_path = tsv_file
372
+ break
373
+
374
+ if not channels_tsv_path.exists():
375
+ raise FileNotFoundError(f"No channels.tsv found for {data_filepath}")
376
+
377
+ channels_tsv = pd.read_csv(channels_tsv_path, sep="\t")
378
+ return channels_tsv["type"].tolist()
379
+
380
+ def num_times(self, data_filepath: str) -> int:
381
+ """Get the number of time points in the recording.
382
+
383
+ Calculated from ``SamplingFrequency`` and ``RecordingDuration`` in eeg.json.
384
+
385
+ Parameters
386
+ ----------
387
+ data_filepath : str
388
+ The path to the data file.
389
+
390
+ Returns
391
+ -------
392
+ int
393
+ The approximate number of time points.
394
+
395
+ """
396
+ eeg_json_dict = self._get_json_with_inheritance(data_filepath, "eeg.json")
397
+ return int(
398
+ eeg_json_dict.get("SamplingFrequency", 0)
399
+ * eeg_json_dict.get("RecordingDuration", 0)
400
+ )
401
+
402
+ def subject_participant_tsv(self, data_filepath: str) -> dict[str, Any]:
403
+ """Get the participants.tsv record for a subject.
404
+
405
+ Parameters
406
+ ----------
407
+ data_filepath : str
408
+ The path to a data file belonging to the subject.
409
+
410
+ Returns
411
+ -------
412
+ dict
413
+ A dictionary of the subject's information from participants.tsv.
414
+
415
+ """
416
+ participants_tsv_path = self.get_bids_metadata_files(
417
+ data_filepath, "participants.tsv"
418
+ )[0]
419
+ participants_tsv = pd.read_csv(participants_tsv_path, sep="\t")
420
+ if participants_tsv.empty:
421
+ return {}
422
+ participants_tsv.set_index("participant_id", inplace=True)
423
+ subject = f"sub-{self.get_bids_file_attribute('subject', data_filepath)}"
424
+ return participants_tsv.loc[subject].to_dict()
425
+
426
+ def eeg_json(self, data_filepath: str) -> dict[str, Any]:
427
+ """Get the merged eeg.json metadata for a data file.
428
+
429
+ Parameters
430
+ ----------
431
+ data_filepath : str
432
+ The path to the data file.
433
+
434
+ Returns
435
+ -------
436
+ dict
437
+ The merged eeg.json metadata.
438
+
439
+ """
440
+ return self._get_json_with_inheritance(data_filepath, "eeg.json")
441
+
442
+
443
+ __all__ = ["EEGBIDSDataset"]