PyPI - eegdash - Versions diffs - 0.3.9.dev182388821__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

eegdash 0.3.9.dev182388821py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eegdash might be problematic. Click here for more details.

Files changed (31) hide show

eegdash/__init__.py +12 -1
eegdash/api.py +297 -295
eegdash/bids_eeg_metadata.py +297 -56
eegdash/const.py +43 -0
eegdash/data_utils.py +327 -430
eegdash/dataset/__init__.py +19 -1
eegdash/dataset/dataset.py +61 -33
eegdash/dataset/dataset_summary.csv +255 -256
eegdash/dataset/registry.py +163 -11
eegdash/downloader.py +197 -0
eegdash/features/datasets.py +323 -138
eegdash/features/decorators.py +88 -3
eegdash/features/extractors.py +203 -55
eegdash/features/feature_bank/complexity.py +7 -3
eegdash/features/feature_bank/dimensionality.py +1 -1
eegdash/features/inspect.py +80 -5
eegdash/features/serialization.py +49 -17
eegdash/features/utils.py +75 -8
eegdash/hbn/__init__.py +11 -0
eegdash/hbn/preprocessing.py +61 -19
eegdash/hbn/windows.py +157 -34
eegdash/logging.py +54 -0
eegdash/mongodb.py +55 -24
eegdash/paths.py +28 -5
eegdash/utils.py +29 -1
{eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/METADATA +11 -59
eegdash-0.4.0.dist-info/RECORD +37 -0
eegdash-0.3.9.dev182388821.dist-info/RECORD +0 -35
{eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/WHEEL +0 -0
{eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/licenses/LICENSE +0 -0
{eegdash-0.3.9.dev182388821.dist-info → eegdash-0.4.0.dist-info}/top_level.txt +0 -0

eegdash/data_utils.py CHANGED Viewed

@@ -1,10 +1,19 @@
+# Authors: The EEGDash contributors.
+# License: GNU General Public License
+# Copyright the EEGDash contributors.
+"""Data utilities and dataset classes for EEG data handling.
+This module provides core dataset classes for working with EEG data in the EEGDash ecosystem,
+including classes for individual recordings and collections of datasets. It integrates with
+braindecode for machine learning workflows and handles data loading from both local and remote sources.
+"""
 import io
 import json
-import logging
 import os
 import re
 import traceback
-import warnings
 from contextlib import redirect_stderr
 from pathlib import Path
 from typing import Any
@@ -13,26 +22,41 @@ import mne
 import mne_bids
 import numpy as np
 import pandas as pd
-import s3fs
-from bids import BIDSLayout
-from fsspec.callbacks import TqdmCallback
 from joblib import Parallel, delayed
 from mne._fiff.utils import _read_segments_file
 from mne.io import BaseRaw
+from mne.utils.check import _soft_import
 from mne_bids import BIDSPath
 from braindecode.datasets import BaseDataset
+from . import downloader
+from .bids_eeg_metadata import enrich_from_participants
+from .logging import logger
 from .paths import get_default_cache_dir
-logger = logging.getLogger("eegdash")
 class EEGDashBaseDataset(BaseDataset):
-    """A single EEG recording hosted on AWS S3 and cached locally upon first access.
+    """A single EEG recording dataset.
+    Represents a single EEG recording, typically hosted on a remote server (like AWS S3)
+    and cached locally upon first access. This class is a subclass of
+    :class:`braindecode.datasets.BaseDataset` and can be used with braindecode's
+    preprocessing and training pipelines.
+    Parameters
+    ----------
+    record : dict
+        A fully resolved metadata record for the data to load.
+    cache_dir : str
+        The local directory where the data will be cached.
+    s3_bucket : str, optional
+        The S3 bucket to download data from. If not provided, defaults to the
+        OpenNeuro bucket.
+    **kwargs
+        Additional keyword arguments passed to the
+        :class:`braindecode.datasets.BaseDataset` constructor.
-    This is a subclass of braindecode's BaseDataset, which can consequently be used in
-    conjunction with the preprocessing and training pipelines of braindecode.
     """
     _AWS_BUCKET = "s3://openneuro.org"
@@ -44,20 +68,6 @@ class EEGDashBaseDataset(BaseDataset):
         s3_bucket: str | None = None,
         **kwargs,
     ):
-        """Create a new EEGDashBaseDataset instance. Users do not usually need to call this
-        directly -- instead use the EEGDashDataset class to load a collection of these
-        recordings from a local BIDS folder or using a database query.
-        Parameters
-        ----------
-        record : dict
-            A fully resolved metadata record for the data to load.
-        cache_dir : str
-            A local directory where the data will be cached.
-        kwargs : dict
-            Additional keyword arguments to pass to the BaseDataset constructor.
-        """
         super().__init__(None, **kwargs)
         self.record = record
         self.cache_dir = Path(cache_dir)
@@ -73,6 +83,7 @@ class EEGDashBaseDataset(BaseDataset):
         # Compute a dataset folder name under cache_dir that encodes preprocessing
         # (e.g., bdf, mini) to avoid overlapping with the original dataset cache.
         self.dataset_folder = record.get("dataset", "")
+        # TODO: remove this hack when competition is over
         if s3_bucket:
             suffixes: list[str] = []
             bucket_lower = str(s3_bucket).lower()
@@ -91,6 +102,7 @@ class EEGDashBaseDataset(BaseDataset):
             rel = Path(self.dataset_folder) / rel
         self.filecache = self.cache_dir / rel
         self.bids_root = self.cache_dir / self.dataset_folder
         self.bidspath = BIDSPath(
             root=self.bids_root,
             datatype="eeg",
@@ -98,122 +110,25 @@ class EEGDashBaseDataset(BaseDataset):
             **self.bids_kwargs,
         )
-        self.s3file = self._get_s3path(record["bidspath"])
+        self.s3file = downloader.get_s3path(self.s3_bucket, record["bidspath"])
         self.bids_dependencies = record["bidsdependencies"]
-        # Temporary fix for BIDS dependencies path
-        # just to release to the competition
+        self.bids_dependencies_original = record["bidsdependencies"]
+        # TODO: removing temporary fix for BIDS dependencies path
+        # when the competition is over and dataset is digested properly
         if not self.s3_open_neuro:
-            self.bids_dependencies_original = self.bids_dependencies
             self.bids_dependencies = [
                 dep.split("/", 1)[1] for dep in self.bids_dependencies
             ]
         self._raw = None
-    def _get_s3path(self, filepath: str) -> str:
-        """Helper to form an AWS S3 URI for the given relative filepath."""
-        return f"{self.s3_bucket}/{filepath}"
-    def _download_s3(self) -> None:
-        """Download function that gets the raw EEG data from S3."""
-        filesystem = s3fs.S3FileSystem(
-            anon=True, client_kwargs={"region_name": "us-east-2"}
-        )
-        if not self.s3_open_neuro:
-            self.s3file = re.sub(r"(^|/)ds\d{6}/", r"\1", self.s3file, count=1)
-            if self.s3file.endswith(".set"):
-                self.s3file = self.s3file[:-4] + ".bdf"
-                self.filecache = self.filecache.with_suffix(".bdf")
-        self.filecache.parent.mkdir(parents=True, exist_ok=True)
-        info = filesystem.info(self.s3file)
-        size = info.get("size") or info.get("Size")
-        callback = TqdmCallback(
-            size=size,
-            tqdm_kwargs=dict(
-                desc=f"Downloading {Path(self.s3file).name}",
-                unit="B",
-                unit_scale=True,
-                unit_divisor=1024,
-                dynamic_ncols=True,
-                leave=True,
-                mininterval=0.2,
-                smoothing=0.1,
-                miniters=1,
-                bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
-                "[{elapsed}<{remaining}, {rate_fmt}]",
-            ),
-        )
-        filesystem.get(self.s3file, self.filecache, callback=callback)
-        self.filenames = [self.filecache]
-    def _download_dependencies(self) -> None:
-        """Download all BIDS dependency files (metadata files, recording sidecar files)
-        from S3 and cache them locally.
-        """
-        filesystem = s3fs.S3FileSystem(
-            anon=True, client_kwargs={"region_name": "us-east-2"}
-        )
-        for i, dep in enumerate(self.bids_dependencies):
-            if not self.s3_open_neuro:
-                # fix this when our bucket is integrated into the
-                # mongodb
-                # if the file have ".set" replace to ".bdf"
-                if dep.endswith(".set"):
-                    dep = dep[:-4] + ".bdf"
-            s3path = self._get_s3path(dep)
-            if not self.s3_open_neuro:
-                dep = self.bids_dependencies_original[i]
-            dep_path = Path(dep)
-            if dep_path.parts and dep_path.parts[0] == self.record.get("dataset"):
-                dep_local = Path(self.dataset_folder, *dep_path.parts[1:])
-            else:
-                dep_local = Path(self.dataset_folder) / dep_path
-            filepath = self.cache_dir / dep_local
-            if not self.s3_open_neuro:
-                if filepath.suffix == ".set":
-                    filepath = filepath.with_suffix(".bdf")
-                if self.filecache.suffix == ".set":
-                    self.filecache = self.filecache.with_suffix(".bdf")
-            # here, we download the dependency and it is fine
-            # in the case of the competition.
-            if not filepath.exists():
-                filepath.parent.mkdir(parents=True, exist_ok=True)
-                info = filesystem.info(s3path)
-                size = info.get("size") or info.get("Size")
-                callback = TqdmCallback(
-                    size=size,
-                    tqdm_kwargs=dict(
-                        desc=f"Downloading {Path(s3path).name}",
-                        unit="B",
-                        unit_scale=True,
-                        unit_divisor=1024,
-                        dynamic_ncols=True,
-                        leave=True,
-                        mininterval=0.2,
-                        smoothing=0.1,
-                        miniters=1,
-                        bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
-                        "[{elapsed}<{remaining}, {rate_fmt}]",
-                    ),
-                )
-                filesystem.get(s3path, filepath, callback=callback)
     def _get_raw_bids_args(self) -> dict[str, Any]:
-        """Helper to restrict the metadata record to the fields needed to locate a BIDS
-        recording.
-        """
+        """Extract BIDS-related arguments from the metadata record."""
         desired_fields = ["subject", "session", "task", "run"]
         return {k: self.record[k] for k in desired_fields if self.record[k]}
     def _ensure_raw(self) -> None:
-        """Download the S3 file and BIDS dependencies if not already cached."""
+        """Ensure the raw data file and its dependencies are cached locally."""
         # TO-DO: remove this once is fixed on the our side
         # for the competition
         if not self.s3_open_neuro:
@@ -222,130 +137,43 @@ class EEGDashBaseDataset(BaseDataset):
         if not os.path.exists(self.filecache):  # not preload
             if self.bids_dependencies:
-                self._download_dependencies()
-            self._download_s3()
+                downloader.download_dependencies(
+                    s3_bucket=self.s3_bucket,
+                    bids_dependencies=self.bids_dependencies,
+                    bids_dependencies_original=self.bids_dependencies_original,
+                    cache_dir=self.cache_dir,
+                    dataset_folder=self.dataset_folder,
+                    record=self.record,
+                    s3_open_neuro=self.s3_open_neuro,
+                )
+            self.filecache = downloader.download_s3_file(
+                self.s3file, self.filecache, self.s3_open_neuro
+            )
+            self.filenames = [self.filecache]
         if self._raw is None:
-            # capturing any warnings
-            # to-do: remove this once is fixed on the mne-bids side.
-            with warnings.catch_warnings(record=True) as w:
-                # Ensure all warnings are captured into 'w' and not shown to users
-                warnings.simplefilter("always")
-                try:
-                    # mne-bids emits RuntimeWarnings to stderr; silence stderr during read
-                    _stderr_buffer = io.StringIO()
-                    with redirect_stderr(_stderr_buffer):
-                        self._raw = mne_bids.read_raw_bids(
-                            bids_path=self.bidspath, verbose="ERROR"
-                        )
-                    # Parse unmapped participants.tsv fields reported by mne-bids and
-                    # inject them into Raw.info and the dataset description generically.
-                    extras = self._extract_unmapped_participants_from_warnings(w)
-                    if extras:
-                        # 1) Attach to Raw.info under subject_info.participants_extras
-                        try:
-                            subject_info = self._raw.info.get("subject_info") or {}
-                            if not isinstance(subject_info, dict):
-                                subject_info = {}
-                            pe = subject_info.get("participants_extras") or {}
-                            if not isinstance(pe, dict):
-                                pe = {}
-                            # Merge without overwriting
-                            for k, v in extras.items():
-                                pe.setdefault(k, v)
-                            subject_info["participants_extras"] = pe
-                            self._raw.info["subject_info"] = subject_info
-                        except Exception:
-                            # Non-fatal; continue
-                            pass
-                        # 2) Also add to this dataset's description, if possible, so
-                        #    targets can be selected later without naming specifics.
-                        try:
-                            if isinstance(self.description, dict):
-                                for k, v in extras.items():
-                                    self.description.setdefault(k, v)
-                            elif isinstance(self.description, pd.Series):
-                                for k, v in extras.items():
-                                    if k not in self.description.index:
-                                        self.description.loc[k] = v
-                        except Exception:
-                            pass
-                except Exception as e:
-                    logger.error(
-                        f"Error while reading BIDS file: {self.bidspath}\n"
-                        "This may be due to a missing or corrupted file.\n"
-                        "Please check the file and try again."
-                    )
-                    logger.error(f"Exception: {e}")
-                    logger.error(traceback.format_exc())
-                    raise e
-                # Filter noisy mapping notices from mne-bids; surface others
-                for captured_warning in w:
-                    try:
-                        msg = str(captured_warning.message)
-                    except Exception:
-                        continue
-                    # Suppress verbose participants mapping messages
-                    if "Unable to map the following column" in msg and "MNE" in msg:
-                        logger.debug(
-                            "Suppressed mne-bids mapping warning while reading BIDS file: %s",
-                            msg,
-                        )
-                        continue
-    def _extract_unmapped_participants_from_warnings(
-        self, warnings_list: list[Any]
-    ) -> dict[str, Any]:
-        """Scan captured warnings from mne-bids and extract unmapped participants.tsv
-        entries in a generic way.
-        Optionally, the column name can carry a note in parentheses that we ignore
-        for key/value extraction. Returns a mapping of column name -> raw value.
-        """
-        extras: dict[str, Any] = {}
-        header = "Unable to map the following column(s) to MNE:"
-        for wr in warnings_list:
-            try:
-                msg = str(wr.message)
-            except Exception:
-                continue
-            if header not in msg:
-                continue
-            lines = msg.splitlines()
-            # Find the header line, then parse subsequent lines as entries
             try:
-                idx = next(i for i, ln in enumerate(lines) if header in ln)
-            except StopIteration:
-                idx = -1
-            for line in lines[idx + 1 :]:
-                line = line.strip()
-                if not line:
-                    continue
-                # Pattern:  <col>(optional note): <value>
-                # Examples: "gender: F", "Ethnicity: Indian", "foo (ignored): bar"
-                m = re.match(r"^([^:]+?)(?:\s*\([^)]*\))?\s*:\s*(.*)$", line)
-                if not m:
-                    continue
-                col = m.group(1).strip()
-                val = m.group(2).strip()
-                # Keep original column names as provided to stay agnostic
-                if col and col not in extras:
-                    extras[col] = val
-        return extras
-    # === BaseDataset and PyTorch Dataset interface ===
-    def __getitem__(self, index):
-        """Main function to access a sample from the dataset."""
-        X = self.raw[:, index][0]
-        y = None
-        if self.target_name is not None:
-            y = self.description[self.target_name]
-        if isinstance(y, pd.Series):
-            y = y.to_list()
-        if self.transform is not None:
-            X = self.transform(X)
-        return X, y
+                # mne-bids can emit noisy warnings to stderr; keep user logs clean
+                _stderr_buffer = io.StringIO()
+                with redirect_stderr(_stderr_buffer):
+                    self._raw = mne_bids.read_raw_bids(
+                        bids_path=self.bidspath, verbose="ERROR"
+                    )
+                # Enrich Raw.info and description with participants.tsv extras
+                enrich_from_participants(
+                    self.bids_root, self.bidspath, self._raw, self.description
+                )
+            except Exception as e:
+                logger.error(
+                    f"Error while reading BIDS file: {self.bidspath}\n"
+                    "This may be due to a missing or corrupted file.\n"
+                    "Please check the file and try again.\n"
+                    "Usually erasing the local cache and re-downloading helps.\n"
+                    f"`rm {self.bidspath}`"
+                )
+                logger.error(f"Exception: {e}")
+                logger.error(traceback.format_exc())
+                raise e
     def __len__(self) -> int:
         """Return the number of samples in the dataset."""
@@ -362,42 +190,53 @@ class EEGDashBaseDataset(BaseDataset):
         return len(self._raw)
     @property
-    def raw(self):
-        """Return the MNE Raw object for this recording. This will perform the actual
-        retrieval if not yet done so.
+    def raw(self) -> BaseRaw:
+        """The MNE Raw object for this recording.
+        Accessing this property triggers the download and caching of the data
+        if it has not been accessed before.
+        Returns
+        -------
+        mne.io.BaseRaw
+            The loaded MNE Raw object.
         """
         if self._raw is None:
             self._ensure_raw()
         return self._raw
     @raw.setter
-    def raw(self, raw):
+    def raw(self, raw: BaseRaw):
         self._raw = raw
 class EEGDashBaseRaw(BaseRaw):
-    """Wrapper around the MNE BaseRaw class that automatically fetches the data from S3
-    (when _read_segment is called) and caches it locally. Currently for internal use.
+    """MNE BaseRaw wrapper for automatic S3 data fetching.
+    This class extends :class:`mne.io.BaseRaw` to automatically fetch data
+    from an S3 bucket and cache it locally when data is first accessed.
+    It is intended for internal use within the EEGDash ecosystem.
     Parameters
     ----------
-    input_fname : path-like
-        Path to the S3 file
+    input_fname : str
+        The path to the file on the S3 bucket (relative to the bucket root).
     metadata : dict
-        The metadata record for the recording (e.g., from the database).
-    preload : bool
-        Whether to pre-loaded the data before the first access.
-    cache_dir : str
-        Local path under which the data will be cached.
-    bids_dependencies : list
-        List of additional BIDS metadata files that should be downloaded and cached
-        alongside the main recording file.
-    verbose : str | int | None
-        Optionally the verbosity level for MNE logging (see MNE documentation for possible values).
+        The metadata record for the recording, containing information like
+        sampling frequency, channel names, etc.
+    preload : bool, default False
+        If True, preload the data into memory.
+    cache_dir : str, optional
+        Local directory for caching data. If None, a default directory is used.
+    bids_dependencies : list of str, default []
+        A list of BIDS metadata files to download alongside the main recording.
+    verbose : str, int, or None, default None
+        The MNE verbosity level.
     See Also
     --------
-    mne.io.Raw : Documentation of attributes and methods.
+    mne.io.Raw : The base class for Raw objects in MNE.
     """
@@ -413,7 +252,6 @@ class EEGDashBaseRaw(BaseRaw):
         bids_dependencies: list[str] = [],
         verbose: Any = None,
     ):
-        """Get to work with S3 endpoint first, no caching"""
         # Create a simple RawArray
         sfreq = metadata["sfreq"]  # Sampling frequency
         n_times = metadata["n_times"]
@@ -426,13 +264,16 @@ class EEGDashBaseRaw(BaseRaw):
             ch_types.append(chtype)
         info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
-        self.s3file = self._get_s3path(input_fname)
+        self.s3file = downloader.get_s3path(self._AWS_BUCKET, input_fname)
         self.cache_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
         self.filecache = self.cache_dir / input_fname
         self.bids_dependencies = bids_dependencies
         if preload and not os.path.exists(self.filecache):
-            self._download_s3()
+            self.filecache = downloader.download_s3_file(
+                self.s3file, self.filecache, self.s3_open_neuro
+            )
+            self.filenames = [self.filecache]
             preload = self.filecache
         super().__init__(
@@ -443,56 +284,47 @@ class EEGDashBaseRaw(BaseRaw):
             verbose=verbose,
         )
-    def _get_s3path(self, filepath):
-        return f"{self._AWS_BUCKET}/{filepath}"
-    def _download_s3(self) -> None:
-        self.filecache.parent.mkdir(parents=True, exist_ok=True)
-        filesystem = s3fs.S3FileSystem(
-            anon=True, client_kwargs={"region_name": "us-east-2"}
-        )
-        filesystem.download(self.s3file, self.filecache)
-        self.filenames = [self.filecache]
-    def _download_dependencies(self):
-        filesystem = s3fs.S3FileSystem(
-            anon=True, client_kwargs={"region_name": "us-east-2"}
-        )
-        for dep in self.bids_dependencies:
-            s3path = self._get_s3path(dep)
-            filepath = self.cache_dir / dep
-            if not filepath.exists():
-                filepath.parent.mkdir(parents=True, exist_ok=True)
-                filesystem.download(s3path, filepath)
     def _read_segment(
         self, start=0, stop=None, sel=None, data_buffer=None, *, verbose=None
     ):
+        """Read a segment of data, downloading if necessary."""
         if not os.path.exists(self.filecache):  # not preload
-            if self.bids_dependencies:
-                self._download_dependencies()
-            self._download_s3()
+            if self.bids_dependencies:  # this is use only to sidecars for now
+                downloader.download_dependencies(
+                    s3_bucket=self._AWS_BUCKET,
+                    bids_dependencies=self.bids_dependencies,
+                    bids_dependencies_original=None,
+                    cache_dir=self.cache_dir,
+                    dataset_folder=self.filecache,
+                    record={},
+                    s3_open_neuro=self.s3_open_neuro,
+                )
+            self.filecache = downloader.download_s3_file(
+                self.s3file, self.filecache, self.s3_open_neuro
+            )
+            self.filenames = [self.filecache]
         else:  # not preload and file is not cached
             self.filenames = [self.filecache]
         return super()._read_segment(start, stop, sel, data_buffer, verbose=verbose)
     def _read_segment_file(self, data, idx, fi, start, stop, cals, mult):
-        """Read a chunk of data from the file."""
+        """Read a chunk of data from a local file."""
         _read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype="<f4")
 class EEGBIDSDataset:
-    """A one-stop shop interface to a local BIDS dataset containing EEG recordings.
+    """An interface to a local BIDS dataset containing EEG recordings.
-    This is mainly tailored to the needs of EEGDash application and is used to centralize
-    interactions with the BIDS dataset, such as parsing the metadata.
+    This class centralizes interactions with a BIDS dataset on the local
+    filesystem, providing methods to parse metadata, find files, and
+    retrieve BIDS-related information.
     Parameters
     ----------
-    data_dir : str | Path
+    data_dir : str or Path
         The path to the local BIDS dataset directory.
     dataset : str
-        A name for the dataset.
+        A name for the dataset (e.g., "ds002718").
     """
@@ -516,6 +348,14 @@ class EEGBIDSDataset:
         data_dir=None,  # location of bids dataset
         dataset="",  # dataset name
     ):
+        bids_lib = _soft_import("bids", purpose="digestion of datasets", strict=False)
+        if bids_lib is None:
+            raise ImportError(
+                "The 'pybids' package is required to use EEGBIDSDataset. "
+                "Please install it via 'pip install eegdash[digestion]'."
+            )
         if data_dir is None or not os.path.exists(data_dir):
             raise ValueError("data_dir must be specified and must exist")
         self.bidsdir = Path(data_dir)
@@ -527,7 +367,7 @@ class EEGBIDSDataset:
             raise AssertionError(
                 f"BIDS directory '{dir_name}' does not correspond to dataset '{self.dataset}'"
             )
-        self.layout = BIDSLayout(data_dir)
+        self.layout = bids_lib.BIDSLayout(data_dir)
         # get all recording files in the bids directory
         self.files = self._get_recordings(self.layout)
@@ -537,10 +377,17 @@ class EEGBIDSDataset:
         assert self.check_eeg_dataset(), ValueError("Dataset is not an EEG dataset.")
     def check_eeg_dataset(self) -> bool:
-        """Check if the dataset is EEG."""
+        """Check if the BIDS dataset contains EEG data.
+        Returns
+        -------
+        bool
+            True if the dataset's modality is EEG, False otherwise.
+        """
         return self.get_bids_file_attribute("modality", self.files[0]).lower() == "eeg"
-    def _get_recordings(self, layout: BIDSLayout) -> list[str]:
+    def _get_recordings(self, layout) -> list[str]:
         """Get a list of all EEG recording files in the BIDS layout."""
         files = []
         for ext, exts in self.RAW_EXTENSIONS.items():
@@ -550,14 +397,12 @@ class EEGBIDSDataset:
         return files
     def _get_relative_bidspath(self, filename: str) -> str:
-        """Make the given file path relative to the BIDS directory."""
+        """Make a file path relative to the BIDS parent directory."""
         bids_parent_dir = self.bidsdir.parent.absolute()
         return str(Path(filename).relative_to(bids_parent_dir))
     def _get_property_from_filename(self, property: str, filename: str) -> str:
-        """Parse a property out of a BIDS-compliant filename. Returns an empty string
-        if not found.
-        """
+        """Parse a BIDS entity from a filename."""
         import platform
         if platform.system() == "Windows":
@@ -567,159 +412,106 @@ class EEGBIDSDataset:
         return lookup.group(1) if lookup else ""
     def _merge_json_inheritance(self, json_files: list[str | Path]) -> dict:
-        """Internal helper to merge list of json files found by get_bids_file_inheritance,
-        expecting the order (from left to right) is from lowest
-        level to highest level, and return a merged dictionary
-        """
+        """Merge a list of JSON files according to BIDS inheritance."""
         json_files.reverse()
         json_dict = {}
         for f in json_files:
-            json_dict.update(json.load(open(f)))  # FIXME: should close file
+            with open(f) as fp:
+                json_dict.update(json.load(fp))
         return json_dict
     def _get_bids_file_inheritance(
         self, path: str | Path, basename: str, extension: str
     ) -> list[Path]:
-        """Get all file paths that apply to the basename file in the specified directory
-        and that end with the specified suffix, recursively searching parent directories
-        (following the BIDS inheritance principle in the order of lowest level first).
-        Parameters
-        ----------
-        path : str | Path
-            The directory path to search for files.
-        basename : str
-            BIDS file basename without _eeg.set extension for example
-        extension : str
-            Only consider files that end with the specified suffix; e.g. channels.tsv
-        Returns
-        -------
-        list[Path]
-            A list of file paths that match the given basename and extension.
-        """
+        """Find all applicable metadata files using BIDS inheritance."""
         top_level_files = ["README", "dataset_description.json", "participants.tsv"]
         bids_files = []
-        # check if path is str object
         if isinstance(path, str):
             path = Path(path)
-        if not path.exists:
-            raise ValueError("path {path} does not exist")
+        if not path.exists():
+            raise ValueError(f"path {path} does not exist")
-        # check if file is in current path
         for file in os.listdir(path):
-            # target_file = path / f"{cur_file_basename}_{extension}"
-            if os.path.isfile(path / file):
-                # check if file has extension extension
-                # check if file basename has extension
-                if file.endswith(extension):
-                    filepath = path / file
-                    bids_files.append(filepath)
-        # check if file is in top level directory
+            if os.path.isfile(path / file) and file.endswith(extension):
+                bids_files.append(path / file)
         if any(file in os.listdir(path) for file in top_level_files):
             return bids_files
         else:
-            # call get_bids_file_inheritance recursively with parent directory
             bids_files.extend(
                 self._get_bids_file_inheritance(path.parent, basename, extension)
             )
             return bids_files
     def get_bids_metadata_files(
-        self, filepath: str | Path, metadata_file_extension: list[str]
+        self, filepath: str | Path, metadata_file_extension: str
     ) -> list[Path]:
-        """Retrieve all metadata file paths that apply to a given data file path and that
-        end with a specific suffix (following the BIDS inheritance principle).
+        """Retrieve all metadata files that apply to a given data file.
+        Follows the BIDS inheritance principle to find all relevant metadata
+        files (e.g., ``channels.tsv``, ``eeg.json``) for a specific recording.
         Parameters
         ----------
-        filepath: str | Path
-            The filepath to get the associated metadata files for.
+        filepath : str or Path
+            The path to the data file.
         metadata_file_extension : str
-            Consider only metadata files that end with the specified suffix,
-            e.g., channels.tsv or eeg.json
+            The extension of the metadata file to search for (e.g., "channels.tsv").
         Returns
         -------
-        list[Path]:
-            A list of filepaths for all matching metadata files
+        list of Path
+            A list of paths to the matching metadata files.
         """
         if isinstance(filepath, str):
             filepath = Path(filepath)
-        if not filepath.exists:
-            raise ValueError("filepath {filepath} does not exist")
+        if not filepath.exists():
+            raise ValueError(f"filepath {filepath} does not exist")
         path, filename = os.path.split(filepath)
         basename = filename[: filename.rfind("_")]
-        # metadata files
         meta_files = self._get_bids_file_inheritance(
             path, basename, metadata_file_extension
         )
         return meta_files
     def _scan_directory(self, directory: str, extension: str) -> list[Path]:
-        """Return a list of file paths that end with the given extension in the specified
-        directory. Ignores certain special directories like .git, .datalad, derivatives,
-        and code.
-        """
+        """Scan a directory for files with a given extension."""
         result_files = []
         directory_to_ignore = [".git", ".datalad", "derivatives", "code"]
         with os.scandir(directory) as entries:
             for entry in entries:
                 if entry.is_file() and entry.name.endswith(extension):
-                    result_files.append(entry.path)
-                elif entry.is_dir():
-                    # check that entry path doesn't contain any name in ignore list
-                    if not any(name in entry.name for name in directory_to_ignore):
-                        result_files.append(entry.path)  # Add directory to scan later
+                    result_files.append(Path(entry.path))
+                elif entry.is_dir() and not any(
+                    name in entry.name for name in directory_to_ignore
+                ):
+                    result_files.append(Path(entry.path))
         return result_files
     def _get_files_with_extension_parallel(
         self, directory: str, extension: str = ".set", max_workers: int = -1
     ) -> list[Path]:
-        """Efficiently scan a directory and its subdirectories for files that end with
-        the given extension.
-        Parameters
-        ----------
-        directory : str
-            The root directory to scan for files.
-        extension : str
-            Only consider files that end with this suffix, e.g. '.set'.
-        max_workers : int
-            Optionally specify the maximum number of worker threads to use for parallel scanning.
-            Defaults to all available CPU cores if set to -1.
-        Returns
-        -------
-        list[Path]:
-            A list of filepaths for all matching metadata files
-        """
+        """Scan a directory tree in parallel for files with a given extension."""
         result_files = []
         dirs_to_scan = [directory]
-        # Use joblib.Parallel and delayed to parallelize directory scanning
         while dirs_to_scan:
             logger.info(
                 f"Directories to scan: {len(dirs_to_scan)}, files: {dirs_to_scan}"
             )
-            # Run the scan_directory function in parallel across directories
             results = Parallel(n_jobs=max_workers, prefer="threads", verbose=1)(
                 delayed(self._scan_directory)(d, extension) for d in dirs_to_scan
             )
-            # Reset the directories to scan and process the results
             dirs_to_scan = []
             for res in results:
                 for path in res:
                     if os.path.isdir(path):
-                        dirs_to_scan.append(path)  # Queue up subdirectories to scan
+                        dirs_to_scan.append(path)
                     else:
-                        result_files.append(path)  # Add files to the final result
+                        result_files.append(path)
             logger.info(f"Found {len(result_files)} files.")
         return result_files
@@ -727,19 +519,29 @@ class EEGBIDSDataset:
     def load_and_preprocess_raw(
         self, raw_file: str, preprocess: bool = False
     ) -> np.ndarray:
-        """Utility function to load a raw data file with MNE and apply some simple
-        (hardcoded) preprocessing and return as a numpy array. Not meant for purposes
-        other than testing or debugging.
+        """Load and optionally preprocess a raw data file.
+        This is a utility function for testing or debugging, not for general use.
+        Parameters
+        ----------
+        raw_file : str
+            Path to the raw EEGLAB file (.set).
+        preprocess : bool, default False
+            If True, apply a high-pass filter, notch filter, and resample the data.
+        Returns
+        -------
+        numpy.ndarray
+            The loaded and processed data as a NumPy array.
         """
         logger.info(f"Loading raw data from {raw_file}")
         EEG = mne.io.read_raw_eeglab(raw_file, preload=True, verbose="error")
         if preprocess:
-            # highpass filter
             EEG = EEG.filter(l_freq=0.25, h_freq=25, verbose=False)
-            # remove 60Hz line noise
             EEG = EEG.notch_filter(freqs=(60), verbose=False)
-            # bring to common sampling rate
             sfreq = 128
             if EEG.info["sfreq"] != sfreq:
                 EEG = EEG.resample(sfreq)
@@ -750,26 +552,35 @@ class EEGBIDSDataset:
             raise ValueError("Expect raw data to be CxT dimension")
         return mat_data
-    def get_files(self) -> list[Path]:
-        """Get all EEG recording file paths (with valid extensions) in the BIDS folder."""
+    def get_files(self) -> list[str]:
+        """Get all EEG recording file paths in the BIDS dataset.
+        Returns
+        -------
+        list of str
+            A list of file paths for all valid EEG recordings.
+        """
         return self.files
     def resolve_bids_json(self, json_files: list[str]) -> dict:
-        """Resolve the BIDS JSON files and return a dictionary of the resolved values.
+        """Resolve BIDS JSON inheritance and merge files.
         Parameters
         ----------
-        json_files : list
-            A list of JSON file paths to resolve in order of leaf level first.
+        json_files : list of str
+            A list of JSON file paths, ordered from the lowest (most specific)
+            to highest level of the BIDS hierarchy.
         Returns
         -------
-            dict: A dictionary of the resolved values.
+        dict
+            A dictionary containing the merged JSON data.
         """
-        if len(json_files) == 0:
+        if not json_files:
             raise ValueError("No JSON files provided")
-        json_files.reverse()  # TODO undeterministic
+        json_files.reverse()
         json_dict = {}
         for json_file in json_files:
@@ -778,8 +589,20 @@ class EEGBIDSDataset:
         return json_dict
     def get_bids_file_attribute(self, attribute: str, data_filepath: str) -> Any:
-        """Retrieve a specific attribute from the BIDS file metadata applicable
-        to the provided recording file path.
+        """Retrieve a specific attribute from BIDS metadata.
+        Parameters
+        ----------
+        attribute : str
+            The name of the attribute to retrieve (e.g., "sfreq", "subject").
+        data_filepath : str
+            The path to the data file.
+        Returns
+        -------
+        Any
+            The value of the requested attribute, or None if not found.
         """
         entities = self.layout.parse_file_entities(data_filepath)
         bidsfile = self.layout.get(**entities)[0]
@@ -798,21 +621,59 @@ class EEGBIDSDataset:
         return attribute_value
     def channel_labels(self, data_filepath: str) -> list[str]:
-        """Get a list of channel labels for the given data file path."""
+        """Get a list of channel labels from channels.tsv.
+        Parameters
+        ----------
+        data_filepath : str
+            The path to the data file.
+        Returns
+        -------
+        list of str
+            A list of channel names.
+        """
         channels_tsv = pd.read_csv(
             self.get_bids_metadata_files(data_filepath, "channels.tsv")[0], sep="\t"
         )
         return channels_tsv["name"].tolist()
     def channel_types(self, data_filepath: str) -> list[str]:
-        """Get a list of channel types for the given data file path."""
+        """Get a list of channel types from channels.tsv.
+        Parameters
+        ----------
+        data_filepath : str
+            The path to the data file.
+        Returns
+        -------
+        list of str
+            A list of channel types.
+        """
         channels_tsv = pd.read_csv(
             self.get_bids_metadata_files(data_filepath, "channels.tsv")[0], sep="\t"
         )
         return channels_tsv["type"].tolist()
     def num_times(self, data_filepath: str) -> int:
-        """Get the approximate number of time points in the EEG recording based on the BIDS metadata."""
+        """Get the number of time points in the recording.
+        Calculated from ``SamplingFrequency`` and ``RecordingDuration`` in eeg.json.
+        Parameters
+        ----------
+        data_filepath : str
+            The path to the data file.
+        Returns
+        -------
+        int
+            The approximate number of time points.
+        """
         eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
         eeg_json_dict = self._merge_json_inheritance(eeg_jsons)
         return int(
@@ -820,35 +681,71 @@ class EEGBIDSDataset:
         )
     def subject_participant_tsv(self, data_filepath: str) -> dict[str, Any]:
-        """Get BIDS participants.tsv record for the subject to which the given file
-        path corresponds, as a dictionary.
+        """Get the participants.tsv record for a subject.
+        Parameters
+        ----------
+        data_filepath : str
+            The path to a data file belonging to the subject.
+        Returns
+        -------
+        dict
+            A dictionary of the subject's information from participants.tsv.
         """
-        participants_tsv = pd.read_csv(
-            self.get_bids_metadata_files(data_filepath, "participants.tsv")[0], sep="\t"
-        )
-        # if participants_tsv is not empty
+        participants_tsv_path = self.get_bids_metadata_files(
+            data_filepath, "participants.tsv"
+        )[0]
+        participants_tsv = pd.read_csv(participants_tsv_path, sep="\t")
         if participants_tsv.empty:
             return {}
-        # set 'participant_id' as index
         participants_tsv.set_index("participant_id", inplace=True)
         subject = f"sub-{self.get_bids_file_attribute('subject', data_filepath)}"
         return participants_tsv.loc[subject].to_dict()
     def eeg_json(self, data_filepath: str) -> dict[str, Any]:
-        """Get BIDS eeg.json metadata for the given data file path."""
+        """Get the merged eeg.json metadata for a data file.
+        Parameters
+        ----------
+        data_filepath : str
+            The path to the data file.
+        Returns
+        -------
+        dict
+            The merged eeg.json metadata.
+        """
         eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
-        eeg_json_dict = self._merge_json_inheritance(eeg_jsons)
-        return eeg_json_dict
+        return self._merge_json_inheritance(eeg_jsons)
     def channel_tsv(self, data_filepath: str) -> dict[str, Any]:
-        """Get BIDS channels.tsv metadata for the given data file path, as a dictionary
-        of lists and/or single values.
+        """Get the channels.tsv metadata as a dictionary.
+        Parameters
+        ----------
+        data_filepath : str
+            The path to the data file.
+        Returns
+        -------
+        dict
+            The channels.tsv data, with columns as keys.
         """
-        channels_tsv = pd.read_csv(
-            self.get_bids_metadata_files(data_filepath, "channels.tsv")[0], sep="\t"
-        )
-        channel_tsv = channels_tsv.to_dict()
-        # 'name' and 'type' now have a dictionary of index-value. Convert them to list
+        channels_tsv_path = self.get_bids_metadata_files(data_filepath, "channels.tsv")[
+            0
+        ]
+        channels_tsv = pd.read_csv(channels_tsv_path, sep="\t")
+        channel_tsv_dict = channels_tsv.to_dict()
         for list_field in ["name", "type", "units"]:
-            channel_tsv[list_field] = list(channel_tsv[list_field].values())
-        return channel_tsv
+            if list_field in channel_tsv_dict:
+                channel_tsv_dict[list_field] = list(
+                    channel_tsv_dict[list_field].values()
+                )
+        return channel_tsv_dict
+__all__ = ["EEGDashBaseDataset", "EEGBIDSDataset", "EEGDashBaseRaw"]

eegdash 0.3.9.dev182388821__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

eegdash 0.3.9.dev182388821py3-none-any.whl → 0.4.0py3-none-any.whl