PyPI - eegdash - Versions diffs - 0.0.9__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

eegdash 0.0.9py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eegdash might be problematic. Click here for more details.

Files changed (29) hide show

eegdash/__init__.py +8 -1
eegdash/api.py +690 -0
eegdash/data_config.py +33 -27
eegdash/data_utils.py +365 -222
eegdash/dataset.py +60 -0
eegdash/features/__init__.py +46 -18
eegdash/features/datasets.py +62 -23
eegdash/features/decorators.py +14 -6
eegdash/features/extractors.py +22 -22
eegdash/features/feature_bank/__init__.py +3 -3
eegdash/features/feature_bank/complexity.py +6 -3
eegdash/features/feature_bank/connectivity.py +16 -56
eegdash/features/feature_bank/csp.py +3 -4
eegdash/features/feature_bank/dimensionality.py +8 -5
eegdash/features/feature_bank/signal.py +30 -4
eegdash/features/feature_bank/spectral.py +10 -28
eegdash/features/feature_bank/utils.py +48 -0
eegdash/features/inspect.py +48 -0
eegdash/features/serialization.py +4 -5
eegdash/features/utils.py +9 -7
eegdash/preprocessing.py +65 -0
eegdash/utils.py +11 -0
{eegdash-0.0.9.dist-info → eegdash-0.2.0.dist-info}/METADATA +67 -20
eegdash-0.2.0.dist-info/RECORD +27 -0
{eegdash-0.0.9.dist-info → eegdash-0.2.0.dist-info}/WHEEL +1 -1
{eegdash-0.0.9.dist-info → eegdash-0.2.0.dist-info}/licenses/LICENSE +1 -0
eegdash/main.py +0 -359
eegdash-0.0.9.dist-info/RECORD +0 -22
{eegdash-0.0.9.dist-info → eegdash-0.2.0.dist-info}/top_level.txt +0 -0

eegdash/data_utils.py CHANGED Viewed

@@ -1,90 +1,122 @@
+import json
+import logging
 import os
-import sys
-from joblib import Parallel, delayed
+import re
+from pathlib import Path
+from typing import Any
 import mne
+import mne_bids
 import numpy as np
 import pandas as pd
-from pathlib import Path
-import re
-import json
-from mne.io import BaseRaw
-from mne._fiff.utils import _find_channels, _read_segments_file
 import s3fs
-import tempfile
+from bids import BIDSLayout
+from joblib import Parallel, delayed
 from mne._fiff.utils import _read_segments_file
-from braindecode.datasets import BaseDataset
-import mne_bids
+from mne.io import BaseRaw
 from mne_bids import (
     BIDSPath,
 )
-from bids import BIDSLayout
-class EEGDashBaseDataset(BaseDataset):
-    """Returns samples from an mne.io.Raw object along with a target.
+from braindecode.datasets import BaseDataset
-    Dataset which serves samples from an mne.io.Raw object along with a target.
-    The target is unique for the dataset, and is obtained through the
-    `description` attribute.
+logger = logging.getLogger("eegdash")
-    Parameters
-    ----------
-    raw : mne.io.Raw
-        Continuous data.
-    description : dict | pandas.Series | None
-        Holds additional description about the continuous signal / subject.
-    target_name : str | tuple | None
-        Name(s) of the index in `description` that should be used to provide the
-        target (e.g., to be used in a prediction task later on).
-    transform : callable | None
-        On-the-fly transform applied to the example before it is returned.
+class EEGDashBaseDataset(BaseDataset):
+    """A single EEG recording hosted on AWS S3 and cached locally upon first access.
+    This is a subclass of braindecode's BaseDataset, which can consequently be used in
+    conjunction with the preprocessing and training pipelines of braindecode.
     """
-    AWS_BUCKET = 's3://openneuro.org'
-    def __init__(self, record, cache_dir, **kwargs):
+    AWS_BUCKET = "s3://openneuro.org"
+    def __init__(
+        self,
+        record: dict[str, Any],
+        cache_dir: str,
+        s3_bucket: str | None = None,
+        **kwargs,
+    ):
+        """Create a new EEGDashBaseDataset instance. Users do not usually need to call this
+        directly -- instead use the EEGDashDataset class to load a collection of these
+        recordings from a local BIDS folder or using a database query.
+        Parameters
+        ----------
+        record : dict
+            A fully resolved metadata record for the data to load.
+        cache_dir : str
+            A local directory where the data will be cached.
+        kwargs : dict
+            Additional keyword arguments to pass to the BaseDataset constructor.
+        """
         super().__init__(None, **kwargs)
         self.record = record
         self.cache_dir = Path(cache_dir)
         bids_kwargs = self.get_raw_bids_args()
-        self.bidspath = BIDSPath(root=self.cache_dir / record['dataset'], datatype='eeg', suffix='eeg', **bids_kwargs)
-        self.s3file = self.get_s3path(record['bidspath'])
-        self.filecache = self.cache_dir / record['bidspath']
-        self.bids_dependencies = record['bidsdependencies']
+        self.bidspath = BIDSPath(
+            root=self.cache_dir / record["dataset"],
+            datatype="eeg",
+            suffix="eeg",
+            **bids_kwargs,
+        )
+        self.s3_bucket = s3_bucket if s3_bucket else self.AWS_BUCKET
+        self.s3file = self.get_s3path(record["bidspath"])
+        self.filecache = self.cache_dir / record["bidspath"]
+        self.bids_dependencies = record["bidsdependencies"]
         self._raw = None
-        # if os.path.exists(self.filecache):
-        #     self.raw = mne_bids.read_raw_bids(self.bidspath, verbose=False)
-    def get_s3path(self, filepath):
-        return f"{self.AWS_BUCKET}/{filepath}"
+    def get_s3path(self, filepath: str) -> str:
+        """Helper to form an AWS S3 URI for the given relative filepath."""
+        return f"{self.s3_bucket}/{filepath}"
-    def _download_s3(self):
+    def _download_s3(self) -> None:
+        """Fetch the given data from its S3 location and cache it locally."""
         self.filecache.parent.mkdir(parents=True, exist_ok=True)
-        filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
+        filesystem = s3fs.S3FileSystem(
+            anon=True, client_kwargs={"region_name": "us-east-2"}
+        )
         filesystem.download(self.s3file, self.filecache)
         self.filenames = [self.filecache]
-    def _download_dependencies(self):
-        filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
+    def _download_dependencies(self) -> None:
+        """Download all BIDS dependency files (metadata files, recording sidecar files)
+        from S3 and cache them locally.
+        """
+        filesystem = s3fs.S3FileSystem(
+            anon=True, client_kwargs={"region_name": "us-east-2"}
+        )
         for dep in self.bids_dependencies:
             s3path = self.get_s3path(dep)
             filepath = self.cache_dir / dep
             if not filepath.exists():
                 filepath.parent.mkdir(parents=True, exist_ok=True)
-                filesystem.download(s3path, filepath)
+                filesystem.download(s3path, filepath)
-    def get_raw_bids_args(self):
-        desired_fields = ['subject', 'session', 'task', 'run']
+    def get_raw_bids_args(self) -> dict[str, Any]:
+        """Helper to restrict the metadata record to the fields needed to locate a BIDS
+        recording.
+        """
+        desired_fields = ["subject", "session", "task", "run"]
         return {k: self.record[k] for k in desired_fields if self.record[k]}
-    def check_and_get_raw(self):
-        if not os.path.exists(self.filecache): # not preload
+    def check_and_get_raw(self) -> None:
+        """Download the S3 file and BIDS dependencies if not already cached."""
+        if not os.path.exists(self.filecache):  # not preload
             if self.bids_dependencies:
                 self._download_dependencies()
             self._download_s3()
         if self._raw is None:
             self._raw = mne_bids.read_raw_bids(self.bidspath, verbose=False)
-    def __getitem__(self, index):
-        # self.check_and_get_raw()
+    # === BaseDataset and PyTorch Dataset interface ===
+    def __getitem__(self, index):
+        """Main function to access a sample from the dataset."""
         X = self.raw[:, index][0]
         y = None
         if self.target_name is not None:
@@ -94,15 +126,21 @@ class EEGDashBaseDataset(BaseDataset):
         if self.transform is not None:
             X = self.transform(X)
         return X, y
-    def __len__(self):
+    def __len__(self) -> int:
+        """Return the number of samples in the dataset."""
         if self._raw is None:
-            return int(self.record['ntimes'] * self.record['sampling_frequency'])
+            # FIXME: this is a bit strange and should definitely not change as a side effect
+            #  of accessing the data (which it will, since ntimes is the actual length but rounded down)
+            return int(self.record["ntimes"] * self.record["sampling_frequency"])
         else:
             return len(self._raw)
     @property
     def raw(self):
+        """Return the MNE Raw object for this recording. This will perform the actual
+        retrieval if not yet done so.
+        """
         if self._raw is None:
             self.check_and_get_raw()
         return self._raw
@@ -111,59 +149,55 @@ class EEGDashBaseDataset(BaseDataset):
     def raw(self, raw):
         self._raw = raw
 class EEGDashBaseRaw(BaseRaw):
-    r"""MNE Raw object from EEG-Dash connection with Openneuro S3 file.
+    """Wrapper around the MNE BaseRaw class that automatically fetches the data from S3
+    (when _read_segment is called) and caches it locally. Currently for internal use.
     Parameters
     ----------
     input_fname : path-like
         Path to the S3 file
-    eog : list | tuple | 'auto'
-        Names or indices of channels that should be designated EOG channels.
-        If 'auto', the channel names containing ``EOG`` or ``EYE`` are used.
-        Defaults to empty tuple.
-    %(preload)s
-        Note that preload=False will be effective only if the data is stored
-        in a separate binary file.
-    %(uint16_codec)s
-    %(montage_units)s
-    %(verbose)s
+    metadata : dict
+        The metadata record for the recording (e.g., from the database).
+    preload : bool
+        Whether to pre-loaded the data before the first access.
+    cache_dir : str
+        Local path under which the data will be cached.
+    bids_dependencies : list
+        List of additional BIDS metadata files that should be downloaded and cached
+        alongside the main recording file.
+    verbose : str | int | None
+        Optionally the verbosity level for MNE logging (see MNE documentation for possible values).
     See Also
     --------
     mne.io.Raw : Documentation of attributes and methods.
-    Notes
-    -----
-    .. versionadded:: 0.11.0
     """
-    AWS_BUCKET = 's3://openneuro.org'
+    AWS_BUCKET = "s3://openneuro.org"
     def __init__(
         self,
-        input_fname,
-        metadata,
-        eog=(),
-        preload=False,
+        input_fname: str,
+        metadata: dict[str, Any],
+        preload: bool = False,
         *,
-        cache_dir='./.eegdash_cache',
-        bids_dependencies:list = [],
-        uint16_codec=None,
-        montage_units="auto",
-        verbose=None,
+        cache_dir: str = "./.eegdash_cache",
+        bids_dependencies: list[str] = [],
+        verbose: Any = None,
     ):
-        '''
-        Get to work with S3 endpoint first, no caching
-        '''
+        """Get to work with S3 endpoint first, no caching"""
         # Create a simple RawArray
-        sfreq = metadata['sfreq']  # Sampling frequency
-        n_times = metadata['n_times']
-        ch_names = metadata['ch_names']
+        sfreq = metadata["sfreq"]  # Sampling frequency
+        n_times = metadata["n_times"]
+        ch_names = metadata["ch_names"]
         ch_types = []
-        for ch in metadata['ch_types']:
+        for ch in metadata["ch_types"]:
             chtype = ch.lower()
-            if chtype == 'heog' or chtype == 'veog':
-                chtype = 'eog'
+            if chtype == "heog" or chtype == "veog":
+                chtype = "eog"
             ch_types.append(chtype)
         info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
         self.s3file = self.get_s3path(input_fname)
@@ -178,7 +212,7 @@ class EEGDashBaseRaw(BaseRaw):
         super().__init__(
             info,
             preload,
-            last_samps=[n_times-1],
+            last_samps=[n_times - 1],
             orig_format="single",
             verbose=verbose,
         )
@@ -188,12 +222,16 @@ class EEGDashBaseRaw(BaseRaw):
     def _download_s3(self):
         self.filecache.parent.mkdir(parents=True, exist_ok=True)
-        filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
+        filesystem = s3fs.S3FileSystem(
+            anon=True, client_kwargs={"region_name": "us-east-2"}
+        )
         filesystem.download(self.s3file, self.filecache)
         self.filenames = [self.filecache]
     def _download_dependencies(self):
-        filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
+        filesystem = s3fs.S3FileSystem(
+            anon=True, client_kwargs={"region_name": "us-east-2"}
+        )
         for dep in self.bids_dependencies:
             s3path = self.get_s3path(dep)
             filepath = self.cache_dir / dep
@@ -204,34 +242,56 @@ class EEGDashBaseRaw(BaseRaw):
     def _read_segment(
         self, start=0, stop=None, sel=None, data_buffer=None, *, verbose=None
     ):
-        if not os.path.exists(self.filecache): # not preload
+        if not os.path.exists(self.filecache):  # not preload
             if self.bids_dependencies:
                 self._download_dependencies()
             self._download_s3()
-        else: # not preload and file is not cached
+        else:  # not preload and file is not cached
             self.filenames = [self.filecache]
         return super()._read_segment(start, stop, sel, data_buffer, verbose=verbose)
     def _read_segment_file(self, data, idx, fi, start, stop, cals, mult):
         """Read a chunk of data from the file."""
         _read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype="<f4")
-class EEGBIDSDataset():
-    ALLOWED_FILE_FORMAT = ['eeglab', 'brainvision', 'biosemi', 'european']
+class EEGBIDSDataset:
+    """A one-stop shop interface to a local BIDS dataset containing EEG recordings.
+    This is mainly tailored to the needs of EEGDash application and is used to centralize
+    interactions with the BIDS dataset, such as parsing the metadata.
+    Parameters
+    ----------
+    data_dir : str | Path
+        The path to the local BIDS dataset directory.
+    dataset : str
+        A name for the dataset.
+    """
+    ALLOWED_FILE_FORMAT = ["eeglab", "brainvision", "biosemi", "european"]
     RAW_EXTENSIONS = {
-            '.set': ['.set', '.fdt'], # eeglab
-            '.edf': ['.edf'], # european
-            '.vhdr': ['.eeg', '.vhdr', '.vmrk', '.dat', '.raw'], # brainvision
-            '.bdf': ['.bdf'], # biosemi
-        }
-    METADATA_FILE_EXTENSIONS = ['eeg.json', 'channels.tsv', 'electrodes.tsv', 'events.tsv', 'events.json']
-    def __init__(self,
-            data_dir=None,                            # location of bids dataset
-            dataset='',                               # dataset name
-        ):
+        ".set": [".set", ".fdt"],  # eeglab
+        ".edf": [".edf"],  # european
+        ".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"],  # brainvision
+        ".bdf": [".bdf"],  # biosemi
+    }
+    METADATA_FILE_EXTENSIONS = [
+        "eeg.json",
+        "channels.tsv",
+        "electrodes.tsv",
+        "events.tsv",
+        "events.json",
+    ]
+    def __init__(
+        self,
+        data_dir=None,  # location of bids dataset
+        dataset="",  # dataset name
+    ):
         if data_dir is None or not os.path.exists(data_dir):
-            raise ValueError('data_dir must be specified and must exist')
+            raise ValueError("data_dir must be specified and must exist")
         self.bidsdir = Path(data_dir)
         self.dataset = dataset
         assert str(self.bidsdir).endswith(self.dataset)
@@ -239,73 +299,87 @@ class EEGBIDSDataset():
         # get all recording files in the bids directory
         self.files = self.get_recordings(self.layout)
-        assert len(self.files) > 0, ValueError('Unable to construct EEG dataset. No EEG recordings found.')
-        assert self.check_eeg_dataset(), ValueError('Dataset is not an EEG dataset.')
-        # temp_dir = (Path().resolve() / 'data')
-        # if not os.path.exists(temp_dir):
-        #     os.mkdir(temp_dir)
-        # if not os.path.exists(temp_dir / f'{dataset}_files.npy'):
-        #     self.files = self.get_files_with_extension_parallel(self.bidsdir, extension=self.RAW_EXTENSION[self.raw_format])
-        #     np.save(temp_dir / f'{dataset}_files.npy', self.files)
-        # else:
-        #     self.files = np.load(temp_dir / f'{dataset}_files.npy', allow_pickle=True)
-    def check_eeg_dataset(self):
-        return self.get_bids_file_attribute('modality', self.files[0]).lower() == 'eeg'
-    def get_recordings(self, layout:BIDSLayout):
+        assert len(self.files) > 0, ValueError(
+            "Unable to construct EEG dataset. No EEG recordings found."
+        )
+        assert self.check_eeg_dataset(), ValueError("Dataset is not an EEG dataset.")
+    def check_eeg_dataset(self) -> bool:
+        """Check if the dataset is EEG."""
+        return self.get_bids_file_attribute("modality", self.files[0]).lower() == "eeg"
+    def get_recordings(self, layout: BIDSLayout) -> list[str]:
+        """Get a list of all EEG recording files in the BIDS layout."""
         files = []
         for ext, exts in self.RAW_EXTENSIONS.items():
-            files = layout.get(extension=ext, return_type='filename')
+            files = layout.get(extension=ext, return_type="filename")
             if files:
-                break
+                break
         return files
-    def get_relative_bidspath(self, filename):
-        bids_parent_dir = self.bidsdir.parent
+    def get_relative_bidspath(self, filename: str) -> str:
+        """Make the given file path relative to the BIDS directory."""
+        bids_parent_dir = self.bidsdir.parent.absolute()
         return str(Path(filename).relative_to(bids_parent_dir))
-    def get_property_from_filename(self, property, filename):
+    def get_property_from_filename(self, property: str, filename: str) -> str:
+        """Parse a property out of a BIDS-compliant filename. Returns an empty string
+        if not found.
+        """
         import platform
         if platform.system() == "Windows":
-            lookup = re.search(rf'{property}-(.*?)[_\\]', filename)
+            lookup = re.search(rf"{property}-(.*?)[_\\]", filename)
         else:
-            lookup = re.search(rf'{property}-(.*?)[_\/]', filename)
-        return lookup.group(1) if lookup else ''
-    def merge_json_inheritance(self, json_files):
-        '''
-        Merge list of json files found by get_bids_file_inheritance,
-        expecting the order (from left to right) is from lowest level to highest level,
-        and return a merged dictionary
-        '''
+            lookup = re.search(rf"{property}-(.*?)[_\/]", filename)
+        return lookup.group(1) if lookup else ""
+    def merge_json_inheritance(self, json_files: list[str | Path]) -> dict:
+        """Internal helper to merge list of json files found by get_bids_file_inheritance,
+        expecting the order (from left to right) is from lowest
+        level to highest level, and return a merged dictionary
+        """
         json_files.reverse()
         json_dict = {}
         for f in json_files:
-            json_dict.update(json.load(open(f)))
+            json_dict.update(json.load(open(f)))  # FIXME: should close file
         return json_dict
-    def get_bids_file_inheritance(self, path, basename, extension):
-        '''
-        Get all files with given extension that applies to the basename file
-        following the BIDS inheritance principle in the order of lowest level first
-        @param
-            basename: bids file basename without _eeg.set extension for example
-            extension: e.g. channels.tsv
-        '''
-        top_level_files = ['README', 'dataset_description.json', 'participants.tsv']
+    def get_bids_file_inheritance(
+        self, path: str | Path, basename: str, extension: str
+    ) -> list[Path]:
+        """Get all file paths that apply to the basename file in the specified directory
+        and that end with the specified suffix, recursively searching parent directories
+        (following the BIDS inheritance principle in the order of lowest level first).
+        Parameters
+        ----------
+        path : str | Path
+            The directory path to search for files.
+        basename : str
+            BIDS file basename without _eeg.set extension for example
+        extension : str
+            Only consider files that end with the specified suffix; e.g. channels.tsv
+        Returns
+        -------
+        list[Path]
+            A list of file paths that match the given basename and extension.
+        """
+        top_level_files = ["README", "dataset_description.json", "participants.tsv"]
         bids_files = []
         # check if path is str object
         if isinstance(path, str):
             path = Path(path)
         if not path.exists:
-            raise ValueError('path {path} does not exist')
+            raise ValueError("path {path} does not exist")
         # check if file is in current path
         for file in os.listdir(path):
             # target_file = path / f"{cur_file_basename}_{extension}"
-            if os.path.isfile(path/file):
+            if os.path.isfile(path / file):
                 # check if file has extension extension
                 # check if file basename has extension
                 if file.endswith(extension):
@@ -317,38 +391,54 @@ class EEGBIDSDataset():
             return bids_files
         else:
             # call get_bids_file_inheritance recursively with parent directory
-            bids_files.extend(self.get_bids_file_inheritance(path.parent, basename, extension))
+            bids_files.extend(
+                self.get_bids_file_inheritance(path.parent, basename, extension)
+            )
             return bids_files
-    def get_bids_metadata_files(self, filepath, metadata_file_extension):
-        """
-        (Wrapper for self.get_bids_file_inheritance)
-        Get all BIDS metadata files that are associated with the given filepath, following the BIDS inheritance principle.
-        Args:
-            filepath (str or Path): The filepath to get the associated metadata files for.
-            metadata_files_extensions (list): A list of file extensions to search for metadata files.
-        Returns:
-            list: A list of filepaths for all the associated metadata files
+    def get_bids_metadata_files(
+        self, filepath: str | Path, metadata_file_extension: list[str]
+    ) -> list[Path]:
+        """Retrieve all metadata file paths that apply to a given data file path and that
+        end with a specific suffix (following the BIDS inheritance principle).
+        Parameters
+        ----------
+        filepath: str | Path
+            The filepath to get the associated metadata files for.
+        metadata_file_extension : str
+            Consider only metadata files that end with the specified suffix,
+            e.g., channels.tsv or eeg.json
+        Returns
+        -------
+        list[Path]:
+            A list of filepaths for all matching metadata files
         """
         if isinstance(filepath, str):
             filepath = Path(filepath)
         if not filepath.exists:
-            raise ValueError('filepath {filepath} does not exist')
+            raise ValueError("filepath {filepath} does not exist")
         path, filename = os.path.split(filepath)
-        basename = filename[:filename.rfind('_')]
+        basename = filename[: filename.rfind("_")]
         # metadata files
-        meta_files = self.get_bids_file_inheritance(path, basename, metadata_file_extension)
+        meta_files = self.get_bids_file_inheritance(
+            path, basename, metadata_file_extension
+        )
         return meta_files
-    def scan_directory(self, directory, extension):
+    def scan_directory(self, directory: str, extension: str) -> list[Path]:
+        """Return a list of file paths that end with the given extension in the specified
+        directory. Ignores certain special directories like .git, .datalad, derivatives,
+        and code.
+        """
         result_files = []
-        directory_to_ignore = ['.git', '.datalad', 'derivatives', 'code']
+        directory_to_ignore = [".git", ".datalad", "derivatives", "code"]
         with os.scandir(directory) as entries:
             for entry in entries:
                 if entry.is_file() and entry.name.endswith(extension):
-                    print('Adding ', entry.path)
+                    print("Adding ", entry.path)
                     result_files.append(entry.path)
                 elif entry.is_dir():
                     # check that entry path doesn't contain any name in ignore list
@@ -356,18 +446,41 @@ class EEGBIDSDataset():
                         result_files.append(entry.path)  # Add directory to scan later
         return result_files
-    def get_files_with_extension_parallel(self, directory, extension='.set', max_workers=-1):
+    def get_files_with_extension_parallel(
+        self, directory: str, extension: str = ".set", max_workers: int = -1
+    ) -> list[Path]:
+        """Efficiently scan a directory and its subdirectories for files that end with
+        the given extension.
+        Parameters
+        ----------
+        directory : str
+            The root directory to scan for files.
+        extension : str
+            Only consider files that end with this suffix, e.g. '.set'.
+        max_workers : int
+            Optionally specify the maximum number of worker threads to use for parallel scanning.
+            Defaults to all available CPU cores if set to -1.
+        Returns
+        -------
+        list[Path]:
+            A list of filepaths for all matching metadata files
+        """
         result_files = []
         dirs_to_scan = [directory]
         # Use joblib.Parallel and delayed to parallelize directory scanning
         while dirs_to_scan:
-            print(f"Scanning {len(dirs_to_scan)} directories...", dirs_to_scan)
+            logger.info(
+                f"Directories to scan: {len(dirs_to_scan)}, files: {dirs_to_scan}"
+            )
             # Run the scan_directory function in parallel across directories
             results = Parallel(n_jobs=max_workers, prefer="threads", verbose=1)(
                 delayed(self.scan_directory)(d, extension) for d in dirs_to_scan
             )
             # Reset the directories to scan and process the results
             dirs_to_scan = []
             for res in results:
@@ -376,14 +489,20 @@ class EEGBIDSDataset():
                         dirs_to_scan.append(path)  # Queue up subdirectories to scan
                     else:
                         result_files.append(path)  # Add files to the final result
-            print(f"Current number of files: {len(result_files)}")
+            logger.info(f"Found {len(result_files)} files.")
         return result_files
-    def load_and_preprocess_raw(self, raw_file, preprocess=False):
-        print(f"Loading {raw_file}")
-        EEG = mne.io.read_raw_eeglab(raw_file, preload=True, verbose='error')
+    def load_and_preprocess_raw(
+        self, raw_file: str, preprocess: bool = False
+    ) -> np.ndarray:
+        """Utility function to load a raw data file with MNE and apply some simple
+        (hardcoded) preprocessing and return as a numpy array. Not meant for purposes
+        other than testing or debugging.
+        """
+        logger.info(f"Loading raw data from {raw_file}")
+        EEG = mne.io.read_raw_eeglab(raw_file, preload=True, verbose="error")
         if preprocess:
             # highpass filter
             EEG = EEG.filter(l_freq=0.25, h_freq=25, verbose=False)
@@ -391,33 +510,35 @@ class EEGBIDSDataset():
             EEG = EEG.notch_filter(freqs=(60), verbose=False)
             # bring to common sampling rate
             sfreq = 128
-            if EEG.info['sfreq'] != sfreq:
+            if EEG.info["sfreq"] != sfreq:
                 EEG = EEG.resample(sfreq)
-            # # normalize data to zero mean and unit variance
-            # scalar = preprocessing.StandardScaler()
-            # mat_data = scalar.fit_transform(mat_data.T).T # scalar normalize for each feature and expects shape data x features
         mat_data = EEG.get_data()
         if len(mat_data.shape) > 2:
-            raise ValueError('Expect raw data to be CxT dimension')
+            raise ValueError("Expect raw data to be CxT dimension")
         return mat_data
-    def get_files(self):
+    def get_files(self) -> list[Path]:
+        """Get all EEG recording file paths (with valid extensions) in the BIDS folder."""
         return self.files
-    def resolve_bids_json(self, json_files: list):
-        """
-        Resolve the BIDS JSON files and return a dictionary of the resolved values.
-        Args:
-            json_files (list): A list of JSON files to resolve in order of leaf level first
-        Returns:
+    def resolve_bids_json(self, json_files: list[str]) -> dict:
+        """Resolve the BIDS JSON files and return a dictionary of the resolved values.
+        Parameters
+        ----------
+        json_files : list
+            A list of JSON file paths to resolve in order of leaf level first.
+        Returns
+        -------
             dict: A dictionary of the resolved values.
         """
         if len(json_files) == 0:
-            raise ValueError('No JSON files provided')
-        json_files.reverse() # TODO undeterministic
+            raise ValueError("No JSON files provided")
+        json_files.reverse()  # TODO undeterministic
         json_dict = {}
         for json_file in json_files:
@@ -425,56 +546,78 @@ class EEGBIDSDataset():
                 json_dict.update(json.load(f))
         return json_dict
-    def get_bids_file_attribute(self, attribute, data_filepath):
+    def get_bids_file_attribute(self, attribute: str, data_filepath: str) -> Any:
+        """Retrieve a specific attribute from the BIDS file metadata applicable
+        to the provided recording file path.
+        """
         entities = self.layout.parse_file_entities(data_filepath)
         bidsfile = self.layout.get(**entities)[0]
-        attributes = bidsfile.get_entities(metadata='all')
+        attributes = bidsfile.get_entities(metadata="all")
         attribute_mapping = {
-            'sfreq': 'SamplingFrequency',
-            'modality': 'datatype',
-            'task': 'task',
-            'session': 'session',
-            'run': 'run',
-            'subject': 'subject',
-            'ntimes': 'RecordingDuration',
-            'nchans': 'EEGChannelCount'
+            "sfreq": "SamplingFrequency",
+            "modality": "datatype",
+            "task": "task",
+            "session": "session",
+            "run": "run",
+            "subject": "subject",
+            "ntimes": "RecordingDuration",
+            "nchans": "EEGChannelCount",
         }
         attribute_value = attributes.get(attribute_mapping.get(attribute), None)
         return attribute_value
-    def channel_labels(self, data_filepath):
-        channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
-        return channels_tsv['name'].tolist()
-    def channel_types(self, data_filepath):
-        channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
-        return channels_tsv['type'].tolist()
-    def num_times(self, data_filepath):
-        eeg_jsons = self.get_bids_metadata_files(data_filepath, 'eeg.json')
+    def channel_labels(self, data_filepath: str) -> list[str]:
+        """Get a list of channel labels for the given data file path."""
+        channels_tsv = pd.read_csv(
+            self.get_bids_metadata_files(data_filepath, "channels.tsv")[0], sep="\t"
+        )
+        return channels_tsv["name"].tolist()
+    def channel_types(self, data_filepath: str) -> list[str]:
+        """Get a list of channel types for the given data file path."""
+        channels_tsv = pd.read_csv(
+            self.get_bids_metadata_files(data_filepath, "channels.tsv")[0], sep="\t"
+        )
+        return channels_tsv["type"].tolist()
+    def num_times(self, data_filepath: str) -> int:
+        """Get the approximate number of time points in the EEG recording based on the BIDS metadata."""
+        eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
         eeg_json_dict = self.merge_json_inheritance(eeg_jsons)
-        return int(eeg_json_dict['SamplingFrequency'] * eeg_json_dict['RecordingDuration'])
-    def subject_participant_tsv(self, data_filepath):
-        '''Get participants_tsv info of a subject based on filepath'''
-        participants_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'participants.tsv')[0], sep='\t')
+        return int(
+            eeg_json_dict["SamplingFrequency"] * eeg_json_dict["RecordingDuration"]
+        )
+    def subject_participant_tsv(self, data_filepath: str) -> dict[str, Any]:
+        """Get BIDS participants.tsv record for the subject to which the given file
+        path corresponds, as a dictionary.
+        """
+        participants_tsv = pd.read_csv(
+            self.get_bids_metadata_files(data_filepath, "participants.tsv")[0], sep="\t"
+        )
         # if participants_tsv is not empty
         if participants_tsv.empty:
             return {}
         # set 'participant_id' as index
-        participants_tsv.set_index('participant_id', inplace=True)
+        participants_tsv.set_index("participant_id", inplace=True)
         subject = f"sub-{self.get_bids_file_attribute('subject', data_filepath)}"
         return participants_tsv.loc[subject].to_dict()
-    def eeg_json(self, data_filepath):
-        eeg_jsons = self.get_bids_metadata_files(data_filepath, 'eeg.json')
+    def eeg_json(self, data_filepath: str) -> dict[str, Any]:
+        """Get BIDS eeg.json metadata for the given data file path."""
+        eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
         eeg_json_dict = self.merge_json_inheritance(eeg_jsons)
         return eeg_json_dict
-    def channel_tsv(self, data_filepath):
-        channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
+    def channel_tsv(self, data_filepath: str) -> dict[str, Any]:
+        """Get BIDS channels.tsv metadata for the given data file path, as a dictionary
+        of lists and/or single values.
+        """
+        channels_tsv = pd.read_csv(
+            self.get_bids_metadata_files(data_filepath, "channels.tsv")[0], sep="\t"
+        )
         channel_tsv = channels_tsv.to_dict()
         # 'name' and 'type' now have a dictionary of index-value. Convert them to list
-        for list_field in ['name', 'type', 'units']:
+        for list_field in ["name", "type", "units"]:
             channel_tsv[list_field] = list(channel_tsv[list_field].values())
-        return channel_tsv
+        return channel_tsv

eegdash 0.0.9__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

eegdash 0.0.9py3-none-any.whl → 0.2.0py3-none-any.whl