PyPI - eegdash - Versions diffs - 0.3.7.dev177024734__py3-none-any.whl → 0.3.8__py3-none-any.whl - Mend

eegdash 0.3.7.dev177024734py3-none-any.whl → 0.3.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eegdash might be problematic. Click here for more details.

Files changed (22) hide show

eegdash/__init__.py +5 -5
eegdash/api.py +528 -460
eegdash/bids_eeg_metadata.py +254 -0
eegdash/const.py +48 -0
eegdash/data_utils.py +177 -45
eegdash/dataset/__init__.py +4 -0
eegdash/{dataset.py → dataset/dataset.py} +53 -10
eegdash/dataset/dataset_summary.csv +256 -0
eegdash/{registry.py → dataset/registry.py} +3 -3
eegdash/hbn/__init__.py +17 -0
eegdash/hbn/windows.py +305 -0
eegdash/paths.py +28 -0
eegdash/utils.py +1 -1
{eegdash-0.3.7.dev177024734.dist-info → eegdash-0.3.8.dist-info}/METADATA +11 -5
eegdash-0.3.8.dist-info/RECORD +35 -0
eegdash/data_config.py +0 -34
eegdash/dataset_summary.csv +0 -256
eegdash-0.3.7.dev177024734.dist-info/RECORD +0 -31
/eegdash/{preprocessing.py → hbn/preprocessing.py} +0 -0
{eegdash-0.3.7.dev177024734.dist-info → eegdash-0.3.8.dist-info}/WHEEL +0 -0
{eegdash-0.3.7.dev177024734.dist-info → eegdash-0.3.8.dist-info}/licenses/LICENSE +0 -0
{eegdash-0.3.7.dev177024734.dist-info → eegdash-0.3.8.dist-info}/top_level.txt +0 -0

eegdash/data_utils.py CHANGED Viewed

@@ -1,9 +1,11 @@
+import io
 import json
 import logging
 import os
 import re
 import traceback
 import warnings
+from contextlib import redirect_stderr
 from pathlib import Path
 from typing import Any
@@ -21,6 +23,8 @@ from mne_bids import BIDSPath
 from braindecode.datasets import BaseDataset
+from .paths import get_default_cache_dir
 logger = logging.getLogger("eegdash")
@@ -57,7 +61,7 @@ class EEGDashBaseDataset(BaseDataset):
         super().__init__(None, **kwargs)
         self.record = record
         self.cache_dir = Path(cache_dir)
-        self.bids_kwargs = self.get_raw_bids_args()
+        self.bids_kwargs = self._get_raw_bids_args()
         if s3_bucket:
             self.s3_bucket = s3_bucket
@@ -66,8 +70,27 @@ class EEGDashBaseDataset(BaseDataset):
             self.s3_bucket = self._AWS_BUCKET
             self.s3_open_neuro = True
-        self.filecache = self.cache_dir / record["bidspath"]
-        self.bids_root = self.cache_dir / record["dataset"]
+        # Compute a dataset folder name under cache_dir that encodes preprocessing
+        # (e.g., bdf, mini) to avoid overlapping with the original dataset cache.
+        self.dataset_folder = record.get("dataset", "")
+        if s3_bucket:
+            suffixes: list[str] = []
+            bucket_lower = str(s3_bucket).lower()
+            if "bdf" in bucket_lower:
+                suffixes.append("bdf")
+            if "mini" in bucket_lower:
+                suffixes.append("mini")
+            if suffixes:
+                self.dataset_folder = f"{self.dataset_folder}-{'-'.join(suffixes)}"
+        # Place files under the dataset-specific folder (with suffix if any)
+        rel = Path(record["bidspath"])  # usually starts with dataset id
+        if rel.parts and rel.parts[0] == record.get("dataset"):
+            rel = Path(self.dataset_folder, *rel.parts[1:])
+        else:
+            rel = Path(self.dataset_folder) / rel
+        self.filecache = self.cache_dir / rel
+        self.bids_root = self.cache_dir / self.dataset_folder
         self.bidspath = BIDSPath(
             root=self.bids_root,
             datatype="eeg",
@@ -75,7 +98,7 @@ class EEGDashBaseDataset(BaseDataset):
             **self.bids_kwargs,
         )
-        self.s3file = self.get_s3path(record["bidspath"])
+        self.s3file = self._get_s3path(record["bidspath"])
         self.bids_dependencies = record["bidsdependencies"]
         # Temporary fix for BIDS dependencies path
         # just to release to the competition
@@ -87,7 +110,7 @@ class EEGDashBaseDataset(BaseDataset):
         self._raw = None
-    def get_s3path(self, filepath: str) -> str:
+    def _get_s3path(self, filepath: str) -> str:
         """Helper to form an AWS S3 URI for the given relative filepath."""
         return f"{self.s3_bucket}/{filepath}"
@@ -141,14 +164,22 @@ class EEGDashBaseDataset(BaseDataset):
                 if dep.endswith(".set"):
                     dep = dep[:-4] + ".bdf"
-            s3path = self.get_s3path(dep)
+            s3path = self._get_s3path(dep)
             if not self.s3_open_neuro:
                 dep = self.bids_dependencies_original[i]
-            filepath = self.cache_dir / dep
+            dep_path = Path(dep)
+            if dep_path.parts and dep_path.parts[0] == self.record.get("dataset"):
+                dep_local = Path(self.dataset_folder, *dep_path.parts[1:])
+            else:
+                dep_local = Path(self.dataset_folder) / dep_path
+            filepath = self.cache_dir / dep_local
             if not self.s3_open_neuro:
+                if filepath.suffix == ".set":
+                    filepath = filepath.with_suffix(".bdf")
                 if self.filecache.suffix == ".set":
                     self.filecache = self.filecache.with_suffix(".bdf")
             # here, we download the dependency and it is fine
             # in the case of the competition.
             if not filepath.exists():
@@ -174,15 +205,21 @@ class EEGDashBaseDataset(BaseDataset):
                 )
                 filesystem.get(s3path, filepath, callback=callback)
-    def get_raw_bids_args(self) -> dict[str, Any]:
+    def _get_raw_bids_args(self) -> dict[str, Any]:
         """Helper to restrict the metadata record to the fields needed to locate a BIDS
         recording.
         """
         desired_fields = ["subject", "session", "task", "run"]
         return {k: self.record[k] for k in desired_fields if self.record[k]}
-    def check_and_get_raw(self) -> None:
+    def _ensure_raw(self) -> None:
         """Download the S3 file and BIDS dependencies if not already cached."""
+        # TO-DO: remove this once is fixed on the our side
+        # for the competition
+        if not self.s3_open_neuro:
+            self.bidspath = self.bidspath.update(extension=".bdf")
+            self.filecache = self.filecache.with_suffix(".bdf")
         if not os.path.exists(self.filecache):  # not preload
             if self.bids_dependencies:
                 self._download_dependencies()
@@ -191,14 +228,48 @@ class EEGDashBaseDataset(BaseDataset):
             # capturing any warnings
             # to-do: remove this once is fixed on the mne-bids side.
             with warnings.catch_warnings(record=True) as w:
+                # Ensure all warnings are captured into 'w' and not shown to users
+                warnings.simplefilter("always")
                 try:
-                    # TO-DO: remove this once is fixed on the our side
-                    if not self.s3_open_neuro:
-                        self.bidspath = self.bidspath.update(extension=".bdf")
-                    self._raw = mne_bids.read_raw_bids(
-                        bids_path=self.bidspath, verbose="ERROR"
-                    )
+                    # mne-bids emits RuntimeWarnings to stderr; silence stderr during read
+                    _stderr_buffer = io.StringIO()
+                    with redirect_stderr(_stderr_buffer):
+                        self._raw = mne_bids.read_raw_bids(
+                            bids_path=self.bidspath, verbose="ERROR"
+                        )
+                    # Parse unmapped participants.tsv fields reported by mne-bids and
+                    # inject them into Raw.info and the dataset description generically.
+                    extras = self._extract_unmapped_participants_from_warnings(w)
+                    if extras:
+                        # 1) Attach to Raw.info under subject_info.participants_extras
+                        try:
+                            subject_info = self._raw.info.get("subject_info") or {}
+                            if not isinstance(subject_info, dict):
+                                subject_info = {}
+                            pe = subject_info.get("participants_extras") or {}
+                            if not isinstance(pe, dict):
+                                pe = {}
+                            # Merge without overwriting
+                            for k, v in extras.items():
+                                pe.setdefault(k, v)
+                            subject_info["participants_extras"] = pe
+                            self._raw.info["subject_info"] = subject_info
+                        except Exception:
+                            # Non-fatal; continue
+                            pass
+                        # 2) Also add to this dataset's description, if possible, so
+                        #    targets can be selected later without naming specifics.
+                        try:
+                            if isinstance(self.description, dict):
+                                for k, v in extras.items():
+                                    self.description.setdefault(k, v)
+                            elif isinstance(self.description, pd.Series):
+                                for k, v in extras.items():
+                                    if k not in self.description.index:
+                                        self.description.loc[k] = v
+                        except Exception:
+                            pass
                 except Exception as e:
                     logger.error(
                         f"Error while reading BIDS file: {self.bidspath}\n"
@@ -208,10 +279,59 @@ class EEGDashBaseDataset(BaseDataset):
                     logger.error(f"Exception: {e}")
                     logger.error(traceback.format_exc())
                     raise e
-                for warning in w:
-                    logger.warning(
-                        f"Warning while reading BIDS file: {warning.message}"
-                    )
+                # Filter noisy mapping notices from mne-bids; surface others
+                for captured_warning in w:
+                    try:
+                        msg = str(captured_warning.message)
+                    except Exception:
+                        continue
+                    # Suppress verbose participants mapping messages
+                    if "Unable to map the following column" in msg and "MNE" in msg:
+                        logger.debug(
+                            "Suppressed mne-bids mapping warning while reading BIDS file: %s",
+                            msg,
+                        )
+                        continue
+    def _extract_unmapped_participants_from_warnings(
+        self, warnings_list: list[Any]
+    ) -> dict[str, Any]:
+        """Scan captured warnings from mne-bids and extract unmapped participants.tsv
+        entries in a generic way.
+        Optionally, the column name can carry a note in parentheses that we ignore
+        for key/value extraction. Returns a mapping of column name -> raw value.
+        """
+        extras: dict[str, Any] = {}
+        header = "Unable to map the following column(s) to MNE:"
+        for wr in warnings_list:
+            try:
+                msg = str(wr.message)
+            except Exception:
+                continue
+            if header not in msg:
+                continue
+            lines = msg.splitlines()
+            # Find the header line, then parse subsequent lines as entries
+            try:
+                idx = next(i for i, ln in enumerate(lines) if header in ln)
+            except StopIteration:
+                idx = -1
+            for line in lines[idx + 1 :]:
+                line = line.strip()
+                if not line:
+                    continue
+                # Pattern:  <col>(optional note): <value>
+                # Examples: "gender: F", "Ethnicity: Indian", "foo (ignored): bar"
+                m = re.match(r"^([^:]+?)(?:\s*\([^)]*\))?\s*:\s*(.*)$", line)
+                if not m:
+                    continue
+                col = m.group(1).strip()
+                val = m.group(2).strip()
+                # Keep original column names as provided to stay agnostic
+                if col and col not in extras:
+                    extras[col] = val
+        return extras
     # === BaseDataset and PyTorch Dataset interface ===
@@ -230,11 +350,16 @@ class EEGDashBaseDataset(BaseDataset):
     def __len__(self) -> int:
         """Return the number of samples in the dataset."""
         if self._raw is None:
-            # FIXME: this is a bit strange and should definitely not change as a side effect
-            #  of accessing the data (which it will, since ntimes is the actual length but rounded down)
-            return int(self.record["ntimes"] * self.record["sampling_frequency"])
-        else:
-            return len(self._raw)
+            if (
+                self.record["ntimes"] is None
+                or self.record["sampling_frequency"] is None
+            ):
+                self._ensure_raw()
+            else:
+                # FIXME: this is a bit strange and should definitely not change as a side effect
+                #  of accessing the data (which it will, since ntimes is the actual length but rounded down)
+                return int(self.record["ntimes"] * self.record["sampling_frequency"])
+        return len(self._raw)
     @property
     def raw(self):
@@ -242,7 +367,7 @@ class EEGDashBaseDataset(BaseDataset):
         retrieval if not yet done so.
         """
         if self._raw is None:
-            self.check_and_get_raw()
+            self._ensure_raw()
         return self._raw
     @raw.setter
@@ -284,7 +409,7 @@ class EEGDashBaseRaw(BaseRaw):
         metadata: dict[str, Any],
         preload: bool = False,
         *,
-        cache_dir: str = "~/eegdash_cache",
+        cache_dir: str | None = None,
         bids_dependencies: list[str] = [],
         verbose: Any = None,
     ):
@@ -300,8 +425,9 @@ class EEGDashBaseRaw(BaseRaw):
                 chtype = "eog"
             ch_types.append(chtype)
         info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
-        self.s3file = self.get_s3path(input_fname)
-        self.cache_dir = Path(cache_dir)
+        self.s3file = self._get_s3path(input_fname)
+        self.cache_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
         self.filecache = self.cache_dir / input_fname
         self.bids_dependencies = bids_dependencies
@@ -317,7 +443,7 @@ class EEGDashBaseRaw(BaseRaw):
             verbose=verbose,
         )
-    def get_s3path(self, filepath):
+    def _get_s3path(self, filepath):
         return f"{self._AWS_BUCKET}/{filepath}"
     def _download_s3(self) -> None:
@@ -333,7 +459,7 @@ class EEGDashBaseRaw(BaseRaw):
             anon=True, client_kwargs={"region_name": "us-east-2"}
         )
         for dep in self.bids_dependencies:
-            s3path = self.get_s3path(dep)
+            s3path = self._get_s3path(dep)
             filepath = self.cache_dir / dep
             if not filepath.exists():
                 filepath.parent.mkdir(parents=True, exist_ok=True)
@@ -394,11 +520,17 @@ class EEGBIDSDataset:
             raise ValueError("data_dir must be specified and must exist")
         self.bidsdir = Path(data_dir)
         self.dataset = dataset
-        assert str(self.bidsdir).endswith(self.dataset)
+        # Accept exact dataset folder or a variant with informative suffixes
+        # (e.g., dsXXXXX-bdf, dsXXXXX-bdf-mini) to avoid collisions.
+        dir_name = self.bidsdir.name
+        if not (dir_name == self.dataset or dir_name.startswith(self.dataset + "-")):
+            raise AssertionError(
+                f"BIDS directory '{dir_name}' does not correspond to dataset '{self.dataset}'"
+            )
         self.layout = BIDSLayout(data_dir)
         # get all recording files in the bids directory
-        self.files = self.get_recordings(self.layout)
+        self.files = self._get_recordings(self.layout)
         assert len(self.files) > 0, ValueError(
             "Unable to construct EEG dataset. No EEG recordings found."
         )
@@ -408,7 +540,7 @@ class EEGBIDSDataset:
         """Check if the dataset is EEG."""
         return self.get_bids_file_attribute("modality", self.files[0]).lower() == "eeg"
-    def get_recordings(self, layout: BIDSLayout) -> list[str]:
+    def _get_recordings(self, layout: BIDSLayout) -> list[str]:
         """Get a list of all EEG recording files in the BIDS layout."""
         files = []
         for ext, exts in self.RAW_EXTENSIONS.items():
@@ -417,12 +549,12 @@ class EEGBIDSDataset:
                 break
         return files
-    def get_relative_bidspath(self, filename: str) -> str:
+    def _get_relative_bidspath(self, filename: str) -> str:
         """Make the given file path relative to the BIDS directory."""
         bids_parent_dir = self.bidsdir.parent.absolute()
         return str(Path(filename).relative_to(bids_parent_dir))
-    def get_property_from_filename(self, property: str, filename: str) -> str:
+    def _get_property_from_filename(self, property: str, filename: str) -> str:
         """Parse a property out of a BIDS-compliant filename. Returns an empty string
         if not found.
         """
@@ -434,7 +566,7 @@ class EEGBIDSDataset:
             lookup = re.search(rf"{property}-(.*?)[_\/]", filename)
         return lookup.group(1) if lookup else ""
-    def merge_json_inheritance(self, json_files: list[str | Path]) -> dict:
+    def _merge_json_inheritance(self, json_files: list[str | Path]) -> dict:
         """Internal helper to merge list of json files found by get_bids_file_inheritance,
         expecting the order (from left to right) is from lowest
         level to highest level, and return a merged dictionary
@@ -445,7 +577,7 @@ class EEGBIDSDataset:
             json_dict.update(json.load(open(f)))  # FIXME: should close file
         return json_dict
-    def get_bids_file_inheritance(
+    def _get_bids_file_inheritance(
         self, path: str | Path, basename: str, extension: str
     ) -> list[Path]:
         """Get all file paths that apply to the basename file in the specified directory
@@ -492,7 +624,7 @@ class EEGBIDSDataset:
         else:
             # call get_bids_file_inheritance recursively with parent directory
             bids_files.extend(
-                self.get_bids_file_inheritance(path.parent, basename, extension)
+                self._get_bids_file_inheritance(path.parent, basename, extension)
             )
             return bids_files
@@ -523,12 +655,12 @@ class EEGBIDSDataset:
         path, filename = os.path.split(filepath)
         basename = filename[: filename.rfind("_")]
         # metadata files
-        meta_files = self.get_bids_file_inheritance(
+        meta_files = self._get_bids_file_inheritance(
             path, basename, metadata_file_extension
         )
         return meta_files
-    def scan_directory(self, directory: str, extension: str) -> list[Path]:
+    def _scan_directory(self, directory: str, extension: str) -> list[Path]:
         """Return a list of file paths that end with the given extension in the specified
         directory. Ignores certain special directories like .git, .datalad, derivatives,
         and code.
@@ -545,7 +677,7 @@ class EEGBIDSDataset:
                         result_files.append(entry.path)  # Add directory to scan later
         return result_files
-    def get_files_with_extension_parallel(
+    def _get_files_with_extension_parallel(
         self, directory: str, extension: str = ".set", max_workers: int = -1
     ) -> list[Path]:
         """Efficiently scan a directory and its subdirectories for files that end with
@@ -577,7 +709,7 @@ class EEGBIDSDataset:
             )
             # Run the scan_directory function in parallel across directories
             results = Parallel(n_jobs=max_workers, prefer="threads", verbose=1)(
-                delayed(self.scan_directory)(d, extension) for d in dirs_to_scan
+                delayed(self._scan_directory)(d, extension) for d in dirs_to_scan
             )
             # Reset the directories to scan and process the results
@@ -682,7 +814,7 @@ class EEGBIDSDataset:
     def num_times(self, data_filepath: str) -> int:
         """Get the approximate number of time points in the EEG recording based on the BIDS metadata."""
         eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
-        eeg_json_dict = self.merge_json_inheritance(eeg_jsons)
+        eeg_json_dict = self._merge_json_inheritance(eeg_jsons)
         return int(
             eeg_json_dict["SamplingFrequency"] * eeg_json_dict["RecordingDuration"]
         )
@@ -705,7 +837,7 @@ class EEGBIDSDataset:
     def eeg_json(self, data_filepath: str) -> dict[str, Any]:
         """Get BIDS eeg.json metadata for the given data file path."""
         eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
-        eeg_json_dict = self.merge_json_inheritance(eeg_jsons)
+        eeg_json_dict = self._merge_json_inheritance(eeg_jsons)
         return eeg_json_dict
     def channel_tsv(self, data_filepath: str) -> dict[str, Any]:

eegdash/dataset/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .dataset import EEGChallengeDataset
+from .registry import register_openneuro_datasets
+__all__ = ["EEGChallengeDataset", "register_openneuro_datasets"]

eegdash/{dataset.py → dataset/dataset.py} RENAMED Viewed

@@ -3,8 +3,9 @@ from pathlib import Path
 from mne.utils import warn
-from .api import EEGDashDataset
-from .const import RELEASE_TO_OPENNEURO_DATASET_MAP, SUBJECT_MINI_RELEASE_MAP
+from ..api import EEGDashDataset
+from ..bids_eeg_metadata import build_query_from_kwargs
+from ..const import RELEASE_TO_OPENNEURO_DATASET_MAP, SUBJECT_MINI_RELEASE_MAP
 from .registry import register_openneuro_datasets
 logger = logging.getLogger("eegdash")
@@ -68,15 +69,56 @@ class EEGChallengeDataset(EEGDashDataset):
             )
         if self.mini:
-            # Disallow mixing subject selection with mini=True since mini already
-            # applies a predefined subject subset.
-            if (query and "subject" in query) or ("subject" in kwargs):
-                raise ValueError(
-                    "Query using the parameters `subject` with the class EEGChallengeDataset and `mini==True` is not possible."
-                    "Please don't use the `subject` selection twice."
-                    "Set `mini=False` to use the `subject` selection."
+            # When using the mini release, restrict subjects to the predefined subset.
+            # If the user specifies subject(s), ensure they all belong to the mini subset;
+            # otherwise, default to the full mini subject list for this release.
+            allowed_subjects = set(SUBJECT_MINI_RELEASE_MAP[release])
+            # Normalize potential 'subjects' -> 'subject' for convenience
+            if "subjects" in kwargs and "subject" not in kwargs:
+                kwargs["subject"] = kwargs.pop("subjects")
+            # Collect user-requested subjects from kwargs/query. We canonicalize
+            # kwargs via build_query_from_kwargs to leverage existing validation,
+            # and support Mongo-style {"$in": [...]} shapes from a raw query.
+            requested_subjects: list[str] = []
+            # From kwargs
+            if "subject" in kwargs and kwargs["subject"] is not None:
+                # Use the shared query builder to normalize scalars/lists
+                built = build_query_from_kwargs(subject=kwargs["subject"])
+                s_val = built.get("subject")
+                if isinstance(s_val, dict) and "$in" in s_val:
+                    requested_subjects.extend(list(s_val["$in"]))
+                elif s_val is not None:
+                    requested_subjects.append(s_val)  # type: ignore[arg-type]
+            # From query (top-level only)
+            if query and isinstance(query, dict) and "subject" in query:
+                qval = query["subject"]
+                if isinstance(qval, dict) and "$in" in qval:
+                    requested_subjects.extend(list(qval["$in"]))
+                elif isinstance(qval, (list, tuple, set)):
+                    requested_subjects.extend(list(qval))
+                elif qval is not None:
+                    requested_subjects.append(qval)
+            # Validate if any subjects were explicitly requested
+            if requested_subjects:
+                invalid = sorted(
+                    {s for s in requested_subjects if s not in allowed_subjects}
                 )
-            kwargs["subject"] = SUBJECT_MINI_RELEASE_MAP[release]
+                if invalid:
+                    raise ValueError(
+                        "Some requested subject(s) are not part of the mini release for "
+                        f"{release}: {invalid}. Allowed subjects: {sorted(allowed_subjects)}"
+                    )
+                # Do not override user selection; keep their (validated) subjects as-is.
+            else:
+                # No subject specified by the user: default to the full mini subset
+                kwargs["subject"] = sorted(allowed_subjects)
             s3_bucket = f"{s3_bucket}/{release}_mini_L100_bdf"
         else:
             s3_bucket = f"{s3_bucket}/{release}_L100_bdf"
@@ -104,6 +146,7 @@ class EEGChallengeDataset(EEGDashDataset):
             query=query,
             cache_dir=cache_dir,
             s3_bucket=s3_bucket,
+            _suppress_comp_warning=True,
             **kwargs,
         )

eegdash 0.3.7.dev177024734__py3-none-any.whl → 0.3.8__py3-none-any.whl

Potentially problematic release.

eegdash 0.3.7.dev177024734py3-none-any.whl → 0.3.8py3-none-any.whl