PyPI - eegdash - Versions diffs - 0.3.7.dev105__tar.gz → 0.3.7.dev107__tar.gz - Mend

eegdash 0.3.7.dev105tar.gz → 0.3.7.dev107tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eegdash might be problematic. Click here for more details.

Files changed (59) hide show

{eegdash-0.3.7.dev105/eegdash.egg-info → eegdash-0.3.7.dev107}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eegdash
-Version: 0.3.7.dev105
+Version: 0.3.7.dev107
 Summary: EEG data for machine learning
 Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>, Aviv Dotan <avivd220@gmail.com>, Oren Shriki <oren70@gmail.com>, Bruno Aristimunha <b.aristimunha@gmail.com>
 License-Expression: GPL-3.0-only

{eegdash-0.3.7.dev105 → eegdash-0.3.7.dev107}/eegdash/__init__.py RENAMED Viewed

@@ -8,4 +8,4 @@ _init_mongo_client()
 __all__ = ["EEGDash", "EEGDashDataset", "EEGChallengeDataset", "preprocessing"]
-__version__ = "0.3.7.dev105"
+__version__ = "0.3.7.dev107"

{eegdash-0.3.7.dev105 → eegdash-0.3.7.dev107}/eegdash/api.py RENAMED Viewed

@@ -18,13 +18,21 @@ from s3fs import S3FileSystem
 from braindecode.datasets import BaseConcatDataset
-from .bids_eeg_metadata import build_query_from_kwargs, load_eeg_attrs_from_bids_file
+from .bids_eeg_metadata import (
+    build_query_from_kwargs,
+    load_eeg_attrs_from_bids_file,
+    merge_participants_fields,
+    normalize_key,
+)
 from .const import (
     ALLOWED_QUERY_FIELDS,
     RELEASE_TO_OPENNEURO_DATASET_MAP,
 )
 from .const import config as data_config
-from .data_utils import EEGBIDSDataset, EEGDashBaseDataset
+from .data_utils import (
+    EEGBIDSDataset,
+    EEGDashBaseDataset,
+)
 from .mongodb import MongoConnectionManager
 logger = logging.getLogger("eegdash")
@@ -784,20 +792,49 @@ class EEGDashDataset(BaseConcatDataset):
                     f"Offline mode is enabled, but local data_dir {self.data_dir} does not exist."
                 )
             records = self._find_local_bids_records(self.data_dir, self.query)
-            datasets = [
-                EEGDashBaseDataset(
-                    record=record,
-                    cache_dir=self.cache_dir,
-                    s3_bucket=self.s3_bucket,
-                    description={
-                        k: record.get(k)
-                        for k in description_fields
-                        if record.get(k) is not None
-                    },
-                    **base_dataset_kwargs,
+            # Try to enrich from local participants.tsv to restore requested fields
+            try:
+                bids_ds = EEGBIDSDataset(
+                    data_dir=str(self.data_dir), dataset=self.query["dataset"]
+                )  # type: ignore[index]
+            except Exception:
+                bids_ds = None
+            datasets = []
+            for record in records:
+                # Start with entity values from filename
+                desc: dict[str, Any] = {
+                    k: record.get(k)
+                    for k in ("subject", "session", "run", "task")
+                    if record.get(k) is not None
+                }
+                if bids_ds is not None:
+                    try:
+                        rel_from_dataset = Path(record["bidspath"]).relative_to(
+                            record["dataset"]
+                        )  # type: ignore[index]
+                        local_file = (self.data_dir / rel_from_dataset).as_posix()
+                        part_row = bids_ds.subject_participant_tsv(local_file)
+                        desc = merge_participants_fields(
+                            description=desc,
+                            participants_row=part_row
+                            if isinstance(part_row, dict)
+                            else None,
+                            description_fields=description_fields,
+                        )
+                    except Exception:
+                        pass
+                datasets.append(
+                    EEGDashBaseDataset(
+                        record=record,
+                        cache_dir=self.cache_dir,
+                        s3_bucket=self.s3_bucket,
+                        description=desc,
+                        **base_dataset_kwargs,
+                    )
                 )
-                for record in records
-            ]
         elif self.query:
             # This is the DB query path that we are improving
             datasets = self._find_datasets(
@@ -882,23 +919,16 @@ class EEGDashDataset(BaseConcatDataset):
             else:
                 matching_args[finder_key] = [entity_val]
-        paths = find_matching_paths(
+        matched_paths = find_matching_paths(
             root=str(dataset_root),
             datatypes=["eeg"],
             suffixes=["eeg"],
             ignore_json=True,
             **matching_args,
         )
+        records_out: list[dict] = []
-        records: list[dict] = []
-        seen_files: set[str] = set()
-        for bids_path in paths:
-            fpath = str(Path(bids_path.fpath).resolve())
-            if fpath in seen_files:
-                continue
-            seen_files.add(fpath)
+        for bids_path in matched_paths:
             # Build bidspath as dataset_id / relative_path_from_dataset_root (POSIX)
             rel_from_root = (
                 Path(bids_path.fpath)
@@ -915,29 +945,37 @@ class EEGDashDataset(BaseConcatDataset):
                 "session": (bids_path.session or None),
                 "task": (bids_path.task or None),
                 "run": (bids_path.run or None),
-                # minimal fields to satisfy BaseDataset
+                # minimal fields to satisfy BaseDataset from eegdash
                 "bidsdependencies": [],  # not needed to just run.
                 "modality": "eeg",
-                # this information is from eegdash schema but not available locally
-                "sampling_frequency": 1.0,
-                "nchans": 1,
-                "ntimes": 1,
+                # minimal numeric defaults for offline length calculation
+                "sampling_frequency": None,
+                "nchans": None,
+                "ntimes": None,
             }
-            records.append(rec)
+            records_out.append(rec)
-        return records
+        return records_out
     def _find_key_in_nested_dict(self, data: Any, target_key: str) -> Any:
-        """Helper to recursively search for a key in a nested dictionary structure; returns
-        the value associated with the first occurrence of the key, or None if not found.
+        """Recursively search for target_key in nested dicts/lists with normalized matching.
+        This makes lookups tolerant to naming differences like "p-factor" vs "p_factor".
+        Returns the first match or None.
         """
+        norm_target = normalize_key(target_key)
         if isinstance(data, dict):
-            if target_key in data:
-                return data[target_key]
-            for value in data.values():
-                result = self._find_key_in_nested_dict(value, target_key)
-                if result is not None:
-                    return result
+            for k, v in data.items():
+                if normalize_key(k) == norm_target:
+                    return v
+                res = self._find_key_in_nested_dict(v, target_key)
+                if res is not None:
+                    return res
+        elif isinstance(data, list):
+            for item in data:
+                res = self._find_key_in_nested_dict(item, target_key)
+                if res is not None:
+                    return res
         return None
     def _find_datasets(
@@ -969,11 +1007,20 @@ class EEGDashDataset(BaseConcatDataset):
         self.records = self.eeg_dash_instance.find(query)
         for record in self.records:
-            description = {}
+            description: dict[str, Any] = {}
+            # Requested fields first (normalized matching)
             for field in description_fields:
                 value = self._find_key_in_nested_dict(record, field)
                 if value is not None:
                     description[field] = value
+            # Merge all participants.tsv columns generically
+            part = self._find_key_in_nested_dict(record, "participant_tsv")
+            if isinstance(part, dict):
+                description = merge_participants_fields(
+                    description=description,
+                    participants_row=part,
+                    description_fields=description_fields,
+                )
             datasets.append(
                 EEGDashBaseDataset(
                     record,

{eegdash-0.3.7.dev105 → eegdash-0.3.7.dev107}/eegdash/bids_eeg_metadata.py RENAMED Viewed

@@ -1,16 +1,18 @@
 import logging
+import re
 from pathlib import Path
 from typing import Any
 from .const import ALLOWED_QUERY_FIELDS
 from .const import config as data_config
-from .data_utils import EEGBIDSDataset
 logger = logging.getLogger("eegdash")
 __all__ = [
     "build_query_from_kwargs",
     "load_eeg_attrs_from_bids_file",
+    "merge_participants_fields",
+    "normalize_key",
 ]
@@ -70,7 +72,7 @@ def build_query_from_kwargs(**kwargs) -> dict[str, Any]:
     return query
-def _get_raw_extensions(bids_file: str, bids_dataset: EEGBIDSDataset) -> list[str]:
+def _get_raw_extensions(bids_file: str, bids_dataset) -> list[str]:
     """Helper to find paths to additional "sidecar" files that may be associated
     with a given main data file in a BIDS dataset; paths are returned as relative to
     the parent dataset path.
@@ -92,9 +94,7 @@ def _get_raw_extensions(bids_file: str, bids_dataset: EEGBIDSDataset) -> list[st
     ]
-def load_eeg_attrs_from_bids_file(
-    bids_dataset: EEGBIDSDataset, bids_file: str
-) -> dict[str, Any]:
+def load_eeg_attrs_from_bids_file(bids_dataset, bids_file: str) -> dict[str, Any]:
     """Build the metadata record for a given BIDS file (single recording) in a BIDS dataset.
     Attributes are at least the ones defined in data_config attributes (set to None if missing),
@@ -182,3 +182,73 @@ def load_eeg_attrs_from_bids_file(
             attrs[field] = None
     return attrs
+def normalize_key(key: str) -> str:
+    """Normalize a metadata key for robust matching.
+    Lowercase and replace non-alphanumeric characters with underscores, then strip
+    leading/trailing underscores. This allows tolerant matching such as
+    "p-factor" ≈ "p_factor" ≈ "P Factor".
+    """
+    return re.sub(r"[^a-z0-9]+", "_", str(key).lower()).strip("_")
+def merge_participants_fields(
+    description: dict[str, Any],
+    participants_row: dict[str, Any] | None,
+    description_fields: list[str] | None = None,
+) -> dict[str, Any]:
+    """Merge participants.tsv fields into a dataset description dictionary.
+    - Preserves existing entries in ``description`` (no overwrites).
+    - Fills requested ``description_fields`` first, preserving their original names.
+    - Adds all remaining participants columns generically using normalized keys
+      unless a matching requested field already captured them.
+    Parameters
+    ----------
+    description : dict
+        Current description to be enriched in-place and returned.
+    participants_row : dict | None
+        A mapping of participants.tsv columns for the current subject.
+    description_fields : list[str] | None
+        Optional list of requested description fields. When provided, matching is
+        performed by normalized names; the original requested field names are kept.
+    Returns
+    -------
+    dict
+        The enriched description (same object as input for convenience).
+    """
+    if not isinstance(description, dict) or not isinstance(participants_row, dict):
+        return description
+    # Normalize participants keys and keep first non-None value per normalized key
+    norm_map: dict[str, Any] = {}
+    for part_key, part_value in participants_row.items():
+        norm_key = normalize_key(part_key)
+        if norm_key not in norm_map and part_value is not None:
+            norm_map[norm_key] = part_value
+    # Ensure description_fields is a list for matching
+    requested = list(description_fields or [])
+    # 1) Fill requested fields first using normalized matching, preserving names
+    for key in requested:
+        if key in description:
+            continue
+        requested_norm_key = normalize_key(key)
+        if requested_norm_key in norm_map:
+            description[key] = norm_map[requested_norm_key]
+    # 2) Add remaining participants columns generically under normalized names,
+    #    unless a requested field already captured them
+    requested_norm = {normalize_key(k) for k in requested}
+    for norm_key, part_value in norm_map.items():
+        if norm_key in requested_norm:
+            continue
+        if norm_key not in description:
+            description[norm_key] = part_value
+    return description

{eegdash-0.3.7.dev105 → eegdash-0.3.7.dev107}/eegdash/data_utils.py RENAMED Viewed

@@ -1,9 +1,11 @@
+import io
 import json
 import logging
 import os
 import re
 import traceback
 import warnings
+from contextlib import redirect_stderr
 from pathlib import Path
 from typing import Any
@@ -91,19 +93,8 @@ class EEGDashBaseDataset(BaseDataset):
             root=self.bids_root,
             datatype="eeg",
             suffix="eeg",
-            # extension='.bdf',
             **self.bids_kwargs,
         )
-        # TO-DO: remove this once find a better solution using mne-bids or update competition dataset
-        try:
-            _ = str(self.bidspath)
-        except RuntimeError:
-            try:
-                self.bidspath = self.bidspath.update(extension=".bdf")
-                self.filecache = self.filecache.with_suffix(".bdf")
-            except Exception as e:
-                logger.error(f"Error while updating BIDS path: {e}")
-                raise e
         self.s3file = self._get_s3path(record["bidspath"])
         self.bids_dependencies = record["bidsdependencies"]
@@ -182,8 +173,11 @@ class EEGDashBaseDataset(BaseDataset):
                 dep_local = Path(self.dataset_folder) / dep_path
             filepath = self.cache_dir / dep_local
             if not self.s3_open_neuro:
+                if filepath.suffix == ".set":
+                    filepath = filepath.with_suffix(".bdf")
                 if self.filecache.suffix == ".set":
                     self.filecache = self.filecache.with_suffix(".bdf")
             # here, we download the dependency and it is fine
             # in the case of the competition.
             if not filepath.exists():
@@ -218,6 +212,12 @@ class EEGDashBaseDataset(BaseDataset):
     def _ensure_raw(self) -> None:
         """Download the S3 file and BIDS dependencies if not already cached."""
+        # TO-DO: remove this once is fixed on the our side
+        # for the competition
+        if not self.s3_open_neuro:
+            self.bidspath = self.bidspath.update(extension=".bdf")
+            self.filecache = self.filecache.with_suffix(".bdf")
         if not os.path.exists(self.filecache):  # not preload
             if self.bids_dependencies:
                 self._download_dependencies()
@@ -226,13 +226,50 @@ class EEGDashBaseDataset(BaseDataset):
             # capturing any warnings
             # to-do: remove this once is fixed on the mne-bids side.
             with warnings.catch_warnings(record=True) as w:
+                # Ensure all warnings are captured into 'w' and not shown to users
+                warnings.simplefilter("always")
                 try:
-                    # TO-DO: remove this once is fixed on the our side
-                    if not self.s3_open_neuro:
-                        self.bidspath = self.bidspath.update(extension=".bdf")
-                    self._raw = mne_bids.read_raw_bids(
-                        bids_path=self.bidspath, verbose="ERROR"
-                    )
+                    # mne-bids emits RuntimeWarnings to stderr; silence stderr during read
+                    _stderr_buffer = io.StringIO()
+                    with redirect_stderr(_stderr_buffer):
+                        self._raw = mne_bids.read_raw_bids(
+                            bids_path=self.bidspath, verbose="ERROR"
+                        )
+                    # Parse unmapped participants.tsv fields reported by mne-bids and
+                    # inject them into Raw.info and the dataset description generically.
+                    extras = self._extract_unmapped_participants_from_warnings(w)
+                    if extras:
+                        # 1) Attach to Raw.info under subject_info.participants_extras
+                        try:
+                            subject_info = self._raw.info.get("subject_info") or {}
+                            if not isinstance(subject_info, dict):
+                                subject_info = {}
+                            pe = subject_info.get("participants_extras") or {}
+                            if not isinstance(pe, dict):
+                                pe = {}
+                            # Merge without overwriting
+                            for k, v in extras.items():
+                                pe.setdefault(k, v)
+                            subject_info["participants_extras"] = pe
+                            self._raw.info["subject_info"] = subject_info
+                        except Exception:
+                            # Non-fatal; continue
+                            pass
+                        # 2) Also add to this dataset's description, if possible, so
+                        #    targets can be selected later without naming specifics.
+                        try:
+                            import pandas as _pd  # local import to avoid top-level cost
+                            if isinstance(self.description, dict):
+                                for k, v in extras.items():
+                                    self.description.setdefault(k, v)
+                            elif isinstance(self.description, _pd.Series):
+                                for k, v in extras.items():
+                                    if k not in self.description.index:
+                                        self.description.loc[k] = v
+                        except Exception:
+                            pass
                 except Exception as e:
                     logger.error(
                         f"Error while reading BIDS file: {self.bidspath}\n"
@@ -242,10 +279,60 @@ class EEGDashBaseDataset(BaseDataset):
                     logger.error(f"Exception: {e}")
                     logger.error(traceback.format_exc())
                     raise e
-                for warning in w:
-                    logger.warning(
-                        f"Warning while reading BIDS file: {warning.message}"
-                    )
+                # Filter noisy mapping notices from mne-bids; surface others
+                for captured_warning in w:
+                    try:
+                        msg = str(captured_warning.message)
+                    except Exception:
+                        continue
+                    # Suppress verbose participants mapping messages
+                    if "Unable to map the following column" in msg and "MNE" in msg:
+                        logger.debug(
+                            "Suppressed mne-bids mapping warning while reading BIDS file: %s",
+                            msg,
+                        )
+                        continue
+                    logger.warning("Warning while reading BIDS file: %s", msg)
+    def _extract_unmapped_participants_from_warnings(
+        self, warnings_list: list[Any]
+    ) -> dict[str, Any]:
+        """Scan captured warnings from mne-bids and extract unmapped participants.tsv
+        entries in a generic way.
+        Optionally, the column name can carry a note in parentheses that we ignore
+        for key/value extraction. Returns a mapping of column name -> raw value.
+        """
+        extras: dict[str, Any] = {}
+        header = "Unable to map the following column(s) to MNE:"
+        for wr in warnings_list:
+            try:
+                msg = str(wr.message)
+            except Exception:
+                continue
+            if header not in msg:
+                continue
+            lines = msg.splitlines()
+            # Find the header line, then parse subsequent lines as entries
+            try:
+                idx = next(i for i, ln in enumerate(lines) if header in ln)
+            except StopIteration:
+                idx = -1
+            for line in lines[idx + 1 :]:
+                line = line.strip()
+                if not line:
+                    continue
+                # Pattern:  <col>(optional note): <value>
+                # Examples: "gender: F", "Ethnicity: Indian", "foo (ignored): bar"
+                m = re.match(r"^([^:]+?)(?:\s*\([^)]*\))?\s*:\s*(.*)$", line)
+                if not m:
+                    continue
+                col = m.group(1).strip()
+                val = m.group(2).strip()
+                # Keep original column names as provided to stay agnostic
+                if col and col not in extras:
+                    extras[col] = val
+        return extras
     # === BaseDataset and PyTorch Dataset interface ===
@@ -264,11 +351,16 @@ class EEGDashBaseDataset(BaseDataset):
     def __len__(self) -> int:
         """Return the number of samples in the dataset."""
         if self._raw is None:
-            # FIXME: this is a bit strange and should definitely not change as a side effect
-            #  of accessing the data (which it will, since ntimes is the actual length but rounded down)
-            return int(self.record["ntimes"] * self.record["sampling_frequency"])
-        else:
-            return len(self._raw)
+            if (
+                self.record["ntimes"] is None
+                or self.record["sampling_frequency"] is None
+            ):
+                self._ensure_raw()
+            else:
+                # FIXME: this is a bit strange and should definitely not change as a side effect
+                #  of accessing the data (which it will, since ntimes is the actual length but rounded down)
+                return int(self.record["ntimes"] * self.record["sampling_frequency"])
+        return len(self._raw)
     @property
     def raw(self):

{eegdash-0.3.7.dev105 → eegdash-0.3.7.dev107/eegdash.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eegdash
-Version: 0.3.7.dev105
+Version: 0.3.7.dev107
 Summary: EEG data for machine learning
 Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>, Aviv Dotan <avivd220@gmail.com>, Oren Shriki <oren70@gmail.com>, Bruno Aristimunha <b.aristimunha@gmail.com>
 License-Expression: GPL-3.0-only

{eegdash-0.3.7.dev105 → eegdash-0.3.7.dev107}/eegdash.egg-info/SOURCES.txt RENAMED Viewed

@@ -52,5 +52,4 @@ tests/test_init.py
 tests/test_minirelease.py
 tests/test_mongo_connection.py
 tests/test_offline.py
-tests/test_offline_bids_matching.py
 tests/test_query.py

{eegdash-0.3.7.dev105 → eegdash-0.3.7.dev107}/pyproject.toml RENAMED Viewed

@@ -114,13 +114,14 @@ line-length = 88
 target-version = "py311"
 [tool.isort]
-skip               = [".gitignore"]
-skip_glob          = ["examples/*"]
-py_version         = 312
-profile            = "black"
-sections           = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
-known_first_party  = ["eegdash", "braindecode"]
+skip                   = [".gitignore"]
+# Format examples too; pre-commit selects files, no need for skip_glob
+py_version             = 312
+profile                = "black"
+sections               = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
+known_first_party      = ["eegdash", "braindecode"]
 lines_between_sections = 1
+atomic                 = false
 [pytest]
 testpaths        = ["tests"]

eegdash-0.3.7.dev107/tests/test_offline.py ADDED Viewed

@@ -0,0 +1,157 @@
+from pathlib import Path
+import platformdirs
+from eegdash.const import RELEASE_TO_OPENNEURO_DATASET_MAP
+from eegdash.dataset.dataset import EEGChallengeDataset
+def test_offline_real_data_end_to_end():
+    """Use real data like in the tutorial: prefetch (online) then go offline.
+    - Prefetch via EEGChallengeDataset (mini release) to the user cache
+    - Instantiate an offline EEGChallengeDataset pointing at the cache
+    - Compare raw shapes for one subject and basic description columns
+    """
+    release = "R2"
+    _ = RELEASE_TO_OPENNEURO_DATASET_MAP[release]
+    cache_dir = Path(platformdirs.user_cache_dir("EEGDash"))
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    # Online: construct challenge dataset (mini) and prefetch first subject
+    # Limit to a single subject to keep the test lean
+    subject_id = "NDARAB793GL3"  # part of R2 mini set
+    ds_online = EEGChallengeDataset(
+        release=release,
+        cache_dir=cache_dir,
+        task="RestingState",
+        mini=True,
+        subject=subject_id,
+    )
+    assert len(ds_online.datasets) > 0
+    first_online = ds_online.datasets[0]
+    # Trigger download of this subject's files (raw + sidecars)
+    _ = first_online.raw
+    # Offline: enumerate locally cached data
+    ds_offline = EEGChallengeDataset(
+        release=release,
+        cache_dir=cache_dir,
+        task="RestingState",
+        download=False,
+        subject=subject_id,
+    )
+    assert len(ds_offline.datasets) == 1
+    first_offline = ds_offline.datasets[0]
+    # Compare raw shapes for the same subject online vs offline
+    shape_online = first_online.raw.get_data().shape
+    shape_offline = first_offline.raw.get_data().shape
+    assert shape_online == shape_offline
+    # Basic description columns present
+    for col in ("subject", "task"):
+        assert col in ds_offline.description.columns
+def test_offline_real_bidspath_and_cache_suffix():
+    """Verify bidspath root and local cache folder for real data (tutorial style)."""
+    release = "R2"
+    dataset_id = RELEASE_TO_OPENNEURO_DATASET_MAP[release]
+    cache_dir = Path(platformdirs.user_cache_dir("EEGDash"))
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    subject_id = "NDARAB793GL3"
+    ds_offline = EEGChallengeDataset(
+        release=release,
+        cache_dir=cache_dir,
+        task="RestingState",
+        download=False,
+        subject=subject_id,
+    )
+    assert len(ds_offline.datasets) == 1
+    base = ds_offline.datasets[0]
+    # bidspath must start with dataset id (not suffixed cache folder)
+    assert base.record["bidspath"].split("/")[0] == dataset_id
+    # local BIDS root points to suffixed folder used by the challenge
+    assert (cache_dir / f"{dataset_id}-bdf-mini").exists()
+    assert base.bids_root == cache_dir / f"{dataset_id}-bdf-mini"
+def test_offline_real_records_description_shape():
+    """Reconstruct from records and compare description row counts (tutorial-like)."""
+    release = "R2"
+    cache_dir = Path(platformdirs.user_cache_dir("EEGDash"))
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    subject_id = "NDARAB793GL3"
+    ds_offline = EEGChallengeDataset(
+        release=release,
+        cache_dir=cache_dir,
+        task="RestingState",
+        download=False,
+        subject=subject_id,
+    )
+    assert len(ds_offline.datasets) == 1
+    # Recreate a dataset from the exact offline records
+    records = [bd.record for bd in ds_offline.datasets]
+    ds_from_records = EEGChallengeDataset(
+        release=release, cache_dir=cache_dir, task="RestingState", records=records
+    )
+    assert ds_offline.description.shape[0] == ds_from_records.description.shape[0]
+def test_online_vs_records_vs_offline_single_subject():
+    """Compare online vs records-injection vs offline for a single subject.
+    Ensures consistent row counts and identical raw data shapes across modes.
+    """
+    release = "R2"
+    subject_id = "NDARAB793GL3"
+    cache_dir = Path(platformdirs.user_cache_dir("EEGDash"))
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    # Online for a single subject, and prefetch raw
+    ds_online = EEGChallengeDataset(
+        release=release,
+        cache_dir=cache_dir,
+        task="RestingState",
+        mini=True,
+        subject=subject_id,
+    )
+    assert len(ds_online.datasets) == 1
+    online_base = ds_online.datasets[0]
+    _ = online_base.raw
+    # From records (inject the online records directly)
+    records = [d.record for d in ds_online.datasets]
+    ds_records = EEGChallengeDataset(
+        release=release,
+        cache_dir=cache_dir,
+        task="RestingState",
+        records=records,
+    )
+    assert len(ds_records.datasets) == 1
+    # Offline: enumerate from cache for same subject
+    ds_offline = EEGChallengeDataset(
+        release=release,
+        cache_dir=cache_dir,
+        task="RestingState",
+        download=False,
+        subject=subject_id,
+    )
+    assert len(ds_offline.datasets) == 1
+    # Compare row counts in description
+    assert ds_online.description.shape[0] == 1
+    assert ds_records.description.shape[0] == 1
+    assert ds_offline.description.shape[0] == 1
+    # Compare raw shapes across modes
+    shape_online = ds_online.datasets[0].raw.get_data().shape
+    shape_records = ds_records.datasets[0].raw.get_data().shape
+    shape_offline = ds_offline.datasets[0].raw.get_data().shape
+    assert shape_online == shape_records == shape_offline

eegdash-0.3.7.dev105/tests/test_offline.py DELETED Viewed

@@ -1,51 +0,0 @@
-from pathlib import Path
-import pytest
-from eegdash import EEGDash, EEGDashDataset
-CACHE_DIR = (Path.home() / "mne_data" / "openneuro").resolve()
-CACHE_DIR.mkdir(parents=True, exist_ok=True)
-def test_dataset_loads_without_eegdash(monkeypatch):
-    """Dataset should load from records without contacting network resources."""
-    eeg_dash = EEGDash()
-    records = eeg_dash.find(
-        dataset="ds005509", subject="NDARAC350XUM", task="RestingState"
-    )
-    # test with internet
-    dataset_internet = EEGDashDataset(
-        query=dict(task="RestingState", subject="NDARAC350XUM", dataset="ds005509"),
-        cache_dir=CACHE_DIR,
-        eeg_dash_instance=eeg_dash,
-    )
-    # Monkeypatch any network calls inside EEGDashDataset to raise if called
-    monkeypatch.setattr(
-        EEGDashDataset,
-        "_find_datasets",
-        lambda *args, **kwargs: pytest.skip(
-            "Skipping network download in offline test"
-        ),
-    )
-    monkeypatch.setattr(
-        EEGDashDataset,
-        "_find_datasets",
-        lambda *args, **kwargs: pytest.skip(
-            "Skipping network download in offline test"
-        ),
-    )
-    # TO-DO: discover way to do this pytest
-    dataset_without_internet = EEGDashDataset(
-        dataset="ds005509", records=records, cache_dir=CACHE_DIR, eeg_dash_instance=None
-    )
-    assert dataset_internet.datasets[0].raw == dataset_without_internet.datasets[0].raw
-    assert (
-        dataset_internet.datasets[0].record
-        == dataset_without_internet.datasets[0].record
-    )

eegdash-0.3.7.dev105/tests/test_offline_bids_matching.py DELETED Viewed

@@ -1,119 +0,0 @@
-from pathlib import Path
-import pytest
-from eegdash.api import EEGDashDataset
-def _touch(p: Path):
-    p.parent.mkdir(parents=True, exist_ok=True)
-    p.touch()
-def make_minimal_bids(tmp_path: Path, dataset_id: str, folder_name: str | None = None):
-    """Create minimal BIDS-like structure under tmp_path/folder_name or dataset_id.
-    The filenames will always embed the dataset_id in bidspath semantics; the folder
-    name can include suffixes to simulate cache suffixing (e.g., ds-xxx-bdf-mini).
-    """
-    root = tmp_path / (folder_name or dataset_id)
-    # Create a few EEG files with different entities
-    _touch(
-        root / "sub-01" / "ses-01" / "eeg" / "sub-01_ses-01_task-rest_run-01_eeg.edf"
-    )
-    _touch(
-        root / "sub-02" / "ses-01" / "eeg" / "sub-02_ses-01_task-rest_run-01_eeg.edf"
-    )
-    _touch(root / "sub-02" / "ses-02" / "eeg" / "sub-02_ses-02_task-eo_run-01_eeg.bdf")
-    return root
-def test_offline_match_all(tmp_path: Path):
-    dataset_id = "ds-local"
-    make_minimal_bids(tmp_path, dataset_id)
-    ds = EEGDashDataset(cache_dir=tmp_path, dataset=dataset_id, download=False)
-    assert len(ds.datasets) == 3
-def test_offline_filter_subject(tmp_path: Path):
-    dataset_id = "ds-local"
-    make_minimal_bids(tmp_path, dataset_id)
-    ds = EEGDashDataset(
-        cache_dir=tmp_path, dataset=dataset_id, subject="01", download=False
-    )
-    assert len(ds.datasets) == 1
-    rec = ds.datasets[0].record
-    assert rec["subject"] == "01"
-    assert rec["task"] == "rest"
-def test_offline_filter_lists(tmp_path: Path):
-    dataset_id = "ds-local"
-    make_minimal_bids(tmp_path, dataset_id)
-    ds = EEGDashDataset(
-        cache_dir=tmp_path,
-        dataset=dataset_id,
-        subject=["01", "02"],
-        task=["rest"],
-        download=False,
-    )
-    # two rest recordings across subjects
-    assert len(ds.datasets) == 2
-    tasks = sorted([d.record["task"] for d in ds.datasets])
-    assert tasks == ["rest", "rest"]
-def test_offline_filter_session(tmp_path: Path):
-    dataset_id = "ds-local"
-    make_minimal_bids(tmp_path, dataset_id)
-    ds = EEGDashDataset(
-        cache_dir=tmp_path, dataset=dataset_id, session="02", download=False
-    )
-    assert len(ds.datasets) == 1
-    rec = ds.datasets[0].record
-    assert rec["session"] == "02"
-    assert rec["task"] == "eo"
-def test_offline_bidspath_and_suffix_rewrite(tmp_path: Path, monkeypatch):
-    """Bidspath should start with dataset id (no suffix) while files are stored
-    under suffixed cache root when s3_bucket indicates preprocessing.
-    Also ensure no S3 is touched in offline path.
-    """
-    dataset_id = "ds-local"
-    folder_name = f"{dataset_id}-bdf-mini"
-    make_minimal_bids(tmp_path, dataset_id, folder_name=folder_name)
-    # Make S3 usage explode if called; offline should not call it
-    import eegdash.api as api_mod
-    class Boom:
-        def __init__(self, *a, **k):
-            raise AssertionError(
-                "S3FileSystem should not be instantiated in offline mode"
-            )
-    monkeypatch.setattr(api_mod, "S3FileSystem", Boom)
-    ds = EEGDashDataset(
-        cache_dir=tmp_path,
-        dataset=dataset_id,
-        download=False,
-        s3_bucket="s3://example/some_bdf_mini_bucket",
-        eeg_dash_instance=object(),  # prevent constructing real EEGDash (which touches S3FileSystem)
-    )
-    assert len(ds.datasets) == 3
-    base = ds.datasets[0]
-    # Records should keep bidspath starting with dataset id (no suffix)
-    assert base.record["bidspath"].split("/")[0] == dataset_id
-    # Local writes should target suffixed folder
-    assert base.bids_root == tmp_path / folder_name
-    assert str(base.filecache).startswith(str((tmp_path / folder_name).resolve()))
-def test_offline_missing_dir_raises(tmp_path: Path):
-    dataset_id = "ds-does-not-exist"
-    with pytest.raises(ValueError, match="Offline mode is enabled, but local data_dir"):
-        EEGDashDataset(cache_dir=tmp_path, dataset=dataset_id, download=False)