PyPI - eegdash - Versions diffs - 0.3.7.dev105__py3-none-any.whl → 0.3.7.dev107__py3-none-any.whl - Mend

eegdash 0.3.7.dev105py3-none-any.whl → 0.3.7.dev107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eegdash might be problematic. Click here for more details.

Files changed (9) hide show

eegdash/__init__.py CHANGED Viewed

@@ -8,4 +8,4 @@ _init_mongo_client()
 __all__ = ["EEGDash", "EEGDashDataset", "EEGChallengeDataset", "preprocessing"]
-__version__ = "0.3.7.dev105"
+__version__ = "0.3.7.dev107"

eegdash/api.py CHANGED Viewed

@@ -18,13 +18,21 @@ from s3fs import S3FileSystem
 from braindecode.datasets import BaseConcatDataset
-from .bids_eeg_metadata import build_query_from_kwargs, load_eeg_attrs_from_bids_file
+from .bids_eeg_metadata import (
+    build_query_from_kwargs,
+    load_eeg_attrs_from_bids_file,
+    merge_participants_fields,
+    normalize_key,
+)
 from .const import (
     ALLOWED_QUERY_FIELDS,
     RELEASE_TO_OPENNEURO_DATASET_MAP,
 )
 from .const import config as data_config
-from .data_utils import EEGBIDSDataset, EEGDashBaseDataset
+from .data_utils import (
+    EEGBIDSDataset,
+    EEGDashBaseDataset,
+)
 from .mongodb import MongoConnectionManager
 logger = logging.getLogger("eegdash")
@@ -784,20 +792,49 @@ class EEGDashDataset(BaseConcatDataset):
                     f"Offline mode is enabled, but local data_dir {self.data_dir} does not exist."
                 )
             records = self._find_local_bids_records(self.data_dir, self.query)
-            datasets = [
-                EEGDashBaseDataset(
-                    record=record,
-                    cache_dir=self.cache_dir,
-                    s3_bucket=self.s3_bucket,
-                    description={
-                        k: record.get(k)
-                        for k in description_fields
-                        if record.get(k) is not None
-                    },
-                    **base_dataset_kwargs,
+            # Try to enrich from local participants.tsv to restore requested fields
+            try:
+                bids_ds = EEGBIDSDataset(
+                    data_dir=str(self.data_dir), dataset=self.query["dataset"]
+                )  # type: ignore[index]
+            except Exception:
+                bids_ds = None
+            datasets = []
+            for record in records:
+                # Start with entity values from filename
+                desc: dict[str, Any] = {
+                    k: record.get(k)
+                    for k in ("subject", "session", "run", "task")
+                    if record.get(k) is not None
+                }
+                if bids_ds is not None:
+                    try:
+                        rel_from_dataset = Path(record["bidspath"]).relative_to(
+                            record["dataset"]
+                        )  # type: ignore[index]
+                        local_file = (self.data_dir / rel_from_dataset).as_posix()
+                        part_row = bids_ds.subject_participant_tsv(local_file)
+                        desc = merge_participants_fields(
+                            description=desc,
+                            participants_row=part_row
+                            if isinstance(part_row, dict)
+                            else None,
+                            description_fields=description_fields,
+                        )
+                    except Exception:
+                        pass
+                datasets.append(
+                    EEGDashBaseDataset(
+                        record=record,
+                        cache_dir=self.cache_dir,
+                        s3_bucket=self.s3_bucket,
+                        description=desc,
+                        **base_dataset_kwargs,
+                    )
                 )
-                for record in records
-            ]
         elif self.query:
             # This is the DB query path that we are improving
             datasets = self._find_datasets(
@@ -882,23 +919,16 @@ class EEGDashDataset(BaseConcatDataset):
             else:
                 matching_args[finder_key] = [entity_val]
-        paths = find_matching_paths(
+        matched_paths = find_matching_paths(
             root=str(dataset_root),
             datatypes=["eeg"],
             suffixes=["eeg"],
             ignore_json=True,
             **matching_args,
         )
+        records_out: list[dict] = []
-        records: list[dict] = []
-        seen_files: set[str] = set()
-        for bids_path in paths:
-            fpath = str(Path(bids_path.fpath).resolve())
-            if fpath in seen_files:
-                continue
-            seen_files.add(fpath)
+        for bids_path in matched_paths:
             # Build bidspath as dataset_id / relative_path_from_dataset_root (POSIX)
             rel_from_root = (
                 Path(bids_path.fpath)
@@ -915,29 +945,37 @@ class EEGDashDataset(BaseConcatDataset):
                 "session": (bids_path.session or None),
                 "task": (bids_path.task or None),
                 "run": (bids_path.run or None),
-                # minimal fields to satisfy BaseDataset
+                # minimal fields to satisfy BaseDataset from eegdash
                 "bidsdependencies": [],  # not needed to just run.
                 "modality": "eeg",
-                # this information is from eegdash schema but not available locally
-                "sampling_frequency": 1.0,
-                "nchans": 1,
-                "ntimes": 1,
+                # minimal numeric defaults for offline length calculation
+                "sampling_frequency": None,
+                "nchans": None,
+                "ntimes": None,
             }
-            records.append(rec)
+            records_out.append(rec)
-        return records
+        return records_out
     def _find_key_in_nested_dict(self, data: Any, target_key: str) -> Any:
-        """Helper to recursively search for a key in a nested dictionary structure; returns
-        the value associated with the first occurrence of the key, or None if not found.
+        """Recursively search for target_key in nested dicts/lists with normalized matching.
+        This makes lookups tolerant to naming differences like "p-factor" vs "p_factor".
+        Returns the first match or None.
         """
+        norm_target = normalize_key(target_key)
         if isinstance(data, dict):
-            if target_key in data:
-                return data[target_key]
-            for value in data.values():
-                result = self._find_key_in_nested_dict(value, target_key)
-                if result is not None:
-                    return result
+            for k, v in data.items():
+                if normalize_key(k) == norm_target:
+                    return v
+                res = self._find_key_in_nested_dict(v, target_key)
+                if res is not None:
+                    return res
+        elif isinstance(data, list):
+            for item in data:
+                res = self._find_key_in_nested_dict(item, target_key)
+                if res is not None:
+                    return res
         return None
     def _find_datasets(
@@ -969,11 +1007,20 @@ class EEGDashDataset(BaseConcatDataset):
         self.records = self.eeg_dash_instance.find(query)
         for record in self.records:
-            description = {}
+            description: dict[str, Any] = {}
+            # Requested fields first (normalized matching)
             for field in description_fields:
                 value = self._find_key_in_nested_dict(record, field)
                 if value is not None:
                     description[field] = value
+            # Merge all participants.tsv columns generically
+            part = self._find_key_in_nested_dict(record, "participant_tsv")
+            if isinstance(part, dict):
+                description = merge_participants_fields(
+                    description=description,
+                    participants_row=part,
+                    description_fields=description_fields,
+                )
             datasets.append(
                 EEGDashBaseDataset(
                     record,

eegdash/bids_eeg_metadata.py CHANGED Viewed

@@ -1,16 +1,18 @@
 import logging
+import re
 from pathlib import Path
 from typing import Any
 from .const import ALLOWED_QUERY_FIELDS
 from .const import config as data_config
-from .data_utils import EEGBIDSDataset
 logger = logging.getLogger("eegdash")
 __all__ = [
     "build_query_from_kwargs",
     "load_eeg_attrs_from_bids_file",
+    "merge_participants_fields",
+    "normalize_key",
 ]
@@ -70,7 +72,7 @@ def build_query_from_kwargs(**kwargs) -> dict[str, Any]:
     return query
-def _get_raw_extensions(bids_file: str, bids_dataset: EEGBIDSDataset) -> list[str]:
+def _get_raw_extensions(bids_file: str, bids_dataset) -> list[str]:
     """Helper to find paths to additional "sidecar" files that may be associated
     with a given main data file in a BIDS dataset; paths are returned as relative to
     the parent dataset path.
@@ -92,9 +94,7 @@ def _get_raw_extensions(bids_file: str, bids_dataset: EEGBIDSDataset) -> list[st
     ]
-def load_eeg_attrs_from_bids_file(
-    bids_dataset: EEGBIDSDataset, bids_file: str
-) -> dict[str, Any]:
+def load_eeg_attrs_from_bids_file(bids_dataset, bids_file: str) -> dict[str, Any]:
     """Build the metadata record for a given BIDS file (single recording) in a BIDS dataset.
     Attributes are at least the ones defined in data_config attributes (set to None if missing),
@@ -182,3 +182,73 @@ def load_eeg_attrs_from_bids_file(
             attrs[field] = None
     return attrs
+def normalize_key(key: str) -> str:
+    """Normalize a metadata key for robust matching.
+    Lowercase and replace non-alphanumeric characters with underscores, then strip
+    leading/trailing underscores. This allows tolerant matching such as
+    "p-factor" ≈ "p_factor" ≈ "P Factor".
+    """
+    return re.sub(r"[^a-z0-9]+", "_", str(key).lower()).strip("_")
+def merge_participants_fields(
+    description: dict[str, Any],
+    participants_row: dict[str, Any] | None,
+    description_fields: list[str] | None = None,
+) -> dict[str, Any]:
+    """Merge participants.tsv fields into a dataset description dictionary.
+    - Preserves existing entries in ``description`` (no overwrites).
+    - Fills requested ``description_fields`` first, preserving their original names.
+    - Adds all remaining participants columns generically using normalized keys
+      unless a matching requested field already captured them.
+    Parameters
+    ----------
+    description : dict
+        Current description to be enriched in-place and returned.
+    participants_row : dict | None
+        A mapping of participants.tsv columns for the current subject.
+    description_fields : list[str] | None
+        Optional list of requested description fields. When provided, matching is
+        performed by normalized names; the original requested field names are kept.
+    Returns
+    -------
+    dict
+        The enriched description (same object as input for convenience).
+    """
+    if not isinstance(description, dict) or not isinstance(participants_row, dict):
+        return description
+    # Normalize participants keys and keep first non-None value per normalized key
+    norm_map: dict[str, Any] = {}
+    for part_key, part_value in participants_row.items():
+        norm_key = normalize_key(part_key)
+        if norm_key not in norm_map and part_value is not None:
+            norm_map[norm_key] = part_value
+    # Ensure description_fields is a list for matching
+    requested = list(description_fields or [])
+    # 1) Fill requested fields first using normalized matching, preserving names
+    for key in requested:
+        if key in description:
+            continue
+        requested_norm_key = normalize_key(key)
+        if requested_norm_key in norm_map:
+            description[key] = norm_map[requested_norm_key]
+    # 2) Add remaining participants columns generically under normalized names,
+    #    unless a requested field already captured them
+    requested_norm = {normalize_key(k) for k in requested}
+    for norm_key, part_value in norm_map.items():
+        if norm_key in requested_norm:
+            continue
+        if norm_key not in description:
+            description[norm_key] = part_value
+    return description

eegdash/data_utils.py CHANGED Viewed

@@ -1,9 +1,11 @@
+import io
 import json
 import logging
 import os
 import re
 import traceback
 import warnings
+from contextlib import redirect_stderr
 from pathlib import Path
 from typing import Any
@@ -91,19 +93,8 @@ class EEGDashBaseDataset(BaseDataset):
             root=self.bids_root,
             datatype="eeg",
             suffix="eeg",
-            # extension='.bdf',
             **self.bids_kwargs,
         )
-        # TO-DO: remove this once find a better solution using mne-bids or update competition dataset
-        try:
-            _ = str(self.bidspath)
-        except RuntimeError:
-            try:
-                self.bidspath = self.bidspath.update(extension=".bdf")
-                self.filecache = self.filecache.with_suffix(".bdf")
-            except Exception as e:
-                logger.error(f"Error while updating BIDS path: {e}")
-                raise e
         self.s3file = self._get_s3path(record["bidspath"])
         self.bids_dependencies = record["bidsdependencies"]
@@ -182,8 +173,11 @@ class EEGDashBaseDataset(BaseDataset):
                 dep_local = Path(self.dataset_folder) / dep_path
             filepath = self.cache_dir / dep_local
             if not self.s3_open_neuro:
+                if filepath.suffix == ".set":
+                    filepath = filepath.with_suffix(".bdf")
                 if self.filecache.suffix == ".set":
                     self.filecache = self.filecache.with_suffix(".bdf")
             # here, we download the dependency and it is fine
             # in the case of the competition.
             if not filepath.exists():
@@ -218,6 +212,12 @@ class EEGDashBaseDataset(BaseDataset):
     def _ensure_raw(self) -> None:
         """Download the S3 file and BIDS dependencies if not already cached."""
+        # TO-DO: remove this once is fixed on the our side
+        # for the competition
+        if not self.s3_open_neuro:
+            self.bidspath = self.bidspath.update(extension=".bdf")
+            self.filecache = self.filecache.with_suffix(".bdf")
         if not os.path.exists(self.filecache):  # not preload
             if self.bids_dependencies:
                 self._download_dependencies()
@@ -226,13 +226,50 @@ class EEGDashBaseDataset(BaseDataset):
             # capturing any warnings
             # to-do: remove this once is fixed on the mne-bids side.
             with warnings.catch_warnings(record=True) as w:
+                # Ensure all warnings are captured into 'w' and not shown to users
+                warnings.simplefilter("always")
                 try:
-                    # TO-DO: remove this once is fixed on the our side
-                    if not self.s3_open_neuro:
-                        self.bidspath = self.bidspath.update(extension=".bdf")
-                    self._raw = mne_bids.read_raw_bids(
-                        bids_path=self.bidspath, verbose="ERROR"
-                    )
+                    # mne-bids emits RuntimeWarnings to stderr; silence stderr during read
+                    _stderr_buffer = io.StringIO()
+                    with redirect_stderr(_stderr_buffer):
+                        self._raw = mne_bids.read_raw_bids(
+                            bids_path=self.bidspath, verbose="ERROR"
+                        )
+                    # Parse unmapped participants.tsv fields reported by mne-bids and
+                    # inject them into Raw.info and the dataset description generically.
+                    extras = self._extract_unmapped_participants_from_warnings(w)
+                    if extras:
+                        # 1) Attach to Raw.info under subject_info.participants_extras
+                        try:
+                            subject_info = self._raw.info.get("subject_info") or {}
+                            if not isinstance(subject_info, dict):
+                                subject_info = {}
+                            pe = subject_info.get("participants_extras") or {}
+                            if not isinstance(pe, dict):
+                                pe = {}
+                            # Merge without overwriting
+                            for k, v in extras.items():
+                                pe.setdefault(k, v)
+                            subject_info["participants_extras"] = pe
+                            self._raw.info["subject_info"] = subject_info
+                        except Exception:
+                            # Non-fatal; continue
+                            pass
+                        # 2) Also add to this dataset's description, if possible, so
+                        #    targets can be selected later without naming specifics.
+                        try:
+                            import pandas as _pd  # local import to avoid top-level cost
+                            if isinstance(self.description, dict):
+                                for k, v in extras.items():
+                                    self.description.setdefault(k, v)
+                            elif isinstance(self.description, _pd.Series):
+                                for k, v in extras.items():
+                                    if k not in self.description.index:
+                                        self.description.loc[k] = v
+                        except Exception:
+                            pass
                 except Exception as e:
                     logger.error(
                         f"Error while reading BIDS file: {self.bidspath}\n"
@@ -242,10 +279,60 @@ class EEGDashBaseDataset(BaseDataset):
                     logger.error(f"Exception: {e}")
                     logger.error(traceback.format_exc())
                     raise e
-                for warning in w:
-                    logger.warning(
-                        f"Warning while reading BIDS file: {warning.message}"
-                    )
+                # Filter noisy mapping notices from mne-bids; surface others
+                for captured_warning in w:
+                    try:
+                        msg = str(captured_warning.message)
+                    except Exception:
+                        continue
+                    # Suppress verbose participants mapping messages
+                    if "Unable to map the following column" in msg and "MNE" in msg:
+                        logger.debug(
+                            "Suppressed mne-bids mapping warning while reading BIDS file: %s",
+                            msg,
+                        )
+                        continue
+                    logger.warning("Warning while reading BIDS file: %s", msg)
+    def _extract_unmapped_participants_from_warnings(
+        self, warnings_list: list[Any]
+    ) -> dict[str, Any]:
+        """Scan captured warnings from mne-bids and extract unmapped participants.tsv
+        entries in a generic way.
+        Optionally, the column name can carry a note in parentheses that we ignore
+        for key/value extraction. Returns a mapping of column name -> raw value.
+        """
+        extras: dict[str, Any] = {}
+        header = "Unable to map the following column(s) to MNE:"
+        for wr in warnings_list:
+            try:
+                msg = str(wr.message)
+            except Exception:
+                continue
+            if header not in msg:
+                continue
+            lines = msg.splitlines()
+            # Find the header line, then parse subsequent lines as entries
+            try:
+                idx = next(i for i, ln in enumerate(lines) if header in ln)
+            except StopIteration:
+                idx = -1
+            for line in lines[idx + 1 :]:
+                line = line.strip()
+                if not line:
+                    continue
+                # Pattern:  <col>(optional note): <value>
+                # Examples: "gender: F", "Ethnicity: Indian", "foo (ignored): bar"
+                m = re.match(r"^([^:]+?)(?:\s*\([^)]*\))?\s*:\s*(.*)$", line)
+                if not m:
+                    continue
+                col = m.group(1).strip()
+                val = m.group(2).strip()
+                # Keep original column names as provided to stay agnostic
+                if col and col not in extras:
+                    extras[col] = val
+        return extras
     # === BaseDataset and PyTorch Dataset interface ===
@@ -264,11 +351,16 @@ class EEGDashBaseDataset(BaseDataset):
     def __len__(self) -> int:
         """Return the number of samples in the dataset."""
         if self._raw is None:
-            # FIXME: this is a bit strange and should definitely not change as a side effect
-            #  of accessing the data (which it will, since ntimes is the actual length but rounded down)
-            return int(self.record["ntimes"] * self.record["sampling_frequency"])
-        else:
-            return len(self._raw)
+            if (
+                self.record["ntimes"] is None
+                or self.record["sampling_frequency"] is None
+            ):
+                self._ensure_raw()
+            else:
+                # FIXME: this is a bit strange and should definitely not change as a side effect
+                #  of accessing the data (which it will, since ntimes is the actual length but rounded down)
+                return int(self.record["ntimes"] * self.record["sampling_frequency"])
+        return len(self._raw)
     @property
     def raw(self):

{eegdash-0.3.7.dev105.dist-info → eegdash-0.3.7.dev107.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eegdash
-Version: 0.3.7.dev105
+Version: 0.3.7.dev107
 Summary: EEG data for machine learning
 Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>, Aviv Dotan <avivd220@gmail.com>, Oren Shriki <oren70@gmail.com>, Bruno Aristimunha <b.aristimunha@gmail.com>
 License-Expression: GPL-3.0-only

{eegdash-0.3.7.dev105.dist-info → eegdash-0.3.7.dev107.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
-eegdash/__init__.py,sha256=yplAyK95jkzGdCrV7kNsoqxXC0jxOJzkD87WI29VA5A,285
-eegdash/api.py,sha256=s6dG52Or7YlEsJFR_BJQ8Az2-BwyuUScFoy8UZLgsdY,38097
-eegdash/bids_eeg_metadata.py,sha256=iAHJKxeleDWzeei4sNqc-9NlZJk4QY2BYk6D1t2mkOI,6984
+eegdash/__init__.py,sha256=UHfMZV_pHqDbXX4q_0FLvnQkrQG5kbXG_nS3iekTts8,285
+eegdash/api.py,sha256=YyJrs2vTiJ67UPV5aK_xbr5wCa1onX9Jv4o6S3oVDsE,40091
+eegdash/bids_eeg_metadata.py,sha256=LZrGPGVdnGUbZlD4M_aAW4kEItzwTTeZFicH-jyqDyc,9712
 eegdash/const.py,sha256=qdFBEL9kIrsj9CdxbXhBkR61R3CrTGSaj5Iq0YOACIs,7313
-eegdash/data_utils.py,sha256=sPglGH1w3USBuDn6uNpsfPxji8NVy2QmVg789uMhe_E,29739
+eegdash/data_utils.py,sha256=U8QbqpDvZeHsC2shk1_A2gUBEm8GrzVT8vHN_VZpY_g,34257
 eegdash/mongodb.py,sha256=GD3WgA253oFgpzOHrYaj4P1mRjNtDMT5Oj4kVvHswjI,2006
 eegdash/utils.py,sha256=7TfQ9D0LrAJ7FgnSXEvWgeHWK2QqaqS-_WcWXD86ObQ,408
 eegdash/dataset/__init__.py,sha256=Qmzki5G8GaFlzTb10e4SmC3WkKuJyo1Ckii15tCEHAo,157
@@ -23,8 +23,8 @@ eegdash/features/feature_bank/dimensionality.py,sha256=j_Ds71Y1AbV2uLFQj8EuXQ4kz
 eegdash/features/feature_bank/signal.py,sha256=3Tb8z9gX7iZipxQJ9DSyy30JfdmW58kgvimSyZX74p8,3404
 eegdash/features/feature_bank/spectral.py,sha256=bNB7skusePs1gX7NOU6yRlw_Gr4UOCkO_ylkCgybzug,3319
 eegdash/features/feature_bank/utils.py,sha256=DGh-Q7-XFIittP7iBBxvsJaZrlVvuY5mw-G7q6C-PCI,1237
-eegdash-0.3.7.dev105.dist-info/licenses/LICENSE,sha256=asisR-xupy_NrQBFXnx6yqXeZcYWLvbAaiETl25iXT0,931
-eegdash-0.3.7.dev105.dist-info/METADATA,sha256=Eyu7l05oYbcZVSJGfZ2H3sC9a4Ekjc3oc4Z9fuZrk48,10053
-eegdash-0.3.7.dev105.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-eegdash-0.3.7.dev105.dist-info/top_level.txt,sha256=zavO69HQ6MyZM0aQMR2zUS6TAFc7bnN5GEpDpOpFZzU,8
-eegdash-0.3.7.dev105.dist-info/RECORD,,
+eegdash-0.3.7.dev107.dist-info/licenses/LICENSE,sha256=asisR-xupy_NrQBFXnx6yqXeZcYWLvbAaiETl25iXT0,931
+eegdash-0.3.7.dev107.dist-info/METADATA,sha256=KjKTOdpKaADi0ZuBTScvYRRV4V0YoIQPd9ZpnALfmZ0,10053
+eegdash-0.3.7.dev107.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+eegdash-0.3.7.dev107.dist-info/top_level.txt,sha256=zavO69HQ6MyZM0aQMR2zUS6TAFc7bnN5GEpDpOpFZzU,8
+eegdash-0.3.7.dev107.dist-info/RECORD,,

{eegdash-0.3.7.dev105.dist-info → eegdash-0.3.7.dev107.dist-info}/WHEEL RENAMED Viewed

File without changes

{eegdash-0.3.7.dev105.dist-info → eegdash-0.3.7.dev107.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{eegdash-0.3.7.dev105.dist-info → eegdash-0.3.7.dev107.dist-info}/top_level.txt RENAMED Viewed

File without changes

eegdash 0.3.7.dev105__py3-none-any.whl → 0.3.7.dev107__py3-none-any.whl

Potentially problematic release.

eegdash 0.3.7.dev105py3-none-any.whl → 0.3.7.dev107py3-none-any.whl