PyPI - eegdash - Versions diffs - 0.3.6.dev182011805__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

eegdash 0.3.6.dev182011805py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eegdash might be problematic. Click here for more details.

Files changed (19) hide show

eegdash/__init__.py +5 -4
eegdash/api.py +515 -454
eegdash/bids_eeg_metadata.py +254 -0
eegdash/{dataset.py → const.py} +46 -93
eegdash/data_utils.py +180 -45
eegdash/dataset/__init__.py +4 -0
eegdash/dataset/dataset.py +161 -0
eegdash/dataset/dataset_summary.csv +256 -0
eegdash/{registry.py → dataset/registry.py} +16 -6
eegdash/paths.py +28 -0
eegdash/utils.py +1 -1
{eegdash-0.3.6.dev182011805.dist-info → eegdash-0.3.7.dist-info}/METADATA +13 -5
{eegdash-0.3.6.dev182011805.dist-info → eegdash-0.3.7.dist-info}/RECORD +16 -14
eegdash/data_config.py +0 -34
eegdash/dataset_summary.csv +0 -256
eegdash/preprocessing.py +0 -63
{eegdash-0.3.6.dev182011805.dist-info → eegdash-0.3.7.dist-info}/WHEEL +0 -0
{eegdash-0.3.6.dev182011805.dist-info → eegdash-0.3.7.dist-info}/licenses/LICENSE +0 -0
{eegdash-0.3.6.dev182011805.dist-info → eegdash-0.3.7.dist-info}/top_level.txt +0 -0

eegdash/bids_eeg_metadata.py ADDED Viewed

@@ -0,0 +1,254 @@
+import logging
+import re
+from pathlib import Path
+from typing import Any
+from .const import ALLOWED_QUERY_FIELDS
+from .const import config as data_config
+logger = logging.getLogger("eegdash")
+__all__ = [
+    "build_query_from_kwargs",
+    "load_eeg_attrs_from_bids_file",
+    "merge_participants_fields",
+    "normalize_key",
+]
+def build_query_from_kwargs(**kwargs) -> dict[str, Any]:
+    """Build and validate a MongoDB query from user-friendly keyword arguments.
+    Improvements:
+    - Reject None values and empty/whitespace-only strings
+    - For list/tuple/set values: strip strings, drop None/empties, deduplicate, and use `$in`
+    - Preserve scalars as exact matches
+    """
+    # 1. Validate that all provided keys are allowed for querying
+    unknown_fields = set(kwargs.keys()) - ALLOWED_QUERY_FIELDS
+    if unknown_fields:
+        raise ValueError(
+            f"Unsupported query field(s): {', '.join(sorted(unknown_fields))}. "
+            f"Allowed fields are: {', '.join(sorted(ALLOWED_QUERY_FIELDS))}"
+        )
+    # 2. Construct the query dictionary
+    query = {}
+    for key, value in kwargs.items():
+        # None is not a valid constraint
+        if value is None:
+            raise ValueError(
+                f"Received None for query parameter '{key}'. Provide a concrete value."
+            )
+        # Handle list-like values as multi-constraints
+        if isinstance(value, (list, tuple, set)):
+            cleaned: list[Any] = []
+            for item in value:
+                if item is None:
+                    continue
+                if isinstance(item, str):
+                    item = item.strip()
+                    if not item:
+                        continue
+                cleaned.append(item)
+            # Deduplicate while preserving order
+            cleaned = list(dict.fromkeys(cleaned))
+            if not cleaned:
+                raise ValueError(
+                    f"Received an empty list for query parameter '{key}'. This is not supported."
+                )
+            query[key] = {"$in": cleaned}
+        else:
+            # Scalars: trim strings and validate
+            if isinstance(value, str):
+                value = value.strip()
+                if not value:
+                    raise ValueError(
+                        f"Received an empty string for query parameter '{key}'."
+                    )
+            query[key] = value
+    return query
+def _get_raw_extensions(bids_file: str, bids_dataset) -> list[str]:
+    """Helper to find paths to additional "sidecar" files that may be associated
+    with a given main data file in a BIDS dataset; paths are returned as relative to
+    the parent dataset path.
+    For example, if the input file is a .set file, this will return the relative path
+    to a corresponding .fdt file (if any).
+    """
+    bids_file = Path(bids_file)
+    extensions = {
+        ".set": [".set", ".fdt"],  # eeglab
+        ".edf": [".edf"],  # european
+        ".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"],  # brainvision
+        ".bdf": [".bdf"],  # biosemi
+    }
+    return [
+        str(bids_dataset._get_relative_bidspath(bids_file.with_suffix(suffix)))
+        for suffix in extensions[bids_file.suffix]
+        if bids_file.with_suffix(suffix).exists()
+    ]
+def load_eeg_attrs_from_bids_file(bids_dataset, bids_file: str) -> dict[str, Any]:
+    """Build the metadata record for a given BIDS file (single recording) in a BIDS dataset.
+    Attributes are at least the ones defined in data_config attributes (set to None if missing),
+    but are typically a superset, and include, among others, the paths to relevant
+    meta-data files needed to load and interpret the file in question.
+    Parameters
+    ----------
+    bids_dataset : EEGBIDSDataset
+        The BIDS dataset object containing the file.
+    bids_file : str
+        The path to the BIDS file within the dataset.
+    Returns
+    -------
+    dict:
+        A dictionary representing the metadata record for the given file. This is the
+        same format as the records stored in the database.
+    """
+    if bids_file not in bids_dataset.files:
+        raise ValueError(f"{bids_file} not in {bids_dataset.dataset}")
+    # Initialize attrs with None values for all expected fields
+    attrs = {field: None for field in data_config["attributes"].keys()}
+    file = Path(bids_file).name
+    dsnumber = bids_dataset.dataset
+    # extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
+    openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
+    # Update with actual values where available
+    try:
+        participants_tsv = bids_dataset.subject_participant_tsv(bids_file)
+    except Exception as e:
+        logger.error("Error getting participants_tsv: %s", str(e))
+        participants_tsv = None
+    try:
+        eeg_json = bids_dataset.eeg_json(bids_file)
+    except Exception as e:
+        logger.error("Error getting eeg_json: %s", str(e))
+        eeg_json = None
+    bids_dependencies_files = data_config["bids_dependencies_files"]
+    bidsdependencies = []
+    for extension in bids_dependencies_files:
+        try:
+            dep_path = bids_dataset.get_bids_metadata_files(bids_file, extension)
+            dep_path = [
+                str(bids_dataset.get_relative_bidspath(dep)) for dep in dep_path
+            ]
+            bidsdependencies.extend(dep_path)
+        except Exception:
+            pass
+    bidsdependencies.extend(_get_raw_extensions(bids_file, bids_dataset))
+    # Define field extraction functions with error handling
+    field_extractors = {
+        "data_name": lambda: f"{bids_dataset.dataset}_{file}",
+        "dataset": lambda: bids_dataset.dataset,
+        "bidspath": lambda: openneuro_path,
+        "subject": lambda: bids_dataset.get_bids_file_attribute("subject", bids_file),
+        "task": lambda: bids_dataset.get_bids_file_attribute("task", bids_file),
+        "session": lambda: bids_dataset.get_bids_file_attribute("session", bids_file),
+        "run": lambda: bids_dataset.get_bids_file_attribute("run", bids_file),
+        "modality": lambda: bids_dataset.get_bids_file_attribute("modality", bids_file),
+        "sampling_frequency": lambda: bids_dataset.get_bids_file_attribute(
+            "sfreq", bids_file
+        ),
+        "nchans": lambda: bids_dataset.get_bids_file_attribute("nchans", bids_file),
+        "ntimes": lambda: bids_dataset.get_bids_file_attribute("ntimes", bids_file),
+        "participant_tsv": lambda: participants_tsv,
+        "eeg_json": lambda: eeg_json,
+        "bidsdependencies": lambda: bidsdependencies,
+    }
+    # Dynamically populate attrs with error handling
+    for field, extractor in field_extractors.items():
+        try:
+            attrs[field] = extractor()
+        except Exception as e:
+            logger.error("Error extracting %s : %s", field, str(e))
+            attrs[field] = None
+    return attrs
+def normalize_key(key: str) -> str:
+    """Normalize a metadata key for robust matching.
+    Lowercase and replace non-alphanumeric characters with underscores, then strip
+    leading/trailing underscores. This allows tolerant matching such as
+    "p-factor" ≈ "p_factor" ≈ "P Factor".
+    """
+    return re.sub(r"[^a-z0-9]+", "_", str(key).lower()).strip("_")
+def merge_participants_fields(
+    description: dict[str, Any],
+    participants_row: dict[str, Any] | None,
+    description_fields: list[str] | None = None,
+) -> dict[str, Any]:
+    """Merge participants.tsv fields into a dataset description dictionary.
+    - Preserves existing entries in ``description`` (no overwrites).
+    - Fills requested ``description_fields`` first, preserving their original names.
+    - Adds all remaining participants columns generically using normalized keys
+      unless a matching requested field already captured them.
+    Parameters
+    ----------
+    description : dict
+        Current description to be enriched in-place and returned.
+    participants_row : dict | None
+        A mapping of participants.tsv columns for the current subject.
+    description_fields : list[str] | None
+        Optional list of requested description fields. When provided, matching is
+        performed by normalized names; the original requested field names are kept.
+    Returns
+    -------
+    dict
+        The enriched description (same object as input for convenience).
+    """
+    if not isinstance(description, dict) or not isinstance(participants_row, dict):
+        return description
+    # Normalize participants keys and keep first non-None value per normalized key
+    norm_map: dict[str, Any] = {}
+    for part_key, part_value in participants_row.items():
+        norm_key = normalize_key(part_key)
+        if norm_key not in norm_map and part_value is not None:
+            norm_map[norm_key] = part_value
+    # Ensure description_fields is a list for matching
+    requested = list(description_fields or [])
+    # 1) Fill requested fields first using normalized matching, preserving names
+    for key in requested:
+        if key in description:
+            continue
+        requested_norm_key = normalize_key(key)
+        if requested_norm_key in norm_map:
+            description[key] = norm_map[requested_norm_key]
+    # 2) Add remaining participants columns generically under normalized names,
+    #    unless a requested field already captured them
+    requested_norm = {normalize_key(k) for k in requested}
+    for norm_key, part_value in norm_map.items():
+        if norm_key in requested_norm:
+            continue
+        if norm_key not in description:
+            description[norm_key] = part_value
+    return description

eegdash/{dataset.py → const.py} RENAMED Viewed

@@ -1,7 +1,15 @@
-from pathlib import Path
-from .api import EEGDashDataset
-from .registry import register_openneuro_datasets
+ALLOWED_QUERY_FIELDS = {
+    "data_name",
+    "dataset",
+    "subject",
+    "task",
+    "session",
+    "run",
+    "modality",
+    "sampling_frequency",
+    "nchans",
+    "ntimes",
+}
 RELEASE_TO_OPENNEURO_DATASET_MAP = {
     "R11": "ds005516",
@@ -262,92 +270,37 @@ SUBJECT_MINI_RELEASE_MAP = {
     ],
 }
-class EEGChallengeDataset(EEGDashDataset):
-    def __init__(
-        self,
-        release: str,
-        cache_dir: str,
-        mini: bool = True,
-        query: dict | None = None,
-        s3_bucket: str | None = "s3://nmdatasets/NeurIPS25",
-        **kwargs,
-    ):
-        """Create a new EEGDashDataset from a given query or local BIDS dataset directory
-        and dataset name. An EEGDashDataset is pooled collection of EEGDashBaseDataset
-        instances (individual recordings) and is a subclass of braindecode's BaseConcatDataset.
-        Parameters
-        ----------
-        release: str
-            Release name. Can be one of ["R1", ..., "R11"]
-        mini: bool, default True
-            Whether to use the mini-release version of the dataset. It is recommended
-            to use the mini version for faster training and evaluation.
-        query : dict | None
-            Optionally a dictionary that specifies a query to be executed,
-            in addition to the dataset (automatically inferred from the release argument).
-            See EEGDash.find() for details on the query format.
-        cache_dir : str
-            A directory where the dataset will be cached locally.
-        s3_bucket : str | None
-            An optional S3 bucket URI to use instead of the
-            default OpenNeuro bucket for loading data files.
-        kwargs : dict
-            Additional keyword arguments to be passed to the EEGDashDataset
-            constructor.
-        """
-        self.release = release
-        self.mini = mini
-        if release not in RELEASE_TO_OPENNEURO_DATASET_MAP:
-            raise ValueError(
-                f"Unknown release: {release}, expected one of {list(RELEASE_TO_OPENNEURO_DATASET_MAP.keys())}"
-            )
-        dataset_parameters = []
-        if isinstance(release, str):
-            dataset_parameters.append(RELEASE_TO_OPENNEURO_DATASET_MAP[release])
-        else:
-            raise ValueError(
-                f"Unknown release type: {type(release)}, the expected type is str."
-            )
-        if query and "dataset" in query:
-            raise ValueError(
-                "Query using the parameters `dataset` with the class EEGChallengeDataset is not possible."
-                "Please use the release argument instead, or the object EEGDashDataset instead."
-            )
-        if self.mini:
-            # Disallow mixing subject selection with mini=True since mini already
-            # applies a predefined subject subset.
-            if (query and "subject" in query) or ("subject" in kwargs):
-                raise ValueError(
-                    "Query using the parameters `subject` with the class EEGChallengeDataset and `mini==True` is not possible."
-                    "Please don't use the `subject` selection twice."
-                    "Set `mini=False` to use the `subject` selection."
-                )
-            kwargs["subject"] = SUBJECT_MINI_RELEASE_MAP[release]
-            s3_bucket = f"{s3_bucket}/{release}_mini_L100_bdf"
-        else:
-            s3_bucket = f"{s3_bucket}/{release}_L100_bdf"
-        super().__init__(
-            dataset=dataset_parameters,
-            query=query,
-            cache_dir=cache_dir,
-            s3_bucket=s3_bucket,
-            **kwargs,
-        )
-registered_classes = register_openneuro_datasets(
-    summary_file=Path(__file__).with_name("dataset_summary.csv"),
-    base_class=EEGDashDataset,
-    namespace=globals(),
-)
-__all__ = ["EEGChallengeDataset"] + list(registered_classes.keys())
+config = {
+    "required_fields": ["data_name"],
+    # Default set of user-facing primary record attributes expected in the database. Records
+    # where any of these are missing will be loaded with the respective attribute set to None.
+    # Additional fields may be returned if they are present in the database, notably bidsdependencies.
+    "attributes": {
+        "data_name": "str",
+        "dataset": "str",
+        "bidspath": "str",
+        "subject": "str",
+        "task": "str",
+        "session": "str",
+        "run": "str",
+        "sampling_frequency": "float",
+        "modality": "str",
+        "nchans": "int",
+        "ntimes": "int",  # note: this is really the number of seconds in the data, rounded down
+    },
+    # queryable descriptive fields for a given recording
+    "description_fields": ["subject", "session", "run", "task", "age", "gender", "sex"],
+    # list of filenames that may be present in the BIDS dataset directory that are used
+    # to load and interpret a given BIDS recording.
+    "bids_dependencies_files": [
+        "dataset_description.json",
+        "participants.tsv",
+        "events.tsv",
+        "events.json",
+        "eeg.json",
+        "electrodes.tsv",
+        "channels.tsv",
+        "coordsystem.json",
+    ],
+    "accepted_query_fields": ["data_name", "dataset"],
+}

eegdash 0.3.6.dev182011805__py3-none-any.whl → 0.3.7__py3-none-any.whl

Potentially problematic release.

eegdash 0.3.6.dev182011805py3-none-any.whl → 0.3.7py3-none-any.whl