PyPI - eegdash - Versions diffs - 0.3.3.dev61__py3-none-any.whl → 0.5.0.dev180784713__py3-none-any.whl - Mend

eegdash 0.3.3.dev61py3-none-any.whl → 0.5.0.dev180784713py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

eegdash/__init__.py +19 -6
eegdash/api.py +336 -539
eegdash/bids_eeg_metadata.py +495 -0
eegdash/const.py +349 -0
eegdash/dataset/__init__.py +28 -0
eegdash/dataset/base.py +311 -0
eegdash/dataset/bids_dataset.py +641 -0
eegdash/dataset/dataset.py +692 -0
eegdash/dataset/dataset_summary.csv +255 -0
eegdash/dataset/registry.py +287 -0
eegdash/downloader.py +197 -0
eegdash/features/__init__.py +15 -13
eegdash/features/datasets.py +329 -138
eegdash/features/decorators.py +105 -13
eegdash/features/extractors.py +233 -63
eegdash/features/feature_bank/__init__.py +12 -12
eegdash/features/feature_bank/complexity.py +22 -20
eegdash/features/feature_bank/connectivity.py +27 -28
eegdash/features/feature_bank/csp.py +3 -1
eegdash/features/feature_bank/dimensionality.py +6 -6
eegdash/features/feature_bank/signal.py +29 -30
eegdash/features/feature_bank/spectral.py +40 -44
eegdash/features/feature_bank/utils.py +8 -0
eegdash/features/inspect.py +126 -15
eegdash/features/serialization.py +58 -17
eegdash/features/utils.py +90 -16
eegdash/hbn/__init__.py +28 -0
eegdash/hbn/preprocessing.py +105 -0
eegdash/hbn/windows.py +428 -0
eegdash/logging.py +54 -0
eegdash/mongodb.py +55 -24
eegdash/paths.py +52 -0
eegdash/utils.py +29 -1
eegdash-0.5.0.dev180784713.dist-info/METADATA +121 -0
eegdash-0.5.0.dev180784713.dist-info/RECORD +38 -0
eegdash-0.5.0.dev180784713.dist-info/licenses/LICENSE +29 -0
eegdash/data_config.py +0 -34
eegdash/data_utils.py +0 -687
eegdash/dataset.py +0 -69
eegdash/preprocessing.py +0 -63
eegdash-0.3.3.dev61.dist-info/METADATA +0 -192
eegdash-0.3.3.dev61.dist-info/RECORD +0 -28
eegdash-0.3.3.dev61.dist-info/licenses/LICENSE +0 -23
{eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/WHEEL +0 -0
{eegdash-0.3.3.dev61.dist-info → eegdash-0.5.0.dev180784713.dist-info}/top_level.txt +0 -0

eegdash/api.py CHANGED Viewed

@@ -1,54 +1,65 @@
-import logging
+# Authors: The EEGDash contributors.
+# License: BSD-3-Clause
+# Copyright the EEGDash contributors.
+"""High-level interface to the EEGDash metadata database.
+This module provides the main EEGDash class which serves as the primary entry point for
+interacting with the EEGDash ecosystem. It offers methods to query, insert, and update
+metadata records stored in the EEGDash MongoDB database, and includes utilities to load
+EEG data from S3 for matched records.
+"""
+import json
 import os
-import tempfile
 from pathlib import Path
 from typing import Any, Mapping
 import mne
 import numpy as np
-import xarray as xr
-from dotenv import load_dotenv
-from joblib import Parallel, delayed
-from pymongo import InsertOne, UpdateOne
-from s3fs import S3FileSystem
-from braindecode.datasets import BaseConcatDataset
-from .data_config import config as data_config
-from .data_utils import EEGBIDSDataset, EEGDashBaseDataset
+import pandas as pd
+from mne.utils import _soft_import
+from .bids_eeg_metadata import (
+    build_query_from_kwargs,
+    load_eeg_attrs_from_bids_file,
+)
+from .const import (
+    ALLOWED_QUERY_FIELDS,
+)
+from .const import config as data_config
+from .dataset.bids_dataset import EEGBIDSDataset
+from .logging import logger
 from .mongodb import MongoConnectionManager
-logger = logging.getLogger("eegdash")
+from .utils import _init_mongo_client
 class EEGDash:
-    """A high-level interface to the EEGDash database.
-    This class is primarily used to interact with the metadata records stored in the
-    EEGDash database (or a private instance of it), allowing users to find, add, and
-    update EEG data records.
+    """High-level interface to the EEGDash metadata database.
-    While this class provides basic support for loading EEG data, please see
-    the EEGDashDataset class for a more complete way to retrieve and work with full
-    datasets.
+    Provides methods to query, insert, and update metadata records stored in the
+    EEGDash MongoDB database (public or private). Also includes utilities to load
+    EEG data from S3 for matched records.
+    For working with collections of
+    recordings as PyTorch datasets, prefer :class:`EEGDashDataset`.
     """
     def __init__(self, *, is_public: bool = True, is_staging: bool = False) -> None:
-        """Create new instance of the EEGDash Database client.
+        """Create a new EEGDash client.
         Parameters
         ----------
-        is_public: bool
-            Whether to connect to the public MongoDB database; if False, connect to a
-            private database instance as per the DB_CONNECTION_STRING env variable
-            (or .env file entry).
-        is_staging: bool
-            If True, use staging MongoDB database ("eegdashstaging"); otherwise use the
-            production database ("eegdash").
-        Example
-        -------
+        is_public : bool, default True
+            Connect to the public MongoDB database. If ``False``, connect to a
+            private database instance using the ``DB_CONNECTION_STRING`` environment
+            variable (or value from a ``.env`` file).
+        is_staging : bool, default False
+            If ``True``, use the staging database (``eegdashstaging``); otherwise
+            use the production database (``eegdash``).
+        Examples
+        --------
         >>> eegdash = EEGDash()
         """
@@ -58,47 +69,91 @@ class EEGDash:
         if self.is_public:
             DB_CONNECTION_STRING = mne.utils.get_config("EEGDASH_DB_URI")
+            if not DB_CONNECTION_STRING:
+                try:
+                    _init_mongo_client()
+                    DB_CONNECTION_STRING = mne.utils.get_config("EEGDASH_DB_URI")
+                except Exception:
+                    DB_CONNECTION_STRING = None
         else:
-            load_dotenv()
+            dotenv = _soft_import("dotenv", "eegdash[full] is necessary.")
+            dotenv.load_dotenv()
             DB_CONNECTION_STRING = os.getenv("DB_CONNECTION_STRING")
         # Use singleton to get MongoDB client, database, and collection
+        if not DB_CONNECTION_STRING:
+            raise RuntimeError(
+                "No MongoDB connection string configured. Set MNE config 'EEGDASH_DB_URI' "
+                "or environment variable 'DB_CONNECTION_STRING'."
+            )
         self.__client, self.__db, self.__collection = MongoConnectionManager.get_client(
             DB_CONNECTION_STRING, is_staging
         )
-        self.filesystem = S3FileSystem(
-            anon=True, client_kwargs={"region_name": "us-east-2"}
-        )
+    def find(
+        self, query: dict[str, Any] = None, /, **kwargs
+    ) -> list[Mapping[str, Any]]:
+        """Find records in the MongoDB collection.
-    def find(self, query: dict[str, Any], *args, **kwargs) -> list[Mapping[str, Any]]:
-        """Find records in the MongoDB collection that satisfy the given query.
+        Examples
+        --------
+        >>> eegdash.find({"dataset": "ds002718", "subject": {"$in": ["012", "013"]}})  # pre-built query
+        >>> eegdash.find(dataset="ds002718", subject="012")  # keyword filters
+        >>> eegdash.find(dataset="ds002718", subject=["012", "013"])  # sequence -> $in
+        >>> eegdash.find({})  # fetch all (use with care)
+        >>> eegdash.find({"dataset": "ds002718"}, subject=["012", "013"])  # combine query + kwargs (AND)
         Parameters
         ----------
-        query: dict
-            A dictionary that specifies the query to be executed; this is a reference
-            document that is used to match records in the MongoDB collection.
-        args:
-            Additional positional arguments for the MongoDB find() method; see
-            https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.find
-        kwargs:
-            Additional keyword arguments for the MongoDB find() method.
+        query : dict, optional
+            Complete MongoDB query dictionary. This is a positional-only
+            argument.
+        **kwargs
+            User-friendly field filters that are converted to a MongoDB query.
+            Values can be scalars (e.g., ``"sub-01"``) or sequences (translated
+            to ``$in`` queries).
         Returns
         -------
-        list:
-            A list of DB records (string-keyed dictionaries) that match the query.
-        Example
-        -------
-        >>> eegdash = EEGDash()
-        >>> eegdash.find({"dataset": "ds002718", "subject": "012"})
+        list of dict
+            DB records that match the query.
         """
-        results = self.__collection.find(query, *args, **kwargs)
+        final_query: dict[str, Any] | None = None
+        # Accept explicit empty dict {} to mean "match all"
+        raw_query = query if isinstance(query, dict) else None
+        kwargs_query = build_query_from_kwargs(**kwargs) if kwargs else None
+        # Determine presence, treating {} as a valid raw query
+        has_raw = isinstance(raw_query, dict)
+        has_kwargs = kwargs_query is not None
+        if has_raw and has_kwargs:
+            # Detect conflicting constraints on the same field (e.g., task specified
+            # differently in both places) and raise a clear error instead of silently
+            # producing an empty result.
+            self._raise_if_conflicting_constraints(raw_query, kwargs_query)
+            # Merge with logical AND so both constraints apply
+            if raw_query:  # non-empty dict adds constraints
+                final_query = {"$and": [raw_query, kwargs_query]}
+            else:  # {} adds nothing; use kwargs_query only
+                final_query = kwargs_query
+        elif has_raw:
+            # May be {} meaning match-all, or a non-empty dict
+            final_query = raw_query
+        elif has_kwargs:
+            final_query = kwargs_query
+        else:
+            # Avoid accidental full scans
+            raise ValueError(
+                "find() requires a query dictionary or at least one keyword argument. "
+                "To find all documents, use find({})."
+            )
-        return [result for result in results]
+        results = self.__collection.find(final_query)
+        return list(results)
     def exist(self, query: dict[str, Any]) -> bool:
         """Return True if at least one record matches the query, else False.
@@ -145,17 +200,22 @@ class EEGDash:
         return doc is not None
     def _validate_input(self, record: dict[str, Any]) -> dict[str, Any]:
-        """Internal method to validate the input record against the expected schema.
+        """Validate the input record against the expected schema.
         Parameters
         ----------
-        record: dict
+        record : dict
             A dictionary representing the EEG data record to be validated.
         Returns
         -------
-        dict:
-            Returns the record itself on success, or raises a ValueError if the record is invalid.
+        dict
+            The record itself on success.
+        Raises
+        ------
+        ValueError
+            If the record is missing required keys or has values of the wrong type.
         """
         input_types = {
@@ -184,548 +244,285 @@ class EEGDash:
         return record
-    def load_eeg_data_from_s3(self, s3path: str) -> xr.DataArray:
-        """Load an EEGLAB .set file from an AWS S3 URI and return it as an xarray DataArray.
+    def _build_query_from_kwargs(self, **kwargs) -> dict[str, Any]:
+        """Build a validated MongoDB query from keyword arguments.
+        This delegates to the module-level builder used across the package.
         Parameters
         ----------
-        s3path : str
-            An S3 URI (should start with "s3://") for the file in question.
+        **kwargs
+            Keyword arguments to convert into a MongoDB query.
         Returns
         -------
-        xr.DataArray
-            A DataArray containing the EEG data, with dimensions "channel" and "time".
-        Example
-        -------
-        >>> eegdash = EEGDash()
-        >>> mypath = "s3://openneuro.org/path/to/your/eeg_data.set"
-        >>> mydata = eegdash.load_eeg_data_from_s3(mypath)
+        dict
+            A MongoDB query dictionary.
         """
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".set") as tmp:
-            with self.filesystem.open(s3path) as s3_file:
-                tmp.write(s3_file.read())
-            tmp_path = tmp.name
-            eeg_data = self.load_eeg_data_from_bids_file(tmp_path)
-            os.unlink(tmp_path)
-            return eeg_data
-    def load_eeg_data_from_bids_file(self, bids_file: str) -> xr.DataArray:
-        """Load EEG data from a local file and return it as a xarray DataArray.
-        Parameters
-        ----------
-        bids_file : str
-            Path to the file on the local filesystem.
-        Notes
-        -----
-        Currently, only non-epoched .set files are supported.
-        """
-        raw_object = mne.io.read_raw(bids_file)
-        eeg_data = raw_object.get_data()
-        fs = raw_object.info["sfreq"]
-        max_time = eeg_data.shape[1] / fs
-        time_steps = np.linspace(0, max_time, eeg_data.shape[1]).squeeze()  # in seconds
-        channel_names = raw_object.ch_names
+        return build_query_from_kwargs(**kwargs)
-        eeg_xarray = xr.DataArray(
-            data=eeg_data,
-            dims=["channel", "time"],
-            coords={"time": time_steps, "channel": channel_names},
-        )
-        return eeg_xarray
-    def get_raw_extensions(
-        self, bids_file: str, bids_dataset: EEGBIDSDataset
-    ) -> list[str]:
-        """Helper to find paths to additional "sidecar" files that may be associated
-        with a given main data file in a BIDS dataset; paths are returned as relative to
-        the parent dataset path.
+    def _extract_simple_constraint(
+        self, query: dict[str, Any], key: str
+    ) -> tuple[str, Any] | None:
+        """Extract a simple constraint for a given key from a query dict.
-        For example, if the input file is a .set file, this will return the relative path
-        to a corresponding .fdt file (if any).
-        """
-        bids_file = Path(bids_file)
-        extensions = {
-            ".set": [".set", ".fdt"],  # eeglab
-            ".edf": [".edf"],  # european
-            ".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"],  # brainvision
-            ".bdf": [".bdf"],  # biosemi
-        }
-        return [
-            str(bids_dataset.get_relative_bidspath(bids_file.with_suffix(suffix)))
-            for suffix in extensions[bids_file.suffix]
-            if bids_file.with_suffix(suffix).exists()
-        ]
-    def load_eeg_attrs_from_bids_file(
-        self, bids_dataset: EEGBIDSDataset, bids_file: str
-    ) -> dict[str, Any]:
-        """Build the metadata record for a given BIDS file (single recording) in a BIDS dataset.
-        Attributes are at least the ones defined in data_config attributes (set to None if missing),
-        but are typically a superset, and include, among others, the paths to relevant
-        meta-data files needed to load and interpret the file in question.
+        Supports top-level equality (e.g., ``{'subject': '01'}``) and ``$in``
+        (e.g., ``{'subject': {'$in': ['01', '02']}}``) constraints.
         Parameters
         ----------
-        bids_dataset : EEGBIDSDataset
-            The BIDS dataset object containing the file.
-        bids_file : str
-            The path to the BIDS file within the dataset.
+        query : dict
+            The MongoDB query dictionary.
+        key : str
+            The key for which to extract the constraint.
         Returns
         -------
-        dict:
-            A dictionary representing the metadata record for the given file. This is the
-            same format as the records stored in the database.
+        tuple or None
+            A tuple of (kind, value) where kind is "eq" or "in", or None if the
+            constraint is not present or unsupported.
         """
-        if bids_file not in bids_dataset.files:
-            raise ValueError(f"{bids_file} not in {bids_dataset.dataset}")
+        if not isinstance(query, dict) or key not in query:
+            return None
+        val = query[key]
+        if isinstance(val, dict):
+            if "$in" in val and isinstance(val["$in"], (list, tuple)):
+                return ("in", list(val["$in"]))
+            return None  # unsupported operator shape for conflict checking
+        else:
+            return "eq", val
-        # Initialize attrs with None values for all expected fields
-        attrs = {field: None for field in self.config["attributes"].keys()}
+    def _raise_if_conflicting_constraints(
+        self, raw_query: dict[str, Any], kwargs_query: dict[str, Any]
+    ) -> None:
+        """Raise ValueError if query sources have incompatible constraints.
-        file = Path(bids_file).name
-        dsnumber = bids_dataset.dataset
-        # extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
-        openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
+        Checks for mutually exclusive constraints on the same field to avoid
+        silent empty results.
-        # Update with actual values where available
-        try:
-            participants_tsv = bids_dataset.subject_participant_tsv(bids_file)
-        except Exception as e:
-            logger.error("Error getting participants_tsv: %s", str(e))
-            participants_tsv = None
+        Parameters
+        ----------
+        raw_query : dict
+            The raw MongoDB query dictionary.
+        kwargs_query : dict
+            The query dictionary built from keyword arguments.
-        try:
-            eeg_json = bids_dataset.eeg_json(bids_file)
-        except Exception as e:
-            logger.error("Error getting eeg_json: %s", str(e))
-            eeg_json = None
-        bids_dependencies_files = self.config["bids_dependencies_files"]
-        bidsdependencies = []
-        for extension in bids_dependencies_files:
-            try:
-                dep_path = bids_dataset.get_bids_metadata_files(bids_file, extension)
-                dep_path = [
-                    str(bids_dataset.get_relative_bidspath(dep)) for dep in dep_path
-                ]
-                bidsdependencies.extend(dep_path)
-            except Exception:
-                pass
-        bidsdependencies.extend(self.get_raw_extensions(bids_file, bids_dataset))
-        # Define field extraction functions with error handling
-        field_extractors = {
-            "data_name": lambda: f"{bids_dataset.dataset}_{file}",
-            "dataset": lambda: bids_dataset.dataset,
-            "bidspath": lambda: openneuro_path,
-            "subject": lambda: bids_dataset.get_bids_file_attribute(
-                "subject", bids_file
-            ),
-            "task": lambda: bids_dataset.get_bids_file_attribute("task", bids_file),
-            "session": lambda: bids_dataset.get_bids_file_attribute(
-                "session", bids_file
-            ),
-            "run": lambda: bids_dataset.get_bids_file_attribute("run", bids_file),
-            "modality": lambda: bids_dataset.get_bids_file_attribute(
-                "modality", bids_file
-            ),
-            "sampling_frequency": lambda: bids_dataset.get_bids_file_attribute(
-                "sfreq", bids_file
-            ),
-            "nchans": lambda: bids_dataset.get_bids_file_attribute("nchans", bids_file),
-            "ntimes": lambda: bids_dataset.get_bids_file_attribute("ntimes", bids_file),
-            "participant_tsv": lambda: participants_tsv,
-            "eeg_json": lambda: eeg_json,
-            "bidsdependencies": lambda: bidsdependencies,
-        }
+        Raises
+        ------
+        ValueError
+            If conflicting constraints are found.
-        # Dynamically populate attrs with error handling
-        for field, extractor in field_extractors.items():
-            try:
-                attrs[field] = extractor()
-            except Exception as e:
-                logger.error("Error extracting %s : %s", field, str(e))
-                attrs[field] = None
+        """
+        if not raw_query or not kwargs_query:
+            return
-        return attrs
+        # Only consider fields we generally allow; skip meta operators like $and
+        raw_keys = set(raw_query.keys()) & ALLOWED_QUERY_FIELDS
+        kw_keys = set(kwargs_query.keys()) & ALLOWED_QUERY_FIELDS
+        dup_keys = raw_keys & kw_keys
+        for key in dup_keys:
+            rc = self._extract_simple_constraint(raw_query, key)
+            kc = self._extract_simple_constraint(kwargs_query, key)
+            if rc is None or kc is None:
+                # If either side is non-simple, skip conflict detection for this key
+                continue
+            r_kind, r_val = rc
+            k_kind, k_val = kc
+            # Normalize to sets when appropriate for simpler checks
+            if r_kind == "eq" and k_kind == "eq":
+                if r_val != k_val:
+                    raise ValueError(
+                        f"Conflicting constraints for '{key}': query={r_val!r} vs kwargs={k_val!r}"
+                    )
+            elif r_kind == "in" and k_kind == "eq":
+                if k_val not in r_val:
+                    raise ValueError(
+                        f"Conflicting constraints for '{key}': query in {r_val!r} vs kwargs={k_val!r}"
+                    )
+            elif r_kind == "eq" and k_kind == "in":
+                if r_val not in k_val:
+                    raise ValueError(
+                        f"Conflicting constraints for '{key}': query={r_val!r} vs kwargs in {k_val!r}"
+                    )
+            elif r_kind == "in" and k_kind == "in":
+                if len(set(r_val).intersection(k_val)) == 0:
+                    raise ValueError(
+                        f"Conflicting constraints for '{key}': disjoint sets {r_val!r} and {k_val!r}"
+                    )
     def add_bids_dataset(
-        self, dataset: str, data_dir: str, overwrite: bool = True
-    ) -> None:
-        """Traverse the BIDS dataset at data_dir and add its records to the MongoDB database,
-        under the given dataset name.
+        self,
+        dataset: str,
+        data_dir: str,
+        overwrite: bool = True,
+        output_path: str | Path | None = None,
+    ) -> dict[str, Any]:
+        """Collect metadata for a local BIDS dataset as JSON-ready records.
+        Instead of inserting records directly into MongoDB, this method scans
+        ``data_dir`` and returns a JSON-serializable manifest describing every
+        EEG recording that was discovered. The manifest can be written to disk
+        or forwarded to the EEGDash ingestion API for persistence.
         Parameters
         ----------
-        dataset : str)
-            The name of the dataset to be added (e.g., "ds002718").
+        dataset : str
+            Dataset identifier (e.g., ``"ds002718"``).
         data_dir : str
-            The path to the BIDS dataset directory.
-        overwrite : bool
-            Whether to overwrite/update existing records in the database.
+            Path to the local BIDS dataset directory.
+        overwrite : bool, default True
+            If ``False``, skip records that already exist in the database based
+            on ``data_name`` lookups.
+        output_path : str | Path | None, optional
+            If provided, the manifest is written to the given JSON file.
-        """
-        if self.is_public:
-            raise ValueError("This operation is not allowed for public users")
+        Returns
+        -------
+        dict
+            A manifest with keys ``dataset``, ``source``, ``records`` and, when
+            applicable, ``skipped`` or ``errors``.
-        if not overwrite and self.exist({"dataset": dataset}):
-            logger.info("Dataset %s already exists in the database", dataset)
-            return
+        """
+        source_dir = Path(data_dir).expanduser()
         try:
             bids_dataset = EEGBIDSDataset(
-                data_dir=data_dir,
+                data_dir=str(source_dir),
                 dataset=dataset,
             )
-        except Exception as e:
-            logger.error("Error creating bids dataset %s: $s", dataset, str(e))
-            raise e
-        requests = []
+        except Exception as exc:
+            logger.error("Error creating BIDS dataset %s: %s", dataset, exc)
+            raise exc
+        records: list[dict[str, Any]] = []
+        skipped: list[str] = []
+        errors: list[dict[str, str]] = []
         for bids_file in bids_dataset.get_files():
-            try:
-                data_id = f"{dataset}_{Path(bids_file).name}"
-                if self.exist({"data_name": data_id}):
-                    if overwrite:
-                        eeg_attrs = self.load_eeg_attrs_from_bids_file(
-                            bids_dataset, bids_file
-                        )
-                        requests.append(self.update_request(eeg_attrs))
-                else:
-                    eeg_attrs = self.load_eeg_attrs_from_bids_file(
-                        bids_dataset, bids_file
+            data_id = f"{dataset}_{Path(bids_file).name}"
+            if not overwrite:
+                try:
+                    if self.exist({"data_name": data_id}):
+                        skipped.append(data_id)
+                        continue
+                except Exception as exc:
+                    logger.warning(
+                        "Could not verify existing record %s due to: %s",
+                        data_id,
+                        exc,
                     )
-                    requests.append(self.add_request(eeg_attrs))
-            except Exception as e:
-                logger.error("Error adding record %s", bids_file)
-                logger.error(str(e))
-        logger.info("Number of requests: %s", len(requests))
+            try:
+                eeg_attrs = load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
+                records.append(eeg_attrs)
+            except Exception as exc:  # log and continue collecting
+                logger.error("Error extracting metadata for %s", bids_file)
+                logger.error(str(exc))
+                errors.append({"file": str(bids_file), "error": str(exc)})
+        manifest: dict[str, Any] = {
+            "dataset": dataset,
+            "source": str(source_dir.resolve()),
+            "record_count": len(records),
+            "records": records,
+        }
+        if skipped:
+            manifest["skipped"] = skipped
+        if errors:
+            manifest["errors"] = errors
+        if output_path is not None:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            with output_path.open("w", encoding="utf-8") as fh:
+                json.dump(
+                    manifest,
+                    fh,
+                    indent=2,
+                    sort_keys=True,
+                    default=_json_default,
+                )
+            logger.info(
+                "Wrote EEGDash ingestion manifest for %s to %s",
+                dataset,
+                output_path,
+            )
+        logger.info(
+            "Prepared %s records for dataset %s (skipped=%s, errors=%s)",
+            len(records),
+            dataset,
+            len(skipped),
+            len(errors),
+        )
+        return manifest
-        if requests:
-            result = self.__collection.bulk_write(requests, ordered=False)
-            logger.info("Inserted: %s ", result.inserted_count)
-            logger.info("Modified: %s ", result.modified_count)
-            logger.info("Deleted: %s", result.deleted_count)
-            logger.info("Upserted: %s", result.upserted_count)
-            logger.info("Errors: %s ", result.bulk_api_result.get("writeErrors", []))
+    def exists(self, query: dict[str, Any]) -> bool:
+        """Check if at least one record matches the query.
-    def get(self, query: dict[str, Any]) -> list[xr.DataArray]:
-        """Retrieve a list of EEG data arrays that match the given query. See also
-        the `find()` method for details on the query format.
+        This is an alias for :meth:`exist`.
         Parameters
         ----------
         query : dict
-            A dictionary that specifies the query to be executed; this is a reference
-            document that is used to match records in the MongoDB collection.
+            MongoDB query to check for existence.
         Returns
         -------
-            A list of xarray DataArray objects containing the EEG data for each matching record.
-        Notes
-        -----
-        Retrieval is done in parallel, and the downloaded data are not cached locally.
-        """
-        sessions = self.find(query)
-        results = []
-        if sessions:
-            logger.info("Found %s records", len(sessions))
-            results = Parallel(
-                n_jobs=-1 if len(sessions) > 1 else 1, prefer="threads", verbose=1
-            )(
-                delayed(self.load_eeg_data_from_s3)(self.get_s3path(session))
-                for session in sessions
-            )
-        return results
-    def add_request(self, record: dict):
-        """Internal helper method to create a MongoDB insertion request for a record."""
-        return InsertOne(record)
-    def add(self, record: dict):
-        """Add a single record to the MongoDB collection."""
-        try:
-            self.__collection.insert_one(record)
-        except ValueError as e:
-            logger.error("Validation error for record: %s ", record["data_name"])
-            logger.error(e)
-        except:
-            logger.error("Error adding record: %s ", record["data_name"])
-    def update_request(self, record: dict):
-        """Internal helper method to create a MongoDB update request for a record."""
-        return UpdateOne({"data_name": record["data_name"]}, {"$set": record})
-    def update(self, record: dict):
-        """Update a single record in the MongoDB collection."""
-        try:
-            self.__collection.update_one(
-                {"data_name": record["data_name"]}, {"$set": record}
-            )
-        except:  # silent failure
-            logger.error("Error updating record: %s", record["data_name"])
-    def remove_field(self, record, field):
-        """Remove a specific field from a record in the MongoDB collection."""
-        self.__collection.update_one(
-            {"data_name": record["data_name"]}, {"$unset": {field: 1}}
-        )
+        bool
+            True if a matching record exists, False otherwise.
-    def remove_field_from_db(self, field):
-        """Removed all occurrences of a specific field from all records in the MongoDB
-        collection. WARNING: this operation is destructive and should be used with caution.
         """
-        self.__collection.update_many({}, {"$unset": {field: 1}})
+        return self.exist(query)
     @property
     def collection(self):
-        """Return the MongoDB collection object."""
-        return self.__collection
+        """The underlying PyMongo ``Collection`` object.
-    def close(self):
-        """Close the MongoDB client connection.
+        Returns
+        -------
+        pymongo.collection.Collection
+            The collection object used for database interactions.
-        Note: Since MongoDB clients are now managed by a singleton,
-        this method no longer closes connections. Use close_all_connections()
-        class method to close all connections if needed.
         """
-        # Individual instances no longer close the shared client
-        pass
+        return self.__collection
     @classmethod
-    def close_all_connections(cls):
-        """Close all MongoDB client connections managed by the singleton."""
+    def close_all_connections(cls) -> None:
+        """Close all MongoDB client connections managed by the singleton manager."""
         MongoConnectionManager.close_all()
-    def __del__(self):
-        """Ensure connection is closed when object is deleted."""
-        # No longer needed since we're using singleton pattern
-        pass
-class EEGDashDataset(BaseConcatDataset):
-    def __init__(
-        self,
-        query: dict | None = None,
-        data_dir: str | list | None = None,
-        dataset: str | list | None = None,
-        description_fields: list[str] = [
-            "subject",
-            "session",
-            "run",
-            "task",
-            "age",
-            "gender",
-            "sex",
-        ],
-        cache_dir: str = "~/eegdash_cache",
-        s3_bucket: str | None = None,
-        eeg_dash_instance=None,
-        **kwargs,
-    ):
-        """Create a new EEGDashDataset from a given query or local BIDS dataset directory
-        and dataset name. An EEGDashDataset is pooled collection of EEGDashBaseDataset
-        instances (individual recordings) and is a subclass of braindecode's BaseConcatDataset.
-        Parameters
-        ----------
-        query : dict | None
-            Optionally a dictionary that specifies the query to be executed; see
-            EEGDash.find() for details on the query format.
-        data_dir : str | list[str] | None
-            Optionally a string or a list of strings specifying one or more local
-            BIDS dataset directories from which to load the EEG data files. Exactly one
-            of query or data_dir must be provided.
-        dataset : str | list[str] | None
-            If data_dir is given, a name or list of names for for the dataset(s) to be loaded.
-        description_fields : list[str]
-            A list of fields to be extracted from the dataset records
-            and included in the returned data description(s). Examples are typical
-            subject metadata fields such as "subject", "session", "run", "task", etc.;
-            see also data_config.description_fields for the default set of fields.
-        cache_dir : str
-            A directory where the dataset will be cached locally.
-        s3_bucket : str | None
-            An optional S3 bucket URI (e.g., "s3://mybucket") to use instead of the
-            default OpenNeuro bucket for loading data files
-        kwargs : dict
-            Additional keyword arguments to be passed to the EEGDashBaseDataset
-            constructor.
-        """
-        self.cache_dir = cache_dir
-        self.s3_bucket = s3_bucket
-        self.eeg_dash = eeg_dash_instance or EEGDash()
-        _owns_client = eeg_dash_instance is None
-        try:
-            if query:
-                datasets = self.find_datasets(query, description_fields, **kwargs)
-            elif data_dir:
-                if isinstance(data_dir, str):
-                    datasets = self.load_bids_dataset(
-                        dataset, data_dir, description_fields, s3_bucket, **kwargs
-                    )
-                else:
-                    assert len(data_dir) == len(dataset), (
-                        "Number of datasets and their directories must match"
-                    )
-                    datasets = []
-                    for i, _ in enumerate(data_dir):
-                        datasets.extend(
-                            self.load_bids_dataset(
-                                dataset[i],
-                                data_dir[i],
-                                description_fields,
-                                s3_bucket,
-                                **kwargs,
-                            )
-                        )
-            else:
-                raise ValueError(
-                    "Exactly one of 'query' or 'data_dir' must be provided."
-                )
-        finally:
-            # If we created the client, close it now that construction is done.
-            if _owns_client:
-                try:
-                    self.eeg_dash.close()
-                except Exception:
-                    # Don't let close errors break construction
-                    pass
-        self.filesystem = S3FileSystem(
-            anon=True, client_kwargs={"region_name": "us-east-2"}
-        )
-        self.eeg_dash.close()
-        super().__init__(datasets)
-    def find_key_in_nested_dict(self, data: Any, target_key: str) -> Any:
-        """Helper to recursively search for a key in a nested dictionary structure; returns
-        the value associated with the first occurrence of the key, or None if not found.
-        """
-        if isinstance(data, dict):
-            if target_key in data:
-                return data[target_key]
-            for value in data.values():
-                result = self.find_key_in_nested_dict(value, target_key)
-                if result is not None:
-                    return result
-        return None
-    def find_datasets(
-        self, query: dict[str, Any], description_fields: list[str], **kwargs
-    ) -> list[EEGDashBaseDataset]:
-        """Helper method to find datasets in the MongoDB collection that satisfy the
-        given query and return them as a list of EEGDashBaseDataset objects.
-        Parameters
-        ----------
-        query : dict
-            The query object, as in EEGDash.find().
-        description_fields : list[str]
-            A list of fields to be extracted from the dataset records and included in
-            the returned dataset description(s).
-        kwargs: additional keyword arguments to be passed to the EEGDashBaseDataset
-            constructor.
-        Returns
-        -------
-        list :
-            A list of EEGDashBaseDataset objects that match the query.
+def _json_default(value: Any) -> Any:
+    """Fallback serializer for complex objects when exporting ingestion JSON."""
+    try:
+        if isinstance(value, (np.generic,)):
+            return value.item()
+        if isinstance(value, np.ndarray):
+            return value.tolist()
+    except Exception:
+        pass
-        """
-        datasets: list[EEGDashBaseDataset] = []
-        for record in self.eeg_dash.find(query):
-            description = {}
-            for field in description_fields:
-                value = self.find_key_in_nested_dict(record, field)
-                if value is not None:
-                    description[field] = value
-            datasets.append(
-                EEGDashBaseDataset(
-                    record,
-                    self.cache_dir,
-                    self.s3_bucket,
-                    description=description,
-                    **kwargs,
-                )
-            )
-        return datasets
+    try:
+        if value is pd.NA:
+            return None
+        if isinstance(value, (pd.Timestamp, pd.Timedelta)):
+            return value.isoformat()
+        if isinstance(value, pd.Series):
+            return value.to_dict()
+    except Exception:
+        pass
-    def load_bids_dataset(
-        self,
-        dataset,
-        data_dir,
-        description_fields: list[str],
-        s3_bucket: str | None = None,
-        **kwargs,
-    ):
-        """Helper method to load a single local BIDS dataset and return it as a list of
-        EEGDashBaseDatasets (one for each recording in the dataset).
+    if isinstance(value, Path):
+        return value.as_posix()
+    if isinstance(value, set):
+        return sorted(value)
-        Parameters
-        ----------
-        dataset : str
-            A name for the dataset to be loaded (e.g., "ds002718").
-        data_dir : str
-            The path to the local BIDS dataset directory.
-        description_fields : list[str]
-            A list of fields to be extracted from the dataset records
-            and included in the returned dataset description(s).
+    raise TypeError(f"Object of type {type(value).__name__} is not JSON serializable")
-        """
-        bids_dataset = EEGBIDSDataset(
-            data_dir=data_dir,
-            dataset=dataset,
-        )
-        datasets = Parallel(n_jobs=-1, prefer="threads", verbose=1)(
-            delayed(self.get_base_dataset_from_bids_file)(
-                bids_dataset=bids_dataset,
-                bids_file=bids_file,
-                s3_bucket=s3_bucket,
-                description_fields=description_fields,
-                **kwargs,
-            )
-            for bids_file in bids_dataset.get_files()
-        )
-        return datasets
-    def get_base_dataset_from_bids_file(
-        self,
-        bids_dataset: "EEGBIDSDataset",
-        bids_file: str,
-        s3_bucket: str | None,
-        description_fields: list[str],
-        **kwargs,
-    ) -> "EEGDashBaseDataset":
-        """Instantiate a single EEGDashBaseDataset given a local BIDS file (metadata only)."""
-        record = self.eeg_dash.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
-        description = {}
-        for field in description_fields:
-            value = self.find_key_in_nested_dict(record, field)
-            if value is not None:
-                description[field] = value
-        return EEGDashBaseDataset(
-            record,
-            self.cache_dir,
-            s3_bucket,
-            description=description,
-            **kwargs,
-        )
+__all__ = ["EEGDash"]

eegdash 0.3.3.dev61__py3-none-any.whl → 0.5.0.dev180784713__py3-none-any.whl

eegdash 0.3.3.dev61py3-none-any.whl → 0.5.0.dev180784713py3-none-any.whl