PyPI - juniper-data - Versions diffs - 0.4.2__py3-none-any.whl - Mend

juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

juniper_data/__init__.py +88 -0
juniper_data/__main__.py +78 -0
juniper_data/api/__init__.py +10 -0
juniper_data/api/app.py +111 -0
juniper_data/api/middleware.py +95 -0
juniper_data/api/routes/__init__.py +9 -0
juniper_data/api/routes/datasets.py +414 -0
juniper_data/api/routes/generators.py +125 -0
juniper_data/api/routes/health.py +49 -0
juniper_data/api/security.py +238 -0
juniper_data/api/settings.py +109 -0
juniper_data/core/__init__.py +32 -0
juniper_data/core/artifacts.py +63 -0
juniper_data/core/dataset_id.py +38 -0
juniper_data/core/models.py +135 -0
juniper_data/core/split.py +120 -0
juniper_data/generators/__init__.py +15 -0
juniper_data/generators/arc_agi/__init__.py +11 -0
juniper_data/generators/arc_agi/generator.py +229 -0
juniper_data/generators/arc_agi/params.py +56 -0
juniper_data/generators/checkerboard/__init__.py +15 -0
juniper_data/generators/checkerboard/generator.py +114 -0
juniper_data/generators/checkerboard/params.py +32 -0
juniper_data/generators/circles/__init__.py +11 -0
juniper_data/generators/circles/generator.py +112 -0
juniper_data/generators/circles/params.py +31 -0
juniper_data/generators/csv_import/__init__.py +15 -0
juniper_data/generators/csv_import/generator.py +198 -0
juniper_data/generators/csv_import/params.py +48 -0
juniper_data/generators/gaussian/__init__.py +11 -0
juniper_data/generators/gaussian/generator.py +149 -0
juniper_data/generators/gaussian/params.py +53 -0
juniper_data/generators/mnist/__init__.py +11 -0
juniper_data/generators/mnist/generator.py +124 -0
juniper_data/generators/mnist/params.py +39 -0
juniper_data/generators/spiral/__init__.py +57 -0
juniper_data/generators/spiral/defaults.py +39 -0
juniper_data/generators/spiral/generator.py +206 -0
juniper_data/generators/spiral/params.py +148 -0
juniper_data/generators/xor/__init__.py +11 -0
juniper_data/generators/xor/generator.py +162 -0
juniper_data/generators/xor/params.py +30 -0
juniper_data/storage/__init__.py +120 -0
juniper_data/storage/base.py +279 -0
juniper_data/storage/cached.py +211 -0
juniper_data/storage/hf_store.py +257 -0
juniper_data/storage/kaggle_store.py +333 -0
juniper_data/storage/local_fs.py +232 -0
juniper_data/storage/memory.py +136 -0
juniper_data/storage/postgres_store.py +373 -0
juniper_data/storage/redis_store.py +264 -0
juniper_data/tests/__init__.py +1 -0
juniper_data/tests/conftest.py +68 -0
juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
juniper_data/tests/integration/__init__.py +1 -0
juniper_data/tests/integration/test_api.py +283 -0
juniper_data/tests/integration/test_e2e_workflow.py +378 -0
juniper_data/tests/integration/test_lifecycle_api.py +304 -0
juniper_data/tests/integration/test_security_integration.py +189 -0
juniper_data/tests/integration/test_storage_workflow.py +259 -0
juniper_data/tests/performance/__init__.py +1 -0
juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
juniper_data/tests/unit/__init__.py +1 -0
juniper_data/tests/unit/test_api_app.py +206 -0
juniper_data/tests/unit/test_api_routes.py +407 -0
juniper_data/tests/unit/test_api_settings.py +100 -0
juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
juniper_data/tests/unit/test_artifacts.py +145 -0
juniper_data/tests/unit/test_cached_store.py +423 -0
juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
juniper_data/tests/unit/test_circles_generator.py +256 -0
juniper_data/tests/unit/test_csv_import_generator.py +345 -0
juniper_data/tests/unit/test_dataset_id.py +181 -0
juniper_data/tests/unit/test_gaussian_generator.py +333 -0
juniper_data/tests/unit/test_hf_store.py +416 -0
juniper_data/tests/unit/test_init.py +93 -0
juniper_data/tests/unit/test_kaggle_store.py +469 -0
juniper_data/tests/unit/test_lifecycle.py +394 -0
juniper_data/tests/unit/test_main.py +127 -0
juniper_data/tests/unit/test_middleware.py +79 -0
juniper_data/tests/unit/test_mnist_generator.py +370 -0
juniper_data/tests/unit/test_postgres_store.py +490 -0
juniper_data/tests/unit/test_redis_store.py +500 -0
juniper_data/tests/unit/test_security.py +281 -0
juniper_data/tests/unit/test_security_boundaries.py +517 -0
juniper_data/tests/unit/test_spiral_generator.py +566 -0
juniper_data/tests/unit/test_split.py +245 -0
juniper_data/tests/unit/test_storage.py +767 -0
juniper_data/tests/unit/test_xor_generator.py +223 -0
juniper_data-0.4.2.dist-info/METADATA +216 -0
juniper_data-0.4.2.dist-info/RECORD +95 -0
juniper_data-0.4.2.dist-info/WHEEL +5 -0
juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
juniper_data-0.4.2.dist-info/top_level.txt +1 -0

juniper_data/storage/kaggle_store.py ADDED Viewed

@@ -0,0 +1,333 @@
+"""Kaggle datasets integration for downloading and caching datasets."""
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+import numpy as np
+from juniper_data.core.models import DatasetMeta
+from .base import DatasetStore
+from .memory import InMemoryDatasetStore
+try:
+    from kaggle.api.kaggle_api_extended import KaggleApi
+    KAGGLE_AVAILABLE = True
+except ImportError:
+    KAGGLE_AVAILABLE = False
+    KaggleApi = None  # type: ignore[assignment, misc]
+class KaggleDatasetStore(DatasetStore):
+    """Kaggle API integration for downloading datasets.
+    Downloads datasets from Kaggle and caches them locally.
+    Primarily used as a data source, not for persistent storage.
+    Requires the `kaggle` package: pip install kaggle
+    Also requires Kaggle API credentials in ~/.kaggle/kaggle.json
+    or via KAGGLE_USERNAME and KAGGLE_KEY environment variables.
+    """
+    def __init__(
+        self,
+        download_path: Path | None = None,
+        cache_store: DatasetStore | None = None,
+        auto_authenticate: bool = True,
+    ) -> None:
+        """Initialize the Kaggle store.
+        Args:
+            download_path: Path for downloading and extracting datasets.
+            cache_store: Optional store for caching loaded datasets.
+            auto_authenticate: Automatically authenticate with Kaggle API.
+        Raises:
+            ImportError: If kaggle package is not installed.
+        """
+        if not KAGGLE_AVAILABLE:
+            raise ImportError("Kaggle package not installed. Install with: pip install kaggle")
+        self._download_path = download_path or Path("./data/kaggle")
+        self._download_path.mkdir(parents=True, exist_ok=True)
+        self._cache_store = cache_store or InMemoryDatasetStore()
+        self._api: Any | None = None
+        if auto_authenticate:
+            self._authenticate()
+    def _authenticate(self) -> None:
+        """Authenticate with Kaggle API."""
+        self._api = KaggleApi()
+        self._api.authenticate()
+    def download_dataset(
+        self,
+        dataset_ref: str,
+        unzip: bool = True,
+        force: bool = False,
+    ) -> Path:
+        """Download a dataset from Kaggle.
+        Args:
+            dataset_ref: Dataset reference in format "owner/dataset-name".
+            unzip: Whether to unzip downloaded files.
+            force: Force re-download even if already exists.
+        Returns:
+            Path to the downloaded/extracted dataset directory.
+        Raises:
+            RuntimeError: If authentication failed or API not available.
+        """
+        if self._api is None:
+            raise RuntimeError("Kaggle API not authenticated. Call _authenticate() first.")
+        dataset_path = self._download_path / dataset_ref.replace("/", "_")
+        if dataset_path.exists() and not force:
+            return dataset_path
+        dataset_path.mkdir(parents=True, exist_ok=True)
+        self._api.dataset_download_files(
+            dataset_ref,
+            path=str(dataset_path),
+            unzip=unzip,
+            force=force,
+        )
+        return dataset_path
+    def load_kaggle_dataset(
+        self,
+        dataset_ref: str,
+        file_name: str,
+        feature_columns: list[str] | None = None,
+        label_column: str = "label",
+        delimiter: str = ",",
+        n_samples: int | None = None,
+        seed: int | None = None,
+        one_hot_labels: bool = True,
+        normalize_features: bool = False,
+        train_ratio: float = 0.8,
+    ) -> tuple[str, DatasetMeta, dict[str, np.ndarray]]:
+        """Download and load a CSV dataset from Kaggle.
+        Args:
+            dataset_ref: Dataset reference in format "owner/dataset-name".
+            file_name: Name of the CSV file within the dataset.
+            feature_columns: Column names for features (None = auto-detect).
+            label_column: Column name for labels.
+            delimiter: CSV delimiter.
+            n_samples: Optional limit on number of samples.
+            seed: Random seed for shuffling.
+            one_hot_labels: One-hot encode labels.
+            normalize_features: Normalize features to [0, 1].
+            train_ratio: Ratio for train/test split.
+        Returns:
+            Tuple of (dataset_id, metadata, arrays).
+        """
+        dataset_path = self.download_dataset(dataset_ref)
+        file_path = dataset_path / file_name
+        if not file_path.exists():
+            all_files = list(dataset_path.glob("**/*"))
+            csv_files = [f for f in all_files if f.suffix.lower() == ".csv"]
+            if csv_files:
+                file_path = csv_files[0]
+            else:
+                raise FileNotFoundError(
+                    f"File '{file_name}' not found in dataset. Available files: {[f.name for f in all_files]}"
+                )
+        import csv
+        data = []
+        with open(file_path, encoding="utf-8") as f:
+            reader = csv.DictReader(f, delimiter=delimiter)
+            for row in reader:
+                data.append(row)
+        if not data:
+            raise ValueError("No data found in CSV file")
+        if seed is not None:
+            import random
+            random.seed(seed)
+            random.shuffle(data)
+        if n_samples is not None:
+            data = data[:n_samples]
+        all_columns = list(data[0].keys())
+        if feature_columns is None:
+            feature_columns = [c for c in all_columns if c != label_column]
+        features = []
+        labels = []
+        for row in data:
+            feature_row = []
+            for col in feature_columns:
+                val = row.get(col, 0)
+                try:
+                    feature_row.append(float(val))
+                except (ValueError, TypeError):
+                    feature_row.append(0.0)
+            features.append(feature_row)
+            labels.append(row.get(label_column))
+        X = np.array(features, dtype=np.float32)
+        if normalize_features:
+            X_min = X.min(axis=0, keepdims=True)
+            X_max = X.max(axis=0, keepdims=True)
+            X_range = X_max - X_min
+            X_range[X_range == 0] = 1
+            X = (X - X_min) / X_range
+        unique_labels = sorted([str(lbl) for lbl in set(labels)])
+        label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
+        n_classes = len(unique_labels)
+        label_indices = np.array([label_to_idx[str(lbl)] for lbl in labels])
+        if one_hot_labels:
+            y = np.zeros((len(labels), n_classes), dtype=np.float32)
+            y[np.arange(len(labels)), label_indices] = 1.0
+        else:
+            y = label_indices.astype(np.float32).reshape(-1, 1)
+        class_distribution = {}
+        for i in range(n_classes):
+            class_distribution[str(i)] = int((label_indices == i).sum())
+        n_train = int(len(X) * train_ratio)
+        X_train, X_test = X[:n_train], X[n_train:]
+        y_train, y_test = y[:n_train], y[n_train:]
+        dataset_id = f"kaggle-{dataset_ref.replace('/', '-')}-{len(X)}"
+        meta = DatasetMeta(
+            dataset_id=dataset_id,
+            generator="kaggle",
+            generator_version="1.0.0",
+            params={
+                "dataset_ref": dataset_ref,
+                "file_name": file_name,
+                "n_samples": len(X),
+                "seed": seed,
+                "normalize_features": normalize_features,
+                "one_hot_labels": one_hot_labels,
+            },
+            n_samples=len(X),
+            n_features=X.shape[1],
+            n_classes=n_classes,
+            n_train=n_train,
+            n_test=len(X) - n_train,
+            class_distribution=class_distribution,
+            created_at=datetime.now(UTC),
+            tags=["kaggle", dataset_ref.split("/")[0]],
+        )
+        arrays = {
+            "X_train": X_train,
+            "y_train": y_train,
+            "X_test": X_test,
+            "y_test": y_test,
+            "X_full": X,
+            "y_full": y,
+        }
+        self._cache_store.save(dataset_id, meta, arrays)
+        return dataset_id, meta, arrays
+    def list_competitions(self, search: str | None = None) -> list[dict]:
+        """List available Kaggle competitions.
+        Args:
+            search: Optional search term.
+        Returns:
+            List of competition info dictionaries.
+        """
+        if self._api is None:
+            raise RuntimeError("Kaggle API not authenticated.")
+        competitions = self._api.competitions_list(search=search)
+        return [
+            {
+                "ref": c.ref,
+                "title": c.title,
+                "deadline": c.deadline,
+                "category": c.category,
+            }
+            for c in competitions
+        ]
+    def list_kaggle_datasets(self, search: str | None = None, page: int = 1) -> list[dict]:
+        """List available Kaggle datasets.
+        Args:
+            search: Optional search term.
+            page: Page number for pagination.
+        Returns:
+            List of dataset info dictionaries.
+        """
+        if self._api is None:
+            raise RuntimeError("Kaggle API not authenticated.")
+        datasets = self._api.dataset_list(search=search, page=page)
+        return [
+            {
+                "ref": d.ref,
+                "title": d.title,
+                "size": d.totalBytes,
+                "lastUpdated": d.lastUpdated,
+            }
+            for d in datasets
+        ]
+    def save(
+        self,
+        dataset_id: str,
+        meta: DatasetMeta,
+        arrays: dict[str, np.ndarray],
+    ) -> None:
+        """Save to cache store."""
+        self._cache_store.save(dataset_id, meta, arrays)
+    def get_meta(self, dataset_id: str) -> DatasetMeta | None:
+        """Get from cache store."""
+        return self._cache_store.get_meta(dataset_id)
+    def get_artifact_bytes(self, dataset_id: str) -> bytes | None:
+        """Get from cache store."""
+        return self._cache_store.get_artifact_bytes(dataset_id)
+    def exists(self, dataset_id: str) -> bool:
+        """Check cache store."""
+        return self._cache_store.exists(dataset_id)
+    def delete(self, dataset_id: str) -> bool:
+        """Delete from cache store."""
+        return self._cache_store.delete(dataset_id)
+    def list_datasets(self, limit: int = 100, offset: int = 0) -> list[str]:
+        """List from cache store."""
+        return self._cache_store.list_datasets(limit, offset)
+    def update_meta(self, dataset_id: str, meta: DatasetMeta) -> bool:
+        """Update in cache store."""
+        return self._cache_store.update_meta(dataset_id, meta)
+    def list_all_metadata(self) -> list[DatasetMeta]:
+        """List from cache store."""
+        return self._cache_store.list_all_metadata()

juniper_data/storage/local_fs.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Local filesystem dataset store."""
+import io
+import json
+import logging
+from datetime import datetime
+from pathlib import Path
+# from typing import Any, Dict, List, Optional
+from typing import Any
+import numpy as np
+from juniper_data.core.models import DatasetMeta
+from juniper_data.storage.base import DatasetStore
+def _json_serializer(obj: Any) -> str:
+    """JSON serializer for objects not serializable by default."""
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
+class LocalFSDatasetStore(DatasetStore):
+    """Local filesystem implementation of DatasetStore.
+    Stores datasets as JSON metadata files and NPZ array files.
+    Storage layout:
+        {base_path}/{dataset_id}.meta.json
+        {base_path}/{dataset_id}.npz
+    """
+    def __init__(self, base_path: Path) -> None:
+        """Initialize the local filesystem store.
+        Args:
+            base_path: Base directory for storing datasets. Created if it doesn't exist.
+        """
+        self._base_path = Path(base_path)
+        self._base_path.mkdir(parents=True, exist_ok=True)
+    def _meta_path(self, dataset_id: str) -> Path:
+        """Get path to metadata file."""
+        return self._base_path / f"{dataset_id}.meta.json"
+    def _npz_path(self, dataset_id: str) -> Path:
+        """Get path to NPZ file."""
+        return self._base_path / f"{dataset_id}.npz"
+    def save(
+        self,
+        dataset_id: str,
+        meta: DatasetMeta,
+        arrays: dict[str, np.ndarray],
+    ) -> None:
+        """Save dataset metadata and arrays to filesystem.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+            meta: Dataset metadata.
+            arrays: Dictionary of numpy arrays.
+        Raises:
+            IOError: If the save operation fails.
+        """
+        meta_path = self._meta_path(dataset_id)
+        npz_path = self._npz_path(dataset_id)
+        # Write to temporary files first, then atomically replace the final files
+        tmp_meta_path = meta_path.with_suffix(meta_path.suffix + ".tmp")
+        tmp_npz_path = npz_path.with_suffix(npz_path.suffix + ".tmp")
+        meta_json = json.dumps(
+            meta.model_dump(),
+            default=_json_serializer,
+            indent=2,
+        )
+        try:
+            # Write metadata JSON to temporary file
+            tmp_meta_path.write_text(meta_json, encoding="utf-8")
+            # Write NPZ data to temporary file
+            buffer = io.BytesIO()
+            np.savez_compressed(buffer, **arrays)  # type: ignore[arg-type]  # numpy stubs incomplete for **kwargs
+            buffer.seek(0)
+            tmp_npz_path.write_bytes(buffer.read())
+            # Atomically replace final files with the temporary ones.
+            # Write NPZ first so we never have metadata without its NPZ.
+            tmp_npz_path.replace(npz_path)
+            tmp_meta_path.replace(meta_path)
+        except Exception:
+            # Best-effort cleanup of temporary files on failure
+            try:
+                tmp_meta_path.unlink(missing_ok=True)
+            except OSError:
+                logging.debug(
+                    "Failed to remove temporary metadata file %s during cleanup",
+                    tmp_meta_path,
+                    exc_info=True,
+                )
+            try:
+                tmp_npz_path.unlink(missing_ok=True)
+            except OSError:
+                logging.debug(
+                    "Failed to remove temporary NPZ file %s during cleanup",
+                    tmp_npz_path,
+                    exc_info=True,
+                )
+            raise
+    def get_meta(self, dataset_id: str) -> DatasetMeta | None:
+        """Get dataset metadata from filesystem.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            Dataset metadata if found, None otherwise.
+        """
+        meta_path = self._meta_path(dataset_id)
+        if not meta_path.exists():
+            return None
+        meta_json = meta_path.read_text(encoding="utf-8")
+        meta_dict = json.loads(meta_json)
+        return DatasetMeta(**meta_dict)
+    def get_artifact_bytes(self, dataset_id: str) -> bytes | None:
+        """Get dataset artifact as NPZ bytes.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            NPZ file contents as bytes if found, None otherwise.
+        """
+        npz_path = self._npz_path(dataset_id)
+        return npz_path.read_bytes() if npz_path.exists() else None
+    def exists(self, dataset_id: str) -> bool:
+        """Check if dataset exists on filesystem.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            True if both metadata and NPZ files exist, False otherwise.
+        """
+        return self._meta_path(dataset_id).exists() and self._npz_path(dataset_id).exists()
+    def delete(self, dataset_id: str) -> bool:
+        """Delete dataset from filesystem.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            True if the dataset was deleted, False if it didn't exist.
+        """
+        meta_path = self._meta_path(dataset_id)
+        npz_path = self._npz_path(dataset_id)
+        if not meta_path.exists() and not npz_path.exists():
+            return False
+        if meta_path.exists():
+            meta_path.unlink()
+        if npz_path.exists():
+            npz_path.unlink()
+        return True
+    def list_datasets(self, limit: int = 100, offset: int = 0) -> list[str]:
+        """List dataset IDs from filesystem.
+        Finds datasets by globbing for .meta.json files.
+        Args:
+            limit: Maximum number of dataset IDs to return.
+            offset: Number of dataset IDs to skip.
+        Returns:
+            List of dataset IDs.
+        """
+        meta_files = sorted(self._base_path.glob("*.meta.json"))
+        dataset_ids = [f.stem.replace(".meta", "") for f in meta_files]
+        return dataset_ids[offset : offset + limit]
+    @property
+    def base_path(self) -> Path:
+        """Get the base storage path."""
+        return self._base_path
+    def update_meta(self, dataset_id: str, meta: DatasetMeta) -> bool:
+        """Update dataset metadata on filesystem.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+            meta: Updated dataset metadata.
+        Returns:
+            True if the dataset was updated, False if it didn't exist.
+        """
+        meta_path = self._meta_path(dataset_id)
+        if not meta_path.exists():
+            return False
+        meta_json = json.dumps(
+            meta.model_dump(),
+            default=_json_serializer,
+            indent=2,
+        )
+        meta_path.write_text(meta_json, encoding="utf-8")
+        return True
+    def list_all_metadata(self) -> list[DatasetMeta]:
+        """List all dataset metadata from filesystem.
+        Returns:
+            List of all DatasetMeta objects.
+        """
+        result = []
+        for meta_file in self._base_path.glob("*.meta.json"):
+            dataset_id = meta_file.stem.replace(".meta", "")
+            meta = self.get_meta(dataset_id)
+            if meta is not None:
+                result.append(meta)
+        return result

juniper_data/storage/memory.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""In-memory dataset store for testing and development."""
+import io
+import numpy as np
+from juniper_data.core.models import DatasetMeta
+from juniper_data.storage.base import DatasetStore
+class InMemoryDatasetStore(DatasetStore):
+    """In-memory implementation of DatasetStore.
+    Stores datasets in dictionaries. Useful for testing and development.
+    Data is lost when the process exits.
+    """
+    def __init__(self) -> None:
+        """Initialize the in-memory store."""
+        self._metadata: dict[str, DatasetMeta] = {}
+        self._arrays: dict[str, dict[str, np.ndarray]] = {}
+    def save(
+        self,
+        dataset_id: str,
+        meta: DatasetMeta,
+        arrays: dict[str, np.ndarray],
+    ) -> None:
+        """Save dataset metadata and arrays to memory.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+            meta: Dataset metadata.
+            arrays: Dictionary of numpy arrays.
+        """
+        self._metadata[dataset_id] = meta
+        self._arrays[dataset_id] = {k: v.copy() for k, v in arrays.items()}
+    def get_meta(self, dataset_id: str) -> DatasetMeta | None:
+        """Get dataset metadata from memory.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            Dataset metadata if found, None otherwise.
+        """
+        return self._metadata.get(dataset_id)
+    def get_artifact_bytes(self, dataset_id: str) -> bytes | None:
+        """Get dataset artifact as NPZ bytes.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            NPZ file contents as bytes if found, None otherwise.
+        """
+        arrays = self._arrays.get(dataset_id)
+        if arrays is None:
+            return None
+        buffer = io.BytesIO()
+        # Sort keys to ensure stable NPZ artifact bytes regardless of dict construction order.
+        sorted_arrays = {key: arrays[key] for key in sorted(arrays.keys())}
+        np.savez_compressed(buffer, **sorted_arrays)
+        buffer.seek(0)
+        return buffer.read()
+    def exists(self, dataset_id: str) -> bool:
+        """Check if dataset exists in memory.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            True if the dataset exists, False otherwise.
+        """
+        return dataset_id in self._metadata
+    def delete(self, dataset_id: str) -> bool:
+        """Delete dataset from memory.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+        Returns:
+            True if the dataset was deleted, False if it didn't exist.
+        """
+        if dataset_id not in self._metadata:
+            return False
+        del self._metadata[dataset_id]
+        del self._arrays[dataset_id]
+        return True
+    def list_datasets(self, limit: int = 100, offset: int = 0) -> list[str]:
+        """List dataset IDs from memory.
+        Args:
+            limit: Maximum number of dataset IDs to return.
+            offset: Number of dataset IDs to skip.
+        Returns:
+            List of dataset IDs.
+        """
+        all_ids = sorted(self._metadata.keys())
+        return all_ids[offset : offset + limit]
+    def clear(self) -> None:
+        """Clear all stored datasets. Useful for test cleanup."""
+        self._metadata.clear()
+        self._arrays.clear()
+    def update_meta(self, dataset_id: str, meta: DatasetMeta) -> bool:
+        """Update dataset metadata in memory.
+        Args:
+            dataset_id: Unique identifier for the dataset.
+            meta: Updated dataset metadata.
+        Returns:
+            True if the dataset was updated, False if it didn't exist.
+        """
+        if dataset_id not in self._metadata:
+            return False
+        self._metadata[dataset_id] = meta
+        return True
+    def list_all_metadata(self) -> list[DatasetMeta]:
+        """List all dataset metadata from memory.
+        Returns:
+            List of all DatasetMeta objects.
+        """
+        return list(self._metadata.values())