PyPI - juniper-data - Versions diffs - 0.4.2__py3-none-any.whl - Mend

juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

juniper_data/__init__.py +88 -0
juniper_data/__main__.py +78 -0
juniper_data/api/__init__.py +10 -0
juniper_data/api/app.py +111 -0
juniper_data/api/middleware.py +95 -0
juniper_data/api/routes/__init__.py +9 -0
juniper_data/api/routes/datasets.py +414 -0
juniper_data/api/routes/generators.py +125 -0
juniper_data/api/routes/health.py +49 -0
juniper_data/api/security.py +238 -0
juniper_data/api/settings.py +109 -0
juniper_data/core/__init__.py +32 -0
juniper_data/core/artifacts.py +63 -0
juniper_data/core/dataset_id.py +38 -0
juniper_data/core/models.py +135 -0
juniper_data/core/split.py +120 -0
juniper_data/generators/__init__.py +15 -0
juniper_data/generators/arc_agi/__init__.py +11 -0
juniper_data/generators/arc_agi/generator.py +229 -0
juniper_data/generators/arc_agi/params.py +56 -0
juniper_data/generators/checkerboard/__init__.py +15 -0
juniper_data/generators/checkerboard/generator.py +114 -0
juniper_data/generators/checkerboard/params.py +32 -0
juniper_data/generators/circles/__init__.py +11 -0
juniper_data/generators/circles/generator.py +112 -0
juniper_data/generators/circles/params.py +31 -0
juniper_data/generators/csv_import/__init__.py +15 -0
juniper_data/generators/csv_import/generator.py +198 -0
juniper_data/generators/csv_import/params.py +48 -0
juniper_data/generators/gaussian/__init__.py +11 -0
juniper_data/generators/gaussian/generator.py +149 -0
juniper_data/generators/gaussian/params.py +53 -0
juniper_data/generators/mnist/__init__.py +11 -0
juniper_data/generators/mnist/generator.py +124 -0
juniper_data/generators/mnist/params.py +39 -0
juniper_data/generators/spiral/__init__.py +57 -0
juniper_data/generators/spiral/defaults.py +39 -0
juniper_data/generators/spiral/generator.py +206 -0
juniper_data/generators/spiral/params.py +148 -0
juniper_data/generators/xor/__init__.py +11 -0
juniper_data/generators/xor/generator.py +162 -0
juniper_data/generators/xor/params.py +30 -0
juniper_data/storage/__init__.py +120 -0
juniper_data/storage/base.py +279 -0
juniper_data/storage/cached.py +211 -0
juniper_data/storage/hf_store.py +257 -0
juniper_data/storage/kaggle_store.py +333 -0
juniper_data/storage/local_fs.py +232 -0
juniper_data/storage/memory.py +136 -0
juniper_data/storage/postgres_store.py +373 -0
juniper_data/storage/redis_store.py +264 -0
juniper_data/tests/__init__.py +1 -0
juniper_data/tests/conftest.py +68 -0
juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
juniper_data/tests/integration/__init__.py +1 -0
juniper_data/tests/integration/test_api.py +283 -0
juniper_data/tests/integration/test_e2e_workflow.py +378 -0
juniper_data/tests/integration/test_lifecycle_api.py +304 -0
juniper_data/tests/integration/test_security_integration.py +189 -0
juniper_data/tests/integration/test_storage_workflow.py +259 -0
juniper_data/tests/performance/__init__.py +1 -0
juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
juniper_data/tests/unit/__init__.py +1 -0
juniper_data/tests/unit/test_api_app.py +206 -0
juniper_data/tests/unit/test_api_routes.py +407 -0
juniper_data/tests/unit/test_api_settings.py +100 -0
juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
juniper_data/tests/unit/test_artifacts.py +145 -0
juniper_data/tests/unit/test_cached_store.py +423 -0
juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
juniper_data/tests/unit/test_circles_generator.py +256 -0
juniper_data/tests/unit/test_csv_import_generator.py +345 -0
juniper_data/tests/unit/test_dataset_id.py +181 -0
juniper_data/tests/unit/test_gaussian_generator.py +333 -0
juniper_data/tests/unit/test_hf_store.py +416 -0
juniper_data/tests/unit/test_init.py +93 -0
juniper_data/tests/unit/test_kaggle_store.py +469 -0
juniper_data/tests/unit/test_lifecycle.py +394 -0
juniper_data/tests/unit/test_main.py +127 -0
juniper_data/tests/unit/test_middleware.py +79 -0
juniper_data/tests/unit/test_mnist_generator.py +370 -0
juniper_data/tests/unit/test_postgres_store.py +490 -0
juniper_data/tests/unit/test_redis_store.py +500 -0
juniper_data/tests/unit/test_security.py +281 -0
juniper_data/tests/unit/test_security_boundaries.py +517 -0
juniper_data/tests/unit/test_spiral_generator.py +566 -0
juniper_data/tests/unit/test_split.py +245 -0
juniper_data/tests/unit/test_storage.py +767 -0
juniper_data/tests/unit/test_xor_generator.py +223 -0
juniper_data-0.4.2.dist-info/METADATA +216 -0
juniper_data-0.4.2.dist-info/RECORD +95 -0
juniper_data-0.4.2.dist-info/WHEEL +5 -0
juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
juniper_data-0.4.2.dist-info/top_level.txt +1 -0

juniper_data/tests/integration/test_e2e_workflow.py ADDED Viewed

@@ -0,0 +1,378 @@
+"""End-to-End integration tests for the complete JuniperData workflow.
+These tests verify the full flow:
+1. Start JuniperData service (via TestClient)
+2. Create dataset via REST API
+3. Download NPZ artifact
+4. Verify data integrity (shapes, dtypes, determinism)
+Marked with @pytest.mark.slow for weekly CI runs.
+"""
+import io
+import numpy as np
+import pytest
+from fastapi.testclient import TestClient
+from juniper_data.api.app import create_app
+from juniper_data.api.routes import datasets
+from juniper_data.api.settings import Settings
+from juniper_data.storage.memory import InMemoryDatasetStore
+@pytest.fixture
+def e2e_store() -> InMemoryDatasetStore:
+    """Create a fresh in-memory store for E2E tests."""
+    return InMemoryDatasetStore()
+@pytest.fixture
+def e2e_settings() -> Settings:
+    """Create E2E test settings."""
+    return Settings(storage_path="/tmp/juniper_data_e2e_test")
+@pytest.fixture
+def e2e_client(e2e_store: InMemoryDatasetStore, e2e_settings: Settings) -> TestClient:
+    """Create an E2E test client with in-memory storage."""
+    app = create_app(settings=e2e_settings)
+    datasets.set_store(e2e_store)
+    return TestClient(app)
+@pytest.mark.integration
+@pytest.mark.slow
+class TestE2EModernAlgorithm:
+    """E2E tests for the modern spiral generation algorithm."""
+    @pytest.fixture
+    def modern_request(self) -> dict:
+        """Request for modern algorithm spiral dataset."""
+        return {
+            "generator": "spiral",
+            "params": {
+                "n_spirals": 2,
+                "n_points_per_spiral": 100,
+                "seed": 42,
+                "algorithm": "modern",
+                "noise": 0.1,
+                "train_ratio": 0.8,
+                "test_ratio": 0.2,
+            },
+            "persist": True,
+        }
+    def test_e2e_create_download_verify_modern(self, e2e_client: TestClient, modern_request: dict) -> None:
+        """Complete E2E flow: create dataset, download NPZ, verify integrity."""
+        create_response = e2e_client.post("/v1/datasets", json=modern_request)
+        assert create_response.status_code == 201
+        dataset_id = create_response.json()["dataset_id"]
+        artifact_response = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        assert artifact_response.status_code == 200
+        assert artifact_response.headers["content-type"] == "application/octet-stream"
+        with np.load(io.BytesIO(artifact_response.content)) as data:
+            assert "X_train" in data.files
+            assert "y_train" in data.files
+            assert "X_test" in data.files
+            assert "y_test" in data.files
+            assert "X_full" in data.files
+            assert "y_full" in data.files
+            X_train = data["X_train"]
+            y_train = data["y_train"]
+            X_test = data["X_test"]
+            y_test = data["y_test"]
+            X_full = data["X_full"]
+            y_full = data["y_full"]
+            assert X_train.dtype == np.float32
+            assert y_train.dtype == np.float32
+            assert X_test.dtype == np.float32
+            assert y_test.dtype == np.float32
+            assert X_full.dtype == np.float32
+            assert y_full.dtype == np.float32
+            n_total = 2 * 100
+            n_train = int(n_total * 0.8)
+            n_test = n_total - n_train
+            n_spirals = 2
+            assert X_train.shape == (n_train, 2)
+            assert y_train.shape == (n_train, n_spirals)
+            assert X_test.shape == (n_test, 2)
+            assert y_test.shape == (n_test, n_spirals)
+            assert X_full.shape == (n_total, 2)
+            assert y_full.shape == (n_total, n_spirals)
+    def test_e2e_deterministic_with_seed(self, e2e_client: TestClient, modern_request: dict) -> None:
+        """Same seed produces identical data (determinism verification)."""
+        create_response1 = e2e_client.post("/v1/datasets", json=modern_request)
+        dataset_id1 = create_response1.json()["dataset_id"]
+        artifact_response1 = e2e_client.get(f"/v1/datasets/{dataset_id1}/artifact")
+        modern_request["params"]["seed"] = 42
+        create_response2 = e2e_client.post("/v1/datasets", json=modern_request)
+        dataset_id2 = create_response2.json()["dataset_id"]
+        artifact_response2 = e2e_client.get(f"/v1/datasets/{dataset_id2}/artifact")
+        assert dataset_id1 == dataset_id2
+        with np.load(io.BytesIO(artifact_response1.content)) as data1:
+            with np.load(io.BytesIO(artifact_response2.content)) as data2:
+                np.testing.assert_array_equal(data1["X_full"], data2["X_full"])
+                np.testing.assert_array_equal(data1["y_full"], data2["y_full"])
+    def test_e2e_different_seed_different_data(self, e2e_client: TestClient, modern_request: dict) -> None:
+        """Different seeds produce different data."""
+        modern_request["params"]["seed"] = 42
+        create_response1 = e2e_client.post("/v1/datasets", json=modern_request)
+        dataset_id1 = create_response1.json()["dataset_id"]
+        artifact_response1 = e2e_client.get(f"/v1/datasets/{dataset_id1}/artifact")
+        modern_request["params"]["seed"] = 123
+        create_response2 = e2e_client.post("/v1/datasets", json=modern_request)
+        dataset_id2 = create_response2.json()["dataset_id"]
+        artifact_response2 = e2e_client.get(f"/v1/datasets/{dataset_id2}/artifact")
+        assert dataset_id1 != dataset_id2
+        with np.load(io.BytesIO(artifact_response1.content)) as data1:
+            with np.load(io.BytesIO(artifact_response2.content)) as data2:
+                assert not np.array_equal(data1["X_full"], data2["X_full"])
+@pytest.mark.integration
+@pytest.mark.slow
+class TestE2ELegacyCascorAlgorithm:
+    """E2E tests for the legacy_cascor spiral generation algorithm."""
+    @pytest.fixture
+    def legacy_request(self) -> dict:
+        """Request for legacy_cascor algorithm spiral dataset."""
+        return {
+            "generator": "spiral",
+            "params": {
+                "n_spirals": 2,
+                "n_points_per_spiral": 100,
+                "seed": 42,
+                "algorithm": "legacy_cascor",
+                "radius": 10.0,
+                "origin": [0.0, 0.0],
+                "noise": 0.1,
+                "train_ratio": 0.8,
+                "test_ratio": 0.2,
+            },
+            "persist": True,
+        }
+    def test_e2e_create_download_verify_legacy(self, e2e_client: TestClient, legacy_request: dict) -> None:
+        """Complete E2E flow for legacy_cascor algorithm."""
+        create_response = e2e_client.post("/v1/datasets", json=legacy_request)
+        assert create_response.status_code == 201
+        dataset_id = create_response.json()["dataset_id"]
+        artifact_response = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        assert artifact_response.status_code == 200
+        with np.load(io.BytesIO(artifact_response.content)) as data:
+            expected_keys = ["X_train", "y_train", "X_test", "y_test", "X_full", "y_full"]
+            for key in expected_keys:
+                assert key in data.files, f"Missing key: {key}"
+            X_full = data["X_full"]
+            y_full = data["y_full"]
+            assert X_full.dtype == np.float32
+            assert y_full.dtype == np.float32
+            n_total = 2 * 100
+            assert X_full.shape == (n_total, 2)
+            assert y_full.shape == (n_total, 2)
+    def test_e2e_legacy_vs_modern_different(self, e2e_client: TestClient) -> None:
+        """Legacy and modern algorithms produce different data with same seed."""
+        base_params = {
+            "n_spirals": 2,
+            "n_points_per_spiral": 50,
+            "seed": 42,
+            "noise": 0.1,
+        }
+        modern_request = {
+            "generator": "spiral",
+            "params": {**base_params, "algorithm": "modern"},
+            "persist": True,
+        }
+        legacy_request = {
+            "generator": "spiral",
+            "params": {**base_params, "algorithm": "legacy_cascor", "radius": 10.0},
+            "persist": True,
+        }
+        modern_response = e2e_client.post("/v1/datasets", json=modern_request)
+        legacy_response = e2e_client.post("/v1/datasets", json=legacy_request)
+        modern_id = modern_response.json()["dataset_id"]
+        legacy_id = legacy_response.json()["dataset_id"]
+        assert modern_id != legacy_id
+        modern_artifact = e2e_client.get(f"/v1/datasets/{modern_id}/artifact")
+        legacy_artifact = e2e_client.get(f"/v1/datasets/{legacy_id}/artifact")
+        with np.load(io.BytesIO(modern_artifact.content)) as modern_data:
+            with np.load(io.BytesIO(legacy_artifact.content)) as legacy_data:
+                assert not np.array_equal(modern_data["X_full"], legacy_data["X_full"])
+@pytest.mark.integration
+@pytest.mark.slow
+class TestE2EDataContract:
+    """E2E tests verifying the NPZ data contract for consumers."""
+    @pytest.fixture
+    def contract_request(self) -> dict:
+        """Standard request for data contract verification."""
+        return {
+            "generator": "spiral",
+            "params": {
+                "n_spirals": 2,
+                "n_points_per_spiral": 50,
+                "seed": 12345,
+                "train_ratio": 0.7,
+                "test_ratio": 0.3,
+            },
+            "persist": True,
+        }
+    def test_e2e_npz_keys_contract(self, e2e_client: TestClient, contract_request: dict) -> None:
+        """Verify NPZ contains exactly the expected keys per data contract."""
+        create_response = e2e_client.post("/v1/datasets", json=contract_request)
+        dataset_id = create_response.json()["dataset_id"]
+        artifact_response = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        with np.load(io.BytesIO(artifact_response.content)) as data:
+            expected_keys = {"X_train", "y_train", "X_test", "y_test", "X_full", "y_full"}
+            actual_keys = set(data.files)
+            assert actual_keys == expected_keys, f"Keys mismatch: expected {expected_keys}, got {actual_keys}"
+    def test_e2e_feature_dimensions(self, e2e_client: TestClient, contract_request: dict) -> None:
+        """Verify features have 2 dimensions (x, y coordinates)."""
+        create_response = e2e_client.post("/v1/datasets", json=contract_request)
+        dataset_id = create_response.json()["dataset_id"]
+        artifact_response = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        with np.load(io.BytesIO(artifact_response.content)) as data:
+            assert data["X_train"].shape[1] == 2
+            assert data["X_test"].shape[1] == 2
+            assert data["X_full"].shape[1] == 2
+    def test_e2e_one_hot_labels(self, e2e_client: TestClient, contract_request: dict) -> None:
+        """Verify labels are one-hot encoded with correct class count."""
+        create_response = e2e_client.post("/v1/datasets", json=contract_request)
+        dataset_id = create_response.json()["dataset_id"]
+        artifact_response = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        with np.load(io.BytesIO(artifact_response.content)) as data:
+            y_full = data["y_full"]
+            n_spirals = contract_request["params"]["n_spirals"]
+            assert y_full.shape[1] == n_spirals
+            row_sums = y_full.sum(axis=1)
+            np.testing.assert_array_almost_equal(row_sums, np.ones(len(y_full)))
+            assert set(np.unique(y_full)) == {0.0, 1.0}
+    def test_e2e_train_test_split_ratios(self, e2e_client: TestClient, contract_request: dict) -> None:
+        """Verify train/test split matches requested ratios."""
+        create_response = e2e_client.post("/v1/datasets", json=contract_request)
+        dataset_id = create_response.json()["dataset_id"]
+        artifact_response = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        with np.load(io.BytesIO(artifact_response.content)) as data:
+            n_train = len(data["X_train"])
+            n_test = len(data["X_test"])
+            n_full = len(data["X_full"])
+            assert n_train + n_test == n_full
+            expected_train_ratio = 0.7
+            actual_train_ratio = n_train / n_full
+            assert abs(actual_train_ratio - expected_train_ratio) < 0.05
+    def test_e2e_metadata_consistency(self, e2e_client: TestClient, contract_request: dict) -> None:
+        """Verify metadata matches actual data dimensions."""
+        create_response = e2e_client.post("/v1/datasets", json=contract_request)
+        data = create_response.json()
+        dataset_id = data["dataset_id"]
+        meta = data["meta"]
+        artifact_response = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        with np.load(io.BytesIO(artifact_response.content)) as npz_data:
+            assert meta["n_samples"] == len(npz_data["X_full"])
+            assert meta["n_train"] == len(npz_data["X_train"])
+            assert meta["n_test"] == len(npz_data["X_test"])
+            assert meta["n_features"] == npz_data["X_full"].shape[1]
+            assert meta["n_classes"] == npz_data["y_full"].shape[1]
+@pytest.mark.integration
+@pytest.mark.slow
+class TestE2EErrorHandling:
+    """E2E tests for error handling scenarios."""
+    def test_e2e_invalid_generator_name(self, e2e_client: TestClient) -> None:
+        """Invalid generator name returns error (400 or 404)."""
+        request = {
+            "generator": "nonexistent_generator",
+            "params": {},
+            "persist": True,
+        }
+        response = e2e_client.post("/v1/datasets", json=request)
+        assert response.status_code in (400, 404)
+        assert "detail" in response.json()
+    def test_e2e_invalid_params(self, e2e_client: TestClient) -> None:
+        """Invalid parameters return 400/422."""
+        request = {
+            "generator": "spiral",
+            "params": {
+                "n_spirals": -1,
+                "n_points_per_spiral": 100,
+            },
+            "persist": True,
+        }
+        response = e2e_client.post("/v1/datasets", json=request)
+        assert response.status_code in (400, 422)
+    def test_e2e_nonexistent_dataset_artifact(self, e2e_client: TestClient) -> None:
+        """Requesting artifact for nonexistent dataset returns 404."""
+        response = e2e_client.get("/v1/datasets/nonexistent-id-12345/artifact")
+        assert response.status_code == 404
+    def test_e2e_delete_and_verify_gone(self, e2e_client: TestClient) -> None:
+        """Deleted dataset cannot be retrieved."""
+        request = {
+            "generator": "spiral",
+            "params": {"n_spirals": 2, "n_points_per_spiral": 10, "seed": 1},
+            "persist": True,
+        }
+        create_response = e2e_client.post("/v1/datasets", json=request)
+        dataset_id = create_response.json()["dataset_id"]
+        get_response = e2e_client.get(f"/v1/datasets/{dataset_id}")
+        assert get_response.status_code == 200
+        delete_response = e2e_client.delete(f"/v1/datasets/{dataset_id}")
+        assert delete_response.status_code == 204
+        get_after_delete = e2e_client.get(f"/v1/datasets/{dataset_id}")
+        assert get_after_delete.status_code == 404
+        artifact_after_delete = e2e_client.get(f"/v1/datasets/{dataset_id}/artifact")
+        assert artifact_after_delete.status_code == 404

juniper_data/tests/integration/test_lifecycle_api.py ADDED Viewed

@@ -0,0 +1,304 @@
+"""Integration tests for dataset lifecycle management API endpoints (DATA-016).
+Tests for:
+- POST /v1/datasets with tags and TTL
+- GET /v1/datasets/filter
+- POST /v1/datasets/batch-delete
+- PATCH /v1/datasets/{id}/tags
+- GET /v1/datasets/stats
+- POST /v1/datasets/cleanup-expired
+"""
+from datetime import UTC, datetime, timedelta
+import pytest
+from fastapi.testclient import TestClient
+from juniper_data.api.app import create_app
+from juniper_data.api.routes import datasets
+from juniper_data.api.settings import Settings
+from juniper_data.storage.memory import InMemoryDatasetStore
+# from typing import Dict
+@pytest.fixture
+def lifecycle_store() -> InMemoryDatasetStore:
+    """Create a fresh in-memory store for lifecycle tests."""
+    return InMemoryDatasetStore()
+@pytest.fixture
+def lifecycle_settings() -> Settings:
+    """Create lifecycle test settings."""
+    return Settings(storage_path="/tmp/juniper_data_lifecycle_test")
+@pytest.fixture
+def lifecycle_client(lifecycle_store: InMemoryDatasetStore, lifecycle_settings: Settings) -> TestClient:
+    """Create a lifecycle test client with in-memory storage."""
+    app = create_app(settings=lifecycle_settings)
+    datasets.set_store(lifecycle_store)
+    return TestClient(app)
+def _create_spiral_request(
+    n_points: int = 50,
+    seed: int = 42,
+    tags: list[str] | None = None,
+    ttl_seconds: int | None = None,
+) -> dict:
+    """Create a spiral dataset request."""
+    request = {
+        "generator": "spiral",
+        "params": {"n_spirals": 2, "n_points_per_spiral": n_points, "seed": seed},
+        "persist": True,
+    }
+    if tags:
+        request["tags"] = tags
+    if ttl_seconds:
+        request["ttl_seconds"] = ttl_seconds
+    return request
+@pytest.mark.integration
+class TestCreateDatasetWithLifecycle:
+    """Tests for creating datasets with lifecycle features."""
+    def test_create_dataset_with_tags(self, lifecycle_client: TestClient) -> None:
+        """Create dataset with tags."""
+        request = _create_spiral_request(tags=["train", "experiment-1"])
+        response = lifecycle_client.post("/v1/datasets", json=request)
+        assert response.status_code == 201
+        meta = response.json()["meta"]
+        assert "train" in meta["tags"]
+        assert "experiment-1" in meta["tags"]
+    def test_create_dataset_with_ttl(self, lifecycle_client: TestClient) -> None:
+        """Create dataset with TTL."""
+        request = _create_spiral_request(ttl_seconds=3600)
+        response = lifecycle_client.post("/v1/datasets", json=request)
+        assert response.status_code == 201
+        meta = response.json()["meta"]
+        assert meta["ttl_seconds"] == 3600
+        assert meta["expires_at"] is not None
+@pytest.mark.integration
+class TestFilterDatasets:
+    """Tests for the filter datasets endpoint."""
+    @pytest.fixture
+    def populated_client(self, lifecycle_client: TestClient) -> TestClient:
+        """Create multiple datasets for filtering tests."""
+        requests = [
+            _create_spiral_request(n_points=50, seed=1, tags=["train", "v1"]),
+            _create_spiral_request(n_points=100, seed=2, tags=["train", "v2"]),
+            _create_spiral_request(n_points=150, seed=3, tags=["test", "v1"]),
+            _create_spiral_request(n_points=200, seed=4, tags=["test", "v2"]),
+        ]
+        for req in requests:
+            lifecycle_client.post("/v1/datasets", json=req)
+        return lifecycle_client
+    def test_filter_by_generator(self, populated_client: TestClient) -> None:
+        """Filter by generator name."""
+        response = populated_client.get("/v1/datasets/filter?generator=spiral")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["total"] == 4
+        assert all(d["generator"] == "spiral" for d in data["datasets"])
+    def test_filter_by_tags_any(self, populated_client: TestClient) -> None:
+        """Filter by tags with any match."""
+        response = populated_client.get("/v1/datasets/filter?tags=train&tags_match=any")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["total"] == 2
+    def test_filter_by_tags_all(self, populated_client: TestClient) -> None:
+        """Filter by tags with all match."""
+        response = populated_client.get("/v1/datasets/filter?tags=train,v1&tags_match=all")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["total"] == 1
+    def test_filter_by_sample_count(self, populated_client: TestClient) -> None:
+        """Filter by sample count range."""
+        response = populated_client.get("/v1/datasets/filter?min_samples=250&max_samples=350")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["total"] == 1
+    def test_filter_with_pagination(self, populated_client: TestClient) -> None:
+        """Filter with pagination."""
+        response = populated_client.get("/v1/datasets/filter?limit=2&offset=0")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["total"] == 4
+        assert len(data["datasets"]) == 2
+        assert data["limit"] == 2
+        assert data["offset"] == 0
+@pytest.mark.integration
+class TestBatchDelete:
+    """Tests for the batch delete endpoint."""
+    def test_batch_delete_existing(self, lifecycle_client: TestClient) -> None:
+        """Batch delete existing datasets."""
+        ids = []
+        for seed in range(3):
+            response = lifecycle_client.post("/v1/datasets", json=_create_spiral_request(seed=seed))
+            ids.append(response.json()["dataset_id"])
+        response = lifecycle_client.post("/v1/datasets/batch-delete", json={"dataset_ids": ids[:2]})
+        assert response.status_code == 200
+        data = response.json()
+        assert len(data["deleted"]) == 2
+        assert data["not_found"] == []
+        assert data["total_deleted"] == 2
+        for deleted_id in ids[:2]:
+            get_response = lifecycle_client.get(f"/v1/datasets/{deleted_id}")
+            assert get_response.status_code == 404
+        get_response = lifecycle_client.get(f"/v1/datasets/{ids[2]}")
+        assert get_response.status_code == 200
+    def test_batch_delete_mixed(self, lifecycle_client: TestClient) -> None:
+        """Batch delete with some nonexistent IDs."""
+        response = lifecycle_client.post("/v1/datasets", json=_create_spiral_request(seed=42))
+        existing_id = response.json()["dataset_id"]
+        response = lifecycle_client.post(
+            "/v1/datasets/batch-delete", json={"dataset_ids": [existing_id, "fake-id-1", "fake-id-2"]}
+        )
+        assert response.status_code == 200
+        data = response.json()
+        assert data["deleted"] == [existing_id]
+        assert set(data["not_found"]) == {"fake-id-1", "fake-id-2"}
+@pytest.mark.integration
+class TestUpdateTags:
+    """Tests for the update tags endpoint."""
+    def test_add_tags(self, lifecycle_client: TestClient) -> None:
+        """Add tags to existing dataset."""
+        response = lifecycle_client.post("/v1/datasets", json=_create_spiral_request(tags=["original"]))
+        dataset_id = response.json()["dataset_id"]
+        response = lifecycle_client.patch(
+            f"/v1/datasets/{dataset_id}/tags", json={"add_tags": ["new-tag-1", "new-tag-2"]}
+        )
+        assert response.status_code == 200
+        tags = response.json()["tags"]
+        assert "original" in tags
+        assert "new-tag-1" in tags
+        assert "new-tag-2" in tags
+    def test_remove_tags(self, lifecycle_client: TestClient) -> None:
+        """Remove tags from existing dataset."""
+        response = lifecycle_client.post("/v1/datasets", json=_create_spiral_request(tags=["keep", "remove"]))
+        dataset_id = response.json()["dataset_id"]
+        response = lifecycle_client.patch(f"/v1/datasets/{dataset_id}/tags", json={"remove_tags": ["remove"]})
+        assert response.status_code == 200
+        tags = response.json()["tags"]
+        assert "keep" in tags
+        assert "remove" not in tags
+    def test_add_and_remove_tags(self, lifecycle_client: TestClient) -> None:
+        """Add and remove tags in single request."""
+        response = lifecycle_client.post("/v1/datasets", json=_create_spiral_request(tags=["a", "b"]))
+        dataset_id = response.json()["dataset_id"]
+        response = lifecycle_client.patch(
+            f"/v1/datasets/{dataset_id}/tags", json={"add_tags": ["c"], "remove_tags": ["a"]}
+        )
+        assert response.status_code == 200
+        tags = response.json()["tags"]
+        assert set(tags) == {"b", "c"}
+    def test_update_tags_not_found(self, lifecycle_client: TestClient) -> None:
+        """Update tags on nonexistent dataset returns 404."""
+        response = lifecycle_client.patch("/v1/datasets/nonexistent-id/tags", json={"add_tags": ["test"]})
+        assert response.status_code == 404
+@pytest.mark.integration
+class TestDatasetStats:
+    """Tests for the stats endpoint."""
+    def test_stats_empty(self, lifecycle_client: TestClient) -> None:
+        """Stats for empty store."""
+        response = lifecycle_client.get("/v1/datasets/stats")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["total_datasets"] == 0
+        assert data["total_samples"] == 0
+    def test_stats_populated(self, lifecycle_client: TestClient) -> None:
+        """Stats for populated store."""
+        lifecycle_client.post("/v1/datasets", json=_create_spiral_request(n_points=50, seed=1, tags=["train"]))
+        lifecycle_client.post("/v1/datasets", json=_create_spiral_request(n_points=100, seed=2, tags=["train", "v2"]))
+        lifecycle_client.post("/v1/datasets", json=_create_spiral_request(n_points=150, seed=3, tags=["test"]))
+        response = lifecycle_client.get("/v1/datasets/stats")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["total_datasets"] == 3
+        assert data["total_samples"] == 600
+        assert data["by_generator"] == {"spiral": 3}
+        assert data["by_tag"]["train"] == 2
+@pytest.mark.integration
+class TestCleanupExpired:
+    """Tests for the cleanup-expired endpoint."""
+    def test_cleanup_expired_none(self, lifecycle_client: TestClient) -> None:
+        """Cleanup with no expired datasets."""
+        lifecycle_client.post("/v1/datasets", json=_create_spiral_request(seed=1))
+        lifecycle_client.post("/v1/datasets", json=_create_spiral_request(seed=2))
+        response = lifecycle_client.post("/v1/datasets/cleanup-expired")
+        assert response.status_code == 200
+        assert response.json() == []
+    def test_cleanup_expired_with_ttl(
+        self, lifecycle_client: TestClient, lifecycle_store: InMemoryDatasetStore
+    ) -> None:
+        """Cleanup datasets with expired TTL requires manipulating store directly."""
+        response = lifecycle_client.post("/v1/datasets", json=_create_spiral_request(seed=1, ttl_seconds=3600))
+        dataset_id = response.json()["dataset_id"]
+        meta = lifecycle_store.get_meta(dataset_id)
+        assert meta is not None
+        meta.expires_at = datetime.now(UTC) - timedelta(hours=1)
+        lifecycle_store.update_meta(dataset_id, meta)
+        response = lifecycle_client.post("/v1/datasets/cleanup-expired")
+        assert response.status_code == 200
+        deleted = response.json()
+        assert dataset_id in deleted
+        get_response = lifecycle_client.get(f"/v1/datasets/{dataset_id}")
+        assert get_response.status_code == 404