PyPI - juniper-data - Versions diffs - 0.4.2__py3-none-any.whl - Mend

juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

juniper_data/__init__.py +88 -0
juniper_data/__main__.py +78 -0
juniper_data/api/__init__.py +10 -0
juniper_data/api/app.py +111 -0
juniper_data/api/middleware.py +95 -0
juniper_data/api/routes/__init__.py +9 -0
juniper_data/api/routes/datasets.py +414 -0
juniper_data/api/routes/generators.py +125 -0
juniper_data/api/routes/health.py +49 -0
juniper_data/api/security.py +238 -0
juniper_data/api/settings.py +109 -0
juniper_data/core/__init__.py +32 -0
juniper_data/core/artifacts.py +63 -0
juniper_data/core/dataset_id.py +38 -0
juniper_data/core/models.py +135 -0
juniper_data/core/split.py +120 -0
juniper_data/generators/__init__.py +15 -0
juniper_data/generators/arc_agi/__init__.py +11 -0
juniper_data/generators/arc_agi/generator.py +229 -0
juniper_data/generators/arc_agi/params.py +56 -0
juniper_data/generators/checkerboard/__init__.py +15 -0
juniper_data/generators/checkerboard/generator.py +114 -0
juniper_data/generators/checkerboard/params.py +32 -0
juniper_data/generators/circles/__init__.py +11 -0
juniper_data/generators/circles/generator.py +112 -0
juniper_data/generators/circles/params.py +31 -0
juniper_data/generators/csv_import/__init__.py +15 -0
juniper_data/generators/csv_import/generator.py +198 -0
juniper_data/generators/csv_import/params.py +48 -0
juniper_data/generators/gaussian/__init__.py +11 -0
juniper_data/generators/gaussian/generator.py +149 -0
juniper_data/generators/gaussian/params.py +53 -0
juniper_data/generators/mnist/__init__.py +11 -0
juniper_data/generators/mnist/generator.py +124 -0
juniper_data/generators/mnist/params.py +39 -0
juniper_data/generators/spiral/__init__.py +57 -0
juniper_data/generators/spiral/defaults.py +39 -0
juniper_data/generators/spiral/generator.py +206 -0
juniper_data/generators/spiral/params.py +148 -0
juniper_data/generators/xor/__init__.py +11 -0
juniper_data/generators/xor/generator.py +162 -0
juniper_data/generators/xor/params.py +30 -0
juniper_data/storage/__init__.py +120 -0
juniper_data/storage/base.py +279 -0
juniper_data/storage/cached.py +211 -0
juniper_data/storage/hf_store.py +257 -0
juniper_data/storage/kaggle_store.py +333 -0
juniper_data/storage/local_fs.py +232 -0
juniper_data/storage/memory.py +136 -0
juniper_data/storage/postgres_store.py +373 -0
juniper_data/storage/redis_store.py +264 -0
juniper_data/tests/__init__.py +1 -0
juniper_data/tests/conftest.py +68 -0
juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
juniper_data/tests/integration/__init__.py +1 -0
juniper_data/tests/integration/test_api.py +283 -0
juniper_data/tests/integration/test_e2e_workflow.py +378 -0
juniper_data/tests/integration/test_lifecycle_api.py +304 -0
juniper_data/tests/integration/test_security_integration.py +189 -0
juniper_data/tests/integration/test_storage_workflow.py +259 -0
juniper_data/tests/performance/__init__.py +1 -0
juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
juniper_data/tests/unit/__init__.py +1 -0
juniper_data/tests/unit/test_api_app.py +206 -0
juniper_data/tests/unit/test_api_routes.py +407 -0
juniper_data/tests/unit/test_api_settings.py +100 -0
juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
juniper_data/tests/unit/test_artifacts.py +145 -0
juniper_data/tests/unit/test_cached_store.py +423 -0
juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
juniper_data/tests/unit/test_circles_generator.py +256 -0
juniper_data/tests/unit/test_csv_import_generator.py +345 -0
juniper_data/tests/unit/test_dataset_id.py +181 -0
juniper_data/tests/unit/test_gaussian_generator.py +333 -0
juniper_data/tests/unit/test_hf_store.py +416 -0
juniper_data/tests/unit/test_init.py +93 -0
juniper_data/tests/unit/test_kaggle_store.py +469 -0
juniper_data/tests/unit/test_lifecycle.py +394 -0
juniper_data/tests/unit/test_main.py +127 -0
juniper_data/tests/unit/test_middleware.py +79 -0
juniper_data/tests/unit/test_mnist_generator.py +370 -0
juniper_data/tests/unit/test_postgres_store.py +490 -0
juniper_data/tests/unit/test_redis_store.py +500 -0
juniper_data/tests/unit/test_security.py +281 -0
juniper_data/tests/unit/test_security_boundaries.py +517 -0
juniper_data/tests/unit/test_spiral_generator.py +566 -0
juniper_data/tests/unit/test_split.py +245 -0
juniper_data/tests/unit/test_storage.py +767 -0
juniper_data/tests/unit/test_xor_generator.py +223 -0
juniper_data-0.4.2.dist-info/METADATA +216 -0
juniper_data-0.4.2.dist-info/RECORD +95 -0
juniper_data-0.4.2.dist-info/WHEEL +5 -0
juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
juniper_data-0.4.2.dist-info/top_level.txt +1 -0

juniper_data/tests/performance/test_storage_benchmarks.py ADDED Viewed

@@ -0,0 +1,257 @@
+#####################################################################################################################################################################################################
+# Project:       Juniper
+# Sub-Project:   JuniperData
+# Application:   juniper_data
+# File Name:     test_storage_benchmarks.py
+# Author:        Paul Calnon
+# Version:       0.4.2
+#
+# Date Created:  2026-02-25
+# Last Modified: 2026-02-25
+#
+# License:       MIT License
+# Copyright:     Copyright (c) 2024-2026 Paul Calnon
+#
+# Description:
+#    Performance benchmarks for storage backends.
+#    Measures throughput for save, retrieve, list, and delete operations
+#    on InMemoryDatasetStore and LocalFSDatasetStore.
+#
+# Usage:
+#    # Run benchmarks with timing (disabled by default in addopts):
+#    pytest juniper_data/tests/performance/test_storage_benchmarks.py --benchmark-enable -v
+#
+#    # Run with autosave for regression tracking:
+#    pytest juniper_data/tests/performance/test_storage_benchmarks.py --benchmark-enable --benchmark-autosave
+#
+# References:
+#    - RD-009: Performance Test Infrastructure
+#    - pytest-benchmark: https://pytest-benchmark.readthedocs.io/
+#####################################################################################################################################################################################################
+"""Performance benchmarks for storage backends.
+Benchmarks measure throughput for core storage operations (save, get_meta,
+get_artifact_bytes, list, delete) on InMemoryDatasetStore (baseline) and
+LocalFSDatasetStore (I/O-bound). Optional backends (Redis, PostgreSQL,
+HuggingFace, Kaggle) are excluded as they require external services.
+"""
+from datetime import UTC, datetime
+import numpy as np
+import pytest
+from juniper_data.core.models import DatasetMeta
+from juniper_data.storage.local_fs import LocalFSDatasetStore
+from juniper_data.storage.memory import InMemoryDatasetStore
+def _make_meta(dataset_id: str) -> DatasetMeta:
+    """Create a representative DatasetMeta for benchmarks."""
+    return DatasetMeta(
+        dataset_id=dataset_id,
+        generator="spiral",
+        generator_version="0.4.2",
+        params={"n_spirals": 2, "n_points_per_spiral": 500, "seed": 42},
+        n_samples=1000,
+        n_features=2,
+        n_classes=2,
+        n_train=800,
+        n_test=200,
+        class_distribution={"0": 500, "1": 500},
+        created_at=datetime.now(UTC),
+    )
+def _make_arrays(n_train: int = 800, n_test: int = 200, n_features: int = 2) -> dict[str, np.ndarray]:
+    """Create representative dataset arrays for benchmarks."""
+    rng = np.random.default_rng(42)
+    return {
+        "X_train": rng.random((n_train, n_features), dtype=np.float32),
+        "y_train": rng.random((n_train, 2), dtype=np.float32),
+        "X_test": rng.random((n_test, n_features), dtype=np.float32),
+        "y_test": rng.random((n_test, 2), dtype=np.float32),
+        "X_full": rng.random((n_train + n_test, n_features), dtype=np.float32),
+        "y_full": rng.random((n_train + n_test, 2), dtype=np.float32),
+    }
+# ═══════════════════════════════════════════════════════════════════════════════
+# InMemoryDatasetStore Benchmarks (Baseline)
+# ═══════════════════════════════════════════════════════════════════════════════
+@pytest.mark.performance
+class TestInMemoryStoreBenchmarks:
+    """Benchmark InMemoryDatasetStore operations.
+    In-memory store provides the baseline measurement for storage
+    operations without filesystem or network I/O overhead.
+    """
+    def test_save(self, benchmark):
+        """Benchmark save operation."""
+        store = InMemoryDatasetStore()
+        meta = _make_meta("bench-save")
+        arrays = _make_arrays()
+        benchmark(store.save, "bench-save", meta, arrays)
+        assert store.exists("bench-save")
+    def test_get_meta(self, benchmark):
+        """Benchmark metadata retrieval."""
+        store = InMemoryDatasetStore()
+        store.save("bench-meta", _make_meta("bench-meta"), _make_arrays())
+        result = benchmark(store.get_meta, "bench-meta")
+        assert result is not None
+        assert result.dataset_id == "bench-meta"
+    def test_get_artifact_bytes(self, benchmark):
+        """Benchmark artifact retrieval (NPZ bytes)."""
+        store = InMemoryDatasetStore()
+        store.save("bench-artifact", _make_meta("bench-artifact"), _make_arrays())
+        result = benchmark(store.get_artifact_bytes, "bench-artifact")
+        assert result is not None
+        assert len(result) > 0
+    def test_exists(self, benchmark):
+        """Benchmark existence check."""
+        store = InMemoryDatasetStore()
+        store.save("bench-exists", _make_meta("bench-exists"), _make_arrays())
+        result = benchmark(store.exists, "bench-exists")
+        assert result is True
+    def test_list_datasets(self, benchmark):
+        """Benchmark list operation with 50 datasets."""
+        store = InMemoryDatasetStore()
+        arrays = _make_arrays()
+        for i in range(50):
+            store.save(f"bench-list-{i:03d}", _make_meta(f"bench-list-{i:03d}"), arrays)
+        result = benchmark(store.list_datasets, 50, 0)
+        assert len(result) == 50
+    def test_delete(self, benchmark):
+        """Benchmark delete operation."""
+        store = InMemoryDatasetStore()
+        arrays = _make_arrays()
+        def save_and_delete():
+            store.save("bench-delete", _make_meta("bench-delete"), arrays)
+            return store.delete("bench-delete")
+        result = benchmark(save_and_delete)
+        assert result is True
+# ═══════════════════════════════════════════════════════════════════════════════
+# LocalFSDatasetStore Benchmarks (I/O-bound)
+# ═══════════════════════════════════════════════════════════════════════════════
+@pytest.mark.performance
+class TestLocalFSStoreBenchmarks:
+    """Benchmark LocalFSDatasetStore operations.
+    Filesystem store measures I/O-bound performance for JSON metadata
+    writes and NPZ artifact serialization/deserialization.
+    """
+    def test_save(self, benchmark, tmp_path):
+        """Benchmark save operation (JSON meta + NPZ artifact)."""
+        store = LocalFSDatasetStore(str(tmp_path))
+        meta = _make_meta("bench-save")
+        arrays = _make_arrays()
+        benchmark(store.save, "bench-save", meta, arrays)
+        assert store.exists("bench-save")
+    def test_get_meta(self, benchmark, tmp_path):
+        """Benchmark metadata retrieval from filesystem."""
+        store = LocalFSDatasetStore(str(tmp_path))
+        store.save("bench-meta", _make_meta("bench-meta"), _make_arrays())
+        result = benchmark(store.get_meta, "bench-meta")
+        assert result is not None
+        assert result.dataset_id == "bench-meta"
+    def test_get_artifact_bytes(self, benchmark, tmp_path):
+        """Benchmark artifact retrieval from filesystem."""
+        store = LocalFSDatasetStore(str(tmp_path))
+        store.save("bench-artifact", _make_meta("bench-artifact"), _make_arrays())
+        result = benchmark(store.get_artifact_bytes, "bench-artifact")
+        assert result is not None
+        assert len(result) > 0
+    def test_exists(self, benchmark, tmp_path):
+        """Benchmark existence check on filesystem."""
+        store = LocalFSDatasetStore(str(tmp_path))
+        store.save("bench-exists", _make_meta("bench-exists"), _make_arrays())
+        result = benchmark(store.exists, "bench-exists")
+        assert result is True
+    def test_list_datasets(self, benchmark, tmp_path):
+        """Benchmark list operation with 50 datasets on filesystem."""
+        store = LocalFSDatasetStore(str(tmp_path))
+        arrays = _make_arrays()
+        for i in range(50):
+            store.save(f"bench-list-{i:03d}", _make_meta(f"bench-list-{i:03d}"), arrays)
+        result = benchmark(store.list_datasets, 50, 0)
+        assert len(result) == 50
+    def test_delete(self, benchmark, tmp_path):
+        """Benchmark delete operation on filesystem."""
+        store = LocalFSDatasetStore(str(tmp_path))
+        arrays = _make_arrays()
+        def save_and_delete():
+            store.save("bench-delete", _make_meta("bench-delete"), arrays)
+            return store.delete("bench-delete")
+        result = benchmark(save_and_delete)
+        assert result is True
+# ═══════════════════════════════════════════════════════════════════════════════
+# Dataset Size Scaling Benchmarks
+# ═══════════════════════════════════════════════════════════════════════════════
+@pytest.mark.performance
+class TestStorageScaling:
+    """Benchmark storage throughput across dataset sizes.
+    Measures how save/retrieve times scale with increasing dataset
+    sizes (points * features), using InMemoryDatasetStore to isolate
+    serialization overhead from filesystem I/O.
+    """
+    @pytest.mark.parametrize(
+        ("n_train", "n_test"),
+        [(80, 20), (800, 200), (4000, 1000), (8000, 2000)],
+        ids=["100pts", "1000pts", "5000pts", "10000pts"],
+    )
+    def test_save_scaling(self, benchmark, n_train, n_test):
+        """Benchmark save with increasing dataset sizes."""
+        store = InMemoryDatasetStore()
+        meta = _make_meta("bench-scale")
+        meta.n_train = n_train
+        meta.n_test = n_test
+        meta.n_samples = n_train + n_test
+        arrays = _make_arrays(n_train=n_train, n_test=n_test)
+        benchmark(store.save, "bench-scale", meta, arrays)
+        assert store.exists("bench-scale")
+    @pytest.mark.parametrize(
+        ("n_train", "n_test"),
+        [(80, 20), (800, 200), (4000, 1000), (8000, 2000)],
+        ids=["100pts", "1000pts", "5000pts", "10000pts"],
+    )
+    def test_retrieve_scaling(self, benchmark, n_train, n_test):
+        """Benchmark artifact retrieval with increasing dataset sizes."""
+        store = InMemoryDatasetStore()
+        meta = _make_meta("bench-scale")
+        meta.n_train = n_train
+        meta.n_test = n_test
+        meta.n_samples = n_train + n_test
+        arrays = _make_arrays(n_train=n_train, n_test=n_test)
+        store.save("bench-scale", meta, arrays)
+        result = benchmark(store.get_artifact_bytes, "bench-scale")
+        assert result is not None

juniper_data/tests/unit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Unit tests for Juniper Data."""

juniper_data/tests/unit/test_api_app.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Unit tests for the FastAPI application factory and configuration."""
+import logging
+from unittest.mock import MagicMock, patch
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+from juniper_data import __version__
+from juniper_data.api.app import create_app, lifespan
+from juniper_data.api.routes import datasets
+from juniper_data.api.settings import Settings
+from juniper_data.storage.memory import InMemoryDatasetStore
+@pytest.fixture
+def test_settings() -> Settings:
+    """Create test settings."""
+    return Settings(
+        storage_path="/tmp/juniper_test",
+        host="127.0.0.1",
+        port=8200,
+        log_level="DEBUG",
+        cors_origins=["http://localhost:3000"],
+    )
+@pytest.fixture
+def memory_store() -> InMemoryDatasetStore:
+    """Create in-memory store for testing."""
+    return InMemoryDatasetStore()
+@pytest.mark.unit
+class TestCreateApp:
+    """Tests for the create_app factory function."""
+    def test_create_app_returns_fastapi_instance(self, test_settings: Settings) -> None:
+        """Test create_app returns a FastAPI instance."""
+        app = create_app(settings=test_settings)
+        assert isinstance(app, FastAPI)
+    def test_create_app_sets_title(self, test_settings: Settings) -> None:
+        """Test app has correct title."""
+        app = create_app(settings=test_settings)
+        assert app.title == "Juniper Data API"
+    def test_create_app_sets_version(self, test_settings: Settings) -> None:
+        """Test app has correct version."""
+        app = create_app(settings=test_settings)
+        assert app.version == __version__
+    def test_create_app_stores_settings(self, test_settings: Settings) -> None:
+        """Test settings are stored in app state."""
+        app = create_app(settings=test_settings)
+        assert app.state.settings == test_settings
+    def test_create_app_includes_health_router(self, test_settings: Settings) -> None:
+        """Test health router is included."""
+        app = create_app(settings=test_settings)
+        routes = [getattr(route, "path", None) for route in app.routes]
+        assert "/v1/health" in routes
+    def test_create_app_includes_generators_router(self, test_settings: Settings) -> None:
+        """Test generators router is included."""
+        app = create_app(settings=test_settings)
+        routes = [getattr(route, "path", None) for route in app.routes]
+        assert "/v1/generators" in routes
+    def test_create_app_includes_datasets_router(self, test_settings: Settings) -> None:
+        """Test datasets router is included."""
+        app = create_app(settings=test_settings)
+        routes = [getattr(route, "path", None) for route in app.routes]
+        assert "/v1/datasets" in routes
+    def test_create_app_uses_default_settings_when_none_provided(self) -> None:
+        """Test create_app loads settings from environment when not provided."""
+        with patch("juniper_data.api.app.get_settings") as mock_get:
+            mock_settings = Settings()
+            mock_get.return_value = mock_settings
+            app = create_app(settings=None)
+            mock_get.assert_called_once()
+            assert app.state.settings == mock_settings
+    def test_create_app_cors_middleware_added(self, test_settings: Settings) -> None:
+        """Test CORS middleware is configured."""
+        app = create_app(settings=test_settings)
+        middleware_classes = [getattr(m.cls, "__name__", None) for m in app.user_middleware]
+        assert "CORSMiddleware" in middleware_classes
+@pytest.mark.unit
+class TestExceptionHandlers:
+    """Tests for custom exception handlers."""
+    def test_value_error_returns_400(self, test_settings: Settings, memory_store: InMemoryDatasetStore) -> None:
+        """Test ValueError is handled with 400 status."""
+        app = create_app(settings=test_settings)
+        datasets.set_store(memory_store)
+        @app.get("/test-value-error")
+        async def raise_value_error():
+            raise ValueError("Test error message")
+        client = TestClient(app, raise_server_exceptions=False)
+        response = client.get("/test-value-error")
+        assert response.status_code == 400
+        assert response.json()["detail"] == "Test error message"
+    def test_general_exception_returns_500(self, test_settings: Settings, memory_store: InMemoryDatasetStore) -> None:
+        """Test unhandled Exception returns 500 status."""
+        app = create_app(settings=test_settings)
+        datasets.set_store(memory_store)
+        @app.get("/test-general-error")
+        async def raise_general_error():
+            raise RuntimeError("Unexpected error")
+        client = TestClient(app, raise_server_exceptions=False)
+        with patch("logging.Logger.exception"):
+            response = client.get("/test-general-error")
+        assert response.status_code == 500
+        assert response.json()["detail"] == "Internal server error"
+@pytest.mark.unit
+class TestLifespan:
+    """Tests for the lifespan context manager."""
+    @pytest.mark.asyncio
+    async def test_lifespan_initializes_store(self, test_settings: Settings) -> None:
+        """Test lifespan sets up the dataset store."""
+        app = FastAPI()
+        app.state.settings = test_settings
+        with patch("juniper_data.api.app.LocalFSDatasetStore") as MockStore:
+            mock_store = MagicMock()
+            MockStore.return_value = mock_store
+            with patch("juniper_data.api.app.datasets") as mock_datasets:
+                async with lifespan(app):
+                    MockStore.assert_called_once()
+                    mock_datasets.set_store.assert_called_once_with(mock_store)
+    @pytest.mark.asyncio
+    async def test_lifespan_logs_startup_message(self, test_settings: Settings) -> None:
+        """Test lifespan logs startup message."""
+        app = FastAPI()
+        app.state.settings = test_settings
+        with patch("juniper_data.api.app.LocalFSDatasetStore"):
+            with patch("juniper_data.api.app.datasets"):
+                with patch("logging.Logger.info") as mock_info:
+                    async with lifespan(app):
+                        startup_calls = [call for call in mock_info.call_args_list if "starting" in str(call).lower()]
+                        assert len(startup_calls) >= 1
+    @pytest.mark.asyncio
+    async def test_lifespan_logs_shutdown_message(self, test_settings: Settings) -> None:
+        """Test lifespan logs shutdown message."""
+        app = FastAPI()
+        app.state.settings = test_settings
+        with patch("juniper_data.api.app.LocalFSDatasetStore"):
+            with patch("juniper_data.api.app.datasets"):
+                with patch("logging.Logger.info") as mock_info:
+                    async with lifespan(app):
+                        pass
+                    shutdown_calls = [call for call in mock_info.call_args_list if "shutting" in str(call).lower()]
+                    assert len(shutdown_calls) >= 1
+    @pytest.mark.asyncio
+    async def test_lifespan_configures_logging(self, test_settings: Settings) -> None:
+        """Test lifespan configures logging with correct level."""
+        app = FastAPI()
+        app.state.settings = test_settings
+        with patch("juniper_data.api.app.LocalFSDatasetStore"):
+            with patch("juniper_data.api.app.datasets"):
+                with patch("logging.basicConfig") as mock_config:
+                    async with lifespan(app):
+                        mock_config.assert_called_once()
+                        call_kwargs = mock_config.call_args[1]
+                        assert call_kwargs["level"] == logging.DEBUG
+@pytest.mark.unit
+class TestGlobalApp:
+    """Tests for the global app instance."""
+    def test_global_app_exists(self) -> None:
+        """Test global app is created at module level."""
+        from juniper_data.api.app import app
+        assert isinstance(app, FastAPI)
+    def test_global_app_has_correct_title(self) -> None:
+        """Test global app has correct title."""
+        from juniper_data.api.app import app
+        assert app.title == "Juniper Data API"