PyPI - juniper-data - Versions diffs - 0.4.2__py3-none-any.whl - Mend

juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

juniper_data/__init__.py +88 -0
juniper_data/__main__.py +78 -0
juniper_data/api/__init__.py +10 -0
juniper_data/api/app.py +111 -0
juniper_data/api/middleware.py +95 -0
juniper_data/api/routes/__init__.py +9 -0
juniper_data/api/routes/datasets.py +414 -0
juniper_data/api/routes/generators.py +125 -0
juniper_data/api/routes/health.py +49 -0
juniper_data/api/security.py +238 -0
juniper_data/api/settings.py +109 -0
juniper_data/core/__init__.py +32 -0
juniper_data/core/artifacts.py +63 -0
juniper_data/core/dataset_id.py +38 -0
juniper_data/core/models.py +135 -0
juniper_data/core/split.py +120 -0
juniper_data/generators/__init__.py +15 -0
juniper_data/generators/arc_agi/__init__.py +11 -0
juniper_data/generators/arc_agi/generator.py +229 -0
juniper_data/generators/arc_agi/params.py +56 -0
juniper_data/generators/checkerboard/__init__.py +15 -0
juniper_data/generators/checkerboard/generator.py +114 -0
juniper_data/generators/checkerboard/params.py +32 -0
juniper_data/generators/circles/__init__.py +11 -0
juniper_data/generators/circles/generator.py +112 -0
juniper_data/generators/circles/params.py +31 -0
juniper_data/generators/csv_import/__init__.py +15 -0
juniper_data/generators/csv_import/generator.py +198 -0
juniper_data/generators/csv_import/params.py +48 -0
juniper_data/generators/gaussian/__init__.py +11 -0
juniper_data/generators/gaussian/generator.py +149 -0
juniper_data/generators/gaussian/params.py +53 -0
juniper_data/generators/mnist/__init__.py +11 -0
juniper_data/generators/mnist/generator.py +124 -0
juniper_data/generators/mnist/params.py +39 -0
juniper_data/generators/spiral/__init__.py +57 -0
juniper_data/generators/spiral/defaults.py +39 -0
juniper_data/generators/spiral/generator.py +206 -0
juniper_data/generators/spiral/params.py +148 -0
juniper_data/generators/xor/__init__.py +11 -0
juniper_data/generators/xor/generator.py +162 -0
juniper_data/generators/xor/params.py +30 -0
juniper_data/storage/__init__.py +120 -0
juniper_data/storage/base.py +279 -0
juniper_data/storage/cached.py +211 -0
juniper_data/storage/hf_store.py +257 -0
juniper_data/storage/kaggle_store.py +333 -0
juniper_data/storage/local_fs.py +232 -0
juniper_data/storage/memory.py +136 -0
juniper_data/storage/postgres_store.py +373 -0
juniper_data/storage/redis_store.py +264 -0
juniper_data/tests/__init__.py +1 -0
juniper_data/tests/conftest.py +68 -0
juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
juniper_data/tests/integration/__init__.py +1 -0
juniper_data/tests/integration/test_api.py +283 -0
juniper_data/tests/integration/test_e2e_workflow.py +378 -0
juniper_data/tests/integration/test_lifecycle_api.py +304 -0
juniper_data/tests/integration/test_security_integration.py +189 -0
juniper_data/tests/integration/test_storage_workflow.py +259 -0
juniper_data/tests/performance/__init__.py +1 -0
juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
juniper_data/tests/unit/__init__.py +1 -0
juniper_data/tests/unit/test_api_app.py +206 -0
juniper_data/tests/unit/test_api_routes.py +407 -0
juniper_data/tests/unit/test_api_settings.py +100 -0
juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
juniper_data/tests/unit/test_artifacts.py +145 -0
juniper_data/tests/unit/test_cached_store.py +423 -0
juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
juniper_data/tests/unit/test_circles_generator.py +256 -0
juniper_data/tests/unit/test_csv_import_generator.py +345 -0
juniper_data/tests/unit/test_dataset_id.py +181 -0
juniper_data/tests/unit/test_gaussian_generator.py +333 -0
juniper_data/tests/unit/test_hf_store.py +416 -0
juniper_data/tests/unit/test_init.py +93 -0
juniper_data/tests/unit/test_kaggle_store.py +469 -0
juniper_data/tests/unit/test_lifecycle.py +394 -0
juniper_data/tests/unit/test_main.py +127 -0
juniper_data/tests/unit/test_middleware.py +79 -0
juniper_data/tests/unit/test_mnist_generator.py +370 -0
juniper_data/tests/unit/test_postgres_store.py +490 -0
juniper_data/tests/unit/test_redis_store.py +500 -0
juniper_data/tests/unit/test_security.py +281 -0
juniper_data/tests/unit/test_security_boundaries.py +517 -0
juniper_data/tests/unit/test_spiral_generator.py +566 -0
juniper_data/tests/unit/test_split.py +245 -0
juniper_data/tests/unit/test_storage.py +767 -0
juniper_data/tests/unit/test_xor_generator.py +223 -0
juniper_data-0.4.2.dist-info/METADATA +216 -0
juniper_data-0.4.2.dist-info/RECORD +95 -0
juniper_data-0.4.2.dist-info/WHEEL +5 -0
juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
juniper_data-0.4.2.dist-info/top_level.txt +1 -0

juniper_data/tests/fixtures/generate_golden_datasets.py ADDED Viewed

@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+"""
+Golden Dataset Generator for JuniperData Parity Testing
+This script generates golden reference datasets from the existing JuniperCascor
+SpiralProblem implementation for use in validating the new JuniperData implementation.
+Run this script from the JuniperCascor environment to generate the golden datasets.
+Usage:
+    # Optionally set environment variables to configure paths:
+    #   JUNIPER_CASCOR_SRC   - path to the JuniperCascor 'src' directory
+    #   GOLDEN_DATASETS_DIR  - output directory for generated golden datasets
+    #
+    # Example:
+    #   export JUNIPER_CASCOR_SRC=/path/to/JuniperCascor/juniper_cascor/src
+    #   export GOLDEN_DATASETS_DIR=/path/to/JuniperData/tests/fixtures/golden_datasets
+    #   python -m juniper_data.tests.fixtures.generate_golden_datasets
+"""
+import json
+import os
+import sys
+from pathlib import Path
+import numpy as np
+# Append JuniperCascor source directory for local script execution.
+# The path can be configured via the JUNIPER_CASCOR_SRC environment variable.
+# If not set, we fall back to a path derived relative to this file.
+_default_cascor_src = Path(__file__).resolve().parents[3] / "JuniperCascor" / "juniper_cascor" / "src"
+JUNIPER_CASCOR_SRC = Path(os.environ.get("JUNIPER_CASCOR_SRC", str(_default_cascor_src)))
+sys.path.insert(0, str(JUNIPER_CASCOR_SRC))
+from spiral_problem.spiral_problem import SpiralProblem  # noqa: E402
+# Directory where golden datasets will be written. Can be overridden via the
+# GOLDEN_DATASETS_DIR environment variable; by default, we use a directory
+# named 'golden_datasets' alongside this script.
+GOLDEN_DATASETS_DIR = Path(
+    os.environ.get(
+        "GOLDEN_DATASETS_DIR",
+        str(Path(__file__).resolve().parent / "golden_datasets"),
+    )
+)
+DATASET_CONFIGS = [
+    {
+        "name": "2_spiral",
+        "n_spirals": 2,
+        "n_points": 100,
+        "noise": 0.1,
+        "seed": 42,
+        "train_ratio": 0.8,
+        "test_ratio": 0.2,
+    },
+    {
+        "name": "3_spiral",
+        "n_spirals": 3,
+        "n_points": 50,
+        "noise": 0.05,
+        "seed": 42,
+        "train_ratio": 0.8,
+        "test_ratio": 0.2,
+    },
+]
+def generate_golden_dataset(config: dict) -> dict:
+    """Generate a golden dataset with the specified configuration."""
+    np.random.seed(config["seed"])
+    import torch  # noqa: E402
+    torch.manual_seed(config["seed"])
+    problem = SpiralProblem(
+        _SpiralProblem__n_spirals=config["n_spirals"],
+        _SpiralProblem__n_points=config["n_points"],
+        _SpiralProblem__noise=config["noise"],
+        _SpiralProblem__random_seed=config["seed"],
+        _SpiralProblem__train_ratio=config["train_ratio"],
+        _SpiralProblem__test_ratio=config["test_ratio"],
+    )
+    (X_train, y_train), (X_test, y_test), (X_full, y_full) = problem.generate_n_spiral_dataset(
+        n_spirals=config["n_spirals"],
+        n_points=config["n_points"],
+        noise_level=config["noise"],
+        train_ratio=config["train_ratio"],
+        test_ratio=config["test_ratio"],
+    )
+    X_train_np = X_train.numpy()
+    y_train_np = y_train.numpy()
+    X_test_np = X_test.numpy()
+    y_test_np = y_test.numpy()
+    metadata = {
+        "config": config,
+        "shapes": {
+            "X_train": list(X_train_np.shape),
+            "y_train": list(y_train_np.shape),
+            "X_test": list(X_test_np.shape),
+            "y_test": list(y_test_np.shape),
+        },
+        "dtypes": {
+            "X_train": str(X_train_np.dtype),
+            "y_train": str(y_train_np.dtype),
+            "X_test": str(X_test_np.dtype),
+            "y_test": str(y_test_np.dtype),
+        },
+        "class_distribution": {
+            "train": compute_class_distribution(y_train_np),
+            "test": compute_class_distribution(y_test_np),
+        },
+        "value_ranges": {
+            "X_train": {"min": float(X_train_np.min()), "max": float(X_train_np.max())},
+            "X_test": {"min": float(X_test_np.min()), "max": float(X_test_np.max())},
+        },
+    }
+    return {
+        "X_train": X_train_np,
+        "y_train": y_train_np,
+        "X_test": X_test_np,
+        "y_test": y_test_np,
+        "metadata": metadata,
+    }
+def compute_class_distribution(y: np.ndarray) -> dict:
+    """Compute class distribution from one-hot encoded labels."""
+    class_indices = np.argmax(y, axis=1)
+    unique, counts = np.unique(class_indices, return_counts=True)
+    return {f"class_{int(c)}": int(cnt) for c, cnt in zip(unique, counts)}
+def save_golden_dataset(data: dict, name: str) -> None:
+    """Save golden dataset as NPZ file with metadata JSON."""
+    GOLDEN_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
+    npz_path = GOLDEN_DATASETS_DIR / f"{name}.npz"
+    np.savez(
+        npz_path,
+        X_train=data["X_train"],
+        y_train=data["y_train"],
+        X_test=data["X_test"],
+        y_test=data["y_test"],
+    )
+    print(f"Saved: {npz_path}")
+    metadata_path = GOLDEN_DATASETS_DIR / f"{name}_metadata.json"
+    with open(metadata_path, "w") as f:
+        json.dump(data["metadata"], f, indent=2)
+    print(f"Saved: {metadata_path}")
+def print_dataset_info(data: dict, name: str) -> None:
+    """Print dataset information for verification."""
+    meta = data["metadata"]
+    print(f"\n{'=' * 60}")
+    print(f"Dataset: {name}")
+    print(f"{'=' * 60}")
+    print("Configuration:")
+    for key, value in meta["config"].items():
+        print(f"  {key}: {value}")
+    print("\nShapes:")
+    for key, shape in meta["shapes"].items():
+        print(f"  {key}: {shape}")
+    print("\nDtypes:")
+    for key, dtype in meta["dtypes"].items():
+        print(f"  {key}: {dtype}")
+    print("\nClass Distribution:")
+    for split, dist in meta["class_distribution"].items():
+        print(f"  {split}: {dist}")
+    print("\nValue Ranges:")
+    for key, ranges in meta["value_ranges"].items():
+        print(f"  {key}: min={ranges['min']:.6f}, max={ranges['max']:.6f}")
+def main():
+    """Generate all golden datasets."""
+    print("Generating Golden Reference Datasets")
+    print("=" * 60)
+    for config in DATASET_CONFIGS:
+        print(f"\nGenerating {config['name']} dataset...")
+        data = generate_golden_dataset(config)
+        print_dataset_info(data, config["name"])
+        save_golden_dataset(data, config["name"])
+    print("\n" + "=" * 60)
+    print("Golden dataset generation complete!")
+    print(f"Output directory: {GOLDEN_DATASETS_DIR}")
+if __name__ == "__main__":
+    main()

juniper_data/tests/integration/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Integration tests for Juniper Data."""

juniper_data/tests/integration/test_api.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""Integration tests for the FastAPI REST API.
+Tests cover all endpoints:
+- Health check
+- Generators listing and schema
+- Dataset CRUD operations
+- Artifact download
+- Preview functionality
+"""
+import io
+import numpy as np
+import pytest
+from fastapi.testclient import TestClient
+from juniper_data import __version__
+from juniper_data.api.app import create_app
+from juniper_data.api.routes import datasets
+from juniper_data.api.settings import Settings
+from juniper_data.storage.memory import InMemoryDatasetStore
+@pytest.fixture
+def memory_store() -> InMemoryDatasetStore:
+    """Create a fresh in-memory store for each test."""
+    return InMemoryDatasetStore()
+@pytest.fixture
+def test_settings() -> Settings:
+    """Create test settings."""
+    return Settings(storage_path="/tmp/juniper_data_test")
+@pytest.fixture
+def client(memory_store: InMemoryDatasetStore, test_settings: Settings) -> TestClient:
+    """Create a test client with in-memory storage."""
+    app = create_app(settings=test_settings)
+    datasets.set_store(memory_store)
+    return TestClient(app)
+@pytest.fixture
+def spiral_request() -> dict:
+    """Default spiral dataset creation request."""
+    return {
+        "generator": "spiral",
+        "params": {
+            "n_spirals": 2,
+            "n_points_per_spiral": 50,
+            "seed": 42,
+        },
+        "persist": True,
+    }
+@pytest.mark.integration
+class TestHealthEndpoint:
+    """Tests for the /v1/health endpoints."""
+    def test_health_returns_ok(self, client: TestClient) -> None:
+        """GET /v1/health returns {"status": "ok"}."""
+        response = client.get("/v1/health")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "ok"
+    def test_health_includes_version(self, client: TestClient) -> None:
+        """Response includes version string."""
+        response = client.get("/v1/health")
+        assert response.status_code == 200
+        data = response.json()
+        assert "version" in data
+        assert data["version"] == __version__
+    def test_liveness_probe(self, client: TestClient) -> None:
+        """GET /v1/health/live returns liveness status."""
+        response = client.get("/v1/health/live")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "alive"
+    def test_readiness_probe(self, client: TestClient) -> None:
+        """GET /v1/health/ready returns readiness status with version."""
+        response = client.get("/v1/health/ready")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["status"] == "ready"
+        assert data["version"] == __version__
+@pytest.mark.integration
+class TestGeneratorsEndpoint:
+    """Tests for the /v1/generators endpoints."""
+    def test_list_generators(self, client: TestClient) -> None:
+        """GET /v1/generators returns list with "spiral"."""
+        response = client.get("/v1/generators")
+        assert response.status_code == 200
+        data = response.json()
+        assert isinstance(data, list)
+        assert len(data) >= 1
+        generator_names = [g["name"] for g in data]
+        assert "spiral" in generator_names
+    def test_get_generator_schema(self, client: TestClient) -> None:
+        """GET /v1/generators/spiral/schema returns valid schema."""
+        response = client.get("/v1/generators/spiral/schema")
+        assert response.status_code == 200
+        schema = response.json()
+        assert isinstance(schema, dict)
+        assert "properties" in schema
+        assert "n_spirals" in schema["properties"]
+        assert "n_points_per_spiral" in schema["properties"]
+    def test_unknown_generator_404(self, client: TestClient) -> None:
+        """GET /v1/generators/unknown/schema returns 404."""
+        response = client.get("/v1/generators/unknown/schema")
+        assert response.status_code == 404
+        data = response.json()
+        assert "detail" in data
+        assert "unknown" in data["detail"].lower()
+@pytest.mark.integration
+class TestDatasetsEndpoint:
+    """Tests for the /v1/datasets endpoints."""
+    def test_create_spiral_dataset(self, client: TestClient, spiral_request: dict) -> None:
+        """POST /v1/datasets creates dataset and returns meta."""
+        response = client.post("/v1/datasets", json=spiral_request)
+        assert response.status_code == 201
+        data = response.json()
+        assert "dataset_id" in data
+        assert "meta" in data
+        assert data["generator"] == "spiral"
+        assert data["meta"]["generator"] == "spiral"
+        assert data["meta"]["n_samples"] == 100
+    def test_create_returns_artifact_url(self, client: TestClient, spiral_request: dict) -> None:
+        """Response includes artifact_url."""
+        response = client.post("/v1/datasets", json=spiral_request)
+        assert response.status_code == 201
+        data = response.json()
+        assert "artifact_url" in data
+        assert "/v1/datasets/" in data["artifact_url"]
+        assert "/artifact" in data["artifact_url"]
+    def test_list_datasets(self, client: TestClient, spiral_request: dict) -> None:
+        """GET /v1/datasets returns list after creation."""
+        client.post("/v1/datasets", json=spiral_request)
+        response = client.get("/v1/datasets")
+        assert response.status_code == 200
+        data = response.json()
+        assert isinstance(data, list)
+        assert len(data) >= 1
+    def test_get_dataset_meta(self, client: TestClient, spiral_request: dict) -> None:
+        """GET /v1/datasets/{id} returns metadata."""
+        create_response = client.post("/v1/datasets", json=spiral_request)
+        dataset_id = create_response.json()["dataset_id"]
+        response = client.get(f"/v1/datasets/{dataset_id}")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["dataset_id"] == dataset_id
+        assert data["generator"] == "spiral"
+        assert "n_samples" in data
+    def test_get_dataset_404(self, client: TestClient) -> None:
+        """GET /v1/datasets/nonexistent returns 404."""
+        response = client.get("/v1/datasets/nonexistent")
+        assert response.status_code == 404
+        data = response.json()
+        assert "detail" in data
+    def test_delete_dataset(self, client: TestClient, spiral_request: dict) -> None:
+        """DELETE /v1/datasets/{id} returns 204."""
+        create_response = client.post("/v1/datasets", json=spiral_request)
+        dataset_id = create_response.json()["dataset_id"]
+        response = client.delete(f"/v1/datasets/{dataset_id}")
+        assert response.status_code == 204
+        get_response = client.get(f"/v1/datasets/{dataset_id}")
+        assert get_response.status_code == 404
+    def test_caching_same_params(self, client: TestClient, spiral_request: dict) -> None:
+        """Same params twice returns same dataset_id (no regeneration)."""
+        response1 = client.post("/v1/datasets", json=spiral_request)
+        response2 = client.post("/v1/datasets", json=spiral_request)
+        assert response1.status_code == 201
+        assert response2.status_code == 201
+        data1 = response1.json()
+        data2 = response2.json()
+        assert data1["dataset_id"] == data2["dataset_id"]
+@pytest.mark.integration
+class TestArtifactEndpoint:
+    """Tests for the /v1/datasets/{id}/artifact endpoint."""
+    def test_download_artifact(self, client: TestClient, spiral_request: dict) -> None:
+        """GET /v1/datasets/{id}/artifact returns NPZ bytes."""
+        create_response = client.post("/v1/datasets", json=spiral_request)
+        dataset_id = create_response.json()["dataset_id"]
+        response = client.get(f"/v1/datasets/{dataset_id}/artifact")
+        assert response.status_code == 200
+        assert response.headers["content-type"] == "application/octet-stream"
+        assert len(response.content) > 0
+        with np.load(io.BytesIO(response.content)) as data:
+            assert len(data.files) > 0
+    def test_artifact_contains_expected_keys(self, client: TestClient, spiral_request: dict) -> None:
+        """NPZ has X_train, y_train, X_test, y_test."""
+        create_response = client.post("/v1/datasets", json=spiral_request)
+        dataset_id = create_response.json()["dataset_id"]
+        response = client.get(f"/v1/datasets/{dataset_id}/artifact")
+        assert response.status_code == 200
+        with np.load(io.BytesIO(response.content)) as data:
+            assert "X_train" in data.files
+            assert "y_train" in data.files
+            assert "X_test" in data.files
+            assert "y_test" in data.files
+@pytest.mark.integration
+class TestPreviewEndpoint:
+    """Tests for the /v1/datasets/{id}/preview endpoint."""
+    def test_preview_returns_samples(self, client: TestClient, spiral_request: dict) -> None:
+        """GET /v1/datasets/{id}/preview returns JSON with samples."""
+        create_response = client.post("/v1/datasets", json=spiral_request)
+        dataset_id = create_response.json()["dataset_id"]
+        response = client.get(f"/v1/datasets/{dataset_id}/preview")
+        assert response.status_code == 200
+        data = response.json()
+        assert "n_samples" in data
+        assert "X_sample" in data
+        assert "y_sample" in data
+        assert isinstance(data["X_sample"], list)
+        assert isinstance(data["y_sample"], list)
+        assert len(data["X_sample"]) > 0
+        assert len(data["y_sample"]) > 0
+    def test_preview_respects_n_param(self, client: TestClient, spiral_request: dict) -> None:
+        """?n=10 returns 10 samples."""
+        create_response = client.post("/v1/datasets", json=spiral_request)
+        dataset_id = create_response.json()["dataset_id"]
+        response = client.get(f"/v1/datasets/{dataset_id}/preview?n=10")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["n_samples"] == 10
+        assert len(data["X_sample"]) == 10
+        assert len(data["y_sample"]) == 10