PyPI - juniper-data - Versions diffs - 0.4.2__py3-none-any.whl - Mend

juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

juniper_data/__init__.py +88 -0
juniper_data/__main__.py +78 -0
juniper_data/api/__init__.py +10 -0
juniper_data/api/app.py +111 -0
juniper_data/api/middleware.py +95 -0
juniper_data/api/routes/__init__.py +9 -0
juniper_data/api/routes/datasets.py +414 -0
juniper_data/api/routes/generators.py +125 -0
juniper_data/api/routes/health.py +49 -0
juniper_data/api/security.py +238 -0
juniper_data/api/settings.py +109 -0
juniper_data/core/__init__.py +32 -0
juniper_data/core/artifacts.py +63 -0
juniper_data/core/dataset_id.py +38 -0
juniper_data/core/models.py +135 -0
juniper_data/core/split.py +120 -0
juniper_data/generators/__init__.py +15 -0
juniper_data/generators/arc_agi/__init__.py +11 -0
juniper_data/generators/arc_agi/generator.py +229 -0
juniper_data/generators/arc_agi/params.py +56 -0
juniper_data/generators/checkerboard/__init__.py +15 -0
juniper_data/generators/checkerboard/generator.py +114 -0
juniper_data/generators/checkerboard/params.py +32 -0
juniper_data/generators/circles/__init__.py +11 -0
juniper_data/generators/circles/generator.py +112 -0
juniper_data/generators/circles/params.py +31 -0
juniper_data/generators/csv_import/__init__.py +15 -0
juniper_data/generators/csv_import/generator.py +198 -0
juniper_data/generators/csv_import/params.py +48 -0
juniper_data/generators/gaussian/__init__.py +11 -0
juniper_data/generators/gaussian/generator.py +149 -0
juniper_data/generators/gaussian/params.py +53 -0
juniper_data/generators/mnist/__init__.py +11 -0
juniper_data/generators/mnist/generator.py +124 -0
juniper_data/generators/mnist/params.py +39 -0
juniper_data/generators/spiral/__init__.py +57 -0
juniper_data/generators/spiral/defaults.py +39 -0
juniper_data/generators/spiral/generator.py +206 -0
juniper_data/generators/spiral/params.py +148 -0
juniper_data/generators/xor/__init__.py +11 -0
juniper_data/generators/xor/generator.py +162 -0
juniper_data/generators/xor/params.py +30 -0
juniper_data/storage/__init__.py +120 -0
juniper_data/storage/base.py +279 -0
juniper_data/storage/cached.py +211 -0
juniper_data/storage/hf_store.py +257 -0
juniper_data/storage/kaggle_store.py +333 -0
juniper_data/storage/local_fs.py +232 -0
juniper_data/storage/memory.py +136 -0
juniper_data/storage/postgres_store.py +373 -0
juniper_data/storage/redis_store.py +264 -0
juniper_data/tests/__init__.py +1 -0
juniper_data/tests/conftest.py +68 -0
juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
juniper_data/tests/integration/__init__.py +1 -0
juniper_data/tests/integration/test_api.py +283 -0
juniper_data/tests/integration/test_e2e_workflow.py +378 -0
juniper_data/tests/integration/test_lifecycle_api.py +304 -0
juniper_data/tests/integration/test_security_integration.py +189 -0
juniper_data/tests/integration/test_storage_workflow.py +259 -0
juniper_data/tests/performance/__init__.py +1 -0
juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
juniper_data/tests/unit/__init__.py +1 -0
juniper_data/tests/unit/test_api_app.py +206 -0
juniper_data/tests/unit/test_api_routes.py +407 -0
juniper_data/tests/unit/test_api_settings.py +100 -0
juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
juniper_data/tests/unit/test_artifacts.py +145 -0
juniper_data/tests/unit/test_cached_store.py +423 -0
juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
juniper_data/tests/unit/test_circles_generator.py +256 -0
juniper_data/tests/unit/test_csv_import_generator.py +345 -0
juniper_data/tests/unit/test_dataset_id.py +181 -0
juniper_data/tests/unit/test_gaussian_generator.py +333 -0
juniper_data/tests/unit/test_hf_store.py +416 -0
juniper_data/tests/unit/test_init.py +93 -0
juniper_data/tests/unit/test_kaggle_store.py +469 -0
juniper_data/tests/unit/test_lifecycle.py +394 -0
juniper_data/tests/unit/test_main.py +127 -0
juniper_data/tests/unit/test_middleware.py +79 -0
juniper_data/tests/unit/test_mnist_generator.py +370 -0
juniper_data/tests/unit/test_postgres_store.py +490 -0
juniper_data/tests/unit/test_redis_store.py +500 -0
juniper_data/tests/unit/test_security.py +281 -0
juniper_data/tests/unit/test_security_boundaries.py +517 -0
juniper_data/tests/unit/test_spiral_generator.py +566 -0
juniper_data/tests/unit/test_split.py +245 -0
juniper_data/tests/unit/test_storage.py +767 -0
juniper_data/tests/unit/test_xor_generator.py +223 -0
juniper_data-0.4.2.dist-info/METADATA +216 -0
juniper_data-0.4.2.dist-info/RECORD +95 -0
juniper_data-0.4.2.dist-info/WHEEL +5 -0
juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
juniper_data-0.4.2.dist-info/top_level.txt +1 -0

juniper_data/tests/unit/test_dataset_id.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Unit tests for dataset ID generation.
+Tests cover:
+- Deterministic ID generation
+- Different params produce different IDs
+- ID format matches expected pattern
+"""
+import pytest
+from juniper_data.core.dataset_id import generate_dataset_id
+@pytest.mark.unit
+class TestDatasetIdGeneration:
+    """Tests for deterministic dataset ID generation."""
+    def test_deterministic_id_generation(self) -> None:
+        """Verify same inputs produce identical IDs."""
+        params = {
+            "n_spirals": 2,
+            "n_points_per_spiral": 100,
+            "seed": 42,
+        }
+        id1 = generate_dataset_id("spiral", "v1.0.0", params)
+        id2 = generate_dataset_id("spiral", "v1.0.0", params)
+        assert id1 == id2
+    def test_multiple_calls_identical(self) -> None:
+        """Verify multiple sequential calls produce identical IDs."""
+        params = {"n_spirals": 3, "n_points": 50}
+        ids = [generate_dataset_id("spiral", "v1.0.0", params) for _ in range(5)]
+        assert all(id_ == ids[0] for id_ in ids)
+    def test_different_params_produce_different_ids(self) -> None:
+        """Verify different params produce different IDs."""
+        params1 = {"n_spirals": 2, "seed": 42}
+        params2 = {"n_spirals": 3, "seed": 42}
+        id1 = generate_dataset_id("spiral", "v1.0.0", params1)
+        id2 = generate_dataset_id("spiral", "v1.0.0", params2)
+        assert id1 != id2
+    def test_different_generator_produces_different_id(self) -> None:
+        """Verify different generator names produce different IDs."""
+        params = {"n_spirals": 2}
+        id1 = generate_dataset_id("spiral", "v1.0.0", params)
+        id2 = generate_dataset_id("circle", "v1.0.0", params)
+        assert id1 != id2
+    def test_different_version_produces_different_id(self) -> None:
+        """Verify different versions produce different IDs."""
+        params = {"n_spirals": 2}
+        id1 = generate_dataset_id("spiral", "v1.0.0", params)
+        id2 = generate_dataset_id("spiral", "v2.0.0", params)
+        assert id1 != id2
+@pytest.mark.unit
+class TestDatasetIdFormat:
+    """Tests for dataset ID format validation."""
+    def test_id_format_matches_pattern(self) -> None:
+        """Verify ID format is '{generator}-{version}-{hash[:16]}'."""
+        params = {"n_spirals": 2}
+        dataset_id = generate_dataset_id("spiral", "v1.0.0", params)
+        assert dataset_id.startswith("spiral-v1.0.0-")
+        parts = dataset_id.split("-")
+        assert len(parts) == 3
+        assert parts[0] == "spiral"
+        assert parts[1] == "v1.0.0"
+        assert len(parts[2]) == 16
+    def test_hash_is_hex(self) -> None:
+        """Verify hash portion is valid hexadecimal."""
+        params = {"n_spirals": 2}
+        dataset_id = generate_dataset_id("spiral", "v1.0.0", params)
+        hash_part = dataset_id.split("-")[-1]
+        int(hash_part, 16)
+    def test_id_length_consistent(self) -> None:
+        """Verify ID has consistent length structure."""
+        params1 = {"n_spirals": 2}
+        params2 = {"n_spirals": 3, "noise": 0.5, "seed": 12345}
+        id1 = generate_dataset_id("spiral", "v1.0.0", params1)
+        id2 = generate_dataset_id("spiral", "v1.0.0", params2)
+        hash1 = id1.split("-")[-1]
+        hash2 = id2.split("-")[-1]
+        assert len(hash1) == 16
+        assert len(hash2) == 16
+@pytest.mark.unit
+class TestDatasetIdEdgeCases:
+    """Tests for edge cases in dataset ID generation."""
+    def test_empty_params(self) -> None:
+        """Verify empty params dict works."""
+        dataset_id = generate_dataset_id("spiral", "v1.0.0", {})
+        assert dataset_id.startswith("spiral-v1.0.0-")
+        assert len(dataset_id.split("-")[-1]) == 16
+    def test_nested_params(self) -> None:
+        """Verify nested params are handled correctly."""
+        params = {
+            "n_spirals": 2,
+            "advanced": {"noise": 0.5, "scale": 1.0},
+        }
+        dataset_id = generate_dataset_id("spiral", "v1.0.0", params)
+        assert dataset_id.startswith("spiral-v1.0.0-")
+    def test_params_order_independent(self) -> None:
+        """Verify param order doesn't affect ID (sorted keys)."""
+        params1 = {"a": 1, "b": 2, "c": 3}
+        params2 = {"c": 3, "a": 1, "b": 2}
+        id1 = generate_dataset_id("spiral", "v1.0.0", params1)
+        id2 = generate_dataset_id("spiral", "v1.0.0", params2)
+        assert id1 == id2
+    def test_float_params(self) -> None:
+        """Verify float params work correctly."""
+        params = {"noise": 0.25, "ratio": 0.8}
+        dataset_id = generate_dataset_id("spiral", "v1.0.0", params)
+        assert dataset_id.startswith("spiral-v1.0.0-")
+    def test_boolean_params(self) -> None:
+        """Verify boolean params work correctly."""
+        params = {"clockwise": True, "shuffle": False}
+        dataset_id = generate_dataset_id("spiral", "v1.0.0", params)
+        assert dataset_id.startswith("spiral-v1.0.0-")
+    def test_none_param_value(self) -> None:
+        """Verify None param values work correctly."""
+        params = {"seed": None, "n_spirals": 2}
+        dataset_id = generate_dataset_id("spiral", "v1.0.0", params)
+        assert dataset_id.startswith("spiral-v1.0.0-")
+    def test_special_characters_in_generator_name(self) -> None:
+        """Verify special characters in generator name are handled."""
+        params = {"n_spirals": 2}
+        dataset_id = generate_dataset_id("spiral_v2", "v1.0.0", params)
+        assert dataset_id.startswith("spiral_v2-v1.0.0-")
+    def test_different_float_precision_different_id(self) -> None:
+        """Verify different float values produce different IDs."""
+        params1 = {"noise": 0.25}
+        params2 = {"noise": 0.250001}
+        id1 = generate_dataset_id("spiral", "v1.0.0", params1)
+        id2 = generate_dataset_id("spiral", "v1.0.0", params2)
+        assert id1 != id2

juniper_data/tests/unit/test_gaussian_generator.py ADDED Viewed

@@ -0,0 +1,333 @@
+"""Unit tests for the Gaussian blobs dataset generator."""
+import numpy as np
+import pytest
+from juniper_data.generators.gaussian import VERSION, GaussianGenerator, GaussianParams, get_schema
+class TestGaussianParams:
+    """Tests for GaussianParams validation."""
+    def test_default_params(self) -> None:
+        """Default parameters should be valid."""
+        params = GaussianParams()
+        assert params.n_classes == 2
+        assert params.n_samples_per_class == 50
+        assert params.n_features == 2
+        assert params.class_std == 1.0
+        assert params.centers is None
+        assert params.center_radius == 3.0
+        assert params.noise == 0.0
+        assert params.train_ratio == 0.8
+        assert params.test_ratio == 0.2
+        assert params.shuffle is True
+    def test_custom_params(self) -> None:
+        """Custom parameters should be accepted."""
+        params = GaussianParams(
+            n_classes=3,
+            n_samples_per_class=100,
+            n_features=4,
+            class_std=0.5,
+            seed=42,
+        )
+        assert params.n_classes == 3
+        assert params.n_samples_per_class == 100
+        assert params.n_features == 4
+        assert params.class_std == 0.5
+        assert params.seed == 42
+    def test_list_class_std(self) -> None:
+        """List of class_std values should be accepted."""
+        params = GaussianParams(n_classes=3, class_std=[0.5, 1.0, 1.5])
+        assert params.class_std == [0.5, 1.0, 1.5]
+    def test_invalid_class_std_negative(self) -> None:
+        """Negative class_std should raise validation error."""
+        with pytest.raises(ValueError, match="positive"):
+            GaussianParams(class_std=-0.5)
+    def test_invalid_class_std_list_negative(self) -> None:
+        """List with negative class_std should raise validation error."""
+        with pytest.raises(ValueError, match="positive"):
+            GaussianParams(class_std=[0.5, -1.0, 1.5])
+    def test_custom_centers(self) -> None:
+        """Custom centers should be accepted."""
+        centers = [[0.0, 0.0], [5.0, 5.0]]
+        params = GaussianParams(n_classes=2, n_features=2, centers=centers)
+        assert params.centers == centers
+    def test_empty_centers_invalid(self) -> None:
+        """Empty centers list should raise validation error."""
+        with pytest.raises(ValueError, match="cannot be empty"):
+            GaussianParams(centers=[])
+    def test_invalid_n_classes_too_low(self) -> None:
+        """n_classes less than 2 should raise validation error."""
+        with pytest.raises(ValueError):
+            GaussianParams(n_classes=1)
+    def test_invalid_n_classes_too_high(self) -> None:
+        """n_classes greater than 10 should raise validation error."""
+        with pytest.raises(ValueError):
+            GaussianParams(n_classes=11)
+class TestGaussianGenerator:
+    """Tests for GaussianGenerator."""
+    def test_generate_returns_expected_keys(self) -> None:
+        """Generated data should contain all expected keys."""
+        params = GaussianParams(seed=42)
+        result = GaussianGenerator.generate(params)
+        expected_keys = {"X_train", "y_train", "X_test", "y_test", "X_full", "y_full"}
+        assert set(result.keys()) == expected_keys
+    def test_generate_shapes(self) -> None:
+        """Generated arrays should have correct shapes."""
+        params = GaussianParams(
+            n_classes=3,
+            n_samples_per_class=40,
+            n_features=5,
+            seed=42,
+        )
+        result = GaussianGenerator.generate(params)
+        total_samples = 3 * 40
+        assert result["X_full"].shape == (total_samples, 5)
+        assert result["y_full"].shape == (total_samples, 3)
+    def test_generate_dtypes(self) -> None:
+        """Generated arrays should have float32 dtype."""
+        params = GaussianParams(seed=42)
+        result = GaussianGenerator.generate(params)
+        assert result["X_train"].dtype == np.float32
+        assert result["y_train"].dtype == np.float32
+        assert result["X_full"].dtype == np.float32
+        assert result["y_full"].dtype == np.float32
+    def test_determinism_with_seed(self) -> None:
+        """Same seed should produce identical results."""
+        params = GaussianParams(seed=123)
+        result1 = GaussianGenerator.generate(params)
+        result2 = GaussianGenerator.generate(params)
+        np.testing.assert_array_equal(result1["X_full"], result2["X_full"])
+        np.testing.assert_array_equal(result1["y_full"], result2["y_full"])
+    def test_different_seeds_produce_different_data(self) -> None:
+        """Different seeds should produce different results."""
+        params1 = GaussianParams(seed=42)
+        params2 = GaussianParams(seed=43)
+        result1 = GaussianGenerator.generate(params1)
+        result2 = GaussianGenerator.generate(params2)
+        assert not np.allclose(result1["X_full"], result2["X_full"])
+    def test_one_hot_labels(self) -> None:
+        """Labels should be valid one-hot encoded."""
+        params = GaussianParams(n_classes=4, seed=42)
+        result = GaussianGenerator.generate(params)
+        row_sums = result["y_full"].sum(axis=1)
+        np.testing.assert_array_almost_equal(row_sums, np.ones(len(row_sums)))
+        for row in result["y_full"]:
+            assert np.sum(row == 1.0) == 1
+            assert np.sum(row == 0.0) == params.n_classes - 1
+    def test_class_distribution(self) -> None:
+        """Each class should have n_samples_per_class samples."""
+        params = GaussianParams(n_classes=3, n_samples_per_class=50, seed=42)
+        result = GaussianGenerator.generate(params)
+        class_counts = result["y_full"].sum(axis=0)
+        np.testing.assert_array_equal(class_counts, [50, 50, 50])
+    def test_train_test_split_ratio(self) -> None:
+        """Train/test split should respect configured ratios."""
+        params = GaussianParams(
+            n_samples_per_class=50,
+            train_ratio=0.7,
+            test_ratio=0.3,
+            seed=42,
+        )
+        result = GaussianGenerator.generate(params)
+        total = 2 * 50
+        expected_train = int(total * 0.7)
+        expected_test = int(total * 0.3)
+        assert len(result["X_train"]) == expected_train
+        assert len(result["X_test"]) == expected_test
+    def test_custom_centers(self) -> None:
+        """Custom centers should position class means correctly."""
+        centers = [[0.0, 0.0], [10.0, 10.0]]
+        params = GaussianParams(
+            n_classes=2,
+            n_samples_per_class=100,
+            centers=centers,
+            class_std=0.1,
+            noise=0.0,
+            seed=42,
+        )
+        result = GaussianGenerator.generate(params)
+        class_0_samples = result["X_full"][:100]
+        class_1_samples = result["X_full"][100:]
+        class_0_mean = class_0_samples.mean(axis=0)
+        class_1_mean = class_1_samples.mean(axis=0)
+        np.testing.assert_array_almost_equal(class_0_mean, [0.0, 0.0], decimal=0)
+        np.testing.assert_array_almost_equal(class_1_mean, [10.0, 10.0], decimal=0)
+    def test_centers_dimension_mismatch_raises_error(self) -> None:
+        """Centers with wrong dimensions should raise error."""
+        centers = [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]]
+        params = GaussianParams(
+            n_classes=2,
+            n_features=2,
+            centers=centers,
+            seed=42,
+        )
+        with pytest.raises(ValueError, match="n_features"):
+            GaussianGenerator.generate(params)
+    def test_centers_count_mismatch_raises_error(self) -> None:
+        """Wrong number of centers should raise error."""
+        centers = [[0.0, 0.0]]
+        params = GaussianParams(
+            n_classes=2,
+            n_features=2,
+            centers=centers,
+            seed=42,
+        )
+        with pytest.raises(ValueError, match="n_classes"):
+            GaussianGenerator.generate(params)
+    def test_noise_adds_variation(self) -> None:
+        """Noise parameter should increase data variance."""
+        params_no_noise = GaussianParams(
+            n_samples_per_class=100,
+            class_std=0.5,
+            noise=0.0,
+            seed=42,
+        )
+        params_with_noise = GaussianParams(
+            n_samples_per_class=100,
+            class_std=0.5,
+            noise=1.0,
+            seed=42,
+        )
+        result_no_noise = GaussianGenerator.generate(params_no_noise)
+        result_with_noise = GaussianGenerator.generate(params_with_noise)
+        var_no_noise = np.var(result_no_noise["X_full"])
+        var_with_noise = np.var(result_with_noise["X_full"])
+        assert var_with_noise > var_no_noise
+    def test_auto_center_placement(self) -> None:
+        """Auto-placed centers should be on a circle."""
+        params = GaussianParams(
+            n_classes=4,
+            n_samples_per_class=100,
+            center_radius=5.0,
+            class_std=0.1,
+            seed=42,
+        )
+        result = GaussianGenerator.generate(params)
+        for i in range(4):
+            start = i * 100
+            end = start + 100
+            class_mean = result["X_full"][start:end].mean(axis=0)
+            distance_from_origin = np.linalg.norm(class_mean)
+            np.testing.assert_almost_equal(distance_from_origin, 5.0, decimal=0)
+    def test_generate_with_list_class_std(self) -> None:
+        """Per-class std list should apply different stds to each class."""
+        params = GaussianParams(
+            n_classes=3,
+            n_samples_per_class=100,
+            class_std=[0.1, 0.5, 2.0],
+            seed=42,
+        )
+        result = GaussianGenerator.generate(params)
+        assert result["X_full"].shape == (300, 2)
+    def test_generate_single_feature(self) -> None:
+        """Single feature should skip sin component in center placement."""
+        params = GaussianParams(
+            n_classes=2,
+            n_samples_per_class=50,
+            n_features=1,
+            seed=42,
+        )
+        result = GaussianGenerator.generate(params)
+        assert result["X_full"].shape == (100, 1)
+    def test_get_stds_scalar(self) -> None:
+        """Scalar class_std should return a list of repeated values."""
+        params = GaussianParams(n_classes=3, class_std=0.5)
+        stds = GaussianGenerator._get_stds(params)
+        assert stds == [0.5, 0.5, 0.5]
+    def test_get_stds_list(self) -> None:
+        """List class_std should be returned as-is."""
+        params = GaussianParams(n_classes=3, class_std=[0.1, 0.5, 2.0])
+        stds = GaussianGenerator._get_stds(params)
+        assert stds == [0.1, 0.5, 2.0]
+class TestGetSchema:
+    """Tests for get_schema function."""
+    def test_returns_dict(self) -> None:
+        """get_schema should return a dictionary."""
+        schema = get_schema()
+        assert isinstance(schema, dict)
+    def test_schema_has_properties(self) -> None:
+        """Schema should have properties key."""
+        schema = get_schema()
+        assert "properties" in schema
+    def test_schema_includes_all_params(self) -> None:
+        """Schema should include all parameter names."""
+        schema = get_schema()
+        expected_params = {
+            "n_classes",
+            "n_samples_per_class",
+            "n_features",
+            "class_std",
+            "centers",
+            "center_radius",
+            "noise",
+            "seed",
+            "train_ratio",
+            "test_ratio",
+            "shuffle",
+        }
+        assert expected_params.issubset(set(schema["properties"].keys()))
+class TestVersion:
+    """Tests for VERSION constant."""
+    def test_version_format(self) -> None:
+        """VERSION should be a valid semver string."""
+        parts = VERSION.split(".")
+        assert len(parts) == 3
+        assert all(part.isdigit() for part in parts)