juniper-data 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- juniper_data/__init__.py +88 -0
- juniper_data/__main__.py +78 -0
- juniper_data/api/__init__.py +10 -0
- juniper_data/api/app.py +111 -0
- juniper_data/api/middleware.py +95 -0
- juniper_data/api/routes/__init__.py +9 -0
- juniper_data/api/routes/datasets.py +414 -0
- juniper_data/api/routes/generators.py +125 -0
- juniper_data/api/routes/health.py +49 -0
- juniper_data/api/security.py +238 -0
- juniper_data/api/settings.py +109 -0
- juniper_data/core/__init__.py +32 -0
- juniper_data/core/artifacts.py +63 -0
- juniper_data/core/dataset_id.py +38 -0
- juniper_data/core/models.py +135 -0
- juniper_data/core/split.py +120 -0
- juniper_data/generators/__init__.py +15 -0
- juniper_data/generators/arc_agi/__init__.py +11 -0
- juniper_data/generators/arc_agi/generator.py +229 -0
- juniper_data/generators/arc_agi/params.py +56 -0
- juniper_data/generators/checkerboard/__init__.py +15 -0
- juniper_data/generators/checkerboard/generator.py +114 -0
- juniper_data/generators/checkerboard/params.py +32 -0
- juniper_data/generators/circles/__init__.py +11 -0
- juniper_data/generators/circles/generator.py +112 -0
- juniper_data/generators/circles/params.py +31 -0
- juniper_data/generators/csv_import/__init__.py +15 -0
- juniper_data/generators/csv_import/generator.py +198 -0
- juniper_data/generators/csv_import/params.py +48 -0
- juniper_data/generators/gaussian/__init__.py +11 -0
- juniper_data/generators/gaussian/generator.py +149 -0
- juniper_data/generators/gaussian/params.py +53 -0
- juniper_data/generators/mnist/__init__.py +11 -0
- juniper_data/generators/mnist/generator.py +124 -0
- juniper_data/generators/mnist/params.py +39 -0
- juniper_data/generators/spiral/__init__.py +57 -0
- juniper_data/generators/spiral/defaults.py +39 -0
- juniper_data/generators/spiral/generator.py +206 -0
- juniper_data/generators/spiral/params.py +148 -0
- juniper_data/generators/xor/__init__.py +11 -0
- juniper_data/generators/xor/generator.py +162 -0
- juniper_data/generators/xor/params.py +30 -0
- juniper_data/storage/__init__.py +120 -0
- juniper_data/storage/base.py +279 -0
- juniper_data/storage/cached.py +211 -0
- juniper_data/storage/hf_store.py +257 -0
- juniper_data/storage/kaggle_store.py +333 -0
- juniper_data/storage/local_fs.py +232 -0
- juniper_data/storage/memory.py +136 -0
- juniper_data/storage/postgres_store.py +373 -0
- juniper_data/storage/redis_store.py +264 -0
- juniper_data/tests/__init__.py +1 -0
- juniper_data/tests/conftest.py +68 -0
- juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
- juniper_data/tests/integration/__init__.py +1 -0
- juniper_data/tests/integration/test_api.py +283 -0
- juniper_data/tests/integration/test_e2e_workflow.py +378 -0
- juniper_data/tests/integration/test_lifecycle_api.py +304 -0
- juniper_data/tests/integration/test_security_integration.py +189 -0
- juniper_data/tests/integration/test_storage_workflow.py +259 -0
- juniper_data/tests/performance/__init__.py +1 -0
- juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
- juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
- juniper_data/tests/unit/__init__.py +1 -0
- juniper_data/tests/unit/test_api_app.py +206 -0
- juniper_data/tests/unit/test_api_routes.py +407 -0
- juniper_data/tests/unit/test_api_settings.py +100 -0
- juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
- juniper_data/tests/unit/test_artifacts.py +145 -0
- juniper_data/tests/unit/test_cached_store.py +423 -0
- juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
- juniper_data/tests/unit/test_circles_generator.py +256 -0
- juniper_data/tests/unit/test_csv_import_generator.py +345 -0
- juniper_data/tests/unit/test_dataset_id.py +181 -0
- juniper_data/tests/unit/test_gaussian_generator.py +333 -0
- juniper_data/tests/unit/test_hf_store.py +416 -0
- juniper_data/tests/unit/test_init.py +93 -0
- juniper_data/tests/unit/test_kaggle_store.py +469 -0
- juniper_data/tests/unit/test_lifecycle.py +394 -0
- juniper_data/tests/unit/test_main.py +127 -0
- juniper_data/tests/unit/test_middleware.py +79 -0
- juniper_data/tests/unit/test_mnist_generator.py +370 -0
- juniper_data/tests/unit/test_postgres_store.py +490 -0
- juniper_data/tests/unit/test_redis_store.py +500 -0
- juniper_data/tests/unit/test_security.py +281 -0
- juniper_data/tests/unit/test_security_boundaries.py +517 -0
- juniper_data/tests/unit/test_spiral_generator.py +566 -0
- juniper_data/tests/unit/test_split.py +245 -0
- juniper_data/tests/unit/test_storage.py +767 -0
- juniper_data/tests/unit/test_xor_generator.py +223 -0
- juniper_data-0.4.2.dist-info/METADATA +216 -0
- juniper_data-0.4.2.dist-info/RECORD +95 -0
- juniper_data-0.4.2.dist-info/WHEEL +5 -0
- juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
- juniper_data-0.4.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Unit tests for the split and shuffle utilities.
|
|
2
|
+
|
|
3
|
+
Tests cover:
|
|
4
|
+
- shuffle_data maintains X/y correspondence
|
|
5
|
+
- split_data produces correct sizes
|
|
6
|
+
- shuffle_and_split integration
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
from juniper_data.core.split import shuffle_and_split, shuffle_data, split_data
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.mark.unit
|
|
16
|
+
class TestShuffleData:
|
|
17
|
+
"""Tests for shuffle_data function."""
|
|
18
|
+
|
|
19
|
+
def test_shuffle_maintains_correspondence(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
20
|
+
"""Verify shuffling maintains correspondence between X and y."""
|
|
21
|
+
X = sample_arrays["X"]
|
|
22
|
+
y = sample_arrays["y"]
|
|
23
|
+
|
|
24
|
+
X_original = X.copy()
|
|
25
|
+
|
|
26
|
+
rng = np.random.default_rng(42)
|
|
27
|
+
X_shuffled, y_shuffled = shuffle_data(X, y, rng)
|
|
28
|
+
|
|
29
|
+
assert X_shuffled.shape == X.shape
|
|
30
|
+
assert y_shuffled.shape == y.shape
|
|
31
|
+
|
|
32
|
+
for i in range(X_shuffled.shape[0]):
|
|
33
|
+
x_row = X_shuffled[i]
|
|
34
|
+
y_row = y_shuffled[i]
|
|
35
|
+
original_idx = np.where((X_original == x_row).all(axis=1))[0][0]
|
|
36
|
+
np.testing.assert_array_equal(y_row, sample_arrays["y"][original_idx])
|
|
37
|
+
|
|
38
|
+
def test_shuffle_changes_order(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
39
|
+
"""Verify shuffling actually changes the order."""
|
|
40
|
+
X = sample_arrays["X"]
|
|
41
|
+
y = sample_arrays["y"]
|
|
42
|
+
|
|
43
|
+
rng = np.random.default_rng(42)
|
|
44
|
+
X_shuffled, _ = shuffle_data(X, y, rng)
|
|
45
|
+
|
|
46
|
+
assert not np.array_equal(X, X_shuffled)
|
|
47
|
+
|
|
48
|
+
def test_shuffle_preserves_all_values(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
49
|
+
"""Verify shuffling preserves all original values."""
|
|
50
|
+
X = sample_arrays["X"]
|
|
51
|
+
y = sample_arrays["y"]
|
|
52
|
+
|
|
53
|
+
rng = np.random.default_rng(42)
|
|
54
|
+
X_shuffled, y_shuffled = shuffle_data(X, y, rng)
|
|
55
|
+
|
|
56
|
+
assert set(map(tuple, X.tolist())) == set(map(tuple, X_shuffled.tolist()))
|
|
57
|
+
assert set(map(tuple, y.tolist())) == set(map(tuple, y_shuffled.tolist()))
|
|
58
|
+
|
|
59
|
+
def test_shuffle_mismatched_samples_raises(self) -> None:
|
|
60
|
+
"""Verify mismatched X and y sample counts raise ValueError."""
|
|
61
|
+
X = np.arange(20).reshape(10, 2).astype(np.float32)
|
|
62
|
+
y = np.eye(2, dtype=np.float32)[:5]
|
|
63
|
+
|
|
64
|
+
rng = np.random.default_rng(42)
|
|
65
|
+
with pytest.raises(ValueError, match="same number of samples"):
|
|
66
|
+
shuffle_data(X, y, rng)
|
|
67
|
+
|
|
68
|
+
def test_shuffle_deterministic_with_same_seed(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
69
|
+
"""Verify same seed produces same shuffle order."""
|
|
70
|
+
X = sample_arrays["X"]
|
|
71
|
+
y = sample_arrays["y"]
|
|
72
|
+
|
|
73
|
+
rng1 = np.random.default_rng(42)
|
|
74
|
+
X_shuffled1, y_shuffled1 = shuffle_data(X.copy(), y.copy(), rng1)
|
|
75
|
+
|
|
76
|
+
rng2 = np.random.default_rng(42)
|
|
77
|
+
X_shuffled2, y_shuffled2 = shuffle_data(X.copy(), y.copy(), rng2)
|
|
78
|
+
|
|
79
|
+
np.testing.assert_array_equal(X_shuffled1, X_shuffled2)
|
|
80
|
+
np.testing.assert_array_equal(y_shuffled1, y_shuffled2)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@pytest.mark.unit
|
|
84
|
+
class TestSplitData:
|
|
85
|
+
"""Tests for split_data function."""
|
|
86
|
+
|
|
87
|
+
def test_split_produces_correct_sizes(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
88
|
+
"""Verify split produces correct train/test sizes."""
|
|
89
|
+
X = sample_arrays["X"]
|
|
90
|
+
y = sample_arrays["y"]
|
|
91
|
+
|
|
92
|
+
result = split_data(X, y, train_ratio=0.8, test_ratio=0.2)
|
|
93
|
+
|
|
94
|
+
assert result["X_train"].shape[0] == 8
|
|
95
|
+
assert result["y_train"].shape[0] == 8
|
|
96
|
+
assert result["X_test"].shape[0] == 2
|
|
97
|
+
assert result["y_test"].shape[0] == 2
|
|
98
|
+
|
|
99
|
+
def test_split_maintains_feature_dimensions(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
100
|
+
"""Verify split maintains feature dimensions."""
|
|
101
|
+
X = sample_arrays["X"]
|
|
102
|
+
y = sample_arrays["y"]
|
|
103
|
+
|
|
104
|
+
result = split_data(X, y, train_ratio=0.6, test_ratio=0.4)
|
|
105
|
+
|
|
106
|
+
assert result["X_train"].shape[1] == X.shape[1]
|
|
107
|
+
assert result["X_test"].shape[1] == X.shape[1]
|
|
108
|
+
assert result["y_train"].shape[1] == y.shape[1]
|
|
109
|
+
assert result["y_test"].shape[1] == y.shape[1]
|
|
110
|
+
|
|
111
|
+
def test_split_with_custom_ratios(self) -> None:
|
|
112
|
+
"""Verify custom split ratios work correctly."""
|
|
113
|
+
X = np.arange(100).reshape(50, 2).astype(np.float32)
|
|
114
|
+
y = np.eye(2, dtype=np.float32)[np.arange(50) % 2]
|
|
115
|
+
|
|
116
|
+
result = split_data(X, y, train_ratio=0.6, test_ratio=0.3)
|
|
117
|
+
|
|
118
|
+
expected_train = int(np.round(50 * 0.6))
|
|
119
|
+
expected_test = int(np.round(50 * 0.3))
|
|
120
|
+
|
|
121
|
+
assert abs(result["X_train"].shape[0] - expected_train) <= 1
|
|
122
|
+
assert abs(result["X_test"].shape[0] - expected_test) <= 1
|
|
123
|
+
|
|
124
|
+
def test_split_no_overlap(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
125
|
+
"""Verify train and test sets do not overlap."""
|
|
126
|
+
X = sample_arrays["X"]
|
|
127
|
+
y = sample_arrays["y"]
|
|
128
|
+
|
|
129
|
+
result = split_data(X, y, train_ratio=0.6, test_ratio=0.4)
|
|
130
|
+
|
|
131
|
+
X_train = result["X_train"]
|
|
132
|
+
X_test = result["X_test"]
|
|
133
|
+
|
|
134
|
+
train_set = set(map(tuple, X_train.tolist()))
|
|
135
|
+
test_set = set(map(tuple, X_test.tolist()))
|
|
136
|
+
|
|
137
|
+
assert len(train_set & test_set) == 0
|
|
138
|
+
|
|
139
|
+
def test_split_mismatched_samples_raises(self) -> None:
|
|
140
|
+
"""Verify mismatched X and y sample counts raise ValueError."""
|
|
141
|
+
X = np.arange(20).reshape(10, 2).astype(np.float32)
|
|
142
|
+
y = np.eye(2, dtype=np.float32)[:5]
|
|
143
|
+
|
|
144
|
+
with pytest.raises(ValueError, match="same number of samples"):
|
|
145
|
+
split_data(X, y, train_ratio=0.8, test_ratio=0.2)
|
|
146
|
+
|
|
147
|
+
def test_split_invalid_train_ratio_raises(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
148
|
+
"""Verify invalid train_ratio raises ValueError."""
|
|
149
|
+
X = sample_arrays["X"]
|
|
150
|
+
y = sample_arrays["y"]
|
|
151
|
+
|
|
152
|
+
with pytest.raises(ValueError, match="train_ratio"):
|
|
153
|
+
split_data(X, y, train_ratio=1.5, test_ratio=0.2)
|
|
154
|
+
|
|
155
|
+
def test_split_invalid_test_ratio_raises(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
156
|
+
"""Verify invalid test_ratio raises ValueError."""
|
|
157
|
+
X = sample_arrays["X"]
|
|
158
|
+
y = sample_arrays["y"]
|
|
159
|
+
|
|
160
|
+
with pytest.raises(ValueError, match="test_ratio"):
|
|
161
|
+
split_data(X, y, train_ratio=0.8, test_ratio=1.5)
|
|
162
|
+
|
|
163
|
+
def test_split_ratios_exceed_one_raises(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
164
|
+
"""Verify train_ratio + test_ratio > 1.0 raises ValueError."""
|
|
165
|
+
X = sample_arrays["X"]
|
|
166
|
+
y = sample_arrays["y"]
|
|
167
|
+
|
|
168
|
+
with pytest.raises(ValueError, match="must not exceed 1.0"):
|
|
169
|
+
split_data(X, y, train_ratio=0.7, test_ratio=0.5)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@pytest.mark.unit
|
|
173
|
+
class TestShuffleAndSplit:
|
|
174
|
+
"""Tests for shuffle_and_split integration function."""
|
|
175
|
+
|
|
176
|
+
def test_shuffle_and_split_integration(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
177
|
+
"""Verify shuffle_and_split combines both operations correctly."""
|
|
178
|
+
X = sample_arrays["X"]
|
|
179
|
+
y = sample_arrays["y"]
|
|
180
|
+
|
|
181
|
+
result = shuffle_and_split(
|
|
182
|
+
X=X,
|
|
183
|
+
y=y,
|
|
184
|
+
train_ratio=0.8,
|
|
185
|
+
test_ratio=0.2,
|
|
186
|
+
seed=42,
|
|
187
|
+
shuffle=True,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
assert "X_train" in result
|
|
191
|
+
assert "y_train" in result
|
|
192
|
+
assert "X_test" in result
|
|
193
|
+
assert "y_test" in result
|
|
194
|
+
|
|
195
|
+
assert result["X_train"].shape[0] == 8
|
|
196
|
+
assert result["X_test"].shape[0] == 2
|
|
197
|
+
|
|
198
|
+
def test_shuffle_and_split_deterministic(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
199
|
+
"""Verify same seed produces identical results."""
|
|
200
|
+
X = sample_arrays["X"]
|
|
201
|
+
y = sample_arrays["y"]
|
|
202
|
+
|
|
203
|
+
result1 = shuffle_and_split(X, y, 0.8, 0.2, seed=42, shuffle=True)
|
|
204
|
+
result2 = shuffle_and_split(X, y, 0.8, 0.2, seed=42, shuffle=True)
|
|
205
|
+
|
|
206
|
+
np.testing.assert_array_equal(result1["X_train"], result2["X_train"])
|
|
207
|
+
np.testing.assert_array_equal(result1["X_test"], result2["X_test"])
|
|
208
|
+
|
|
209
|
+
def test_shuffle_and_split_no_shuffle(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
210
|
+
"""Verify shuffle=False preserves original order."""
|
|
211
|
+
X = sample_arrays["X"]
|
|
212
|
+
y = sample_arrays["y"]
|
|
213
|
+
|
|
214
|
+
result = shuffle_and_split(X, y, 0.8, 0.2, seed=42, shuffle=False)
|
|
215
|
+
|
|
216
|
+
np.testing.assert_array_equal(result["X_train"], X[:8])
|
|
217
|
+
np.testing.assert_array_equal(result["X_test"], X[8:])
|
|
218
|
+
|
|
219
|
+
def test_shuffle_and_split_different_seeds(self, sample_arrays: dict[str, np.ndarray]) -> None:
|
|
220
|
+
"""Verify different seeds produce different shuffles."""
|
|
221
|
+
X = sample_arrays["X"]
|
|
222
|
+
y = sample_arrays["y"]
|
|
223
|
+
|
|
224
|
+
result1 = shuffle_and_split(X, y, 0.8, 0.2, seed=42, shuffle=True)
|
|
225
|
+
result2 = shuffle_and_split(X, y, 0.8, 0.2, seed=99, shuffle=True)
|
|
226
|
+
|
|
227
|
+
assert not np.array_equal(result1["X_train"], result2["X_train"])
|
|
228
|
+
|
|
229
|
+
def test_split_adjusts_test_size_when_rounding_exceeds_samples(self) -> None:
|
|
230
|
+
"""Verify test size is adjusted when train+test rounding exceeds total samples.
|
|
231
|
+
|
|
232
|
+
With 3 samples, train_ratio=0.5, test_ratio=0.5:
|
|
233
|
+
- n_train = round(3 * 0.5) = round(1.5) = 2
|
|
234
|
+
- n_test = round(3 * 0.5) = round(1.5) = 2
|
|
235
|
+
- n_train + n_test = 4 > 3, so n_test should be adjusted to 1
|
|
236
|
+
"""
|
|
237
|
+
X = np.arange(6).reshape(3, 2).astype(np.float32)
|
|
238
|
+
y = np.array([[1, 0], [0, 1], [1, 0]], dtype=np.float32)
|
|
239
|
+
|
|
240
|
+
result = split_data(X, y, train_ratio=0.5, test_ratio=0.5)
|
|
241
|
+
|
|
242
|
+
total_split = result["X_train"].shape[0] + result["X_test"].shape[0]
|
|
243
|
+
assert total_split == 3
|
|
244
|
+
assert result["X_train"].shape[0] == 2
|
|
245
|
+
assert result["X_test"].shape[0] == 1
|