juniper-data 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- juniper_data/__init__.py +88 -0
- juniper_data/__main__.py +78 -0
- juniper_data/api/__init__.py +10 -0
- juniper_data/api/app.py +111 -0
- juniper_data/api/middleware.py +95 -0
- juniper_data/api/routes/__init__.py +9 -0
- juniper_data/api/routes/datasets.py +414 -0
- juniper_data/api/routes/generators.py +125 -0
- juniper_data/api/routes/health.py +49 -0
- juniper_data/api/security.py +238 -0
- juniper_data/api/settings.py +109 -0
- juniper_data/core/__init__.py +32 -0
- juniper_data/core/artifacts.py +63 -0
- juniper_data/core/dataset_id.py +38 -0
- juniper_data/core/models.py +135 -0
- juniper_data/core/split.py +120 -0
- juniper_data/generators/__init__.py +15 -0
- juniper_data/generators/arc_agi/__init__.py +11 -0
- juniper_data/generators/arc_agi/generator.py +229 -0
- juniper_data/generators/arc_agi/params.py +56 -0
- juniper_data/generators/checkerboard/__init__.py +15 -0
- juniper_data/generators/checkerboard/generator.py +114 -0
- juniper_data/generators/checkerboard/params.py +32 -0
- juniper_data/generators/circles/__init__.py +11 -0
- juniper_data/generators/circles/generator.py +112 -0
- juniper_data/generators/circles/params.py +31 -0
- juniper_data/generators/csv_import/__init__.py +15 -0
- juniper_data/generators/csv_import/generator.py +198 -0
- juniper_data/generators/csv_import/params.py +48 -0
- juniper_data/generators/gaussian/__init__.py +11 -0
- juniper_data/generators/gaussian/generator.py +149 -0
- juniper_data/generators/gaussian/params.py +53 -0
- juniper_data/generators/mnist/__init__.py +11 -0
- juniper_data/generators/mnist/generator.py +124 -0
- juniper_data/generators/mnist/params.py +39 -0
- juniper_data/generators/spiral/__init__.py +57 -0
- juniper_data/generators/spiral/defaults.py +39 -0
- juniper_data/generators/spiral/generator.py +206 -0
- juniper_data/generators/spiral/params.py +148 -0
- juniper_data/generators/xor/__init__.py +11 -0
- juniper_data/generators/xor/generator.py +162 -0
- juniper_data/generators/xor/params.py +30 -0
- juniper_data/storage/__init__.py +120 -0
- juniper_data/storage/base.py +279 -0
- juniper_data/storage/cached.py +211 -0
- juniper_data/storage/hf_store.py +257 -0
- juniper_data/storage/kaggle_store.py +333 -0
- juniper_data/storage/local_fs.py +232 -0
- juniper_data/storage/memory.py +136 -0
- juniper_data/storage/postgres_store.py +373 -0
- juniper_data/storage/redis_store.py +264 -0
- juniper_data/tests/__init__.py +1 -0
- juniper_data/tests/conftest.py +68 -0
- juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
- juniper_data/tests/integration/__init__.py +1 -0
- juniper_data/tests/integration/test_api.py +283 -0
- juniper_data/tests/integration/test_e2e_workflow.py +378 -0
- juniper_data/tests/integration/test_lifecycle_api.py +304 -0
- juniper_data/tests/integration/test_security_integration.py +189 -0
- juniper_data/tests/integration/test_storage_workflow.py +259 -0
- juniper_data/tests/performance/__init__.py +1 -0
- juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
- juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
- juniper_data/tests/unit/__init__.py +1 -0
- juniper_data/tests/unit/test_api_app.py +206 -0
- juniper_data/tests/unit/test_api_routes.py +407 -0
- juniper_data/tests/unit/test_api_settings.py +100 -0
- juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
- juniper_data/tests/unit/test_artifacts.py +145 -0
- juniper_data/tests/unit/test_cached_store.py +423 -0
- juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
- juniper_data/tests/unit/test_circles_generator.py +256 -0
- juniper_data/tests/unit/test_csv_import_generator.py +345 -0
- juniper_data/tests/unit/test_dataset_id.py +181 -0
- juniper_data/tests/unit/test_gaussian_generator.py +333 -0
- juniper_data/tests/unit/test_hf_store.py +416 -0
- juniper_data/tests/unit/test_init.py +93 -0
- juniper_data/tests/unit/test_kaggle_store.py +469 -0
- juniper_data/tests/unit/test_lifecycle.py +394 -0
- juniper_data/tests/unit/test_main.py +127 -0
- juniper_data/tests/unit/test_middleware.py +79 -0
- juniper_data/tests/unit/test_mnist_generator.py +370 -0
- juniper_data/tests/unit/test_postgres_store.py +490 -0
- juniper_data/tests/unit/test_redis_store.py +500 -0
- juniper_data/tests/unit/test_security.py +281 -0
- juniper_data/tests/unit/test_security_boundaries.py +517 -0
- juniper_data/tests/unit/test_spiral_generator.py +566 -0
- juniper_data/tests/unit/test_split.py +245 -0
- juniper_data/tests/unit/test_storage.py +767 -0
- juniper_data/tests/unit/test_xor_generator.py +223 -0
- juniper_data-0.4.2.dist-info/METADATA +216 -0
- juniper_data-0.4.2.dist-info/RECORD +95 -0
- juniper_data-0.4.2.dist-info/WHEEL +5 -0
- juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
- juniper_data-0.4.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""Unit tests for KaggleDatasetStore."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from unittest.mock import MagicMock, patch
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
from juniper_data.core.models import DatasetMeta
|
|
12
|
+
from juniper_data.storage.memory import InMemoryDatasetStore
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.fixture
|
|
16
|
+
def sample_meta() -> DatasetMeta:
|
|
17
|
+
"""Create sample metadata."""
|
|
18
|
+
return DatasetMeta(
|
|
19
|
+
dataset_id="test-dataset",
|
|
20
|
+
generator="test",
|
|
21
|
+
generator_version="1.0.0",
|
|
22
|
+
params={"seed": 42},
|
|
23
|
+
n_samples=100,
|
|
24
|
+
n_features=2,
|
|
25
|
+
n_classes=2,
|
|
26
|
+
n_train=80,
|
|
27
|
+
n_test=20,
|
|
28
|
+
class_distribution={"0": 50, "1": 50},
|
|
29
|
+
created_at=datetime.now(UTC),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def sample_arrays() -> dict[str, np.ndarray]:
|
|
35
|
+
"""Create sample arrays."""
|
|
36
|
+
rng = np.random.default_rng(42)
|
|
37
|
+
return {
|
|
38
|
+
"X_train": rng.standard_normal((80, 2)).astype(np.float32),
|
|
39
|
+
"y_train": rng.standard_normal((80, 2)).astype(np.float32),
|
|
40
|
+
"X_test": rng.standard_normal((20, 2)).astype(np.float32),
|
|
41
|
+
"y_test": rng.standard_normal((20, 2)).astype(np.float32),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.fixture
|
|
46
|
+
def mock_kaggle_module():
|
|
47
|
+
"""Create a mock kaggle module and patch it into kaggle_store."""
|
|
48
|
+
mock_api_class = MagicMock()
|
|
49
|
+
mock_api_instance = MagicMock()
|
|
50
|
+
mock_api_class.return_value = mock_api_instance
|
|
51
|
+
|
|
52
|
+
with patch("juniper_data.storage.kaggle_store.KAGGLE_AVAILABLE", True):
|
|
53
|
+
with patch("juniper_data.storage.kaggle_store.KaggleApi", mock_api_class):
|
|
54
|
+
yield mock_api_class, mock_api_instance
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _write_csv(path: Path, rows: list[dict]) -> None:
|
|
58
|
+
"""Helper to write a CSV file."""
|
|
59
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
61
|
+
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
|
62
|
+
writer.writeheader()
|
|
63
|
+
writer.writerows(rows)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@pytest.mark.unit
|
|
67
|
+
@pytest.mark.storage
|
|
68
|
+
class TestKaggleDatasetStoreInit:
|
|
69
|
+
"""Tests for KaggleDatasetStore initialization."""
|
|
70
|
+
|
|
71
|
+
def test_init_default(self, mock_kaggle_module, tmp_path) -> None:
|
|
72
|
+
"""Initialize with default parameters."""
|
|
73
|
+
mock_api_class, mock_api_instance = mock_kaggle_module
|
|
74
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
75
|
+
|
|
76
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
77
|
+
mock_api_instance.authenticate.assert_called_once()
|
|
78
|
+
assert isinstance(store._cache_store, InMemoryDatasetStore)
|
|
79
|
+
|
|
80
|
+
def test_init_custom_cache_store(self, mock_kaggle_module, tmp_path) -> None:
|
|
81
|
+
"""Initialize with custom cache store."""
|
|
82
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
83
|
+
|
|
84
|
+
custom_cache = InMemoryDatasetStore()
|
|
85
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle", cache_store=custom_cache)
|
|
86
|
+
assert store._cache_store is custom_cache
|
|
87
|
+
|
|
88
|
+
def test_init_no_auto_authenticate(self, mock_kaggle_module, tmp_path) -> None:
|
|
89
|
+
"""Initialize without auto authentication."""
|
|
90
|
+
_, mock_api_instance = mock_kaggle_module
|
|
91
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
92
|
+
|
|
93
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle", auto_authenticate=False)
|
|
94
|
+
assert store._api is None
|
|
95
|
+
|
|
96
|
+
def test_init_raises_without_kaggle(self) -> None:
|
|
97
|
+
"""Raises ImportError when kaggle package is not available."""
|
|
98
|
+
with patch("juniper_data.storage.kaggle_store.KAGGLE_AVAILABLE", False):
|
|
99
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
100
|
+
|
|
101
|
+
with pytest.raises(ImportError, match="Kaggle package not installed"):
|
|
102
|
+
KaggleDatasetStore()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@pytest.mark.unit
|
|
106
|
+
@pytest.mark.storage
|
|
107
|
+
class TestKaggleDatasetStoreDownload:
|
|
108
|
+
"""Tests for download_dataset operation."""
|
|
109
|
+
|
|
110
|
+
def test_download_dataset(self, mock_kaggle_module, tmp_path) -> None:
|
|
111
|
+
"""Download a new dataset."""
|
|
112
|
+
_, mock_api_instance = mock_kaggle_module
|
|
113
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
114
|
+
|
|
115
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
116
|
+
|
|
117
|
+
result = store.download_dataset("owner/dataset-name")
|
|
118
|
+
assert isinstance(result, Path)
|
|
119
|
+
mock_api_instance.dataset_download_files.assert_called_once()
|
|
120
|
+
|
|
121
|
+
def test_download_dataset_cached(self, mock_kaggle_module, tmp_path) -> None:
|
|
122
|
+
"""Skip download when dataset directory exists."""
|
|
123
|
+
_, mock_api_instance = mock_kaggle_module
|
|
124
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
125
|
+
|
|
126
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
127
|
+
cached_path = tmp_path / "kaggle" / "owner_dataset-name"
|
|
128
|
+
cached_path.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
|
|
130
|
+
result = store.download_dataset("owner/dataset-name")
|
|
131
|
+
assert result == cached_path
|
|
132
|
+
mock_api_instance.dataset_download_files.assert_not_called()
|
|
133
|
+
|
|
134
|
+
def test_download_dataset_force(self, mock_kaggle_module, tmp_path) -> None:
|
|
135
|
+
"""Force re-download even when cached."""
|
|
136
|
+
_, mock_api_instance = mock_kaggle_module
|
|
137
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
138
|
+
|
|
139
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
140
|
+
cached_path = tmp_path / "kaggle" / "owner_dataset-name"
|
|
141
|
+
cached_path.mkdir(parents=True, exist_ok=True)
|
|
142
|
+
|
|
143
|
+
store.download_dataset("owner/dataset-name", force=True)
|
|
144
|
+
mock_api_instance.dataset_download_files.assert_called_once()
|
|
145
|
+
|
|
146
|
+
def test_download_dataset_not_authenticated(self, mock_kaggle_module, tmp_path) -> None:
|
|
147
|
+
"""Raises RuntimeError when API not authenticated."""
|
|
148
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
149
|
+
|
|
150
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle", auto_authenticate=False)
|
|
151
|
+
|
|
152
|
+
with pytest.raises(RuntimeError, match="not authenticated"):
|
|
153
|
+
store.download_dataset("owner/dataset")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@pytest.mark.unit
|
|
157
|
+
@pytest.mark.storage
|
|
158
|
+
class TestKaggleDatasetStoreLoadDataset:
|
|
159
|
+
"""Tests for load_kaggle_dataset operation."""
|
|
160
|
+
|
|
161
|
+
def test_load_csv_dataset(self, mock_kaggle_module, tmp_path) -> None:
|
|
162
|
+
"""Load a CSV dataset from Kaggle."""
|
|
163
|
+
_, mock_api_instance = mock_kaggle_module
|
|
164
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
165
|
+
|
|
166
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
167
|
+
|
|
168
|
+
dataset_dir = tmp_path / "kaggle" / "owner_iris"
|
|
169
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
170
|
+
rows = [
|
|
171
|
+
{"sepal_length": "5.1", "sepal_width": "3.5", "label": "0"},
|
|
172
|
+
{"sepal_length": "7.0", "sepal_width": "3.2", "label": "1"},
|
|
173
|
+
{"sepal_length": "6.3", "sepal_width": "3.3", "label": "2"},
|
|
174
|
+
{"sepal_length": "5.0", "sepal_width": "3.6", "label": "0"},
|
|
175
|
+
{"sepal_length": "6.7", "sepal_width": "3.1", "label": "1"},
|
|
176
|
+
]
|
|
177
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
178
|
+
|
|
179
|
+
dataset_id, meta, arrays = store.load_kaggle_dataset("owner/iris", file_name="data.csv")
|
|
180
|
+
|
|
181
|
+
assert "kaggle-owner-iris" in dataset_id
|
|
182
|
+
assert meta.generator == "kaggle"
|
|
183
|
+
assert meta.n_samples == 5
|
|
184
|
+
assert meta.n_features == 2
|
|
185
|
+
assert meta.n_classes == 3
|
|
186
|
+
assert arrays["X_full"].shape == (5, 2)
|
|
187
|
+
|
|
188
|
+
def test_load_with_auto_detect_csv(self, mock_kaggle_module, tmp_path) -> None:
|
|
189
|
+
"""Auto-detect CSV when specified file not found."""
|
|
190
|
+
_, mock_api_instance = mock_kaggle_module
|
|
191
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
192
|
+
|
|
193
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
194
|
+
|
|
195
|
+
dataset_dir = tmp_path / "kaggle" / "owner_test"
|
|
196
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
197
|
+
rows = [
|
|
198
|
+
{"feature": "1.0", "label": "a"},
|
|
199
|
+
{"feature": "2.0", "label": "b"},
|
|
200
|
+
]
|
|
201
|
+
_write_csv(dataset_dir / "actual.csv", rows)
|
|
202
|
+
|
|
203
|
+
dataset_id, meta, arrays = store.load_kaggle_dataset("owner/test", file_name="missing.csv")
|
|
204
|
+
assert meta.n_samples == 2
|
|
205
|
+
|
|
206
|
+
def test_load_file_not_found(self, mock_kaggle_module, tmp_path) -> None:
|
|
207
|
+
"""Raises FileNotFoundError when no CSV found."""
|
|
208
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
209
|
+
|
|
210
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
211
|
+
dataset_dir = tmp_path / "kaggle" / "owner_test"
|
|
212
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
213
|
+
|
|
214
|
+
with pytest.raises(FileNotFoundError, match="not found"):
|
|
215
|
+
store.load_kaggle_dataset("owner/test", file_name="missing.csv")
|
|
216
|
+
|
|
217
|
+
def test_load_empty_csv(self, mock_kaggle_module, tmp_path) -> None:
|
|
218
|
+
"""Raises ValueError when CSV is empty."""
|
|
219
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
220
|
+
|
|
221
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
222
|
+
dataset_dir = tmp_path / "kaggle" / "owner_empty"
|
|
223
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
224
|
+
empty_csv = dataset_dir / "data.csv"
|
|
225
|
+
empty_csv.write_text("col1,col2,label\n")
|
|
226
|
+
|
|
227
|
+
with pytest.raises(ValueError, match="No data found"):
|
|
228
|
+
store.load_kaggle_dataset("owner/empty", file_name="data.csv")
|
|
229
|
+
|
|
230
|
+
def test_load_with_seed(self, mock_kaggle_module, tmp_path) -> None:
|
|
231
|
+
"""Load with seed shuffles data."""
|
|
232
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
233
|
+
|
|
234
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
235
|
+
dataset_dir = tmp_path / "kaggle" / "owner_seed"
|
|
236
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
237
|
+
rows = [{"feature": str(i), "label": str(i % 2)} for i in range(10)]
|
|
238
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
239
|
+
|
|
240
|
+
_, meta1, arrays1 = store.load_kaggle_dataset("owner/seed", file_name="data.csv", seed=42)
|
|
241
|
+
_, meta2, arrays2 = store.load_kaggle_dataset("owner/seed", file_name="data.csv", seed=42)
|
|
242
|
+
|
|
243
|
+
np.testing.assert_array_equal(arrays1["X_full"], arrays2["X_full"])
|
|
244
|
+
|
|
245
|
+
def test_load_with_n_samples(self, mock_kaggle_module, tmp_path) -> None:
|
|
246
|
+
"""Load with n_samples limits data."""
|
|
247
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
248
|
+
|
|
249
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
250
|
+
dataset_dir = tmp_path / "kaggle" / "owner_limit"
|
|
251
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
rows = [{"feature": str(i), "label": str(i % 2)} for i in range(20)]
|
|
253
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
254
|
+
|
|
255
|
+
_, meta, _ = store.load_kaggle_dataset("owner/limit", file_name="data.csv", n_samples=5)
|
|
256
|
+
assert meta.n_samples == 5
|
|
257
|
+
|
|
258
|
+
def test_load_without_one_hot(self, mock_kaggle_module, tmp_path) -> None:
|
|
259
|
+
"""Load without one-hot encoding."""
|
|
260
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
261
|
+
|
|
262
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
263
|
+
dataset_dir = tmp_path / "kaggle" / "owner_nohot"
|
|
264
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
265
|
+
rows = [{"feature": str(i), "label": str(i % 2)} for i in range(10)]
|
|
266
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
267
|
+
|
|
268
|
+
_, _, arrays = store.load_kaggle_dataset("owner/nohot", file_name="data.csv", one_hot_labels=False)
|
|
269
|
+
assert arrays["y_full"].shape[1] == 1
|
|
270
|
+
|
|
271
|
+
def test_load_with_normalization(self, mock_kaggle_module, tmp_path) -> None:
|
|
272
|
+
"""Load with feature normalization."""
|
|
273
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
274
|
+
|
|
275
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
276
|
+
dataset_dir = tmp_path / "kaggle" / "owner_norm"
|
|
277
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
278
|
+
rows = [{"feature": str(i * 10), "label": str(i % 2)} for i in range(10)]
|
|
279
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
280
|
+
|
|
281
|
+
_, _, arrays = store.load_kaggle_dataset("owner/norm", file_name="data.csv", normalize_features=True)
|
|
282
|
+
assert arrays["X_full"].max() <= 1.0 + 1e-6
|
|
283
|
+
assert arrays["X_full"].min() >= 0.0 - 1e-6
|
|
284
|
+
|
|
285
|
+
def test_load_with_invalid_values(self, mock_kaggle_module, tmp_path) -> None:
|
|
286
|
+
"""Non-numeric feature values are treated as 0.0."""
|
|
287
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
288
|
+
|
|
289
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
290
|
+
dataset_dir = tmp_path / "kaggle" / "owner_bad"
|
|
291
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
292
|
+
rows = [
|
|
293
|
+
{"feature": "abc", "label": "0"},
|
|
294
|
+
{"feature": "1.5", "label": "1"},
|
|
295
|
+
]
|
|
296
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
297
|
+
|
|
298
|
+
_, _, arrays = store.load_kaggle_dataset("owner/bad", file_name="data.csv")
|
|
299
|
+
assert arrays["X_full"][0, 0] == 0.0
|
|
300
|
+
assert arrays["X_full"][1, 0] == 1.5
|
|
301
|
+
|
|
302
|
+
def test_load_with_feature_columns(self, mock_kaggle_module, tmp_path) -> None:
|
|
303
|
+
"""Load with explicit feature columns."""
|
|
304
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
305
|
+
|
|
306
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
307
|
+
dataset_dir = tmp_path / "kaggle" / "owner_cols"
|
|
308
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
309
|
+
rows = [
|
|
310
|
+
{"a": "1", "b": "2", "c": "3", "label": "0"},
|
|
311
|
+
{"a": "4", "b": "5", "c": "6", "label": "1"},
|
|
312
|
+
]
|
|
313
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
314
|
+
|
|
315
|
+
_, meta, arrays = store.load_kaggle_dataset("owner/cols", file_name="data.csv", feature_columns=["a", "b"])
|
|
316
|
+
assert meta.n_features == 2
|
|
317
|
+
|
|
318
|
+
def test_load_saves_to_cache(self, mock_kaggle_module, tmp_path) -> None:
|
|
319
|
+
"""Load saves the result to cache store."""
|
|
320
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
321
|
+
|
|
322
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
323
|
+
dataset_dir = tmp_path / "kaggle" / "owner_cache"
|
|
324
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
325
|
+
rows = [{"feature": "1", "label": "0"}, {"feature": "2", "label": "1"}]
|
|
326
|
+
_write_csv(dataset_dir / "data.csv", rows)
|
|
327
|
+
|
|
328
|
+
dataset_id, _, _ = store.load_kaggle_dataset("owner/cache", file_name="data.csv")
|
|
329
|
+
assert store._cache_store.exists(dataset_id)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@pytest.mark.unit
|
|
333
|
+
@pytest.mark.storage
|
|
334
|
+
class TestKaggleDatasetStoreListCompetitions:
|
|
335
|
+
"""Tests for list_competitions operation."""
|
|
336
|
+
|
|
337
|
+
def test_list_competitions(self, mock_kaggle_module, tmp_path) -> None:
|
|
338
|
+
"""List competitions returns formatted results."""
|
|
339
|
+
_, mock_api_instance = mock_kaggle_module
|
|
340
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
341
|
+
|
|
342
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
343
|
+
|
|
344
|
+
mock_comp = MagicMock()
|
|
345
|
+
mock_comp.ref = "competition-1"
|
|
346
|
+
mock_comp.title = "Test Competition"
|
|
347
|
+
mock_comp.deadline = "2026-12-31"
|
|
348
|
+
mock_comp.category = "Getting Started"
|
|
349
|
+
mock_api_instance.competitions_list.return_value = [mock_comp]
|
|
350
|
+
|
|
351
|
+
result = store.list_competitions(search="test")
|
|
352
|
+
assert len(result) == 1
|
|
353
|
+
assert result[0]["ref"] == "competition-1"
|
|
354
|
+
assert result[0]["title"] == "Test Competition"
|
|
355
|
+
|
|
356
|
+
def test_list_competitions_not_authenticated(self, mock_kaggle_module, tmp_path) -> None:
|
|
357
|
+
"""Raises RuntimeError when API not authenticated."""
|
|
358
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
359
|
+
|
|
360
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle", auto_authenticate=False)
|
|
361
|
+
|
|
362
|
+
with pytest.raises(RuntimeError, match="not authenticated"):
|
|
363
|
+
store.list_competitions()
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@pytest.mark.unit
|
|
367
|
+
@pytest.mark.storage
|
|
368
|
+
class TestKaggleDatasetStoreListKaggleDatasets:
|
|
369
|
+
"""Tests for list_kaggle_datasets operation."""
|
|
370
|
+
|
|
371
|
+
def test_list_kaggle_datasets(self, mock_kaggle_module, tmp_path) -> None:
|
|
372
|
+
"""List Kaggle datasets returns formatted results."""
|
|
373
|
+
_, mock_api_instance = mock_kaggle_module
|
|
374
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
375
|
+
|
|
376
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
377
|
+
|
|
378
|
+
mock_dataset = MagicMock()
|
|
379
|
+
mock_dataset.ref = "owner/dataset"
|
|
380
|
+
mock_dataset.title = "Test Dataset"
|
|
381
|
+
mock_dataset.totalBytes = 1024
|
|
382
|
+
mock_dataset.lastUpdated = "2026-01-01"
|
|
383
|
+
mock_api_instance.dataset_list.return_value = [mock_dataset]
|
|
384
|
+
|
|
385
|
+
result = store.list_kaggle_datasets(search="test", page=2)
|
|
386
|
+
assert len(result) == 1
|
|
387
|
+
assert result[0]["ref"] == "owner/dataset"
|
|
388
|
+
mock_api_instance.dataset_list.assert_called_once_with(search="test", page=2)
|
|
389
|
+
|
|
390
|
+
def test_list_kaggle_datasets_not_authenticated(self, mock_kaggle_module, tmp_path) -> None:
|
|
391
|
+
"""Raises RuntimeError when API not authenticated."""
|
|
392
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
393
|
+
|
|
394
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle", auto_authenticate=False)
|
|
395
|
+
|
|
396
|
+
with pytest.raises(RuntimeError, match="not authenticated"):
|
|
397
|
+
store.list_kaggle_datasets()
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
@pytest.mark.unit
|
|
401
|
+
@pytest.mark.storage
|
|
402
|
+
class TestKaggleDatasetStoreDelegation:
|
|
403
|
+
"""Tests for delegated cache store operations."""
|
|
404
|
+
|
|
405
|
+
def test_save_delegates(self, mock_kaggle_module, tmp_path, sample_meta, sample_arrays) -> None:
|
|
406
|
+
"""save delegates to cache store."""
|
|
407
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
408
|
+
|
|
409
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
410
|
+
store.save("test-1", sample_meta, sample_arrays)
|
|
411
|
+
assert store._cache_store.exists("test-1")
|
|
412
|
+
|
|
413
|
+
def test_get_meta_delegates(self, mock_kaggle_module, tmp_path, sample_meta, sample_arrays) -> None:
|
|
414
|
+
"""get_meta delegates to cache store."""
|
|
415
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
416
|
+
|
|
417
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
418
|
+
store._cache_store.save("test-1", sample_meta, sample_arrays)
|
|
419
|
+
result = store.get_meta("test-1")
|
|
420
|
+
assert result is not None
|
|
421
|
+
|
|
422
|
+
def test_get_artifact_bytes_delegates(self, mock_kaggle_module, tmp_path, sample_meta, sample_arrays) -> None:
|
|
423
|
+
"""get_artifact_bytes delegates to cache store."""
|
|
424
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
425
|
+
|
|
426
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
427
|
+
store._cache_store.save("test-1", sample_meta, sample_arrays)
|
|
428
|
+
result = store.get_artifact_bytes("test-1")
|
|
429
|
+
assert result is not None
|
|
430
|
+
|
|
431
|
+
def test_exists_delegates(self, mock_kaggle_module, tmp_path) -> None:
|
|
432
|
+
"""exists delegates to cache store."""
|
|
433
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
434
|
+
|
|
435
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
436
|
+
assert store.exists("nonexistent") is False
|
|
437
|
+
|
|
438
|
+
def test_delete_delegates(self, mock_kaggle_module, tmp_path, sample_meta, sample_arrays) -> None:
|
|
439
|
+
"""delete delegates to cache store."""
|
|
440
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
441
|
+
|
|
442
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
443
|
+
store._cache_store.save("test-1", sample_meta, sample_arrays)
|
|
444
|
+
assert store.delete("test-1") is True
|
|
445
|
+
|
|
446
|
+
def test_list_datasets_delegates(self, mock_kaggle_module, tmp_path, sample_meta, sample_arrays) -> None:
|
|
447
|
+
"""list_datasets delegates to cache store."""
|
|
448
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
449
|
+
|
|
450
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
451
|
+
store._cache_store.save("test-1", sample_meta, sample_arrays)
|
|
452
|
+
assert "test-1" in store.list_datasets()
|
|
453
|
+
|
|
454
|
+
def test_update_meta_delegates(self, mock_kaggle_module, tmp_path, sample_meta, sample_arrays) -> None:
|
|
455
|
+
"""update_meta delegates to cache store."""
|
|
456
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
457
|
+
|
|
458
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
459
|
+
store._cache_store.save("test-1", sample_meta, sample_arrays)
|
|
460
|
+
assert store.update_meta("test-1", sample_meta) is True
|
|
461
|
+
|
|
462
|
+
def test_list_all_metadata_delegates(self, mock_kaggle_module, tmp_path, sample_meta, sample_arrays) -> None:
|
|
463
|
+
"""list_all_metadata delegates to cache store."""
|
|
464
|
+
from juniper_data.storage.kaggle_store import KaggleDatasetStore
|
|
465
|
+
|
|
466
|
+
store = KaggleDatasetStore(download_path=tmp_path / "kaggle")
|
|
467
|
+
store._cache_store.save("test-1", sample_meta, sample_arrays)
|
|
468
|
+
result = store.list_all_metadata()
|
|
469
|
+
assert len(result) == 1
|