juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. juniper_data/__init__.py +88 -0
  2. juniper_data/__main__.py +78 -0
  3. juniper_data/api/__init__.py +10 -0
  4. juniper_data/api/app.py +111 -0
  5. juniper_data/api/middleware.py +95 -0
  6. juniper_data/api/routes/__init__.py +9 -0
  7. juniper_data/api/routes/datasets.py +414 -0
  8. juniper_data/api/routes/generators.py +125 -0
  9. juniper_data/api/routes/health.py +49 -0
  10. juniper_data/api/security.py +238 -0
  11. juniper_data/api/settings.py +109 -0
  12. juniper_data/core/__init__.py +32 -0
  13. juniper_data/core/artifacts.py +63 -0
  14. juniper_data/core/dataset_id.py +38 -0
  15. juniper_data/core/models.py +135 -0
  16. juniper_data/core/split.py +120 -0
  17. juniper_data/generators/__init__.py +15 -0
  18. juniper_data/generators/arc_agi/__init__.py +11 -0
  19. juniper_data/generators/arc_agi/generator.py +229 -0
  20. juniper_data/generators/arc_agi/params.py +56 -0
  21. juniper_data/generators/checkerboard/__init__.py +15 -0
  22. juniper_data/generators/checkerboard/generator.py +114 -0
  23. juniper_data/generators/checkerboard/params.py +32 -0
  24. juniper_data/generators/circles/__init__.py +11 -0
  25. juniper_data/generators/circles/generator.py +112 -0
  26. juniper_data/generators/circles/params.py +31 -0
  27. juniper_data/generators/csv_import/__init__.py +15 -0
  28. juniper_data/generators/csv_import/generator.py +198 -0
  29. juniper_data/generators/csv_import/params.py +48 -0
  30. juniper_data/generators/gaussian/__init__.py +11 -0
  31. juniper_data/generators/gaussian/generator.py +149 -0
  32. juniper_data/generators/gaussian/params.py +53 -0
  33. juniper_data/generators/mnist/__init__.py +11 -0
  34. juniper_data/generators/mnist/generator.py +124 -0
  35. juniper_data/generators/mnist/params.py +39 -0
  36. juniper_data/generators/spiral/__init__.py +57 -0
  37. juniper_data/generators/spiral/defaults.py +39 -0
  38. juniper_data/generators/spiral/generator.py +206 -0
  39. juniper_data/generators/spiral/params.py +148 -0
  40. juniper_data/generators/xor/__init__.py +11 -0
  41. juniper_data/generators/xor/generator.py +162 -0
  42. juniper_data/generators/xor/params.py +30 -0
  43. juniper_data/storage/__init__.py +120 -0
  44. juniper_data/storage/base.py +279 -0
  45. juniper_data/storage/cached.py +211 -0
  46. juniper_data/storage/hf_store.py +257 -0
  47. juniper_data/storage/kaggle_store.py +333 -0
  48. juniper_data/storage/local_fs.py +232 -0
  49. juniper_data/storage/memory.py +136 -0
  50. juniper_data/storage/postgres_store.py +373 -0
  51. juniper_data/storage/redis_store.py +264 -0
  52. juniper_data/tests/__init__.py +1 -0
  53. juniper_data/tests/conftest.py +68 -0
  54. juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
  55. juniper_data/tests/integration/__init__.py +1 -0
  56. juniper_data/tests/integration/test_api.py +283 -0
  57. juniper_data/tests/integration/test_e2e_workflow.py +378 -0
  58. juniper_data/tests/integration/test_lifecycle_api.py +304 -0
  59. juniper_data/tests/integration/test_security_integration.py +189 -0
  60. juniper_data/tests/integration/test_storage_workflow.py +259 -0
  61. juniper_data/tests/performance/__init__.py +1 -0
  62. juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
  63. juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
  64. juniper_data/tests/unit/__init__.py +1 -0
  65. juniper_data/tests/unit/test_api_app.py +206 -0
  66. juniper_data/tests/unit/test_api_routes.py +407 -0
  67. juniper_data/tests/unit/test_api_settings.py +100 -0
  68. juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
  69. juniper_data/tests/unit/test_artifacts.py +145 -0
  70. juniper_data/tests/unit/test_cached_store.py +423 -0
  71. juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
  72. juniper_data/tests/unit/test_circles_generator.py +256 -0
  73. juniper_data/tests/unit/test_csv_import_generator.py +345 -0
  74. juniper_data/tests/unit/test_dataset_id.py +181 -0
  75. juniper_data/tests/unit/test_gaussian_generator.py +333 -0
  76. juniper_data/tests/unit/test_hf_store.py +416 -0
  77. juniper_data/tests/unit/test_init.py +93 -0
  78. juniper_data/tests/unit/test_kaggle_store.py +469 -0
  79. juniper_data/tests/unit/test_lifecycle.py +394 -0
  80. juniper_data/tests/unit/test_main.py +127 -0
  81. juniper_data/tests/unit/test_middleware.py +79 -0
  82. juniper_data/tests/unit/test_mnist_generator.py +370 -0
  83. juniper_data/tests/unit/test_postgres_store.py +490 -0
  84. juniper_data/tests/unit/test_redis_store.py +500 -0
  85. juniper_data/tests/unit/test_security.py +281 -0
  86. juniper_data/tests/unit/test_security_boundaries.py +517 -0
  87. juniper_data/tests/unit/test_spiral_generator.py +566 -0
  88. juniper_data/tests/unit/test_split.py +245 -0
  89. juniper_data/tests/unit/test_storage.py +767 -0
  90. juniper_data/tests/unit/test_xor_generator.py +223 -0
  91. juniper_data-0.4.2.dist-info/METADATA +216 -0
  92. juniper_data-0.4.2.dist-info/RECORD +95 -0
  93. juniper_data-0.4.2.dist-info/WHEEL +5 -0
  94. juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
  95. juniper_data-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,767 @@
1
+ """Unit tests for storage module."""
2
+
3
+ import contextlib
4
+ import io
5
+ import tempfile
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import pytest
11
+
12
+ from juniper_data.core.models import DatasetMeta
13
+ from juniper_data.storage import DatasetStore, InMemoryDatasetStore, LocalFSDatasetStore
14
+
15
+
16
+ @pytest.fixture
17
+ def sample_meta() -> DatasetMeta:
18
+ """Create sample dataset metadata for testing."""
19
+ return DatasetMeta(
20
+ dataset_id="test-dataset-001",
21
+ generator="spiral",
22
+ generator_version="1.0.0",
23
+ params={"n_spirals": 2, "n_points_per_spiral": 100, "noise": 0.1},
24
+ n_samples=200,
25
+ n_features=2,
26
+ n_classes=2,
27
+ n_train=160,
28
+ n_test=40,
29
+ class_distribution={"0": 100, "1": 100},
30
+ artifact_formats=["npz"],
31
+ created_at=datetime(2026, 1, 30, 12, 0, 0),
32
+ checksum="abc123",
33
+ )
34
+
35
+
36
+ @pytest.fixture
37
+ def sample_arrays() -> dict[str, np.ndarray]:
38
+ """Create sample arrays for testing."""
39
+ return {
40
+ "X_train": np.random.randn(160, 2).astype(np.float32),
41
+ "y_train": np.eye(2, dtype=np.float32)[np.random.randint(0, 2, 160)],
42
+ "X_test": np.random.randn(40, 2).astype(np.float32),
43
+ "y_test": np.eye(2, dtype=np.float32)[np.random.randint(0, 2, 40)],
44
+ }
45
+
46
+
47
+ @pytest.fixture
48
+ def memory_store() -> InMemoryDatasetStore:
49
+ """Create a fresh in-memory store."""
50
+ return InMemoryDatasetStore()
51
+
52
+
53
+ @pytest.fixture
54
+ def temp_dir():
55
+ """Create a temporary directory for filesystem tests."""
56
+ with tempfile.TemporaryDirectory() as tmpdir:
57
+ yield Path(tmpdir)
58
+
59
+
60
+ @pytest.fixture
61
+ def fs_store(temp_dir: Path) -> LocalFSDatasetStore:
62
+ """Create a local filesystem store in a temp directory."""
63
+ return LocalFSDatasetStore(temp_dir)
64
+
65
+
66
+ class TestInMemoryDatasetStore:
67
+ """Tests for InMemoryDatasetStore."""
68
+
69
+ @pytest.mark.unit
70
+ def test_init_creates_empty_store(self, memory_store: InMemoryDatasetStore):
71
+ """Test that initialization creates an empty store."""
72
+ assert memory_store.list_datasets() == []
73
+
74
+ @pytest.mark.unit
75
+ def test_save_and_get_meta(
76
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
77
+ ):
78
+ """Test saving and retrieving metadata."""
79
+ memory_store.save("ds-001", sample_meta, sample_arrays)
80
+ retrieved = memory_store.get_meta("ds-001")
81
+
82
+ assert retrieved is not None
83
+ assert retrieved.dataset_id == sample_meta.dataset_id
84
+ assert retrieved.generator == sample_meta.generator
85
+ assert retrieved.n_samples == sample_meta.n_samples
86
+
87
+ @pytest.mark.unit
88
+ def test_get_meta_nonexistent(self, memory_store: InMemoryDatasetStore):
89
+ """Test getting metadata for nonexistent dataset returns None."""
90
+ assert memory_store.get_meta("nonexistent") is None
91
+
92
+ @pytest.mark.unit
93
+ def test_save_and_get_artifact_bytes(
94
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
95
+ ):
96
+ """Test saving and retrieving artifact bytes."""
97
+ memory_store.save("ds-001", sample_meta, sample_arrays)
98
+ artifact_bytes = memory_store.get_artifact_bytes("ds-001")
99
+
100
+ assert artifact_bytes is not None
101
+ assert len(artifact_bytes) > 0
102
+
103
+ loaded = np.load(io.BytesIO(artifact_bytes))
104
+ assert set(loaded.files) == set(sample_arrays.keys())
105
+ for key in sample_arrays:
106
+ np.testing.assert_array_almost_equal(loaded[key], sample_arrays[key])
107
+
108
+ @pytest.mark.unit
109
+ def test_get_artifact_bytes_nonexistent(self, memory_store: InMemoryDatasetStore):
110
+ """Test getting artifact bytes for nonexistent dataset returns None."""
111
+ assert memory_store.get_artifact_bytes("nonexistent") is None
112
+
113
+ @pytest.mark.unit
114
+ def test_exists_true(
115
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
116
+ ):
117
+ """Test exists returns True for saved dataset."""
118
+ memory_store.save("ds-001", sample_meta, sample_arrays)
119
+ assert memory_store.exists("ds-001") is True
120
+
121
+ @pytest.mark.unit
122
+ def test_exists_false(self, memory_store: InMemoryDatasetStore):
123
+ """Test exists returns False for nonexistent dataset."""
124
+ assert memory_store.exists("nonexistent") is False
125
+
126
+ @pytest.mark.unit
127
+ def test_delete_existing(
128
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
129
+ ):
130
+ """Test deleting an existing dataset returns True."""
131
+ memory_store.save("ds-001", sample_meta, sample_arrays)
132
+ deleted = memory_store.delete("ds-001")
133
+ assert deleted is True
134
+ assert memory_store.exists("ds-001") is False
135
+ assert memory_store.get_meta("ds-001") is None
136
+
137
+ @pytest.mark.unit
138
+ def test_delete_nonexistent(self, memory_store: InMemoryDatasetStore):
139
+ """Test deleting a nonexistent dataset returns False."""
140
+ deleted = memory_store.delete("nonexistent")
141
+ assert deleted is False
142
+
143
+ @pytest.mark.unit
144
+ def test_list_datasets_empty(self, memory_store: InMemoryDatasetStore):
145
+ """Test listing datasets in empty store."""
146
+ assert memory_store.list_datasets() == []
147
+
148
+ @pytest.mark.unit
149
+ def test_list_datasets_multiple(
150
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
151
+ ):
152
+ """Test listing multiple datasets."""
153
+ for i in range(5):
154
+ memory_store.save(f"ds-00{i}", sample_meta, sample_arrays)
155
+
156
+ datasets = memory_store.list_datasets()
157
+ assert len(datasets) == 5
158
+ assert datasets == sorted(datasets)
159
+
160
+ @pytest.mark.unit
161
+ def test_list_datasets_with_limit(
162
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
163
+ ):
164
+ """Test listing datasets with limit."""
165
+ for i in range(10):
166
+ memory_store.save(f"ds-{i:03d}", sample_meta, sample_arrays)
167
+
168
+ datasets = memory_store.list_datasets(limit=3)
169
+ assert len(datasets) == 3
170
+
171
+ @pytest.mark.unit
172
+ def test_list_datasets_with_offset(
173
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
174
+ ):
175
+ """Test listing datasets with offset."""
176
+ for i in range(10):
177
+ memory_store.save(f"ds-{i:03d}", sample_meta, sample_arrays)
178
+
179
+ datasets = memory_store.list_datasets(offset=5)
180
+ assert len(datasets) == 5
181
+ assert datasets[0] == "ds-005"
182
+
183
+ @pytest.mark.unit
184
+ def test_list_datasets_with_limit_and_offset(
185
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
186
+ ):
187
+ """Test listing datasets with both limit and offset."""
188
+ for i in range(10):
189
+ memory_store.save(f"ds-{i:03d}", sample_meta, sample_arrays)
190
+
191
+ datasets = memory_store.list_datasets(limit=3, offset=2)
192
+ assert len(datasets) == 3
193
+ assert datasets == ["ds-002", "ds-003", "ds-004"]
194
+
195
+ @pytest.mark.unit
196
+ def test_clear(
197
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
198
+ ):
199
+ """Test clearing all datasets."""
200
+ for i in range(5):
201
+ memory_store.save(f"ds-00{i}", sample_meta, sample_arrays)
202
+
203
+ assert len(memory_store.list_datasets()) == 5
204
+ memory_store.clear()
205
+ assert len(memory_store.list_datasets()) == 0
206
+
207
+ @pytest.mark.unit
208
+ def test_save_copies_arrays(
209
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
210
+ ):
211
+ """Test that save makes copies of arrays (not references)."""
212
+ memory_store.save("ds-001", sample_meta, sample_arrays)
213
+
214
+ original_value = sample_arrays["X_train"][0, 0].copy()
215
+ sample_arrays["X_train"][0, 0] = 999.0
216
+
217
+ artifact_bytes = memory_store.get_artifact_bytes("ds-001")
218
+ assert artifact_bytes is not None
219
+ loaded = np.load(io.BytesIO(artifact_bytes))
220
+ assert loaded["X_train"][0, 0] == original_value
221
+
222
+ @pytest.mark.unit
223
+ def test_overwrite_existing(
224
+ self, memory_store: InMemoryDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
225
+ ):
226
+ """Test that saving to same ID overwrites existing dataset."""
227
+ memory_store.save("ds-001", sample_meta, sample_arrays)
228
+
229
+ new_meta = DatasetMeta(
230
+ dataset_id="ds-001-updated",
231
+ generator="spiral",
232
+ generator_version="2.0.0",
233
+ params={},
234
+ n_samples=100,
235
+ n_features=2,
236
+ n_classes=2,
237
+ n_train=80,
238
+ n_test=20,
239
+ class_distribution={"0": 50, "1": 50},
240
+ created_at=datetime.now(),
241
+ )
242
+ new_arrays = {"X": np.zeros((10, 2), dtype=np.float32)}
243
+
244
+ memory_store.save("ds-001", new_meta, new_arrays)
245
+
246
+ retrieved = memory_store.get_meta("ds-001")
247
+ assert retrieved is not None
248
+ assert retrieved.generator_version == "2.0.0"
249
+
250
+ @pytest.mark.unit
251
+ def test_record_access_nonexistent_noop(self, memory_store: InMemoryDatasetStore):
252
+ """record_access on nonexistent dataset does nothing."""
253
+ memory_store.record_access("nonexistent-id")
254
+ assert not memory_store.exists("nonexistent-id")
255
+
256
+
257
+ class TestLocalFSDatasetStore:
258
+ """Tests for LocalFSDatasetStore."""
259
+
260
+ @pytest.mark.unit
261
+ def test_init_creates_directory(self, temp_dir: Path):
262
+ """Test that initialization creates the base directory."""
263
+ subdir = temp_dir / "datasets" / "nested"
264
+ store = LocalFSDatasetStore(subdir)
265
+ assert subdir.exists()
266
+ assert store.base_path == subdir
267
+
268
+ @pytest.mark.unit
269
+ def test_save_creates_files(
270
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
271
+ ):
272
+ """Test that save creates meta and npz files."""
273
+ fs_store.save("ds-001", sample_meta, sample_arrays)
274
+
275
+ meta_path = fs_store.base_path / "ds-001.meta.json"
276
+ npz_path = fs_store.base_path / "ds-001.npz"
277
+
278
+ assert meta_path.exists()
279
+ assert npz_path.exists()
280
+
281
+ @pytest.mark.unit
282
+ def test_save_and_get_meta(
283
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
284
+ ):
285
+ """Test saving and retrieving metadata."""
286
+ fs_store.save("ds-001", sample_meta, sample_arrays)
287
+ retrieved = fs_store.get_meta("ds-001")
288
+
289
+ assert retrieved is not None
290
+ assert retrieved.dataset_id == sample_meta.dataset_id
291
+ assert retrieved.generator == sample_meta.generator
292
+ assert retrieved.n_samples == sample_meta.n_samples
293
+ assert retrieved.created_at == sample_meta.created_at
294
+
295
+ @pytest.mark.unit
296
+ def test_get_meta_nonexistent(self, fs_store: LocalFSDatasetStore):
297
+ """Test getting metadata for nonexistent dataset returns None."""
298
+ assert fs_store.get_meta("nonexistent") is None
299
+
300
+ @pytest.mark.unit
301
+ def test_save_and_get_artifact_bytes(
302
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
303
+ ):
304
+ """Test saving and retrieving artifact bytes."""
305
+ fs_store.save("ds-001", sample_meta, sample_arrays)
306
+ artifact_bytes = fs_store.get_artifact_bytes("ds-001")
307
+
308
+ assert artifact_bytes is not None
309
+ assert len(artifact_bytes) > 0
310
+
311
+ loaded = np.load(io.BytesIO(artifact_bytes))
312
+ assert set(loaded.files) == set(sample_arrays.keys())
313
+
314
+ @pytest.mark.unit
315
+ def test_get_artifact_bytes_nonexistent(self, fs_store: LocalFSDatasetStore):
316
+ """Test getting artifact bytes for nonexistent dataset returns None."""
317
+ assert fs_store.get_artifact_bytes("nonexistent") is None
318
+
319
+ @pytest.mark.unit
320
+ def test_exists_true(
321
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
322
+ ):
323
+ """Test exists returns True for saved dataset."""
324
+ fs_store.save("ds-001", sample_meta, sample_arrays)
325
+ assert fs_store.exists("ds-001") is True
326
+
327
+ @pytest.mark.unit
328
+ def test_exists_false(self, fs_store: LocalFSDatasetStore):
329
+ """Test exists returns False for nonexistent dataset."""
330
+ assert fs_store.exists("nonexistent") is False
331
+
332
+ @pytest.mark.unit
333
+ def test_exists_partial_files(
334
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
335
+ ):
336
+ """Test exists returns False when only one file exists."""
337
+ fs_store.save("ds-001", sample_meta, sample_arrays)
338
+
339
+ (fs_store.base_path / "ds-001.npz").unlink()
340
+ assert fs_store.exists("ds-001") is False
341
+
342
+ @pytest.mark.unit
343
+ def test_delete_existing(
344
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
345
+ ):
346
+ """Test deleting an existing dataset returns True."""
347
+ fs_store.save("ds-001", sample_meta, sample_arrays)
348
+ deleted = fs_store.delete("ds-001")
349
+ assert deleted is True
350
+ assert fs_store.exists("ds-001") is False
351
+
352
+ assert not (fs_store.base_path / "ds-001.meta.json").exists()
353
+ assert not (fs_store.base_path / "ds-001.npz").exists()
354
+
355
+ @pytest.mark.unit
356
+ def test_delete_nonexistent(self, fs_store: LocalFSDatasetStore):
357
+ """Test deleting a nonexistent dataset returns False."""
358
+ deleted = fs_store.delete("nonexistent")
359
+ assert deleted is False
360
+
361
+ @pytest.mark.unit
362
+ def test_delete_partial_files(
363
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
364
+ ):
365
+ """Test deleting when only meta file exists."""
366
+ fs_store.save("ds-001", sample_meta, sample_arrays)
367
+
368
+ deleted = fs_store.delete("ds-001")
369
+ assert deleted is True
370
+ assert fs_store.delete("ds-001") is False
371
+ assert not (fs_store.base_path / "ds-001.meta.json").exists()
372
+
373
+ @pytest.mark.unit
374
+ def test_list_datasets_empty(self, fs_store: LocalFSDatasetStore):
375
+ """Test listing datasets in empty store."""
376
+ assert fs_store.list_datasets() == []
377
+
378
+ @pytest.mark.unit
379
+ def test_list_datasets_multiple(
380
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
381
+ ):
382
+ """Test listing multiple datasets."""
383
+ for i in range(5):
384
+ fs_store.save(f"ds-00{i}", sample_meta, sample_arrays)
385
+
386
+ datasets = fs_store.list_datasets()
387
+ assert len(datasets) == 5
388
+ assert datasets == sorted(datasets)
389
+
390
+ @pytest.mark.unit
391
+ def test_list_datasets_with_limit(
392
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
393
+ ):
394
+ """Test listing datasets with limit."""
395
+ for i in range(10):
396
+ fs_store.save(f"ds-{i:03d}", sample_meta, sample_arrays)
397
+
398
+ datasets = fs_store.list_datasets(limit=3)
399
+ assert len(datasets) == 3
400
+
401
+ @pytest.mark.unit
402
+ def test_list_datasets_with_offset(
403
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
404
+ ):
405
+ """Test listing datasets with offset."""
406
+ for i in range(10):
407
+ fs_store.save(f"ds-{i:03d}", sample_meta, sample_arrays)
408
+
409
+ datasets = fs_store.list_datasets(offset=5)
410
+ assert len(datasets) == 5
411
+ assert datasets[0] == "ds-005"
412
+
413
+ @pytest.mark.unit
414
+ def test_list_datasets_with_limit_and_offset(
415
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
416
+ ):
417
+ """Test listing datasets with both limit and offset."""
418
+ for i in range(10):
419
+ fs_store.save(f"ds-{i:03d}", sample_meta, sample_arrays)
420
+
421
+ datasets = fs_store.list_datasets(limit=3, offset=2)
422
+ assert len(datasets) == 3
423
+ assert datasets == ["ds-002", "ds-003", "ds-004"]
424
+
425
+ @pytest.mark.unit
426
+ def test_base_path_property(self, temp_dir: Path):
427
+ """Test base_path property returns correct path."""
428
+ store = LocalFSDatasetStore(temp_dir)
429
+ assert store.base_path == temp_dir
430
+
431
+ @pytest.mark.unit
432
+ def test_datetime_serialization(self, fs_store: LocalFSDatasetStore, sample_arrays: dict[str, np.ndarray]):
433
+ """Test that datetime is properly serialized and deserialized."""
434
+ specific_time = datetime(2026, 6, 15, 10, 30, 45)
435
+ meta = DatasetMeta(
436
+ dataset_id="dt-test",
437
+ generator="test",
438
+ generator_version="1.0.0",
439
+ params={},
440
+ n_samples=100,
441
+ n_features=2,
442
+ n_classes=2,
443
+ n_train=80,
444
+ n_test=20,
445
+ class_distribution={"0": 50, "1": 50},
446
+ created_at=specific_time,
447
+ )
448
+
449
+ fs_store.save("dt-test", meta, sample_arrays)
450
+ retrieved = fs_store.get_meta("dt-test")
451
+
452
+ assert retrieved is not None
453
+ assert retrieved.created_at == specific_time
454
+
455
+
456
+ class TestDatasetStoreInterface:
457
+ """Tests to verify implementations follow the abstract interface."""
458
+
459
+ @pytest.mark.unit
460
+ def test_memory_store_is_dataset_store(self, memory_store: InMemoryDatasetStore):
461
+ """Test InMemoryDatasetStore is a DatasetStore."""
462
+ assert isinstance(memory_store, DatasetStore)
463
+
464
+ @pytest.mark.unit
465
+ def test_fs_store_is_dataset_store(self, fs_store: LocalFSDatasetStore):
466
+ """Test LocalFSDatasetStore is a DatasetStore."""
467
+ assert isinstance(fs_store, DatasetStore)
468
+
469
+
470
+ class TestLocalFSUpdateAndList:
471
+ """Tests for LocalFSDatasetStore update_meta and list_all_metadata."""
472
+
473
+ @pytest.mark.unit
474
+ def test_update_meta(self, fs_store, sample_meta, sample_arrays):
475
+ """Test update_meta updates metadata."""
476
+ fs_store.save("ds-upd", sample_meta, sample_arrays)
477
+ new_meta = DatasetMeta(
478
+ dataset_id="ds-upd-v2",
479
+ generator="spiral",
480
+ generator_version="2.0.0",
481
+ params={},
482
+ n_samples=200,
483
+ n_features=2,
484
+ n_classes=2,
485
+ n_train=160,
486
+ n_test=40,
487
+ class_distribution={"0": 100, "1": 100},
488
+ created_at=datetime(2026, 2, 1, 12, 0, 0),
489
+ )
490
+ result = fs_store.update_meta("ds-upd", new_meta)
491
+ assert result is True
492
+ retrieved = fs_store.get_meta("ds-upd")
493
+ assert retrieved.generator_version == "2.0.0"
494
+
495
+ @pytest.mark.unit
496
+ def test_update_meta_nonexistent(self, fs_store, sample_meta):
497
+ """Test update_meta returns False for nonexistent dataset."""
498
+ assert fs_store.update_meta("nonexistent", sample_meta) is False
499
+
500
+ @pytest.mark.unit
501
+ def test_list_all_metadata(self, fs_store, sample_meta, sample_arrays):
502
+ """Test list_all_metadata returns all stored metadata."""
503
+ fs_store.save("ds-a", sample_meta, sample_arrays)
504
+ fs_store.save("ds-b", sample_meta, sample_arrays)
505
+ result = fs_store.list_all_metadata()
506
+ assert len(result) == 2
507
+
508
+ @pytest.mark.unit
509
+ def test_list_all_metadata_empty(self, fs_store):
510
+ """Test list_all_metadata returns empty list when no datasets."""
511
+ result = fs_store.list_all_metadata()
512
+ assert result == []
513
+
514
+ @pytest.mark.unit
515
+ def test_save_error_cleanup(self, fs_store, sample_meta, sample_arrays):
516
+ """Test save cleans up temp files on error."""
517
+ from unittest.mock import patch
518
+
519
+ with patch("numpy.savez_compressed", side_effect=OSError("disk full")):
520
+ with pytest.raises(OSError, match="disk full"):
521
+ fs_store.save("ds-fail", sample_meta, sample_arrays)
522
+ tmp_files = list(fs_store.base_path.glob("*.tmp"))
523
+ assert not tmp_files
524
+
525
+ @pytest.mark.unit
526
+ def test_save_cleanup_oserror_suppressed(self, fs_store, sample_meta, sample_arrays):
527
+ """Test that OSError during temp file cleanup is caught and logged."""
528
+ from unittest.mock import patch
529
+
530
+ path_cls = type(fs_store.base_path)
531
+ original_replace = path_cls.replace
532
+ original_unlink = path_cls.unlink
533
+
534
+ def failing_replace(self_path, target):
535
+ if str(self_path).endswith(".npz.tmp"):
536
+ raise OSError("Simulated disk error during replace")
537
+ return original_replace(self_path, target)
538
+
539
+ def failing_unlink(self_path, missing_ok=False):
540
+ if str(self_path).endswith(".tmp"):
541
+ raise OSError("Simulated permission error during cleanup")
542
+ return original_unlink(self_path, missing_ok=missing_ok)
543
+
544
+ with patch.object(path_cls, "replace", failing_replace):
545
+ with patch.object(path_cls, "unlink", failing_unlink):
546
+ with pytest.raises(IOError, match="Simulated disk error"):
547
+ fs_store.save("ds-cleanup-err", sample_meta, sample_arrays)
548
+
549
+
550
+ class TestLocalFSEdgeCases:
551
+ """Additional edge case tests for LocalFSDatasetStore."""
552
+
553
+ @pytest.mark.unit
554
+ def test_json_serializer_raises_for_unknown_type(self):
555
+ """Test _json_serializer raises TypeError for unknown types."""
556
+ from juniper_data.storage.local_fs import _json_serializer
557
+
558
+ with pytest.raises(TypeError) as exc_info:
559
+ _json_serializer(object())
560
+
561
+ assert "not JSON serializable" in str(exc_info.value)
562
+
563
+ @pytest.mark.unit
564
+ def test_get_meta_skips_datetime_conversion_for_non_string(
565
+ self, fs_store: LocalFSDatasetStore, sample_arrays: dict[str, np.ndarray]
566
+ ):
567
+ """Test get_meta skips datetime conversion when created_at is already parsed or not a string."""
568
+ import json
569
+
570
+ meta = DatasetMeta(
571
+ dataset_id="test-date-type",
572
+ generator="test",
573
+ generator_version="1.0.0",
574
+ params={},
575
+ n_samples=100,
576
+ n_features=2,
577
+ n_classes=2,
578
+ n_train=80,
579
+ n_test=20,
580
+ class_distribution={"0": 50, "1": 50},
581
+ created_at=datetime(2026, 1, 30, 12, 0, 0),
582
+ )
583
+
584
+ fs_store.save("test-date-type", meta, sample_arrays)
585
+
586
+ meta_path = fs_store._meta_path("test-date-type")
587
+ meta_dict = json.loads(meta_path.read_text())
588
+ assert isinstance(meta_dict["created_at"], str)
589
+ meta_dict["created_at"] = 1234567890
590
+ meta_path.write_text(json.dumps(meta_dict))
591
+
592
+ retrieved = fs_store.get_meta("test-date-type")
593
+ assert retrieved is not None
594
+
595
+ @pytest.mark.unit
596
+ def test_delete_only_npz_exists(
597
+ self, fs_store: LocalFSDatasetStore, sample_meta: DatasetMeta, sample_arrays: dict[str, np.ndarray]
598
+ ):
599
+ """Test delete when only NPZ file exists (meta was deleted)."""
600
+ fs_store.save("ds-partial-npz", sample_meta, sample_arrays)
601
+
602
+ (fs_store.base_path / "ds-partial-npz.meta.json").unlink()
603
+
604
+ result = fs_store.delete("ds-partial-npz")
605
+ assert result is True
606
+ assert not (fs_store.base_path / "ds-partial-npz.npz").exists()
607
+
608
+ @pytest.mark.unit
609
+ def test_get_meta_with_timezone_aware_datetime(
610
+ self, fs_store: LocalFSDatasetStore, sample_arrays: dict[str, np.ndarray]
611
+ ):
612
+ """Test get_meta correctly deserializes timezone-aware datetime."""
613
+
614
+ tz_aware_time = datetime(2026, 6, 15, 10, 30, 45, tzinfo=UTC)
615
+ meta = DatasetMeta(
616
+ dataset_id="tz-test",
617
+ generator="test",
618
+ generator_version="1.0.0",
619
+ params={},
620
+ n_samples=100,
621
+ n_features=2,
622
+ n_classes=2,
623
+ n_train=80,
624
+ n_test=20,
625
+ class_distribution={"0": 50, "1": 50},
626
+ created_at=tz_aware_time,
627
+ )
628
+
629
+ fs_store.save("tz-test", meta, sample_arrays)
630
+ retrieved = fs_store.get_meta("tz-test")
631
+
632
+ assert retrieved is not None
633
+ assert retrieved.created_at is not None
634
+ assert retrieved.created_at.year == 2026
635
+ assert retrieved.created_at.month == 6
636
+
637
+
638
+ class TestDatasetStoreAbstractMethods:
639
+ """Tests to ensure abstract methods are properly defined."""
640
+
641
+ @pytest.mark.unit
642
+ def test_cannot_instantiate_abstract_base(self):
643
+ """Test that DatasetStore cannot be instantiated directly."""
644
+ with pytest.raises(TypeError):
645
+ DatasetStore()
646
+
647
+ @pytest.mark.unit
648
+ def test_abstract_methods_exist(self):
649
+ """Test that all abstract methods are defined."""
650
+ import inspect
651
+
652
+ abstract_methods = [
653
+ name
654
+ for name, method in inspect.getmembers(DatasetStore, predicate=inspect.isfunction)
655
+ if getattr(method, "__isabstractmethod__", False)
656
+ ]
657
+
658
+ expected_methods = ["save", "get_meta", "get_artifact_bytes", "exists", "delete", "list_datasets"]
659
+ for method in expected_methods:
660
+ assert method in abstract_methods, f"Missing abstract method: {method}"
661
+
662
+
663
+ class TestStorageModuleFactories:
664
+ """Tests for storage module factory functions and imports."""
665
+
666
+ @pytest.mark.unit
667
+ def test_storage_module_exports(self):
668
+ """Test that the storage module exports expected classes."""
669
+ from juniper_data.storage import CachedDatasetStore, DatasetStore, InMemoryDatasetStore, LocalFSDatasetStore
670
+
671
+ assert DatasetStore is not None
672
+ assert InMemoryDatasetStore is not None
673
+ assert LocalFSDatasetStore is not None
674
+ assert CachedDatasetStore is not None
675
+
676
+ @pytest.mark.unit
677
+ def test_optional_imports_are_none_or_class(self):
678
+ """Test that optional store classes are None or importable."""
679
+ import juniper_data.storage as storage_mod
680
+
681
+ for attr in ["RedisDatasetStore", "HuggingFaceDatasetStore", "PostgresDatasetStore", "KaggleDatasetStore"]:
682
+ val = getattr(storage_mod, attr, None)
683
+ assert val is None or isinstance(val, type)
684
+
685
+ @pytest.mark.unit
686
+ def test_get_redis_store_raises_import_error(self):
687
+ """Test get_redis_store raises ImportError when redis not installed."""
688
+ from juniper_data.storage import get_redis_store
689
+
690
+ with contextlib.suppress(ImportError):
691
+ get_redis_store()
692
+
693
+ @pytest.mark.unit
694
+ def test_get_hf_store_raises_import_error(self):
695
+ """Test get_hf_store raises ImportError when datasets not installed."""
696
+ from juniper_data.storage import get_hf_store
697
+
698
+ with contextlib.suppress(ImportError):
699
+ get_hf_store()
700
+
701
+ @pytest.mark.unit
702
+ def test_get_postgres_store_raises_import_error(self):
703
+ """Test get_postgres_store raises ImportError when psycopg2 not installed."""
704
+ from juniper_data.storage import get_postgres_store
705
+
706
+ with contextlib.suppress(ImportError):
707
+ get_postgres_store()
708
+
709
+ @pytest.mark.unit
710
+ def test_get_kaggle_store_raises_import_error(self):
711
+ """Test get_kaggle_store raises ImportError when kaggle not installed."""
712
+ from juniper_data.storage import get_kaggle_store
713
+
714
+ with contextlib.suppress(ImportError):
715
+ get_kaggle_store()
716
+
717
+ @pytest.mark.unit
718
+ def test_optional_imports_fallback_to_none(self):
719
+ """Test that optional imports fall back to None when packages are missing."""
720
+ import importlib
721
+ import sys
722
+
723
+ # from unittest.mock import MagicMock
724
+ # Save original modules and remove them to force ImportError
725
+ modules_to_block = [
726
+ "juniper_data.storage.redis_store",
727
+ "juniper_data.storage.hf_store",
728
+ "juniper_data.storage.postgres_store",
729
+ "juniper_data.storage.kaggle_store",
730
+ ]
731
+ saved = {}
732
+ for mod in modules_to_block:
733
+ saved[mod] = sys.modules.pop(mod, None)
734
+
735
+ # Also remove the storage module itself so it can be reimported
736
+ saved["juniper_data.storage"] = sys.modules.pop("juniper_data.storage", None)
737
+
738
+ # Patch __import__ to block the optional store modules
739
+ original_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
740
+
741
+ def blocking_import(name, *args, **kwargs):
742
+ if name in modules_to_block:
743
+ raise ImportError(f"Mocked: {name} not installed")
744
+ return original_import(name, *args, **kwargs)
745
+
746
+ try:
747
+ with pytest.MonkeyPatch.context() as mp:
748
+ mp.setattr("builtins.__import__", blocking_import)
749
+ storage_mod = importlib.import_module("juniper_data.storage")
750
+
751
+ assert storage_mod.RedisDatasetStore is None
752
+ assert storage_mod.HuggingFaceDatasetStore is None
753
+ assert storage_mod.PostgresDatasetStore is None
754
+ assert storage_mod.KaggleDatasetStore is None
755
+ assert "RedisDatasetStore" not in storage_mod.__all__
756
+ assert "HuggingFaceDatasetStore" not in storage_mod.__all__
757
+ assert "PostgresDatasetStore" not in storage_mod.__all__
758
+ assert "KaggleDatasetStore" not in storage_mod.__all__
759
+ finally:
760
+ # Restore original modules
761
+ for mod, val in saved.items():
762
+ if val is not None:
763
+ sys.modules[mod] = val
764
+ else:
765
+ sys.modules.pop(mod, None)
766
+ # Force reimport to restore normal state
767
+ importlib.import_module("juniper_data.storage")