juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. juniper_data/__init__.py +88 -0
  2. juniper_data/__main__.py +78 -0
  3. juniper_data/api/__init__.py +10 -0
  4. juniper_data/api/app.py +111 -0
  5. juniper_data/api/middleware.py +95 -0
  6. juniper_data/api/routes/__init__.py +9 -0
  7. juniper_data/api/routes/datasets.py +414 -0
  8. juniper_data/api/routes/generators.py +125 -0
  9. juniper_data/api/routes/health.py +49 -0
  10. juniper_data/api/security.py +238 -0
  11. juniper_data/api/settings.py +109 -0
  12. juniper_data/core/__init__.py +32 -0
  13. juniper_data/core/artifacts.py +63 -0
  14. juniper_data/core/dataset_id.py +38 -0
  15. juniper_data/core/models.py +135 -0
  16. juniper_data/core/split.py +120 -0
  17. juniper_data/generators/__init__.py +15 -0
  18. juniper_data/generators/arc_agi/__init__.py +11 -0
  19. juniper_data/generators/arc_agi/generator.py +229 -0
  20. juniper_data/generators/arc_agi/params.py +56 -0
  21. juniper_data/generators/checkerboard/__init__.py +15 -0
  22. juniper_data/generators/checkerboard/generator.py +114 -0
  23. juniper_data/generators/checkerboard/params.py +32 -0
  24. juniper_data/generators/circles/__init__.py +11 -0
  25. juniper_data/generators/circles/generator.py +112 -0
  26. juniper_data/generators/circles/params.py +31 -0
  27. juniper_data/generators/csv_import/__init__.py +15 -0
  28. juniper_data/generators/csv_import/generator.py +198 -0
  29. juniper_data/generators/csv_import/params.py +48 -0
  30. juniper_data/generators/gaussian/__init__.py +11 -0
  31. juniper_data/generators/gaussian/generator.py +149 -0
  32. juniper_data/generators/gaussian/params.py +53 -0
  33. juniper_data/generators/mnist/__init__.py +11 -0
  34. juniper_data/generators/mnist/generator.py +124 -0
  35. juniper_data/generators/mnist/params.py +39 -0
  36. juniper_data/generators/spiral/__init__.py +57 -0
  37. juniper_data/generators/spiral/defaults.py +39 -0
  38. juniper_data/generators/spiral/generator.py +206 -0
  39. juniper_data/generators/spiral/params.py +148 -0
  40. juniper_data/generators/xor/__init__.py +11 -0
  41. juniper_data/generators/xor/generator.py +162 -0
  42. juniper_data/generators/xor/params.py +30 -0
  43. juniper_data/storage/__init__.py +120 -0
  44. juniper_data/storage/base.py +279 -0
  45. juniper_data/storage/cached.py +211 -0
  46. juniper_data/storage/hf_store.py +257 -0
  47. juniper_data/storage/kaggle_store.py +333 -0
  48. juniper_data/storage/local_fs.py +232 -0
  49. juniper_data/storage/memory.py +136 -0
  50. juniper_data/storage/postgres_store.py +373 -0
  51. juniper_data/storage/redis_store.py +264 -0
  52. juniper_data/tests/__init__.py +1 -0
  53. juniper_data/tests/conftest.py +68 -0
  54. juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
  55. juniper_data/tests/integration/__init__.py +1 -0
  56. juniper_data/tests/integration/test_api.py +283 -0
  57. juniper_data/tests/integration/test_e2e_workflow.py +378 -0
  58. juniper_data/tests/integration/test_lifecycle_api.py +304 -0
  59. juniper_data/tests/integration/test_security_integration.py +189 -0
  60. juniper_data/tests/integration/test_storage_workflow.py +259 -0
  61. juniper_data/tests/performance/__init__.py +1 -0
  62. juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
  63. juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
  64. juniper_data/tests/unit/__init__.py +1 -0
  65. juniper_data/tests/unit/test_api_app.py +206 -0
  66. juniper_data/tests/unit/test_api_routes.py +407 -0
  67. juniper_data/tests/unit/test_api_settings.py +100 -0
  68. juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
  69. juniper_data/tests/unit/test_artifacts.py +145 -0
  70. juniper_data/tests/unit/test_cached_store.py +423 -0
  71. juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
  72. juniper_data/tests/unit/test_circles_generator.py +256 -0
  73. juniper_data/tests/unit/test_csv_import_generator.py +345 -0
  74. juniper_data/tests/unit/test_dataset_id.py +181 -0
  75. juniper_data/tests/unit/test_gaussian_generator.py +333 -0
  76. juniper_data/tests/unit/test_hf_store.py +416 -0
  77. juniper_data/tests/unit/test_init.py +93 -0
  78. juniper_data/tests/unit/test_kaggle_store.py +469 -0
  79. juniper_data/tests/unit/test_lifecycle.py +394 -0
  80. juniper_data/tests/unit/test_main.py +127 -0
  81. juniper_data/tests/unit/test_middleware.py +79 -0
  82. juniper_data/tests/unit/test_mnist_generator.py +370 -0
  83. juniper_data/tests/unit/test_postgres_store.py +490 -0
  84. juniper_data/tests/unit/test_redis_store.py +500 -0
  85. juniper_data/tests/unit/test_security.py +281 -0
  86. juniper_data/tests/unit/test_security_boundaries.py +517 -0
  87. juniper_data/tests/unit/test_spiral_generator.py +566 -0
  88. juniper_data/tests/unit/test_split.py +245 -0
  89. juniper_data/tests/unit/test_storage.py +767 -0
  90. juniper_data/tests/unit/test_xor_generator.py +223 -0
  91. juniper_data-0.4.2.dist-info/METADATA +216 -0
  92. juniper_data-0.4.2.dist-info/RECORD +95 -0
  93. juniper_data-0.4.2.dist-info/WHEEL +5 -0
  94. juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
  95. juniper_data-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,414 @@
1
+ """Dataset endpoints for creating, listing, and retrieving datasets."""
2
+
3
+ import io
4
+ from datetime import UTC, datetime, timedelta
5
+
6
+ import numpy as np
7
+ from fastapi import APIRouter, Depends, HTTPException, Query
8
+ from fastapi.responses import StreamingResponse
9
+
10
+ from juniper_data.core.dataset_id import generate_dataset_id
11
+ from juniper_data.core.models import (
12
+ BatchDeleteRequest,
13
+ BatchDeleteResponse,
14
+ CreateDatasetRequest,
15
+ CreateDatasetResponse,
16
+ DatasetListResponse,
17
+ DatasetMeta,
18
+ DatasetStats,
19
+ PreviewData,
20
+ UpdateTagsRequest,
21
+ )
22
+ from juniper_data.storage import DatasetStore
23
+
24
+ from .generators import GENERATOR_REGISTRY
25
+
26
+ # from typing import List, Optional
27
+
28
+
29
+ router = APIRouter(prefix="/datasets", tags=["datasets"])
30
+
31
+ _store: DatasetStore | None = None
32
+
33
+
34
+ def get_store() -> DatasetStore:
35
+ """Dependency to get the dataset store."""
36
+ if _store is None:
37
+ raise HTTPException(status_code=500, detail="Storage not initialized")
38
+ return _store
39
+
40
+
41
+ def set_store(store: DatasetStore) -> None:
42
+ """Set the dataset store (called during app startup)."""
43
+ global _store
44
+ _store = store
45
+
46
+
47
+ @router.post("", response_model=CreateDatasetResponse, status_code=201)
48
+ async def create_dataset(
49
+ request: CreateDatasetRequest,
50
+ store: DatasetStore = Depends(get_store),
51
+ ) -> CreateDatasetResponse:
52
+ """Create or generate a new dataset.
53
+
54
+ If a dataset with the same parameters already exists, returns the existing
55
+ metadata without regeneration (caching behavior).
56
+
57
+ Args:
58
+ request: Dataset creation request with generator name and parameters.
59
+ store: Dataset storage backend.
60
+
61
+ Returns:
62
+ Dataset metadata and artifact URL.
63
+
64
+ Raises:
65
+ HTTPException: 400 if generator not found or parameters invalid.
66
+ """
67
+ if request.generator not in GENERATOR_REGISTRY:
68
+ raise HTTPException(
69
+ status_code=400,
70
+ detail=f"Unknown generator '{request.generator}'. Available: {list(GENERATOR_REGISTRY.keys())}",
71
+ )
72
+
73
+ generator_info = GENERATOR_REGISTRY[request.generator]
74
+ generator_class = generator_info["generator"]
75
+ params_class = generator_info["params_class"]
76
+ version = generator_info["version"]
77
+
78
+ try:
79
+ params = params_class(**request.params)
80
+ except Exception as e:
81
+ raise HTTPException(status_code=400, detail=f"Invalid parameters: {e}")
82
+
83
+ dataset_id = generate_dataset_id(
84
+ generator=request.generator,
85
+ version=version,
86
+ params=params.model_dump(),
87
+ )
88
+
89
+ existing_meta = store.get_meta(dataset_id)
90
+ if existing_meta is not None:
91
+ return CreateDatasetResponse(
92
+ dataset_id=dataset_id,
93
+ generator=request.generator,
94
+ meta=existing_meta,
95
+ artifact_url=f"/v1/datasets/{dataset_id}/artifact",
96
+ )
97
+
98
+ arrays = generator_class.generate(params)
99
+
100
+ n_train = len(arrays["X_train"])
101
+ n_test = len(arrays["X_test"])
102
+ n_samples = n_train + n_test
103
+ n_features = arrays["X_train"].shape[1] if n_train > 0 else 2
104
+ n_classes = arrays["y_train"].shape[1] if n_train > 0 else params.n_spirals
105
+
106
+ y_full = arrays.get("y_full", np.vstack([arrays["y_train"], arrays["y_test"]]))
107
+ class_labels = np.argmax(y_full, axis=1)
108
+ unique, counts = np.unique(class_labels, return_counts=True)
109
+ class_distribution = {str(int(k)): int(v) for k, v in zip(unique, counts)}
110
+
111
+ now = datetime.now(UTC)
112
+ expires_at = None
113
+ if request.ttl_seconds is not None:
114
+ expires_at = now + timedelta(seconds=request.ttl_seconds)
115
+
116
+ meta = DatasetMeta(
117
+ dataset_id=dataset_id,
118
+ generator=request.generator,
119
+ generator_version=version,
120
+ params=params.model_dump(),
121
+ n_samples=n_samples,
122
+ n_features=n_features,
123
+ n_classes=n_classes,
124
+ n_train=n_train,
125
+ n_test=n_test,
126
+ class_distribution=class_distribution,
127
+ artifact_formats=["npz"],
128
+ created_at=now,
129
+ tags=request.tags,
130
+ ttl_seconds=request.ttl_seconds,
131
+ expires_at=expires_at,
132
+ )
133
+
134
+ if request.persist:
135
+ store.save(dataset_id, meta, arrays)
136
+
137
+ return CreateDatasetResponse(
138
+ dataset_id=dataset_id,
139
+ generator=request.generator,
140
+ meta=meta,
141
+ artifact_url=f"/v1/datasets/{dataset_id}/artifact",
142
+ )
143
+
144
+
145
+ @router.get("", response_model=list[str])
146
+ async def list_datasets(
147
+ limit: int = Query(default=100, ge=1, le=1000),
148
+ offset: int = Query(default=0, ge=0),
149
+ store: DatasetStore = Depends(get_store),
150
+ ) -> list[str]:
151
+ """List all dataset IDs.
152
+
153
+ Args:
154
+ limit: Maximum number of dataset IDs to return.
155
+ offset: Number of dataset IDs to skip.
156
+ store: Dataset storage backend.
157
+
158
+ Returns:
159
+ List of dataset IDs.
160
+ """
161
+ return store.list_datasets(limit=limit, offset=offset)
162
+
163
+
164
+ @router.get("/filter", response_model=DatasetListResponse)
165
+ async def filter_datasets(
166
+ generator: str | None = Query(default=None, description="Filter by generator name"),
167
+ tags: str | None = Query(default=None, description="Comma-separated list of tags to filter by"),
168
+ tags_match: str = Query(
169
+ default="any", pattern="^(any|all)$", description="Tag matching mode: 'any' (OR) or 'all' (AND)"
170
+ ),
171
+ created_after: datetime | None = Query(default=None, description="Filter by creation date (after)"),
172
+ created_before: datetime | None = Query(default=None, description="Filter by creation date (before)"),
173
+ min_samples: int | None = Query(default=None, ge=1, description="Minimum number of samples"),
174
+ max_samples: int | None = Query(default=None, ge=1, description="Maximum number of samples"),
175
+ include_expired: bool = Query(default=False, description="Include expired datasets"),
176
+ limit: int = Query(default=100, ge=1, le=1000),
177
+ offset: int = Query(default=0, ge=0),
178
+ store: DatasetStore = Depends(get_store),
179
+ ) -> DatasetListResponse:
180
+ """Filter datasets by various criteria.
181
+
182
+ Args:
183
+ generator: Filter by generator name.
184
+ tags: Comma-separated list of tags.
185
+ tags_match: Tag matching mode: 'any' (OR) or 'all' (AND).
186
+ created_after: Filter by creation date (after).
187
+ created_before: Filter by creation date (before).
188
+ min_samples: Minimum number of samples.
189
+ max_samples: Maximum number of samples.
190
+ include_expired: Include expired datasets.
191
+ limit: Maximum number of results.
192
+ offset: Number of results to skip.
193
+ store: Dataset storage backend.
194
+
195
+ Returns:
196
+ Filtered list of dataset metadata with pagination info.
197
+ """
198
+ tag_list = [t.strip() for t in tags.split(",")] if tags else None
199
+
200
+ datasets, total = store.filter_datasets(
201
+ generator=generator,
202
+ tags=tag_list,
203
+ tags_match=tags_match,
204
+ created_after=created_after,
205
+ created_before=created_before,
206
+ min_samples=min_samples,
207
+ max_samples=max_samples,
208
+ include_expired=include_expired,
209
+ limit=limit,
210
+ offset=offset,
211
+ )
212
+
213
+ return DatasetListResponse(
214
+ datasets=datasets,
215
+ total=total,
216
+ limit=limit,
217
+ offset=offset,
218
+ )
219
+
220
+
221
+ @router.get("/stats", response_model=DatasetStats)
222
+ async def get_dataset_stats(
223
+ store: DatasetStore = Depends(get_store),
224
+ ) -> DatasetStats:
225
+ """Get aggregate statistics about stored datasets.
226
+
227
+ Args:
228
+ store: Dataset storage backend.
229
+
230
+ Returns:
231
+ Dataset statistics.
232
+ """
233
+ stats = store.get_stats()
234
+ return DatasetStats(**stats) # type: ignore[arg-type]
235
+
236
+
237
+ @router.post("/batch-delete", response_model=BatchDeleteResponse)
238
+ async def batch_delete_datasets(
239
+ request: BatchDeleteRequest,
240
+ store: DatasetStore = Depends(get_store),
241
+ ) -> BatchDeleteResponse:
242
+ """Delete multiple datasets in a single request.
243
+
244
+ Args:
245
+ request: Batch delete request with list of dataset IDs.
246
+ store: Dataset storage backend.
247
+
248
+ Returns:
249
+ Batch delete response with deleted and not found IDs.
250
+ """
251
+ deleted, not_found = store.batch_delete(request.dataset_ids)
252
+
253
+ return BatchDeleteResponse(
254
+ deleted=deleted,
255
+ not_found=not_found,
256
+ total_deleted=len(deleted),
257
+ )
258
+
259
+
260
+ @router.post("/cleanup-expired", response_model=list[str])
261
+ async def cleanup_expired_datasets(
262
+ store: DatasetStore = Depends(get_store),
263
+ ) -> list[str]:
264
+ """Delete all expired datasets.
265
+
266
+ Args:
267
+ store: Dataset storage backend.
268
+
269
+ Returns:
270
+ List of deleted dataset IDs.
271
+ """
272
+ return store.delete_expired()
273
+
274
+
275
+ @router.get("/{dataset_id}", response_model=DatasetMeta)
276
+ async def get_dataset_metadata(
277
+ dataset_id: str,
278
+ store: DatasetStore = Depends(get_store),
279
+ ) -> DatasetMeta:
280
+ """Get metadata for a specific dataset.
281
+
282
+ Args:
283
+ dataset_id: Unique dataset identifier.
284
+ store: Dataset storage backend.
285
+
286
+ Returns:
287
+ Dataset metadata.
288
+
289
+ Raises:
290
+ HTTPException: 404 if dataset not found.
291
+ """
292
+ meta = store.get_meta(dataset_id)
293
+ if meta is None:
294
+ raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found")
295
+ return meta
296
+
297
+
298
+ @router.get("/{dataset_id}/artifact")
299
+ async def download_artifact(
300
+ dataset_id: str,
301
+ store: DatasetStore = Depends(get_store),
302
+ ) -> StreamingResponse:
303
+ """Download dataset artifact as NPZ file.
304
+
305
+ Args:
306
+ dataset_id: Unique dataset identifier.
307
+ store: Dataset storage backend.
308
+
309
+ Returns:
310
+ Streaming response with NPZ file contents.
311
+
312
+ Raises:
313
+ HTTPException: 404 if dataset not found.
314
+ """
315
+ artifact_bytes = store.get_artifact_bytes(dataset_id)
316
+ if artifact_bytes is None:
317
+ raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found")
318
+
319
+ return StreamingResponse(
320
+ io.BytesIO(artifact_bytes),
321
+ media_type="application/octet-stream",
322
+ headers={"Content-Disposition": f"attachment; filename={dataset_id}.npz"},
323
+ )
324
+
325
+
326
+ @router.get("/{dataset_id}/preview", response_model=PreviewData)
327
+ async def preview_dataset(
328
+ dataset_id: str,
329
+ n: int = Query(default=100, ge=1, le=1000),
330
+ store: DatasetStore = Depends(get_store),
331
+ ) -> PreviewData:
332
+ """Preview first N samples of a dataset as JSON.
333
+
334
+ Args:
335
+ dataset_id: Unique dataset identifier.
336
+ n: Number of samples to preview (default 100, max 1000).
337
+ store: Dataset storage backend.
338
+
339
+ Returns:
340
+ Preview data with sample features and labels.
341
+
342
+ Raises:
343
+ HTTPException: 404 if dataset not found.
344
+ """
345
+ artifact_bytes = store.get_artifact_bytes(dataset_id)
346
+ if artifact_bytes is None:
347
+ raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found")
348
+
349
+ with np.load(io.BytesIO(artifact_bytes)) as data:
350
+ if "X_full" in data and "y_full" in data:
351
+ X = data["X_full"]
352
+ y = data["y_full"]
353
+ else:
354
+ X = np.vstack([data["X_train"], data["X_test"]])
355
+ y = np.vstack([data["y_train"], data["y_test"]])
356
+
357
+ n_samples = min(n, len(X))
358
+
359
+ return PreviewData(
360
+ n_samples=n_samples,
361
+ X_sample=X[:n_samples].tolist(),
362
+ y_sample=y[:n_samples].tolist(),
363
+ )
364
+
365
+
366
+ @router.delete("/{dataset_id}", status_code=204)
367
+ async def delete_dataset(
368
+ dataset_id: str,
369
+ store: DatasetStore = Depends(get_store),
370
+ ) -> None:
371
+ """Delete a dataset.
372
+
373
+ Args:
374
+ dataset_id: Unique dataset identifier.
375
+ store: Dataset storage backend.
376
+
377
+ Raises:
378
+ HTTPException: 404 if dataset not found.
379
+ """
380
+ deleted = store.delete(dataset_id)
381
+ if not deleted:
382
+ raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found")
383
+
384
+
385
+ @router.patch("/{dataset_id}/tags", response_model=DatasetMeta)
386
+ async def update_dataset_tags(
387
+ dataset_id: str,
388
+ request: UpdateTagsRequest,
389
+ store: DatasetStore = Depends(get_store),
390
+ ) -> DatasetMeta:
391
+ """Add or remove tags from a dataset.
392
+
393
+ Args:
394
+ dataset_id: Unique dataset identifier.
395
+ request: Tags to add and/or remove.
396
+ store: Dataset storage backend.
397
+
398
+ Returns:
399
+ Updated dataset metadata.
400
+
401
+ Raises:
402
+ HTTPException: 404 if dataset not found.
403
+ """
404
+ meta = store.get_meta(dataset_id)
405
+ if meta is None:
406
+ raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found")
407
+
408
+ current_tags = set(meta.tags)
409
+ current_tags.update(request.add_tags)
410
+ current_tags -= set(request.remove_tags)
411
+ meta.tags = sorted(current_tags)
412
+
413
+ store.update_meta(dataset_id, meta)
414
+ return meta
@@ -0,0 +1,125 @@
1
+ """Generator endpoints for listing and describing available generators."""
2
+
3
+ from typing import Any
4
+
5
+ from fastapi import APIRouter, HTTPException
6
+
7
+ from juniper_data.core.models import GeneratorInfo
8
+ from juniper_data.generators.arc_agi import VERSION as ARC_AGI_VERSION
9
+ from juniper_data.generators.arc_agi import ArcAgiGenerator, ArcAgiParams
10
+ from juniper_data.generators.checkerboard import VERSION as CHECKERBOARD_VERSION
11
+ from juniper_data.generators.checkerboard import CheckerboardGenerator, CheckerboardParams
12
+ from juniper_data.generators.circles import VERSION as CIRCLES_VERSION
13
+ from juniper_data.generators.circles import CirclesGenerator, CirclesParams
14
+ from juniper_data.generators.csv_import import VERSION as CSV_IMPORT_VERSION
15
+ from juniper_data.generators.csv_import import CsvImportGenerator, CsvImportParams
16
+ from juniper_data.generators.gaussian import VERSION as GAUSSIAN_VERSION
17
+ from juniper_data.generators.gaussian import GaussianGenerator, GaussianParams
18
+ from juniper_data.generators.mnist import VERSION as MNIST_VERSION
19
+ from juniper_data.generators.mnist import MnistGenerator, MnistParams
20
+ from juniper_data.generators.spiral import VERSION as SPIRAL_VERSION
21
+ from juniper_data.generators.spiral import SpiralGenerator, SpiralParams
22
+ from juniper_data.generators.xor import VERSION as XOR_VERSION
23
+ from juniper_data.generators.xor import XorGenerator, XorParams
24
+
25
+ router = APIRouter(prefix="/generators", tags=["generators"])
26
+
27
+ GENERATOR_REGISTRY: dict[str, dict[str, Any]] = {
28
+ "spiral": {
29
+ "generator": SpiralGenerator,
30
+ "params_class": SpiralParams,
31
+ "version": SPIRAL_VERSION,
32
+ "description": "Multi-spiral classification dataset generator. "
33
+ "Generates N interleaved spiral arms with configurable points, rotations, and noise.",
34
+ },
35
+ "xor": {
36
+ "generator": XorGenerator,
37
+ "params_class": XorParams,
38
+ "version": XOR_VERSION,
39
+ "description": "XOR classification dataset generator. "
40
+ "Generates points in 4 quadrants with opposite classes in diagonal quadrants.",
41
+ },
42
+ "gaussian": {
43
+ "generator": GaussianGenerator,
44
+ "params_class": GaussianParams,
45
+ "version": GAUSSIAN_VERSION,
46
+ "description": "Gaussian blobs classification dataset generator. "
47
+ "Generates mixture-of-Gaussians with configurable centers and covariance.",
48
+ },
49
+ "circles": {
50
+ "generator": CirclesGenerator,
51
+ "params_class": CirclesParams,
52
+ "version": CIRCLES_VERSION,
53
+ "description": "Concentric circles classification dataset generator. "
54
+ "Generates binary classification with inner and outer circle classes.",
55
+ },
56
+ "checkerboard": {
57
+ "generator": CheckerboardGenerator,
58
+ "params_class": CheckerboardParams,
59
+ "version": CHECKERBOARD_VERSION,
60
+ "description": "Checkerboard pattern classification dataset generator. "
61
+ "Generates 2D grid with alternating class squares.",
62
+ },
63
+ "csv_import": {
64
+ "generator": CsvImportGenerator,
65
+ "params_class": CsvImportParams,
66
+ "version": CSV_IMPORT_VERSION,
67
+ "description": "CSV/JSON import generator for custom datasets. "
68
+ "Import data from CSV or JSON files with configurable feature and label columns.",
69
+ },
70
+ "mnist": {
71
+ "generator": MnistGenerator,
72
+ "params_class": MnistParams,
73
+ "version": MNIST_VERSION,
74
+ "description": "MNIST and Fashion-MNIST dataset generator. "
75
+ "Downloads and prepares standard handwritten digit or fashion item classification datasets.",
76
+ },
77
+ "arc_agi": {
78
+ "generator": ArcAgiGenerator,
79
+ "params_class": ArcAgiParams,
80
+ "version": ARC_AGI_VERSION,
81
+ "description": "ARC-AGI (Abstraction and Reasoning Corpus) dataset generator. "
82
+ "Generates visual reasoning tasks from the ARC benchmark.",
83
+ },
84
+ }
85
+
86
+
87
+ @router.get("", response_model=list[GeneratorInfo])
88
+ async def list_generators() -> list[GeneratorInfo]:
89
+ """List all available dataset generators with their info.
90
+
91
+ Returns:
92
+ List of generator information objects including name, version,
93
+ description, and parameter schema.
94
+ """
95
+ generators: list[GeneratorInfo] = []
96
+ generators.extend(
97
+ GeneratorInfo(
98
+ name=name,
99
+ version=info["version"],
100
+ description=info["description"],
101
+ schema=info["params_class"].model_json_schema(),
102
+ )
103
+ for name, info in GENERATOR_REGISTRY.items()
104
+ )
105
+ return generators
106
+
107
+
108
+ @router.get("/{name}/schema")
109
+ async def get_generator_schema(name: str) -> dict[str, Any]:
110
+ """Get the JSON schema for a generator's parameters.
111
+
112
+ Args:
113
+ name: Generator name (e.g., "spiral").
114
+
115
+ Returns:
116
+ JSON schema dictionary describing the generator's parameters.
117
+
118
+ Raises:
119
+ HTTPException: 404 if generator not found.
120
+ """
121
+ if name not in GENERATOR_REGISTRY:
122
+ raise HTTPException(status_code=404, detail=f"Generator '{name}' not found")
123
+
124
+ params_class = GENERATOR_REGISTRY[name]["params_class"]
125
+ return params_class.model_json_schema()
@@ -0,0 +1,49 @@
1
+ """Health check endpoints for container orchestration.
2
+
3
+ Provides three health check endpoints:
4
+ - /v1/health: Combined health check (backward compatible)
5
+ - /v1/health/live: Liveness probe - is the process running?
6
+ - /v1/health/ready: Readiness probe - is the service ready to accept traffic?
7
+ """
8
+
9
+ from fastapi import APIRouter
10
+
11
+ from juniper_data import __version__
12
+
13
+ router = APIRouter(tags=["health"])
14
+
15
+
16
+ @router.get("/health")
17
+ async def health_check() -> dict:
18
+ """Combined health check endpoint (backward compatible).
19
+
20
+ Returns:
21
+ Dictionary with service status and version.
22
+ """
23
+ return {"status": "ok", "version": __version__}
24
+
25
+
26
+ @router.get("/health/live")
27
+ async def liveness_probe() -> dict:
28
+ """Liveness probe for container orchestration.
29
+
30
+ Used by Kubernetes/Docker to determine if the container should be restarted.
31
+ Returns success if the Python process is running and can respond to requests.
32
+
33
+ Returns:
34
+ Dictionary with liveness status.
35
+ """
36
+ return {"status": "alive"}
37
+
38
+
39
+ @router.get("/health/ready")
40
+ async def readiness_probe() -> dict:
41
+ """Readiness probe for container orchestration.
42
+
43
+ Used by Kubernetes/Docker to determine if the container can accept traffic.
44
+ Returns success if the service is fully initialized and ready to handle requests.
45
+
46
+ Returns:
47
+ Dictionary with readiness status and version.
48
+ """
49
+ return {"status": "ready", "version": __version__}