juniper-data 0.4.2__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {juniper_data-0.4.2 → juniper_data-0.6.0}/PKG-INFO +69 -22
- {juniper_data-0.4.2 → juniper_data-0.6.0}/README.md +63 -19
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/__init__.py +1 -1
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/app.py +47 -17
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/middleware.py +48 -0
- juniper_data-0.6.0/juniper_data/api/models/__init__.py +1 -0
- juniper_data-0.6.0/juniper_data/api/models/health.py +26 -0
- juniper_data-0.6.0/juniper_data/api/observability.py +227 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/routes/datasets.py +228 -5
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/routes/generators.py +8 -16
- juniper_data-0.6.0/juniper_data/api/routes/health.py +76 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/settings.py +49 -2
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/core/models.py +83 -0
- juniper_data-0.6.0/juniper_data/core/secrets.py +25 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/core/split.py +3 -10
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/arc_agi/generator.py +1 -4
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/csv_import/generator.py +10 -17
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/spiral/params.py +1 -3
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/base.py +79 -5
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/kaggle_store.py +1 -3
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/postgres_store.py +88 -14
- juniper_data-0.6.0/juniper_data/tests/api/__init__.py +1 -0
- juniper_data-0.6.0/juniper_data/tests/api/test_batch_operations.py +449 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/integration/test_api.py +4 -2
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/integration/test_lifecycle_api.py +4 -12
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/integration/test_storage_workflow.py +1 -3
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_api_app.py +8 -7
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_api_routes.py +5 -5
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_api_settings.py +5 -5
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_arc_agi_generator.py +5 -15
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_checkerboard_generator.py +1 -3
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_csv_import_generator.py +71 -89
- juniper_data-0.6.0/juniper_data/tests/unit/test_dataset_versioning.py +663 -0
- juniper_data-0.6.0/juniper_data/tests/unit/test_health_enhanced.py +138 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_hf_store.py +7 -21
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_main.py +6 -4
- juniper_data-0.6.0/juniper_data/tests/unit/test_observability.py +352 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_postgres_store.py +207 -6
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_redis_store.py +3 -9
- juniper_data-0.6.0/juniper_data/tests/unit/test_secrets.py +85 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_security_boundaries.py +14 -21
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_storage.py +26 -80
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data.egg-info/PKG-INFO +69 -22
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data.egg-info/SOURCES.txt +10 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data.egg-info/requires.txt +6 -2
- {juniper_data-0.4.2 → juniper_data-0.6.0}/pyproject.toml +13 -6
- juniper_data-0.4.2/juniper_data/api/routes/health.py +0 -49
- {juniper_data-0.4.2 → juniper_data-0.6.0}/LICENSE +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/__main__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/routes/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/api/security.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/core/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/core/artifacts.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/core/dataset_id.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/arc_agi/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/arc_agi/params.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/checkerboard/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/checkerboard/generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/checkerboard/params.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/circles/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/circles/generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/circles/params.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/csv_import/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/csv_import/params.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/gaussian/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/gaussian/generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/gaussian/params.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/mnist/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/mnist/generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/mnist/params.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/spiral/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/spiral/defaults.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/spiral/generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/xor/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/xor/generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/generators/xor/params.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/cached.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/hf_store.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/local_fs.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/memory.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/storage/redis_store.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/conftest.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/fixtures/generate_golden_datasets.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/integration/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/integration/test_e2e_workflow.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/integration/test_security_integration.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/performance/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/performance/test_generator_benchmarks.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/performance/test_storage_benchmarks.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/__init__.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_artifacts.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_cached_store.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_circles_generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_dataset_id.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_gaussian_generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_init.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_kaggle_store.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_lifecycle.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_middleware.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_mnist_generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_security.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_spiral_generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_split.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data/tests/unit/test_xor_generator.py +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data.egg-info/dependency_links.txt +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/juniper_data.egg-info/top_level.txt +0 -0
- {juniper_data-0.4.2 → juniper_data-0.6.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: juniper-data
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Dataset generation and management service for the Juniper ecosystem
|
|
5
5
|
Author: Paul Calnon
|
|
6
6
|
License: MIT
|
|
@@ -25,14 +25,17 @@ Requires-Dist: pytest-benchmark>=4.0.0; extra == "test"
|
|
|
25
25
|
Requires-Dist: httpx>=0.24.0; extra == "test"
|
|
26
26
|
Requires-Dist: coverage[toml]>=7.0.0; extra == "test"
|
|
27
27
|
Requires-Dist: juniper-data-client>=0.3.0; extra == "test"
|
|
28
|
+
Provides-Extra: observability
|
|
29
|
+
Requires-Dist: prometheus-client>=0.20.0; extra == "observability"
|
|
30
|
+
Requires-Dist: sentry-sdk[fastapi]>=2.0.0; extra == "observability"
|
|
28
31
|
Provides-Extra: dev
|
|
29
32
|
Requires-Dist: ruff>=0.9.0; extra == "dev"
|
|
30
33
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
31
|
-
Requires-Dist: bandit[sarif]>=1.
|
|
34
|
+
Requires-Dist: bandit[sarif]>=1.9.4; extra == "dev"
|
|
32
35
|
Requires-Dist: pip-audit>=2.7.0; extra == "dev"
|
|
33
36
|
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
34
37
|
Provides-Extra: all
|
|
35
|
-
Requires-Dist: juniper-data[api,arc-agi,dev,test]; extra == "all"
|
|
38
|
+
Requires-Dist: juniper-data[api,arc-agi,dev,observability,test]; extra == "all"
|
|
36
39
|
Dynamic: license-file
|
|
37
40
|
|
|
38
41
|
# Juniper Data
|
|
@@ -52,7 +55,7 @@ Verified compatible versions:
|
|
|
52
55
|
|---|---|---|---|---|---|
|
|
53
56
|
| 0.4.x | 0.3.x | 0.2.x | >=0.3.1 | >=0.1.0 | >=0.1.0 |
|
|
54
57
|
|
|
55
|
-
For full-stack Docker deployment and integration tests, see
|
|
58
|
+
For full-stack Docker deployment and integration tests, see `juniper-deploy`.
|
|
56
59
|
|
|
57
60
|
## Architecture
|
|
58
61
|
|
|
@@ -94,10 +97,20 @@ JuniperData is the **foundational data layer** of the Juniper ecosystem. Juniper
|
|
|
94
97
|
|
|
95
98
|
```bash
|
|
96
99
|
# Full stack with all three services:
|
|
97
|
-
git clone https://github.com/pcalnon/juniper-deploy.git
|
|
100
|
+
git clone https://github.com/pcalnon/juniper-deploy.git # (private repository)
|
|
98
101
|
cd juniper-deploy && docker compose up --build
|
|
99
102
|
```
|
|
100
103
|
|
|
104
|
+
## Dependency Lockfile
|
|
105
|
+
|
|
106
|
+
The `requirements.lock` file pins exact dependency versions for reproducible Docker builds. The `pyproject.toml` retains flexible `>=` ranges for local development.
|
|
107
|
+
|
|
108
|
+
**Regenerate after changing dependencies in `pyproject.toml`:**
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
uv pip compile pyproject.toml --extra api --extra observability -o requirements.lock
|
|
112
|
+
```
|
|
113
|
+
|
|
101
114
|
## Installation
|
|
102
115
|
|
|
103
116
|
### Basic Installation
|
|
@@ -143,28 +156,62 @@ uvicorn juniper_data.api.app:app --reload
|
|
|
143
156
|
|
|
144
157
|
## API Endpoints
|
|
145
158
|
|
|
146
|
-
| Endpoint
|
|
147
|
-
|
|
|
148
|
-
| `/v1/health`
|
|
149
|
-
| `/v1/
|
|
150
|
-
| `/v1/
|
|
151
|
-
| `/v1/generators
|
|
152
|
-
| `/v1/generators/
|
|
159
|
+
| Endpoint | Method | Description |
|
|
160
|
+
| ------------------------------------- | ------ | ---------------------------------------------------- |
|
|
161
|
+
| `/v1/health` | GET | Health check |
|
|
162
|
+
| `/v1/health/live` | GET | Liveness probe |
|
|
163
|
+
| `/v1/health/ready` | GET | Readiness probe (checks storage) |
|
|
164
|
+
| `/v1/generators` | GET | List all generators with schemas |
|
|
165
|
+
| `/v1/generators/{name}/schema` | GET | Get parameter schema for a generator |
|
|
166
|
+
| `/v1/datasets` | POST | Create dataset (or return cached dataset) |
|
|
167
|
+
| `/v1/datasets` | GET | List dataset IDs |
|
|
168
|
+
| `/v1/datasets/filter` | GET | Filter metadata by generator/tags/date/name/version |
|
|
169
|
+
| `/v1/datasets/stats` | GET | Aggregate dataset statistics |
|
|
170
|
+
| `/v1/datasets/versions` | GET | List all versions for a logical dataset name |
|
|
171
|
+
| `/v1/datasets/latest` | GET | Get latest version for a logical dataset name |
|
|
172
|
+
| `/v1/datasets/batch-create` | POST | Create multiple datasets |
|
|
173
|
+
| `/v1/datasets/batch-delete` | POST | Delete multiple datasets |
|
|
174
|
+
| `/v1/datasets/batch-tags` | PATCH | Update tags on multiple datasets |
|
|
175
|
+
| `/v1/datasets/batch-export` | POST | Export multiple datasets as ZIP |
|
|
176
|
+
| `/v1/datasets/cleanup-expired` | POST | Delete expired datasets |
|
|
177
|
+
| `/v1/datasets/{id}` | GET | Get dataset metadata |
|
|
178
|
+
| `/v1/datasets/{id}` | DELETE | Delete a dataset |
|
|
179
|
+
| `/v1/datasets/{id}/artifact` | GET | Download NPZ artifact |
|
|
180
|
+
| `/v1/datasets/{id}/preview` | GET | Preview first N samples as JSON |
|
|
181
|
+
| `/v1/datasets/{id}/tags` | PATCH | Add/remove tags on one dataset |
|
|
182
|
+
|
|
183
|
+
See [docs/api/JUNIPER_DATA_API.md](docs/api/JUNIPER_DATA_API.md) for full endpoint documentation including filtering, batch operations, and tagging.
|
|
184
|
+
|
|
185
|
+
### Named Dataset Versioning
|
|
186
|
+
|
|
187
|
+
`POST /v1/datasets` supports logical names for versioned datasets:
|
|
188
|
+
|
|
189
|
+
- Set `name` to group related datasets into a version series.
|
|
190
|
+
- Persisted creates with the same `name` auto-increment `meta.dataset_version` (`1`, `2`, `3`, ...).
|
|
191
|
+
- Repeating an identical request returns the cached dataset and keeps its existing version.
|
|
192
|
+
- Use `GET /v1/datasets/versions?name=<dataset_name>` to view history and `GET /v1/datasets/latest?name=<dataset_name>` to resolve the latest.
|
|
153
193
|
|
|
154
194
|
## Project Structure
|
|
155
195
|
|
|
156
196
|
```bash
|
|
157
|
-
|
|
197
|
+
juniper-data/
|
|
158
198
|
├── juniper_data/
|
|
159
199
|
│ ├── core/ # Core functionality and base classes
|
|
160
|
-
│ ├── generators/ # Dataset generators
|
|
161
|
-
│ │
|
|
200
|
+
│ ├── generators/ # Dataset generators (8 types)
|
|
201
|
+
│ │ ├── spiral/ # Multi-spiral classification
|
|
202
|
+
│ │ ├── xor/ # XOR classification
|
|
203
|
+
│ │ ├── gaussian/ # Mixture of Gaussians
|
|
204
|
+
│ │ ├── circles/ # Concentric circles
|
|
205
|
+
│ │ ├── checkerboard/ # 2D checkerboard pattern
|
|
206
|
+
│ │ ├── csv_import/ # CSV/JSON file import
|
|
207
|
+
│ │ ├── mnist/ # MNIST / Fashion-MNIST
|
|
208
|
+
│ │ └── arc_agi/ # ARC-AGI visual reasoning
|
|
162
209
|
│ ├── storage/ # Dataset persistence layer
|
|
163
|
-
│
|
|
164
|
-
│
|
|
165
|
-
|
|
166
|
-
│
|
|
167
|
-
│
|
|
210
|
+
│ ├── api/ # FastAPI application
|
|
211
|
+
│ │ └── routes/ # API route handlers
|
|
212
|
+
│ └── tests/ # Test suite
|
|
213
|
+
│ ├── unit/ # Unit tests
|
|
214
|
+
│ └── integration/ # Integration tests
|
|
168
215
|
├── pyproject.toml # Project configuration
|
|
169
216
|
└── README.md # This file
|
|
170
217
|
```
|
|
@@ -186,8 +233,8 @@ pytest --cov=juniper_data --cov-report=html
|
|
|
186
233
|
### Code Formatting
|
|
187
234
|
|
|
188
235
|
```bash
|
|
189
|
-
|
|
190
|
-
|
|
236
|
+
ruff format juniper_data tests
|
|
237
|
+
ruff check --fix juniper_data tests
|
|
191
238
|
```
|
|
192
239
|
|
|
193
240
|
### Type Checking
|
|
@@ -15,7 +15,7 @@ Verified compatible versions:
|
|
|
15
15
|
|---|---|---|---|---|---|
|
|
16
16
|
| 0.4.x | 0.3.x | 0.2.x | >=0.3.1 | >=0.1.0 | >=0.1.0 |
|
|
17
17
|
|
|
18
|
-
For full-stack Docker deployment and integration tests, see
|
|
18
|
+
For full-stack Docker deployment and integration tests, see `juniper-deploy`.
|
|
19
19
|
|
|
20
20
|
## Architecture
|
|
21
21
|
|
|
@@ -57,10 +57,20 @@ JuniperData is the **foundational data layer** of the Juniper ecosystem. Juniper
|
|
|
57
57
|
|
|
58
58
|
```bash
|
|
59
59
|
# Full stack with all three services:
|
|
60
|
-
git clone https://github.com/pcalnon/juniper-deploy.git
|
|
60
|
+
git clone https://github.com/pcalnon/juniper-deploy.git # (private repository)
|
|
61
61
|
cd juniper-deploy && docker compose up --build
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
+
## Dependency Lockfile
|
|
65
|
+
|
|
66
|
+
The `requirements.lock` file pins exact dependency versions for reproducible Docker builds. The `pyproject.toml` retains flexible `>=` ranges for local development.
|
|
67
|
+
|
|
68
|
+
**Regenerate after changing dependencies in `pyproject.toml`:**
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
uv pip compile pyproject.toml --extra api --extra observability -o requirements.lock
|
|
72
|
+
```
|
|
73
|
+
|
|
64
74
|
## Installation
|
|
65
75
|
|
|
66
76
|
### Basic Installation
|
|
@@ -106,28 +116,62 @@ uvicorn juniper_data.api.app:app --reload
|
|
|
106
116
|
|
|
107
117
|
## API Endpoints
|
|
108
118
|
|
|
109
|
-
| Endpoint
|
|
110
|
-
|
|
|
111
|
-
| `/v1/health`
|
|
112
|
-
| `/v1/
|
|
113
|
-
| `/v1/
|
|
114
|
-
| `/v1/generators
|
|
115
|
-
| `/v1/generators/
|
|
119
|
+
| Endpoint | Method | Description |
|
|
120
|
+
| ------------------------------------- | ------ | ---------------------------------------------------- |
|
|
121
|
+
| `/v1/health` | GET | Health check |
|
|
122
|
+
| `/v1/health/live` | GET | Liveness probe |
|
|
123
|
+
| `/v1/health/ready` | GET | Readiness probe (checks storage) |
|
|
124
|
+
| `/v1/generators` | GET | List all generators with schemas |
|
|
125
|
+
| `/v1/generators/{name}/schema` | GET | Get parameter schema for a generator |
|
|
126
|
+
| `/v1/datasets` | POST | Create dataset (or return cached dataset) |
|
|
127
|
+
| `/v1/datasets` | GET | List dataset IDs |
|
|
128
|
+
| `/v1/datasets/filter` | GET | Filter metadata by generator/tags/date/name/version |
|
|
129
|
+
| `/v1/datasets/stats` | GET | Aggregate dataset statistics |
|
|
130
|
+
| `/v1/datasets/versions` | GET | List all versions for a logical dataset name |
|
|
131
|
+
| `/v1/datasets/latest` | GET | Get latest version for a logical dataset name |
|
|
132
|
+
| `/v1/datasets/batch-create` | POST | Create multiple datasets |
|
|
133
|
+
| `/v1/datasets/batch-delete` | POST | Delete multiple datasets |
|
|
134
|
+
| `/v1/datasets/batch-tags` | PATCH | Update tags on multiple datasets |
|
|
135
|
+
| `/v1/datasets/batch-export` | POST | Export multiple datasets as ZIP |
|
|
136
|
+
| `/v1/datasets/cleanup-expired` | POST | Delete expired datasets |
|
|
137
|
+
| `/v1/datasets/{id}` | GET | Get dataset metadata |
|
|
138
|
+
| `/v1/datasets/{id}` | DELETE | Delete a dataset |
|
|
139
|
+
| `/v1/datasets/{id}/artifact` | GET | Download NPZ artifact |
|
|
140
|
+
| `/v1/datasets/{id}/preview` | GET | Preview first N samples as JSON |
|
|
141
|
+
| `/v1/datasets/{id}/tags` | PATCH | Add/remove tags on one dataset |
|
|
142
|
+
|
|
143
|
+
See [docs/api/JUNIPER_DATA_API.md](docs/api/JUNIPER_DATA_API.md) for full endpoint documentation including filtering, batch operations, and tagging.
|
|
144
|
+
|
|
145
|
+
### Named Dataset Versioning
|
|
146
|
+
|
|
147
|
+
`POST /v1/datasets` supports logical names for versioned datasets:
|
|
148
|
+
|
|
149
|
+
- Set `name` to group related datasets into a version series.
|
|
150
|
+
- Persisted creates with the same `name` auto-increment `meta.dataset_version` (`1`, `2`, `3`, ...).
|
|
151
|
+
- Repeating an identical request returns the cached dataset and keeps its existing version.
|
|
152
|
+
- Use `GET /v1/datasets/versions?name=<dataset_name>` to view history and `GET /v1/datasets/latest?name=<dataset_name>` to resolve the latest.
|
|
116
153
|
|
|
117
154
|
## Project Structure
|
|
118
155
|
|
|
119
156
|
```bash
|
|
120
|
-
|
|
157
|
+
juniper-data/
|
|
121
158
|
├── juniper_data/
|
|
122
159
|
│ ├── core/ # Core functionality and base classes
|
|
123
|
-
│ ├── generators/ # Dataset generators
|
|
124
|
-
│ │
|
|
160
|
+
│ ├── generators/ # Dataset generators (8 types)
|
|
161
|
+
│ │ ├── spiral/ # Multi-spiral classification
|
|
162
|
+
│ │ ├── xor/ # XOR classification
|
|
163
|
+
│ │ ├── gaussian/ # Mixture of Gaussians
|
|
164
|
+
│ │ ├── circles/ # Concentric circles
|
|
165
|
+
│ │ ├── checkerboard/ # 2D checkerboard pattern
|
|
166
|
+
│ │ ├── csv_import/ # CSV/JSON file import
|
|
167
|
+
│ │ ├── mnist/ # MNIST / Fashion-MNIST
|
|
168
|
+
│ │ └── arc_agi/ # ARC-AGI visual reasoning
|
|
125
169
|
│ ├── storage/ # Dataset persistence layer
|
|
126
|
-
│
|
|
127
|
-
│
|
|
128
|
-
|
|
129
|
-
│
|
|
130
|
-
│
|
|
170
|
+
│ ├── api/ # FastAPI application
|
|
171
|
+
│ │ └── routes/ # API route handlers
|
|
172
|
+
│ └── tests/ # Test suite
|
|
173
|
+
│ ├── unit/ # Unit tests
|
|
174
|
+
│ └── integration/ # Integration tests
|
|
131
175
|
├── pyproject.toml # Project configuration
|
|
132
176
|
└── README.md # This file
|
|
133
177
|
```
|
|
@@ -149,8 +193,8 @@ pytest --cov=juniper_data --cov-report=html
|
|
|
149
193
|
### Code Formatting
|
|
150
194
|
|
|
151
195
|
```bash
|
|
152
|
-
|
|
153
|
-
|
|
196
|
+
ruff format juniper_data tests
|
|
197
|
+
ruff check --fix juniper_data tests
|
|
154
198
|
```
|
|
155
199
|
|
|
156
200
|
### Type Checking
|
|
@@ -12,7 +12,15 @@ from fastapi.responses import JSONResponse
|
|
|
12
12
|
from juniper_data import __version__
|
|
13
13
|
from juniper_data.storage import LocalFSDatasetStore
|
|
14
14
|
|
|
15
|
-
from .middleware import SecurityMiddleware
|
|
15
|
+
from .middleware import RequestBodyLimitMiddleware, SecurityHeadersMiddleware, SecurityMiddleware
|
|
16
|
+
from .observability import (
|
|
17
|
+
PrometheusMiddleware,
|
|
18
|
+
RequestIdMiddleware,
|
|
19
|
+
configure_logging,
|
|
20
|
+
configure_sentry,
|
|
21
|
+
get_prometheus_app,
|
|
22
|
+
set_build_info,
|
|
23
|
+
)
|
|
16
24
|
from .routes import datasets, generators, health
|
|
17
25
|
from .security import APIKeyAuth, RateLimiter
|
|
18
26
|
from .settings import Settings, get_settings
|
|
@@ -26,10 +34,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
|
|
26
34
|
store = LocalFSDatasetStore(storage_path)
|
|
27
35
|
datasets.set_store(store)
|
|
28
36
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
37
|
+
configure_logging(settings.log_level, settings.log_format, "juniper-data")
|
|
38
|
+
configure_sentry(settings.sentry_dsn, "juniper-data", __version__, send_pii=settings.sentry_send_pii, traces_sample_rate=settings.sentry_traces_sample_rate)
|
|
39
|
+
if settings.metrics_enabled:
|
|
40
|
+
set_build_info("juniper_data", __version__)
|
|
41
|
+
|
|
33
42
|
logger = logging.getLogger("juniper_data")
|
|
34
43
|
logger.info(f"JuniperData API v{__version__} starting")
|
|
35
44
|
logger.info(f"Storage path: {storage_path.absolute()}")
|
|
@@ -52,28 +61,37 @@ def create_app(settings: Settings | None = None) -> FastAPI:
|
|
|
52
61
|
if settings is None:
|
|
53
62
|
settings = get_settings()
|
|
54
63
|
|
|
64
|
+
# Disable interactive API docs when authentication is enabled (production).
|
|
65
|
+
docs_enabled = not settings.api_keys
|
|
55
66
|
app = FastAPI(
|
|
56
67
|
title="Juniper Data API",
|
|
57
68
|
description="Dataset generation and management service for the Juniper ecosystem",
|
|
58
69
|
version=__version__,
|
|
59
70
|
lifespan=lifespan,
|
|
71
|
+
docs_url="/docs" if docs_enabled else None,
|
|
72
|
+
redoc_url="/redoc" if docs_enabled else None,
|
|
73
|
+
openapi_url="/openapi.json" if docs_enabled else None,
|
|
60
74
|
)
|
|
61
75
|
|
|
62
76
|
app.state.settings = settings
|
|
63
77
|
|
|
64
|
-
#
|
|
65
|
-
# Browsers do not permit Access-Control-Allow-Credentials: true with a wildcard
|
|
66
|
-
# origin (Access-Control-Allow-Origin: "*"), so the default ["*"] intentionally
|
|
67
|
-
# disables credentials unless concrete origins are configured.
|
|
78
|
+
# CORS: only enable when origins are explicitly configured.
|
|
68
79
|
allow_credentials = bool(settings.cors_origins) and "*" not in settings.cors_origins
|
|
69
80
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
81
|
+
if settings.cors_origins:
|
|
82
|
+
app.add_middleware(
|
|
83
|
+
CORSMiddleware,
|
|
84
|
+
allow_origins=settings.cors_origins,
|
|
85
|
+
allow_credentials=allow_credentials,
|
|
86
|
+
allow_methods=["*"],
|
|
87
|
+
allow_headers=["*"],
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Request body size limit
|
|
91
|
+
app.add_middleware(RequestBodyLimitMiddleware)
|
|
92
|
+
|
|
93
|
+
# Security headers (outermost — runs on every response)
|
|
94
|
+
app.add_middleware(SecurityHeadersMiddleware)
|
|
77
95
|
|
|
78
96
|
api_key_auth = APIKeyAuth(settings.api_keys)
|
|
79
97
|
rate_limiter = RateLimiter(
|
|
@@ -86,15 +104,27 @@ def create_app(settings: Settings | None = None) -> FastAPI:
|
|
|
86
104
|
rate_limiter=rate_limiter,
|
|
87
105
|
)
|
|
88
106
|
|
|
107
|
+
# Observability middleware (added after SecurityMiddleware, before CORS)
|
|
108
|
+
# Middleware execution is LIFO: last added runs first.
|
|
109
|
+
# Order: RequestIdMiddleware → PrometheusMiddleware → SecurityMiddleware → SecurityHeaders → CORS
|
|
110
|
+
if settings.metrics_enabled:
|
|
111
|
+
app.add_middleware(PrometheusMiddleware, service_name="juniper-data", namespace="juniper_data")
|
|
112
|
+
app.add_middleware(RequestIdMiddleware)
|
|
113
|
+
|
|
89
114
|
app.include_router(health.router, prefix="/v1")
|
|
90
115
|
app.include_router(generators.router, prefix="/v1")
|
|
91
116
|
app.include_router(datasets.router, prefix="/v1")
|
|
92
117
|
|
|
118
|
+
# Mount Prometheus metrics endpoint
|
|
119
|
+
if settings.metrics_enabled:
|
|
120
|
+
app.mount("/metrics", get_prometheus_app())
|
|
121
|
+
|
|
93
122
|
@app.exception_handler(ValueError)
|
|
94
123
|
async def value_error_handler(request: Request, exc: ValueError) -> JSONResponse:
|
|
124
|
+
logging.getLogger("juniper_data").debug("Validation error: %s", exc)
|
|
95
125
|
return JSONResponse(
|
|
96
126
|
status_code=400,
|
|
97
|
-
content={"detail":
|
|
127
|
+
content={"detail": "Invalid request parameters"},
|
|
98
128
|
)
|
|
99
129
|
|
|
100
130
|
@app.exception_handler(Exception)
|
|
@@ -16,6 +16,54 @@ EXEMPT_PATHS = {
|
|
|
16
16
|
"/redoc",
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
+
# Default Content-Security-Policy for API-only services.
|
|
20
|
+
_DEFAULT_CSP = "default-src 'none'; frame-ancestors 'none'"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SecurityHeadersMiddleware(BaseHTTPMiddleware):
|
|
24
|
+
"""Add security headers to all responses.
|
|
25
|
+
|
|
26
|
+
Injects standard security headers (X-Content-Type-Options, X-Frame-Options,
|
|
27
|
+
Referrer-Policy, Permissions-Policy, CSP, and conditional HSTS) into every
|
|
28
|
+
HTTP response.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, app: ASGIApp, content_security_policy: str = _DEFAULT_CSP) -> None:
|
|
32
|
+
super().__init__(app)
|
|
33
|
+
self._csp = content_security_policy
|
|
34
|
+
|
|
35
|
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
|
36
|
+
response = await call_next(request)
|
|
37
|
+
|
|
38
|
+
response.headers["X-Content-Type-Options"] = "nosniff"
|
|
39
|
+
response.headers["X-Frame-Options"] = "DENY"
|
|
40
|
+
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
|
41
|
+
response.headers["Permissions-Policy"] = "camera=(), microphone=(), geolocation=()"
|
|
42
|
+
response.headers["Content-Security-Policy"] = self._csp
|
|
43
|
+
|
|
44
|
+
# Only add HSTS when the request arrived over TLS (via reverse proxy)
|
|
45
|
+
if request.headers.get("X-Forwarded-Proto") == "https":
|
|
46
|
+
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
|
47
|
+
|
|
48
|
+
return response
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_MAX_REQUEST_BODY_BYTES = 10 * 1024 * 1024 # 10 MB
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class RequestBodyLimitMiddleware(BaseHTTPMiddleware):
|
|
55
|
+
"""Reject requests whose Content-Length exceeds a configurable limit."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, app: ASGIApp, max_bytes: int = _MAX_REQUEST_BODY_BYTES) -> None:
|
|
58
|
+
super().__init__(app)
|
|
59
|
+
self._max_bytes = max_bytes
|
|
60
|
+
|
|
61
|
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
|
62
|
+
content_length = request.headers.get("content-length")
|
|
63
|
+
if content_length is not None and int(content_length) > self._max_bytes:
|
|
64
|
+
return JSONResponse(status_code=413, content={"detail": "Request body too large"})
|
|
65
|
+
return await call_next(request)
|
|
66
|
+
|
|
19
67
|
|
|
20
68
|
class SecurityMiddleware(BaseHTTPMiddleware):
|
|
21
69
|
"""Middleware for API key authentication and rate limiting.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""API response models."""
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Health check response models for standardized readiness reporting."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DependencyStatus(BaseModel):
|
|
10
|
+
"""Health status of a single dependency."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
status: Literal["healthy", "unhealthy", "degraded", "not_configured"]
|
|
14
|
+
latency_ms: float | None = None
|
|
15
|
+
message: str | None = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ReadinessResponse(BaseModel):
|
|
19
|
+
"""Standard /v1/health/ready response for all Juniper services."""
|
|
20
|
+
|
|
21
|
+
status: Literal["ready", "degraded", "not_ready"]
|
|
22
|
+
version: str
|
|
23
|
+
service: str
|
|
24
|
+
timestamp: float = Field(default_factory=lambda: datetime.now().timestamp())
|
|
25
|
+
dependencies: dict[str, DependencyStatus] = {}
|
|
26
|
+
details: dict[str, object] = {}
|