radix-edge 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- radix_edge-2.0.0/PKG-INFO +24 -0
- radix_edge-2.0.0/pyproject.toml +47 -0
- radix_edge-2.0.0/radix_edge/__init__.py +3 -0
- radix_edge-2.0.0/radix_edge/__main__.py +20 -0
- radix_edge-2.0.0/radix_edge/api/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/api/app.py +205 -0
- radix_edge-2.0.0/radix_edge/api/auth.py +50 -0
- radix_edge-2.0.0/radix_edge/api/routes_admin.py +82 -0
- radix_edge-2.0.0/radix_edge/api/routes_gpus.py +28 -0
- radix_edge-2.0.0/radix_edge/api/routes_health.py +121 -0
- radix_edge-2.0.0/radix_edge/api/routes_jobs.py +120 -0
- radix_edge-2.0.0/radix_edge/api/schemas.py +30 -0
- radix_edge-2.0.0/radix_edge/cli/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/cli/main.py +227 -0
- radix_edge-2.0.0/radix_edge/config.py +133 -0
- radix_edge-2.0.0/radix_edge/db.py +232 -0
- radix_edge-2.0.0/radix_edge/enforcer/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/enforcer/admission.py +107 -0
- radix_edge-2.0.0/radix_edge/enforcer/fairness.py +100 -0
- radix_edge-2.0.0/radix_edge/enforcer/rate_limiter.py +46 -0
- radix_edge-2.0.0/radix_edge/executor/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/executor/executor.py +249 -0
- radix_edge-2.0.0/radix_edge/executor/gpu_allocator.py +126 -0
- radix_edge-2.0.0/radix_edge/mesh/__init__.py +1 -0
- radix_edge-2.0.0/radix_edge/mesh/coordinator.py +134 -0
- radix_edge-2.0.0/radix_edge/mesh/discovery.py +174 -0
- radix_edge-2.0.0/radix_edge/mesh/peer_client.py +127 -0
- radix_edge-2.0.0/radix_edge/mesh/peer_registry.py +98 -0
- radix_edge-2.0.0/radix_edge/mesh/routes_mesh.py +216 -0
- radix_edge-2.0.0/radix_edge/mesh/security.py +53 -0
- radix_edge-2.0.0/radix_edge/mesh/sync.py +196 -0
- radix_edge-2.0.0/radix_edge/meta/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/meta/brain_autotuner.py +286 -0
- radix_edge-2.0.0/radix_edge/meta/fep_controller.py +224 -0
- radix_edge-2.0.0/radix_edge/meta/generative_model.py +76 -0
- radix_edge-2.0.0/radix_edge/models.py +140 -0
- radix_edge-2.0.0/radix_edge/observer/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/observer/gpu_observer.py +256 -0
- radix_edge-2.0.0/radix_edge/observer/gpu_topology.py +101 -0
- radix_edge-2.0.0/radix_edge/recovery.py +130 -0
- radix_edge-2.0.0/radix_edge/scheduler/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/scheduler/policies.py +90 -0
- radix_edge-2.0.0/radix_edge/scheduler/scheduler.py +161 -0
- radix_edge-2.0.0/radix_edge/scheduler/scoring_model.py +257 -0
- radix_edge-2.0.0/radix_edge/scheduler/sinkhorn.py +77 -0
- radix_edge-2.0.0/radix_edge/upstream/__init__.py +0 -0
- radix_edge-2.0.0/radix_edge/upstream/client.py +84 -0
- radix_edge-2.0.0/radix_edge/upstream/heartbeat.py +62 -0
- radix_edge-2.0.0/radix_edge/upstream/job_sync.py +156 -0
- radix_edge-2.0.0/radix_edge/upstream/token_manager.py +84 -0
- radix_edge-2.0.0/radix_edge/upstream/translate.py +42 -0
- radix_edge-2.0.0/radix_edge.egg-info/PKG-INFO +24 -0
- radix_edge-2.0.0/radix_edge.egg-info/SOURCES.txt +75 -0
- radix_edge-2.0.0/radix_edge.egg-info/dependency_links.txt +1 -0
- radix_edge-2.0.0/radix_edge.egg-info/entry_points.txt +2 -0
- radix_edge-2.0.0/radix_edge.egg-info/requires.txt +21 -0
- radix_edge-2.0.0/radix_edge.egg-info/top_level.txt +1 -0
- radix_edge-2.0.0/setup.cfg +4 -0
- radix_edge-2.0.0/tests/test_brain_autotuner.py +141 -0
- radix_edge-2.0.0/tests/test_enforcer.py +214 -0
- radix_edge-2.0.0/tests/test_executor.py +297 -0
- radix_edge-2.0.0/tests/test_fep_controller.py +196 -0
- radix_edge-2.0.0/tests/test_gpu_integration.py +124 -0
- radix_edge-2.0.0/tests/test_mesh_coordinator.py +178 -0
- radix_edge-2.0.0/tests/test_mesh_discovery.py +158 -0
- radix_edge-2.0.0/tests/test_mesh_registry.py +125 -0
- radix_edge-2.0.0/tests/test_mesh_routes.py +210 -0
- radix_edge-2.0.0/tests/test_mesh_scheduling.py +174 -0
- radix_edge-2.0.0/tests/test_mesh_security.py +86 -0
- radix_edge-2.0.0/tests/test_mesh_sync.py +190 -0
- radix_edge-2.0.0/tests/test_observer.py +337 -0
- radix_edge-2.0.0/tests/test_scoring_model.py +255 -0
- radix_edge-2.0.0/tests/test_security.py +290 -0
- radix_edge-2.0.0/tests/test_skeleton.py +130 -0
- radix_edge-2.0.0/tests/test_upstream_client.py +101 -0
- radix_edge-2.0.0/tests/test_upstream_job_sync.py +225 -0
- radix_edge-2.0.0/tests/test_upstream_token.py +101 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: radix-edge
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Radix Edge Agent — GPU orchestration with information-theoretic scheduling
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: fastapi>=0.109.0
|
|
7
|
+
Requires-Dist: uvicorn[standard]>=0.27.0
|
|
8
|
+
Requires-Dist: pydantic>=2.5.0
|
|
9
|
+
Requires-Dist: pydantic-settings>=2.1.0
|
|
10
|
+
Requires-Dist: prometheus-client>=0.19.0
|
|
11
|
+
Requires-Dist: numpy>=1.24.0
|
|
12
|
+
Requires-Dist: scipy>=1.10.0
|
|
13
|
+
Requires-Dist: docker>=6.0.0
|
|
14
|
+
Requires-Dist: httpx>=0.25.0
|
|
15
|
+
Requires-Dist: typer>=0.9.0
|
|
16
|
+
Requires-Dist: rich>=13.0.0
|
|
17
|
+
Provides-Extra: mesh
|
|
18
|
+
Requires-Dist: zeroconf>=0.131.0; extra == "mesh"
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
21
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
|
|
23
|
+
Requires-Dist: respx>=0.21.0; extra == "dev"
|
|
24
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "radix-edge"
|
|
7
|
+
version = "2.0.0"
|
|
8
|
+
description = "Radix Edge Agent — GPU orchestration with information-theoretic scheduling"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"fastapi>=0.109.0",
|
|
12
|
+
"uvicorn[standard]>=0.27.0",
|
|
13
|
+
"pydantic>=2.5.0",
|
|
14
|
+
"pydantic-settings>=2.1.0",
|
|
15
|
+
"prometheus-client>=0.19.0",
|
|
16
|
+
"numpy>=1.24.0",
|
|
17
|
+
"scipy>=1.10.0",
|
|
18
|
+
"docker>=6.0.0",
|
|
19
|
+
"httpx>=0.25.0",
|
|
20
|
+
"typer>=0.9.0",
|
|
21
|
+
"rich>=13.0.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
mesh = ["zeroconf>=0.131.0"]
|
|
26
|
+
dev = [
|
|
27
|
+
"pytest>=8.0.0",
|
|
28
|
+
"pytest-asyncio>=0.23.0",
|
|
29
|
+
"pytest-cov>=5.0.0",
|
|
30
|
+
"respx>=0.21.0",
|
|
31
|
+
"ruff>=0.3.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
radix-edge = "radix_edge.cli.main:app"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
include = ["radix_edge*"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
testpaths = ["tests"]
|
|
42
|
+
asyncio_mode = "auto"
|
|
43
|
+
markers = ["gpu: GPU integration tests (require --run-gpu flag or RADIX_GPU_TESTS=1)"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff]
|
|
46
|
+
line-length = 120
|
|
47
|
+
target-version = "py311"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Entry point for running the edge agent server."""
|
|
2
|
+
|
|
3
|
+
import uvicorn
|
|
4
|
+
|
|
5
|
+
from radix_edge.config import get_config
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
config = get_config()
|
|
10
|
+
uvicorn.run(
|
|
11
|
+
"radix_edge.api.app:create_app",
|
|
12
|
+
factory=True,
|
|
13
|
+
host=config.host,
|
|
14
|
+
port=config.port,
|
|
15
|
+
log_level=config.log_level.lower(),
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if __name__ == "__main__":
|
|
20
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""FastAPI application factory for the Radix Edge Agent."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
|
|
7
|
+
from fastapi import Depends, FastAPI
|
|
8
|
+
|
|
9
|
+
from radix_edge.config import get_config
|
|
10
|
+
from radix_edge.db import init_db, close_db
|
|
11
|
+
from radix_edge.api.routes_health import router as health_router
|
|
12
|
+
from radix_edge.api.routes_gpus import router as gpus_router
|
|
13
|
+
from radix_edge.api.routes_jobs import router as jobs_router
|
|
14
|
+
from radix_edge.api.routes_admin import router as admin_router
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("radix_edge")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@asynccontextmanager
|
|
20
|
+
async def lifespan(app: FastAPI):
|
|
21
|
+
"""Application lifespan — startup and shutdown."""
|
|
22
|
+
config = get_config()
|
|
23
|
+
logging.basicConfig(
|
|
24
|
+
level=getattr(logging, config.log_level.upper(), logging.INFO),
|
|
25
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
logger.info("Radix Edge Agent starting")
|
|
29
|
+
logger.info("Database: %s", config.db_path)
|
|
30
|
+
|
|
31
|
+
# Initialize database
|
|
32
|
+
init_db()
|
|
33
|
+
logger.info("Database initialized (12 tables)")
|
|
34
|
+
|
|
35
|
+
# Reconcile state after restart
|
|
36
|
+
from radix_edge.recovery import reconcile_on_startup
|
|
37
|
+
recovery_summary = reconcile_on_startup()
|
|
38
|
+
logger.info("Recovery: %s", recovery_summary)
|
|
39
|
+
|
|
40
|
+
# Start GPU observer background task
|
|
41
|
+
from radix_edge.observer.gpu_observer import observer_loop
|
|
42
|
+
observer_task = asyncio.create_task(observer_loop())
|
|
43
|
+
logger.info("GPU observer started")
|
|
44
|
+
|
|
45
|
+
# Start scheduler background task
|
|
46
|
+
from radix_edge.scheduler.scheduler import scheduling_loop
|
|
47
|
+
scheduler_task = asyncio.create_task(scheduling_loop())
|
|
48
|
+
logger.info("Scheduler started")
|
|
49
|
+
|
|
50
|
+
# Start FEP controller and brain auto-tuner
|
|
51
|
+
from radix_edge.meta.fep_controller import fep_loop
|
|
52
|
+
from radix_edge.meta.brain_autotuner import autotuner_loop
|
|
53
|
+
fep_task = asyncio.create_task(fep_loop())
|
|
54
|
+
autotuner_task = asyncio.create_task(autotuner_loop())
|
|
55
|
+
logger.info("FEP controller and brain auto-tuner started")
|
|
56
|
+
|
|
57
|
+
# Start upstream control plane communication (if configured)
|
|
58
|
+
upstream_tasks = []
|
|
59
|
+
upstream_client = None
|
|
60
|
+
if config.upstream_enabled and config.api_base and config.cluster_id:
|
|
61
|
+
from radix_edge.upstream.token_manager import resolve_token
|
|
62
|
+
from radix_edge.upstream.client import UpstreamClient
|
|
63
|
+
from radix_edge.upstream.heartbeat import heartbeat_loop
|
|
64
|
+
from radix_edge.upstream.job_sync import job_sync_loop
|
|
65
|
+
|
|
66
|
+
cluster_token = await resolve_token(
|
|
67
|
+
api_base=config.api_base,
|
|
68
|
+
cluster_id=config.cluster_id,
|
|
69
|
+
tenant_id=config.tenant_id,
|
|
70
|
+
cluster_token=config.cluster_token,
|
|
71
|
+
one_time_token=config.one_time_token,
|
|
72
|
+
token_file=config.token_file,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
upstream_client = UpstreamClient(
|
|
76
|
+
api_base=config.api_base,
|
|
77
|
+
cluster_id=config.cluster_id,
|
|
78
|
+
tenant_id=config.tenant_id,
|
|
79
|
+
cluster_token=cluster_token,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
upstream_tasks.append(asyncio.create_task(heartbeat_loop(upstream_client)))
|
|
83
|
+
upstream_tasks.append(asyncio.create_task(job_sync_loop(upstream_client)))
|
|
84
|
+
logger.info("Upstream control plane connected (cluster=%s)", config.cluster_id)
|
|
85
|
+
else:
|
|
86
|
+
logger.info("Upstream control plane disabled")
|
|
87
|
+
|
|
88
|
+
# Start mesh networking (if configured)
|
|
89
|
+
mesh_tasks = []
|
|
90
|
+
mesh_peer_clients: dict = {}
|
|
91
|
+
mesh_discovery_inst = None
|
|
92
|
+
if config.mesh_enabled and config.mesh_key:
|
|
93
|
+
import uuid
|
|
94
|
+
from radix_edge.mesh.peer_registry import PeerRegistry
|
|
95
|
+
from radix_edge.mesh.coordinator import MeshCoordinator
|
|
96
|
+
from radix_edge.mesh.discovery import MeshDiscovery, ManualPeerList, ZEROCONF_AVAILABLE
|
|
97
|
+
from radix_edge.mesh.sync import (
|
|
98
|
+
mesh_heartbeat_loop,
|
|
99
|
+
mesh_discovery_loop,
|
|
100
|
+
election_monitor_loop,
|
|
101
|
+
coordinator_loop,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Generate or load node_id
|
|
105
|
+
node_id = config.mesh_node_id
|
|
106
|
+
if not node_id:
|
|
107
|
+
with get_db() as db_conn:
|
|
108
|
+
row = db_conn.execute("SELECT value FROM config WHERE key = 'mesh_node_id'").fetchone()
|
|
109
|
+
if row:
|
|
110
|
+
node_id = row["value"]
|
|
111
|
+
else:
|
|
112
|
+
node_id = f"node-{uuid.uuid4().hex[:12]}"
|
|
113
|
+
db_conn.execute(
|
|
114
|
+
"INSERT INTO config (key, value) VALUES ('mesh_node_id', ?)",
|
|
115
|
+
(node_id,),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Create mesh objects
|
|
119
|
+
mesh_registry = PeerRegistry(stale_threshold=config.mesh_stale_threshold)
|
|
120
|
+
mesh_coordinator = MeshCoordinator(node_id, mesh_registry)
|
|
121
|
+
|
|
122
|
+
# Discovery: prefer mDNS, fall back to manual
|
|
123
|
+
if ZEROCONF_AVAILABLE and not config.mesh_peers:
|
|
124
|
+
mesh_discovery_inst = MeshDiscovery(
|
|
125
|
+
node_id=node_id,
|
|
126
|
+
port=config.port,
|
|
127
|
+
mesh_key=config.mesh_key,
|
|
128
|
+
)
|
|
129
|
+
await mesh_discovery_inst.start()
|
|
130
|
+
else:
|
|
131
|
+
mesh_discovery_inst = ManualPeerList(config.mesh_peers)
|
|
132
|
+
|
|
133
|
+
# Store in app.state for route access
|
|
134
|
+
app.state.mesh_registry = mesh_registry
|
|
135
|
+
app.state.mesh_coordinator = mesh_coordinator
|
|
136
|
+
app.state.mesh_key = config.mesh_key
|
|
137
|
+
app.state.mesh_node_id = node_id
|
|
138
|
+
|
|
139
|
+
# Include mesh router
|
|
140
|
+
from radix_edge.mesh.routes_mesh import router as mesh_router
|
|
141
|
+
app.include_router(mesh_router)
|
|
142
|
+
|
|
143
|
+
# Start background tasks
|
|
144
|
+
mesh_tasks.append(asyncio.create_task(
|
|
145
|
+
mesh_heartbeat_loop(config, mesh_registry, mesh_peer_clients, node_id, mesh_coordinator)
|
|
146
|
+
))
|
|
147
|
+
mesh_tasks.append(asyncio.create_task(
|
|
148
|
+
mesh_discovery_loop(config, mesh_discovery_inst, mesh_registry, mesh_peer_clients, config.mesh_key)
|
|
149
|
+
))
|
|
150
|
+
mesh_tasks.append(asyncio.create_task(
|
|
151
|
+
election_monitor_loop(config, mesh_coordinator, mesh_registry, mesh_peer_clients)
|
|
152
|
+
))
|
|
153
|
+
mesh_tasks.append(asyncio.create_task(
|
|
154
|
+
coordinator_loop(config, mesh_coordinator, mesh_peer_clients)
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
# Run initial election
|
|
158
|
+
mesh_coordinator.elect()
|
|
159
|
+
logger.info("Mesh networking started (node=%s, peers=%d)", node_id, mesh_registry.peer_count)
|
|
160
|
+
else:
|
|
161
|
+
logger.info("Mesh networking disabled")
|
|
162
|
+
|
|
163
|
+
logger.info("Radix Edge Agent ready on %s:%d", config.host, config.port)
|
|
164
|
+
|
|
165
|
+
yield
|
|
166
|
+
|
|
167
|
+
# Shutdown
|
|
168
|
+
logger.info("Radix Edge Agent shutting down")
|
|
169
|
+
for task in (observer_task, scheduler_task, fep_task, autotuner_task, *upstream_tasks, *mesh_tasks):
|
|
170
|
+
task.cancel()
|
|
171
|
+
try:
|
|
172
|
+
await task
|
|
173
|
+
except asyncio.CancelledError:
|
|
174
|
+
pass
|
|
175
|
+
if upstream_client:
|
|
176
|
+
await upstream_client.close()
|
|
177
|
+
# Close mesh resources
|
|
178
|
+
for pc in mesh_peer_clients.values():
|
|
179
|
+
await pc.close()
|
|
180
|
+
if mesh_discovery_inst and hasattr(mesh_discovery_inst, "stop"):
|
|
181
|
+
await mesh_discovery_inst.stop()
|
|
182
|
+
close_db()
|
|
183
|
+
logger.info("Radix Edge Agent stopped")
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def create_app() -> FastAPI:
|
|
187
|
+
"""Create and configure the FastAPI application."""
|
|
188
|
+
from radix_edge.api.auth import require_auth
|
|
189
|
+
|
|
190
|
+
app = FastAPI(
|
|
191
|
+
title="Radix Edge Agent",
|
|
192
|
+
description="GPU orchestration with information-theoretic scheduling",
|
|
193
|
+
version="0.1.0",
|
|
194
|
+
lifespan=lifespan,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Health endpoints — no auth required
|
|
198
|
+
app.include_router(health_router)
|
|
199
|
+
|
|
200
|
+
# All other endpoints — require Bearer token
|
|
201
|
+
app.include_router(gpus_router, dependencies=[Depends(require_auth)])
|
|
202
|
+
app.include_router(jobs_router, dependencies=[Depends(require_auth)])
|
|
203
|
+
app.include_router(admin_router, dependencies=[Depends(require_auth)])
|
|
204
|
+
|
|
205
|
+
return app
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Bearer token authentication for the edge agent API."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from fastapi import Depends, HTTPException, Request, status
|
|
8
|
+
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
9
|
+
|
|
10
|
+
from radix_edge.config import get_config
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("radix_edge.auth")
|
|
13
|
+
|
|
14
|
+
_bearer_scheme = HTTPBearer(auto_error=False)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def require_auth(
|
|
18
|
+
request: Request,
|
|
19
|
+
credentials: Optional[HTTPAuthorizationCredentials] = Depends(_bearer_scheme),
|
|
20
|
+
) -> str:
|
|
21
|
+
"""Validate Bearer token. Returns the token string on success.
|
|
22
|
+
|
|
23
|
+
Raises 401 if:
|
|
24
|
+
- No auth token is configured (agent misconfigured)
|
|
25
|
+
- No Authorization header provided
|
|
26
|
+
- Token doesn't match configured value
|
|
27
|
+
"""
|
|
28
|
+
config = get_config()
|
|
29
|
+
|
|
30
|
+
if not config.auth_token:
|
|
31
|
+
# No token configured — allow unauthenticated local access
|
|
32
|
+
logger.debug("No auth token configured; allowing local access")
|
|
33
|
+
return "local"
|
|
34
|
+
|
|
35
|
+
if credentials is None:
|
|
36
|
+
raise HTTPException(
|
|
37
|
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
|
38
|
+
detail="Missing authorization header",
|
|
39
|
+
headers={"WWW-Authenticate": "Bearer"},
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
if credentials.credentials != config.auth_token:
|
|
43
|
+
logger.warning("Invalid auth token from %s", request.client.host if request.client else "unknown")
|
|
44
|
+
raise HTTPException(
|
|
45
|
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
|
46
|
+
detail="Invalid authentication token",
|
|
47
|
+
headers={"WWW-Authenticate": "Bearer"},
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return credentials.credentials
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Admin API endpoints for configuration and quota management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from fastapi import APIRouter
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from radix_edge.db import get_db
|
|
11
|
+
from radix_edge.enforcer.admission import get_user_quota, set_user_quota
|
|
12
|
+
from radix_edge.enforcer.fairness import get_user_usage, get_all_user_usage
|
|
13
|
+
from radix_edge.models import BrainParams
|
|
14
|
+
|
|
15
|
+
router = APIRouter(prefix="/v1/config", tags=["admin"])
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# --- Quota management ---
|
|
19
|
+
|
|
20
|
+
class QuotaUpdateRequest(BaseModel):
|
|
21
|
+
max_concurrent_gpu_jobs: Optional[int] = Field(None, ge=1, le=100)
|
|
22
|
+
max_gpus_total: Optional[int] = Field(None, ge=1, le=256)
|
|
23
|
+
fair_share_weight: Optional[float] = Field(None, ge=0.01, le=100.0)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@router.get("/quotas/{user_id}")
|
|
27
|
+
async def get_quota(user_id: str):
|
|
28
|
+
"""Get quota settings for a user."""
|
|
29
|
+
return get_user_quota(user_id)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@router.put("/quotas/{user_id}")
|
|
33
|
+
async def update_quota(user_id: str, req: QuotaUpdateRequest):
|
|
34
|
+
"""Update quota settings for a user."""
|
|
35
|
+
set_user_quota(
|
|
36
|
+
user_id,
|
|
37
|
+
max_concurrent=req.max_concurrent_gpu_jobs,
|
|
38
|
+
max_gpus=req.max_gpus_total,
|
|
39
|
+
fair_share_weight=req.fair_share_weight,
|
|
40
|
+
)
|
|
41
|
+
return get_user_quota(user_id)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@router.get("/users")
|
|
45
|
+
async def list_users():
|
|
46
|
+
"""List all users with quotas and usage."""
|
|
47
|
+
with get_db() as db:
|
|
48
|
+
quotas = db.execute("SELECT * FROM user_quotas ORDER BY user_id").fetchall()
|
|
49
|
+
usage = get_all_user_usage(hours=24, db_conn=db)
|
|
50
|
+
usage_map = {u["user_id"]: u for u in usage}
|
|
51
|
+
|
|
52
|
+
result = []
|
|
53
|
+
for q in quotas:
|
|
54
|
+
u = usage_map.get(q["user_id"], {"gpu_seconds": 0, "job_count": 0})
|
|
55
|
+
result.append({
|
|
56
|
+
"user_id": q["user_id"],
|
|
57
|
+
"max_concurrent_gpu_jobs": q["max_concurrent_gpu_jobs"],
|
|
58
|
+
"max_gpus_total": q["max_gpus_total"],
|
|
59
|
+
"fair_share_weight": q["fair_share_weight"],
|
|
60
|
+
"gpu_seconds_24h": u["gpu_seconds"],
|
|
61
|
+
"job_count_24h": u["job_count"],
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
return {"users": result}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# --- Brain params ---
|
|
68
|
+
|
|
69
|
+
@router.get("/brain")
|
|
70
|
+
async def get_brain_params():
|
|
71
|
+
"""Get current brain scheduling weights."""
|
|
72
|
+
with get_db() as db:
|
|
73
|
+
row = db.execute("SELECT * FROM brain_params WHERE key = 'current'").fetchone()
|
|
74
|
+
if not row:
|
|
75
|
+
return {}
|
|
76
|
+
return BrainParams.from_row(row).to_dict()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@router.get("/usage/{user_id}")
|
|
80
|
+
async def get_usage(user_id: str, hours: int = 24):
|
|
81
|
+
"""Get resource usage for a user."""
|
|
82
|
+
return get_user_usage(user_id, hours=hours)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""GPU state API endpoints."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, HTTPException
|
|
6
|
+
|
|
7
|
+
from radix_edge.db import get_db
|
|
8
|
+
from radix_edge.models import GPU
|
|
9
|
+
|
|
10
|
+
router = APIRouter(prefix="/v1/gpus", tags=["gpus"])
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@router.get("")
|
|
14
|
+
async def list_gpus():
|
|
15
|
+
"""List all detected GPUs with live state."""
|
|
16
|
+
with get_db() as db:
|
|
17
|
+
rows = db.execute("SELECT * FROM gpus ORDER BY gpu_index").fetchall()
|
|
18
|
+
return {"gpus": [GPU.from_row(r).to_dict() for r in rows]}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@router.get("/{gpu_index}")
|
|
22
|
+
async def get_gpu(gpu_index: int):
|
|
23
|
+
"""Get a single GPU's state."""
|
|
24
|
+
with get_db() as db:
|
|
25
|
+
row = db.execute("SELECT * FROM gpus WHERE gpu_index = ?", (gpu_index,)).fetchone()
|
|
26
|
+
if not row:
|
|
27
|
+
raise HTTPException(status_code=404, detail=f"GPU {gpu_index} not found")
|
|
28
|
+
return GPU.from_row(row).to_dict()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Health, readiness, and metrics endpoints."""
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter
|
|
4
|
+
from fastapi.responses import PlainTextResponse
|
|
5
|
+
|
|
6
|
+
from prometheus_client import (
|
|
7
|
+
CollectorRegistry, Gauge, Counter, generate_latest,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
from radix_edge.db import get_db
|
|
11
|
+
|
|
12
|
+
router = APIRouter()
|
|
13
|
+
|
|
14
|
+
# --- Prometheus metrics ---
|
|
15
|
+
|
|
16
|
+
registry = CollectorRegistry()
|
|
17
|
+
|
|
18
|
+
# GPU metrics
|
|
19
|
+
gpu_utilization = Gauge("radix_gpu_utilization_pct", "GPU utilization %", ["gpu_index"], registry=registry)
|
|
20
|
+
gpu_memory_used = Gauge("radix_gpu_memory_used_mb", "GPU memory used (MB)", ["gpu_index"], registry=registry)
|
|
21
|
+
gpu_temperature = Gauge("radix_gpu_temperature_celsius", "GPU temperature (C)", ["gpu_index"], registry=registry)
|
|
22
|
+
gpu_power_draw = Gauge("radix_gpu_power_draw_watts", "GPU power draw (W)", ["gpu_index"], registry=registry)
|
|
23
|
+
|
|
24
|
+
# Job metrics
|
|
25
|
+
jobs_total = Gauge("radix_jobs_total", "Total jobs by status", ["status"], registry=registry)
|
|
26
|
+
job_queue_depth = Gauge("radix_job_queue_depth", "Number of queued jobs", registry=registry)
|
|
27
|
+
|
|
28
|
+
# Scheduler metrics
|
|
29
|
+
scheduler_decisions = Counter("radix_scheduler_decisions_total", "Scheduler decisions made", registry=registry)
|
|
30
|
+
scheduler_exploration_ratio = Gauge("radix_scheduler_exploration_ratio", "Current exploration ratio", registry=registry)
|
|
31
|
+
|
|
32
|
+
# FEP metrics
|
|
33
|
+
fep_free_energy = Gauge("radix_fep_free_energy", "Current variational free energy", registry=registry)
|
|
34
|
+
fep_latent_state = Gauge("radix_fep_latent_state_prob", "Latent state probability", ["state"], registry=registry)
|
|
35
|
+
|
|
36
|
+
# Brain weights
|
|
37
|
+
brain_weight = Gauge("radix_brain_weight", "Brain scheduling weight", ["dimension"], registry=registry)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _update_metrics():
|
|
41
|
+
"""Update all Prometheus metrics from current database state."""
|
|
42
|
+
try:
|
|
43
|
+
with get_db() as db:
|
|
44
|
+
# GPU metrics
|
|
45
|
+
gpu_rows = db.execute("SELECT * FROM gpus").fetchall()
|
|
46
|
+
for g in gpu_rows:
|
|
47
|
+
idx = str(g["gpu_index"])
|
|
48
|
+
gpu_utilization.labels(gpu_index=idx).set(g["utilization_pct"])
|
|
49
|
+
gpu_memory_used.labels(gpu_index=idx).set(g["memory_used_mb"])
|
|
50
|
+
gpu_temperature.labels(gpu_index=idx).set(g["temperature_c"])
|
|
51
|
+
gpu_power_draw.labels(gpu_index=idx).set(g["power_draw_w"])
|
|
52
|
+
|
|
53
|
+
# Job metrics
|
|
54
|
+
status_counts = db.execute(
|
|
55
|
+
"SELECT status, COUNT(*) as c FROM jobs GROUP BY status"
|
|
56
|
+
).fetchall()
|
|
57
|
+
for r in status_counts:
|
|
58
|
+
jobs_total.labels(status=r["status"]).set(r["c"])
|
|
59
|
+
|
|
60
|
+
queued = db.execute("SELECT COUNT(*) as c FROM jobs WHERE status = 'queued'").fetchone()
|
|
61
|
+
job_queue_depth.set(queued["c"])
|
|
62
|
+
|
|
63
|
+
# Brain weights
|
|
64
|
+
brain = db.execute("SELECT * FROM brain_params WHERE key = 'current'").fetchone()
|
|
65
|
+
if brain:
|
|
66
|
+
brain_weight.labels(dimension="queue_depth").set(brain["w_queue_depth"])
|
|
67
|
+
brain_weight.labels(dimension="gpu_saturation").set(brain["w_gpu_saturation"])
|
|
68
|
+
brain_weight.labels(dimension="node_pressure").set(brain["w_node_pressure"])
|
|
69
|
+
|
|
70
|
+
# FEP latest decision
|
|
71
|
+
fep = db.execute(
|
|
72
|
+
"SELECT * FROM fep_decisions ORDER BY decided_at DESC LIMIT 1"
|
|
73
|
+
).fetchone()
|
|
74
|
+
if fep:
|
|
75
|
+
fep_free_energy.set(fep["free_energy"])
|
|
76
|
+
import json
|
|
77
|
+
probs = json.loads(fep["state_probs"]) if fep["state_probs"] else {}
|
|
78
|
+
for state, prob in probs.items():
|
|
79
|
+
fep_latent_state.labels(state=state).set(prob)
|
|
80
|
+
|
|
81
|
+
except Exception:
|
|
82
|
+
pass # Metrics endpoint should never fail
|
|
83
|
+
|
|
84
|
+
# Scheduler model metrics
|
|
85
|
+
try:
|
|
86
|
+
from radix_edge.scheduler.scheduler import get_model
|
|
87
|
+
model = get_model()
|
|
88
|
+
metrics = model.get_metrics()
|
|
89
|
+
# Counter can only go up, so use a gauge for current total
|
|
90
|
+
scheduler_exploration_ratio.set(metrics.get("exploration_ratio", 0))
|
|
91
|
+
except Exception:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# --- Endpoints ---
|
|
96
|
+
|
|
97
|
+
@router.get("/healthz")
|
|
98
|
+
async def health_check():
|
|
99
|
+
"""Liveness probe."""
|
|
100
|
+
return {"status": "healthy"}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@router.get("/readyz")
|
|
104
|
+
async def readiness_check():
|
|
105
|
+
"""Readiness probe — verifies database is accessible."""
|
|
106
|
+
try:
|
|
107
|
+
with get_db() as db:
|
|
108
|
+
db.execute("SELECT 1")
|
|
109
|
+
return {"status": "ready"}
|
|
110
|
+
except Exception as e:
|
|
111
|
+
return {"status": "not_ready", "error": str(e)}, 503
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@router.get("/metrics")
|
|
115
|
+
async def metrics():
|
|
116
|
+
"""Prometheus metrics endpoint."""
|
|
117
|
+
_update_metrics()
|
|
118
|
+
return PlainTextResponse(
|
|
119
|
+
generate_latest(registry).decode("utf-8"),
|
|
120
|
+
media_type="text/plain; version=0.0.4; charset=utf-8",
|
|
121
|
+
)
|