radix-edge 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. radix_edge-2.0.0/PKG-INFO +24 -0
  2. radix_edge-2.0.0/pyproject.toml +47 -0
  3. radix_edge-2.0.0/radix_edge/__init__.py +3 -0
  4. radix_edge-2.0.0/radix_edge/__main__.py +20 -0
  5. radix_edge-2.0.0/radix_edge/api/__init__.py +0 -0
  6. radix_edge-2.0.0/radix_edge/api/app.py +205 -0
  7. radix_edge-2.0.0/radix_edge/api/auth.py +50 -0
  8. radix_edge-2.0.0/radix_edge/api/routes_admin.py +82 -0
  9. radix_edge-2.0.0/radix_edge/api/routes_gpus.py +28 -0
  10. radix_edge-2.0.0/radix_edge/api/routes_health.py +121 -0
  11. radix_edge-2.0.0/radix_edge/api/routes_jobs.py +120 -0
  12. radix_edge-2.0.0/radix_edge/api/schemas.py +30 -0
  13. radix_edge-2.0.0/radix_edge/cli/__init__.py +0 -0
  14. radix_edge-2.0.0/radix_edge/cli/main.py +227 -0
  15. radix_edge-2.0.0/radix_edge/config.py +133 -0
  16. radix_edge-2.0.0/radix_edge/db.py +232 -0
  17. radix_edge-2.0.0/radix_edge/enforcer/__init__.py +0 -0
  18. radix_edge-2.0.0/radix_edge/enforcer/admission.py +107 -0
  19. radix_edge-2.0.0/radix_edge/enforcer/fairness.py +100 -0
  20. radix_edge-2.0.0/radix_edge/enforcer/rate_limiter.py +46 -0
  21. radix_edge-2.0.0/radix_edge/executor/__init__.py +0 -0
  22. radix_edge-2.0.0/radix_edge/executor/executor.py +249 -0
  23. radix_edge-2.0.0/radix_edge/executor/gpu_allocator.py +126 -0
  24. radix_edge-2.0.0/radix_edge/mesh/__init__.py +1 -0
  25. radix_edge-2.0.0/radix_edge/mesh/coordinator.py +134 -0
  26. radix_edge-2.0.0/radix_edge/mesh/discovery.py +174 -0
  27. radix_edge-2.0.0/radix_edge/mesh/peer_client.py +127 -0
  28. radix_edge-2.0.0/radix_edge/mesh/peer_registry.py +98 -0
  29. radix_edge-2.0.0/radix_edge/mesh/routes_mesh.py +216 -0
  30. radix_edge-2.0.0/radix_edge/mesh/security.py +53 -0
  31. radix_edge-2.0.0/radix_edge/mesh/sync.py +196 -0
  32. radix_edge-2.0.0/radix_edge/meta/__init__.py +0 -0
  33. radix_edge-2.0.0/radix_edge/meta/brain_autotuner.py +286 -0
  34. radix_edge-2.0.0/radix_edge/meta/fep_controller.py +224 -0
  35. radix_edge-2.0.0/radix_edge/meta/generative_model.py +76 -0
  36. radix_edge-2.0.0/radix_edge/models.py +140 -0
  37. radix_edge-2.0.0/radix_edge/observer/__init__.py +0 -0
  38. radix_edge-2.0.0/radix_edge/observer/gpu_observer.py +256 -0
  39. radix_edge-2.0.0/radix_edge/observer/gpu_topology.py +101 -0
  40. radix_edge-2.0.0/radix_edge/recovery.py +130 -0
  41. radix_edge-2.0.0/radix_edge/scheduler/__init__.py +0 -0
  42. radix_edge-2.0.0/radix_edge/scheduler/policies.py +90 -0
  43. radix_edge-2.0.0/radix_edge/scheduler/scheduler.py +161 -0
  44. radix_edge-2.0.0/radix_edge/scheduler/scoring_model.py +257 -0
  45. radix_edge-2.0.0/radix_edge/scheduler/sinkhorn.py +77 -0
  46. radix_edge-2.0.0/radix_edge/upstream/__init__.py +0 -0
  47. radix_edge-2.0.0/radix_edge/upstream/client.py +84 -0
  48. radix_edge-2.0.0/radix_edge/upstream/heartbeat.py +62 -0
  49. radix_edge-2.0.0/radix_edge/upstream/job_sync.py +156 -0
  50. radix_edge-2.0.0/radix_edge/upstream/token_manager.py +84 -0
  51. radix_edge-2.0.0/radix_edge/upstream/translate.py +42 -0
  52. radix_edge-2.0.0/radix_edge.egg-info/PKG-INFO +24 -0
  53. radix_edge-2.0.0/radix_edge.egg-info/SOURCES.txt +75 -0
  54. radix_edge-2.0.0/radix_edge.egg-info/dependency_links.txt +1 -0
  55. radix_edge-2.0.0/radix_edge.egg-info/entry_points.txt +2 -0
  56. radix_edge-2.0.0/radix_edge.egg-info/requires.txt +21 -0
  57. radix_edge-2.0.0/radix_edge.egg-info/top_level.txt +1 -0
  58. radix_edge-2.0.0/setup.cfg +4 -0
  59. radix_edge-2.0.0/tests/test_brain_autotuner.py +141 -0
  60. radix_edge-2.0.0/tests/test_enforcer.py +214 -0
  61. radix_edge-2.0.0/tests/test_executor.py +297 -0
  62. radix_edge-2.0.0/tests/test_fep_controller.py +196 -0
  63. radix_edge-2.0.0/tests/test_gpu_integration.py +124 -0
  64. radix_edge-2.0.0/tests/test_mesh_coordinator.py +178 -0
  65. radix_edge-2.0.0/tests/test_mesh_discovery.py +158 -0
  66. radix_edge-2.0.0/tests/test_mesh_registry.py +125 -0
  67. radix_edge-2.0.0/tests/test_mesh_routes.py +210 -0
  68. radix_edge-2.0.0/tests/test_mesh_scheduling.py +174 -0
  69. radix_edge-2.0.0/tests/test_mesh_security.py +86 -0
  70. radix_edge-2.0.0/tests/test_mesh_sync.py +190 -0
  71. radix_edge-2.0.0/tests/test_observer.py +337 -0
  72. radix_edge-2.0.0/tests/test_scoring_model.py +255 -0
  73. radix_edge-2.0.0/tests/test_security.py +290 -0
  74. radix_edge-2.0.0/tests/test_skeleton.py +130 -0
  75. radix_edge-2.0.0/tests/test_upstream_client.py +101 -0
  76. radix_edge-2.0.0/tests/test_upstream_job_sync.py +225 -0
  77. radix_edge-2.0.0/tests/test_upstream_token.py +101 -0
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: radix-edge
3
+ Version: 2.0.0
4
+ Summary: Radix Edge Agent — GPU orchestration with information-theoretic scheduling
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: fastapi>=0.109.0
7
+ Requires-Dist: uvicorn[standard]>=0.27.0
8
+ Requires-Dist: pydantic>=2.5.0
9
+ Requires-Dist: pydantic-settings>=2.1.0
10
+ Requires-Dist: prometheus-client>=0.19.0
11
+ Requires-Dist: numpy>=1.24.0
12
+ Requires-Dist: scipy>=1.10.0
13
+ Requires-Dist: docker>=6.0.0
14
+ Requires-Dist: httpx>=0.25.0
15
+ Requires-Dist: typer>=0.9.0
16
+ Requires-Dist: rich>=13.0.0
17
+ Provides-Extra: mesh
18
+ Requires-Dist: zeroconf>=0.131.0; extra == "mesh"
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
21
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
22
+ Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
23
+ Requires-Dist: respx>=0.21.0; extra == "dev"
24
+ Requires-Dist: ruff>=0.3.0; extra == "dev"
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "radix-edge"
7
+ version = "2.0.0"
8
+ description = "Radix Edge Agent — GPU orchestration with information-theoretic scheduling"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "fastapi>=0.109.0",
12
+ "uvicorn[standard]>=0.27.0",
13
+ "pydantic>=2.5.0",
14
+ "pydantic-settings>=2.1.0",
15
+ "prometheus-client>=0.19.0",
16
+ "numpy>=1.24.0",
17
+ "scipy>=1.10.0",
18
+ "docker>=6.0.0",
19
+ "httpx>=0.25.0",
20
+ "typer>=0.9.0",
21
+ "rich>=13.0.0",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ mesh = ["zeroconf>=0.131.0"]
26
+ dev = [
27
+ "pytest>=8.0.0",
28
+ "pytest-asyncio>=0.23.0",
29
+ "pytest-cov>=5.0.0",
30
+ "respx>=0.21.0",
31
+ "ruff>=0.3.0",
32
+ ]
33
+
34
+ [project.scripts]
35
+ radix-edge = "radix_edge.cli.main:app"
36
+
37
+ [tool.setuptools.packages.find]
38
+ include = ["radix_edge*"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
42
+ asyncio_mode = "auto"
43
+ markers = ["gpu: GPU integration tests (require --run-gpu flag or RADIX_GPU_TESTS=1)"]
44
+
45
+ [tool.ruff]
46
+ line-length = 120
47
+ target-version = "py311"
@@ -0,0 +1,3 @@
1
+ """Radix Edge Agent — GPU orchestration with information-theoretic scheduling."""
2
+
3
+ __version__ = "2.0.0"
@@ -0,0 +1,20 @@
1
+ """Entry point for running the edge agent server."""
2
+
3
+ import uvicorn
4
+
5
+ from radix_edge.config import get_config
6
+
7
+
8
+ def main():
9
+ config = get_config()
10
+ uvicorn.run(
11
+ "radix_edge.api.app:create_app",
12
+ factory=True,
13
+ host=config.host,
14
+ port=config.port,
15
+ log_level=config.log_level.lower(),
16
+ )
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()
File without changes
@@ -0,0 +1,205 @@
1
+ """FastAPI application factory for the Radix Edge Agent."""
2
+
3
+ import asyncio
4
+ import logging
5
+ from contextlib import asynccontextmanager
6
+
7
+ from fastapi import Depends, FastAPI
8
+
9
+ from radix_edge.config import get_config
10
+ from radix_edge.db import init_db, close_db
11
+ from radix_edge.api.routes_health import router as health_router
12
+ from radix_edge.api.routes_gpus import router as gpus_router
13
+ from radix_edge.api.routes_jobs import router as jobs_router
14
+ from radix_edge.api.routes_admin import router as admin_router
15
+
16
+ logger = logging.getLogger("radix_edge")
17
+
18
+
19
+ @asynccontextmanager
20
+ async def lifespan(app: FastAPI):
21
+ """Application lifespan — startup and shutdown."""
22
+ config = get_config()
23
+ logging.basicConfig(
24
+ level=getattr(logging, config.log_level.upper(), logging.INFO),
25
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
26
+ )
27
+
28
+ logger.info("Radix Edge Agent starting")
29
+ logger.info("Database: %s", config.db_path)
30
+
31
+ # Initialize database
32
+ init_db()
33
+ logger.info("Database initialized (12 tables)")
34
+
35
+ # Reconcile state after restart
36
+ from radix_edge.recovery import reconcile_on_startup
37
+ recovery_summary = reconcile_on_startup()
38
+ logger.info("Recovery: %s", recovery_summary)
39
+
40
+ # Start GPU observer background task
41
+ from radix_edge.observer.gpu_observer import observer_loop
42
+ observer_task = asyncio.create_task(observer_loop())
43
+ logger.info("GPU observer started")
44
+
45
+ # Start scheduler background task
46
+ from radix_edge.scheduler.scheduler import scheduling_loop
47
+ scheduler_task = asyncio.create_task(scheduling_loop())
48
+ logger.info("Scheduler started")
49
+
50
+ # Start FEP controller and brain auto-tuner
51
+ from radix_edge.meta.fep_controller import fep_loop
52
+ from radix_edge.meta.brain_autotuner import autotuner_loop
53
+ fep_task = asyncio.create_task(fep_loop())
54
+ autotuner_task = asyncio.create_task(autotuner_loop())
55
+ logger.info("FEP controller and brain auto-tuner started")
56
+
57
+ # Start upstream control plane communication (if configured)
58
+ upstream_tasks = []
59
+ upstream_client = None
60
+ if config.upstream_enabled and config.api_base and config.cluster_id:
61
+ from radix_edge.upstream.token_manager import resolve_token
62
+ from radix_edge.upstream.client import UpstreamClient
63
+ from radix_edge.upstream.heartbeat import heartbeat_loop
64
+ from radix_edge.upstream.job_sync import job_sync_loop
65
+
66
+ cluster_token = await resolve_token(
67
+ api_base=config.api_base,
68
+ cluster_id=config.cluster_id,
69
+ tenant_id=config.tenant_id,
70
+ cluster_token=config.cluster_token,
71
+ one_time_token=config.one_time_token,
72
+ token_file=config.token_file,
73
+ )
74
+
75
+ upstream_client = UpstreamClient(
76
+ api_base=config.api_base,
77
+ cluster_id=config.cluster_id,
78
+ tenant_id=config.tenant_id,
79
+ cluster_token=cluster_token,
80
+ )
81
+
82
+ upstream_tasks.append(asyncio.create_task(heartbeat_loop(upstream_client)))
83
+ upstream_tasks.append(asyncio.create_task(job_sync_loop(upstream_client)))
84
+ logger.info("Upstream control plane connected (cluster=%s)", config.cluster_id)
85
+ else:
86
+ logger.info("Upstream control plane disabled")
87
+
88
+ # Start mesh networking (if configured)
89
+ mesh_tasks = []
90
+ mesh_peer_clients: dict = {}
91
+ mesh_discovery_inst = None
92
+ if config.mesh_enabled and config.mesh_key:
93
+ import uuid
94
+ from radix_edge.mesh.peer_registry import PeerRegistry
95
+ from radix_edge.mesh.coordinator import MeshCoordinator
96
+ from radix_edge.mesh.discovery import MeshDiscovery, ManualPeerList, ZEROCONF_AVAILABLE
97
+ from radix_edge.mesh.sync import (
98
+ mesh_heartbeat_loop,
99
+ mesh_discovery_loop,
100
+ election_monitor_loop,
101
+ coordinator_loop,
102
+ )
103
+
104
+ # Generate or load node_id
105
+ node_id = config.mesh_node_id
106
+ if not node_id:
107
+ with get_db() as db_conn:
108
+ row = db_conn.execute("SELECT value FROM config WHERE key = 'mesh_node_id'").fetchone()
109
+ if row:
110
+ node_id = row["value"]
111
+ else:
112
+ node_id = f"node-{uuid.uuid4().hex[:12]}"
113
+ db_conn.execute(
114
+ "INSERT INTO config (key, value) VALUES ('mesh_node_id', ?)",
115
+ (node_id,),
116
+ )
117
+
118
+ # Create mesh objects
119
+ mesh_registry = PeerRegistry(stale_threshold=config.mesh_stale_threshold)
120
+ mesh_coordinator = MeshCoordinator(node_id, mesh_registry)
121
+
122
+ # Discovery: prefer mDNS, fall back to manual
123
+ if ZEROCONF_AVAILABLE and not config.mesh_peers:
124
+ mesh_discovery_inst = MeshDiscovery(
125
+ node_id=node_id,
126
+ port=config.port,
127
+ mesh_key=config.mesh_key,
128
+ )
129
+ await mesh_discovery_inst.start()
130
+ else:
131
+ mesh_discovery_inst = ManualPeerList(config.mesh_peers)
132
+
133
+ # Store in app.state for route access
134
+ app.state.mesh_registry = mesh_registry
135
+ app.state.mesh_coordinator = mesh_coordinator
136
+ app.state.mesh_key = config.mesh_key
137
+ app.state.mesh_node_id = node_id
138
+
139
+ # Include mesh router
140
+ from radix_edge.mesh.routes_mesh import router as mesh_router
141
+ app.include_router(mesh_router)
142
+
143
+ # Start background tasks
144
+ mesh_tasks.append(asyncio.create_task(
145
+ mesh_heartbeat_loop(config, mesh_registry, mesh_peer_clients, node_id, mesh_coordinator)
146
+ ))
147
+ mesh_tasks.append(asyncio.create_task(
148
+ mesh_discovery_loop(config, mesh_discovery_inst, mesh_registry, mesh_peer_clients, config.mesh_key)
149
+ ))
150
+ mesh_tasks.append(asyncio.create_task(
151
+ election_monitor_loop(config, mesh_coordinator, mesh_registry, mesh_peer_clients)
152
+ ))
153
+ mesh_tasks.append(asyncio.create_task(
154
+ coordinator_loop(config, mesh_coordinator, mesh_peer_clients)
155
+ ))
156
+
157
+ # Run initial election
158
+ mesh_coordinator.elect()
159
+ logger.info("Mesh networking started (node=%s, peers=%d)", node_id, mesh_registry.peer_count)
160
+ else:
161
+ logger.info("Mesh networking disabled")
162
+
163
+ logger.info("Radix Edge Agent ready on %s:%d", config.host, config.port)
164
+
165
+ yield
166
+
167
+ # Shutdown
168
+ logger.info("Radix Edge Agent shutting down")
169
+ for task in (observer_task, scheduler_task, fep_task, autotuner_task, *upstream_tasks, *mesh_tasks):
170
+ task.cancel()
171
+ try:
172
+ await task
173
+ except asyncio.CancelledError:
174
+ pass
175
+ if upstream_client:
176
+ await upstream_client.close()
177
+ # Close mesh resources
178
+ for pc in mesh_peer_clients.values():
179
+ await pc.close()
180
+ if mesh_discovery_inst and hasattr(mesh_discovery_inst, "stop"):
181
+ await mesh_discovery_inst.stop()
182
+ close_db()
183
+ logger.info("Radix Edge Agent stopped")
184
+
185
+
186
+ def create_app() -> FastAPI:
187
+ """Create and configure the FastAPI application."""
188
+ from radix_edge.api.auth import require_auth
189
+
190
+ app = FastAPI(
191
+ title="Radix Edge Agent",
192
+ description="GPU orchestration with information-theoretic scheduling",
193
+ version="0.1.0",
194
+ lifespan=lifespan,
195
+ )
196
+
197
+ # Health endpoints — no auth required
198
+ app.include_router(health_router)
199
+
200
+ # All other endpoints — require Bearer token
201
+ app.include_router(gpus_router, dependencies=[Depends(require_auth)])
202
+ app.include_router(jobs_router, dependencies=[Depends(require_auth)])
203
+ app.include_router(admin_router, dependencies=[Depends(require_auth)])
204
+
205
+ return app
@@ -0,0 +1,50 @@
1
+ """Bearer token authentication for the edge agent API."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from typing import Optional
6
+
7
+ from fastapi import Depends, HTTPException, Request, status
8
+ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
9
+
10
+ from radix_edge.config import get_config
11
+
12
+ logger = logging.getLogger("radix_edge.auth")
13
+
14
+ _bearer_scheme = HTTPBearer(auto_error=False)
15
+
16
+
17
+ async def require_auth(
18
+ request: Request,
19
+ credentials: Optional[HTTPAuthorizationCredentials] = Depends(_bearer_scheme),
20
+ ) -> str:
21
+ """Validate Bearer token. Returns the token string on success.
22
+
23
+ Raises 401 if:
24
+ - No auth token is configured (agent misconfigured)
25
+ - No Authorization header provided
26
+ - Token doesn't match configured value
27
+ """
28
+ config = get_config()
29
+
30
+ if not config.auth_token:
31
+ # No token configured — allow unauthenticated local access
32
+ logger.debug("No auth token configured; allowing local access")
33
+ return "local"
34
+
35
+ if credentials is None:
36
+ raise HTTPException(
37
+ status_code=status.HTTP_401_UNAUTHORIZED,
38
+ detail="Missing authorization header",
39
+ headers={"WWW-Authenticate": "Bearer"},
40
+ )
41
+
42
+ if credentials.credentials != config.auth_token:
43
+ logger.warning("Invalid auth token from %s", request.client.host if request.client else "unknown")
44
+ raise HTTPException(
45
+ status_code=status.HTTP_401_UNAUTHORIZED,
46
+ detail="Invalid authentication token",
47
+ headers={"WWW-Authenticate": "Bearer"},
48
+ )
49
+
50
+ return credentials.credentials
@@ -0,0 +1,82 @@
1
+ """Admin API endpoints for configuration and quota management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from fastapi import APIRouter
8
+ from pydantic import BaseModel, Field
9
+
10
+ from radix_edge.db import get_db
11
+ from radix_edge.enforcer.admission import get_user_quota, set_user_quota
12
+ from radix_edge.enforcer.fairness import get_user_usage, get_all_user_usage
13
+ from radix_edge.models import BrainParams
14
+
15
+ router = APIRouter(prefix="/v1/config", tags=["admin"])
16
+
17
+
18
+ # --- Quota management ---
19
+
20
+ class QuotaUpdateRequest(BaseModel):
21
+ max_concurrent_gpu_jobs: Optional[int] = Field(None, ge=1, le=100)
22
+ max_gpus_total: Optional[int] = Field(None, ge=1, le=256)
23
+ fair_share_weight: Optional[float] = Field(None, ge=0.01, le=100.0)
24
+
25
+
26
+ @router.get("/quotas/{user_id}")
27
+ async def get_quota(user_id: str):
28
+ """Get quota settings for a user."""
29
+ return get_user_quota(user_id)
30
+
31
+
32
+ @router.put("/quotas/{user_id}")
33
+ async def update_quota(user_id: str, req: QuotaUpdateRequest):
34
+ """Update quota settings for a user."""
35
+ set_user_quota(
36
+ user_id,
37
+ max_concurrent=req.max_concurrent_gpu_jobs,
38
+ max_gpus=req.max_gpus_total,
39
+ fair_share_weight=req.fair_share_weight,
40
+ )
41
+ return get_user_quota(user_id)
42
+
43
+
44
+ @router.get("/users")
45
+ async def list_users():
46
+ """List all users with quotas and usage."""
47
+ with get_db() as db:
48
+ quotas = db.execute("SELECT * FROM user_quotas ORDER BY user_id").fetchall()
49
+ usage = get_all_user_usage(hours=24, db_conn=db)
50
+ usage_map = {u["user_id"]: u for u in usage}
51
+
52
+ result = []
53
+ for q in quotas:
54
+ u = usage_map.get(q["user_id"], {"gpu_seconds": 0, "job_count": 0})
55
+ result.append({
56
+ "user_id": q["user_id"],
57
+ "max_concurrent_gpu_jobs": q["max_concurrent_gpu_jobs"],
58
+ "max_gpus_total": q["max_gpus_total"],
59
+ "fair_share_weight": q["fair_share_weight"],
60
+ "gpu_seconds_24h": u["gpu_seconds"],
61
+ "job_count_24h": u["job_count"],
62
+ })
63
+
64
+ return {"users": result}
65
+
66
+
67
+ # --- Brain params ---
68
+
69
+ @router.get("/brain")
70
+ async def get_brain_params():
71
+ """Get current brain scheduling weights."""
72
+ with get_db() as db:
73
+ row = db.execute("SELECT * FROM brain_params WHERE key = 'current'").fetchone()
74
+ if not row:
75
+ return {}
76
+ return BrainParams.from_row(row).to_dict()
77
+
78
+
79
+ @router.get("/usage/{user_id}")
80
+ async def get_usage(user_id: str, hours: int = 24):
81
+ """Get resource usage for a user."""
82
+ return get_user_usage(user_id, hours=hours)
@@ -0,0 +1,28 @@
1
+ """GPU state API endpoints."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastapi import APIRouter, HTTPException
6
+
7
+ from radix_edge.db import get_db
8
+ from radix_edge.models import GPU
9
+
10
+ router = APIRouter(prefix="/v1/gpus", tags=["gpus"])
11
+
12
+
13
+ @router.get("")
14
+ async def list_gpus():
15
+ """List all detected GPUs with live state."""
16
+ with get_db() as db:
17
+ rows = db.execute("SELECT * FROM gpus ORDER BY gpu_index").fetchall()
18
+ return {"gpus": [GPU.from_row(r).to_dict() for r in rows]}
19
+
20
+
21
+ @router.get("/{gpu_index}")
22
+ async def get_gpu(gpu_index: int):
23
+ """Get a single GPU's state."""
24
+ with get_db() as db:
25
+ row = db.execute("SELECT * FROM gpus WHERE gpu_index = ?", (gpu_index,)).fetchone()
26
+ if not row:
27
+ raise HTTPException(status_code=404, detail=f"GPU {gpu_index} not found")
28
+ return GPU.from_row(row).to_dict()
@@ -0,0 +1,121 @@
1
+ """Health, readiness, and metrics endpoints."""
2
+
3
+ from fastapi import APIRouter
4
+ from fastapi.responses import PlainTextResponse
5
+
6
+ from prometheus_client import (
7
+ CollectorRegistry, Gauge, Counter, generate_latest,
8
+ )
9
+
10
+ from radix_edge.db import get_db
11
+
12
+ router = APIRouter()
13
+
14
+ # --- Prometheus metrics ---
15
+
16
+ registry = CollectorRegistry()
17
+
18
+ # GPU metrics
19
+ gpu_utilization = Gauge("radix_gpu_utilization_pct", "GPU utilization %", ["gpu_index"], registry=registry)
20
+ gpu_memory_used = Gauge("radix_gpu_memory_used_mb", "GPU memory used (MB)", ["gpu_index"], registry=registry)
21
+ gpu_temperature = Gauge("radix_gpu_temperature_celsius", "GPU temperature (C)", ["gpu_index"], registry=registry)
22
+ gpu_power_draw = Gauge("radix_gpu_power_draw_watts", "GPU power draw (W)", ["gpu_index"], registry=registry)
23
+
24
+ # Job metrics
25
+ jobs_total = Gauge("radix_jobs_total", "Total jobs by status", ["status"], registry=registry)
26
+ job_queue_depth = Gauge("radix_job_queue_depth", "Number of queued jobs", registry=registry)
27
+
28
+ # Scheduler metrics
29
+ scheduler_decisions = Counter("radix_scheduler_decisions_total", "Scheduler decisions made", registry=registry)
30
+ scheduler_exploration_ratio = Gauge("radix_scheduler_exploration_ratio", "Current exploration ratio", registry=registry)
31
+
32
+ # FEP metrics
33
+ fep_free_energy = Gauge("radix_fep_free_energy", "Current variational free energy", registry=registry)
34
+ fep_latent_state = Gauge("radix_fep_latent_state_prob", "Latent state probability", ["state"], registry=registry)
35
+
36
+ # Brain weights
37
+ brain_weight = Gauge("radix_brain_weight", "Brain scheduling weight", ["dimension"], registry=registry)
38
+
39
+
40
+ def _update_metrics():
41
+ """Update all Prometheus metrics from current database state."""
42
+ try:
43
+ with get_db() as db:
44
+ # GPU metrics
45
+ gpu_rows = db.execute("SELECT * FROM gpus").fetchall()
46
+ for g in gpu_rows:
47
+ idx = str(g["gpu_index"])
48
+ gpu_utilization.labels(gpu_index=idx).set(g["utilization_pct"])
49
+ gpu_memory_used.labels(gpu_index=idx).set(g["memory_used_mb"])
50
+ gpu_temperature.labels(gpu_index=idx).set(g["temperature_c"])
51
+ gpu_power_draw.labels(gpu_index=idx).set(g["power_draw_w"])
52
+
53
+ # Job metrics
54
+ status_counts = db.execute(
55
+ "SELECT status, COUNT(*) as c FROM jobs GROUP BY status"
56
+ ).fetchall()
57
+ for r in status_counts:
58
+ jobs_total.labels(status=r["status"]).set(r["c"])
59
+
60
+ queued = db.execute("SELECT COUNT(*) as c FROM jobs WHERE status = 'queued'").fetchone()
61
+ job_queue_depth.set(queued["c"])
62
+
63
+ # Brain weights
64
+ brain = db.execute("SELECT * FROM brain_params WHERE key = 'current'").fetchone()
65
+ if brain:
66
+ brain_weight.labels(dimension="queue_depth").set(brain["w_queue_depth"])
67
+ brain_weight.labels(dimension="gpu_saturation").set(brain["w_gpu_saturation"])
68
+ brain_weight.labels(dimension="node_pressure").set(brain["w_node_pressure"])
69
+
70
+ # FEP latest decision
71
+ fep = db.execute(
72
+ "SELECT * FROM fep_decisions ORDER BY decided_at DESC LIMIT 1"
73
+ ).fetchone()
74
+ if fep:
75
+ fep_free_energy.set(fep["free_energy"])
76
+ import json
77
+ probs = json.loads(fep["state_probs"]) if fep["state_probs"] else {}
78
+ for state, prob in probs.items():
79
+ fep_latent_state.labels(state=state).set(prob)
80
+
81
+ except Exception:
82
+ pass # Metrics endpoint should never fail
83
+
84
+ # Scheduler model metrics
85
+ try:
86
+ from radix_edge.scheduler.scheduler import get_model
87
+ model = get_model()
88
+ metrics = model.get_metrics()
89
+ # Counter can only go up, so use a gauge for current total
90
+ scheduler_exploration_ratio.set(metrics.get("exploration_ratio", 0))
91
+ except Exception:
92
+ pass
93
+
94
+
95
+ # --- Endpoints ---
96
+
97
+ @router.get("/healthz")
98
+ async def health_check():
99
+ """Liveness probe."""
100
+ return {"status": "healthy"}
101
+
102
+
103
+ @router.get("/readyz")
104
+ async def readiness_check():
105
+ """Readiness probe — verifies database is accessible."""
106
+ try:
107
+ with get_db() as db:
108
+ db.execute("SELECT 1")
109
+ return {"status": "ready"}
110
+ except Exception as e:
111
+ return {"status": "not_ready", "error": str(e)}, 503
112
+
113
+
114
+ @router.get("/metrics")
115
+ async def metrics():
116
+ """Prometheus metrics endpoint."""
117
+ _update_metrics()
118
+ return PlainTextResponse(
119
+ generate_latest(registry).decode("utf-8"),
120
+ media_type="text/plain; version=0.0.4; charset=utf-8",
121
+ )