cortex-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortex_engine/__init__.py +12 -0
- cortex_engine/cli.py +51 -0
- cortex_engine/config.py +35 -0
- cortex_engine/dependencies.py +41 -0
- cortex_engine/main.py +180 -0
- cortex_engine/models/__init__.py +1 -0
- cortex_engine/models/schemas.py +158 -0
- cortex_engine/py.typed +1 -0
- cortex_engine/routers/__init__.py +1 -0
- cortex_engine/routers/api.py +86 -0
- cortex_engine/routers/inference.py +38 -0
- cortex_engine/services/__init__.py +1 -0
- cortex_engine/services/cache_manager.py +111 -0
- cortex_engine/services/evaluator.py +103 -0
- cortex_engine/services/feedback.py +57 -0
- cortex_engine/services/orchestrator.py +138 -0
- cortex_engine/services/registry.py +186 -0
- cortex_engine/services/router.py +214 -0
- cortex_engine/services/scheduler.py +111 -0
- cortex_engine-0.1.0.dist-info/METADATA +352 -0
- cortex_engine-0.1.0.dist-info/RECORD +23 -0
- cortex_engine-0.1.0.dist-info/WHEEL +4 -0
- cortex_engine-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — AI Operating System for Coding.
|
|
3
|
+
|
|
4
|
+
Routes queries across 30–50+ language models, manages GPU scheduling,
|
|
5
|
+
LRU caching, output evaluation, and continuous feedback-driven routing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
__author__ = "Cortex-Engine Contributors"
|
|
10
|
+
__license__ = "MIT"
|
|
11
|
+
|
|
12
|
+
from cortex_engine.main import app # noqa: F401 — expose ASGI app at top level
|
cortex_engine/cli.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — CLI entry point.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
cortex-engine # start the API server (default)
|
|
6
|
+
cortex-engine serve # same as above
|
|
7
|
+
cortex-engine --help
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import sys
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main() -> None:
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog="cortex-engine",
|
|
19
|
+
description="Cortex-Engine — AI Operating System for Coding.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"command",
|
|
23
|
+
nargs="?",
|
|
24
|
+
default="serve",
|
|
25
|
+
choices=["serve"],
|
|
26
|
+
help="Command to run (default: serve)",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument("--host", default="0.0.0.0", help="Bind host (default: 0.0.0.0)")
|
|
29
|
+
parser.add_argument("--port", type=int, default=8000, help="Bind port (default: 8000)")
|
|
30
|
+
parser.add_argument("--workers", type=int, default=1, help="Worker count (default: 1)")
|
|
31
|
+
parser.add_argument("--reload", action="store_true", help="Enable auto-reload (dev mode)")
|
|
32
|
+
args = parser.parse_args()
|
|
33
|
+
|
|
34
|
+
if args.command == "serve":
|
|
35
|
+
try:
|
|
36
|
+
import uvicorn
|
|
37
|
+
except ImportError:
|
|
38
|
+
print("ERROR: uvicorn is required. Run: uv add uvicorn", file=sys.stderr)
|
|
39
|
+
sys.exit(1)
|
|
40
|
+
|
|
41
|
+
uvicorn.run(
|
|
42
|
+
"cortex_engine.main:app",
|
|
43
|
+
host=args.host,
|
|
44
|
+
port=args.port,
|
|
45
|
+
workers=args.workers,
|
|
46
|
+
reload=args.reload,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
main()
|
cortex_engine/config.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — Configuration
|
|
3
|
+
Reads from environment variables with sensible defaults.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
from typing import List
|
|
8
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Settings(BaseSettings):
|
|
12
|
+
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
|
|
13
|
+
|
|
14
|
+
# Redis
|
|
15
|
+
redis_url: str = "redis://localhost:6379/0"
|
|
16
|
+
redis_max_connections: int = 50
|
|
17
|
+
|
|
18
|
+
# Server
|
|
19
|
+
host: str = "0.0.0.0"
|
|
20
|
+
port: int = 8000
|
|
21
|
+
workers: int = 1
|
|
22
|
+
reload: bool = False
|
|
23
|
+
|
|
24
|
+
# CORS
|
|
25
|
+
allowed_origins: List[str] = ["*"]
|
|
26
|
+
|
|
27
|
+
# Cache
|
|
28
|
+
cache_ttl_seconds: int = 3600
|
|
29
|
+
|
|
30
|
+
# Feature flags
|
|
31
|
+
enable_evaluation: bool = True
|
|
32
|
+
enable_feedback: bool = True
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
settings = Settings()
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — Dependency Injection
|
|
3
|
+
All FastAPI Depends() providers live here.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
from typing import AsyncGenerator
|
|
10
|
+
|
|
11
|
+
import redis.asyncio as redis
|
|
12
|
+
from fastapi import Request
|
|
13
|
+
|
|
14
|
+
from cortex_engine.services.cache_manager import CacheManager
|
|
15
|
+
from cortex_engine.services.evaluator import EvaluationEngine
|
|
16
|
+
from cortex_engine.services.feedback import FeedbackSystem
|
|
17
|
+
from cortex_engine.services.orchestrator import ModelOrchestrator
|
|
18
|
+
from cortex_engine.services.registry import ModelRegistry
|
|
19
|
+
from cortex_engine.services.router import RouterEngine
|
|
20
|
+
from cortex_engine.services.scheduler import Scheduler
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ── Low-level services (pulled from app.state) ────────────────────────────────
|
|
24
|
+
|
|
25
|
+
def get_redis(request: Request) -> redis.Redis:
|
|
26
|
+
return request.app.state.redis
|
|
27
|
+
|
|
28
|
+
def get_registry(request: Request) -> ModelRegistry:
|
|
29
|
+
return request.app.state.registry
|
|
30
|
+
|
|
31
|
+
def get_cache(request: Request) -> CacheManager:
|
|
32
|
+
return request.app.state.cache
|
|
33
|
+
|
|
34
|
+
def get_scheduler(request: Request) -> Scheduler:
|
|
35
|
+
return request.app.state.scheduler
|
|
36
|
+
|
|
37
|
+
def get_feedback(request: Request) -> FeedbackSystem:
|
|
38
|
+
return request.app.state.feedback
|
|
39
|
+
|
|
40
|
+
def get_orchestrator(request: Request) -> ModelOrchestrator:
|
|
41
|
+
return request.app.state.orchestrator
|
cortex_engine/main.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — Main Application
|
|
3
|
+
FastAPI entry point. Bootstraps all services on startup.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import time
|
|
10
|
+
from contextlib import asynccontextmanager
|
|
11
|
+
|
|
12
|
+
import redis.asyncio as redis_asyncio
|
|
13
|
+
from fastapi import FastAPI
|
|
14
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
15
|
+
from fastapi.middleware.gzip import GZipMiddleware
|
|
16
|
+
|
|
17
|
+
from cortex_engine.config import settings
|
|
18
|
+
from cortex_engine.routers.inference import router as inference_router
|
|
19
|
+
from cortex_engine.routers.api import registry_router, admin_router
|
|
20
|
+
from cortex_engine.services.cache_manager import CacheManager
|
|
21
|
+
from cortex_engine.services.evaluator import EvaluationEngine
|
|
22
|
+
from cortex_engine.services.feedback import FeedbackSystem
|
|
23
|
+
from cortex_engine.services.orchestrator import ModelOrchestrator
|
|
24
|
+
from cortex_engine.services.registry import ModelRegistry
|
|
25
|
+
from cortex_engine.services.router import RouterEngine
|
|
26
|
+
from cortex_engine.services.scheduler import Scheduler
|
|
27
|
+
|
|
28
|
+
logging.basicConfig(
|
|
29
|
+
level=logging.INFO,
|
|
30
|
+
format="%(asctime)s | %(levelname)-8s | %(name)s — %(message)s",
|
|
31
|
+
)
|
|
32
|
+
log = logging.getLogger("cortex_engine")
|
|
33
|
+
|
|
34
|
+
_BOOT_TIME = time.time()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ── Lifespan (startup / shutdown) ─────────────────────────────────────────────
|
|
38
|
+
|
|
39
|
+
@asynccontextmanager
|
|
40
|
+
async def lifespan(app: FastAPI):
|
|
41
|
+
log.info("🚀 Cortex-Engine booting …")
|
|
42
|
+
|
|
43
|
+
# Redis
|
|
44
|
+
pool = redis_asyncio.ConnectionPool.from_url(
|
|
45
|
+
settings.redis_url,
|
|
46
|
+
max_connections=settings.redis_max_connections,
|
|
47
|
+
decode_responses=True,
|
|
48
|
+
)
|
|
49
|
+
r = redis_asyncio.Redis(connection_pool=pool)
|
|
50
|
+
await r.ping()
|
|
51
|
+
log.info("✅ Redis connected: %s", settings.redis_url)
|
|
52
|
+
|
|
53
|
+
# Services
|
|
54
|
+
registry = ModelRegistry(r)
|
|
55
|
+
cache = CacheManager(r)
|
|
56
|
+
scheduler = Scheduler(r, registry)
|
|
57
|
+
router = RouterEngine(registry=registry)
|
|
58
|
+
evaluator = EvaluationEngine()
|
|
59
|
+
feedback = FeedbackSystem(r)
|
|
60
|
+
|
|
61
|
+
await registry.seed()
|
|
62
|
+
|
|
63
|
+
orchestrator = ModelOrchestrator(
|
|
64
|
+
registry=registry,
|
|
65
|
+
router=router,
|
|
66
|
+
scheduler=scheduler,
|
|
67
|
+
cache=cache,
|
|
68
|
+
evaluator=evaluator,
|
|
69
|
+
feedback=feedback,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Attach to app state
|
|
73
|
+
app.state.redis = r
|
|
74
|
+
app.state.registry = registry
|
|
75
|
+
app.state.cache = cache
|
|
76
|
+
app.state.scheduler = scheduler
|
|
77
|
+
app.state.router = router
|
|
78
|
+
app.state.evaluator = evaluator
|
|
79
|
+
app.state.feedback = feedback
|
|
80
|
+
app.state.orchestrator = orchestrator
|
|
81
|
+
|
|
82
|
+
log.info("✅ All services initialised.")
|
|
83
|
+
yield
|
|
84
|
+
|
|
85
|
+
# Shutdown
|
|
86
|
+
log.info("🛑 Cortex-Engine shutting down …")
|
|
87
|
+
await r.aclose()
|
|
88
|
+
await pool.aclose()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ── App ───────────────────────────────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
app = FastAPI(
|
|
94
|
+
title="Cortex-Engine",
|
|
95
|
+
description=(
|
|
96
|
+
"Distributed AI system: routes queries to 30–50+ models, "
|
|
97
|
+
"manages GPU scheduling, caching, and continuous feedback."
|
|
98
|
+
),
|
|
99
|
+
version="0.1.0",
|
|
100
|
+
lifespan=lifespan,
|
|
101
|
+
docs_url="/docs",
|
|
102
|
+
redoc_url="/redoc",
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
|
106
|
+
app.add_middleware(
|
|
107
|
+
CORSMiddleware,
|
|
108
|
+
allow_origins=settings.allowed_origins,
|
|
109
|
+
allow_methods=["*"],
|
|
110
|
+
allow_headers=["*"],
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ── Latency middleware ─────────────────────────────────────────────────────────
|
|
115
|
+
|
|
116
|
+
import time as _time
|
|
117
|
+
from fastapi import Request, Response
|
|
118
|
+
|
|
119
|
+
@app.middleware("http")
|
|
120
|
+
async def latency_header(request: Request, call_next):
|
|
121
|
+
t0 = _time.perf_counter()
|
|
122
|
+
response: Response = await call_next(request)
|
|
123
|
+
response.headers["X-Process-Time-Ms"] = str(
|
|
124
|
+
round((_time.perf_counter() - t0) * 1000, 2)
|
|
125
|
+
)
|
|
126
|
+
return response
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ── Routers ───────────────────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
app.include_router(inference_router)
|
|
132
|
+
app.include_router(registry_router)
|
|
133
|
+
app.include_router(admin_router)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ── Health endpoints ───────────────────────────────────────────────────────────
|
|
137
|
+
|
|
138
|
+
@app.get("/", tags=["Health"])
|
|
139
|
+
async def root():
|
|
140
|
+
return {"service": "Cortex-Engine", "version": "0.1.0", "status": "ok"}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@app.get("/health", tags=["Health"])
|
|
144
|
+
async def health(request: Request):
|
|
145
|
+
r: redis_asyncio.Redis = request.app.state.redis
|
|
146
|
+
try:
|
|
147
|
+
await r.ping()
|
|
148
|
+
redis_ok = True
|
|
149
|
+
except Exception:
|
|
150
|
+
redis_ok = False
|
|
151
|
+
|
|
152
|
+
reg_stats = await request.app.state.registry.stats()
|
|
153
|
+
cache_info = await request.app.state.cache.stats()
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
"status": "ok" if redis_ok else "degraded",
|
|
157
|
+
"uptime_seconds": round(time.time() - _BOOT_TIME, 1),
|
|
158
|
+
"redis": "up" if redis_ok else "down",
|
|
159
|
+
"models": reg_stats,
|
|
160
|
+
"cache": cache_info,
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@app.get("/metrics", tags=["Health"])
|
|
165
|
+
async def metrics(request: Request):
|
|
166
|
+
fb_stats = await request.app.state.feedback.accuracy_stats()
|
|
167
|
+
cache_stats = await request.app.state.cache.stats()
|
|
168
|
+
reg_stats = await request.app.state.registry.stats()
|
|
169
|
+
queue_depth = await request.app.state.scheduler.queue_depth()
|
|
170
|
+
gpu_loads = await request.app.state.scheduler.gpu_loads()
|
|
171
|
+
|
|
172
|
+
return {
|
|
173
|
+
"uptime_seconds": round(time.time() - _BOOT_TIME, 1),
|
|
174
|
+
"routing_accuracy": fb_stats["accuracy"],
|
|
175
|
+
"cache_hit_rate": cache_stats["hit_rate"],
|
|
176
|
+
"models": reg_stats,
|
|
177
|
+
"queue_depth": queue_depth,
|
|
178
|
+
"gpu_loads": gpu_loads,
|
|
179
|
+
"feedback": fb_stats,
|
|
180
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# models sub-package
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — Pydantic Schemas
|
|
3
|
+
Defines all request/response models used across the system.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ─────────────────────────────────────────────
|
|
14
|
+
# Enums
|
|
15
|
+
# ─────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
class ModelType(str, Enum):
|
|
18
|
+
CODING = "coding"
|
|
19
|
+
DEBUGGING = "debugging"
|
|
20
|
+
EXPLANATION = "explanation"
|
|
21
|
+
TESTING = "testing"
|
|
22
|
+
REFACTORING = "refactoring"
|
|
23
|
+
DOCS = "docs"
|
|
24
|
+
GENERAL = "general"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ModelStatus(str, Enum):
|
|
28
|
+
AVAILABLE = "available"
|
|
29
|
+
LOADING = "loading"
|
|
30
|
+
BUSY = "busy"
|
|
31
|
+
OFFLINE = "offline"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class EvalMethod(str, Enum):
|
|
35
|
+
UNIT_TEST = "unit_test"
|
|
36
|
+
STATIC_ANALYSIS = "static_analysis"
|
|
37
|
+
LLM_GRADING = "llm_grading"
|
|
38
|
+
EXECUTION = "execution"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ─────────────────────────────────────────────
|
|
42
|
+
# Model Registry Schemas
|
|
43
|
+
# ─────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
class ModelMetadata(BaseModel):
|
|
46
|
+
model_name: str
|
|
47
|
+
type: ModelType
|
|
48
|
+
cluster: str # e.g. "python", "typescript", "rust"
|
|
49
|
+
size: str # e.g. "7B", "14B", "70B"
|
|
50
|
+
latency_ms: int = Field(ge=0) # expected p50 latency in ms
|
|
51
|
+
cost_per_tok: float = Field(ge=0.0) # USD per 1K tokens
|
|
52
|
+
gpu_location: str # e.g. "gpu-0", "gpu-3"
|
|
53
|
+
status: ModelStatus = ModelStatus.AVAILABLE
|
|
54
|
+
priority: int = 5 # 1 (lowest) – 10 (highest)
|
|
55
|
+
tags: List[str] = []
|
|
56
|
+
|
|
57
|
+
model_config = ConfigDict(use_enum_values=True)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ModelRegistryResponse(BaseModel):
|
|
61
|
+
models: List[ModelMetadata]
|
|
62
|
+
total: int
|
|
63
|
+
available: int
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ─────────────────────────────────────────────
|
|
67
|
+
# Inference Schemas
|
|
68
|
+
# ─────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
class InferenceRequest(BaseModel):
|
|
71
|
+
query: str = Field(..., min_length=1, max_length=32_000)
|
|
72
|
+
preferred_type: Optional[ModelType] = None
|
|
73
|
+
preferred_model: Optional[str] = None # override routing
|
|
74
|
+
max_tokens: int = Field(default=2048, ge=64, le=16384)
|
|
75
|
+
temperature: float = Field(default=0.2, ge=0.0, le=2.0)
|
|
76
|
+
evaluate: bool = True
|
|
77
|
+
user_id: Optional[str] = None
|
|
78
|
+
session_id: Optional[str] = None
|
|
79
|
+
metadata: Dict[str, Any] = {}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class RouteDecision(BaseModel):
|
|
83
|
+
cluster: str
|
|
84
|
+
selected_model: str
|
|
85
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
86
|
+
fallback_models: List[str] = []
|
|
87
|
+
routing_latency_ms: float = 0.0
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class EvaluationResult(BaseModel):
|
|
91
|
+
success: bool
|
|
92
|
+
score: float = Field(ge=0.0, le=1.0)
|
|
93
|
+
method: EvalMethod
|
|
94
|
+
details: str = ""
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class InferenceResponse(BaseModel):
|
|
98
|
+
request_id: str
|
|
99
|
+
output: str
|
|
100
|
+
model_used: str
|
|
101
|
+
route: RouteDecision
|
|
102
|
+
evaluation: Optional[EvaluationResult] = None
|
|
103
|
+
total_latency_ms: float
|
|
104
|
+
cached: bool = False
|
|
105
|
+
tokens_used: int = 0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ─────────────────────────────────────────────
|
|
109
|
+
# Scheduler Schemas
|
|
110
|
+
# ─────────────────────────────────────────────
|
|
111
|
+
|
|
112
|
+
class ScheduleRequest(BaseModel):
|
|
113
|
+
model_name: str
|
|
114
|
+
gpu_hint: Optional[str] = None # prefer specific GPU
|
|
115
|
+
priority: int = 5
|
|
116
|
+
|
|
117
|
+
class ScheduleResult(BaseModel):
|
|
118
|
+
model_name: str
|
|
119
|
+
gpu_location: str
|
|
120
|
+
queue_position: int
|
|
121
|
+
estimated_wait_ms: float
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# ─────────────────────────────────────────────
|
|
125
|
+
# Feedback Schemas
|
|
126
|
+
# ─────────────────────────────────────────────
|
|
127
|
+
|
|
128
|
+
class FeedbackEntry(BaseModel):
|
|
129
|
+
request_id: str
|
|
130
|
+
query: str
|
|
131
|
+
predicted_route: str
|
|
132
|
+
actual_route: Optional[str] = None
|
|
133
|
+
success: bool
|
|
134
|
+
score: float
|
|
135
|
+
user_correction: Optional[str] = None # what the correct model should have been
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ─────────────────────────────────────────────
|
|
139
|
+
# Health / Metrics
|
|
140
|
+
# ─────────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
class GPUStats(BaseModel):
|
|
143
|
+
gpu_id: str
|
|
144
|
+
utilization: float # 0.0 – 1.0
|
|
145
|
+
memory_used: float # GB
|
|
146
|
+
memory_total: float # GB
|
|
147
|
+
models_loaded: List[str] = []
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class KernelMetrics(BaseModel):
|
|
151
|
+
uptime_seconds: float
|
|
152
|
+
total_requests: int
|
|
153
|
+
requests_per_second: float
|
|
154
|
+
cache_hit_rate: float
|
|
155
|
+
avg_latency_ms: float
|
|
156
|
+
routing_accuracy: float
|
|
157
|
+
gpu_stats: List[GPUStats] = []
|
|
158
|
+
models_available: int
|
cortex_engine/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This file marks cortex_engine as a typed package (PEP 561).
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# routers sub-package
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — /registry and /admin routes
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, Depends, HTTPException, status
|
|
6
|
+
from cortex_engine.models.schemas import ModelMetadata, ModelRegistryResponse, ModelStatus
|
|
7
|
+
from cortex_engine.services.registry import ModelRegistry
|
|
8
|
+
from cortex_engine.services.scheduler import Scheduler
|
|
9
|
+
from cortex_engine.services.cache_manager import CacheManager
|
|
10
|
+
from cortex_engine.services.feedback import FeedbackSystem
|
|
11
|
+
from cortex_engine.dependencies import get_registry, get_scheduler, get_cache, get_feedback
|
|
12
|
+
|
|
13
|
+
registry_router = APIRouter(prefix="/registry", tags=["Registry"])
|
|
14
|
+
admin_router = APIRouter(prefix="/admin", tags=["Admin"])
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ── Registry ──────────────────────────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
@registry_router.get("/", response_model=ModelRegistryResponse)
|
|
20
|
+
async def list_models(reg: ModelRegistry = Depends(get_registry)):
|
|
21
|
+
models = await reg.list_all()
|
|
22
|
+
stats = await reg.stats()
|
|
23
|
+
return ModelRegistryResponse(models=models, total=stats["total"], available=stats["available"])
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@registry_router.post("/", status_code=status.HTTP_201_CREATED)
|
|
27
|
+
async def register_model(model: ModelMetadata, reg: ModelRegistry = Depends(get_registry)):
|
|
28
|
+
await reg.register(model)
|
|
29
|
+
return {"registered": model.model_name}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@registry_router.get("/{model_name}")
|
|
33
|
+
async def get_model(model_name: str, reg: ModelRegistry = Depends(get_registry)):
|
|
34
|
+
model = await reg.get(model_name)
|
|
35
|
+
if not model:
|
|
36
|
+
raise HTTPException(404, f"Model '{model_name}' not found")
|
|
37
|
+
return model
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@registry_router.patch("/{model_name}/status")
|
|
41
|
+
async def set_model_status(
|
|
42
|
+
model_name: str, status: ModelStatus,
|
|
43
|
+
reg: ModelRegistry = Depends(get_registry),
|
|
44
|
+
):
|
|
45
|
+
ok = await reg.update_status(model_name, status)
|
|
46
|
+
if not ok:
|
|
47
|
+
raise HTTPException(404, f"Model '{model_name}' not found")
|
|
48
|
+
return {"updated": model_name, "status": status}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@registry_router.delete("/{model_name}", status_code=204)
|
|
52
|
+
async def deregister_model(model_name: str, reg: ModelRegistry = Depends(get_registry)):
|
|
53
|
+
ok = await reg.deregister(model_name)
|
|
54
|
+
if not ok:
|
|
55
|
+
raise HTTPException(404, f"Model '{model_name}' not found")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ── Admin ─────────────────────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
@admin_router.get("/queue")
|
|
61
|
+
async def queue_status(sched: Scheduler = Depends(get_scheduler)):
|
|
62
|
+
return {
|
|
63
|
+
"queue_depth": await sched.queue_depth(),
|
|
64
|
+
"running_jobs": await sched.running_jobs(),
|
|
65
|
+
"gpu_loads": await sched.gpu_loads(),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@admin_router.get("/cache")
|
|
70
|
+
async def cache_status(cache: CacheManager = Depends(get_cache)):
|
|
71
|
+
return {
|
|
72
|
+
"stats": await cache.stats(),
|
|
73
|
+
"warm_pool": await cache.warm_pool(),
|
|
74
|
+
"eviction_log": await cache.eviction_log(20),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@admin_router.get("/feedback")
|
|
79
|
+
async def feedback_status(
|
|
80
|
+
limit: int = 50,
|
|
81
|
+
fb: FeedbackSystem = Depends(get_feedback),
|
|
82
|
+
):
|
|
83
|
+
return {
|
|
84
|
+
"accuracy_stats": await fb.accuracy_stats(),
|
|
85
|
+
"recent": await fb.recent(limit),
|
|
86
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex-Engine — /inference routes
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, Depends, HTTPException, status
|
|
6
|
+
from cortex_engine.models.schemas import InferenceRequest, InferenceResponse
|
|
7
|
+
from cortex_engine.services.orchestrator import ModelOrchestrator
|
|
8
|
+
from cortex_engine.dependencies import get_orchestrator
|
|
9
|
+
|
|
10
|
+
router = APIRouter(prefix="/inference", tags=["Inference"])
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@router.post(
|
|
14
|
+
"/",
|
|
15
|
+
response_model=InferenceResponse,
|
|
16
|
+
status_code=status.HTTP_200_OK,
|
|
17
|
+
summary="Submit a query for AI inference",
|
|
18
|
+
)
|
|
19
|
+
async def infer(
|
|
20
|
+
req: InferenceRequest,
|
|
21
|
+
orchestrator: ModelOrchestrator = Depends(get_orchestrator),
|
|
22
|
+
) -> InferenceResponse:
|
|
23
|
+
try:
|
|
24
|
+
return await orchestrator.infer(req)
|
|
25
|
+
except RuntimeError as exc:
|
|
26
|
+
raise HTTPException(status_code=503, detail=str(exc))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@router.get(
|
|
30
|
+
"/route-preview",
|
|
31
|
+
summary="Preview which model would be selected without running inference",
|
|
32
|
+
)
|
|
33
|
+
async def route_preview(
|
|
34
|
+
query: str,
|
|
35
|
+
orchestrator: ModelOrchestrator = Depends(get_orchestrator),
|
|
36
|
+
):
|
|
37
|
+
route = await orchestrator._router.route(query=query)
|
|
38
|
+
return route
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# services sub-package
|