cortex-engine 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortex_engine-0.1.0/PKG-INFO +352 -0
- cortex_engine-0.1.0/README.md +307 -0
- cortex_engine-0.1.0/pyproject.toml +185 -0
- cortex_engine-0.1.0/src/cortex_engine/__init__.py +12 -0
- cortex_engine-0.1.0/src/cortex_engine/cli.py +51 -0
- cortex_engine-0.1.0/src/cortex_engine/config.py +35 -0
- cortex_engine-0.1.0/src/cortex_engine/dependencies.py +41 -0
- cortex_engine-0.1.0/src/cortex_engine/main.py +180 -0
- cortex_engine-0.1.0/src/cortex_engine/models/__init__.py +1 -0
- cortex_engine-0.1.0/src/cortex_engine/models/schemas.py +158 -0
- cortex_engine-0.1.0/src/cortex_engine/py.typed +1 -0
- cortex_engine-0.1.0/src/cortex_engine/routers/__init__.py +1 -0
- cortex_engine-0.1.0/src/cortex_engine/routers/api.py +86 -0
- cortex_engine-0.1.0/src/cortex_engine/routers/inference.py +38 -0
- cortex_engine-0.1.0/src/cortex_engine/services/__init__.py +1 -0
- cortex_engine-0.1.0/src/cortex_engine/services/cache_manager.py +111 -0
- cortex_engine-0.1.0/src/cortex_engine/services/evaluator.py +103 -0
- cortex_engine-0.1.0/src/cortex_engine/services/feedback.py +57 -0
- cortex_engine-0.1.0/src/cortex_engine/services/orchestrator.py +138 -0
- cortex_engine-0.1.0/src/cortex_engine/services/registry.py +186 -0
- cortex_engine-0.1.0/src/cortex_engine/services/router.py +214 -0
- cortex_engine-0.1.0/src/cortex_engine/services/scheduler.py +111 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: cortex-engine
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AI Operating System for Coding — routes queries across 30-50+ LLMs with GPU scheduling, LRU caching, evaluation, and feedback-driven routing.
|
|
5
|
+
Keywords: ai,llm,inference,routing,orchestration,gpu,fastapi,coding-assistant,model-serving
|
|
6
|
+
Author: Cortex-Engine Contributors
|
|
7
|
+
Author-email: Cortex-Engine Contributors <hello@cortex-engine.dev>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: HTTP Servers
|
|
21
|
+
Classifier: Framework :: FastAPI
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Dist: fastapi>=0.115,<1.0
|
|
24
|
+
Requires-Dist: uvicorn[standard]>=0.34,<1.0
|
|
25
|
+
Requires-Dist: redis>=5.2,<6.0
|
|
26
|
+
Requires-Dist: pydantic>=2.10,<3.0
|
|
27
|
+
Requires-Dist: pydantic-settings>=2.7,<3.0
|
|
28
|
+
Requires-Dist: httpx>=0.28,<1.0
|
|
29
|
+
Requires-Dist: sentence-transformers>=3.4,<4.0 ; extra == 'embeddings'
|
|
30
|
+
Requires-Dist: faiss-cpu>=1.10,<2.0 ; extra == 'embeddings'
|
|
31
|
+
Requires-Dist: cortex-engine[embeddings,qdrant,postgres] ; extra == 'full'
|
|
32
|
+
Requires-Dist: sqlalchemy[asyncio]>=2.0,<3.0 ; extra == 'postgres'
|
|
33
|
+
Requires-Dist: asyncpg>=0.30,<1.0 ; extra == 'postgres'
|
|
34
|
+
Requires-Dist: qdrant-client>=1.14,<2.0 ; extra == 'qdrant'
|
|
35
|
+
Requires-Python: >=3.11
|
|
36
|
+
Project-URL: Bug Tracker, https://github.com/imnotdev25/cortex-engine/issues
|
|
37
|
+
Project-URL: Changelog, https://github.com/imnotdev25/cortex-engine/blob/main/CHANGELOG.md
|
|
38
|
+
Project-URL: Homepage, https://github.com/imnotdev25/cortex-engine
|
|
39
|
+
Project-URL: Repository, https://github.com/imnotdev25/cortex-engine
|
|
40
|
+
Provides-Extra: embeddings
|
|
41
|
+
Provides-Extra: full
|
|
42
|
+
Provides-Extra: postgres
|
|
43
|
+
Provides-Extra: qdrant
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# 🧠 Cortex-Engine
|
|
47
|
+
|
|
48
|
+
> An AI Operating System for coding — routes queries across 30–50+ models, manages GPU scheduling, caching, evaluation, and continuous feedback.
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
models = processes | GPU = CPU | router = scheduler | kernel = control plane
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Architecture
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
User Query
|
|
60
|
+
│
|
|
61
|
+
▼
|
|
62
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
63
|
+
│ Cortex-Engine API (FastAPI) │
|
|
64
|
+
│ │
|
|
65
|
+
│ POST /inference │
|
|
66
|
+
│ │ │
|
|
67
|
+
│ ▼ │
|
|
68
|
+
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
69
|
+
│ │ Model Orchestrator (central brain) │ │
|
|
70
|
+
│ │ │ │
|
|
71
|
+
│ │ ① Cache Check ──────────► Redis LRU Cache │ │
|
|
72
|
+
│ │ │ miss │ │
|
|
73
|
+
│ │ ▼ │ │
|
|
74
|
+
│ │ ② Router Engine ─────────► Cluster Detection │ │
|
|
75
|
+
│ │ │ Model Selection │ │
|
|
76
|
+
│ │ ▼ Confidence Score │ │
|
|
77
|
+
│ │ ③ Scheduler ──────────────► GPU Assignment │ │
|
|
78
|
+
│ │ │ Priority Queue │ │
|
|
79
|
+
│ │ ▼ │ │
|
|
80
|
+
│ │ ④ Model Worker ──────────► vLLM / Triton (Phase 2) │ │
|
|
81
|
+
│ │ │ │ │
|
|
82
|
+
│ │ ▼ │ │
|
|
83
|
+
│ │ ⑤ Evaluator ─────────────► Static Analysis │ │
|
|
84
|
+
│ │ │ LLM Grading (Phase 2) │ │
|
|
85
|
+
│ │ ▼ │ │
|
|
86
|
+
│ │ ⑥ Feedback Log ──────────► Redis (rolling 10k) │ │
|
|
87
|
+
│ └─────────────────────────────────────────────────────────┘ │
|
|
88
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
89
|
+
│
|
|
90
|
+
▼
|
|
91
|
+
Model Registry (Redis Hash)
|
|
92
|
+
7 seed models → 50+ in Phase 3
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Latency Targets
|
|
98
|
+
|
|
99
|
+
| Component | Target |
|
|
100
|
+
|-------------|---------|
|
|
101
|
+
| Router | < 50 ms |
|
|
102
|
+
| Cache check | < 5 ms |
|
|
103
|
+
| Scheduling | < 10 ms |
|
|
104
|
+
| Total overhead | < 100 ms |
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Project Structure
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
cortex_engine/
|
|
112
|
+
├── main.py ← FastAPI app, lifespan, middleware, health/metrics
|
|
113
|
+
├── config.py ← Pydantic Settings (env-based)
|
|
114
|
+
├── dependencies.py ← All Depends() providers
|
|
115
|
+
│
|
|
116
|
+
├── models/
|
|
117
|
+
│ └── schemas.py ← All Pydantic request/response models + enums
|
|
118
|
+
│
|
|
119
|
+
├── services/
|
|
120
|
+
│ ├── registry.py ← Redis-backed model registry (7 seed models)
|
|
121
|
+
│ ├── router.py ← Keyword/heuristic router with tiebreak scoring
|
|
122
|
+
│ ├── cache_manager.py ← LRU response cache + warm pool + eviction log
|
|
123
|
+
│ ├── scheduler.py ← Priority queue scheduler (Redis sorted sets)
|
|
124
|
+
│ ├── evaluator.py ← Static analysis + LLM-grade stub
|
|
125
|
+
│ ├── feedback.py ← Rolling feedback log + accuracy stats
|
|
126
|
+
│ └── orchestrator.py ← Central brain: routes→schedules→infers→evals
|
|
127
|
+
│
|
|
128
|
+
├── routers/
|
|
129
|
+
│ ├── inference.py ← POST /inference, GET /inference/route-preview
|
|
130
|
+
│ └── api.py ← /registry CRUD, /admin (queue/cache/feedback)
|
|
131
|
+
│
|
|
132
|
+
├── tests/
|
|
133
|
+
│ ├── conftest.py ← Async fixtures with fakeredis (no real Redis needed)
|
|
134
|
+
│ ├── test_registry.py ← 11 tests
|
|
135
|
+
│ ├── test_router.py ← 11 tests
|
|
136
|
+
│ ├── test_cache.py ← 7 tests
|
|
137
|
+
│ ├── test_evaluator.py ← 6 tests
|
|
138
|
+
│ ├── test_scheduler.py ← 7 tests
|
|
139
|
+
│ ├── test_orchestrator.py ← 11 tests (integration)
|
|
140
|
+
│ └── test_feedback.py ← 4 tests (57 total → all pass)
|
|
141
|
+
│
|
|
142
|
+
├── Dockerfile
|
|
143
|
+
├── docker-compose.yml
|
|
144
|
+
├── requirements.txt
|
|
145
|
+
├── pytest.ini
|
|
146
|
+
└── .env.example
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Quickstart
|
|
152
|
+
|
|
153
|
+
### Option A — Local (needs Redis)
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
# 1. Clone & install
|
|
157
|
+
git clone <repo>
|
|
158
|
+
cd cortex_engine
|
|
159
|
+
pip install -r requirements.txt
|
|
160
|
+
|
|
161
|
+
# 2. Start Redis
|
|
162
|
+
docker run -d -p 6379:6379 redis:7.4-alpine
|
|
163
|
+
|
|
164
|
+
# 3. Configure
|
|
165
|
+
cp .env.example .env
|
|
166
|
+
|
|
167
|
+
# 4. Run
|
|
168
|
+
uvicorn main:app --reload --port 8000
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Option B — Docker Compose (recommended)
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
docker compose up --build
|
|
175
|
+
# Optional: include Redis Commander UI
|
|
176
|
+
docker compose --profile dev up --build
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Run Tests (no Redis needed)
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pip install -r requirements.txt
|
|
183
|
+
pytest tests/ -v
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## API Reference
|
|
189
|
+
|
|
190
|
+
### `POST /inference`
|
|
191
|
+
|
|
192
|
+
Route a query to the best model and get a response.
|
|
193
|
+
|
|
194
|
+
```json
|
|
195
|
+
// Request
|
|
196
|
+
{
|
|
197
|
+
"query": "Write a pytest test for a function that reverses a string",
|
|
198
|
+
"preferred_type": null, // optional: debugging|testing|explanation|...
|
|
199
|
+
"preferred_model": null, // optional: override routing entirely
|
|
200
|
+
"max_tokens": 2048,
|
|
201
|
+
"temperature": 0.2,
|
|
202
|
+
"evaluate": true
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Response
|
|
206
|
+
{
|
|
207
|
+
"request_id": "a1b2c3d4e5f6",
|
|
208
|
+
"output": "...",
|
|
209
|
+
"model_used": "starcoder2-7b",
|
|
210
|
+
"route": {
|
|
211
|
+
"cluster": "testing",
|
|
212
|
+
"selected_model": "starcoder2-7b",
|
|
213
|
+
"confidence": 0.72,
|
|
214
|
+
"fallback_models": ["phi-3-mini"],
|
|
215
|
+
"routing_latency_ms": 1.4
|
|
216
|
+
},
|
|
217
|
+
"evaluation": {
|
|
218
|
+
"success": true,
|
|
219
|
+
"score": 0.8,
|
|
220
|
+
"method": "static_analysis",
|
|
221
|
+
"details": "has_content=✓; has_code=✓; no_apology=✓; syntax_ok=✓; keyword_coverage=✓"
|
|
222
|
+
},
|
|
223
|
+
"total_latency_ms": 14.2,
|
|
224
|
+
"cached": false,
|
|
225
|
+
"tokens_used": 87
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### `GET /inference/route-preview?query=...`
|
|
230
|
+
|
|
231
|
+
Dry-run: see which model would be selected without running inference.
|
|
232
|
+
|
|
233
|
+
### `GET /registry/`
|
|
234
|
+
|
|
235
|
+
List all registered models with status and metadata.
|
|
236
|
+
|
|
237
|
+
### `POST /registry/`
|
|
238
|
+
|
|
239
|
+
Register a new model.
|
|
240
|
+
|
|
241
|
+
### `PATCH /registry/{model_name}/status`
|
|
242
|
+
|
|
243
|
+
Set model status: `available | loading | busy | offline`
|
|
244
|
+
|
|
245
|
+
### `GET /admin/queue`
|
|
246
|
+
|
|
247
|
+
Current GPU queue depth, running jobs, per-GPU load counts.
|
|
248
|
+
|
|
249
|
+
### `GET /admin/cache`
|
|
250
|
+
|
|
251
|
+
Cache hit rate, warm pool, recent evictions.
|
|
252
|
+
|
|
253
|
+
### `GET /admin/feedback`
|
|
254
|
+
|
|
255
|
+
Routing accuracy stats and recent feedback log.
|
|
256
|
+
|
|
257
|
+
### `GET /health`
|
|
258
|
+
|
|
259
|
+
Liveness check — Redis connectivity + model counts.
|
|
260
|
+
|
|
261
|
+
### `GET /metrics`
|
|
262
|
+
|
|
263
|
+
Full system metrics: routing accuracy, cache hit rate, GPU loads, queue depth.
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## Cluster → Model Routing
|
|
268
|
+
|
|
269
|
+
| Cluster | Trigger Keywords | Models |
|
|
270
|
+
|---------|-----------------|--------|
|
|
271
|
+
| `debugging` | error, exception, traceback, crash, fix bug | codellama-7b |
|
|
272
|
+
| `testing` | pytest, jest, unit test, mock, assert | starcoder2-7b |
|
|
273
|
+
| `explanation` | explain, what is, how does, document | mistral-7b-instruct |
|
|
274
|
+
| `refactoring` | refactor, optimize, clean up, simplify | qwen-coder-14b |
|
|
275
|
+
| `python` | python, django, fastapi, flask, .py | qwen-coder-7b, qwen-coder-14b |
|
|
276
|
+
| `general_code` | code, class, implement, build | deepseek-coder-6.7b |
|
|
277
|
+
| `fallback` | everything else | phi-3-mini |
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## Seed Models (Phase 1 MVP)
|
|
282
|
+
|
|
283
|
+
| Model | Cluster | Size | Latency | GPU |
|
|
284
|
+
|-------|---------|------|---------|-----|
|
|
285
|
+
| `qwen-coder-7b` | python | 7B | 250ms | gpu-0 |
|
|
286
|
+
| `qwen-coder-14b` | python | 14B | 400ms | gpu-1 |
|
|
287
|
+
| `deepseek-coder-6.7b` | general_code | 6.7B | 220ms | gpu-0 |
|
|
288
|
+
| `codellama-7b` | debugging | 7B | 270ms | gpu-1 |
|
|
289
|
+
| `mistral-7b-instruct` | explanation | 7B | 300ms | gpu-2 |
|
|
290
|
+
| `starcoder2-7b` | testing | 7B | 260ms | gpu-2 |
|
|
291
|
+
| `phi-3-mini` | fallback | 3.8B | 150ms | gpu-3 |
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## Roadmap
|
|
296
|
+
|
|
297
|
+
### Phase 1 (now — MVP)
|
|
298
|
+
- [x] FastAPI kernel with all core services
|
|
299
|
+
- [x] Redis-backed model registry (7 models)
|
|
300
|
+
- [x] Keyword/heuristic router with cluster detection
|
|
301
|
+
- [x] LRU response cache + warm pool + eviction log
|
|
302
|
+
- [x] Priority queue GPU scheduler (Redis sorted sets)
|
|
303
|
+
- [x] Static analysis evaluator
|
|
304
|
+
- [x] Rolling feedback system + accuracy tracking
|
|
305
|
+
- [x] 59 tests (all passing, no real Redis needed)
|
|
306
|
+
- [x] Docker Compose stack
|
|
307
|
+
|
|
308
|
+
### Phase 2
|
|
309
|
+
- [ ] Swap inference stub → real vLLM HTTP calls
|
|
310
|
+
- [ ] Embedding-based routing (BGE/E5 + FAISS/Qdrant)
|
|
311
|
+
- [ ] 15 models across more language clusters
|
|
312
|
+
- [ ] Ray Serve for distributed model workers
|
|
313
|
+
- [ ] LLM judge model for evaluation (phi-3-mini)
|
|
314
|
+
|
|
315
|
+
### Phase 3
|
|
316
|
+
- [ ] 30–50+ models
|
|
317
|
+
- [ ] Self-improving router (daily retraining from feedback)
|
|
318
|
+
- [ ] Cost-aware routing (balance quality vs. $/token)
|
|
319
|
+
- [ ] Multi-model collaboration (chain models)
|
|
320
|
+
- [ ] PostgreSQL for persistent metadata
|
|
321
|
+
- [ ] Reinforcement learning router
|
|
322
|
+
|
|
323
|
+
---
|
|
324
|
+
|
|
325
|
+
## Environment Variables
|
|
326
|
+
|
|
327
|
+
| Variable | Default | Description |
|
|
328
|
+
|----------|---------|-------------|
|
|
329
|
+
| `REDIS_URL` | `redis://localhost:6379/0` | Redis connection URL |
|
|
330
|
+
| `REDIS_MAX_CONNECTIONS` | `50` | Connection pool size |
|
|
331
|
+
| `PORT` | `8000` | API server port |
|
|
332
|
+
| `WORKERS` | `1` | Uvicorn worker count |
|
|
333
|
+
| `CACHE_TTL_SECONDS` | `3600` | Response cache TTL |
|
|
334
|
+
| `ENABLE_EVALUATION` | `true` | Run evaluator on outputs |
|
|
335
|
+
| `ENABLE_FEEDBACK` | `true` | Log routing feedback |
|
|
336
|
+
|
|
337
|
+
---
|
|
338
|
+
|
|
339
|
+
## Tech Stack
|
|
340
|
+
|
|
341
|
+
| Layer | Tech |
|
|
342
|
+
|-------|------|
|
|
343
|
+
| API | FastAPI + Uvicorn |
|
|
344
|
+
| Cache / State | Redis 7 (asyncio) |
|
|
345
|
+
| Scheduling | Redis sorted sets |
|
|
346
|
+
| Config | Pydantic Settings |
|
|
347
|
+
| Testing | pytest-asyncio + fakeredis |
|
|
348
|
+
| Containers | Docker + Docker Compose |
|
|
349
|
+
| Phase 2 Serving | vLLM + Triton |
|
|
350
|
+
| Phase 2 Orchestration | Ray Serve + Kubernetes |
|
|
351
|
+
| Phase 2 Vector DB | FAISS / Qdrant |
|
|
352
|
+
| Phase 3 Metadata | PostgreSQL + SQLAlchemy |
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# 🧠 Cortex-Engine
|
|
2
|
+
|
|
3
|
+
> An AI Operating System for coding — routes queries across 30–50+ models, manages GPU scheduling, caching, evaluation, and continuous feedback.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
models = processes | GPU = CPU | router = scheduler | kernel = control plane
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Architecture
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
User Query
|
|
15
|
+
│
|
|
16
|
+
▼
|
|
17
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
18
|
+
│ Cortex-Engine API (FastAPI) │
|
|
19
|
+
│ │
|
|
20
|
+
│ POST /inference │
|
|
21
|
+
│ │ │
|
|
22
|
+
│ ▼ │
|
|
23
|
+
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
24
|
+
│ │ Model Orchestrator (central brain) │ │
|
|
25
|
+
│ │ │ │
|
|
26
|
+
│ │ ① Cache Check ──────────► Redis LRU Cache │ │
|
|
27
|
+
│ │ │ miss │ │
|
|
28
|
+
│ │ ▼ │ │
|
|
29
|
+
│ │ ② Router Engine ─────────► Cluster Detection │ │
|
|
30
|
+
│ │ │ Model Selection │ │
|
|
31
|
+
│ │ ▼ Confidence Score │ │
|
|
32
|
+
│ │ ③ Scheduler ──────────────► GPU Assignment │ │
|
|
33
|
+
│ │ │ Priority Queue │ │
|
|
34
|
+
│ │ ▼ │ │
|
|
35
|
+
│ │ ④ Model Worker ──────────► vLLM / Triton (Phase 2) │ │
|
|
36
|
+
│ │ │ │ │
|
|
37
|
+
│ │ ▼ │ │
|
|
38
|
+
│ │ ⑤ Evaluator ─────────────► Static Analysis │ │
|
|
39
|
+
│ │ │ LLM Grading (Phase 2) │ │
|
|
40
|
+
│ │ ▼ │ │
|
|
41
|
+
│ │ ⑥ Feedback Log ──────────► Redis (rolling 10k) │ │
|
|
42
|
+
│ └─────────────────────────────────────────────────────────┘ │
|
|
43
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
44
|
+
│
|
|
45
|
+
▼
|
|
46
|
+
Model Registry (Redis Hash)
|
|
47
|
+
7 seed models → 50+ in Phase 3
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Latency Targets
|
|
53
|
+
|
|
54
|
+
| Component | Target |
|
|
55
|
+
|-------------|---------|
|
|
56
|
+
| Router | < 50 ms |
|
|
57
|
+
| Cache check | < 5 ms |
|
|
58
|
+
| Scheduling | < 10 ms |
|
|
59
|
+
| Total overhead | < 100 ms |
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Project Structure
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
cortex_engine/
|
|
67
|
+
├── main.py ← FastAPI app, lifespan, middleware, health/metrics
|
|
68
|
+
├── config.py ← Pydantic Settings (env-based)
|
|
69
|
+
├── dependencies.py ← All Depends() providers
|
|
70
|
+
│
|
|
71
|
+
├── models/
|
|
72
|
+
│ └── schemas.py ← All Pydantic request/response models + enums
|
|
73
|
+
│
|
|
74
|
+
├── services/
|
|
75
|
+
│ ├── registry.py ← Redis-backed model registry (7 seed models)
|
|
76
|
+
│ ├── router.py ← Keyword/heuristic router with tiebreak scoring
|
|
77
|
+
│ ├── cache_manager.py ← LRU response cache + warm pool + eviction log
|
|
78
|
+
│ ├── scheduler.py ← Priority queue scheduler (Redis sorted sets)
|
|
79
|
+
│ ├── evaluator.py ← Static analysis + LLM-grade stub
|
|
80
|
+
│ ├── feedback.py ← Rolling feedback log + accuracy stats
|
|
81
|
+
│ └── orchestrator.py ← Central brain: routes→schedules→infers→evals
|
|
82
|
+
│
|
|
83
|
+
├── routers/
|
|
84
|
+
│ ├── inference.py ← POST /inference, GET /inference/route-preview
|
|
85
|
+
│ └── api.py ← /registry CRUD, /admin (queue/cache/feedback)
|
|
86
|
+
│
|
|
87
|
+
├── tests/
|
|
88
|
+
│ ├── conftest.py ← Async fixtures with fakeredis (no real Redis needed)
|
|
89
|
+
│ ├── test_registry.py ← 11 tests
|
|
90
|
+
│ ├── test_router.py ← 11 tests
|
|
91
|
+
│ ├── test_cache.py ← 7 tests
|
|
92
|
+
│ ├── test_evaluator.py ← 6 tests
|
|
93
|
+
│ ├── test_scheduler.py ← 7 tests
|
|
94
|
+
│ ├── test_orchestrator.py ← 11 tests (integration)
|
|
95
|
+
│ └── test_feedback.py ← 4 tests (57 total → all pass)
|
|
96
|
+
│
|
|
97
|
+
├── Dockerfile
|
|
98
|
+
├── docker-compose.yml
|
|
99
|
+
├── requirements.txt
|
|
100
|
+
├── pytest.ini
|
|
101
|
+
└── .env.example
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Quickstart
|
|
107
|
+
|
|
108
|
+
### Option A — Local (needs Redis)
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# 1. Clone & install
|
|
112
|
+
git clone <repo>
|
|
113
|
+
cd cortex_engine
|
|
114
|
+
pip install -r requirements.txt
|
|
115
|
+
|
|
116
|
+
# 2. Start Redis
|
|
117
|
+
docker run -d -p 6379:6379 redis:7.4-alpine
|
|
118
|
+
|
|
119
|
+
# 3. Configure
|
|
120
|
+
cp .env.example .env
|
|
121
|
+
|
|
122
|
+
# 4. Run
|
|
123
|
+
uvicorn main:app --reload --port 8000
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Option B — Docker Compose (recommended)
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
docker compose up --build
|
|
130
|
+
# Optional: include Redis Commander UI
|
|
131
|
+
docker compose --profile dev up --build
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Run Tests (no Redis needed)
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
pip install -r requirements.txt
|
|
138
|
+
pytest tests/ -v
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## API Reference
|
|
144
|
+
|
|
145
|
+
### `POST /inference`
|
|
146
|
+
|
|
147
|
+
Route a query to the best model and get a response.
|
|
148
|
+
|
|
149
|
+
```json
|
|
150
|
+
// Request
|
|
151
|
+
{
|
|
152
|
+
"query": "Write a pytest test for a function that reverses a string",
|
|
153
|
+
"preferred_type": null, // optional: debugging|testing|explanation|...
|
|
154
|
+
"preferred_model": null, // optional: override routing entirely
|
|
155
|
+
"max_tokens": 2048,
|
|
156
|
+
"temperature": 0.2,
|
|
157
|
+
"evaluate": true
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Response
|
|
161
|
+
{
|
|
162
|
+
"request_id": "a1b2c3d4e5f6",
|
|
163
|
+
"output": "...",
|
|
164
|
+
"model_used": "starcoder2-7b",
|
|
165
|
+
"route": {
|
|
166
|
+
"cluster": "testing",
|
|
167
|
+
"selected_model": "starcoder2-7b",
|
|
168
|
+
"confidence": 0.72,
|
|
169
|
+
"fallback_models": ["phi-3-mini"],
|
|
170
|
+
"routing_latency_ms": 1.4
|
|
171
|
+
},
|
|
172
|
+
"evaluation": {
|
|
173
|
+
"success": true,
|
|
174
|
+
"score": 0.8,
|
|
175
|
+
"method": "static_analysis",
|
|
176
|
+
"details": "has_content=✓; has_code=✓; no_apology=✓; syntax_ok=✓; keyword_coverage=✓"
|
|
177
|
+
},
|
|
178
|
+
"total_latency_ms": 14.2,
|
|
179
|
+
"cached": false,
|
|
180
|
+
"tokens_used": 87
|
|
181
|
+
}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### `GET /inference/route-preview?query=...`
|
|
185
|
+
|
|
186
|
+
Dry-run: see which model would be selected without running inference.
|
|
187
|
+
|
|
188
|
+
### `GET /registry/`
|
|
189
|
+
|
|
190
|
+
List all registered models with status and metadata.
|
|
191
|
+
|
|
192
|
+
### `POST /registry/`
|
|
193
|
+
|
|
194
|
+
Register a new model.
|
|
195
|
+
|
|
196
|
+
### `PATCH /registry/{model_name}/status`
|
|
197
|
+
|
|
198
|
+
Set model status: `available | loading | busy | offline`
|
|
199
|
+
|
|
200
|
+
### `GET /admin/queue`
|
|
201
|
+
|
|
202
|
+
Current GPU queue depth, running jobs, per-GPU load counts.
|
|
203
|
+
|
|
204
|
+
### `GET /admin/cache`
|
|
205
|
+
|
|
206
|
+
Cache hit rate, warm pool, recent evictions.
|
|
207
|
+
|
|
208
|
+
### `GET /admin/feedback`
|
|
209
|
+
|
|
210
|
+
Routing accuracy stats and recent feedback log.
|
|
211
|
+
|
|
212
|
+
### `GET /health`
|
|
213
|
+
|
|
214
|
+
Liveness check — Redis connectivity + model counts.
|
|
215
|
+
|
|
216
|
+
### `GET /metrics`
|
|
217
|
+
|
|
218
|
+
Full system metrics: routing accuracy, cache hit rate, GPU loads, queue depth.
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Cluster → Model Routing
|
|
223
|
+
|
|
224
|
+
| Cluster | Trigger Keywords | Models |
|
|
225
|
+
|---------|-----------------|--------|
|
|
226
|
+
| `debugging` | error, exception, traceback, crash, fix bug | codellama-7b |
|
|
227
|
+
| `testing` | pytest, jest, unit test, mock, assert | starcoder2-7b |
|
|
228
|
+
| `explanation` | explain, what is, how does, document | mistral-7b-instruct |
|
|
229
|
+
| `refactoring` | refactor, optimize, clean up, simplify | qwen-coder-14b |
|
|
230
|
+
| `python` | python, django, fastapi, flask, .py | qwen-coder-7b, qwen-coder-14b |
|
|
231
|
+
| `general_code` | code, class, implement, build | deepseek-coder-6.7b |
|
|
232
|
+
| `fallback` | everything else | phi-3-mini |
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Seed Models (Phase 1 MVP)
|
|
237
|
+
|
|
238
|
+
| Model | Cluster | Size | Latency | GPU |
|
|
239
|
+
|-------|---------|------|---------|-----|
|
|
240
|
+
| `qwen-coder-7b` | python | 7B | 250ms | gpu-0 |
|
|
241
|
+
| `qwen-coder-14b` | python | 14B | 400ms | gpu-1 |
|
|
242
|
+
| `deepseek-coder-6.7b` | general_code | 6.7B | 220ms | gpu-0 |
|
|
243
|
+
| `codellama-7b` | debugging | 7B | 270ms | gpu-1 |
|
|
244
|
+
| `mistral-7b-instruct` | explanation | 7B | 300ms | gpu-2 |
|
|
245
|
+
| `starcoder2-7b` | testing | 7B | 260ms | gpu-2 |
|
|
246
|
+
| `phi-3-mini` | fallback | 3.8B | 150ms | gpu-3 |
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## Roadmap
|
|
251
|
+
|
|
252
|
+
### Phase 1 (now — MVP)
|
|
253
|
+
- [x] FastAPI kernel with all core services
|
|
254
|
+
- [x] Redis-backed model registry (7 models)
|
|
255
|
+
- [x] Keyword/heuristic router with cluster detection
|
|
256
|
+
- [x] LRU response cache + warm pool + eviction log
|
|
257
|
+
- [x] Priority queue GPU scheduler (Redis sorted sets)
|
|
258
|
+
- [x] Static analysis evaluator
|
|
259
|
+
- [x] Rolling feedback system + accuracy tracking
|
|
260
|
+
- [x] 59 tests (all passing, no real Redis needed)
|
|
261
|
+
- [x] Docker Compose stack
|
|
262
|
+
|
|
263
|
+
### Phase 2
|
|
264
|
+
- [ ] Swap inference stub → real vLLM HTTP calls
|
|
265
|
+
- [ ] Embedding-based routing (BGE/E5 + FAISS/Qdrant)
|
|
266
|
+
- [ ] 15 models across more language clusters
|
|
267
|
+
- [ ] Ray Serve for distributed model workers
|
|
268
|
+
- [ ] LLM judge model for evaluation (phi-3-mini)
|
|
269
|
+
|
|
270
|
+
### Phase 3
|
|
271
|
+
- [ ] 30–50+ models
|
|
272
|
+
- [ ] Self-improving router (daily retraining from feedback)
|
|
273
|
+
- [ ] Cost-aware routing (balance quality vs. $/token)
|
|
274
|
+
- [ ] Multi-model collaboration (chain models)
|
|
275
|
+
- [ ] PostgreSQL for persistent metadata
|
|
276
|
+
- [ ] Reinforcement learning router
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## Environment Variables
|
|
281
|
+
|
|
282
|
+
| Variable | Default | Description |
|
|
283
|
+
|----------|---------|-------------|
|
|
284
|
+
| `REDIS_URL` | `redis://localhost:6379/0` | Redis connection URL |
|
|
285
|
+
| `REDIS_MAX_CONNECTIONS` | `50` | Connection pool size |
|
|
286
|
+
| `PORT` | `8000` | API server port |
|
|
287
|
+
| `WORKERS` | `1` | Uvicorn worker count |
|
|
288
|
+
| `CACHE_TTL_SECONDS` | `3600` | Response cache TTL |
|
|
289
|
+
| `ENABLE_EVALUATION` | `true` | Run evaluator on outputs |
|
|
290
|
+
| `ENABLE_FEEDBACK` | `true` | Log routing feedback |
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## Tech Stack
|
|
295
|
+
|
|
296
|
+
| Layer | Tech |
|
|
297
|
+
|-------|------|
|
|
298
|
+
| API | FastAPI + Uvicorn |
|
|
299
|
+
| Cache / State | Redis 7 (asyncio) |
|
|
300
|
+
| Scheduling | Redis sorted sets |
|
|
301
|
+
| Config | Pydantic Settings |
|
|
302
|
+
| Testing | pytest-asyncio + fakeredis |
|
|
303
|
+
| Containers | Docker + Docker Compose |
|
|
304
|
+
| Phase 2 Serving | vLLM + Triton |
|
|
305
|
+
| Phase 2 Orchestration | Ray Serve + Kubernetes |
|
|
306
|
+
| Phase 2 Vector DB | FAISS / Qdrant |
|
|
307
|
+
| Phase 3 Metadata | PostgreSQL + SQLAlchemy |
|