codex-api-proxy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codex_api_proxy/__init__.py +3 -0
- codex_api_proxy/app_server_runner.py +554 -0
- codex_api_proxy/cli.py +570 -0
- codex_api_proxy/codex_runner.py +278 -0
- codex_api_proxy/config.py +83 -0
- codex_api_proxy/main.py +561 -0
- codex_api_proxy/prompt.py +31 -0
- codex_api_proxy/schemas.py +48 -0
- codex_api_proxy-0.1.0.dist-info/METADATA +347 -0
- codex_api_proxy-0.1.0.dist-info/RECORD +13 -0
- codex_api_proxy-0.1.0.dist-info/WHEEL +5 -0
- codex_api_proxy-0.1.0.dist-info/entry_points.txt +2 -0
- codex_api_proxy-0.1.0.dist-info/top_level.txt +1 -0
codex_api_proxy/main.py
ADDED
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import shutil
|
|
7
|
+
import time
|
|
8
|
+
import uuid
|
|
9
|
+
from collections.abc import AsyncIterator, Awaitable, Callable
|
|
10
|
+
from contextlib import asynccontextmanager
|
|
11
|
+
|
|
12
|
+
from fastapi import Depends, FastAPI, Header, HTTPException
|
|
13
|
+
from fastapi.responses import StreamingResponse
|
|
14
|
+
|
|
15
|
+
from . import __version__
|
|
16
|
+
from .app_server_runner import AppServerBusy, AppServerRunError, create_stdio_app_server_pool
|
|
17
|
+
from .codex_runner import CodexRunError, run_codex_exec
|
|
18
|
+
from .config import Settings, SettingsError
|
|
19
|
+
from .prompt import messages_to_prompt
|
|
20
|
+
from .schemas import ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChoiceMessage
|
|
21
|
+
|
|
22
|
+
Runner = Callable[..., Awaitable[str]]
|
|
23
|
+
AppServerRunner = Callable[..., AsyncIterator[str]]
|
|
24
|
+
latency_logger = logging.getLogger("codex_api_proxy.latency")
|
|
25
|
+
uvicorn_logger = logging.getLogger("uvicorn.error")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _elapsed_ms(started_at: float) -> float:
|
|
29
|
+
return round((time.perf_counter() - started_at) * 1000, 2)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _log_latency(payload: dict[str, object]) -> None:
|
|
33
|
+
message = json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
|
|
34
|
+
latency_logger.info(message)
|
|
35
|
+
uvicorn_logger.info("codex_api_proxy.latency %s", message)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _sse_event(data: dict[str, object] | str) -> str:
|
|
39
|
+
if isinstance(data, str):
|
|
40
|
+
return f"data: {data}\n\n"
|
|
41
|
+
return f"data: {json.dumps(data, ensure_ascii=False, separators=(',', ':'))}\n\n"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _stream_chat_completion(
|
|
45
|
+
*,
|
|
46
|
+
completion_id: str,
|
|
47
|
+
created: int,
|
|
48
|
+
model: str,
|
|
49
|
+
content: str,
|
|
50
|
+
request_id: str,
|
|
51
|
+
request_started_at: float,
|
|
52
|
+
):
|
|
53
|
+
base = {
|
|
54
|
+
"id": completion_id,
|
|
55
|
+
"object": "chat.completion.chunk",
|
|
56
|
+
"created": created,
|
|
57
|
+
"model": model,
|
|
58
|
+
}
|
|
59
|
+
_log_latency(
|
|
60
|
+
{
|
|
61
|
+
"event": "chat_completion_first_sse",
|
|
62
|
+
"request_id": request_id,
|
|
63
|
+
"time_to_first_sse_ms": _elapsed_ms(request_started_at),
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
yield _sse_event(
|
|
67
|
+
{
|
|
68
|
+
**base,
|
|
69
|
+
"choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
yield _sse_event(
|
|
73
|
+
{
|
|
74
|
+
**base,
|
|
75
|
+
"choices": [{"index": 0, "delta": {"content": content}, "finish_reason": None}],
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
yield _sse_event(
|
|
79
|
+
{
|
|
80
|
+
**base,
|
|
81
|
+
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
|
|
82
|
+
}
|
|
83
|
+
)
|
|
84
|
+
yield _sse_event("[DONE]")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
async def _stream_chat_completion_from_deltas(
|
|
88
|
+
*,
|
|
89
|
+
completion_id: str,
|
|
90
|
+
created: int,
|
|
91
|
+
model: str,
|
|
92
|
+
deltas: AsyncIterator[str],
|
|
93
|
+
request_id: str,
|
|
94
|
+
request_started_at: float,
|
|
95
|
+
phases_ms: dict[str, float],
|
|
96
|
+
on_complete: Callable[[], None] | None = None,
|
|
97
|
+
on_error: Callable[[], None] | None = None,
|
|
98
|
+
):
|
|
99
|
+
base = {
|
|
100
|
+
"id": completion_id,
|
|
101
|
+
"object": "chat.completion.chunk",
|
|
102
|
+
"created": created,
|
|
103
|
+
"model": model,
|
|
104
|
+
}
|
|
105
|
+
_log_latency(
|
|
106
|
+
{
|
|
107
|
+
"event": "chat_completion_first_sse",
|
|
108
|
+
"request_id": request_id,
|
|
109
|
+
"time_to_first_sse_ms": _elapsed_ms(request_started_at),
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
yield _sse_event(
|
|
113
|
+
{
|
|
114
|
+
**base,
|
|
115
|
+
"choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
first_content = True
|
|
119
|
+
started_at = time.perf_counter()
|
|
120
|
+
try:
|
|
121
|
+
try:
|
|
122
|
+
async for delta in deltas:
|
|
123
|
+
if first_content:
|
|
124
|
+
first_content = False
|
|
125
|
+
_log_latency(
|
|
126
|
+
{
|
|
127
|
+
"event": "chat_completion_first_content_sse",
|
|
128
|
+
"request_id": request_id,
|
|
129
|
+
"time_to_first_content_sse_ms": _elapsed_ms(request_started_at),
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
yield _sse_event(
|
|
133
|
+
{
|
|
134
|
+
**base,
|
|
135
|
+
"choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
finally:
|
|
139
|
+
phases_ms["app_server_exec"] = _elapsed_ms(started_at)
|
|
140
|
+
yield _sse_event(
|
|
141
|
+
{
|
|
142
|
+
**base,
|
|
143
|
+
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
yield _sse_event("[DONE]")
|
|
147
|
+
except Exception:
|
|
148
|
+
if on_error:
|
|
149
|
+
on_error()
|
|
150
|
+
raise
|
|
151
|
+
if on_complete:
|
|
152
|
+
on_complete()
|
|
153
|
+
_log_latency(
|
|
154
|
+
{
|
|
155
|
+
"event": "chat_completion_latency",
|
|
156
|
+
"request_id": request_id,
|
|
157
|
+
"status": "ok",
|
|
158
|
+
"stream": True,
|
|
159
|
+
"model": model,
|
|
160
|
+
"engine": "app-server",
|
|
161
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
162
|
+
}
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def create_app(
|
|
167
|
+
settings: Settings | None = None,
|
|
168
|
+
runner: Runner = run_codex_exec,
|
|
169
|
+
app_server_runner: AppServerRunner | None = None,
|
|
170
|
+
) -> FastAPI:
|
|
171
|
+
active_settings = settings or Settings.from_env()
|
|
172
|
+
|
|
173
|
+
@asynccontextmanager
|
|
174
|
+
async def lifespan(app: FastAPI):
|
|
175
|
+
try:
|
|
176
|
+
yield
|
|
177
|
+
finally:
|
|
178
|
+
pool = app.state.app_server_pool
|
|
179
|
+
if pool is not None:
|
|
180
|
+
await pool.close()
|
|
181
|
+
|
|
182
|
+
app = FastAPI(title="Codex OpenAI-Compatible Proxy", version=__version__, lifespan=lifespan)
|
|
183
|
+
app.state.settings = active_settings
|
|
184
|
+
app.state.app_server_pool = None
|
|
185
|
+
app.state.started_at = time.time()
|
|
186
|
+
app.state.metrics = {
|
|
187
|
+
"requests_total": 0,
|
|
188
|
+
"requests_ok": 0,
|
|
189
|
+
"requests_error": 0,
|
|
190
|
+
"errors_by_status": {},
|
|
191
|
+
}
|
|
192
|
+
semaphore = asyncio.Semaphore(active_settings.max_concurrency)
|
|
193
|
+
|
|
194
|
+
def record_request_start() -> None:
|
|
195
|
+
app.state.metrics["requests_total"] += 1
|
|
196
|
+
|
|
197
|
+
def record_request_ok() -> None:
|
|
198
|
+
app.state.metrics["requests_ok"] += 1
|
|
199
|
+
|
|
200
|
+
def record_request_error(status_code: int) -> None:
|
|
201
|
+
app.state.metrics["requests_error"] += 1
|
|
202
|
+
errors_by_status = app.state.metrics["errors_by_status"]
|
|
203
|
+
key = str(status_code)
|
|
204
|
+
errors_by_status[key] = errors_by_status.get(key, 0) + 1
|
|
205
|
+
|
|
206
|
+
async def default_app_server_runner(**kwargs) -> AsyncIterator[str]:
|
|
207
|
+
pool = app.state.app_server_pool
|
|
208
|
+
if pool is None:
|
|
209
|
+
pool = create_stdio_app_server_pool(
|
|
210
|
+
codex_bin=active_settings.codex_bin,
|
|
211
|
+
proxy=active_settings.proxy,
|
|
212
|
+
codex_home=active_settings.app_server_codex_home,
|
|
213
|
+
codex_configs=active_settings.codex_configs or [],
|
|
214
|
+
workers=active_settings.workers,
|
|
215
|
+
max_queue_size=active_settings.max_queue_size,
|
|
216
|
+
queue_timeout_seconds=active_settings.queue_timeout_seconds,
|
|
217
|
+
timeout_seconds=active_settings.request_timeout_seconds,
|
|
218
|
+
)
|
|
219
|
+
app.state.app_server_pool = pool
|
|
220
|
+
async for chunk in pool.stream_completion(**kwargs):
|
|
221
|
+
yield chunk
|
|
222
|
+
|
|
223
|
+
active_app_server_runner = app_server_runner or default_app_server_runner
|
|
224
|
+
|
|
225
|
+
async def require_auth(authorization: str | None = Header(default=None)) -> None:
|
|
226
|
+
if not active_settings.api_key:
|
|
227
|
+
return
|
|
228
|
+
expected = f"Bearer {active_settings.api_key}"
|
|
229
|
+
if authorization != expected:
|
|
230
|
+
raise HTTPException(status_code=401, detail="Missing or invalid bearer token")
|
|
231
|
+
|
|
232
|
+
@app.get("/health")
|
|
233
|
+
async def health() -> dict[str, object]:
|
|
234
|
+
return {
|
|
235
|
+
"status": "ok",
|
|
236
|
+
"codex_available": shutil.which(active_settings.codex_bin) is not None,
|
|
237
|
+
"codex_bin": active_settings.codex_bin,
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
@app.get("/ready")
|
|
241
|
+
async def ready() -> dict[str, object]:
|
|
242
|
+
codex_available = shutil.which(active_settings.codex_bin) is not None
|
|
243
|
+
return {
|
|
244
|
+
"status": "ready" if codex_available else "not_ready",
|
|
245
|
+
"engine": active_settings.engine,
|
|
246
|
+
"codex_available": codex_available,
|
|
247
|
+
"codex_bin": active_settings.codex_bin,
|
|
248
|
+
"app_server_pool_started": app.state.app_server_pool is not None,
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
@app.get("/metrics")
|
|
252
|
+
async def metrics() -> dict[str, object]:
|
|
253
|
+
return {
|
|
254
|
+
**app.state.metrics,
|
|
255
|
+
"engine": active_settings.engine,
|
|
256
|
+
"uptime_seconds": round(time.time() - app.state.started_at, 3),
|
|
257
|
+
"app_server_pool_started": app.state.app_server_pool is not None,
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
@app.get("/v1/models")
|
|
261
|
+
async def models(_: None = Depends(require_auth)) -> dict[str, object]:
|
|
262
|
+
return {
|
|
263
|
+
"object": "list",
|
|
264
|
+
"data": [
|
|
265
|
+
{
|
|
266
|
+
"id": "codex-local",
|
|
267
|
+
"object": "model",
|
|
268
|
+
"created": 0,
|
|
269
|
+
"owned_by": "local",
|
|
270
|
+
}
|
|
271
|
+
],
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
@app.post("/v1/chat/completions", response_model=None)
|
|
275
|
+
async def chat_completions(
|
|
276
|
+
request: ChatCompletionRequest,
|
|
277
|
+
_: None = Depends(require_auth),
|
|
278
|
+
) -> ChatCompletionResponse | StreamingResponse:
|
|
279
|
+
record_request_start()
|
|
280
|
+
request_id = uuid.uuid4().hex
|
|
281
|
+
request_started_at = time.perf_counter()
|
|
282
|
+
phases_ms: dict[str, float] = {}
|
|
283
|
+
metadata = request.metadata or {}
|
|
284
|
+
cwd_started_at = time.perf_counter()
|
|
285
|
+
try:
|
|
286
|
+
cwd = active_settings.resolve_cwd(metadata.get("cwd"))
|
|
287
|
+
except SettingsError as exc:
|
|
288
|
+
phases_ms["cwd_resolve"] = _elapsed_ms(cwd_started_at)
|
|
289
|
+
_log_latency(
|
|
290
|
+
{
|
|
291
|
+
"event": "chat_completion_latency",
|
|
292
|
+
"request_id": request_id,
|
|
293
|
+
"status": "error",
|
|
294
|
+
"error_type": type(exc).__name__,
|
|
295
|
+
"http_status": 400,
|
|
296
|
+
"stream": request.stream,
|
|
297
|
+
"model": request.model,
|
|
298
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
299
|
+
}
|
|
300
|
+
)
|
|
301
|
+
record_request_error(400)
|
|
302
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
303
|
+
|
|
304
|
+
phases_ms["cwd_resolve"] = _elapsed_ms(cwd_started_at)
|
|
305
|
+
messages = [message.model_dump() for message in request.messages]
|
|
306
|
+
prompt_started_at = time.perf_counter()
|
|
307
|
+
prompt = messages_to_prompt(messages)
|
|
308
|
+
phases_ms["prompt_build"] = _elapsed_ms(prompt_started_at)
|
|
309
|
+
queue_started_at = time.perf_counter()
|
|
310
|
+
|
|
311
|
+
def record_codex_phase(name: str, elapsed_ms: float) -> None:
|
|
312
|
+
phases_ms[name] = round(elapsed_ms, 2)
|
|
313
|
+
|
|
314
|
+
if active_settings.engine == "app-server":
|
|
315
|
+
phases_ms["queue_wait"] = _elapsed_ms(queue_started_at)
|
|
316
|
+
response_started_at = time.perf_counter()
|
|
317
|
+
completion_id = f"chatcmpl-{request_id}"
|
|
318
|
+
created = int(time.time())
|
|
319
|
+
app_server_kwargs = {
|
|
320
|
+
"cwd": cwd,
|
|
321
|
+
"prompt": prompt,
|
|
322
|
+
"model": active_settings.model if request.model == "codex-local" else request.model,
|
|
323
|
+
"codex_configs": active_settings.codex_configs or [],
|
|
324
|
+
"ephemeral": active_settings.ephemeral,
|
|
325
|
+
"timeout_seconds": active_settings.request_timeout_seconds,
|
|
326
|
+
"latency_callback": record_codex_phase,
|
|
327
|
+
}
|
|
328
|
+
if request.stream:
|
|
329
|
+
phases_ms["response_build"] = _elapsed_ms(response_started_at)
|
|
330
|
+
return StreamingResponse(
|
|
331
|
+
_stream_chat_completion_from_deltas(
|
|
332
|
+
completion_id=completion_id,
|
|
333
|
+
created=created,
|
|
334
|
+
model=request.model,
|
|
335
|
+
deltas=active_app_server_runner(**app_server_kwargs),
|
|
336
|
+
request_id=request_id,
|
|
337
|
+
request_started_at=request_started_at,
|
|
338
|
+
phases_ms=phases_ms,
|
|
339
|
+
on_complete=record_request_ok,
|
|
340
|
+
on_error=lambda: record_request_error(502),
|
|
341
|
+
),
|
|
342
|
+
media_type="text/event-stream",
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
app_server_started_at = time.perf_counter()
|
|
346
|
+
try:
|
|
347
|
+
chunks = [chunk async for chunk in active_app_server_runner(**app_server_kwargs)]
|
|
348
|
+
except AppServerBusy as exc:
|
|
349
|
+
phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
|
|
350
|
+
_log_latency(
|
|
351
|
+
{
|
|
352
|
+
"event": "chat_completion_latency",
|
|
353
|
+
"request_id": request_id,
|
|
354
|
+
"status": "error",
|
|
355
|
+
"error_type": type(exc).__name__,
|
|
356
|
+
"http_status": 429,
|
|
357
|
+
"stream": request.stream,
|
|
358
|
+
"model": request.model,
|
|
359
|
+
"engine": "app-server",
|
|
360
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
361
|
+
}
|
|
362
|
+
)
|
|
363
|
+
record_request_error(429)
|
|
364
|
+
raise HTTPException(status_code=429, detail=str(exc)) from exc
|
|
365
|
+
except TimeoutError as exc:
|
|
366
|
+
phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
|
|
367
|
+
_log_latency(
|
|
368
|
+
{
|
|
369
|
+
"event": "chat_completion_latency",
|
|
370
|
+
"request_id": request_id,
|
|
371
|
+
"status": "error",
|
|
372
|
+
"error_type": type(exc).__name__,
|
|
373
|
+
"http_status": 504,
|
|
374
|
+
"stream": request.stream,
|
|
375
|
+
"model": request.model,
|
|
376
|
+
"engine": "app-server",
|
|
377
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
378
|
+
}
|
|
379
|
+
)
|
|
380
|
+
record_request_error(504)
|
|
381
|
+
raise HTTPException(status_code=504, detail=str(exc)) from exc
|
|
382
|
+
except AppServerRunError as exc:
|
|
383
|
+
phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
|
|
384
|
+
_log_latency(
|
|
385
|
+
{
|
|
386
|
+
"event": "chat_completion_latency",
|
|
387
|
+
"request_id": request_id,
|
|
388
|
+
"status": "error",
|
|
389
|
+
"error_type": type(exc).__name__,
|
|
390
|
+
"http_status": 502,
|
|
391
|
+
"stream": request.stream,
|
|
392
|
+
"model": request.model,
|
|
393
|
+
"engine": "app-server",
|
|
394
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
395
|
+
}
|
|
396
|
+
)
|
|
397
|
+
record_request_error(502)
|
|
398
|
+
raise HTTPException(status_code=502, detail=str(exc)) from exc
|
|
399
|
+
content = "".join(chunks)
|
|
400
|
+
phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
|
|
401
|
+
response = ChatCompletionResponse(
|
|
402
|
+
id=completion_id,
|
|
403
|
+
created=created,
|
|
404
|
+
model=request.model,
|
|
405
|
+
choices=[ChatCompletionChoice(message=ChoiceMessage(content=content))],
|
|
406
|
+
)
|
|
407
|
+
phases_ms["response_build"] = _elapsed_ms(response_started_at)
|
|
408
|
+
_log_latency(
|
|
409
|
+
{
|
|
410
|
+
"event": "chat_completion_latency",
|
|
411
|
+
"request_id": request_id,
|
|
412
|
+
"status": "ok",
|
|
413
|
+
"stream": False,
|
|
414
|
+
"model": request.model,
|
|
415
|
+
"engine": "app-server",
|
|
416
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
417
|
+
}
|
|
418
|
+
)
|
|
419
|
+
record_request_ok()
|
|
420
|
+
return response
|
|
421
|
+
|
|
422
|
+
if active_settings.engine != "exec":
|
|
423
|
+
record_request_error(400)
|
|
424
|
+
raise HTTPException(status_code=400, detail=f"unsupported engine: {active_settings.engine}")
|
|
425
|
+
|
|
426
|
+
try:
|
|
427
|
+
async with semaphore:
|
|
428
|
+
phases_ms["queue_wait"] = _elapsed_ms(queue_started_at)
|
|
429
|
+
codex_started_at = time.perf_counter()
|
|
430
|
+
content = await runner(
|
|
431
|
+
codex_bin=active_settings.codex_bin,
|
|
432
|
+
cwd=cwd,
|
|
433
|
+
prompt=prompt,
|
|
434
|
+
timeout_seconds=active_settings.request_timeout_seconds,
|
|
435
|
+
proxy=active_settings.proxy,
|
|
436
|
+
model=active_settings.model,
|
|
437
|
+
codex_configs=active_settings.codex_configs or [],
|
|
438
|
+
ephemeral=active_settings.ephemeral,
|
|
439
|
+
latency_callback=record_codex_phase,
|
|
440
|
+
)
|
|
441
|
+
phases_ms["codex_exec"] = _elapsed_ms(codex_started_at)
|
|
442
|
+
except SettingsError as exc:
|
|
443
|
+
phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
|
|
444
|
+
_log_latency(
|
|
445
|
+
{
|
|
446
|
+
"event": "chat_completion_latency",
|
|
447
|
+
"request_id": request_id,
|
|
448
|
+
"status": "error",
|
|
449
|
+
"error_type": type(exc).__name__,
|
|
450
|
+
"http_status": 400,
|
|
451
|
+
"stream": request.stream,
|
|
452
|
+
"model": request.model,
|
|
453
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
454
|
+
}
|
|
455
|
+
)
|
|
456
|
+
record_request_error(400)
|
|
457
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
458
|
+
except TimeoutError as exc:
|
|
459
|
+
phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
|
|
460
|
+
_log_latency(
|
|
461
|
+
{
|
|
462
|
+
"event": "chat_completion_latency",
|
|
463
|
+
"request_id": request_id,
|
|
464
|
+
"status": "error",
|
|
465
|
+
"error_type": type(exc).__name__,
|
|
466
|
+
"http_status": 504,
|
|
467
|
+
"stream": request.stream,
|
|
468
|
+
"model": request.model,
|
|
469
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
470
|
+
}
|
|
471
|
+
)
|
|
472
|
+
record_request_error(504)
|
|
473
|
+
raise HTTPException(status_code=504, detail=str(exc)) from exc
|
|
474
|
+
except FileNotFoundError as exc:
|
|
475
|
+
detail = f"codex executable not found: {active_settings.codex_bin}"
|
|
476
|
+
phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
|
|
477
|
+
_log_latency(
|
|
478
|
+
{
|
|
479
|
+
"event": "chat_completion_latency",
|
|
480
|
+
"request_id": request_id,
|
|
481
|
+
"status": "error",
|
|
482
|
+
"error_type": type(exc).__name__,
|
|
483
|
+
"http_status": 503,
|
|
484
|
+
"stream": request.stream,
|
|
485
|
+
"model": request.model,
|
|
486
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
487
|
+
}
|
|
488
|
+
)
|
|
489
|
+
record_request_error(503)
|
|
490
|
+
raise HTTPException(status_code=503, detail=detail) from exc
|
|
491
|
+
except CodexRunError as exc:
|
|
492
|
+
phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
|
|
493
|
+
_log_latency(
|
|
494
|
+
{
|
|
495
|
+
"event": "chat_completion_latency",
|
|
496
|
+
"request_id": request_id,
|
|
497
|
+
"status": "error",
|
|
498
|
+
"error_type": type(exc).__name__,
|
|
499
|
+
"http_status": 502,
|
|
500
|
+
"stream": request.stream,
|
|
501
|
+
"model": request.model,
|
|
502
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
503
|
+
}
|
|
504
|
+
)
|
|
505
|
+
record_request_error(502)
|
|
506
|
+
raise HTTPException(status_code=502, detail=str(exc)) from exc
|
|
507
|
+
|
|
508
|
+
response_started_at = time.perf_counter()
|
|
509
|
+
completion_id = f"chatcmpl-{request_id}"
|
|
510
|
+
created = int(time.time())
|
|
511
|
+
if request.stream:
|
|
512
|
+
phases_ms["response_build"] = _elapsed_ms(response_started_at)
|
|
513
|
+
_log_latency(
|
|
514
|
+
{
|
|
515
|
+
"event": "chat_completion_latency",
|
|
516
|
+
"request_id": request_id,
|
|
517
|
+
"status": "ok",
|
|
518
|
+
"stream": True,
|
|
519
|
+
"model": request.model,
|
|
520
|
+
"engine": "exec",
|
|
521
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
522
|
+
}
|
|
523
|
+
)
|
|
524
|
+
record_request_ok()
|
|
525
|
+
return StreamingResponse(
|
|
526
|
+
_stream_chat_completion(
|
|
527
|
+
completion_id=completion_id,
|
|
528
|
+
created=created,
|
|
529
|
+
model=request.model,
|
|
530
|
+
content=content,
|
|
531
|
+
request_id=request_id,
|
|
532
|
+
request_started_at=request_started_at,
|
|
533
|
+
),
|
|
534
|
+
media_type="text/event-stream",
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
response = ChatCompletionResponse(
|
|
538
|
+
id=completion_id,
|
|
539
|
+
created=created,
|
|
540
|
+
model=request.model,
|
|
541
|
+
choices=[ChatCompletionChoice(message=ChoiceMessage(content=content))],
|
|
542
|
+
)
|
|
543
|
+
phases_ms["response_build"] = _elapsed_ms(response_started_at)
|
|
544
|
+
_log_latency(
|
|
545
|
+
{
|
|
546
|
+
"event": "chat_completion_latency",
|
|
547
|
+
"request_id": request_id,
|
|
548
|
+
"status": "ok",
|
|
549
|
+
"stream": False,
|
|
550
|
+
"model": request.model,
|
|
551
|
+
"engine": "exec",
|
|
552
|
+
"phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
|
|
553
|
+
}
|
|
554
|
+
)
|
|
555
|
+
record_request_ok()
|
|
556
|
+
return response
|
|
557
|
+
|
|
558
|
+
return app
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
app = create_app()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _content_to_text(content: Any) -> str:
|
|
7
|
+
if content is None:
|
|
8
|
+
return ""
|
|
9
|
+
if isinstance(content, str):
|
|
10
|
+
return content
|
|
11
|
+
if isinstance(content, list):
|
|
12
|
+
parts: list[str] = []
|
|
13
|
+
for item in content:
|
|
14
|
+
if isinstance(item, dict) and item.get("type") in {"text", "input_text"}:
|
|
15
|
+
parts.append(str(item.get("text", "")))
|
|
16
|
+
elif isinstance(item, dict):
|
|
17
|
+
item_type = item.get("type", "unknown")
|
|
18
|
+
parts.append(f"[{item_type} omitted by codex-api-proxy v1]")
|
|
19
|
+
else:
|
|
20
|
+
parts.append(str(item))
|
|
21
|
+
return "\n".join(part for part in parts if part)
|
|
22
|
+
return str(content)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def messages_to_prompt(messages: list[dict[str, Any]]) -> str:
|
|
26
|
+
blocks: list[str] = []
|
|
27
|
+
for message in messages:
|
|
28
|
+
role = str(message.get("role", "user")).strip() or "user"
|
|
29
|
+
content = _content_to_text(message.get("content"))
|
|
30
|
+
blocks.append(f"[{role}]\n{content}")
|
|
31
|
+
return "\n\n".join(blocks) + "\n"
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ChatMessage(BaseModel):
|
|
9
|
+
role: str
|
|
10
|
+
content: Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ChatCompletionRequest(BaseModel):
|
|
14
|
+
model: str = "codex-local"
|
|
15
|
+
messages: list[ChatMessage] = Field(min_length=1)
|
|
16
|
+
stream: bool = False
|
|
17
|
+
temperature: float | None = None
|
|
18
|
+
top_p: float | None = None
|
|
19
|
+
max_tokens: int | None = None
|
|
20
|
+
presence_penalty: float | None = None
|
|
21
|
+
frequency_penalty: float | None = None
|
|
22
|
+
metadata: dict[str, Any] | None = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ChoiceMessage(BaseModel):
|
|
26
|
+
role: Literal["assistant"] = "assistant"
|
|
27
|
+
content: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ChatCompletionChoice(BaseModel):
|
|
31
|
+
index: int = 0
|
|
32
|
+
message: ChoiceMessage
|
|
33
|
+
finish_reason: str = "stop"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ChatCompletionResponse(BaseModel):
|
|
37
|
+
id: str
|
|
38
|
+
object: Literal["chat.completion"] = "chat.completion"
|
|
39
|
+
created: int
|
|
40
|
+
model: str
|
|
41
|
+
choices: list[ChatCompletionChoice]
|
|
42
|
+
usage: dict[str, int] = Field(
|
|
43
|
+
default_factory=lambda: {
|
|
44
|
+
"prompt_tokens": 0,
|
|
45
|
+
"completion_tokens": 0,
|
|
46
|
+
"total_tokens": 0,
|
|
47
|
+
}
|
|
48
|
+
)
|