codex-api-proxy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,561 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import shutil
7
+ import time
8
+ import uuid
9
+ from collections.abc import AsyncIterator, Awaitable, Callable
10
+ from contextlib import asynccontextmanager
11
+
12
+ from fastapi import Depends, FastAPI, Header, HTTPException
13
+ from fastapi.responses import StreamingResponse
14
+
15
+ from . import __version__
16
+ from .app_server_runner import AppServerBusy, AppServerRunError, create_stdio_app_server_pool
17
+ from .codex_runner import CodexRunError, run_codex_exec
18
+ from .config import Settings, SettingsError
19
+ from .prompt import messages_to_prompt
20
+ from .schemas import ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChoiceMessage
21
+
22
+ Runner = Callable[..., Awaitable[str]]
23
+ AppServerRunner = Callable[..., AsyncIterator[str]]
24
+ latency_logger = logging.getLogger("codex_api_proxy.latency")
25
+ uvicorn_logger = logging.getLogger("uvicorn.error")
26
+
27
+
28
+ def _elapsed_ms(started_at: float) -> float:
29
+ return round((time.perf_counter() - started_at) * 1000, 2)
30
+
31
+
32
+ def _log_latency(payload: dict[str, object]) -> None:
33
+ message = json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
34
+ latency_logger.info(message)
35
+ uvicorn_logger.info("codex_api_proxy.latency %s", message)
36
+
37
+
38
+ def _sse_event(data: dict[str, object] | str) -> str:
39
+ if isinstance(data, str):
40
+ return f"data: {data}\n\n"
41
+ return f"data: {json.dumps(data, ensure_ascii=False, separators=(',', ':'))}\n\n"
42
+
43
+
44
+ def _stream_chat_completion(
45
+ *,
46
+ completion_id: str,
47
+ created: int,
48
+ model: str,
49
+ content: str,
50
+ request_id: str,
51
+ request_started_at: float,
52
+ ):
53
+ base = {
54
+ "id": completion_id,
55
+ "object": "chat.completion.chunk",
56
+ "created": created,
57
+ "model": model,
58
+ }
59
+ _log_latency(
60
+ {
61
+ "event": "chat_completion_first_sse",
62
+ "request_id": request_id,
63
+ "time_to_first_sse_ms": _elapsed_ms(request_started_at),
64
+ }
65
+ )
66
+ yield _sse_event(
67
+ {
68
+ **base,
69
+ "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
70
+ }
71
+ )
72
+ yield _sse_event(
73
+ {
74
+ **base,
75
+ "choices": [{"index": 0, "delta": {"content": content}, "finish_reason": None}],
76
+ }
77
+ )
78
+ yield _sse_event(
79
+ {
80
+ **base,
81
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
82
+ }
83
+ )
84
+ yield _sse_event("[DONE]")
85
+
86
+
87
+ async def _stream_chat_completion_from_deltas(
88
+ *,
89
+ completion_id: str,
90
+ created: int,
91
+ model: str,
92
+ deltas: AsyncIterator[str],
93
+ request_id: str,
94
+ request_started_at: float,
95
+ phases_ms: dict[str, float],
96
+ on_complete: Callable[[], None] | None = None,
97
+ on_error: Callable[[], None] | None = None,
98
+ ):
99
+ base = {
100
+ "id": completion_id,
101
+ "object": "chat.completion.chunk",
102
+ "created": created,
103
+ "model": model,
104
+ }
105
+ _log_latency(
106
+ {
107
+ "event": "chat_completion_first_sse",
108
+ "request_id": request_id,
109
+ "time_to_first_sse_ms": _elapsed_ms(request_started_at),
110
+ }
111
+ )
112
+ yield _sse_event(
113
+ {
114
+ **base,
115
+ "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
116
+ }
117
+ )
118
+ first_content = True
119
+ started_at = time.perf_counter()
120
+ try:
121
+ try:
122
+ async for delta in deltas:
123
+ if first_content:
124
+ first_content = False
125
+ _log_latency(
126
+ {
127
+ "event": "chat_completion_first_content_sse",
128
+ "request_id": request_id,
129
+ "time_to_first_content_sse_ms": _elapsed_ms(request_started_at),
130
+ }
131
+ )
132
+ yield _sse_event(
133
+ {
134
+ **base,
135
+ "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
136
+ }
137
+ )
138
+ finally:
139
+ phases_ms["app_server_exec"] = _elapsed_ms(started_at)
140
+ yield _sse_event(
141
+ {
142
+ **base,
143
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
144
+ }
145
+ )
146
+ yield _sse_event("[DONE]")
147
+ except Exception:
148
+ if on_error:
149
+ on_error()
150
+ raise
151
+ if on_complete:
152
+ on_complete()
153
+ _log_latency(
154
+ {
155
+ "event": "chat_completion_latency",
156
+ "request_id": request_id,
157
+ "status": "ok",
158
+ "stream": True,
159
+ "model": model,
160
+ "engine": "app-server",
161
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
162
+ }
163
+ )
164
+
165
+
166
+ def create_app(
167
+ settings: Settings | None = None,
168
+ runner: Runner = run_codex_exec,
169
+ app_server_runner: AppServerRunner | None = None,
170
+ ) -> FastAPI:
171
+ active_settings = settings or Settings.from_env()
172
+
173
+ @asynccontextmanager
174
+ async def lifespan(app: FastAPI):
175
+ try:
176
+ yield
177
+ finally:
178
+ pool = app.state.app_server_pool
179
+ if pool is not None:
180
+ await pool.close()
181
+
182
+ app = FastAPI(title="Codex OpenAI-Compatible Proxy", version=__version__, lifespan=lifespan)
183
+ app.state.settings = active_settings
184
+ app.state.app_server_pool = None
185
+ app.state.started_at = time.time()
186
+ app.state.metrics = {
187
+ "requests_total": 0,
188
+ "requests_ok": 0,
189
+ "requests_error": 0,
190
+ "errors_by_status": {},
191
+ }
192
+ semaphore = asyncio.Semaphore(active_settings.max_concurrency)
193
+
194
+ def record_request_start() -> None:
195
+ app.state.metrics["requests_total"] += 1
196
+
197
+ def record_request_ok() -> None:
198
+ app.state.metrics["requests_ok"] += 1
199
+
200
+ def record_request_error(status_code: int) -> None:
201
+ app.state.metrics["requests_error"] += 1
202
+ errors_by_status = app.state.metrics["errors_by_status"]
203
+ key = str(status_code)
204
+ errors_by_status[key] = errors_by_status.get(key, 0) + 1
205
+
206
+ async def default_app_server_runner(**kwargs) -> AsyncIterator[str]:
207
+ pool = app.state.app_server_pool
208
+ if pool is None:
209
+ pool = create_stdio_app_server_pool(
210
+ codex_bin=active_settings.codex_bin,
211
+ proxy=active_settings.proxy,
212
+ codex_home=active_settings.app_server_codex_home,
213
+ codex_configs=active_settings.codex_configs or [],
214
+ workers=active_settings.workers,
215
+ max_queue_size=active_settings.max_queue_size,
216
+ queue_timeout_seconds=active_settings.queue_timeout_seconds,
217
+ timeout_seconds=active_settings.request_timeout_seconds,
218
+ )
219
+ app.state.app_server_pool = pool
220
+ async for chunk in pool.stream_completion(**kwargs):
221
+ yield chunk
222
+
223
+ active_app_server_runner = app_server_runner or default_app_server_runner
224
+
225
+ async def require_auth(authorization: str | None = Header(default=None)) -> None:
226
+ if not active_settings.api_key:
227
+ return
228
+ expected = f"Bearer {active_settings.api_key}"
229
+ if authorization != expected:
230
+ raise HTTPException(status_code=401, detail="Missing or invalid bearer token")
231
+
232
+ @app.get("/health")
233
+ async def health() -> dict[str, object]:
234
+ return {
235
+ "status": "ok",
236
+ "codex_available": shutil.which(active_settings.codex_bin) is not None,
237
+ "codex_bin": active_settings.codex_bin,
238
+ }
239
+
240
+ @app.get("/ready")
241
+ async def ready() -> dict[str, object]:
242
+ codex_available = shutil.which(active_settings.codex_bin) is not None
243
+ return {
244
+ "status": "ready" if codex_available else "not_ready",
245
+ "engine": active_settings.engine,
246
+ "codex_available": codex_available,
247
+ "codex_bin": active_settings.codex_bin,
248
+ "app_server_pool_started": app.state.app_server_pool is not None,
249
+ }
250
+
251
+ @app.get("/metrics")
252
+ async def metrics() -> dict[str, object]:
253
+ return {
254
+ **app.state.metrics,
255
+ "engine": active_settings.engine,
256
+ "uptime_seconds": round(time.time() - app.state.started_at, 3),
257
+ "app_server_pool_started": app.state.app_server_pool is not None,
258
+ }
259
+
260
+ @app.get("/v1/models")
261
+ async def models(_: None = Depends(require_auth)) -> dict[str, object]:
262
+ return {
263
+ "object": "list",
264
+ "data": [
265
+ {
266
+ "id": "codex-local",
267
+ "object": "model",
268
+ "created": 0,
269
+ "owned_by": "local",
270
+ }
271
+ ],
272
+ }
273
+
274
+ @app.post("/v1/chat/completions", response_model=None)
275
+ async def chat_completions(
276
+ request: ChatCompletionRequest,
277
+ _: None = Depends(require_auth),
278
+ ) -> ChatCompletionResponse | StreamingResponse:
279
+ record_request_start()
280
+ request_id = uuid.uuid4().hex
281
+ request_started_at = time.perf_counter()
282
+ phases_ms: dict[str, float] = {}
283
+ metadata = request.metadata or {}
284
+ cwd_started_at = time.perf_counter()
285
+ try:
286
+ cwd = active_settings.resolve_cwd(metadata.get("cwd"))
287
+ except SettingsError as exc:
288
+ phases_ms["cwd_resolve"] = _elapsed_ms(cwd_started_at)
289
+ _log_latency(
290
+ {
291
+ "event": "chat_completion_latency",
292
+ "request_id": request_id,
293
+ "status": "error",
294
+ "error_type": type(exc).__name__,
295
+ "http_status": 400,
296
+ "stream": request.stream,
297
+ "model": request.model,
298
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
299
+ }
300
+ )
301
+ record_request_error(400)
302
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
303
+
304
+ phases_ms["cwd_resolve"] = _elapsed_ms(cwd_started_at)
305
+ messages = [message.model_dump() for message in request.messages]
306
+ prompt_started_at = time.perf_counter()
307
+ prompt = messages_to_prompt(messages)
308
+ phases_ms["prompt_build"] = _elapsed_ms(prompt_started_at)
309
+ queue_started_at = time.perf_counter()
310
+
311
+ def record_codex_phase(name: str, elapsed_ms: float) -> None:
312
+ phases_ms[name] = round(elapsed_ms, 2)
313
+
314
+ if active_settings.engine == "app-server":
315
+ phases_ms["queue_wait"] = _elapsed_ms(queue_started_at)
316
+ response_started_at = time.perf_counter()
317
+ completion_id = f"chatcmpl-{request_id}"
318
+ created = int(time.time())
319
+ app_server_kwargs = {
320
+ "cwd": cwd,
321
+ "prompt": prompt,
322
+ "model": active_settings.model if request.model == "codex-local" else request.model,
323
+ "codex_configs": active_settings.codex_configs or [],
324
+ "ephemeral": active_settings.ephemeral,
325
+ "timeout_seconds": active_settings.request_timeout_seconds,
326
+ "latency_callback": record_codex_phase,
327
+ }
328
+ if request.stream:
329
+ phases_ms["response_build"] = _elapsed_ms(response_started_at)
330
+ return StreamingResponse(
331
+ _stream_chat_completion_from_deltas(
332
+ completion_id=completion_id,
333
+ created=created,
334
+ model=request.model,
335
+ deltas=active_app_server_runner(**app_server_kwargs),
336
+ request_id=request_id,
337
+ request_started_at=request_started_at,
338
+ phases_ms=phases_ms,
339
+ on_complete=record_request_ok,
340
+ on_error=lambda: record_request_error(502),
341
+ ),
342
+ media_type="text/event-stream",
343
+ )
344
+
345
+ app_server_started_at = time.perf_counter()
346
+ try:
347
+ chunks = [chunk async for chunk in active_app_server_runner(**app_server_kwargs)]
348
+ except AppServerBusy as exc:
349
+ phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
350
+ _log_latency(
351
+ {
352
+ "event": "chat_completion_latency",
353
+ "request_id": request_id,
354
+ "status": "error",
355
+ "error_type": type(exc).__name__,
356
+ "http_status": 429,
357
+ "stream": request.stream,
358
+ "model": request.model,
359
+ "engine": "app-server",
360
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
361
+ }
362
+ )
363
+ record_request_error(429)
364
+ raise HTTPException(status_code=429, detail=str(exc)) from exc
365
+ except TimeoutError as exc:
366
+ phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
367
+ _log_latency(
368
+ {
369
+ "event": "chat_completion_latency",
370
+ "request_id": request_id,
371
+ "status": "error",
372
+ "error_type": type(exc).__name__,
373
+ "http_status": 504,
374
+ "stream": request.stream,
375
+ "model": request.model,
376
+ "engine": "app-server",
377
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
378
+ }
379
+ )
380
+ record_request_error(504)
381
+ raise HTTPException(status_code=504, detail=str(exc)) from exc
382
+ except AppServerRunError as exc:
383
+ phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
384
+ _log_latency(
385
+ {
386
+ "event": "chat_completion_latency",
387
+ "request_id": request_id,
388
+ "status": "error",
389
+ "error_type": type(exc).__name__,
390
+ "http_status": 502,
391
+ "stream": request.stream,
392
+ "model": request.model,
393
+ "engine": "app-server",
394
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
395
+ }
396
+ )
397
+ record_request_error(502)
398
+ raise HTTPException(status_code=502, detail=str(exc)) from exc
399
+ content = "".join(chunks)
400
+ phases_ms["app_server_exec"] = _elapsed_ms(app_server_started_at)
401
+ response = ChatCompletionResponse(
402
+ id=completion_id,
403
+ created=created,
404
+ model=request.model,
405
+ choices=[ChatCompletionChoice(message=ChoiceMessage(content=content))],
406
+ )
407
+ phases_ms["response_build"] = _elapsed_ms(response_started_at)
408
+ _log_latency(
409
+ {
410
+ "event": "chat_completion_latency",
411
+ "request_id": request_id,
412
+ "status": "ok",
413
+ "stream": False,
414
+ "model": request.model,
415
+ "engine": "app-server",
416
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
417
+ }
418
+ )
419
+ record_request_ok()
420
+ return response
421
+
422
+ if active_settings.engine != "exec":
423
+ record_request_error(400)
424
+ raise HTTPException(status_code=400, detail=f"unsupported engine: {active_settings.engine}")
425
+
426
+ try:
427
+ async with semaphore:
428
+ phases_ms["queue_wait"] = _elapsed_ms(queue_started_at)
429
+ codex_started_at = time.perf_counter()
430
+ content = await runner(
431
+ codex_bin=active_settings.codex_bin,
432
+ cwd=cwd,
433
+ prompt=prompt,
434
+ timeout_seconds=active_settings.request_timeout_seconds,
435
+ proxy=active_settings.proxy,
436
+ model=active_settings.model,
437
+ codex_configs=active_settings.codex_configs or [],
438
+ ephemeral=active_settings.ephemeral,
439
+ latency_callback=record_codex_phase,
440
+ )
441
+ phases_ms["codex_exec"] = _elapsed_ms(codex_started_at)
442
+ except SettingsError as exc:
443
+ phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
444
+ _log_latency(
445
+ {
446
+ "event": "chat_completion_latency",
447
+ "request_id": request_id,
448
+ "status": "error",
449
+ "error_type": type(exc).__name__,
450
+ "http_status": 400,
451
+ "stream": request.stream,
452
+ "model": request.model,
453
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
454
+ }
455
+ )
456
+ record_request_error(400)
457
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
458
+ except TimeoutError as exc:
459
+ phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
460
+ _log_latency(
461
+ {
462
+ "event": "chat_completion_latency",
463
+ "request_id": request_id,
464
+ "status": "error",
465
+ "error_type": type(exc).__name__,
466
+ "http_status": 504,
467
+ "stream": request.stream,
468
+ "model": request.model,
469
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
470
+ }
471
+ )
472
+ record_request_error(504)
473
+ raise HTTPException(status_code=504, detail=str(exc)) from exc
474
+ except FileNotFoundError as exc:
475
+ detail = f"codex executable not found: {active_settings.codex_bin}"
476
+ phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
477
+ _log_latency(
478
+ {
479
+ "event": "chat_completion_latency",
480
+ "request_id": request_id,
481
+ "status": "error",
482
+ "error_type": type(exc).__name__,
483
+ "http_status": 503,
484
+ "stream": request.stream,
485
+ "model": request.model,
486
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
487
+ }
488
+ )
489
+ record_request_error(503)
490
+ raise HTTPException(status_code=503, detail=detail) from exc
491
+ except CodexRunError as exc:
492
+ phases_ms.setdefault("queue_wait", _elapsed_ms(queue_started_at))
493
+ _log_latency(
494
+ {
495
+ "event": "chat_completion_latency",
496
+ "request_id": request_id,
497
+ "status": "error",
498
+ "error_type": type(exc).__name__,
499
+ "http_status": 502,
500
+ "stream": request.stream,
501
+ "model": request.model,
502
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
503
+ }
504
+ )
505
+ record_request_error(502)
506
+ raise HTTPException(status_code=502, detail=str(exc)) from exc
507
+
508
+ response_started_at = time.perf_counter()
509
+ completion_id = f"chatcmpl-{request_id}"
510
+ created = int(time.time())
511
+ if request.stream:
512
+ phases_ms["response_build"] = _elapsed_ms(response_started_at)
513
+ _log_latency(
514
+ {
515
+ "event": "chat_completion_latency",
516
+ "request_id": request_id,
517
+ "status": "ok",
518
+ "stream": True,
519
+ "model": request.model,
520
+ "engine": "exec",
521
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
522
+ }
523
+ )
524
+ record_request_ok()
525
+ return StreamingResponse(
526
+ _stream_chat_completion(
527
+ completion_id=completion_id,
528
+ created=created,
529
+ model=request.model,
530
+ content=content,
531
+ request_id=request_id,
532
+ request_started_at=request_started_at,
533
+ ),
534
+ media_type="text/event-stream",
535
+ )
536
+
537
+ response = ChatCompletionResponse(
538
+ id=completion_id,
539
+ created=created,
540
+ model=request.model,
541
+ choices=[ChatCompletionChoice(message=ChoiceMessage(content=content))],
542
+ )
543
+ phases_ms["response_build"] = _elapsed_ms(response_started_at)
544
+ _log_latency(
545
+ {
546
+ "event": "chat_completion_latency",
547
+ "request_id": request_id,
548
+ "status": "ok",
549
+ "stream": False,
550
+ "model": request.model,
551
+ "engine": "exec",
552
+ "phases_ms": {**phases_ms, "total": _elapsed_ms(request_started_at)},
553
+ }
554
+ )
555
+ record_request_ok()
556
+ return response
557
+
558
+ return app
559
+
560
+
561
+ app = create_app()
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+
6
+ def _content_to_text(content: Any) -> str:
7
+ if content is None:
8
+ return ""
9
+ if isinstance(content, str):
10
+ return content
11
+ if isinstance(content, list):
12
+ parts: list[str] = []
13
+ for item in content:
14
+ if isinstance(item, dict) and item.get("type") in {"text", "input_text"}:
15
+ parts.append(str(item.get("text", "")))
16
+ elif isinstance(item, dict):
17
+ item_type = item.get("type", "unknown")
18
+ parts.append(f"[{item_type} omitted by codex-api-proxy v1]")
19
+ else:
20
+ parts.append(str(item))
21
+ return "\n".join(part for part in parts if part)
22
+ return str(content)
23
+
24
+
25
+ def messages_to_prompt(messages: list[dict[str, Any]]) -> str:
26
+ blocks: list[str] = []
27
+ for message in messages:
28
+ role = str(message.get("role", "user")).strip() or "user"
29
+ content = _content_to_text(message.get("content"))
30
+ blocks.append(f"[{role}]\n{content}")
31
+ return "\n\n".join(blocks) + "\n"
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Literal
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class ChatMessage(BaseModel):
9
+ role: str
10
+ content: Any
11
+
12
+
13
+ class ChatCompletionRequest(BaseModel):
14
+ model: str = "codex-local"
15
+ messages: list[ChatMessage] = Field(min_length=1)
16
+ stream: bool = False
17
+ temperature: float | None = None
18
+ top_p: float | None = None
19
+ max_tokens: int | None = None
20
+ presence_penalty: float | None = None
21
+ frequency_penalty: float | None = None
22
+ metadata: dict[str, Any] | None = None
23
+
24
+
25
+ class ChoiceMessage(BaseModel):
26
+ role: Literal["assistant"] = "assistant"
27
+ content: str
28
+
29
+
30
+ class ChatCompletionChoice(BaseModel):
31
+ index: int = 0
32
+ message: ChoiceMessage
33
+ finish_reason: str = "stop"
34
+
35
+
36
+ class ChatCompletionResponse(BaseModel):
37
+ id: str
38
+ object: Literal["chat.completion"] = "chat.completion"
39
+ created: int
40
+ model: str
41
+ choices: list[ChatCompletionChoice]
42
+ usage: dict[str, int] = Field(
43
+ default_factory=lambda: {
44
+ "prompt_tokens": 0,
45
+ "completion_tokens": 0,
46
+ "total_tokens": 0,
47
+ }
48
+ )