fusionkit-server 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: fusionkit-server
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: OpenAI-compatible HTTP server for fusionkit.
|
|
5
|
+
Requires-Dist: fastapi>=0.124.4
|
|
6
|
+
Requires-Dist: fusionkit-core==0.1.0
|
|
7
|
+
Requires-Dist: uvicorn>=0.38.0
|
|
8
|
+
Requires-Dist: uvicorn[standard]>=0.38.0 ; extra == 'server'
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Provides-Extra: server
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fusionkit-server"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "OpenAI-compatible HTTP server for fusionkit."
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"fastapi>=0.124.4",
|
|
8
|
+
"fusionkit-core==0.1.0",
|
|
9
|
+
"uvicorn>=0.38.0",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[project.optional-dependencies]
|
|
13
|
+
server = ["uvicorn[standard]>=0.38.0"]
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["uv_build>=0.11.21,<0.12.0"]
|
|
17
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,746 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
import traceback
|
|
7
|
+
import uuid
|
|
8
|
+
from collections.abc import AsyncIterator
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from fastapi import FastAPI, Header, Query
|
|
13
|
+
from fastapi.responses import JSONResponse, StreamingResponse
|
|
14
|
+
from fusionkit_core.artifacts import LocalArtifactStore
|
|
15
|
+
from fusionkit_core.clients import ChatClient, build_clients
|
|
16
|
+
from fusionkit_core.config import FusionConfig, FusionMode
|
|
17
|
+
from fusionkit_core.contracts import (
|
|
18
|
+
FusionRunRequestV1,
|
|
19
|
+
HarnessTrajectoryV1,
|
|
20
|
+
contract_metadata,
|
|
21
|
+
)
|
|
22
|
+
from fusionkit_core.fusion import FusionEngine
|
|
23
|
+
from fusionkit_core.run import (
|
|
24
|
+
CreateRunResult,
|
|
25
|
+
FusionRunManager,
|
|
26
|
+
NativeRunError,
|
|
27
|
+
RunInspection,
|
|
28
|
+
ToolExecutionMode,
|
|
29
|
+
ToolExecutionPolicy,
|
|
30
|
+
ToolResultSubmission,
|
|
31
|
+
hash_json,
|
|
32
|
+
make_id,
|
|
33
|
+
)
|
|
34
|
+
from fusionkit_core.run_store import FileSystemRunStore
|
|
35
|
+
from fusionkit_core.trace import TRACE_ID_HEADER, TRACE_SPAN_HEADER, new_span_id
|
|
36
|
+
from fusionkit_core.trace import emit as trace_emit
|
|
37
|
+
from fusionkit_core.types import ChatMessage, ModelResponse, ToolCall
|
|
38
|
+
from pydantic import BaseModel, Field
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class FusionToolExecutionOptions(BaseModel):
|
|
42
|
+
mode: ToolExecutionMode = "disabled"
|
|
43
|
+
allowed_side_effects: list[str] = Field(default_factory=lambda: ["read_only"])
|
|
44
|
+
environment: str | None = None
|
|
45
|
+
policy_id: str | None = None
|
|
46
|
+
dedupe_read_only: bool = True
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class FusionOptions(BaseModel):
|
|
50
|
+
mode: FusionMode | None = None
|
|
51
|
+
panel_models: list[str] | None = None
|
|
52
|
+
sample_count: int | None = Field(default=None, ge=1)
|
|
53
|
+
verify: bool = False
|
|
54
|
+
tool_execution: FusionToolExecutionOptions = Field(
|
|
55
|
+
default_factory=FusionToolExecutionOptions
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class FusionRequest(BaseModel):
|
|
60
|
+
model: str = "fusionkit/router"
|
|
61
|
+
messages: list[ChatMessage]
|
|
62
|
+
temperature: float | None = None
|
|
63
|
+
top_p: float | None = None
|
|
64
|
+
max_tokens: int | None = None
|
|
65
|
+
stream: bool = False
|
|
66
|
+
fusion: FusionOptions = Field(default_factory=FusionOptions)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class TrajectoryStepInput(BaseModel):
|
|
70
|
+
index: int
|
|
71
|
+
type: str
|
|
72
|
+
text: str | None = None
|
|
73
|
+
tool_name: str | None = None
|
|
74
|
+
tool_call_id: str | None = None
|
|
75
|
+
tool_input: str | None = None
|
|
76
|
+
is_error: bool | None = None
|
|
77
|
+
output_hash: str | None = None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class TrajectoryVerificationInput(BaseModel):
|
|
81
|
+
status: str
|
|
82
|
+
evidence: list[str] | None = None
|
|
83
|
+
exit_code: int | None = None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class TrajectoryInput(BaseModel):
|
|
87
|
+
trajectory_id: str
|
|
88
|
+
model_id: str
|
|
89
|
+
status: str
|
|
90
|
+
steps: list[TrajectoryStepInput] = Field(default_factory=list)
|
|
91
|
+
final_output: str
|
|
92
|
+
candidate_id: str | None = None
|
|
93
|
+
model: str | None = None
|
|
94
|
+
harness_kind: str | None = None
|
|
95
|
+
diff: str | None = None
|
|
96
|
+
verification: TrajectoryVerificationInput | None = None
|
|
97
|
+
metadata: dict[str, Any] | None = None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class FuseTrajectoriesRequest(BaseModel):
|
|
101
|
+
messages: list[ChatMessage]
|
|
102
|
+
trajectories: list[TrajectoryInput]
|
|
103
|
+
judge_model: str | None = None
|
|
104
|
+
synthesizer_model: str | None = None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class StepTrajectoriesRequest(BaseModel):
|
|
108
|
+
"""A single front-door turn: the judge synthesizes the next step (tool call
|
|
109
|
+
or final answer) from the candidate trajectories + the live conversation."""
|
|
110
|
+
|
|
111
|
+
model: str = "fusionkit/router"
|
|
112
|
+
# Raw OpenAI chat messages (assistant tool_calls are nested under `function`,
|
|
113
|
+
# tool results carry `tool_call_id`, content may be a parts array); normalized
|
|
114
|
+
# to FusionKit ChatMessage in the handler.
|
|
115
|
+
messages: list[dict[str, Any]]
|
|
116
|
+
trajectories: list[TrajectoryInput] = Field(default_factory=list)
|
|
117
|
+
tools: list[dict[str, Any]] | None = None
|
|
118
|
+
tool_choice: str | dict[str, Any] | None = None
|
|
119
|
+
judge_model: str | None = None
|
|
120
|
+
stream: bool = False
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def create_app(
|
|
124
|
+
config: FusionConfig,
|
|
125
|
+
clients: dict[str, ChatClient] | None = None,
|
|
126
|
+
run_manager: FusionRunManager | None = None,
|
|
127
|
+
run_store_path: Path | None = None,
|
|
128
|
+
) -> FastAPI:
|
|
129
|
+
app = FastAPI(title="fusionkit", version="0.1.0")
|
|
130
|
+
model_clients = clients or build_clients(config)
|
|
131
|
+
engine = FusionEngine(config=config, clients=model_clients)
|
|
132
|
+
native_runs = run_manager or _create_run_manager(engine, run_store_path)
|
|
133
|
+
|
|
134
|
+
@app.get("/health")
|
|
135
|
+
async def health() -> dict[str, str]:
|
|
136
|
+
return {"status": "ok"}
|
|
137
|
+
|
|
138
|
+
@app.get("/v1/models")
|
|
139
|
+
async def models() -> dict[str, Any]:
|
|
140
|
+
data = [{"id": "fusionkit/router", "object": "model"}]
|
|
141
|
+
data.extend({"id": endpoint.id, "object": "model"} for endpoint in config.endpoints)
|
|
142
|
+
return {"object": "list", "data": data}
|
|
143
|
+
|
|
144
|
+
@app.post("/v1/fusion/runs")
|
|
145
|
+
async def create_fusion_run(
|
|
146
|
+
request: FusionRunRequestV1,
|
|
147
|
+
idempotency_key: str | None = Header(default=None, alias="Idempotency-Key"),
|
|
148
|
+
) -> JSONResponse:
|
|
149
|
+
result = await native_runs.create_and_run(request, idempotency_key=idempotency_key)
|
|
150
|
+
if isinstance(result, CreateRunResult):
|
|
151
|
+
if result.idempotency_outcome == "conflict":
|
|
152
|
+
return _native_error_response(result.terminal_error, status_code=409)
|
|
153
|
+
return _json_response(_create_run_payload(result))
|
|
154
|
+
return _json_response(
|
|
155
|
+
{
|
|
156
|
+
"run_id": result.run_id,
|
|
157
|
+
"trace_id": result.trace_id,
|
|
158
|
+
"state": result.state,
|
|
159
|
+
"status": result.status,
|
|
160
|
+
"event_cursor": result.event_cursor,
|
|
161
|
+
"idempotency_outcome": "created",
|
|
162
|
+
"terminal_error": _dump_optional(result.terminal_error),
|
|
163
|
+
"inspection": result.model_dump(mode="json"),
|
|
164
|
+
}
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
@app.get("/v1/fusion/runs/{run_id}")
|
|
168
|
+
async def get_fusion_run(run_id: str) -> JSONResponse:
|
|
169
|
+
try:
|
|
170
|
+
return _json_response(native_runs.store.read_summary(run_id).model_dump(mode="json"))
|
|
171
|
+
except FileNotFoundError:
|
|
172
|
+
return _run_not_found_response()
|
|
173
|
+
|
|
174
|
+
@app.get("/v1/fusion/runs/{run_id}/inspect")
|
|
175
|
+
async def inspect_fusion_run(run_id: str) -> JSONResponse:
|
|
176
|
+
try:
|
|
177
|
+
return _json_response(native_runs.store.inspect_run(run_id).model_dump(mode="json"))
|
|
178
|
+
except FileNotFoundError:
|
|
179
|
+
return _run_not_found_response()
|
|
180
|
+
|
|
181
|
+
@app.get("/v1/fusion/runs/{run_id}/events")
|
|
182
|
+
async def fusion_run_events(
|
|
183
|
+
run_id: str,
|
|
184
|
+
after: int | None = Query(default=None, ge=0),
|
|
185
|
+
) -> JSONResponse:
|
|
186
|
+
try:
|
|
187
|
+
native_runs.store.read_summary(run_id)
|
|
188
|
+
except FileNotFoundError:
|
|
189
|
+
return _run_not_found_response()
|
|
190
|
+
return _json_response(native_runs.store.event_page(run_id, after).model_dump(mode="json"))
|
|
191
|
+
|
|
192
|
+
@app.post("/v1/fusion/runs/{run_id}/tool-results")
|
|
193
|
+
async def submit_tool_results(
|
|
194
|
+
run_id: str,
|
|
195
|
+
submission: ToolResultSubmission,
|
|
196
|
+
) -> JSONResponse:
|
|
197
|
+
try:
|
|
198
|
+
result = native_runs.submit_tool_result(run_id, submission)
|
|
199
|
+
except FileNotFoundError:
|
|
200
|
+
return _run_not_found_response()
|
|
201
|
+
if isinstance(result, NativeRunError):
|
|
202
|
+
return _native_error_response(result, status_code=409)
|
|
203
|
+
return _json_response(result.model_dump(mode="json"))
|
|
204
|
+
|
|
205
|
+
@app.post("/v1/chat/completions", response_model=None)
|
|
206
|
+
async def chat_completions(
|
|
207
|
+
request: FusionRequest,
|
|
208
|
+
) -> dict[str, Any] | JSONResponse | StreamingResponse:
|
|
209
|
+
resolved = await _resolve_native_chat(native_runs, request, config)
|
|
210
|
+
if isinstance(resolved, JSONResponse):
|
|
211
|
+
return resolved
|
|
212
|
+
final_output, metadata = resolved
|
|
213
|
+
if request.stream:
|
|
214
|
+
return StreamingResponse(
|
|
215
|
+
_chat_completion_sse(request.model, final_output, metadata),
|
|
216
|
+
media_type="text/event-stream",
|
|
217
|
+
)
|
|
218
|
+
return _openai_chat_response(request.model, final_output, metadata)
|
|
219
|
+
|
|
220
|
+
@app.post("/v1/fusion/trajectories:fuse", response_model=None)
|
|
221
|
+
async def fuse_trajectories(
|
|
222
|
+
request: FuseTrajectoriesRequest,
|
|
223
|
+
trace_id: str | None = Header(default=None, alias=TRACE_ID_HEADER),
|
|
224
|
+
span_id: str | None = Header(default=None, alias=TRACE_SPAN_HEADER),
|
|
225
|
+
) -> dict[str, Any] | JSONResponse:
|
|
226
|
+
if not request.trajectories:
|
|
227
|
+
return _openai_error_response(
|
|
228
|
+
"no_trajectories",
|
|
229
|
+
"At least one trajectory is required.",
|
|
230
|
+
status_code=400,
|
|
231
|
+
)
|
|
232
|
+
judge_model = request.judge_model or config.resolved_judge_model
|
|
233
|
+
synthesizer_model = request.synthesizer_model or config.resolved_synthesizer_model
|
|
234
|
+
try:
|
|
235
|
+
judge_client = engine.clients[judge_model]
|
|
236
|
+
synthesizer_client = engine.clients[synthesizer_model]
|
|
237
|
+
except KeyError as exc:
|
|
238
|
+
return _openai_error_response(
|
|
239
|
+
"unknown_model",
|
|
240
|
+
f"Unknown model endpoint {exc}.",
|
|
241
|
+
status_code=400,
|
|
242
|
+
)
|
|
243
|
+
trajectories = [
|
|
244
|
+
HarnessTrajectoryV1.model_validate(
|
|
245
|
+
{
|
|
246
|
+
**contract_metadata("harness-trajectory.v1"),
|
|
247
|
+
**trajectory.model_dump(exclude_none=True),
|
|
248
|
+
}
|
|
249
|
+
)
|
|
250
|
+
for trajectory in request.trajectories
|
|
251
|
+
]
|
|
252
|
+
synthesis_span = new_span_id()
|
|
253
|
+
trace_emit(
|
|
254
|
+
component="synthesis",
|
|
255
|
+
event_type="log",
|
|
256
|
+
trace_id=trace_id,
|
|
257
|
+
span_id=synthesis_span,
|
|
258
|
+
parent_span_id=span_id,
|
|
259
|
+
payload={
|
|
260
|
+
"message": "trajectories:fuse received",
|
|
261
|
+
"judge_model": judge_model,
|
|
262
|
+
"synthesizer_model": synthesizer_model,
|
|
263
|
+
"input_trajectory_ids": [trajectory.trajectory_id for trajectory in trajectories],
|
|
264
|
+
"input_model_ids": [trajectory.model_id for trajectory in trajectories],
|
|
265
|
+
},
|
|
266
|
+
)
|
|
267
|
+
result = await engine.judge_synthesizer.synthesize_trajectories(
|
|
268
|
+
request.messages,
|
|
269
|
+
trajectories,
|
|
270
|
+
judge_client=judge_client,
|
|
271
|
+
synthesizer_client=synthesizer_client,
|
|
272
|
+
judge_sampling=config.sampling.model_copy(update={"temperature": 0.0}),
|
|
273
|
+
synthesis_sampling=config.sampling,
|
|
274
|
+
trace_id=trace_id,
|
|
275
|
+
span_id=synthesis_span,
|
|
276
|
+
)
|
|
277
|
+
return {
|
|
278
|
+
"final_output": result.final_output,
|
|
279
|
+
"synthesis_id": result.record.synthesis_id,
|
|
280
|
+
"decision": result.record.decision,
|
|
281
|
+
"rationale": result.record.rationale,
|
|
282
|
+
"judge_synthesis_record": result.record.model_dump(mode="json"),
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
@app.post("/v1/fusion/trajectory:step", response_model=None)
|
|
286
|
+
async def step_trajectory(
|
|
287
|
+
request: StepTrajectoriesRequest,
|
|
288
|
+
trace_id: str | None = Header(default=None, alias=TRACE_ID_HEADER),
|
|
289
|
+
span_id: str | None = Header(default=None, alias=TRACE_SPAN_HEADER),
|
|
290
|
+
) -> dict[str, Any] | JSONResponse | StreamingResponse:
|
|
291
|
+
judge_model = request.judge_model or config.resolved_judge_model
|
|
292
|
+
try:
|
|
293
|
+
judge_client = engine.clients[judge_model]
|
|
294
|
+
except KeyError as exc:
|
|
295
|
+
return _openai_error_response(
|
|
296
|
+
"unknown_model",
|
|
297
|
+
f"Unknown model endpoint {exc}.",
|
|
298
|
+
status_code=400,
|
|
299
|
+
)
|
|
300
|
+
trajectories = [
|
|
301
|
+
HarnessTrajectoryV1.model_validate(
|
|
302
|
+
{
|
|
303
|
+
**contract_metadata("harness-trajectory.v1"),
|
|
304
|
+
**trajectory.model_dump(exclude_none=True),
|
|
305
|
+
}
|
|
306
|
+
)
|
|
307
|
+
for trajectory in request.trajectories
|
|
308
|
+
]
|
|
309
|
+
try:
|
|
310
|
+
response = await engine.judge_synthesizer.step(
|
|
311
|
+
[_to_chat_message(message) for message in request.messages],
|
|
312
|
+
trajectories,
|
|
313
|
+
judge_client=judge_client,
|
|
314
|
+
sampling=config.sampling,
|
|
315
|
+
tools=_normalize_tools(request.tools),
|
|
316
|
+
tool_choice=_normalize_tool_choice(request.tool_choice),
|
|
317
|
+
trace_id=trace_id,
|
|
318
|
+
span_id=span_id or new_span_id(),
|
|
319
|
+
)
|
|
320
|
+
except Exception as exc: # noqa: BLE001 - surface as an OpenAI-style error body
|
|
321
|
+
traceback.print_exc()
|
|
322
|
+
return _openai_error_response(
|
|
323
|
+
exc.__class__.__name__,
|
|
324
|
+
f"trajectory step failed: {exc}",
|
|
325
|
+
status_code=502,
|
|
326
|
+
)
|
|
327
|
+
if request.stream:
|
|
328
|
+
return StreamingResponse(
|
|
329
|
+
_step_completion_sse(request.model, response),
|
|
330
|
+
media_type="text/event-stream",
|
|
331
|
+
)
|
|
332
|
+
return _openai_step_response(request.model, response)
|
|
333
|
+
|
|
334
|
+
return app
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
async def _resolve_native_chat(
|
|
338
|
+
native_runs: FusionRunManager,
|
|
339
|
+
request: FusionRequest,
|
|
340
|
+
config: FusionConfig,
|
|
341
|
+
) -> tuple[str, dict[str, Any]] | JSONResponse:
|
|
342
|
+
run_request = _fusion_request_to_run_request(request, config)
|
|
343
|
+
result = await native_runs.create_and_run(run_request)
|
|
344
|
+
if isinstance(result, CreateRunResult):
|
|
345
|
+
if result.idempotency_outcome == "conflict":
|
|
346
|
+
return _openai_native_error_response(result.terminal_error, status_code=409)
|
|
347
|
+
if result.run_id is None:
|
|
348
|
+
return _openai_error_response(
|
|
349
|
+
"run_not_available",
|
|
350
|
+
"Native run did not return a run id.",
|
|
351
|
+
status_code=500,
|
|
352
|
+
)
|
|
353
|
+
result = native_runs.store.inspect_run(result.run_id)
|
|
354
|
+
if result.state != "completed" or result.final_output is None:
|
|
355
|
+
return _openai_native_error_response(result.terminal_error, status_code=500)
|
|
356
|
+
return result.final_output, _chat_fusion_metadata(result)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
async def _chat_completion_sse(
|
|
360
|
+
model: str,
|
|
361
|
+
content: str,
|
|
362
|
+
metadata: dict[str, Any],
|
|
363
|
+
) -> AsyncIterator[str]:
|
|
364
|
+
completion_id = f"chatcmpl-{uuid.uuid4()}"
|
|
365
|
+
created = int(time.time())
|
|
366
|
+
|
|
367
|
+
def chunk(delta: dict[str, Any], finish_reason: str | None) -> str:
|
|
368
|
+
payload: dict[str, Any] = {
|
|
369
|
+
"id": completion_id,
|
|
370
|
+
"object": "chat.completion.chunk",
|
|
371
|
+
"created": created,
|
|
372
|
+
"model": model,
|
|
373
|
+
"choices": [
|
|
374
|
+
{"index": 0, "delta": delta, "finish_reason": finish_reason},
|
|
375
|
+
],
|
|
376
|
+
}
|
|
377
|
+
if finish_reason is not None:
|
|
378
|
+
payload["fusionkit"] = metadata
|
|
379
|
+
return f"data: {json.dumps(payload)}\n\n"
|
|
380
|
+
|
|
381
|
+
yield chunk({"role": "assistant"}, None)
|
|
382
|
+
for piece in _stream_pieces(content):
|
|
383
|
+
yield chunk({"content": piece}, None)
|
|
384
|
+
yield chunk({}, "stop")
|
|
385
|
+
yield "data: [DONE]\n\n"
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _stream_pieces(content: str) -> list[str]:
|
|
389
|
+
if not content:
|
|
390
|
+
return []
|
|
391
|
+
# Split into tokens that retain their trailing whitespace so the
|
|
392
|
+
# concatenation of all pieces reproduces the original content exactly.
|
|
393
|
+
return [token for token in re.findall(r"\S+\s*|\s+", content) if token]
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _create_run_manager(
|
|
397
|
+
engine: FusionEngine,
|
|
398
|
+
run_store_path: Path | None,
|
|
399
|
+
) -> FusionRunManager:
|
|
400
|
+
root = run_store_path or Path(".fusionkit/runs")
|
|
401
|
+
return FusionRunManager(
|
|
402
|
+
engine,
|
|
403
|
+
FileSystemRunStore(root),
|
|
404
|
+
LocalArtifactStore(root),
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _create_run_payload(result: CreateRunResult) -> dict[str, Any]:
|
|
409
|
+
return {
|
|
410
|
+
"run_id": result.run_id,
|
|
411
|
+
"trace_id": result.trace_id,
|
|
412
|
+
"state": result.state,
|
|
413
|
+
"status": result.status,
|
|
414
|
+
"event_cursor": result.event_cursor,
|
|
415
|
+
"idempotency_outcome": result.idempotency_outcome,
|
|
416
|
+
"terminal_error": _dump_optional(result.terminal_error),
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _fusion_request_to_run_request(
|
|
421
|
+
request: FusionRequest,
|
|
422
|
+
config: FusionConfig,
|
|
423
|
+
) -> FusionRunRequestV1:
|
|
424
|
+
sampling = config.sampling.model_copy(
|
|
425
|
+
update={
|
|
426
|
+
key: value
|
|
427
|
+
for key, value in {
|
|
428
|
+
"temperature": request.temperature,
|
|
429
|
+
"top_p": request.top_p,
|
|
430
|
+
"max_tokens": request.max_tokens,
|
|
431
|
+
}.items()
|
|
432
|
+
if value is not None
|
|
433
|
+
}
|
|
434
|
+
)
|
|
435
|
+
payload = {
|
|
436
|
+
**contract_metadata("fusion-run-request.v1"),
|
|
437
|
+
"request_id": make_id("chat_request"),
|
|
438
|
+
"mode": _mode_from_request(request),
|
|
439
|
+
"messages": [
|
|
440
|
+
message.model_dump(mode="json", include={"role", "content"})
|
|
441
|
+
for message in request.messages
|
|
442
|
+
],
|
|
443
|
+
"sampling": sampling.model_dump(mode="json"),
|
|
444
|
+
"sample_count": request.fusion.sample_count,
|
|
445
|
+
"verify": request.fusion.verify,
|
|
446
|
+
"requested_models": request.fusion.panel_models,
|
|
447
|
+
"tool_policy": _tool_policy_from_options(request.fusion.tool_execution),
|
|
448
|
+
}
|
|
449
|
+
payload["request_hash"] = hash_json(
|
|
450
|
+
{
|
|
451
|
+
"model": request.model,
|
|
452
|
+
"messages": payload["messages"],
|
|
453
|
+
"sampling": payload["sampling"],
|
|
454
|
+
"fusion": request.fusion.model_dump(mode="json"),
|
|
455
|
+
}
|
|
456
|
+
)
|
|
457
|
+
return FusionRunRequestV1.model_validate(payload)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _tool_policy_from_options(options: FusionToolExecutionOptions) -> str:
|
|
461
|
+
if options.mode == "external":
|
|
462
|
+
return "external_pause"
|
|
463
|
+
if options.mode == "executor":
|
|
464
|
+
return "allowed"
|
|
465
|
+
return "disabled"
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _tool_execution_policy_from_options(options: FusionToolExecutionOptions) -> ToolExecutionPolicy:
|
|
469
|
+
return ToolExecutionPolicy.model_validate(
|
|
470
|
+
{
|
|
471
|
+
"mode": options.mode,
|
|
472
|
+
"allowed_side_effects": options.allowed_side_effects,
|
|
473
|
+
"environment": options.environment,
|
|
474
|
+
"policy_id": options.policy_id,
|
|
475
|
+
"dedupe_read_only": options.dedupe_read_only,
|
|
476
|
+
}
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _chat_fusion_metadata(inspection: RunInspection) -> dict[str, Any]:
|
|
481
|
+
return {
|
|
482
|
+
"run_id": inspection.run_id,
|
|
483
|
+
"trace_id": inspection.trace_id,
|
|
484
|
+
"state": inspection.state,
|
|
485
|
+
"status": inspection.status,
|
|
486
|
+
"event_cursor": inspection.event_cursor,
|
|
487
|
+
"candidate_count": len(inspection.candidates),
|
|
488
|
+
"candidate_model_ids": [candidate.model_id for candidate in inspection.candidates],
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def _native_error_response(
|
|
493
|
+
error: NativeRunError | None,
|
|
494
|
+
status_code: int,
|
|
495
|
+
) -> JSONResponse:
|
|
496
|
+
resolved_error = error or NativeRunError(
|
|
497
|
+
error_kind="internal_error",
|
|
498
|
+
error_code="unknown_native_run_error",
|
|
499
|
+
retryable=False,
|
|
500
|
+
owner="fusionkit",
|
|
501
|
+
terminal_reason="unknown_native_run_error",
|
|
502
|
+
)
|
|
503
|
+
return _json_response({"error": resolved_error.model_dump(mode="json")}, status_code)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _openai_native_error_response(
|
|
507
|
+
error: NativeRunError | None,
|
|
508
|
+
status_code: int,
|
|
509
|
+
) -> JSONResponse:
|
|
510
|
+
resolved_error = error or NativeRunError(
|
|
511
|
+
error_kind="internal_error",
|
|
512
|
+
error_code="native_run_failed",
|
|
513
|
+
retryable=False,
|
|
514
|
+
owner="fusionkit",
|
|
515
|
+
terminal_reason="native_run_failed",
|
|
516
|
+
)
|
|
517
|
+
return _openai_error_response(
|
|
518
|
+
resolved_error.error_code,
|
|
519
|
+
resolved_error.message or resolved_error.terminal_reason,
|
|
520
|
+
status_code=status_code,
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def _openai_error_response(error_code: str, message: str, status_code: int) -> JSONResponse:
|
|
525
|
+
return _json_response(
|
|
526
|
+
{
|
|
527
|
+
"error": {
|
|
528
|
+
"message": message,
|
|
529
|
+
"type": "invalid_request_error",
|
|
530
|
+
"code": error_code,
|
|
531
|
+
}
|
|
532
|
+
},
|
|
533
|
+
status_code=status_code,
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _run_not_found_response() -> JSONResponse:
|
|
538
|
+
return _native_error_response(
|
|
539
|
+
NativeRunError(
|
|
540
|
+
error_kind="validation_error",
|
|
541
|
+
error_code="run_not_found",
|
|
542
|
+
retryable=False,
|
|
543
|
+
owner="fusionkit",
|
|
544
|
+
terminal_reason="unknown_run",
|
|
545
|
+
),
|
|
546
|
+
status_code=404,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _json_response(payload: Any, status_code: int = 200) -> JSONResponse:
|
|
551
|
+
return JSONResponse(content=payload, status_code=status_code)
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _dump_optional(model: BaseModel | None) -> dict[str, Any] | None:
|
|
555
|
+
return None if model is None else model.model_dump(mode="json")
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _mode_from_request(request: FusionRequest) -> FusionMode:
|
|
559
|
+
if request.fusion.mode is not None:
|
|
560
|
+
return request.fusion.mode
|
|
561
|
+
suffix = request.model.rsplit("/", maxsplit=1)[-1]
|
|
562
|
+
if suffix == "single":
|
|
563
|
+
return "single"
|
|
564
|
+
if suffix == "self":
|
|
565
|
+
return "self"
|
|
566
|
+
if suffix == "panel":
|
|
567
|
+
return "panel"
|
|
568
|
+
if suffix == "router":
|
|
569
|
+
return "router"
|
|
570
|
+
return "router"
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _coerce_message_content(content: Any) -> str:
|
|
574
|
+
if isinstance(content, str):
|
|
575
|
+
return content
|
|
576
|
+
if isinstance(content, list):
|
|
577
|
+
parts: list[str] = []
|
|
578
|
+
for part in content:
|
|
579
|
+
if isinstance(part, str):
|
|
580
|
+
parts.append(part)
|
|
581
|
+
elif isinstance(part, dict) and isinstance(part.get("text"), str):
|
|
582
|
+
parts.append(part["text"])
|
|
583
|
+
return "".join(parts)
|
|
584
|
+
return ""
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _to_chat_message(message: dict[str, Any]) -> ChatMessage:
|
|
588
|
+
"""Normalize a raw OpenAI chat message into a FusionKit ChatMessage,
|
|
589
|
+
flattening nested ({function:{name,arguments}}) tool calls."""
|
|
590
|
+
kwargs: dict[str, Any] = {
|
|
591
|
+
"role": message.get("role", "user"),
|
|
592
|
+
"content": _coerce_message_content(message.get("content")),
|
|
593
|
+
}
|
|
594
|
+
if message.get("tool_call_id"):
|
|
595
|
+
kwargs["tool_call_id"] = message["tool_call_id"]
|
|
596
|
+
if message.get("name"):
|
|
597
|
+
kwargs["name"] = message["name"]
|
|
598
|
+
tool_calls = message.get("tool_calls")
|
|
599
|
+
if tool_calls:
|
|
600
|
+
parsed: list[ToolCall] = []
|
|
601
|
+
for call in tool_calls:
|
|
602
|
+
function = (
|
|
603
|
+
call.get("function") if isinstance(call, dict) and "function" in call else call
|
|
604
|
+
)
|
|
605
|
+
function = function if isinstance(function, dict) else {}
|
|
606
|
+
parsed.append(
|
|
607
|
+
ToolCall(
|
|
608
|
+
id=call.get("id", "") if isinstance(call, dict) else "",
|
|
609
|
+
name=function.get("name", ""),
|
|
610
|
+
arguments=function.get("arguments", "{}") or "{}",
|
|
611
|
+
)
|
|
612
|
+
)
|
|
613
|
+
if parsed:
|
|
614
|
+
kwargs["tool_calls"] = parsed
|
|
615
|
+
return ChatMessage(**kwargs)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def _normalize_tools(tools: list[dict[str, Any]] | None) -> list[dict[str, Any]] | None:
|
|
619
|
+
"""Accept OpenAI-nested ({type:function, function:{...}}) or flat tool defs and
|
|
620
|
+
return the flat {name, description, parameters} shape FusionKit's clients expect."""
|
|
621
|
+
if not tools:
|
|
622
|
+
return None
|
|
623
|
+
normalized: list[dict[str, Any]] = []
|
|
624
|
+
for entry in tools:
|
|
625
|
+
function = (
|
|
626
|
+
entry.get("function") if isinstance(entry, dict) and "function" in entry else entry
|
|
627
|
+
)
|
|
628
|
+
if not isinstance(function, dict):
|
|
629
|
+
continue
|
|
630
|
+
name = function.get("name", "")
|
|
631
|
+
# Skip tools without a usable name (some agent CLIs advertise custom or
|
|
632
|
+
# freeform tool shapes that resolve to an empty name, which providers reject).
|
|
633
|
+
if not isinstance(name, str) or not name:
|
|
634
|
+
continue
|
|
635
|
+
normalized.append(
|
|
636
|
+
{
|
|
637
|
+
"name": name,
|
|
638
|
+
"description": function.get("description", ""),
|
|
639
|
+
"parameters": function.get("parameters", {"type": "object", "properties": {}}),
|
|
640
|
+
}
|
|
641
|
+
)
|
|
642
|
+
return normalized or None
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def _normalize_tool_choice(choice: str | dict[str, Any] | None) -> str | dict[str, Any] | None:
|
|
646
|
+
if choice is None or isinstance(choice, str):
|
|
647
|
+
return choice
|
|
648
|
+
if isinstance(choice, dict):
|
|
649
|
+
function = choice.get("function") if "function" in choice else choice
|
|
650
|
+
name = function.get("name") if isinstance(function, dict) else None
|
|
651
|
+
if isinstance(name, str) and name:
|
|
652
|
+
return {"name": name}
|
|
653
|
+
return None
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def _tool_calls_payload(response: ModelResponse) -> list[dict[str, Any]]:
|
|
657
|
+
return [
|
|
658
|
+
{
|
|
659
|
+
"id": call.id or f"call_{index}",
|
|
660
|
+
"type": "function",
|
|
661
|
+
"function": {"name": call.name, "arguments": call.arguments},
|
|
662
|
+
}
|
|
663
|
+
for index, call in enumerate(response.tool_calls)
|
|
664
|
+
]
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def _usage_dict(response: ModelResponse) -> dict[str, Any]:
|
|
668
|
+
return {
|
|
669
|
+
"prompt_tokens": response.usage.prompt_tokens,
|
|
670
|
+
"completion_tokens": response.usage.completion_tokens,
|
|
671
|
+
"total_tokens": response.usage.total_tokens,
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def _openai_step_response(model: str, response: ModelResponse) -> dict[str, Any]:
|
|
676
|
+
message: dict[str, Any] = {"role": "assistant", "content": response.content or ""}
|
|
677
|
+
tool_calls = _tool_calls_payload(response)
|
|
678
|
+
if tool_calls:
|
|
679
|
+
message["tool_calls"] = tool_calls
|
|
680
|
+
finish_reason = "tool_calls" if tool_calls else (response.finish_reason or "stop")
|
|
681
|
+
return {
|
|
682
|
+
"id": f"chatcmpl-{uuid.uuid4()}",
|
|
683
|
+
"object": "chat.completion",
|
|
684
|
+
"created": int(time.time()),
|
|
685
|
+
"model": model,
|
|
686
|
+
"choices": [{"index": 0, "message": message, "finish_reason": finish_reason}],
|
|
687
|
+
"usage": _usage_dict(response),
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
async def _step_completion_sse(model: str, response: ModelResponse) -> AsyncIterator[str]:
|
|
692
|
+
completion_id = f"chatcmpl-{uuid.uuid4()}"
|
|
693
|
+
created = int(time.time())
|
|
694
|
+
tool_calls = _tool_calls_payload(response)
|
|
695
|
+
finish_reason = "tool_calls" if tool_calls else (response.finish_reason or "stop")
|
|
696
|
+
|
|
697
|
+
def chunk(delta: dict[str, Any], finish: str | None) -> str:
|
|
698
|
+
payload = {
|
|
699
|
+
"id": completion_id,
|
|
700
|
+
"object": "chat.completion.chunk",
|
|
701
|
+
"created": created,
|
|
702
|
+
"model": model,
|
|
703
|
+
"choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
|
|
704
|
+
}
|
|
705
|
+
return f"data: {json.dumps(payload)}\n\n"
|
|
706
|
+
|
|
707
|
+
yield chunk({"role": "assistant"}, None)
|
|
708
|
+
if response.content:
|
|
709
|
+
for piece in _stream_pieces(response.content):
|
|
710
|
+
yield chunk({"content": piece}, None)
|
|
711
|
+
if tool_calls:
|
|
712
|
+
yield chunk(
|
|
713
|
+
{
|
|
714
|
+
"tool_calls": [
|
|
715
|
+
{"index": index, **call} for index, call in enumerate(tool_calls)
|
|
716
|
+
]
|
|
717
|
+
},
|
|
718
|
+
None,
|
|
719
|
+
)
|
|
720
|
+
yield chunk({}, finish_reason)
|
|
721
|
+
yield "data: [DONE]\n\n"
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def _openai_chat_response(model: str, content: str, metadata: dict[str, Any]) -> dict[str, Any]:
|
|
725
|
+
return {
|
|
726
|
+
"id": f"chatcmpl-{uuid.uuid4()}",
|
|
727
|
+
"object": "chat.completion",
|
|
728
|
+
"created": int(time.time()),
|
|
729
|
+
"model": model,
|
|
730
|
+
"choices": [
|
|
731
|
+
{
|
|
732
|
+
"index": 0,
|
|
733
|
+
"finish_reason": "stop",
|
|
734
|
+
"message": {
|
|
735
|
+
"role": "assistant",
|
|
736
|
+
"content": content,
|
|
737
|
+
},
|
|
738
|
+
}
|
|
739
|
+
],
|
|
740
|
+
"usage": {
|
|
741
|
+
"prompt_tokens": None,
|
|
742
|
+
"completion_tokens": None,
|
|
743
|
+
"total_tokens": None,
|
|
744
|
+
},
|
|
745
|
+
"fusionkit": metadata,
|
|
746
|
+
}
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Front one model endpoint (any provider) as an OpenAI Chat Completions server.
|
|
2
|
+
|
|
3
|
+
This is the cloud analogue of the local MLX server: it serves a single
|
|
4
|
+
``/v1/chat/completions`` endpoint backed by FusionKit's provider clients
|
|
5
|
+
(``build_client``), so an OpenAI/Anthropic/Google model can be consumed by any
|
|
6
|
+
OpenAI-compatible caller (for example, HandoffKit's per-candidate coding
|
|
7
|
+
harness). One process fronts exactly one model, which keeps per-model routing
|
|
8
|
+
trivial for the caller.
|
|
9
|
+
|
|
10
|
+
Single-threaded on purpose (one model, low volume); each request runs the async
|
|
11
|
+
provider client in a fresh event loop so the SDK's HTTP client is bound to the
|
|
12
|
+
loop that drives it. Shared by the ``fusionkit serve-endpoint`` CLI command and
|
|
13
|
+
``scripts/simple_openai_server.py``.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import json
|
|
19
|
+
import time
|
|
20
|
+
import traceback
|
|
21
|
+
import uuid
|
|
22
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
23
|
+
from typing import Any, cast
|
|
24
|
+
|
|
25
|
+
from fusionkit_core.clients import build_client
|
|
26
|
+
from fusionkit_core.config import ModelEndpoint, ProviderKind, SamplingConfig
|
|
27
|
+
from fusionkit_core.trace import (
|
|
28
|
+
TRACE_CANDIDATE_HEADER,
|
|
29
|
+
TRACE_ID_HEADER,
|
|
30
|
+
TRACE_SPAN_HEADER,
|
|
31
|
+
new_span_id,
|
|
32
|
+
)
|
|
33
|
+
from fusionkit_core.trace import emit as trace_emit
|
|
34
|
+
from fusionkit_core.types import ChatMessage, ToolCall
|
|
35
|
+
|
|
36
|
+
# Provider base URLs when the operator does not pass an explicit base URL. The
|
|
37
|
+
# OpenAI client appends `/v1`; the Anthropic SDK takes the root.
|
|
38
|
+
PROVIDER_DEFAULT_BASE_URL = {
|
|
39
|
+
"openai": "https://api.openai.com",
|
|
40
|
+
"anthropic": "https://api.anthropic.com",
|
|
41
|
+
"google": "https://generativelanguage.googleapis.com",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _to_chat_message(message: dict[str, Any]) -> ChatMessage:
|
|
46
|
+
content = message.get("content")
|
|
47
|
+
kwargs: dict[str, Any] = {
|
|
48
|
+
"role": message.get("role", "user"),
|
|
49
|
+
"content": content if isinstance(content, str) else "",
|
|
50
|
+
}
|
|
51
|
+
if message.get("tool_call_id"):
|
|
52
|
+
kwargs["tool_call_id"] = message["tool_call_id"]
|
|
53
|
+
tool_calls = message.get("tool_calls")
|
|
54
|
+
if tool_calls:
|
|
55
|
+
kwargs["tool_calls"] = [
|
|
56
|
+
ToolCall(
|
|
57
|
+
id=call.get("id", ""),
|
|
58
|
+
name=call.get("function", {}).get("name", ""),
|
|
59
|
+
arguments=call.get("function", {}).get("arguments", "{}"),
|
|
60
|
+
)
|
|
61
|
+
for call in tool_calls
|
|
62
|
+
]
|
|
63
|
+
return ChatMessage(**kwargs)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _to_tools(tools: Any) -> list[dict[str, Any]] | None:
|
|
67
|
+
if not tools:
|
|
68
|
+
return None
|
|
69
|
+
converted = []
|
|
70
|
+
for entry in tools:
|
|
71
|
+
function = entry.get("function", entry)
|
|
72
|
+
converted.append(
|
|
73
|
+
{
|
|
74
|
+
"name": function.get("name", ""),
|
|
75
|
+
"description": function.get("description", ""),
|
|
76
|
+
"parameters": function.get("parameters", {"type": "object", "properties": {}}),
|
|
77
|
+
}
|
|
78
|
+
)
|
|
79
|
+
return converted
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def make_handler(endpoint: ModelEndpoint) -> type[BaseHTTPRequestHandler]:
|
|
83
|
+
class Handler(BaseHTTPRequestHandler):
|
|
84
|
+
server_version = "fusionkit-openai-bridge/0.1"
|
|
85
|
+
|
|
86
|
+
def log_message(self, format: str, *args: Any) -> None:
|
|
87
|
+
prefix = f"{self.address_string()} - - [{self.log_date_time_string()}] "
|
|
88
|
+
print(prefix + format % args, flush=True)
|
|
89
|
+
|
|
90
|
+
def _send_json(self, status: int, payload: dict[str, Any]) -> None:
|
|
91
|
+
body = json.dumps(payload).encode("utf-8")
|
|
92
|
+
self.send_response(status)
|
|
93
|
+
self.send_header("content-type", "application/json")
|
|
94
|
+
self.send_header("content-length", str(len(body)))
|
|
95
|
+
self.end_headers()
|
|
96
|
+
self.wfile.write(body)
|
|
97
|
+
|
|
98
|
+
def do_GET(self) -> None:
|
|
99
|
+
if self.path in ("/health", "/v1/health"):
|
|
100
|
+
self._send_json(
|
|
101
|
+
200,
|
|
102
|
+
{"status": "ok", "model": endpoint.id, "provider": endpoint.provider},
|
|
103
|
+
)
|
|
104
|
+
return
|
|
105
|
+
if self.path == "/v1/models":
|
|
106
|
+
self._send_json(
|
|
107
|
+
200,
|
|
108
|
+
{
|
|
109
|
+
"object": "list",
|
|
110
|
+
"data": [
|
|
111
|
+
{"id": endpoint.id, "object": "model", "owned_by": endpoint.provider}
|
|
112
|
+
],
|
|
113
|
+
},
|
|
114
|
+
)
|
|
115
|
+
return
|
|
116
|
+
self._send_json(404, {"error": {"message": "not found"}})
|
|
117
|
+
|
|
118
|
+
def do_POST(self) -> None:
|
|
119
|
+
if self.path != "/v1/chat/completions":
|
|
120
|
+
self._send_json(404, {"error": {"message": "not found"}})
|
|
121
|
+
return
|
|
122
|
+
trace_id = self.headers.get(TRACE_ID_HEADER)
|
|
123
|
+
candidate_id = self.headers.get(TRACE_CANDIDATE_HEADER)
|
|
124
|
+
parent_span = self.headers.get(TRACE_SPAN_HEADER)
|
|
125
|
+
call_span = new_span_id()
|
|
126
|
+
try:
|
|
127
|
+
length = int(self.headers.get("content-length", "0"))
|
|
128
|
+
request = json.loads(self.rfile.read(length).decode("utf-8"))
|
|
129
|
+
messages = [
|
|
130
|
+
_to_chat_message(message) for message in (request.get("messages") or [])
|
|
131
|
+
]
|
|
132
|
+
tools = _to_tools(request.get("tools"))
|
|
133
|
+
tool_choice = request.get("tool_choice")
|
|
134
|
+
sampling = SamplingConfig(
|
|
135
|
+
temperature=float(request.get("temperature", 0.2) or 0.2),
|
|
136
|
+
top_p=float(request.get("top_p", 0.95) or 0.95),
|
|
137
|
+
max_tokens=int(request.get("max_tokens", 1024) or 1024),
|
|
138
|
+
)
|
|
139
|
+
trace_emit(
|
|
140
|
+
component="panel-model",
|
|
141
|
+
event_type="model.call.started",
|
|
142
|
+
trace_id=trace_id,
|
|
143
|
+
span_id=call_span,
|
|
144
|
+
parent_span_id=parent_span,
|
|
145
|
+
candidate_id=candidate_id,
|
|
146
|
+
model_id=endpoint.id,
|
|
147
|
+
payload={
|
|
148
|
+
"model": endpoint.model,
|
|
149
|
+
"provider": endpoint.provider,
|
|
150
|
+
"message_count": len(messages),
|
|
151
|
+
"tool_count": len(tools) if tools else 0,
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
async def run() -> Any:
|
|
156
|
+
client = build_client(endpoint)
|
|
157
|
+
return await client.chat(
|
|
158
|
+
messages,
|
|
159
|
+
sampling,
|
|
160
|
+
tools=tools,
|
|
161
|
+
tool_choice=tool_choice if isinstance(tool_choice, str) else None,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
started = time.perf_counter()
|
|
165
|
+
response = asyncio.run(run())
|
|
166
|
+
latency_s = time.perf_counter() - started
|
|
167
|
+
if response.tool_calls:
|
|
168
|
+
finish_reason = "tool_calls"
|
|
169
|
+
else:
|
|
170
|
+
finish_reason = response.finish_reason or "stop"
|
|
171
|
+
trace_emit(
|
|
172
|
+
component="panel-model",
|
|
173
|
+
event_type="model.call.finished",
|
|
174
|
+
trace_id=trace_id,
|
|
175
|
+
span_id=call_span,
|
|
176
|
+
parent_span_id=parent_span,
|
|
177
|
+
candidate_id=candidate_id,
|
|
178
|
+
model_id=endpoint.id,
|
|
179
|
+
payload={
|
|
180
|
+
"model": endpoint.model,
|
|
181
|
+
"provider": endpoint.provider,
|
|
182
|
+
"latency_s": round(latency_s, 3),
|
|
183
|
+
"finish_reason": finish_reason,
|
|
184
|
+
"tool_call_count": len(response.tool_calls),
|
|
185
|
+
"content_preview": (response.content or "")[:400],
|
|
186
|
+
"usage": {
|
|
187
|
+
"prompt_tokens": response.usage.prompt_tokens,
|
|
188
|
+
"completion_tokens": response.usage.completion_tokens,
|
|
189
|
+
"total_tokens": response.usage.total_tokens,
|
|
190
|
+
},
|
|
191
|
+
},
|
|
192
|
+
)
|
|
193
|
+
print(
|
|
194
|
+
json.dumps(
|
|
195
|
+
{
|
|
196
|
+
"event": "chat_completion",
|
|
197
|
+
"model": endpoint.id,
|
|
198
|
+
"provider": endpoint.provider,
|
|
199
|
+
"latency_s": round(latency_s, 3),
|
|
200
|
+
}
|
|
201
|
+
),
|
|
202
|
+
flush=True,
|
|
203
|
+
)
|
|
204
|
+
message_body: dict[str, Any] = {"role": "assistant", "content": response.content}
|
|
205
|
+
if response.tool_calls:
|
|
206
|
+
message_body["tool_calls"] = [
|
|
207
|
+
{
|
|
208
|
+
"id": call.id or f"call_{uuid.uuid4().hex[:8]}",
|
|
209
|
+
"type": "function",
|
|
210
|
+
"function": {"name": call.name, "arguments": call.arguments},
|
|
211
|
+
}
|
|
212
|
+
for call in response.tool_calls
|
|
213
|
+
]
|
|
214
|
+
self._send_json(
|
|
215
|
+
200,
|
|
216
|
+
{
|
|
217
|
+
"id": f"chatcmpl-{uuid.uuid4()}",
|
|
218
|
+
"object": "chat.completion",
|
|
219
|
+
"created": int(time.time()),
|
|
220
|
+
"model": endpoint.model,
|
|
221
|
+
"choices": [
|
|
222
|
+
{
|
|
223
|
+
"index": 0,
|
|
224
|
+
"message": message_body,
|
|
225
|
+
"finish_reason": finish_reason,
|
|
226
|
+
}
|
|
227
|
+
],
|
|
228
|
+
"usage": {
|
|
229
|
+
"prompt_tokens": response.usage.prompt_tokens,
|
|
230
|
+
"completion_tokens": response.usage.completion_tokens,
|
|
231
|
+
"total_tokens": response.usage.total_tokens,
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
)
|
|
235
|
+
except Exception as exc: # noqa: BLE001 - surface as an OpenAI error body
|
|
236
|
+
traceback.print_exc()
|
|
237
|
+
trace_emit(
|
|
238
|
+
component="panel-model",
|
|
239
|
+
event_type="model.call.finished",
|
|
240
|
+
trace_id=trace_id,
|
|
241
|
+
span_id=call_span,
|
|
242
|
+
parent_span_id=parent_span,
|
|
243
|
+
candidate_id=candidate_id,
|
|
244
|
+
model_id=endpoint.id,
|
|
245
|
+
payload={
|
|
246
|
+
"model": endpoint.model,
|
|
247
|
+
"provider": endpoint.provider,
|
|
248
|
+
"error": str(exc),
|
|
249
|
+
"error_type": exc.__class__.__name__,
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
self._send_json(
|
|
253
|
+
500,
|
|
254
|
+
{"error": {"message": str(exc), "type": exc.__class__.__name__}},
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return Handler
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def build_endpoint(
|
|
261
|
+
*,
|
|
262
|
+
id: str,
|
|
263
|
+
model: str,
|
|
264
|
+
provider: str = "openai",
|
|
265
|
+
base_url: str | None = None,
|
|
266
|
+
api_key_env: str | None = None,
|
|
267
|
+
timeout_s: float = 120.0,
|
|
268
|
+
) -> ModelEndpoint:
|
|
269
|
+
resolved_base_url = base_url or PROVIDER_DEFAULT_BASE_URL.get(provider, "http://127.0.0.1")
|
|
270
|
+
# `provider` arrives as a free string from the CLI; ModelEndpoint validates
|
|
271
|
+
# it against ProviderKind at construction time (pydantic raises on misuse).
|
|
272
|
+
return ModelEndpoint(
|
|
273
|
+
id=id,
|
|
274
|
+
model=model,
|
|
275
|
+
base_url=resolved_base_url,
|
|
276
|
+
provider=cast(ProviderKind, provider),
|
|
277
|
+
api_key_env=api_key_env,
|
|
278
|
+
timeout_s=timeout_s,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def serve_single_endpoint(endpoint: ModelEndpoint, *, host: str = "127.0.0.1", port: int) -> None:
|
|
283
|
+
print(
|
|
284
|
+
json.dumps(
|
|
285
|
+
{
|
|
286
|
+
"event": "starting",
|
|
287
|
+
"id": endpoint.id,
|
|
288
|
+
"provider": endpoint.provider,
|
|
289
|
+
"model": endpoint.model,
|
|
290
|
+
}
|
|
291
|
+
),
|
|
292
|
+
flush=True,
|
|
293
|
+
)
|
|
294
|
+
server = HTTPServer((host, port), make_handler(endpoint))
|
|
295
|
+
print(json.dumps({"event": "listening", "host": host, "port": port}), flush=True)
|
|
296
|
+
server.serve_forever()
|