fusionkit-server 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.3
2
+ Name: fusionkit-server
3
+ Version: 0.1.0
4
+ Summary: OpenAI-compatible HTTP server for fusionkit.
5
+ Requires-Dist: fastapi>=0.124.4
6
+ Requires-Dist: fusionkit-core==0.1.0
7
+ Requires-Dist: uvicorn>=0.38.0
8
+ Requires-Dist: uvicorn[standard]>=0.38.0 ; extra == 'server'
9
+ Requires-Python: >=3.11
10
+ Provides-Extra: server
@@ -0,0 +1,17 @@
1
+ [project]
2
+ name = "fusionkit-server"
3
+ version = "0.1.0"
4
+ description = "OpenAI-compatible HTTP server for fusionkit."
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "fastapi>=0.124.4",
8
+ "fusionkit-core==0.1.0",
9
+ "uvicorn>=0.38.0",
10
+ ]
11
+
12
+ [project.optional-dependencies]
13
+ server = ["uvicorn[standard]>=0.38.0"]
14
+
15
+ [build-system]
16
+ requires = ["uv_build>=0.11.21,<0.12.0"]
17
+ build-backend = "uv_build"
@@ -0,0 +1,3 @@
1
+ from fusionkit_server.app import create_app
2
+
3
+ __all__ = ["create_app"]
@@ -0,0 +1,746 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ import time
6
+ import traceback
7
+ import uuid
8
+ from collections.abc import AsyncIterator
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from fastapi import FastAPI, Header, Query
13
+ from fastapi.responses import JSONResponse, StreamingResponse
14
+ from fusionkit_core.artifacts import LocalArtifactStore
15
+ from fusionkit_core.clients import ChatClient, build_clients
16
+ from fusionkit_core.config import FusionConfig, FusionMode
17
+ from fusionkit_core.contracts import (
18
+ FusionRunRequestV1,
19
+ HarnessTrajectoryV1,
20
+ contract_metadata,
21
+ )
22
+ from fusionkit_core.fusion import FusionEngine
23
+ from fusionkit_core.run import (
24
+ CreateRunResult,
25
+ FusionRunManager,
26
+ NativeRunError,
27
+ RunInspection,
28
+ ToolExecutionMode,
29
+ ToolExecutionPolicy,
30
+ ToolResultSubmission,
31
+ hash_json,
32
+ make_id,
33
+ )
34
+ from fusionkit_core.run_store import FileSystemRunStore
35
+ from fusionkit_core.trace import TRACE_ID_HEADER, TRACE_SPAN_HEADER, new_span_id
36
+ from fusionkit_core.trace import emit as trace_emit
37
+ from fusionkit_core.types import ChatMessage, ModelResponse, ToolCall
38
+ from pydantic import BaseModel, Field
39
+
40
+
41
+ class FusionToolExecutionOptions(BaseModel):
42
+ mode: ToolExecutionMode = "disabled"
43
+ allowed_side_effects: list[str] = Field(default_factory=lambda: ["read_only"])
44
+ environment: str | None = None
45
+ policy_id: str | None = None
46
+ dedupe_read_only: bool = True
47
+
48
+
49
+ class FusionOptions(BaseModel):
50
+ mode: FusionMode | None = None
51
+ panel_models: list[str] | None = None
52
+ sample_count: int | None = Field(default=None, ge=1)
53
+ verify: bool = False
54
+ tool_execution: FusionToolExecutionOptions = Field(
55
+ default_factory=FusionToolExecutionOptions
56
+ )
57
+
58
+
59
+ class FusionRequest(BaseModel):
60
+ model: str = "fusionkit/router"
61
+ messages: list[ChatMessage]
62
+ temperature: float | None = None
63
+ top_p: float | None = None
64
+ max_tokens: int | None = None
65
+ stream: bool = False
66
+ fusion: FusionOptions = Field(default_factory=FusionOptions)
67
+
68
+
69
+ class TrajectoryStepInput(BaseModel):
70
+ index: int
71
+ type: str
72
+ text: str | None = None
73
+ tool_name: str | None = None
74
+ tool_call_id: str | None = None
75
+ tool_input: str | None = None
76
+ is_error: bool | None = None
77
+ output_hash: str | None = None
78
+
79
+
80
+ class TrajectoryVerificationInput(BaseModel):
81
+ status: str
82
+ evidence: list[str] | None = None
83
+ exit_code: int | None = None
84
+
85
+
86
+ class TrajectoryInput(BaseModel):
87
+ trajectory_id: str
88
+ model_id: str
89
+ status: str
90
+ steps: list[TrajectoryStepInput] = Field(default_factory=list)
91
+ final_output: str
92
+ candidate_id: str | None = None
93
+ model: str | None = None
94
+ harness_kind: str | None = None
95
+ diff: str | None = None
96
+ verification: TrajectoryVerificationInput | None = None
97
+ metadata: dict[str, Any] | None = None
98
+
99
+
100
+ class FuseTrajectoriesRequest(BaseModel):
101
+ messages: list[ChatMessage]
102
+ trajectories: list[TrajectoryInput]
103
+ judge_model: str | None = None
104
+ synthesizer_model: str | None = None
105
+
106
+
107
+ class StepTrajectoriesRequest(BaseModel):
108
+ """A single front-door turn: the judge synthesizes the next step (tool call
109
+ or final answer) from the candidate trajectories + the live conversation."""
110
+
111
+ model: str = "fusionkit/router"
112
+ # Raw OpenAI chat messages (assistant tool_calls are nested under `function`,
113
+ # tool results carry `tool_call_id`, content may be a parts array); normalized
114
+ # to FusionKit ChatMessage in the handler.
115
+ messages: list[dict[str, Any]]
116
+ trajectories: list[TrajectoryInput] = Field(default_factory=list)
117
+ tools: list[dict[str, Any]] | None = None
118
+ tool_choice: str | dict[str, Any] | None = None
119
+ judge_model: str | None = None
120
+ stream: bool = False
121
+
122
+
123
+ def create_app(
124
+ config: FusionConfig,
125
+ clients: dict[str, ChatClient] | None = None,
126
+ run_manager: FusionRunManager | None = None,
127
+ run_store_path: Path | None = None,
128
+ ) -> FastAPI:
129
+ app = FastAPI(title="fusionkit", version="0.1.0")
130
+ model_clients = clients or build_clients(config)
131
+ engine = FusionEngine(config=config, clients=model_clients)
132
+ native_runs = run_manager or _create_run_manager(engine, run_store_path)
133
+
134
+ @app.get("/health")
135
+ async def health() -> dict[str, str]:
136
+ return {"status": "ok"}
137
+
138
+ @app.get("/v1/models")
139
+ async def models() -> dict[str, Any]:
140
+ data = [{"id": "fusionkit/router", "object": "model"}]
141
+ data.extend({"id": endpoint.id, "object": "model"} for endpoint in config.endpoints)
142
+ return {"object": "list", "data": data}
143
+
144
+ @app.post("/v1/fusion/runs")
145
+ async def create_fusion_run(
146
+ request: FusionRunRequestV1,
147
+ idempotency_key: str | None = Header(default=None, alias="Idempotency-Key"),
148
+ ) -> JSONResponse:
149
+ result = await native_runs.create_and_run(request, idempotency_key=idempotency_key)
150
+ if isinstance(result, CreateRunResult):
151
+ if result.idempotency_outcome == "conflict":
152
+ return _native_error_response(result.terminal_error, status_code=409)
153
+ return _json_response(_create_run_payload(result))
154
+ return _json_response(
155
+ {
156
+ "run_id": result.run_id,
157
+ "trace_id": result.trace_id,
158
+ "state": result.state,
159
+ "status": result.status,
160
+ "event_cursor": result.event_cursor,
161
+ "idempotency_outcome": "created",
162
+ "terminal_error": _dump_optional(result.terminal_error),
163
+ "inspection": result.model_dump(mode="json"),
164
+ }
165
+ )
166
+
167
+ @app.get("/v1/fusion/runs/{run_id}")
168
+ async def get_fusion_run(run_id: str) -> JSONResponse:
169
+ try:
170
+ return _json_response(native_runs.store.read_summary(run_id).model_dump(mode="json"))
171
+ except FileNotFoundError:
172
+ return _run_not_found_response()
173
+
174
+ @app.get("/v1/fusion/runs/{run_id}/inspect")
175
+ async def inspect_fusion_run(run_id: str) -> JSONResponse:
176
+ try:
177
+ return _json_response(native_runs.store.inspect_run(run_id).model_dump(mode="json"))
178
+ except FileNotFoundError:
179
+ return _run_not_found_response()
180
+
181
+ @app.get("/v1/fusion/runs/{run_id}/events")
182
+ async def fusion_run_events(
183
+ run_id: str,
184
+ after: int | None = Query(default=None, ge=0),
185
+ ) -> JSONResponse:
186
+ try:
187
+ native_runs.store.read_summary(run_id)
188
+ except FileNotFoundError:
189
+ return _run_not_found_response()
190
+ return _json_response(native_runs.store.event_page(run_id, after).model_dump(mode="json"))
191
+
192
+ @app.post("/v1/fusion/runs/{run_id}/tool-results")
193
+ async def submit_tool_results(
194
+ run_id: str,
195
+ submission: ToolResultSubmission,
196
+ ) -> JSONResponse:
197
+ try:
198
+ result = native_runs.submit_tool_result(run_id, submission)
199
+ except FileNotFoundError:
200
+ return _run_not_found_response()
201
+ if isinstance(result, NativeRunError):
202
+ return _native_error_response(result, status_code=409)
203
+ return _json_response(result.model_dump(mode="json"))
204
+
205
+ @app.post("/v1/chat/completions", response_model=None)
206
+ async def chat_completions(
207
+ request: FusionRequest,
208
+ ) -> dict[str, Any] | JSONResponse | StreamingResponse:
209
+ resolved = await _resolve_native_chat(native_runs, request, config)
210
+ if isinstance(resolved, JSONResponse):
211
+ return resolved
212
+ final_output, metadata = resolved
213
+ if request.stream:
214
+ return StreamingResponse(
215
+ _chat_completion_sse(request.model, final_output, metadata),
216
+ media_type="text/event-stream",
217
+ )
218
+ return _openai_chat_response(request.model, final_output, metadata)
219
+
220
+ @app.post("/v1/fusion/trajectories:fuse", response_model=None)
221
+ async def fuse_trajectories(
222
+ request: FuseTrajectoriesRequest,
223
+ trace_id: str | None = Header(default=None, alias=TRACE_ID_HEADER),
224
+ span_id: str | None = Header(default=None, alias=TRACE_SPAN_HEADER),
225
+ ) -> dict[str, Any] | JSONResponse:
226
+ if not request.trajectories:
227
+ return _openai_error_response(
228
+ "no_trajectories",
229
+ "At least one trajectory is required.",
230
+ status_code=400,
231
+ )
232
+ judge_model = request.judge_model or config.resolved_judge_model
233
+ synthesizer_model = request.synthesizer_model or config.resolved_synthesizer_model
234
+ try:
235
+ judge_client = engine.clients[judge_model]
236
+ synthesizer_client = engine.clients[synthesizer_model]
237
+ except KeyError as exc:
238
+ return _openai_error_response(
239
+ "unknown_model",
240
+ f"Unknown model endpoint {exc}.",
241
+ status_code=400,
242
+ )
243
+ trajectories = [
244
+ HarnessTrajectoryV1.model_validate(
245
+ {
246
+ **contract_metadata("harness-trajectory.v1"),
247
+ **trajectory.model_dump(exclude_none=True),
248
+ }
249
+ )
250
+ for trajectory in request.trajectories
251
+ ]
252
+ synthesis_span = new_span_id()
253
+ trace_emit(
254
+ component="synthesis",
255
+ event_type="log",
256
+ trace_id=trace_id,
257
+ span_id=synthesis_span,
258
+ parent_span_id=span_id,
259
+ payload={
260
+ "message": "trajectories:fuse received",
261
+ "judge_model": judge_model,
262
+ "synthesizer_model": synthesizer_model,
263
+ "input_trajectory_ids": [trajectory.trajectory_id for trajectory in trajectories],
264
+ "input_model_ids": [trajectory.model_id for trajectory in trajectories],
265
+ },
266
+ )
267
+ result = await engine.judge_synthesizer.synthesize_trajectories(
268
+ request.messages,
269
+ trajectories,
270
+ judge_client=judge_client,
271
+ synthesizer_client=synthesizer_client,
272
+ judge_sampling=config.sampling.model_copy(update={"temperature": 0.0}),
273
+ synthesis_sampling=config.sampling,
274
+ trace_id=trace_id,
275
+ span_id=synthesis_span,
276
+ )
277
+ return {
278
+ "final_output": result.final_output,
279
+ "synthesis_id": result.record.synthesis_id,
280
+ "decision": result.record.decision,
281
+ "rationale": result.record.rationale,
282
+ "judge_synthesis_record": result.record.model_dump(mode="json"),
283
+ }
284
+
285
+ @app.post("/v1/fusion/trajectory:step", response_model=None)
286
+ async def step_trajectory(
287
+ request: StepTrajectoriesRequest,
288
+ trace_id: str | None = Header(default=None, alias=TRACE_ID_HEADER),
289
+ span_id: str | None = Header(default=None, alias=TRACE_SPAN_HEADER),
290
+ ) -> dict[str, Any] | JSONResponse | StreamingResponse:
291
+ judge_model = request.judge_model or config.resolved_judge_model
292
+ try:
293
+ judge_client = engine.clients[judge_model]
294
+ except KeyError as exc:
295
+ return _openai_error_response(
296
+ "unknown_model",
297
+ f"Unknown model endpoint {exc}.",
298
+ status_code=400,
299
+ )
300
+ trajectories = [
301
+ HarnessTrajectoryV1.model_validate(
302
+ {
303
+ **contract_metadata("harness-trajectory.v1"),
304
+ **trajectory.model_dump(exclude_none=True),
305
+ }
306
+ )
307
+ for trajectory in request.trajectories
308
+ ]
309
+ try:
310
+ response = await engine.judge_synthesizer.step(
311
+ [_to_chat_message(message) for message in request.messages],
312
+ trajectories,
313
+ judge_client=judge_client,
314
+ sampling=config.sampling,
315
+ tools=_normalize_tools(request.tools),
316
+ tool_choice=_normalize_tool_choice(request.tool_choice),
317
+ trace_id=trace_id,
318
+ span_id=span_id or new_span_id(),
319
+ )
320
+ except Exception as exc: # noqa: BLE001 - surface as an OpenAI-style error body
321
+ traceback.print_exc()
322
+ return _openai_error_response(
323
+ exc.__class__.__name__,
324
+ f"trajectory step failed: {exc}",
325
+ status_code=502,
326
+ )
327
+ if request.stream:
328
+ return StreamingResponse(
329
+ _step_completion_sse(request.model, response),
330
+ media_type="text/event-stream",
331
+ )
332
+ return _openai_step_response(request.model, response)
333
+
334
+ return app
335
+
336
+
337
+ async def _resolve_native_chat(
338
+ native_runs: FusionRunManager,
339
+ request: FusionRequest,
340
+ config: FusionConfig,
341
+ ) -> tuple[str, dict[str, Any]] | JSONResponse:
342
+ run_request = _fusion_request_to_run_request(request, config)
343
+ result = await native_runs.create_and_run(run_request)
344
+ if isinstance(result, CreateRunResult):
345
+ if result.idempotency_outcome == "conflict":
346
+ return _openai_native_error_response(result.terminal_error, status_code=409)
347
+ if result.run_id is None:
348
+ return _openai_error_response(
349
+ "run_not_available",
350
+ "Native run did not return a run id.",
351
+ status_code=500,
352
+ )
353
+ result = native_runs.store.inspect_run(result.run_id)
354
+ if result.state != "completed" or result.final_output is None:
355
+ return _openai_native_error_response(result.terminal_error, status_code=500)
356
+ return result.final_output, _chat_fusion_metadata(result)
357
+
358
+
359
+ async def _chat_completion_sse(
360
+ model: str,
361
+ content: str,
362
+ metadata: dict[str, Any],
363
+ ) -> AsyncIterator[str]:
364
+ completion_id = f"chatcmpl-{uuid.uuid4()}"
365
+ created = int(time.time())
366
+
367
+ def chunk(delta: dict[str, Any], finish_reason: str | None) -> str:
368
+ payload: dict[str, Any] = {
369
+ "id": completion_id,
370
+ "object": "chat.completion.chunk",
371
+ "created": created,
372
+ "model": model,
373
+ "choices": [
374
+ {"index": 0, "delta": delta, "finish_reason": finish_reason},
375
+ ],
376
+ }
377
+ if finish_reason is not None:
378
+ payload["fusionkit"] = metadata
379
+ return f"data: {json.dumps(payload)}\n\n"
380
+
381
+ yield chunk({"role": "assistant"}, None)
382
+ for piece in _stream_pieces(content):
383
+ yield chunk({"content": piece}, None)
384
+ yield chunk({}, "stop")
385
+ yield "data: [DONE]\n\n"
386
+
387
+
388
+ def _stream_pieces(content: str) -> list[str]:
389
+ if not content:
390
+ return []
391
+ # Split into tokens that retain their trailing whitespace so the
392
+ # concatenation of all pieces reproduces the original content exactly.
393
+ return [token for token in re.findall(r"\S+\s*|\s+", content) if token]
394
+
395
+
396
+ def _create_run_manager(
397
+ engine: FusionEngine,
398
+ run_store_path: Path | None,
399
+ ) -> FusionRunManager:
400
+ root = run_store_path or Path(".fusionkit/runs")
401
+ return FusionRunManager(
402
+ engine,
403
+ FileSystemRunStore(root),
404
+ LocalArtifactStore(root),
405
+ )
406
+
407
+
408
+ def _create_run_payload(result: CreateRunResult) -> dict[str, Any]:
409
+ return {
410
+ "run_id": result.run_id,
411
+ "trace_id": result.trace_id,
412
+ "state": result.state,
413
+ "status": result.status,
414
+ "event_cursor": result.event_cursor,
415
+ "idempotency_outcome": result.idempotency_outcome,
416
+ "terminal_error": _dump_optional(result.terminal_error),
417
+ }
418
+
419
+
420
+ def _fusion_request_to_run_request(
421
+ request: FusionRequest,
422
+ config: FusionConfig,
423
+ ) -> FusionRunRequestV1:
424
+ sampling = config.sampling.model_copy(
425
+ update={
426
+ key: value
427
+ for key, value in {
428
+ "temperature": request.temperature,
429
+ "top_p": request.top_p,
430
+ "max_tokens": request.max_tokens,
431
+ }.items()
432
+ if value is not None
433
+ }
434
+ )
435
+ payload = {
436
+ **contract_metadata("fusion-run-request.v1"),
437
+ "request_id": make_id("chat_request"),
438
+ "mode": _mode_from_request(request),
439
+ "messages": [
440
+ message.model_dump(mode="json", include={"role", "content"})
441
+ for message in request.messages
442
+ ],
443
+ "sampling": sampling.model_dump(mode="json"),
444
+ "sample_count": request.fusion.sample_count,
445
+ "verify": request.fusion.verify,
446
+ "requested_models": request.fusion.panel_models,
447
+ "tool_policy": _tool_policy_from_options(request.fusion.tool_execution),
448
+ }
449
+ payload["request_hash"] = hash_json(
450
+ {
451
+ "model": request.model,
452
+ "messages": payload["messages"],
453
+ "sampling": payload["sampling"],
454
+ "fusion": request.fusion.model_dump(mode="json"),
455
+ }
456
+ )
457
+ return FusionRunRequestV1.model_validate(payload)
458
+
459
+
460
+ def _tool_policy_from_options(options: FusionToolExecutionOptions) -> str:
461
+ if options.mode == "external":
462
+ return "external_pause"
463
+ if options.mode == "executor":
464
+ return "allowed"
465
+ return "disabled"
466
+
467
+
468
+ def _tool_execution_policy_from_options(options: FusionToolExecutionOptions) -> ToolExecutionPolicy:
469
+ return ToolExecutionPolicy.model_validate(
470
+ {
471
+ "mode": options.mode,
472
+ "allowed_side_effects": options.allowed_side_effects,
473
+ "environment": options.environment,
474
+ "policy_id": options.policy_id,
475
+ "dedupe_read_only": options.dedupe_read_only,
476
+ }
477
+ )
478
+
479
+
480
+ def _chat_fusion_metadata(inspection: RunInspection) -> dict[str, Any]:
481
+ return {
482
+ "run_id": inspection.run_id,
483
+ "trace_id": inspection.trace_id,
484
+ "state": inspection.state,
485
+ "status": inspection.status,
486
+ "event_cursor": inspection.event_cursor,
487
+ "candidate_count": len(inspection.candidates),
488
+ "candidate_model_ids": [candidate.model_id for candidate in inspection.candidates],
489
+ }
490
+
491
+
492
+ def _native_error_response(
493
+ error: NativeRunError | None,
494
+ status_code: int,
495
+ ) -> JSONResponse:
496
+ resolved_error = error or NativeRunError(
497
+ error_kind="internal_error",
498
+ error_code="unknown_native_run_error",
499
+ retryable=False,
500
+ owner="fusionkit",
501
+ terminal_reason="unknown_native_run_error",
502
+ )
503
+ return _json_response({"error": resolved_error.model_dump(mode="json")}, status_code)
504
+
505
+
506
+ def _openai_native_error_response(
507
+ error: NativeRunError | None,
508
+ status_code: int,
509
+ ) -> JSONResponse:
510
+ resolved_error = error or NativeRunError(
511
+ error_kind="internal_error",
512
+ error_code="native_run_failed",
513
+ retryable=False,
514
+ owner="fusionkit",
515
+ terminal_reason="native_run_failed",
516
+ )
517
+ return _openai_error_response(
518
+ resolved_error.error_code,
519
+ resolved_error.message or resolved_error.terminal_reason,
520
+ status_code=status_code,
521
+ )
522
+
523
+
524
+ def _openai_error_response(error_code: str, message: str, status_code: int) -> JSONResponse:
525
+ return _json_response(
526
+ {
527
+ "error": {
528
+ "message": message,
529
+ "type": "invalid_request_error",
530
+ "code": error_code,
531
+ }
532
+ },
533
+ status_code=status_code,
534
+ )
535
+
536
+
537
+ def _run_not_found_response() -> JSONResponse:
538
+ return _native_error_response(
539
+ NativeRunError(
540
+ error_kind="validation_error",
541
+ error_code="run_not_found",
542
+ retryable=False,
543
+ owner="fusionkit",
544
+ terminal_reason="unknown_run",
545
+ ),
546
+ status_code=404,
547
+ )
548
+
549
+
550
+ def _json_response(payload: Any, status_code: int = 200) -> JSONResponse:
551
+ return JSONResponse(content=payload, status_code=status_code)
552
+
553
+
554
+ def _dump_optional(model: BaseModel | None) -> dict[str, Any] | None:
555
+ return None if model is None else model.model_dump(mode="json")
556
+
557
+
558
+ def _mode_from_request(request: FusionRequest) -> FusionMode:
559
+ if request.fusion.mode is not None:
560
+ return request.fusion.mode
561
+ suffix = request.model.rsplit("/", maxsplit=1)[-1]
562
+ if suffix == "single":
563
+ return "single"
564
+ if suffix == "self":
565
+ return "self"
566
+ if suffix == "panel":
567
+ return "panel"
568
+ if suffix == "router":
569
+ return "router"
570
+ return "router"
571
+
572
+
573
+ def _coerce_message_content(content: Any) -> str:
574
+ if isinstance(content, str):
575
+ return content
576
+ if isinstance(content, list):
577
+ parts: list[str] = []
578
+ for part in content:
579
+ if isinstance(part, str):
580
+ parts.append(part)
581
+ elif isinstance(part, dict) and isinstance(part.get("text"), str):
582
+ parts.append(part["text"])
583
+ return "".join(parts)
584
+ return ""
585
+
586
+
587
+ def _to_chat_message(message: dict[str, Any]) -> ChatMessage:
588
+ """Normalize a raw OpenAI chat message into a FusionKit ChatMessage,
589
+ flattening nested ({function:{name,arguments}}) tool calls."""
590
+ kwargs: dict[str, Any] = {
591
+ "role": message.get("role", "user"),
592
+ "content": _coerce_message_content(message.get("content")),
593
+ }
594
+ if message.get("tool_call_id"):
595
+ kwargs["tool_call_id"] = message["tool_call_id"]
596
+ if message.get("name"):
597
+ kwargs["name"] = message["name"]
598
+ tool_calls = message.get("tool_calls")
599
+ if tool_calls:
600
+ parsed: list[ToolCall] = []
601
+ for call in tool_calls:
602
+ function = (
603
+ call.get("function") if isinstance(call, dict) and "function" in call else call
604
+ )
605
+ function = function if isinstance(function, dict) else {}
606
+ parsed.append(
607
+ ToolCall(
608
+ id=call.get("id", "") if isinstance(call, dict) else "",
609
+ name=function.get("name", ""),
610
+ arguments=function.get("arguments", "{}") or "{}",
611
+ )
612
+ )
613
+ if parsed:
614
+ kwargs["tool_calls"] = parsed
615
+ return ChatMessage(**kwargs)
616
+
617
+
618
+ def _normalize_tools(tools: list[dict[str, Any]] | None) -> list[dict[str, Any]] | None:
619
+ """Accept OpenAI-nested ({type:function, function:{...}}) or flat tool defs and
620
+ return the flat {name, description, parameters} shape FusionKit's clients expect."""
621
+ if not tools:
622
+ return None
623
+ normalized: list[dict[str, Any]] = []
624
+ for entry in tools:
625
+ function = (
626
+ entry.get("function") if isinstance(entry, dict) and "function" in entry else entry
627
+ )
628
+ if not isinstance(function, dict):
629
+ continue
630
+ name = function.get("name", "")
631
+ # Skip tools without a usable name (some agent CLIs advertise custom or
632
+ # freeform tool shapes that resolve to an empty name, which providers reject).
633
+ if not isinstance(name, str) or not name:
634
+ continue
635
+ normalized.append(
636
+ {
637
+ "name": name,
638
+ "description": function.get("description", ""),
639
+ "parameters": function.get("parameters", {"type": "object", "properties": {}}),
640
+ }
641
+ )
642
+ return normalized or None
643
+
644
+
645
+ def _normalize_tool_choice(choice: str | dict[str, Any] | None) -> str | dict[str, Any] | None:
646
+ if choice is None or isinstance(choice, str):
647
+ return choice
648
+ if isinstance(choice, dict):
649
+ function = choice.get("function") if "function" in choice else choice
650
+ name = function.get("name") if isinstance(function, dict) else None
651
+ if isinstance(name, str) and name:
652
+ return {"name": name}
653
+ return None
654
+
655
+
656
+ def _tool_calls_payload(response: ModelResponse) -> list[dict[str, Any]]:
657
+ return [
658
+ {
659
+ "id": call.id or f"call_{index}",
660
+ "type": "function",
661
+ "function": {"name": call.name, "arguments": call.arguments},
662
+ }
663
+ for index, call in enumerate(response.tool_calls)
664
+ ]
665
+
666
+
667
+ def _usage_dict(response: ModelResponse) -> dict[str, Any]:
668
+ return {
669
+ "prompt_tokens": response.usage.prompt_tokens,
670
+ "completion_tokens": response.usage.completion_tokens,
671
+ "total_tokens": response.usage.total_tokens,
672
+ }
673
+
674
+
675
+ def _openai_step_response(model: str, response: ModelResponse) -> dict[str, Any]:
676
+ message: dict[str, Any] = {"role": "assistant", "content": response.content or ""}
677
+ tool_calls = _tool_calls_payload(response)
678
+ if tool_calls:
679
+ message["tool_calls"] = tool_calls
680
+ finish_reason = "tool_calls" if tool_calls else (response.finish_reason or "stop")
681
+ return {
682
+ "id": f"chatcmpl-{uuid.uuid4()}",
683
+ "object": "chat.completion",
684
+ "created": int(time.time()),
685
+ "model": model,
686
+ "choices": [{"index": 0, "message": message, "finish_reason": finish_reason}],
687
+ "usage": _usage_dict(response),
688
+ }
689
+
690
+
691
+ async def _step_completion_sse(model: str, response: ModelResponse) -> AsyncIterator[str]:
692
+ completion_id = f"chatcmpl-{uuid.uuid4()}"
693
+ created = int(time.time())
694
+ tool_calls = _tool_calls_payload(response)
695
+ finish_reason = "tool_calls" if tool_calls else (response.finish_reason or "stop")
696
+
697
+ def chunk(delta: dict[str, Any], finish: str | None) -> str:
698
+ payload = {
699
+ "id": completion_id,
700
+ "object": "chat.completion.chunk",
701
+ "created": created,
702
+ "model": model,
703
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
704
+ }
705
+ return f"data: {json.dumps(payload)}\n\n"
706
+
707
+ yield chunk({"role": "assistant"}, None)
708
+ if response.content:
709
+ for piece in _stream_pieces(response.content):
710
+ yield chunk({"content": piece}, None)
711
+ if tool_calls:
712
+ yield chunk(
713
+ {
714
+ "tool_calls": [
715
+ {"index": index, **call} for index, call in enumerate(tool_calls)
716
+ ]
717
+ },
718
+ None,
719
+ )
720
+ yield chunk({}, finish_reason)
721
+ yield "data: [DONE]\n\n"
722
+
723
+
724
+ def _openai_chat_response(model: str, content: str, metadata: dict[str, Any]) -> dict[str, Any]:
725
+ return {
726
+ "id": f"chatcmpl-{uuid.uuid4()}",
727
+ "object": "chat.completion",
728
+ "created": int(time.time()),
729
+ "model": model,
730
+ "choices": [
731
+ {
732
+ "index": 0,
733
+ "finish_reason": "stop",
734
+ "message": {
735
+ "role": "assistant",
736
+ "content": content,
737
+ },
738
+ }
739
+ ],
740
+ "usage": {
741
+ "prompt_tokens": None,
742
+ "completion_tokens": None,
743
+ "total_tokens": None,
744
+ },
745
+ "fusionkit": metadata,
746
+ }
@@ -0,0 +1,296 @@
1
+ """Front one model endpoint (any provider) as an OpenAI Chat Completions server.
2
+
3
+ This is the cloud analogue of the local MLX server: it serves a single
4
+ ``/v1/chat/completions`` endpoint backed by FusionKit's provider clients
5
+ (``build_client``), so an OpenAI/Anthropic/Google model can be consumed by any
6
+ OpenAI-compatible caller (for example, HandoffKit's per-candidate coding
7
+ harness). One process fronts exactly one model, which keeps per-model routing
8
+ trivial for the caller.
9
+
10
+ Single-threaded on purpose (one model, low volume); each request runs the async
11
+ provider client in a fresh event loop so the SDK's HTTP client is bound to the
12
+ loop that drives it. Shared by the ``fusionkit serve-endpoint`` CLI command and
13
+ ``scripts/simple_openai_server.py``.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import json
19
+ import time
20
+ import traceback
21
+ import uuid
22
+ from http.server import BaseHTTPRequestHandler, HTTPServer
23
+ from typing import Any, cast
24
+
25
+ from fusionkit_core.clients import build_client
26
+ from fusionkit_core.config import ModelEndpoint, ProviderKind, SamplingConfig
27
+ from fusionkit_core.trace import (
28
+ TRACE_CANDIDATE_HEADER,
29
+ TRACE_ID_HEADER,
30
+ TRACE_SPAN_HEADER,
31
+ new_span_id,
32
+ )
33
+ from fusionkit_core.trace import emit as trace_emit
34
+ from fusionkit_core.types import ChatMessage, ToolCall
35
+
36
+ # Provider base URLs when the operator does not pass an explicit base URL. The
37
+ # OpenAI client appends `/v1`; the Anthropic SDK takes the root.
38
+ PROVIDER_DEFAULT_BASE_URL = {
39
+ "openai": "https://api.openai.com",
40
+ "anthropic": "https://api.anthropic.com",
41
+ "google": "https://generativelanguage.googleapis.com",
42
+ }
43
+
44
+
45
+ def _to_chat_message(message: dict[str, Any]) -> ChatMessage:
46
+ content = message.get("content")
47
+ kwargs: dict[str, Any] = {
48
+ "role": message.get("role", "user"),
49
+ "content": content if isinstance(content, str) else "",
50
+ }
51
+ if message.get("tool_call_id"):
52
+ kwargs["tool_call_id"] = message["tool_call_id"]
53
+ tool_calls = message.get("tool_calls")
54
+ if tool_calls:
55
+ kwargs["tool_calls"] = [
56
+ ToolCall(
57
+ id=call.get("id", ""),
58
+ name=call.get("function", {}).get("name", ""),
59
+ arguments=call.get("function", {}).get("arguments", "{}"),
60
+ )
61
+ for call in tool_calls
62
+ ]
63
+ return ChatMessage(**kwargs)
64
+
65
+
66
+ def _to_tools(tools: Any) -> list[dict[str, Any]] | None:
67
+ if not tools:
68
+ return None
69
+ converted = []
70
+ for entry in tools:
71
+ function = entry.get("function", entry)
72
+ converted.append(
73
+ {
74
+ "name": function.get("name", ""),
75
+ "description": function.get("description", ""),
76
+ "parameters": function.get("parameters", {"type": "object", "properties": {}}),
77
+ }
78
+ )
79
+ return converted
80
+
81
+
82
+ def make_handler(endpoint: ModelEndpoint) -> type[BaseHTTPRequestHandler]:
83
+ class Handler(BaseHTTPRequestHandler):
84
+ server_version = "fusionkit-openai-bridge/0.1"
85
+
86
+ def log_message(self, format: str, *args: Any) -> None:
87
+ prefix = f"{self.address_string()} - - [{self.log_date_time_string()}] "
88
+ print(prefix + format % args, flush=True)
89
+
90
+ def _send_json(self, status: int, payload: dict[str, Any]) -> None:
91
+ body = json.dumps(payload).encode("utf-8")
92
+ self.send_response(status)
93
+ self.send_header("content-type", "application/json")
94
+ self.send_header("content-length", str(len(body)))
95
+ self.end_headers()
96
+ self.wfile.write(body)
97
+
98
+ def do_GET(self) -> None:
99
+ if self.path in ("/health", "/v1/health"):
100
+ self._send_json(
101
+ 200,
102
+ {"status": "ok", "model": endpoint.id, "provider": endpoint.provider},
103
+ )
104
+ return
105
+ if self.path == "/v1/models":
106
+ self._send_json(
107
+ 200,
108
+ {
109
+ "object": "list",
110
+ "data": [
111
+ {"id": endpoint.id, "object": "model", "owned_by": endpoint.provider}
112
+ ],
113
+ },
114
+ )
115
+ return
116
+ self._send_json(404, {"error": {"message": "not found"}})
117
+
118
+ def do_POST(self) -> None:
119
+ if self.path != "/v1/chat/completions":
120
+ self._send_json(404, {"error": {"message": "not found"}})
121
+ return
122
+ trace_id = self.headers.get(TRACE_ID_HEADER)
123
+ candidate_id = self.headers.get(TRACE_CANDIDATE_HEADER)
124
+ parent_span = self.headers.get(TRACE_SPAN_HEADER)
125
+ call_span = new_span_id()
126
+ try:
127
+ length = int(self.headers.get("content-length", "0"))
128
+ request = json.loads(self.rfile.read(length).decode("utf-8"))
129
+ messages = [
130
+ _to_chat_message(message) for message in (request.get("messages") or [])
131
+ ]
132
+ tools = _to_tools(request.get("tools"))
133
+ tool_choice = request.get("tool_choice")
134
+ sampling = SamplingConfig(
135
+ temperature=float(request.get("temperature", 0.2) or 0.2),
136
+ top_p=float(request.get("top_p", 0.95) or 0.95),
137
+ max_tokens=int(request.get("max_tokens", 1024) or 1024),
138
+ )
139
+ trace_emit(
140
+ component="panel-model",
141
+ event_type="model.call.started",
142
+ trace_id=trace_id,
143
+ span_id=call_span,
144
+ parent_span_id=parent_span,
145
+ candidate_id=candidate_id,
146
+ model_id=endpoint.id,
147
+ payload={
148
+ "model": endpoint.model,
149
+ "provider": endpoint.provider,
150
+ "message_count": len(messages),
151
+ "tool_count": len(tools) if tools else 0,
152
+ },
153
+ )
154
+
155
+ async def run() -> Any:
156
+ client = build_client(endpoint)
157
+ return await client.chat(
158
+ messages,
159
+ sampling,
160
+ tools=tools,
161
+ tool_choice=tool_choice if isinstance(tool_choice, str) else None,
162
+ )
163
+
164
+ started = time.perf_counter()
165
+ response = asyncio.run(run())
166
+ latency_s = time.perf_counter() - started
167
+ if response.tool_calls:
168
+ finish_reason = "tool_calls"
169
+ else:
170
+ finish_reason = response.finish_reason or "stop"
171
+ trace_emit(
172
+ component="panel-model",
173
+ event_type="model.call.finished",
174
+ trace_id=trace_id,
175
+ span_id=call_span,
176
+ parent_span_id=parent_span,
177
+ candidate_id=candidate_id,
178
+ model_id=endpoint.id,
179
+ payload={
180
+ "model": endpoint.model,
181
+ "provider": endpoint.provider,
182
+ "latency_s": round(latency_s, 3),
183
+ "finish_reason": finish_reason,
184
+ "tool_call_count": len(response.tool_calls),
185
+ "content_preview": (response.content or "")[:400],
186
+ "usage": {
187
+ "prompt_tokens": response.usage.prompt_tokens,
188
+ "completion_tokens": response.usage.completion_tokens,
189
+ "total_tokens": response.usage.total_tokens,
190
+ },
191
+ },
192
+ )
193
+ print(
194
+ json.dumps(
195
+ {
196
+ "event": "chat_completion",
197
+ "model": endpoint.id,
198
+ "provider": endpoint.provider,
199
+ "latency_s": round(latency_s, 3),
200
+ }
201
+ ),
202
+ flush=True,
203
+ )
204
+ message_body: dict[str, Any] = {"role": "assistant", "content": response.content}
205
+ if response.tool_calls:
206
+ message_body["tool_calls"] = [
207
+ {
208
+ "id": call.id or f"call_{uuid.uuid4().hex[:8]}",
209
+ "type": "function",
210
+ "function": {"name": call.name, "arguments": call.arguments},
211
+ }
212
+ for call in response.tool_calls
213
+ ]
214
+ self._send_json(
215
+ 200,
216
+ {
217
+ "id": f"chatcmpl-{uuid.uuid4()}",
218
+ "object": "chat.completion",
219
+ "created": int(time.time()),
220
+ "model": endpoint.model,
221
+ "choices": [
222
+ {
223
+ "index": 0,
224
+ "message": message_body,
225
+ "finish_reason": finish_reason,
226
+ }
227
+ ],
228
+ "usage": {
229
+ "prompt_tokens": response.usage.prompt_tokens,
230
+ "completion_tokens": response.usage.completion_tokens,
231
+ "total_tokens": response.usage.total_tokens,
232
+ },
233
+ },
234
+ )
235
+ except Exception as exc: # noqa: BLE001 - surface as an OpenAI error body
236
+ traceback.print_exc()
237
+ trace_emit(
238
+ component="panel-model",
239
+ event_type="model.call.finished",
240
+ trace_id=trace_id,
241
+ span_id=call_span,
242
+ parent_span_id=parent_span,
243
+ candidate_id=candidate_id,
244
+ model_id=endpoint.id,
245
+ payload={
246
+ "model": endpoint.model,
247
+ "provider": endpoint.provider,
248
+ "error": str(exc),
249
+ "error_type": exc.__class__.__name__,
250
+ },
251
+ )
252
+ self._send_json(
253
+ 500,
254
+ {"error": {"message": str(exc), "type": exc.__class__.__name__}},
255
+ )
256
+
257
+ return Handler
258
+
259
+
260
+ def build_endpoint(
261
+ *,
262
+ id: str,
263
+ model: str,
264
+ provider: str = "openai",
265
+ base_url: str | None = None,
266
+ api_key_env: str | None = None,
267
+ timeout_s: float = 120.0,
268
+ ) -> ModelEndpoint:
269
+ resolved_base_url = base_url or PROVIDER_DEFAULT_BASE_URL.get(provider, "http://127.0.0.1")
270
+ # `provider` arrives as a free string from the CLI; ModelEndpoint validates
271
+ # it against ProviderKind at construction time (pydantic raises on misuse).
272
+ return ModelEndpoint(
273
+ id=id,
274
+ model=model,
275
+ base_url=resolved_base_url,
276
+ provider=cast(ProviderKind, provider),
277
+ api_key_env=api_key_env,
278
+ timeout_s=timeout_s,
279
+ )
280
+
281
+
282
+ def serve_single_endpoint(endpoint: ModelEndpoint, *, host: str = "127.0.0.1", port: int) -> None:
283
+ print(
284
+ json.dumps(
285
+ {
286
+ "event": "starting",
287
+ "id": endpoint.id,
288
+ "provider": endpoint.provider,
289
+ "model": endpoint.model,
290
+ }
291
+ ),
292
+ flush=True,
293
+ )
294
+ server = HTTPServer((host, port), make_handler(endpoint))
295
+ print(json.dumps({"event": "listening", "host": host, "port": port}), flush=True)
296
+ server.serve_forever()