opencode-llmstack 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,790 @@
1
+ """AWS Bedrock backend for the auto-router.
2
+
3
+ What this does
4
+ ==============
5
+
6
+ A tier in ``models.ini`` declared with ``backend = bedrock`` (or just
7
+ ``aws_model_id = ...``) is *not* loaded by llama-swap. Instead, when the
8
+ router selects that tier, this module:
9
+
10
+ 1. Builds a per-tier ``boto3`` ``bedrock-runtime`` client using the
11
+ credentials from that tier's :class:`~llmstack.tiers.BedrockConfig`
12
+ (region/profile/explicit creds/assume-role -- whichever the
13
+ operator declared, falling back to boto3's default chain).
14
+ 2. Translates the inbound OpenAI-style chat/completions body to
15
+ Bedrock's `Converse`_ shape (``system``, ``messages``,
16
+ ``inferenceConfig``, ``toolConfig``).
17
+ 3. Calls :py:meth:`bedrock-runtime.converse_stream` (streaming) or
18
+ :py:meth:`bedrock-runtime.converse` (non-streaming).
19
+ 4. Translates the response back to OpenAI's chat-completion or SSE
20
+ ``chat.completion.chunk`` format so existing clients (opencode,
21
+ curl, anything pointed at the router) don't have to know Bedrock
22
+ exists.
23
+
24
+ Each tier gets its own client + session because each tier may live in
25
+ a different region / AWS account / role -- credentials are scoped to
26
+ the tier, not globalised in ``[DEFAULT]``.
27
+
28
+ .. _Converse: https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_Converse.html
29
+
30
+ Limitations
31
+ ===========
32
+
33
+ Text in / text out + tool calling. Multimodal (image) parts are passed
34
+ through as text where possible and dropped silently otherwise -- the
35
+ local stack is text-first and that's the 95 % case for the agent loop
36
+ opencode drives.
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import asyncio
42
+ import json
43
+ import logging
44
+ import os
45
+ import time
46
+ import uuid
47
+ from collections.abc import AsyncIterator
48
+ from threading import Lock
49
+ from typing import Any
50
+
51
+ from fastapi import Request
52
+ from fastapi.responses import JSONResponse, StreamingResponse
53
+
54
+ from llmstack.tiers import BedrockConfig, Tier
55
+
56
+ log = logging.getLogger("router.bedrock")
57
+
58
+ USE_NEXT_ENV = "LLMSTACK_USE_NEXT"
59
+
60
+ # Lazy boto3 import: don't make every llmstack action depend on the AWS
61
+ # SDK -- only the router needs it, and only when a bedrock tier is hit.
62
+ _boto3 = None # type: ignore[var-annotated]
63
+ _botocore = None # type: ignore[var-annotated]
64
+ _clients: dict[str, Any] = {}
65
+ _clients_lock = Lock()
66
+
67
+
68
+ def _use_next() -> bool:
69
+ """Read the ``--next`` channel flag from the router's environment.
70
+
71
+ ``llmstack start --next`` exports ``LLMSTACK_USE_NEXT=1`` to the
72
+ router subprocess so that bedrock tiers swap to ``aws_model_id_next``
73
+ in lock-step with gguf tiers swapping to ``hf_file_next``.
74
+ """
75
+ return os.environ.get(USE_NEXT_ENV, "").strip().lower() in ("1", "true", "yes", "on")
76
+
77
+
78
+ class BedrockUnavailableError(RuntimeError):
79
+ """Raised when boto3 is not installed but a bedrock tier was hit."""
80
+
81
+
82
+ def _require_boto3() -> tuple[Any, Any]:
83
+ global _boto3, _botocore
84
+ if _boto3 is not None and _botocore is not None:
85
+ return _boto3, _botocore
86
+ try:
87
+ import boto3 as _b # type: ignore[import-not-found]
88
+ import botocore # type: ignore[import-not-found]
89
+ except ImportError as exc: # pragma: no cover - import-time only
90
+ raise BedrockUnavailableError(
91
+ "boto3 is required for bedrock-backed tiers; install with "
92
+ "`pip install 'llmstack[bedrock]'`"
93
+ ) from exc
94
+ _boto3, _botocore = _b, botocore
95
+ return _b, botocore
96
+
97
+
98
+ def _client_cache_key(cfg: BedrockConfig) -> str:
99
+ """One client per distinct (profile, region, endpoint) tuple.
100
+
101
+ Two tiers that point at the same profile + region collapse onto a
102
+ single boto3 client; switching channel (current/next) builds a new
103
+ one because ``model_id``-bound region differs.
104
+ """
105
+ return "|".join([
106
+ cfg.profile or "",
107
+ cfg.region or "",
108
+ cfg.endpoint_url or "",
109
+ ])
110
+
111
+
112
+ def _build_client(cfg: BedrockConfig):
113
+ """Construct a ``bedrock-runtime`` client for a tier.
114
+
115
+ All credential resolution (long-term keys, SSO, role chaining via
116
+ ``role_arn`` + ``source_profile`` in ``~/.aws/config``, MFA, IMDS)
117
+ is delegated to boto3 by passing ``profile_name``. We never touch
118
+ raw secrets here.
119
+ """
120
+ boto3, botocore = _require_boto3()
121
+
122
+ session_kwargs: dict[str, Any] = {}
123
+ if cfg.profile:
124
+ session_kwargs["profile_name"] = cfg.profile
125
+ if cfg.region:
126
+ session_kwargs["region_name"] = cfg.region
127
+ session = boto3.session.Session(**session_kwargs)
128
+
129
+ client_kwargs: dict[str, Any] = {}
130
+ if cfg.endpoint_url:
131
+ client_kwargs["endpoint_url"] = cfg.endpoint_url
132
+ # Bedrock InvokeModelWithResponseStream / Converse can hold the
133
+ # connection open for a while on slow models -- give it a generous
134
+ # read timeout while keeping connect tight.
135
+ client_kwargs["config"] = botocore.config.Config(
136
+ connect_timeout=10,
137
+ read_timeout=600,
138
+ retries={"max_attempts": 2, "mode": "standard"},
139
+ )
140
+ return session.client("bedrock-runtime", **client_kwargs)
141
+
142
+
143
+ def get_client(cfg: BedrockConfig):
144
+ """Return a process-wide cached client for the given tier config."""
145
+ key = _client_cache_key(cfg)
146
+ with _clients_lock:
147
+ c = _clients.get(key)
148
+ if c is not None:
149
+ return c
150
+ c = _build_client(cfg)
151
+ _clients[key] = c
152
+ return c
153
+
154
+
155
+ # ---------------------------------------------------------------------------
156
+ # OpenAI -> Bedrock Converse translation
157
+ # ---------------------------------------------------------------------------
158
+
159
+ def _coerce_text(content: Any) -> str:
160
+ """Turn an OpenAI message ``content`` into a plain string.
161
+
162
+ OpenAI accepts either a string or an array of typed parts. We keep
163
+ only the ``text`` parts -- multimodal blobs (``image_url`` etc.) are
164
+ dropped since the Bedrock text models we target won't accept them
165
+ anyway and translating multimodal end-to-end is out of scope.
166
+ """
167
+ if content is None:
168
+ return ""
169
+ if isinstance(content, str):
170
+ return content
171
+ if isinstance(content, list):
172
+ bits: list[str] = []
173
+ for part in content:
174
+ if isinstance(part, dict):
175
+ t = part.get("text")
176
+ if isinstance(t, str):
177
+ bits.append(t)
178
+ return "\n".join(bits)
179
+ return str(content)
180
+
181
+
182
+ def _system_blocks(messages: list[dict[str, Any]]) -> list[dict[str, str]]:
183
+ out: list[dict[str, str]] = []
184
+ for m in messages:
185
+ if m.get("role") != "system":
186
+ continue
187
+ text = _coerce_text(m.get("content"))
188
+ if text:
189
+ out.append({"text": text})
190
+ return out
191
+
192
+
193
+ _ORPHAN_TOOL_RESULT_TEXT = (
194
+ "(no result; tool call was cancelled or interrupted -- treat as failed)"
195
+ )
196
+
197
+
198
+ def _converse_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
199
+ """Translate the non-system slice of OpenAI messages to Converse shape.
200
+
201
+ Tool calls (assistant) and tool results (``role: "tool"``) round-trip
202
+ via ``toolUse`` / ``toolResult`` content blocks. The OpenAI shape and
203
+ the Bedrock Converse shape disagree on two things, so we normalise:
204
+
205
+ 1. **Tool results merge into the next user turn.** OpenAI emits one
206
+ ``role: "tool"`` message per tool result; Converse expects all
207
+ toolResults from a single assistant turn to live in *one*
208
+ following user turn (along with any subsequent user text).
209
+ Without this, Bedrock 400s with
210
+ ``Expected toolResult blocks at messages.N.content``.
211
+
212
+ 2. **Strict role alternation.** Bedrock rejects consecutive
213
+ same-role turns. We collapse any run of consecutive user (or
214
+ tool-as-user) messages into a single user message by
215
+ concatenating their content blocks.
216
+
217
+ On top of that we **inject stub toolResults for orphan toolUse
218
+ blocks** -- assistant turns whose tool_calls were never resolved
219
+ (user cancelled, transport dropped the result, etc.). Without the
220
+ stub, Bedrock surfaces the same "Expected toolResult blocks" error
221
+ even though the missing resolution is the *previous* run's fault.
222
+ Stubs carry ``status: "error"`` so the model knows the call failed
223
+ rather than silently treating an empty payload as success.
224
+ """
225
+ raw: list[tuple[str, list[dict[str, Any]]]] = []
226
+
227
+ for m in messages:
228
+ role = m.get("role")
229
+ if role == "system":
230
+ continue
231
+
232
+ if role == "tool":
233
+ tool_call_id = m.get("tool_call_id") or m.get("id") or ""
234
+ text = _coerce_text(m.get("content"))
235
+ raw.append(("user", [{
236
+ "toolResult": {
237
+ "toolUseId": tool_call_id,
238
+ "content": [{"text": text}],
239
+ },
240
+ }]))
241
+ continue
242
+
243
+ blocks: list[dict[str, Any]] = []
244
+ text = _coerce_text(m.get("content"))
245
+ if text:
246
+ blocks.append({"text": text})
247
+
248
+ if role == "assistant":
249
+ for tc in m.get("tool_calls") or []:
250
+ fn = (tc or {}).get("function") or {}
251
+ name = fn.get("name") or ""
252
+ raw_args = fn.get("arguments") or "{}"
253
+ try:
254
+ parsed = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
255
+ except json.JSONDecodeError:
256
+ parsed = {"_raw": raw_args}
257
+ blocks.append({
258
+ "toolUse": {
259
+ "toolUseId": tc.get("id") or f"tool_{uuid.uuid4().hex[:12]}",
260
+ "name": name,
261
+ "input": parsed if isinstance(parsed, dict) else {"value": parsed},
262
+ },
263
+ })
264
+
265
+ if not blocks:
266
+ continue
267
+ raw.append(("assistant" if role == "assistant" else "user", blocks))
268
+
269
+ # Pass 1: collapse runs of same-role messages into one. Tool results
270
+ # already arrive as ``("user", [...])`` entries above, so this naturally
271
+ # gathers them with each other and with any following user text.
272
+ merged: list[list[Any]] = []
273
+ for role, blocks in raw:
274
+ if merged and merged[-1][0] == role:
275
+ merged[-1][1].extend(blocks)
276
+ else:
277
+ merged.append([role, list(blocks)])
278
+
279
+ # Pass 2: for every assistant turn that emits toolUse blocks, ensure
280
+ # the next user turn carries a matching toolResult for each id. Inject
281
+ # a stub error result for any orphan id; create a stub user turn if
282
+ # none follows at all.
283
+ i = 0
284
+ while i < len(merged):
285
+ role, blocks = merged[i]
286
+ if role != "assistant":
287
+ i += 1
288
+ continue
289
+ tool_use_ids = [
290
+ (b["toolUse"] or {}).get("toolUseId")
291
+ for b in blocks
292
+ if isinstance(b, dict) and "toolUse" in b
293
+ ]
294
+ tool_use_ids = [tid for tid in tool_use_ids if tid]
295
+ if not tool_use_ids:
296
+ i += 1
297
+ continue
298
+
299
+ if i + 1 >= len(merged) or merged[i + 1][0] != "user":
300
+ merged.insert(i + 1, ["user", []])
301
+ next_blocks = merged[i + 1][1]
302
+ provided = {
303
+ (b["toolResult"] or {}).get("toolUseId")
304
+ for b in next_blocks
305
+ if isinstance(b, dict) and "toolResult" in b
306
+ }
307
+ # Prepend any missing stubs so toolResults sit before user text,
308
+ # which matches what callers naturally produce.
309
+ stubs: list[dict[str, Any]] = []
310
+ for tid in tool_use_ids:
311
+ if tid in provided:
312
+ continue
313
+ stubs.append({
314
+ "toolResult": {
315
+ "toolUseId": tid,
316
+ "content": [{"text": _ORPHAN_TOOL_RESULT_TEXT}],
317
+ "status": "error",
318
+ },
319
+ })
320
+ if stubs:
321
+ log.debug(
322
+ "bedrock: injected %d orphan toolResult stub(s) for ids=%s",
323
+ len(stubs), [s["toolResult"]["toolUseId"] for s in stubs],
324
+ )
325
+ merged[i + 1][1] = stubs + next_blocks
326
+ i += 1
327
+
328
+ return [
329
+ {"role": role, "content": blocks}
330
+ for role, blocks in merged
331
+ if blocks
332
+ ]
333
+
334
+
335
+ def _messages_reference_tools(converse_messages: list[dict[str, Any]]) -> set[str]:
336
+ """Return the set of tool *names* referenced by toolUse blocks in history.
337
+
338
+ Used to synthesise a minimum ``toolConfig`` when the inbound request
339
+ body has no ``tools`` array but the message history replays prior
340
+ tool calls -- Bedrock rejects that combination outright with
341
+ ``The toolConfig field must be defined when using toolUse and
342
+ toolResult content blocks``. ToolResult blocks only carry the tool
343
+ *id*, not the name, so we recover names from the matching toolUse
344
+ blocks earlier in the conversation.
345
+ """
346
+ names: set[str] = set()
347
+ for m in converse_messages:
348
+ for b in m.get("content") or []:
349
+ if not isinstance(b, dict):
350
+ continue
351
+ tu = b.get("toolUse")
352
+ if isinstance(tu, dict):
353
+ name = tu.get("name")
354
+ if isinstance(name, str) and name:
355
+ names.add(name)
356
+ return names
357
+
358
+
359
+ def _stub_tool_config(names: set[str]) -> dict[str, Any]:
360
+ """Minimum-viable ``toolConfig`` for replaying tool history.
361
+
362
+ The schema is permissive (``{"type": "object"}``) since we're only
363
+ declaring tools to satisfy Bedrock's validator -- the model is meant
364
+ to summarise / continue, not invoke a fresh call. If it does call
365
+ one, opencode will resolve it on the next loop with the real schema
366
+ in scope.
367
+ """
368
+ return {
369
+ "tools": [
370
+ {
371
+ "toolSpec": {
372
+ "name": name,
373
+ "description": "(replayed from history; schema unavailable)",
374
+ "inputSchema": {"json": {"type": "object"}},
375
+ },
376
+ }
377
+ for name in sorted(names)
378
+ ],
379
+ }
380
+
381
+
382
+ def _tool_config(tools: list[dict[str, Any]] | None) -> dict[str, Any] | None:
383
+ if not tools:
384
+ return None
385
+ specs: list[dict[str, Any]] = []
386
+ for t in tools:
387
+ fn = (t or {}).get("function") or {}
388
+ name = fn.get("name")
389
+ if not name:
390
+ continue
391
+ specs.append({
392
+ "toolSpec": {
393
+ "name": name,
394
+ "description": fn.get("description") or "",
395
+ "inputSchema": {"json": fn.get("parameters") or {"type": "object"}},
396
+ }
397
+ })
398
+ if not specs:
399
+ return None
400
+ return {"tools": specs}
401
+
402
+
403
+ def _inference_config(body: dict[str, Any]) -> dict[str, Any]:
404
+ # We forward only what the Converse `inferenceConfig` schema accepts:
405
+ # `temperature`, `topP`, `maxTokens`, `stopSequences`. Other sampler
406
+ # knobs (`top_k`, `min_p`, `repetition_penalty`) have no Converse-
407
+ # standard mapping and are silently dropped here -- they're llama.cpp
408
+ # extensions used only by local GGUF tiers.
409
+ #
410
+ # Per-model rules about which of these are valid (e.g. Claude Opus
411
+ # 4.7 rejects ALL sampler params; Claude Sonnet 4.5 accepts either
412
+ # `temperature` or `top_p` but not both) are NOT enforced here. They
413
+ # live in models.ini -- whichever sampler keys are declared on the
414
+ # tier are what the router injects into the body, and that's what we
415
+ # forward. Configure Bedrock tiers in models.ini accordingly: omit
416
+ # the `sampler =` line for Opus 4.7+, and pick the one allowed knob
417
+ # for Sonnet 4.5 / Haiku 4.5.
418
+ cfg: dict[str, Any] = {}
419
+ if "temperature" in body:
420
+ try:
421
+ cfg["temperature"] = float(body["temperature"])
422
+ except (TypeError, ValueError):
423
+ pass
424
+ if "top_p" in body:
425
+ try:
426
+ cfg["topP"] = float(body["top_p"])
427
+ except (TypeError, ValueError):
428
+ pass
429
+ if "max_tokens" in body or "max_completion_tokens" in body:
430
+ try:
431
+ cfg["maxTokens"] = int(body.get("max_tokens") or body.get("max_completion_tokens"))
432
+ except (TypeError, ValueError):
433
+ pass
434
+ stop = body.get("stop")
435
+ if isinstance(stop, str):
436
+ cfg["stopSequences"] = [stop]
437
+ elif isinstance(stop, list):
438
+ cfg["stopSequences"] = [s for s in stop if isinstance(s, str)]
439
+ return cfg
440
+
441
+
442
+ def _build_converse_kwargs(tier: Tier, body: dict[str, Any], cfg: BedrockConfig) -> dict[str, Any]:
443
+ """OpenAI-style request body -> kwargs for ``converse[_stream]``.
444
+
445
+ ``cfg`` is the channel-resolved :class:`BedrockConfig` (current vs.
446
+ next), passed in so the caller controls the channel and we don't
447
+ re-read the env mid-call.
448
+ """
449
+ assert tier.bedrock is not None
450
+ messages = body.get("messages")
451
+ if not isinstance(messages, list):
452
+ # /v1/completions style: synthesise a single user message
453
+ prompt = body.get("prompt") or ""
454
+ messages = [{"role": "user", "content": prompt}]
455
+
456
+ converse_messages = _converse_messages(messages)
457
+ converse_kwargs: dict[str, Any] = {
458
+ "modelId": cfg.model_id,
459
+ "messages": converse_messages,
460
+ }
461
+ sys_blocks = _system_blocks(messages)
462
+ if sys_blocks:
463
+ converse_kwargs["system"] = sys_blocks
464
+
465
+ inference = _inference_config(body)
466
+ if inference:
467
+ converse_kwargs["inferenceConfig"] = inference
468
+
469
+ tools = _tool_config(body.get("tools"))
470
+ if tools is None:
471
+ # Body didn't ship `tools`, but the message history might replay
472
+ # prior tool calls (e.g. opencode continuing a conversation that
473
+ # started with tools registered). Bedrock requires toolConfig
474
+ # whenever any toolUse/toolResult block is present in messages,
475
+ # so we synthesise stub specs from the names referenced in the
476
+ # converted history.
477
+ referenced = _messages_reference_tools(converse_messages)
478
+ if referenced:
479
+ tools = _stub_tool_config(referenced)
480
+ log.debug(
481
+ "bedrock: synthesised stub toolConfig for replayed names=%s",
482
+ sorted(referenced),
483
+ )
484
+ if tools:
485
+ converse_kwargs["toolConfig"] = tools
486
+ return converse_kwargs
487
+
488
+
489
+ # ---------------------------------------------------------------------------
490
+ # Bedrock -> OpenAI translation
491
+ # ---------------------------------------------------------------------------
492
+
493
+ _STOP_REASON_MAP = {
494
+ "end_turn": "stop",
495
+ "stop_sequence": "stop",
496
+ "max_tokens": "length",
497
+ "tool_use": "tool_calls",
498
+ "guardrail_intervened": "content_filter",
499
+ "content_filtered": "content_filter",
500
+ }
501
+
502
+
503
+ def _completion_id() -> str:
504
+ return f"chatcmpl-{uuid.uuid4().hex[:24]}"
505
+
506
+
507
+ def _now_unix() -> int:
508
+ return int(time.time())
509
+
510
+
511
+ def _openai_message_from_converse(resp: dict[str, Any]) -> tuple[dict[str, Any], str | None]:
512
+ """Pull text + tool calls out of a non-streaming Converse response."""
513
+ msg = (resp.get("output") or {}).get("message") or {}
514
+ blocks = msg.get("content") or []
515
+ text_parts: list[str] = []
516
+ tool_calls: list[dict[str, Any]] = []
517
+ for b in blocks:
518
+ if "text" in b and b["text"]:
519
+ text_parts.append(b["text"])
520
+ elif "toolUse" in b:
521
+ tu = b["toolUse"] or {}
522
+ tool_calls.append({
523
+ "id": tu.get("toolUseId") or f"tool_{uuid.uuid4().hex[:12]}",
524
+ "type": "function",
525
+ "function": {
526
+ "name": tu.get("name") or "",
527
+ "arguments": json.dumps(tu.get("input") or {}),
528
+ },
529
+ })
530
+ out: dict[str, Any] = {
531
+ "role": "assistant",
532
+ "content": "".join(text_parts) if text_parts else None,
533
+ }
534
+ if tool_calls:
535
+ out["tool_calls"] = tool_calls
536
+ finish = _STOP_REASON_MAP.get(resp.get("stopReason") or "", "stop")
537
+ return out, finish
538
+
539
+
540
+ # ---------------------------------------------------------------------------
541
+ # Dispatch entry points (called by app.py)
542
+ # ---------------------------------------------------------------------------
543
+
544
+ async def dispatch(req: Request, tier: Tier, body: dict[str, Any]) -> StreamingResponse | JSONResponse:
545
+ """Top-level entry: turn an OpenAI-style request into a Bedrock call.
546
+
547
+ Streams when the request asked for ``stream: true``; otherwise
548
+ returns a single chat-completion JSON object.
549
+ """
550
+ if tier.bedrock is None:
551
+ return JSONResponse(
552
+ status_code=500,
553
+ content={"error": {"message": f"tier {tier.name!r} has backend=bedrock but no aws_model_id"}},
554
+ )
555
+
556
+ streaming = bool(body.get("stream"))
557
+ use_next = _use_next()
558
+ cfg = tier.bedrock.resolved(use_next=use_next)
559
+ channel = "next" if (use_next and tier.bedrock.has_next) else "current"
560
+ converse_kwargs = _build_converse_kwargs(tier, body, cfg)
561
+ log.info(
562
+ "bedrock dispatch tier=%s model=%s region=%s channel=%s stream=%s",
563
+ tier.name, cfg.model_id, cfg.region or "(default)", channel, streaming,
564
+ )
565
+
566
+ try:
567
+ client = get_client(cfg)
568
+ except BedrockUnavailableError as exc:
569
+ return JSONResponse(status_code=500, content={"error": {"message": str(exc)}})
570
+
571
+ if streaming:
572
+ return await _stream_response(client, tier, converse_kwargs)
573
+ return await _complete_response(client, tier, converse_kwargs)
574
+
575
+
576
+ async def _complete_response(client: Any, tier: Tier, converse_kwargs: dict[str, Any]) -> JSONResponse:
577
+ try:
578
+ resp = await asyncio.to_thread(client.converse, **converse_kwargs)
579
+ except Exception as exc: # noqa: BLE001 - surface upstream error verbatim
580
+ log.warning("bedrock converse failed: %s", exc)
581
+ return JSONResponse(status_code=502, content={"error": _error_payload(exc)})
582
+
583
+ message, finish = _openai_message_from_converse(resp)
584
+ usage_in = (resp.get("usage") or {})
585
+ payload = {
586
+ "id": _completion_id(),
587
+ "object": "chat.completion",
588
+ "created": _now_unix(),
589
+ "model": tier.name,
590
+ "choices": [{
591
+ "index": 0,
592
+ "message": message,
593
+ "finish_reason": finish or "stop",
594
+ }],
595
+ "usage": {
596
+ "prompt_tokens": int(usage_in.get("inputTokens") or 0),
597
+ "completion_tokens": int(usage_in.get("outputTokens") or 0),
598
+ "total_tokens": int(usage_in.get("totalTokens") or 0),
599
+ },
600
+ }
601
+ return JSONResponse(content=payload)
602
+
603
+
604
+ def _error_payload(exc: Exception) -> dict[str, Any]:
605
+ out: dict[str, Any] = {"message": str(exc), "type": exc.__class__.__name__}
606
+ response = getattr(exc, "response", None)
607
+ if isinstance(response, dict):
608
+ err = response.get("Error") or {}
609
+ if err.get("Code"):
610
+ out["code"] = err["Code"]
611
+ return out
612
+
613
+
614
+ async def _stream_response(client: Any, tier: Tier, converse_kwargs: dict[str, Any]) -> StreamingResponse:
615
+ completion_id = _completion_id()
616
+ created = _now_unix()
617
+ model_label = tier.name
618
+
619
+ def _sse(payload: dict[str, Any]) -> bytes:
620
+ return f"data: {json.dumps(payload, separators=(',', ':'))}\n\n".encode()
621
+
622
+ def _frame(delta: dict[str, Any], *, finish: str | None = None) -> dict[str, Any]:
623
+ choice: dict[str, Any] = {
624
+ "index": 0,
625
+ "delta": delta,
626
+ "finish_reason": finish,
627
+ }
628
+ return {
629
+ "id": completion_id,
630
+ "object": "chat.completion.chunk",
631
+ "created": created,
632
+ "model": model_label,
633
+ "choices": [choice],
634
+ }
635
+
636
+ async def gen() -> AsyncIterator[bytes]:
637
+ # Open the converse stream in a worker thread; the EventStream
638
+ # iterator is sync, so we read it off the loop and bridge to an
639
+ # asyncio queue.
640
+ queue: asyncio.Queue = asyncio.Queue(maxsize=64)
641
+ loop = asyncio.get_running_loop()
642
+ sentinel = object()
643
+
644
+ def _pump() -> None:
645
+ try:
646
+ resp = client.converse_stream(**converse_kwargs)
647
+ stream = resp.get("stream")
648
+ if stream is None:
649
+ raise RuntimeError("converse_stream returned no stream")
650
+ for event in stream:
651
+ asyncio.run_coroutine_threadsafe(queue.put(event), loop).result()
652
+ except Exception as exc: # noqa: BLE001
653
+ asyncio.run_coroutine_threadsafe(queue.put(("__error__", exc)), loop).result()
654
+ finally:
655
+ asyncio.run_coroutine_threadsafe(queue.put(sentinel), loop).result()
656
+
657
+ pump_task = asyncio.create_task(asyncio.to_thread(_pump))
658
+
659
+ # First chunk: announce the assistant role so OpenAI clients can
660
+ # initialise their accumulator.
661
+ yield _sse(_frame({"role": "assistant"}))
662
+
663
+ # Per-content-block state: index -> "text" | "tool_use"
664
+ block_kinds: dict[int, str] = {}
665
+ # tool_use blocks need the OpenAI tool_calls index to map to.
666
+ tool_call_index: dict[int, int] = {}
667
+ next_tool_call_index = 0
668
+ finish_reason: str | None = None
669
+
670
+ try:
671
+ while True:
672
+ item = await queue.get()
673
+ if item is sentinel:
674
+ break
675
+ if isinstance(item, tuple) and len(item) == 2 and item[0] == "__error__":
676
+ err = item[1]
677
+ log.warning("bedrock stream failed: %s", err)
678
+ yield _sse(_frame({}, finish="error"))
679
+ yield b"data: " + json.dumps({"error": _error_payload(err)}).encode() + b"\n\n"
680
+ return
681
+
682
+ event = item
683
+ if "messageStart" in event:
684
+ continue
685
+ if "contentBlockStart" in event:
686
+ cbs = event["contentBlockStart"]
687
+ idx = cbs.get("contentBlockIndex", 0)
688
+ start = cbs.get("start") or {}
689
+ if "toolUse" in start:
690
+ block_kinds[idx] = "tool_use"
691
+ oai_idx = next_tool_call_index
692
+ tool_call_index[idx] = oai_idx
693
+ next_tool_call_index += 1
694
+ tu = start["toolUse"]
695
+ yield _sse(_frame({
696
+ "tool_calls": [{
697
+ "index": oai_idx,
698
+ "id": tu.get("toolUseId") or f"tool_{uuid.uuid4().hex[:12]}",
699
+ "type": "function",
700
+ "function": {"name": tu.get("name") or "", "arguments": ""},
701
+ }]
702
+ }))
703
+ else:
704
+ block_kinds[idx] = "text"
705
+ continue
706
+
707
+ if "contentBlockDelta" in event:
708
+ cbd = event["contentBlockDelta"]
709
+ idx = cbd.get("contentBlockIndex", 0)
710
+ delta = cbd.get("delta") or {}
711
+ kind = block_kinds.get(idx, "text")
712
+ if kind == "text":
713
+ text = delta.get("text")
714
+ if text:
715
+ yield _sse(_frame({"content": text}))
716
+ elif kind == "tool_use":
717
+ # toolUse deltas carry partial JSON in `input`.
718
+ tu = delta.get("toolUse") or {}
719
+ partial = tu.get("input")
720
+ if partial is None:
721
+ partial = ""
722
+ if not isinstance(partial, str):
723
+ partial = json.dumps(partial)
724
+ if partial:
725
+ yield _sse(_frame({
726
+ "tool_calls": [{
727
+ "index": tool_call_index.get(idx, 0),
728
+ "function": {"arguments": partial},
729
+ }]
730
+ }))
731
+ continue
732
+
733
+ if "contentBlockStop" in event:
734
+ continue
735
+
736
+ if "messageStop" in event:
737
+ finish_reason = _STOP_REASON_MAP.get(
738
+ event["messageStop"].get("stopReason") or "", "stop",
739
+ )
740
+ continue
741
+
742
+ if "metadata" in event:
743
+ # Could attach token counts here via an
744
+ # `x-llmstack-usage` event, but OpenAI's chunk schema
745
+ # has no usage field on intermediate chunks; skip.
746
+ continue
747
+ finally:
748
+ await pump_task
749
+
750
+ yield _sse(_frame({}, finish=finish_reason or "stop"))
751
+ yield b"data: [DONE]\n\n"
752
+
753
+ return StreamingResponse(
754
+ gen(),
755
+ media_type="text/event-stream",
756
+ headers={"cache-control": "no-cache", "connection": "keep-alive"},
757
+ )
758
+
759
+
760
+ # ---------------------------------------------------------------------------
761
+ # /v1/models metadata
762
+ # ---------------------------------------------------------------------------
763
+
764
+ def model_descriptor(tier: Tier) -> dict[str, Any]:
765
+ """Return an OpenAI-style ``/v1/models`` entry for a bedrock tier."""
766
+ assert tier.bedrock is not None
767
+ use_next = _use_next()
768
+ active = tier.bedrock.resolved(use_next=use_next)
769
+ channel = "next" if (use_next and tier.bedrock.has_next) else "current"
770
+ metadata: dict[str, Any] = {
771
+ "model_id": active.model_id,
772
+ "region": active.region or os.environ.get("AWS_REGION") or "",
773
+ "ctx_size": tier.ctx_size,
774
+ "channel": channel,
775
+ }
776
+ if tier.bedrock.has_next:
777
+ metadata["model_id_next"] = tier.bedrock.model_id_next
778
+ if tier.bedrock.region_next:
779
+ metadata["region_next"] = tier.bedrock.region_next
780
+ return {
781
+ "id": tier.name,
782
+ "object": "model",
783
+ "created": 0,
784
+ "owned_by": "aws-bedrock",
785
+ "name": tier.description,
786
+ "description": tier.description,
787
+ "tier": tier.role,
788
+ "backend": "bedrock",
789
+ "metadata": metadata,
790
+ }