npm - veryfront - Versions diffs - 0.1.207 → 0.1.209 - Mend

veryfront 0.1.207 → 0.1.209

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/esm/deno.js +1 -1
package/esm/src/provider/runtime-loader.d.ts +47 -0
package/esm/src/provider/runtime-loader.d.ts.map +1 -1
package/esm/src/provider/runtime-loader.js +1386 -70
package/esm/src/provider/types.d.ts +2 -0
package/esm/src/provider/types.d.ts.map +1 -1
package/esm/src/utils/version-constant.d.ts +1 -1
package/esm/src/utils/version-constant.js +1 -1
package/package.json +1 -1
package/src/deno.js +1 -1
package/src/src/provider/runtime-loader.ts +1872 -102
package/src/src/provider/types.ts +2 -0
package/src/src/utils/version-constant.ts +1 -1

package/src/src/provider/runtime-loader.ts CHANGED Viewed

@@ -36,6 +36,18 @@ type RuntimePromptMessage =
         input: unknown;
         providerExecuted?: boolean;
       }
+      | {
+        // Anthropic thinking block replay. Carries the original signed
+        // thinking trace so that on the next turn Anthropic can verify
+        // the signature and let Claude continue reasoning from the same
+        // point. `text` + `signature` are the normal pair for an
+        // un-redacted thinking block; `redactedData` is set instead of
+        // both when Anthropic returned an encrypted opaque payload.
+        type: "reasoning";
+        text?: string;
+        signature?: string;
+        redactedData?: string;
+      }
     >;
   }
   | {
@@ -60,6 +72,67 @@ type RuntimeToolDefinition =
     id: `${string}.${string}`;
     args: Record<string, unknown>;
   };
+/**
+ * TTL for a single prompt-cache breakpoint.
+ *
+ * `true` and `"5m"` both map to Anthropic's default ephemeral (5-minute) cache.
+ * `"1h"` maps to the extended 1-hour cache at a 2x write cost. Callers can
+ * pick per breakpoint target.
+ */
+type ProviderCacheTtl = boolean | "5m" | "1h";
+/**
+ * Per-provider prompt / context caching controls.
+ *
+ * For Anthropic, flipping these on emits `cache_control: { type: "ephemeral" }`
+ * breakpoints on the assembled system prompt and/or the last tool definition
+ * sent to the Messages API, enabling Anthropic's explicit prompt cache.
+ *
+ * OpenAI's prompt cache is automatic on gpt-4o+ and has no request-side
+ * directive to emit, so this option is a no-op for the OpenAI runtime. Google
+ * uses a separate `cachedContent` resource model that is intentionally not
+ * covered by this option (it belongs on a dedicated Gemini-specific surface).
+ */
+type ProviderCacheControlOption = {
+  /**
+   * Attach a cache breakpoint to the final system-prompt text block.
+   * Use when the system prompt is large and reused across requests.
+   */
+  system?: ProviderCacheTtl;
+  /**
+   * Attach a cache breakpoint to the last tool definition in `tools`.
+   * Use when the tool schemas are large and identical across requests.
+   */
+  tools?: ProviderCacheTtl;
+};
+/**
+ * Unified effort level for extended reasoning / thinking. Maps to
+ * per-provider knobs: Anthropic `thinking.budget_tokens`, OpenAI
+ * `reasoning_effort`, Gemini `thinkingConfig.thinkingBudget`.
+ */
+type ProviderReasoningEffort = "low" | "medium" | "high" | "max";
+/**
+ * Unified reasoning / thinking request option.
+ *
+ * Setting `enabled: true` turns on extended thinking on providers that
+ * support it (Anthropic Claude 4.x, OpenAI o-series, Gemini 2.5+). The
+ * `effort` field picks a coarse budget; when `budgetTokens` is set it
+ * wins for providers that take a numeric budget (Anthropic, Gemini).
+ *
+ * Providers that do not support reasoning treat this as a no-op. On
+ * Anthropic + OpenAI, enabling reasoning also disables sampling params
+ * that the providers reject in combination (`temperature`, `topP`,
+ * `topK`, `presencePenalty`, `frequencyPenalty`) — silently dropping
+ * them rather than failing the request.
+ */
+type ProviderReasoningOption = {
+  enabled?: boolean;
+  effort?: ProviderReasoningEffort;
+  budgetTokens?: number;
+};
 type OpenAICompatibleLanguageOptions = {
   prompt: RuntimePromptMessage[];
   maxOutputTokens?: number;
@@ -76,6 +149,128 @@ type OpenAICompatibleLanguageOptions = {
   providerOptions?: Record<string, unknown>;
   includeRawChunks?: boolean;
   abortSignal?: AbortSignal;
+  /**
+   * Per-provider prompt / context caching controls. See
+   * {@link ProviderCacheControlOption}. When unset, caching behaviour is
+   * unchanged on every provider.
+   */
+  cacheControl?: ProviderCacheControlOption;
+  /**
+   * Enable extended reasoning / thinking on providers that support it.
+   * See {@link ProviderReasoningOption}. When unset, reasoning behaviour
+   * is unchanged on every provider.
+   */
+  reasoning?: ProviderReasoningOption;
+  /**
+   * Stable per-user identifier for rate-limiting, abuse detection, and
+   * billing attribution. Maps to:
+   *  - Anthropic: `metadata.user_id`
+   *  - OpenAI: `user`
+   *  - Google: `labels.user_id` (when {@link requestLabels} is unset)
+   */
+  userId?: string;
+  /**
+   * Provider-specific label map for Google Gemini's `labels` field.
+   * Anthropic and OpenAI don't have an arbitrary-label equivalent, so
+   * this is intentionally Google-only. When unset, no labels are sent.
+   */
+  requestLabels?: Record<string, string>;
+  /**
+   * OpenAI-specific. Maps to the `service_tier` field on Chat Completions
+   * which trades latency for cost. Documented values:
+   *
+   *  - `default` — standard processing (default if unset)
+   *  - `flex` — lower-priority queue, lower per-token cost, longer
+   *    expected latency. Useful for batchy or non-interactive workloads.
+   *  - `scale` — reserved-capacity tier with strict latency SLOs.
+   *  - `auto` — let OpenAI pick.
+   *
+   * Forwarded verbatim. Anthropic and Google have no equivalent and
+   * the field is silently omitted on those providers.
+   */
+  serviceTier?: "auto" | "default" | "flex" | "scale";
+  /**
+   * OpenAI-specific. When `false`, OpenAI runs tool calls sequentially
+   * instead of in parallel. Useful for ordered side effects where
+   * concurrent calls would race. Default behaviour (unset) is parallel.
+   */
+  parallelToolCalls?: boolean;
+  /**
+   * Structured-output response format. Maps to OpenAI's `response_format`
+   * field on Chat Completions (and Responses). Three variants:
+   *
+   *  - `{ type: "text" }` — the default (no constraint).
+   *  - `{ type: "json" }` — emits OpenAI's `response_format:
+   *    { type: "json_object" }` to force the model to return valid JSON.
+   *  - `{ type: "json_schema", name, schema, strict? }` — emits
+   *    OpenAI's `response_format: { type: "json_schema", json_schema: {
+   *    name, schema, strict } }` for fully constrained structured
+   *    outputs (gpt-4o-2024-08-06+).
+   *
+   * On Anthropic and Google this option emits an "unsupported-setting"
+   * warning when set to anything other than `text` (those providers
+   * have their own structured-output surfaces and need a dedicated
+   * follow-up to wire them in).
+   */
+  responseFormat?:
+    | { type: "text" }
+    | { type: "json" }
+    | {
+      type: "json_schema";
+      name: string;
+      schema: unknown;
+      description?: string;
+      strict?: boolean;
+    };
+  /**
+   * Anthropic-specific. `container` field for programmatic tool calling
+   * and agent skills. Anthropic uses this to scope a session to a
+   * sandboxed container (e.g. for Computer Use, code execution
+   * sandboxes, or skills loaded from a container). Forwarded verbatim.
+   *
+   * The shape varies — string container id or a structured object
+   * depending on the feature. Caller passes whatever Anthropic's docs
+   * specify for the target feature.
+   */
+  anthropicContainer?: unknown;
+  /**
+   * Google-specific. Reference to a previously-created Gemini cached
+   * content resource (created via the separate caches API) to attach
+   * to this request. Resource name format:
+   * `cachedContents/<id>`. See https://ai.google.dev/gemini-api/docs/caching.
+   *
+   * Cache creation itself is out of scope for the runtime — callers
+   * use the Gemini REST API or SDK to create the cache, then pass the
+   * resource name here on each subsequent generate call to attach the
+   * cached prefix and avoid re-paying for it.
+   */
+  googleCachedContent?: string;
+  /**
+   * Google-specific. Per-request safety filter configuration for
+   * Gemini. Each entry pairs a HARM_CATEGORY_* with a threshold
+   * (BLOCK_NONE / BLOCK_LOW_AND_ABOVE / BLOCK_MEDIUM_AND_ABOVE /
+   * BLOCK_ONLY_HIGH). Forwarded verbatim as the `safetySettings`
+   * field. See https://ai.google.dev/gemini-api/docs/safety-settings.
+   */
+  googleSafetySettings?: Array<{
+    category: string;
+    threshold: string;
+  }>;
+  /**
+   * Anthropic-specific. Native MCP server definitions to pass directly
+   * on the Messages API request body. Lets callers register MCP servers
+   * server-side instead of reloading them into local function tools.
+   *
+   * Caller must opt into the MCP beta by adding the matching header to
+   * `headers`, e.g. `{ "anthropic-beta": "mcp-client-2025-04-04" }`.
+   * Without that header Anthropic will reject the request.
+   *
+   * Each entry is forwarded with camelCase keys converted to snake_case
+   * so `authorizationToken` → `authorization_token`,
+   * `toolConfiguration.allowedTools` → `tool_configuration.allowed_tools`,
+   * etc.
+   */
+  mcpServers?: Array<Record<string, unknown>>;
 };
 type OpenAICompatibleChatMessage =
   | { role: "system"; content: string }
@@ -142,7 +337,12 @@ type AnthropicCompatibleRequest = {
   messages: AnthropicCompatibleMessage[];
   max_tokens: number;
   stream?: boolean;
-  system?: string;
+  /**
+   * String form is the classic shorthand. Array-of-blocks form is required
+   * when the system prompt carries a cache_control breakpoint, because
+   * cache_control lives on an individual content block, not on a raw string.
+   */
+  system?: string | Array<Record<string, unknown>>;
   temperature?: number;
   top_p?: number;
   stop_sequences?: string[];
@@ -168,9 +368,7 @@ type GoogleCompatibleRequest = {
   systemInstruction?: {
     parts: Array<{ text: string }>;
   };
-  tools?: Array<{
-    functionDeclarations: Array<Record<string, unknown>>;
-  }>;
+  tools?: Array<Record<string, unknown>>;
   toolConfig?: {
     functionCallingConfig: Record<string, unknown>;
   };
@@ -198,6 +396,10 @@ function getOpenAIChatCompletionsUrl(baseURL?: string): string {
   return joinUrl(baseURL ?? DEFAULT_OPENAI_BASE_URL, "chat/completions");
 }
+function getOpenAIResponsesUrl(baseURL?: string): string {
+  return joinUrl(baseURL ?? DEFAULT_OPENAI_BASE_URL, "responses");
+}
 function getGoogleGenerateContentUrl(baseURL: string | undefined, modelId: string): string {
   return joinUrl(
     baseURL ?? DEFAULT_GOOGLE_BASE_URL,
@@ -287,9 +489,203 @@ function extractGoogleUsageTokens(payload: unknown): number | undefined {
   return typeof promptTokenCount === "number" ? promptTokenCount : undefined;
 }
-async function readErrorMessage(response: Response): Promise<string> {
-  const text = await response.text();
-  return text.trim() || `${response.status} ${response.statusText}`.trim();
+type ProviderKind = "anthropic" | "openai" | "google";
+/**
+ * Structured warning emitted when a provider runtime drops or rewrites a
+ * caller-provided option. Mirrors the AI ecosystem convention (Vercel AI
+ * SDK, LangChain) of returning `unsupported-setting` warnings on the
+ * runtime result so callers can discover silently-dropped fields without
+ * having to read the source.
+ */
+export type ProviderWarning = {
+  type: "unsupported-setting" | "other";
+  setting?: string;
+  details?: string;
+  provider: ProviderKind;
+};
+/**
+ * Mutable warning collector handed to per-provider request builders so
+ * they can append entries during the build pass instead of plumbing a
+ * return-tuple shape through every helper.
+ */
+type WarningCollector = {
+  push(warning: ProviderWarning): void;
+  drain(): ProviderWarning[];
+};
+function createWarningCollector(): WarningCollector {
+  const list: ProviderWarning[] = [];
+  return {
+    push(warning) {
+      list.push(warning);
+    },
+    drain() {
+      return list.slice();
+    },
+  };
+}
+/**
+ * Base class for typed provider errors. The `retryable` flag is the
+ * primary signal for callers (or a retry wrapper) to decide whether to
+ * re-issue the request. `retryAfterMs` is set when the provider gave an
+ * explicit delay hint (Retry-After header, Retry-Info trailer).
+ */
+export class ProviderError extends Error {
+  readonly provider: ProviderKind;
+  readonly status: number;
+  readonly retryable: boolean;
+  readonly retryAfterMs?: number;
+  constructor(options: {
+    provider: ProviderKind;
+    status: number;
+    message: string;
+    retryable: boolean;
+    retryAfterMs?: number;
+  }) {
+    super(options.message);
+    this.name = new.target.name;
+    this.provider = options.provider;
+    this.status = options.status;
+    this.retryable = options.retryable;
+    if (options.retryAfterMs !== undefined) {
+      this.retryAfterMs = options.retryAfterMs;
+    }
+  }
+}
+/** Provider reports it is overloaded (Anthropic 529, OpenAI/Google 503). */
+export class ProviderOverloadedError extends ProviderError {}
+/** Provider is rate limiting this API key (OpenAI/Google 429 with Retry-After). */
+export class ProviderRateLimitError extends ProviderError {}
+/** Provider account quota is exhausted — non-retryable. */
+export class ProviderQuotaError extends ProviderError {}
+/** Non-retryable 4xx/5xx that doesn't fit another bucket. */
+export class ProviderRequestError extends ProviderError {}
+function parseRetryAfterMs(header: string | null): number | undefined {
+  if (!header) return undefined;
+  const asNumber = Number(header);
+  if (Number.isFinite(asNumber) && asNumber >= 0) {
+    return Math.round(asNumber * 1000);
+  }
+  // HTTP-date form (rare in practice for LLM providers).
+  const parsed = Date.parse(header);
+  if (!Number.isNaN(parsed)) {
+    return Math.max(0, parsed - Date.now());
+  }
+  return undefined;
+}
+/**
+ * Inspect a non-2xx response and build the most specific ProviderError
+ * subclass we can. Reads the response body as text (it's already dead
+ * on the wire by this point). Body classification handles the cases
+ * where HTTP status alone is ambiguous — notably OpenAI
+ * `insufficient_quota` vs `rate_limit_exceeded` both arriving as 429.
+ */
+async function buildProviderError(
+  provider: ProviderKind,
+  response: Response,
+): Promise<ProviderError> {
+  const rawBody = await response.text();
+  const message = rawBody.trim() || `${response.status} ${response.statusText}`.trim();
+  const status = response.status;
+  const retryAfterMs = parseRetryAfterMs(response.headers.get("retry-after"));
+  const parsedBody = (() => {
+    try {
+      return JSON.parse(rawBody) as Record<string, unknown>;
+    } catch {
+      return undefined;
+    }
+  })();
+  const errorRecord = readRecord(parsedBody?.error);
+  const errorCode = typeof errorRecord?.code === "string"
+    ? errorRecord.code
+    : typeof errorRecord?.type === "string"
+    ? errorRecord.type
+    : typeof errorRecord?.status === "string"
+    ? errorRecord.status
+    : undefined;
+  // Anthropic 529 = overloaded. Anthropic surfaces this with
+  // { error: { type: "overloaded_error" } } in the body.
+  if (provider === "anthropic" && status === 529) {
+    return new ProviderOverloadedError({
+      provider,
+      status,
+      message,
+      retryable: true,
+      ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
+    });
+  }
+  // OpenAI / Google 503 = overloaded.
+  if ((provider === "openai" || provider === "google") && status === 503) {
+    return new ProviderOverloadedError({
+      provider,
+      status,
+      message,
+      retryable: true,
+      ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
+    });
+  }
+  // OpenAI 429 splits based on the error code in the body:
+  //  - insufficient_quota → hard quota, non-retryable
+  //  - rate_limit_exceeded / tokens_per_min_exceeded → retry with Retry-After
+  if (provider === "openai" && status === 429) {
+    if (errorCode === "insufficient_quota") {
+      return new ProviderQuotaError({
+        provider,
+        status,
+        message,
+        retryable: false,
+      });
+    }
+    return new ProviderRateLimitError({
+      provider,
+      status,
+      message,
+      retryable: true,
+      ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
+    });
+  }
+  // Google 429 RESOURCE_EXHAUSTED is almost always the daily free-tier
+  // quota — surface as a hard quota error so callers don't hot-loop on
+  // retries that can't possibly succeed until midnight UTC.
+  if (provider === "google" && status === 429) {
+    if (errorCode === "RESOURCE_EXHAUSTED") {
+      return new ProviderQuotaError({
+        provider,
+        status,
+        message,
+        retryable: false,
+      });
+    }
+    return new ProviderRateLimitError({
+      provider,
+      status,
+      message,
+      retryable: true,
+      ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
+    });
+  }
+  return new ProviderRequestError({
+    provider,
+    status,
+    message,
+    retryable: false,
+  });
 }
 async function requestJson(options: {
@@ -297,11 +693,13 @@ async function requestJson(options: {
   fetchImpl: typeof globalThis.fetch;
   init: RequestInit;
   providerLabel: string;
+  providerKind: ProviderKind;
 }): Promise<unknown> {
   const response = await options.fetchImpl(options.url, options.init);
   if (!response.ok) {
-    const message = await readErrorMessage(response);
-    throw new Error(`${options.providerLabel} request failed: ${message}`);
+    const err = await buildProviderError(options.providerKind, response);
+    err.message = `${options.providerLabel} request failed: ${err.message}`;
+    throw err;
   }
   return response.json();
@@ -312,15 +710,22 @@ async function requestStream(options: {
   fetchImpl: typeof globalThis.fetch;
   init: RequestInit;
   providerLabel: string;
+  providerKind: ProviderKind;
 }): Promise<ReadableStream<Uint8Array>> {
   const response = await options.fetchImpl(options.url, options.init);
   if (!response.ok) {
-    const message = await readErrorMessage(response);
-    throw new Error(`${options.providerLabel} request failed: ${message}`);
+    const err = await buildProviderError(options.providerKind, response);
+    err.message = `${options.providerLabel} request failed: ${err.message}`;
+    throw err;
   }
   if (!response.body) {
-    throw new Error(`${options.providerLabel} request failed: stream body missing`);
+    throw new ProviderRequestError({
+      provider: options.providerKind,
+      status: response.status,
+      message: `${options.providerLabel} request failed: stream body missing`,
+      retryable: false,
+    });
   }
   return response.body;
@@ -366,6 +771,11 @@ function toOpenAICompatibleMessages(prompt: RuntimePromptMessage[]): OpenAICompa
             text += part.text;
             continue;
           }
+          // OpenAI Chat Completions has no roundtrip slot for Anthropic
+          // thinking blocks — they get dropped on replay. Anthropic-only.
+          if (part.type === "reasoning") {
+            continue;
+          }
           toolCalls.push({
             id: part.toolCallId,
@@ -473,9 +883,15 @@ function normalizeAnthropicFinishReason(
   }
 }
-function extractAnthropicUsage(payload: unknown):
-  | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
-  | undefined {
+type RuntimeUsage = {
+  inputTokens?: number;
+  outputTokens?: number;
+  totalTokens?: number;
+  cacheCreationInputTokens?: number;
+  cacheReadInputTokens?: number;
+};
+function extractAnthropicUsage(payload: unknown): RuntimeUsage | undefined {
   const record = readRecord(payload);
   const usage = readRecord(record?.usage);
   if (!usage) {
@@ -484,6 +900,8 @@ function extractAnthropicUsage(payload: unknown):
   const inputTokens = usage.input_tokens;
   const outputTokens = usage.output_tokens;
+  const cacheCreationInputTokens = usage.cache_creation_input_tokens;
+  const cacheReadInputTokens = usage.cache_read_input_tokens;
   return {
     inputTokens: typeof inputTokens === "number" ? inputTokens : undefined,
@@ -492,17 +910,15 @@ function extractAnthropicUsage(payload: unknown):
       ? (typeof inputTokens === "number" ? inputTokens : 0) +
         (typeof outputTokens === "number" ? outputTokens : 0)
       : undefined,
+    ...(typeof cacheCreationInputTokens === "number" ? { cacheCreationInputTokens } : {}),
+    ...(typeof cacheReadInputTokens === "number" ? { cacheReadInputTokens } : {}),
   };
 }
 function mergeUsage(
-  current:
-    | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
-    | undefined,
-  next:
-    | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
-    | undefined,
-): { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined {
+  current: RuntimeUsage | undefined,
+  next: RuntimeUsage | undefined,
+): RuntimeUsage | undefined {
   if (!current) {
     return next;
   }
@@ -513,11 +929,16 @@ function mergeUsage(
   const inputTokens = next.inputTokens ?? current.inputTokens;
   const outputTokens = next.outputTokens ?? current.outputTokens;
+  const cacheCreationInputTokens = next.cacheCreationInputTokens ??
+    current.cacheCreationInputTokens;
+  const cacheReadInputTokens = next.cacheReadInputTokens ?? current.cacheReadInputTokens;
   return {
     inputTokens,
     outputTokens,
     totalTokens: (inputTokens ?? 0) + (outputTokens ?? 0),
+    ...(cacheCreationInputTokens !== undefined ? { cacheCreationInputTokens } : {}),
+    ...(cacheReadInputTokens !== undefined ? { cacheReadInputTokens } : {}),
   };
 }
@@ -538,6 +959,26 @@ function toSnakeCaseRecord(record: Record<string, unknown>): Record<string, unkn
   );
 }
+/**
+ * Recursive snake_case key converter for nested config objects (used for
+ * Anthropic mcp_servers, where authorizationToken / toolConfiguration /
+ * allowedTools all need conversion).
+ */
+function deepSnakeCase(value: unknown): unknown {
+  if (Array.isArray(value)) {
+    return value.map(deepSnakeCase);
+  }
+  if (value !== null && typeof value === "object") {
+    return Object.fromEntries(
+      Object.entries(value as Record<string, unknown>).map(([key, v]) => [
+        key.replace(/[A-Z]/g, (match) => `_${match.toLowerCase()}`),
+        deepSnakeCase(v),
+      ]),
+    );
+  }
+  return value;
+}
 function pushAnthropicUserContent(
   messages: AnthropicCompatibleMessage[],
   content: Array<Record<string, unknown>>,
@@ -558,9 +999,32 @@ function pushAnthropicUserContent(
   });
 }
+/**
+ * Resolves a {@link ProviderCacheTtl} into Anthropic's `cache_control` shape.
+ *
+ * Returns `undefined` when caching is not requested (`false` / `undefined`),
+ * `{ type: "ephemeral" }` for the 5-minute default (`true` / `"5m"`), or
+ * `{ type: "ephemeral", ttl: "1h" }` for the extended 1-hour cache.
+ */
+function resolveAnthropicCacheControlBlock(
+  ttl: ProviderCacheTtl | undefined,
+): { type: "ephemeral"; ttl?: "1h" } | undefined {
+  if (ttl === undefined || ttl === false) {
+    return undefined;
+  }
+  if (ttl === "1h") {
+    return { type: "ephemeral", ttl: "1h" };
+  }
+  return { type: "ephemeral" };
+}
 function toAnthropicMessages(
   prompt: RuntimePromptMessage[],
-): { system?: string; messages: AnthropicCompatibleMessage[] } {
+  systemCacheControl?: { type: "ephemeral"; ttl?: "1h" },
+): {
+  system?: string | Array<Record<string, unknown>>;
+  messages: AnthropicCompatibleMessage[];
+} {
   const systemParts: string[] = [];
   const messages: AnthropicCompatibleMessage[] = [];
@@ -580,14 +1044,33 @@ function toAnthropicMessages(
       case "assistant":
         messages.push({
           role: "assistant",
-          content: message.content.map((part) =>
-            part.type === "text" ? { type: "text", text: part.text } : {
+          content: message.content.map((part) => {
+            if (part.type === "text") {
+              return { type: "text", text: part.text };
+            }
+            if (part.type === "reasoning") {
+              // Redacted thinking blocks roundtrip as the encrypted blob
+              // form Anthropic gave us. Plain thinking blocks need the
+              // signature to verify on the server.
+              if (typeof part.redactedData === "string") {
+                return {
+                  type: "redacted_thinking",
+                  data: part.redactedData,
+                };
+              }
+              return {
+                type: "thinking",
+                thinking: part.text ?? "",
+                ...(typeof part.signature === "string" ? { signature: part.signature } : {}),
+              };
+            }
+            return {
               type: "tool_use",
               id: part.toolCallId,
               name: part.toolName,
               input: part.input,
-            }
-          ),
+            };
+          }),
         });
         break;
       case "tool":
@@ -603,14 +1086,63 @@ function toAnthropicMessages(
     }
   }
-  return {
-    ...(systemParts.length > 0 ? { system: systemParts.join("\n\n") } : {}),
-    messages,
-  };
+  if (systemParts.length === 0) {
+    return { messages };
+  }
+  const joined = systemParts.join("\n\n");
+  // Cache-controlled system prompts must use the array-of-blocks form so the
+  // breakpoint lands on an individual content block. Callers that don't opt
+  // in keep the legacy raw-string form for backward compatibility.
+  if (systemCacheControl) {
+    return {
+      system: [{
+        type: "text",
+        text: joined,
+        cache_control: systemCacheControl,
+      }],
+      messages,
+    };
+  }
+  return { system: joined, messages };
+}
+/**
+ * Short-name → latest-versioned-type alias map for Anthropic provider tools.
+ *
+ * Anthropic tool types are date-stamped (e.g. `code_execution_20260120`) so
+ * callers either pin a version or get the latest. We accept both: a caller
+ * can pass `anthropic.code_execution` and we map to the latest known version,
+ * or pass `anthropic.code_execution_20250522` and we forward verbatim.
+ *
+ * Versions chosen here are the latest documented releases as of 2026-04-15
+ * — see https://docs.claude.com/en/docs/agents-and-tools/tool-use/overview.
+ * When Anthropic ships newer versions, update this map.
+ */
+const ANTHROPIC_TOOL_VERSION_ALIASES: Record<string, string> = {
+  code_execution: "code_execution_20260120",
+  computer_use: "computer_20250124",
+  computer: "computer_20250124",
+  text_editor: "text_editor_20250728",
+  bash: "bash_20250124",
+  memory: "memory_20250818",
+  web_search: "web_search_20250305",
+  web_fetch: "web_fetch_20250910",
+};
+function resolveAnthropicProviderType(rawType: string): string {
+  // Already-versioned types (contain a date stamp suffix) pass through verbatim.
+  if (/_\d{8}$/.test(rawType)) {
+    return rawType;
+  }
+  return ANTHROPIC_TOOL_VERSION_ALIASES[rawType] ?? rawType;
 }
 function toAnthropicTools(
   tools: RuntimeToolDefinition[] | undefined,
+  toolsCacheControl?: { type: "ephemeral"; ttl?: "1h" },
 ): Array<Record<string, unknown>> | undefined {
   if (!tools) {
     return undefined;
@@ -632,19 +1164,35 @@ function toAnthropicTools(
       continue;
     }
-    const providerType = tool.id.slice("anthropic.".length);
-    if (providerType.length === 0) {
+    const rawType = tool.id.slice("anthropic.".length);
+    if (rawType.length === 0) {
       continue;
     }
     normalized.push({
-      type: providerType,
+      type: resolveAnthropicProviderType(rawType),
       name: tool.name,
       ...toSnakeCaseRecord(tool.args),
     });
   }
-  return normalized.length > 0 ? normalized : undefined;
+  if (normalized.length === 0) {
+    return undefined;
+  }
+  // Attach the cache breakpoint to the final tool entry so Anthropic caches
+  // the entire tools block up to and including that definition. Earlier tool
+  // entries are implicitly covered by the same breakpoint per Anthropic's
+  // walk-backward cache lookup behaviour.
+  if (toolsCacheControl) {
+    const lastIndex = normalized.length - 1;
+    normalized[lastIndex] = {
+      ...normalized[lastIndex],
+      cache_control: toolsCacheControl,
+    };
+  }
+  return normalized;
 }
 function createAnthropicRequestHeaders(options: {
@@ -717,47 +1265,244 @@ function resolveAnthropicMaxTokens(
   return requested;
 }
+/**
+ * Map a unified reasoning effort level to an Anthropic `thinking.budget_tokens`
+ * value. Anthropic's minimum accepted budget is 1024; higher tiers give Claude
+ * more headroom to explore. `max` maps to the upper bound documented for
+ * Claude 4.x family (32k tokens of thinking — caller can override via
+ * `budgetTokens` if they need more).
+ */
+function resolveAnthropicThinkingBudget(
+  option: ProviderReasoningOption | undefined,
+): number | undefined {
+  if (!option || option.enabled !== true) {
+    return undefined;
+  }
+  if (typeof option.budgetTokens === "number" && option.budgetTokens >= 1024) {
+    return option.budgetTokens;
+  }
+  switch (option.effort) {
+    case "low":
+      return 1024;
+    case "high":
+      return 16_384;
+    case "max":
+      return 32_768;
+    case "medium":
+    default:
+      return 4096;
+  }
+}
 function buildAnthropicMessagesRequest(
   modelId: string,
   providerName: string,
   options: OpenAICompatibleLanguageOptions,
   stream: boolean,
+  warnings: WarningCollector,
 ): AnthropicCompatibleRequest {
-  const { system, messages } = toAnthropicMessages(options.prompt);
+  const systemCacheControl = resolveAnthropicCacheControlBlock(
+    options.cacheControl?.system,
+  );
+  const toolsCacheControl = resolveAnthropicCacheControlBlock(
+    options.cacheControl?.tools,
+  );
+  const { system, messages } = toAnthropicMessages(options.prompt, systemCacheControl);
+  const anthropicTools = toAnthropicTools(options.tools, toolsCacheControl);
+  const thinkingBudget = resolveAnthropicThinkingBudget(options.reasoning);
+  const thinkingEnabled = thinkingBudget !== undefined;
+  // Anthropic doesn't support these unified options at all — emit warnings
+  // so callers don't quietly pass values that have zero effect.
+  if (options.presencePenalty !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "presencePenalty",
+      details: "Anthropic Messages API has no equivalent and the value was dropped.",
+    });
+  }
+  if (options.frequencyPenalty !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "frequencyPenalty",
+      details: "Anthropic Messages API has no equivalent and the value was dropped.",
+    });
+  }
+  if (options.seed !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "seed",
+      details: "Anthropic Messages API does not support deterministic seeding.",
+    });
+  }
+  if (options.topK !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "topK",
+      details: "Anthropic Messages API does not expose top_k on this surface.",
+    });
+  }
+  if (
+    options.stopSequences && options.stopSequences.length > 4
+  ) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "stopSequences",
+      details:
+        `Anthropic accepts at most 4 stop sequences; ${options.stopSequences.length} were provided and the extras were truncated.`,
+    });
+  }
+  if (thinkingEnabled && options.temperature !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "temperature",
+      details:
+        "Dropped because Anthropic rejects sampling params when extended thinking is enabled.",
+    });
+  }
+  if (thinkingEnabled && options.topP !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "topP",
+      details:
+        "Dropped because Anthropic rejects sampling params when extended thinking is enabled.",
+    });
+  }
+  if (options.responseFormat && options.responseFormat.type !== "text") {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "anthropic",
+      setting: "responseFormat",
+      details:
+        "Anthropic Messages API does not have a structured-output response_format equivalent. Use a tool with the schema as input_schema instead.",
+    });
+  }
+  // Anthropic requires max_tokens > budget_tokens when thinking is enabled.
+  // Growing max_tokens by the thinking budget preserves the caller's intended
+  // output budget, and we clamp the sum at the model's advertised maximum so
+  // the request never exceeds the API's hard cap.
+  const baseMaxTokens = resolveAnthropicMaxTokens(modelId, options.maxOutputTokens);
+  const maxTokens = thinkingEnabled
+    ? Math.min(
+      baseMaxTokens + (thinkingBudget ?? 0),
+      getAnthropicModelCapabilities(modelId).maxOutputTokens,
+    )
+    : baseMaxTokens;
   const body: AnthropicCompatibleRequest = {
     model: modelId,
     messages,
-    max_tokens: resolveAnthropicMaxTokens(modelId, options.maxOutputTokens),
+    max_tokens: maxTokens,
     ...(stream ? { stream: true } : {}),
     ...(system ? { system } : {}),
-    ...(options.temperature !== undefined ? { temperature: options.temperature } : {}),
-    ...(options.topP !== undefined ? { top_p: options.topP } : {}),
+    // Sampling params are mutually exclusive with thinking on Anthropic — the
+    // API rejects the combo outright. Drop them silently when thinking is on
+    // (callers see thinking's output instead of what they'd have gotten from
+    // custom sampling, which is the documented tradeoff).
+    ...(!thinkingEnabled && options.temperature !== undefined
+      ? { temperature: options.temperature }
+      : {}),
+    ...(!thinkingEnabled && options.topP !== undefined ? { top_p: options.topP } : {}),
     ...(options.stopSequences && options.stopSequences.length > 0
-      ? { stop_sequences: options.stopSequences }
+      ? { stop_sequences: options.stopSequences.slice(0, 4) }
       : {}),
-    ...(toAnthropicTools(options.tools) ? { tools: toAnthropicTools(options.tools) } : {}),
+    ...(anthropicTools ? { tools: anthropicTools } : {}),
     ...(options.toolChoice !== undefined
       ? { tool_choice: normalizeAnthropicToolChoice(options.toolChoice) }
       : {}),
+    ...(thinkingEnabled ? { thinking: { type: "enabled", budget_tokens: thinkingBudget } } : {}),
+    ...(typeof options.userId === "string" && options.userId.length > 0
+      ? { metadata: { user_id: options.userId } }
+      : {}),
+    ...(options.mcpServers && options.mcpServers.length > 0
+      ? { mcp_servers: deepSnakeCase(options.mcpServers) as unknown[] }
+      : {}),
+    ...(options.anthropicContainer !== undefined ? { container: options.anthropicContainer } : {}),
   };
   Object.assign(body, readProviderOptions(options.providerOptions, "anthropic", providerName));
   return body;
 }
+type AnthropicReasoningContent = {
+  type: "reasoning";
+  text?: string;
+  signature?: string;
+  redactedData?: string;
+};
+type AnthropicCitation = {
+  type: string;
+  citedText?: string;
+  url?: string;
+  title?: string;
+  startCharIndex?: number;
+  endCharIndex?: number;
+  startBlockIndex?: number;
+  endBlockIndex?: number;
+  startPageNumber?: number;
+  endPageNumber?: number;
+  documentIndex?: number;
+  documentTitle?: string;
+};
+type AnthropicTextContent = {
+  type: "text";
+  text: string;
+  citations?: AnthropicCitation[];
+};
+/**
+ * Best-effort camelCase normalization of a single Anthropic citation
+ * record. Handles the union of fields across web_search_result_location,
+ * web_fetch_result_location, char_location, page_location, and
+ * content_block_location citation kinds — see
+ * https://docs.claude.com/en/docs/build-with-claude/citations
+ */
+function normalizeAnthropicCitation(raw: unknown): AnthropicCitation | undefined {
+  const r = readRecord(raw);
+  if (!r) return undefined;
+  const typeStr = typeof r.type === "string" ? r.type : undefined;
+  if (!typeStr) return undefined;
+  const out: AnthropicCitation = { type: typeStr };
+  if (typeof r.cited_text === "string") out.citedText = r.cited_text;
+  if (typeof r.url === "string") out.url = r.url;
+  if (typeof r.title === "string") out.title = r.title;
+  if (typeof r.start_char_index === "number") out.startCharIndex = r.start_char_index;
+  if (typeof r.end_char_index === "number") out.endCharIndex = r.end_char_index;
+  if (typeof r.start_block_index === "number") out.startBlockIndex = r.start_block_index;
+  if (typeof r.end_block_index === "number") out.endBlockIndex = r.end_block_index;
+  if (typeof r.start_page_number === "number") out.startPageNumber = r.start_page_number;
+  if (typeof r.end_page_number === "number") out.endPageNumber = r.end_page_number;
+  if (typeof r.document_index === "number") out.documentIndex = r.document_index;
+  if (typeof r.document_title === "string") out.documentTitle = r.document_title;
+  return out;
+}
 function buildAnthropicGenerateResult(payload: unknown): {
   content: Array<
-    | { type: "text"; text: string }
+    | AnthropicTextContent
+    | AnthropicReasoningContent
     | { type: "tool-call"; toolCallId: string; toolName: string; input: string }
     | { type: "tool-result"; toolCallId: string; toolName: string; result: unknown }
   >;
   finishReason?: string | { unified: string; raw: string } | null;
-  usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number };
+  usage?: RuntimeUsage;
 } {
   const record = readRecord(payload);
   const content = Array.isArray(record?.content) ? record.content : [];
   const normalized: Array<
-    | { type: "text"; text: string }
+    | AnthropicTextContent
+    | AnthropicReasoningContent
     | { type: "tool-call"; toolCallId: string; toolName: string; input: string }
     | { type: "tool-result"; toolCallId: string; toolName: string; result: unknown }
   > = [];
@@ -767,7 +1512,42 @@ function buildAnthropicGenerateResult(payload: unknown): {
     const blockType = typeof block?.type === "string" ? block.type : undefined;
     if (blockType === "text" && typeof block?.text === "string" && block.text.length > 0) {
-      normalized.push({ type: "text", text: block.text });
+      const citationsRaw = Array.isArray(block.citations) ? block.citations : undefined;
+      const citations = citationsRaw
+        ?.flatMap((c) => {
+          const normalizedCitation = normalizeAnthropicCitation(c);
+          return normalizedCitation ? [normalizedCitation] : [];
+        });
+      normalized.push({
+        type: "text",
+        text: block.text,
+        ...(citations && citations.length > 0 ? { citations } : {}),
+      });
+      continue;
+    }
+    // Thinking blocks carry the cleartext trace plus a signature that
+    // Anthropic uses to verify on subsequent turns. Surfacing both lets
+    // callers persist them as `reasoning` content parts and replay on
+    // the next turn so Claude can continue from the same thinking.
+    if (blockType === "thinking") {
+      normalized.push({
+        type: "reasoning",
+        ...(typeof block?.thinking === "string" ? { text: block.thinking } : {}),
+        ...(typeof block?.signature === "string" ? { signature: block.signature } : {}),
+      });
+      continue;
+    }
+    // Redacted thinking blocks arrive when Claude's safety classifier
+    // hides the trace. Pass the encrypted blob through opaquely so the
+    // caller can replay it on the next turn (Anthropic still needs the
+    // blob to verify continuity even though it can't read it).
+    if (blockType === "redacted_thinking" && typeof block?.data === "string") {
+      normalized.push({
+        type: "reasoning",
+        redactedData: block.data,
+      });
       continue;
     }
@@ -857,7 +1637,7 @@ async function* streamAnthropicCompatibleParts(
   const toolCalls = new Map<number, AnthropicStreamToolCallState>();
   const reasoningBlocks = new Map<number, AnthropicStreamReasoningState>();
   let finishReason: string | { unified: string; raw: string } | null = null;
-  let usage: { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined;
+  let usage: RuntimeUsage | undefined;
   for await (const chunk of stream) {
     buffer += decoder.decode(chunk, { stream: true });
@@ -909,6 +1689,20 @@ async function* streamAnthropicCompatibleParts(
           continue;
         }
+        // Redacted thinking blocks arrive as opaque encrypted payloads when
+        // Claude's safety classifier flags the reasoning trace. Surface them
+        // as a zero-length reasoning block so callers know thinking happened
+        // without leaking the (legitimately hidden) contents.
+        if (blockType === "redacted_thinking") {
+          const reasoningId = `thinking-${index}`;
+          reasoningBlocks.set(index, { id: reasoningId });
+          yield {
+            type: "reasoning-start",
+            id: reasoningId,
+          };
+          continue;
+        }
         if (
           (blockType === "tool_use" || blockType === "server_tool_use") &&
           typeof contentBlock?.id === "string" &&
@@ -1094,9 +1888,7 @@ function normalizeOpenAIFinishReason(
   return raw;
 }
-function extractOpenAIUsage(payload: unknown):
-  | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
-  | undefined {
+function extractOpenAIUsage(payload: unknown): RuntimeUsage | undefined {
   const record = readRecord(payload);
   const usage = readRecord(record?.usage);
   if (!usage) {
@@ -1106,11 +1898,14 @@ function extractOpenAIUsage(payload: unknown):
   const inputTokens = usage.prompt_tokens;
   const outputTokens = usage.completion_tokens;
   const totalTokens = usage.total_tokens;
+  const promptTokensDetails = readRecord(usage.prompt_tokens_details);
+  const cachedTokens = promptTokensDetails?.cached_tokens;
   return {
     inputTokens: typeof inputTokens === "number" ? inputTokens : undefined,
     outputTokens: typeof outputTokens === "number" ? outputTokens : undefined,
     totalTokens: typeof totalTokens === "number" ? totalTokens : undefined,
+    ...(typeof cachedTokens === "number" ? { cacheReadInputTokens: cachedTokens } : {}),
   };
 }
@@ -1165,19 +1960,95 @@ function extractOpenAIToolCalls(message: Record<string, unknown>): Array<{
   return normalized;
 }
+/**
+ * OpenAI reasoning models (o1 / o3 / o4 family) use the completion path but
+ * have different constraints than chat models: sampling params are rejected,
+ * and they accept a `reasoning_effort` field. We detect them by model id
+ * prefix so callers don't have to configure it per runtime.
+ */
+function isOpenAIReasoningModel(modelId: string): boolean {
+  return /^o[134](-|$)/.test(modelId);
+}
+/**
+ * Map the unified reasoning effort to OpenAI's `reasoning_effort` enum.
+ * OpenAI doesn't accept "max" — we collapse it to "high".
+ */
+function resolveOpenAIReasoningEffort(
+  option: ProviderReasoningOption | undefined,
+): "low" | "medium" | "high" | undefined {
+  if (!option || option.enabled !== true) {
+    return undefined;
+  }
+  switch (option.effort) {
+    case "low":
+      return "low";
+    case "high":
+    case "max":
+      return "high";
+    case "medium":
+    default:
+      return "medium";
+  }
+}
 function buildOpenAIChatRequest(
   modelId: string,
   providerName: string,
   options: OpenAICompatibleLanguageOptions,
   stream: boolean,
+  warnings: WarningCollector,
 ): OpenAICompatibleChatRequest {
+  const isReasoningModel = isOpenAIReasoningModel(modelId);
+  const reasoningEffort = resolveOpenAIReasoningEffort(options.reasoning);
+  const reasoningEnabled = isReasoningModel || reasoningEffort !== undefined;
+  // OpenAI Chat Completions has no top_k surface (it's exposed only on the
+  // Responses API for some reasoning models). Quietly accepting it would
+  // mislead callers into thinking it took effect.
+  if (options.topK !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "openai",
+      setting: "topK",
+      details: "OpenAI Chat Completions does not expose top_k; the value was dropped.",
+    });
+  }
+  // Reasoning models (o1 / o3 / o4) reject sampling params outright. Emit
+  // warnings at build time so callers see *why* the value didn't apply
+  // rather than a 400 from the API.
+  if (reasoningEnabled) {
+    const dropped: Array<[keyof typeof options, string]> = [
+      ["temperature", "temperature"],
+      ["topP", "top_p"],
+      ["presencePenalty", "presence_penalty"],
+      ["frequencyPenalty", "frequency_penalty"],
+    ];
+    for (const [key, openaiName] of dropped) {
+      if (options[key] !== undefined) {
+        warnings.push({
+          type: "unsupported-setting",
+          provider: "openai",
+          setting: key,
+          details:
+            `Dropped because OpenAI reasoning models reject ${openaiName}. Reasoning was active for this request.`,
+        });
+      }
+    }
+  }
   const body: OpenAICompatibleChatRequest = {
     model: modelId,
     messages: toOpenAICompatibleMessages(options.prompt),
     ...(stream ? { stream: true, stream_options: { include_usage: true } } : {}),
     ...(options.maxOutputTokens !== undefined ? { max_tokens: options.maxOutputTokens } : {}),
-    ...(options.temperature !== undefined ? { temperature: options.temperature } : {}),
-    ...(options.topP !== undefined ? { top_p: options.topP } : {}),
+    // OpenAI reasoning models reject temperature / top_p / frequency / presence.
+    // Drop them silently rather than letting the API bounce the request.
+    ...(!reasoningEnabled && options.temperature !== undefined
+      ? { temperature: options.temperature }
+      : {}),
+    ...(!reasoningEnabled && options.topP !== undefined ? { top_p: options.topP } : {}),
     ...(options.stopSequences && options.stopSequences.length > 0
       ? { stop: options.stopSequences }
       : {}),
@@ -1186,10 +2057,37 @@ function buildOpenAIChatRequest(
       : {}),
     ...(options.toolChoice !== undefined ? { tool_choice: options.toolChoice } : {}),
     ...(options.seed !== undefined ? { seed: options.seed } : {}),
-    ...(options.presencePenalty !== undefined ? { presence_penalty: options.presencePenalty } : {}),
-    ...(options.frequencyPenalty !== undefined
+    ...(!reasoningEnabled && options.presencePenalty !== undefined
+      ? { presence_penalty: options.presencePenalty }
+      : {}),
+    ...(!reasoningEnabled && options.frequencyPenalty !== undefined
       ? { frequency_penalty: options.frequencyPenalty }
       : {}),
+    ...(reasoningEffort !== undefined ? { reasoning_effort: reasoningEffort } : {}),
+    ...(typeof options.userId === "string" && options.userId.length > 0
+      ? { user: options.userId }
+      : {}),
+    ...(options.serviceTier !== undefined ? { service_tier: options.serviceTier } : {}),
+    ...(options.parallelToolCalls !== undefined
+      ? { parallel_tool_calls: options.parallelToolCalls }
+      : {}),
+    ...(options.responseFormat && options.responseFormat.type !== "text"
+      ? {
+        response_format: options.responseFormat.type === "json" ? { type: "json_object" } : {
+          type: "json_schema",
+          json_schema: {
+            name: options.responseFormat.name,
+            ...(typeof options.responseFormat.description === "string"
+              ? { description: options.responseFormat.description }
+              : {}),
+            schema: unwrapToolInputSchema(options.responseFormat.schema),
+            ...(options.responseFormat.strict !== undefined
+              ? { strict: options.responseFormat.strict }
+              : {}),
+          },
+        },
+      }
+      : {}),
   };
   Object.assign(body, readProviderOptions(options.providerOptions, "openai", providerName));
@@ -1216,9 +2114,7 @@ function normalizeGoogleFinishReason(
   }
 }
-function extractGoogleUsage(payload: unknown):
-  | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
-  | undefined {
+function extractGoogleUsage(payload: unknown): RuntimeUsage | undefined {
   const record = readRecord(payload);
   const usage = readRecord(record?.usageMetadata);
   if (!usage) {
@@ -1228,11 +2124,15 @@ function extractGoogleUsage(payload: unknown):
   const inputTokens = usage.promptTokenCount;
   const outputTokens = usage.candidatesTokenCount;
   const totalTokens = usage.totalTokenCount;
+  const cachedContentTokenCount = usage.cachedContentTokenCount;
   return {
     inputTokens: typeof inputTokens === "number" ? inputTokens : undefined,
     outputTokens: typeof outputTokens === "number" ? outputTokens : undefined,
     totalTokens: typeof totalTokens === "number" ? totalTokens : undefined,
+    ...(typeof cachedContentTokenCount === "number"
+      ? { cacheReadInputTokens: cachedContentTokenCount }
+      : {}),
   };
 }
@@ -1258,20 +2158,29 @@ function toGoogleContents(
           parts: [{ text: readTextParts(message.content) }],
         });
         break;
-      case "assistant":
-        contents.push({
-          role: "model",
-          parts: message.content.map((part) =>
-            part.type === "text" ? { text: part.text } : {
-              functionCall: {
-                id: part.toolCallId,
-                name: part.toolName,
-                args: part.input,
-              },
-            }
-          ),
-        });
+      case "assistant": {
+        // Anthropic-only `reasoning` parts have no Gemini equivalent
+        // and are dropped on replay.
+        const parts: Array<Record<string, unknown>> = [];
+        for (const part of message.content) {
+          if (part.type === "text") {
+            parts.push({ text: part.text });
+            continue;
+          }
+          if (part.type === "reasoning") {
+            continue;
+          }
+          parts.push({
+            functionCall: {
+              id: part.toolCallId,
+              name: part.toolName,
+              args: part.input,
+            },
+          });
+        }
+        contents.push({ role: "model", parts });
         break;
+      }
       case "tool":
         contents.push({
           role: "user",
@@ -1299,22 +2208,45 @@ function toGoogleContents(
 function toGoogleTools(
   tools: RuntimeToolDefinition[] | undefined,
-): GoogleCompatibleRequest["tools"] | undefined {
+): Array<Record<string, unknown>> | undefined {
   if (!tools) {
     return undefined;
   }
-  const functionDeclarations = tools.flatMap((tool) =>
-    tool.type === "function"
-      ? [{
-        name: tool.name,
+  const functionDeclarations: Array<Record<string, unknown>> = [];
+  const providerEntries: Array<Record<string, unknown>> = [];
+  for (const tool of tools) {
+    if (tool.type === "function") {
+      functionDeclarations.push({
+        name: tool.name,
         ...(typeof tool.description === "string" ? { description: tool.description } : {}),
         parameters: unwrapToolInputSchema(tool.inputSchema),
-      }]
-      : []
-  );
+      });
+      continue;
+    }
-  return functionDeclarations.length > 0 ? [{ functionDeclarations }] : undefined;
+    // Gemini provider tools — code_execution, google_search,
+    // google_search_retrieval — each lives in its own tools[] entry
+    // with a single key keyed by the camelCase tool name and an
+    // optional config payload (caller-provided tool.args).
+    if (!tool.id.startsWith("google.")) {
+      continue;
+    }
+    const providerType = tool.id.slice("google.".length);
+    if (providerType.length === 0) {
+      continue;
+    }
+    const camelKey = providerType.replace(/_([a-z])/g, (_, ch) => ch.toUpperCase());
+    providerEntries.push({ [camelKey]: tool.args ?? {} });
+  }
+  const result: Array<Record<string, unknown>> = [];
+  if (functionDeclarations.length > 0) {
+    result.push({ functionDeclarations });
+  }
+  result.push(...providerEntries);
+  return result.length > 0 ? result : undefined;
 }
 function unwrapToolInputSchema(inputSchema: unknown): unknown {
@@ -1346,7 +2278,11 @@ function normalizeGoogleToolChoice(toolChoice: unknown):
   }
   const record = readRecord(toolChoice);
-  if (record?.type === "tool" && typeof record.name === "string") {
+  if (!record) return undefined;
+  // Single-tool restriction: { type: "tool", name } — pin to one
+  // function via mode: ANY + allowedFunctionNames: [name].
+  if (record.type === "tool" && typeof record.name === "string") {
     return {
       functionCallingConfig: {
         mode: "ANY",
@@ -1355,12 +2291,74 @@ function normalizeGoogleToolChoice(toolChoice: unknown):
     };
   }
+  // Multi-tool restriction: { type: "tools", names: string[] } — pin
+  // to a subset via mode: ANY + the full allowedFunctionNames array.
+  if (record.type === "tools" && Array.isArray(record.names)) {
+    const names = record.names.filter((n): n is string => typeof n === "string");
+    if (names.length > 0) {
+      return {
+        functionCallingConfig: {
+          mode: "ANY",
+          allowedFunctionNames: names,
+        },
+      };
+    }
+  }
+  // Explicit mode forms: { type: "auto" | "none" | "any" }.
+  if (record.type === "auto") {
+    return { functionCallingConfig: { mode: "AUTO" } };
+  }
+  if (record.type === "none") {
+    return { functionCallingConfig: { mode: "NONE" } };
+  }
+  if (record.type === "any" || record.type === "required") {
+    return { functionCallingConfig: { mode: "ANY" } };
+  }
   return undefined;
 }
+/**
+ * Map the unified reasoning option to Gemini's thinkingConfig. Gemini 2.5+
+ * accepts `includeThoughts: true` to stream back `thought` parts, and
+ * `thinkingBudget: N` to cap the thinking token count. The effort levels
+ * here follow Google's own guidance (low ~= 512, medium ~= 2048,
+ * high ~= 8192, max = -1 means "dynamic/no cap").
+ */
+function resolveGoogleThinkingConfig(
+  option: ProviderReasoningOption | undefined,
+): Record<string, unknown> | undefined {
+  if (!option || option.enabled !== true) {
+    return undefined;
+  }
+  const config: Record<string, unknown> = { includeThoughts: true };
+  if (typeof option.budgetTokens === "number") {
+    config.thinkingBudget = option.budgetTokens;
+    return config;
+  }
+  switch (option.effort) {
+    case "low":
+      config.thinkingBudget = 512;
+      break;
+    case "high":
+      config.thinkingBudget = 8192;
+      break;
+    case "max":
+      config.thinkingBudget = -1;
+      break;
+    case "medium":
+    default:
+      config.thinkingBudget = 2048;
+      break;
+  }
+  return config;
+}
 function buildGoogleGenerationConfig(
   options: OpenAICompatibleLanguageOptions,
 ): Record<string, unknown> | undefined {
+  const thinkingConfig = resolveGoogleThinkingConfig(options.reasoning);
   const config: Record<string, unknown> = {
     ...(options.maxOutputTokens !== undefined ? { maxOutputTokens: options.maxOutputTokens } : {}),
     ...(options.temperature !== undefined ? { temperature: options.temperature } : {}),
@@ -1370,6 +2368,7 @@ function buildGoogleGenerationConfig(
       ? { stopSequences: options.stopSequences }
       : {}),
     ...(options.seed !== undefined ? { seed: options.seed } : {}),
+    ...(thinkingConfig ? { thinkingConfig } : {}),
   };
   return Object.keys(config).length > 0 ? config : undefined;
@@ -1378,8 +2377,47 @@ function buildGoogleGenerationConfig(
 function buildGoogleGenerateContentRequest(
   providerName: string,
   options: OpenAICompatibleLanguageOptions,
+  warnings: WarningCollector,
 ): GoogleCompatibleRequest {
+  // Google generate-content surface doesn't accept presence/frequency
+  // penalties on most current models. Emit warnings and let the request
+  // through without them.
+  if (options.presencePenalty !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "google",
+      setting: "presencePenalty",
+      details: "Gemini generateContent does not accept presencePenalty; the value was dropped.",
+    });
+  }
+  if (options.frequencyPenalty !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "google",
+      setting: "frequencyPenalty",
+      details: "Gemini generateContent does not accept frequencyPenalty; the value was dropped.",
+    });
+  }
+  if (options.responseFormat && options.responseFormat.type !== "text") {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "google",
+      setting: "responseFormat",
+      details:
+        "Gemini uses generationConfig.responseMimeType + responseSchema for structured outputs, which is a separate surface and not yet wired through this option.",
+    });
+  }
   const { systemInstruction, contents } = toGoogleContents(options.prompt);
+  const generationConfig = buildGoogleGenerationConfig(options);
+  // requestLabels wins over userId-derived labels: when callers explicitly
+  // provide a label map, that's the source of truth. Otherwise fall back
+  // to {user_id} derived from the unified userId option.
+  const labels = options.requestLabels && Object.keys(options.requestLabels).length > 0
+    ? options.requestLabels
+    : typeof options.userId === "string" && options.userId.length > 0
+    ? { user_id: options.userId }
+    : undefined;
   const body: GoogleCompatibleRequest = {
     contents,
     ...(systemInstruction ? { systemInstruction } : {}),
@@ -1387,8 +2425,13 @@ function buildGoogleGenerateContentRequest(
     ...(normalizeGoogleToolChoice(options.toolChoice)
       ? { toolConfig: normalizeGoogleToolChoice(options.toolChoice) }
       : {}),
-    ...(buildGoogleGenerationConfig(options)
-      ? { generationConfig: buildGoogleGenerationConfig(options) }
+    ...(generationConfig ? { generationConfig } : {}),
+    ...(labels ? { labels } : {}),
+    ...(typeof options.googleCachedContent === "string" && options.googleCachedContent.length > 0
+      ? { cachedContent: options.googleCachedContent }
+      : {}),
+    ...(options.googleSafetySettings && options.googleSafetySettings.length > 0
+      ? { safetySettings: options.googleSafetySettings }
       : {}),
   };
@@ -1426,7 +2469,8 @@ function buildGoogleGenerateResult(payload: unknown): {
     | { type: "tool-call"; toolCallId: string; toolName: string; input: string }
   >;
   finishReason?: string | { unified: string; raw: string } | null;
-  usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number };
+  usage?: RuntimeUsage;
+  groundingMetadata?: Record<string, unknown>;
 } {
   const parts = extractGoogleCandidateParts(payload);
   const content: Array<
@@ -1451,10 +2495,19 @@ function buildGoogleGenerateResult(payload: unknown): {
     }
   }
+  // Gemini grounding (google_search / google_search_retrieval) returns
+  // a per-candidate groundingMetadata object with web search queries,
+  // grounding chunks, and citation indices into the response text.
+  // Pass it through opaquely so callers can render footnotes / source
+  // chips / "Search results" UI without parsing the wire shape.
+  const candidate = extractFirstGoogleCandidate(payload);
+  const groundingMetadata = readRecord(candidate?.groundingMetadata);
   return {
     content,
-    finishReason: normalizeGoogleFinishReason(extractFirstGoogleCandidate(payload)?.finishReason),
+    finishReason: normalizeGoogleFinishReason(candidate?.finishReason),
     usage: extractGoogleUsage(payload),
+    ...(groundingMetadata ? { groundingMetadata } : {}),
   };
 }
@@ -1467,7 +2520,7 @@ async function* streamGoogleCompatibleParts(
   let reasoningId: string | null = null;
   let reasoningIndex = 0;
   let finishReason: string | { unified: string; raw: string } | null = null;
-  let usage: { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined;
+  let usage: RuntimeUsage | undefined;
   for await (const chunk of stream) {
     buffer += decoder.decode(chunk, { stream: true });
@@ -1599,7 +2652,7 @@ function buildOpenAIGenerateResult(payload: unknown): {
     }
   >;
   finishReason?: string | { unified: string; raw: string } | null;
-  usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number };
+  usage?: RuntimeUsage;
 } {
   const choice = extractFirstChoice(payload);
   const message = readRecord(choice?.message);
@@ -1630,7 +2683,7 @@ async function* streamOpenAICompatibleParts(
   let reasoningId: string | null = null;
   let reasoningIndex = 0;
   let finishReason: string | { unified: string; raw: string } | null = null;
-  let usage: { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined;
+  let usage: RuntimeUsage | undefined;
   for await (const chunk of stream) {
     buffer += decoder.decode(chunk, { stream: true });
@@ -1788,11 +2841,19 @@ export function createOpenAIModelRuntime(
     doGenerate(optionsForRuntime: unknown) {
       const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
       const url = getOpenAIChatCompletionsUrl(config.baseURL);
-      const body = buildOpenAIChatRequest(modelId, config.name ?? "openai", options, false);
+      const warnings = createWarningCollector();
+      const body = buildOpenAIChatRequest(
+        modelId,
+        config.name ?? "openai",
+        options,
+        false,
+        warnings,
+      );
       return requestJson({
         url,
         fetchImpl,
         providerLabel: config.name ?? "openai",
+        providerKind: "openai",
         init: {
           method: "POST",
           headers: createRequestHeaders({
@@ -1803,16 +2864,30 @@ export function createOpenAIModelRuntime(
           body: JSON.stringify(body),
           signal: options.abortSignal,
         },
-      }).then(buildOpenAIGenerateResult);
+      }).then((payload) => {
+        const drained = warnings.drain();
+        return {
+          ...buildOpenAIGenerateResult(payload),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
     },
     doStream(optionsForRuntime: unknown) {
       const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
       const url = getOpenAIChatCompletionsUrl(config.baseURL);
-      const body = buildOpenAIChatRequest(modelId, config.name ?? "openai", options, true);
+      const warnings = createWarningCollector();
+      const body = buildOpenAIChatRequest(
+        modelId,
+        config.name ?? "openai",
+        options,
+        true,
+        warnings,
+      );
       return requestStream({
         url,
         fetchImpl,
         providerLabel: config.name ?? "openai",
+        providerKind: "openai",
         init: {
           method: "POST",
           headers: createRequestHeaders({
@@ -1823,9 +2898,664 @@ export function createOpenAIModelRuntime(
           body: JSON.stringify(body),
           signal: options.abortSignal,
         },
-      }).then((responseStream) => ({
-        stream: ReadableStream.from(streamOpenAICompatibleParts(responseStream)),
-      }));
+      }).then((responseStream) => {
+        const drained = warnings.drain();
+        return {
+          stream: ReadableStream.from(streamOpenAICompatibleParts(responseStream)),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
+    },
+  };
+}
+// =============================================================================
+// OpenAI Responses API runtime (#1077, deferred from #1052 C4)
+// =============================================================================
+//
+// The Responses API (/v1/responses) is a different surface than Chat
+// Completions. Same provider, different request shape, different streaming
+// event grammar, different response shape, and different reasoning-summary
+// surface. This runtime is parallel to createOpenAIModelRuntime so each
+// path stays focused on one wire format.
+//
+// Why parallel runtimes instead of a flag? See the rationale in #1077.
+//
+// docs: https://platform.openai.com/docs/api-reference/responses
+type OpenAIResponsesInputItem = Record<string, unknown>;
+type OpenAIResponsesRequest = {
+  model: string;
+  input: OpenAIResponsesInputItem[];
+  instructions?: string;
+  stream?: boolean;
+  max_output_tokens?: number;
+  temperature?: number;
+  top_p?: number;
+  tools?: Array<Record<string, unknown>>;
+  tool_choice?: unknown;
+  reasoning?: { effort?: string; summary?: string };
+  metadata?: Record<string, string>;
+  user?: string;
+  service_tier?: string;
+  parallel_tool_calls?: boolean;
+  text?: { format: Record<string, unknown> };
+  [key: string]: unknown;
+};
+/**
+ * Convert the unified RuntimePromptMessage[] to the Responses API `input`
+ * array shape. Differences from Chat Completions:
+ *  - System prompts go on the top-level `instructions` field, not inline.
+ *  - Content parts use `input_text` / `output_text` discriminants instead
+ *    of the Chat Completions plain-text shorthand.
+ *  - Assistant tool calls become standalone `function_call` items in the
+ *    input array, not nested `tool_calls` on a message.
+ *  - Tool results become standalone `function_call_output` items.
+ *  - Reasoning content parts roundtrip as `reasoning` items so callers can
+ *    replay multi-turn conversations with chain-of-thought intact.
+ */
+function toOpenAIResponsesInput(
+  prompt: RuntimePromptMessage[],
+): { instructions?: string; input: OpenAIResponsesInputItem[] } {
+  const instructionsParts: string[] = [];
+  const input: OpenAIResponsesInputItem[] = [];
+  for (const message of prompt) {
+    switch (message.role) {
+      case "system":
+        if (message.content.length > 0) {
+          instructionsParts.push(message.content);
+        }
+        break;
+      case "user":
+        input.push({
+          role: "user",
+          content: [{ type: "input_text", text: readTextParts(message.content) }],
+        });
+        break;
+      case "assistant": {
+        const messageContent: Array<Record<string, unknown>> = [];
+        for (const part of message.content) {
+          if (part.type === "text") {
+            messageContent.push({ type: "output_text", text: part.text });
+            continue;
+          }
+          if (part.type === "reasoning") {
+            // Reasoning items are top-level entries in the input array,
+            // not nested inside the assistant message — flush whatever
+            // text we've accumulated first, then push the reasoning item.
+            if (messageContent.length > 0) {
+              input.push({ role: "assistant", content: [...messageContent] });
+              messageContent.length = 0;
+            }
+            const summary: Array<Record<string, unknown>> = [];
+            if (typeof part.text === "string" && part.text.length > 0) {
+              summary.push({ type: "summary_text", text: part.text });
+            }
+            input.push({
+              type: "reasoning",
+              ...(typeof part.signature === "string" ? { encrypted_content: part.signature } : {}),
+              summary,
+            });
+            continue;
+          }
+          // tool-call: flush message content, then push as standalone
+          // function_call item per Responses API shape.
+          if (messageContent.length > 0) {
+            input.push({ role: "assistant", content: [...messageContent] });
+            messageContent.length = 0;
+          }
+          input.push({
+            type: "function_call",
+            call_id: part.toolCallId,
+            name: part.toolName,
+            arguments: stringifyJsonValue(part.input),
+          });
+        }
+        if (messageContent.length > 0) {
+          input.push({ role: "assistant", content: messageContent });
+        }
+        break;
+      }
+      case "tool":
+        for (const part of message.content) {
+          input.push({
+            type: "function_call_output",
+            call_id: part.toolCallId,
+            output: stringifyJsonValue(part.output.value),
+          });
+        }
+        break;
+    }
+  }
+  return {
+    ...(instructionsParts.length > 0 ? { instructions: instructionsParts.join("\n\n") } : {}),
+    input,
+  };
+}
+/**
+ * Tools on the Responses API differ from Chat Completions: instead of
+ * `{ type: "function", function: { name, parameters } }` the function
+ * shape lifts the name/parameters/strict to the top of the entry. Native
+ * tools (web_search, file_search, computer_use, code_interpreter) live
+ * alongside function tools in the same array.
+ */
+function toOpenAIResponsesTools(
+  tools: RuntimeToolDefinition[] | undefined,
+): Array<Record<string, unknown>> | undefined {
+  if (!tools) return undefined;
+  const normalized: Array<Record<string, unknown>> = [];
+  for (const tool of tools) {
+    if (tool.type === "function") {
+      normalized.push({
+        type: "function",
+        name: tool.name,
+        ...(typeof tool.description === "string" ? { description: tool.description } : {}),
+        parameters: unwrapToolInputSchema(tool.inputSchema),
+      });
+      continue;
+    }
+    if (!tool.id.startsWith("openai.")) continue;
+    const providerType = tool.id.slice("openai.".length);
+    if (providerType.length === 0) continue;
+    normalized.push({
+      type: providerType,
+      ...toSnakeCaseRecord(tool.args),
+    });
+  }
+  return normalized.length > 0 ? normalized : undefined;
+}
+function buildOpenAIResponsesRequest(
+  modelId: string,
+  providerName: string,
+  options: OpenAICompatibleLanguageOptions,
+  stream: boolean,
+  warnings: WarningCollector,
+): OpenAIResponsesRequest {
+  const isReasoningModel = isOpenAIReasoningModel(modelId);
+  const reasoningEffort = resolveOpenAIReasoningEffort(options.reasoning);
+  const reasoningEnabled = isReasoningModel || reasoningEffort !== undefined;
+  // Same param-sanitization rules as Chat Completions: reasoning models
+  // reject sampling params. Drop with a warning.
+  if (options.topK !== undefined) {
+    warnings.push({
+      type: "unsupported-setting",
+      provider: "openai",
+      setting: "topK",
+      details: "OpenAI Responses API does not expose top_k; the value was dropped.",
+    });
+  }
+  if (reasoningEnabled) {
+    const dropped: Array<[keyof typeof options, string]> = [
+      ["temperature", "temperature"],
+      ["topP", "top_p"],
+      ["presencePenalty", "presence_penalty"],
+      ["frequencyPenalty", "frequency_penalty"],
+    ];
+    for (const [key, openaiName] of dropped) {
+      if (options[key] !== undefined) {
+        warnings.push({
+          type: "unsupported-setting",
+          provider: "openai",
+          setting: key,
+          details:
+            `Dropped because OpenAI reasoning models reject ${openaiName}. Reasoning was active for this request.`,
+        });
+      }
+    }
+  }
+  const { instructions, input } = toOpenAIResponsesInput(options.prompt);
+  const responsesTools = toOpenAIResponsesTools(options.tools);
+  const body: OpenAIResponsesRequest = {
+    model: modelId,
+    input,
+    ...(instructions !== undefined ? { instructions } : {}),
+    ...(stream ? { stream: true } : {}),
+    ...(options.maxOutputTokens !== undefined
+      ? { max_output_tokens: options.maxOutputTokens }
+      : {}),
+    ...(!reasoningEnabled && options.temperature !== undefined
+      ? { temperature: options.temperature }
+      : {}),
+    ...(!reasoningEnabled && options.topP !== undefined ? { top_p: options.topP } : {}),
+    ...(responsesTools ? { tools: responsesTools } : {}),
+    ...(options.toolChoice !== undefined ? { tool_choice: options.toolChoice } : {}),
+    // The Responses API surfaces reasoning effort + summary verbosity
+    // in a structured `reasoning` object instead of a flat field. We
+    // request "auto" summary so callers see structured summary parts
+    // without having to opt into them per request.
+    ...(reasoningEffort !== undefined
+      ? { reasoning: { effort: reasoningEffort, summary: "auto" } }
+      : {}),
+    ...(typeof options.userId === "string" && options.userId.length > 0
+      ? { user: options.userId }
+      : {}),
+    ...(options.serviceTier !== undefined ? { service_tier: options.serviceTier } : {}),
+    ...(options.parallelToolCalls !== undefined
+      ? { parallel_tool_calls: options.parallelToolCalls }
+      : {}),
+    // Responses API uses `text.format` instead of Chat Completions'
+    // `response_format`. The shape is similar but nested under `text`.
+    ...(options.responseFormat && options.responseFormat.type !== "text"
+      ? {
+        text: {
+          format: options.responseFormat.type === "json" ? { type: "json_object" } : {
+            type: "json_schema",
+            name: options.responseFormat.name,
+            ...(typeof options.responseFormat.description === "string"
+              ? { description: options.responseFormat.description }
+              : {}),
+            schema: unwrapToolInputSchema(options.responseFormat.schema),
+            ...(options.responseFormat.strict !== undefined
+              ? { strict: options.responseFormat.strict }
+              : {}),
+          },
+        },
+      }
+      : {}),
+  };
+  Object.assign(body, readProviderOptions(options.providerOptions, "openai", providerName));
+  return body;
+}
+/**
+ * The Responses API uses `input_tokens` / `output_tokens` field names
+ * instead of Chat Completions' `prompt_tokens` / `completion_tokens`.
+ * It also nests cached input tokens under `input_tokens_details` and
+ * exposes reasoning tokens via `output_tokens_details.reasoning_tokens`.
+ */
+function extractOpenAIResponsesUsage(payload: unknown): RuntimeUsage | undefined {
+  const record = readRecord(payload);
+  // Streaming usage lives on response.completed inside `response.usage`;
+  // non-streaming has it at the top level.
+  const responseRecord = readRecord(record?.response);
+  const usage = readRecord(responseRecord?.usage) ?? readRecord(record?.usage);
+  if (!usage) return undefined;
+  const inputTokens = typeof usage.input_tokens === "number" ? usage.input_tokens : undefined;
+  const outputTokens = typeof usage.output_tokens === "number" ? usage.output_tokens : undefined;
+  const totalTokens = typeof usage.total_tokens === "number"
+    ? usage.total_tokens
+    : (inputTokens !== undefined || outputTokens !== undefined
+      ? (inputTokens ?? 0) + (outputTokens ?? 0)
+      : undefined);
+  const inputDetails = readRecord(usage.input_tokens_details);
+  const cachedTokens = inputDetails?.cached_tokens;
+  return {
+    inputTokens,
+    outputTokens,
+    totalTokens,
+    ...(typeof cachedTokens === "number" ? { cacheReadInputTokens: cachedTokens } : {}),
+  };
+}
+function normalizeOpenAIResponsesFinishReason(
+  raw: unknown,
+): string | { unified: string; raw: string } | null {
+  if (typeof raw !== "string") return null;
+  switch (raw) {
+    case "completed":
+      return { unified: "stop", raw };
+    case "incomplete":
+      return { unified: "length", raw };
+    case "failed":
+      return { unified: "error", raw };
+    case "in_progress":
+      return null;
+    default:
+      return raw;
+  }
+}
+type OpenAIResponsesContentPart =
+  | { type: "text"; text: string }
+  | {
+    type: "reasoning";
+    summaries?: Array<{ id?: string; text: string }>;
+    signature?: string;
+  }
+  | { type: "tool-call"; toolCallId: string; toolName: string; input: string };
+function buildOpenAIResponsesGenerateResult(payload: unknown): {
+  content: OpenAIResponsesContentPart[];
+  finishReason?: string | { unified: string; raw: string } | null;
+  usage?: RuntimeUsage;
+} {
+  const record = readRecord(payload);
+  const output = Array.isArray(record?.output) ? record.output : [];
+  const content: OpenAIResponsesContentPart[] = [];
+  for (const item of output) {
+    const itemRecord = readRecord(item);
+    const itemType = typeof itemRecord?.type === "string" ? itemRecord.type : undefined;
+    if (itemType === "message" && Array.isArray(itemRecord?.content)) {
+      // A message item bundles one or more output_text parts. Concat
+      // their texts into a single text content entry.
+      let text = "";
+      for (const part of itemRecord.content) {
+        const p = readRecord(part);
+        if (typeof p?.type === "string" && p.type === "output_text" && typeof p.text === "string") {
+          text += p.text;
+        }
+      }
+      if (text.length > 0) {
+        content.push({ type: "text", text });
+      }
+      continue;
+    }
+    if (itemType === "function_call") {
+      content.push({
+        type: "tool-call",
+        toolCallId: typeof itemRecord?.call_id === "string"
+          ? itemRecord.call_id
+          : (typeof itemRecord?.id === "string" ? itemRecord.id : ""),
+        toolName: typeof itemRecord?.name === "string" ? itemRecord.name : "",
+        input: typeof itemRecord?.arguments === "string"
+          ? itemRecord.arguments
+          : stringifyJsonValue(itemRecord?.arguments ?? {}),
+      });
+      continue;
+    }
+    if (itemType === "reasoning") {
+      const summary = Array.isArray(itemRecord?.summary) ? itemRecord.summary : [];
+      const summaries: Array<{ id?: string; text: string }> = [];
+      for (const s of summary) {
+        const sr = readRecord(s);
+        if (typeof sr?.text === "string" && sr.text.length > 0) {
+          summaries.push({
+            ...(typeof sr?.id === "string" ? { id: sr.id } : {}),
+            text: sr.text,
+          });
+        }
+      }
+      content.push({
+        type: "reasoning",
+        ...(summaries.length > 0 ? { summaries } : {}),
+        ...(typeof itemRecord?.encrypted_content === "string"
+          ? { signature: itemRecord.encrypted_content }
+          : {}),
+      });
+      continue;
+    }
+  }
+  return {
+    content,
+    finishReason: normalizeOpenAIResponsesFinishReason(record?.status),
+    usage: extractOpenAIResponsesUsage(payload),
+  };
+}
+type OpenAIResponsesStreamReasoningState = {
+  id: string;
+  emittedStart: boolean;
+};
+type OpenAIResponsesStreamFunctionCallState = {
+  id: string;
+  toolCallId: string;
+  name: string;
+  arguments: string;
+};
+/**
+ * Parse the Responses API streaming event grammar into the same UI part
+ * shapes the existing OpenAI / Anthropic / Google streams emit. The
+ * Responses API uses a strict event-typed protocol — every event has a
+ * `type` field naming the lifecycle phase — instead of the loose
+ * `delta`-based shape Chat Completions uses.
+ */
+async function* streamOpenAIResponsesParts(
+  stream: ReadableStream<Uint8Array>,
+): AsyncIterable<unknown> {
+  const decoder = new TextDecoder();
+  let buffer = "";
+  const reasoningBlocks = new Map<string, OpenAIResponsesStreamReasoningState>();
+  const functionCalls = new Map<string, OpenAIResponsesStreamFunctionCallState>();
+  const startedToolCalls = new Set<string>();
+  let finishReason: string | { unified: string; raw: string } | null = null;
+  let usage: RuntimeUsage | undefined;
+  let reasoningCounter = 0;
+  for await (const chunk of stream) {
+    buffer += decoder.decode(chunk, { stream: true });
+    const parsed = parseSseChunk(buffer);
+    buffer = parsed.remainder;
+    for (const event of parsed.events) {
+      if (event === "[DONE]") continue;
+      const record = readRecord(event);
+      const type = typeof record?.type === "string" ? record.type : undefined;
+      if (!type) continue;
+      // response.output_item.added: a new output item begins. Track
+      // function_call items so their argument deltas can be attributed,
+      // and reasoning items so summary deltas can group correctly.
+      if (type === "response.output_item.added") {
+        const item = readRecord(record?.item);
+        const itemType = typeof item?.type === "string" ? item.type : undefined;
+        const itemId = typeof item?.id === "string" ? item.id : undefined;
+        if (itemType === "function_call" && itemId) {
+          const callId = typeof item?.call_id === "string" ? item.call_id : itemId;
+          const name = typeof item?.name === "string" ? item.name : "";
+          functionCalls.set(itemId, {
+            id: itemId,
+            toolCallId: callId,
+            name,
+            arguments: "",
+          });
+        }
+        if (itemType === "reasoning" && itemId) {
+          reasoningBlocks.set(itemId, {
+            id: `reasoning-${reasoningCounter++}`,
+            emittedStart: false,
+          });
+        }
+        continue;
+      }
+      // response.output_text.delta: text chunk for a message item.
+      if (type === "response.output_text.delta" && typeof record?.delta === "string") {
+        if (record.delta.length > 0) {
+          yield { type: "text-delta", delta: record.delta };
+        }
+        continue;
+      }
+      // response.reasoning_summary_text.delta: reasoning summary text
+      // chunk. The first delta on an item lazily emits the
+      // reasoning-start event so callers can group deltas into a part.
+      if (type === "response.reasoning_summary_text.delta" && typeof record?.delta === "string") {
+        const itemId = typeof record?.item_id === "string" ? record.item_id : undefined;
+        const state = itemId ? reasoningBlocks.get(itemId) : undefined;
+        if (state && record.delta.length > 0) {
+          if (!state.emittedStart) {
+            yield { type: "reasoning-start", id: state.id };
+            state.emittedStart = true;
+          }
+          yield { type: "reasoning-delta", id: state.id, delta: record.delta };
+        }
+        continue;
+      }
+      // response.function_call_arguments.delta: tool call argument
+      // chunk. The first delta lazily emits tool-input-start.
+      if (type === "response.function_call_arguments.delta" && typeof record?.delta === "string") {
+        const itemId = typeof record?.item_id === "string" ? record.item_id : undefined;
+        const state = itemId ? functionCalls.get(itemId) : undefined;
+        if (state && record.delta.length > 0) {
+          if (!startedToolCalls.has(state.id)) {
+            yield {
+              type: "tool-input-start",
+              id: state.toolCallId,
+              toolName: state.name,
+            };
+            startedToolCalls.add(state.id);
+          }
+          state.arguments += record.delta;
+          yield {
+            type: "tool-input-delta",
+            id: state.toolCallId,
+            delta: record.delta,
+          };
+        }
+        continue;
+      }
+      // response.output_item.done: an item has finished emitting deltas.
+      // Close any reasoning or function-call streams that were open.
+      if (type === "response.output_item.done") {
+        const item = readRecord(record?.item);
+        const itemType = typeof item?.type === "string" ? item.type : undefined;
+        const itemId = typeof item?.id === "string" ? item.id : undefined;
+        if (itemType === "reasoning" && itemId) {
+          const state = reasoningBlocks.get(itemId);
+          if (state?.emittedStart) {
+            yield { type: "reasoning-end", id: state.id };
+          }
+          reasoningBlocks.delete(itemId);
+        }
+        if (itemType === "function_call" && itemId) {
+          const state = functionCalls.get(itemId);
+          if (state) {
+            yield {
+              type: "tool-call",
+              toolCallId: state.toolCallId,
+              toolName: state.name,
+              input: state.arguments,
+            };
+          }
+          functionCalls.delete(itemId);
+        }
+        continue;
+      }
+      // response.completed: terminal event with the final response object
+      // (status + usage). Capture both for the final finish part.
+      if (type === "response.completed") {
+        usage = extractOpenAIResponsesUsage(record) ?? usage;
+        const responseRecord = readRecord(record?.response);
+        finishReason = normalizeOpenAIResponsesFinishReason(responseRecord?.status);
+        continue;
+      }
+      if (type === "response.failed" || type === "response.incomplete") {
+        const responseRecord = readRecord(record?.response);
+        finishReason = normalizeOpenAIResponsesFinishReason(responseRecord?.status) ??
+          (type === "response.failed"
+            ? { unified: "error", raw: "failed" }
+            : { unified: "length", raw: "incomplete" });
+        usage = extractOpenAIResponsesUsage(record) ?? usage;
+        continue;
+      }
+    }
+  }
+  // Close any reasoning streams still open at end-of-stream (defensive
+  // — a clean Responses API stream always closes them via output_item.done).
+  for (const state of reasoningBlocks.values()) {
+    if (state.emittedStart) {
+      yield { type: "reasoning-end", id: state.id };
+    }
+  }
+  yield {
+    type: "finish",
+    finishReason,
+    ...(usage ? { usage } : {}),
+  };
+}
+export function createOpenAIResponsesRuntime(
+  config: OpenAIRuntimeConfig,
+  modelId: string,
+): ModelRuntime {
+  const fetchImpl = config.fetch ?? globalThis.fetch;
+  return {
+    provider: config.name ?? "openai",
+    modelId,
+    specificationVersion: "v3",
+    supportedUrls: {},
+    doGenerate(optionsForRuntime: unknown) {
+      const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
+      const url = getOpenAIResponsesUrl(config.baseURL);
+      const warnings = createWarningCollector();
+      const body = buildOpenAIResponsesRequest(
+        modelId,
+        config.name ?? "openai",
+        options,
+        false,
+        warnings,
+      );
+      return requestJson({
+        url,
+        fetchImpl,
+        providerLabel: config.name ?? "openai",
+        providerKind: "openai",
+        init: {
+          method: "POST",
+          headers: createRequestHeaders({
+            apiKeyHeaderName: "authorization",
+            apiKey: `Bearer ${config.apiKey}`,
+            extraHeaders: options.headers,
+          }),
+          body: JSON.stringify(body),
+          signal: options.abortSignal,
+        },
+      }).then((payload) => {
+        const drained = warnings.drain();
+        return {
+          ...buildOpenAIResponsesGenerateResult(payload),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
+    },
+    doStream(optionsForRuntime: unknown) {
+      const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
+      const url = getOpenAIResponsesUrl(config.baseURL);
+      const warnings = createWarningCollector();
+      const body = buildOpenAIResponsesRequest(
+        modelId,
+        config.name ?? "openai",
+        options,
+        true,
+        warnings,
+      );
+      return requestStream({
+        url,
+        fetchImpl,
+        providerLabel: config.name ?? "openai",
+        providerKind: "openai",
+        init: {
+          method: "POST",
+          headers: createRequestHeaders({
+            apiKeyHeaderName: "authorization",
+            apiKey: `Bearer ${config.apiKey}`,
+            extraHeaders: options.headers,
+          }),
+          body: JSON.stringify(body),
+          signal: options.abortSignal,
+        },
+      }).then((responseStream) => {
+        const drained = warnings.drain();
+        return {
+          stream: ReadableStream.from(streamOpenAIResponsesParts(responseStream)),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
     },
   };
 }
@@ -1843,16 +3573,19 @@ export function createAnthropicModelRuntime(
     doGenerate(optionsForRuntime: unknown) {
       const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
       const url = getAnthropicMessagesUrl(config.baseURL);
+      const warnings = createWarningCollector();
       const body = buildAnthropicMessagesRequest(
         modelId,
         config.name ?? "anthropic",
         options,
         false,
+        warnings,
       );
       return requestJson({
         url,
         fetchImpl,
         providerLabel: config.name ?? "anthropic",
+        providerKind: "anthropic",
         init: {
           method: "POST",
           headers: createAnthropicRequestHeaders({
@@ -1863,21 +3596,30 @@ export function createAnthropicModelRuntime(
           body: JSON.stringify(body),
           signal: options.abortSignal,
         },
-      }).then(buildAnthropicGenerateResult);
+      }).then((payload) => {
+        const drained = warnings.drain();
+        return {
+          ...buildAnthropicGenerateResult(payload),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
     },
     doStream(optionsForRuntime: unknown) {
       const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
       const url = getAnthropicMessagesUrl(config.baseURL);
+      const warnings = createWarningCollector();
       const body = buildAnthropicMessagesRequest(
         modelId,
         config.name ?? "anthropic",
         options,
         true,
+        warnings,
       );
       return requestStream({
         url,
         fetchImpl,
         providerLabel: config.name ?? "anthropic",
+        providerKind: "anthropic",
         init: {
           method: "POST",
           headers: createAnthropicRequestHeaders({
@@ -1888,9 +3630,13 @@ export function createAnthropicModelRuntime(
           body: JSON.stringify(body),
           signal: options.abortSignal,
         },
-      }).then((responseStream) => ({
-        stream: ReadableStream.from(streamAnthropicCompatibleParts(responseStream)),
-      }));
+      }).then((responseStream) => {
+        const drained = warnings.drain();
+        return {
+          stream: ReadableStream.from(streamAnthropicCompatibleParts(responseStream)),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
     },
   };
 }
@@ -1908,11 +3654,17 @@ export function createGoogleModelRuntime(
     doGenerate(optionsForRuntime: unknown) {
       const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
       const url = getGoogleGenerateContentUrl(config.baseURL, modelId);
-      const body = buildGoogleGenerateContentRequest(config.name ?? "google", options);
+      const warnings = createWarningCollector();
+      const body = buildGoogleGenerateContentRequest(
+        config.name ?? "google",
+        options,
+        warnings,
+      );
       return requestJson({
         url,
         fetchImpl,
         providerLabel: config.name ?? "google",
+        providerKind: "google",
         init: {
           method: "POST",
           headers: createRequestHeaders({
@@ -1923,16 +3675,28 @@ export function createGoogleModelRuntime(
           body: JSON.stringify(body),
           signal: options.abortSignal,
         },
-      }).then(buildGoogleGenerateResult);
+      }).then((payload) => {
+        const drained = warnings.drain();
+        return {
+          ...buildGoogleGenerateResult(payload),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
     },
     doStream(optionsForRuntime: unknown) {
       const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
       const url = getGoogleStreamGenerateContentUrl(config.baseURL, modelId);
-      const body = buildGoogleGenerateContentRequest(config.name ?? "google", options);
+      const warnings = createWarningCollector();
+      const body = buildGoogleGenerateContentRequest(
+        config.name ?? "google",
+        options,
+        warnings,
+      );
       return requestStream({
         url,
         fetchImpl,
         providerLabel: config.name ?? "google",
+        providerKind: "google",
         init: {
           method: "POST",
           headers: createRequestHeaders({
@@ -1943,9 +3707,13 @@ export function createGoogleModelRuntime(
           body: JSON.stringify(body),
           signal: options.abortSignal,
         },
-      }).then((responseStream) => ({
-        stream: ReadableStream.from(streamGoogleCompatibleParts(responseStream)),
-      }));
+      }).then((responseStream) => {
+        const drained = warnings.drain();
+        return {
+          stream: ReadableStream.from(streamGoogleCompatibleParts(responseStream)),
+          ...(drained.length > 0 ? { warnings: drained } : {}),
+        };
+      });
     },
   };
 }
@@ -1973,6 +3741,7 @@ export function createOpenAIEmbeddingRuntime(
         url,
         fetchImpl,
         providerLabel: config.name ?? "openai",
+        providerKind: "openai",
         init: {
           method: "POST",
           headers: {
@@ -2021,6 +3790,7 @@ export function createGoogleEmbeddingRuntime(
           url,
           fetchImpl,
           providerLabel: config.name ?? "google",
+          providerKind: "google",
           init: {
             method: "POST",
             headers: {