veryfront 0.1.207 → 0.1.209

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -36,6 +36,18 @@ type RuntimePromptMessage =
36
36
  input: unknown;
37
37
  providerExecuted?: boolean;
38
38
  }
39
+ | {
40
+ // Anthropic thinking block replay. Carries the original signed
41
+ // thinking trace so that on the next turn Anthropic can verify
42
+ // the signature and let Claude continue reasoning from the same
43
+ // point. `text` + `signature` are the normal pair for an
44
+ // un-redacted thinking block; `redactedData` is set instead of
45
+ // both when Anthropic returned an encrypted opaque payload.
46
+ type: "reasoning";
47
+ text?: string;
48
+ signature?: string;
49
+ redactedData?: string;
50
+ }
39
51
  >;
40
52
  }
41
53
  | {
@@ -60,6 +72,67 @@ type RuntimeToolDefinition =
60
72
  id: `${string}.${string}`;
61
73
  args: Record<string, unknown>;
62
74
  };
75
+ /**
76
+ * TTL for a single prompt-cache breakpoint.
77
+ *
78
+ * `true` and `"5m"` both map to Anthropic's default ephemeral (5-minute) cache.
79
+ * `"1h"` maps to the extended 1-hour cache at a 2x write cost. Callers can
80
+ * pick per breakpoint target.
81
+ */
82
+ type ProviderCacheTtl = boolean | "5m" | "1h";
83
+
84
+ /**
85
+ * Per-provider prompt / context caching controls.
86
+ *
87
+ * For Anthropic, flipping these on emits `cache_control: { type: "ephemeral" }`
88
+ * breakpoints on the assembled system prompt and/or the last tool definition
89
+ * sent to the Messages API, enabling Anthropic's explicit prompt cache.
90
+ *
91
+ * OpenAI's prompt cache is automatic on gpt-4o+ and has no request-side
92
+ * directive to emit, so this option is a no-op for the OpenAI runtime. Google
93
+ * uses a separate `cachedContent` resource model that is intentionally not
94
+ * covered by this option (it belongs on a dedicated Gemini-specific surface).
95
+ */
96
+ type ProviderCacheControlOption = {
97
+ /**
98
+ * Attach a cache breakpoint to the final system-prompt text block.
99
+ * Use when the system prompt is large and reused across requests.
100
+ */
101
+ system?: ProviderCacheTtl;
102
+ /**
103
+ * Attach a cache breakpoint to the last tool definition in `tools`.
104
+ * Use when the tool schemas are large and identical across requests.
105
+ */
106
+ tools?: ProviderCacheTtl;
107
+ };
108
+
109
+ /**
110
+ * Unified effort level for extended reasoning / thinking. Maps to
111
+ * per-provider knobs: Anthropic `thinking.budget_tokens`, OpenAI
112
+ * `reasoning_effort`, Gemini `thinkingConfig.thinkingBudget`.
113
+ */
114
+ type ProviderReasoningEffort = "low" | "medium" | "high" | "max";
115
+
116
+ /**
117
+ * Unified reasoning / thinking request option.
118
+ *
119
+ * Setting `enabled: true` turns on extended thinking on providers that
120
+ * support it (Anthropic Claude 4.x, OpenAI o-series, Gemini 2.5+). The
121
+ * `effort` field picks a coarse budget; when `budgetTokens` is set it
122
+ * wins for providers that take a numeric budget (Anthropic, Gemini).
123
+ *
124
+ * Providers that do not support reasoning treat this as a no-op. On
125
+ * Anthropic + OpenAI, enabling reasoning also disables sampling params
126
+ * that the providers reject in combination (`temperature`, `topP`,
127
+ * `topK`, `presencePenalty`, `frequencyPenalty`) — silently dropping
128
+ * them rather than failing the request.
129
+ */
130
+ type ProviderReasoningOption = {
131
+ enabled?: boolean;
132
+ effort?: ProviderReasoningEffort;
133
+ budgetTokens?: number;
134
+ };
135
+
63
136
  type OpenAICompatibleLanguageOptions = {
64
137
  prompt: RuntimePromptMessage[];
65
138
  maxOutputTokens?: number;
@@ -76,6 +149,128 @@ type OpenAICompatibleLanguageOptions = {
76
149
  providerOptions?: Record<string, unknown>;
77
150
  includeRawChunks?: boolean;
78
151
  abortSignal?: AbortSignal;
152
+ /**
153
+ * Per-provider prompt / context caching controls. See
154
+ * {@link ProviderCacheControlOption}. When unset, caching behaviour is
155
+ * unchanged on every provider.
156
+ */
157
+ cacheControl?: ProviderCacheControlOption;
158
+ /**
159
+ * Enable extended reasoning / thinking on providers that support it.
160
+ * See {@link ProviderReasoningOption}. When unset, reasoning behaviour
161
+ * is unchanged on every provider.
162
+ */
163
+ reasoning?: ProviderReasoningOption;
164
+ /**
165
+ * Stable per-user identifier for rate-limiting, abuse detection, and
166
+ * billing attribution. Maps to:
167
+ * - Anthropic: `metadata.user_id`
168
+ * - OpenAI: `user`
169
+ * - Google: `labels.user_id` (when {@link requestLabels} is unset)
170
+ */
171
+ userId?: string;
172
+ /**
173
+ * Provider-specific label map for Google Gemini's `labels` field.
174
+ * Anthropic and OpenAI don't have an arbitrary-label equivalent, so
175
+ * this is intentionally Google-only. When unset, no labels are sent.
176
+ */
177
+ requestLabels?: Record<string, string>;
178
+ /**
179
+ * OpenAI-specific. Maps to the `service_tier` field on Chat Completions
180
+ * which trades latency for cost. Documented values:
181
+ *
182
+ * - `default` — standard processing (default if unset)
183
+ * - `flex` — lower-priority queue, lower per-token cost, longer
184
+ * expected latency. Useful for batchy or non-interactive workloads.
185
+ * - `scale` — reserved-capacity tier with strict latency SLOs.
186
+ * - `auto` — let OpenAI pick.
187
+ *
188
+ * Forwarded verbatim. Anthropic and Google have no equivalent and
189
+ * the field is silently omitted on those providers.
190
+ */
191
+ serviceTier?: "auto" | "default" | "flex" | "scale";
192
+ /**
193
+ * OpenAI-specific. When `false`, OpenAI runs tool calls sequentially
194
+ * instead of in parallel. Useful for ordered side effects where
195
+ * concurrent calls would race. Default behaviour (unset) is parallel.
196
+ */
197
+ parallelToolCalls?: boolean;
198
+ /**
199
+ * Structured-output response format. Maps to OpenAI's `response_format`
200
+ * field on Chat Completions (and Responses). Three variants:
201
+ *
202
+ * - `{ type: "text" }` — the default (no constraint).
203
+ * - `{ type: "json" }` — emits OpenAI's `response_format:
204
+ * { type: "json_object" }` to force the model to return valid JSON.
205
+ * - `{ type: "json_schema", name, schema, strict? }` — emits
206
+ * OpenAI's `response_format: { type: "json_schema", json_schema: {
207
+ * name, schema, strict } }` for fully constrained structured
208
+ * outputs (gpt-4o-2024-08-06+).
209
+ *
210
+ * On Anthropic and Google this option emits an "unsupported-setting"
211
+ * warning when set to anything other than `text` (those providers
212
+ * have their own structured-output surfaces and need a dedicated
213
+ * follow-up to wire them in).
214
+ */
215
+ responseFormat?:
216
+ | { type: "text" }
217
+ | { type: "json" }
218
+ | {
219
+ type: "json_schema";
220
+ name: string;
221
+ schema: unknown;
222
+ description?: string;
223
+ strict?: boolean;
224
+ };
225
+ /**
226
+ * Anthropic-specific. `container` field for programmatic tool calling
227
+ * and agent skills. Anthropic uses this to scope a session to a
228
+ * sandboxed container (e.g. for Computer Use, code execution
229
+ * sandboxes, or skills loaded from a container). Forwarded verbatim.
230
+ *
231
+ * The shape varies — string container id or a structured object
232
+ * depending on the feature. Caller passes whatever Anthropic's docs
233
+ * specify for the target feature.
234
+ */
235
+ anthropicContainer?: unknown;
236
+ /**
237
+ * Google-specific. Reference to a previously-created Gemini cached
238
+ * content resource (created via the separate caches API) to attach
239
+ * to this request. Resource name format:
240
+ * `cachedContents/<id>`. See https://ai.google.dev/gemini-api/docs/caching.
241
+ *
242
+ * Cache creation itself is out of scope for the runtime — callers
243
+ * use the Gemini REST API or SDK to create the cache, then pass the
244
+ * resource name here on each subsequent generate call to attach the
245
+ * cached prefix and avoid re-paying for it.
246
+ */
247
+ googleCachedContent?: string;
248
+ /**
249
+ * Google-specific. Per-request safety filter configuration for
250
+ * Gemini. Each entry pairs a HARM_CATEGORY_* with a threshold
251
+ * (BLOCK_NONE / BLOCK_LOW_AND_ABOVE / BLOCK_MEDIUM_AND_ABOVE /
252
+ * BLOCK_ONLY_HIGH). Forwarded verbatim as the `safetySettings`
253
+ * field. See https://ai.google.dev/gemini-api/docs/safety-settings.
254
+ */
255
+ googleSafetySettings?: Array<{
256
+ category: string;
257
+ threshold: string;
258
+ }>;
259
+ /**
260
+ * Anthropic-specific. Native MCP server definitions to pass directly
261
+ * on the Messages API request body. Lets callers register MCP servers
262
+ * server-side instead of reloading them into local function tools.
263
+ *
264
+ * Caller must opt into the MCP beta by adding the matching header to
265
+ * `headers`, e.g. `{ "anthropic-beta": "mcp-client-2025-04-04" }`.
266
+ * Without that header Anthropic will reject the request.
267
+ *
268
+ * Each entry is forwarded with camelCase keys converted to snake_case
269
+ * so `authorizationToken` → `authorization_token`,
270
+ * `toolConfiguration.allowedTools` → `tool_configuration.allowed_tools`,
271
+ * etc.
272
+ */
273
+ mcpServers?: Array<Record<string, unknown>>;
79
274
  };
80
275
  type OpenAICompatibleChatMessage =
81
276
  | { role: "system"; content: string }
@@ -142,7 +337,12 @@ type AnthropicCompatibleRequest = {
142
337
  messages: AnthropicCompatibleMessage[];
143
338
  max_tokens: number;
144
339
  stream?: boolean;
145
- system?: string;
340
+ /**
341
+ * String form is the classic shorthand. Array-of-blocks form is required
342
+ * when the system prompt carries a cache_control breakpoint, because
343
+ * cache_control lives on an individual content block, not on a raw string.
344
+ */
345
+ system?: string | Array<Record<string, unknown>>;
146
346
  temperature?: number;
147
347
  top_p?: number;
148
348
  stop_sequences?: string[];
@@ -168,9 +368,7 @@ type GoogleCompatibleRequest = {
168
368
  systemInstruction?: {
169
369
  parts: Array<{ text: string }>;
170
370
  };
171
- tools?: Array<{
172
- functionDeclarations: Array<Record<string, unknown>>;
173
- }>;
371
+ tools?: Array<Record<string, unknown>>;
174
372
  toolConfig?: {
175
373
  functionCallingConfig: Record<string, unknown>;
176
374
  };
@@ -198,6 +396,10 @@ function getOpenAIChatCompletionsUrl(baseURL?: string): string {
198
396
  return joinUrl(baseURL ?? DEFAULT_OPENAI_BASE_URL, "chat/completions");
199
397
  }
200
398
 
399
+ function getOpenAIResponsesUrl(baseURL?: string): string {
400
+ return joinUrl(baseURL ?? DEFAULT_OPENAI_BASE_URL, "responses");
401
+ }
402
+
201
403
  function getGoogleGenerateContentUrl(baseURL: string | undefined, modelId: string): string {
202
404
  return joinUrl(
203
405
  baseURL ?? DEFAULT_GOOGLE_BASE_URL,
@@ -287,9 +489,203 @@ function extractGoogleUsageTokens(payload: unknown): number | undefined {
287
489
  return typeof promptTokenCount === "number" ? promptTokenCount : undefined;
288
490
  }
289
491
 
290
- async function readErrorMessage(response: Response): Promise<string> {
291
- const text = await response.text();
292
- return text.trim() || `${response.status} ${response.statusText}`.trim();
492
+ type ProviderKind = "anthropic" | "openai" | "google";
493
+
494
+ /**
495
+ * Structured warning emitted when a provider runtime drops or rewrites a
496
+ * caller-provided option. Mirrors the AI ecosystem convention (Vercel AI
497
+ * SDK, LangChain) of returning `unsupported-setting` warnings on the
498
+ * runtime result so callers can discover silently-dropped fields without
499
+ * having to read the source.
500
+ */
501
+ export type ProviderWarning = {
502
+ type: "unsupported-setting" | "other";
503
+ setting?: string;
504
+ details?: string;
505
+ provider: ProviderKind;
506
+ };
507
+
508
+ /**
509
+ * Mutable warning collector handed to per-provider request builders so
510
+ * they can append entries during the build pass instead of plumbing a
511
+ * return-tuple shape through every helper.
512
+ */
513
+ type WarningCollector = {
514
+ push(warning: ProviderWarning): void;
515
+ drain(): ProviderWarning[];
516
+ };
517
+
518
+ function createWarningCollector(): WarningCollector {
519
+ const list: ProviderWarning[] = [];
520
+ return {
521
+ push(warning) {
522
+ list.push(warning);
523
+ },
524
+ drain() {
525
+ return list.slice();
526
+ },
527
+ };
528
+ }
529
+
530
+ /**
531
+ * Base class for typed provider errors. The `retryable` flag is the
532
+ * primary signal for callers (or a retry wrapper) to decide whether to
533
+ * re-issue the request. `retryAfterMs` is set when the provider gave an
534
+ * explicit delay hint (Retry-After header, Retry-Info trailer).
535
+ */
536
+ export class ProviderError extends Error {
537
+ readonly provider: ProviderKind;
538
+ readonly status: number;
539
+ readonly retryable: boolean;
540
+ readonly retryAfterMs?: number;
541
+
542
+ constructor(options: {
543
+ provider: ProviderKind;
544
+ status: number;
545
+ message: string;
546
+ retryable: boolean;
547
+ retryAfterMs?: number;
548
+ }) {
549
+ super(options.message);
550
+ this.name = new.target.name;
551
+ this.provider = options.provider;
552
+ this.status = options.status;
553
+ this.retryable = options.retryable;
554
+ if (options.retryAfterMs !== undefined) {
555
+ this.retryAfterMs = options.retryAfterMs;
556
+ }
557
+ }
558
+ }
559
+
560
+ /** Provider reports it is overloaded (Anthropic 529, OpenAI/Google 503). */
561
+ export class ProviderOverloadedError extends ProviderError {}
562
+
563
+ /** Provider is rate limiting this API key (OpenAI/Google 429 with Retry-After). */
564
+ export class ProviderRateLimitError extends ProviderError {}
565
+
566
+ /** Provider account quota is exhausted — non-retryable. */
567
+ export class ProviderQuotaError extends ProviderError {}
568
+
569
+ /** Non-retryable 4xx/5xx that doesn't fit another bucket. */
570
+ export class ProviderRequestError extends ProviderError {}
571
+
572
+ function parseRetryAfterMs(header: string | null): number | undefined {
573
+ if (!header) return undefined;
574
+ const asNumber = Number(header);
575
+ if (Number.isFinite(asNumber) && asNumber >= 0) {
576
+ return Math.round(asNumber * 1000);
577
+ }
578
+ // HTTP-date form (rare in practice for LLM providers).
579
+ const parsed = Date.parse(header);
580
+ if (!Number.isNaN(parsed)) {
581
+ return Math.max(0, parsed - Date.now());
582
+ }
583
+ return undefined;
584
+ }
585
+
586
+ /**
587
+ * Inspect a non-2xx response and build the most specific ProviderError
588
+ * subclass we can. Reads the response body as text (it's already dead
589
+ * on the wire by this point). Body classification handles the cases
590
+ * where HTTP status alone is ambiguous — notably OpenAI
591
+ * `insufficient_quota` vs `rate_limit_exceeded` both arriving as 429.
592
+ */
593
+ async function buildProviderError(
594
+ provider: ProviderKind,
595
+ response: Response,
596
+ ): Promise<ProviderError> {
597
+ const rawBody = await response.text();
598
+ const message = rawBody.trim() || `${response.status} ${response.statusText}`.trim();
599
+ const status = response.status;
600
+ const retryAfterMs = parseRetryAfterMs(response.headers.get("retry-after"));
601
+
602
+ const parsedBody = (() => {
603
+ try {
604
+ return JSON.parse(rawBody) as Record<string, unknown>;
605
+ } catch {
606
+ return undefined;
607
+ }
608
+ })();
609
+ const errorRecord = readRecord(parsedBody?.error);
610
+ const errorCode = typeof errorRecord?.code === "string"
611
+ ? errorRecord.code
612
+ : typeof errorRecord?.type === "string"
613
+ ? errorRecord.type
614
+ : typeof errorRecord?.status === "string"
615
+ ? errorRecord.status
616
+ : undefined;
617
+
618
+ // Anthropic 529 = overloaded. Anthropic surfaces this with
619
+ // { error: { type: "overloaded_error" } } in the body.
620
+ if (provider === "anthropic" && status === 529) {
621
+ return new ProviderOverloadedError({
622
+ provider,
623
+ status,
624
+ message,
625
+ retryable: true,
626
+ ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
627
+ });
628
+ }
629
+
630
+ // OpenAI / Google 503 = overloaded.
631
+ if ((provider === "openai" || provider === "google") && status === 503) {
632
+ return new ProviderOverloadedError({
633
+ provider,
634
+ status,
635
+ message,
636
+ retryable: true,
637
+ ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
638
+ });
639
+ }
640
+
641
+ // OpenAI 429 splits based on the error code in the body:
642
+ // - insufficient_quota → hard quota, non-retryable
643
+ // - rate_limit_exceeded / tokens_per_min_exceeded → retry with Retry-After
644
+ if (provider === "openai" && status === 429) {
645
+ if (errorCode === "insufficient_quota") {
646
+ return new ProviderQuotaError({
647
+ provider,
648
+ status,
649
+ message,
650
+ retryable: false,
651
+ });
652
+ }
653
+ return new ProviderRateLimitError({
654
+ provider,
655
+ status,
656
+ message,
657
+ retryable: true,
658
+ ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
659
+ });
660
+ }
661
+
662
+ // Google 429 RESOURCE_EXHAUSTED is almost always the daily free-tier
663
+ // quota — surface as a hard quota error so callers don't hot-loop on
664
+ // retries that can't possibly succeed until midnight UTC.
665
+ if (provider === "google" && status === 429) {
666
+ if (errorCode === "RESOURCE_EXHAUSTED") {
667
+ return new ProviderQuotaError({
668
+ provider,
669
+ status,
670
+ message,
671
+ retryable: false,
672
+ });
673
+ }
674
+ return new ProviderRateLimitError({
675
+ provider,
676
+ status,
677
+ message,
678
+ retryable: true,
679
+ ...(retryAfterMs !== undefined ? { retryAfterMs } : {}),
680
+ });
681
+ }
682
+
683
+ return new ProviderRequestError({
684
+ provider,
685
+ status,
686
+ message,
687
+ retryable: false,
688
+ });
293
689
  }
294
690
 
295
691
  async function requestJson(options: {
@@ -297,11 +693,13 @@ async function requestJson(options: {
297
693
  fetchImpl: typeof globalThis.fetch;
298
694
  init: RequestInit;
299
695
  providerLabel: string;
696
+ providerKind: ProviderKind;
300
697
  }): Promise<unknown> {
301
698
  const response = await options.fetchImpl(options.url, options.init);
302
699
  if (!response.ok) {
303
- const message = await readErrorMessage(response);
304
- throw new Error(`${options.providerLabel} request failed: ${message}`);
700
+ const err = await buildProviderError(options.providerKind, response);
701
+ err.message = `${options.providerLabel} request failed: ${err.message}`;
702
+ throw err;
305
703
  }
306
704
 
307
705
  return response.json();
@@ -312,15 +710,22 @@ async function requestStream(options: {
312
710
  fetchImpl: typeof globalThis.fetch;
313
711
  init: RequestInit;
314
712
  providerLabel: string;
713
+ providerKind: ProviderKind;
315
714
  }): Promise<ReadableStream<Uint8Array>> {
316
715
  const response = await options.fetchImpl(options.url, options.init);
317
716
  if (!response.ok) {
318
- const message = await readErrorMessage(response);
319
- throw new Error(`${options.providerLabel} request failed: ${message}`);
717
+ const err = await buildProviderError(options.providerKind, response);
718
+ err.message = `${options.providerLabel} request failed: ${err.message}`;
719
+ throw err;
320
720
  }
321
721
 
322
722
  if (!response.body) {
323
- throw new Error(`${options.providerLabel} request failed: stream body missing`);
723
+ throw new ProviderRequestError({
724
+ provider: options.providerKind,
725
+ status: response.status,
726
+ message: `${options.providerLabel} request failed: stream body missing`,
727
+ retryable: false,
728
+ });
324
729
  }
325
730
 
326
731
  return response.body;
@@ -366,6 +771,11 @@ function toOpenAICompatibleMessages(prompt: RuntimePromptMessage[]): OpenAICompa
366
771
  text += part.text;
367
772
  continue;
368
773
  }
774
+ // OpenAI Chat Completions has no roundtrip slot for Anthropic
775
+ // thinking blocks — they get dropped on replay. Anthropic-only.
776
+ if (part.type === "reasoning") {
777
+ continue;
778
+ }
369
779
 
370
780
  toolCalls.push({
371
781
  id: part.toolCallId,
@@ -473,9 +883,15 @@ function normalizeAnthropicFinishReason(
473
883
  }
474
884
  }
475
885
 
476
- function extractAnthropicUsage(payload: unknown):
477
- | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
478
- | undefined {
886
+ type RuntimeUsage = {
887
+ inputTokens?: number;
888
+ outputTokens?: number;
889
+ totalTokens?: number;
890
+ cacheCreationInputTokens?: number;
891
+ cacheReadInputTokens?: number;
892
+ };
893
+
894
+ function extractAnthropicUsage(payload: unknown): RuntimeUsage | undefined {
479
895
  const record = readRecord(payload);
480
896
  const usage = readRecord(record?.usage);
481
897
  if (!usage) {
@@ -484,6 +900,8 @@ function extractAnthropicUsage(payload: unknown):
484
900
 
485
901
  const inputTokens = usage.input_tokens;
486
902
  const outputTokens = usage.output_tokens;
903
+ const cacheCreationInputTokens = usage.cache_creation_input_tokens;
904
+ const cacheReadInputTokens = usage.cache_read_input_tokens;
487
905
 
488
906
  return {
489
907
  inputTokens: typeof inputTokens === "number" ? inputTokens : undefined,
@@ -492,17 +910,15 @@ function extractAnthropicUsage(payload: unknown):
492
910
  ? (typeof inputTokens === "number" ? inputTokens : 0) +
493
911
  (typeof outputTokens === "number" ? outputTokens : 0)
494
912
  : undefined,
913
+ ...(typeof cacheCreationInputTokens === "number" ? { cacheCreationInputTokens } : {}),
914
+ ...(typeof cacheReadInputTokens === "number" ? { cacheReadInputTokens } : {}),
495
915
  };
496
916
  }
497
917
 
498
918
  function mergeUsage(
499
- current:
500
- | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
501
- | undefined,
502
- next:
503
- | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
504
- | undefined,
505
- ): { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined {
919
+ current: RuntimeUsage | undefined,
920
+ next: RuntimeUsage | undefined,
921
+ ): RuntimeUsage | undefined {
506
922
  if (!current) {
507
923
  return next;
508
924
  }
@@ -513,11 +929,16 @@ function mergeUsage(
513
929
 
514
930
  const inputTokens = next.inputTokens ?? current.inputTokens;
515
931
  const outputTokens = next.outputTokens ?? current.outputTokens;
932
+ const cacheCreationInputTokens = next.cacheCreationInputTokens ??
933
+ current.cacheCreationInputTokens;
934
+ const cacheReadInputTokens = next.cacheReadInputTokens ?? current.cacheReadInputTokens;
516
935
 
517
936
  return {
518
937
  inputTokens,
519
938
  outputTokens,
520
939
  totalTokens: (inputTokens ?? 0) + (outputTokens ?? 0),
940
+ ...(cacheCreationInputTokens !== undefined ? { cacheCreationInputTokens } : {}),
941
+ ...(cacheReadInputTokens !== undefined ? { cacheReadInputTokens } : {}),
521
942
  };
522
943
  }
523
944
 
@@ -538,6 +959,26 @@ function toSnakeCaseRecord(record: Record<string, unknown>): Record<string, unkn
538
959
  );
539
960
  }
540
961
 
962
+ /**
963
+ * Recursive snake_case key converter for nested config objects (used for
964
+ * Anthropic mcp_servers, where authorizationToken / toolConfiguration /
965
+ * allowedTools all need conversion).
966
+ */
967
+ function deepSnakeCase(value: unknown): unknown {
968
+ if (Array.isArray(value)) {
969
+ return value.map(deepSnakeCase);
970
+ }
971
+ if (value !== null && typeof value === "object") {
972
+ return Object.fromEntries(
973
+ Object.entries(value as Record<string, unknown>).map(([key, v]) => [
974
+ key.replace(/[A-Z]/g, (match) => `_${match.toLowerCase()}`),
975
+ deepSnakeCase(v),
976
+ ]),
977
+ );
978
+ }
979
+ return value;
980
+ }
981
+
541
982
  function pushAnthropicUserContent(
542
983
  messages: AnthropicCompatibleMessage[],
543
984
  content: Array<Record<string, unknown>>,
@@ -558,9 +999,32 @@ function pushAnthropicUserContent(
558
999
  });
559
1000
  }
560
1001
 
1002
+ /**
1003
+ * Resolves a {@link ProviderCacheTtl} into Anthropic's `cache_control` shape.
1004
+ *
1005
+ * Returns `undefined` when caching is not requested (`false` / `undefined`),
1006
+ * `{ type: "ephemeral" }` for the 5-minute default (`true` / `"5m"`), or
1007
+ * `{ type: "ephemeral", ttl: "1h" }` for the extended 1-hour cache.
1008
+ */
1009
+ function resolveAnthropicCacheControlBlock(
1010
+ ttl: ProviderCacheTtl | undefined,
1011
+ ): { type: "ephemeral"; ttl?: "1h" } | undefined {
1012
+ if (ttl === undefined || ttl === false) {
1013
+ return undefined;
1014
+ }
1015
+ if (ttl === "1h") {
1016
+ return { type: "ephemeral", ttl: "1h" };
1017
+ }
1018
+ return { type: "ephemeral" };
1019
+ }
1020
+
561
1021
  function toAnthropicMessages(
562
1022
  prompt: RuntimePromptMessage[],
563
- ): { system?: string; messages: AnthropicCompatibleMessage[] } {
1023
+ systemCacheControl?: { type: "ephemeral"; ttl?: "1h" },
1024
+ ): {
1025
+ system?: string | Array<Record<string, unknown>>;
1026
+ messages: AnthropicCompatibleMessage[];
1027
+ } {
564
1028
  const systemParts: string[] = [];
565
1029
  const messages: AnthropicCompatibleMessage[] = [];
566
1030
 
@@ -580,14 +1044,33 @@ function toAnthropicMessages(
580
1044
  case "assistant":
581
1045
  messages.push({
582
1046
  role: "assistant",
583
- content: message.content.map((part) =>
584
- part.type === "text" ? { type: "text", text: part.text } : {
1047
+ content: message.content.map((part) => {
1048
+ if (part.type === "text") {
1049
+ return { type: "text", text: part.text };
1050
+ }
1051
+ if (part.type === "reasoning") {
1052
+ // Redacted thinking blocks roundtrip as the encrypted blob
1053
+ // form Anthropic gave us. Plain thinking blocks need the
1054
+ // signature to verify on the server.
1055
+ if (typeof part.redactedData === "string") {
1056
+ return {
1057
+ type: "redacted_thinking",
1058
+ data: part.redactedData,
1059
+ };
1060
+ }
1061
+ return {
1062
+ type: "thinking",
1063
+ thinking: part.text ?? "",
1064
+ ...(typeof part.signature === "string" ? { signature: part.signature } : {}),
1065
+ };
1066
+ }
1067
+ return {
585
1068
  type: "tool_use",
586
1069
  id: part.toolCallId,
587
1070
  name: part.toolName,
588
1071
  input: part.input,
589
- }
590
- ),
1072
+ };
1073
+ }),
591
1074
  });
592
1075
  break;
593
1076
  case "tool":
@@ -603,14 +1086,63 @@ function toAnthropicMessages(
603
1086
  }
604
1087
  }
605
1088
 
606
- return {
607
- ...(systemParts.length > 0 ? { system: systemParts.join("\n\n") } : {}),
608
- messages,
609
- };
1089
+ if (systemParts.length === 0) {
1090
+ return { messages };
1091
+ }
1092
+
1093
+ const joined = systemParts.join("\n\n");
1094
+
1095
+ // Cache-controlled system prompts must use the array-of-blocks form so the
1096
+ // breakpoint lands on an individual content block. Callers that don't opt
1097
+ // in keep the legacy raw-string form for backward compatibility.
1098
+ if (systemCacheControl) {
1099
+ return {
1100
+ system: [{
1101
+ type: "text",
1102
+ text: joined,
1103
+ cache_control: systemCacheControl,
1104
+ }],
1105
+ messages,
1106
+ };
1107
+ }
1108
+
1109
+ return { system: joined, messages };
1110
+ }
1111
+
1112
+ /**
1113
+ * Short-name → latest-versioned-type alias map for Anthropic provider tools.
1114
+ *
1115
+ * Anthropic tool types are date-stamped (e.g. `code_execution_20260120`) so
1116
+ * callers either pin a version or get the latest. We accept both: a caller
1117
+ * can pass `anthropic.code_execution` and we map to the latest known version,
1118
+ * or pass `anthropic.code_execution_20250522` and we forward verbatim.
1119
+ *
1120
+ * Versions chosen here are the latest documented releases as of 2026-04-15
1121
+ * — see https://docs.claude.com/en/docs/agents-and-tools/tool-use/overview.
1122
+ * When Anthropic ships newer versions, update this map.
1123
+ */
1124
+ const ANTHROPIC_TOOL_VERSION_ALIASES: Record<string, string> = {
1125
+ code_execution: "code_execution_20260120",
1126
+ computer_use: "computer_20250124",
1127
+ computer: "computer_20250124",
1128
+ text_editor: "text_editor_20250728",
1129
+ bash: "bash_20250124",
1130
+ memory: "memory_20250818",
1131
+ web_search: "web_search_20250305",
1132
+ web_fetch: "web_fetch_20250910",
1133
+ };
1134
+
1135
+ function resolveAnthropicProviderType(rawType: string): string {
1136
+ // Already-versioned types (contain a date stamp suffix) pass through verbatim.
1137
+ if (/_\d{8}$/.test(rawType)) {
1138
+ return rawType;
1139
+ }
1140
+ return ANTHROPIC_TOOL_VERSION_ALIASES[rawType] ?? rawType;
610
1141
  }
611
1142
 
612
1143
  function toAnthropicTools(
613
1144
  tools: RuntimeToolDefinition[] | undefined,
1145
+ toolsCacheControl?: { type: "ephemeral"; ttl?: "1h" },
614
1146
  ): Array<Record<string, unknown>> | undefined {
615
1147
  if (!tools) {
616
1148
  return undefined;
@@ -632,19 +1164,35 @@ function toAnthropicTools(
632
1164
  continue;
633
1165
  }
634
1166
 
635
- const providerType = tool.id.slice("anthropic.".length);
636
- if (providerType.length === 0) {
1167
+ const rawType = tool.id.slice("anthropic.".length);
1168
+ if (rawType.length === 0) {
637
1169
  continue;
638
1170
  }
639
1171
 
640
1172
  normalized.push({
641
- type: providerType,
1173
+ type: resolveAnthropicProviderType(rawType),
642
1174
  name: tool.name,
643
1175
  ...toSnakeCaseRecord(tool.args),
644
1176
  });
645
1177
  }
646
1178
 
647
- return normalized.length > 0 ? normalized : undefined;
1179
+ if (normalized.length === 0) {
1180
+ return undefined;
1181
+ }
1182
+
1183
+ // Attach the cache breakpoint to the final tool entry so Anthropic caches
1184
+ // the entire tools block up to and including that definition. Earlier tool
1185
+ // entries are implicitly covered by the same breakpoint per Anthropic's
1186
+ // walk-backward cache lookup behaviour.
1187
+ if (toolsCacheControl) {
1188
+ const lastIndex = normalized.length - 1;
1189
+ normalized[lastIndex] = {
1190
+ ...normalized[lastIndex],
1191
+ cache_control: toolsCacheControl,
1192
+ };
1193
+ }
1194
+
1195
+ return normalized;
648
1196
  }
649
1197
 
650
1198
  function createAnthropicRequestHeaders(options: {
@@ -717,47 +1265,244 @@ function resolveAnthropicMaxTokens(
717
1265
  return requested;
718
1266
  }
719
1267
 
1268
+ /**
1269
+ * Map a unified reasoning effort level to an Anthropic `thinking.budget_tokens`
1270
+ * value. Anthropic's minimum accepted budget is 1024; higher tiers give Claude
1271
+ * more headroom to explore. `max` maps to the upper bound documented for
1272
+ * Claude 4.x family (32k tokens of thinking — caller can override via
1273
+ * `budgetTokens` if they need more).
1274
+ */
1275
+ function resolveAnthropicThinkingBudget(
1276
+ option: ProviderReasoningOption | undefined,
1277
+ ): number | undefined {
1278
+ if (!option || option.enabled !== true) {
1279
+ return undefined;
1280
+ }
1281
+ if (typeof option.budgetTokens === "number" && option.budgetTokens >= 1024) {
1282
+ return option.budgetTokens;
1283
+ }
1284
+ switch (option.effort) {
1285
+ case "low":
1286
+ return 1024;
1287
+ case "high":
1288
+ return 16_384;
1289
+ case "max":
1290
+ return 32_768;
1291
+ case "medium":
1292
+ default:
1293
+ return 4096;
1294
+ }
1295
+ }
1296
+
720
1297
  function buildAnthropicMessagesRequest(
721
1298
  modelId: string,
722
1299
  providerName: string,
723
1300
  options: OpenAICompatibleLanguageOptions,
724
1301
  stream: boolean,
1302
+ warnings: WarningCollector,
725
1303
  ): AnthropicCompatibleRequest {
726
- const { system, messages } = toAnthropicMessages(options.prompt);
1304
+ const systemCacheControl = resolveAnthropicCacheControlBlock(
1305
+ options.cacheControl?.system,
1306
+ );
1307
+ const toolsCacheControl = resolveAnthropicCacheControlBlock(
1308
+ options.cacheControl?.tools,
1309
+ );
1310
+
1311
+ const { system, messages } = toAnthropicMessages(options.prompt, systemCacheControl);
1312
+ const anthropicTools = toAnthropicTools(options.tools, toolsCacheControl);
1313
+ const thinkingBudget = resolveAnthropicThinkingBudget(options.reasoning);
1314
+ const thinkingEnabled = thinkingBudget !== undefined;
1315
+
1316
+ // Anthropic doesn't support these unified options at all — emit warnings
1317
+ // so callers don't quietly pass values that have zero effect.
1318
+ if (options.presencePenalty !== undefined) {
1319
+ warnings.push({
1320
+ type: "unsupported-setting",
1321
+ provider: "anthropic",
1322
+ setting: "presencePenalty",
1323
+ details: "Anthropic Messages API has no equivalent and the value was dropped.",
1324
+ });
1325
+ }
1326
+ if (options.frequencyPenalty !== undefined) {
1327
+ warnings.push({
1328
+ type: "unsupported-setting",
1329
+ provider: "anthropic",
1330
+ setting: "frequencyPenalty",
1331
+ details: "Anthropic Messages API has no equivalent and the value was dropped.",
1332
+ });
1333
+ }
1334
+ if (options.seed !== undefined) {
1335
+ warnings.push({
1336
+ type: "unsupported-setting",
1337
+ provider: "anthropic",
1338
+ setting: "seed",
1339
+ details: "Anthropic Messages API does not support deterministic seeding.",
1340
+ });
1341
+ }
1342
+ if (options.topK !== undefined) {
1343
+ warnings.push({
1344
+ type: "unsupported-setting",
1345
+ provider: "anthropic",
1346
+ setting: "topK",
1347
+ details: "Anthropic Messages API does not expose top_k on this surface.",
1348
+ });
1349
+ }
1350
+ if (
1351
+ options.stopSequences && options.stopSequences.length > 4
1352
+ ) {
1353
+ warnings.push({
1354
+ type: "unsupported-setting",
1355
+ provider: "anthropic",
1356
+ setting: "stopSequences",
1357
+ details:
1358
+ `Anthropic accepts at most 4 stop sequences; ${options.stopSequences.length} were provided and the extras were truncated.`,
1359
+ });
1360
+ }
1361
+ if (thinkingEnabled && options.temperature !== undefined) {
1362
+ warnings.push({
1363
+ type: "unsupported-setting",
1364
+ provider: "anthropic",
1365
+ setting: "temperature",
1366
+ details:
1367
+ "Dropped because Anthropic rejects sampling params when extended thinking is enabled.",
1368
+ });
1369
+ }
1370
+ if (thinkingEnabled && options.topP !== undefined) {
1371
+ warnings.push({
1372
+ type: "unsupported-setting",
1373
+ provider: "anthropic",
1374
+ setting: "topP",
1375
+ details:
1376
+ "Dropped because Anthropic rejects sampling params when extended thinking is enabled.",
1377
+ });
1378
+ }
1379
+ if (options.responseFormat && options.responseFormat.type !== "text") {
1380
+ warnings.push({
1381
+ type: "unsupported-setting",
1382
+ provider: "anthropic",
1383
+ setting: "responseFormat",
1384
+ details:
1385
+ "Anthropic Messages API does not have a structured-output response_format equivalent. Use a tool with the schema as input_schema instead.",
1386
+ });
1387
+ }
1388
+
1389
+ // Anthropic requires max_tokens > budget_tokens when thinking is enabled.
1390
+ // Growing max_tokens by the thinking budget preserves the caller's intended
1391
+ // output budget, and we clamp the sum at the model's advertised maximum so
1392
+ // the request never exceeds the API's hard cap.
1393
+ const baseMaxTokens = resolveAnthropicMaxTokens(modelId, options.maxOutputTokens);
1394
+ const maxTokens = thinkingEnabled
1395
+ ? Math.min(
1396
+ baseMaxTokens + (thinkingBudget ?? 0),
1397
+ getAnthropicModelCapabilities(modelId).maxOutputTokens,
1398
+ )
1399
+ : baseMaxTokens;
1400
+
727
1401
  const body: AnthropicCompatibleRequest = {
728
1402
  model: modelId,
729
1403
  messages,
730
- max_tokens: resolveAnthropicMaxTokens(modelId, options.maxOutputTokens),
1404
+ max_tokens: maxTokens,
731
1405
  ...(stream ? { stream: true } : {}),
732
1406
  ...(system ? { system } : {}),
733
- ...(options.temperature !== undefined ? { temperature: options.temperature } : {}),
734
- ...(options.topP !== undefined ? { top_p: options.topP } : {}),
1407
+ // Sampling params are mutually exclusive with thinking on Anthropic — the
1408
+ // API rejects the combo outright. Drop them silently when thinking is on
1409
+ // (callers see thinking's output instead of what they'd have gotten from
1410
+ // custom sampling, which is the documented tradeoff).
1411
+ ...(!thinkingEnabled && options.temperature !== undefined
1412
+ ? { temperature: options.temperature }
1413
+ : {}),
1414
+ ...(!thinkingEnabled && options.topP !== undefined ? { top_p: options.topP } : {}),
735
1415
  ...(options.stopSequences && options.stopSequences.length > 0
736
- ? { stop_sequences: options.stopSequences }
1416
+ ? { stop_sequences: options.stopSequences.slice(0, 4) }
737
1417
  : {}),
738
- ...(toAnthropicTools(options.tools) ? { tools: toAnthropicTools(options.tools) } : {}),
1418
+ ...(anthropicTools ? { tools: anthropicTools } : {}),
739
1419
  ...(options.toolChoice !== undefined
740
1420
  ? { tool_choice: normalizeAnthropicToolChoice(options.toolChoice) }
741
1421
  : {}),
1422
+ ...(thinkingEnabled ? { thinking: { type: "enabled", budget_tokens: thinkingBudget } } : {}),
1423
+ ...(typeof options.userId === "string" && options.userId.length > 0
1424
+ ? { metadata: { user_id: options.userId } }
1425
+ : {}),
1426
+ ...(options.mcpServers && options.mcpServers.length > 0
1427
+ ? { mcp_servers: deepSnakeCase(options.mcpServers) as unknown[] }
1428
+ : {}),
1429
+ ...(options.anthropicContainer !== undefined ? { container: options.anthropicContainer } : {}),
742
1430
  };
743
1431
 
744
1432
  Object.assign(body, readProviderOptions(options.providerOptions, "anthropic", providerName));
745
1433
  return body;
746
1434
  }
747
1435
 
1436
+ type AnthropicReasoningContent = {
1437
+ type: "reasoning";
1438
+ text?: string;
1439
+ signature?: string;
1440
+ redactedData?: string;
1441
+ };
1442
+
1443
+ type AnthropicCitation = {
1444
+ type: string;
1445
+ citedText?: string;
1446
+ url?: string;
1447
+ title?: string;
1448
+ startCharIndex?: number;
1449
+ endCharIndex?: number;
1450
+ startBlockIndex?: number;
1451
+ endBlockIndex?: number;
1452
+ startPageNumber?: number;
1453
+ endPageNumber?: number;
1454
+ documentIndex?: number;
1455
+ documentTitle?: string;
1456
+ };
1457
+
1458
+ type AnthropicTextContent = {
1459
+ type: "text";
1460
+ text: string;
1461
+ citations?: AnthropicCitation[];
1462
+ };
1463
+
1464
+ /**
1465
+ * Best-effort camelCase normalization of a single Anthropic citation
1466
+ * record. Handles the union of fields across web_search_result_location,
1467
+ * web_fetch_result_location, char_location, page_location, and
1468
+ * content_block_location citation kinds — see
1469
+ * https://docs.claude.com/en/docs/build-with-claude/citations
1470
+ */
1471
+ function normalizeAnthropicCitation(raw: unknown): AnthropicCitation | undefined {
1472
+ const r = readRecord(raw);
1473
+ if (!r) return undefined;
1474
+ const typeStr = typeof r.type === "string" ? r.type : undefined;
1475
+ if (!typeStr) return undefined;
1476
+ const out: AnthropicCitation = { type: typeStr };
1477
+ if (typeof r.cited_text === "string") out.citedText = r.cited_text;
1478
+ if (typeof r.url === "string") out.url = r.url;
1479
+ if (typeof r.title === "string") out.title = r.title;
1480
+ if (typeof r.start_char_index === "number") out.startCharIndex = r.start_char_index;
1481
+ if (typeof r.end_char_index === "number") out.endCharIndex = r.end_char_index;
1482
+ if (typeof r.start_block_index === "number") out.startBlockIndex = r.start_block_index;
1483
+ if (typeof r.end_block_index === "number") out.endBlockIndex = r.end_block_index;
1484
+ if (typeof r.start_page_number === "number") out.startPageNumber = r.start_page_number;
1485
+ if (typeof r.end_page_number === "number") out.endPageNumber = r.end_page_number;
1486
+ if (typeof r.document_index === "number") out.documentIndex = r.document_index;
1487
+ if (typeof r.document_title === "string") out.documentTitle = r.document_title;
1488
+ return out;
1489
+ }
1490
+
748
1491
  function buildAnthropicGenerateResult(payload: unknown): {
749
1492
  content: Array<
750
- | { type: "text"; text: string }
1493
+ | AnthropicTextContent
1494
+ | AnthropicReasoningContent
751
1495
  | { type: "tool-call"; toolCallId: string; toolName: string; input: string }
752
1496
  | { type: "tool-result"; toolCallId: string; toolName: string; result: unknown }
753
1497
  >;
754
1498
  finishReason?: string | { unified: string; raw: string } | null;
755
- usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number };
1499
+ usage?: RuntimeUsage;
756
1500
  } {
757
1501
  const record = readRecord(payload);
758
1502
  const content = Array.isArray(record?.content) ? record.content : [];
759
1503
  const normalized: Array<
760
- | { type: "text"; text: string }
1504
+ | AnthropicTextContent
1505
+ | AnthropicReasoningContent
761
1506
  | { type: "tool-call"; toolCallId: string; toolName: string; input: string }
762
1507
  | { type: "tool-result"; toolCallId: string; toolName: string; result: unknown }
763
1508
  > = [];
@@ -767,7 +1512,42 @@ function buildAnthropicGenerateResult(payload: unknown): {
767
1512
  const blockType = typeof block?.type === "string" ? block.type : undefined;
768
1513
 
769
1514
  if (blockType === "text" && typeof block?.text === "string" && block.text.length > 0) {
770
- normalized.push({ type: "text", text: block.text });
1515
+ const citationsRaw = Array.isArray(block.citations) ? block.citations : undefined;
1516
+ const citations = citationsRaw
1517
+ ?.flatMap((c) => {
1518
+ const normalizedCitation = normalizeAnthropicCitation(c);
1519
+ return normalizedCitation ? [normalizedCitation] : [];
1520
+ });
1521
+ normalized.push({
1522
+ type: "text",
1523
+ text: block.text,
1524
+ ...(citations && citations.length > 0 ? { citations } : {}),
1525
+ });
1526
+ continue;
1527
+ }
1528
+
1529
+ // Thinking blocks carry the cleartext trace plus a signature that
1530
+ // Anthropic uses to verify on subsequent turns. Surfacing both lets
1531
+ // callers persist them as `reasoning` content parts and replay on
1532
+ // the next turn so Claude can continue from the same thinking.
1533
+ if (blockType === "thinking") {
1534
+ normalized.push({
1535
+ type: "reasoning",
1536
+ ...(typeof block?.thinking === "string" ? { text: block.thinking } : {}),
1537
+ ...(typeof block?.signature === "string" ? { signature: block.signature } : {}),
1538
+ });
1539
+ continue;
1540
+ }
1541
+
1542
+ // Redacted thinking blocks arrive when Claude's safety classifier
1543
+ // hides the trace. Pass the encrypted blob through opaquely so the
1544
+ // caller can replay it on the next turn (Anthropic still needs the
1545
+ // blob to verify continuity even though it can't read it).
1546
+ if (blockType === "redacted_thinking" && typeof block?.data === "string") {
1547
+ normalized.push({
1548
+ type: "reasoning",
1549
+ redactedData: block.data,
1550
+ });
771
1551
  continue;
772
1552
  }
773
1553
 
@@ -857,7 +1637,7 @@ async function* streamAnthropicCompatibleParts(
857
1637
  const toolCalls = new Map<number, AnthropicStreamToolCallState>();
858
1638
  const reasoningBlocks = new Map<number, AnthropicStreamReasoningState>();
859
1639
  let finishReason: string | { unified: string; raw: string } | null = null;
860
- let usage: { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined;
1640
+ let usage: RuntimeUsage | undefined;
861
1641
 
862
1642
  for await (const chunk of stream) {
863
1643
  buffer += decoder.decode(chunk, { stream: true });
@@ -909,6 +1689,20 @@ async function* streamAnthropicCompatibleParts(
909
1689
  continue;
910
1690
  }
911
1691
 
1692
+ // Redacted thinking blocks arrive as opaque encrypted payloads when
1693
+ // Claude's safety classifier flags the reasoning trace. Surface them
1694
+ // as a zero-length reasoning block so callers know thinking happened
1695
+ // without leaking the (legitimately hidden) contents.
1696
+ if (blockType === "redacted_thinking") {
1697
+ const reasoningId = `thinking-${index}`;
1698
+ reasoningBlocks.set(index, { id: reasoningId });
1699
+ yield {
1700
+ type: "reasoning-start",
1701
+ id: reasoningId,
1702
+ };
1703
+ continue;
1704
+ }
1705
+
912
1706
  if (
913
1707
  (blockType === "tool_use" || blockType === "server_tool_use") &&
914
1708
  typeof contentBlock?.id === "string" &&
@@ -1094,9 +1888,7 @@ function normalizeOpenAIFinishReason(
1094
1888
  return raw;
1095
1889
  }
1096
1890
 
1097
- function extractOpenAIUsage(payload: unknown):
1098
- | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
1099
- | undefined {
1891
+ function extractOpenAIUsage(payload: unknown): RuntimeUsage | undefined {
1100
1892
  const record = readRecord(payload);
1101
1893
  const usage = readRecord(record?.usage);
1102
1894
  if (!usage) {
@@ -1106,11 +1898,14 @@ function extractOpenAIUsage(payload: unknown):
1106
1898
  const inputTokens = usage.prompt_tokens;
1107
1899
  const outputTokens = usage.completion_tokens;
1108
1900
  const totalTokens = usage.total_tokens;
1901
+ const promptTokensDetails = readRecord(usage.prompt_tokens_details);
1902
+ const cachedTokens = promptTokensDetails?.cached_tokens;
1109
1903
 
1110
1904
  return {
1111
1905
  inputTokens: typeof inputTokens === "number" ? inputTokens : undefined,
1112
1906
  outputTokens: typeof outputTokens === "number" ? outputTokens : undefined,
1113
1907
  totalTokens: typeof totalTokens === "number" ? totalTokens : undefined,
1908
+ ...(typeof cachedTokens === "number" ? { cacheReadInputTokens: cachedTokens } : {}),
1114
1909
  };
1115
1910
  }
1116
1911
 
@@ -1165,19 +1960,95 @@ function extractOpenAIToolCalls(message: Record<string, unknown>): Array<{
1165
1960
  return normalized;
1166
1961
  }
1167
1962
 
1963
+ /**
1964
+ * OpenAI reasoning models (o1 / o3 / o4 family) use the completion path but
1965
+ * have different constraints than chat models: sampling params are rejected,
1966
+ * and they accept a `reasoning_effort` field. We detect them by model id
1967
+ * prefix so callers don't have to configure it per runtime.
1968
+ */
1969
+ function isOpenAIReasoningModel(modelId: string): boolean {
1970
+ return /^o[134](-|$)/.test(modelId);
1971
+ }
1972
+
1973
+ /**
1974
+ * Map the unified reasoning effort to OpenAI's `reasoning_effort` enum.
1975
+ * OpenAI doesn't accept "max" — we collapse it to "high".
1976
+ */
1977
+ function resolveOpenAIReasoningEffort(
1978
+ option: ProviderReasoningOption | undefined,
1979
+ ): "low" | "medium" | "high" | undefined {
1980
+ if (!option || option.enabled !== true) {
1981
+ return undefined;
1982
+ }
1983
+ switch (option.effort) {
1984
+ case "low":
1985
+ return "low";
1986
+ case "high":
1987
+ case "max":
1988
+ return "high";
1989
+ case "medium":
1990
+ default:
1991
+ return "medium";
1992
+ }
1993
+ }
1994
+
1168
1995
  function buildOpenAIChatRequest(
1169
1996
  modelId: string,
1170
1997
  providerName: string,
1171
1998
  options: OpenAICompatibleLanguageOptions,
1172
1999
  stream: boolean,
2000
+ warnings: WarningCollector,
1173
2001
  ): OpenAICompatibleChatRequest {
2002
+ const isReasoningModel = isOpenAIReasoningModel(modelId);
2003
+ const reasoningEffort = resolveOpenAIReasoningEffort(options.reasoning);
2004
+ const reasoningEnabled = isReasoningModel || reasoningEffort !== undefined;
2005
+
2006
+ // OpenAI Chat Completions has no top_k surface (it's exposed only on the
2007
+ // Responses API for some reasoning models). Quietly accepting it would
2008
+ // mislead callers into thinking it took effect.
2009
+ if (options.topK !== undefined) {
2010
+ warnings.push({
2011
+ type: "unsupported-setting",
2012
+ provider: "openai",
2013
+ setting: "topK",
2014
+ details: "OpenAI Chat Completions does not expose top_k; the value was dropped.",
2015
+ });
2016
+ }
2017
+
2018
+ // Reasoning models (o1 / o3 / o4) reject sampling params outright. Emit
2019
+ // warnings at build time so callers see *why* the value didn't apply
2020
+ // rather than a 400 from the API.
2021
+ if (reasoningEnabled) {
2022
+ const dropped: Array<[keyof typeof options, string]> = [
2023
+ ["temperature", "temperature"],
2024
+ ["topP", "top_p"],
2025
+ ["presencePenalty", "presence_penalty"],
2026
+ ["frequencyPenalty", "frequency_penalty"],
2027
+ ];
2028
+ for (const [key, openaiName] of dropped) {
2029
+ if (options[key] !== undefined) {
2030
+ warnings.push({
2031
+ type: "unsupported-setting",
2032
+ provider: "openai",
2033
+ setting: key,
2034
+ details:
2035
+ `Dropped because OpenAI reasoning models reject ${openaiName}. Reasoning was active for this request.`,
2036
+ });
2037
+ }
2038
+ }
2039
+ }
2040
+
1174
2041
  const body: OpenAICompatibleChatRequest = {
1175
2042
  model: modelId,
1176
2043
  messages: toOpenAICompatibleMessages(options.prompt),
1177
2044
  ...(stream ? { stream: true, stream_options: { include_usage: true } } : {}),
1178
2045
  ...(options.maxOutputTokens !== undefined ? { max_tokens: options.maxOutputTokens } : {}),
1179
- ...(options.temperature !== undefined ? { temperature: options.temperature } : {}),
1180
- ...(options.topP !== undefined ? { top_p: options.topP } : {}),
2046
+ // OpenAI reasoning models reject temperature / top_p / frequency / presence.
2047
+ // Drop them silently rather than letting the API bounce the request.
2048
+ ...(!reasoningEnabled && options.temperature !== undefined
2049
+ ? { temperature: options.temperature }
2050
+ : {}),
2051
+ ...(!reasoningEnabled && options.topP !== undefined ? { top_p: options.topP } : {}),
1181
2052
  ...(options.stopSequences && options.stopSequences.length > 0
1182
2053
  ? { stop: options.stopSequences }
1183
2054
  : {}),
@@ -1186,10 +2057,37 @@ function buildOpenAIChatRequest(
1186
2057
  : {}),
1187
2058
  ...(options.toolChoice !== undefined ? { tool_choice: options.toolChoice } : {}),
1188
2059
  ...(options.seed !== undefined ? { seed: options.seed } : {}),
1189
- ...(options.presencePenalty !== undefined ? { presence_penalty: options.presencePenalty } : {}),
1190
- ...(options.frequencyPenalty !== undefined
2060
+ ...(!reasoningEnabled && options.presencePenalty !== undefined
2061
+ ? { presence_penalty: options.presencePenalty }
2062
+ : {}),
2063
+ ...(!reasoningEnabled && options.frequencyPenalty !== undefined
1191
2064
  ? { frequency_penalty: options.frequencyPenalty }
1192
2065
  : {}),
2066
+ ...(reasoningEffort !== undefined ? { reasoning_effort: reasoningEffort } : {}),
2067
+ ...(typeof options.userId === "string" && options.userId.length > 0
2068
+ ? { user: options.userId }
2069
+ : {}),
2070
+ ...(options.serviceTier !== undefined ? { service_tier: options.serviceTier } : {}),
2071
+ ...(options.parallelToolCalls !== undefined
2072
+ ? { parallel_tool_calls: options.parallelToolCalls }
2073
+ : {}),
2074
+ ...(options.responseFormat && options.responseFormat.type !== "text"
2075
+ ? {
2076
+ response_format: options.responseFormat.type === "json" ? { type: "json_object" } : {
2077
+ type: "json_schema",
2078
+ json_schema: {
2079
+ name: options.responseFormat.name,
2080
+ ...(typeof options.responseFormat.description === "string"
2081
+ ? { description: options.responseFormat.description }
2082
+ : {}),
2083
+ schema: unwrapToolInputSchema(options.responseFormat.schema),
2084
+ ...(options.responseFormat.strict !== undefined
2085
+ ? { strict: options.responseFormat.strict }
2086
+ : {}),
2087
+ },
2088
+ },
2089
+ }
2090
+ : {}),
1193
2091
  };
1194
2092
 
1195
2093
  Object.assign(body, readProviderOptions(options.providerOptions, "openai", providerName));
@@ -1216,9 +2114,7 @@ function normalizeGoogleFinishReason(
1216
2114
  }
1217
2115
  }
1218
2116
 
1219
- function extractGoogleUsage(payload: unknown):
1220
- | { inputTokens?: number; outputTokens?: number; totalTokens?: number }
1221
- | undefined {
2117
+ function extractGoogleUsage(payload: unknown): RuntimeUsage | undefined {
1222
2118
  const record = readRecord(payload);
1223
2119
  const usage = readRecord(record?.usageMetadata);
1224
2120
  if (!usage) {
@@ -1228,11 +2124,15 @@ function extractGoogleUsage(payload: unknown):
1228
2124
  const inputTokens = usage.promptTokenCount;
1229
2125
  const outputTokens = usage.candidatesTokenCount;
1230
2126
  const totalTokens = usage.totalTokenCount;
2127
+ const cachedContentTokenCount = usage.cachedContentTokenCount;
1231
2128
 
1232
2129
  return {
1233
2130
  inputTokens: typeof inputTokens === "number" ? inputTokens : undefined,
1234
2131
  outputTokens: typeof outputTokens === "number" ? outputTokens : undefined,
1235
2132
  totalTokens: typeof totalTokens === "number" ? totalTokens : undefined,
2133
+ ...(typeof cachedContentTokenCount === "number"
2134
+ ? { cacheReadInputTokens: cachedContentTokenCount }
2135
+ : {}),
1236
2136
  };
1237
2137
  }
1238
2138
 
@@ -1258,20 +2158,29 @@ function toGoogleContents(
1258
2158
  parts: [{ text: readTextParts(message.content) }],
1259
2159
  });
1260
2160
  break;
1261
- case "assistant":
1262
- contents.push({
1263
- role: "model",
1264
- parts: message.content.map((part) =>
1265
- part.type === "text" ? { text: part.text } : {
1266
- functionCall: {
1267
- id: part.toolCallId,
1268
- name: part.toolName,
1269
- args: part.input,
1270
- },
1271
- }
1272
- ),
1273
- });
2161
+ case "assistant": {
2162
+ // Anthropic-only `reasoning` parts have no Gemini equivalent
2163
+ // and are dropped on replay.
2164
+ const parts: Array<Record<string, unknown>> = [];
2165
+ for (const part of message.content) {
2166
+ if (part.type === "text") {
2167
+ parts.push({ text: part.text });
2168
+ continue;
2169
+ }
2170
+ if (part.type === "reasoning") {
2171
+ continue;
2172
+ }
2173
+ parts.push({
2174
+ functionCall: {
2175
+ id: part.toolCallId,
2176
+ name: part.toolName,
2177
+ args: part.input,
2178
+ },
2179
+ });
2180
+ }
2181
+ contents.push({ role: "model", parts });
1274
2182
  break;
2183
+ }
1275
2184
  case "tool":
1276
2185
  contents.push({
1277
2186
  role: "user",
@@ -1299,22 +2208,45 @@ function toGoogleContents(
1299
2208
 
1300
2209
  function toGoogleTools(
1301
2210
  tools: RuntimeToolDefinition[] | undefined,
1302
- ): GoogleCompatibleRequest["tools"] | undefined {
2211
+ ): Array<Record<string, unknown>> | undefined {
1303
2212
  if (!tools) {
1304
2213
  return undefined;
1305
2214
  }
1306
2215
 
1307
- const functionDeclarations = tools.flatMap((tool) =>
1308
- tool.type === "function"
1309
- ? [{
1310
- name: tool.name,
2216
+ const functionDeclarations: Array<Record<string, unknown>> = [];
2217
+ const providerEntries: Array<Record<string, unknown>> = [];
2218
+
2219
+ for (const tool of tools) {
2220
+ if (tool.type === "function") {
2221
+ functionDeclarations.push({
2222
+ name: tool.name,
1311
2223
  ...(typeof tool.description === "string" ? { description: tool.description } : {}),
1312
2224
  parameters: unwrapToolInputSchema(tool.inputSchema),
1313
- }]
1314
- : []
1315
- );
2225
+ });
2226
+ continue;
2227
+ }
1316
2228
 
1317
- return functionDeclarations.length > 0 ? [{ functionDeclarations }] : undefined;
2229
+ // Gemini provider tools code_execution, google_search,
2230
+ // google_search_retrieval — each lives in its own tools[] entry
2231
+ // with a single key keyed by the camelCase tool name and an
2232
+ // optional config payload (caller-provided tool.args).
2233
+ if (!tool.id.startsWith("google.")) {
2234
+ continue;
2235
+ }
2236
+ const providerType = tool.id.slice("google.".length);
2237
+ if (providerType.length === 0) {
2238
+ continue;
2239
+ }
2240
+ const camelKey = providerType.replace(/_([a-z])/g, (_, ch) => ch.toUpperCase());
2241
+ providerEntries.push({ [camelKey]: tool.args ?? {} });
2242
+ }
2243
+
2244
+ const result: Array<Record<string, unknown>> = [];
2245
+ if (functionDeclarations.length > 0) {
2246
+ result.push({ functionDeclarations });
2247
+ }
2248
+ result.push(...providerEntries);
2249
+ return result.length > 0 ? result : undefined;
1318
2250
  }
1319
2251
 
1320
2252
  function unwrapToolInputSchema(inputSchema: unknown): unknown {
@@ -1346,7 +2278,11 @@ function normalizeGoogleToolChoice(toolChoice: unknown):
1346
2278
  }
1347
2279
 
1348
2280
  const record = readRecord(toolChoice);
1349
- if (record?.type === "tool" && typeof record.name === "string") {
2281
+ if (!record) return undefined;
2282
+
2283
+ // Single-tool restriction: { type: "tool", name } — pin to one
2284
+ // function via mode: ANY + allowedFunctionNames: [name].
2285
+ if (record.type === "tool" && typeof record.name === "string") {
1350
2286
  return {
1351
2287
  functionCallingConfig: {
1352
2288
  mode: "ANY",
@@ -1355,12 +2291,74 @@ function normalizeGoogleToolChoice(toolChoice: unknown):
1355
2291
  };
1356
2292
  }
1357
2293
 
2294
+ // Multi-tool restriction: { type: "tools", names: string[] } — pin
2295
+ // to a subset via mode: ANY + the full allowedFunctionNames array.
2296
+ if (record.type === "tools" && Array.isArray(record.names)) {
2297
+ const names = record.names.filter((n): n is string => typeof n === "string");
2298
+ if (names.length > 0) {
2299
+ return {
2300
+ functionCallingConfig: {
2301
+ mode: "ANY",
2302
+ allowedFunctionNames: names,
2303
+ },
2304
+ };
2305
+ }
2306
+ }
2307
+
2308
+ // Explicit mode forms: { type: "auto" | "none" | "any" }.
2309
+ if (record.type === "auto") {
2310
+ return { functionCallingConfig: { mode: "AUTO" } };
2311
+ }
2312
+ if (record.type === "none") {
2313
+ return { functionCallingConfig: { mode: "NONE" } };
2314
+ }
2315
+ if (record.type === "any" || record.type === "required") {
2316
+ return { functionCallingConfig: { mode: "ANY" } };
2317
+ }
2318
+
1358
2319
  return undefined;
1359
2320
  }
1360
2321
 
2322
+ /**
2323
+ * Map the unified reasoning option to Gemini's thinkingConfig. Gemini 2.5+
2324
+ * accepts `includeThoughts: true` to stream back `thought` parts, and
2325
+ * `thinkingBudget: N` to cap the thinking token count. The effort levels
2326
+ * here follow Google's own guidance (low ~= 512, medium ~= 2048,
2327
+ * high ~= 8192, max = -1 means "dynamic/no cap").
2328
+ */
2329
+ function resolveGoogleThinkingConfig(
2330
+ option: ProviderReasoningOption | undefined,
2331
+ ): Record<string, unknown> | undefined {
2332
+ if (!option || option.enabled !== true) {
2333
+ return undefined;
2334
+ }
2335
+ const config: Record<string, unknown> = { includeThoughts: true };
2336
+ if (typeof option.budgetTokens === "number") {
2337
+ config.thinkingBudget = option.budgetTokens;
2338
+ return config;
2339
+ }
2340
+ switch (option.effort) {
2341
+ case "low":
2342
+ config.thinkingBudget = 512;
2343
+ break;
2344
+ case "high":
2345
+ config.thinkingBudget = 8192;
2346
+ break;
2347
+ case "max":
2348
+ config.thinkingBudget = -1;
2349
+ break;
2350
+ case "medium":
2351
+ default:
2352
+ config.thinkingBudget = 2048;
2353
+ break;
2354
+ }
2355
+ return config;
2356
+ }
2357
+
1361
2358
  function buildGoogleGenerationConfig(
1362
2359
  options: OpenAICompatibleLanguageOptions,
1363
2360
  ): Record<string, unknown> | undefined {
2361
+ const thinkingConfig = resolveGoogleThinkingConfig(options.reasoning);
1364
2362
  const config: Record<string, unknown> = {
1365
2363
  ...(options.maxOutputTokens !== undefined ? { maxOutputTokens: options.maxOutputTokens } : {}),
1366
2364
  ...(options.temperature !== undefined ? { temperature: options.temperature } : {}),
@@ -1370,6 +2368,7 @@ function buildGoogleGenerationConfig(
1370
2368
  ? { stopSequences: options.stopSequences }
1371
2369
  : {}),
1372
2370
  ...(options.seed !== undefined ? { seed: options.seed } : {}),
2371
+ ...(thinkingConfig ? { thinkingConfig } : {}),
1373
2372
  };
1374
2373
 
1375
2374
  return Object.keys(config).length > 0 ? config : undefined;
@@ -1378,8 +2377,47 @@ function buildGoogleGenerationConfig(
1378
2377
  function buildGoogleGenerateContentRequest(
1379
2378
  providerName: string,
1380
2379
  options: OpenAICompatibleLanguageOptions,
2380
+ warnings: WarningCollector,
1381
2381
  ): GoogleCompatibleRequest {
2382
+ // Google generate-content surface doesn't accept presence/frequency
2383
+ // penalties on most current models. Emit warnings and let the request
2384
+ // through without them.
2385
+ if (options.presencePenalty !== undefined) {
2386
+ warnings.push({
2387
+ type: "unsupported-setting",
2388
+ provider: "google",
2389
+ setting: "presencePenalty",
2390
+ details: "Gemini generateContent does not accept presencePenalty; the value was dropped.",
2391
+ });
2392
+ }
2393
+ if (options.frequencyPenalty !== undefined) {
2394
+ warnings.push({
2395
+ type: "unsupported-setting",
2396
+ provider: "google",
2397
+ setting: "frequencyPenalty",
2398
+ details: "Gemini generateContent does not accept frequencyPenalty; the value was dropped.",
2399
+ });
2400
+ }
2401
+ if (options.responseFormat && options.responseFormat.type !== "text") {
2402
+ warnings.push({
2403
+ type: "unsupported-setting",
2404
+ provider: "google",
2405
+ setting: "responseFormat",
2406
+ details:
2407
+ "Gemini uses generationConfig.responseMimeType + responseSchema for structured outputs, which is a separate surface and not yet wired through this option.",
2408
+ });
2409
+ }
2410
+
1382
2411
  const { systemInstruction, contents } = toGoogleContents(options.prompt);
2412
+ const generationConfig = buildGoogleGenerationConfig(options);
2413
+ // requestLabels wins over userId-derived labels: when callers explicitly
2414
+ // provide a label map, that's the source of truth. Otherwise fall back
2415
+ // to {user_id} derived from the unified userId option.
2416
+ const labels = options.requestLabels && Object.keys(options.requestLabels).length > 0
2417
+ ? options.requestLabels
2418
+ : typeof options.userId === "string" && options.userId.length > 0
2419
+ ? { user_id: options.userId }
2420
+ : undefined;
1383
2421
  const body: GoogleCompatibleRequest = {
1384
2422
  contents,
1385
2423
  ...(systemInstruction ? { systemInstruction } : {}),
@@ -1387,8 +2425,13 @@ function buildGoogleGenerateContentRequest(
1387
2425
  ...(normalizeGoogleToolChoice(options.toolChoice)
1388
2426
  ? { toolConfig: normalizeGoogleToolChoice(options.toolChoice) }
1389
2427
  : {}),
1390
- ...(buildGoogleGenerationConfig(options)
1391
- ? { generationConfig: buildGoogleGenerationConfig(options) }
2428
+ ...(generationConfig ? { generationConfig } : {}),
2429
+ ...(labels ? { labels } : {}),
2430
+ ...(typeof options.googleCachedContent === "string" && options.googleCachedContent.length > 0
2431
+ ? { cachedContent: options.googleCachedContent }
2432
+ : {}),
2433
+ ...(options.googleSafetySettings && options.googleSafetySettings.length > 0
2434
+ ? { safetySettings: options.googleSafetySettings }
1392
2435
  : {}),
1393
2436
  };
1394
2437
 
@@ -1426,7 +2469,8 @@ function buildGoogleGenerateResult(payload: unknown): {
1426
2469
  | { type: "tool-call"; toolCallId: string; toolName: string; input: string }
1427
2470
  >;
1428
2471
  finishReason?: string | { unified: string; raw: string } | null;
1429
- usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number };
2472
+ usage?: RuntimeUsage;
2473
+ groundingMetadata?: Record<string, unknown>;
1430
2474
  } {
1431
2475
  const parts = extractGoogleCandidateParts(payload);
1432
2476
  const content: Array<
@@ -1451,10 +2495,19 @@ function buildGoogleGenerateResult(payload: unknown): {
1451
2495
  }
1452
2496
  }
1453
2497
 
2498
+ // Gemini grounding (google_search / google_search_retrieval) returns
2499
+ // a per-candidate groundingMetadata object with web search queries,
2500
+ // grounding chunks, and citation indices into the response text.
2501
+ // Pass it through opaquely so callers can render footnotes / source
2502
+ // chips / "Search results" UI without parsing the wire shape.
2503
+ const candidate = extractFirstGoogleCandidate(payload);
2504
+ const groundingMetadata = readRecord(candidate?.groundingMetadata);
2505
+
1454
2506
  return {
1455
2507
  content,
1456
- finishReason: normalizeGoogleFinishReason(extractFirstGoogleCandidate(payload)?.finishReason),
2508
+ finishReason: normalizeGoogleFinishReason(candidate?.finishReason),
1457
2509
  usage: extractGoogleUsage(payload),
2510
+ ...(groundingMetadata ? { groundingMetadata } : {}),
1458
2511
  };
1459
2512
  }
1460
2513
 
@@ -1467,7 +2520,7 @@ async function* streamGoogleCompatibleParts(
1467
2520
  let reasoningId: string | null = null;
1468
2521
  let reasoningIndex = 0;
1469
2522
  let finishReason: string | { unified: string; raw: string } | null = null;
1470
- let usage: { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined;
2523
+ let usage: RuntimeUsage | undefined;
1471
2524
 
1472
2525
  for await (const chunk of stream) {
1473
2526
  buffer += decoder.decode(chunk, { stream: true });
@@ -1599,7 +2652,7 @@ function buildOpenAIGenerateResult(payload: unknown): {
1599
2652
  }
1600
2653
  >;
1601
2654
  finishReason?: string | { unified: string; raw: string } | null;
1602
- usage?: { inputTokens?: number; outputTokens?: number; totalTokens?: number };
2655
+ usage?: RuntimeUsage;
1603
2656
  } {
1604
2657
  const choice = extractFirstChoice(payload);
1605
2658
  const message = readRecord(choice?.message);
@@ -1630,7 +2683,7 @@ async function* streamOpenAICompatibleParts(
1630
2683
  let reasoningId: string | null = null;
1631
2684
  let reasoningIndex = 0;
1632
2685
  let finishReason: string | { unified: string; raw: string } | null = null;
1633
- let usage: { inputTokens?: number; outputTokens?: number; totalTokens?: number } | undefined;
2686
+ let usage: RuntimeUsage | undefined;
1634
2687
 
1635
2688
  for await (const chunk of stream) {
1636
2689
  buffer += decoder.decode(chunk, { stream: true });
@@ -1788,11 +2841,19 @@ export function createOpenAIModelRuntime(
1788
2841
  doGenerate(optionsForRuntime: unknown) {
1789
2842
  const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
1790
2843
  const url = getOpenAIChatCompletionsUrl(config.baseURL);
1791
- const body = buildOpenAIChatRequest(modelId, config.name ?? "openai", options, false);
2844
+ const warnings = createWarningCollector();
2845
+ const body = buildOpenAIChatRequest(
2846
+ modelId,
2847
+ config.name ?? "openai",
2848
+ options,
2849
+ false,
2850
+ warnings,
2851
+ );
1792
2852
  return requestJson({
1793
2853
  url,
1794
2854
  fetchImpl,
1795
2855
  providerLabel: config.name ?? "openai",
2856
+ providerKind: "openai",
1796
2857
  init: {
1797
2858
  method: "POST",
1798
2859
  headers: createRequestHeaders({
@@ -1803,16 +2864,30 @@ export function createOpenAIModelRuntime(
1803
2864
  body: JSON.stringify(body),
1804
2865
  signal: options.abortSignal,
1805
2866
  },
1806
- }).then(buildOpenAIGenerateResult);
2867
+ }).then((payload) => {
2868
+ const drained = warnings.drain();
2869
+ return {
2870
+ ...buildOpenAIGenerateResult(payload),
2871
+ ...(drained.length > 0 ? { warnings: drained } : {}),
2872
+ };
2873
+ });
1807
2874
  },
1808
2875
  doStream(optionsForRuntime: unknown) {
1809
2876
  const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
1810
2877
  const url = getOpenAIChatCompletionsUrl(config.baseURL);
1811
- const body = buildOpenAIChatRequest(modelId, config.name ?? "openai", options, true);
2878
+ const warnings = createWarningCollector();
2879
+ const body = buildOpenAIChatRequest(
2880
+ modelId,
2881
+ config.name ?? "openai",
2882
+ options,
2883
+ true,
2884
+ warnings,
2885
+ );
1812
2886
  return requestStream({
1813
2887
  url,
1814
2888
  fetchImpl,
1815
2889
  providerLabel: config.name ?? "openai",
2890
+ providerKind: "openai",
1816
2891
  init: {
1817
2892
  method: "POST",
1818
2893
  headers: createRequestHeaders({
@@ -1823,9 +2898,664 @@ export function createOpenAIModelRuntime(
1823
2898
  body: JSON.stringify(body),
1824
2899
  signal: options.abortSignal,
1825
2900
  },
1826
- }).then((responseStream) => ({
1827
- stream: ReadableStream.from(streamOpenAICompatibleParts(responseStream)),
1828
- }));
2901
+ }).then((responseStream) => {
2902
+ const drained = warnings.drain();
2903
+ return {
2904
+ stream: ReadableStream.from(streamOpenAICompatibleParts(responseStream)),
2905
+ ...(drained.length > 0 ? { warnings: drained } : {}),
2906
+ };
2907
+ });
2908
+ },
2909
+ };
2910
+ }
2911
+
2912
+ // =============================================================================
2913
+ // OpenAI Responses API runtime (#1077, deferred from #1052 C4)
2914
+ // =============================================================================
2915
+ //
2916
+ // The Responses API (/v1/responses) is a different surface than Chat
2917
+ // Completions. Same provider, different request shape, different streaming
2918
+ // event grammar, different response shape, and different reasoning-summary
2919
+ // surface. This runtime is parallel to createOpenAIModelRuntime so each
2920
+ // path stays focused on one wire format.
2921
+ //
2922
+ // Why parallel runtimes instead of a flag? See the rationale in #1077.
2923
+ //
2924
+ // docs: https://platform.openai.com/docs/api-reference/responses
2925
+
2926
+ type OpenAIResponsesInputItem = Record<string, unknown>;
2927
+
2928
+ type OpenAIResponsesRequest = {
2929
+ model: string;
2930
+ input: OpenAIResponsesInputItem[];
2931
+ instructions?: string;
2932
+ stream?: boolean;
2933
+ max_output_tokens?: number;
2934
+ temperature?: number;
2935
+ top_p?: number;
2936
+ tools?: Array<Record<string, unknown>>;
2937
+ tool_choice?: unknown;
2938
+ reasoning?: { effort?: string; summary?: string };
2939
+ metadata?: Record<string, string>;
2940
+ user?: string;
2941
+ service_tier?: string;
2942
+ parallel_tool_calls?: boolean;
2943
+ text?: { format: Record<string, unknown> };
2944
+ [key: string]: unknown;
2945
+ };
2946
+
2947
+ /**
2948
+ * Convert the unified RuntimePromptMessage[] to the Responses API `input`
2949
+ * array shape. Differences from Chat Completions:
2950
+ * - System prompts go on the top-level `instructions` field, not inline.
2951
+ * - Content parts use `input_text` / `output_text` discriminants instead
2952
+ * of the Chat Completions plain-text shorthand.
2953
+ * - Assistant tool calls become standalone `function_call` items in the
2954
+ * input array, not nested `tool_calls` on a message.
2955
+ * - Tool results become standalone `function_call_output` items.
2956
+ * - Reasoning content parts roundtrip as `reasoning` items so callers can
2957
+ * replay multi-turn conversations with chain-of-thought intact.
2958
+ */
2959
+ function toOpenAIResponsesInput(
2960
+ prompt: RuntimePromptMessage[],
2961
+ ): { instructions?: string; input: OpenAIResponsesInputItem[] } {
2962
+ const instructionsParts: string[] = [];
2963
+ const input: OpenAIResponsesInputItem[] = [];
2964
+
2965
+ for (const message of prompt) {
2966
+ switch (message.role) {
2967
+ case "system":
2968
+ if (message.content.length > 0) {
2969
+ instructionsParts.push(message.content);
2970
+ }
2971
+ break;
2972
+ case "user":
2973
+ input.push({
2974
+ role: "user",
2975
+ content: [{ type: "input_text", text: readTextParts(message.content) }],
2976
+ });
2977
+ break;
2978
+ case "assistant": {
2979
+ const messageContent: Array<Record<string, unknown>> = [];
2980
+ for (const part of message.content) {
2981
+ if (part.type === "text") {
2982
+ messageContent.push({ type: "output_text", text: part.text });
2983
+ continue;
2984
+ }
2985
+ if (part.type === "reasoning") {
2986
+ // Reasoning items are top-level entries in the input array,
2987
+ // not nested inside the assistant message — flush whatever
2988
+ // text we've accumulated first, then push the reasoning item.
2989
+ if (messageContent.length > 0) {
2990
+ input.push({ role: "assistant", content: [...messageContent] });
2991
+ messageContent.length = 0;
2992
+ }
2993
+ const summary: Array<Record<string, unknown>> = [];
2994
+ if (typeof part.text === "string" && part.text.length > 0) {
2995
+ summary.push({ type: "summary_text", text: part.text });
2996
+ }
2997
+ input.push({
2998
+ type: "reasoning",
2999
+ ...(typeof part.signature === "string" ? { encrypted_content: part.signature } : {}),
3000
+ summary,
3001
+ });
3002
+ continue;
3003
+ }
3004
+ // tool-call: flush message content, then push as standalone
3005
+ // function_call item per Responses API shape.
3006
+ if (messageContent.length > 0) {
3007
+ input.push({ role: "assistant", content: [...messageContent] });
3008
+ messageContent.length = 0;
3009
+ }
3010
+ input.push({
3011
+ type: "function_call",
3012
+ call_id: part.toolCallId,
3013
+ name: part.toolName,
3014
+ arguments: stringifyJsonValue(part.input),
3015
+ });
3016
+ }
3017
+ if (messageContent.length > 0) {
3018
+ input.push({ role: "assistant", content: messageContent });
3019
+ }
3020
+ break;
3021
+ }
3022
+ case "tool":
3023
+ for (const part of message.content) {
3024
+ input.push({
3025
+ type: "function_call_output",
3026
+ call_id: part.toolCallId,
3027
+ output: stringifyJsonValue(part.output.value),
3028
+ });
3029
+ }
3030
+ break;
3031
+ }
3032
+ }
3033
+
3034
+ return {
3035
+ ...(instructionsParts.length > 0 ? { instructions: instructionsParts.join("\n\n") } : {}),
3036
+ input,
3037
+ };
3038
+ }
3039
+
3040
+ /**
3041
+ * Tools on the Responses API differ from Chat Completions: instead of
3042
+ * `{ type: "function", function: { name, parameters } }` the function
3043
+ * shape lifts the name/parameters/strict to the top of the entry. Native
3044
+ * tools (web_search, file_search, computer_use, code_interpreter) live
3045
+ * alongside function tools in the same array.
3046
+ */
3047
+ function toOpenAIResponsesTools(
3048
+ tools: RuntimeToolDefinition[] | undefined,
3049
+ ): Array<Record<string, unknown>> | undefined {
3050
+ if (!tools) return undefined;
3051
+ const normalized: Array<Record<string, unknown>> = [];
3052
+ for (const tool of tools) {
3053
+ if (tool.type === "function") {
3054
+ normalized.push({
3055
+ type: "function",
3056
+ name: tool.name,
3057
+ ...(typeof tool.description === "string" ? { description: tool.description } : {}),
3058
+ parameters: unwrapToolInputSchema(tool.inputSchema),
3059
+ });
3060
+ continue;
3061
+ }
3062
+ if (!tool.id.startsWith("openai.")) continue;
3063
+ const providerType = tool.id.slice("openai.".length);
3064
+ if (providerType.length === 0) continue;
3065
+ normalized.push({
3066
+ type: providerType,
3067
+ ...toSnakeCaseRecord(tool.args),
3068
+ });
3069
+ }
3070
+ return normalized.length > 0 ? normalized : undefined;
3071
+ }
3072
+
3073
+ function buildOpenAIResponsesRequest(
3074
+ modelId: string,
3075
+ providerName: string,
3076
+ options: OpenAICompatibleLanguageOptions,
3077
+ stream: boolean,
3078
+ warnings: WarningCollector,
3079
+ ): OpenAIResponsesRequest {
3080
+ const isReasoningModel = isOpenAIReasoningModel(modelId);
3081
+ const reasoningEffort = resolveOpenAIReasoningEffort(options.reasoning);
3082
+ const reasoningEnabled = isReasoningModel || reasoningEffort !== undefined;
3083
+
3084
+ // Same param-sanitization rules as Chat Completions: reasoning models
3085
+ // reject sampling params. Drop with a warning.
3086
+ if (options.topK !== undefined) {
3087
+ warnings.push({
3088
+ type: "unsupported-setting",
3089
+ provider: "openai",
3090
+ setting: "topK",
3091
+ details: "OpenAI Responses API does not expose top_k; the value was dropped.",
3092
+ });
3093
+ }
3094
+ if (reasoningEnabled) {
3095
+ const dropped: Array<[keyof typeof options, string]> = [
3096
+ ["temperature", "temperature"],
3097
+ ["topP", "top_p"],
3098
+ ["presencePenalty", "presence_penalty"],
3099
+ ["frequencyPenalty", "frequency_penalty"],
3100
+ ];
3101
+ for (const [key, openaiName] of dropped) {
3102
+ if (options[key] !== undefined) {
3103
+ warnings.push({
3104
+ type: "unsupported-setting",
3105
+ provider: "openai",
3106
+ setting: key,
3107
+ details:
3108
+ `Dropped because OpenAI reasoning models reject ${openaiName}. Reasoning was active for this request.`,
3109
+ });
3110
+ }
3111
+ }
3112
+ }
3113
+
3114
+ const { instructions, input } = toOpenAIResponsesInput(options.prompt);
3115
+ const responsesTools = toOpenAIResponsesTools(options.tools);
3116
+
3117
+ const body: OpenAIResponsesRequest = {
3118
+ model: modelId,
3119
+ input,
3120
+ ...(instructions !== undefined ? { instructions } : {}),
3121
+ ...(stream ? { stream: true } : {}),
3122
+ ...(options.maxOutputTokens !== undefined
3123
+ ? { max_output_tokens: options.maxOutputTokens }
3124
+ : {}),
3125
+ ...(!reasoningEnabled && options.temperature !== undefined
3126
+ ? { temperature: options.temperature }
3127
+ : {}),
3128
+ ...(!reasoningEnabled && options.topP !== undefined ? { top_p: options.topP } : {}),
3129
+ ...(responsesTools ? { tools: responsesTools } : {}),
3130
+ ...(options.toolChoice !== undefined ? { tool_choice: options.toolChoice } : {}),
3131
+ // The Responses API surfaces reasoning effort + summary verbosity
3132
+ // in a structured `reasoning` object instead of a flat field. We
3133
+ // request "auto" summary so callers see structured summary parts
3134
+ // without having to opt into them per request.
3135
+ ...(reasoningEffort !== undefined
3136
+ ? { reasoning: { effort: reasoningEffort, summary: "auto" } }
3137
+ : {}),
3138
+ ...(typeof options.userId === "string" && options.userId.length > 0
3139
+ ? { user: options.userId }
3140
+ : {}),
3141
+ ...(options.serviceTier !== undefined ? { service_tier: options.serviceTier } : {}),
3142
+ ...(options.parallelToolCalls !== undefined
3143
+ ? { parallel_tool_calls: options.parallelToolCalls }
3144
+ : {}),
3145
+ // Responses API uses `text.format` instead of Chat Completions'
3146
+ // `response_format`. The shape is similar but nested under `text`.
3147
+ ...(options.responseFormat && options.responseFormat.type !== "text"
3148
+ ? {
3149
+ text: {
3150
+ format: options.responseFormat.type === "json" ? { type: "json_object" } : {
3151
+ type: "json_schema",
3152
+ name: options.responseFormat.name,
3153
+ ...(typeof options.responseFormat.description === "string"
3154
+ ? { description: options.responseFormat.description }
3155
+ : {}),
3156
+ schema: unwrapToolInputSchema(options.responseFormat.schema),
3157
+ ...(options.responseFormat.strict !== undefined
3158
+ ? { strict: options.responseFormat.strict }
3159
+ : {}),
3160
+ },
3161
+ },
3162
+ }
3163
+ : {}),
3164
+ };
3165
+
3166
+ Object.assign(body, readProviderOptions(options.providerOptions, "openai", providerName));
3167
+ return body;
3168
+ }
3169
+
3170
+ /**
3171
+ * The Responses API uses `input_tokens` / `output_tokens` field names
3172
+ * instead of Chat Completions' `prompt_tokens` / `completion_tokens`.
3173
+ * It also nests cached input tokens under `input_tokens_details` and
3174
+ * exposes reasoning tokens via `output_tokens_details.reasoning_tokens`.
3175
+ */
3176
+ function extractOpenAIResponsesUsage(payload: unknown): RuntimeUsage | undefined {
3177
+ const record = readRecord(payload);
3178
+ // Streaming usage lives on response.completed inside `response.usage`;
3179
+ // non-streaming has it at the top level.
3180
+ const responseRecord = readRecord(record?.response);
3181
+ const usage = readRecord(responseRecord?.usage) ?? readRecord(record?.usage);
3182
+ if (!usage) return undefined;
3183
+
3184
+ const inputTokens = typeof usage.input_tokens === "number" ? usage.input_tokens : undefined;
3185
+ const outputTokens = typeof usage.output_tokens === "number" ? usage.output_tokens : undefined;
3186
+ const totalTokens = typeof usage.total_tokens === "number"
3187
+ ? usage.total_tokens
3188
+ : (inputTokens !== undefined || outputTokens !== undefined
3189
+ ? (inputTokens ?? 0) + (outputTokens ?? 0)
3190
+ : undefined);
3191
+ const inputDetails = readRecord(usage.input_tokens_details);
3192
+ const cachedTokens = inputDetails?.cached_tokens;
3193
+
3194
+ return {
3195
+ inputTokens,
3196
+ outputTokens,
3197
+ totalTokens,
3198
+ ...(typeof cachedTokens === "number" ? { cacheReadInputTokens: cachedTokens } : {}),
3199
+ };
3200
+ }
3201
+
3202
+ function normalizeOpenAIResponsesFinishReason(
3203
+ raw: unknown,
3204
+ ): string | { unified: string; raw: string } | null {
3205
+ if (typeof raw !== "string") return null;
3206
+ switch (raw) {
3207
+ case "completed":
3208
+ return { unified: "stop", raw };
3209
+ case "incomplete":
3210
+ return { unified: "length", raw };
3211
+ case "failed":
3212
+ return { unified: "error", raw };
3213
+ case "in_progress":
3214
+ return null;
3215
+ default:
3216
+ return raw;
3217
+ }
3218
+ }
3219
+
3220
+ type OpenAIResponsesContentPart =
3221
+ | { type: "text"; text: string }
3222
+ | {
3223
+ type: "reasoning";
3224
+ summaries?: Array<{ id?: string; text: string }>;
3225
+ signature?: string;
3226
+ }
3227
+ | { type: "tool-call"; toolCallId: string; toolName: string; input: string };
3228
+
3229
+ function buildOpenAIResponsesGenerateResult(payload: unknown): {
3230
+ content: OpenAIResponsesContentPart[];
3231
+ finishReason?: string | { unified: string; raw: string } | null;
3232
+ usage?: RuntimeUsage;
3233
+ } {
3234
+ const record = readRecord(payload);
3235
+ const output = Array.isArray(record?.output) ? record.output : [];
3236
+ const content: OpenAIResponsesContentPart[] = [];
3237
+
3238
+ for (const item of output) {
3239
+ const itemRecord = readRecord(item);
3240
+ const itemType = typeof itemRecord?.type === "string" ? itemRecord.type : undefined;
3241
+
3242
+ if (itemType === "message" && Array.isArray(itemRecord?.content)) {
3243
+ // A message item bundles one or more output_text parts. Concat
3244
+ // their texts into a single text content entry.
3245
+ let text = "";
3246
+ for (const part of itemRecord.content) {
3247
+ const p = readRecord(part);
3248
+ if (typeof p?.type === "string" && p.type === "output_text" && typeof p.text === "string") {
3249
+ text += p.text;
3250
+ }
3251
+ }
3252
+ if (text.length > 0) {
3253
+ content.push({ type: "text", text });
3254
+ }
3255
+ continue;
3256
+ }
3257
+
3258
+ if (itemType === "function_call") {
3259
+ content.push({
3260
+ type: "tool-call",
3261
+ toolCallId: typeof itemRecord?.call_id === "string"
3262
+ ? itemRecord.call_id
3263
+ : (typeof itemRecord?.id === "string" ? itemRecord.id : ""),
3264
+ toolName: typeof itemRecord?.name === "string" ? itemRecord.name : "",
3265
+ input: typeof itemRecord?.arguments === "string"
3266
+ ? itemRecord.arguments
3267
+ : stringifyJsonValue(itemRecord?.arguments ?? {}),
3268
+ });
3269
+ continue;
3270
+ }
3271
+
3272
+ if (itemType === "reasoning") {
3273
+ const summary = Array.isArray(itemRecord?.summary) ? itemRecord.summary : [];
3274
+ const summaries: Array<{ id?: string; text: string }> = [];
3275
+ for (const s of summary) {
3276
+ const sr = readRecord(s);
3277
+ if (typeof sr?.text === "string" && sr.text.length > 0) {
3278
+ summaries.push({
3279
+ ...(typeof sr?.id === "string" ? { id: sr.id } : {}),
3280
+ text: sr.text,
3281
+ });
3282
+ }
3283
+ }
3284
+ content.push({
3285
+ type: "reasoning",
3286
+ ...(summaries.length > 0 ? { summaries } : {}),
3287
+ ...(typeof itemRecord?.encrypted_content === "string"
3288
+ ? { signature: itemRecord.encrypted_content }
3289
+ : {}),
3290
+ });
3291
+ continue;
3292
+ }
3293
+ }
3294
+
3295
+ return {
3296
+ content,
3297
+ finishReason: normalizeOpenAIResponsesFinishReason(record?.status),
3298
+ usage: extractOpenAIResponsesUsage(payload),
3299
+ };
3300
+ }
3301
+
3302
+ type OpenAIResponsesStreamReasoningState = {
3303
+ id: string;
3304
+ emittedStart: boolean;
3305
+ };
3306
+
3307
+ type OpenAIResponsesStreamFunctionCallState = {
3308
+ id: string;
3309
+ toolCallId: string;
3310
+ name: string;
3311
+ arguments: string;
3312
+ };
3313
+
3314
+ /**
3315
+ * Parse the Responses API streaming event grammar into the same UI part
3316
+ * shapes the existing OpenAI / Anthropic / Google streams emit. The
3317
+ * Responses API uses a strict event-typed protocol — every event has a
3318
+ * `type` field naming the lifecycle phase — instead of the loose
3319
+ * `delta`-based shape Chat Completions uses.
3320
+ */
3321
+ async function* streamOpenAIResponsesParts(
3322
+ stream: ReadableStream<Uint8Array>,
3323
+ ): AsyncIterable<unknown> {
3324
+ const decoder = new TextDecoder();
3325
+ let buffer = "";
3326
+ const reasoningBlocks = new Map<string, OpenAIResponsesStreamReasoningState>();
3327
+ const functionCalls = new Map<string, OpenAIResponsesStreamFunctionCallState>();
3328
+ const startedToolCalls = new Set<string>();
3329
+ let finishReason: string | { unified: string; raw: string } | null = null;
3330
+ let usage: RuntimeUsage | undefined;
3331
+ let reasoningCounter = 0;
3332
+
3333
+ for await (const chunk of stream) {
3334
+ buffer += decoder.decode(chunk, { stream: true });
3335
+ const parsed = parseSseChunk(buffer);
3336
+ buffer = parsed.remainder;
3337
+
3338
+ for (const event of parsed.events) {
3339
+ if (event === "[DONE]") continue;
3340
+ const record = readRecord(event);
3341
+ const type = typeof record?.type === "string" ? record.type : undefined;
3342
+ if (!type) continue;
3343
+
3344
+ // response.output_item.added: a new output item begins. Track
3345
+ // function_call items so their argument deltas can be attributed,
3346
+ // and reasoning items so summary deltas can group correctly.
3347
+ if (type === "response.output_item.added") {
3348
+ const item = readRecord(record?.item);
3349
+ const itemType = typeof item?.type === "string" ? item.type : undefined;
3350
+ const itemId = typeof item?.id === "string" ? item.id : undefined;
3351
+ if (itemType === "function_call" && itemId) {
3352
+ const callId = typeof item?.call_id === "string" ? item.call_id : itemId;
3353
+ const name = typeof item?.name === "string" ? item.name : "";
3354
+ functionCalls.set(itemId, {
3355
+ id: itemId,
3356
+ toolCallId: callId,
3357
+ name,
3358
+ arguments: "",
3359
+ });
3360
+ }
3361
+ if (itemType === "reasoning" && itemId) {
3362
+ reasoningBlocks.set(itemId, {
3363
+ id: `reasoning-${reasoningCounter++}`,
3364
+ emittedStart: false,
3365
+ });
3366
+ }
3367
+ continue;
3368
+ }
3369
+
3370
+ // response.output_text.delta: text chunk for a message item.
3371
+ if (type === "response.output_text.delta" && typeof record?.delta === "string") {
3372
+ if (record.delta.length > 0) {
3373
+ yield { type: "text-delta", delta: record.delta };
3374
+ }
3375
+ continue;
3376
+ }
3377
+
3378
+ // response.reasoning_summary_text.delta: reasoning summary text
3379
+ // chunk. The first delta on an item lazily emits the
3380
+ // reasoning-start event so callers can group deltas into a part.
3381
+ if (type === "response.reasoning_summary_text.delta" && typeof record?.delta === "string") {
3382
+ const itemId = typeof record?.item_id === "string" ? record.item_id : undefined;
3383
+ const state = itemId ? reasoningBlocks.get(itemId) : undefined;
3384
+ if (state && record.delta.length > 0) {
3385
+ if (!state.emittedStart) {
3386
+ yield { type: "reasoning-start", id: state.id };
3387
+ state.emittedStart = true;
3388
+ }
3389
+ yield { type: "reasoning-delta", id: state.id, delta: record.delta };
3390
+ }
3391
+ continue;
3392
+ }
3393
+
3394
+ // response.function_call_arguments.delta: tool call argument
3395
+ // chunk. The first delta lazily emits tool-input-start.
3396
+ if (type === "response.function_call_arguments.delta" && typeof record?.delta === "string") {
3397
+ const itemId = typeof record?.item_id === "string" ? record.item_id : undefined;
3398
+ const state = itemId ? functionCalls.get(itemId) : undefined;
3399
+ if (state && record.delta.length > 0) {
3400
+ if (!startedToolCalls.has(state.id)) {
3401
+ yield {
3402
+ type: "tool-input-start",
3403
+ id: state.toolCallId,
3404
+ toolName: state.name,
3405
+ };
3406
+ startedToolCalls.add(state.id);
3407
+ }
3408
+ state.arguments += record.delta;
3409
+ yield {
3410
+ type: "tool-input-delta",
3411
+ id: state.toolCallId,
3412
+ delta: record.delta,
3413
+ };
3414
+ }
3415
+ continue;
3416
+ }
3417
+
3418
+ // response.output_item.done: an item has finished emitting deltas.
3419
+ // Close any reasoning or function-call streams that were open.
3420
+ if (type === "response.output_item.done") {
3421
+ const item = readRecord(record?.item);
3422
+ const itemType = typeof item?.type === "string" ? item.type : undefined;
3423
+ const itemId = typeof item?.id === "string" ? item.id : undefined;
3424
+ if (itemType === "reasoning" && itemId) {
3425
+ const state = reasoningBlocks.get(itemId);
3426
+ if (state?.emittedStart) {
3427
+ yield { type: "reasoning-end", id: state.id };
3428
+ }
3429
+ reasoningBlocks.delete(itemId);
3430
+ }
3431
+ if (itemType === "function_call" && itemId) {
3432
+ const state = functionCalls.get(itemId);
3433
+ if (state) {
3434
+ yield {
3435
+ type: "tool-call",
3436
+ toolCallId: state.toolCallId,
3437
+ toolName: state.name,
3438
+ input: state.arguments,
3439
+ };
3440
+ }
3441
+ functionCalls.delete(itemId);
3442
+ }
3443
+ continue;
3444
+ }
3445
+
3446
+ // response.completed: terminal event with the final response object
3447
+ // (status + usage). Capture both for the final finish part.
3448
+ if (type === "response.completed") {
3449
+ usage = extractOpenAIResponsesUsage(record) ?? usage;
3450
+ const responseRecord = readRecord(record?.response);
3451
+ finishReason = normalizeOpenAIResponsesFinishReason(responseRecord?.status);
3452
+ continue;
3453
+ }
3454
+
3455
+ if (type === "response.failed" || type === "response.incomplete") {
3456
+ const responseRecord = readRecord(record?.response);
3457
+ finishReason = normalizeOpenAIResponsesFinishReason(responseRecord?.status) ??
3458
+ (type === "response.failed"
3459
+ ? { unified: "error", raw: "failed" }
3460
+ : { unified: "length", raw: "incomplete" });
3461
+ usage = extractOpenAIResponsesUsage(record) ?? usage;
3462
+ continue;
3463
+ }
3464
+ }
3465
+ }
3466
+
3467
+ // Close any reasoning streams still open at end-of-stream (defensive
3468
+ // — a clean Responses API stream always closes them via output_item.done).
3469
+ for (const state of reasoningBlocks.values()) {
3470
+ if (state.emittedStart) {
3471
+ yield { type: "reasoning-end", id: state.id };
3472
+ }
3473
+ }
3474
+
3475
+ yield {
3476
+ type: "finish",
3477
+ finishReason,
3478
+ ...(usage ? { usage } : {}),
3479
+ };
3480
+ }
3481
+
3482
+ export function createOpenAIResponsesRuntime(
3483
+ config: OpenAIRuntimeConfig,
3484
+ modelId: string,
3485
+ ): ModelRuntime {
3486
+ const fetchImpl = config.fetch ?? globalThis.fetch;
3487
+ return {
3488
+ provider: config.name ?? "openai",
3489
+ modelId,
3490
+ specificationVersion: "v3",
3491
+ supportedUrls: {},
3492
+ doGenerate(optionsForRuntime: unknown) {
3493
+ const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
3494
+ const url = getOpenAIResponsesUrl(config.baseURL);
3495
+ const warnings = createWarningCollector();
3496
+ const body = buildOpenAIResponsesRequest(
3497
+ modelId,
3498
+ config.name ?? "openai",
3499
+ options,
3500
+ false,
3501
+ warnings,
3502
+ );
3503
+ return requestJson({
3504
+ url,
3505
+ fetchImpl,
3506
+ providerLabel: config.name ?? "openai",
3507
+ providerKind: "openai",
3508
+ init: {
3509
+ method: "POST",
3510
+ headers: createRequestHeaders({
3511
+ apiKeyHeaderName: "authorization",
3512
+ apiKey: `Bearer ${config.apiKey}`,
3513
+ extraHeaders: options.headers,
3514
+ }),
3515
+ body: JSON.stringify(body),
3516
+ signal: options.abortSignal,
3517
+ },
3518
+ }).then((payload) => {
3519
+ const drained = warnings.drain();
3520
+ return {
3521
+ ...buildOpenAIResponsesGenerateResult(payload),
3522
+ ...(drained.length > 0 ? { warnings: drained } : {}),
3523
+ };
3524
+ });
3525
+ },
3526
+ doStream(optionsForRuntime: unknown) {
3527
+ const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
3528
+ const url = getOpenAIResponsesUrl(config.baseURL);
3529
+ const warnings = createWarningCollector();
3530
+ const body = buildOpenAIResponsesRequest(
3531
+ modelId,
3532
+ config.name ?? "openai",
3533
+ options,
3534
+ true,
3535
+ warnings,
3536
+ );
3537
+ return requestStream({
3538
+ url,
3539
+ fetchImpl,
3540
+ providerLabel: config.name ?? "openai",
3541
+ providerKind: "openai",
3542
+ init: {
3543
+ method: "POST",
3544
+ headers: createRequestHeaders({
3545
+ apiKeyHeaderName: "authorization",
3546
+ apiKey: `Bearer ${config.apiKey}`,
3547
+ extraHeaders: options.headers,
3548
+ }),
3549
+ body: JSON.stringify(body),
3550
+ signal: options.abortSignal,
3551
+ },
3552
+ }).then((responseStream) => {
3553
+ const drained = warnings.drain();
3554
+ return {
3555
+ stream: ReadableStream.from(streamOpenAIResponsesParts(responseStream)),
3556
+ ...(drained.length > 0 ? { warnings: drained } : {}),
3557
+ };
3558
+ });
1829
3559
  },
1830
3560
  };
1831
3561
  }
@@ -1843,16 +3573,19 @@ export function createAnthropicModelRuntime(
1843
3573
  doGenerate(optionsForRuntime: unknown) {
1844
3574
  const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
1845
3575
  const url = getAnthropicMessagesUrl(config.baseURL);
3576
+ const warnings = createWarningCollector();
1846
3577
  const body = buildAnthropicMessagesRequest(
1847
3578
  modelId,
1848
3579
  config.name ?? "anthropic",
1849
3580
  options,
1850
3581
  false,
3582
+ warnings,
1851
3583
  );
1852
3584
  return requestJson({
1853
3585
  url,
1854
3586
  fetchImpl,
1855
3587
  providerLabel: config.name ?? "anthropic",
3588
+ providerKind: "anthropic",
1856
3589
  init: {
1857
3590
  method: "POST",
1858
3591
  headers: createAnthropicRequestHeaders({
@@ -1863,21 +3596,30 @@ export function createAnthropicModelRuntime(
1863
3596
  body: JSON.stringify(body),
1864
3597
  signal: options.abortSignal,
1865
3598
  },
1866
- }).then(buildAnthropicGenerateResult);
3599
+ }).then((payload) => {
3600
+ const drained = warnings.drain();
3601
+ return {
3602
+ ...buildAnthropicGenerateResult(payload),
3603
+ ...(drained.length > 0 ? { warnings: drained } : {}),
3604
+ };
3605
+ });
1867
3606
  },
1868
3607
  doStream(optionsForRuntime: unknown) {
1869
3608
  const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
1870
3609
  const url = getAnthropicMessagesUrl(config.baseURL);
3610
+ const warnings = createWarningCollector();
1871
3611
  const body = buildAnthropicMessagesRequest(
1872
3612
  modelId,
1873
3613
  config.name ?? "anthropic",
1874
3614
  options,
1875
3615
  true,
3616
+ warnings,
1876
3617
  );
1877
3618
  return requestStream({
1878
3619
  url,
1879
3620
  fetchImpl,
1880
3621
  providerLabel: config.name ?? "anthropic",
3622
+ providerKind: "anthropic",
1881
3623
  init: {
1882
3624
  method: "POST",
1883
3625
  headers: createAnthropicRequestHeaders({
@@ -1888,9 +3630,13 @@ export function createAnthropicModelRuntime(
1888
3630
  body: JSON.stringify(body),
1889
3631
  signal: options.abortSignal,
1890
3632
  },
1891
- }).then((responseStream) => ({
1892
- stream: ReadableStream.from(streamAnthropicCompatibleParts(responseStream)),
1893
- }));
3633
+ }).then((responseStream) => {
3634
+ const drained = warnings.drain();
3635
+ return {
3636
+ stream: ReadableStream.from(streamAnthropicCompatibleParts(responseStream)),
3637
+ ...(drained.length > 0 ? { warnings: drained } : {}),
3638
+ };
3639
+ });
1894
3640
  },
1895
3641
  };
1896
3642
  }
@@ -1908,11 +3654,17 @@ export function createGoogleModelRuntime(
1908
3654
  doGenerate(optionsForRuntime: unknown) {
1909
3655
  const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
1910
3656
  const url = getGoogleGenerateContentUrl(config.baseURL, modelId);
1911
- const body = buildGoogleGenerateContentRequest(config.name ?? "google", options);
3657
+ const warnings = createWarningCollector();
3658
+ const body = buildGoogleGenerateContentRequest(
3659
+ config.name ?? "google",
3660
+ options,
3661
+ warnings,
3662
+ );
1912
3663
  return requestJson({
1913
3664
  url,
1914
3665
  fetchImpl,
1915
3666
  providerLabel: config.name ?? "google",
3667
+ providerKind: "google",
1916
3668
  init: {
1917
3669
  method: "POST",
1918
3670
  headers: createRequestHeaders({
@@ -1923,16 +3675,28 @@ export function createGoogleModelRuntime(
1923
3675
  body: JSON.stringify(body),
1924
3676
  signal: options.abortSignal,
1925
3677
  },
1926
- }).then(buildGoogleGenerateResult);
3678
+ }).then((payload) => {
3679
+ const drained = warnings.drain();
3680
+ return {
3681
+ ...buildGoogleGenerateResult(payload),
3682
+ ...(drained.length > 0 ? { warnings: drained } : {}),
3683
+ };
3684
+ });
1927
3685
  },
1928
3686
  doStream(optionsForRuntime: unknown) {
1929
3687
  const options = optionsForRuntime as OpenAICompatibleLanguageOptions;
1930
3688
  const url = getGoogleStreamGenerateContentUrl(config.baseURL, modelId);
1931
- const body = buildGoogleGenerateContentRequest(config.name ?? "google", options);
3689
+ const warnings = createWarningCollector();
3690
+ const body = buildGoogleGenerateContentRequest(
3691
+ config.name ?? "google",
3692
+ options,
3693
+ warnings,
3694
+ );
1932
3695
  return requestStream({
1933
3696
  url,
1934
3697
  fetchImpl,
1935
3698
  providerLabel: config.name ?? "google",
3699
+ providerKind: "google",
1936
3700
  init: {
1937
3701
  method: "POST",
1938
3702
  headers: createRequestHeaders({
@@ -1943,9 +3707,13 @@ export function createGoogleModelRuntime(
1943
3707
  body: JSON.stringify(body),
1944
3708
  signal: options.abortSignal,
1945
3709
  },
1946
- }).then((responseStream) => ({
1947
- stream: ReadableStream.from(streamGoogleCompatibleParts(responseStream)),
1948
- }));
3710
+ }).then((responseStream) => {
3711
+ const drained = warnings.drain();
3712
+ return {
3713
+ stream: ReadableStream.from(streamGoogleCompatibleParts(responseStream)),
3714
+ ...(drained.length > 0 ? { warnings: drained } : {}),
3715
+ };
3716
+ });
1949
3717
  },
1950
3718
  };
1951
3719
  }
@@ -1973,6 +3741,7 @@ export function createOpenAIEmbeddingRuntime(
1973
3741
  url,
1974
3742
  fetchImpl,
1975
3743
  providerLabel: config.name ?? "openai",
3744
+ providerKind: "openai",
1976
3745
  init: {
1977
3746
  method: "POST",
1978
3747
  headers: {
@@ -2021,6 +3790,7 @@ export function createGoogleEmbeddingRuntime(
2021
3790
  url,
2022
3791
  fetchImpl,
2023
3792
  providerLabel: config.name ?? "google",
3793
+ providerKind: "google",
2024
3794
  init: {
2025
3795
  method: "POST",
2026
3796
  headers: {