npm - @prometheus-ai/ai - Versions diffs - 0.5.4 → 0.5.8 - Mend

@prometheus-ai/ai 0.5.4 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

package/dist/types/auth-broker/remote-store.d.ts +2 -1
package/dist/types/auth-broker/wire-schemas.d.ts +4 -1
package/dist/types/auth-gateway/server.d.ts +19 -0
package/dist/types/auth-gateway/types.d.ts +9 -3
package/dist/types/auth-retry.d.ts +119 -0
package/dist/types/auth-storage.d.ts +217 -8
package/dist/types/errors.d.ts +24 -0
package/dist/types/index.d.ts +5 -9
package/dist/types/provider-details.d.ts +1 -1
package/dist/types/providers/amazon-bedrock.d.ts +12 -6
package/dist/types/providers/anthropic-client.d.ts +10 -3
package/dist/types/providers/anthropic-messages-server-schema.d.ts +2 -2
package/dist/types/providers/anthropic-messages-server.d.ts +3 -3
package/dist/types/providers/anthropic-wire.d.ts +3 -3
package/dist/types/providers/anthropic.d.ts +41 -34
package/dist/types/providers/aws-credentials.d.ts +8 -0
package/dist/types/providers/azure-openai-responses.d.ts +1 -0
package/dist/types/providers/google-gemini-cli.d.ts +22 -1
package/dist/types/providers/google-shared.d.ts +22 -0
package/dist/types/providers/google-types.d.ts +13 -1
package/dist/types/providers/mock.d.ts +8 -3
package/dist/types/providers/ollama.d.ts +6 -0
package/dist/types/providers/openai-chat-server-schema.d.ts +6 -3
package/dist/types/providers/openai-chat-server.d.ts +3 -3
package/dist/types/providers/openai-chat-wire.d.ts +644 -0
package/dist/types/providers/openai-codex/request-transformer.d.ts +8 -0
package/dist/types/providers/openai-codex/response-handler.d.ts +9 -0
package/dist/types/providers/openai-codex-responses.d.ts +31 -2
package/dist/types/providers/openai-completions-compat.d.ts +2 -25
package/dist/types/providers/openai-completions.d.ts +2 -10
package/dist/types/providers/openai-responses-server-schema.d.ts +4 -4
package/dist/types/providers/openai-responses-server.d.ts +2 -2
package/dist/types/providers/openai-responses-shared.d.ts +49 -9
package/dist/types/providers/openai-responses-wire.d.ts +6065 -0
package/dist/types/providers/openai-responses.d.ts +13 -4
package/dist/types/providers/prometheus-native-client.d.ts +9 -0
package/dist/types/providers/prometheus-native-server.d.ts +4 -3
package/dist/types/providers/transform-messages.d.ts +1 -2
package/dist/types/rate-limit-utils.d.ts +3 -2
package/dist/types/registry/aimlapi.d.ts +4 -0
package/dist/types/registry/alibaba-coding-plan.d.ts +7 -0
package/dist/types/registry/amazon-bedrock.d.ts +5 -0
package/dist/types/registry/anthropic.d.ts +10 -0
package/dist/types/{utils/oauth → registry}/api-key-login.d.ts +8 -2
package/dist/types/{utils/oauth → registry}/api-key-validation.d.ts +15 -0
package/dist/types/registry/cerebras.d.ts +7 -0
package/dist/types/registry/cloudflare-ai-gateway.d.ts +13 -0
package/dist/types/registry/cursor.d.ts +7 -0
package/dist/types/registry/deepseek.d.ts +8 -0
package/dist/types/registry/derived.d.ts +5 -0
package/dist/types/registry/firepass.d.ts +16 -0
package/dist/types/registry/fireworks.d.ts +7 -0
package/dist/types/registry/github-copilot.d.ts +7 -0
package/dist/types/registry/gitlab-duo.d.ts +9 -0
package/dist/types/registry/google-antigravity.d.ts +9 -0
package/dist/types/registry/google-gemini-cli.d.ts +9 -0
package/dist/types/registry/google-vertex.d.ts +5 -0
package/dist/types/registry/google.d.ts +4 -0
package/dist/types/registry/groq.d.ts +4 -0
package/dist/types/registry/huggingface.d.ts +7 -0
package/dist/types/registry/index.d.ts +4 -0
package/dist/types/registry/kagi.d.ts +14 -0
package/dist/types/registry/kilo.d.ts +7 -0
package/dist/types/registry/kimi-code.d.ts +7 -0
package/dist/types/registry/litellm.d.ts +13 -0
package/dist/types/registry/lm-studio.d.ts +8 -0
package/dist/types/registry/minimax-code-cn.d.ts +6 -0
package/dist/types/registry/minimax-code.d.ts +6 -0
package/dist/types/registry/minimax.d.ts +4 -0
package/dist/types/registry/mistral.d.ts +4 -0
package/dist/types/registry/moonshot.d.ts +7 -0
package/dist/types/registry/nanogpt.d.ts +7 -0
package/dist/types/registry/nvidia.d.ts +7 -0
package/dist/types/registry/oauth/__tests__/xai-oauth.test.d.ts +1 -0
package/dist/types/{utils → registry}/oauth/anthropic.d.ts +2 -1
package/dist/types/{utils → registry}/oauth/github-copilot.d.ts +15 -23
package/dist/types/{utils → registry}/oauth/index.d.ts +1 -0
package/dist/types/{utils → registry}/oauth/minimax-code.d.ts +5 -5
package/dist/types/{utils → registry}/oauth/types.d.ts +6 -1
package/dist/types/{utils → registry}/oauth/xai-oauth.d.ts +2 -1
package/dist/types/registry/ollama-cloud.d.ts +7 -0
package/dist/types/registry/ollama.d.ts +12 -0
package/dist/types/registry/openai-codex-device.d.ts +8 -0
package/dist/types/registry/openai-codex.d.ts +9 -0
package/dist/types/registry/openai.d.ts +4 -0
package/dist/types/registry/opencode-go.d.ts +6 -0
package/dist/types/registry/opencode-zen.d.ts +6 -0
package/dist/types/registry/openrouter.d.ts +13 -0
package/dist/types/registry/parallel.d.ts +14 -0
package/dist/types/registry/perplexity.d.ts +7 -0
package/dist/types/registry/qianfan.d.ts +7 -0
package/dist/types/registry/qwen-portal.d.ts +7 -0
package/dist/types/registry/registry.d.ts +272 -0
package/dist/types/registry/synthetic.d.ts +6 -0
package/dist/types/registry/tavily.d.ts +14 -0
package/dist/types/registry/together.d.ts +6 -0
package/dist/types/registry/types.d.ts +51 -0
package/dist/types/registry/venice.d.ts +13 -0
package/dist/types/registry/vercel-ai-gateway.d.ts +7 -0
package/dist/types/registry/vllm.d.ts +7 -0
package/dist/types/registry/wafer-pass.d.ts +6 -0
package/dist/types/registry/wafer-serverless.d.ts +6 -0
package/dist/types/registry/xai-oauth.d.ts +7 -0
package/dist/types/registry/xai.d.ts +4 -0
package/dist/types/registry/xiaomi-token-plan-ams.d.ts +6 -0
package/dist/types/registry/xiaomi-token-plan-cn.d.ts +6 -0
package/dist/types/registry/xiaomi-token-plan-sgp.d.ts +6 -0
package/dist/types/registry/xiaomi.d.ts +6 -0
package/dist/types/registry/zai.d.ts +7 -0
package/dist/types/registry/zenmux.d.ts +7 -0
package/dist/types/registry/zhipu-coding-plan.d.ts +7 -0
package/dist/types/stream.d.ts +9 -1
package/dist/types/types.d.ts +56 -295
package/dist/types/usage/google-antigravity.d.ts +15 -1
package/dist/types/usage/openai-codex-reset.d.ts +79 -0
package/dist/types/usage/openai-codex.d.ts +1 -0
package/dist/types/usage.d.ts +77 -4
package/dist/types/utils/abort.d.ts +6 -0
package/dist/types/utils/event-stream.d.ts +2 -0
package/dist/types/utils/http-inspector.d.ts +0 -1
package/dist/types/utils/idle-iterator.d.ts +35 -0
package/dist/types/utils/openai-http.d.ts +58 -0
package/dist/types/utils/request-debug.d.ts +3 -0
package/dist/types/utils/retry-after.d.ts +1 -0
package/dist/types/utils/schema/fields.d.ts +5 -0
package/dist/types/utils/schema/json-schema-validator.d.ts +8 -0
package/dist/types/utils/schema/stamps.d.ts +7 -15
package/dist/types/utils/sse-debug.d.ts +0 -5
package/dist/types/utils/stream-markup-healing.d.ts +2 -0
package/dist/types/utils.d.ts +1 -5
package/package.json +17 -29
package/src/auth-broker/remote-store.ts +10 -1
package/src/auth-broker/snapshot-cache.ts +1 -1
package/src/auth-broker/wire-schemas.ts +1 -1
package/src/auth-gateway/http.ts +1 -1
package/src/auth-gateway/server.ts +95 -30
package/src/auth-gateway/types.ts +10 -2
package/src/auth-retry.ts +238 -0
package/src/auth-storage.ts +935 -430
package/src/errors.ts +32 -0
package/src/index.ts +9 -14
package/src/provider-details.ts +1 -1
package/src/providers/__tests__/google-auth.test.ts +144 -0
package/src/providers/amazon-bedrock.ts +70 -40
package/src/providers/anthropic-client.ts +15 -13
package/src/providers/anthropic-messages-server-schema.ts +17 -7
package/src/providers/anthropic-messages-server.ts +88 -20
package/src/providers/anthropic-wire.ts +4 -3
package/src/providers/anthropic.ts +1234 -621
package/src/providers/aws-credentials.ts +47 -5
package/src/providers/aws-eventstream.ts +5 -0
package/src/providers/azure-openai-responses.ts +117 -67
package/src/providers/cursor.ts +30 -30
package/src/providers/github-copilot-headers.ts +1 -1
package/src/providers/gitlab-duo.ts +36 -29
package/src/providers/google-auth.ts +71 -8
package/src/providers/google-gemini-cli.ts +118 -22
package/src/providers/google-shared.ts +163 -43
package/src/providers/google-types.ts +10 -1
package/src/providers/kimi.ts +1 -1
package/src/providers/mock.ts +11 -3
package/src/providers/ollama.ts +64 -7
package/src/providers/openai-anthropic-shim.ts +17 -8
package/src/providers/openai-chat-server-schema.ts +9 -3
package/src/providers/openai-chat-server.ts +82 -16
package/src/providers/openai-chat-wire.ts +847 -0
package/src/providers/openai-codex/request-transformer.ts +129 -34
package/src/providers/openai-codex/response-handler.ts +22 -1
package/src/providers/openai-codex-responses.ts +699 -247
package/src/providers/openai-completions-compat.ts +8 -308
package/src/providers/openai-completions.ts +416 -267
package/src/providers/openai-responses-server-schema.ts +15 -9
package/src/providers/openai-responses-server.ts +162 -114
package/src/providers/openai-responses-shared.ts +320 -82
package/src/providers/openai-responses-wire.ts +6391 -0
package/src/providers/openai-responses.ts +382 -176
package/src/providers/prometheus-native-client.ts +27 -11
package/src/providers/prometheus-native-server.ts +44 -17
package/src/providers/transform-messages.ts +311 -120
package/src/providers/vision-guard.ts +5 -3
package/src/rate-limit-utils.ts +13 -3
package/src/registry/aimlapi.ts +6 -0
package/src/{utils/oauth → registry}/alibaba-coding-plan.ts +8 -18
package/src/registry/amazon-bedrock.ts +22 -0
package/src/registry/anthropic.ts +26 -0
package/src/{utils/oauth → registry}/api-key-login.ts +25 -3
package/src/{utils/oauth → registry}/api-key-validation.ts +62 -2
package/src/{utils/oauth → registry}/cerebras.ts +8 -1
package/src/{utils/oauth → registry}/cloudflare-ai-gateway.ts +8 -12
package/src/registry/cursor.ts +20 -0
package/src/{utils/oauth → registry}/deepseek.ts +9 -17
package/src/registry/derived.ts +9 -0
package/src/{utils/oauth → registry}/firepass.ts +10 -2
package/src/{utils/oauth → registry}/fireworks.ts +8 -1
package/src/registry/github-copilot.ts +22 -0
package/src/registry/gitlab-duo.ts +19 -0
package/src/registry/google-antigravity.ts +21 -0
package/src/registry/google-gemini-cli.ts +21 -0
package/src/registry/google-vertex.ts +38 -0
package/src/registry/google.ts +6 -0
package/src/registry/groq.ts +6 -0
package/src/{utils/oauth → registry}/huggingface.ts +8 -19
package/src/registry/index.ts +4 -0
package/src/{utils/oauth → registry}/kagi.ts +9 -11
package/src/{utils/oauth → registry}/kilo.ts +11 -6
package/src/registry/kimi-code.ts +17 -0
package/src/{utils/oauth → registry}/litellm.ts +8 -12
package/src/{utils/oauth → registry}/lm-studio.ts +9 -17
package/src/registry/minimax-code-cn.ts +12 -0
package/src/registry/minimax-code.ts +12 -0
package/src/registry/minimax.ts +6 -0
package/src/registry/mistral.ts +6 -0
package/src/{utils/oauth → registry}/moonshot.ts +8 -9
package/src/{utils/oauth → registry}/nanogpt.ts +8 -1
package/src/{utils/oauth → registry}/nvidia.ts +8 -18
package/src/{utils → registry}/oauth/__tests__/xai-oauth.test.ts +4 -7
package/src/{utils → registry}/oauth/anthropic.ts +38 -17
package/src/{utils → registry}/oauth/github-copilot.ts +79 -115
package/src/registry/oauth/gitlab-duo.ts +198 -0
package/src/{utils → registry}/oauth/google-antigravity.ts +1 -4
package/src/{utils → registry}/oauth/google-gemini-cli.ts +1 -4
package/src/registry/oauth/index.ts +164 -0
package/src/{utils → registry}/oauth/minimax-code.ts +16 -14
package/src/{utils → registry}/oauth/types.ts +7 -51
package/src/{utils → registry}/oauth/wafer.ts +1 -1
package/src/{utils → registry}/oauth/xai-oauth.ts +16 -8
package/src/{utils → registry}/oauth/xiaomi.ts +9 -4
package/src/{utils/oauth → registry}/ollama-cloud.ts +8 -1
package/src/{utils/oauth → registry}/ollama.ts +8 -13
package/src/registry/openai-codex-device.ts +18 -0
package/src/registry/openai-codex.ts +19 -0
package/src/registry/openai.ts +6 -0
package/src/registry/opencode-go.ts +12 -0
package/src/registry/opencode-zen.ts +12 -0
package/src/{utils/oauth → registry}/openrouter.ts +10 -2
package/src/{utils/oauth → registry}/parallel.ts +9 -11
package/src/registry/perplexity.ts +13 -0
package/src/{utils/oauth → registry}/qianfan.ts +8 -17
package/src/{utils/oauth → registry}/qwen-portal.ts +8 -19
package/src/registry/registry.ts +149 -0
package/src/{utils/oauth → registry}/synthetic.ts +7 -1
package/src/{utils/oauth → registry}/tavily.ts +10 -12
package/src/{utils/oauth → registry}/together.ts +7 -1
package/src/registry/types.ts +56 -0
package/src/{utils/oauth → registry}/venice.ts +8 -12
package/src/{utils/oauth → registry}/vercel-ai-gateway.ts +8 -18
package/src/{utils/oauth → registry}/vllm.ts +9 -16
package/src/registry/wafer-pass.ts +12 -0
package/src/registry/wafer-serverless.ts +12 -0
package/src/registry/xai-oauth.ts +17 -0
package/src/registry/xai.ts +6 -0
package/src/registry/xiaomi-token-plan-ams.ts +12 -0
package/src/registry/xiaomi-token-plan-cn.ts +12 -0
package/src/registry/xiaomi-token-plan-sgp.ts +12 -0
package/src/registry/xiaomi.ts +12 -0
package/src/{utils/oauth → registry}/zai.ts +10 -22
package/src/{utils/oauth → registry}/zenmux.ts +8 -1
package/src/{utils/oauth/zhipu.ts → registry/zhipu-coding-plan.ts} +9 -21
package/src/stream.ts +229 -199
package/src/types.ts +63 -384
package/src/usage/claude.ts +4 -2
package/src/usage/github-copilot.ts +4 -2
package/src/usage/google-antigravity.ts +196 -28
package/src/usage/kimi.ts +1 -1
package/src/usage/minimax-code.ts +5 -6
package/src/usage/openai-codex-reset.ts +174 -0
package/src/usage/openai-codex.ts +19 -2
package/src/usage/zai.ts +2 -1
package/src/usage.ts +93 -4
package/src/utils/abort.ts +14 -0
package/src/utils/event-stream.ts +17 -0
package/src/utils/http-inspector.ts +4 -12
package/src/utils/idle-iterator.ts +250 -79
package/src/utils/openai-http.ts +157 -0
package/src/utils/request-debug.ts +67 -19
package/src/utils/retry-after.ts +1 -1
package/src/utils/retry.ts +23 -2
package/src/utils/schema/CONSTRAINTS.md +4 -2
package/src/utils/schema/fields.ts +16 -0
package/src/utils/schema/json-schema-validator.ts +19 -1
package/src/utils/schema/normalize.ts +80 -8
package/src/utils/schema/stamps.ts +22 -10
package/src/utils/schema/wire.ts +2 -2
package/src/utils/sse-debug.ts +0 -271
package/src/utils/stream-markup-healing.ts +50 -8
package/src/utils/validation.ts +49 -13
package/src/utils.ts +2 -26
package/dist/types/model-cache.d.ts +0 -17
package/dist/types/model-manager.d.ts +0 -64
package/dist/types/model-thinking.d.ts +0 -100
package/dist/types/models.d.ts +0 -12
package/dist/types/provider-models/bundled-references.d.ts +0 -4
package/dist/types/provider-models/descriptors.d.ts +0 -50
package/dist/types/provider-models/google.d.ts +0 -24
package/dist/types/provider-models/index.d.ts +0 -5
package/dist/types/provider-models/ollama.d.ts +0 -7
package/dist/types/provider-models/openai-compat.d.ts +0 -323
package/dist/types/provider-models/special.d.ts +0 -16
package/dist/types/utils/discovery/antigravity.d.ts +0 -61
package/dist/types/utils/discovery/codex.d.ts +0 -38
package/dist/types/utils/discovery/cursor.d.ts +0 -23
package/dist/types/utils/discovery/gemini.d.ts +0 -25
package/dist/types/utils/discovery/index.d.ts +0 -4
package/dist/types/utils/discovery/openai-compatible.d.ts +0 -72
package/dist/types/utils/oauth/alibaba-coding-plan.d.ts +0 -18
package/dist/types/utils/oauth/cerebras.d.ts +0 -1
package/dist/types/utils/oauth/cloudflare-ai-gateway.d.ts +0 -18
package/dist/types/utils/oauth/deepseek.d.ts +0 -10
package/dist/types/utils/oauth/firepass.d.ts +0 -1
package/dist/types/utils/oauth/fireworks.d.ts +0 -1
package/dist/types/utils/oauth/huggingface.d.ts +0 -19
package/dist/types/utils/oauth/kagi.d.ts +0 -17
package/dist/types/utils/oauth/kilo.d.ts +0 -5
package/dist/types/utils/oauth/litellm.d.ts +0 -18
package/dist/types/utils/oauth/lm-studio.d.ts +0 -17
package/dist/types/utils/oauth/moonshot.d.ts +0 -1
package/dist/types/utils/oauth/nanogpt.d.ts +0 -1
package/dist/types/utils/oauth/nvidia.d.ts +0 -18
package/dist/types/utils/oauth/ollama-cloud.d.ts +0 -2
package/dist/types/utils/oauth/ollama.d.ts +0 -18
package/dist/types/utils/oauth/openrouter.d.ts +0 -1
package/dist/types/utils/oauth/parallel.d.ts +0 -17
package/dist/types/utils/oauth/qianfan.d.ts +0 -17
package/dist/types/utils/oauth/qwen-portal.d.ts +0 -19
package/dist/types/utils/oauth/synthetic.d.ts +0 -1
package/dist/types/utils/oauth/tavily.d.ts +0 -17
package/dist/types/utils/oauth/together.d.ts +0 -1
package/dist/types/utils/oauth/venice.d.ts +0 -18
package/dist/types/utils/oauth/vercel-ai-gateway.d.ts +0 -18
package/dist/types/utils/oauth/vllm.d.ts +0 -16
package/dist/types/utils/oauth/zai.d.ts +0 -18
package/dist/types/utils/oauth/zenmux.d.ts +0 -1
package/dist/types/utils/oauth/zhipu.d.ts +0 -18
package/src/model-cache.ts +0 -129
package/src/model-manager.ts +0 -469
package/src/model-thinking.ts +0 -756
package/src/models.json +0 -60287
package/src/models.json.d.ts +0 -9
package/src/models.ts +0 -56
package/src/provider-models/bundled-references.ts +0 -38
package/src/provider-models/descriptors.ts +0 -364
package/src/provider-models/google.ts +0 -88
package/src/provider-models/index.ts +0 -5
package/src/provider-models/ollama.ts +0 -153
package/src/provider-models/openai-compat.ts +0 -2904
package/src/provider-models/special.ts +0 -67
package/src/utils/discovery/antigravity.ts +0 -261
package/src/utils/discovery/codex.ts +0 -371
package/src/utils/discovery/cursor.ts +0 -306
package/src/utils/discovery/gemini.ts +0 -248
package/src/utils/discovery/index.ts +0 -4
package/src/utils/discovery/openai-compatible.ts +0 -224
package/src/utils/oauth/gitlab-duo.ts +0 -123
package/src/utils/oauth/index.ts +0 -502
/package/dist/types/{utils/oauth/__tests__/xai-oauth.test.d.ts → providers/__tests__/google-auth.test.d.ts} +0 -0
/package/dist/types/{utils → registry}/oauth/callback-server.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/cursor.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/gitlab-duo.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/google-antigravity.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/google-gemini-cli.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/google-oauth-shared.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/kimi.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/openai-codex.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/opencode.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/perplexity.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/pkce.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/wafer.d.ts +0 -0
/package/dist/types/{utils → registry}/oauth/xiaomi.d.ts +0 -0
/package/src/{utils → registry}/oauth/callback-server.ts +0 -0
/package/src/{utils → registry}/oauth/cursor.ts +0 -0
/package/src/{utils → registry}/oauth/google-oauth-shared.ts +0 -0
/package/src/{utils → registry}/oauth/kimi.ts +0 -0
/package/src/{utils → registry}/oauth/oauth.html +0 -0
/package/src/{utils → registry}/oauth/openai-codex.ts +0 -0
/package/src/{utils → registry}/oauth/opencode.ts +0 -0
/package/src/{utils → registry}/oauth/perplexity.ts +0 -0
/package/src/{utils → registry}/oauth/pkce.ts +0 -0

package/src/providers/openai-completions.ts CHANGED Viewed

@@ -1,27 +1,24 @@
-import { $env, APP_DISPLAY_NAME, extractHttpStatusFromError } from "@prometheus-ai/utils";
-import OpenAI, { APIConnectionTimeoutError as OpenAIConnectionTimeoutError } from "openai";
-import type {
-	ChatCompletionAssistantMessageParam,
-	ChatCompletionChunk,
-	ChatCompletionContentPart,
-	ChatCompletionContentPartImage,
-	ChatCompletionContentPartText,
-	ChatCompletionMessageParam,
-	ChatCompletionToolMessageParam,
-} from "openai/resources/chat/completions";
+import type { Effort } from "@prometheus-ai/catalog/effort";
+import { toFirepassWireModelId, toFireworksWireModelId } from "@prometheus-ai/catalog/fireworks-model-id";
+import { isDeepseekModelIdOrName } from "@prometheus-ai/catalog/identity";
+import { getSupportedEfforts, resolveWireModelId } from "@prometheus-ai/catalog/model-thinking";
+import { calculateCost } from "@prometheus-ai/catalog/models";
+import type { ResolvedOpenAICompat } from "@prometheus-ai/catalog/types";
+import { parseGitHubCopilotApiKey } from "@prometheus-ai/catalog/wire/github-copilot";
+import { $env, extractHttpStatusFromError } from "@prometheus-ai/utils";
 import packageJson from "../../package.json" with { type: "json" };
-import { type Effort, getSupportedEfforts } from "../model-thinking";
-import { calculateCost } from "../models";
+import { getKimiCommonHeaders } from "../registry/oauth/kimi";
 import { getEnvApiKey } from "../stream";
 import {
 	type AssistantMessage,
 	type Context,
-	type FetchImpl,
 	type Message,
 	type MessageAttribution,
 	type Model,
+	OPENAI_MAX_OUTPUT_TOKENS,
 	type OpenAICompat,
 	type ProviderSessionState,
+	type RawSseEvent,
 	resolveServiceTier,
 	type ServiceTier,
 	type StopReason,
@@ -38,7 +35,6 @@ import {
 import { normalizeSystemPrompts } from "../utils";
 import { createAbortSourceTracker } from "../utils/abort";
 import { AssistantMessageEventStream } from "../utils/event-stream";
-import { toFirepassWireModelId, toFireworksWireModelId } from "../utils/fireworks-model-id";
 import {
 	type CapturedHttpErrorResponse,
 	finalizeErrorMessage,
@@ -49,27 +45,38 @@ import {
 	getOpenAIStreamFirstEventTimeoutMs,
 	getOpenAIStreamIdleTimeoutMs,
 	iterateWithIdleTimeout,
+	iterateWithTerminalGrace,
 } from "../utils/idle-iterator";
 import { parseStreamingJson, parseStreamingJsonThrottled } from "../utils/json-parse";
-import { parseGitHubCopilotApiKey } from "../utils/oauth/github-copilot";
-import { getKimiCommonHeaders } from "../utils/oauth/kimi";
+import { OpenAIHttpError, postOpenAIStream } from "../utils/openai-http";
 import { notifyProviderResponse } from "../utils/provider-response";
 import { callWithCopilotModelRetry } from "../utils/retry";
 import { adaptSchemaForStrict, NO_STRICT, toolWireSchema } from "../utils/schema";
-import { wrapFetchForSseDebug } from "../utils/sse-debug";
 import {
 	getStreamMarkupHealingPattern,
 	type HealedToolCall,
+	modelMayLeakThinkingTags,
 	StreamMarkupHealing,
 	type StreamMarkupHealingEvent,
 } from "../utils/stream-markup-healing";
 import { isForcedToolChoice, mapToOpenAICompletionsToolChoice } from "../utils/tool-choice";
+import { parseAzureDeploymentNameMap } from "./azure-openai-responses";
 import {
 	buildCopilotDynamicHeaders,
 	hasCopilotVisionInput,
 	resolveGitHubCopilotBaseUrl,
 } from "./github-copilot-headers";
-import { detectOpenAICompat, type ResolvedOpenAICompat, resolveOpenAICompat } from "./openai-completions-compat";
+import type {
+	ChatCompletionAssistantMessageParam,
+	ChatCompletionChunk,
+	ChatCompletionContentPart,
+	ChatCompletionContentPartImage,
+	ChatCompletionContentPartText,
+	ChatCompletionCreateParamsStreaming,
+	ChatCompletionMessageParam,
+	ChatCompletionTool,
+	ChatCompletionToolMessageParam,
+} from "./openai-chat-wire";
 import { createInitialResponsesAssistantMessage } from "./openai-responses-shared";
 import { transformMessages } from "./transform-messages";
 import {
@@ -107,10 +114,16 @@ function resolveOpenAICompletionsModelId(
 	model: Model<"openai-completions">,
 	options: OpenAICompletionsOptions | undefined,
 ): string {
-	if (model.provider === "firepass") return toFirepassWireModelId(model.id);
-	if (model.provider === "fireworks") return toFireworksWireModelId(model.id);
-	if (model.provider === "openrouter") return applyOpenRouterRoutingVariant(model.id, options?.openrouterVariant);
-	return model.id;
+	// Effort-tier variants route per request effort (off → bare id, efforts →
+	// the thinking backing id); catalog variants (Copilot long-context `-1m`
+	// entries) pin via `requestModelId`; everything else serializes `model.id`.
+	const effort =
+		options?.reasoning && !options.disableReasoning && model.reasoning ? (options.reasoning as Effort) : undefined;
+	const wireId = resolveWireModelId(model, effort);
+	if (model.provider === "firepass") return toFirepassWireModelId(wireId);
+	if (model.provider === "fireworks") return toFireworksWireModelId(wireId);
+	if (model.provider === "openrouter") return applyOpenRouterRoutingVariant(wireId, options?.openrouterVariant);
+	return wireId;
 }
 /**
@@ -255,7 +268,7 @@ export interface OpenAICompletionsOptions extends StreamOptions {
 	openrouterVariant?: string;
 }
-type OpenAICompletionsParams = OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming & {
+type OpenAICompletionsParams = ChatCompletionCreateParamsStreaming & {
 	top_k?: number;
 	min_p?: number;
 	repetition_penalty?: number;
@@ -271,8 +284,10 @@ type AppliedToolStrictMode = "mixed" | "all_strict" | "none";
 type ToolStrictModeOverride = Exclude<ResolvedOpenAICompat["toolStrictMode"], "mixed"> | undefined;
 type BuiltOpenAICompletionTools = {
-	tools: OpenAI.Chat.Completions.ChatCompletionTool[];
+	tools: ChatCompletionTool[];
 	toolStrictMode: AppliedToolStrictMode;
+	/** True when at least one wire tool was sent with `strict: true`. */
+	strictToolsApplied: boolean;
 };
 const OPENAI_COMPLETIONS_PROVIDER_SESSION_STATE_PREFIX = "openai-completions:";
@@ -385,25 +400,13 @@ function getTrailingPartialDeepseekToken(text: string): string {
 }
 const OPENAI_COMPLETIONS_FIRST_EVENT_TIMEOUT_MESSAGE =
 	"OpenAI completions stream timed out while waiting for the first event";
-const GLM_CODING_PLAN_STREAM_IDLE_TIMEOUT_MS = 600_000;
-const GLM_CODING_PLAN_MODEL_PATTERN = /^glm-5(?:[.-]|$)/i;
-/** Returns the widened OpenAI stream watchdog floor for slow GLM coding-plan reasoning models. */
-export function getOpenAICompletionsStreamIdleTimeoutFallbackMs(
-	model: Model<"openai-completions">,
-): number | undefined {
-	if (!GLM_CODING_PLAN_MODEL_PATTERN.test(model.id)) return undefined;
-	if (model.provider === "zhipu-coding-plan" || model.provider === "zai")
-		return GLM_CODING_PLAN_STREAM_IDLE_TIMEOUT_MS;
-	const baseUrl = model.baseUrl.toLowerCase();
-	if (baseUrl.includes("open.bigmodel.cn") || baseUrl.includes("api.z.ai")) {
-		return GLM_CODING_PLAN_STREAM_IDLE_TIMEOUT_MS;
-	}
-	return undefined;
-}
+// How long to keep draining the stream after a `finish_reason` chunk arrived.
+// Compliant hosts follow it (almost) immediately with an optional usage-only
+// chunk and the `[DONE]` sentinel, so the window only ever elapses on hosts
+// that hold the connection open after the response logically completed —
+// without it the turn parks on `iterator.next()` until the idle watchdog
+// converts the already-successful response into a timeout error.
+const OPENAI_COMPLETIONS_POST_FINISH_GRACE_MS = 2_500;
 export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 	model: Model<"openai-completions">,
@@ -415,41 +418,55 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 	(async () => {
 		const startTime = Date.now();
 		let firstTokenTime: number | undefined;
-		let getCapturedErrorResponse: (() => CapturedHttpErrorResponse | undefined) | undefined;
 		const output: AssistantMessage = createInitialResponsesAssistantMessage(model.api, model.provider, model.id);
 		let rawRequestDump: RawHttpRequestDump | undefined;
 		const abortTracker = createAbortSourceTracker(options?.signal);
 		const firstEventTimeoutAbortError = new Error(OPENAI_COMPLETIONS_FIRST_EVENT_TIMEOUT_MESSAGE);
 		const { requestAbortController, requestSignal } = abortTracker;
+		const onSseEvent = options?.onSseEvent;
+		const rawSseObserver = onSseEvent
+			? (event: RawSseEvent) => {
+					if (!event.event && event.data && event.data !== "[DONE]") {
+						try {
+							const parsed = JSON.parse(event.data);
+							const resolvedEvent =
+								typeof parsed.type === "string"
+									? parsed.type
+									: typeof parsed.object === "string"
+										? parsed.object
+										: null;
+							if (resolvedEvent) {
+								event.event = resolvedEvent;
+								event.raw = [`event: ${resolvedEvent}`, ...event.raw];
+							}
+						} catch {}
+					}
+					onSseEvent(event, model);
+				}
+			: undefined;
+		// Assigned once the block helpers exist (they are scoped to the `try`);
+		// the catch handler uses it to close any open blocks before emitting the
+		// terminal error so both exit paths obey the same block lifecycle.
+		let finishOpenBlocksOnError: () => void = () => {};
 		try {
 			const apiKey = options?.apiKey || getEnvApiKey(model.provider) || "";
-			const idleTimeoutFallbackMs = getOpenAICompletionsStreamIdleTimeoutFallbackMs(model);
+			const idleTimeoutFallbackMs = model.compat.streamIdleTimeoutMs;
 			const idleTimeoutMs = options?.streamIdleTimeoutMs ?? getOpenAIStreamIdleTimeoutMs(idleTimeoutFallbackMs);
 			const firstEventTimeoutMs =
 				options?.streamFirstEventTimeoutMs ?? getOpenAIStreamFirstEventTimeoutMs(idleTimeoutMs);
 			const requestTimeoutMs =
 				firstEventTimeoutMs !== undefined && firstEventTimeoutMs > 0 ? firstEventTimeoutMs : undefined;
-			const {
-				client,
-				copilotPremiumRequests,
-				baseUrl,
-				requestHeaders,
-				getCapturedErrorResponse: captureErrorResponse,
-				clearCapturedErrorResponse,
-			} = await createClient(
+			const { copilotPremiumRequests, baseUrl, headers, query, requestHeaders } = await createRequestSetup(
 				model,
 				context,
 				apiKey,
 				options?.headers,
 				options?.initiatorOverride,
-				options?.onSseEvent,
-				options?.fetch,
 			);
 			const premiumRequestsTotal = copilotPremiumRequests;
-			getCapturedErrorResponse = captureErrorResponse;
-			let appliedToolStrictMode: AppliedToolStrictMode = "mixed";
+			let appliedStrictTools = false;
 			const providerSessionState = getOpenAICompletionsProviderSessionState(
 				model,
 				baseUrl,
@@ -457,31 +474,29 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 			);
 			let disableStrictTools = providerSessionState?.strictToolsDisabled ?? false;
 			let strictFallbackErrorMessage: string | undefined;
+			const trimmedBaseUrl = baseUrl.replace(/\/+$/, "");
+			const completionsUrl = query
+				? `${trimmedBaseUrl}/chat/completions?${new URLSearchParams(query)}`
+				: `${trimmedBaseUrl}/chat/completions`;
 			const createCompletionsStream = async (toolStrictModeOverride?: ToolStrictModeOverride) => {
-				clearCapturedErrorResponse();
 				const effectiveToolStrictModeOverride = disableStrictTools ? "none" : toolStrictModeOverride;
-				const { params, toolStrictMode } = buildParams(
+				const { params, strictToolsApplied } = buildParams(
 					model,
 					context,
 					options,
-					baseUrl,
 					effectiveToolStrictModeOverride,
 				);
-				appliedToolStrictMode = toolStrictMode;
+				appliedStrictTools = strictToolsApplied;
 				options?.onPayload?.(params);
 				rawRequestDump = {
 					provider: model.provider,
 					api: output.api,
 					model: model.id,
 					method: "POST",
-					url: `${baseUrl}/chat/completions`,
+					url: completionsUrl,
 					headers: requestHeaders,
 					body: params,
 				};
-				const requestOptions =
-					requestTimeoutMs === undefined
-						? { signal: requestSignal }
-						: { signal: requestSignal, timeout: requestTimeoutMs };
 				let requestTimeout: NodeJS.Timeout | undefined;
 				if (requestTimeoutMs !== undefined) {
 					requestTimeout = setTimeout(
@@ -490,17 +505,26 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 					);
 				}
 				try {
-					const { data, response, request_id } = await client.chat.completions
-						.create(params, requestOptions)
-						.withResponse();
-					await notifyProviderResponse(options, response, model, request_id);
-					return data;
-				} catch (error) {
-					if (error instanceof OpenAIConnectionTimeoutError && !abortTracker.wasCallerAbort()) {
-						throw firstEventTimeoutAbortError;
+					const headersWithTimeout = { ...headers };
+					if (requestTimeoutMs !== undefined) {
+						headersWithTimeout["X-Stainless-Timeout"] = Math.floor(requestTimeoutMs / 1000).toString();
 					}
-					throw error;
+					const { events, response, requestId } = await postOpenAIStream<ChatCompletionChunk>({
+						url: completionsUrl,
+						headers: headersWithTimeout,
+						body: params,
+						signal: requestSignal,
+						fetch: options?.fetch,
+						// With a first-event watchdog armed, transport retries must
+						// not silently extend the deadline (old SDK `maxRetries: 0`).
+						maxAttempts: requestTimeoutMs === undefined ? undefined : 1,
+						onSseEvent: rawSseObserver,
+					});
+					await notifyProviderResponse(options, response, model, requestId);
+					return events;
 				} finally {
+					// Headers arrived (or the request failed); from here the
+					// first-event deadline is enforced by `iterateWithIdleTimeout`.
 					if (requestTimeout !== undefined) clearTimeout(requestTimeout);
 				}
 			};
@@ -511,7 +535,7 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 					signal: requestSignal,
 				});
 			} catch (error) {
-				const capturedErrorResponse = getCapturedErrorResponse();
+				const capturedErrorResponse = error instanceof OpenAIHttpError ? error.captured : undefined;
 				if (
 					isOpenRouterAnthropicModel(model) &&
 					!disableStrictTools &&
@@ -525,9 +549,15 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 					disableStrictTools = true;
 					openaiStream = await createCompletionsStream("none");
 				} else {
-					if (!shouldRetryWithoutStrictTools(error, capturedErrorResponse, appliedToolStrictMode, context.tools)) {
+					if (!shouldRetryWithoutStrictTools(error, capturedErrorResponse, appliedStrictTools, context.tools)) {
 						throw error;
 					}
+					// Remember the rejection for the rest of the session so every
+					// subsequent request doesn't pay a strict-400 + retry round-trip.
+					if (providerSessionState) {
+						providerSessionState.strictToolsDisabled = true;
+					}
+					disableStrictTools = true;
 					openaiStream = await createCompletionsStream("none");
 				}
 			}
@@ -536,13 +566,12 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 			}
 			stream.push({ type: "start", partial: output });
-			const parseMiniMaxThinkTags = model.provider === "minimax-code" || model.provider === "minimax-code-cn";
 			// Some OpenAI-compatible DeepSeek hosts (including NVIDIA NIM and DeepSeek's
 			// native API) leak chat-template tool-call markers in `delta.content` even
 			// though tool calls are also surfaced structurally. Strip the leaked markers
 			// so users don't see raw `<｜...｜>` tokens.
 			const stripDeepseekChatTemplateTokens =
-				/deepseek/i.test(model.id) && (model.provider === "nvidia" || model.provider === "deepseek");
+				isDeepseekModelIdOrName(model.id) && (model.provider === "nvidia" || model.provider === "deepseek");
 			type ToolCallStreamBlock = ToolCall & {
 				partialArgs?: string | Record<string, unknown>;
 				streamIndex?: number;
@@ -560,6 +589,20 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 				if (block.partialArgs === undefined) return;
 				const contentIndex = blockIndex(block);
 				if (contentIndex < 0) return;
+				// Object-shaped `partialArgs` came from MiniMax-compatible hosts that stream
+				// `function.arguments` as an object. The per-chunk handler holds them with an
+				// empty wire delta (see the object branch below) because emitting each chunk's
+				// `JSON.stringify(rawArgs)` would feed concat-based downstream consumers
+				// (proxy.ts, openai-chat-server, openai-responses-server, anthropic-messages-server)
+				// an invalid concatenation like `{"input":"a"}{"input":"b"}`. Flush the final
+				// merged object as one concat-safe delta now so those consumers reconstruct the
+				// args correctly before observing `toolcall_end`.
+				if (typeof block.partialArgs === "object" && !Array.isArray(block.partialArgs)) {
+					const fullJson = JSON.stringify(block.partialArgs);
+					if (fullJson.length > 0 && fullJson !== "{}") {
+						stream.push({ type: "toolcall_delta", contentIndex, delta: fullJson, partial: output });
+					}
+				}
 				block.arguments =
 					typeof block.partialArgs === "string" ? parseStreamingJson(block.partialArgs) : block.partialArgs;
 				delete block.partialArgs;
@@ -591,13 +634,21 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 				}
 				finishToolCallBlock(block);
 			};
+			finishOpenBlocksOnError = () => {
+				if (currentBlock?.type !== "toolCall") finishCurrentBlock(currentBlock);
+				finishPendingToolCallBlocks();
+			};
 			const appendText = (
 				message: AssistantMessage,
 				eventStream: AssistantMessageEventStream,
 				text: string,
 			): void => {
 				if (currentBlock?.type !== "text") {
-					finishCurrentBlock(currentBlock);
+					// Leave toolCall blocks pending across text transitions: chunks after
+					// the first typically carry only `index`, so a finished (de-registered)
+					// call would be reborn as a nameless phantom block when its arguments
+					// resume. The stream-end sweep finalizes pending calls.
+					if (currentBlock?.type !== "toolCall") finishCurrentBlock(currentBlock);
 					currentBlock = { type: "text", text: "" };
 					message.content.push(currentBlock);
 					eventStream.push({ type: "text_start", contentIndex: blockIndex(currentBlock), partial: message });
@@ -620,7 +671,9 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 					currentBlock?.type !== "thinking" ||
 					(signature !== undefined && currentBlock.thinkingSignature !== signature)
 				) {
-					finishCurrentBlock(currentBlock);
+					// Same as appendText: leave toolCall blocks pending so index-only
+					// continuation deltas can still find them.
+					if (currentBlock?.type !== "toolCall") finishCurrentBlock(currentBlock);
 					currentBlock = { type: "thinking", thinking: "", thinkingSignature: signature };
 					message.content.push(currentBlock);
 					eventStream.push({
@@ -646,10 +699,32 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 				if (!firstTokenTime) firstTokenTime = Date.now();
 				appendText(output, stream, text);
 			};
-			const appendThinkingDelta = (thinking: string, signature?: string): void => {
+			// Tracks the last full cumulative reasoning snapshot per signature (the
+			// reasoning field name) so dedup survives block transitions. Required
+			// for MiniMax-M3: once `</think>` and visible text arrive, currentBlock
+			// flips to "text", but later chunks keep carrying the same cumulative
+			// `reasoning_content` snapshot. Without an external tracker the guard
+			// below misses and the snapshot gets re-emitted as a fresh thinking
+			// block after the answer has started.
+			const lastCumulativeReasoningBySignature = new Map<string, string>();
+			const appendThinkingDelta = (
+				thinking: string,
+				signature?: string,
+				source: "delta" | "cumulative" = "delta",
+			): void => {
 				if (!thinking) return;
+				let emittedThinking = thinking;
+				if (source === "cumulative") {
+					const key = signature ?? "";
+					const lastSnapshot = lastCumulativeReasoningBySignature.get(key) ?? "";
+					if (thinking.startsWith(lastSnapshot)) {
+						emittedThinking = thinking.slice(lastSnapshot.length);
+					}
+					lastCumulativeReasoningBySignature.set(key, thinking);
+					if (!emittedThinking) return;
+				}
 				if (!firstTokenTime) firstTokenTime = Date.now();
-				appendThinking(output, stream, thinking, signature);
+				appendThinking(output, stream, emittedThinking, signature);
 			};
 			let deepseekStripBuffer = "";
@@ -676,13 +751,11 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 					appendTextDelta(processedText);
 				}
 			};
-			const streamMarkupHealingPattern = getStreamMarkupHealingPattern(model.provider, model.id, {
-				parseThinkingTags: parseMiniMaxThinkTags,
-			});
+			const streamMarkupHealingPattern = getStreamMarkupHealingPattern(model.provider, model.id);
 			const streamMarkupHealing = streamMarkupHealingPattern
 				? new StreamMarkupHealing({ pattern: streamMarkupHealingPattern })
 				: undefined;
+			const explicitReasoningDeltasMayBeCumulative = modelMayLeakThinkingTags(model.provider, model.id);
 			let healedToolCallEmitted = false;
 			const emitHealedToolCall = (call: HealedToolCall): void => {
 				finishCurrentBlock(currentBlock);
@@ -722,7 +795,12 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 				for (const call of calls) emitHealedToolCall(call);
 			};
-			for await (const chunk of iterateWithIdleTimeout(openaiStream, {
+			// Terminal-chunk bookkeeping for the post-finish grace window below.
+			// `streamFinishedAt` flips when a chunk carries `finish_reason`;
+			// `sawUsagePayload` flips when any usage payload was parsed.
+			let streamFinishedAt: number | undefined;
+			let sawUsagePayload = false;
+			const timedOpenaiStream = iterateWithIdleTimeout(openaiStream, {
 				idleTimeoutMs,
 				firstItemTimeoutMs: firstEventTimeoutMs,
 				firstItemErrorMessage: OPENAI_COMPLETIONS_FIRST_EVENT_TIMEOUT_MESSAGE,
@@ -731,24 +809,48 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 				onFirstItemTimeout: () => abortTracker.abortLocally(firstEventTimeoutAbortError),
 				abortSignal: options?.signal,
 				isProgressItem: isOpenAICompletionsProgressChunk,
-			})) {
+			});
+			const terminalAwareStream = iterateWithTerminalGrace(timedOpenaiStream, {
+				finishedAtMs: () => streamFinishedAt,
+				graceMs: OPENAI_COMPLETIONS_POST_FINISH_GRACE_MS,
+				// The inner idle-timeout generator is parked mid-`next()` when the
+				// grace window closes, so abort the transport to settle that read
+				// and release the socket immediately (a queued `.return()` alone
+				// would wait on the never-arriving next chunk).
+				onGraceEnd: () => requestAbortController.abort(),
+			});
+			for await (const chunk of terminalAwareStream) {
 				if (!chunk || typeof chunk !== "object") continue;
 				// OpenAI documents ChatCompletionChunk.id as the unique chat completion identifier,
 				// and each chunk in a streamed completion carries the same id.
 				output.responseId ||= chunk.id;
+				// Aggregators (OpenRouter, Vercel AI Gateway, …) report the upstream
+				// provider that actually served the request via a top-level `provider`
+				// field present on every chunk. Capture the first non-empty value so
+				// callers can attribute routing without re-parsing the raw stream.
+				output.upstreamProvider ||= getOptionalStringProperty(chunk, "provider");
 				if (chunk.usage) {
 					output.usage = parseChunkUsage(chunk.usage, model, premiumRequestsTotal);
+					sawUsagePayload = true;
 				}
 				const choice = Array.isArray(chunk.choices) ? chunk.choices[0] : undefined;
-				if (!choice) continue;
+				if (!choice) {
+					// Trailing usage-only chunk (`stream_options.include_usage`) after
+					// `finish_reason`: the response is complete — stop pulling instead
+					// of waiting for `[DONE]`/close from hosts that never send either.
+					if (streamFinishedAt !== undefined && sawUsagePayload) break;
+					continue;
+				}
 				if (!chunk.usage) {
 					const choiceUsage = getChoiceUsage(choice);
 					if (choiceUsage) {
 						output.usage = parseChunkUsage(choiceUsage, model, premiumRequestsTotal);
+						sawUsagePayload = true;
 					}
 				}
@@ -758,14 +860,42 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 					if (finishReasonResult.errorMessage) {
 						output.errorMessage = finishReasonResult.errorMessage;
 					}
+					streamFinishedAt ??= Date.now();
 				}
 				if (choice.delta) {
+					// Some endpoints return reasoning in reasoning_content (llama.cpp),
+					// or reasoning (other openai compatible endpoints). Use the first
+					// non-empty reasoning field to avoid duplication when a chunk carries
+					// multiple aliases for the same reasoning text.
+					const reasoningFields = ["reasoning_content", "reasoning", "reasoning_text"];
+					const deltaRecord = choice.delta as Record<string, unknown>;
+					let foundReasoningField: string | undefined;
+					let foundReasoningDelta = "";
+					for (const field of reasoningFields) {
+						const reasoningDelta = deltaRecord[field];
+						if (typeof reasoningDelta === "string" && reasoningDelta.length > 0) {
+							foundReasoningField = field;
+							foundReasoningDelta = reasoningDelta;
+							break;
+						}
+					}
+					if (foundReasoningField) {
+						appendThinkingDelta(
+							foundReasoningDelta,
+							foundReasoningField,
+							explicitReasoningDeltasMayBeCumulative ? "cumulative" : "delta",
+						);
+					}
 					const normalizedDeltaText = normalizeStreamingContentText(choice.delta.content);
 					if (normalizedDeltaText.length > 0) {
 						if (!firstTokenTime) firstTokenTime = Date.now();
 						const hasStructuredToolCalls =
 							Array.isArray(choice.delta.tool_calls) && choice.delta.tool_calls.length > 0;
+						const suppressContentThinking =
+							foundReasoningField !== undefined && streamMarkupHealing?.pattern === "thinking";
 						if (streamMarkupHealing) {
 							if (hasStructuredToolCalls) {
@@ -776,6 +906,7 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 								appendProcessedText(streamMarkupHealing.consumeWithoutCalls(normalizedDeltaText));
 							} else {
 								for (const event of streamMarkupHealing.feedEvents(normalizedDeltaText)) {
+									if (suppressContentThinking && event.type === "thinking") continue;
 									emitHealingEvent(event);
 								}
 							}
@@ -784,30 +915,6 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 						}
 					}
-					// Some endpoints return reasoning in reasoning_content (llama.cpp),
-					// or reasoning (other openai compatible endpoints)
-					// Use the first non-empty reasoning field to avoid duplication
-					// (e.g., chutes.ai returns both reasoning_content and reasoning with same content)
-					const reasoningFields = ["reasoning_content", "reasoning", "reasoning_text"];
-					let foundReasoningField: string | null = null;
-					for (const field of reasoningFields) {
-						if (
-							(choice.delta as any)[field] !== null &&
-							(choice.delta as any)[field] !== undefined &&
-							(choice.delta as any)[field].length > 0
-						) {
-							if (!foundReasoningField) {
-								foundReasoningField = field;
-								break;
-							}
-						}
-					}
-					if (foundReasoningField) {
-						const delta = (choice.delta as any)[foundReasoningField];
-						appendThinkingDelta(delta, foundReasoningField);
-					}
 					if (choice?.delta?.tool_calls && choice.delta.tool_calls.length > 0) {
 						for (const toolCall of choice.delta.tool_calls) {
 							const streamIndex = typeof toolCall.index === "number" ? toolCall.index : undefined;
@@ -845,6 +952,11 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 									partial: output,
 								});
 							} else {
+								// Resuming a pending call after interleaved text/thinking:
+								// close the text/thinking block we drifted into.
+								if (currentBlock !== block && currentBlock && currentBlock.type !== "toolCall") {
+									finishCurrentBlock(currentBlock);
+								}
 								currentBlock = block;
 								if (streamIndex !== undefined && block.streamIndex === undefined) {
 									block.streamIndex = streamIndex;
@@ -871,13 +983,37 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 									}
 								}
 							} else if (rawArgs && typeof rawArgs === "object" && !Array.isArray(rawArgs)) {
-								// MiniMax-compatible hosts stream `function.arguments` as a complete object in a
-								// single delta instead of the OpenAI JSON-string contract. Hold the object directly
-								// — no `[object Object]` round-trip through the string buffer — and serialize once for
-								// the wire delta that proxy servers forward verbatim as `input_json_delta`.
-								block.partialArgs = rawArgs;
-								block.arguments = rawArgs;
-								delta = JSON.stringify(rawArgs);
+								// MiniMax-compatible hosts stream `function.arguments` as an object instead of the
+								// OpenAI JSON-string contract. Most chunks carry the complete object in one delta,
+								// but cannot rely on that: replacing per-chunk drops earlier keys (and earlier
+								// string content for the same key) when the host fragments the args across deltas.
+								// Shallow-merge into the accumulated object; for shared string keys, detect
+								// cumulative-vs-delta semantics with `startsWith` so we neither duplicate cumulative
+								// payloads nor lose delta fragments. Degenerates to the previous "last wins"
+								// behaviour for the common single-chunk shape (no prior value to merge with).
+								//
+								// `delta` stays empty here: emitting `JSON.stringify(rawArgs)` per chunk feeds
+								// downstream concat-based accumulators (proxy.ts, openai-chat-server,
+								// openai-responses-server, anthropic-messages-server) an invalid sequence like
+								// `{"input":"a"}{"input":"b"}`. The merged object is flushed as a single
+								// concat-safe delta in `finishToolCallBlock` before `toolcall_end` instead.
+								const prev =
+									block.partialArgs &&
+									typeof block.partialArgs === "object" &&
+									!Array.isArray(block.partialArgs)
+										? (block.partialArgs as Record<string, unknown>)
+										: undefined;
+								const merged: Record<string, unknown> = prev ? { ...prev } : {};
+								for (const [key, value] of Object.entries(rawArgs)) {
+									const prevValue = merged[key];
+									if (typeof prevValue === "string" && typeof value === "string") {
+										merged[key] = value.startsWith(prevValue) ? value : prevValue + value;
+									} else {
+										merged[key] = value;
+									}
+								}
+								block.partialArgs = merged;
+								block.arguments = merged;
 							}
 							stream.push({
 								type: "toolcall_delta",
@@ -902,6 +1038,12 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 						}
 					}
 				}
+				// `finish_reason` + usage both observed: the chat-completions
+				// contract has nothing left to deliver. Break instead of waiting
+				// for `[DONE]`/connection close so hosts that hold the socket open
+				// can't park the turn until the idle watchdog errors it out.
+				if (streamFinishedAt !== undefined && sawUsagePayload) break;
 			}
 			if (streamMarkupHealing) {
@@ -962,13 +1104,20 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 			stream.push({ type: "done", reason: output.stopReason, message: output });
 			stream.end();
 		} catch (error) {
+			// Close open blocks first so consumers tracking text_/thinking_/toolcall_
+			// lifecycles never see orphaned starts on the error path. Best-effort: a
+			// throw here must not prevent the terminal error event below.
+			try {
+				finishOpenBlocksOnError();
+			} catch {}
 			for (const block of output.content) delete (block as any).index;
 			const firstEventTimeoutError = abortTracker.getLocalAbortReason();
 			output.stopReason = abortTracker.wasCallerAbort() ? "aborted" : "error";
-			output.errorStatus = extractHttpStatusFromError(error) ?? getCapturedErrorResponse?.()?.status;
+			const capturedErrorResponse = error instanceof OpenAIHttpError ? error.captured : undefined;
+			output.errorStatus = extractHttpStatusFromError(error) ?? capturedErrorResponse?.status;
 			output.errorMessage =
 				firstEventTimeoutError?.message ??
-				(await finalizeErrorMessage(error, rawRequestDump, getCapturedErrorResponse?.()));
+				(await finalizeErrorMessage(error, rawRequestDump, capturedErrorResponse));
 			// Some providers via OpenRouter include extra details here.
 			const rawMetadata = (error as { error?: { metadata?: { raw?: string } } })?.error?.metadata?.raw;
 			if (rawMetadata) output.errorMessage += `\n${rawMetadata}`;
@@ -983,21 +1132,21 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
 	return stream;
 };
-async function createClient(
+async function createRequestSetup(
 	model: Model<"openai-completions">,
 	context: Context,
 	apiKey?: string,
 	extraHeaders?: Record<string, string>,
 	initiatorOverride?: MessageAttribution,
-	onSseEvent?: OpenAICompletionsOptions["onSseEvent"],
-	fetchOverride?: FetchImpl,
 ): Promise<{
-	client: OpenAI;
 	copilotPremiumRequests: number | undefined;
-	baseUrl: string | undefined;
+	baseUrl: string;
+	/** Headers sent on the wire, including `Authorization`. */
+	headers: Record<string, string>;
+	/** Query params appended to the request URL (Azure `api-version`). */
+	query: Record<string, string> | undefined;
+	/** Headers recorded in `rawRequestDump` (sans `Authorization`). */
 	requestHeaders: Record<string, string>;
-	getCapturedErrorResponse: () => CapturedHttpErrorResponse | undefined;
-	clearCapturedErrorResponse: () => void;
 }> {
 	if (!apiKey) {
 		if (!$env.OPENAI_API_KEY) {
@@ -1015,12 +1164,12 @@ async function createClient(
 		// analytics. `HTTP-Referer` is the unique app identifier; without it nothing is
 		// tracked. `X-OpenRouter-Title` is the display name (`X-Title` is the legacy
 		// alias kept for back-compat). `X-OpenRouter-Categories` slots us into the
-		// `cli-agent` marketplace category. `User-Agent` overrides the default OpenAI
-		// SDK UA so traffic is identifiable in upstream provider logs.
+		// `cli-agent` marketplace category. `User-Agent` makes our traffic
+		// identifiable in upstream provider logs.
 		// https://openrouter.ai/docs/app-attribution
-		headers["User-Agent"] = `${APP_DISPLAY_NAME}/${packageJson.version}`;
-		headers["HTTP-Referer"] = "https://prometheus.trivlab.com/";
-		headers["X-OpenRouter-Title"] = APP_DISPLAY_NAME;
+		headers["User-Agent"] = `Prometheus/${packageJson.version}`;
+		headers["HTTP-Referer"] = "https://prometheus.sh/";
+		headers["X-OpenRouter-Title"] = "Prometheus";
 		headers["X-OpenRouter-Categories"] = "cli-agent";
 		// Always-on response caching: identical requests return cached responses for free.
 		// TTL 1h; first call hits the provider, every identical call within the window
@@ -1055,114 +1204,68 @@ async function createClient(
 	if (baseUrl?.includes(".openai.azure.com")) {
 		const apiVersion = $env.AZURE_OPENAI_API_VERSION || "2024-10-21";
 		if (!baseUrl.includes("/deployments/")) {
-			baseUrl = `${baseUrl}/deployments/${model.id}`;
+			// Honor AZURE_OPENAI_DEPLOYMENT_NAME_MAP like the responses provider:
+			// deployment names routinely differ from catalog model ids.
+			const deploymentName =
+				parseAzureDeploymentNameMap($env.AZURE_OPENAI_DEPLOYMENT_NAME_MAP).get(model.id) ?? model.id;
+			baseUrl = `${baseUrl}/deployments/${deploymentName}`;
 		}
 		azureDefaultQuery = { "api-version": apiVersion };
 	}
-	let capturedErrorResponse: CapturedHttpErrorResponse | undefined;
-	const baseFetch = fetchOverride ?? fetch;
-	const wrappedFetch = Object.assign(
-		async (input: string | URL | Request, init?: RequestInit): Promise<Response> => {
-			const response = await baseFetch(input, init);
-			if (response.ok) {
-				capturedErrorResponse = undefined;
-				return response;
-			}
-			let bodyText: string | undefined;
-			let bodyJson: unknown;
-			try {
-				bodyText = await response.clone().text();
-				if (bodyText.trim().length > 0) {
-					try {
-						bodyJson = JSON.parse(bodyText);
-					} catch {}
-				}
-			} catch {}
-			capturedErrorResponse = {
-				status: response.status,
-				headers: response.headers,
-				bodyText,
-				bodyJson,
-			};
-			return response;
-		},
-		baseFetch.preconnect ? { preconnect: baseFetch.preconnect } : {},
-	);
-	const debugFetch = onSseEvent ? wrapFetchForSseDebug(wrappedFetch, event => onSseEvent(event, model)) : wrappedFetch;
+	// The removed SDK client resolved its base URL as
+	// `baseURL ?? $OPENAI_BASE_URL ?? https://api.openai.com/v1`; keep that
+	// resolution explicit now that we build the request URL ourselves.
+	const resolvedBaseUrl = baseUrl ?? ($env.OPENAI_BASE_URL?.trim() || "https://api.openai.com/v1");
 	return {
-		client: new OpenAI({
-			apiKey,
-			baseURL: baseUrl,
-			dangerouslyAllowBrowser: true,
-			maxRetries: 5,
-			defaultHeaders: headers,
-			defaultQuery: azureDefaultQuery,
-			fetch: debugFetch,
-		}),
 		copilotPremiumRequests,
-		baseUrl,
+		baseUrl: resolvedBaseUrl,
+		headers: { Authorization: `Bearer ${apiKey}`, ...headers },
+		query: azureDefaultQuery,
 		requestHeaders: headers,
-		getCapturedErrorResponse: () => capturedErrorResponse,
-		clearCapturedErrorResponse: () => {
-			capturedErrorResponse = undefined;
-		},
 	};
 }
+function getForcedCompletionsToolName(toolChoice: OpenAICompletionsParams["tool_choice"]): string | undefined {
+	if (typeof toolChoice !== "object" || toolChoice === null || !("function" in toolChoice)) return undefined;
+	return toolChoice.function.name;
+}
 function buildParams(
 	model: Model<"openai-completions">,
 	context: Context,
 	options: OpenAICompletionsOptions | undefined,
-	resolvedBaseUrl?: string,
 	toolStrictModeOverride?: ToolStrictModeOverride,
-): { params: OpenAICompletionsParams; toolStrictMode: AppliedToolStrictMode } {
-	const compat = getCompat(model, resolvedBaseUrl);
-	// Opencode Zen's gateway (https://opencode.ai/zen/go/v1) gates
-	// `reasoning_content` on the request's thinking state for every model it
-	// fronts (Kimi K2.x, DeepSeek V4, GLM-5.x, Qwen3.x, MiMo, MiniMax, …): it
-	// 400s with `Extra inputs are not permitted` when thinking is off but the
-	// field is supplied (#1071), and 400s with `thinking is enabled but
-	// reasoning_content is missing in assistant tool call message at index N`
-	// (#1484) when thinking is on and the field is absent. `detectOpenAICompat`
-	// only set `requiresReasoningContentForToolCalls` for the DeepSeek family
-	// (and previously for Kimi until #1071 carved out opencode); reactivate it
-	// per request for every opencode model whenever this turn is in thinking
-	// mode so prior tool-call turns replay reasoning_content. Forced-tool
-	// turns are excluded because the later `disableReasoningOnForcedToolChoice`
-	// guard at the bottom of `buildParams` strips thinking from the wire body
-	// for Kimi-style models — keeping the replay on under those conditions
-	// would resurrect the #1071 failure.
-	//
-	// `allowsSyntheticReasoningContentForToolCalls` is forced to `false` on
-	// the same path: the gateway specifically requires `reasoning_content`,
-	// and the default synthetic-friendly behavior would echo whichever field
-	// the upstream streamed (e.g. `reasoning` for many opencode turns),
-	// landing the replay in the wrong key and re-triggering the 400.
-	const isOpenCodeProvider = model.provider === "opencode-go" || model.provider === "opencode-zen";
+): { params: OpenAICompletionsParams; toolStrictMode: AppliedToolStrictMode; strictToolsApplied: boolean } {
+	let compat = model.compat;
 	const thinkingEnabledForRequest =
 		Boolean(options?.reasoning) && !options?.disableReasoning && Boolean(model.reasoning);
 	const forcedToolChoiceSuppressesThinking =
 		compat.disableReasoningOnForcedToolChoice &&
+		compat.supportsForcedToolChoice &&
 		isForcedToolChoice(mapToOpenAICompletionsToolChoice(options?.toolChoice));
-	if (isOpenCodeProvider && thinkingEnabledForRequest && !forcedToolChoiceSuppressesThinking) {
-		compat.requiresReasoningContentForToolCalls = true;
-		compat.allowsSyntheticReasoningContentForToolCalls = false;
-		compat.reasoningContentField = "reasoning_content";
+	if (compat.whenThinking && thinkingEnabledForRequest && !forcedToolChoiceSuppressesThinking) {
+		compat = compat.whenThinking; // precomputed at model build — pointer swap, no allocation
 	}
-	const isKimiModelId = model.id.includes("moonshotai/kimi") || /(^|\/)kimi[-.]/i.test(model.id);
 	const messages = convertMessages(model, context, compat);
 	maybeAddAnthropicCacheControl(compat, messages);
-	const supportsReasoningParams = model.provider !== "github-copilot";
-	// Kimi (including via OpenRouter and Fireworks router-form IDs such as
-	// `accounts/fireworks/routers/kimi-*`) calculates TPM rate limits based on
-	// max_tokens, not actual output. The official Kimi K2 model guidance
-	// (https://docs.fireworks.ai/models/kimi-k2) also requires `max_tokens` for
-	// every call since the family can otherwise emit very long reasoning traces
-	// before the final answer. Always send max_tokens — match the same
-	// Kimi-family regex used by the compat detector.
-	// Note: Direct kimi-code provider is handled by the dedicated Kimi provider in kimi.ts.
-	const effectiveMaxTokens = options?.maxTokens ?? (isKimiModelId ? model.maxTokens : undefined);
+	const supportsReasoningParams = compat.supportsReasoningParams;
+	// Kimi-family models calculate TPM rate limits from max_tokens (not actual
+	// output) and the official guidance requires sending it on every call —
+	// `compat.alwaysSendMaxTokens` carries that detection.
+	const requestedMaxTokens =
+		options?.maxTokens ?? (compat.alwaysSendMaxTokens ? (model.maxTokens ?? OPENAI_MAX_OUTPUT_TOKENS) : undefined);
+	// OpenRouter fans out to upstreams whose output caps differ from the catalog
+	// value (which tracks the highest-cap provider). A max_tokens above the routed
+	// upstream's cap makes OpenRouter silently skip that provider (e.g. Cerebras
+	// GLM-4.7, ~40k) for a higher-cap one, defeating `provider.order`/`only`. Omit
+	// it for OpenRouter so each upstream self-caps and routing is honored — unless
+	// the model always requires max_tokens (Kimi TPM accounting, see above).
+	const omitMaxTokensForRouting = compat.isOpenRouterHost && !compat.alwaysSendMaxTokens;
+	const effectiveMaxTokens =
+		requestedMaxTokens === undefined || omitMaxTokensForRouting
+			? undefined
+			: Math.min(requestedMaxTokens, model.maxTokens ?? Number.POSITIVE_INFINITY, OPENAI_MAX_OUTPUT_TOKENS);
 	const requestModelId = resolveOpenAICompletionsModelId(model, options);
 	const params: OpenAICompletionsParams = {
@@ -1171,6 +1274,7 @@ function buildParams(
 		stream: true,
 	};
 	let toolStrictMode: AppliedToolStrictMode = "none";
+	let strictToolsApplied = false;
 	if (compat.supportsUsageInStreaming !== false) {
 		params.stream_options = { include_usage: true };
@@ -1224,6 +1328,7 @@ function buildParams(
 		const builtTools = convertTools(context.tools, compat, toolStrictModeOverride);
 		params.tools = builtTools.tools;
 		toolStrictMode = builtTools.toolStrictMode;
+		strictToolsApplied = builtTools.strictToolsApplied;
 	} else if (context.tools === undefined && hasToolHistory(context.messages)) {
 		// Anthropic (via LiteLLM/proxy) requires the `tools` param when the conversation
 		// contains tool_calls/tool_results, even when no tools are offered this turn.
@@ -1238,6 +1343,12 @@ function buildParams(
 	if (options?.toolChoice && compat.supportsToolChoice) {
 		params.tool_choice = mapToOpenAICompletionsToolChoice(options.toolChoice);
 	}
+	if (isForcedToolChoice(params.tool_choice) && !compat.supportsForcedToolChoice) {
+		// Some thinking-required OpenAI-compatible models reject forced
+		// `tool_choice` while still accepting tools with the default auto
+		// selector. Keep the tool available and let the model choose it.
+		params.tool_choice = "auto";
+	}
 	if (params.tool_choice === "none" && (!Array.isArray(params.tools) || params.tools.length === 0)) {
 		// `tool_choice: "none"` with no tools to gate is redundant and also
@@ -1251,6 +1362,19 @@ function buildParams(
 		delete params.tool_choice;
 	}
+	const forcedToolName = getForcedCompletionsToolName(params.tool_choice);
+	if (
+		forcedToolName !== undefined &&
+		(!Array.isArray(params.tools) ||
+			!params.tools.some(tool => tool.type === "function" && tool.function.name === forcedToolName))
+	) {
+		// A forced named tool_choice is only valid when the same request offers
+		// that function in `tools`. Active-tool filtering normally enforces this
+		// before provider dispatch; this guard keeps raw provider callers from
+		// emitting a self-inconsistent OpenAI-compatible payload.
+		delete params.tool_choice;
+	}
 	if (supportsReasoningParams && compat.thinkingFormat === "zai" && model.reasoning) {
 		// Z.ai uses binary thinking: { type: "enabled" | "disabled" }
 		// Must explicitly disable since z.ai defaults to thinking enabled.
@@ -1278,7 +1402,10 @@ function buildParams(
 			openRouterParams.reasoning = { enabled: false };
 		} else if (options?.reasoning) {
 			openRouterParams.reasoning = {
-				effort: mapReasoningEffort(options.reasoning, compat.reasoningEffortMap),
+				effort:
+					compat.reasoningEffortMap?.[options.reasoning] ??
+					model.thinking?.effortMap?.[options.reasoning] ??
+					options.reasoning,
 			};
 		}
 	} else if (
@@ -1289,7 +1416,9 @@ function buildParams(
 		compat.supportsReasoningEffort
 	) {
 		// OpenAI-style reasoning_effort
-		params.reasoning_effort = mapReasoningEffort(options.reasoning, compat.reasoningEffortMap) as Effort;
+		params.reasoning_effort = (compat.reasoningEffortMap?.[options.reasoning] ??
+			model.thinking?.effortMap?.[options.reasoning] ??
+			options.reasoning) as Effort;
 	} else if (
 		supportsReasoningParams &&
 		options?.disableReasoning &&
@@ -1304,7 +1433,9 @@ function buildParams(
 		if (minEffort === undefined) {
 			throw new Error(`Model ${model.provider}/${model.id} has no supported reasoning efforts`);
 		}
-		params.reasoning_effort = mapReasoningEffort(minEffort, compat.reasoningEffortMap) as Effort;
+		params.reasoning_effort = (compat.reasoningEffortMap?.[minEffort] ??
+			model.thinking?.effortMap?.[minEffort] ??
+			minEffort) as Effort;
 	}
 	if (compat.disableReasoningOnToolChoice && params.tool_choice !== undefined) {
@@ -1327,13 +1458,13 @@ function buildParams(
 	}
 	// OpenRouter provider routing preferences
-	if (model.baseUrl.includes("openrouter.ai") && compat.openRouterRouting) {
+	if (compat.isOpenRouterHost && compat.openRouterRouting) {
 		params.provider = compat.openRouterRouting;
 	}
 	// Vercel AI Gateway provider routing preferences
-	if (model.baseUrl.includes("ai-gateway.vercel.sh") && model.compat?.vercelGatewayRouting) {
-		const routing = model.compat.vercelGatewayRouting;
+	if (compat.isVercelGatewayHost && compat.vercelGatewayRouting) {
+		const routing = compat.vercelGatewayRouting;
 		if (routing.only || routing.order) {
 			const gatewayOptions: Record<string, string[]> = {};
 			if (routing.only) gatewayOptions.only = routing.only;
@@ -1344,9 +1475,14 @@ function buildParams(
 	if (compat.extraBody) {
 		Object.assign(params, compat.extraBody);
+		if (model.provider === "fireworks" && params.reasoning_effort !== undefined) {
+			// Fireworks rejects simultaneous DeepSeek-style `thinking` toggles and
+			// OpenAI-style `reasoning_effort`; the effort field carries the user's level.
+			delete params.thinking;
+		}
 	}
-	return { params, toolStrictMode };
+	return { params, toolStrictMode, strictToolsApplied };
 }
 function getOptionalNumberProperty(value: object, key: string): number | undefined {
@@ -1354,6 +1490,11 @@ function getOptionalNumberProperty(value: object, key: string): number | undefin
 	return typeof property === "number" ? property : undefined;
 }
+function getOptionalStringProperty(value: object, key: string): string | undefined {
+	const property = Reflect.get(value, key);
+	return typeof property === "string" && property.length > 0 ? property : undefined;
+}
 function getOptionalObjectProperty(value: object, key: string): object | undefined {
 	const property = Reflect.get(value, key);
 	return typeof property === "object" && property !== null ? property : undefined;
@@ -1430,13 +1571,6 @@ export function parseChunkUsage(
 	return usage;
 }
-function mapReasoningEffort(
-	effort: NonNullable<OpenAICompletionsOptions["reasoning"]>,
-	reasoningEffortMap: Partial<Record<NonNullable<OpenAICompletionsOptions["reasoning"]>, string>>,
-): string {
-	return reasoningEffortMap[effort] ?? effort;
-}
 function maybeAddAnthropicCacheControl(compat: ResolvedOpenAICompat, messages: ChatCompletionMessageParam[]): void {
 	if (compat.cacheControlFormat !== "anthropic") return;
 	// Anthropic-style caching requires cache_control on a text part. Add a breakpoint
@@ -1447,6 +1581,7 @@ function maybeAddAnthropicCacheControl(compat: ResolvedOpenAICompat, messages: C
 		const content = msg.content;
 		if (typeof content === "string") {
+			if (content.trim().length === 0) continue;
 			msg.content = [
 				Object.assign({ type: "text" as const, text: content }, { cache_control: { type: "ephemeral" } }),
 			];
@@ -1455,10 +1590,12 @@ function maybeAddAnthropicCacheControl(compat: ResolvedOpenAICompat, messages: C
 		if (!Array.isArray(content)) continue;
-		// Find last text part and add cache_control
+		// Find last non-empty text part and add cache_control. Empty assistant
+		// content is valid for tool-call replay, but Anthropic/OpenRouter reject
+		// empty text blocks once cache_control turns it into structured content.
 		for (let j = content.length - 1; j >= 0; j--) {
 			const part = content[j];
-			if (part?.type === "text") {
+			if (part?.type === "text" && part.text.trim().length > 0) {
 				Object.assign(part, { cache_control: { type: "ephemeral" } });
 				return;
 			}
@@ -1473,6 +1610,12 @@ export function convertMessages(
 ): ChatCompletionMessageParam[] {
 	const params: ChatCompletionMessageParam[] = [];
+	const maxNormalizedToolCallIdLength = compat.requiresMistralToolIds
+		? 9
+		: model.provider === "openai"
+			? 40
+			: undefined;
+	const duplicateToolCallIdSuffixPrefix = compat.requiresMistralToolIds ? "dup" : undefined;
 	const normalizeToolCallId = (id: string): string => {
 		if (compat.requiresMistralToolIds) return normalizeMistralToolId(id, true);
@@ -1489,7 +1632,13 @@ export function convertMessages(
 		if (model.provider === "openai") return id.length > 40 ? id.slice(0, 40) : id;
 		return id;
 	};
-	const transformedMessages = transformMessages(context.messages, model, id => normalizeToolCallId(id));
+	const transformedMessages = transformMessages(
+		context.messages,
+		model,
+		id => normalizeToolCallId(id),
+		maxNormalizedToolCallIdLength,
+		duplicateToolCallIdSuffixPrefix,
+	);
 	const remappedToolCallIds = new Map<string, string[]>();
 	let generatedToolCallIdCounter = 0;
@@ -1586,6 +1735,8 @@ export function convertMessages(
 							type: "image_url",
 							image_url: {
 								url: `data:${item.mimeType};base64,${item.data}`,
+								// Chat Completions has no "original"; omit it (provider default).
+								...(item.detail && item.detail !== "original" ? { detail: item.detail } : {}),
 							},
 						} satisfies ChatCompletionContentPartImage);
 					} else {
@@ -1628,12 +1779,12 @@ export function convertMessages(
 				if (compat.requiresThinkingAsText) {
 					// Convert thinking blocks to plain text (no tags to avoid model mimicking them)
 					const thinkingText = nonEmptyThinkingBlocks.map(b => b.thinking).join("\n\n");
-					const textContent = assistantMsg.content as Array<{ type: "text"; text: string }> | null;
-					if (textContent) {
-						textContent.unshift({ type: "text", text: thinkingText });
-					} else {
-						assistantMsg.content = [{ type: "text", text: thinkingText }];
-					}
+					// `content` is a plain string at this point (set above) or null —
+					// never an array. Prepend the thinking text to the string form.
+					assistantMsg.content =
+						typeof assistantMsg.content === "string" && assistantMsg.content.length > 0
+							? `${thinkingText}\n\n${assistantMsg.content}`
+							: thinkingText;
 				} else if (compat.requiresReasoningContentForToolCalls) {
 					// Use the streamed signature when the backend accepts whichever
 					// recognized field name was emitted (allowsSynthetic=true). Backends
@@ -1934,16 +2085,19 @@ function convertTools(
 			};
 		}),
 		toolStrictMode,
+		strictToolsApplied:
+			tools.length > 0 &&
+			(toolStrictMode === "all_strict" || (toolStrictMode === "mixed" && adaptedTools.some(tool => tool.strict))),
 	};
 }
 function shouldRetryWithoutStrictTools(
 	error: unknown,
 	capturedErrorResponse: CapturedHttpErrorResponse | undefined,
-	toolStrictMode: AppliedToolStrictMode,
+	strictToolsApplied: boolean,
 	tools: Tool[] | undefined,
 ): boolean {
-	if (!tools || tools.length === 0 || toolStrictMode !== "all_strict") {
+	if (!tools || tools.length === 0 || !strictToolsApplied) {
 		return false;
 	}
 	const status = extractHttpStatusFromError(error) ?? capturedErrorResponse?.status;
@@ -1953,7 +2107,14 @@ function shouldRetryWithoutStrictTools(
 	const messageParts = [error instanceof Error ? error.message : undefined, capturedErrorResponse?.bodyText]
 		.filter((value): value is string => typeof value === "string" && value.trim().length > 0)
 		.join("\n");
-	return /wrong_api_format|mixed values for 'strict'|tool[s]?\b.*strict|\bstrict\b.*tool/i.test(messageParts);
+	// Last two alternatives catch upstream tool-schema validators rejecting our
+	// strictified schemas outright (e.g. OpenRouter DeepSeek's "Invalid tool
+	// parameters schema : field `anyOf`: missing field `type`", #2270, and
+	// OpenAI's own "Invalid schema for function 'x'"). Retrying non-strict sends
+	// the unmodified base schemas, which those validators accept.
+	return /wrong_api_format|mixed values for 'strict'|tool[s]?\b.*strict|\bstrict\b.*tool|tool parameters? schema|invalid schema for function/i.test(
+		messageParts,
+	);
 }
 function mapStopReason(reason: ChatCompletionChunk.Choice["finish_reason"] | string): {
@@ -1974,6 +2135,13 @@ function mapStopReason(reason: ChatCompletionChunk.Choice["finish_reason"] | str
 			return { stopReason: "error", errorMessage: "Provider finish_reason: content_filter" };
 		case "network_error":
 			return { stopReason: "error", errorMessage: "Provider finish_reason: network_error" };
+		case "error":
+			// Gateways (OpenRouter, Vercel AI Gateway, …) report upstream model
+			// failures as a bare `finish_reason: "error"` with no detail. These are
+			// almost always transient (e.g. Gemini MALFORMED_FUNCTION_CALL), so word
+			// the message to match the session retry classifier's transient-transport
+			// pattern (`provider.?returned.?error`) and get the turn auto-retried.
+			return { stopReason: "error", errorMessage: "Provider returned error finish_reason" };
 		default:
 			return {
 				stopReason: "error",
@@ -1981,22 +2149,3 @@ function mapStopReason(reason: ChatCompletionChunk.Choice["finish_reason"] | str
 			};
 	}
 }
-/**
- * Detect compatibility settings from provider and baseUrl for known providers.
- * Provider takes precedence over URL-based detection since it's explicitly configured.
- * Returns a fully resolved OpenAICompat object with all fields set.
- */
-export function detectCompat(model: Model<"openai-completions">): ResolvedOpenAICompat {
-	return detectOpenAICompat(model);
-}
-/**
- * Get resolved compatibility settings for a model.
- * Uses explicit model.compat if provided, otherwise auto-detects from provider/URL.
- * @param model - The model configuration
- * @param resolvedBaseUrl - Optional resolved base URL (e.g., after GitHub Copilot proxy-ep resolution).
- */
-function getCompat(model: Model<"openai-completions">, resolvedBaseUrl?: string): ResolvedOpenAICompat {
-	return resolveOpenAICompat(model, resolvedBaseUrl);
-}