@prometheus-ai/ai 0.5.4 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/dist/types/auth-broker/remote-store.d.ts +2 -1
  2. package/dist/types/auth-broker/wire-schemas.d.ts +4 -1
  3. package/dist/types/auth-gateway/server.d.ts +19 -0
  4. package/dist/types/auth-gateway/types.d.ts +9 -3
  5. package/dist/types/auth-retry.d.ts +119 -0
  6. package/dist/types/auth-storage.d.ts +217 -8
  7. package/dist/types/errors.d.ts +24 -0
  8. package/dist/types/index.d.ts +5 -9
  9. package/dist/types/provider-details.d.ts +1 -1
  10. package/dist/types/providers/amazon-bedrock.d.ts +12 -6
  11. package/dist/types/providers/anthropic-client.d.ts +10 -3
  12. package/dist/types/providers/anthropic-messages-server-schema.d.ts +2 -2
  13. package/dist/types/providers/anthropic-messages-server.d.ts +3 -3
  14. package/dist/types/providers/anthropic-wire.d.ts +3 -3
  15. package/dist/types/providers/anthropic.d.ts +41 -34
  16. package/dist/types/providers/aws-credentials.d.ts +8 -0
  17. package/dist/types/providers/azure-openai-responses.d.ts +1 -0
  18. package/dist/types/providers/google-gemini-cli.d.ts +22 -1
  19. package/dist/types/providers/google-shared.d.ts +22 -0
  20. package/dist/types/providers/google-types.d.ts +13 -1
  21. package/dist/types/providers/mock.d.ts +8 -3
  22. package/dist/types/providers/ollama.d.ts +6 -0
  23. package/dist/types/providers/openai-chat-server-schema.d.ts +6 -3
  24. package/dist/types/providers/openai-chat-server.d.ts +3 -3
  25. package/dist/types/providers/openai-chat-wire.d.ts +644 -0
  26. package/dist/types/providers/openai-codex/request-transformer.d.ts +8 -0
  27. package/dist/types/providers/openai-codex/response-handler.d.ts +9 -0
  28. package/dist/types/providers/openai-codex-responses.d.ts +31 -2
  29. package/dist/types/providers/openai-completions-compat.d.ts +2 -25
  30. package/dist/types/providers/openai-completions.d.ts +2 -10
  31. package/dist/types/providers/openai-responses-server-schema.d.ts +4 -4
  32. package/dist/types/providers/openai-responses-server.d.ts +2 -2
  33. package/dist/types/providers/openai-responses-shared.d.ts +49 -9
  34. package/dist/types/providers/openai-responses-wire.d.ts +6065 -0
  35. package/dist/types/providers/openai-responses.d.ts +13 -4
  36. package/dist/types/providers/prometheus-native-client.d.ts +9 -0
  37. package/dist/types/providers/prometheus-native-server.d.ts +4 -3
  38. package/dist/types/providers/transform-messages.d.ts +1 -2
  39. package/dist/types/rate-limit-utils.d.ts +3 -2
  40. package/dist/types/registry/aimlapi.d.ts +4 -0
  41. package/dist/types/registry/alibaba-coding-plan.d.ts +7 -0
  42. package/dist/types/registry/amazon-bedrock.d.ts +5 -0
  43. package/dist/types/registry/anthropic.d.ts +10 -0
  44. package/dist/types/{utils/oauth → registry}/api-key-login.d.ts +8 -2
  45. package/dist/types/{utils/oauth → registry}/api-key-validation.d.ts +15 -0
  46. package/dist/types/registry/cerebras.d.ts +7 -0
  47. package/dist/types/registry/cloudflare-ai-gateway.d.ts +13 -0
  48. package/dist/types/registry/cursor.d.ts +7 -0
  49. package/dist/types/registry/deepseek.d.ts +8 -0
  50. package/dist/types/registry/derived.d.ts +5 -0
  51. package/dist/types/registry/firepass.d.ts +16 -0
  52. package/dist/types/registry/fireworks.d.ts +7 -0
  53. package/dist/types/registry/github-copilot.d.ts +7 -0
  54. package/dist/types/registry/gitlab-duo.d.ts +9 -0
  55. package/dist/types/registry/google-antigravity.d.ts +9 -0
  56. package/dist/types/registry/google-gemini-cli.d.ts +9 -0
  57. package/dist/types/registry/google-vertex.d.ts +5 -0
  58. package/dist/types/registry/google.d.ts +4 -0
  59. package/dist/types/registry/groq.d.ts +4 -0
  60. package/dist/types/registry/huggingface.d.ts +7 -0
  61. package/dist/types/registry/index.d.ts +4 -0
  62. package/dist/types/registry/kagi.d.ts +14 -0
  63. package/dist/types/registry/kilo.d.ts +7 -0
  64. package/dist/types/registry/kimi-code.d.ts +7 -0
  65. package/dist/types/registry/litellm.d.ts +13 -0
  66. package/dist/types/registry/lm-studio.d.ts +8 -0
  67. package/dist/types/registry/minimax-code-cn.d.ts +6 -0
  68. package/dist/types/registry/minimax-code.d.ts +6 -0
  69. package/dist/types/registry/minimax.d.ts +4 -0
  70. package/dist/types/registry/mistral.d.ts +4 -0
  71. package/dist/types/registry/moonshot.d.ts +7 -0
  72. package/dist/types/registry/nanogpt.d.ts +7 -0
  73. package/dist/types/registry/nvidia.d.ts +7 -0
  74. package/dist/types/registry/oauth/__tests__/xai-oauth.test.d.ts +1 -0
  75. package/dist/types/{utils → registry}/oauth/anthropic.d.ts +2 -1
  76. package/dist/types/{utils → registry}/oauth/github-copilot.d.ts +15 -23
  77. package/dist/types/{utils → registry}/oauth/index.d.ts +1 -0
  78. package/dist/types/{utils → registry}/oauth/minimax-code.d.ts +5 -5
  79. package/dist/types/{utils → registry}/oauth/types.d.ts +6 -1
  80. package/dist/types/{utils → registry}/oauth/xai-oauth.d.ts +2 -1
  81. package/dist/types/registry/ollama-cloud.d.ts +7 -0
  82. package/dist/types/registry/ollama.d.ts +12 -0
  83. package/dist/types/registry/openai-codex-device.d.ts +8 -0
  84. package/dist/types/registry/openai-codex.d.ts +9 -0
  85. package/dist/types/registry/openai.d.ts +4 -0
  86. package/dist/types/registry/opencode-go.d.ts +6 -0
  87. package/dist/types/registry/opencode-zen.d.ts +6 -0
  88. package/dist/types/registry/openrouter.d.ts +13 -0
  89. package/dist/types/registry/parallel.d.ts +14 -0
  90. package/dist/types/registry/perplexity.d.ts +7 -0
  91. package/dist/types/registry/qianfan.d.ts +7 -0
  92. package/dist/types/registry/qwen-portal.d.ts +7 -0
  93. package/dist/types/registry/registry.d.ts +272 -0
  94. package/dist/types/registry/synthetic.d.ts +6 -0
  95. package/dist/types/registry/tavily.d.ts +14 -0
  96. package/dist/types/registry/together.d.ts +6 -0
  97. package/dist/types/registry/types.d.ts +51 -0
  98. package/dist/types/registry/venice.d.ts +13 -0
  99. package/dist/types/registry/vercel-ai-gateway.d.ts +7 -0
  100. package/dist/types/registry/vllm.d.ts +7 -0
  101. package/dist/types/registry/wafer-pass.d.ts +6 -0
  102. package/dist/types/registry/wafer-serverless.d.ts +6 -0
  103. package/dist/types/registry/xai-oauth.d.ts +7 -0
  104. package/dist/types/registry/xai.d.ts +4 -0
  105. package/dist/types/registry/xiaomi-token-plan-ams.d.ts +6 -0
  106. package/dist/types/registry/xiaomi-token-plan-cn.d.ts +6 -0
  107. package/dist/types/registry/xiaomi-token-plan-sgp.d.ts +6 -0
  108. package/dist/types/registry/xiaomi.d.ts +6 -0
  109. package/dist/types/registry/zai.d.ts +7 -0
  110. package/dist/types/registry/zenmux.d.ts +7 -0
  111. package/dist/types/registry/zhipu-coding-plan.d.ts +7 -0
  112. package/dist/types/stream.d.ts +9 -1
  113. package/dist/types/types.d.ts +56 -295
  114. package/dist/types/usage/google-antigravity.d.ts +15 -1
  115. package/dist/types/usage/openai-codex-reset.d.ts +79 -0
  116. package/dist/types/usage/openai-codex.d.ts +1 -0
  117. package/dist/types/usage.d.ts +77 -4
  118. package/dist/types/utils/abort.d.ts +6 -0
  119. package/dist/types/utils/event-stream.d.ts +2 -0
  120. package/dist/types/utils/http-inspector.d.ts +0 -1
  121. package/dist/types/utils/idle-iterator.d.ts +35 -0
  122. package/dist/types/utils/openai-http.d.ts +58 -0
  123. package/dist/types/utils/request-debug.d.ts +3 -0
  124. package/dist/types/utils/retry-after.d.ts +1 -0
  125. package/dist/types/utils/schema/fields.d.ts +5 -0
  126. package/dist/types/utils/schema/json-schema-validator.d.ts +8 -0
  127. package/dist/types/utils/schema/stamps.d.ts +7 -15
  128. package/dist/types/utils/sse-debug.d.ts +0 -5
  129. package/dist/types/utils/stream-markup-healing.d.ts +2 -0
  130. package/dist/types/utils.d.ts +1 -5
  131. package/package.json +17 -29
  132. package/src/auth-broker/remote-store.ts +10 -1
  133. package/src/auth-broker/snapshot-cache.ts +1 -1
  134. package/src/auth-broker/wire-schemas.ts +1 -1
  135. package/src/auth-gateway/http.ts +1 -1
  136. package/src/auth-gateway/server.ts +95 -30
  137. package/src/auth-gateway/types.ts +10 -2
  138. package/src/auth-retry.ts +238 -0
  139. package/src/auth-storage.ts +935 -430
  140. package/src/errors.ts +32 -0
  141. package/src/index.ts +9 -14
  142. package/src/provider-details.ts +1 -1
  143. package/src/providers/__tests__/google-auth.test.ts +144 -0
  144. package/src/providers/amazon-bedrock.ts +70 -40
  145. package/src/providers/anthropic-client.ts +15 -13
  146. package/src/providers/anthropic-messages-server-schema.ts +17 -7
  147. package/src/providers/anthropic-messages-server.ts +88 -20
  148. package/src/providers/anthropic-wire.ts +4 -3
  149. package/src/providers/anthropic.ts +1234 -621
  150. package/src/providers/aws-credentials.ts +47 -5
  151. package/src/providers/aws-eventstream.ts +5 -0
  152. package/src/providers/azure-openai-responses.ts +117 -67
  153. package/src/providers/cursor.ts +30 -30
  154. package/src/providers/github-copilot-headers.ts +1 -1
  155. package/src/providers/gitlab-duo.ts +36 -29
  156. package/src/providers/google-auth.ts +71 -8
  157. package/src/providers/google-gemini-cli.ts +118 -22
  158. package/src/providers/google-shared.ts +163 -43
  159. package/src/providers/google-types.ts +10 -1
  160. package/src/providers/kimi.ts +1 -1
  161. package/src/providers/mock.ts +11 -3
  162. package/src/providers/ollama.ts +64 -7
  163. package/src/providers/openai-anthropic-shim.ts +17 -8
  164. package/src/providers/openai-chat-server-schema.ts +9 -3
  165. package/src/providers/openai-chat-server.ts +82 -16
  166. package/src/providers/openai-chat-wire.ts +847 -0
  167. package/src/providers/openai-codex/request-transformer.ts +129 -34
  168. package/src/providers/openai-codex/response-handler.ts +22 -1
  169. package/src/providers/openai-codex-responses.ts +699 -247
  170. package/src/providers/openai-completions-compat.ts +8 -308
  171. package/src/providers/openai-completions.ts +416 -267
  172. package/src/providers/openai-responses-server-schema.ts +15 -9
  173. package/src/providers/openai-responses-server.ts +162 -114
  174. package/src/providers/openai-responses-shared.ts +320 -82
  175. package/src/providers/openai-responses-wire.ts +6391 -0
  176. package/src/providers/openai-responses.ts +382 -176
  177. package/src/providers/prometheus-native-client.ts +27 -11
  178. package/src/providers/prometheus-native-server.ts +44 -17
  179. package/src/providers/transform-messages.ts +311 -120
  180. package/src/providers/vision-guard.ts +5 -3
  181. package/src/rate-limit-utils.ts +13 -3
  182. package/src/registry/aimlapi.ts +6 -0
  183. package/src/{utils/oauth → registry}/alibaba-coding-plan.ts +8 -18
  184. package/src/registry/amazon-bedrock.ts +22 -0
  185. package/src/registry/anthropic.ts +26 -0
  186. package/src/{utils/oauth → registry}/api-key-login.ts +25 -3
  187. package/src/{utils/oauth → registry}/api-key-validation.ts +62 -2
  188. package/src/{utils/oauth → registry}/cerebras.ts +8 -1
  189. package/src/{utils/oauth → registry}/cloudflare-ai-gateway.ts +8 -12
  190. package/src/registry/cursor.ts +20 -0
  191. package/src/{utils/oauth → registry}/deepseek.ts +9 -17
  192. package/src/registry/derived.ts +9 -0
  193. package/src/{utils/oauth → registry}/firepass.ts +10 -2
  194. package/src/{utils/oauth → registry}/fireworks.ts +8 -1
  195. package/src/registry/github-copilot.ts +22 -0
  196. package/src/registry/gitlab-duo.ts +19 -0
  197. package/src/registry/google-antigravity.ts +21 -0
  198. package/src/registry/google-gemini-cli.ts +21 -0
  199. package/src/registry/google-vertex.ts +38 -0
  200. package/src/registry/google.ts +6 -0
  201. package/src/registry/groq.ts +6 -0
  202. package/src/{utils/oauth → registry}/huggingface.ts +8 -19
  203. package/src/registry/index.ts +4 -0
  204. package/src/{utils/oauth → registry}/kagi.ts +9 -11
  205. package/src/{utils/oauth → registry}/kilo.ts +11 -6
  206. package/src/registry/kimi-code.ts +17 -0
  207. package/src/{utils/oauth → registry}/litellm.ts +8 -12
  208. package/src/{utils/oauth → registry}/lm-studio.ts +9 -17
  209. package/src/registry/minimax-code-cn.ts +12 -0
  210. package/src/registry/minimax-code.ts +12 -0
  211. package/src/registry/minimax.ts +6 -0
  212. package/src/registry/mistral.ts +6 -0
  213. package/src/{utils/oauth → registry}/moonshot.ts +8 -9
  214. package/src/{utils/oauth → registry}/nanogpt.ts +8 -1
  215. package/src/{utils/oauth → registry}/nvidia.ts +8 -18
  216. package/src/{utils → registry}/oauth/__tests__/xai-oauth.test.ts +4 -7
  217. package/src/{utils → registry}/oauth/anthropic.ts +38 -17
  218. package/src/{utils → registry}/oauth/github-copilot.ts +79 -115
  219. package/src/registry/oauth/gitlab-duo.ts +198 -0
  220. package/src/{utils → registry}/oauth/google-antigravity.ts +1 -4
  221. package/src/{utils → registry}/oauth/google-gemini-cli.ts +1 -4
  222. package/src/registry/oauth/index.ts +164 -0
  223. package/src/{utils → registry}/oauth/minimax-code.ts +16 -14
  224. package/src/{utils → registry}/oauth/types.ts +7 -51
  225. package/src/{utils → registry}/oauth/wafer.ts +1 -1
  226. package/src/{utils → registry}/oauth/xai-oauth.ts +16 -8
  227. package/src/{utils → registry}/oauth/xiaomi.ts +9 -4
  228. package/src/{utils/oauth → registry}/ollama-cloud.ts +8 -1
  229. package/src/{utils/oauth → registry}/ollama.ts +8 -13
  230. package/src/registry/openai-codex-device.ts +18 -0
  231. package/src/registry/openai-codex.ts +19 -0
  232. package/src/registry/openai.ts +6 -0
  233. package/src/registry/opencode-go.ts +12 -0
  234. package/src/registry/opencode-zen.ts +12 -0
  235. package/src/{utils/oauth → registry}/openrouter.ts +10 -2
  236. package/src/{utils/oauth → registry}/parallel.ts +9 -11
  237. package/src/registry/perplexity.ts +13 -0
  238. package/src/{utils/oauth → registry}/qianfan.ts +8 -17
  239. package/src/{utils/oauth → registry}/qwen-portal.ts +8 -19
  240. package/src/registry/registry.ts +149 -0
  241. package/src/{utils/oauth → registry}/synthetic.ts +7 -1
  242. package/src/{utils/oauth → registry}/tavily.ts +10 -12
  243. package/src/{utils/oauth → registry}/together.ts +7 -1
  244. package/src/registry/types.ts +56 -0
  245. package/src/{utils/oauth → registry}/venice.ts +8 -12
  246. package/src/{utils/oauth → registry}/vercel-ai-gateway.ts +8 -18
  247. package/src/{utils/oauth → registry}/vllm.ts +9 -16
  248. package/src/registry/wafer-pass.ts +12 -0
  249. package/src/registry/wafer-serverless.ts +12 -0
  250. package/src/registry/xai-oauth.ts +17 -0
  251. package/src/registry/xai.ts +6 -0
  252. package/src/registry/xiaomi-token-plan-ams.ts +12 -0
  253. package/src/registry/xiaomi-token-plan-cn.ts +12 -0
  254. package/src/registry/xiaomi-token-plan-sgp.ts +12 -0
  255. package/src/registry/xiaomi.ts +12 -0
  256. package/src/{utils/oauth → registry}/zai.ts +10 -22
  257. package/src/{utils/oauth → registry}/zenmux.ts +8 -1
  258. package/src/{utils/oauth/zhipu.ts → registry/zhipu-coding-plan.ts} +9 -21
  259. package/src/stream.ts +229 -199
  260. package/src/types.ts +63 -384
  261. package/src/usage/claude.ts +4 -2
  262. package/src/usage/github-copilot.ts +4 -2
  263. package/src/usage/google-antigravity.ts +196 -28
  264. package/src/usage/kimi.ts +1 -1
  265. package/src/usage/minimax-code.ts +5 -6
  266. package/src/usage/openai-codex-reset.ts +174 -0
  267. package/src/usage/openai-codex.ts +19 -2
  268. package/src/usage/zai.ts +2 -1
  269. package/src/usage.ts +93 -4
  270. package/src/utils/abort.ts +14 -0
  271. package/src/utils/event-stream.ts +17 -0
  272. package/src/utils/http-inspector.ts +4 -12
  273. package/src/utils/idle-iterator.ts +250 -79
  274. package/src/utils/openai-http.ts +157 -0
  275. package/src/utils/request-debug.ts +67 -19
  276. package/src/utils/retry-after.ts +1 -1
  277. package/src/utils/retry.ts +23 -2
  278. package/src/utils/schema/CONSTRAINTS.md +4 -2
  279. package/src/utils/schema/fields.ts +16 -0
  280. package/src/utils/schema/json-schema-validator.ts +19 -1
  281. package/src/utils/schema/normalize.ts +80 -8
  282. package/src/utils/schema/stamps.ts +22 -10
  283. package/src/utils/schema/wire.ts +2 -2
  284. package/src/utils/sse-debug.ts +0 -271
  285. package/src/utils/stream-markup-healing.ts +50 -8
  286. package/src/utils/validation.ts +49 -13
  287. package/src/utils.ts +2 -26
  288. package/dist/types/model-cache.d.ts +0 -17
  289. package/dist/types/model-manager.d.ts +0 -64
  290. package/dist/types/model-thinking.d.ts +0 -100
  291. package/dist/types/models.d.ts +0 -12
  292. package/dist/types/provider-models/bundled-references.d.ts +0 -4
  293. package/dist/types/provider-models/descriptors.d.ts +0 -50
  294. package/dist/types/provider-models/google.d.ts +0 -24
  295. package/dist/types/provider-models/index.d.ts +0 -5
  296. package/dist/types/provider-models/ollama.d.ts +0 -7
  297. package/dist/types/provider-models/openai-compat.d.ts +0 -323
  298. package/dist/types/provider-models/special.d.ts +0 -16
  299. package/dist/types/utils/discovery/antigravity.d.ts +0 -61
  300. package/dist/types/utils/discovery/codex.d.ts +0 -38
  301. package/dist/types/utils/discovery/cursor.d.ts +0 -23
  302. package/dist/types/utils/discovery/gemini.d.ts +0 -25
  303. package/dist/types/utils/discovery/index.d.ts +0 -4
  304. package/dist/types/utils/discovery/openai-compatible.d.ts +0 -72
  305. package/dist/types/utils/oauth/alibaba-coding-plan.d.ts +0 -18
  306. package/dist/types/utils/oauth/cerebras.d.ts +0 -1
  307. package/dist/types/utils/oauth/cloudflare-ai-gateway.d.ts +0 -18
  308. package/dist/types/utils/oauth/deepseek.d.ts +0 -10
  309. package/dist/types/utils/oauth/firepass.d.ts +0 -1
  310. package/dist/types/utils/oauth/fireworks.d.ts +0 -1
  311. package/dist/types/utils/oauth/huggingface.d.ts +0 -19
  312. package/dist/types/utils/oauth/kagi.d.ts +0 -17
  313. package/dist/types/utils/oauth/kilo.d.ts +0 -5
  314. package/dist/types/utils/oauth/litellm.d.ts +0 -18
  315. package/dist/types/utils/oauth/lm-studio.d.ts +0 -17
  316. package/dist/types/utils/oauth/moonshot.d.ts +0 -1
  317. package/dist/types/utils/oauth/nanogpt.d.ts +0 -1
  318. package/dist/types/utils/oauth/nvidia.d.ts +0 -18
  319. package/dist/types/utils/oauth/ollama-cloud.d.ts +0 -2
  320. package/dist/types/utils/oauth/ollama.d.ts +0 -18
  321. package/dist/types/utils/oauth/openrouter.d.ts +0 -1
  322. package/dist/types/utils/oauth/parallel.d.ts +0 -17
  323. package/dist/types/utils/oauth/qianfan.d.ts +0 -17
  324. package/dist/types/utils/oauth/qwen-portal.d.ts +0 -19
  325. package/dist/types/utils/oauth/synthetic.d.ts +0 -1
  326. package/dist/types/utils/oauth/tavily.d.ts +0 -17
  327. package/dist/types/utils/oauth/together.d.ts +0 -1
  328. package/dist/types/utils/oauth/venice.d.ts +0 -18
  329. package/dist/types/utils/oauth/vercel-ai-gateway.d.ts +0 -18
  330. package/dist/types/utils/oauth/vllm.d.ts +0 -16
  331. package/dist/types/utils/oauth/zai.d.ts +0 -18
  332. package/dist/types/utils/oauth/zenmux.d.ts +0 -1
  333. package/dist/types/utils/oauth/zhipu.d.ts +0 -18
  334. package/src/model-cache.ts +0 -129
  335. package/src/model-manager.ts +0 -469
  336. package/src/model-thinking.ts +0 -756
  337. package/src/models.json +0 -60287
  338. package/src/models.json.d.ts +0 -9
  339. package/src/models.ts +0 -56
  340. package/src/provider-models/bundled-references.ts +0 -38
  341. package/src/provider-models/descriptors.ts +0 -364
  342. package/src/provider-models/google.ts +0 -88
  343. package/src/provider-models/index.ts +0 -5
  344. package/src/provider-models/ollama.ts +0 -153
  345. package/src/provider-models/openai-compat.ts +0 -2904
  346. package/src/provider-models/special.ts +0 -67
  347. package/src/utils/discovery/antigravity.ts +0 -261
  348. package/src/utils/discovery/codex.ts +0 -371
  349. package/src/utils/discovery/cursor.ts +0 -306
  350. package/src/utils/discovery/gemini.ts +0 -248
  351. package/src/utils/discovery/index.ts +0 -4
  352. package/src/utils/discovery/openai-compatible.ts +0 -224
  353. package/src/utils/oauth/gitlab-duo.ts +0 -123
  354. package/src/utils/oauth/index.ts +0 -502
  355. /package/dist/types/{utils/oauth/__tests__/xai-oauth.test.d.ts → providers/__tests__/google-auth.test.d.ts} +0 -0
  356. /package/dist/types/{utils → registry}/oauth/callback-server.d.ts +0 -0
  357. /package/dist/types/{utils → registry}/oauth/cursor.d.ts +0 -0
  358. /package/dist/types/{utils → registry}/oauth/gitlab-duo.d.ts +0 -0
  359. /package/dist/types/{utils → registry}/oauth/google-antigravity.d.ts +0 -0
  360. /package/dist/types/{utils → registry}/oauth/google-gemini-cli.d.ts +0 -0
  361. /package/dist/types/{utils → registry}/oauth/google-oauth-shared.d.ts +0 -0
  362. /package/dist/types/{utils → registry}/oauth/kimi.d.ts +0 -0
  363. /package/dist/types/{utils → registry}/oauth/openai-codex.d.ts +0 -0
  364. /package/dist/types/{utils → registry}/oauth/opencode.d.ts +0 -0
  365. /package/dist/types/{utils → registry}/oauth/perplexity.d.ts +0 -0
  366. /package/dist/types/{utils → registry}/oauth/pkce.d.ts +0 -0
  367. /package/dist/types/{utils → registry}/oauth/wafer.d.ts +0 -0
  368. /package/dist/types/{utils → registry}/oauth/xiaomi.d.ts +0 -0
  369. /package/src/{utils → registry}/oauth/callback-server.ts +0 -0
  370. /package/src/{utils → registry}/oauth/cursor.ts +0 -0
  371. /package/src/{utils → registry}/oauth/google-oauth-shared.ts +0 -0
  372. /package/src/{utils → registry}/oauth/kimi.ts +0 -0
  373. /package/src/{utils → registry}/oauth/oauth.html +0 -0
  374. /package/src/{utils → registry}/oauth/openai-codex.ts +0 -0
  375. /package/src/{utils → registry}/oauth/opencode.ts +0 -0
  376. /package/src/{utils → registry}/oauth/perplexity.ts +0 -0
  377. /package/src/{utils → registry}/oauth/pkce.ts +0 -0
@@ -1,27 +1,24 @@
1
- import { $env, APP_DISPLAY_NAME, extractHttpStatusFromError } from "@prometheus-ai/utils";
2
- import OpenAI, { APIConnectionTimeoutError as OpenAIConnectionTimeoutError } from "openai";
3
- import type {
4
- ChatCompletionAssistantMessageParam,
5
- ChatCompletionChunk,
6
- ChatCompletionContentPart,
7
- ChatCompletionContentPartImage,
8
- ChatCompletionContentPartText,
9
- ChatCompletionMessageParam,
10
- ChatCompletionToolMessageParam,
11
- } from "openai/resources/chat/completions";
1
+ import type { Effort } from "@prometheus-ai/catalog/effort";
2
+ import { toFirepassWireModelId, toFireworksWireModelId } from "@prometheus-ai/catalog/fireworks-model-id";
3
+ import { isDeepseekModelIdOrName } from "@prometheus-ai/catalog/identity";
4
+ import { getSupportedEfforts, resolveWireModelId } from "@prometheus-ai/catalog/model-thinking";
5
+ import { calculateCost } from "@prometheus-ai/catalog/models";
6
+ import type { ResolvedOpenAICompat } from "@prometheus-ai/catalog/types";
7
+ import { parseGitHubCopilotApiKey } from "@prometheus-ai/catalog/wire/github-copilot";
8
+ import { $env, extractHttpStatusFromError } from "@prometheus-ai/utils";
12
9
  import packageJson from "../../package.json" with { type: "json" };
13
- import { type Effort, getSupportedEfforts } from "../model-thinking";
14
- import { calculateCost } from "../models";
10
+ import { getKimiCommonHeaders } from "../registry/oauth/kimi";
15
11
  import { getEnvApiKey } from "../stream";
16
12
  import {
17
13
  type AssistantMessage,
18
14
  type Context,
19
- type FetchImpl,
20
15
  type Message,
21
16
  type MessageAttribution,
22
17
  type Model,
18
+ OPENAI_MAX_OUTPUT_TOKENS,
23
19
  type OpenAICompat,
24
20
  type ProviderSessionState,
21
+ type RawSseEvent,
25
22
  resolveServiceTier,
26
23
  type ServiceTier,
27
24
  type StopReason,
@@ -38,7 +35,6 @@ import {
38
35
  import { normalizeSystemPrompts } from "../utils";
39
36
  import { createAbortSourceTracker } from "../utils/abort";
40
37
  import { AssistantMessageEventStream } from "../utils/event-stream";
41
- import { toFirepassWireModelId, toFireworksWireModelId } from "../utils/fireworks-model-id";
42
38
  import {
43
39
  type CapturedHttpErrorResponse,
44
40
  finalizeErrorMessage,
@@ -49,27 +45,38 @@ import {
49
45
  getOpenAIStreamFirstEventTimeoutMs,
50
46
  getOpenAIStreamIdleTimeoutMs,
51
47
  iterateWithIdleTimeout,
48
+ iterateWithTerminalGrace,
52
49
  } from "../utils/idle-iterator";
53
50
  import { parseStreamingJson, parseStreamingJsonThrottled } from "../utils/json-parse";
54
- import { parseGitHubCopilotApiKey } from "../utils/oauth/github-copilot";
55
- import { getKimiCommonHeaders } from "../utils/oauth/kimi";
51
+ import { OpenAIHttpError, postOpenAIStream } from "../utils/openai-http";
56
52
  import { notifyProviderResponse } from "../utils/provider-response";
57
53
  import { callWithCopilotModelRetry } from "../utils/retry";
58
54
  import { adaptSchemaForStrict, NO_STRICT, toolWireSchema } from "../utils/schema";
59
- import { wrapFetchForSseDebug } from "../utils/sse-debug";
60
55
  import {
61
56
  getStreamMarkupHealingPattern,
62
57
  type HealedToolCall,
58
+ modelMayLeakThinkingTags,
63
59
  StreamMarkupHealing,
64
60
  type StreamMarkupHealingEvent,
65
61
  } from "../utils/stream-markup-healing";
66
62
  import { isForcedToolChoice, mapToOpenAICompletionsToolChoice } from "../utils/tool-choice";
63
+ import { parseAzureDeploymentNameMap } from "./azure-openai-responses";
67
64
  import {
68
65
  buildCopilotDynamicHeaders,
69
66
  hasCopilotVisionInput,
70
67
  resolveGitHubCopilotBaseUrl,
71
68
  } from "./github-copilot-headers";
72
- import { detectOpenAICompat, type ResolvedOpenAICompat, resolveOpenAICompat } from "./openai-completions-compat";
69
+ import type {
70
+ ChatCompletionAssistantMessageParam,
71
+ ChatCompletionChunk,
72
+ ChatCompletionContentPart,
73
+ ChatCompletionContentPartImage,
74
+ ChatCompletionContentPartText,
75
+ ChatCompletionCreateParamsStreaming,
76
+ ChatCompletionMessageParam,
77
+ ChatCompletionTool,
78
+ ChatCompletionToolMessageParam,
79
+ } from "./openai-chat-wire";
73
80
  import { createInitialResponsesAssistantMessage } from "./openai-responses-shared";
74
81
  import { transformMessages } from "./transform-messages";
75
82
  import {
@@ -107,10 +114,16 @@ function resolveOpenAICompletionsModelId(
107
114
  model: Model<"openai-completions">,
108
115
  options: OpenAICompletionsOptions | undefined,
109
116
  ): string {
110
- if (model.provider === "firepass") return toFirepassWireModelId(model.id);
111
- if (model.provider === "fireworks") return toFireworksWireModelId(model.id);
112
- if (model.provider === "openrouter") return applyOpenRouterRoutingVariant(model.id, options?.openrouterVariant);
113
- return model.id;
117
+ // Effort-tier variants route per request effort (off → bare id, efforts →
118
+ // the thinking backing id); catalog variants (Copilot long-context `-1m`
119
+ // entries) pin via `requestModelId`; everything else serializes `model.id`.
120
+ const effort =
121
+ options?.reasoning && !options.disableReasoning && model.reasoning ? (options.reasoning as Effort) : undefined;
122
+ const wireId = resolveWireModelId(model, effort);
123
+ if (model.provider === "firepass") return toFirepassWireModelId(wireId);
124
+ if (model.provider === "fireworks") return toFireworksWireModelId(wireId);
125
+ if (model.provider === "openrouter") return applyOpenRouterRoutingVariant(wireId, options?.openrouterVariant);
126
+ return wireId;
114
127
  }
115
128
 
116
129
  /**
@@ -255,7 +268,7 @@ export interface OpenAICompletionsOptions extends StreamOptions {
255
268
  openrouterVariant?: string;
256
269
  }
257
270
 
258
- type OpenAICompletionsParams = OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming & {
271
+ type OpenAICompletionsParams = ChatCompletionCreateParamsStreaming & {
259
272
  top_k?: number;
260
273
  min_p?: number;
261
274
  repetition_penalty?: number;
@@ -271,8 +284,10 @@ type AppliedToolStrictMode = "mixed" | "all_strict" | "none";
271
284
  type ToolStrictModeOverride = Exclude<ResolvedOpenAICompat["toolStrictMode"], "mixed"> | undefined;
272
285
 
273
286
  type BuiltOpenAICompletionTools = {
274
- tools: OpenAI.Chat.Completions.ChatCompletionTool[];
287
+ tools: ChatCompletionTool[];
275
288
  toolStrictMode: AppliedToolStrictMode;
289
+ /** True when at least one wire tool was sent with `strict: true`. */
290
+ strictToolsApplied: boolean;
276
291
  };
277
292
 
278
293
  const OPENAI_COMPLETIONS_PROVIDER_SESSION_STATE_PREFIX = "openai-completions:";
@@ -385,25 +400,13 @@ function getTrailingPartialDeepseekToken(text: string): string {
385
400
  }
386
401
  const OPENAI_COMPLETIONS_FIRST_EVENT_TIMEOUT_MESSAGE =
387
402
  "OpenAI completions stream timed out while waiting for the first event";
388
-
389
- const GLM_CODING_PLAN_STREAM_IDLE_TIMEOUT_MS = 600_000;
390
- const GLM_CODING_PLAN_MODEL_PATTERN = /^glm-5(?:[.-]|$)/i;
391
-
392
- /** Returns the widened OpenAI stream watchdog floor for slow GLM coding-plan reasoning models. */
393
- export function getOpenAICompletionsStreamIdleTimeoutFallbackMs(
394
- model: Model<"openai-completions">,
395
- ): number | undefined {
396
- if (!GLM_CODING_PLAN_MODEL_PATTERN.test(model.id)) return undefined;
397
- if (model.provider === "zhipu-coding-plan" || model.provider === "zai")
398
- return GLM_CODING_PLAN_STREAM_IDLE_TIMEOUT_MS;
399
-
400
- const baseUrl = model.baseUrl.toLowerCase();
401
- if (baseUrl.includes("open.bigmodel.cn") || baseUrl.includes("api.z.ai")) {
402
- return GLM_CODING_PLAN_STREAM_IDLE_TIMEOUT_MS;
403
- }
404
-
405
- return undefined;
406
- }
403
+ // How long to keep draining the stream after a `finish_reason` chunk arrived.
404
+ // Compliant hosts follow it (almost) immediately with an optional usage-only
405
+ // chunk and the `[DONE]` sentinel, so the window only ever elapses on hosts
406
+ // that hold the connection open after the response logically completed —
407
+ // without it the turn parks on `iterator.next()` until the idle watchdog
408
+ // converts the already-successful response into a timeout error.
409
+ const OPENAI_COMPLETIONS_POST_FINISH_GRACE_MS = 2_500;
407
410
 
408
411
  export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
409
412
  model: Model<"openai-completions">,
@@ -415,41 +418,55 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
415
418
  (async () => {
416
419
  const startTime = Date.now();
417
420
  let firstTokenTime: number | undefined;
418
- let getCapturedErrorResponse: (() => CapturedHttpErrorResponse | undefined) | undefined;
419
421
 
420
422
  const output: AssistantMessage = createInitialResponsesAssistantMessage(model.api, model.provider, model.id);
421
423
  let rawRequestDump: RawHttpRequestDump | undefined;
422
424
  const abortTracker = createAbortSourceTracker(options?.signal);
423
425
  const firstEventTimeoutAbortError = new Error(OPENAI_COMPLETIONS_FIRST_EVENT_TIMEOUT_MESSAGE);
424
426
  const { requestAbortController, requestSignal } = abortTracker;
427
+ const onSseEvent = options?.onSseEvent;
428
+ const rawSseObserver = onSseEvent
429
+ ? (event: RawSseEvent) => {
430
+ if (!event.event && event.data && event.data !== "[DONE]") {
431
+ try {
432
+ const parsed = JSON.parse(event.data);
433
+ const resolvedEvent =
434
+ typeof parsed.type === "string"
435
+ ? parsed.type
436
+ : typeof parsed.object === "string"
437
+ ? parsed.object
438
+ : null;
439
+ if (resolvedEvent) {
440
+ event.event = resolvedEvent;
441
+ event.raw = [`event: ${resolvedEvent}`, ...event.raw];
442
+ }
443
+ } catch {}
444
+ }
445
+ onSseEvent(event, model);
446
+ }
447
+ : undefined;
448
+ // Assigned once the block helpers exist (they are scoped to the `try`);
449
+ // the catch handler uses it to close any open blocks before emitting the
450
+ // terminal error so both exit paths obey the same block lifecycle.
451
+ let finishOpenBlocksOnError: () => void = () => {};
425
452
 
426
453
  try {
427
454
  const apiKey = options?.apiKey || getEnvApiKey(model.provider) || "";
428
- const idleTimeoutFallbackMs = getOpenAICompletionsStreamIdleTimeoutFallbackMs(model);
455
+ const idleTimeoutFallbackMs = model.compat.streamIdleTimeoutMs;
429
456
  const idleTimeoutMs = options?.streamIdleTimeoutMs ?? getOpenAIStreamIdleTimeoutMs(idleTimeoutFallbackMs);
430
457
  const firstEventTimeoutMs =
431
458
  options?.streamFirstEventTimeoutMs ?? getOpenAIStreamFirstEventTimeoutMs(idleTimeoutMs);
432
459
  const requestTimeoutMs =
433
460
  firstEventTimeoutMs !== undefined && firstEventTimeoutMs > 0 ? firstEventTimeoutMs : undefined;
434
- const {
435
- client,
436
- copilotPremiumRequests,
437
- baseUrl,
438
- requestHeaders,
439
- getCapturedErrorResponse: captureErrorResponse,
440
- clearCapturedErrorResponse,
441
- } = await createClient(
461
+ const { copilotPremiumRequests, baseUrl, headers, query, requestHeaders } = await createRequestSetup(
442
462
  model,
443
463
  context,
444
464
  apiKey,
445
465
  options?.headers,
446
466
  options?.initiatorOverride,
447
- options?.onSseEvent,
448
- options?.fetch,
449
467
  );
450
468
  const premiumRequestsTotal = copilotPremiumRequests;
451
- getCapturedErrorResponse = captureErrorResponse;
452
- let appliedToolStrictMode: AppliedToolStrictMode = "mixed";
469
+ let appliedStrictTools = false;
453
470
  const providerSessionState = getOpenAICompletionsProviderSessionState(
454
471
  model,
455
472
  baseUrl,
@@ -457,31 +474,29 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
457
474
  );
458
475
  let disableStrictTools = providerSessionState?.strictToolsDisabled ?? false;
459
476
  let strictFallbackErrorMessage: string | undefined;
477
+ const trimmedBaseUrl = baseUrl.replace(/\/+$/, "");
478
+ const completionsUrl = query
479
+ ? `${trimmedBaseUrl}/chat/completions?${new URLSearchParams(query)}`
480
+ : `${trimmedBaseUrl}/chat/completions`;
460
481
  const createCompletionsStream = async (toolStrictModeOverride?: ToolStrictModeOverride) => {
461
- clearCapturedErrorResponse();
462
482
  const effectiveToolStrictModeOverride = disableStrictTools ? "none" : toolStrictModeOverride;
463
- const { params, toolStrictMode } = buildParams(
483
+ const { params, strictToolsApplied } = buildParams(
464
484
  model,
465
485
  context,
466
486
  options,
467
- baseUrl,
468
487
  effectiveToolStrictModeOverride,
469
488
  );
470
- appliedToolStrictMode = toolStrictMode;
489
+ appliedStrictTools = strictToolsApplied;
471
490
  options?.onPayload?.(params);
472
491
  rawRequestDump = {
473
492
  provider: model.provider,
474
493
  api: output.api,
475
494
  model: model.id,
476
495
  method: "POST",
477
- url: `${baseUrl}/chat/completions`,
496
+ url: completionsUrl,
478
497
  headers: requestHeaders,
479
498
  body: params,
480
499
  };
481
- const requestOptions =
482
- requestTimeoutMs === undefined
483
- ? { signal: requestSignal }
484
- : { signal: requestSignal, timeout: requestTimeoutMs };
485
500
  let requestTimeout: NodeJS.Timeout | undefined;
486
501
  if (requestTimeoutMs !== undefined) {
487
502
  requestTimeout = setTimeout(
@@ -490,17 +505,26 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
490
505
  );
491
506
  }
492
507
  try {
493
- const { data, response, request_id } = await client.chat.completions
494
- .create(params, requestOptions)
495
- .withResponse();
496
- await notifyProviderResponse(options, response, model, request_id);
497
- return data;
498
- } catch (error) {
499
- if (error instanceof OpenAIConnectionTimeoutError && !abortTracker.wasCallerAbort()) {
500
- throw firstEventTimeoutAbortError;
508
+ const headersWithTimeout = { ...headers };
509
+ if (requestTimeoutMs !== undefined) {
510
+ headersWithTimeout["X-Stainless-Timeout"] = Math.floor(requestTimeoutMs / 1000).toString();
501
511
  }
502
- throw error;
512
+ const { events, response, requestId } = await postOpenAIStream<ChatCompletionChunk>({
513
+ url: completionsUrl,
514
+ headers: headersWithTimeout,
515
+ body: params,
516
+ signal: requestSignal,
517
+ fetch: options?.fetch,
518
+ // With a first-event watchdog armed, transport retries must
519
+ // not silently extend the deadline (old SDK `maxRetries: 0`).
520
+ maxAttempts: requestTimeoutMs === undefined ? undefined : 1,
521
+ onSseEvent: rawSseObserver,
522
+ });
523
+ await notifyProviderResponse(options, response, model, requestId);
524
+ return events;
503
525
  } finally {
526
+ // Headers arrived (or the request failed); from here the
527
+ // first-event deadline is enforced by `iterateWithIdleTimeout`.
504
528
  if (requestTimeout !== undefined) clearTimeout(requestTimeout);
505
529
  }
506
530
  };
@@ -511,7 +535,7 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
511
535
  signal: requestSignal,
512
536
  });
513
537
  } catch (error) {
514
- const capturedErrorResponse = getCapturedErrorResponse();
538
+ const capturedErrorResponse = error instanceof OpenAIHttpError ? error.captured : undefined;
515
539
  if (
516
540
  isOpenRouterAnthropicModel(model) &&
517
541
  !disableStrictTools &&
@@ -525,9 +549,15 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
525
549
  disableStrictTools = true;
526
550
  openaiStream = await createCompletionsStream("none");
527
551
  } else {
528
- if (!shouldRetryWithoutStrictTools(error, capturedErrorResponse, appliedToolStrictMode, context.tools)) {
552
+ if (!shouldRetryWithoutStrictTools(error, capturedErrorResponse, appliedStrictTools, context.tools)) {
529
553
  throw error;
530
554
  }
555
+ // Remember the rejection for the rest of the session so every
556
+ // subsequent request doesn't pay a strict-400 + retry round-trip.
557
+ if (providerSessionState) {
558
+ providerSessionState.strictToolsDisabled = true;
559
+ }
560
+ disableStrictTools = true;
531
561
  openaiStream = await createCompletionsStream("none");
532
562
  }
533
563
  }
@@ -536,13 +566,12 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
536
566
  }
537
567
  stream.push({ type: "start", partial: output });
538
568
 
539
- const parseMiniMaxThinkTags = model.provider === "minimax-code" || model.provider === "minimax-code-cn";
540
569
  // Some OpenAI-compatible DeepSeek hosts (including NVIDIA NIM and DeepSeek's
541
570
  // native API) leak chat-template tool-call markers in `delta.content` even
542
571
  // though tool calls are also surfaced structurally. Strip the leaked markers
543
572
  // so users don't see raw `<|...|>` tokens.
544
573
  const stripDeepseekChatTemplateTokens =
545
- /deepseek/i.test(model.id) && (model.provider === "nvidia" || model.provider === "deepseek");
574
+ isDeepseekModelIdOrName(model.id) && (model.provider === "nvidia" || model.provider === "deepseek");
546
575
  type ToolCallStreamBlock = ToolCall & {
547
576
  partialArgs?: string | Record<string, unknown>;
548
577
  streamIndex?: number;
@@ -560,6 +589,20 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
560
589
  if (block.partialArgs === undefined) return;
561
590
  const contentIndex = blockIndex(block);
562
591
  if (contentIndex < 0) return;
592
+ // Object-shaped `partialArgs` came from MiniMax-compatible hosts that stream
593
+ // `function.arguments` as an object. The per-chunk handler holds them with an
594
+ // empty wire delta (see the object branch below) because emitting each chunk's
595
+ // `JSON.stringify(rawArgs)` would feed concat-based downstream consumers
596
+ // (proxy.ts, openai-chat-server, openai-responses-server, anthropic-messages-server)
597
+ // an invalid concatenation like `{"input":"a"}{"input":"b"}`. Flush the final
598
+ // merged object as one concat-safe delta now so those consumers reconstruct the
599
+ // args correctly before observing `toolcall_end`.
600
+ if (typeof block.partialArgs === "object" && !Array.isArray(block.partialArgs)) {
601
+ const fullJson = JSON.stringify(block.partialArgs);
602
+ if (fullJson.length > 0 && fullJson !== "{}") {
603
+ stream.push({ type: "toolcall_delta", contentIndex, delta: fullJson, partial: output });
604
+ }
605
+ }
563
606
  block.arguments =
564
607
  typeof block.partialArgs === "string" ? parseStreamingJson(block.partialArgs) : block.partialArgs;
565
608
  delete block.partialArgs;
@@ -591,13 +634,21 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
591
634
  }
592
635
  finishToolCallBlock(block);
593
636
  };
637
+ finishOpenBlocksOnError = () => {
638
+ if (currentBlock?.type !== "toolCall") finishCurrentBlock(currentBlock);
639
+ finishPendingToolCallBlocks();
640
+ };
594
641
  const appendText = (
595
642
  message: AssistantMessage,
596
643
  eventStream: AssistantMessageEventStream,
597
644
  text: string,
598
645
  ): void => {
599
646
  if (currentBlock?.type !== "text") {
600
- finishCurrentBlock(currentBlock);
647
+ // Leave toolCall blocks pending across text transitions: chunks after
648
+ // the first typically carry only `index`, so a finished (de-registered)
649
+ // call would be reborn as a nameless phantom block when its arguments
650
+ // resume. The stream-end sweep finalizes pending calls.
651
+ if (currentBlock?.type !== "toolCall") finishCurrentBlock(currentBlock);
601
652
  currentBlock = { type: "text", text: "" };
602
653
  message.content.push(currentBlock);
603
654
  eventStream.push({ type: "text_start", contentIndex: blockIndex(currentBlock), partial: message });
@@ -620,7 +671,9 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
620
671
  currentBlock?.type !== "thinking" ||
621
672
  (signature !== undefined && currentBlock.thinkingSignature !== signature)
622
673
  ) {
623
- finishCurrentBlock(currentBlock);
674
+ // Same as appendText: leave toolCall blocks pending so index-only
675
+ // continuation deltas can still find them.
676
+ if (currentBlock?.type !== "toolCall") finishCurrentBlock(currentBlock);
624
677
  currentBlock = { type: "thinking", thinking: "", thinkingSignature: signature };
625
678
  message.content.push(currentBlock);
626
679
  eventStream.push({
@@ -646,10 +699,32 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
646
699
  if (!firstTokenTime) firstTokenTime = Date.now();
647
700
  appendText(output, stream, text);
648
701
  };
649
- const appendThinkingDelta = (thinking: string, signature?: string): void => {
702
+ // Tracks the last full cumulative reasoning snapshot per signature (the
703
+ // reasoning field name) so dedup survives block transitions. Required
704
+ // for MiniMax-M3: once `</think>` and visible text arrive, currentBlock
705
+ // flips to "text", but later chunks keep carrying the same cumulative
706
+ // `reasoning_content` snapshot. Without an external tracker the guard
707
+ // below misses and the snapshot gets re-emitted as a fresh thinking
708
+ // block after the answer has started.
709
+ const lastCumulativeReasoningBySignature = new Map<string, string>();
710
+ const appendThinkingDelta = (
711
+ thinking: string,
712
+ signature?: string,
713
+ source: "delta" | "cumulative" = "delta",
714
+ ): void => {
650
715
  if (!thinking) return;
716
+ let emittedThinking = thinking;
717
+ if (source === "cumulative") {
718
+ const key = signature ?? "";
719
+ const lastSnapshot = lastCumulativeReasoningBySignature.get(key) ?? "";
720
+ if (thinking.startsWith(lastSnapshot)) {
721
+ emittedThinking = thinking.slice(lastSnapshot.length);
722
+ }
723
+ lastCumulativeReasoningBySignature.set(key, thinking);
724
+ if (!emittedThinking) return;
725
+ }
651
726
  if (!firstTokenTime) firstTokenTime = Date.now();
652
- appendThinking(output, stream, thinking, signature);
727
+ appendThinking(output, stream, emittedThinking, signature);
653
728
  };
654
729
 
655
730
  let deepseekStripBuffer = "";
@@ -676,13 +751,11 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
676
751
  appendTextDelta(processedText);
677
752
  }
678
753
  };
679
-
680
- const streamMarkupHealingPattern = getStreamMarkupHealingPattern(model.provider, model.id, {
681
- parseThinkingTags: parseMiniMaxThinkTags,
682
- });
754
+ const streamMarkupHealingPattern = getStreamMarkupHealingPattern(model.provider, model.id);
683
755
  const streamMarkupHealing = streamMarkupHealingPattern
684
756
  ? new StreamMarkupHealing({ pattern: streamMarkupHealingPattern })
685
757
  : undefined;
758
+ const explicitReasoningDeltasMayBeCumulative = modelMayLeakThinkingTags(model.provider, model.id);
686
759
  let healedToolCallEmitted = false;
687
760
  const emitHealedToolCall = (call: HealedToolCall): void => {
688
761
  finishCurrentBlock(currentBlock);
@@ -722,7 +795,12 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
722
795
  for (const call of calls) emitHealedToolCall(call);
723
796
  };
724
797
 
725
- for await (const chunk of iterateWithIdleTimeout(openaiStream, {
798
+ // Terminal-chunk bookkeeping for the post-finish grace window below.
799
+ // `streamFinishedAt` flips when a chunk carries `finish_reason`;
800
+ // `sawUsagePayload` flips when any usage payload was parsed.
801
+ let streamFinishedAt: number | undefined;
802
+ let sawUsagePayload = false;
803
+ const timedOpenaiStream = iterateWithIdleTimeout(openaiStream, {
726
804
  idleTimeoutMs,
727
805
  firstItemTimeoutMs: firstEventTimeoutMs,
728
806
  firstItemErrorMessage: OPENAI_COMPLETIONS_FIRST_EVENT_TIMEOUT_MESSAGE,
@@ -731,24 +809,48 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
731
809
  onFirstItemTimeout: () => abortTracker.abortLocally(firstEventTimeoutAbortError),
732
810
  abortSignal: options?.signal,
733
811
  isProgressItem: isOpenAICompletionsProgressChunk,
734
- })) {
812
+ });
813
+ const terminalAwareStream = iterateWithTerminalGrace(timedOpenaiStream, {
814
+ finishedAtMs: () => streamFinishedAt,
815
+ graceMs: OPENAI_COMPLETIONS_POST_FINISH_GRACE_MS,
816
+ // The inner idle-timeout generator is parked mid-`next()` when the
817
+ // grace window closes, so abort the transport to settle that read
818
+ // and release the socket immediately (a queued `.return()` alone
819
+ // would wait on the never-arriving next chunk).
820
+ onGraceEnd: () => requestAbortController.abort(),
821
+ });
822
+ for await (const chunk of terminalAwareStream) {
735
823
  if (!chunk || typeof chunk !== "object") continue;
736
824
 
737
825
  // OpenAI documents ChatCompletionChunk.id as the unique chat completion identifier,
738
826
  // and each chunk in a streamed completion carries the same id.
739
827
  output.responseId ||= chunk.id;
740
828
 
829
+ // Aggregators (OpenRouter, Vercel AI Gateway, …) report the upstream
830
+ // provider that actually served the request via a top-level `provider`
831
+ // field present on every chunk. Capture the first non-empty value so
832
+ // callers can attribute routing without re-parsing the raw stream.
833
+ output.upstreamProvider ||= getOptionalStringProperty(chunk, "provider");
834
+
741
835
  if (chunk.usage) {
742
836
  output.usage = parseChunkUsage(chunk.usage, model, premiumRequestsTotal);
837
+ sawUsagePayload = true;
743
838
  }
744
839
 
745
840
  const choice = Array.isArray(chunk.choices) ? chunk.choices[0] : undefined;
746
- if (!choice) continue;
841
+ if (!choice) {
842
+ // Trailing usage-only chunk (`stream_options.include_usage`) after
843
+ // `finish_reason`: the response is complete — stop pulling instead
844
+ // of waiting for `[DONE]`/close from hosts that never send either.
845
+ if (streamFinishedAt !== undefined && sawUsagePayload) break;
846
+ continue;
847
+ }
747
848
 
748
849
  if (!chunk.usage) {
749
850
  const choiceUsage = getChoiceUsage(choice);
750
851
  if (choiceUsage) {
751
852
  output.usage = parseChunkUsage(choiceUsage, model, premiumRequestsTotal);
853
+ sawUsagePayload = true;
752
854
  }
753
855
  }
754
856
 
@@ -758,14 +860,42 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
758
860
  if (finishReasonResult.errorMessage) {
759
861
  output.errorMessage = finishReasonResult.errorMessage;
760
862
  }
863
+ streamFinishedAt ??= Date.now();
761
864
  }
762
865
 
763
866
  if (choice.delta) {
867
+ // Some endpoints return reasoning in reasoning_content (llama.cpp),
868
+ // or reasoning (other openai compatible endpoints). Use the first
869
+ // non-empty reasoning field to avoid duplication when a chunk carries
870
+ // multiple aliases for the same reasoning text.
871
+ const reasoningFields = ["reasoning_content", "reasoning", "reasoning_text"];
872
+ const deltaRecord = choice.delta as Record<string, unknown>;
873
+ let foundReasoningField: string | undefined;
874
+ let foundReasoningDelta = "";
875
+ for (const field of reasoningFields) {
876
+ const reasoningDelta = deltaRecord[field];
877
+ if (typeof reasoningDelta === "string" && reasoningDelta.length > 0) {
878
+ foundReasoningField = field;
879
+ foundReasoningDelta = reasoningDelta;
880
+ break;
881
+ }
882
+ }
883
+
884
+ if (foundReasoningField) {
885
+ appendThinkingDelta(
886
+ foundReasoningDelta,
887
+ foundReasoningField,
888
+ explicitReasoningDeltasMayBeCumulative ? "cumulative" : "delta",
889
+ );
890
+ }
891
+
764
892
  const normalizedDeltaText = normalizeStreamingContentText(choice.delta.content);
765
893
  if (normalizedDeltaText.length > 0) {
766
894
  if (!firstTokenTime) firstTokenTime = Date.now();
767
895
  const hasStructuredToolCalls =
768
896
  Array.isArray(choice.delta.tool_calls) && choice.delta.tool_calls.length > 0;
897
+ const suppressContentThinking =
898
+ foundReasoningField !== undefined && streamMarkupHealing?.pattern === "thinking";
769
899
 
770
900
  if (streamMarkupHealing) {
771
901
  if (hasStructuredToolCalls) {
@@ -776,6 +906,7 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
776
906
  appendProcessedText(streamMarkupHealing.consumeWithoutCalls(normalizedDeltaText));
777
907
  } else {
778
908
  for (const event of streamMarkupHealing.feedEvents(normalizedDeltaText)) {
909
+ if (suppressContentThinking && event.type === "thinking") continue;
779
910
  emitHealingEvent(event);
780
911
  }
781
912
  }
@@ -784,30 +915,6 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
784
915
  }
785
916
  }
786
917
 
787
- // Some endpoints return reasoning in reasoning_content (llama.cpp),
788
- // or reasoning (other openai compatible endpoints)
789
- // Use the first non-empty reasoning field to avoid duplication
790
- // (e.g., chutes.ai returns both reasoning_content and reasoning with same content)
791
- const reasoningFields = ["reasoning_content", "reasoning", "reasoning_text"];
792
- let foundReasoningField: string | null = null;
793
- for (const field of reasoningFields) {
794
- if (
795
- (choice.delta as any)[field] !== null &&
796
- (choice.delta as any)[field] !== undefined &&
797
- (choice.delta as any)[field].length > 0
798
- ) {
799
- if (!foundReasoningField) {
800
- foundReasoningField = field;
801
- break;
802
- }
803
- }
804
- }
805
-
806
- if (foundReasoningField) {
807
- const delta = (choice.delta as any)[foundReasoningField];
808
- appendThinkingDelta(delta, foundReasoningField);
809
- }
810
-
811
918
  if (choice?.delta?.tool_calls && choice.delta.tool_calls.length > 0) {
812
919
  for (const toolCall of choice.delta.tool_calls) {
813
920
  const streamIndex = typeof toolCall.index === "number" ? toolCall.index : undefined;
@@ -845,6 +952,11 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
845
952
  partial: output,
846
953
  });
847
954
  } else {
955
+ // Resuming a pending call after interleaved text/thinking:
956
+ // close the text/thinking block we drifted into.
957
+ if (currentBlock !== block && currentBlock && currentBlock.type !== "toolCall") {
958
+ finishCurrentBlock(currentBlock);
959
+ }
848
960
  currentBlock = block;
849
961
  if (streamIndex !== undefined && block.streamIndex === undefined) {
850
962
  block.streamIndex = streamIndex;
@@ -871,13 +983,37 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
871
983
  }
872
984
  }
873
985
  } else if (rawArgs && typeof rawArgs === "object" && !Array.isArray(rawArgs)) {
874
- // MiniMax-compatible hosts stream `function.arguments` as a complete object in a
875
- // single delta instead of the OpenAI JSON-string contract. Hold the object directly
876
- // no `[object Object]` round-trip through the string buffer and serialize once for
877
- // the wire delta that proxy servers forward verbatim as `input_json_delta`.
878
- block.partialArgs = rawArgs;
879
- block.arguments = rawArgs;
880
- delta = JSON.stringify(rawArgs);
986
+ // MiniMax-compatible hosts stream `function.arguments` as an object instead of the
987
+ // OpenAI JSON-string contract. Most chunks carry the complete object in one delta,
988
+ // but cannot rely on that: replacing per-chunk drops earlier keys (and earlier
989
+ // string content for the same key) when the host fragments the args across deltas.
990
+ // Shallow-merge into the accumulated object; for shared string keys, detect
991
+ // cumulative-vs-delta semantics with `startsWith` so we neither duplicate cumulative
992
+ // payloads nor lose delta fragments. Degenerates to the previous "last wins"
993
+ // behaviour for the common single-chunk shape (no prior value to merge with).
994
+ //
995
+ // `delta` stays empty here: emitting `JSON.stringify(rawArgs)` per chunk feeds
996
+ // downstream concat-based accumulators (proxy.ts, openai-chat-server,
997
+ // openai-responses-server, anthropic-messages-server) an invalid sequence like
998
+ // `{"input":"a"}{"input":"b"}`. The merged object is flushed as a single
999
+ // concat-safe delta in `finishToolCallBlock` before `toolcall_end` instead.
1000
+ const prev =
1001
+ block.partialArgs &&
1002
+ typeof block.partialArgs === "object" &&
1003
+ !Array.isArray(block.partialArgs)
1004
+ ? (block.partialArgs as Record<string, unknown>)
1005
+ : undefined;
1006
+ const merged: Record<string, unknown> = prev ? { ...prev } : {};
1007
+ for (const [key, value] of Object.entries(rawArgs)) {
1008
+ const prevValue = merged[key];
1009
+ if (typeof prevValue === "string" && typeof value === "string") {
1010
+ merged[key] = value.startsWith(prevValue) ? value : prevValue + value;
1011
+ } else {
1012
+ merged[key] = value;
1013
+ }
1014
+ }
1015
+ block.partialArgs = merged;
1016
+ block.arguments = merged;
881
1017
  }
882
1018
  stream.push({
883
1019
  type: "toolcall_delta",
@@ -902,6 +1038,12 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
902
1038
  }
903
1039
  }
904
1040
  }
1041
+
1042
+ // `finish_reason` + usage both observed: the chat-completions
1043
+ // contract has nothing left to deliver. Break instead of waiting
1044
+ // for `[DONE]`/connection close so hosts that hold the socket open
1045
+ // can't park the turn until the idle watchdog errors it out.
1046
+ if (streamFinishedAt !== undefined && sawUsagePayload) break;
905
1047
  }
906
1048
 
907
1049
  if (streamMarkupHealing) {
@@ -962,13 +1104,20 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
962
1104
  stream.push({ type: "done", reason: output.stopReason, message: output });
963
1105
  stream.end();
964
1106
  } catch (error) {
1107
+ // Close open blocks first so consumers tracking text_/thinking_/toolcall_
1108
+ // lifecycles never see orphaned starts on the error path. Best-effort: a
1109
+ // throw here must not prevent the terminal error event below.
1110
+ try {
1111
+ finishOpenBlocksOnError();
1112
+ } catch {}
965
1113
  for (const block of output.content) delete (block as any).index;
966
1114
  const firstEventTimeoutError = abortTracker.getLocalAbortReason();
967
1115
  output.stopReason = abortTracker.wasCallerAbort() ? "aborted" : "error";
968
- output.errorStatus = extractHttpStatusFromError(error) ?? getCapturedErrorResponse?.()?.status;
1116
+ const capturedErrorResponse = error instanceof OpenAIHttpError ? error.captured : undefined;
1117
+ output.errorStatus = extractHttpStatusFromError(error) ?? capturedErrorResponse?.status;
969
1118
  output.errorMessage =
970
1119
  firstEventTimeoutError?.message ??
971
- (await finalizeErrorMessage(error, rawRequestDump, getCapturedErrorResponse?.()));
1120
+ (await finalizeErrorMessage(error, rawRequestDump, capturedErrorResponse));
972
1121
  // Some providers via OpenRouter include extra details here.
973
1122
  const rawMetadata = (error as { error?: { metadata?: { raw?: string } } })?.error?.metadata?.raw;
974
1123
  if (rawMetadata) output.errorMessage += `\n${rawMetadata}`;
@@ -983,21 +1132,21 @@ export const streamOpenAICompletions: StreamFunction<"openai-completions"> = (
983
1132
  return stream;
984
1133
  };
985
1134
 
986
- async function createClient(
1135
+ async function createRequestSetup(
987
1136
  model: Model<"openai-completions">,
988
1137
  context: Context,
989
1138
  apiKey?: string,
990
1139
  extraHeaders?: Record<string, string>,
991
1140
  initiatorOverride?: MessageAttribution,
992
- onSseEvent?: OpenAICompletionsOptions["onSseEvent"],
993
- fetchOverride?: FetchImpl,
994
1141
  ): Promise<{
995
- client: OpenAI;
996
1142
  copilotPremiumRequests: number | undefined;
997
- baseUrl: string | undefined;
1143
+ baseUrl: string;
1144
+ /** Headers sent on the wire, including `Authorization`. */
1145
+ headers: Record<string, string>;
1146
+ /** Query params appended to the request URL (Azure `api-version`). */
1147
+ query: Record<string, string> | undefined;
1148
+ /** Headers recorded in `rawRequestDump` (sans `Authorization`). */
998
1149
  requestHeaders: Record<string, string>;
999
- getCapturedErrorResponse: () => CapturedHttpErrorResponse | undefined;
1000
- clearCapturedErrorResponse: () => void;
1001
1150
  }> {
1002
1151
  if (!apiKey) {
1003
1152
  if (!$env.OPENAI_API_KEY) {
@@ -1015,12 +1164,12 @@ async function createClient(
1015
1164
  // analytics. `HTTP-Referer` is the unique app identifier; without it nothing is
1016
1165
  // tracked. `X-OpenRouter-Title` is the display name (`X-Title` is the legacy
1017
1166
  // alias kept for back-compat). `X-OpenRouter-Categories` slots us into the
1018
- // `cli-agent` marketplace category. `User-Agent` overrides the default OpenAI
1019
- // SDK UA so traffic is identifiable in upstream provider logs.
1167
+ // `cli-agent` marketplace category. `User-Agent` makes our traffic
1168
+ // identifiable in upstream provider logs.
1020
1169
  // https://openrouter.ai/docs/app-attribution
1021
- headers["User-Agent"] = `${APP_DISPLAY_NAME}/${packageJson.version}`;
1022
- headers["HTTP-Referer"] = "https://prometheus.trivlab.com/";
1023
- headers["X-OpenRouter-Title"] = APP_DISPLAY_NAME;
1170
+ headers["User-Agent"] = `Prometheus/${packageJson.version}`;
1171
+ headers["HTTP-Referer"] = "https://prometheus.sh/";
1172
+ headers["X-OpenRouter-Title"] = "Prometheus";
1024
1173
  headers["X-OpenRouter-Categories"] = "cli-agent";
1025
1174
  // Always-on response caching: identical requests return cached responses for free.
1026
1175
  // TTL 1h; first call hits the provider, every identical call within the window
@@ -1055,114 +1204,68 @@ async function createClient(
1055
1204
  if (baseUrl?.includes(".openai.azure.com")) {
1056
1205
  const apiVersion = $env.AZURE_OPENAI_API_VERSION || "2024-10-21";
1057
1206
  if (!baseUrl.includes("/deployments/")) {
1058
- baseUrl = `${baseUrl}/deployments/${model.id}`;
1207
+ // Honor AZURE_OPENAI_DEPLOYMENT_NAME_MAP like the responses provider:
1208
+ // deployment names routinely differ from catalog model ids.
1209
+ const deploymentName =
1210
+ parseAzureDeploymentNameMap($env.AZURE_OPENAI_DEPLOYMENT_NAME_MAP).get(model.id) ?? model.id;
1211
+ baseUrl = `${baseUrl}/deployments/${deploymentName}`;
1059
1212
  }
1060
1213
  azureDefaultQuery = { "api-version": apiVersion };
1061
1214
  }
1062
- let capturedErrorResponse: CapturedHttpErrorResponse | undefined;
1063
- const baseFetch = fetchOverride ?? fetch;
1064
- const wrappedFetch = Object.assign(
1065
- async (input: string | URL | Request, init?: RequestInit): Promise<Response> => {
1066
- const response = await baseFetch(input, init);
1067
- if (response.ok) {
1068
- capturedErrorResponse = undefined;
1069
- return response;
1070
- }
1071
- let bodyText: string | undefined;
1072
- let bodyJson: unknown;
1073
- try {
1074
- bodyText = await response.clone().text();
1075
- if (bodyText.trim().length > 0) {
1076
- try {
1077
- bodyJson = JSON.parse(bodyText);
1078
- } catch {}
1079
- }
1080
- } catch {}
1081
- capturedErrorResponse = {
1082
- status: response.status,
1083
- headers: response.headers,
1084
- bodyText,
1085
- bodyJson,
1086
- };
1087
- return response;
1088
- },
1089
- baseFetch.preconnect ? { preconnect: baseFetch.preconnect } : {},
1090
- );
1091
- const debugFetch = onSseEvent ? wrapFetchForSseDebug(wrappedFetch, event => onSseEvent(event, model)) : wrappedFetch;
1215
+ // The removed SDK client resolved its base URL as
1216
+ // `baseURL ?? $OPENAI_BASE_URL ?? https://api.openai.com/v1`; keep that
1217
+ // resolution explicit now that we build the request URL ourselves.
1218
+ const resolvedBaseUrl = baseUrl ?? ($env.OPENAI_BASE_URL?.trim() || "https://api.openai.com/v1");
1092
1219
  return {
1093
- client: new OpenAI({
1094
- apiKey,
1095
- baseURL: baseUrl,
1096
- dangerouslyAllowBrowser: true,
1097
- maxRetries: 5,
1098
- defaultHeaders: headers,
1099
- defaultQuery: azureDefaultQuery,
1100
- fetch: debugFetch,
1101
- }),
1102
1220
  copilotPremiumRequests,
1103
- baseUrl,
1221
+ baseUrl: resolvedBaseUrl,
1222
+ headers: { Authorization: `Bearer ${apiKey}`, ...headers },
1223
+ query: azureDefaultQuery,
1104
1224
  requestHeaders: headers,
1105
- getCapturedErrorResponse: () => capturedErrorResponse,
1106
- clearCapturedErrorResponse: () => {
1107
- capturedErrorResponse = undefined;
1108
- },
1109
1225
  };
1110
1226
  }
1111
1227
 
1228
+ function getForcedCompletionsToolName(toolChoice: OpenAICompletionsParams["tool_choice"]): string | undefined {
1229
+ if (typeof toolChoice !== "object" || toolChoice === null || !("function" in toolChoice)) return undefined;
1230
+ return toolChoice.function.name;
1231
+ }
1232
+
1112
1233
  function buildParams(
1113
1234
  model: Model<"openai-completions">,
1114
1235
  context: Context,
1115
1236
  options: OpenAICompletionsOptions | undefined,
1116
- resolvedBaseUrl?: string,
1117
1237
  toolStrictModeOverride?: ToolStrictModeOverride,
1118
- ): { params: OpenAICompletionsParams; toolStrictMode: AppliedToolStrictMode } {
1119
- const compat = getCompat(model, resolvedBaseUrl);
1120
- // Opencode Zen's gateway (https://opencode.ai/zen/go/v1) gates
1121
- // `reasoning_content` on the request's thinking state for every model it
1122
- // fronts (Kimi K2.x, DeepSeek V4, GLM-5.x, Qwen3.x, MiMo, MiniMax, …): it
1123
- // 400s with `Extra inputs are not permitted` when thinking is off but the
1124
- // field is supplied (#1071), and 400s with `thinking is enabled but
1125
- // reasoning_content is missing in assistant tool call message at index N`
1126
- // (#1484) when thinking is on and the field is absent. `detectOpenAICompat`
1127
- // only set `requiresReasoningContentForToolCalls` for the DeepSeek family
1128
- // (and previously for Kimi until #1071 carved out opencode); reactivate it
1129
- // per request for every opencode model whenever this turn is in thinking
1130
- // mode so prior tool-call turns replay reasoning_content. Forced-tool
1131
- // turns are excluded because the later `disableReasoningOnForcedToolChoice`
1132
- // guard at the bottom of `buildParams` strips thinking from the wire body
1133
- // for Kimi-style models — keeping the replay on under those conditions
1134
- // would resurrect the #1071 failure.
1135
- //
1136
- // `allowsSyntheticReasoningContentForToolCalls` is forced to `false` on
1137
- // the same path: the gateway specifically requires `reasoning_content`,
1138
- // and the default synthetic-friendly behavior would echo whichever field
1139
- // the upstream streamed (e.g. `reasoning` for many opencode turns),
1140
- // landing the replay in the wrong key and re-triggering the 400.
1141
- const isOpenCodeProvider = model.provider === "opencode-go" || model.provider === "opencode-zen";
1238
+ ): { params: OpenAICompletionsParams; toolStrictMode: AppliedToolStrictMode; strictToolsApplied: boolean } {
1239
+ let compat = model.compat;
1142
1240
  const thinkingEnabledForRequest =
1143
1241
  Boolean(options?.reasoning) && !options?.disableReasoning && Boolean(model.reasoning);
1144
1242
  const forcedToolChoiceSuppressesThinking =
1145
1243
  compat.disableReasoningOnForcedToolChoice &&
1244
+ compat.supportsForcedToolChoice &&
1146
1245
  isForcedToolChoice(mapToOpenAICompletionsToolChoice(options?.toolChoice));
1147
- if (isOpenCodeProvider && thinkingEnabledForRequest && !forcedToolChoiceSuppressesThinking) {
1148
- compat.requiresReasoningContentForToolCalls = true;
1149
- compat.allowsSyntheticReasoningContentForToolCalls = false;
1150
- compat.reasoningContentField = "reasoning_content";
1246
+ if (compat.whenThinking && thinkingEnabledForRequest && !forcedToolChoiceSuppressesThinking) {
1247
+ compat = compat.whenThinking; // precomputed at model build — pointer swap, no allocation
1151
1248
  }
1152
- const isKimiModelId = model.id.includes("moonshotai/kimi") || /(^|\/)kimi[-.]/i.test(model.id);
1153
1249
  const messages = convertMessages(model, context, compat);
1154
1250
  maybeAddAnthropicCacheControl(compat, messages);
1155
- const supportsReasoningParams = model.provider !== "github-copilot";
1156
-
1157
- // Kimi (including via OpenRouter and Fireworks router-form IDs such as
1158
- // `accounts/fireworks/routers/kimi-*`) calculates TPM rate limits based on
1159
- // max_tokens, not actual output. The official Kimi K2 model guidance
1160
- // (https://docs.fireworks.ai/models/kimi-k2) also requires `max_tokens` for
1161
- // every call since the family can otherwise emit very long reasoning traces
1162
- // before the final answer. Always send max_tokens match the same
1163
- // Kimi-family regex used by the compat detector.
1164
- // Note: Direct kimi-code provider is handled by the dedicated Kimi provider in kimi.ts.
1165
- const effectiveMaxTokens = options?.maxTokens ?? (isKimiModelId ? model.maxTokens : undefined);
1251
+ const supportsReasoningParams = compat.supportsReasoningParams;
1252
+
1253
+ // Kimi-family models calculate TPM rate limits from max_tokens (not actual
1254
+ // output) and the official guidance requires sending it on every call —
1255
+ // `compat.alwaysSendMaxTokens` carries that detection.
1256
+ const requestedMaxTokens =
1257
+ options?.maxTokens ?? (compat.alwaysSendMaxTokens ? (model.maxTokens ?? OPENAI_MAX_OUTPUT_TOKENS) : undefined);
1258
+ // OpenRouter fans out to upstreams whose output caps differ from the catalog
1259
+ // value (which tracks the highest-cap provider). A max_tokens above the routed
1260
+ // upstream's cap makes OpenRouter silently skip that provider (e.g. Cerebras
1261
+ // GLM-4.7, ~40k) for a higher-cap one, defeating `provider.order`/`only`. Omit
1262
+ // it for OpenRouter so each upstream self-caps and routing is honored — unless
1263
+ // the model always requires max_tokens (Kimi TPM accounting, see above).
1264
+ const omitMaxTokensForRouting = compat.isOpenRouterHost && !compat.alwaysSendMaxTokens;
1265
+ const effectiveMaxTokens =
1266
+ requestedMaxTokens === undefined || omitMaxTokensForRouting
1267
+ ? undefined
1268
+ : Math.min(requestedMaxTokens, model.maxTokens ?? Number.POSITIVE_INFINITY, OPENAI_MAX_OUTPUT_TOKENS);
1166
1269
 
1167
1270
  const requestModelId = resolveOpenAICompletionsModelId(model, options);
1168
1271
  const params: OpenAICompletionsParams = {
@@ -1171,6 +1274,7 @@ function buildParams(
1171
1274
  stream: true,
1172
1275
  };
1173
1276
  let toolStrictMode: AppliedToolStrictMode = "none";
1277
+ let strictToolsApplied = false;
1174
1278
 
1175
1279
  if (compat.supportsUsageInStreaming !== false) {
1176
1280
  params.stream_options = { include_usage: true };
@@ -1224,6 +1328,7 @@ function buildParams(
1224
1328
  const builtTools = convertTools(context.tools, compat, toolStrictModeOverride);
1225
1329
  params.tools = builtTools.tools;
1226
1330
  toolStrictMode = builtTools.toolStrictMode;
1331
+ strictToolsApplied = builtTools.strictToolsApplied;
1227
1332
  } else if (context.tools === undefined && hasToolHistory(context.messages)) {
1228
1333
  // Anthropic (via LiteLLM/proxy) requires the `tools` param when the conversation
1229
1334
  // contains tool_calls/tool_results, even when no tools are offered this turn.
@@ -1238,6 +1343,12 @@ function buildParams(
1238
1343
  if (options?.toolChoice && compat.supportsToolChoice) {
1239
1344
  params.tool_choice = mapToOpenAICompletionsToolChoice(options.toolChoice);
1240
1345
  }
1346
+ if (isForcedToolChoice(params.tool_choice) && !compat.supportsForcedToolChoice) {
1347
+ // Some thinking-required OpenAI-compatible models reject forced
1348
+ // `tool_choice` while still accepting tools with the default auto
1349
+ // selector. Keep the tool available and let the model choose it.
1350
+ params.tool_choice = "auto";
1351
+ }
1241
1352
 
1242
1353
  if (params.tool_choice === "none" && (!Array.isArray(params.tools) || params.tools.length === 0)) {
1243
1354
  // `tool_choice: "none"` with no tools to gate is redundant and also
@@ -1251,6 +1362,19 @@ function buildParams(
1251
1362
  delete params.tool_choice;
1252
1363
  }
1253
1364
 
1365
+ const forcedToolName = getForcedCompletionsToolName(params.tool_choice);
1366
+ if (
1367
+ forcedToolName !== undefined &&
1368
+ (!Array.isArray(params.tools) ||
1369
+ !params.tools.some(tool => tool.type === "function" && tool.function.name === forcedToolName))
1370
+ ) {
1371
+ // A forced named tool_choice is only valid when the same request offers
1372
+ // that function in `tools`. Active-tool filtering normally enforces this
1373
+ // before provider dispatch; this guard keeps raw provider callers from
1374
+ // emitting a self-inconsistent OpenAI-compatible payload.
1375
+ delete params.tool_choice;
1376
+ }
1377
+
1254
1378
  if (supportsReasoningParams && compat.thinkingFormat === "zai" && model.reasoning) {
1255
1379
  // Z.ai uses binary thinking: { type: "enabled" | "disabled" }
1256
1380
  // Must explicitly disable since z.ai defaults to thinking enabled.
@@ -1278,7 +1402,10 @@ function buildParams(
1278
1402
  openRouterParams.reasoning = { enabled: false };
1279
1403
  } else if (options?.reasoning) {
1280
1404
  openRouterParams.reasoning = {
1281
- effort: mapReasoningEffort(options.reasoning, compat.reasoningEffortMap),
1405
+ effort:
1406
+ compat.reasoningEffortMap?.[options.reasoning] ??
1407
+ model.thinking?.effortMap?.[options.reasoning] ??
1408
+ options.reasoning,
1282
1409
  };
1283
1410
  }
1284
1411
  } else if (
@@ -1289,7 +1416,9 @@ function buildParams(
1289
1416
  compat.supportsReasoningEffort
1290
1417
  ) {
1291
1418
  // OpenAI-style reasoning_effort
1292
- params.reasoning_effort = mapReasoningEffort(options.reasoning, compat.reasoningEffortMap) as Effort;
1419
+ params.reasoning_effort = (compat.reasoningEffortMap?.[options.reasoning] ??
1420
+ model.thinking?.effortMap?.[options.reasoning] ??
1421
+ options.reasoning) as Effort;
1293
1422
  } else if (
1294
1423
  supportsReasoningParams &&
1295
1424
  options?.disableReasoning &&
@@ -1304,7 +1433,9 @@ function buildParams(
1304
1433
  if (minEffort === undefined) {
1305
1434
  throw new Error(`Model ${model.provider}/${model.id} has no supported reasoning efforts`);
1306
1435
  }
1307
- params.reasoning_effort = mapReasoningEffort(minEffort, compat.reasoningEffortMap) as Effort;
1436
+ params.reasoning_effort = (compat.reasoningEffortMap?.[minEffort] ??
1437
+ model.thinking?.effortMap?.[minEffort] ??
1438
+ minEffort) as Effort;
1308
1439
  }
1309
1440
 
1310
1441
  if (compat.disableReasoningOnToolChoice && params.tool_choice !== undefined) {
@@ -1327,13 +1458,13 @@ function buildParams(
1327
1458
  }
1328
1459
 
1329
1460
  // OpenRouter provider routing preferences
1330
- if (model.baseUrl.includes("openrouter.ai") && compat.openRouterRouting) {
1461
+ if (compat.isOpenRouterHost && compat.openRouterRouting) {
1331
1462
  params.provider = compat.openRouterRouting;
1332
1463
  }
1333
1464
 
1334
1465
  // Vercel AI Gateway provider routing preferences
1335
- if (model.baseUrl.includes("ai-gateway.vercel.sh") && model.compat?.vercelGatewayRouting) {
1336
- const routing = model.compat.vercelGatewayRouting;
1466
+ if (compat.isVercelGatewayHost && compat.vercelGatewayRouting) {
1467
+ const routing = compat.vercelGatewayRouting;
1337
1468
  if (routing.only || routing.order) {
1338
1469
  const gatewayOptions: Record<string, string[]> = {};
1339
1470
  if (routing.only) gatewayOptions.only = routing.only;
@@ -1344,9 +1475,14 @@ function buildParams(
1344
1475
 
1345
1476
  if (compat.extraBody) {
1346
1477
  Object.assign(params, compat.extraBody);
1478
+ if (model.provider === "fireworks" && params.reasoning_effort !== undefined) {
1479
+ // Fireworks rejects simultaneous DeepSeek-style `thinking` toggles and
1480
+ // OpenAI-style `reasoning_effort`; the effort field carries the user's level.
1481
+ delete params.thinking;
1482
+ }
1347
1483
  }
1348
1484
 
1349
- return { params, toolStrictMode };
1485
+ return { params, toolStrictMode, strictToolsApplied };
1350
1486
  }
1351
1487
 
1352
1488
  function getOptionalNumberProperty(value: object, key: string): number | undefined {
@@ -1354,6 +1490,11 @@ function getOptionalNumberProperty(value: object, key: string): number | undefin
1354
1490
  return typeof property === "number" ? property : undefined;
1355
1491
  }
1356
1492
 
1493
+ function getOptionalStringProperty(value: object, key: string): string | undefined {
1494
+ const property = Reflect.get(value, key);
1495
+ return typeof property === "string" && property.length > 0 ? property : undefined;
1496
+ }
1497
+
1357
1498
  function getOptionalObjectProperty(value: object, key: string): object | undefined {
1358
1499
  const property = Reflect.get(value, key);
1359
1500
  return typeof property === "object" && property !== null ? property : undefined;
@@ -1430,13 +1571,6 @@ export function parseChunkUsage(
1430
1571
  return usage;
1431
1572
  }
1432
1573
 
1433
- function mapReasoningEffort(
1434
- effort: NonNullable<OpenAICompletionsOptions["reasoning"]>,
1435
- reasoningEffortMap: Partial<Record<NonNullable<OpenAICompletionsOptions["reasoning"]>, string>>,
1436
- ): string {
1437
- return reasoningEffortMap[effort] ?? effort;
1438
- }
1439
-
1440
1574
  function maybeAddAnthropicCacheControl(compat: ResolvedOpenAICompat, messages: ChatCompletionMessageParam[]): void {
1441
1575
  if (compat.cacheControlFormat !== "anthropic") return;
1442
1576
  // Anthropic-style caching requires cache_control on a text part. Add a breakpoint
@@ -1447,6 +1581,7 @@ function maybeAddAnthropicCacheControl(compat: ResolvedOpenAICompat, messages: C
1447
1581
 
1448
1582
  const content = msg.content;
1449
1583
  if (typeof content === "string") {
1584
+ if (content.trim().length === 0) continue;
1450
1585
  msg.content = [
1451
1586
  Object.assign({ type: "text" as const, text: content }, { cache_control: { type: "ephemeral" } }),
1452
1587
  ];
@@ -1455,10 +1590,12 @@ function maybeAddAnthropicCacheControl(compat: ResolvedOpenAICompat, messages: C
1455
1590
 
1456
1591
  if (!Array.isArray(content)) continue;
1457
1592
 
1458
- // Find last text part and add cache_control
1593
+ // Find last non-empty text part and add cache_control. Empty assistant
1594
+ // content is valid for tool-call replay, but Anthropic/OpenRouter reject
1595
+ // empty text blocks once cache_control turns it into structured content.
1459
1596
  for (let j = content.length - 1; j >= 0; j--) {
1460
1597
  const part = content[j];
1461
- if (part?.type === "text") {
1598
+ if (part?.type === "text" && part.text.trim().length > 0) {
1462
1599
  Object.assign(part, { cache_control: { type: "ephemeral" } });
1463
1600
  return;
1464
1601
  }
@@ -1473,6 +1610,12 @@ export function convertMessages(
1473
1610
  ): ChatCompletionMessageParam[] {
1474
1611
  const params: ChatCompletionMessageParam[] = [];
1475
1612
 
1613
+ const maxNormalizedToolCallIdLength = compat.requiresMistralToolIds
1614
+ ? 9
1615
+ : model.provider === "openai"
1616
+ ? 40
1617
+ : undefined;
1618
+ const duplicateToolCallIdSuffixPrefix = compat.requiresMistralToolIds ? "dup" : undefined;
1476
1619
  const normalizeToolCallId = (id: string): string => {
1477
1620
  if (compat.requiresMistralToolIds) return normalizeMistralToolId(id, true);
1478
1621
 
@@ -1489,7 +1632,13 @@ export function convertMessages(
1489
1632
  if (model.provider === "openai") return id.length > 40 ? id.slice(0, 40) : id;
1490
1633
  return id;
1491
1634
  };
1492
- const transformedMessages = transformMessages(context.messages, model, id => normalizeToolCallId(id));
1635
+ const transformedMessages = transformMessages(
1636
+ context.messages,
1637
+ model,
1638
+ id => normalizeToolCallId(id),
1639
+ maxNormalizedToolCallIdLength,
1640
+ duplicateToolCallIdSuffixPrefix,
1641
+ );
1493
1642
 
1494
1643
  const remappedToolCallIds = new Map<string, string[]>();
1495
1644
  let generatedToolCallIdCounter = 0;
@@ -1586,6 +1735,8 @@ export function convertMessages(
1586
1735
  type: "image_url",
1587
1736
  image_url: {
1588
1737
  url: `data:${item.mimeType};base64,${item.data}`,
1738
+ // Chat Completions has no "original"; omit it (provider default).
1739
+ ...(item.detail && item.detail !== "original" ? { detail: item.detail } : {}),
1589
1740
  },
1590
1741
  } satisfies ChatCompletionContentPartImage);
1591
1742
  } else {
@@ -1628,12 +1779,12 @@ export function convertMessages(
1628
1779
  if (compat.requiresThinkingAsText) {
1629
1780
  // Convert thinking blocks to plain text (no tags to avoid model mimicking them)
1630
1781
  const thinkingText = nonEmptyThinkingBlocks.map(b => b.thinking).join("\n\n");
1631
- const textContent = assistantMsg.content as Array<{ type: "text"; text: string }> | null;
1632
- if (textContent) {
1633
- textContent.unshift({ type: "text", text: thinkingText });
1634
- } else {
1635
- assistantMsg.content = [{ type: "text", text: thinkingText }];
1636
- }
1782
+ // `content` is a plain string at this point (set above) or null
1783
+ // never an array. Prepend the thinking text to the string form.
1784
+ assistantMsg.content =
1785
+ typeof assistantMsg.content === "string" && assistantMsg.content.length > 0
1786
+ ? `${thinkingText}\n\n${assistantMsg.content}`
1787
+ : thinkingText;
1637
1788
  } else if (compat.requiresReasoningContentForToolCalls) {
1638
1789
  // Use the streamed signature when the backend accepts whichever
1639
1790
  // recognized field name was emitted (allowsSynthetic=true). Backends
@@ -1934,16 +2085,19 @@ function convertTools(
1934
2085
  };
1935
2086
  }),
1936
2087
  toolStrictMode,
2088
+ strictToolsApplied:
2089
+ tools.length > 0 &&
2090
+ (toolStrictMode === "all_strict" || (toolStrictMode === "mixed" && adaptedTools.some(tool => tool.strict))),
1937
2091
  };
1938
2092
  }
1939
2093
 
1940
2094
  function shouldRetryWithoutStrictTools(
1941
2095
  error: unknown,
1942
2096
  capturedErrorResponse: CapturedHttpErrorResponse | undefined,
1943
- toolStrictMode: AppliedToolStrictMode,
2097
+ strictToolsApplied: boolean,
1944
2098
  tools: Tool[] | undefined,
1945
2099
  ): boolean {
1946
- if (!tools || tools.length === 0 || toolStrictMode !== "all_strict") {
2100
+ if (!tools || tools.length === 0 || !strictToolsApplied) {
1947
2101
  return false;
1948
2102
  }
1949
2103
  const status = extractHttpStatusFromError(error) ?? capturedErrorResponse?.status;
@@ -1953,7 +2107,14 @@ function shouldRetryWithoutStrictTools(
1953
2107
  const messageParts = [error instanceof Error ? error.message : undefined, capturedErrorResponse?.bodyText]
1954
2108
  .filter((value): value is string => typeof value === "string" && value.trim().length > 0)
1955
2109
  .join("\n");
1956
- return /wrong_api_format|mixed values for 'strict'|tool[s]?\b.*strict|\bstrict\b.*tool/i.test(messageParts);
2110
+ // Last two alternatives catch upstream tool-schema validators rejecting our
2111
+ // strictified schemas outright (e.g. OpenRouter DeepSeek's "Invalid tool
2112
+ // parameters schema : field `anyOf`: missing field `type`", #2270, and
2113
+ // OpenAI's own "Invalid schema for function 'x'"). Retrying non-strict sends
2114
+ // the unmodified base schemas, which those validators accept.
2115
+ return /wrong_api_format|mixed values for 'strict'|tool[s]?\b.*strict|\bstrict\b.*tool|tool parameters? schema|invalid schema for function/i.test(
2116
+ messageParts,
2117
+ );
1957
2118
  }
1958
2119
 
1959
2120
  function mapStopReason(reason: ChatCompletionChunk.Choice["finish_reason"] | string): {
@@ -1974,6 +2135,13 @@ function mapStopReason(reason: ChatCompletionChunk.Choice["finish_reason"] | str
1974
2135
  return { stopReason: "error", errorMessage: "Provider finish_reason: content_filter" };
1975
2136
  case "network_error":
1976
2137
  return { stopReason: "error", errorMessage: "Provider finish_reason: network_error" };
2138
+ case "error":
2139
+ // Gateways (OpenRouter, Vercel AI Gateway, …) report upstream model
2140
+ // failures as a bare `finish_reason: "error"` with no detail. These are
2141
+ // almost always transient (e.g. Gemini MALFORMED_FUNCTION_CALL), so word
2142
+ // the message to match the session retry classifier's transient-transport
2143
+ // pattern (`provider.?returned.?error`) and get the turn auto-retried.
2144
+ return { stopReason: "error", errorMessage: "Provider returned error finish_reason" };
1977
2145
  default:
1978
2146
  return {
1979
2147
  stopReason: "error",
@@ -1981,22 +2149,3 @@ function mapStopReason(reason: ChatCompletionChunk.Choice["finish_reason"] | str
1981
2149
  };
1982
2150
  }
1983
2151
  }
1984
-
1985
- /**
1986
- * Detect compatibility settings from provider and baseUrl for known providers.
1987
- * Provider takes precedence over URL-based detection since it's explicitly configured.
1988
- * Returns a fully resolved OpenAICompat object with all fields set.
1989
- */
1990
- export function detectCompat(model: Model<"openai-completions">): ResolvedOpenAICompat {
1991
- return detectOpenAICompat(model);
1992
- }
1993
-
1994
- /**
1995
- * Get resolved compatibility settings for a model.
1996
- * Uses explicit model.compat if provided, otherwise auto-detects from provider/URL.
1997
- * @param model - The model configuration
1998
- * @param resolvedBaseUrl - Optional resolved base URL (e.g., after GitHub Copilot proxy-ep resolution).
1999
- */
2000
- function getCompat(model: Model<"openai-completions">, resolvedBaseUrl?: string): ResolvedOpenAICompat {
2001
- return resolveOpenAICompat(model, resolvedBaseUrl);
2002
- }