@oh-my-pi/pi-catalog 16.1.7 → 16.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,24 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [16.1.9] - 2026-06-21
6
+
7
+ ### Fixed
8
+
9
+ - Fixed the `moonshot` provider with no path to the Kimi China API: model discovery now honors a `MOONSHOT_BASE_URL` override (redirecting to `api.moonshot.cn`), and `KIMI_API_KEY` resolves as a fallback for `MOONSHOT_API_KEY`. ([#2883](https://github.com/can1357/oh-my-pi/issues/2883))
10
+ - Fixed LiteLLM model discovery preserving colliding models.dev transport metadata (for example `ollama-cloud` `deepseek-v4-flash`) instead of keeping the LiteLLM `openai-completions` provider transport. ([#3162](https://github.com/can1357/oh-my-pi/issues/3162))
11
+
12
+ ### Removed
13
+
14
+ - Removed bundled Wafer Pass (`wafer-pass`) catalog entries and generation support; Wafer Serverless remains available as `wafer-serverless`.
15
+
16
+ ## [16.1.8] - 2026-06-20
17
+
18
+ ### Fixed
19
+
20
+ - Fixed Fireworks-hosted Qwen turns (e.g. `fireworks/qwen3.7-plus`) failing with `400 Extra inputs are not permitted, field: 'enable_thinking'`. Fireworks serves Qwen3 with controllable thinking via OpenAI-style `reasoning_effort` and rejects the top-level `enable_thinking` boolean that Alibaba DashScope speaks; `buildOpenAICompat` was selecting `thinkingFormat: "qwen"` from the `qwen` id pattern regardless of host. Fireworks-hosted Qwen models now resolve to `thinkingFormat: "openai"`.
21
+ - Fixed MiMo models on OpenAI-compatible gateways to expose only accepted `low`, `medium`, and `high` reasoning tiers and map unsupported raw `minimal`/`xhigh` requests to safe wire values. ([#2864](https://github.com/can1357/oh-my-pi/issues/2864))
22
+
5
23
  ## [16.1.7] - 2026-06-20
6
24
 
7
25
  ### Fixed
@@ -8,3 +8,16 @@ export declare function toFireworksWireModelId(modelId: string): string;
8
8
  */
9
9
  export declare function toFirepassPublicModelId(modelId: string): string;
10
10
  export declare function toFirepassWireModelId(modelId: string): string;
11
+ /**
12
+ * Public-id suffix marking a Fireworks "Fast" serving-path variant. Fast is a
13
+ * higher-throughput route (100+ tok/s) exposed under a dedicated router id
14
+ * (`accounts/fireworks/routers/<id>-fast`), not a separate model — same weights,
15
+ * higher price, no Priority tier. We keep a friendly `<id>-fast` public id and
16
+ * translate it to the router wire form at request time (compat
17
+ * `wireModelIdMode: "firepass"`). See https://docs.fireworks.ai/serverless/serving-paths.
18
+ */
19
+ export declare const FIREWORKS_FAST_SUFFIX = "-fast";
20
+ /** True for a Fireworks public model id that selects the Fast serving path. */
21
+ export declare function isFireworksFastModelId(modelId: string): boolean;
22
+ /** Strip the Fast suffix to recover the base (Standard-tier) model id. */
23
+ export declare function toFireworksBaseModelId(modelId: string): string;
@@ -173,7 +173,7 @@ export declare const CATALOG_PROVIDERS: readonly [{
173
173
  }, {
174
174
  readonly id: "moonshot";
175
175
  readonly defaultModel: "kimi-k2.7-code";
176
- readonly envVars: readonly ["MOONSHOT_API_KEY"];
176
+ readonly envVars: readonly ["MOONSHOT_API_KEY", "KIMI_API_KEY"];
177
177
  readonly createModelManagerOptions: (config: ModelManagerConfig) => import("..").ModelManagerOptions<"openai-completions", unknown>;
178
178
  readonly catalogDiscovery: {
179
179
  readonly label: "Moonshot";
@@ -310,15 +310,6 @@ export declare const CATALOG_PROVIDERS: readonly [{
310
310
  readonly label: "vLLM";
311
311
  readonly allowUnauthenticated: true;
312
312
  };
313
- }, {
314
- readonly id: "wafer-pass";
315
- readonly defaultModel: "GLM-5.1";
316
- readonly envVars: readonly ["WAFER_PASS_API_KEY"];
317
- readonly createModelManagerOptions: (config: ModelManagerConfig) => import("..").ModelManagerOptions<"openai-completions", unknown>;
318
- readonly catalogDiscovery: {
319
- readonly label: "Wafer Pass";
320
- readonly oauthProvider: "wafer-pass";
321
- };
322
313
  }, {
323
314
  readonly id: "wafer-serverless";
324
315
  readonly defaultModel: "GLM-5.1";
@@ -179,6 +179,13 @@ export declare const KIMI_K27_CODE_RECOMMENDED_MAX_TOKENS = 32768;
179
179
  export declare function isKimiK27CodeModelId(modelId: string): boolean;
180
180
  export declare function clampKimiK27CodeMaxTokens(modelId: string, candidate: number): number;
181
181
  export declare function clampKimiK27CodeMaxTokens(modelId: string, candidate: number | null): number | null;
182
+ /**
183
+ * Build the Fireworks Fast seed by projecting each base bundled spec into a
184
+ * `<id>-fast` variant. Pushed into the generated catalog (Fast routers never
185
+ * appear in the serverless control-plane list, so discovery cannot surface
186
+ * them) and deduped behind any identical previous-snapshot entry.
187
+ */
188
+ export declare function buildFireworksFastSeed(): ModelSpec<"openai-completions">[];
182
189
  /**
183
190
  * Fireworks DeepSeek V4 accepts effort via `reasoning_effort` but rejects the
184
191
  * DeepSeek-native binary `thinking` toggle when both are present.
@@ -208,7 +215,6 @@ export interface WaferModelManagerConfig {
208
215
  baseUrl?: string;
209
216
  fetch?: FetchImpl;
210
217
  }
211
- export declare function waferPassModelManagerOptions(config?: WaferModelManagerConfig): ModelManagerOptions<"openai-completions">;
212
218
  export declare function waferServerlessModelManagerOptions(config?: WaferModelManagerConfig): ModelManagerOptions<"openai-completions">;
213
219
  export interface MistralModelManagerConfig {
214
220
  apiKey?: string;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "type": "module",
3
3
  "name": "@oh-my-pi/pi-catalog",
4
- "version": "16.1.7",
4
+ "version": "16.1.9",
5
5
  "description": "Model catalog for omp: bundled model database, provider discovery descriptors, model identity, classification, and equivalence",
6
6
  "homepage": "https://omp.sh",
7
7
  "author": "Can Boluk",
@@ -34,12 +34,12 @@
34
34
  },
35
35
  "dependencies": {
36
36
  "@bufbuild/protobuf": "^2.12.0",
37
- "@oh-my-pi/pi-utils": "16.1.7",
37
+ "@oh-my-pi/pi-utils": "16.1.9",
38
38
  "arktype": "^2.2.0",
39
39
  "zod": "^4"
40
40
  },
41
41
  "devDependencies": {
42
- "@oh-my-pi/pi-ai": "16.1.7",
42
+ "@oh-my-pi/pi-ai": "16.1.9",
43
43
  "@types/bun": "^1.3.14"
44
44
  },
45
45
  "engines": {
@@ -7,6 +7,7 @@
7
7
  * complete alternate views. Request handlers read `model.compat` fields and
8
8
  * never detect, resolve, or allocate.
9
9
  */
10
+ import { isFireworksFastModelId } from "../fireworks-model-id";
10
11
  import { hostMatchesUrl, modelMatchesHost } from "../hosts";
11
12
  import {
12
13
  isAnthropicNamespacedModelId,
@@ -130,6 +131,16 @@ const OPENCODE_WHEN_THINKING: NonNullable<OpenAICompat["whenThinking"]> = {
130
131
  reasoningContentField: "reasoning_content",
131
132
  };
132
133
 
134
+ const MIMO_REASONING_EFFORT_MAP: NonNullable<OpenAICompat["reasoningEffortMap"]> = {
135
+ minimal: "low",
136
+ xhigh: "high",
137
+ };
138
+
139
+ function mergeMimoReasoningEffortMap(compat: ResolvedOpenAISharedCompat, enabled: boolean): void {
140
+ if (!enabled) return;
141
+ compat.reasoningEffortMap = { ...MIMO_REASONING_EFFORT_MAP, ...compat.reasoningEffortMap };
142
+ }
143
+
133
144
  function detectStrictModeSupport(provider: string, baseUrl: string): boolean {
134
145
  if (
135
146
  provider === "openai" ||
@@ -184,6 +195,8 @@ export function buildOpenAICompat(spec: ModelSpec<"openai-completions">): Resolv
184
195
  const lowerName = (spec.name ?? "").toLowerCase();
185
196
  const isXiaomiHost = modelMatchesHost(hostModel, "xiaomi");
186
197
  const isXiaomiMimo = isXiaomiHost && (isMimoModelIdOrName(spec.id) || isMimoModelIdOrName(spec.name ?? ""));
198
+ const isMimoReasoningEffortModel =
199
+ !isXiaomiHost && (isMimoModelIdOrName(spec.id) || isMimoModelIdOrName(spec.name ?? ""));
187
200
  // OpenCode Zen's `big-pickle` is a DeepSeek reasoning alias; the upstream
188
201
  // 400s come from DeepSeek and require exact reasoning_content replay.
189
202
  const isOpenCodeDeepseekAlias =
@@ -238,17 +251,21 @@ export function buildOpenAICompat(spec: ModelSpec<"openai-completions">): Resolv
238
251
  const isGroqHost = modelMatchesHost(hostModel, "groq");
239
252
  const isCopilotHost = provider === "github-copilot";
240
253
  const isZenmuxHost = provider === "zenmux";
241
- // Endpoints that MUST receive a single system block. MiniMax's OpenAI
242
- // endpoint returns error 2013 on multiple system messages; Alibaba's
243
- // Dashscope and Qwen Portal serve Qwen models whose chat template
244
- // raises "System message must be at the beginning" if any system
245
- // message appears past index 0.
254
+ // Endpoints/models that MUST receive a single system block. MiniMax's OpenAI
255
+ // endpoint returns error 2013 on multiple system messages; the Qwen 3.5+ chat
256
+ // template raises "System message must be at the beginning" / 500s with an
257
+ // internal_server_error when any system block appears past index 0. That
258
+ // template ships with the weights, so every Qwen-serving vLLM/SGLang host
259
+ // hits it — confirmed on Alibaba Dashscope, Qwen Portal, and Fireworks
260
+ // (`fireworks/qwen3.7-plus` 500'd on two leading system blocks). Gate on the
261
+ // Qwen family itself, not per-host: coalescing only trades away KV-cache reuse.
246
262
  const isMiniMaxHost = modelMatchesHost(hostModel, "minimax");
247
263
  const isQwenPortal = modelMatchesHost(hostModel, "qwenPortal");
248
264
  const supportsMultipleSystemMessagesDefault =
249
265
  !isMiniMaxHost &&
250
266
  !isAlibaba &&
251
267
  !isQwenPortal &&
268
+ !isQwen &&
252
269
  (isOpenAIHost ||
253
270
  isAzureHost ||
254
271
  isOpenRouter ||
@@ -276,8 +293,12 @@ export function buildOpenAICompat(spec: ModelSpec<"openai-completions">): Resolv
276
293
  ? DEEPSEEK_REASONING_STREAM_IDLE_TIMEOUT_MS
277
294
  : undefined;
278
295
 
296
+ // Fireworks "Fast" variants (`<id>-fast`) are served from the router
297
+ // namespace (`accounts/fireworks/routers/<id>-fast`), like Fire Pass, rather
298
+ // than the `models/` namespace the rest of the `fireworks` provider uses.
299
+ const isFireworksFastRouter = provider === "fireworks" && isFireworksFastModelId(spec.id);
279
300
  const wireModelIdMode: ResolvedOpenAISharedCompat["wireModelIdMode"] =
280
- provider === "firepass"
301
+ provider === "firepass" || isFireworksFastRouter
281
302
  ? "firepass"
282
303
  : provider === "fireworks"
283
304
  ? "fireworks"
@@ -291,9 +312,11 @@ export function buildOpenAICompat(spec: ModelSpec<"openai-completions">): Resolv
291
312
  ? "openrouter"
292
313
  : isQwen && isNvidiaNim
293
314
  ? "qwen-chat-template"
294
- : isAlibaba || isQwen
295
- ? "qwen"
296
- : "openai";
315
+ : isQwen && isFireworks
316
+ ? "openai"
317
+ : isAlibaba || isQwen
318
+ ? "qwen"
319
+ : "openai";
297
320
 
298
321
  const compat: ResolvedOpenAICompat = {
299
322
  supportsStore: !isNonStandard,
@@ -308,7 +331,7 @@ export function buildOpenAICompat(spec: ModelSpec<"openai-completions">): Resolv
308
331
  supportsReasoningEffort: !isGrok && !isXiaomiMimo && (!(isZai || isZhipu) || supportsZaiReasoningEffort),
309
332
  // GitHub Copilot's chat-completions endpoint rejects reasoning params wholesale.
310
333
  supportsReasoningParams: provider !== "github-copilot",
311
- reasoningEffortMap: {},
334
+ reasoningEffortMap: isMimoReasoningEffortModel ? MIMO_REASONING_EFFORT_MAP : {},
312
335
  supportsUsageInStreaming: !isCerebras,
313
336
  // pi-ai's thinking-loop guard is gemini-only; default the flag from the
314
337
  // family classifier so OpenAI-compat proxies serving Gemini are covered.
@@ -400,6 +423,7 @@ export function buildOpenAICompat(spec: ModelSpec<"openai-completions">): Resolv
400
423
  compat.omitReasoningEffort = true;
401
424
  }
402
425
  mergeOllamaReasoningEffortMap(compat, provider, spec.reasoning);
426
+ mergeMimoReasoningEffortMap(compat, isMimoReasoningEffortModel);
403
427
 
404
428
  const whenThinkingPolicy =
405
429
  spec.compat?.whenThinking ?? (isOpenCodeProvider && spec.reasoning ? OPENCODE_WHEN_THINKING : undefined);
@@ -413,6 +437,7 @@ export function buildOpenAICompat(spec: ModelSpec<"openai-completions">): Resolv
413
437
  variant.omitReasoningEffort = true;
414
438
  }
415
439
  mergeOllamaReasoningEffortMap(variant, provider, spec.reasoning);
440
+ mergeMimoReasoningEffortMap(variant, isMimoReasoningEffortModel);
416
441
  compat.whenThinking = variant;
417
442
  }
418
443
 
@@ -28,3 +28,23 @@ export function toFirepassWireModelId(modelId: string): string {
28
28
  const stripped = modelId.startsWith(FIREPASS_WIRE_PREFIX) ? modelId.slice(FIREPASS_WIRE_PREFIX.length) : modelId;
29
29
  return `${FIREPASS_WIRE_PREFIX}${stripped.replace(VERSION_DOT_PATTERN, "p")}`;
30
30
  }
31
+
32
+ /**
33
+ * Public-id suffix marking a Fireworks "Fast" serving-path variant. Fast is a
34
+ * higher-throughput route (100+ tok/s) exposed under a dedicated router id
35
+ * (`accounts/fireworks/routers/<id>-fast`), not a separate model — same weights,
36
+ * higher price, no Priority tier. We keep a friendly `<id>-fast` public id and
37
+ * translate it to the router wire form at request time (compat
38
+ * `wireModelIdMode: "firepass"`). See https://docs.fireworks.ai/serverless/serving-paths.
39
+ */
40
+ export const FIREWORKS_FAST_SUFFIX = "-fast";
41
+
42
+ /** True for a Fireworks public model id that selects the Fast serving path. */
43
+ export function isFireworksFastModelId(modelId: string): boolean {
44
+ return modelId.endsWith(FIREWORKS_FAST_SUFFIX);
45
+ }
46
+
47
+ /** Strip the Fast suffix to recover the base (Standard-tier) model id. */
48
+ export function toFireworksBaseModelId(modelId: string): string {
49
+ return modelId.endsWith(FIREWORKS_FAST_SUFFIX) ? modelId.slice(0, -FIREWORKS_FAST_SUFFIX.length) : modelId;
50
+ }
@@ -24,6 +24,7 @@ import {
24
24
  findThinkingVariantToken,
25
25
  isDeepseekModelIdOrName,
26
26
  isGlm52ReasoningEffortModelId,
27
+ isMimoModelIdOrName,
27
28
  isMinimaxM2FamilyModelId,
28
29
  isMinimaxM3FamilyModelId,
29
30
  isOpenAIGptOssModelId,
@@ -89,6 +90,10 @@ const ZAI_GLM_52_REASONING_EFFORT_MAP: Readonly<EffortMap> = {
89
90
  const GLM_52_XHIGH_MAX_EFFORT_MAP: Readonly<EffortMap> = {
90
91
  [Effort.XHigh]: "max",
91
92
  };
93
+ const MIMO_REASONING_EFFORT_MAP: Readonly<EffortMap> = {
94
+ [Effort.Minimal]: "low",
95
+ [Effort.XHigh]: "high",
96
+ };
92
97
 
93
98
  /**
94
99
  * Effort → wire-value map for the 5-tier adaptive scale (Opus 4.7+ and
@@ -296,7 +301,10 @@ function getModelDefinedEfforts<TApi extends Api>(
296
301
  return GLM_52_HIGH_MAX_REASONING_EFFORTS;
297
302
  }
298
303
  }
299
- return isOpenAICompatReasoningApi(spec.api) && (isMinimaxM2FamilyModelId(spec.id) || isOpenAIGptOssModelId(spec.id))
304
+ return isOpenAICompatReasoningApi(spec.api) &&
305
+ (isMinimaxM2FamilyModelId(spec.id) ||
306
+ isOpenAIGptOssModelId(spec.id) ||
307
+ isOpenAICompatMimoReasoningEffortModel(spec, compat))
300
308
  ? LOW_MEDIUM_HIGH_REASONING_EFFORTS
301
309
  : undefined;
302
310
  }
@@ -309,6 +317,19 @@ function isMinimaxReasoningModelOnAnthropicEndpoint<TApi extends Api>(spec: Mode
309
317
  return spec.api === "anthropic-messages" && (isMinimaxM2FamilyModelId(spec.id) || isMinimaxM3FamilyModelId(spec.id));
310
318
  }
311
319
 
320
+ function isOpenAICompatMimoReasoningEffortModel<TApi extends Api>(
321
+ spec: ModelSpec<TApi>,
322
+ compat: CompatOf<TApi>,
323
+ ): boolean {
324
+ if (!isOpenAICompatReasoningApi(spec.api)) return false;
325
+ if (!isMimoModelIdOrName(spec.id) && !isMimoModelIdOrName(spec.name ?? "")) return false;
326
+ const resolved = compat as ResolvedOpenAICompat | undefined;
327
+ return (
328
+ (resolved?.thinkingFormat === "openai" || resolved?.thinkingFormat === "openrouter") &&
329
+ resolved.supportsReasoningEffort
330
+ );
331
+ }
332
+
312
333
  function readCompatEffortMap(compat: CompatOf<Api>): EffortMap | undefined {
313
334
  if (compat === undefined || !("reasoningEffortMap" in compat)) {
314
335
  return undefined;
@@ -364,6 +385,8 @@ function inferDetectedEffortMap<TApi extends Api>(
364
385
  map = GROQ_QWEN3_32B_REASONING_EFFORT_MAP;
365
386
  } else if (isDeepseekReasoningModel(spec)) {
366
387
  map = DEEPSEEK_REASONING_EFFORT_MAP;
388
+ } else if (isOpenAICompatMimoReasoningEffortModel(spec, compat)) {
389
+ map = MIMO_REASONING_EFFORT_MAP;
367
390
  } else if (modelMatchesHost(spec, "openrouter")) {
368
391
  map = getOpenRouterAnthropicReasoningEffortMap(spec.id);
369
392
  } else if (modelMatchesHost(spec, "fireworks")) {
@@ -485,6 +508,8 @@ function inferAnthropicSupportedEfforts<TApi extends Api>(
485
508
  }
486
509
 
487
510
  function inferFallbackEfforts<TApi extends Api>(spec: ModelSpec<TApi>, compat: CompatOf<TApi>): readonly Effort[] {
511
+ const modelDefinedEfforts = getModelDefinedEfforts(spec, compat);
512
+ if (modelDefinedEfforts !== undefined) return modelDefinedEfforts;
488
513
  if (isMinimaxReasoningModelOnAnthropicEndpoint(spec)) {
489
514
  return LOW_MEDIUM_HIGH_REASONING_EFFORTS;
490
515
  }
package/src/models.json CHANGED
@@ -7208,11 +7208,9 @@
7208
7208
  "thinking": {
7209
7209
  "mode": "effort",
7210
7210
  "efforts": [
7211
- "minimal",
7212
7211
  "low",
7213
7212
  "medium",
7214
- "high",
7215
- "xhigh"
7213
+ "high"
7216
7214
  ]
7217
7215
  }
7218
7216
  },
@@ -7238,11 +7236,9 @@
7238
7236
  "thinking": {
7239
7237
  "mode": "effort",
7240
7238
  "efforts": [
7241
- "minimal",
7242
7239
  "low",
7243
7240
  "medium",
7244
- "high",
7245
- "xhigh"
7241
+ "high"
7246
7242
  ]
7247
7243
  }
7248
7244
  },
@@ -7267,11 +7263,9 @@
7267
7263
  "thinking": {
7268
7264
  "mode": "effort",
7269
7265
  "efforts": [
7270
- "minimal",
7271
7266
  "low",
7272
7267
  "medium",
7273
- "high",
7274
- "xhigh"
7268
+ "high"
7275
7269
  ]
7276
7270
  }
7277
7271
  }
@@ -14801,6 +14795,38 @@
14801
14795
  }
14802
14796
  }
14803
14797
  },
14798
+ "glm-5.1-fast": {
14799
+ "id": "glm-5.1-fast",
14800
+ "name": "GLM-5.1 Fast",
14801
+ "api": "openai-completions",
14802
+ "provider": "fireworks",
14803
+ "baseUrl": "https://api.fireworks.ai/inference/v1",
14804
+ "reasoning": true,
14805
+ "input": [
14806
+ "text"
14807
+ ],
14808
+ "cost": {
14809
+ "input": 2.8,
14810
+ "output": 8.8,
14811
+ "cacheRead": 0.52,
14812
+ "cacheWrite": 0
14813
+ },
14814
+ "contextWindow": 202752,
14815
+ "maxTokens": 131072,
14816
+ "thinking": {
14817
+ "mode": "effort",
14818
+ "efforts": [
14819
+ "minimal",
14820
+ "low",
14821
+ "medium",
14822
+ "high",
14823
+ "xhigh"
14824
+ ],
14825
+ "effortMap": {
14826
+ "minimal": "none"
14827
+ }
14828
+ }
14829
+ },
14804
14830
  "glm-5.2": {
14805
14831
  "id": "glm-5.2",
14806
14832
  "name": "GLM-5.2",
@@ -14947,6 +14973,39 @@
14947
14973
  }
14948
14974
  }
14949
14975
  },
14976
+ "kimi-k2.6-fast": {
14977
+ "id": "kimi-k2.6-fast",
14978
+ "name": "Kimi K2.6 Fast",
14979
+ "api": "openai-completions",
14980
+ "provider": "fireworks",
14981
+ "baseUrl": "https://api.fireworks.ai/inference/v1",
14982
+ "reasoning": true,
14983
+ "input": [
14984
+ "text",
14985
+ "image"
14986
+ ],
14987
+ "cost": {
14988
+ "input": 2,
14989
+ "output": 8,
14990
+ "cacheRead": 0.3,
14991
+ "cacheWrite": 0
14992
+ },
14993
+ "contextWindow": 262144,
14994
+ "maxTokens": 32768,
14995
+ "thinking": {
14996
+ "mode": "effort",
14997
+ "efforts": [
14998
+ "minimal",
14999
+ "low",
15000
+ "medium",
15001
+ "high",
15002
+ "xhigh"
15003
+ ],
15004
+ "effortMap": {
15005
+ "minimal": "none"
15006
+ }
15007
+ }
15008
+ },
14950
15009
  "kimi-k2.7-code": {
14951
15010
  "id": "kimi-k2.7-code",
14952
15011
  "name": "Kimi K2.7 Code",
@@ -14980,6 +15039,39 @@
14980
15039
  }
14981
15040
  }
14982
15041
  },
15042
+ "kimi-k2.7-code-fast": {
15043
+ "id": "kimi-k2.7-code-fast",
15044
+ "name": "Kimi K2.7 Code Fast",
15045
+ "api": "openai-completions",
15046
+ "provider": "fireworks",
15047
+ "baseUrl": "https://api.fireworks.ai/inference/v1",
15048
+ "reasoning": true,
15049
+ "input": [
15050
+ "text",
15051
+ "image"
15052
+ ],
15053
+ "cost": {
15054
+ "input": 1.9,
15055
+ "output": 8,
15056
+ "cacheRead": 0.38,
15057
+ "cacheWrite": 0
15058
+ },
15059
+ "contextWindow": 262144,
15060
+ "maxTokens": 32768,
15061
+ "thinking": {
15062
+ "mode": "effort",
15063
+ "efforts": [
15064
+ "minimal",
15065
+ "low",
15066
+ "medium",
15067
+ "high",
15068
+ "xhigh"
15069
+ ],
15070
+ "effortMap": {
15071
+ "minimal": "none"
15072
+ }
15073
+ }
15074
+ },
14983
15075
  "minimax-m2.5": {
14984
15076
  "id": "minimax-m2.5",
14985
15077
  "name": "MiniMax M2.5",
@@ -20276,11 +20368,9 @@
20276
20368
  "thinking": {
20277
20369
  "mode": "effort",
20278
20370
  "efforts": [
20279
- "minimal",
20280
20371
  "low",
20281
20372
  "medium",
20282
- "high",
20283
- "xhigh"
20373
+ "high"
20284
20374
  ]
20285
20375
  }
20286
20376
  },
@@ -30625,11 +30715,9 @@
30625
30715
  "thinking": {
30626
30716
  "mode": "effort",
30627
30717
  "efforts": [
30628
- "minimal",
30629
30718
  "low",
30630
30719
  "medium",
30631
- "high",
30632
- "xhigh"
30720
+ "high"
30633
30721
  ]
30634
30722
  }
30635
30723
  },
@@ -30655,11 +30743,9 @@
30655
30743
  "thinking": {
30656
30744
  "mode": "effort",
30657
30745
  "efforts": [
30658
- "minimal",
30659
30746
  "low",
30660
30747
  "medium",
30661
- "high",
30662
- "xhigh"
30748
+ "high"
30663
30749
  ]
30664
30750
  }
30665
30751
  },
@@ -30703,11 +30789,9 @@
30703
30789
  "thinking": {
30704
30790
  "mode": "effort",
30705
30791
  "efforts": [
30706
- "minimal",
30707
30792
  "low",
30708
30793
  "medium",
30709
- "high",
30710
- "xhigh"
30794
+ "high"
30711
30795
  ]
30712
30796
  }
30713
30797
  },
@@ -30752,11 +30836,9 @@
30752
30836
  "thinking": {
30753
30837
  "mode": "effort",
30754
30838
  "efforts": [
30755
- "minimal",
30756
30839
  "low",
30757
30840
  "medium",
30758
- "high",
30759
- "xhigh"
30841
+ "high"
30760
30842
  ]
30761
30843
  }
30762
30844
  },
@@ -30781,11 +30863,9 @@
30781
30863
  "thinking": {
30782
30864
  "mode": "effort",
30783
30865
  "efforts": [
30784
- "minimal",
30785
30866
  "low",
30786
30867
  "medium",
30787
- "high",
30788
- "xhigh"
30868
+ "high"
30789
30869
  ]
30790
30870
  }
30791
30871
  },
@@ -31172,7 +31252,7 @@
31172
31252
  "kimi-code": {
31173
31253
  "kimi-for-coding": {
31174
31254
  "id": "kimi-for-coding",
31175
- "name": "Kimi For Coding",
31255
+ "name": "K2.7 Code",
31176
31256
  "api": "openai-completions",
31177
31257
  "provider": "kimi-code",
31178
31258
  "baseUrl": "https://api.kimi.com/coding/v1",
@@ -49119,11 +49199,9 @@
49119
49199
  "thinking": {
49120
49200
  "mode": "effort",
49121
49201
  "efforts": [
49122
- "minimal",
49123
49202
  "low",
49124
49203
  "medium",
49125
- "high",
49126
- "xhigh"
49204
+ "high"
49127
49205
  ],
49128
49206
  "effortRouting": {
49129
49207
  "off": "xiaomi/mimo-v2-flash",
@@ -49183,11 +49261,9 @@
49183
49261
  "thinking": {
49184
49262
  "mode": "effort",
49185
49263
  "efforts": [
49186
- "minimal",
49187
49264
  "low",
49188
49265
  "medium",
49189
- "high",
49190
- "xhigh"
49266
+ "high"
49191
49267
  ],
49192
49268
  "effortRouting": {
49193
49269
  "off": "xiaomi/mimo-v2-flash-original",
@@ -49248,11 +49324,9 @@
49248
49324
  "thinking": {
49249
49325
  "mode": "effort",
49250
49326
  "efforts": [
49251
- "minimal",
49252
49327
  "low",
49253
49328
  "medium",
49254
- "high",
49255
- "xhigh"
49329
+ "high"
49256
49330
  ]
49257
49331
  }
49258
49332
  },
@@ -49277,11 +49351,9 @@
49277
49351
  "thinking": {
49278
49352
  "mode": "effort",
49279
49353
  "efforts": [
49280
- "minimal",
49281
49354
  "low",
49282
49355
  "medium",
49283
- "high",
49284
- "xhigh"
49356
+ "high"
49285
49357
  ]
49286
49358
  }
49287
49359
  },
@@ -49307,11 +49379,9 @@
49307
49379
  "thinking": {
49308
49380
  "mode": "effort",
49309
49381
  "efforts": [
49310
- "minimal",
49311
49382
  "low",
49312
49383
  "medium",
49313
- "high",
49314
- "xhigh"
49384
+ "high"
49315
49385
  ]
49316
49386
  }
49317
49387
  },
@@ -49336,11 +49406,9 @@
49336
49406
  "thinking": {
49337
49407
  "mode": "effort",
49338
49408
  "efforts": [
49339
- "minimal",
49340
49409
  "low",
49341
49410
  "medium",
49342
- "high",
49343
- "xhigh"
49411
+ "high"
49344
49412
  ]
49345
49413
  }
49346
49414
  },
@@ -56842,11 +56910,9 @@
56842
56910
  "thinking": {
56843
56911
  "mode": "effort",
56844
56912
  "efforts": [
56845
- "minimal",
56846
56913
  "low",
56847
56914
  "medium",
56848
- "high",
56849
- "xhigh"
56915
+ "high"
56850
56916
  ]
56851
56917
  },
56852
56918
  "compat": {
@@ -56874,11 +56940,9 @@
56874
56940
  "thinking": {
56875
56941
  "mode": "effort",
56876
56942
  "efforts": [
56877
- "minimal",
56878
56943
  "low",
56879
56944
  "medium",
56880
- "high",
56881
- "xhigh"
56945
+ "high"
56882
56946
  ]
56883
56947
  },
56884
56948
  "compat": {
@@ -56910,11 +56974,9 @@
56910
56974
  "thinking": {
56911
56975
  "mode": "effort",
56912
56976
  "efforts": [
56913
- "minimal",
56914
56977
  "low",
56915
56978
  "medium",
56916
- "high",
56917
- "xhigh"
56979
+ "high"
56918
56980
  ]
56919
56981
  }
56920
56982
  },
@@ -56942,11 +57004,9 @@
56942
57004
  "thinking": {
56943
57005
  "mode": "effort",
56944
57006
  "efforts": [
56945
- "minimal",
56946
57007
  "low",
56947
57008
  "medium",
56948
- "high",
56949
- "xhigh"
57009
+ "high"
56950
57010
  ]
56951
57011
  }
56952
57012
  },
@@ -58575,11 +58635,9 @@
58575
58635
  "thinking": {
58576
58636
  "mode": "effort",
58577
58637
  "efforts": [
58578
- "minimal",
58579
58638
  "low",
58580
58639
  "medium",
58581
- "high",
58582
- "xhigh"
58640
+ "high"
58583
58641
  ]
58584
58642
  }
58585
58643
  },
@@ -58605,11 +58663,9 @@
58605
58663
  "thinking": {
58606
58664
  "mode": "effort",
58607
58665
  "efforts": [
58608
- "minimal",
58609
58666
  "low",
58610
58667
  "medium",
58611
- "high",
58612
- "xhigh"
58668
+ "high"
58613
58669
  ]
58614
58670
  }
58615
58671
  },
@@ -58634,11 +58690,9 @@
58634
58690
  "thinking": {
58635
58691
  "mode": "effort",
58636
58692
  "efforts": [
58637
- "minimal",
58638
58693
  "low",
58639
58694
  "medium",
58640
- "high",
58641
- "xhigh"
58695
+ "high"
58642
58696
  ]
58643
58697
  }
58644
58698
  },
@@ -58664,11 +58718,9 @@
58664
58718
  "thinking": {
58665
58719
  "mode": "effort",
58666
58720
  "efforts": [
58667
- "minimal",
58668
58721
  "low",
58669
58722
  "medium",
58670
- "high",
58671
- "xhigh"
58723
+ "high"
58672
58724
  ]
58673
58725
  }
58674
58726
  },
@@ -67414,7 +67466,6 @@
67414
67466
  "thinking": {
67415
67467
  "mode": "effort",
67416
67468
  "efforts": [
67417
- "minimal",
67418
67469
  "low",
67419
67470
  "medium",
67420
67471
  "high"
@@ -67443,7 +67494,6 @@
67443
67494
  "thinking": {
67444
67495
  "mode": "effort",
67445
67496
  "efforts": [
67446
- "minimal",
67447
67497
  "low",
67448
67498
  "medium",
67449
67499
  "high"
@@ -67471,7 +67521,6 @@
67471
67521
  "thinking": {
67472
67522
  "mode": "effort",
67473
67523
  "efforts": [
67474
- "minimal",
67475
67524
  "low",
67476
67525
  "medium",
67477
67526
  "high"
@@ -67500,7 +67549,6 @@
67500
67549
  "thinking": {
67501
67550
  "mode": "effort",
67502
67551
  "efforts": [
67503
- "minimal",
67504
67552
  "low",
67505
67553
  "medium",
67506
67554
  "high"
@@ -67528,7 +67576,6 @@
67528
67576
  "thinking": {
67529
67577
  "mode": "effort",
67530
67578
  "efforts": [
67531
- "minimal",
67532
67579
  "low",
67533
67580
  "medium",
67534
67581
  "high"
@@ -72124,11 +72171,9 @@
72124
72171
  "thinking": {
72125
72172
  "mode": "effort",
72126
72173
  "efforts": [
72127
- "minimal",
72128
72174
  "low",
72129
72175
  "medium",
72130
- "high",
72131
- "xhigh"
72176
+ "high"
72132
72177
  ]
72133
72178
  }
72134
72179
  },
@@ -77545,64 +77590,6 @@
77545
77590
  }
77546
77591
  }
77547
77592
  },
77548
- "wafer-pass": {
77549
- "GLM-5.1": {
77550
- "id": "GLM-5.1",
77551
- "name": "GLM-5.1",
77552
- "api": "openai-completions",
77553
- "provider": "wafer-pass",
77554
- "baseUrl": "https://pass.wafer.ai/v1",
77555
- "reasoning": true,
77556
- "input": [
77557
- "text"
77558
- ],
77559
- "cost": {
77560
- "input": 0,
77561
- "output": 0,
77562
- "cacheRead": 0,
77563
- "cacheWrite": 0
77564
- },
77565
- "contextWindow": 202752,
77566
- "maxTokens": 65536,
77567
- "compat": {
77568
- "supportsDeveloperRole": false,
77569
- "thinkingFormat": "zai",
77570
- "reasoningContentField": "reasoning_content"
77571
- },
77572
- "thinking": {
77573
- "mode": "effort",
77574
- "efforts": [
77575
- "minimal",
77576
- "low",
77577
- "medium",
77578
- "high"
77579
- ]
77580
- }
77581
- },
77582
- "Qwen3.5-397B-A17B": {
77583
- "id": "Qwen3.5-397B-A17B",
77584
- "name": "Qwen3.5-397B-A17B",
77585
- "api": "openai-completions",
77586
- "provider": "wafer-pass",
77587
- "baseUrl": "https://pass.wafer.ai/v1",
77588
- "reasoning": false,
77589
- "input": [
77590
- "text",
77591
- "image"
77592
- ],
77593
- "cost": {
77594
- "input": 0,
77595
- "output": 0,
77596
- "cacheRead": 0,
77597
- "cacheWrite": 0
77598
- },
77599
- "contextWindow": 262144,
77600
- "maxTokens": 65536,
77601
- "compat": {
77602
- "supportsDeveloperRole": false
77603
- }
77604
- }
77605
- },
77606
77593
  "wafer-serverless": {
77607
77594
  "deepseek-v4-flash": {
77608
77595
  "id": "deepseek-v4-flash",
@@ -84061,11 +84048,9 @@
84061
84048
  "thinking": {
84062
84049
  "mode": "effort",
84063
84050
  "efforts": [
84064
- "minimal",
84065
84051
  "low",
84066
84052
  "medium",
84067
- "high",
84068
- "xhigh"
84053
+ "high"
84069
84054
  ]
84070
84055
  }
84071
84056
  },
@@ -84090,11 +84075,9 @@
84090
84075
  "thinking": {
84091
84076
  "mode": "effort",
84092
84077
  "efforts": [
84093
- "minimal",
84094
84078
  "low",
84095
84079
  "medium",
84096
- "high",
84097
- "xhigh"
84080
+ "high"
84098
84081
  ]
84099
84082
  }
84100
84083
  },
@@ -84120,11 +84103,9 @@
84120
84103
  "thinking": {
84121
84104
  "mode": "effort",
84122
84105
  "efforts": [
84123
- "minimal",
84124
84106
  "low",
84125
84107
  "medium",
84126
- "high",
84127
- "xhigh"
84108
+ "high"
84128
84109
  ]
84129
84110
  }
84130
84111
  },
@@ -84149,11 +84130,9 @@
84149
84130
  "thinking": {
84150
84131
  "mode": "effort",
84151
84132
  "efforts": [
84152
- "minimal",
84153
84133
  "low",
84154
84134
  "medium",
84155
- "high",
84156
- "xhigh"
84135
+ "high"
84157
84136
  ]
84158
84137
  }
84159
84138
  },
@@ -84179,11 +84158,9 @@
84179
84158
  "thinking": {
84180
84159
  "mode": "effort",
84181
84160
  "efforts": [
84182
- "minimal",
84183
84161
  "low",
84184
84162
  "medium",
84185
- "high",
84186
- "xhigh"
84163
+ "high"
84187
84164
  ]
84188
84165
  }
84189
84166
  },
@@ -84208,11 +84185,9 @@
84208
84185
  "thinking": {
84209
84186
  "mode": "effort",
84210
84187
  "efforts": [
84211
- "minimal",
84212
84188
  "low",
84213
84189
  "medium",
84214
- "high",
84215
- "xhigh"
84190
+ "high"
84216
84191
  ]
84217
84192
  }
84218
84193
  },
@@ -84637,6 +84612,39 @@
84637
84612
  }
84638
84613
  },
84639
84614
  "zhipu-coding-plan": {
84615
+ "glm-4.5": {
84616
+ "id": "glm-4.5",
84617
+ "name": "glm-4.5",
84618
+ "api": "openai-completions",
84619
+ "provider": "zhipu-coding-plan",
84620
+ "baseUrl": "https://open.bigmodel.cn/api/coding/paas/v4",
84621
+ "reasoning": true,
84622
+ "input": [
84623
+ "text"
84624
+ ],
84625
+ "cost": {
84626
+ "input": 0,
84627
+ "output": 0,
84628
+ "cacheRead": 0,
84629
+ "cacheWrite": 0
84630
+ },
84631
+ "contextWindow": 131072,
84632
+ "maxTokens": 98304,
84633
+ "thinking": {
84634
+ "mode": "effort",
84635
+ "efforts": [
84636
+ "minimal",
84637
+ "low",
84638
+ "medium",
84639
+ "high"
84640
+ ]
84641
+ },
84642
+ "compat": {
84643
+ "thinkingFormat": "zai",
84644
+ "reasoningContentField": "reasoning_content",
84645
+ "supportsDeveloperRole": false
84646
+ }
84647
+ },
84640
84648
  "glm-4.5-air": {
84641
84649
  "id": "glm-4.5-air",
84642
84650
  "name": "GLM-4.5-Air",
@@ -84670,6 +84678,39 @@
84670
84678
  ]
84671
84679
  }
84672
84680
  },
84681
+ "glm-4.6": {
84682
+ "id": "glm-4.6",
84683
+ "name": "glm-4.6",
84684
+ "api": "openai-completions",
84685
+ "provider": "zhipu-coding-plan",
84686
+ "baseUrl": "https://open.bigmodel.cn/api/coding/paas/v4",
84687
+ "reasoning": true,
84688
+ "input": [
84689
+ "text"
84690
+ ],
84691
+ "cost": {
84692
+ "input": 0,
84693
+ "output": 0,
84694
+ "cacheRead": 0,
84695
+ "cacheWrite": 0
84696
+ },
84697
+ "contextWindow": 202752,
84698
+ "maxTokens": 131072,
84699
+ "thinking": {
84700
+ "mode": "effort",
84701
+ "efforts": [
84702
+ "minimal",
84703
+ "low",
84704
+ "medium",
84705
+ "high"
84706
+ ]
84707
+ },
84708
+ "compat": {
84709
+ "thinkingFormat": "zai",
84710
+ "reasoningContentField": "reasoning_content",
84711
+ "supportsDeveloperRole": false
84712
+ }
84713
+ },
84673
84714
  "glm-4.6v": {
84674
84715
  "id": "glm-4.6v",
84675
84716
  "name": "GLM-4.6V",
@@ -84737,6 +84778,39 @@
84737
84778
  ]
84738
84779
  }
84739
84780
  },
84781
+ "glm-5": {
84782
+ "id": "glm-5",
84783
+ "name": "GLM-5",
84784
+ "api": "openai-completions",
84785
+ "provider": "zhipu-coding-plan",
84786
+ "baseUrl": "https://open.bigmodel.cn/api/coding/paas/v4",
84787
+ "reasoning": true,
84788
+ "input": [
84789
+ "text"
84790
+ ],
84791
+ "cost": {
84792
+ "input": 0,
84793
+ "output": 0,
84794
+ "cacheRead": 0,
84795
+ "cacheWrite": 0
84796
+ },
84797
+ "contextWindow": 204800,
84798
+ "maxTokens": 131072,
84799
+ "thinking": {
84800
+ "mode": "effort",
84801
+ "efforts": [
84802
+ "minimal",
84803
+ "low",
84804
+ "medium",
84805
+ "high"
84806
+ ]
84807
+ },
84808
+ "compat": {
84809
+ "thinkingFormat": "zai",
84810
+ "reasoningContentField": "reasoning_content",
84811
+ "supportsDeveloperRole": false
84812
+ }
84813
+ },
84740
84814
  "glm-5-turbo": {
84741
84815
  "id": "glm-5-turbo",
84742
84816
  "name": "GLM-5-Turbo",
@@ -41,7 +41,6 @@ import {
41
41
  veniceModelManagerOptions,
42
42
  vercelAiGatewayModelManagerOptions,
43
43
  vllmModelManagerOptions,
44
- waferPassModelManagerOptions,
45
44
  waferServerlessModelManagerOptions,
46
45
  xaiModelManagerOptions,
47
46
  xaiOAuthModelManagerOptions,
@@ -219,7 +218,9 @@ export const CATALOG_PROVIDERS = [
219
218
  {
220
219
  id: "moonshot",
221
220
  defaultModel: "kimi-k2.7-code",
222
- envVars: ["MOONSHOT_API_KEY"],
221
+ // KIMI_API_KEY is the most intuitive name for a Kimi/Moonshot key; accept it
222
+ // as a fallback so China users need not learn MOONSHOT_API_KEY. (#2883)
223
+ envVars: ["MOONSHOT_API_KEY", "KIMI_API_KEY"],
223
224
  createModelManagerOptions: (config: ModelManagerConfig) => moonshotModelManagerOptions(config),
224
225
  catalogDiscovery: { label: "Moonshot" },
225
226
  },
@@ -347,13 +348,6 @@ export const CATALOG_PROVIDERS = [
347
348
  createModelManagerOptions: (config: ModelManagerConfig) => vllmModelManagerOptions(config),
348
349
  catalogDiscovery: { label: "vLLM", allowUnauthenticated: true },
349
350
  },
350
- {
351
- id: "wafer-pass",
352
- defaultModel: "GLM-5.1",
353
- envVars: ["WAFER_PASS_API_KEY"],
354
- createModelManagerOptions: (config: ModelManagerConfig) => waferPassModelManagerOptions(config),
355
- catalogDiscovery: { label: "Wafer Pass", oauthProvider: "wafer-pass" },
356
- },
357
351
  {
358
352
  id: "wafer-serverless",
359
353
  defaultModel: "GLM-5.1",
@@ -4,7 +4,7 @@ import {
4
4
  type OpenAICompatibleModelRecord,
5
5
  } from "../discovery/openai-compatible";
6
6
  import { Effort } from "../effort";
7
- import { toFireworksPublicModelId } from "../fireworks-model-id";
7
+ import { FIREWORKS_FAST_SUFFIX, toFireworksPublicModelId } from "../fireworks-model-id";
8
8
  import { isGlmVisionModelId, isGrokReasoningEffortCapable, isReasoningGlmModelId } from "../identity/family";
9
9
  import type { ModelManagerOptions } from "../model-manager";
10
10
  import { getBundledModels } from "../models";
@@ -197,6 +197,8 @@ function mapWithBundledReference<TApi extends Api>(
197
197
  ...reference,
198
198
  id: defaults.id,
199
199
  name,
200
+ api: defaults.api,
201
+ provider: defaults.provider,
200
202
  baseUrl: defaults.baseUrl,
201
203
  contextWindow: toPositiveNumber(entry.context_length, reference.contextWindow),
202
204
  maxTokens: toPositiveNumber(entry.max_completion_tokens, reference.maxTokens),
@@ -1258,6 +1260,51 @@ export function clampKimiK27CodeMaxTokens(modelId: string, candidate: number | n
1258
1260
  return isKimiK27CodeModelId(modelId) ? Math.min(candidate, KIMI_K27_CODE_RECOMMENDED_MAX_TOKENS) : candidate;
1259
1261
  }
1260
1262
 
1263
+ /**
1264
+ * Fireworks Fast variants we surface. Each inherits the base model's
1265
+ * limits/modalities/thinking and overrides only the cost with the Standard-column
1266
+ * Fast prices from the Serverless pricing table; `cacheWrite` stays 0 (Fireworks
1267
+ * bills no cache-write). Derived from the bundled base entries so metadata stays
1268
+ * in lockstep, and the runtime auto-falls back to the base id on a failed fast
1269
+ * request. See https://docs.fireworks.ai/serverless/pricing.
1270
+ */
1271
+ const FIREWORKS_FAST_VARIANT_SPECS: ReadonlyArray<{
1272
+ base: string;
1273
+ name: string;
1274
+ cost: { input: number; output: number; cacheRead: number };
1275
+ }> = [
1276
+ { base: "kimi-k2.7-code", name: "Kimi K2.7 Code Fast", cost: { input: 1.9, output: 8, cacheRead: 0.38 } },
1277
+ { base: "kimi-k2.6", name: "Kimi K2.6 Fast", cost: { input: 2, output: 8, cacheRead: 0.3 } },
1278
+ { base: "glm-5.1", name: "GLM-5.1 Fast", cost: { input: 2.8, output: 8.8, cacheRead: 0.52 } },
1279
+ ];
1280
+
1281
+ /**
1282
+ * Build the Fireworks Fast seed by projecting each base bundled spec into a
1283
+ * `<id>-fast` variant. Pushed into the generated catalog (Fast routers never
1284
+ * appear in the serverless control-plane list, so discovery cannot surface
1285
+ * them) and deduped behind any identical previous-snapshot entry.
1286
+ */
1287
+ export function buildFireworksFastSeed(): ModelSpec<"openai-completions">[] {
1288
+ const bundled = createBundledReferenceMap<"openai-completions">("fireworks");
1289
+ const seeds: ModelSpec<"openai-completions">[] = [];
1290
+ for (const variant of FIREWORKS_FAST_VARIANT_SPECS) {
1291
+ const base = bundled.get(variant.base);
1292
+ if (!base) continue;
1293
+ seeds.push({
1294
+ ...base,
1295
+ id: `${variant.base}${FIREWORKS_FAST_SUFFIX}`,
1296
+ name: variant.name,
1297
+ cost: {
1298
+ input: variant.cost.input,
1299
+ output: variant.cost.output,
1300
+ cacheRead: variant.cost.cacheRead,
1301
+ cacheWrite: 0,
1302
+ },
1303
+ });
1304
+ }
1305
+ return seeds;
1306
+ }
1307
+
1261
1308
  /**
1262
1309
  * Fireworks DeepSeek V4 accepts effort via `reasoning_effort` but rejects the
1263
1310
  * DeepSeek-native binary `thinking` toggle when both are present.
@@ -1523,7 +1570,7 @@ export function firepassModelManagerOptions(
1523
1570
  }
1524
1571
 
1525
1572
  // ---------------------------------------------------------------------------
1526
- // 7.7 Wafer (Pass + Serverless)
1573
+ // 7.7 Wafer Serverless
1527
1574
  // ---------------------------------------------------------------------------
1528
1575
 
1529
1576
  export interface WaferModelManagerConfig {
@@ -1536,13 +1583,14 @@ const WAFER_DEFAULT_BASE_URL = "https://pass.wafer.ai/v1";
1536
1583
  const WAFER_MAX_TOKENS_CAP = 65536;
1537
1584
 
1538
1585
  /**
1539
- * Shared mapper for Wafer's `/v1/models` records.
1586
+ * Mapper for Wafer Serverless `/v1/models` records.
1540
1587
  *
1541
- * Wafer wraps each entry with a `wafer` envelope describing tier, capabilities,
1542
- * and cents-per-million pricing. The mapper folds that metadata into the
1543
- * canonical `ModelSpec<"openai-completions">` shape and applies zai-family thinking
1544
- * compat when the entry advertises reasoning support (GLM-family on the Pass
1545
- * SKU). Cents-per-million dollars-per-million via /100.
1588
+ * Wafer wraps each entry with a `wafer` envelope describing capabilities and
1589
+ * pricing. The mapper folds that metadata into the canonical
1590
+ * `ModelSpec<"openai-completions">` shape and applies upstream-specific thinking
1591
+ * compat when the entry advertises reasoning support. Wafer pricing is exposed
1592
+ * through internal wholesale units; the public Serverless rate equals
1593
+ * `cents × 125 / 10000`.
1546
1594
  */
1547
1595
  interface WaferRecord {
1548
1596
  context_length?: unknown;
@@ -1563,7 +1611,7 @@ function readWaferRecord(entry: OpenAICompatibleModelRecord): WaferRecord | unde
1563
1611
  }
1564
1612
 
1565
1613
  function mapWaferModel(
1566
- providerId: "wafer-pass" | "wafer-serverless",
1614
+ providerId: "wafer-serverless",
1567
1615
  baseUrl: string,
1568
1616
  entry: OpenAICompatibleModelRecord,
1569
1617
  defaults: ModelSpec<"openai-completions">,
@@ -1579,25 +1627,12 @@ function mapWaferModel(
1579
1627
  );
1580
1628
  const maxTokens = contextWindow !== null ? Math.min(contextWindow, WAFER_MAX_TOKENS_CAP) : null;
1581
1629
  const pricing = wafer?.pricing ?? {};
1582
- // Wafer's `/v1/models` exposes pricing through `*_cents_per_million` fields,
1583
- // but the values are an internal wholesale unit, not literal cents — across
1584
- // every published Serverless model on wafer.ai the user-facing rate equals
1585
- // `cents × 125 / 10000` (i.e. wholesale × 1.25 / 100; GLM-5.1's `120` →
1586
- // $1.50/M, Kimi-K2.6's `88` → $1.10/M, etc.). The multiply-first form keeps
1587
- // the result a finite dyadic for every observed value.
1588
- // For the Pass SKU the per-token rate is bundled in the flat-rate
1589
- // subscription, so we follow the convention shared with
1590
- // `kimi-code`/`firepass`/`alibaba-coding-plan` and seed every Pass model with
1591
- // `cost: 0` regardless of what the upstream envelope says.
1592
- const isPassSku = providerId === "wafer-pass";
1593
- const cost = isPassSku
1594
- ? { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }
1595
- : {
1596
- input: (toPositiveNumber(pricing.input_cents_per_million, 0) * 125) / 10000,
1597
- output: (toPositiveNumber(pricing.output_cents_per_million, 0) * 125) / 10000,
1598
- cacheRead: (toPositiveNumber(pricing.cache_read_cents_per_million, 0) * 125) / 10000,
1599
- cacheWrite: 0,
1600
- };
1630
+ const cost = {
1631
+ input: (toPositiveNumber(pricing.input_cents_per_million, 0) * 125) / 10000,
1632
+ output: (toPositiveNumber(pricing.output_cents_per_million, 0) * 125) / 10000,
1633
+ cacheRead: (toPositiveNumber(pricing.cache_read_cents_per_million, 0) * 125) / 10000,
1634
+ cacheWrite: 0,
1635
+ };
1601
1636
  const name = toModelName(wafer?.display_name, defaults.name);
1602
1637
  const base: ModelSpec<"openai-completions"> = {
1603
1638
  ...defaults,
@@ -1643,13 +1678,12 @@ function mapWaferModel(
1643
1678
  };
1644
1679
  }
1645
1680
 
1646
- function createWaferOptions(
1647
- providerId: "wafer-pass" | "wafer-serverless",
1648
- config: WaferModelManagerConfig | undefined,
1681
+ export function waferServerlessModelManagerOptions(
1682
+ config?: WaferModelManagerConfig,
1649
1683
  ): ModelManagerOptions<"openai-completions"> {
1650
1684
  const apiKey = config?.apiKey;
1651
1685
  const baseUrl = config?.baseUrl ?? WAFER_DEFAULT_BASE_URL;
1652
- const passOnly = providerId === "wafer-pass";
1686
+ const providerId = "wafer-serverless" as const;
1653
1687
  return {
1654
1688
  providerId,
1655
1689
  ...(apiKey && {
@@ -1659,11 +1693,6 @@ function createWaferOptions(
1659
1693
  provider: providerId,
1660
1694
  baseUrl,
1661
1695
  apiKey,
1662
- filterModel: entry => {
1663
- if (!passOnly) return true;
1664
- const wafer = readWaferRecord(entry);
1665
- return wafer?.tier === "pass_included";
1666
- },
1667
1696
  mapModel: (entry, defaults) => mapWaferModel(providerId, baseUrl, entry, defaults),
1668
1697
  fetch: config?.fetch,
1669
1698
  }),
@@ -1671,18 +1700,6 @@ function createWaferOptions(
1671
1700
  };
1672
1701
  }
1673
1702
 
1674
- export function waferPassModelManagerOptions(
1675
- config?: WaferModelManagerConfig,
1676
- ): ModelManagerOptions<"openai-completions"> {
1677
- return createWaferOptions("wafer-pass", config);
1678
- }
1679
-
1680
- export function waferServerlessModelManagerOptions(
1681
- config?: WaferModelManagerConfig,
1682
- ): ModelManagerOptions<"openai-completions"> {
1683
- return createWaferOptions("wafer-serverless", config);
1684
- }
1685
-
1686
1703
  // ---------------------------------------------------------------------------
1687
1704
  // 7. Mistral
1688
1705
  // ---------------------------------------------------------------------------
@@ -2448,7 +2465,10 @@ export function moonshotModelManagerOptions(
2448
2465
  config?: MoonshotModelManagerConfig,
2449
2466
  ): ModelManagerOptions<"openai-completions"> {
2450
2467
  const apiKey = config?.apiKey;
2451
- const baseUrl = config?.baseUrl ?? "https://api.moonshot.ai/v1";
2468
+ // `MOONSHOT_BASE_URL` redirects discovery (and the streaming request that
2469
+ // inherits this baseUrl) at the Kimi China platform `api.moonshot.cn`; an
2470
+ // explicit `config.baseUrl` still wins. Mirrors LITELLM_BASE_URL/LM_STUDIO_BASE_URL. (#2883)
2471
+ const baseUrl = config?.baseUrl ?? Bun.env.MOONSHOT_BASE_URL ?? "https://api.moonshot.ai/v1";
2452
2472
  const references = createBundledReferenceMap<"openai-completions">("moonshot");
2453
2473
  return {
2454
2474
  providerId: "moonshot",