@khanglvm/llm-router 2.6.0 → 2.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,19 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
- ## [2.6.0] - 2026-04-23
11
-
12
- ### Added
13
- - Local `llama.cpp` variants can now persist a per-model runtime profile, including auto-tuned presets and custom launch overrides, so each GGUF variant can run with settings that match its own size and context shape instead of sharing one global `llama-server` startup profile.
14
- - The Web UI now exposes managed `llama.cpp` runtime health for Local Models, including tracked instance counts, healthy/stale summaries, and persisted runtime-profile data for each saved variant.
15
-
16
- ### Changed
17
- - Local variant requests are now resolved through a managed per-variant `llama.cpp` runtime layer that can reuse compatible instances, allocate fallback ports safely, and start the right runtime configuration for the specific model variant without exposing multi-process lifecycle management to the user.
18
- - Hugging Face GGUF search/download flows now surface file size plus estimated runtime memory guidance directly in the Local Models workflow, making it easier to choose a viable quantization before download.
19
-
20
- ### Fixed
21
- - Managed `llama.cpp` runtimes now reconcile stale tracked instances before reuse, avoid reserving dead immediate-exit servers, and drain pending shutdown/startup edges more reliably so local per-model routing does not leave behind stale `llama-server` processes.
22
-
23
10
  ## [2.5.2] - 2026-04-23
24
11
 
25
12
  ### Fixed
package/README.md CHANGED
@@ -44,9 +44,6 @@ Open `llr` and use the **Local Models** tab to manage local inference sources al
44
44
  - **Native macOS browsing** — use the built-in file picker to choose a single GGUF file, scan a folder recursively for GGUF models, or browse directly to a local `llama-server` binary
45
45
  - **Managed + attached model library** — stale or moved files stay visible instead of crashing the app, and can be repaired by locating the file again or removed cleanly
46
46
  - **Router-visible local variants** — create friendly model variants with bounded presets, context-window metadata, preload toggles, and Mac unified-memory fit guidance with clearer safe/tight recommendations
47
- - **Per-variant llama.cpp tuning** — each local variant can store its own runtime profile so balanced, throughput, long-context, low-memory, or custom launch overrides do not fight over one shared global `llama-server` config
48
- - **Managed per-model runtimes** — the router automatically starts, reuses, and stops the right `llama.cpp` instance for the requested local variant, with stale-runtime cleanup handled internally instead of asking the user to manage separate servers
49
- - **GGUF size + memory guidance** — Hugging Face search results now show model file size plus estimated runtime memory fit guidance before download, helping choose viable quantizations faster
50
47
  - **Alias-ready local routing** — once saved, local variants behave like normal router models and can be used in aliases, capability flags, and fallback chains
51
48
 
52
49
  For v1, the managed download flow only searches public Hugging Face GGUF files and the fit guidance is tuned for Macs with unified memory.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@khanglvm/llm-router",
3
- "version": "2.6.0",
3
+ "version": "2.6.2",
4
4
  "description": "LLM Router: single gateway endpoint for multi-provider LLMs with unified OpenAI+Anthropic format and seamless fallback",
5
5
  "keywords": [
6
6
  "llm-router",
@@ -66,6 +66,7 @@ import {
66
66
  sanitizeConfigForDisplay,
67
67
  validateRuntimeConfig
68
68
  } from "../runtime/config.js";
69
+ import { normalizeQuotaProbeConfig } from "../runtime/quota-probe.js";
69
70
  import {
70
71
  CODEX_SUBSCRIPTION_MODELS,
71
72
  CLAUDE_CODE_SUBSCRIPTION_MODELS
@@ -8041,6 +8042,234 @@ async function doSetProviderRateLimits(context) {
8041
8042
  };
8042
8043
  }
8043
8044
 
8045
+ function parseProbeHeaders(raw) {
8046
+ if (!raw) return undefined;
8047
+ const str = String(raw).trim();
8048
+ if (!str) return undefined;
8049
+ try {
8050
+ const parsed = JSON.parse(str);
8051
+ if (Array.isArray(parsed)) return parsed;
8052
+ if (typeof parsed === "object" && parsed !== null) {
8053
+ return Object.entries(parsed).map(([key, value]) => ({ key, value: String(value) }));
8054
+ }
8055
+ } catch {
8056
+ // not JSON — ignore
8057
+ }
8058
+ return undefined;
8059
+ }
8060
+
8061
+ function parseProbeMapping(raw) {
8062
+ if (!raw) return undefined;
8063
+ const str = String(raw).trim();
8064
+ if (!str) return undefined;
8065
+ try {
8066
+ return JSON.parse(str);
8067
+ } catch {
8068
+ return undefined;
8069
+ }
8070
+ }
8071
+
8072
+ function buildMappingFieldEntry(pathStr, coerceAs) {
8073
+ if (!pathStr) return undefined;
8074
+ return { path: String(pathStr).trim(), as: coerceAs || "number" };
8075
+ }
8076
+
8077
+ export function setProviderQuotaProbeInConfig(config, { providerId, quotaProbe }) {
8078
+ const next = structuredClone(config);
8079
+ const normalizedProviderId = String(providerId || "").trim();
8080
+ if (!normalizedProviderId) {
8081
+ return { config: next, changed: false, reason: "provider-id is required." };
8082
+ }
8083
+ const provider = (next.providers || []).find((item) => item.id === normalizedProviderId);
8084
+ if (!provider) {
8085
+ return { config: next, changed: false, reason: `Provider '${normalizedProviderId}' not found.` };
8086
+ }
8087
+ const previous = provider.quotaProbe || null;
8088
+ provider.quotaProbe = quotaProbe;
8089
+ const validationErrors = findIntroducedConfigValidationErrors(config, next);
8090
+ if (validationErrors.length > 0) {
8091
+ return { config, changed: false, reason: formatConfigValidationError(validationErrors) };
8092
+ }
8093
+ return {
8094
+ config: next,
8095
+ changed: serializeStable(previous) !== serializeStable(provider.quotaProbe),
8096
+ reason: "",
8097
+ providerId: normalizedProviderId,
8098
+ quotaProbe: provider.quotaProbe
8099
+ };
8100
+ }
8101
+
8102
+ function buildQuotaProbeReport(providerId, probe) {
8103
+ if (!probe) return `Provider '${providerId}': quota probe disabled.`;
8104
+ const lines = [`Provider '${providerId}': quota probe configured.`];
8105
+ lines.push(` enabled: ${probe.enabled}`);
8106
+ lines.push(` mode: ${probe.mode}`);
8107
+ lines.push(` capKind: ${probe.capKind}`);
8108
+ lines.push(` combinator: ${probe.combinator}`);
8109
+ lines.push(` enforce: ${probe.enforce}`);
8110
+ if (probe.safetyMargin) {
8111
+ lines.push(` margin: $${probe.safetyMargin.dollars} or ${probe.safetyMargin.percent}%`);
8112
+ }
8113
+ if (probe.mode === "http" && probe.http) {
8114
+ lines.push(` url: ${probe.http.method} ${probe.http.url}`);
8115
+ lines.push(` timeout: ${probe.http.timeoutMs}ms`);
8116
+ if (probe.http.headers?.length) {
8117
+ lines.push(` headers: ${probe.http.headers.map((h) => h.key).join(", ")}`);
8118
+ }
8119
+ }
8120
+ if (probe.mode === "custom" && probe.custom) {
8121
+ lines.push(` timeout: ${probe.custom.timeoutMs}ms`);
8122
+ lines.push(` source: ${probe.custom.source.length} chars`);
8123
+ }
8124
+ return lines.join("\n");
8125
+ }
8126
+
8127
+ async function doSetQuotaProbe(context) {
8128
+ const args = context.args || {};
8129
+ const configPath = readArg(args, ["config", "configPath"], getDefaultConfigPath());
8130
+ const config = await readConfigFile(configPath);
8131
+ const providerId = String(readArg(args, ["provider-id", "providerId"], "") || "").trim();
8132
+
8133
+ if (!providerId) {
8134
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "provider-id is required." };
8135
+ }
8136
+
8137
+ const provider = config.providers.find((item) => item.id === providerId);
8138
+ if (!provider) {
8139
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: `Provider '${providerId}' not found.` };
8140
+ }
8141
+
8142
+ const disableProbe = toBoolean(readArg(args, ["disable-quota-probe", "disableQuotaProbe"], false), false);
8143
+ if (disableProbe) {
8144
+ const result = setProviderQuotaProbeInConfig(config, { providerId, quotaProbe: null });
8145
+ if (!result.changed && result.reason) {
8146
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: result.reason };
8147
+ }
8148
+ await writeConfigFile(result.config, configPath);
8149
+ return { ok: true, mode: context.mode, exitCode: EXIT_SUCCESS, data: `Provider '${providerId}': quota probe disabled.` };
8150
+ }
8151
+
8152
+ const quotaProbeJsonRaw = readArg(args, ["quota-probe-json", "quotaProbeJson"], undefined);
8153
+ let probeConfig;
8154
+
8155
+ if (quotaProbeJsonRaw) {
8156
+ try {
8157
+ probeConfig = JSON.parse(String(quotaProbeJsonRaw));
8158
+ } catch {
8159
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "quota-probe-json must be valid JSON." };
8160
+ }
8161
+ if (typeof probeConfig !== "object" || probeConfig === null) {
8162
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "quota-probe-json must be a JSON object." };
8163
+ }
8164
+ if (!("enabled" in probeConfig)) probeConfig.enabled = true;
8165
+ } else {
8166
+ const existing = provider.quotaProbe || {};
8167
+ const mode = String(readArg(args, ["probe-mode", "probeMode"], existing.mode || "http") || "http").trim();
8168
+ const capKind = String(readArg(args, ["cap-kind", "capKind"], existing.capKind || "") || "").trim();
8169
+ const combinator = String(readArg(args, ["combinator"], existing.combinator || "AND") || "AND").trim().toUpperCase();
8170
+ const enforce = String(readArg(args, ["enforce"], existing.enforce || "gate") || "gate").trim();
8171
+ const marginDollars = toNumber(readArg(args, ["safety-margin-dollars", "safetyMarginDollars"], existing.safetyMargin?.dollars), 0);
8172
+ const marginPercent = toNumber(readArg(args, ["safety-margin-percent", "safetyMarginPercent"], existing.safetyMargin?.percent), 0);
8173
+
8174
+ if (!capKind) {
8175
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "cap-kind is required (dollars | tokens | requests)." };
8176
+ }
8177
+
8178
+ probeConfig = {
8179
+ enabled: true,
8180
+ capKind,
8181
+ combinator,
8182
+ enforce,
8183
+ mode,
8184
+ safetyMargin: { dollars: marginDollars, percent: marginPercent }
8185
+ };
8186
+
8187
+ if (mode === "http") {
8188
+ const existingHttp = existing.http || {};
8189
+ const url = String(readArg(args, ["probe-url", "probeUrl"], existingHttp.url || "") || "").trim();
8190
+ const method = String(readArg(args, ["probe-method", "probeMethod"], existingHttp.method || "GET") || "GET").trim().toUpperCase();
8191
+ const timeoutMs = toNumber(readArg(args, ["probe-timeout", "probeTimeout"], existingHttp.timeoutMs), undefined);
8192
+ const headersRaw = readArg(args, ["probe-headers", "probeHeaders"], undefined);
8193
+ const bodyRaw = readArg(args, ["probe-body", "probeBody"], undefined);
8194
+
8195
+ if (!url) {
8196
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "probe-url is required for HTTP mode." };
8197
+ }
8198
+
8199
+ const headers = parseProbeHeaders(headersRaw) ?? existingHttp.headers ?? [];
8200
+ const body = bodyRaw !== undefined ? String(bodyRaw) : existingHttp.body;
8201
+
8202
+ const mappingJsonRaw = readArg(args, ["probe-mapping", "probeMapping"], undefined);
8203
+ let mapping;
8204
+ if (mappingJsonRaw) {
8205
+ mapping = parseProbeMapping(mappingJsonRaw);
8206
+ if (!mapping) {
8207
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "probe-mapping must be valid JSON." };
8208
+ }
8209
+ } else {
8210
+ const existingMapping = existingHttp.mapping || {};
8211
+ const usedPath = readArg(args, ["probe-mapping-used", "probeMappingUsed"], undefined);
8212
+ const limitPath = readArg(args, ["probe-mapping-limit", "probeMappingLimit"], undefined);
8213
+ const remainingPath = readArg(args, ["probe-mapping-remaining", "probeMappingRemaining"], undefined);
8214
+ const resetAtPath = readArg(args, ["probe-mapping-reset-at", "probeMappingResetAt"], undefined);
8215
+ const isUnlimitedPath = readArg(args, ["probe-mapping-is-unlimited", "probeMappingIsUnlimited"], undefined);
8216
+ mapping = { ...existingMapping };
8217
+ if (usedPath) mapping.used = buildMappingFieldEntry(usedPath, "number");
8218
+ if (limitPath) mapping.limit = buildMappingFieldEntry(limitPath, "number");
8219
+ if (remainingPath) mapping.remaining = buildMappingFieldEntry(remainingPath, "number");
8220
+ if (resetAtPath) mapping.resetAt = buildMappingFieldEntry(resetAtPath, "datetime");
8221
+ if (isUnlimitedPath) mapping.isUnlimited = buildMappingFieldEntry(isUnlimitedPath, "boolean");
8222
+ }
8223
+
8224
+ probeConfig.http = { method, url, headers, timeoutMs, mapping };
8225
+ if (body !== undefined) probeConfig.http.body = body;
8226
+ } else if (mode === "custom") {
8227
+ const existingCustom = existing.custom || {};
8228
+ const source = readArg(args, ["custom-source", "customSource"], existingCustom.source || "");
8229
+ const timeoutMs = toNumber(readArg(args, ["probe-timeout", "probeTimeout"], existingCustom.timeoutMs), undefined);
8230
+ if (!source) {
8231
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "custom-source is required for custom mode." };
8232
+ }
8233
+ probeConfig.custom = { source: String(source), timeoutMs };
8234
+ }
8235
+
8236
+ const refreshOnUiOpen = toBoolean(readArg(args, ["refresh-on-ui-open", "refreshOnUiOpen"], undefined), undefined);
8237
+ const refreshOnResetAt = toBoolean(readArg(args, ["refresh-on-reset-at", "refreshOnResetAt"], undefined), undefined);
8238
+ const refreshOnErrorRaw = readArg(args, ["refresh-on-upstream-error", "refreshOnUpstreamError"], undefined);
8239
+
8240
+ if (refreshOnUiOpen !== undefined || refreshOnResetAt !== undefined || refreshOnErrorRaw !== undefined) {
8241
+ const existingTriggers = existing.refreshTriggers || {};
8242
+ probeConfig.refreshTriggers = {
8243
+ onUiOpen: refreshOnUiOpen !== undefined ? refreshOnUiOpen : !!existingTriggers.onUiOpen,
8244
+ onManual: true,
8245
+ onResetAt: refreshOnResetAt !== undefined ? refreshOnResetAt : !!existingTriggers.onResetAt,
8246
+ onUpstreamError: null
8247
+ };
8248
+ if (refreshOnErrorRaw) {
8249
+ const codes = String(refreshOnErrorRaw).split(",").map((s) => Number(s.trim())).filter((n) => Number.isFinite(n));
8250
+ if (codes.length > 0) probeConfig.refreshTriggers.onUpstreamError = { statusCodes: codes };
8251
+ } else if (existingTriggers.onUpstreamError) {
8252
+ probeConfig.refreshTriggers.onUpstreamError = existingTriggers.onUpstreamError;
8253
+ }
8254
+ }
8255
+ }
8256
+
8257
+ const normalized = normalizeQuotaProbeConfig(probeConfig);
8258
+ if (!normalized) {
8259
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "Invalid quota probe config. Ensure enabled=true and capKind is one of: dollars, tokens, requests." };
8260
+ }
8261
+
8262
+ const result = setProviderQuotaProbeInConfig(config, { providerId, quotaProbe: probeConfig });
8263
+ if (!result.changed && result.reason) {
8264
+ return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: result.reason };
8265
+ }
8266
+ if (!result.changed) {
8267
+ return { ok: true, mode: context.mode, exitCode: EXIT_SUCCESS, data: buildQuotaProbeReport(providerId, normalized) + "\n(no changes)" };
8268
+ }
8269
+ await writeConfigFile(result.config, configPath);
8270
+ return { ok: true, mode: context.mode, exitCode: EXIT_SUCCESS, data: buildQuotaProbeReport(providerId, normalized) };
8271
+ }
8272
+
8044
8273
  async function doSetMasterKey(context) {
8045
8274
  const args = context.args || {};
8046
8275
  const configPath = readArg(args, ["config", "configPath"], getDefaultConfigPath());
@@ -8554,6 +8783,9 @@ async function runConfigAction(context) {
8554
8783
  case "set-model-fallbacks":
8555
8784
  case "set-model-fallback":
8556
8785
  return doSetModelFallbacks(context);
8786
+ case "set-quota-probe":
8787
+ case "set-provider-quota-probe":
8788
+ return doSetQuotaProbe(context);
8557
8789
  case "set-master-key":
8558
8790
  return doSetMasterKey(context);
8559
8791
  case "set-amp-config":
@@ -10575,7 +10807,7 @@ const routerModule = {
10575
10807
  },
10576
10808
  {
10577
10809
  actionId: "config",
10578
- description: "Config manager for providers, diagnostics, coding-tool routing, AMP, and startup service.",
10810
+ description: "Config manager for providers, diagnostics, coding-tool routing, AMP, quota probes, and startup service.",
10579
10811
  tui: { steps: ["cli-only"] },
10580
10812
  commandline: {
10581
10813
  requiredArgs: [],
@@ -10610,6 +10842,29 @@ const routerModule = {
10610
10842
  "rate-limits",
10611
10843
  "remove-bucket",
10612
10844
  "replace-rate-limits",
10845
+ "probe-mode",
10846
+ "probe-url",
10847
+ "probe-method",
10848
+ "probe-headers",
10849
+ "probe-body",
10850
+ "probe-timeout",
10851
+ "probe-mapping",
10852
+ "probe-mapping-used",
10853
+ "probe-mapping-limit",
10854
+ "probe-mapping-remaining",
10855
+ "probe-mapping-reset-at",
10856
+ "probe-mapping-is-unlimited",
10857
+ "cap-kind",
10858
+ "combinator",
10859
+ "enforce",
10860
+ "safety-margin-dollars",
10861
+ "safety-margin-percent",
10862
+ "custom-source",
10863
+ "quota-probe-json",
10864
+ "disable-quota-probe",
10865
+ "refresh-on-ui-open",
10866
+ "refresh-on-reset-at",
10867
+ "refresh-on-upstream-error",
10613
10868
  "alias-id",
10614
10869
  "alias",
10615
10870
  "targets",
@@ -10681,7 +10936,7 @@ const routerModule = {
10681
10936
  ]
10682
10937
  },
10683
10938
  help: {
10684
- summary: `Manage providers, diagnostics, config validation, coding-tool routing, model aliases, rate-limit buckets, AMP proxy settings, master key, and OS startup. \`${CLI_COMMAND} config\` opens the web console by default; use \`--operation\` for direct CLI actions.`,
10939
+ summary: `Manage providers, diagnostics, config validation, coding-tool routing, model aliases, rate-limit buckets, quota probes (external provider budget monitoring), AMP proxy settings, master key, and OS startup. \`${CLI_COMMAND} config\` opens the web console by default; use \`--operation\` for direct CLI actions.`,
10685
10940
  args: [
10686
10941
  { name: "operation", required: false, description: "Config operation (optional; defaults to a config summary when omitted in direct CLI mode).", example: "--operation=upsert-provider" },
10687
10942
  { name: "provider-id", required: false, description: "Provider id (lowercase letters/numbers/dashes).", example: "--provider-id=openrouter-primary" },
@@ -10718,6 +10973,29 @@ const routerModule = {
10718
10973
  { name: "remove-bucket", required: false, description: "Remove bucket by --bucket-id in set-provider-rate-limits.", example: "--remove-bucket=true" },
10719
10974
  { name: "replace-rate-limits", required: false, description: "Replace all provider buckets with provided entries.", example: "--replace-rate-limits=true" },
10720
10975
  { name: "rate-limits", required: false, description: "Rate-limit bucket JSON object/array for bulk update.", example: "--rate-limits='[{\"id\":\"or-month\",\"models\":[\"all\"],\"requests\":20000,\"window\":{\"unit\":\"month\",\"size\":1}}]'" },
10976
+ { name: "probe-mode", required: false, description: "For set-quota-probe: probe execution mode. 'http' sends an HTTP request to a provider quota/usage endpoint and maps the JSON response to a normalized snapshot. 'custom' runs a sandboxed JS function. Default: http.", example: "--probe-mode=http" },
10977
+ { name: "probe-url", required: false, description: "For set-quota-probe (HTTP mode): the full URL of the provider's quota/usage/subscription API endpoint that returns JSON with usage data. Supports {{providerApiKey}}, {{providerBaseUrl}}, {{providerId}}, and {{env.VAR_NAME}} shortcodes for secret interpolation.", example: "--probe-url=https://ramclouds.me/api/subscription/self" },
10978
+ { name: "probe-method", required: false, description: "For set-quota-probe (HTTP mode): HTTP method. Default: GET.", example: "--probe-method=GET" },
10979
+ { name: "probe-headers", required: false, description: "For set-quota-probe (HTTP mode): request headers as a JSON array of {key,value} objects or a JSON object {key:value}. Use {{providerApiKey}} to interpolate the provider's API key, or {{env.VAR_NAME}} to interpolate environment variables for secrets that differ from the provider API key (e.g. a separate system token).", example: "--probe-headers='[{\"key\":\"Authorization\",\"value\":\"Bearer {{env.RC_TOKEN}}\"},{\"key\":\"New-Api-User\",\"value\":\"{{env.RC_USER}}\"}]'" },
10980
+ { name: "probe-body", required: false, description: "For set-quota-probe (HTTP mode, POST only): request body string. Supports the same {{shortcode}} interpolation as probe-headers.", example: "--probe-body='{\"action\":\"get_usage\"}'" },
10981
+ { name: "probe-timeout", required: false, description: "For set-quota-probe: request timeout in milliseconds. HTTP mode default: 5000 (max 15000). Custom mode default: 2000 (max 10000).", example: "--probe-timeout=10000" },
10982
+ { name: "probe-mapping", required: false, description: "For set-quota-probe (HTTP mode): full JSON mapping object that maps provider API response JSON paths to normalized snapshot fields. Each field has {path, as} where 'path' is a dot-path like '$.data.used_quota' and 'as' is the coercion type. Coercion types: 'number' (numeric), 'dollars-from-cents' (divides by 100), 'boolean', 'datetime' (ISO-8601/epoch/duration), 'raw'. Alternative to individual --probe-mapping-* flags.", example: "--probe-mapping='{\"used\":{\"path\":\"$.data.used\",\"as\":\"number\"},\"limit\":{\"path\":\"$.data.limit\",\"as\":\"number\"}}'" },
10983
+ { name: "probe-mapping-used", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path in the provider API response to the 'used' quota value (how much has been consumed). Coerced as number. At least 2 of {used, limit, remaining} are required; the third is auto-derived.", example: "--probe-mapping-used=$.data.used_quota" },
10984
+ { name: "probe-mapping-limit", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to the 'limit' value (total quota cap). Coerced as number.", example: "--probe-mapping-limit=$.data.quota_limit" },
10985
+ { name: "probe-mapping-remaining", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to the 'remaining' value (quota left). Coerced as number.", example: "--probe-mapping-remaining=$.data.remaining_quota" },
10986
+ { name: "probe-mapping-reset-at", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to the reset timestamp (when quota resets). Coerced as datetime (auto-detects ISO-8601, epoch seconds, epoch milliseconds, or duration strings like '2h' or 'PT30M').", example: "--probe-mapping-reset-at=$.data.reset_at" },
10987
+ { name: "probe-mapping-is-unlimited", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to a boolean indicating unlimited quota. When true, the probe always reports 'available' regardless of used/limit values.", example: "--probe-mapping-is-unlimited=$.data.is_unlimited" },
10988
+ { name: "cap-kind", required: false, description: "For set-quota-probe: the unit of the quota cap reported by the provider. Determines how used/limit/remaining values are interpreted. Values: 'dollars' (monetary budget), 'tokens' (token count), 'requests' (request count).", example: "--cap-kind=dollars" },
10989
+ { name: "combinator", required: false, description: "For set-quota-probe: how the quota probe verdict combines with local rate-limit verdict to decide if a provider is eligible. 'AND' = both must pass (default, safest). 'OR' = either can pass (lenient). 'REPLACE' = probe verdict replaces rate-limit entirely.", example: "--combinator=AND" },
10990
+ { name: "enforce", required: false, description: "For set-quota-probe: enforcement mode. 'gate' = blocks routing when quota exhausted (production use). 'observe' = logs verdict but never blocks (dry-run/testing). Default: gate.", example: "--enforce=gate" },
10991
+ { name: "safety-margin-dollars", required: false, description: "For set-quota-probe: dollar-based safety margin. Provider is considered exhausted when remaining ≤ this value. Applied as max(dollars, limit×percent/100). Default: 0.", example: "--safety-margin-dollars=1" },
10992
+ { name: "safety-margin-percent", required: false, description: "For set-quota-probe: percentage-based safety margin. Provider is considered exhausted when remaining ≤ limit×percent/100. Applied as max(dollars, limit×percent/100). Default: 0.", example: "--safety-margin-percent=2" },
10993
+ { name: "custom-source", required: false, description: "For set-quota-probe (custom mode): JavaScript async function source that runs in a sandboxed VM. Receives ctx object with {fetch, providerApiKey, providerBaseUrl, providerId}. Must return {capKind, used, limit} or {capKind, remaining, limit}. No access to process, require, or globalThis.", example: "--custom-source='export default async function(ctx) { const r = await ctx.fetch(\"https://api.example.com/usage\", {headers:{\"Authorization\":\"Bearer \"+ctx.providerApiKey}}); const d = await r.json(); return {capKind:\"dollars\",used:d.used,limit:d.limit}; }'" },
10994
+ { name: "quota-probe-json", required: false, description: "For set-quota-probe: provide the full quotaProbe config as a single JSON object. Overrides all other probe flags. Useful when the config is complex or pre-built. The object is written directly to the provider's quotaProbe field.", example: "--quota-probe-json='{\"enabled\":true,\"capKind\":\"dollars\",\"mode\":\"http\",\"combinator\":\"AND\",\"enforce\":\"gate\",\"http\":{\"method\":\"GET\",\"url\":\"https://example.com/api/usage\",\"headers\":[{\"key\":\"Authorization\",\"value\":\"Bearer {{providerApiKey}}\"}],\"mapping\":{\"used\":{\"path\":\"$.used\",\"as\":\"number\"},\"limit\":{\"path\":\"$.limit\",\"as\":\"number\"}}}}'" },
10995
+ { name: "disable-quota-probe", required: false, description: "For set-quota-probe: set to true to disable and remove the quota probe config from the provider. The provider will no longer be gated by external quota checks.", example: "--disable-quota-probe=true" },
10996
+ { name: "refresh-on-ui-open", required: false, description: "For set-quota-probe: auto-refresh the quota snapshot when the web console UI is opened. Default: false.", example: "--refresh-on-ui-open=true" },
10997
+ { name: "refresh-on-reset-at", required: false, description: "For set-quota-probe: schedule an automatic refresh at the resetAt timestamp returned by the probe. Useful when the provider reports when the quota window rolls over. Default: false.", example: "--refresh-on-reset-at=true" },
10998
+ { name: "refresh-on-upstream-error", required: false, description: "For set-quota-probe: comma-separated HTTP status codes from upstream provider errors that should trigger a quota probe refresh. Common: 429 (rate limited), 402 (payment required).", example: "--refresh-on-upstream-error=429,402" },
10721
10999
  { name: "format", required: false, description: "Manual format if probe is skipped.", example: "--format=openai" },
10722
11000
  { name: "headers", required: false, description: "Custom provider headers as JSON object (default User-Agent applied when omitted).", example: "--headers={\"User-Agent\":\"Mozilla/5.0\"}" },
10723
11001
  { name: "skip-probe", required: false, description: "Skip live endpoint/model probe.", example: "--skip-probe=true" },
@@ -10795,6 +11073,10 @@ const routerModule = {
10795
11073
  `${CLI_COMMAND} config --operation=set-provider-rate-limits --provider-id=openrouter --bucket-id=openrouter-all-month --bucket-models=all --bucket-requests=20000 --bucket-window=month:1`,
10796
11074
  `${CLI_COMMAND} config --operation=set-provider-rate-limits --provider-id=openrouter --bucket-name="6-hours cap" --bucket-models=all --bucket-requests=600 --bucket-window=hour:6`,
10797
11075
  `${CLI_COMMAND} config --operation=migrate-config --target-version=2 --create-backup=true`,
11076
+ `${CLI_COMMAND} config --operation=set-quota-probe --provider-id=ramclouds --cap-kind=dollars --probe-url=https://ramclouds.me/api/subscription/self --probe-headers='[{"key":"Authorization","value":"Bearer {{env.RC_TOKEN}}"},{"key":"New-Api-User","value":"{{env.RC_USER}}"}]' --probe-mapping-used=$.data.used_quota --probe-mapping-limit=$.data.quota_limit --safety-margin-dollars=1 --combinator=AND --enforce=gate`,
11077
+ `${CLI_COMMAND} config --operation=set-quota-probe --provider-id=openrouter --cap-kind=dollars --probe-url=https://openrouter.ai/api/v1/auth/key --probe-headers='{"Authorization":"Bearer {{providerApiKey}}"}' --probe-mapping-used=$.data.usage --probe-mapping-limit=$.data.limit --refresh-on-upstream-error=429,402`,
11078
+ `${CLI_COMMAND} config --operation=set-quota-probe --provider-id=myapi --cap-kind=tokens --probe-url=https://api.example.com/usage --probe-method=GET --probe-headers='{"Authorization":"Bearer {{providerApiKey}}"}' --probe-mapping-remaining=$.remaining --probe-mapping-limit=$.total --enforce=observe`,
11079
+ `${CLI_COMMAND} config --operation=set-quota-probe --provider-id=ramclouds --disable-quota-probe=true`,
10798
11080
  `${CLI_COMMAND} config --operation=set-model-fallbacks --provider-id=openrouter --model=gpt-4o --fallback-models=anthropic/claude-3-7-sonnet,openrouter/gpt-4.1-mini`,
10799
11081
  `${CLI_COMMAND} config --operation=remove-model --provider-id=openrouter --model=gpt-4o`,
10800
11082
  `${CLI_COMMAND} config --operation=set-amp-config --patch-amp-client-config=true --amp-client-settings-scope=workspace --amp-client-url=${LOCAL_ROUTER_ORIGIN}`,
@@ -17,6 +17,7 @@ import {
17
17
  normalizeFactoryDroidReasoningEffort,
18
18
  resolveFactoryDroidRouterModelRef
19
19
  } from "../shared/coding-tool-bindings.js";
20
+ import { LOCAL_RUNTIME_PROVIDER_TYPE } from "../runtime/local-models.js";
20
21
 
21
22
  const BACKUP_SUFFIX = ".llm_router_backup";
22
23
  const CODEX_PROVIDER_ID = "llm-router";
@@ -972,9 +973,11 @@ export async function patchClaudeCodeEffortLevel({
972
973
  const FACTORY_DROID_ROUTER_MARKER = "_llmRouterManaged";
973
974
  const FACTORY_DROID_OPENAI_PROVIDER = "openai";
974
975
  const FACTORY_DROID_ANTHROPIC_PROVIDER = "anthropic";
976
+ const FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER = "generic-chat-completion-api";
975
977
  const FACTORY_DROID_ROUTER_PROVIDERS = Object.freeze([
976
978
  FACTORY_DROID_OPENAI_PROVIDER,
977
- FACTORY_DROID_ANTHROPIC_PROVIDER
979
+ FACTORY_DROID_ANTHROPIC_PROVIDER,
980
+ FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER
978
981
  ]);
979
982
 
980
983
  function dedupeStrings(values = []) {
@@ -1116,6 +1119,17 @@ function resolveFactoryDroidRouteFormat(modelRef, config = {}, seen = new Set())
1116
1119
  }
1117
1120
 
1118
1121
  function resolveFactoryDroidCustomModelProvider(modelRef, config = {}) {
1122
+ const normalizedModelRef = String(modelRef || "").trim();
1123
+ if (normalizedModelRef.includes("/")) {
1124
+ const separatorIndex = normalizedModelRef.indexOf("/");
1125
+ const providerId = normalizedModelRef.slice(0, separatorIndex).trim();
1126
+ const provider = (Array.isArray(config?.providers) ? config.providers : [])
1127
+ .find((entry) => String(entry?.id || "").trim() === providerId);
1128
+ if (String(provider?.type || "").trim().toLowerCase() === LOCAL_RUNTIME_PROVIDER_TYPE) {
1129
+ return FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER;
1130
+ }
1131
+ }
1132
+
1119
1133
  return mapFactoryDroidFormatToProvider(resolveFactoryDroidRouteFormat(modelRef, config))
1120
1134
  || FACTORY_DROID_OPENAI_PROVIDER;
1121
1135
  }
@@ -1,6 +1,5 @@
1
1
  import path from "node:path";
2
2
  import { promises as fs } from "node:fs";
3
- import { estimateLlamacppRuntimeBytes } from "./llamacpp-runtime-profile.js";
4
3
 
5
4
  const HUGGING_FACE_API_URL = "https://huggingface.co/api/models";
6
5
  const HUGGING_FACE_BASE_URL = "https://huggingface.co";
@@ -155,13 +154,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
155
154
  expectedContextWindow: systemInfo?.expectedContextWindow
156
155
  }, systemInfo);
157
156
  const quantization = parseQuantizationFromFileName(file);
158
- const estimatedRuntimeBytes = sizeBytes
159
- ? estimateLlamacppRuntimeBytes({
160
- sizeBytes,
161
- contextWindow: systemInfo?.expectedContextWindow,
162
- preset: status.fit === "tight" ? "memory-safe" : "balanced"
163
- })
164
- : undefined;
165
157
  const fitScore = status.fit === "safe" ? 30 : status.fit === "tight" ? 15 : status.fit === "unknown" ? 8 : -20;
166
158
  const rankingScore = fitScore
167
159
  + (status.disabled ? -100 : 0)
@@ -174,10 +166,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
174
166
  file,
175
167
  quantization,
176
168
  sizeBytes,
177
- estimatedRuntimeBytes,
178
- memoryLabel: estimatedRuntimeBytes
179
- ? `${(estimatedRuntimeBytes / (1024 ** 3)).toFixed(1)} GB runtime est.`
180
- : "Runtime estimate unavailable",
181
169
  disabled: status.disabled,
182
170
  disabledReason: status.reason,
183
171
  fit: status.fit,