@khanglvm/llm-router 2.6.0 → 2.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +0 -13
- package/README.md +0 -3
- package/package.json +1 -1
- package/src/cli/router-module.js +284 -2
- package/src/node/coding-tool-config.js +15 -1
- package/src/node/huggingface-gguf.js +0 -12
- package/src/node/llamacpp-runtime.js +78 -256
- package/src/node/local-models-service.js +2 -25
- package/src/node/local-server.js +2 -60
- package/src/node/provider-probe.js +18 -0
- package/src/node/quota-probe-mapping.js +215 -0
- package/src/node/quota-probe-runner.js +234 -0
- package/src/node/web-console-client.js +33 -27
- package/src/node/web-console-server.js +107 -64
- package/src/node/web-console-styles.generated.js +1 -1
- package/src/node/web-console-ui/api-client.js +27 -0
- package/src/node/web-console-ui/local-models-utils.js +0 -33
- package/src/runtime/balancer.js +47 -4
- package/src/runtime/config.js +9 -4
- package/src/runtime/handler/fallback.js +7 -0
- package/src/runtime/handler/provider-call.js +18 -36
- package/src/runtime/handler/runtime-policy.js +1 -4
- package/src/runtime/local-models.js +0 -36
- package/src/runtime/quota-probe.js +179 -0
- package/src/translator/request/claude-to-openai.js +28 -0
- package/src/node/llamacpp-managed-runtime.js +0 -202
- package/src/node/llamacpp-runtime-profile.js +0 -133
package/CHANGELOG.md
CHANGED
|
@@ -7,19 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
-
## [2.6.0] - 2026-04-23
|
|
11
|
-
|
|
12
|
-
### Added
|
|
13
|
-
- Local `llama.cpp` variants can now persist a per-model runtime profile, including auto-tuned presets and custom launch overrides, so each GGUF variant can run with settings that match its own size and context shape instead of sharing one global `llama-server` startup profile.
|
|
14
|
-
- The Web UI now exposes managed `llama.cpp` runtime health for Local Models, including tracked instance counts, healthy/stale summaries, and persisted runtime-profile data for each saved variant.
|
|
15
|
-
|
|
16
|
-
### Changed
|
|
17
|
-
- Local variant requests are now resolved through a managed per-variant `llama.cpp` runtime layer that can reuse compatible instances, allocate fallback ports safely, and start the right runtime configuration for the specific model variant without exposing multi-process lifecycle management to the user.
|
|
18
|
-
- Hugging Face GGUF search/download flows now surface file size plus estimated runtime memory guidance directly in the Local Models workflow, making it easier to choose a viable quantization before download.
|
|
19
|
-
|
|
20
|
-
### Fixed
|
|
21
|
-
- Managed `llama.cpp` runtimes now reconcile stale tracked instances before reuse, avoid reserving dead immediate-exit servers, and drain pending shutdown/startup edges more reliably so local per-model routing does not leave behind stale `llama-server` processes.
|
|
22
|
-
|
|
23
10
|
## [2.5.2] - 2026-04-23
|
|
24
11
|
|
|
25
12
|
### Fixed
|
package/README.md
CHANGED
|
@@ -44,9 +44,6 @@ Open `llr` and use the **Local Models** tab to manage local inference sources al
|
|
|
44
44
|
- **Native macOS browsing** — use the built-in file picker to choose a single GGUF file, scan a folder recursively for GGUF models, or browse directly to a local `llama-server` binary
|
|
45
45
|
- **Managed + attached model library** — stale or moved files stay visible instead of crashing the app, and can be repaired by locating the file again or removed cleanly
|
|
46
46
|
- **Router-visible local variants** — create friendly model variants with bounded presets, context-window metadata, preload toggles, and Mac unified-memory fit guidance with clearer safe/tight recommendations
|
|
47
|
-
- **Per-variant llama.cpp tuning** — each local variant can store its own runtime profile so balanced, throughput, long-context, low-memory, or custom launch overrides do not fight over one shared global `llama-server` config
|
|
48
|
-
- **Managed per-model runtimes** — the router automatically starts, reuses, and stops the right `llama.cpp` instance for the requested local variant, with stale-runtime cleanup handled internally instead of asking the user to manage separate servers
|
|
49
|
-
- **GGUF size + memory guidance** — Hugging Face search results now show model file size plus estimated runtime memory fit guidance before download, helping choose viable quantizations faster
|
|
50
47
|
- **Alias-ready local routing** — once saved, local variants behave like normal router models and can be used in aliases, capability flags, and fallback chains
|
|
51
48
|
|
|
52
49
|
For v1, the managed download flow only searches public Hugging Face GGUF files and the fit guidance is tuned for Macs with unified memory.
|
package/package.json
CHANGED
package/src/cli/router-module.js
CHANGED
|
@@ -66,6 +66,7 @@ import {
|
|
|
66
66
|
sanitizeConfigForDisplay,
|
|
67
67
|
validateRuntimeConfig
|
|
68
68
|
} from "../runtime/config.js";
|
|
69
|
+
import { normalizeQuotaProbeConfig } from "../runtime/quota-probe.js";
|
|
69
70
|
import {
|
|
70
71
|
CODEX_SUBSCRIPTION_MODELS,
|
|
71
72
|
CLAUDE_CODE_SUBSCRIPTION_MODELS
|
|
@@ -8041,6 +8042,234 @@ async function doSetProviderRateLimits(context) {
|
|
|
8041
8042
|
};
|
|
8042
8043
|
}
|
|
8043
8044
|
|
|
8045
|
+
function parseProbeHeaders(raw) {
|
|
8046
|
+
if (!raw) return undefined;
|
|
8047
|
+
const str = String(raw).trim();
|
|
8048
|
+
if (!str) return undefined;
|
|
8049
|
+
try {
|
|
8050
|
+
const parsed = JSON.parse(str);
|
|
8051
|
+
if (Array.isArray(parsed)) return parsed;
|
|
8052
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
8053
|
+
return Object.entries(parsed).map(([key, value]) => ({ key, value: String(value) }));
|
|
8054
|
+
}
|
|
8055
|
+
} catch {
|
|
8056
|
+
// not JSON — ignore
|
|
8057
|
+
}
|
|
8058
|
+
return undefined;
|
|
8059
|
+
}
|
|
8060
|
+
|
|
8061
|
+
function parseProbeMapping(raw) {
|
|
8062
|
+
if (!raw) return undefined;
|
|
8063
|
+
const str = String(raw).trim();
|
|
8064
|
+
if (!str) return undefined;
|
|
8065
|
+
try {
|
|
8066
|
+
return JSON.parse(str);
|
|
8067
|
+
} catch {
|
|
8068
|
+
return undefined;
|
|
8069
|
+
}
|
|
8070
|
+
}
|
|
8071
|
+
|
|
8072
|
+
function buildMappingFieldEntry(pathStr, coerceAs) {
|
|
8073
|
+
if (!pathStr) return undefined;
|
|
8074
|
+
return { path: String(pathStr).trim(), as: coerceAs || "number" };
|
|
8075
|
+
}
|
|
8076
|
+
|
|
8077
|
+
export function setProviderQuotaProbeInConfig(config, { providerId, quotaProbe }) {
|
|
8078
|
+
const next = structuredClone(config);
|
|
8079
|
+
const normalizedProviderId = String(providerId || "").trim();
|
|
8080
|
+
if (!normalizedProviderId) {
|
|
8081
|
+
return { config: next, changed: false, reason: "provider-id is required." };
|
|
8082
|
+
}
|
|
8083
|
+
const provider = (next.providers || []).find((item) => item.id === normalizedProviderId);
|
|
8084
|
+
if (!provider) {
|
|
8085
|
+
return { config: next, changed: false, reason: `Provider '${normalizedProviderId}' not found.` };
|
|
8086
|
+
}
|
|
8087
|
+
const previous = provider.quotaProbe || null;
|
|
8088
|
+
provider.quotaProbe = quotaProbe;
|
|
8089
|
+
const validationErrors = findIntroducedConfigValidationErrors(config, next);
|
|
8090
|
+
if (validationErrors.length > 0) {
|
|
8091
|
+
return { config, changed: false, reason: formatConfigValidationError(validationErrors) };
|
|
8092
|
+
}
|
|
8093
|
+
return {
|
|
8094
|
+
config: next,
|
|
8095
|
+
changed: serializeStable(previous) !== serializeStable(provider.quotaProbe),
|
|
8096
|
+
reason: "",
|
|
8097
|
+
providerId: normalizedProviderId,
|
|
8098
|
+
quotaProbe: provider.quotaProbe
|
|
8099
|
+
};
|
|
8100
|
+
}
|
|
8101
|
+
|
|
8102
|
+
function buildQuotaProbeReport(providerId, probe) {
|
|
8103
|
+
if (!probe) return `Provider '${providerId}': quota probe disabled.`;
|
|
8104
|
+
const lines = [`Provider '${providerId}': quota probe configured.`];
|
|
8105
|
+
lines.push(` enabled: ${probe.enabled}`);
|
|
8106
|
+
lines.push(` mode: ${probe.mode}`);
|
|
8107
|
+
lines.push(` capKind: ${probe.capKind}`);
|
|
8108
|
+
lines.push(` combinator: ${probe.combinator}`);
|
|
8109
|
+
lines.push(` enforce: ${probe.enforce}`);
|
|
8110
|
+
if (probe.safetyMargin) {
|
|
8111
|
+
lines.push(` margin: $${probe.safetyMargin.dollars} or ${probe.safetyMargin.percent}%`);
|
|
8112
|
+
}
|
|
8113
|
+
if (probe.mode === "http" && probe.http) {
|
|
8114
|
+
lines.push(` url: ${probe.http.method} ${probe.http.url}`);
|
|
8115
|
+
lines.push(` timeout: ${probe.http.timeoutMs}ms`);
|
|
8116
|
+
if (probe.http.headers?.length) {
|
|
8117
|
+
lines.push(` headers: ${probe.http.headers.map((h) => h.key).join(", ")}`);
|
|
8118
|
+
}
|
|
8119
|
+
}
|
|
8120
|
+
if (probe.mode === "custom" && probe.custom) {
|
|
8121
|
+
lines.push(` timeout: ${probe.custom.timeoutMs}ms`);
|
|
8122
|
+
lines.push(` source: ${probe.custom.source.length} chars`);
|
|
8123
|
+
}
|
|
8124
|
+
return lines.join("\n");
|
|
8125
|
+
}
|
|
8126
|
+
|
|
8127
|
+
async function doSetQuotaProbe(context) {
|
|
8128
|
+
const args = context.args || {};
|
|
8129
|
+
const configPath = readArg(args, ["config", "configPath"], getDefaultConfigPath());
|
|
8130
|
+
const config = await readConfigFile(configPath);
|
|
8131
|
+
const providerId = String(readArg(args, ["provider-id", "providerId"], "") || "").trim();
|
|
8132
|
+
|
|
8133
|
+
if (!providerId) {
|
|
8134
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "provider-id is required." };
|
|
8135
|
+
}
|
|
8136
|
+
|
|
8137
|
+
const provider = config.providers.find((item) => item.id === providerId);
|
|
8138
|
+
if (!provider) {
|
|
8139
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: `Provider '${providerId}' not found.` };
|
|
8140
|
+
}
|
|
8141
|
+
|
|
8142
|
+
const disableProbe = toBoolean(readArg(args, ["disable-quota-probe", "disableQuotaProbe"], false), false);
|
|
8143
|
+
if (disableProbe) {
|
|
8144
|
+
const result = setProviderQuotaProbeInConfig(config, { providerId, quotaProbe: null });
|
|
8145
|
+
if (!result.changed && result.reason) {
|
|
8146
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: result.reason };
|
|
8147
|
+
}
|
|
8148
|
+
await writeConfigFile(result.config, configPath);
|
|
8149
|
+
return { ok: true, mode: context.mode, exitCode: EXIT_SUCCESS, data: `Provider '${providerId}': quota probe disabled.` };
|
|
8150
|
+
}
|
|
8151
|
+
|
|
8152
|
+
const quotaProbeJsonRaw = readArg(args, ["quota-probe-json", "quotaProbeJson"], undefined);
|
|
8153
|
+
let probeConfig;
|
|
8154
|
+
|
|
8155
|
+
if (quotaProbeJsonRaw) {
|
|
8156
|
+
try {
|
|
8157
|
+
probeConfig = JSON.parse(String(quotaProbeJsonRaw));
|
|
8158
|
+
} catch {
|
|
8159
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "quota-probe-json must be valid JSON." };
|
|
8160
|
+
}
|
|
8161
|
+
if (typeof probeConfig !== "object" || probeConfig === null) {
|
|
8162
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "quota-probe-json must be a JSON object." };
|
|
8163
|
+
}
|
|
8164
|
+
if (!("enabled" in probeConfig)) probeConfig.enabled = true;
|
|
8165
|
+
} else {
|
|
8166
|
+
const existing = provider.quotaProbe || {};
|
|
8167
|
+
const mode = String(readArg(args, ["probe-mode", "probeMode"], existing.mode || "http") || "http").trim();
|
|
8168
|
+
const capKind = String(readArg(args, ["cap-kind", "capKind"], existing.capKind || "") || "").trim();
|
|
8169
|
+
const combinator = String(readArg(args, ["combinator"], existing.combinator || "AND") || "AND").trim().toUpperCase();
|
|
8170
|
+
const enforce = String(readArg(args, ["enforce"], existing.enforce || "gate") || "gate").trim();
|
|
8171
|
+
const marginDollars = toNumber(readArg(args, ["safety-margin-dollars", "safetyMarginDollars"], existing.safetyMargin?.dollars), 0);
|
|
8172
|
+
const marginPercent = toNumber(readArg(args, ["safety-margin-percent", "safetyMarginPercent"], existing.safetyMargin?.percent), 0);
|
|
8173
|
+
|
|
8174
|
+
if (!capKind) {
|
|
8175
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "cap-kind is required (dollars | tokens | requests)." };
|
|
8176
|
+
}
|
|
8177
|
+
|
|
8178
|
+
probeConfig = {
|
|
8179
|
+
enabled: true,
|
|
8180
|
+
capKind,
|
|
8181
|
+
combinator,
|
|
8182
|
+
enforce,
|
|
8183
|
+
mode,
|
|
8184
|
+
safetyMargin: { dollars: marginDollars, percent: marginPercent }
|
|
8185
|
+
};
|
|
8186
|
+
|
|
8187
|
+
if (mode === "http") {
|
|
8188
|
+
const existingHttp = existing.http || {};
|
|
8189
|
+
const url = String(readArg(args, ["probe-url", "probeUrl"], existingHttp.url || "") || "").trim();
|
|
8190
|
+
const method = String(readArg(args, ["probe-method", "probeMethod"], existingHttp.method || "GET") || "GET").trim().toUpperCase();
|
|
8191
|
+
const timeoutMs = toNumber(readArg(args, ["probe-timeout", "probeTimeout"], existingHttp.timeoutMs), undefined);
|
|
8192
|
+
const headersRaw = readArg(args, ["probe-headers", "probeHeaders"], undefined);
|
|
8193
|
+
const bodyRaw = readArg(args, ["probe-body", "probeBody"], undefined);
|
|
8194
|
+
|
|
8195
|
+
if (!url) {
|
|
8196
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "probe-url is required for HTTP mode." };
|
|
8197
|
+
}
|
|
8198
|
+
|
|
8199
|
+
const headers = parseProbeHeaders(headersRaw) ?? existingHttp.headers ?? [];
|
|
8200
|
+
const body = bodyRaw !== undefined ? String(bodyRaw) : existingHttp.body;
|
|
8201
|
+
|
|
8202
|
+
const mappingJsonRaw = readArg(args, ["probe-mapping", "probeMapping"], undefined);
|
|
8203
|
+
let mapping;
|
|
8204
|
+
if (mappingJsonRaw) {
|
|
8205
|
+
mapping = parseProbeMapping(mappingJsonRaw);
|
|
8206
|
+
if (!mapping) {
|
|
8207
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "probe-mapping must be valid JSON." };
|
|
8208
|
+
}
|
|
8209
|
+
} else {
|
|
8210
|
+
const existingMapping = existingHttp.mapping || {};
|
|
8211
|
+
const usedPath = readArg(args, ["probe-mapping-used", "probeMappingUsed"], undefined);
|
|
8212
|
+
const limitPath = readArg(args, ["probe-mapping-limit", "probeMappingLimit"], undefined);
|
|
8213
|
+
const remainingPath = readArg(args, ["probe-mapping-remaining", "probeMappingRemaining"], undefined);
|
|
8214
|
+
const resetAtPath = readArg(args, ["probe-mapping-reset-at", "probeMappingResetAt"], undefined);
|
|
8215
|
+
const isUnlimitedPath = readArg(args, ["probe-mapping-is-unlimited", "probeMappingIsUnlimited"], undefined);
|
|
8216
|
+
mapping = { ...existingMapping };
|
|
8217
|
+
if (usedPath) mapping.used = buildMappingFieldEntry(usedPath, "number");
|
|
8218
|
+
if (limitPath) mapping.limit = buildMappingFieldEntry(limitPath, "number");
|
|
8219
|
+
if (remainingPath) mapping.remaining = buildMappingFieldEntry(remainingPath, "number");
|
|
8220
|
+
if (resetAtPath) mapping.resetAt = buildMappingFieldEntry(resetAtPath, "datetime");
|
|
8221
|
+
if (isUnlimitedPath) mapping.isUnlimited = buildMappingFieldEntry(isUnlimitedPath, "boolean");
|
|
8222
|
+
}
|
|
8223
|
+
|
|
8224
|
+
probeConfig.http = { method, url, headers, timeoutMs, mapping };
|
|
8225
|
+
if (body !== undefined) probeConfig.http.body = body;
|
|
8226
|
+
} else if (mode === "custom") {
|
|
8227
|
+
const existingCustom = existing.custom || {};
|
|
8228
|
+
const source = readArg(args, ["custom-source", "customSource"], existingCustom.source || "");
|
|
8229
|
+
const timeoutMs = toNumber(readArg(args, ["probe-timeout", "probeTimeout"], existingCustom.timeoutMs), undefined);
|
|
8230
|
+
if (!source) {
|
|
8231
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "custom-source is required for custom mode." };
|
|
8232
|
+
}
|
|
8233
|
+
probeConfig.custom = { source: String(source), timeoutMs };
|
|
8234
|
+
}
|
|
8235
|
+
|
|
8236
|
+
const refreshOnUiOpen = toBoolean(readArg(args, ["refresh-on-ui-open", "refreshOnUiOpen"], undefined), undefined);
|
|
8237
|
+
const refreshOnResetAt = toBoolean(readArg(args, ["refresh-on-reset-at", "refreshOnResetAt"], undefined), undefined);
|
|
8238
|
+
const refreshOnErrorRaw = readArg(args, ["refresh-on-upstream-error", "refreshOnUpstreamError"], undefined);
|
|
8239
|
+
|
|
8240
|
+
if (refreshOnUiOpen !== undefined || refreshOnResetAt !== undefined || refreshOnErrorRaw !== undefined) {
|
|
8241
|
+
const existingTriggers = existing.refreshTriggers || {};
|
|
8242
|
+
probeConfig.refreshTriggers = {
|
|
8243
|
+
onUiOpen: refreshOnUiOpen !== undefined ? refreshOnUiOpen : !!existingTriggers.onUiOpen,
|
|
8244
|
+
onManual: true,
|
|
8245
|
+
onResetAt: refreshOnResetAt !== undefined ? refreshOnResetAt : !!existingTriggers.onResetAt,
|
|
8246
|
+
onUpstreamError: null
|
|
8247
|
+
};
|
|
8248
|
+
if (refreshOnErrorRaw) {
|
|
8249
|
+
const codes = String(refreshOnErrorRaw).split(",").map((s) => Number(s.trim())).filter((n) => Number.isFinite(n));
|
|
8250
|
+
if (codes.length > 0) probeConfig.refreshTriggers.onUpstreamError = { statusCodes: codes };
|
|
8251
|
+
} else if (existingTriggers.onUpstreamError) {
|
|
8252
|
+
probeConfig.refreshTriggers.onUpstreamError = existingTriggers.onUpstreamError;
|
|
8253
|
+
}
|
|
8254
|
+
}
|
|
8255
|
+
}
|
|
8256
|
+
|
|
8257
|
+
const normalized = normalizeQuotaProbeConfig(probeConfig);
|
|
8258
|
+
if (!normalized) {
|
|
8259
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: "Invalid quota probe config. Ensure enabled=true and capKind is one of: dollars, tokens, requests." };
|
|
8260
|
+
}
|
|
8261
|
+
|
|
8262
|
+
const result = setProviderQuotaProbeInConfig(config, { providerId, quotaProbe: probeConfig });
|
|
8263
|
+
if (!result.changed && result.reason) {
|
|
8264
|
+
return { ok: false, mode: context.mode, exitCode: EXIT_VALIDATION, errorMessage: result.reason };
|
|
8265
|
+
}
|
|
8266
|
+
if (!result.changed) {
|
|
8267
|
+
return { ok: true, mode: context.mode, exitCode: EXIT_SUCCESS, data: buildQuotaProbeReport(providerId, normalized) + "\n(no changes)" };
|
|
8268
|
+
}
|
|
8269
|
+
await writeConfigFile(result.config, configPath);
|
|
8270
|
+
return { ok: true, mode: context.mode, exitCode: EXIT_SUCCESS, data: buildQuotaProbeReport(providerId, normalized) };
|
|
8271
|
+
}
|
|
8272
|
+
|
|
8044
8273
|
async function doSetMasterKey(context) {
|
|
8045
8274
|
const args = context.args || {};
|
|
8046
8275
|
const configPath = readArg(args, ["config", "configPath"], getDefaultConfigPath());
|
|
@@ -8554,6 +8783,9 @@ async function runConfigAction(context) {
|
|
|
8554
8783
|
case "set-model-fallbacks":
|
|
8555
8784
|
case "set-model-fallback":
|
|
8556
8785
|
return doSetModelFallbacks(context);
|
|
8786
|
+
case "set-quota-probe":
|
|
8787
|
+
case "set-provider-quota-probe":
|
|
8788
|
+
return doSetQuotaProbe(context);
|
|
8557
8789
|
case "set-master-key":
|
|
8558
8790
|
return doSetMasterKey(context);
|
|
8559
8791
|
case "set-amp-config":
|
|
@@ -10575,7 +10807,7 @@ const routerModule = {
|
|
|
10575
10807
|
},
|
|
10576
10808
|
{
|
|
10577
10809
|
actionId: "config",
|
|
10578
|
-
description: "Config manager for providers, diagnostics, coding-tool routing, AMP, and startup service.",
|
|
10810
|
+
description: "Config manager for providers, diagnostics, coding-tool routing, AMP, quota probes, and startup service.",
|
|
10579
10811
|
tui: { steps: ["cli-only"] },
|
|
10580
10812
|
commandline: {
|
|
10581
10813
|
requiredArgs: [],
|
|
@@ -10610,6 +10842,29 @@ const routerModule = {
|
|
|
10610
10842
|
"rate-limits",
|
|
10611
10843
|
"remove-bucket",
|
|
10612
10844
|
"replace-rate-limits",
|
|
10845
|
+
"probe-mode",
|
|
10846
|
+
"probe-url",
|
|
10847
|
+
"probe-method",
|
|
10848
|
+
"probe-headers",
|
|
10849
|
+
"probe-body",
|
|
10850
|
+
"probe-timeout",
|
|
10851
|
+
"probe-mapping",
|
|
10852
|
+
"probe-mapping-used",
|
|
10853
|
+
"probe-mapping-limit",
|
|
10854
|
+
"probe-mapping-remaining",
|
|
10855
|
+
"probe-mapping-reset-at",
|
|
10856
|
+
"probe-mapping-is-unlimited",
|
|
10857
|
+
"cap-kind",
|
|
10858
|
+
"combinator",
|
|
10859
|
+
"enforce",
|
|
10860
|
+
"safety-margin-dollars",
|
|
10861
|
+
"safety-margin-percent",
|
|
10862
|
+
"custom-source",
|
|
10863
|
+
"quota-probe-json",
|
|
10864
|
+
"disable-quota-probe",
|
|
10865
|
+
"refresh-on-ui-open",
|
|
10866
|
+
"refresh-on-reset-at",
|
|
10867
|
+
"refresh-on-upstream-error",
|
|
10613
10868
|
"alias-id",
|
|
10614
10869
|
"alias",
|
|
10615
10870
|
"targets",
|
|
@@ -10681,7 +10936,7 @@ const routerModule = {
|
|
|
10681
10936
|
]
|
|
10682
10937
|
},
|
|
10683
10938
|
help: {
|
|
10684
|
-
summary: `Manage providers, diagnostics, config validation, coding-tool routing, model aliases, rate-limit buckets, AMP proxy settings, master key, and OS startup. \`${CLI_COMMAND} config\` opens the web console by default; use \`--operation\` for direct CLI actions.`,
|
|
10939
|
+
summary: `Manage providers, diagnostics, config validation, coding-tool routing, model aliases, rate-limit buckets, quota probes (external provider budget monitoring), AMP proxy settings, master key, and OS startup. \`${CLI_COMMAND} config\` opens the web console by default; use \`--operation\` for direct CLI actions.`,
|
|
10685
10940
|
args: [
|
|
10686
10941
|
{ name: "operation", required: false, description: "Config operation (optional; defaults to a config summary when omitted in direct CLI mode).", example: "--operation=upsert-provider" },
|
|
10687
10942
|
{ name: "provider-id", required: false, description: "Provider id (lowercase letters/numbers/dashes).", example: "--provider-id=openrouter-primary" },
|
|
@@ -10718,6 +10973,29 @@ const routerModule = {
|
|
|
10718
10973
|
{ name: "remove-bucket", required: false, description: "Remove bucket by --bucket-id in set-provider-rate-limits.", example: "--remove-bucket=true" },
|
|
10719
10974
|
{ name: "replace-rate-limits", required: false, description: "Replace all provider buckets with provided entries.", example: "--replace-rate-limits=true" },
|
|
10720
10975
|
{ name: "rate-limits", required: false, description: "Rate-limit bucket JSON object/array for bulk update.", example: "--rate-limits='[{\"id\":\"or-month\",\"models\":[\"all\"],\"requests\":20000,\"window\":{\"unit\":\"month\",\"size\":1}}]'" },
|
|
10976
|
+
{ name: "probe-mode", required: false, description: "For set-quota-probe: probe execution mode. 'http' sends an HTTP request to a provider quota/usage endpoint and maps the JSON response to a normalized snapshot. 'custom' runs a sandboxed JS function. Default: http.", example: "--probe-mode=http" },
|
|
10977
|
+
{ name: "probe-url", required: false, description: "For set-quota-probe (HTTP mode): the full URL of the provider's quota/usage/subscription API endpoint that returns JSON with usage data. Supports {{providerApiKey}}, {{providerBaseUrl}}, {{providerId}}, and {{env.VAR_NAME}} shortcodes for secret interpolation.", example: "--probe-url=https://ramclouds.me/api/subscription/self" },
|
|
10978
|
+
{ name: "probe-method", required: false, description: "For set-quota-probe (HTTP mode): HTTP method. Default: GET.", example: "--probe-method=GET" },
|
|
10979
|
+
{ name: "probe-headers", required: false, description: "For set-quota-probe (HTTP mode): request headers as a JSON array of {key,value} objects or a JSON object {key:value}. Use {{providerApiKey}} to interpolate the provider's API key, or {{env.VAR_NAME}} to interpolate environment variables for secrets that differ from the provider API key (e.g. a separate system token).", example: "--probe-headers='[{\"key\":\"Authorization\",\"value\":\"Bearer {{env.RC_TOKEN}}\"},{\"key\":\"New-Api-User\",\"value\":\"{{env.RC_USER}}\"}]'" },
|
|
10980
|
+
{ name: "probe-body", required: false, description: "For set-quota-probe (HTTP mode, POST only): request body string. Supports the same {{shortcode}} interpolation as probe-headers.", example: "--probe-body='{\"action\":\"get_usage\"}'" },
|
|
10981
|
+
{ name: "probe-timeout", required: false, description: "For set-quota-probe: request timeout in milliseconds. HTTP mode default: 5000 (max 15000). Custom mode default: 2000 (max 10000).", example: "--probe-timeout=10000" },
|
|
10982
|
+
{ name: "probe-mapping", required: false, description: "For set-quota-probe (HTTP mode): full JSON mapping object that maps provider API response JSON paths to normalized snapshot fields. Each field has {path, as} where 'path' is a dot-path like '$.data.used_quota' and 'as' is the coercion type. Coercion types: 'number' (numeric), 'dollars-from-cents' (divides by 100), 'boolean', 'datetime' (ISO-8601/epoch/duration), 'raw'. Alternative to individual --probe-mapping-* flags.", example: "--probe-mapping='{\"used\":{\"path\":\"$.data.used\",\"as\":\"number\"},\"limit\":{\"path\":\"$.data.limit\",\"as\":\"number\"}}'" },
|
|
10983
|
+
{ name: "probe-mapping-used", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path in the provider API response to the 'used' quota value (how much has been consumed). Coerced as number. At least 2 of {used, limit, remaining} are required; the third is auto-derived.", example: "--probe-mapping-used=$.data.used_quota" },
|
|
10984
|
+
{ name: "probe-mapping-limit", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to the 'limit' value (total quota cap). Coerced as number.", example: "--probe-mapping-limit=$.data.quota_limit" },
|
|
10985
|
+
{ name: "probe-mapping-remaining", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to the 'remaining' value (quota left). Coerced as number.", example: "--probe-mapping-remaining=$.data.remaining_quota" },
|
|
10986
|
+
{ name: "probe-mapping-reset-at", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to the reset timestamp (when quota resets). Coerced as datetime (auto-detects ISO-8601, epoch seconds, epoch milliseconds, or duration strings like '2h' or 'PT30M').", example: "--probe-mapping-reset-at=$.data.reset_at" },
|
|
10987
|
+
{ name: "probe-mapping-is-unlimited", required: false, description: "For set-quota-probe (HTTP mode): JSON dot-path to a boolean indicating unlimited quota. When true, the probe always reports 'available' regardless of used/limit values.", example: "--probe-mapping-is-unlimited=$.data.is_unlimited" },
|
|
10988
|
+
{ name: "cap-kind", required: false, description: "For set-quota-probe: the unit of the quota cap reported by the provider. Determines how used/limit/remaining values are interpreted. Values: 'dollars' (monetary budget), 'tokens' (token count), 'requests' (request count).", example: "--cap-kind=dollars" },
|
|
10989
|
+
{ name: "combinator", required: false, description: "For set-quota-probe: how the quota probe verdict combines with local rate-limit verdict to decide if a provider is eligible. 'AND' = both must pass (default, safest). 'OR' = either can pass (lenient). 'REPLACE' = probe verdict replaces rate-limit entirely.", example: "--combinator=AND" },
|
|
10990
|
+
{ name: "enforce", required: false, description: "For set-quota-probe: enforcement mode. 'gate' = blocks routing when quota exhausted (production use). 'observe' = logs verdict but never blocks (dry-run/testing). Default: gate.", example: "--enforce=gate" },
|
|
10991
|
+
{ name: "safety-margin-dollars", required: false, description: "For set-quota-probe: dollar-based safety margin. Provider is considered exhausted when remaining ≤ this value. Applied as max(dollars, limit×percent/100). Default: 0.", example: "--safety-margin-dollars=1" },
|
|
10992
|
+
{ name: "safety-margin-percent", required: false, description: "For set-quota-probe: percentage-based safety margin. Provider is considered exhausted when remaining ≤ limit×percent/100. Applied as max(dollars, limit×percent/100). Default: 0.", example: "--safety-margin-percent=2" },
|
|
10993
|
+
{ name: "custom-source", required: false, description: "For set-quota-probe (custom mode): JavaScript async function source that runs in a sandboxed VM. Receives ctx object with {fetch, providerApiKey, providerBaseUrl, providerId}. Must return {capKind, used, limit} or {capKind, remaining, limit}. No access to process, require, or globalThis.", example: "--custom-source='export default async function(ctx) { const r = await ctx.fetch(\"https://api.example.com/usage\", {headers:{\"Authorization\":\"Bearer \"+ctx.providerApiKey}}); const d = await r.json(); return {capKind:\"dollars\",used:d.used,limit:d.limit}; }'" },
|
|
10994
|
+
{ name: "quota-probe-json", required: false, description: "For set-quota-probe: provide the full quotaProbe config as a single JSON object. Overrides all other probe flags. Useful when the config is complex or pre-built. The object is written directly to the provider's quotaProbe field.", example: "--quota-probe-json='{\"enabled\":true,\"capKind\":\"dollars\",\"mode\":\"http\",\"combinator\":\"AND\",\"enforce\":\"gate\",\"http\":{\"method\":\"GET\",\"url\":\"https://example.com/api/usage\",\"headers\":[{\"key\":\"Authorization\",\"value\":\"Bearer {{providerApiKey}}\"}],\"mapping\":{\"used\":{\"path\":\"$.used\",\"as\":\"number\"},\"limit\":{\"path\":\"$.limit\",\"as\":\"number\"}}}}'" },
|
|
10995
|
+
{ name: "disable-quota-probe", required: false, description: "For set-quota-probe: set to true to disable and remove the quota probe config from the provider. The provider will no longer be gated by external quota checks.", example: "--disable-quota-probe=true" },
|
|
10996
|
+
{ name: "refresh-on-ui-open", required: false, description: "For set-quota-probe: auto-refresh the quota snapshot when the web console UI is opened. Default: false.", example: "--refresh-on-ui-open=true" },
|
|
10997
|
+
{ name: "refresh-on-reset-at", required: false, description: "For set-quota-probe: schedule an automatic refresh at the resetAt timestamp returned by the probe. Useful when the provider reports when the quota window rolls over. Default: false.", example: "--refresh-on-reset-at=true" },
|
|
10998
|
+
{ name: "refresh-on-upstream-error", required: false, description: "For set-quota-probe: comma-separated HTTP status codes from upstream provider errors that should trigger a quota probe refresh. Common: 429 (rate limited), 402 (payment required).", example: "--refresh-on-upstream-error=429,402" },
|
|
10721
10999
|
{ name: "format", required: false, description: "Manual format if probe is skipped.", example: "--format=openai" },
|
|
10722
11000
|
{ name: "headers", required: false, description: "Custom provider headers as JSON object (default User-Agent applied when omitted).", example: "--headers={\"User-Agent\":\"Mozilla/5.0\"}" },
|
|
10723
11001
|
{ name: "skip-probe", required: false, description: "Skip live endpoint/model probe.", example: "--skip-probe=true" },
|
|
@@ -10795,6 +11073,10 @@ const routerModule = {
|
|
|
10795
11073
|
`${CLI_COMMAND} config --operation=set-provider-rate-limits --provider-id=openrouter --bucket-id=openrouter-all-month --bucket-models=all --bucket-requests=20000 --bucket-window=month:1`,
|
|
10796
11074
|
`${CLI_COMMAND} config --operation=set-provider-rate-limits --provider-id=openrouter --bucket-name="6-hours cap" --bucket-models=all --bucket-requests=600 --bucket-window=hour:6`,
|
|
10797
11075
|
`${CLI_COMMAND} config --operation=migrate-config --target-version=2 --create-backup=true`,
|
|
11076
|
+
`${CLI_COMMAND} config --operation=set-quota-probe --provider-id=ramclouds --cap-kind=dollars --probe-url=https://ramclouds.me/api/subscription/self --probe-headers='[{"key":"Authorization","value":"Bearer {{env.RC_TOKEN}}"},{"key":"New-Api-User","value":"{{env.RC_USER}}"}]' --probe-mapping-used=$.data.used_quota --probe-mapping-limit=$.data.quota_limit --safety-margin-dollars=1 --combinator=AND --enforce=gate`,
|
|
11077
|
+
`${CLI_COMMAND} config --operation=set-quota-probe --provider-id=openrouter --cap-kind=dollars --probe-url=https://openrouter.ai/api/v1/auth/key --probe-headers='{"Authorization":"Bearer {{providerApiKey}}"}' --probe-mapping-used=$.data.usage --probe-mapping-limit=$.data.limit --refresh-on-upstream-error=429,402`,
|
|
11078
|
+
`${CLI_COMMAND} config --operation=set-quota-probe --provider-id=myapi --cap-kind=tokens --probe-url=https://api.example.com/usage --probe-method=GET --probe-headers='{"Authorization":"Bearer {{providerApiKey}}"}' --probe-mapping-remaining=$.remaining --probe-mapping-limit=$.total --enforce=observe`,
|
|
11079
|
+
`${CLI_COMMAND} config --operation=set-quota-probe --provider-id=ramclouds --disable-quota-probe=true`,
|
|
10798
11080
|
`${CLI_COMMAND} config --operation=set-model-fallbacks --provider-id=openrouter --model=gpt-4o --fallback-models=anthropic/claude-3-7-sonnet,openrouter/gpt-4.1-mini`,
|
|
10799
11081
|
`${CLI_COMMAND} config --operation=remove-model --provider-id=openrouter --model=gpt-4o`,
|
|
10800
11082
|
`${CLI_COMMAND} config --operation=set-amp-config --patch-amp-client-config=true --amp-client-settings-scope=workspace --amp-client-url=${LOCAL_ROUTER_ORIGIN}`,
|
|
@@ -17,6 +17,7 @@ import {
|
|
|
17
17
|
normalizeFactoryDroidReasoningEffort,
|
|
18
18
|
resolveFactoryDroidRouterModelRef
|
|
19
19
|
} from "../shared/coding-tool-bindings.js";
|
|
20
|
+
import { LOCAL_RUNTIME_PROVIDER_TYPE } from "../runtime/local-models.js";
|
|
20
21
|
|
|
21
22
|
const BACKUP_SUFFIX = ".llm_router_backup";
|
|
22
23
|
const CODEX_PROVIDER_ID = "llm-router";
|
|
@@ -972,9 +973,11 @@ export async function patchClaudeCodeEffortLevel({
|
|
|
972
973
|
const FACTORY_DROID_ROUTER_MARKER = "_llmRouterManaged";
|
|
973
974
|
const FACTORY_DROID_OPENAI_PROVIDER = "openai";
|
|
974
975
|
const FACTORY_DROID_ANTHROPIC_PROVIDER = "anthropic";
|
|
976
|
+
const FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER = "generic-chat-completion-api";
|
|
975
977
|
const FACTORY_DROID_ROUTER_PROVIDERS = Object.freeze([
|
|
976
978
|
FACTORY_DROID_OPENAI_PROVIDER,
|
|
977
|
-
FACTORY_DROID_ANTHROPIC_PROVIDER
|
|
979
|
+
FACTORY_DROID_ANTHROPIC_PROVIDER,
|
|
980
|
+
FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER
|
|
978
981
|
]);
|
|
979
982
|
|
|
980
983
|
function dedupeStrings(values = []) {
|
|
@@ -1116,6 +1119,17 @@ function resolveFactoryDroidRouteFormat(modelRef, config = {}, seen = new Set())
|
|
|
1116
1119
|
}
|
|
1117
1120
|
|
|
1118
1121
|
function resolveFactoryDroidCustomModelProvider(modelRef, config = {}) {
|
|
1122
|
+
const normalizedModelRef = String(modelRef || "").trim();
|
|
1123
|
+
if (normalizedModelRef.includes("/")) {
|
|
1124
|
+
const separatorIndex = normalizedModelRef.indexOf("/");
|
|
1125
|
+
const providerId = normalizedModelRef.slice(0, separatorIndex).trim();
|
|
1126
|
+
const provider = (Array.isArray(config?.providers) ? config.providers : [])
|
|
1127
|
+
.find((entry) => String(entry?.id || "").trim() === providerId);
|
|
1128
|
+
if (String(provider?.type || "").trim().toLowerCase() === LOCAL_RUNTIME_PROVIDER_TYPE) {
|
|
1129
|
+
return FACTORY_DROID_GENERIC_CHAT_COMPLETIONS_PROVIDER;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1119
1133
|
return mapFactoryDroidFormatToProvider(resolveFactoryDroidRouteFormat(modelRef, config))
|
|
1120
1134
|
|| FACTORY_DROID_OPENAI_PROVIDER;
|
|
1121
1135
|
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import path from "node:path";
|
|
2
2
|
import { promises as fs } from "node:fs";
|
|
3
|
-
import { estimateLlamacppRuntimeBytes } from "./llamacpp-runtime-profile.js";
|
|
4
3
|
|
|
5
4
|
const HUGGING_FACE_API_URL = "https://huggingface.co/api/models";
|
|
6
5
|
const HUGGING_FACE_BASE_URL = "https://huggingface.co";
|
|
@@ -155,13 +154,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
|
|
|
155
154
|
expectedContextWindow: systemInfo?.expectedContextWindow
|
|
156
155
|
}, systemInfo);
|
|
157
156
|
const quantization = parseQuantizationFromFileName(file);
|
|
158
|
-
const estimatedRuntimeBytes = sizeBytes
|
|
159
|
-
? estimateLlamacppRuntimeBytes({
|
|
160
|
-
sizeBytes,
|
|
161
|
-
contextWindow: systemInfo?.expectedContextWindow,
|
|
162
|
-
preset: status.fit === "tight" ? "memory-safe" : "balanced"
|
|
163
|
-
})
|
|
164
|
-
: undefined;
|
|
165
157
|
const fitScore = status.fit === "safe" ? 30 : status.fit === "tight" ? 15 : status.fit === "unknown" ? 8 : -20;
|
|
166
158
|
const rankingScore = fitScore
|
|
167
159
|
+ (status.disabled ? -100 : 0)
|
|
@@ -174,10 +166,6 @@ export function shapeHuggingFaceGgufResults(files, systemInfo = {}) {
|
|
|
174
166
|
file,
|
|
175
167
|
quantization,
|
|
176
168
|
sizeBytes,
|
|
177
|
-
estimatedRuntimeBytes,
|
|
178
|
-
memoryLabel: estimatedRuntimeBytes
|
|
179
|
-
? `${(estimatedRuntimeBytes / (1024 ** 3)).toFixed(1)} GB runtime est.`
|
|
180
|
-
: "Runtime estimate unavailable",
|
|
181
169
|
disabled: status.disabled,
|
|
182
170
|
disabledReason: status.reason,
|
|
183
171
|
fit: status.fit,
|