@khanglvm/llm-router 2.6.0 → 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -135,21 +135,6 @@ function queueLargeRequestEvent(onLargeRequestLog, payload) {
135
135
  }
136
136
  }
137
137
 
138
- async function resolveRequestProviderUrl(provider, plan, candidate, runtimeFlags) {
139
- if (provider?.type === "local-runtime" && typeof runtimeFlags?.resolveLocalRuntimeBaseUrl === "function") {
140
- const dynamicBaseUrl = await runtimeFlags.resolveLocalRuntimeBaseUrl({
141
- candidate,
142
- targetFormat: plan.targetFormat,
143
- requestKind: plan.requestKind
144
- });
145
- if (dynamicBaseUrl) {
146
- return resolveProviderUrl({ ...provider, baseUrl: dynamicBaseUrl }, plan.targetFormat, plan.requestKind);
147
- }
148
- }
149
-
150
- return resolveProviderUrl(provider, plan.targetFormat, plan.requestKind);
151
- }
152
-
153
138
  function maybeQueueLargeRequestLog({
154
139
  env,
155
140
  onLargeRequestLog,
@@ -1089,27 +1074,8 @@ export async function makeProviderCall({
1089
1074
  }
1090
1075
 
1091
1076
  const executeHttpProviderRequest = async (plan) => {
1092
- let providerUrl;
1093
- try {
1094
- providerUrl = await resolveRequestProviderUrl(provider, plan, candidate, runtimeFlags);
1095
- } catch (error) {
1096
- return jsonResponse({
1097
- type: "error",
1098
- error: {
1099
- type: "api_error",
1100
- message: error instanceof Error ? error.message : String(error)
1101
- }
1102
- }, 503);
1103
- }
1104
- if (!providerUrl) {
1105
- return jsonResponse({
1106
- type: "error",
1107
- error: {
1108
- type: "configuration_error",
1109
- message: `Provider ${provider.id} has invalid baseUrl.`
1110
- }
1111
- }, 500);
1112
- }
1077
+ const providerUrl = resolveProviderUrl(provider, plan.targetFormat, plan.requestKind);
1078
+ if (!providerUrl) return null;
1113
1079
  const headers = mergeCachingHeaders(
1114
1080
  buildProviderHeaders(provider, env, plan.targetFormat),
1115
1081
  requestHeaders,
@@ -1148,6 +1114,22 @@ export async function makeProviderCall({
1148
1114
  }
1149
1115
  };
1150
1116
 
1117
+ if (!resolveProviderUrl(provider, activePlan.targetFormat, activePlan.requestKind)) {
1118
+ return {
1119
+ ok: false,
1120
+ status: 500,
1121
+ retryable: false,
1122
+ errorKind: "configuration_error",
1123
+ response: jsonResponse({
1124
+ type: "error",
1125
+ error: {
1126
+ type: "configuration_error",
1127
+ message: `Provider ${provider.id} has invalid baseUrl.`
1128
+ }
1129
+ }, 500)
1130
+ };
1131
+ }
1132
+
1151
1133
  let response;
1152
1134
  try {
1153
1135
  response = await executeHttpProviderRequest(activePlan);
@@ -27,10 +27,7 @@ export function resolveRuntimeFlags(options = {}, env = {}) {
27
27
  workerRuntime,
28
28
  workerSafeMode,
29
29
  allowBestEffortStatefulRouting,
30
- statefulRoutingEnabled: !workerSafeMode || allowBestEffortStatefulRouting,
31
- ...(typeof options.resolveLocalRuntimeBaseUrl === "function"
32
- ? { resolveLocalRuntimeBaseUrl: options.resolveLocalRuntimeBaseUrl }
33
- : {})
30
+ statefulRoutingEnabled: !workerSafeMode || allowBestEffortStatefulRouting
34
31
  };
35
32
  }
36
33
 
@@ -20,34 +20,6 @@ function normalizePositiveNumber(value) {
20
20
  return Math.floor(parsed);
21
21
  }
22
22
 
23
- function normalizeRuntimeProfile(raw = {}) {
24
- const source = isPlainObject(raw) ? raw : {};
25
- const overrides = isPlainObject(source.overrides) ? { ...source.overrides } : {};
26
- const extraArgs = Array.isArray(source.extraArgs)
27
- ? source.extraArgs.map((value) => normalizeString(value)).filter(Boolean)
28
- : [];
29
-
30
- return {
31
- mode: normalizeString(source.mode) === "custom" ? "custom" : "auto",
32
- preset: normalizeString(source.preset) || "balanced",
33
- overrides,
34
- extraArgs,
35
- lastKnownGood: isPlainObject(source.lastKnownGood) ? { ...source.lastKnownGood } : null,
36
- lastFailure: isPlainObject(source.lastFailure) ? { ...source.lastFailure } : null
37
- };
38
- }
39
-
40
- function normalizeRuntimeStatus(raw = {}) {
41
- const source = isPlainObject(raw) ? raw : {};
42
-
43
- return {
44
- activeInstanceId: normalizeString(source.activeInstanceId),
45
- lastFailure: isPlainObject(source.lastFailure) ? { ...source.lastFailure } : null,
46
- lastStartedAt: normalizeString(source.lastStartedAt),
47
- lastHealthyAt: normalizeString(source.lastHealthyAt)
48
- };
49
- }
50
-
51
23
  function normalizeLocalModelLibraryEntry(key, entry) {
52
24
  if (!isPlainObject(entry)) return null;
53
25
 
@@ -104,14 +76,6 @@ function normalizeLocalModelVariantEntry(key, entry) {
104
76
  else delete normalized.availability;
105
77
  }
106
78
 
107
- if (normalized.runtime === "llamacpp") {
108
- normalized.runtimeProfile = normalizeRuntimeProfile(entry.runtimeProfile);
109
- normalized.runtimeStatus = normalizeRuntimeStatus(entry.runtimeStatus);
110
- } else {
111
- delete normalized.runtimeProfile;
112
- delete normalized.runtimeStatus;
113
- }
114
-
115
79
  return normalized;
116
80
  }
117
81
 
@@ -0,0 +1,179 @@
1
+ /**
2
+ * Pure logic for provider quota-probe snapshots, verdicts, and config normalization.
3
+ * Zero IO — all functions are deterministic and side-effect free.
4
+ */
5
+
6
+ const VALID_CAP_KINDS = new Set(["dollars", "tokens", "requests"]);
7
+ const VALID_COMBINATORS = new Set(["AND", "OR", "REPLACE"]);
8
+ const VALID_ENFORCE_MODES = new Set(["gate", "observe"]);
9
+ const VALID_PROBE_MODES = new Set(["http", "custom"]);
10
+ const VALID_HTTP_METHODS = new Set(["GET", "POST"]);
11
+
12
+ const HTTP_TIMEOUT_DEFAULT = 5000;
13
+ const HTTP_TIMEOUT_CAP = 15000;
14
+ const CUSTOM_TIMEOUT_DEFAULT = 2000;
15
+ const CUSTOM_TIMEOUT_CAP = 10000;
16
+
17
+ function isFiniteNonNeg(v) {
18
+ return typeof v === "number" && Number.isFinite(v) && v >= 0;
19
+ }
20
+
21
+ export function validateSnapshot(raw) {
22
+ if (!raw || typeof raw !== "object") {
23
+ return { valid: false, error: "snapshot must be an object" };
24
+ }
25
+ if (!VALID_CAP_KINDS.has(raw.capKind)) {
26
+ return { valid: false, error: `invalid capKind: ${raw.capKind}` };
27
+ }
28
+ for (const field of ["used", "limit", "remaining"]) {
29
+ if (field in raw && raw[field] !== undefined && raw[field] !== null) {
30
+ if (!isFiniteNonNeg(raw[field])) {
31
+ return { valid: false, error: `${field} must be a non-negative finite number` };
32
+ }
33
+ }
34
+ }
35
+ if (!raw.isUnlimited) {
36
+ const present = ["used", "limit", "remaining"].filter(
37
+ (f) => f in raw && isFiniteNonNeg(raw[f])
38
+ );
39
+ if (present.length < 2) {
40
+ return { valid: false, error: "at least two of {used, limit, remaining} required" };
41
+ }
42
+ }
43
+ return { valid: true, error: null };
44
+ }
45
+
46
+ export function deriveSnapshot(raw) {
47
+ const out = { ...raw };
48
+ const hasUsed = isFiniteNonNeg(out.used);
49
+ const hasLimit = isFiniteNonNeg(out.limit);
50
+ const hasRemaining = isFiniteNonNeg(out.remaining);
51
+
52
+ if (hasUsed && hasLimit && !hasRemaining) {
53
+ out.remaining = out.limit - out.used;
54
+ } else if (hasLimit && hasRemaining && !hasUsed) {
55
+ out.used = out.limit - out.remaining;
56
+ } else if (hasUsed && hasRemaining && !hasLimit) {
57
+ out.limit = out.used + out.remaining;
58
+ }
59
+ return out;
60
+ }
61
+
62
+ export function isExhausted(snapshot, safetyMargin) {
63
+ if (snapshot.isUnlimited) return false;
64
+ if (!isFiniteNonNeg(snapshot.remaining)) return false;
65
+
66
+ const dollarMargin = safetyMargin?.dollars ?? 0;
67
+ const percentMargin = safetyMargin?.percent ?? 0;
68
+ const limitBased = isFiniteNonNeg(snapshot.limit)
69
+ ? (snapshot.limit * percentMargin) / 100
70
+ : 0;
71
+ const effectiveMargin = Math.max(dollarMargin, limitBased);
72
+ return snapshot.remaining <= effectiveMargin;
73
+ }
74
+
75
+ export function resolveProbeVerdict(snapshot, probeConfig, _now) {
76
+ if (!snapshot || !probeConfig?.enabled) return null;
77
+ if (probeConfig.enforce !== "gate") return null;
78
+ if (snapshot.state === "unknown" || snapshot.state === "errored") return null;
79
+ if (snapshot.isUnlimited) return { available: true, reason: "unlimited" };
80
+
81
+ const derived = deriveSnapshot(snapshot);
82
+ const margin = probeConfig.safetyMargin ?? { dollars: 0, percent: 0 };
83
+ if (isExhausted(derived, margin)) {
84
+ return { available: false, reason: "quota exhausted" };
85
+ }
86
+ return { available: true, reason: "within budget" };
87
+ }
88
+
89
+ export function applyQuotaProbeGate({ combinator, probeAvailable, rateLimitEligible }) {
90
+ const probeOk = probeAvailable === null || probeAvailable === undefined ? true : probeAvailable;
91
+ const rlOk = !!rateLimitEligible;
92
+
93
+ switch (combinator) {
94
+ case "OR":
95
+ return probeOk || rlOk
96
+ ? { eligible: true, skipReason: null }
97
+ : { eligible: false, skipReason: "probe and rate-limit both unavailable" };
98
+ case "REPLACE":
99
+ return probeOk
100
+ ? { eligible: true, skipReason: null }
101
+ : { eligible: false, skipReason: "probe unavailable" };
102
+ case "AND":
103
+ default:
104
+ if (!probeOk && !rlOk) return { eligible: false, skipReason: "probe and rate-limit both unavailable" };
105
+ if (!probeOk) return { eligible: false, skipReason: "probe unavailable" };
106
+ if (!rlOk) return { eligible: false, skipReason: "rate-limit exceeded" };
107
+ return { eligible: true, skipReason: null };
108
+ }
109
+ }
110
+
111
+ function clampTimeout(value, defaultVal, cap) {
112
+ const n = Number(value);
113
+ if (!Number.isFinite(n) || n <= 0) return defaultVal;
114
+ return Math.min(n, cap);
115
+ }
116
+
117
+ function normalizeHttpBlock(raw) {
118
+ if (!raw || typeof raw !== "object") return null;
119
+ const method = VALID_HTTP_METHODS.has(raw.method) ? raw.method : "GET";
120
+ const url = typeof raw.url === "string" ? raw.url : "";
121
+ const headers = Array.isArray(raw.headers) ? raw.headers : [];
122
+ const body = raw.body !== undefined ? raw.body : undefined;
123
+ const timeoutMs = clampTimeout(raw.timeoutMs, HTTP_TIMEOUT_DEFAULT, HTTP_TIMEOUT_CAP);
124
+ const mapping = raw.mapping && typeof raw.mapping === "object" ? raw.mapping : {};
125
+ return { method, url, headers, body, timeoutMs, mapping };
126
+ }
127
+
128
+ function normalizeCustomBlock(raw) {
129
+ if (!raw || typeof raw !== "object") return null;
130
+ const source = typeof raw.source === "string" ? raw.source : "";
131
+ const timeoutMs = clampTimeout(raw.timeoutMs, CUSTOM_TIMEOUT_DEFAULT, CUSTOM_TIMEOUT_CAP);
132
+ return { source, timeoutMs };
133
+ }
134
+
135
+ function normalizeMargin(raw) {
136
+ if (!raw || typeof raw !== "object") return { dollars: 0, percent: 0 };
137
+ const dollars = isFiniteNonNeg(raw.dollars) ? raw.dollars : 0;
138
+ const percent = isFiniteNonNeg(raw.percent) ? raw.percent : 0;
139
+ return { dollars, percent };
140
+ }
141
+
142
+ function normalizeRefreshTriggers(raw) {
143
+ const defaults = { onUiOpen: false, onManual: true, onResetAt: false, onUpstreamError: null };
144
+ if (!raw || typeof raw !== "object") return defaults;
145
+ const out = {
146
+ onUiOpen: !!raw.onUiOpen,
147
+ onManual: true,
148
+ onResetAt: !!raw.onResetAt,
149
+ onUpstreamError: null
150
+ };
151
+ if (raw.onUpstreamError && typeof raw.onUpstreamError === "object") {
152
+ out.onUpstreamError = {
153
+ statusCodes: Array.isArray(raw.onUpstreamError.statusCodes)
154
+ ? raw.onUpstreamError.statusCodes.filter((c) => Number.isFinite(c))
155
+ : [],
156
+ bodyRegex: typeof raw.onUpstreamError.bodyRegex === "string"
157
+ ? raw.onUpstreamError.bodyRegex
158
+ : null
159
+ };
160
+ }
161
+ return out;
162
+ }
163
+
164
+ export function normalizeQuotaProbeConfig(raw) {
165
+ if (!raw || typeof raw !== "object" || raw.enabled !== true) return null;
166
+
167
+ const capKind = VALID_CAP_KINDS.has(raw.capKind) ? raw.capKind : null;
168
+ if (!capKind) return null;
169
+
170
+ const combinator = VALID_COMBINATORS.has(raw.combinator) ? raw.combinator : "AND";
171
+ const enforce = VALID_ENFORCE_MODES.has(raw.enforce) ? raw.enforce : "gate";
172
+ const mode = VALID_PROBE_MODES.has(raw.mode) ? raw.mode : "http";
173
+ const safetyMargin = normalizeMargin(raw.safetyMargin);
174
+ const http = normalizeHttpBlock(raw.http);
175
+ const custom = normalizeCustomBlock(raw.custom);
176
+ const refreshTriggers = normalizeRefreshTriggers(raw.refreshTriggers);
177
+
178
+ return { enabled: true, capKind, combinator, enforce, mode, safetyMargin, http, custom, refreshTriggers };
179
+ }
@@ -1,202 +0,0 @@
1
- export function createLlamacppManagedRuntimeRegistry(deps = {}) {
2
- const instances = new Map();
3
- const inFlightStarts = new Map();
4
- let nextPort = 39391;
5
- const MIN_PORT = 1;
6
- const MAX_PORT = 65535;
7
-
8
- function resolveSpawnRuntime(overrides = {}) {
9
- if (typeof overrides.spawnRuntime === "function") return overrides.spawnRuntime;
10
- if (typeof deps.spawnRuntime === "function") return deps.spawnRuntime;
11
- return async ({ host = "127.0.0.1", port } = {}) => ({
12
- pid: undefined,
13
- host,
14
- port,
15
- baseUrl: `http://${host}:${port}/v1`
16
- });
17
- }
18
-
19
- function resolveWaitForHealthy(overrides = {}) {
20
- if (typeof overrides.waitForHealthy === "function") return overrides.waitForHealthy;
21
- if (typeof deps.waitForHealthy === "function") return deps.waitForHealthy;
22
- return async (instance) => ({ ...instance, healthy: true });
23
- }
24
-
25
- function resolveListListeningPids(overrides = {}) {
26
- if (typeof overrides.listListeningPids === "function") return overrides.listListeningPids;
27
- if (typeof deps.listListeningPids === "function") return deps.listListeningPids;
28
- return async () => [];
29
- }
30
-
31
- function resolveStopProcessByPid(overrides = {}) {
32
- if (typeof overrides.stopProcessByPid === "function") return overrides.stopProcessByPid;
33
- if (typeof deps.stopProcessByPid === "function") return deps.stopProcessByPid;
34
- return async () => {};
35
- }
36
-
37
- function isTrackedInstanceReusable(instance) {
38
- if (instance?.healthy !== true) return false;
39
- const child = instance?.child;
40
- if (child) {
41
- return child.exitCode === null && child.killed !== true;
42
- }
43
- return true;
44
- }
45
-
46
- function isChildAlive(child) {
47
- if (!child) return true;
48
- return child.exitCode === null && child.killed !== true;
49
- }
50
-
51
- function normalizeRuntimePort(value, fallback = null) {
52
- const parsed = Number(value);
53
- if (!Number.isInteger(parsed) || parsed < MIN_PORT || parsed > MAX_PORT) return fallback;
54
- return parsed;
55
- }
56
-
57
- function buildCompatibilityKey(variantKey, profileHash) {
58
- return `${String(variantKey || "")}::${String(profileHash || "")}`;
59
- }
60
-
61
- function buildReservedPorts() {
62
- const reserved = new Set();
63
- for (const instance of instances.values()) {
64
- if (!isChildAlive(instance?.child)) continue;
65
- const port = normalizeRuntimePort(instance?.port);
66
- if (port !== null) reserved.add(port);
67
- }
68
- for (const start of inFlightStarts.values()) {
69
- const port = normalizeRuntimePort(start?.reservedPort);
70
- if (port !== null) reserved.add(port);
71
- }
72
- return reserved;
73
- }
74
-
75
- function pruneDeadInstances() {
76
- for (const [instanceId, instance] of instances.entries()) {
77
- if (!isChildAlive(instance?.child)) {
78
- instances.delete(instanceId);
79
- }
80
- }
81
- }
82
-
83
- function allocatePort(preferredPort) {
84
- const reservedPorts = buildReservedPorts();
85
- const preferred = normalizeRuntimePort(preferredPort);
86
- if (preferred !== null && !reservedPorts.has(preferred)) {
87
- if (preferred >= nextPort) nextPort = preferred + 1;
88
- return preferred;
89
- }
90
-
91
- let port = Math.max(39391, nextPort);
92
- if (port > MAX_PORT) {
93
- port = 39391;
94
- }
95
- const startPort = port;
96
- while (reservedPorts.has(port)) {
97
- port += 1;
98
- if (port > MAX_PORT) {
99
- port = 39391;
100
- }
101
- if (port === startPort) {
102
- throw new Error("No available managed runtime port.");
103
- }
104
- }
105
-
106
- nextPort = port + 1;
107
- return port;
108
- }
109
-
110
- async function ensureRuntimeForVariant({ variantKey, profileHash, launchArgs, preferredPort } = {}, runtimeDeps = {}) {
111
- const spawnRuntime = resolveSpawnRuntime(runtimeDeps);
112
- const waitForHealthy = resolveWaitForHealthy(runtimeDeps);
113
- const compatibilityKey = buildCompatibilityKey(variantKey, profileHash);
114
- pruneDeadInstances();
115
-
116
- for (const instance of instances.values()) {
117
- if (
118
- instance.profileHash === profileHash
119
- && instance.variantKey === variantKey
120
- && isTrackedInstanceReusable(instance)
121
- ) {
122
- return instance;
123
- }
124
- }
125
-
126
- const inFlight = inFlightStarts.get(compatibilityKey);
127
- if (inFlight?.promise) {
128
- return inFlight.promise;
129
- }
130
-
131
- const port = allocatePort(preferredPort);
132
- const startPromise = (async () => {
133
- const spawned = await spawnRuntime({ variantKey, profileHash, launchArgs, port });
134
- const healthy = await waitForHealthy(spawned);
135
- const assignedPort = normalizeRuntimePort(healthy?.port, port);
136
- if (!isChildAlive(healthy?.child)) {
137
- throw new Error("Managed runtime exited before becoming healthy.");
138
- }
139
- const instance = {
140
- instanceId: `${variantKey}:${profileHash}:${assignedPort}`,
141
- owner: "llm-router",
142
- variantKey,
143
- profileHash,
144
- healthy: true,
145
- ...healthy,
146
- port: assignedPort
147
- };
148
- instances.set(instance.instanceId, instance);
149
- return instance;
150
- })().finally(() => {
151
- inFlightStarts.delete(compatibilityKey);
152
- });
153
-
154
- inFlightStarts.set(compatibilityKey, { promise: startPromise, reservedPort: port });
155
- return startPromise;
156
- }
157
-
158
- async function reconcile(runtimeDeps = {}) {
159
- const listListeningPids = resolveListListeningPids(runtimeDeps);
160
- const stopProcessByPid = resolveStopProcessByPid(runtimeDeps);
161
- for (const [instanceId, instance] of instances.entries()) {
162
- const probe = await listListeningPids(instance.port).catch(() => null);
163
- const livePids = Array.isArray(probe)
164
- ? probe
165
- : (probe && typeof probe === "object" && Array.isArray(probe.pids) ? probe.pids : null);
166
- const probeFailed = Boolean(probe && typeof probe === "object" && probe.ok === false);
167
- if (probeFailed || !Array.isArray(livePids)) continue;
168
- if (livePids.includes(instance.pid)) continue;
169
- if (instance.owner === "llm-router") {
170
- await stopProcessByPid(instance.pid).catch(() => {});
171
- }
172
- instances.delete(instanceId);
173
- }
174
- }
175
-
176
- async function waitForInFlightStarts() {
177
- while (inFlightStarts.size > 0) {
178
- const pending = [...inFlightStarts.values()]
179
- .map((entry) => entry?.promise)
180
- .filter(Boolean)
181
- .map((promise) => promise.catch(() => null));
182
- if (pending.length === 0) return;
183
- await Promise.all(pending);
184
- }
185
- }
186
-
187
- return {
188
- ensureRuntimeForVariant,
189
- reconcile,
190
- waitForInFlightStarts,
191
- trackInstance: async (instance) => {
192
- instances.set(instance.instanceId, { ...instance });
193
- },
194
- untrackInstance: async (instanceId) => {
195
- instances.delete(instanceId);
196
- },
197
- clear: async () => {
198
- instances.clear();
199
- },
200
- snapshot: () => [...instances.values()]
201
- };
202
- }
@@ -1,133 +0,0 @@
1
- function isPlainObject(value) {
2
- return Boolean(value) && typeof value === "object" && !Array.isArray(value);
3
- }
4
-
5
- function normalizeString(value) {
6
- return typeof value === "string" ? value.trim() : "";
7
- }
8
-
9
- function toGiB(bytes) {
10
- return Math.round((Number(bytes || 0) / (1024 ** 3)) * 10) / 10;
11
- }
12
-
13
- function normalizePositiveInteger(value, fallback) {
14
- const parsed = Number(value);
15
- if (!Number.isFinite(parsed) || parsed <= 0) return fallback;
16
- return Math.floor(parsed);
17
- }
18
-
19
- const LLAMACPP_PRESET_TUNING = Object.freeze({
20
- balanced: Object.freeze({
21
- canonicalPreset: "balanced",
22
- batchSize: 64,
23
- ubatchSize: 16,
24
- gpuLayers: { darwin: 99, other: 0 },
25
- penaltyRatio: 0.10,
26
- noContBatching: false
27
- }),
28
- "long-context": Object.freeze({
29
- canonicalPreset: "long-context",
30
- batchSize: 32,
31
- ubatchSize: 8,
32
- gpuLayers: { darwin: 80, other: 0 },
33
- penaltyRatio: 0.16,
34
- noContBatching: false
35
- }),
36
- "low-memory": Object.freeze({
37
- canonicalPreset: "low-memory",
38
- batchSize: 32,
39
- ubatchSize: 8,
40
- gpuLayers: { darwin: 0, other: 0 },
41
- penaltyRatio: 0.04,
42
- noContBatching: true
43
- }),
44
- "fast-response": Object.freeze({
45
- canonicalPreset: "fast-response",
46
- batchSize: 16,
47
- ubatchSize: 8,
48
- gpuLayers: { darwin: 40, other: 0 },
49
- penaltyRatio: 0.07,
50
- noContBatching: false
51
- }),
52
- "cpu-safe": Object.freeze({
53
- canonicalPreset: "cpu-safe",
54
- batchSize: 32,
55
- ubatchSize: 8,
56
- gpuLayers: { darwin: 0, other: 0 },
57
- penaltyRatio: 0.04,
58
- noContBatching: true
59
- })
60
- });
61
-
62
- function resolveCanonicalPreset(requestedPreset) {
63
- const normalizedPreset = normalizeString(requestedPreset).toLowerCase();
64
- if (normalizedPreset === "throughput") return LLAMACPP_PRESET_TUNING["fast-response"];
65
- if (normalizedPreset === "memory-safe") return LLAMACPP_PRESET_TUNING["low-memory"];
66
- return LLAMACPP_PRESET_TUNING[normalizedPreset] || LLAMACPP_PRESET_TUNING.balanced;
67
- }
68
-
69
- export function estimateLlamacppRuntimeBytes({
70
- sizeBytes = 0,
71
- contextWindow = 0,
72
- preset = "balanced"
73
- } = {}) {
74
- const base = Number(sizeBytes || 0);
75
- const contextBytes = Number(contextWindow || 0) * 163840;
76
- const tuning = resolveCanonicalPreset(preset);
77
- const presetPenalty = Math.floor(base * tuning.penaltyRatio);
78
- return base + contextBytes + presetPenalty;
79
- }
80
-
81
- export function deriveLlamacppLaunchProfile({
82
- variant,
83
- baseModel,
84
- system
85
- } = {}) {
86
- const requestedPreset = normalizeString(variant?.preset)
87
- || normalizeString(variant?.runtimeProfile?.preset)
88
- || "balanced";
89
- const failureCategory = normalizeString(
90
- variant?.runtimeProfile?.lastFailure?.category || variant?.runtimeStatus?.lastFailure?.category
91
- );
92
- const tuning = resolveCanonicalPreset(failureCategory === "metal-oom" ? "cpu-safe" : requestedPreset);
93
- const preset = tuning.canonicalPreset;
94
- const contextWindow = normalizePositiveInteger(variant?.contextWindow, 2048);
95
- const overrides = isPlainObject(variant?.runtimeProfile?.overrides) ? variant.runtimeProfile.overrides : {};
96
- const extraArgs = Array.isArray(variant?.runtimeProfile?.extraArgs)
97
- ? variant.runtimeProfile.extraArgs.map((value) => normalizeString(value)).filter(Boolean)
98
- : [];
99
- const gpuLayers = Number.isFinite(Number(overrides.gpuLayers))
100
- ? Math.floor(Number(overrides.gpuLayers))
101
- : (system?.platform === "darwin" ? tuning.gpuLayers.darwin : tuning.gpuLayers.other);
102
- const batchSize = Number.isFinite(Number(overrides.batchSize))
103
- ? Math.floor(Number(overrides.batchSize))
104
- : tuning.batchSize;
105
- const ubatchSize = Number.isFinite(Number(overrides.ubatchSize))
106
- ? Math.floor(Number(overrides.ubatchSize))
107
- : tuning.ubatchSize;
108
- const estimatedRuntimeBytes = estimateLlamacppRuntimeBytes({
109
- sizeBytes: baseModel?.metadata?.sizeBytes,
110
- contextWindow,
111
- preset
112
- });
113
- const args = [
114
- "-m", normalizeString(baseModel?.path),
115
- "-a", normalizeString(variant?.id),
116
- "-c", String(contextWindow),
117
- "-np", "1",
118
- "-b", String(batchSize),
119
- "-ub", String(ubatchSize),
120
- "--cache-ram", "0",
121
- "--no-warmup"
122
- ];
123
-
124
- if (tuning.noContBatching) args.push("--no-cont-batching");
125
- args.push("-ngl", String(gpuLayers), ...extraArgs);
126
-
127
- return {
128
- preset,
129
- args: args.filter(Boolean),
130
- estimatedRuntimeBytes,
131
- memoryLabel: `${toGiB(estimatedRuntimeBytes)} GB`
132
- };
133
- }