@khanglvm/llm-router 2.3.1 → 2.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CHANGELOG.md +15 -0
  2. package/README.md +2 -2
  3. package/package.json +1 -1
  4. package/src/cli/router-module.js +32 -5
  5. package/src/node/coding-tool-config.js +138 -25
  6. package/src/node/large-request-log.js +54 -0
  7. package/src/node/litellm-context-catalog.js +13 -1
  8. package/src/node/local-server.js +10 -0
  9. package/src/node/ollama-client.js +195 -0
  10. package/src/node/ollama-hardware.js +94 -0
  11. package/src/node/ollama-install.js +230 -0
  12. package/src/node/provider-probe.js +69 -5
  13. package/src/node/web-console-client.js +36 -36
  14. package/src/node/web-console-server.js +478 -8
  15. package/src/node/web-console-styles.generated.js +1 -1
  16. package/src/node/web-console-ui/amp-utils.js +272 -0
  17. package/src/node/web-console-ui/api-client.js +128 -0
  18. package/src/node/web-console-ui/capability-utils.js +36 -0
  19. package/src/node/web-console-ui/config-editor-utils.js +20 -5
  20. package/src/node/web-console-ui/constants.js +140 -0
  21. package/src/node/web-console-ui/context-window-utils.js +262 -0
  22. package/src/node/web-console-ui/hooks/use-reorder-layout-animation.js +65 -0
  23. package/src/node/web-console-ui/provider-presets.js +211 -0
  24. package/src/node/web-console-ui/quick-start-utils.js +790 -0
  25. package/src/node/web-console-ui/utils.js +353 -0
  26. package/src/node/web-console-ui/web-search-utils.js +460 -0
  27. package/src/runtime/config.js +96 -9
  28. package/src/runtime/handler/fallback.js +71 -0
  29. package/src/runtime/handler/field-filter.js +39 -0
  30. package/src/runtime/handler/large-request-log.js +211 -0
  31. package/src/runtime/handler/provider-call.js +276 -15
  32. package/src/runtime/handler/reasoning-effort.js +11 -1
  33. package/src/runtime/handler/tool-name-sanitizer.js +258 -0
  34. package/src/runtime/handler.js +16 -3
  35. package/src/shared/coding-tool-bindings.js +3 -0
@@ -0,0 +1,195 @@
1
+ /** Ollama REST API client. All exports return { ok, error?, ... } — never throw. */
2
+
3
+ const DEFAULT_TIMEOUT_MS = 5_000;
4
+ const LOAD_TIMEOUT_MS = 120_000;
5
+ const PULL_TIMEOUT_MS = 600_000;
6
+
7
+ /**
8
+ * @param {string} baseUrl e.g. "http://localhost:11434"
9
+ * @param {string} path must start with "/"
10
+ * @param {RequestInit & { timeoutMs?: number }} options
11
+ * @returns {Promise<{ ok: boolean, status: number, json: unknown, error: string | null }>}
12
+ */
13
+ async function ollamaFetch(baseUrl, path, options = {}) {
14
+ const { timeoutMs = DEFAULT_TIMEOUT_MS, ...init } = options;
15
+ const url = baseUrl.replace(/\/+$/, "") + path;
16
+ let response;
17
+ let json = null;
18
+ let error = null;
19
+
20
+ try {
21
+ response = await fetch(url, {
22
+ ...init,
23
+ signal: init.signal ?? AbortSignal.timeout(timeoutMs)
24
+ });
25
+ const text = await response.text();
26
+ if (text) {
27
+ try {
28
+ json = JSON.parse(text);
29
+ } catch {
30
+ // non-JSON body — leave json null
31
+ }
32
+ }
33
+ } catch (err) {
34
+ error = err instanceof Error ? err.message : String(err);
35
+ }
36
+
37
+ return {
38
+ ok: Boolean(response?.ok),
39
+ status: response?.status ?? 0,
40
+ json,
41
+ error
42
+ };
43
+ }
44
+
45
+ /** Check whether the Ollama server is reachable. */
46
+ export async function ollamaCheckConnection(baseUrl) {
47
+ const result = await ollamaFetch(baseUrl, "/");
48
+ if (!result.ok) {
49
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
50
+ }
51
+ return { ok: true };
52
+ }
53
+
54
+ /** List all locally available models. */
55
+ export async function ollamaListModels(baseUrl) {
56
+ const result = await ollamaFetch(baseUrl, "/api/tags");
57
+ if (!result.ok) {
58
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
59
+ }
60
+ const raw = result.json?.models ?? [];
61
+ const models = raw.map((m) => ({
62
+ name: m.name,
63
+ parameterSize: m.details?.parameter_size ?? null,
64
+ quantizationLevel: m.details?.quantization_level ?? null,
65
+ sizeBytes: m.size ?? null,
66
+ family: m.details?.family ?? null,
67
+ modifiedAt: m.modified_at ?? null,
68
+ contextLength: null
69
+ }));
70
+ // Enrich with contextLength from /api/show (max 5 concurrent)
71
+ const BATCH = 5;
72
+ for (let i = 0; i < models.length; i += BATCH) {
73
+ const batch = models.slice(i, i + BATCH);
74
+ const details = await Promise.all(
75
+ batch.map((m) => ollamaShowModel(baseUrl, m.name).catch(() => ({ ok: false })))
76
+ );
77
+ for (let j = 0; j < batch.length; j++) {
78
+ if (details[j]?.ok && details[j].details?.contextLength) {
79
+ batch[j].contextLength = details[j].details.contextLength;
80
+ }
81
+ }
82
+ }
83
+ return { ok: true, models };
84
+ }
85
+
86
+ /** Show detailed info for a specific model. */
87
+ export async function ollamaShowModel(baseUrl, modelName) {
88
+ const result = await ollamaFetch(baseUrl, "/api/show", {
89
+ method: "POST",
90
+ headers: { "Content-Type": "application/json" },
91
+ body: JSON.stringify({ model: modelName, verbose: false })
92
+ });
93
+ if (!result.ok) {
94
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
95
+ }
96
+ const body = result.json ?? {};
97
+ const details = {
98
+ contextLength: body.model_info?.["llm.context_length"] ?? null,
99
+ parameterSize: body.details?.parameter_size ?? null,
100
+ quantizationLevel: body.details?.quantization_level ?? null,
101
+ family: body.details?.family ?? null,
102
+ format: body.details?.format ?? null
103
+ };
104
+ return { ok: true, details };
105
+ }
106
+
107
+ /** List currently running (loaded) models. */
108
+ export async function ollamaListRunning(baseUrl) {
109
+ const result = await ollamaFetch(baseUrl, "/api/ps");
110
+ if (!result.ok) {
111
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
112
+ }
113
+ const raw = result.json?.models ?? [];
114
+ const PINNED_SENTINEL = "0001-01-01T00:00:00Z";
115
+ const models = raw.map((m) => ({
116
+ name: m.name,
117
+ sizeVram: m.size_vram ?? null,
118
+ expiresAt: m.expires_at ?? null,
119
+ isPinned: m.expires_at === PINNED_SENTINEL,
120
+ processor: m.details?.families?.[0] ?? null
121
+ }));
122
+ return { ok: true, models };
123
+ }
124
+
125
+ /**
126
+ * Load a model into memory. keepAlive: "24h" | "10m" | -1 (pin) | 0 (unload).
127
+ * Uses 120 s timeout to accommodate large models.
128
+ */
129
+ export async function ollamaLoadModel(baseUrl, modelName, keepAlive = "24h") {
130
+ const result = await ollamaFetch(baseUrl, "/api/generate", {
131
+ method: "POST",
132
+ headers: { "Content-Type": "application/json" },
133
+ body: JSON.stringify({ model: modelName, prompt: "", keep_alive: keepAlive, stream: false }),
134
+ timeoutMs: LOAD_TIMEOUT_MS
135
+ });
136
+ if (!result.ok) {
137
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
138
+ }
139
+ const loadDurationMs =
140
+ typeof result.json?.load_duration === "number"
141
+ ? result.json.load_duration / 1_000_000
142
+ : null;
143
+ return { ok: true, loadDurationMs };
144
+ }
145
+
146
+ /** Unload a model from memory immediately. */
147
+ export async function ollamaUnloadModel(baseUrl, modelName) {
148
+ const result = await ollamaFetch(baseUrl, "/api/generate", {
149
+ method: "POST",
150
+ headers: { "Content-Type": "application/json" },
151
+ body: JSON.stringify({ model: modelName, prompt: "", keep_alive: 0, stream: false }),
152
+ timeoutMs: LOAD_TIMEOUT_MS
153
+ });
154
+ if (!result.ok) {
155
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
156
+ }
157
+ return { ok: true, unloaded: true };
158
+ }
159
+
160
+ /** Pin a model in memory indefinitely (keep_alive = -1). */
161
+ export async function ollamaPinModel(baseUrl, modelName) {
162
+ return ollamaLoadModel(baseUrl, modelName, -1);
163
+ }
164
+
165
+ /** Set a custom keep-alive duration for a loaded model (e.g. "10m", "1h"). */
166
+ export async function ollamaSetKeepAlive(baseUrl, modelName, duration) {
167
+ return ollamaLoadModel(baseUrl, modelName, duration);
168
+ }
169
+
170
+ /** Pull (download) a model from the Ollama registry. 600 s timeout. */
171
+ export async function ollamaPullModel(baseUrl, modelName) {
172
+ const result = await ollamaFetch(baseUrl, "/api/pull", {
173
+ method: "POST",
174
+ headers: { "Content-Type": "application/json" },
175
+ body: JSON.stringify({ name: modelName, stream: false }),
176
+ timeoutMs: PULL_TIMEOUT_MS
177
+ });
178
+ if (!result.ok) {
179
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
180
+ }
181
+ return { ok: true };
182
+ }
183
+
184
+ /** Delete a locally stored model. */
185
+ export async function ollamaDeleteModel(baseUrl, modelName) {
186
+ const result = await ollamaFetch(baseUrl, "/api/delete", {
187
+ method: "DELETE",
188
+ headers: { "Content-Type": "application/json" },
189
+ body: JSON.stringify({ name: modelName })
190
+ });
191
+ if (!result.ok) {
192
+ return { ok: false, error: result.error ?? `HTTP ${result.status}` };
193
+ }
194
+ return { ok: true };
195
+ }
@@ -0,0 +1,94 @@
1
+ /**
2
+ * VRAM/memory estimation utilities for Ollama models.
3
+ * Pure functions, no side effects, no external dependencies.
4
+ */
5
+
6
+ const QUANT_BITS = {
7
+ q2_k: 2.5,
8
+ q3_k_s: 3.0, q3_k_m: 3.5, q3_k_l: 3.5,
9
+ q4_0: 4.0, q4_k_s: 4.5, q4_k_m: 4.5,
10
+ q5_0: 5.0, q5_k_s: 5.5, q5_k_m: 5.5,
11
+ q6_k: 6.5, q8_0: 8.0, f16: 16.0, f32: 32.0
12
+ };
13
+
14
+ const DEFAULT_BITS = 4.5; // Q4_K_M
15
+ const OVERHEAD_BYTES = 512 * 1024 * 1024; // 512 MB
16
+
17
+ /**
18
+ * Parse a parameter size string like "4.3B", "70B" into numeric count.
19
+ * @param {string} parameterSize
20
+ * @returns {number|null} Parameter count as integer, or null if unparseable
21
+ */
22
+ export function parseParameterSize(parameterSize) {
23
+ const match = String(parameterSize || '').trim().match(/^([\d.]+)([BKMGT]?)$/i);
24
+ if (!match) return null;
25
+
26
+ const value = parseFloat(match[1]);
27
+ if (!Number.isFinite(value) || value <= 0) return null;
28
+
29
+ const suffix = match[2].toUpperCase();
30
+ const multipliers = { B: 1e9, M: 1e6, K: 1e3, G: 1e9, T: 1e12, '': 1 };
31
+ const multiplier = multipliers[suffix] ?? 1;
32
+
33
+ return Math.round(value * multiplier);
34
+ }
35
+
36
+ /**
37
+ * Estimate VRAM required for a model at a given quantization and context length.
38
+ * @param {string} parameterSize - e.g. "7B", "70B"
39
+ * @param {string} quantLevel - e.g. "Q4_K_M", "F16"
40
+ * @param {number} contextLength - number of tokens in context window
41
+ * @returns {{ baseModelBytes: number, kvCacheBytes: number, totalBytes: number }|null}
42
+ */
43
+ export function estimateModelVram(parameterSize, quantLevel, contextLength) {
44
+ const params = parseParameterSize(parameterSize);
45
+ if (params === null) return null;
46
+
47
+ const bitsPerWeight = QUANT_BITS[String(quantLevel || '').toLowerCase()] ?? DEFAULT_BITS;
48
+ const baseModelBytes = params * bitsPerWeight / 8;
49
+
50
+ // Empirical KV cache estimate: bytes per token scales with model size
51
+ const kvBytesPerToken = params * 0.00000025;
52
+ const kvCacheBytes = (Number(contextLength) || 0) * kvBytesPerToken;
53
+
54
+ const totalBytes = baseModelBytes + kvCacheBytes + OVERHEAD_BYTES;
55
+ return { baseModelBytes, kvCacheBytes, totalBytes };
56
+ }
57
+
58
+ /**
59
+ * Calculate max practical context given available memory.
60
+ * @param {string} parameterSize - e.g. "7B"
61
+ * @param {string} quantLevel - e.g. "Q4_K_M"
62
+ * @param {number} availableMemoryBytes
63
+ * @returns {{ maxContext: number, warningThreshold: number }}
64
+ */
65
+ export function estimateMaxContext(parameterSize, quantLevel, availableMemoryBytes) {
66
+ const params = parseParameterSize(parameterSize);
67
+ if (params === null) return { maxContext: 0, warningThreshold: 0 };
68
+
69
+ const bitsPerWeight = QUANT_BITS[String(quantLevel || '').toLowerCase()] ?? DEFAULT_BITS;
70
+ const baseModelBytes = params * bitsPerWeight / 8;
71
+
72
+ const remaining = availableMemoryBytes - baseModelBytes - OVERHEAD_BYTES;
73
+ if (remaining <= 0) return { maxContext: 0, warningThreshold: 0 };
74
+
75
+ const kvBytesPerToken = params * 0.00000025;
76
+ const maxContext = Math.floor(remaining / kvBytesPerToken / 1024) * 1024;
77
+ const warningThreshold = Math.floor(maxContext * 0.85);
78
+
79
+ return { maxContext, warningThreshold };
80
+ }
81
+
82
+ /**
83
+ * Format a byte count as a human-readable string.
84
+ * @param {number} bytes
85
+ * @returns {string} e.g. "4.5 GB", "512.0 MB"
86
+ */
87
+ export function formatBytes(bytes) {
88
+ const n = Number(bytes) || 0;
89
+ if (n >= 1024 ** 4) return `${(n / 1024 ** 4).toFixed(1)} TB`;
90
+ if (n >= 1024 ** 3) return `${(n / 1024 ** 3).toFixed(1)} GB`;
91
+ if (n >= 1024 ** 2) return `${(n / 1024 ** 2).toFixed(1)} MB`;
92
+ if (n >= 1024) return `${(n / 1024).toFixed(1)} KB`;
93
+ return `${n} B`;
94
+ }
@@ -0,0 +1,230 @@
1
+ /**
2
+ * Ollama detection, installation, and server lifecycle management.
3
+ * All public functions return structured results — never throw.
4
+ */
5
+
6
+ import { spawnSync, spawn } from "node:child_process";
7
+ import { existsSync } from "node:fs";
8
+
9
+ const OLLAMA_PORT_URL = "http://localhost:11434/";
10
+ const STARTUP_WAIT_MS = 2_000;
11
+ const HEALTH_TIMEOUT_MS = 3_000;
12
+
13
+ /** @type {Record<string, string[]>} */
14
+ const FALLBACK_PATHS = {
15
+ darwin: ["/usr/local/bin/ollama", "/opt/homebrew/bin/ollama"],
16
+ linux: ["/usr/local/bin/ollama", "/usr/bin/ollama"],
17
+ win32: ["C:\\Program Files\\Ollama\\ollama.exe"]
18
+ };
19
+
20
+ /**
21
+ * Detect if Ollama is installed on the system.
22
+ * @returns {{ installed: boolean, path: string, version: string }}
23
+ */
24
+ export function detectOllamaInstallation() {
25
+ try {
26
+ const platform = process.platform;
27
+ const whichCmd = platform === "win32" ? "where" : "which";
28
+ const which = spawnSync(whichCmd, ["ollama"], { encoding: "utf8" });
29
+ let ollamaPath = which.stdout?.trim() ?? "";
30
+
31
+ if (!ollamaPath) {
32
+ const candidates = FALLBACK_PATHS[platform] ?? [];
33
+ ollamaPath = candidates.find((p) => existsSync(p)) ?? "";
34
+ }
35
+
36
+ if (!ollamaPath) {
37
+ return { installed: false, path: "", version: "" };
38
+ }
39
+
40
+ const ver = spawnSync("ollama", ["--version"], { encoding: "utf8" });
41
+ const version = ver.stdout?.trim() ?? "";
42
+ return { installed: true, path: ollamaPath, version };
43
+ } catch {
44
+ return { installed: false, path: "", version: "" };
45
+ }
46
+ }
47
+
48
+ /**
49
+ * Install Ollama silently per platform.
50
+ * @param {{ onProgress?: (event: { phase: string, message: string }) => void }} opts
51
+ * @returns {Promise<{ ok: boolean, version?: string, error?: string, alreadyInstalled?: boolean }>}
52
+ */
53
+ export async function installOllama({ onProgress } = {}) {
54
+ const progress = (phase, message) => onProgress?.({ phase, message });
55
+
56
+ try {
57
+ progress("detecting", "Checking for existing Ollama installation...");
58
+ const existing = detectOllamaInstallation();
59
+ if (existing.installed) {
60
+ progress("done", "Ollama is already installed.");
61
+ return { ok: true, alreadyInstalled: true, version: existing.version };
62
+ }
63
+
64
+ const platform = process.platform;
65
+
66
+ if (platform === "win32") {
67
+ const msg = "Automatic install not supported on Windows. Please install from https://ollama.com/download";
68
+ progress("error", msg);
69
+ return { ok: false, error: msg };
70
+ }
71
+
72
+ if (platform === "darwin") {
73
+ return await installViaBrew({ progress });
74
+ }
75
+
76
+ if (platform === "linux") {
77
+ return await installViaScript({ progress });
78
+ }
79
+
80
+ const msg = `Unsupported platform: ${platform}`;
81
+ progress("error", msg);
82
+ return { ok: false, error: msg };
83
+ } catch (err) {
84
+ const error = err instanceof Error ? err.message : String(err);
85
+ progress("error", error);
86
+ return { ok: false, error };
87
+ }
88
+ }
89
+
90
+ /**
91
+ * Start Ollama server as a detached background process.
92
+ * @returns {Promise<{ ok: boolean, pid?: number, error?: string }>}
93
+ */
94
+ export async function startOllamaServer() {
95
+ try {
96
+ const child = spawn("ollama", ["serve"], {
97
+ detached: true,
98
+ stdio: "ignore"
99
+ });
100
+ child.unref();
101
+ const pid = child.pid;
102
+
103
+ await new Promise((resolve) => setTimeout(resolve, STARTUP_WAIT_MS));
104
+
105
+ const running = await isOllamaRunning();
106
+ if (!running) {
107
+ return { ok: false, error: "Server did not respond after startup" };
108
+ }
109
+
110
+ return { ok: true, pid };
111
+ } catch (err) {
112
+ return { ok: false, error: err instanceof Error ? err.message : String(err) };
113
+ }
114
+ }
115
+
116
+ /**
117
+ * Stop the Ollama server process.
118
+ * @returns {{ ok: boolean }}
119
+ */
120
+ export function stopOllamaServer() {
121
+ try {
122
+ if (process.platform === "win32") {
123
+ spawnSync("taskkill", ["/IM", "ollama.exe", "/F"]);
124
+ } else {
125
+ spawnSync("pkill", ["-x", "ollama"], { timeout: 5000 });
126
+ }
127
+ return { ok: true };
128
+ } catch {
129
+ return { ok: false };
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Check if Ollama server is responding.
135
+ * @returns {Promise<boolean>}
136
+ */
137
+ export async function isOllamaRunning() {
138
+ try {
139
+ const res = await fetch(OLLAMA_PORT_URL, {
140
+ signal: AbortSignal.timeout(HEALTH_TIMEOUT_MS)
141
+ });
142
+ return res.ok || res.status > 0;
143
+ } catch {
144
+ return false;
145
+ }
146
+ }
147
+
148
+ // -- Private helpers --
149
+
150
+ async function installViaBrew({ progress }) {
151
+ const brew = spawnSync("which", ["brew"], { encoding: "utf8" });
152
+ if (!brew.stdout?.trim()) {
153
+ const msg =
154
+ "Homebrew not found. Please install Ollama manually from https://ollama.com/download";
155
+ progress("error", msg);
156
+ return { ok: false, error: msg };
157
+ }
158
+
159
+ progress("downloading", "Installing Ollama via Homebrew...");
160
+ return new Promise((resolve) => {
161
+ const child = spawn("brew", ["install", "ollama"], { stdio: "pipe" });
162
+
163
+ child.stdout.on("data", (d) =>
164
+ progress("installing", d.toString().trim())
165
+ );
166
+ child.stderr.on("data", (d) =>
167
+ progress("installing", d.toString().trim())
168
+ );
169
+
170
+ child.on("close", (code) => {
171
+ if (code !== 0) {
172
+ const error = `brew install ollama exited with code ${code}`;
173
+ progress("error", error);
174
+ return resolve({ ok: false, error });
175
+ }
176
+ progress("verifying", "Verifying installation...");
177
+ const result = detectOllamaInstallation();
178
+ if (!result.installed) {
179
+ const error = "Installation succeeded but ollama binary not found";
180
+ progress("error", error);
181
+ return resolve({ ok: false, error });
182
+ }
183
+ progress("done", "Ollama installed successfully.");
184
+ resolve({ ok: true, version: result.version });
185
+ });
186
+
187
+ child.on("error", (err) => {
188
+ progress("error", err.message);
189
+ resolve({ ok: false, error: err.message });
190
+ });
191
+ });
192
+ }
193
+
194
+ async function installViaScript({ progress }) {
195
+ progress("downloading", "Downloading Ollama install script...");
196
+ return new Promise((resolve) => {
197
+ const child = spawn("sh", ["-c", "curl -fsSL https://ollama.com/install.sh | sh"], {
198
+ stdio: "pipe"
199
+ });
200
+
201
+ child.stdout.on("data", (d) =>
202
+ progress("installing", d.toString().trim())
203
+ );
204
+ child.stderr.on("data", (d) =>
205
+ progress("installing", d.toString().trim())
206
+ );
207
+
208
+ child.on("close", (code) => {
209
+ if (code !== 0) {
210
+ const error = `Install script exited with code ${code}`;
211
+ progress("error", error);
212
+ return resolve({ ok: false, error });
213
+ }
214
+ progress("verifying", "Verifying installation...");
215
+ const result = detectOllamaInstallation();
216
+ if (!result.installed) {
217
+ const error = "Installation succeeded but ollama binary not found";
218
+ progress("error", error);
219
+ return resolve({ ok: false, error });
220
+ }
221
+ progress("done", "Ollama installed successfully.");
222
+ resolve({ ok: true, version: result.version });
223
+ });
224
+
225
+ child.on("error", (err) => {
226
+ progress("error", err.message);
227
+ resolve({ ok: false, error: err.message });
228
+ });
229
+ });
230
+ }
@@ -84,6 +84,9 @@ function resolveModelsUrl(baseUrl, format) {
84
84
  return clean.replace(/\/chat\/completions$/, "/models");
85
85
  }
86
86
  if (clean.endsWith("/v1") || isVersionedApiRoot) return `${clean}/models`;
87
+ // Handle base URLs with a versioned segment followed by a sub-path,
88
+ // e.g. https://generativelanguage.googleapis.com/v1beta/openai
89
+ if (/\/v\d+[a-z]*\/(?!chat\b)\w+$/i.test(clean)) return `${clean}/models`;
87
90
  return `${clean}/v1/models`;
88
91
  }
89
92
 
@@ -191,8 +194,12 @@ function extractModelIds(result) {
191
194
  const ids = [];
192
195
  for (const item of body.data) {
193
196
  if (!item || typeof item !== "object") continue;
194
- const id = typeof item.id === "string" ? item.id : (typeof item.name === "string" ? item.name : null);
195
- if (id) ids.push(id);
197
+ let id = typeof item.id === "string" ? item.id : (typeof item.name === "string" ? item.name : null);
198
+ if (id) {
199
+ // Strip provider-specific prefixes (e.g., Gemini "models/gemini-*")
200
+ if (id.startsWith("models/")) id = id.slice(7);
201
+ ids.push(id);
202
+ }
196
203
  }
197
204
  return [...new Set(ids)];
198
205
  }
@@ -553,12 +560,14 @@ async function probeOpenAI(baseUrl, apiKey, timeoutMs, extraHeaders = {}) {
553
560
  }, timeoutMs);
554
561
  details.checks.push({ step: "chat", auth: variant.type, status: chatResult.status, error: chatResult.error || null });
555
562
 
556
- if (looksOpenAI(chatResult)) {
563
+ const modelsLooksValid = looksOpenAI(modelsResult) && authLooksValid(modelsResult);
564
+
565
+ if (looksOpenAI(chatResult) || modelsLooksValid) {
557
566
  details.supported = true;
558
- if (authLooksValid(chatResult)) {
567
+ if (looksOpenAI(chatResult) ? authLooksValid(chatResult) : modelsLooksValid) {
559
568
  details.working = true;
560
569
  details.auth = { type: variant.type === "x-api-key" ? "x-api-key" : "bearer" };
561
- if (looksOpenAI(modelsResult) && authLooksValid(modelsResult)) {
570
+ if (modelsLooksValid) {
562
571
  details.models = extractModelIds(modelsResult);
563
572
  }
564
573
  return details;
@@ -799,6 +808,61 @@ function pickPreferredFormatForModel(modelId, formats, { providerPreferredFormat
799
808
  return supported[0];
800
809
  }
801
810
 
811
+ /**
812
+ * Probes a list of models against an OpenAI-compatible endpoint to detect
813
+ * free-tier availability. Returns a map of modelId -> { freeTier, rpm }.
814
+ * A model is considered not-free-tier if the response contains "limit: 0"
815
+ * in a free-tier quota metric.
816
+ */
817
+ export async function probeFreeTierModels(options) {
818
+ const baseUrl = String(options?.baseUrl || "").trim().replace(/\/+$/, "");
819
+ const apiKey = String(options?.apiKey || "").trim();
820
+ const modelIds = (options?.modelIds || []).map((id) => String(id || "").trim()).filter(Boolean);
821
+ const timeoutMs = Number.isFinite(options?.timeoutMs) ? options.timeoutMs : 6000;
822
+
823
+ if (!baseUrl || !apiKey || modelIds.length === 0) return {};
824
+
825
+ const chatUrl = `${baseUrl}/chat/completions`;
826
+ const headers = {
827
+ "Content-Type": "application/json",
828
+ Authorization: `Bearer ${apiKey}`
829
+ };
830
+ const result = {};
831
+
832
+ for (const modelId of modelIds) {
833
+ try {
834
+ const response = await safeFetchJson(chatUrl, {
835
+ method: "POST",
836
+ headers,
837
+ body: JSON.stringify({
838
+ model: modelId,
839
+ messages: [{ role: "user", content: "hi" }],
840
+ max_tokens: 1,
841
+ stream: false
842
+ })
843
+ }, timeoutMs);
844
+
845
+ const text = response.text || "";
846
+ const isZeroQuota = /limit:\s*0[,\s]/i.test(text) || text.includes('"limit": 0') || text.includes('"limit":0');
847
+ const isFreeTierQuota = text.includes("free_tier");
848
+
849
+ if (isZeroQuota && isFreeTierQuota) {
850
+ result[modelId] = { freeTier: false };
851
+ } else if (response.ok || response.status === 400 || response.status === 404) {
852
+ result[modelId] = { freeTier: true };
853
+ } else if (response.status === 429 && !isZeroQuota) {
854
+ result[modelId] = { freeTier: true };
855
+ } else {
856
+ result[modelId] = { freeTier: false };
857
+ }
858
+ } catch {
859
+ result[modelId] = { freeTier: null };
860
+ }
861
+ }
862
+
863
+ return result;
864
+ }
865
+
802
866
  export async function probeProviderEndpointMatrix(options) {
803
867
  const emitProgress = makeProgressEmitter(options?.onProgress);
804
868
  const apiKey = String(options?.apiKey || "").trim();