npm - @khanglvm/llm-router - Versions diffs - 2.3.1 → 2.3.2 - Mend

@khanglvm/llm-router 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/CHANGELOG.md +5 -0
package/README.md +2 -2
package/package.json +1 -1
package/src/cli/router-module.js +32 -5
package/src/node/coding-tool-config.js +138 -25
package/src/node/large-request-log.js +54 -0
package/src/node/litellm-context-catalog.js +13 -1
package/src/node/local-server.js +10 -0
package/src/node/ollama-client.js +195 -0
package/src/node/ollama-hardware.js +94 -0
package/src/node/ollama-install.js +230 -0
package/src/node/provider-probe.js +69 -5
package/src/node/web-console-client.js +36 -36
package/src/node/web-console-server.js +478 -8
package/src/node/web-console-styles.generated.js +1 -1
package/src/node/web-console-ui/amp-utils.js +272 -0
package/src/node/web-console-ui/api-client.js +128 -0
package/src/node/web-console-ui/capability-utils.js +36 -0
package/src/node/web-console-ui/config-editor-utils.js +20 -5
package/src/node/web-console-ui/constants.js +140 -0
package/src/node/web-console-ui/context-window-utils.js +262 -0
package/src/node/web-console-ui/hooks/use-reorder-layout-animation.js +65 -0
package/src/node/web-console-ui/provider-presets.js +211 -0
package/src/node/web-console-ui/quick-start-utils.js +790 -0
package/src/node/web-console-ui/utils.js +353 -0
package/src/node/web-console-ui/web-search-utils.js +460 -0
package/src/runtime/config.js +96 -9
package/src/runtime/handler/fallback.js +71 -0
package/src/runtime/handler/field-filter.js +39 -0
package/src/runtime/handler/large-request-log.js +211 -0
package/src/runtime/handler/provider-call.js +185 -15
package/src/runtime/handler/reasoning-effort.js +11 -1
package/src/runtime/handler/tool-name-sanitizer.js +258 -0
package/src/runtime/handler.js +16 -3
package/src/shared/coding-tool-bindings.js +3 -0

package/src/node/ollama-client.js ADDED Viewed

@@ -0,0 +1,195 @@
+/** Ollama REST API client. All exports return { ok, error?, ... } — never throw. */
+const DEFAULT_TIMEOUT_MS = 5_000;
+const LOAD_TIMEOUT_MS = 120_000;
+const PULL_TIMEOUT_MS = 600_000;
+/**
+ * @param {string} baseUrl  e.g. "http://localhost:11434"
+ * @param {string} path     must start with "/"
+ * @param {RequestInit & { timeoutMs?: number }} options
+ * @returns {Promise<{ ok: boolean, status: number, json: unknown, error: string | null }>}
+ */
+async function ollamaFetch(baseUrl, path, options = {}) {
+  const { timeoutMs = DEFAULT_TIMEOUT_MS, ...init } = options;
+  const url = baseUrl.replace(/\/+$/, "") + path;
+  let response;
+  let json = null;
+  let error = null;
+  try {
+    response = await fetch(url, {
+      ...init,
+      signal: init.signal ?? AbortSignal.timeout(timeoutMs)
+    });
+    const text = await response.text();
+    if (text) {
+      try {
+        json = JSON.parse(text);
+      } catch {
+        // non-JSON body — leave json null
+      }
+    }
+  } catch (err) {
+    error = err instanceof Error ? err.message : String(err);
+  }
+  return {
+    ok: Boolean(response?.ok),
+    status: response?.status ?? 0,
+    json,
+    error
+  };
+}
+/** Check whether the Ollama server is reachable. */
+export async function ollamaCheckConnection(baseUrl) {
+  const result = await ollamaFetch(baseUrl, "/");
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  return { ok: true };
+}
+/** List all locally available models. */
+export async function ollamaListModels(baseUrl) {
+  const result = await ollamaFetch(baseUrl, "/api/tags");
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  const raw = result.json?.models ?? [];
+  const models = raw.map((m) => ({
+    name: m.name,
+    parameterSize: m.details?.parameter_size ?? null,
+    quantizationLevel: m.details?.quantization_level ?? null,
+    sizeBytes: m.size ?? null,
+    family: m.details?.family ?? null,
+    modifiedAt: m.modified_at ?? null,
+    contextLength: null
+  }));
+  // Enrich with contextLength from /api/show (max 5 concurrent)
+  const BATCH = 5;
+  for (let i = 0; i < models.length; i += BATCH) {
+    const batch = models.slice(i, i + BATCH);
+    const details = await Promise.all(
+      batch.map((m) => ollamaShowModel(baseUrl, m.name).catch(() => ({ ok: false })))
+    );
+    for (let j = 0; j < batch.length; j++) {
+      if (details[j]?.ok && details[j].details?.contextLength) {
+        batch[j].contextLength = details[j].details.contextLength;
+      }
+    }
+  }
+  return { ok: true, models };
+}
+/** Show detailed info for a specific model. */
+export async function ollamaShowModel(baseUrl, modelName) {
+  const result = await ollamaFetch(baseUrl, "/api/show", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ model: modelName, verbose: false })
+  });
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  const body = result.json ?? {};
+  const details = {
+    contextLength: body.model_info?.["llm.context_length"] ?? null,
+    parameterSize: body.details?.parameter_size ?? null,
+    quantizationLevel: body.details?.quantization_level ?? null,
+    family: body.details?.family ?? null,
+    format: body.details?.format ?? null
+  };
+  return { ok: true, details };
+}
+/** List currently running (loaded) models. */
+export async function ollamaListRunning(baseUrl) {
+  const result = await ollamaFetch(baseUrl, "/api/ps");
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  const raw = result.json?.models ?? [];
+  const PINNED_SENTINEL = "0001-01-01T00:00:00Z";
+  const models = raw.map((m) => ({
+    name: m.name,
+    sizeVram: m.size_vram ?? null,
+    expiresAt: m.expires_at ?? null,
+    isPinned: m.expires_at === PINNED_SENTINEL,
+    processor: m.details?.families?.[0] ?? null
+  }));
+  return { ok: true, models };
+}
+/**
+ * Load a model into memory. keepAlive: "24h" | "10m" | -1 (pin) | 0 (unload).
+ * Uses 120 s timeout to accommodate large models.
+ */
+export async function ollamaLoadModel(baseUrl, modelName, keepAlive = "24h") {
+  const result = await ollamaFetch(baseUrl, "/api/generate", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ model: modelName, prompt: "", keep_alive: keepAlive, stream: false }),
+    timeoutMs: LOAD_TIMEOUT_MS
+  });
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  const loadDurationMs =
+    typeof result.json?.load_duration === "number"
+      ? result.json.load_duration / 1_000_000
+      : null;
+  return { ok: true, loadDurationMs };
+}
+/** Unload a model from memory immediately. */
+export async function ollamaUnloadModel(baseUrl, modelName) {
+  const result = await ollamaFetch(baseUrl, "/api/generate", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ model: modelName, prompt: "", keep_alive: 0, stream: false }),
+    timeoutMs: LOAD_TIMEOUT_MS
+  });
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  return { ok: true, unloaded: true };
+}
+/** Pin a model in memory indefinitely (keep_alive = -1). */
+export async function ollamaPinModel(baseUrl, modelName) {
+  return ollamaLoadModel(baseUrl, modelName, -1);
+}
+/** Set a custom keep-alive duration for a loaded model (e.g. "10m", "1h"). */
+export async function ollamaSetKeepAlive(baseUrl, modelName, duration) {
+  return ollamaLoadModel(baseUrl, modelName, duration);
+}
+/** Pull (download) a model from the Ollama registry. 600 s timeout. */
+export async function ollamaPullModel(baseUrl, modelName) {
+  const result = await ollamaFetch(baseUrl, "/api/pull", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ name: modelName, stream: false }),
+    timeoutMs: PULL_TIMEOUT_MS
+  });
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  return { ok: true };
+}
+/** Delete a locally stored model. */
+export async function ollamaDeleteModel(baseUrl, modelName) {
+  const result = await ollamaFetch(baseUrl, "/api/delete", {
+    method: "DELETE",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ name: modelName })
+  });
+  if (!result.ok) {
+    return { ok: false, error: result.error ?? `HTTP ${result.status}` };
+  }
+  return { ok: true };
+}

package/src/node/ollama-hardware.js ADDED Viewed

@@ -0,0 +1,94 @@
+/**
+ * VRAM/memory estimation utilities for Ollama models.
+ * Pure functions, no side effects, no external dependencies.
+ */
+const QUANT_BITS = {
+  q2_k: 2.5,
+  q3_k_s: 3.0, q3_k_m: 3.5, q3_k_l: 3.5,
+  q4_0: 4.0, q4_k_s: 4.5, q4_k_m: 4.5,
+  q5_0: 5.0, q5_k_s: 5.5, q5_k_m: 5.5,
+  q6_k: 6.5, q8_0: 8.0, f16: 16.0, f32: 32.0
+};
+const DEFAULT_BITS = 4.5; // Q4_K_M
+const OVERHEAD_BYTES = 512 * 1024 * 1024; // 512 MB
+/**
+ * Parse a parameter size string like "4.3B", "70B" into numeric count.
+ * @param {string} parameterSize
+ * @returns {number|null} Parameter count as integer, or null if unparseable
+ */
+export function parseParameterSize(parameterSize) {
+  const match = String(parameterSize || '').trim().match(/^([\d.]+)([BKMGT]?)$/i);
+  if (!match) return null;
+  const value = parseFloat(match[1]);
+  if (!Number.isFinite(value) || value <= 0) return null;
+  const suffix = match[2].toUpperCase();
+  const multipliers = { B: 1e9, M: 1e6, K: 1e3, G: 1e9, T: 1e12, '': 1 };
+  const multiplier = multipliers[suffix] ?? 1;
+  return Math.round(value * multiplier);
+}
+/**
+ * Estimate VRAM required for a model at a given quantization and context length.
+ * @param {string} parameterSize - e.g. "7B", "70B"
+ * @param {string} quantLevel - e.g. "Q4_K_M", "F16"
+ * @param {number} contextLength - number of tokens in context window
+ * @returns {{ baseModelBytes: number, kvCacheBytes: number, totalBytes: number }|null}
+ */
+export function estimateModelVram(parameterSize, quantLevel, contextLength) {
+  const params = parseParameterSize(parameterSize);
+  if (params === null) return null;
+  const bitsPerWeight = QUANT_BITS[String(quantLevel || '').toLowerCase()] ?? DEFAULT_BITS;
+  const baseModelBytes = params * bitsPerWeight / 8;
+  // Empirical KV cache estimate: bytes per token scales with model size
+  const kvBytesPerToken = params * 0.00000025;
+  const kvCacheBytes = (Number(contextLength) || 0) * kvBytesPerToken;
+  const totalBytes = baseModelBytes + kvCacheBytes + OVERHEAD_BYTES;
+  return { baseModelBytes, kvCacheBytes, totalBytes };
+}
+/**
+ * Calculate max practical context given available memory.
+ * @param {string} parameterSize - e.g. "7B"
+ * @param {string} quantLevel - e.g. "Q4_K_M"
+ * @param {number} availableMemoryBytes
+ * @returns {{ maxContext: number, warningThreshold: number }}
+ */
+export function estimateMaxContext(parameterSize, quantLevel, availableMemoryBytes) {
+  const params = parseParameterSize(parameterSize);
+  if (params === null) return { maxContext: 0, warningThreshold: 0 };
+  const bitsPerWeight = QUANT_BITS[String(quantLevel || '').toLowerCase()] ?? DEFAULT_BITS;
+  const baseModelBytes = params * bitsPerWeight / 8;
+  const remaining = availableMemoryBytes - baseModelBytes - OVERHEAD_BYTES;
+  if (remaining <= 0) return { maxContext: 0, warningThreshold: 0 };
+  const kvBytesPerToken = params * 0.00000025;
+  const maxContext = Math.floor(remaining / kvBytesPerToken / 1024) * 1024;
+  const warningThreshold = Math.floor(maxContext * 0.85);
+  return { maxContext, warningThreshold };
+}
+/**
+ * Format a byte count as a human-readable string.
+ * @param {number} bytes
+ * @returns {string} e.g. "4.5 GB", "512.0 MB"
+ */
+export function formatBytes(bytes) {
+  const n = Number(bytes) || 0;
+  if (n >= 1024 ** 4) return `${(n / 1024 ** 4).toFixed(1)} TB`;
+  if (n >= 1024 ** 3) return `${(n / 1024 ** 3).toFixed(1)} GB`;
+  if (n >= 1024 ** 2) return `${(n / 1024 ** 2).toFixed(1)} MB`;
+  if (n >= 1024)      return `${(n / 1024).toFixed(1)} KB`;
+  return `${n} B`;
+}

package/src/node/ollama-install.js ADDED Viewed

@@ -0,0 +1,230 @@
+/**
+ * Ollama detection, installation, and server lifecycle management.
+ * All public functions return structured results — never throw.
+ */
+import { spawnSync, spawn } from "node:child_process";
+import { existsSync } from "node:fs";
+const OLLAMA_PORT_URL = "http://localhost:11434/";
+const STARTUP_WAIT_MS = 2_000;
+const HEALTH_TIMEOUT_MS = 3_000;
+/** @type {Record<string, string[]>} */
+const FALLBACK_PATHS = {
+  darwin: ["/usr/local/bin/ollama", "/opt/homebrew/bin/ollama"],
+  linux: ["/usr/local/bin/ollama", "/usr/bin/ollama"],
+  win32: ["C:\\Program Files\\Ollama\\ollama.exe"]
+};
+/**
+ * Detect if Ollama is installed on the system.
+ * @returns {{ installed: boolean, path: string, version: string }}
+ */
+export function detectOllamaInstallation() {
+  try {
+    const platform = process.platform;
+    const whichCmd = platform === "win32" ? "where" : "which";
+    const which = spawnSync(whichCmd, ["ollama"], { encoding: "utf8" });
+    let ollamaPath = which.stdout?.trim() ?? "";
+    if (!ollamaPath) {
+      const candidates = FALLBACK_PATHS[platform] ?? [];
+      ollamaPath = candidates.find((p) => existsSync(p)) ?? "";
+    }
+    if (!ollamaPath) {
+      return { installed: false, path: "", version: "" };
+    }
+    const ver = spawnSync("ollama", ["--version"], { encoding: "utf8" });
+    const version = ver.stdout?.trim() ?? "";
+    return { installed: true, path: ollamaPath, version };
+  } catch {
+    return { installed: false, path: "", version: "" };
+  }
+}
+/**
+ * Install Ollama silently per platform.
+ * @param {{ onProgress?: (event: { phase: string, message: string }) => void }} opts
+ * @returns {Promise<{ ok: boolean, version?: string, error?: string, alreadyInstalled?: boolean }>}
+ */
+export async function installOllama({ onProgress } = {}) {
+  const progress = (phase, message) => onProgress?.({ phase, message });
+  try {
+    progress("detecting", "Checking for existing Ollama installation...");
+    const existing = detectOllamaInstallation();
+    if (existing.installed) {
+      progress("done", "Ollama is already installed.");
+      return { ok: true, alreadyInstalled: true, version: existing.version };
+    }
+    const platform = process.platform;
+    if (platform === "win32") {
+      const msg = "Automatic install not supported on Windows. Please install from https://ollama.com/download";
+      progress("error", msg);
+      return { ok: false, error: msg };
+    }
+    if (platform === "darwin") {
+      return await installViaBrew({ progress });
+    }
+    if (platform === "linux") {
+      return await installViaScript({ progress });
+    }
+    const msg = `Unsupported platform: ${platform}`;
+    progress("error", msg);
+    return { ok: false, error: msg };
+  } catch (err) {
+    const error = err instanceof Error ? err.message : String(err);
+    progress("error", error);
+    return { ok: false, error };
+  }
+}
+/**
+ * Start Ollama server as a detached background process.
+ * @returns {Promise<{ ok: boolean, pid?: number, error?: string }>}
+ */
+export async function startOllamaServer() {
+  try {
+    const child = spawn("ollama", ["serve"], {
+      detached: true,
+      stdio: "ignore"
+    });
+    child.unref();
+    const pid = child.pid;
+    await new Promise((resolve) => setTimeout(resolve, STARTUP_WAIT_MS));
+    const running = await isOllamaRunning();
+    if (!running) {
+      return { ok: false, error: "Server did not respond after startup" };
+    }
+    return { ok: true, pid };
+  } catch (err) {
+    return { ok: false, error: err instanceof Error ? err.message : String(err) };
+  }
+}
+/**
+ * Stop the Ollama server process.
+ * @returns {{ ok: boolean }}
+ */
+export function stopOllamaServer() {
+  try {
+    if (process.platform === "win32") {
+      spawnSync("taskkill", ["/IM", "ollama.exe", "/F"]);
+    } else {
+      spawnSync("pkill", ["-x", "ollama"], { timeout: 5000 });
+    }
+    return { ok: true };
+  } catch {
+    return { ok: false };
+  }
+}
+/**
+ * Check if Ollama server is responding.
+ * @returns {Promise<boolean>}
+ */
+export async function isOllamaRunning() {
+  try {
+    const res = await fetch(OLLAMA_PORT_URL, {
+      signal: AbortSignal.timeout(HEALTH_TIMEOUT_MS)
+    });
+    return res.ok || res.status > 0;
+  } catch {
+    return false;
+  }
+}
+// -- Private helpers --
+async function installViaBrew({ progress }) {
+  const brew = spawnSync("which", ["brew"], { encoding: "utf8" });
+  if (!brew.stdout?.trim()) {
+    const msg =
+      "Homebrew not found. Please install Ollama manually from https://ollama.com/download";
+    progress("error", msg);
+    return { ok: false, error: msg };
+  }
+  progress("downloading", "Installing Ollama via Homebrew...");
+  return new Promise((resolve) => {
+    const child = spawn("brew", ["install", "ollama"], { stdio: "pipe" });
+    child.stdout.on("data", (d) =>
+      progress("installing", d.toString().trim())
+    );
+    child.stderr.on("data", (d) =>
+      progress("installing", d.toString().trim())
+    );
+    child.on("close", (code) => {
+      if (code !== 0) {
+        const error = `brew install ollama exited with code ${code}`;
+        progress("error", error);
+        return resolve({ ok: false, error });
+      }
+      progress("verifying", "Verifying installation...");
+      const result = detectOllamaInstallation();
+      if (!result.installed) {
+        const error = "Installation succeeded but ollama binary not found";
+        progress("error", error);
+        return resolve({ ok: false, error });
+      }
+      progress("done", "Ollama installed successfully.");
+      resolve({ ok: true, version: result.version });
+    });
+    child.on("error", (err) => {
+      progress("error", err.message);
+      resolve({ ok: false, error: err.message });
+    });
+  });
+}
+async function installViaScript({ progress }) {
+  progress("downloading", "Downloading Ollama install script...");
+  return new Promise((resolve) => {
+    const child = spawn("sh", ["-c", "curl -fsSL https://ollama.com/install.sh | sh"], {
+      stdio: "pipe"
+    });
+    child.stdout.on("data", (d) =>
+      progress("installing", d.toString().trim())
+    );
+    child.stderr.on("data", (d) =>
+      progress("installing", d.toString().trim())
+    );
+    child.on("close", (code) => {
+      if (code !== 0) {
+        const error = `Install script exited with code ${code}`;
+        progress("error", error);
+        return resolve({ ok: false, error });
+      }
+      progress("verifying", "Verifying installation...");
+      const result = detectOllamaInstallation();
+      if (!result.installed) {
+        const error = "Installation succeeded but ollama binary not found";
+        progress("error", error);
+        return resolve({ ok: false, error });
+      }
+      progress("done", "Ollama installed successfully.");
+      resolve({ ok: true, version: result.version });
+    });
+    child.on("error", (err) => {
+      progress("error", err.message);
+      resolve({ ok: false, error: err.message });
+    });
+  });
+}

package/src/node/provider-probe.js CHANGED Viewed

@@ -84,6 +84,9 @@ function resolveModelsUrl(baseUrl, format) {
       return clean.replace(/\/chat\/completions$/, "/models");
     }
     if (clean.endsWith("/v1") || isVersionedApiRoot) return `${clean}/models`;
+    // Handle base URLs with a versioned segment followed by a sub-path,
+    // e.g. https://generativelanguage.googleapis.com/v1beta/openai
+    if (/\/v\d+[a-z]*\/(?!chat\b)\w+$/i.test(clean)) return `${clean}/models`;
     return `${clean}/v1/models`;
   }
@@ -191,8 +194,12 @@ function extractModelIds(result) {
   const ids = [];
   for (const item of body.data) {
     if (!item || typeof item !== "object") continue;
-    const id = typeof item.id === "string" ? item.id : (typeof item.name === "string" ? item.name : null);
-    if (id) ids.push(id);
+    let id = typeof item.id === "string" ? item.id : (typeof item.name === "string" ? item.name : null);
+    if (id) {
+      // Strip provider-specific prefixes (e.g., Gemini "models/gemini-*")
+      if (id.startsWith("models/")) id = id.slice(7);
+      ids.push(id);
+    }
   }
   return [...new Set(ids)];
 }
@@ -553,12 +560,14 @@ async function probeOpenAI(baseUrl, apiKey, timeoutMs, extraHeaders = {}) {
     }, timeoutMs);
     details.checks.push({ step: "chat", auth: variant.type, status: chatResult.status, error: chatResult.error || null });
-    if (looksOpenAI(chatResult)) {
+    const modelsLooksValid = looksOpenAI(modelsResult) && authLooksValid(modelsResult);
+    if (looksOpenAI(chatResult) || modelsLooksValid) {
       details.supported = true;
-      if (authLooksValid(chatResult)) {
+      if (looksOpenAI(chatResult) ? authLooksValid(chatResult) : modelsLooksValid) {
         details.working = true;
         details.auth = { type: variant.type === "x-api-key" ? "x-api-key" : "bearer" };
-        if (looksOpenAI(modelsResult) && authLooksValid(modelsResult)) {
+        if (modelsLooksValid) {
           details.models = extractModelIds(modelsResult);
         }
         return details;
@@ -799,6 +808,61 @@ function pickPreferredFormatForModel(modelId, formats, { providerPreferredFormat
   return supported[0];
 }
+/**
+ * Probes a list of models against an OpenAI-compatible endpoint to detect
+ * free-tier availability. Returns a map of modelId -> { freeTier, rpm }.
+ * A model is considered not-free-tier if the response contains "limit: 0"
+ * in a free-tier quota metric.
+ */
+export async function probeFreeTierModels(options) {
+  const baseUrl = String(options?.baseUrl || "").trim().replace(/\/+$/, "");
+  const apiKey = String(options?.apiKey || "").trim();
+  const modelIds = (options?.modelIds || []).map((id) => String(id || "").trim()).filter(Boolean);
+  const timeoutMs = Number.isFinite(options?.timeoutMs) ? options.timeoutMs : 6000;
+  if (!baseUrl || !apiKey || modelIds.length === 0) return {};
+  const chatUrl = `${baseUrl}/chat/completions`;
+  const headers = {
+    "Content-Type": "application/json",
+    Authorization: `Bearer ${apiKey}`
+  };
+  const result = {};
+  for (const modelId of modelIds) {
+    try {
+      const response = await safeFetchJson(chatUrl, {
+        method: "POST",
+        headers,
+        body: JSON.stringify({
+          model: modelId,
+          messages: [{ role: "user", content: "hi" }],
+          max_tokens: 1,
+          stream: false
+        })
+      }, timeoutMs);
+      const text = response.text || "";
+      const isZeroQuota = /limit:\s*0[,\s]/i.test(text) || text.includes('"limit": 0') || text.includes('"limit":0');
+      const isFreeTierQuota = text.includes("free_tier");
+      if (isZeroQuota && isFreeTierQuota) {
+        result[modelId] = { freeTier: false };
+      } else if (response.ok || response.status === 400 || response.status === 404) {
+        result[modelId] = { freeTier: true };
+      } else if (response.status === 429 && !isZeroQuota) {
+        result[modelId] = { freeTier: true };
+      } else {
+        result[modelId] = { freeTier: false };
+      }
+    } catch {
+      result[modelId] = { freeTier: null };
+    }
+  }
+  return result;
+}
 export async function probeProviderEndpointMatrix(options) {
   const emitProgress = makeProgressEmitter(options?.onProgress);
   const apiKey = String(options?.apiKey || "").trim();