npm - openmates - Versions diffs - 0.12.0-alpha.10 → 0.12.0-alpha.12 - Mend

openmates 0.12.0-alpha.10 → 0.12.0-alpha.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/{chunk-D7RIGVLZ.js → chunk-R5Z4FBJJ.js} +1113 -70
package/dist/cli.js +1 -1
package/dist/index.d.ts +23 -0
package/dist/index.js +1 -1
package/fixtures/brandenburger-tor.png +0 -0
package/fixtures/brandenburger-tor.svg +25 -0
package/package.json +5 -3

package/dist/{chunk-D7RIGVLZ.js → chunk-R5Z4FBJJ.js} RENAMED Viewed

@@ -986,14 +986,14 @@ var OpenMatesWsClient = class {
     });
   }
   async open(timeoutMs = 1e4) {
-    await new Promise((resolve5, reject) => {
+    await new Promise((resolve6, reject) => {
       const timeout = setTimeout(
         () => reject(new Error("WebSocket open timeout")),
         timeoutMs
       );
       this.socket.once("open", () => {
         clearTimeout(timeout);
-        resolve5();
+        resolve6();
       });
       this.socket.once("error", (error) => {
         clearTimeout(timeout);
@@ -1022,15 +1022,15 @@ var OpenMatesWsClient = class {
     this.socket.send(JSON.stringify({ type, payload }));
   }
   sendAsync(type, payload) {
-    return new Promise((resolve5, reject) => {
+    return new Promise((resolve6, reject) => {
       this.socket.send(JSON.stringify({ type, payload }), (error) => {
         if (error) reject(error);
-        else resolve5();
+        else resolve6();
       });
     });
   }
   waitForMessage(expectedType, predicate, timeoutMs = 2e4) {
-    return new Promise((resolve5, reject) => {
+    return new Promise((resolve6, reject) => {
       const onMessage = (rawData) => {
         try {
           const parsed = JSON.parse(rawData.toString());
@@ -1041,7 +1041,7 @@ var OpenMatesWsClient = class {
             return;
           }
           cleanup();
-          resolve5(parsed);
+          resolve6(parsed);
         } catch {
         }
       };
@@ -1074,14 +1074,14 @@ var OpenMatesWsClient = class {
    * Used by ensureSynced to consume the full phased-sync event stream.
    */
   collectMessages(terminatorType, timeoutMs = 9e4) {
-    return new Promise((resolve5, reject) => {
+    return new Promise((resolve6, reject) => {
       const collected = [];
       const onMessage = (rawData) => {
         try {
           const parsed = JSON.parse(rawData.toString());
           if (parsed.type === terminatorType) {
             cleanup();
-            resolve5(collected);
+            resolve6(collected);
             return;
           }
           collected.push(parsed);
@@ -1094,7 +1094,7 @@ var OpenMatesWsClient = class {
       };
       const onClose = () => {
         cleanup();
-        resolve5(collected);
+        resolve6(collected);
       };
       const timeout = setTimeout(() => {
         cleanup();
@@ -1132,7 +1132,7 @@ var OpenMatesWsClient = class {
     const timeoutMs = options?.timeoutMs ?? 9e4;
     const onStream = options?.onStream;
     const asyncEmbedWaitMs = options?.asyncEmbedWaitMs ?? 12e4;
-    return new Promise((resolve5, reject) => {
+    return new Promise((resolve6, reject) => {
       let latestContent = "";
       let messageId = null;
       let taskId = null;
@@ -1189,7 +1189,7 @@ var OpenMatesWsClient = class {
         if (waitingForUserPayload) {
           if (pendingSubChatHandlers.size > 0) return;
           cleanup();
-          resolve5({
+          resolve6({
             status: "waiting_for_user",
             messageId,
             taskId,
@@ -1209,7 +1209,7 @@ var OpenMatesWsClient = class {
         if (processingEmbedIds.size > 0 && !asyncEmbedTimer) {
           asyncEmbedTimer = setTimeout(() => {
             cleanup();
-            resolve5({
+            resolve6({
               status: "completed",
               messageId,
               taskId,
@@ -1226,7 +1226,7 @@ var OpenMatesWsClient = class {
         }
         if (processingEmbedIds.size > 0) return;
         cleanup();
-        resolve5({
+        resolve6({
           status: "completed",
           messageId,
           taskId,
@@ -1440,7 +1440,7 @@ var OpenMatesWsClient = class {
       const onClose = () => {
         if (aiResponseDone) {
           cleanup();
-          resolve5({
+          resolve6({
             status: "completed",
             messageId,
             taskId,
@@ -3677,6 +3677,23 @@ var OpenMatesClient = class _OpenMatesClient {
     if (connectedAccountTokenRefs.length > 0) {
       messagePayload.connected_account_token_refs = connectedAccountTokenRefs;
     }
+    if (params.benchmarkMetadata) {
+      messagePayload.benchmark_metadata = params.benchmarkMetadata;
+    }
+    if (params.incognito) {
+      const providedHistory = (params.messageHistory ?? []).map((historyMessage) => ({
+        ...historyMessage,
+        chat_id: historyMessage.chat_id ?? chatId
+      }));
+      messagePayload.message_history = [...providedHistory, {
+        message_id: messageId,
+        chat_id: chatId,
+        role: "user",
+        sender_name: "User",
+        content: params.message,
+        created_at: createdAt
+      }];
+    }
     let chatKeyBytes = null;
     let encryptedChatKey = null;
     let baselineMessagesV = 0;
@@ -3735,6 +3752,7 @@ var OpenMatesClient = class _OpenMatesClient {
     if (encryptedEmbeds.length > 0) {
       messagePayload.encrypted_embeds = encryptedEmbeds;
     }
+    const precollectedResponse = params.precollectResponse ? ws.collectAiResponse(messageId, chatId, { onStream: params.onStream }) : null;
     const confirmed = ws.waitForMessage(
       "chat_message_confirmed",
       (payload) => {
@@ -3949,7 +3967,7 @@ var OpenMatesClient = class _OpenMatesClient {
     };
     if (params.incognito) {
       try {
-        const resp = await ws.collectAiResponse(messageId, chatId, streamOpts);
+        const resp = await (precollectedResponse ?? ws.collectAiResponse(messageId, chatId, streamOpts));
         assistantMessageId = resp.messageId;
         assistant = resp.content;
         category = resp.category;
@@ -4301,7 +4319,7 @@ var OpenMatesClient = class _OpenMatesClient {
       if (response.data.status === "failed") {
         throw new Error(response.data.error ?? "Task failed");
       }
-      await new Promise((resolve5) => setTimeout(resolve5, SKILL_TASK_POLL_INTERVAL_MS));
+      await new Promise((resolve6) => setTimeout(resolve6, SKILL_TASK_POLL_INTERVAL_MS));
     }
     throw new Error(`Task ${taskId} did not complete within ${SKILL_TASK_POLL_TIMEOUT_MS / 1e3}s`);
   }
@@ -4522,7 +4540,7 @@ var OpenMatesClient = class _OpenMatesClient {
         `Rate limited by settings API; retrying in ${Math.ceil(SETTINGS_GET_RATE_LIMIT_RETRY_MS / 1e3)}s...
 `
       );
-      await new Promise((resolve5) => setTimeout(resolve5, SETTINGS_GET_RATE_LIMIT_RETRY_MS));
+      await new Promise((resolve6) => setTimeout(resolve6, SETTINGS_GET_RATE_LIMIT_RETRY_MS));
       response = await this.http.get(normalizedPath, this.getCliRequestHeaders());
     }
     if (!response.ok) {
@@ -6023,7 +6041,7 @@ function filenameFromContentDisposition(header2) {
   return plain?.trim() ?? null;
 }
 function sleep(ms) {
-  return new Promise((resolve5) => setTimeout(resolve5, ms));
+  return new Promise((resolve6) => setTimeout(resolve6, ms));
 }
 function printLogo() {
   const W = "\x1B[1;37m";
@@ -6039,9 +6057,9 @@ function printLogo() {
 // src/cli.ts
 import { createInterface as createInterface3 } from "readline/promises";
-import { realpathSync, writeFileSync as writeFileSync4 } from "fs";
-import { fileURLToPath } from "url";
-import { basename as basename3, dirname } from "path";
+import { realpathSync, writeFileSync as writeFileSync5 } from "fs";
+import { fileURLToPath as fileURLToPath2 } from "url";
+import { basename as basename3, dirname as dirname2 } from "path";
 import WebSocket2 from "ws";
 // ../secret-scanner/src/registry.ts
@@ -7741,8 +7759,8 @@ async function renderRemotionShareLink(embedId, client, ln) {
   }
 }
 function generateQr(value) {
-  return new Promise((resolve5) => {
-    qrcode2.generate(value, { small: true }, (qr) => resolve5(qr));
+  return new Promise((resolve6) => {
+    qrcode2.generate(value, { small: true }, (qr) => resolve6(qr));
   });
 }
 function remotionMeta(c) {
@@ -8597,9 +8615,9 @@ function exec(cmd, cwd) {
   return execSync(cmd, { cwd, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] }).trim();
 }
 function runInteractive(cmd, args, cwd) {
-  return new Promise((resolve5, reject) => {
+  return new Promise((resolve6, reject) => {
     const child = nodeSpawn(cmd, args, { cwd, stdio: "inherit", shell: false });
-    child.on("close", (code) => resolve5(code ?? 1));
+    child.on("close", (code) => resolve6(code ?? 1));
     child.on("error", reject);
   });
 }
@@ -8860,10 +8878,10 @@ function warnIfMissingLlmCredentials(installPath) {
 }
 async function confirmDestructive(phrase) {
   const rl = createInterface2({ input: process.stdin, output: process.stderr });
-  return new Promise((resolve5) => {
+  return new Promise((resolve6) => {
     rl.question(`Type "${phrase}" to confirm: `, (answer) => {
       rl.close();
-      resolve5(answer.trim() === phrase);
+      resolve6(answer.trim() === phrase);
     });
   });
 }
@@ -27900,6 +27918,12 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
     account_created: {
       text: "Account created"
     },
+    account_created_second_login_title: {
+      text: "Add a second login method"
+    },
+    account_created_second_login_info: {
+      text: "If you signed up with a passkey, add password plus 2FA as a backup. If you signed up with password plus 2FA, add a passkey for faster secure login."
+    },
     password_security_reminder: {
       subject: {
         text: "Action needed to secure your OpenMates account"
@@ -28129,10 +28153,7 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
       text: "Welcome to OpenMates!"
     },
     complete_signup_info: {
-      text: "Once you completed the signup process by purchasing usage credits or redeeming a gift card, you can start using OpenMates!"
-    },
-    auto_delete_warning: {
-      text: "Please note: Accounts that haven't completed the signup process will be automatically deleted after 7 days."
+      text: "Your account is ready. Here are a few helpful next steps to protect your access and keep a copy of your data."
     },
     want_to_delete_account: {
       text: "Want to delete your account?"
@@ -29262,6 +29283,15 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
     copy_failed: {
       text: "Failed to copy to clipboard"
     },
+    code_file_downloaded: {
+      text: "Code file downloaded successfully"
+    },
+    code_file_download_failed: {
+      text: "Failed to download code file"
+    },
+    action_failed: {
+      text: "Failed to perform action"
+    },
     download_itinerary: {
       text: "Download itinerary"
     },
@@ -30004,6 +30034,15 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
     anonymous_terms_reminder: {
       text: "By sending a message you accept the terms & privacy policy of OpenMates."
     },
+    anonymous_terms_reminder_prefix: {
+      text: "By sending a message you accept the "
+    },
+    anonymous_terms_reminder_connector: {
+      text: " & "
+    },
+    anonymous_terms_reminder_suffix: {
+      text: " of OpenMates."
+    },
     send: {
       text: "Send"
     },
@@ -33661,10 +33700,10 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
           text: "Account: Email address, username/display name, profile image, locale, and security settings (e.g., 2FA enabled). Email and username are encrypted with your key before storage. We also keep a separate server-side Vault-encrypted copy of your verified email address for mandatory account lifecycle notices, such as account verification, security alerts, and deletion reminders. Passwords are stored as salted hashes."
         },
         usage: {
-          text: "Usage: Server logs, event timestamps, feature usage, error logs, and device recognition identifiers (hashed). IP addresses may be temporarily processed for security and rate limiting."
+          text: "Usage: Server logs, event timestamps, feature usage, error logs, and device recognition identifiers (hashed). IP addresses may be temporarily processed for security, rate limiting, and anonymous free-usage budget checks. Anonymous free usage uses a first-party random local identifier and sends only a server-side HMAC-hashed form for per-identity abuse limits."
         },
         content: {
-          text: "Content: Chat messages, prompts, attachments, and uploaded images/videos necessary to deliver the service (subject to moderation where applicable)."
+          text: "Content: Chat messages, prompts, attachments, and uploaded images/videos necessary to deliver the service (subject to moderation where applicable). Anonymous free-usage chats stay local-only and encrypted with per-chat keys before signup; they are uploaded only if you sign up and promote them into account sync."
         },
         payments: {
           text: "Payments: Payment method tokens, transaction IDs, billing address and VAT information as required for invoicing (processed primarily by Stripe). We do not store full card numbers."
@@ -34202,7 +34241,7 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
           text: "Credits and Payments"
         },
         description: {
-          text: "OpenMates uses a credit-based payment system. You purchase credit packs which are consumed when using AI services. Credits do not expire and remain in your account until used. Payment processing is handled by Stripe - see our Privacy Policy for details about payment data handling."
+          text: "OpenMates uses a credit-based payment system. You purchase credit packs which are consumed when using AI services. Credits do not expire and remain in your account until used. Official-cloud anonymous free usage, when available, is a limited fair-use trial with shared daily/weekly caps and per-identity abuse limits; it is not an account credit balance and may be unavailable when the budget is exhausted. Payment processing is handled by Stripe - see our Privacy Policy for details about payment data handling."
         },
         refund: {
           text: "Refund Policy: You may request a refund for unused credits within 14 days after purchase. Your right of withdrawal expires once credits are used. See our signup process for the full refund consent details."
@@ -38936,6 +38975,9 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
         weekly_remaining: {
           text: "Weekly remaining"
         },
+        monthly_remaining: {
+          text: "Monthly remaining"
+        },
         reset_at: {
           text: "Daily reset"
         },
@@ -38968,6 +39010,9 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
         },
         validation_percent: {
           text: "Percent values must be between 0 and 100."
+        },
+        validation_per_identity_cap: {
+          text: "Per-identity daily cap must be at least 1 credit when the monthly budget is above 0."
         }
       },
       tests: {
@@ -41487,6 +41532,995 @@ function buildAssistantFeedbackDecision(rating) {
   };
 }
+// src/benchmark.ts
+import { randomUUID as randomUUID3 } from "crypto";
+import { existsSync as existsSync6, mkdtempSync, readFileSync as readFileSync6, readdirSync, writeFileSync as writeFileSync4 } from "fs";
+import { tmpdir } from "os";
+import { dirname, join as join4, resolve as resolve5 } from "path";
+import { fileURLToPath } from "url";
+var DEFAULT_JUDGE_MODEL = "google/gemini-3-flash-preview";
+var DEFAULT_EXTENSIVE_SIZE = 10;
+var DEFAULT_PARALLEL = 4;
+var FIXTURE_IMAGE_SVG = `<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="800" viewBox="0 0 1200 800">
+  <rect width="1200" height="800" fill="#d8ecff"/>
+  <rect y="560" width="1200" height="240" fill="#d7c39a"/>
+  <text x="600" y="88" text-anchor="middle" font-family="Arial, sans-serif" font-size="44" font-weight="700" fill="#23344d">Brandenburger Tor, Berlin</text>
+  <g transform="translate(160 170)" fill="#c9aa6a" stroke="#5d4522" stroke-width="8">
+    <rect x="80" y="160" width="800" height="58"/>
+    <rect x="120" y="218" width="720" height="48"/>
+    <rect x="150" y="266" width="660" height="42"/>
+    <g fill="#d9bd7d">
+      <rect x="170" y="308" width="54" height="250"/>
+      <rect x="285" y="308" width="54" height="250"/>
+      <rect x="400" y="308" width="54" height="250"/>
+      <rect x="515" y="308" width="54" height="250"/>
+      <rect x="630" y="308" width="54" height="250"/>
+      <rect x="745" y="308" width="54" height="250"/>
+    </g>
+    <rect x="130" y="558" width="700" height="50"/>
+    <path d="M480 30 C530 72 620 88 682 48 L720 84 C652 142 530 124 456 78 Z" fill="#3e6f5f"/>
+    <circle cx="510" cy="92" r="22" fill="#3e6f5f"/>
+    <circle cx="625" cy="92" r="22" fill="#3e6f5f"/>
+    <path d="M565 38 l26 78 h-52 z" fill="#3e6f5f"/>
+  </g>
+  <text x="600" y="740" text-anchor="middle" font-family="Arial, sans-serif" font-size="32" fill="#23344d">Neoclassical gate with Quadriga on top</text>
+</svg>
+`;
+var QUICK_CASES = [
+  {
+    id: "quick-exact-token",
+    suite: "quick",
+    title: "Exact token smoke test",
+    prompt: "Reply with exactly this token and no extra text: BENCHMARK_SMOKE_OK",
+    complexity: "basic",
+    category: "smoke",
+    expectedIncludes: "BENCHMARK_SMOKE_OK",
+    judge: true,
+    estimatedInputTokens: 12e3,
+    estimatedOutputTokens: 64
+  },
+  {
+    id: "quick-arithmetic",
+    suite: "quick",
+    title: "Arithmetic direct answer",
+    prompt: "Compute 19 * 23. Reply with only the integer result.",
+    complexity: "basic",
+    category: "math",
+    expectedIncludes: "437",
+    judge: true,
+    estimatedInputTokens: 12e3,
+    estimatedOutputTokens: 64
+  },
+  {
+    id: "quick-code",
+    suite: "quick",
+    title: "Small code generation",
+    prompt: "Write a TypeScript function isPalindrome(input: string): boolean that ignores spaces, punctuation, and case. Include only the function and one short usage example.",
+    complexity: "medium",
+    category: "coding",
+    judge: true,
+    estimatedInputTokens: 12200,
+    estimatedOutputTokens: 650
+  },
+  {
+    id: "quick-image-brandenburger-tor",
+    suite: "quick",
+    title: "Default image understanding",
+    prompt: "Look at the attached image. What landmark is shown, when was it built, and who designed it? Answer in three concise bullet points.",
+    complexity: "medium",
+    category: "image",
+    image: "default",
+    expectedIncludes: "Brandenburg",
+    judge: true,
+    estimatedInputTokens: 13500,
+    estimatedOutputTokens: 350
+  },
+  {
+    id: "quick-followup-continuity",
+    suite: "quick",
+    title: "Short multi-turn continuity",
+    prompt: "Create a three-step plan for evaluating whether a new AI model is ready for production use.",
+    complexity: "medium",
+    category: "multi_turn",
+    judge: true,
+    estimatedInputTokens: 14e3,
+    estimatedOutputTokens: 900,
+    followUps: [
+      { prompt: "Now make step 2 more concrete with two measurable checks." },
+      { prompt: "Summarize the final plan in one sentence." }
+    ]
+  }
+];
+var EXTENSIVE_CASES = [
+  ...QUICK_CASES,
+  {
+    id: "extensive-coding-debug",
+    suite: "extensive",
+    title: "Debug a JavaScript bug",
+    prompt: "A JavaScript function returns NaN when summing prices from [{price: '12.50'}, {price: undefined}]. Explain the bug and write a corrected function.",
+    complexity: "medium",
+    category: "coding",
+    judge: true,
+    estimatedInputTokens: 12300,
+    estimatedOutputTokens: 850
+  },
+  {
+    id: "extensive-coding-api-design",
+    suite: "extensive",
+    title: "Design a small API contract",
+    prompt: "Design a minimal JSON API for creating and listing benchmark runs. Include request/response examples and one validation error.",
+    complexity: "advanced",
+    category: "coding",
+    judge: true,
+    estimatedInputTokens: 12300,
+    estimatedOutputTokens: 1e3
+  },
+  {
+    id: "extensive-reasoning-tradeoffs",
+    suite: "extensive",
+    title: "Reason about benchmark tradeoffs",
+    prompt: "Compare deterministic assertions and LLM-as-judge evaluation for model benchmarks. Give two strengths and two risks for each.",
+    complexity: "medium",
+    category: "reasoning",
+    judge: true,
+    estimatedInputTokens: 12200,
+    estimatedOutputTokens: 800
+  },
+  {
+    id: "extensive-planning",
+    suite: "extensive",
+    title: "Operational rollout plan",
+    prompt: "Create a rollout checklist for switching a production chatbot from one model to another. Include monitoring, rollback, and user-visible risk checks.",
+    complexity: "advanced",
+    category: "synthesis",
+    judge: true,
+    estimatedInputTokens: 12300,
+    estimatedOutputTokens: 950
+  },
+  {
+    id: "extensive-long-context-followup",
+    suite: "extensive",
+    title: "Prebuilt 20-message long chat follow-up",
+    prompt: "Based on the earlier discussion, choose the best launch strategy and explain why in five bullets.",
+    complexity: "advanced",
+    category: "long_context",
+    longContext: true,
+    judge: true,
+    estimatedInputTokens: 18500,
+    estimatedOutputTokens: 900
+  },
+  {
+    id: "extensive-policy-summary",
+    suite: "extensive",
+    title: "Policy summarization",
+    prompt: "Summarize why privacy-preserving benchmark logs should avoid raw user prompts. Include a concrete safer alternative.",
+    complexity: "medium",
+    category: "reasoning",
+    judge: true,
+    estimatedInputTokens: 12200,
+    estimatedOutputTokens: 650
+  },
+  {
+    id: "extensive-structured-output",
+    suite: "extensive",
+    title: "Structured JSON output",
+    prompt: "Return only JSON with keys risk, mitigation, and confidence for the risk: benchmark results are biased by prompt wording.",
+    complexity: "medium",
+    category: "synthesis",
+    judge: true,
+    estimatedInputTokens: 12200,
+    estimatedOutputTokens: 350
+  },
+  {
+    id: "extensive-creative-constraint",
+    suite: "extensive",
+    title: "Creative constrained response",
+    prompt: "Write a six-line product note announcing model comparisons. Each line must be under 70 characters and avoid hype words like revolutionary or magical.",
+    complexity: "medium",
+    category: "synthesis",
+    judge: true,
+    estimatedInputTokens: 12200,
+    estimatedOutputTokens: 500
+  },
+  {
+    id: "extensive-data-reasoning",
+    suite: "extensive",
+    title: "Interpret metrics",
+    prompt: "A benchmark has pass rates 8/10, 7/10, and 9/10 across three runs. Explain what you can and cannot conclude from this sample.",
+    complexity: "medium",
+    category: "reasoning",
+    judge: true,
+    estimatedInputTokens: 12200,
+    estimatedOutputTokens: 600
+  },
+  {
+    id: "extensive-security-review",
+    suite: "extensive",
+    title: "Security review",
+    prompt: "Review this benchmark design for security risks: it logs prompts, outputs, model ids, and usage costs to a shared file. List risks and safer defaults.",
+    complexity: "advanced",
+    category: "reasoning",
+    judge: true,
+    estimatedInputTokens: 12300,
+    estimatedOutputTokens: 850
+  },
+  {
+    id: "extensive-followup-requirements",
+    suite: "extensive",
+    title: "Three-turn requirements refinement",
+    prompt: "Draft acceptance criteria for a CLI benchmark comparison feature.",
+    complexity: "advanced",
+    category: "multi_turn",
+    judge: true,
+    estimatedInputTokens: 14500,
+    estimatedOutputTokens: 1100,
+    followUps: [
+      { prompt: "Add one criterion about cost estimation before live runs." },
+      { prompt: "Add one criterion about partial results after interruption." },
+      { prompt: "Now compress the criteria to five bullets total." }
+    ]
+  },
+  {
+    id: "extensive-coding-tests",
+    suite: "extensive",
+    title: "Write tests for parser behavior",
+    prompt: "Write Node.js test cases for a function parseSuites(value) that accepts quick, extensive, all, and comma-separated lists, and rejects unknown suites.",
+    complexity: "medium",
+    category: "coding",
+    judge: true,
+    estimatedInputTokens: 12300,
+    estimatedOutputTokens: 950
+  },
+  {
+    id: "extensive-coding-refactor",
+    suite: "extensive",
+    title: "Refactor duplicated code",
+    prompt: "Given two duplicated TypeScript loops that build arrays of result objects, explain when to extract a helper and write the helper signature.",
+    complexity: "medium",
+    category: "coding",
+    judge: true,
+    estimatedInputTokens: 12300,
+    estimatedOutputTokens: 750
+  },
+  {
+    id: "extensive-comparison-analysis",
+    suite: "extensive",
+    title: "Compare two model outputs",
+    prompt: "Explain how you would compare two model outputs when one is concise but misses caveats and the other is verbose but complete.",
+    complexity: "medium",
+    category: "reasoning",
+    judge: true,
+    estimatedInputTokens: 12200,
+    estimatedOutputTokens: 650
+  },
+  {
+    id: "extensive-failure-mode",
+    suite: "extensive",
+    title: "Failure-mode analysis",
+    prompt: "List five failure modes for image-understanding benchmarks and one mitigation for each.",
+    complexity: "advanced",
+    category: "image",
+    judge: true,
+    estimatedInputTokens: 12300,
+    estimatedOutputTokens: 900
+  }
+];
+async function handleBenchmark(client, subcommand, rest, flags) {
+  if (!subcommand || subcommand === "help" || flags.help === true) {
+    printBenchmarkHelp();
+    return;
+  }
+  if (subcommand !== "model") {
+    throw new Error(`Unknown benchmark command '${subcommand}'. Run 'openmates benchmark --help'.`);
+  }
+  const targetModels = rest.filter((arg) => !arg.startsWith("--"));
+  if (targetModels.length === 0) {
+    throw new Error("Missing target model. Usage: openmates benchmark model <provider/model> [model-b] --confirm-spend-credits");
+  }
+  const compare = flags.compare === true;
+  if (targetModels.length > 1 && !compare) {
+    throw new Error("Multiple target models require --compare.");
+  }
+  if (compare && targetModels.length < 2) {
+    throw new Error("--compare requires at least two target models.");
+  }
+  const judgeModel = typeof flags["judge-model"] === "string" ? flags["judge-model"] : DEFAULT_JUDGE_MODEL;
+  const suites = parseSuites(flags.suite);
+  const runs = parseRuns(flags.runs);
+  const extensiveSize = parseExtensiveSize(flags["extensive-size"]);
+  const parallel = parseParallel(flags.parallel);
+  const dryRun = flags["dry-run"] === true;
+  const output = typeof flags.output === "string" ? flags.output : void 0;
+  const runId = typeof flags["run-id"] === "string" ? flags["run-id"] : randomUUID3();
+  const imagePath = typeof flags.image === "string" ? resolve5(flags.image) : defaultImageFixturePath();
+  if (!dryRun && flags["confirm-spend-credits"] !== true) {
+    throw new Error(
+      "Benchmark runs spend real credits from the logged-in account. Rerun with --confirm-spend-credits, or use --dry-run to preview the plan."
+    );
+  }
+  const cases = expandCases(suites, runs, extensiveSize);
+  const pricing = loadPricingForModels([...targetModels, judgeModel]);
+  const estimate = estimateCredits(cases, targetModels, judgeModel, pricing);
+  const result = makeBaseResult({
+    runId,
+    targetModels,
+    judgeModel,
+    suites,
+    runs,
+    compare,
+    parallel,
+    extensiveSize,
+    dryRun,
+    estimate,
+    totalJobs: cases.length * targetModels.length
+  });
+  if (dryRun) {
+    writeBenchmarkResult(result, flags, output);
+    return;
+  }
+  if (!client.hasSession()) {
+    throw new Error("Benchmark runs require login. Run 'openmates login' first.");
+  }
+  let interrupted = false;
+  const onInterrupt = () => {
+    interrupted = true;
+  };
+  process.once("SIGINT", onInterrupt);
+  try {
+    const jobs = cases.flatMap((benchmarkCase) => targetModels.map((model) => ({ model, benchmarkCase })));
+    await runPool(jobs, parallel, async (job) => {
+      if (interrupted) return;
+      const caseResult = await runCaseJob({ client, job, judgeModel, runId, imagePath });
+      result.cases.push(caseResult);
+      recomputeResult(result, jobs.length, interrupted);
+    });
+  } finally {
+    process.off("SIGINT", onInterrupt);
+  }
+  recomputeResult(result, cases.length * targetModels.length, interrupted);
+  writeBenchmarkResult(result, flags, output);
+}
+function printBenchmarkHelp() {
+  console.log(`Benchmark commands:
+  openmates benchmark model <provider/model> [provider/model...] --confirm-spend-credits [--compare] [--suite quick|extensive|all] [--json]
+Runs real incognito chat requests through the OpenMates product path. Live runs
+spend the logged-in user's credits and usage entries are grouped as benchmark spend.
+Options:
+  --confirm-spend-credits       Required for live benchmark runs
+  --dry-run                     Preview the benchmark plan without inference or spend
+  --compare                     Compare two or more target models
+  --suite <list>                Comma-separated suites: quick, extensive, all (default: quick)
+  --extensive-size <n>          Extensive cases to run: 5, 10, or 20 (default: ${DEFAULT_EXTENSIVE_SIZE})
+  --runs <n>                    Repeat each selected case (default: 1)
+  --parallel <n>                Concurrent target case requests (default: ${DEFAULT_PARALLEL})
+  --judge-model <provider/model> Judge for evaluated cases (default: ${DEFAULT_JUDGE_MODEL})
+  --image <path>                Override default Brandenburger Tor image fixture
+  --run-id <id>                 Reuse a benchmark run id for grouping
+  --output <path>               Save JSON result to a file
+  --json                        Print JSON result`);
+}
+function parseSuites(value) {
+  if (value === void 0 || value === false) return ["quick"];
+  if (value === true) throw new Error("--suite requires a value");
+  const suites = value.split(",").map((suite) => suite.trim()).filter(Boolean);
+  if (suites.includes("all")) return ["quick", "extensive"];
+  const allowed = /* @__PURE__ */ new Set(["quick", "extensive"]);
+  const invalid = suites.filter((suite) => !allowed.has(suite));
+  if (invalid.length > 0 || suites.length === 0) {
+    throw new Error("Invalid --suite. Use quick, extensive, or all.");
+  }
+  return [...new Set(suites)];
+}
+function parseRuns(value) {
+  if (value === void 0 || value === false) return 1;
+  if (value === true) throw new Error("--runs requires a value");
+  const parsed = Number.parseInt(value, 10);
+  if (!Number.isInteger(parsed) || parsed < 1 || parsed > 20) {
+    throw new Error("--runs must be an integer from 1 to 20");
+  }
+  return parsed;
+}
+function parseExtensiveSize(value) {
+  if (value === void 0 || value === false) return DEFAULT_EXTENSIVE_SIZE;
+  if (value === true) throw new Error("--extensive-size requires a value");
+  const parsed = Number.parseInt(value, 10);
+  if (![5, 10, 20].includes(parsed)) {
+    throw new Error("--extensive-size must be 5, 10, or 20");
+  }
+  return parsed;
+}
+function parseParallel(value) {
+  if (value === void 0 || value === false) return DEFAULT_PARALLEL;
+  if (value === true) throw new Error("--parallel requires a value");
+  const parsed = Number.parseInt(value, 10);
+  if (!Number.isInteger(parsed) || parsed < 1 || parsed > 20) {
+    throw new Error("--parallel must be an integer from 1 to 20");
+  }
+  return parsed;
+}
+function expandCases(suites, runs, extensiveSize) {
+  const selected = [];
+  if (suites.includes("quick")) selected.push(...QUICK_CASES);
+  if (suites.includes("extensive")) selected.push(...selectExtensiveCases(extensiveSize));
+  const uniqueSelected = dedupeCases(selected);
+  const expanded = [];
+  for (let run = 1; run <= runs; run += 1) {
+    for (const benchmarkCase of uniqueSelected) expanded.push({ ...benchmarkCase, run });
+  }
+  return expanded;
+}
+function selectExtensiveCases(size) {
+  const cases = dedupeCases(EXTENSIVE_CASES).slice(0, size);
+  const minimumCoding = Math.ceil(size * 0.15);
+  const codingCount = cases.filter((benchmarkCase) => benchmarkCase.category === "coding").length;
+  if (codingCount >= minimumCoding) return cases;
+  const selectedIds = new Set(cases.map((benchmarkCase) => benchmarkCase.id));
+  const codingBackfill = EXTENSIVE_CASES.filter(
+    (benchmarkCase) => benchmarkCase.category === "coding" && !selectedIds.has(benchmarkCase.id)
+  );
+  const result = [...cases];
+  for (const codingCase of codingBackfill) {
+    let replaceIndex = -1;
+    for (let index = result.length - 1; index >= 0; index -= 1) {
+      if (result[index]?.category !== "coding") {
+        replaceIndex = index;
+        break;
+      }
+    }
+    if (replaceIndex === -1) break;
+    result[replaceIndex] = codingCase;
+    if (result.filter((benchmarkCase) => benchmarkCase.category === "coding").length >= minimumCoding) break;
+  }
+  return result;
+}
+function dedupeCases(cases) {
+  const seen = /* @__PURE__ */ new Set();
+  const result = [];
+  for (const benchmarkCase of cases) {
+    if (seen.has(benchmarkCase.id)) continue;
+    seen.add(benchmarkCase.id);
+    result.push(benchmarkCase);
+  }
+  return result;
+}
+async function runCaseJob(params) {
+  const { client, job, judgeModel, runId, imagePath } = params;
+  const { model, benchmarkCase } = job;
+  const startedAt = Date.now();
+  const turns = [];
+  const history = benchmarkCase.longContext ? buildLongContextHistory() : [];
+  let chatId;
+  try {
+    const initialPrompt = await buildPromptWithAttachments(client, benchmarkCase, model, imagePath);
+    const targetResponse = await sendBenchmarkTurn({
+      client,
+      model,
+      judgeModel,
+      runId,
+      benchmarkCase,
+      prompt: initialPrompt.message,
+      chatId,
+      history,
+      preparedEmbeds: initialPrompt.embeds,
+      caseId: benchmarkCase.id
+    });
+    chatId = targetResponse.chatId;
+    turns.push(targetResponse.turn);
+    appendHistory(history, "user", initialPrompt.message);
+    appendHistory(history, "assistant", targetResponse.turn.assistant);
+    for (const [index, followUp] of (benchmarkCase.followUps ?? []).entries()) {
+      const response = await sendBenchmarkTurn({
+        client,
+        model,
+        judgeModel,
+        runId,
+        benchmarkCase,
+        prompt: `${modelMention(model)} ${followUp.prompt}`,
+        chatId,
+        history,
+        caseId: `${benchmarkCase.id}:followup-${index + 1}`
+      });
+      chatId = response.chatId;
+      turns.push(response.turn);
+      appendHistory(history, "user", response.rawPrompt);
+      appendHistory(history, "assistant", response.turn.assistant);
+    }
+    const assistant = turns.at(-1)?.assistant ?? "";
+    const caseResult = {
+      id: benchmarkCase.id,
+      suite: benchmarkCase.suite,
+      title: benchmarkCase.title,
+      model,
+      run: benchmarkCase.run,
+      complexity: benchmarkCase.complexity,
+      category: benchmarkCase.category,
+      prompt: benchmarkCase.prompt,
+      assistant,
+      modelName: turns.at(-1)?.modelName ?? null,
+      passed: benchmarkCase.expectedIncludes ? assistant.includes(benchmarkCase.expectedIncludes) : true,
+      durationMs: Date.now() - startedAt,
+      expectedIncludes: benchmarkCase.expectedIncludes,
+      turns
+    };
+    if (benchmarkCase.judge) {
+      caseResult.judge = await judgeCase({ client, judgeModel, targetModel: model, benchmarkCase, caseResult, runId });
+      caseResult.passed = caseResult.judge.score !== null && caseResult.judge.score >= 4 && caseResult.passed;
+    }
+    return caseResult;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    return {
+      id: benchmarkCase.id,
+      suite: benchmarkCase.suite,
+      title: benchmarkCase.title,
+      model,
+      run: benchmarkCase.run,
+      complexity: benchmarkCase.complexity,
+      category: benchmarkCase.category,
+      prompt: benchmarkCase.prompt,
+      assistant: turns.at(-1)?.assistant ?? "",
+      modelName: turns.at(-1)?.modelName ?? null,
+      passed: false,
+      durationMs: Date.now() - startedAt,
+      expectedIncludes: benchmarkCase.expectedIncludes,
+      turns,
+      error: message
+    };
+  }
+}
+async function sendBenchmarkTurn(params) {
+  const startedAt = Date.now();
+  const response = await params.client.sendMessage({
+    message: params.prompt,
+    chatId: params.chatId,
+    incognito: true,
+    autoApproveSubChats: true,
+    benchmarkMetadata: benchmarkMetadata({
+      runId: params.runId,
+      suite: params.benchmarkCase.suite,
+      caseId: params.caseId,
+      targetModel: params.model,
+      judgeModel: params.judgeModel
+    }),
+    messageHistory: params.history,
+    preparedEmbeds: params.preparedEmbeds,
+    precollectResponse: true
+  });
+  return {
+    chatId: response.chatId,
+    rawPrompt: params.prompt,
+    turn: {
+      prompt: params.prompt,
+      assistant: response.assistant,
+      modelName: response.modelName,
+      durationMs: Date.now() - startedAt
+    }
+  };
+}
+async function buildPromptWithAttachments(client, benchmarkCase, model, imagePath) {
+  const baseMessage = `${modelMention(model)} ${benchmarkCase.prompt}`;
+  if (benchmarkCase.image !== "default") return { message: baseMessage };
+  const attachment = await prepareImageAttachment(client, imagePath);
+  return { message: `${baseMessage}
+${attachment.messageSuffix}`, embeds: attachment.embeds };
+}
+async function prepareImageAttachment(client, imagePath) {
+  if (!existsSync6(imagePath)) throw new Error(`Benchmark image not found: ${imagePath}`);
+  const processed = processFiles([imagePath], null);
+  if (processed.blocked.length > 0 || processed.errors.length > 0 || processed.embeds.length === 0) {
+    const reason = [...processed.blocked, ...processed.errors].map((entry) => entry.error).join("; ") || "no image embed produced";
+    throw new Error(`Failed to prepare benchmark image: ${reason}`);
+  }
+  const fileEmbed = processed.embeds[0];
+  if (!fileEmbed.requiresUpload || !fileEmbed.localPath) {
+    return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
+  }
+  await uploadBenchmarkImage(client, fileEmbed);
+  return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
+}
+async function uploadBenchmarkImage(client, fileEmbed) {
+  if (!fileEmbed.localPath) return;
+  const uploadResult = await uploadFile(fileEmbed.localPath, client.getSession());
+  const embedRef = fileEmbed.embed.embedRef ?? `benchmark-image-${uploadResult.embed_id.slice(0, 8)}`;
+  fileEmbed.embed.embedRef = embedRef;
+  fileEmbed.embed.content = toonEncodeContent({
+    type: "image",
+    app_id: "images",
+    skill_id: "upload",
+    status: "finished",
+    filename: fileEmbed.displayName,
+    embed_ref: embedRef,
+    content_hash: uploadResult.content_hash,
+    s3_base_url: uploadResult.s3_base_url,
+    files: uploadResult.files,
+    aes_key: uploadResult.aes_key,
+    aes_nonce: uploadResult.aes_nonce,
+    vault_wrapped_aes_key: uploadResult.vault_wrapped_aes_key,
+    ai_detection: uploadResult.ai_detection
+  });
+  fileEmbed.embed.status = "finished";
+  fileEmbed.embed.contentHash = uploadResult.content_hash;
+  fileEmbed.embed.embedId = uploadResult.embed_id;
+  fileEmbed.referenceBlock = createEmbedReferenceBlock(embedRef);
+}
+async function judgeCase(params) {
+  const startedAt = Date.now();
+  const judgeResponse = await params.client.sendMessage({
+    message: `${modelMention(params.judgeModel)} ${judgePrompt(params.targetModel, params.benchmarkCase, params.caseResult)}`,
+    incognito: true,
+    autoApproveSubChats: true,
+    benchmarkMetadata: benchmarkMetadata({
+      runId: params.runId,
+      suite: params.benchmarkCase.suite,
+      caseId: `${params.benchmarkCase.id}:judge:${params.targetModel}`,
+      targetModel: params.targetModel,
+      judgeModel: params.judgeModel
+    }),
+    precollectResponse: true
+  });
+  const judgment = parseJudgment(judgeResponse.assistant);
+  return {
+    model: params.judgeModel,
+    score: judgment.score,
+    reason: judgment.reason,
+    raw: judgeResponse.assistant,
+    durationMs: Date.now() - startedAt
+  };
+}
+async function runPool(items, parallel, worker) {
+  let index = 0;
+  const workers = Array.from({ length: Math.min(parallel, items.length) }, async () => {
+    while (index < items.length) {
+      const item = items[index];
+      index += 1;
+      await worker(item);
+    }
+  });
+  await Promise.all(workers);
+}
+function buildLongContextHistory() {
+  const now = Math.floor(Date.now() / 1e3) - 2e3;
+  const topics = [
+    ["user", "We need to launch a CLI benchmark for model comparisons."],
+    ["assistant", "The first goal should be a quick suite with deterministic checks."],
+    ["user", "The benchmark also needs image inference."],
+    ["assistant", "Use a public fixture image and ask a factual visual question."],
+    ["user", "We should avoid wasting credits."],
+    ["assistant", "Run a pricing preflight and require explicit spend confirmation."],
+    ["user", "What about longer conversations?"],
+    ["assistant", "Add a 20-message predefined history and a dependent follow-up."],
+    ["user", "The extensive suite should not be too small."],
+    ["assistant", "Default to 10 cases and allow 5 or 20 as alternatives."],
+    ["user", "Coding quality matters."],
+    ["assistant", "Reserve at least 15 percent of extensive cases for coding prompts."],
+    ["user", "We also need comparison mode."],
+    ["assistant", "Accept multiple models with --compare and run target jobs in parallel."],
+    ["user", "How should judging work?"],
+    ["assistant", "Judge each completed case immediately with Gemini so partial results remain useful."],
+    ["user", "What if the process is interrupted?"],
+    ["assistant", "Print or write a partial summary with completed judgments and skipped counts."],
+    ["user", "What is the best launch strategy?"],
+    ["assistant", "Ship quick and comparison first, then use extensive for slower releases."]
+  ];
+  return topics.map(([role, content], index) => ({
+    message_id: `benchmark-history-${index + 1}`,
+    role,
+    sender_name: role === "user" ? "User" : "Assistant",
+    content,
+    created_at: now + index * 30
+  }));
+}
+function appendHistory(history, role, content) {
+  history.push({
+    message_id: randomUUID3(),
+    role,
+    sender_name: role === "user" ? "User" : "Assistant",
+    content,
+    created_at: Math.floor(Date.now() / 1e3)
+  });
+}
+function modelMention(model) {
+  const separator = model.indexOf("/");
+  if (separator === -1) return `@ai-model:${model}`;
+  const provider = model.slice(0, separator);
+  const modelId = model.slice(separator + 1);
+  if (!provider || !modelId) return `@ai-model:${model}`;
+  return `@ai-model:${modelId}:${provider}`;
+}
+function benchmarkMetadata(params) {
+  return {
+    source: "benchmark",
+    benchmark_run_id: params.runId,
+    benchmark_suite: params.suite,
+    benchmark_case: params.caseId,
+    benchmark_target_model: params.targetModel,
+    benchmark_judge_model: params.judgeModel
+  };
+}
+function judgePrompt(targetModel, benchmarkCase, result) {
+  return [
+    "You are judging a real OpenMates model benchmark response.",
+    "Return exactly two plain-text lines, with no markdown, no code block, and no tool use.",
+    "Line 1 format: BENCHMARK_SCORE=<integer from 1 to 5>",
+    "Line 2 format: BENCHMARK_REASON=<one short sentence>",
+    "Score for correctness, instruction-following, usefulness, and continuity where relevant.",
+    `Target model: ${targetModel}`,
+    `Benchmark case: ${benchmarkCase.id} (${benchmarkCase.category}, ${benchmarkCase.complexity})`,
+    `Initial prompt: ${JSON.stringify(benchmarkCase.prompt)}`,
+    `Turns: ${JSON.stringify(result.turns.map((turn) => ({ prompt: turn.prompt, assistant: turn.assistant })))}`
+  ].join("\n");
+}
+function parseJudgment(answer) {
+  const markerScore = answer.match(/BENCHMARK_SCORE\s*=\s*([1-5])/i);
+  if (markerScore) {
+    const reasonMatch = answer.match(/BENCHMARK_REASON\s*=\s*(.+)/i);
+    return {
+      score: Number.parseInt(markerScore[1], 10),
+      reason: reasonMatch?.[1]?.trim() ?? null
+    };
+  }
+  const jsonText = extractJsonObject(answer);
+  if (!jsonText) return { score: null, reason: null };
+  try {
+    const parsed = JSON.parse(jsonText);
+    const score = typeof parsed.score === "number" && Number.isFinite(parsed.score) ? parsed.score : null;
+    const reason = typeof parsed.reason === "string" ? parsed.reason : null;
+    return { score, reason };
+  } catch {
+    return { score: null, reason: null };
+  }
+}
+function extractJsonObject(text) {
+  const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i);
+  if (fenced) return fenced[1];
+  const start = text.indexOf("{");
+  const end = text.lastIndexOf("}");
+  if (start === -1 || end === -1 || end <= start) return null;
+  return text.slice(start, end + 1);
+}
+function loadPricingForModels(models) {
+  const availablePricing = loadProviderPricing();
+  const pricing = /* @__PURE__ */ new Map();
+  const missing = [];
+  for (const model of [...new Set(models)]) {
+    const key = normalizeModelKey(model);
+    const modelPricing = availablePricing.get(key);
+    if (!modelPricing) {
+      missing.push(model);
+      continue;
+    }
+    pricing.set(model, modelPricing);
+  }
+  if (missing.length > 0) {
+    throw new Error(
+      `Cannot estimate benchmark cost because pricing metadata is unavailable for: ${missing.join(", ")}. Use provider/model ids with backend provider pricing metadata.`
+    );
+  }
+  return pricing;
+}
+function loadProviderPricing() {
+  const providersDir = findProvidersDir();
+  const pricing = /* @__PURE__ */ new Map();
+  if (!providersDir) return pricing;
+  for (const fileName of readdirSync(providersDir)) {
+    if (!fileName.endsWith(".yml")) continue;
+    const filePath = join4(providersDir, fileName);
+    const text = readFileSync6(filePath, "utf-8");
+    const provider = parseProviderId(text) ?? fileName.replace(/\.yml$/, "");
+    for (const modelPricing of parseModelPricing(text, provider)) {
+      pricing.set(`${modelPricing.provider}/${modelPricing.modelId}`, modelPricing);
+      pricing.set(modelPricing.modelId, modelPricing);
+    }
+  }
+  return pricing;
+}
+function parseProviderId(text) {
+  const match = text.match(/^provider_id:\s*["']?([^"'\n]+)["']?/m);
+  return match?.[1]?.trim() ?? null;
+}
+function parseModelPricing(text, provider) {
+  const lines = text.split("\n");
+  const results = [];
+  let modelId = null;
+  let inModel = false;
+  let inputTokensPerCredit = null;
+  let outputTokensPerCredit = null;
+  for (const line of lines) {
+    const modelMatch = line.match(/^\s{2}-\s+id:\s*["']?([^"'\n#]+)["']?/);
+    if (modelMatch) {
+      if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
+        results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
+      }
+      inModel = true;
+      modelId = modelMatch[1].trim();
+      inputTokensPerCredit = null;
+      outputTokensPerCredit = null;
+      continue;
+    }
+    if (!inModel) continue;
+    const inputMatch = line.match(/^\s{10}per_credit_unit:\s*(\d+)/);
+    if (inputMatch && inputTokensPerCredit === null) {
+      inputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
+      continue;
+    }
+    if (inputMatch && inputTokensPerCredit !== null && outputTokensPerCredit === null) {
+      outputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
+    }
+  }
+  if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
+    results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
+  }
+  return results;
+}
+function normalizeModelKey(model) {
+  return model.includes("/") ? model : model;
+}
+function findProvidersDir() {
+  const currentFile = fileURLToPath(import.meta.url);
+  let current = dirname(currentFile);
+  for (let index = 0; index < 8; index += 1) {
+    const candidate = join4(current, "backend", "providers");
+    if (existsSync6(candidate)) return candidate;
+    const parentCandidate = join4(current, "..", "..", "backend", "providers");
+    if (existsSync6(parentCandidate)) return resolve5(parentCandidate);
+    const next = dirname(current);
+    if (next === current) break;
+    current = next;
+  }
+  return null;
+}
+function estimateCredits(cases, targetModels, judgeModel, pricing) {
+  let targetCredits = 0;
+  let judgeCredits = 0;
+  let targetInputTokens = 0;
+  let targetOutputTokens = 0;
+  let judgeInputTokens = 0;
+  let judgeOutputTokens = 0;
+  for (const benchmarkCase of cases) {
+    const turnCount = 1 + (benchmarkCase.followUps?.length ?? 0);
+    for (const model of targetModels) {
+      const modelPricing = pricing.get(model);
+      if (!modelPricing) continue;
+      const input = benchmarkCase.estimatedInputTokens * turnCount;
+      const output = benchmarkCase.estimatedOutputTokens * turnCount;
+      targetInputTokens += input;
+      targetOutputTokens += output;
+      targetCredits += creditsFor(modelPricing, input, output);
+      if (benchmarkCase.judge) {
+        const judgePricing = pricing.get(judgeModel);
+        if (!judgePricing) continue;
+        const judgeInput = Math.max(2e3, Math.ceil(output * 1.5));
+        const judgeOutput = 350;
+        judgeInputTokens += judgeInput;
+        judgeOutputTokens += judgeOutput;
+        judgeCredits += creditsFor(judgePricing, judgeInput, judgeOutput);
+      }
+    }
+  }
+  return {
+    targetCredits,
+    judgeCredits,
+    totalCredits: targetCredits + judgeCredits,
+    assumptions: { targetInputTokens, targetOutputTokens, judgeInputTokens, judgeOutputTokens }
+  };
+}
+function creditsFor(pricing, inputTokens, outputTokens) {
+  return Math.ceil(inputTokens / pricing.inputTokensPerCredit) + Math.ceil(outputTokens / pricing.outputTokensPerCredit);
+}
+function makeBaseResult(params) {
+  return {
+    command: "benchmark model",
+    status: params.dryRun ? "planned" : "completed",
+    runId: params.runId,
+    targetModel: params.targetModels[0],
+    targetModels: params.targetModels,
+    judgeModel: params.judgeModel,
+    suites: params.suites,
+    runs: params.runs,
+    compare: params.compare,
+    parallel: params.parallel,
+    extensiveSize: params.extensiveSize,
+    spendsCredits: !params.dryRun,
+    estimatedCredits: params.estimate,
+    cases: [],
+    modelSummaries: params.targetModels.map((model) => ({
+      model,
+      total: 0,
+      passed: 0,
+      failed: 0,
+      averageJudgeScore: null,
+      averageDurationMs: null
+    })),
+    summary: {
+      total: params.totalJobs,
+      completed: 0,
+      passed: 0,
+      failed: 0,
+      skipped: params.dryRun ? params.totalJobs : 0,
+      interrupted: false
+    }
+  };
+}
+function recomputeResult(result, totalJobs, interrupted) {
+  const completed = result.cases.length;
+  const passed = result.cases.filter((caseResult) => caseResult.passed).length;
+  const failed = result.cases.filter((caseResult) => !caseResult.passed).length;
+  result.summary = {
+    total: totalJobs,
+    completed,
+    passed,
+    failed,
+    skipped: Math.max(0, totalJobs - completed),
+    interrupted
+  };
+  result.status = interrupted || completed < totalJobs ? "partial" : "completed";
+  result.modelSummaries = result.targetModels.map((model) => summarizeModel(model, result.cases));
+  if (result.compare) result.comparison = buildComparison(result.modelSummaries);
+}
+function summarizeModel(model, cases) {
+  const modelCases = cases.filter((caseResult) => caseResult.model === model);
+  const scores = modelCases.map((caseResult) => caseResult.judge?.score).filter((score) => typeof score === "number" && Number.isFinite(score));
+  const durations = modelCases.map((caseResult) => caseResult.durationMs).filter((value) => value > 0);
+  return {
+    model,
+    total: modelCases.length,
+    passed: modelCases.filter((caseResult) => caseResult.passed).length,
+    failed: modelCases.filter((caseResult) => !caseResult.passed).length,
+    averageJudgeScore: scores.length > 0 ? round2(scores.reduce((sum, score) => sum + score, 0) / scores.length) : null,
+    averageDurationMs: durations.length > 0 ? Math.round(durations.reduce((sum, value) => sum + value, 0) / durations.length) : null
+  };
+}
+function buildComparison(summaries) {
+  const ranking = [...summaries].sort((a, b) => (b.averageJudgeScore ?? -1) - (a.averageJudgeScore ?? -1) || b.passed - a.passed).map((summary) => ({
+    model: summary.model,
+    averageJudgeScore: summary.averageJudgeScore,
+    passed: summary.passed,
+    total: summary.total
+  }));
+  const notes = ranking.length > 0 ? [`Top model so far: ${ranking[0].model} (${ranking[0].passed}/${ranking[0].total} passed).`] : [];
+  return { ranking, notes };
+}
+function round2(value) {
+  return Math.round(value * 100) / 100;
+}
+function defaultImageFixturePath() {
+  const fixtureDir = join4(dirname(fileURLToPath(import.meta.url)), "..", "fixtures");
+  const fixturePath = join4(fixtureDir, "brandenburger-tor.png");
+  if (existsSync6(fixturePath)) return fixturePath;
+  const tempDir = mkdtempSync(join4(tmpdir(), "openmates-benchmark-"));
+  const tempPath = join4(tempDir, "brandenburger-tor.svg");
+  writeFileSync4(tempPath, FIXTURE_IMAGE_SVG, "utf-8");
+  return tempPath;
+}
+function writeBenchmarkResult(result, flags, output) {
+  const json = `${JSON.stringify(result, null, 2)}
+`;
+  if (output) writeFileSync4(output, json, "utf-8");
+  if (flags.json === true || output) {
+    process.stdout.write(json);
+    return;
+  }
+  console.log(`Benchmark ${result.status}: ${result.targetModels.join(", ")}`);
+  console.log(`Run ID: ${result.runId}`);
+  console.log(`Suites: ${result.suites.join(", ")}`);
+  console.log(`Judge: ${result.judgeModel}`);
+  console.log(`Estimated credits: ${result.estimatedCredits.totalCredits}`);
+  console.log(`Spend credits: ${result.spendsCredits ? "yes" : "no"}`);
+  if (result.status !== "planned") {
+    console.log(`Passed: ${result.summary.passed}/${result.summary.completed} completed (${result.summary.skipped} skipped)`);
+    for (const benchmarkCase of result.cases) {
+      const mark = benchmarkCase.passed ? "PASS" : "FAIL";
+      const judge = benchmarkCase.judge ? ` judge=${benchmarkCase.judge.score ?? "unparsed"}` : "";
+      const error = benchmarkCase.error ? ` error=${benchmarkCase.error}` : "";
+      console.log(`${mark} ${benchmarkCase.model} ${benchmarkCase.suite}/${benchmarkCase.id} (${benchmarkCase.durationMs}ms)${judge}${error}`);
+    }
+  }
+}
 // src/cli.ts
 async function main() {
   const parsed = parseArgs(process.argv.slice(2));
@@ -41557,6 +42591,10 @@ async function main() {
       printDocsHelp();
       return;
     }
+    if (command === "benchmark") {
+      printBenchmarkHelp();
+      return;
+    }
     printHelp();
     return;
   }
@@ -41627,6 +42665,10 @@ async function main() {
     handleFeedback(subcommand, rest, parsed.flags);
     return;
   }
+  if (command === "benchmark") {
+    await handleBenchmark(client, subcommand, rest, parsed.flags);
+    return;
+  }
   throw new Error(`Unknown command '${command}'. Run 'openmates help'.`);
 }
 function shouldInitializeRedactor(command, subcommand) {
@@ -41863,10 +42905,10 @@ Run 'openmates chats show ` + chatId + "' to check if suggestions have been save
         input: process.stdin,
         output: process.stdout
       });
-      const answer = await new Promise((resolve5) => {
+      const answer = await new Promise((resolve6) => {
         iface.question(
           `Delete ${resolved.length} chat(s)? This cannot be undone. [y/N] `,
-          resolve5
+          resolve6
         );
       });
       iface.close();
@@ -42026,16 +43068,16 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
       }
     }
     const { mkdir, writeFile } = await import("fs/promises");
-    const { join: join4 } = await import("path");
+    const { join: join5 } = await import("path");
     if (useZip) {
-      const tmpDir = join4(outputDir, `.${filenameBase}_tmp`);
+      const tmpDir = join5(outputDir, `.${filenameBase}_tmp`);
       await mkdir(tmpDir, { recursive: true });
-      await writeFile(join4(tmpDir, `${filenameBase}.yml`), yamlContent);
-      await writeFile(join4(tmpDir, `${filenameBase}.md`), mdContent);
+      await writeFile(join5(tmpDir, `${filenameBase}.yml`), yamlContent);
+      await writeFile(join5(tmpDir, `${filenameBase}.md`), mdContent);
       if (codeEmbeds.length > 0) {
         for (const ce of codeEmbeds) {
           const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
-          const fullPath = join4(tmpDir, "code", fpath);
+          const fullPath = join5(tmpDir, "code", fpath);
           await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
             recursive: true
           });
@@ -42043,13 +43085,13 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
         }
       }
       if (transcriptEmbeds.length > 0) {
-        const tDir = join4(tmpDir, "transcripts");
+        const tDir = join5(tmpDir, "transcripts");
         await mkdir(tDir, { recursive: true });
         for (const te of transcriptEmbeds) {
-          await writeFile(join4(tDir, te.filename), te.content);
+          await writeFile(join5(tDir, te.filename), te.content);
         }
       }
-      const zipPath = join4(outputDir, `${filenameBase}.zip`);
+      const zipPath = join5(outputDir, `${filenameBase}.zip`);
       const { execSync: execSync2 } = await import("child_process");
       try {
         execSync2(`cd "${tmpDir}" && zip -r "${zipPath}" .`, { stdio: "pipe" });
@@ -42064,17 +43106,17 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
         );
       }
     } else {
-      const chatDir = join4(outputDir, filenameBase);
+      const chatDir = join5(outputDir, filenameBase);
       await mkdir(chatDir, { recursive: true });
       const written = [];
-      await writeFile(join4(chatDir, `${filenameBase}.yml`), yamlContent);
+      await writeFile(join5(chatDir, `${filenameBase}.yml`), yamlContent);
       written.push(`${filenameBase}.yml`);
-      await writeFile(join4(chatDir, `${filenameBase}.md`), mdContent);
+      await writeFile(join5(chatDir, `${filenameBase}.md`), mdContent);
       written.push(`${filenameBase}.md`);
       if (codeEmbeds.length > 0) {
         for (const ce of codeEmbeds) {
           const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
-          const fullPath = join4(chatDir, "code", fpath);
+          const fullPath = join5(chatDir, "code", fpath);
           await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
             recursive: true
           });
@@ -42083,10 +43125,10 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
         }
       }
       if (transcriptEmbeds.length > 0) {
-        const tDir = join4(chatDir, "transcripts");
+        const tDir = join5(chatDir, "transcripts");
         await mkdir(tDir, { recursive: true });
         for (const te of transcriptEmbeds) {
-          await writeFile(join4(tDir, te.filename), te.content);
+          await writeFile(join5(tDir, te.filename), te.content);
           written.push(`transcripts/${te.filename}`);
         }
       }
@@ -42122,7 +43164,7 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
       printJson2({
         chat_id: chat.id,
         title: chat.title,
-        output_dir: useZip ? join4(outputDir, `${filenameBase}.zip`) : join4(outputDir, filenameBase),
+        output_dir: useZip ? join5(outputDir, `${filenameBase}.zip`) : join5(outputDir, filenameBase),
         files,
         code_embeds: codeEmbeds.length,
         transcript_embeds: transcriptEmbeds.length
@@ -42643,7 +43685,7 @@ async function handleCodeRun(client, flags, apiKey) {
   }
 }
 async function streamCodeRunToTerminal(url, jsonMode) {
-  return await new Promise((resolve5, reject) => {
+  return await new Promise((resolve6, reject) => {
     const ws = new WebSocket2(url);
     let lastStatus = {};
     ws.on("message", (data) => {
@@ -42662,7 +43704,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
           const status = String(payload.status ?? "");
           if (["finished", "failed", "timeout", "cancelled"].includes(status)) {
             ws.close();
-            resolve5(lastStatus);
+            resolve6(lastStatus);
           }
         }
       } catch (err) {
@@ -42672,7 +43714,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
     });
     ws.on("error", () => reject(new Error("Code Run stream failed.")));
     ws.on("close", () => {
-      if (Object.keys(lastStatus).length > 0) resolve5(lastStatus);
+      if (Object.keys(lastStatus).length > 0) resolve6(lastStatus);
     });
   });
 }
@@ -42683,7 +43725,7 @@ async function pollCodeRunStatus(client, statusPath, apiKey, jsonMode) {
     if (!jsonMode && value) process.stderr.write(`Code Run status: ${value}
 `);
     if (["finished", "failed", "timeout", "cancelled"].includes(value)) return status;
-    await new Promise((resolve5) => setTimeout(resolve5, 1e3));
+    await new Promise((resolve6) => setTimeout(resolve6, 1e3));
   }
 }
 function buildSkillInput(flags, inlineTokens, schemaParams) {
@@ -42883,7 +43925,7 @@ async function handleEmbeds(client, subcommand, rest, flags) {
         throw new Error("Embed version content was not available after local reconstruction.");
       }
       if (typeof flags.output === "string") {
-        writeFileSync4(flags.output, result.content, "utf-8");
+        writeFileSync5(flags.output, result.content, "utf-8");
         if (flags.json === true) {
           printJson2({ ...result, output: flags.output });
         } else {
@@ -43167,11 +44209,11 @@ function parseYamlScalar(value) {
 }
 async function saveDownloadedDocument(document, output) {
   const { mkdir, writeFile } = await import("fs/promises");
-  const { join: join4, basename: basename4, dirname: dirname2 } = await import("path");
+  const { join: join5, basename: basename4, dirname: dirname3 } = await import("path");
   const target = typeof output === "string" ? output : ".";
   const filename = basename4(document.filename || "document.pdf");
-  const filePath = target.endsWith(".pdf") ? target : join4(target, filename);
-  await mkdir(dirname2(filePath), { recursive: true });
+  const filePath = target.endsWith(".pdf") ? target : join5(target, filename);
+  await mkdir(dirname3(filePath), { recursive: true });
   await writeFile(filePath, document.data);
   return filePath;
 }
@@ -43199,7 +44241,7 @@ function printMateInfo(mateId, json) {
 async function confirmOrExit(question) {
   const rl = await import("readline");
   const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
-  const answer = await new Promise((resolve5) => iface.question(question, resolve5));
+  const answer = await new Promise((resolve6) => iface.question(question, resolve6));
   iface.close();
   if (answer.trim().toLowerCase() !== "y") {
     console.log("Aborted.");
@@ -43209,7 +44251,7 @@ async function confirmOrExit(question) {
 async function promptLine(question) {
   const rl = await import("readline");
   const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
-  const answer = await new Promise((resolve5) => iface.question(question, resolve5));
+  const answer = await new Promise((resolve6) => iface.question(question, resolve6));
   iface.close();
   return answer.trim();
 }
@@ -43217,7 +44259,7 @@ async function promptSecret(question) {
   if (!process.stdin.isTTY) {
     return promptLine(question);
   }
-  return new Promise((resolve5) => {
+  return new Promise((resolve6) => {
     const stdin2 = process.stdin;
     const wasRaw = stdin2.isRaw;
     let value = "";
@@ -43230,7 +44272,7 @@ async function promptSecret(question) {
         stdin2.off("data", onData);
         stdin2.setRawMode(wasRaw);
         process.stdout.write("\n");
-        resolve5(value);
+        resolve6(value);
         return;
       }
       if (char === "") {
@@ -43250,7 +44292,7 @@ async function promptSecret(question) {
 }
 async function writeSecretFile(filePath, content, force = false) {
   const { mkdir, writeFile, stat: stat2 } = await import("fs/promises");
-  const { dirname: dirname2 } = await import("path");
+  const { dirname: dirname3 } = await import("path");
   try {
     await stat2(filePath);
     if (!force) throw new Error(`${filePath} already exists. Use --force to overwrite.`);
@@ -43260,7 +44302,7 @@ async function writeSecretFile(filePath, content, force = false) {
     }
     if (error instanceof Error && !("code" in error)) throw error;
   }
-  await mkdir(dirname2(filePath), { recursive: true });
+  await mkdir(dirname3(filePath), { recursive: true });
   await writeFile(filePath, content, { mode: 384 });
   return filePath;
 }
@@ -45887,6 +46929,7 @@ Commands:
   openmates inspirations [--lang <code>] [--json]   Daily inspirations
   openmates newchatsuggestions [--limit <n>] [--json]   Personalized new chat suggestions
   openmates feedback [--help]                Assistant response feedback helpers
+  openmates benchmark [--help]               Run real model benchmarks with usage tagged as benchmark spend
   openmates server [--help]                   Server management (install, start, stop, ...)
   openmates docs [--help]                     Browse, search, and download documentation
   openmates e2e provision-auth-accounts       Provision local E2E auth-account artifacts
@@ -46217,7 +47260,7 @@ async function handleDocs(client, subcommand, rest, flags) {
   }
   if (subcommand === "download") {
     const { writeFile, mkdir } = await import("fs/promises");
-    const { join: join4, dirname: dirname2 } = await import("path");
+    const { join: join5, dirname: dirname3 } = await import("path");
     if (flags.all === true) {
       const outputDir = typeof flags.output === "string" ? flags.output : "./openmates-docs";
       const tree = await client.listDocs();
@@ -46226,8 +47269,8 @@ async function handleDocs(client, subcommand, rest, flags) {
       let count = 0;
       for (const slug2 of slugs) {
         const content2 = await client.getDoc(slug2);
-        const filePath = join4(outputDir, `${slug2}.md`);
-        await mkdir(dirname2(filePath), { recursive: true });
+        const filePath = join5(outputDir, `${slug2}.md`);
+        await mkdir(dirname3(filePath), { recursive: true });
         await writeFile(filePath, content2, "utf-8");
         count++;
         process.stderr.write(`\r  Downloaded ${count}/${slugs.length}`);
@@ -46299,8 +47342,8 @@ function isCliEntrypoint() {
   if (!entrypoint) return false;
   try {
     const invokedPath = realpathSync(entrypoint);
-    const modulePath = realpathSync(fileURLToPath(import.meta.url));
-    return invokedPath === modulePath || basename3(invokedPath) === "cli.js" && dirname(invokedPath) === dirname(modulePath);
+    const modulePath = realpathSync(fileURLToPath2(import.meta.url));
+    return invokedPath === modulePath || basename3(invokedPath) === "cli.js" && dirname2(invokedPath) === dirname2(modulePath);
   } catch {
     return false;
   }