openmates 0.12.0-alpha.11 → 0.12.0-alpha.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -986,14 +986,14 @@ var OpenMatesWsClient = class {
986
986
  });
987
987
  }
988
988
  async open(timeoutMs = 1e4) {
989
- await new Promise((resolve5, reject) => {
989
+ await new Promise((resolve6, reject) => {
990
990
  const timeout = setTimeout(
991
991
  () => reject(new Error("WebSocket open timeout")),
992
992
  timeoutMs
993
993
  );
994
994
  this.socket.once("open", () => {
995
995
  clearTimeout(timeout);
996
- resolve5();
996
+ resolve6();
997
997
  });
998
998
  this.socket.once("error", (error) => {
999
999
  clearTimeout(timeout);
@@ -1022,15 +1022,15 @@ var OpenMatesWsClient = class {
1022
1022
  this.socket.send(JSON.stringify({ type, payload }));
1023
1023
  }
1024
1024
  sendAsync(type, payload) {
1025
- return new Promise((resolve5, reject) => {
1025
+ return new Promise((resolve6, reject) => {
1026
1026
  this.socket.send(JSON.stringify({ type, payload }), (error) => {
1027
1027
  if (error) reject(error);
1028
- else resolve5();
1028
+ else resolve6();
1029
1029
  });
1030
1030
  });
1031
1031
  }
1032
1032
  waitForMessage(expectedType, predicate, timeoutMs = 2e4) {
1033
- return new Promise((resolve5, reject) => {
1033
+ return new Promise((resolve6, reject) => {
1034
1034
  const onMessage = (rawData) => {
1035
1035
  try {
1036
1036
  const parsed = JSON.parse(rawData.toString());
@@ -1041,7 +1041,7 @@ var OpenMatesWsClient = class {
1041
1041
  return;
1042
1042
  }
1043
1043
  cleanup();
1044
- resolve5(parsed);
1044
+ resolve6(parsed);
1045
1045
  } catch {
1046
1046
  }
1047
1047
  };
@@ -1074,14 +1074,14 @@ var OpenMatesWsClient = class {
1074
1074
  * Used by ensureSynced to consume the full phased-sync event stream.
1075
1075
  */
1076
1076
  collectMessages(terminatorType, timeoutMs = 9e4) {
1077
- return new Promise((resolve5, reject) => {
1077
+ return new Promise((resolve6, reject) => {
1078
1078
  const collected = [];
1079
1079
  const onMessage = (rawData) => {
1080
1080
  try {
1081
1081
  const parsed = JSON.parse(rawData.toString());
1082
1082
  if (parsed.type === terminatorType) {
1083
1083
  cleanup();
1084
- resolve5(collected);
1084
+ resolve6(collected);
1085
1085
  return;
1086
1086
  }
1087
1087
  collected.push(parsed);
@@ -1094,7 +1094,7 @@ var OpenMatesWsClient = class {
1094
1094
  };
1095
1095
  const onClose = () => {
1096
1096
  cleanup();
1097
- resolve5(collected);
1097
+ resolve6(collected);
1098
1098
  };
1099
1099
  const timeout = setTimeout(() => {
1100
1100
  cleanup();
@@ -1132,7 +1132,7 @@ var OpenMatesWsClient = class {
1132
1132
  const timeoutMs = options?.timeoutMs ?? 9e4;
1133
1133
  const onStream = options?.onStream;
1134
1134
  const asyncEmbedWaitMs = options?.asyncEmbedWaitMs ?? 12e4;
1135
- return new Promise((resolve5, reject) => {
1135
+ return new Promise((resolve6, reject) => {
1136
1136
  let latestContent = "";
1137
1137
  let messageId = null;
1138
1138
  let taskId = null;
@@ -1189,7 +1189,7 @@ var OpenMatesWsClient = class {
1189
1189
  if (waitingForUserPayload) {
1190
1190
  if (pendingSubChatHandlers.size > 0) return;
1191
1191
  cleanup();
1192
- resolve5({
1192
+ resolve6({
1193
1193
  status: "waiting_for_user",
1194
1194
  messageId,
1195
1195
  taskId,
@@ -1209,7 +1209,7 @@ var OpenMatesWsClient = class {
1209
1209
  if (processingEmbedIds.size > 0 && !asyncEmbedTimer) {
1210
1210
  asyncEmbedTimer = setTimeout(() => {
1211
1211
  cleanup();
1212
- resolve5({
1212
+ resolve6({
1213
1213
  status: "completed",
1214
1214
  messageId,
1215
1215
  taskId,
@@ -1226,7 +1226,7 @@ var OpenMatesWsClient = class {
1226
1226
  }
1227
1227
  if (processingEmbedIds.size > 0) return;
1228
1228
  cleanup();
1229
- resolve5({
1229
+ resolve6({
1230
1230
  status: "completed",
1231
1231
  messageId,
1232
1232
  taskId,
@@ -1440,7 +1440,7 @@ var OpenMatesWsClient = class {
1440
1440
  const onClose = () => {
1441
1441
  if (aiResponseDone) {
1442
1442
  cleanup();
1443
- resolve5({
1443
+ resolve6({
1444
1444
  status: "completed",
1445
1445
  messageId,
1446
1446
  taskId,
@@ -3681,7 +3681,11 @@ var OpenMatesClient = class _OpenMatesClient {
3681
3681
  messagePayload.benchmark_metadata = params.benchmarkMetadata;
3682
3682
  }
3683
3683
  if (params.incognito) {
3684
- messagePayload.message_history = [{
3684
+ const providedHistory = (params.messageHistory ?? []).map((historyMessage) => ({
3685
+ ...historyMessage,
3686
+ chat_id: historyMessage.chat_id ?? chatId
3687
+ }));
3688
+ messagePayload.message_history = [...providedHistory, {
3685
3689
  message_id: messageId,
3686
3690
  chat_id: chatId,
3687
3691
  role: "user",
@@ -4315,7 +4319,7 @@ var OpenMatesClient = class _OpenMatesClient {
4315
4319
  if (response.data.status === "failed") {
4316
4320
  throw new Error(response.data.error ?? "Task failed");
4317
4321
  }
4318
- await new Promise((resolve5) => setTimeout(resolve5, SKILL_TASK_POLL_INTERVAL_MS));
4322
+ await new Promise((resolve6) => setTimeout(resolve6, SKILL_TASK_POLL_INTERVAL_MS));
4319
4323
  }
4320
4324
  throw new Error(`Task ${taskId} did not complete within ${SKILL_TASK_POLL_TIMEOUT_MS / 1e3}s`);
4321
4325
  }
@@ -4536,7 +4540,7 @@ var OpenMatesClient = class _OpenMatesClient {
4536
4540
  `Rate limited by settings API; retrying in ${Math.ceil(SETTINGS_GET_RATE_LIMIT_RETRY_MS / 1e3)}s...
4537
4541
  `
4538
4542
  );
4539
- await new Promise((resolve5) => setTimeout(resolve5, SETTINGS_GET_RATE_LIMIT_RETRY_MS));
4543
+ await new Promise((resolve6) => setTimeout(resolve6, SETTINGS_GET_RATE_LIMIT_RETRY_MS));
4540
4544
  response = await this.http.get(normalizedPath, this.getCliRequestHeaders());
4541
4545
  }
4542
4546
  if (!response.ok) {
@@ -6037,7 +6041,7 @@ function filenameFromContentDisposition(header2) {
6037
6041
  return plain?.trim() ?? null;
6038
6042
  }
6039
6043
  function sleep(ms) {
6040
- return new Promise((resolve5) => setTimeout(resolve5, ms));
6044
+ return new Promise((resolve6) => setTimeout(resolve6, ms));
6041
6045
  }
6042
6046
  function printLogo() {
6043
6047
  const W = "\x1B[1;37m";
@@ -6054,8 +6058,8 @@ function printLogo() {
6054
6058
  // src/cli.ts
6055
6059
  import { createInterface as createInterface3 } from "readline/promises";
6056
6060
  import { realpathSync, writeFileSync as writeFileSync5 } from "fs";
6057
- import { fileURLToPath } from "url";
6058
- import { basename as basename3, dirname } from "path";
6061
+ import { fileURLToPath as fileURLToPath2 } from "url";
6062
+ import { basename as basename3, dirname as dirname2 } from "path";
6059
6063
  import WebSocket2 from "ws";
6060
6064
 
6061
6065
  // ../secret-scanner/src/registry.ts
@@ -7755,8 +7759,8 @@ async function renderRemotionShareLink(embedId, client, ln) {
7755
7759
  }
7756
7760
  }
7757
7761
  function generateQr(value) {
7758
- return new Promise((resolve5) => {
7759
- qrcode2.generate(value, { small: true }, (qr) => resolve5(qr));
7762
+ return new Promise((resolve6) => {
7763
+ qrcode2.generate(value, { small: true }, (qr) => resolve6(qr));
7760
7764
  });
7761
7765
  }
7762
7766
  function remotionMeta(c) {
@@ -8611,9 +8615,9 @@ function exec(cmd, cwd) {
8611
8615
  return execSync(cmd, { cwd, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] }).trim();
8612
8616
  }
8613
8617
  function runInteractive(cmd, args, cwd) {
8614
- return new Promise((resolve5, reject) => {
8618
+ return new Promise((resolve6, reject) => {
8615
8619
  const child = nodeSpawn(cmd, args, { cwd, stdio: "inherit", shell: false });
8616
- child.on("close", (code) => resolve5(code ?? 1));
8620
+ child.on("close", (code) => resolve6(code ?? 1));
8617
8621
  child.on("error", reject);
8618
8622
  });
8619
8623
  }
@@ -8874,10 +8878,10 @@ function warnIfMissingLlmCredentials(installPath) {
8874
8878
  }
8875
8879
  async function confirmDestructive(phrase) {
8876
8880
  const rl = createInterface2({ input: process.stdin, output: process.stderr });
8877
- return new Promise((resolve5) => {
8881
+ return new Promise((resolve6) => {
8878
8882
  rl.question(`Type "${phrase}" to confirm: `, (answer) => {
8879
8883
  rl.close();
8880
- resolve5(answer.trim() === phrase);
8884
+ resolve6(answer.trim() === phrase);
8881
8885
  });
8882
8886
  });
8883
8887
  }
@@ -29279,6 +29283,15 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
29279
29283
  copy_failed: {
29280
29284
  text: "Failed to copy to clipboard"
29281
29285
  },
29286
+ code_file_downloaded: {
29287
+ text: "Code file downloaded successfully"
29288
+ },
29289
+ code_file_download_failed: {
29290
+ text: "Failed to download code file"
29291
+ },
29292
+ action_failed: {
29293
+ text: "Failed to perform action"
29294
+ },
29282
29295
  download_itinerary: {
29283
29296
  text: "Download itinerary"
29284
29297
  },
@@ -41521,26 +41534,276 @@ function buildAssistantFeedbackDecision(rating) {
41521
41534
 
41522
41535
  // src/benchmark.ts
41523
41536
  import { randomUUID as randomUUID3 } from "crypto";
41524
- import { writeFileSync as writeFileSync4 } from "fs";
41537
+ import { existsSync as existsSync6, mkdtempSync, readFileSync as readFileSync6, readdirSync, writeFileSync as writeFileSync4 } from "fs";
41538
+ import { tmpdir } from "os";
41539
+ import { dirname, join as join4, resolve as resolve5 } from "path";
41540
+ import { fileURLToPath } from "url";
41525
41541
  var DEFAULT_JUDGE_MODEL = "google/gemini-3-flash-preview";
41526
- var BENCHMARK_CASES = [
41542
+ var DEFAULT_EXTENSIVE_SIZE = 10;
41543
+ var DEFAULT_PARALLEL = 4;
41544
+ var FIXTURE_IMAGE_SVG = `<?xml version="1.0" encoding="UTF-8"?>
41545
+ <svg xmlns="http://www.w3.org/2000/svg" width="1200" height="800" viewBox="0 0 1200 800">
41546
+ <rect width="1200" height="800" fill="#d8ecff"/>
41547
+ <rect y="560" width="1200" height="240" fill="#d7c39a"/>
41548
+ <text x="600" y="88" text-anchor="middle" font-family="Arial, sans-serif" font-size="44" font-weight="700" fill="#23344d">Brandenburger Tor, Berlin</text>
41549
+ <g transform="translate(160 170)" fill="#c9aa6a" stroke="#5d4522" stroke-width="8">
41550
+ <rect x="80" y="160" width="800" height="58"/>
41551
+ <rect x="120" y="218" width="720" height="48"/>
41552
+ <rect x="150" y="266" width="660" height="42"/>
41553
+ <g fill="#d9bd7d">
41554
+ <rect x="170" y="308" width="54" height="250"/>
41555
+ <rect x="285" y="308" width="54" height="250"/>
41556
+ <rect x="400" y="308" width="54" height="250"/>
41557
+ <rect x="515" y="308" width="54" height="250"/>
41558
+ <rect x="630" y="308" width="54" height="250"/>
41559
+ <rect x="745" y="308" width="54" height="250"/>
41560
+ </g>
41561
+ <rect x="130" y="558" width="700" height="50"/>
41562
+ <path d="M480 30 C530 72 620 88 682 48 L720 84 C652 142 530 124 456 78 Z" fill="#3e6f5f"/>
41563
+ <circle cx="510" cy="92" r="22" fill="#3e6f5f"/>
41564
+ <circle cx="625" cy="92" r="22" fill="#3e6f5f"/>
41565
+ <path d="M565 38 l26 78 h-52 z" fill="#3e6f5f"/>
41566
+ </g>
41567
+ <text x="600" y="740" text-anchor="middle" font-family="Arial, sans-serif" font-size="32" fill="#23344d">Neoclassical gate with Quadriga on top</text>
41568
+ </svg>
41569
+ `;
41570
+ var QUICK_CASES = [
41527
41571
  {
41528
- id: "smoke-exact-token",
41529
- suite: "smoke",
41572
+ id: "quick-exact-token",
41573
+ suite: "quick",
41574
+ title: "Exact token smoke test",
41530
41575
  prompt: "Reply with exactly this token and no extra text: BENCHMARK_SMOKE_OK",
41531
- expectedIncludes: "BENCHMARK_SMOKE_OK"
41576
+ complexity: "basic",
41577
+ category: "smoke",
41578
+ expectedIncludes: "BENCHMARK_SMOKE_OK",
41579
+ judge: true,
41580
+ estimatedInputTokens: 12e3,
41581
+ estimatedOutputTokens: 64
41532
41582
  },
41533
41583
  {
41534
- id: "arithmetic-direct",
41535
- suite: "tools",
41584
+ id: "quick-arithmetic",
41585
+ suite: "quick",
41586
+ title: "Arithmetic direct answer",
41536
41587
  prompt: "Compute 19 * 23. Reply with only the integer result.",
41537
- expectedIncludes: "437"
41588
+ complexity: "basic",
41589
+ category: "math",
41590
+ expectedIncludes: "437",
41591
+ judge: true,
41592
+ estimatedInputTokens: 12e3,
41593
+ estimatedOutputTokens: 64
41538
41594
  },
41539
41595
  {
41540
- id: "quality-concise-explanation",
41541
- suite: "quality",
41542
- prompt: "In four concise sentences, explain why deterministic benchmarks still need human-readable evaluation notes.",
41543
- needsJudge: true
41596
+ id: "quick-code",
41597
+ suite: "quick",
41598
+ title: "Small code generation",
41599
+ prompt: "Write a TypeScript function isPalindrome(input: string): boolean that ignores spaces, punctuation, and case. Include only the function and one short usage example.",
41600
+ complexity: "medium",
41601
+ category: "coding",
41602
+ judge: true,
41603
+ estimatedInputTokens: 12200,
41604
+ estimatedOutputTokens: 650
41605
+ },
41606
+ {
41607
+ id: "quick-image-brandenburger-tor",
41608
+ suite: "quick",
41609
+ title: "Default image understanding",
41610
+ prompt: "Look at the attached image. What landmark is shown, when was it built, and who designed it? Answer in three concise bullet points.",
41611
+ complexity: "medium",
41612
+ category: "image",
41613
+ image: "default",
41614
+ expectedIncludes: "Brandenburg",
41615
+ judge: true,
41616
+ estimatedInputTokens: 13500,
41617
+ estimatedOutputTokens: 350
41618
+ },
41619
+ {
41620
+ id: "quick-followup-continuity",
41621
+ suite: "quick",
41622
+ title: "Short multi-turn continuity",
41623
+ prompt: "Create a three-step plan for evaluating whether a new AI model is ready for production use.",
41624
+ complexity: "medium",
41625
+ category: "multi_turn",
41626
+ judge: true,
41627
+ estimatedInputTokens: 14e3,
41628
+ estimatedOutputTokens: 900,
41629
+ followUps: [
41630
+ { prompt: "Now make step 2 more concrete with two measurable checks." },
41631
+ { prompt: "Summarize the final plan in one sentence." }
41632
+ ]
41633
+ }
41634
+ ];
41635
+ var EXTENSIVE_CASES = [
41636
+ ...QUICK_CASES,
41637
+ {
41638
+ id: "extensive-coding-debug",
41639
+ suite: "extensive",
41640
+ title: "Debug a JavaScript bug",
41641
+ prompt: "A JavaScript function returns NaN when summing prices from [{price: '12.50'}, {price: undefined}]. Explain the bug and write a corrected function.",
41642
+ complexity: "medium",
41643
+ category: "coding",
41644
+ judge: true,
41645
+ estimatedInputTokens: 12300,
41646
+ estimatedOutputTokens: 850
41647
+ },
41648
+ {
41649
+ id: "extensive-coding-api-design",
41650
+ suite: "extensive",
41651
+ title: "Design a small API contract",
41652
+ prompt: "Design a minimal JSON API for creating and listing benchmark runs. Include request/response examples and one validation error.",
41653
+ complexity: "advanced",
41654
+ category: "coding",
41655
+ judge: true,
41656
+ estimatedInputTokens: 12300,
41657
+ estimatedOutputTokens: 1e3
41658
+ },
41659
+ {
41660
+ id: "extensive-reasoning-tradeoffs",
41661
+ suite: "extensive",
41662
+ title: "Reason about benchmark tradeoffs",
41663
+ prompt: "Compare deterministic assertions and LLM-as-judge evaluation for model benchmarks. Give two strengths and two risks for each.",
41664
+ complexity: "medium",
41665
+ category: "reasoning",
41666
+ judge: true,
41667
+ estimatedInputTokens: 12200,
41668
+ estimatedOutputTokens: 800
41669
+ },
41670
+ {
41671
+ id: "extensive-planning",
41672
+ suite: "extensive",
41673
+ title: "Operational rollout plan",
41674
+ prompt: "Create a rollout checklist for switching a production chatbot from one model to another. Include monitoring, rollback, and user-visible risk checks.",
41675
+ complexity: "advanced",
41676
+ category: "synthesis",
41677
+ judge: true,
41678
+ estimatedInputTokens: 12300,
41679
+ estimatedOutputTokens: 950
41680
+ },
41681
+ {
41682
+ id: "extensive-long-context-followup",
41683
+ suite: "extensive",
41684
+ title: "Prebuilt 20-message long chat follow-up",
41685
+ prompt: "Based on the earlier discussion, choose the best launch strategy and explain why in five bullets.",
41686
+ complexity: "advanced",
41687
+ category: "long_context",
41688
+ longContext: true,
41689
+ judge: true,
41690
+ estimatedInputTokens: 18500,
41691
+ estimatedOutputTokens: 900
41692
+ },
41693
+ {
41694
+ id: "extensive-policy-summary",
41695
+ suite: "extensive",
41696
+ title: "Policy summarization",
41697
+ prompt: "Summarize why privacy-preserving benchmark logs should avoid raw user prompts. Include a concrete safer alternative.",
41698
+ complexity: "medium",
41699
+ category: "reasoning",
41700
+ judge: true,
41701
+ estimatedInputTokens: 12200,
41702
+ estimatedOutputTokens: 650
41703
+ },
41704
+ {
41705
+ id: "extensive-structured-output",
41706
+ suite: "extensive",
41707
+ title: "Structured JSON output",
41708
+ prompt: "Return only JSON with keys risk, mitigation, and confidence for the risk: benchmark results are biased by prompt wording.",
41709
+ complexity: "medium",
41710
+ category: "synthesis",
41711
+ judge: true,
41712
+ estimatedInputTokens: 12200,
41713
+ estimatedOutputTokens: 350
41714
+ },
41715
+ {
41716
+ id: "extensive-creative-constraint",
41717
+ suite: "extensive",
41718
+ title: "Creative constrained response",
41719
+ prompt: "Write a six-line product note announcing model comparisons. Each line must be under 70 characters and avoid hype words like revolutionary or magical.",
41720
+ complexity: "medium",
41721
+ category: "synthesis",
41722
+ judge: true,
41723
+ estimatedInputTokens: 12200,
41724
+ estimatedOutputTokens: 500
41725
+ },
41726
+ {
41727
+ id: "extensive-data-reasoning",
41728
+ suite: "extensive",
41729
+ title: "Interpret metrics",
41730
+ prompt: "A benchmark has pass rates 8/10, 7/10, and 9/10 across three runs. Explain what you can and cannot conclude from this sample.",
41731
+ complexity: "medium",
41732
+ category: "reasoning",
41733
+ judge: true,
41734
+ estimatedInputTokens: 12200,
41735
+ estimatedOutputTokens: 600
41736
+ },
41737
+ {
41738
+ id: "extensive-security-review",
41739
+ suite: "extensive",
41740
+ title: "Security review",
41741
+ prompt: "Review this benchmark design for security risks: it logs prompts, outputs, model ids, and usage costs to a shared file. List risks and safer defaults.",
41742
+ complexity: "advanced",
41743
+ category: "reasoning",
41744
+ judge: true,
41745
+ estimatedInputTokens: 12300,
41746
+ estimatedOutputTokens: 850
41747
+ },
41748
+ {
41749
+ id: "extensive-followup-requirements",
41750
+ suite: "extensive",
41751
+ title: "Three-turn requirements refinement",
41752
+ prompt: "Draft acceptance criteria for a CLI benchmark comparison feature.",
41753
+ complexity: "advanced",
41754
+ category: "multi_turn",
41755
+ judge: true,
41756
+ estimatedInputTokens: 14500,
41757
+ estimatedOutputTokens: 1100,
41758
+ followUps: [
41759
+ { prompt: "Add one criterion about cost estimation before live runs." },
41760
+ { prompt: "Add one criterion about partial results after interruption." },
41761
+ { prompt: "Now compress the criteria to five bullets total." }
41762
+ ]
41763
+ },
41764
+ {
41765
+ id: "extensive-coding-tests",
41766
+ suite: "extensive",
41767
+ title: "Write tests for parser behavior",
41768
+ prompt: "Write Node.js test cases for a function parseSuites(value) that accepts quick, extensive, all, and comma-separated lists, and rejects unknown suites.",
41769
+ complexity: "medium",
41770
+ category: "coding",
41771
+ judge: true,
41772
+ estimatedInputTokens: 12300,
41773
+ estimatedOutputTokens: 950
41774
+ },
41775
+ {
41776
+ id: "extensive-coding-refactor",
41777
+ suite: "extensive",
41778
+ title: "Refactor duplicated code",
41779
+ prompt: "Given two duplicated TypeScript loops that build arrays of result objects, explain when to extract a helper and write the helper signature.",
41780
+ complexity: "medium",
41781
+ category: "coding",
41782
+ judge: true,
41783
+ estimatedInputTokens: 12300,
41784
+ estimatedOutputTokens: 750
41785
+ },
41786
+ {
41787
+ id: "extensive-comparison-analysis",
41788
+ suite: "extensive",
41789
+ title: "Compare two model outputs",
41790
+ prompt: "Explain how you would compare two model outputs when one is concise but misses caveats and the other is verbose but complete.",
41791
+ complexity: "medium",
41792
+ category: "reasoning",
41793
+ judge: true,
41794
+ estimatedInputTokens: 12200,
41795
+ estimatedOutputTokens: 650
41796
+ },
41797
+ {
41798
+ id: "extensive-failure-mode",
41799
+ suite: "extensive",
41800
+ title: "Failure-mode analysis",
41801
+ prompt: "List five failure modes for image-understanding benchmarks and one mitigation for each.",
41802
+ complexity: "advanced",
41803
+ category: "image",
41804
+ judge: true,
41805
+ estimatedInputTokens: 12300,
41806
+ estimatedOutputTokens: 900
41544
41807
  }
41545
41808
  ];
41546
41809
  async function handleBenchmark(client, subcommand, rest, flags) {
@@ -41551,122 +41814,105 @@ async function handleBenchmark(client, subcommand, rest, flags) {
41551
41814
  if (subcommand !== "model") {
41552
41815
  throw new Error(`Unknown benchmark command '${subcommand}'. Run 'openmates benchmark --help'.`);
41553
41816
  }
41554
- const targetModel = rest[0];
41555
- if (!targetModel) {
41556
- throw new Error("Missing target model. Usage: openmates benchmark model <provider/model> --confirm-spend-credits");
41817
+ const targetModels = rest.filter((arg) => !arg.startsWith("--"));
41818
+ if (targetModels.length === 0) {
41819
+ throw new Error("Missing target model. Usage: openmates benchmark model <provider/model> [model-b] --confirm-spend-credits");
41820
+ }
41821
+ const compare = flags.compare === true;
41822
+ if (targetModels.length > 1 && !compare) {
41823
+ throw new Error("Multiple target models require --compare.");
41824
+ }
41825
+ if (compare && targetModels.length < 2) {
41826
+ throw new Error("--compare requires at least two target models.");
41557
41827
  }
41558
41828
  const judgeModel = typeof flags["judge-model"] === "string" ? flags["judge-model"] : DEFAULT_JUDGE_MODEL;
41559
41829
  const suites = parseSuites(flags.suite);
41560
41830
  const runs = parseRuns(flags.runs);
41831
+ const extensiveSize = parseExtensiveSize(flags["extensive-size"]);
41832
+ const parallel = parseParallel(flags.parallel);
41833
+ const caseIds = parseCaseIds(flags.case);
41561
41834
  const dryRun = flags["dry-run"] === true;
41562
41835
  const output = typeof flags.output === "string" ? flags.output : void 0;
41563
41836
  const runId = typeof flags["run-id"] === "string" ? flags["run-id"] : randomUUID3();
41837
+ const imagePath = typeof flags.image === "string" ? resolve5(flags.image) : defaultImageFixturePath();
41564
41838
  if (!dryRun && flags["confirm-spend-credits"] !== true) {
41565
41839
  throw new Error(
41566
41840
  "Benchmark runs spend real credits from the logged-in account. Rerun with --confirm-spend-credits, or use --dry-run to preview the plan."
41567
41841
  );
41568
41842
  }
41569
- const cases = expandCases(suites, runs);
41570
- const baseResult = {
41571
- command: "benchmark model",
41572
- status: dryRun ? "planned" : "completed",
41843
+ const cases = filterCases(expandCases(suites, runs, extensiveSize), caseIds);
41844
+ const pricing = loadPricingForModels([...targetModels, judgeModel]);
41845
+ const estimate = estimateCredits(cases, targetModels, judgeModel, pricing);
41846
+ const result = makeBaseResult({
41573
41847
  runId,
41574
- targetModel,
41848
+ targetModels,
41575
41849
  judgeModel,
41576
41850
  suites,
41577
41851
  runs,
41578
- spendsCredits: !dryRun,
41579
- cases: [],
41580
- summary: { total: cases.length, passed: 0, failed: 0 }
41581
- };
41852
+ compare,
41853
+ parallel,
41854
+ extensiveSize,
41855
+ dryRun,
41856
+ estimate,
41857
+ totalJobs: cases.length * targetModels.length
41858
+ });
41582
41859
  if (dryRun) {
41583
- writeBenchmarkResult(baseResult, flags, output);
41860
+ writeBenchmarkResult(result, flags, output);
41584
41861
  return;
41585
41862
  }
41586
41863
  if (!client.hasSession()) {
41587
41864
  throw new Error("Benchmark runs require login. Run 'openmates login' first.");
41588
41865
  }
41589
- for (const benchmarkCase of cases) {
41590
- const startedAt = Date.now();
41591
- const targetResponse = await client.sendMessage({
41592
- message: `${modelMention(targetModel)} ${benchmarkCase.prompt}`,
41593
- incognito: true,
41594
- autoApproveSubChats: true,
41595
- benchmarkMetadata: benchmarkMetadata({
41596
- runId,
41597
- suite: benchmarkCase.suite,
41598
- caseId: benchmarkCase.id,
41599
- targetModel,
41600
- judgeModel
41601
- }),
41602
- precollectResponse: true
41866
+ let interrupted = false;
41867
+ const onInterrupt = () => {
41868
+ interrupted = true;
41869
+ };
41870
+ process.once("SIGINT", onInterrupt);
41871
+ try {
41872
+ const jobs = cases.flatMap((benchmarkCase) => targetModels.map((model) => ({ model, benchmarkCase })));
41873
+ await runPool(jobs, parallel, async (job) => {
41874
+ if (interrupted) return;
41875
+ const caseResult = await runCaseJob({ client, job, judgeModel, runId, imagePath });
41876
+ result.cases.push(caseResult);
41877
+ recomputeResult(result, jobs.length, interrupted);
41603
41878
  });
41604
- const caseResult = {
41605
- id: benchmarkCase.id,
41606
- suite: benchmarkCase.suite,
41607
- run: benchmarkCase.run,
41608
- prompt: benchmarkCase.prompt,
41609
- assistant: targetResponse.assistant,
41610
- modelName: targetResponse.modelName,
41611
- passed: benchmarkCase.expectedIncludes ? targetResponse.assistant.includes(benchmarkCase.expectedIncludes) : true,
41612
- durationMs: Date.now() - startedAt,
41613
- expectedIncludes: benchmarkCase.expectedIncludes
41614
- };
41615
- if (benchmarkCase.needsJudge) {
41616
- const judgeResponse = await client.sendMessage({
41617
- message: `${modelMention(judgeModel)} ${judgePrompt(benchmarkCase.prompt, targetResponse.assistant)}`,
41618
- incognito: true,
41619
- autoApproveSubChats: true,
41620
- benchmarkMetadata: benchmarkMetadata({
41621
- runId,
41622
- suite: benchmarkCase.suite,
41623
- caseId: `${benchmarkCase.id}:judge`,
41624
- targetModel,
41625
- judgeModel
41626
- }),
41627
- precollectResponse: true
41628
- });
41629
- const judgment = parseJudgment(judgeResponse.assistant);
41630
- caseResult.judge = {
41631
- model: judgeModel,
41632
- score: judgment.score,
41633
- reason: judgment.reason,
41634
- raw: judgeResponse.assistant
41635
- };
41636
- caseResult.passed = judgment.score !== null && judgment.score >= 4;
41637
- }
41638
- baseResult.cases.push(caseResult);
41879
+ } finally {
41880
+ process.off("SIGINT", onInterrupt);
41639
41881
  }
41640
- baseResult.summary.passed = baseResult.cases.filter((result) => result.passed).length;
41641
- baseResult.summary.failed = baseResult.cases.length - baseResult.summary.passed;
41642
- writeBenchmarkResult(baseResult, flags, output);
41882
+ recomputeResult(result, cases.length * targetModels.length, interrupted);
41883
+ writeBenchmarkResult(result, flags, output);
41643
41884
  }
41644
41885
  function printBenchmarkHelp() {
41645
41886
  console.log(`Benchmark commands:
41646
- openmates benchmark model <provider/model> --confirm-spend-credits [--suite smoke|tools|quality|all] [--runs <n>] [--json]
41887
+ openmates benchmark model <provider/model> [provider/model...] --confirm-spend-credits [--compare] [--suite quick|extensive|all] [--json]
41647
41888
 
41648
41889
  Runs real incognito chat requests through the OpenMates product path. Live runs
41649
41890
  spend the logged-in user's credits and usage entries are grouped as benchmark spend.
41650
41891
 
41651
41892
  Options:
41652
41893
  --confirm-spend-credits Required for live benchmark runs
41653
- --dry-run Preview the benchmark plan without login or spend
41654
- --suite <list> Comma-separated suites: smoke, tools, quality, all (default: smoke)
41894
+ --dry-run Preview the benchmark plan without inference or spend
41895
+ --compare Compare two or more target models
41896
+ --suite <list> Comma-separated suites: quick, extensive, all (default: quick)
41897
+ --case <id[,id...]> Run only specific case id(s) from the selected suites
41898
+ --extensive-size <n> Extensive cases to run: 5, 10, or 20 (default: ${DEFAULT_EXTENSIVE_SIZE})
41655
41899
  --runs <n> Repeat each selected case (default: 1)
41656
- --judge-model <provider/model> Judge for quality cases (default: ${DEFAULT_JUDGE_MODEL})
41900
+ --parallel <n> Concurrent target case requests (default: ${DEFAULT_PARALLEL})
41901
+ --judge-model <provider/model> Judge for evaluated cases (default: ${DEFAULT_JUDGE_MODEL})
41902
+ --image <path> Override default Brandenburger Tor image fixture
41657
41903
  --run-id <id> Reuse a benchmark run id for grouping
41658
41904
  --output <path> Save JSON result to a file
41659
41905
  --json Print JSON result`);
41660
41906
  }
41661
41907
  function parseSuites(value) {
41662
- if (value === void 0 || value === false) return ["smoke"];
41908
+ if (value === void 0 || value === false) return ["quick"];
41663
41909
  if (value === true) throw new Error("--suite requires a value");
41664
41910
  const suites = value.split(",").map((suite) => suite.trim()).filter(Boolean);
41665
- if (suites.includes("all")) return ["smoke", "tools", "quality"];
41666
- const allowed = /* @__PURE__ */ new Set(["smoke", "tools", "quality"]);
41911
+ if (suites.includes("all")) return ["quick", "extensive"];
41912
+ const allowed = /* @__PURE__ */ new Set(["quick", "extensive"]);
41667
41913
  const invalid = suites.filter((suite) => !allowed.has(suite));
41668
41914
  if (invalid.length > 0 || suites.length === 0) {
41669
- throw new Error("Invalid --suite. Use smoke, tools, quality, or all.");
41915
+ throw new Error("Invalid --suite. Use quick, extensive, or all.");
41670
41916
  }
41671
41917
  return [...new Set(suites)];
41672
41918
  }
@@ -41679,14 +41925,331 @@ function parseRuns(value) {
41679
41925
  }
41680
41926
  return parsed;
41681
41927
  }
41682
- function expandCases(suites, runs) {
41683
- const selected = BENCHMARK_CASES.filter((benchmarkCase) => suites.includes(benchmarkCase.suite));
41928
+ function parseExtensiveSize(value) {
41929
+ if (value === void 0 || value === false) return DEFAULT_EXTENSIVE_SIZE;
41930
+ if (value === true) throw new Error("--extensive-size requires a value");
41931
+ const parsed = Number.parseInt(value, 10);
41932
+ if (![5, 10, 20].includes(parsed)) {
41933
+ throw new Error("--extensive-size must be 5, 10, or 20");
41934
+ }
41935
+ return parsed;
41936
+ }
41937
+ function parseParallel(value) {
41938
+ if (value === void 0 || value === false) return DEFAULT_PARALLEL;
41939
+ if (value === true) throw new Error("--parallel requires a value");
41940
+ const parsed = Number.parseInt(value, 10);
41941
+ if (!Number.isInteger(parsed) || parsed < 1 || parsed > 20) {
41942
+ throw new Error("--parallel must be an integer from 1 to 20");
41943
+ }
41944
+ return parsed;
41945
+ }
41946
+ function parseCaseIds(value) {
41947
+ if (value === void 0 || value === false) return [];
41948
+ if (value === true) throw new Error("--case requires a case id");
41949
+ const caseIds = value.split(",").map((caseId) => caseId.trim()).filter(Boolean);
41950
+ if (caseIds.length === 0) throw new Error("--case requires at least one case id");
41951
+ return [...new Set(caseIds)];
41952
+ }
41953
+ function filterCases(cases, caseIds) {
41954
+ if (caseIds.length === 0) return cases;
41955
+ const availableIds = new Set(cases.map((benchmarkCase) => benchmarkCase.id));
41956
+ const missing = caseIds.filter((caseId) => !availableIds.has(caseId));
41957
+ if (missing.length > 0) {
41958
+ throw new Error(
41959
+ `Unknown benchmark case id(s): ${missing.join(", ")}. Available in selected suite(s): ${[...availableIds].sort().join(", ")}`
41960
+ );
41961
+ }
41962
+ return cases.filter((benchmarkCase) => caseIds.includes(benchmarkCase.id));
41963
+ }
41964
+ function expandCases(suites, runs, extensiveSize) {
41965
+ const selected = [];
41966
+ if (suites.includes("quick")) selected.push(...QUICK_CASES);
41967
+ if (suites.includes("extensive")) selected.push(...selectExtensiveCases(extensiveSize));
41968
+ const uniqueSelected = dedupeCases(selected);
41684
41969
  const expanded = [];
41685
41970
  for (let run = 1; run <= runs; run += 1) {
41686
- for (const benchmarkCase of selected) expanded.push({ ...benchmarkCase, run });
41971
+ for (const benchmarkCase of uniqueSelected) expanded.push({ ...benchmarkCase, run });
41687
41972
  }
41688
41973
  return expanded;
41689
41974
  }
41975
+ function selectExtensiveCases(size) {
41976
+ const cases = dedupeCases(EXTENSIVE_CASES).slice(0, size);
41977
+ const minimumCoding = Math.ceil(size * 0.15);
41978
+ const codingCount = cases.filter((benchmarkCase) => benchmarkCase.category === "coding").length;
41979
+ if (codingCount >= minimumCoding) return cases;
41980
+ const selectedIds = new Set(cases.map((benchmarkCase) => benchmarkCase.id));
41981
+ const codingBackfill = EXTENSIVE_CASES.filter(
41982
+ (benchmarkCase) => benchmarkCase.category === "coding" && !selectedIds.has(benchmarkCase.id)
41983
+ );
41984
+ const result = [...cases];
41985
+ for (const codingCase of codingBackfill) {
41986
+ let replaceIndex = -1;
41987
+ for (let index = result.length - 1; index >= 0; index -= 1) {
41988
+ if (result[index]?.category !== "coding") {
41989
+ replaceIndex = index;
41990
+ break;
41991
+ }
41992
+ }
41993
+ if (replaceIndex === -1) break;
41994
+ result[replaceIndex] = codingCase;
41995
+ if (result.filter((benchmarkCase) => benchmarkCase.category === "coding").length >= minimumCoding) break;
41996
+ }
41997
+ return result;
41998
+ }
41999
+ function dedupeCases(cases) {
42000
+ const seen = /* @__PURE__ */ new Set();
42001
+ const result = [];
42002
+ for (const benchmarkCase of cases) {
42003
+ if (seen.has(benchmarkCase.id)) continue;
42004
+ seen.add(benchmarkCase.id);
42005
+ result.push(benchmarkCase);
42006
+ }
42007
+ return result;
42008
+ }
42009
+ async function runCaseJob(params) {
42010
+ const { client, job, judgeModel, runId, imagePath } = params;
42011
+ const { model, benchmarkCase } = job;
42012
+ const startedAt = Date.now();
42013
+ const turns = [];
42014
+ const history = benchmarkCase.longContext ? buildLongContextHistory() : [];
42015
+ let chatId;
42016
+ try {
42017
+ const initialPrompt = await buildPromptWithAttachments(client, benchmarkCase, model, imagePath);
42018
+ const targetResponse = await sendBenchmarkTurn({
42019
+ client,
42020
+ model,
42021
+ judgeModel,
42022
+ runId,
42023
+ benchmarkCase,
42024
+ prompt: initialPrompt.message,
42025
+ chatId,
42026
+ history,
42027
+ preparedEmbeds: initialPrompt.embeds,
42028
+ caseId: benchmarkCase.id
42029
+ });
42030
+ chatId = targetResponse.chatId;
42031
+ turns.push(targetResponse.turn);
42032
+ appendHistory(history, "user", initialPrompt.message);
42033
+ appendHistory(history, "assistant", targetResponse.turn.assistant);
42034
+ for (const [index, followUp] of (benchmarkCase.followUps ?? []).entries()) {
42035
+ const response = await sendBenchmarkTurn({
42036
+ client,
42037
+ model,
42038
+ judgeModel,
42039
+ runId,
42040
+ benchmarkCase,
42041
+ prompt: `${modelMention(model)} ${followUp.prompt}`,
42042
+ chatId,
42043
+ history,
42044
+ caseId: `${benchmarkCase.id}:followup-${index + 1}`
42045
+ });
42046
+ chatId = response.chatId;
42047
+ turns.push(response.turn);
42048
+ appendHistory(history, "user", response.rawPrompt);
42049
+ appendHistory(history, "assistant", response.turn.assistant);
42050
+ }
42051
+ const assistant = turns.at(-1)?.assistant ?? "";
42052
+ const caseResult = {
42053
+ id: benchmarkCase.id,
42054
+ suite: benchmarkCase.suite,
42055
+ title: benchmarkCase.title,
42056
+ model,
42057
+ run: benchmarkCase.run,
42058
+ complexity: benchmarkCase.complexity,
42059
+ category: benchmarkCase.category,
42060
+ prompt: benchmarkCase.prompt,
42061
+ assistant,
42062
+ modelName: turns.at(-1)?.modelName ?? null,
42063
+ passed: benchmarkCase.expectedIncludes ? assistant.includes(benchmarkCase.expectedIncludes) : true,
42064
+ durationMs: Date.now() - startedAt,
42065
+ expectedIncludes: benchmarkCase.expectedIncludes,
42066
+ turns
42067
+ };
42068
+ if (benchmarkCase.judge) {
42069
+ caseResult.judge = await judgeCase({ client, judgeModel, targetModel: model, benchmarkCase, caseResult, runId });
42070
+ caseResult.passed = caseResult.judge.score !== null && caseResult.judge.score >= 4 && caseResult.passed;
42071
+ }
42072
+ return caseResult;
42073
+ } catch (error) {
42074
+ const message = error instanceof Error ? error.message : String(error);
42075
+ return {
42076
+ id: benchmarkCase.id,
42077
+ suite: benchmarkCase.suite,
42078
+ title: benchmarkCase.title,
42079
+ model,
42080
+ run: benchmarkCase.run,
42081
+ complexity: benchmarkCase.complexity,
42082
+ category: benchmarkCase.category,
42083
+ prompt: benchmarkCase.prompt,
42084
+ assistant: turns.at(-1)?.assistant ?? "",
42085
+ modelName: turns.at(-1)?.modelName ?? null,
42086
+ passed: false,
42087
+ durationMs: Date.now() - startedAt,
42088
+ expectedIncludes: benchmarkCase.expectedIncludes,
42089
+ turns,
42090
+ error: message
42091
+ };
42092
+ }
42093
+ }
42094
+ async function sendBenchmarkTurn(params) {
42095
+ const startedAt = Date.now();
42096
+ const response = await params.client.sendMessage({
42097
+ message: params.prompt,
42098
+ chatId: params.chatId,
42099
+ incognito: true,
42100
+ autoApproveSubChats: true,
42101
+ benchmarkMetadata: benchmarkMetadata({
42102
+ runId: params.runId,
42103
+ suite: params.benchmarkCase.suite,
42104
+ caseId: params.caseId,
42105
+ targetModel: params.model,
42106
+ judgeModel: params.judgeModel
42107
+ }),
42108
+ messageHistory: params.history,
42109
+ preparedEmbeds: params.preparedEmbeds,
42110
+ precollectResponse: true
42111
+ });
42112
+ return {
42113
+ chatId: response.chatId,
42114
+ rawPrompt: params.prompt,
42115
+ turn: {
42116
+ prompt: params.prompt,
42117
+ assistant: response.assistant,
42118
+ modelName: response.modelName,
42119
+ durationMs: Date.now() - startedAt
42120
+ }
42121
+ };
42122
+ }
42123
+ async function buildPromptWithAttachments(client, benchmarkCase, model, imagePath) {
42124
+ const baseMessage = `${modelMention(model)} ${benchmarkCase.prompt}`;
42125
+ if (benchmarkCase.image !== "default") return { message: baseMessage };
42126
+ const attachment = await prepareImageAttachment(client, imagePath);
42127
+ return { message: `${baseMessage}
42128
+
42129
+ ${attachment.messageSuffix}`, embeds: attachment.embeds };
42130
+ }
42131
+ async function prepareImageAttachment(client, imagePath) {
42132
+ if (!existsSync6(imagePath)) throw new Error(`Benchmark image not found: ${imagePath}`);
42133
+ const processed = processFiles([imagePath], null);
42134
+ if (processed.blocked.length > 0 || processed.errors.length > 0 || processed.embeds.length === 0) {
42135
+ const reason = [...processed.blocked, ...processed.errors].map((entry) => entry.error).join("; ") || "no image embed produced";
42136
+ throw new Error(`Failed to prepare benchmark image: ${reason}`);
42137
+ }
42138
+ const fileEmbed = processed.embeds[0];
42139
+ if (!fileEmbed.requiresUpload || !fileEmbed.localPath) {
42140
+ return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
42141
+ }
42142
+ await uploadBenchmarkImage(client, fileEmbed);
42143
+ return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
42144
+ }
42145
+ async function uploadBenchmarkImage(client, fileEmbed) {
42146
+ if (!fileEmbed.localPath) return;
42147
+ const uploadResult = await uploadFile(fileEmbed.localPath, client.getSession());
42148
+ const embedRef = fileEmbed.embed.embedRef ?? `benchmark-image-${uploadResult.embed_id.slice(0, 8)}`;
42149
+ fileEmbed.embed.embedRef = embedRef;
42150
+ fileEmbed.embed.content = toonEncodeContent({
42151
+ type: "image",
42152
+ app_id: "images",
42153
+ skill_id: "upload",
42154
+ status: "finished",
42155
+ filename: fileEmbed.displayName,
42156
+ embed_ref: embedRef,
42157
+ content_hash: uploadResult.content_hash,
42158
+ s3_base_url: uploadResult.s3_base_url,
42159
+ files: uploadResult.files,
42160
+ aes_key: uploadResult.aes_key,
42161
+ aes_nonce: uploadResult.aes_nonce,
42162
+ vault_wrapped_aes_key: uploadResult.vault_wrapped_aes_key,
42163
+ ai_detection: uploadResult.ai_detection
42164
+ });
42165
+ fileEmbed.embed.status = "finished";
42166
+ fileEmbed.embed.contentHash = uploadResult.content_hash;
42167
+ fileEmbed.embed.embedId = uploadResult.embed_id;
42168
+ fileEmbed.referenceBlock = createBenchmarkEmbedReferenceBlock(fileEmbed.embed.embedId, fileEmbed.embed.type);
42169
+ }
42170
+ function createBenchmarkEmbedReferenceBlock(embedId, embedType) {
42171
+ return `
42172
+
42173
+ \`\`\`json
42174
+ ${JSON.stringify({ type: embedType, embed_id: embedId })}
42175
+ \`\`\``;
42176
+ }
42177
+ async function judgeCase(params) {
42178
+ const startedAt = Date.now();
42179
+ const judgeResponse = await params.client.sendMessage({
42180
+ message: `${modelMention(params.judgeModel)} ${judgePrompt(params.targetModel, params.benchmarkCase, params.caseResult)}`,
42181
+ incognito: true,
42182
+ autoApproveSubChats: true,
42183
+ benchmarkMetadata: benchmarkMetadata({
42184
+ runId: params.runId,
42185
+ suite: params.benchmarkCase.suite,
42186
+ caseId: `${params.benchmarkCase.id}:judge:${params.targetModel}`,
42187
+ targetModel: params.targetModel,
42188
+ judgeModel: params.judgeModel
42189
+ }),
42190
+ precollectResponse: true
42191
+ });
42192
+ const judgment = parseJudgment(judgeResponse.assistant);
42193
+ return {
42194
+ model: params.judgeModel,
42195
+ score: judgment.score,
42196
+ reason: judgment.reason,
42197
+ raw: judgeResponse.assistant,
42198
+ durationMs: Date.now() - startedAt
42199
+ };
42200
+ }
42201
+ async function runPool(items, parallel, worker) {
42202
+ let index = 0;
42203
+ const workers = Array.from({ length: Math.min(parallel, items.length) }, async () => {
42204
+ while (index < items.length) {
42205
+ const item = items[index];
42206
+ index += 1;
42207
+ await worker(item);
42208
+ }
42209
+ });
42210
+ await Promise.all(workers);
42211
+ }
42212
+ function buildLongContextHistory() {
42213
+ const now = Math.floor(Date.now() / 1e3) - 2e3;
42214
+ const topics = [
42215
+ ["user", "We need to launch a CLI benchmark for model comparisons."],
42216
+ ["assistant", "The first goal should be a quick suite with deterministic checks."],
42217
+ ["user", "The benchmark also needs image inference."],
42218
+ ["assistant", "Use a public fixture image and ask a factual visual question."],
42219
+ ["user", "We should avoid wasting credits."],
42220
+ ["assistant", "Run a pricing preflight and require explicit spend confirmation."],
42221
+ ["user", "What about longer conversations?"],
42222
+ ["assistant", "Add a 20-message predefined history and a dependent follow-up."],
42223
+ ["user", "The extensive suite should not be too small."],
42224
+ ["assistant", "Default to 10 cases and allow 5 or 20 as alternatives."],
42225
+ ["user", "Coding quality matters."],
42226
+ ["assistant", "Reserve at least 15 percent of extensive cases for coding prompts."],
42227
+ ["user", "We also need comparison mode."],
42228
+ ["assistant", "Accept multiple models with --compare and run target jobs in parallel."],
42229
+ ["user", "How should judging work?"],
42230
+ ["assistant", "Judge each completed case immediately with Gemini so partial results remain useful."],
42231
+ ["user", "What if the process is interrupted?"],
42232
+ ["assistant", "Print or write a partial summary with completed judgments and skipped counts."],
42233
+ ["user", "What is the best launch strategy?"],
42234
+ ["assistant", "Ship quick and comparison first, then use extensive for slower releases."]
42235
+ ];
42236
+ return topics.map(([role, content], index) => ({
42237
+ message_id: `benchmark-history-${index + 1}`,
42238
+ role,
42239
+ sender_name: role === "user" ? "User" : "Assistant",
42240
+ content,
42241
+ created_at: now + index * 30
42242
+ }));
42243
+ }
42244
+ function appendHistory(history, role, content) {
42245
+ history.push({
42246
+ message_id: randomUUID3(),
42247
+ role,
42248
+ sender_name: role === "user" ? "User" : "Assistant",
42249
+ content,
42250
+ created_at: Math.floor(Date.now() / 1e3)
42251
+ });
42252
+ }
41690
42253
  function modelMention(model) {
41691
42254
  const separator = model.indexOf("/");
41692
42255
  if (separator === -1) return `@ai-model:${model}`;
@@ -41705,16 +42268,28 @@ function benchmarkMetadata(params) {
41705
42268
  benchmark_judge_model: params.judgeModel
41706
42269
  };
41707
42270
  }
41708
- function judgePrompt(prompt, answer) {
42271
+ function judgePrompt(targetModel, benchmarkCase, result) {
41709
42272
  return [
41710
- "You are judging a model benchmark response.",
41711
- 'Return only JSON with shape {"score": number, "reason": string}.',
41712
- "Score from 1 to 5 for correctness, clarity, and following instructions.",
41713
- `Benchmark prompt: ${JSON.stringify(prompt)}`,
41714
- `Candidate answer: ${JSON.stringify(answer)}`
42273
+ "You are judging a real OpenMates model benchmark response.",
42274
+ "Return exactly two plain-text lines, with no markdown, no code block, and no tool use.",
42275
+ "Line 1 format: BENCHMARK_SCORE=<integer from 1 to 5>",
42276
+ "Line 2 format: BENCHMARK_REASON=<one short sentence>",
42277
+ "Score for correctness, instruction-following, usefulness, and continuity where relevant.",
42278
+ `Target model: ${targetModel}`,
42279
+ `Benchmark case: ${benchmarkCase.id} (${benchmarkCase.category}, ${benchmarkCase.complexity})`,
42280
+ `Initial prompt: ${JSON.stringify(benchmarkCase.prompt)}`,
42281
+ `Turns: ${JSON.stringify(result.turns.map((turn) => ({ prompt: turn.prompt, assistant: turn.assistant })))}`
41715
42282
  ].join("\n");
41716
42283
  }
41717
42284
  function parseJudgment(answer) {
42285
+ const markerScore = answer.match(/BENCHMARK_SCORE\s*=\s*([1-5])/i);
42286
+ if (markerScore) {
42287
+ const reasonMatch = answer.match(/BENCHMARK_REASON\s*=\s*(.+)/i);
42288
+ return {
42289
+ score: Number.parseInt(markerScore[1], 10),
42290
+ reason: reasonMatch?.[1]?.trim() ?? null
42291
+ };
42292
+ }
41718
42293
  const jsonText = extractJsonObject(answer);
41719
42294
  if (!jsonText) return { score: null, reason: null };
41720
42295
  try {
@@ -41734,6 +42309,220 @@ function extractJsonObject(text) {
41734
42309
  if (start === -1 || end === -1 || end <= start) return null;
41735
42310
  return text.slice(start, end + 1);
41736
42311
  }
42312
+ function loadPricingForModels(models) {
42313
+ const availablePricing = loadProviderPricing();
42314
+ const pricing = /* @__PURE__ */ new Map();
42315
+ const missing = [];
42316
+ for (const model of [...new Set(models)]) {
42317
+ const key = normalizeModelKey(model);
42318
+ const modelPricing = availablePricing.get(key);
42319
+ if (!modelPricing) {
42320
+ missing.push(model);
42321
+ continue;
42322
+ }
42323
+ pricing.set(model, modelPricing);
42324
+ }
42325
+ if (missing.length > 0) {
42326
+ throw new Error(
42327
+ `Cannot estimate benchmark cost because pricing metadata is unavailable for: ${missing.join(", ")}. Use provider/model ids with backend provider pricing metadata.`
42328
+ );
42329
+ }
42330
+ return pricing;
42331
+ }
42332
+ function loadProviderPricing() {
42333
+ const providersDir = findProvidersDir();
42334
+ const pricing = /* @__PURE__ */ new Map();
42335
+ if (!providersDir) return pricing;
42336
+ for (const fileName of readdirSync(providersDir)) {
42337
+ if (!fileName.endsWith(".yml")) continue;
42338
+ const filePath = join4(providersDir, fileName);
42339
+ const text = readFileSync6(filePath, "utf-8");
42340
+ const provider = parseProviderId(text) ?? fileName.replace(/\.yml$/, "");
42341
+ for (const modelPricing of parseModelPricing(text, provider)) {
42342
+ pricing.set(`${modelPricing.provider}/${modelPricing.modelId}`, modelPricing);
42343
+ pricing.set(modelPricing.modelId, modelPricing);
42344
+ }
42345
+ }
42346
+ return pricing;
42347
+ }
42348
+ function parseProviderId(text) {
42349
+ const match = text.match(/^provider_id:\s*["']?([^"'\n]+)["']?/m);
42350
+ return match?.[1]?.trim() ?? null;
42351
+ }
42352
+ function parseModelPricing(text, provider) {
42353
+ const lines = text.split("\n");
42354
+ const results = [];
42355
+ let modelId = null;
42356
+ let inModel = false;
42357
+ let inputTokensPerCredit = null;
42358
+ let outputTokensPerCredit = null;
42359
+ for (const line of lines) {
42360
+ const modelMatch = line.match(/^\s{2}-\s+id:\s*["']?([^"'\n#]+)["']?/);
42361
+ if (modelMatch) {
42362
+ if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
42363
+ results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
42364
+ }
42365
+ inModel = true;
42366
+ modelId = modelMatch[1].trim();
42367
+ inputTokensPerCredit = null;
42368
+ outputTokensPerCredit = null;
42369
+ continue;
42370
+ }
42371
+ if (!inModel) continue;
42372
+ const inputMatch = line.match(/^\s{10}per_credit_unit:\s*(\d+)/);
42373
+ if (inputMatch && inputTokensPerCredit === null) {
42374
+ inputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
42375
+ continue;
42376
+ }
42377
+ if (inputMatch && inputTokensPerCredit !== null && outputTokensPerCredit === null) {
42378
+ outputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
42379
+ }
42380
+ }
42381
+ if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
42382
+ results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
42383
+ }
42384
+ return results;
42385
+ }
42386
+ function normalizeModelKey(model) {
42387
+ return model.includes("/") ? model : model;
42388
+ }
42389
+ function findProvidersDir() {
42390
+ const currentFile = fileURLToPath(import.meta.url);
42391
+ let current = dirname(currentFile);
42392
+ for (let index = 0; index < 8; index += 1) {
42393
+ const candidate = join4(current, "backend", "providers");
42394
+ if (existsSync6(candidate)) return candidate;
42395
+ const parentCandidate = join4(current, "..", "..", "backend", "providers");
42396
+ if (existsSync6(parentCandidate)) return resolve5(parentCandidate);
42397
+ const next = dirname(current);
42398
+ if (next === current) break;
42399
+ current = next;
42400
+ }
42401
+ return null;
42402
+ }
42403
+ function estimateCredits(cases, targetModels, judgeModel, pricing) {
42404
+ let targetCredits = 0;
42405
+ let judgeCredits = 0;
42406
+ let targetInputTokens = 0;
42407
+ let targetOutputTokens = 0;
42408
+ let judgeInputTokens = 0;
42409
+ let judgeOutputTokens = 0;
42410
+ for (const benchmarkCase of cases) {
42411
+ const turnCount = 1 + (benchmarkCase.followUps?.length ?? 0);
42412
+ for (const model of targetModels) {
42413
+ const modelPricing = pricing.get(model);
42414
+ if (!modelPricing) continue;
42415
+ const input = benchmarkCase.estimatedInputTokens * turnCount;
42416
+ const output = benchmarkCase.estimatedOutputTokens * turnCount;
42417
+ targetInputTokens += input;
42418
+ targetOutputTokens += output;
42419
+ targetCredits += creditsFor(modelPricing, input, output);
42420
+ if (benchmarkCase.judge) {
42421
+ const judgePricing = pricing.get(judgeModel);
42422
+ if (!judgePricing) continue;
42423
+ const judgeInput = Math.max(2e3, Math.ceil(output * 1.5));
42424
+ const judgeOutput = 350;
42425
+ judgeInputTokens += judgeInput;
42426
+ judgeOutputTokens += judgeOutput;
42427
+ judgeCredits += creditsFor(judgePricing, judgeInput, judgeOutput);
42428
+ }
42429
+ }
42430
+ }
42431
+ return {
42432
+ targetCredits,
42433
+ judgeCredits,
42434
+ totalCredits: targetCredits + judgeCredits,
42435
+ assumptions: { targetInputTokens, targetOutputTokens, judgeInputTokens, judgeOutputTokens }
42436
+ };
42437
+ }
42438
+ function creditsFor(pricing, inputTokens, outputTokens) {
42439
+ return Math.ceil(inputTokens / pricing.inputTokensPerCredit) + Math.ceil(outputTokens / pricing.outputTokensPerCredit);
42440
+ }
42441
+ function makeBaseResult(params) {
42442
+ return {
42443
+ command: "benchmark model",
42444
+ status: params.dryRun ? "planned" : "completed",
42445
+ runId: params.runId,
42446
+ targetModel: params.targetModels[0],
42447
+ targetModels: params.targetModels,
42448
+ judgeModel: params.judgeModel,
42449
+ suites: params.suites,
42450
+ runs: params.runs,
42451
+ compare: params.compare,
42452
+ parallel: params.parallel,
42453
+ extensiveSize: params.extensiveSize,
42454
+ spendsCredits: !params.dryRun,
42455
+ estimatedCredits: params.estimate,
42456
+ cases: [],
42457
+ modelSummaries: params.targetModels.map((model) => ({
42458
+ model,
42459
+ total: 0,
42460
+ passed: 0,
42461
+ failed: 0,
42462
+ averageJudgeScore: null,
42463
+ averageDurationMs: null
42464
+ })),
42465
+ summary: {
42466
+ total: params.totalJobs,
42467
+ completed: 0,
42468
+ passed: 0,
42469
+ failed: 0,
42470
+ skipped: params.dryRun ? params.totalJobs : 0,
42471
+ interrupted: false
42472
+ }
42473
+ };
42474
+ }
42475
+ function recomputeResult(result, totalJobs, interrupted) {
42476
+ const completed = result.cases.length;
42477
+ const passed = result.cases.filter((caseResult) => caseResult.passed).length;
42478
+ const failed = result.cases.filter((caseResult) => !caseResult.passed).length;
42479
+ result.summary = {
42480
+ total: totalJobs,
42481
+ completed,
42482
+ passed,
42483
+ failed,
42484
+ skipped: Math.max(0, totalJobs - completed),
42485
+ interrupted
42486
+ };
42487
+ result.status = interrupted || completed < totalJobs ? "partial" : "completed";
42488
+ result.modelSummaries = result.targetModels.map((model) => summarizeModel(model, result.cases));
42489
+ if (result.compare) result.comparison = buildComparison(result.modelSummaries);
42490
+ }
42491
+ function summarizeModel(model, cases) {
42492
+ const modelCases = cases.filter((caseResult) => caseResult.model === model);
42493
+ const scores = modelCases.map((caseResult) => caseResult.judge?.score).filter((score) => typeof score === "number" && Number.isFinite(score));
42494
+ const durations = modelCases.map((caseResult) => caseResult.durationMs).filter((value) => value > 0);
42495
+ return {
42496
+ model,
42497
+ total: modelCases.length,
42498
+ passed: modelCases.filter((caseResult) => caseResult.passed).length,
42499
+ failed: modelCases.filter((caseResult) => !caseResult.passed).length,
42500
+ averageJudgeScore: scores.length > 0 ? round2(scores.reduce((sum, score) => sum + score, 0) / scores.length) : null,
42501
+ averageDurationMs: durations.length > 0 ? Math.round(durations.reduce((sum, value) => sum + value, 0) / durations.length) : null
42502
+ };
42503
+ }
42504
+ function buildComparison(summaries) {
42505
+ const ranking = [...summaries].sort((a, b) => (b.averageJudgeScore ?? -1) - (a.averageJudgeScore ?? -1) || b.passed - a.passed).map((summary) => ({
42506
+ model: summary.model,
42507
+ averageJudgeScore: summary.averageJudgeScore,
42508
+ passed: summary.passed,
42509
+ total: summary.total
42510
+ }));
42511
+ const notes = ranking.length > 0 ? [`Top model so far: ${ranking[0].model} (${ranking[0].passed}/${ranking[0].total} passed).`] : [];
42512
+ return { ranking, notes };
42513
+ }
42514
+ function round2(value) {
42515
+ return Math.round(value * 100) / 100;
42516
+ }
42517
+ function defaultImageFixturePath() {
42518
+ const fixtureDir = join4(dirname(fileURLToPath(import.meta.url)), "..", "fixtures");
42519
+ const fixturePath = join4(fixtureDir, "brandenburger-tor.png");
42520
+ if (existsSync6(fixturePath)) return fixturePath;
42521
+ const tempDir = mkdtempSync(join4(tmpdir(), "openmates-benchmark-"));
42522
+ const tempPath = join4(tempDir, "brandenburger-tor.svg");
42523
+ writeFileSync4(tempPath, FIXTURE_IMAGE_SVG, "utf-8");
42524
+ return tempPath;
42525
+ }
41737
42526
  function writeBenchmarkResult(result, flags, output) {
41738
42527
  const json = `${JSON.stringify(result, null, 2)}
41739
42528
  `;
@@ -41742,17 +42531,19 @@ function writeBenchmarkResult(result, flags, output) {
41742
42531
  process.stdout.write(json);
41743
42532
  return;
41744
42533
  }
41745
- console.log(`Benchmark ${result.status}: ${result.targetModel}`);
42534
+ console.log(`Benchmark ${result.status}: ${result.targetModels.join(", ")}`);
41746
42535
  console.log(`Run ID: ${result.runId}`);
41747
42536
  console.log(`Suites: ${result.suites.join(", ")}`);
41748
42537
  console.log(`Judge: ${result.judgeModel}`);
42538
+ console.log(`Estimated credits: ${result.estimatedCredits.totalCredits}`);
41749
42539
  console.log(`Spend credits: ${result.spendsCredits ? "yes" : "no"}`);
41750
- if (result.status === "completed") {
41751
- console.log(`Passed: ${result.summary.passed}/${result.summary.total}`);
42540
+ if (result.status !== "planned") {
42541
+ console.log(`Passed: ${result.summary.passed}/${result.summary.completed} completed (${result.summary.skipped} skipped)`);
41752
42542
  for (const benchmarkCase of result.cases) {
41753
42543
  const mark = benchmarkCase.passed ? "PASS" : "FAIL";
41754
- const judge = benchmarkCase.judge?.score !== void 0 ? ` judge=${benchmarkCase.judge.score ?? "unparsed"}` : "";
41755
- console.log(`${mark} ${benchmarkCase.suite}/${benchmarkCase.id} (${benchmarkCase.durationMs}ms)${judge}`);
42544
+ const judge = benchmarkCase.judge ? ` judge=${benchmarkCase.judge.score ?? "unparsed"}` : "";
42545
+ const error = benchmarkCase.error ? ` error=${benchmarkCase.error}` : "";
42546
+ console.log(`${mark} ${benchmarkCase.model} ${benchmarkCase.suite}/${benchmarkCase.id} (${benchmarkCase.durationMs}ms)${judge}${error}`);
41756
42547
  }
41757
42548
  }
41758
42549
  }
@@ -42141,10 +42932,10 @@ Run 'openmates chats show ` + chatId + "' to check if suggestions have been save
42141
42932
  input: process.stdin,
42142
42933
  output: process.stdout
42143
42934
  });
42144
- const answer = await new Promise((resolve5) => {
42935
+ const answer = await new Promise((resolve6) => {
42145
42936
  iface.question(
42146
42937
  `Delete ${resolved.length} chat(s)? This cannot be undone. [y/N] `,
42147
- resolve5
42938
+ resolve6
42148
42939
  );
42149
42940
  });
42150
42941
  iface.close();
@@ -42304,16 +43095,16 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
42304
43095
  }
42305
43096
  }
42306
43097
  const { mkdir, writeFile } = await import("fs/promises");
42307
- const { join: join4 } = await import("path");
43098
+ const { join: join5 } = await import("path");
42308
43099
  if (useZip) {
42309
- const tmpDir = join4(outputDir, `.${filenameBase}_tmp`);
43100
+ const tmpDir = join5(outputDir, `.${filenameBase}_tmp`);
42310
43101
  await mkdir(tmpDir, { recursive: true });
42311
- await writeFile(join4(tmpDir, `${filenameBase}.yml`), yamlContent);
42312
- await writeFile(join4(tmpDir, `${filenameBase}.md`), mdContent);
43102
+ await writeFile(join5(tmpDir, `${filenameBase}.yml`), yamlContent);
43103
+ await writeFile(join5(tmpDir, `${filenameBase}.md`), mdContent);
42313
43104
  if (codeEmbeds.length > 0) {
42314
43105
  for (const ce of codeEmbeds) {
42315
43106
  const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
42316
- const fullPath = join4(tmpDir, "code", fpath);
43107
+ const fullPath = join5(tmpDir, "code", fpath);
42317
43108
  await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
42318
43109
  recursive: true
42319
43110
  });
@@ -42321,13 +43112,13 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
42321
43112
  }
42322
43113
  }
42323
43114
  if (transcriptEmbeds.length > 0) {
42324
- const tDir = join4(tmpDir, "transcripts");
43115
+ const tDir = join5(tmpDir, "transcripts");
42325
43116
  await mkdir(tDir, { recursive: true });
42326
43117
  for (const te of transcriptEmbeds) {
42327
- await writeFile(join4(tDir, te.filename), te.content);
43118
+ await writeFile(join5(tDir, te.filename), te.content);
42328
43119
  }
42329
43120
  }
42330
- const zipPath = join4(outputDir, `${filenameBase}.zip`);
43121
+ const zipPath = join5(outputDir, `${filenameBase}.zip`);
42331
43122
  const { execSync: execSync2 } = await import("child_process");
42332
43123
  try {
42333
43124
  execSync2(`cd "${tmpDir}" && zip -r "${zipPath}" .`, { stdio: "pipe" });
@@ -42342,17 +43133,17 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
42342
43133
  );
42343
43134
  }
42344
43135
  } else {
42345
- const chatDir = join4(outputDir, filenameBase);
43136
+ const chatDir = join5(outputDir, filenameBase);
42346
43137
  await mkdir(chatDir, { recursive: true });
42347
43138
  const written = [];
42348
- await writeFile(join4(chatDir, `${filenameBase}.yml`), yamlContent);
43139
+ await writeFile(join5(chatDir, `${filenameBase}.yml`), yamlContent);
42349
43140
  written.push(`${filenameBase}.yml`);
42350
- await writeFile(join4(chatDir, `${filenameBase}.md`), mdContent);
43141
+ await writeFile(join5(chatDir, `${filenameBase}.md`), mdContent);
42351
43142
  written.push(`${filenameBase}.md`);
42352
43143
  if (codeEmbeds.length > 0) {
42353
43144
  for (const ce of codeEmbeds) {
42354
43145
  const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
42355
- const fullPath = join4(chatDir, "code", fpath);
43146
+ const fullPath = join5(chatDir, "code", fpath);
42356
43147
  await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
42357
43148
  recursive: true
42358
43149
  });
@@ -42361,10 +43152,10 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
42361
43152
  }
42362
43153
  }
42363
43154
  if (transcriptEmbeds.length > 0) {
42364
- const tDir = join4(chatDir, "transcripts");
43155
+ const tDir = join5(chatDir, "transcripts");
42365
43156
  await mkdir(tDir, { recursive: true });
42366
43157
  for (const te of transcriptEmbeds) {
42367
- await writeFile(join4(tDir, te.filename), te.content);
43158
+ await writeFile(join5(tDir, te.filename), te.content);
42368
43159
  written.push(`transcripts/${te.filename}`);
42369
43160
  }
42370
43161
  }
@@ -42400,7 +43191,7 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
42400
43191
  printJson2({
42401
43192
  chat_id: chat.id,
42402
43193
  title: chat.title,
42403
- output_dir: useZip ? join4(outputDir, `${filenameBase}.zip`) : join4(outputDir, filenameBase),
43194
+ output_dir: useZip ? join5(outputDir, `${filenameBase}.zip`) : join5(outputDir, filenameBase),
42404
43195
  files,
42405
43196
  code_embeds: codeEmbeds.length,
42406
43197
  transcript_embeds: transcriptEmbeds.length
@@ -42921,7 +43712,7 @@ async function handleCodeRun(client, flags, apiKey) {
42921
43712
  }
42922
43713
  }
42923
43714
  async function streamCodeRunToTerminal(url, jsonMode) {
42924
- return await new Promise((resolve5, reject) => {
43715
+ return await new Promise((resolve6, reject) => {
42925
43716
  const ws = new WebSocket2(url);
42926
43717
  let lastStatus = {};
42927
43718
  ws.on("message", (data) => {
@@ -42940,7 +43731,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
42940
43731
  const status = String(payload.status ?? "");
42941
43732
  if (["finished", "failed", "timeout", "cancelled"].includes(status)) {
42942
43733
  ws.close();
42943
- resolve5(lastStatus);
43734
+ resolve6(lastStatus);
42944
43735
  }
42945
43736
  }
42946
43737
  } catch (err) {
@@ -42950,7 +43741,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
42950
43741
  });
42951
43742
  ws.on("error", () => reject(new Error("Code Run stream failed.")));
42952
43743
  ws.on("close", () => {
42953
- if (Object.keys(lastStatus).length > 0) resolve5(lastStatus);
43744
+ if (Object.keys(lastStatus).length > 0) resolve6(lastStatus);
42954
43745
  });
42955
43746
  });
42956
43747
  }
@@ -42961,7 +43752,7 @@ async function pollCodeRunStatus(client, statusPath, apiKey, jsonMode) {
42961
43752
  if (!jsonMode && value) process.stderr.write(`Code Run status: ${value}
42962
43753
  `);
42963
43754
  if (["finished", "failed", "timeout", "cancelled"].includes(value)) return status;
42964
- await new Promise((resolve5) => setTimeout(resolve5, 1e3));
43755
+ await new Promise((resolve6) => setTimeout(resolve6, 1e3));
42965
43756
  }
42966
43757
  }
42967
43758
  function buildSkillInput(flags, inlineTokens, schemaParams) {
@@ -43445,11 +44236,11 @@ function parseYamlScalar(value) {
43445
44236
  }
43446
44237
  async function saveDownloadedDocument(document, output) {
43447
44238
  const { mkdir, writeFile } = await import("fs/promises");
43448
- const { join: join4, basename: basename4, dirname: dirname2 } = await import("path");
44239
+ const { join: join5, basename: basename4, dirname: dirname3 } = await import("path");
43449
44240
  const target = typeof output === "string" ? output : ".";
43450
44241
  const filename = basename4(document.filename || "document.pdf");
43451
- const filePath = target.endsWith(".pdf") ? target : join4(target, filename);
43452
- await mkdir(dirname2(filePath), { recursive: true });
44242
+ const filePath = target.endsWith(".pdf") ? target : join5(target, filename);
44243
+ await mkdir(dirname3(filePath), { recursive: true });
43453
44244
  await writeFile(filePath, document.data);
43454
44245
  return filePath;
43455
44246
  }
@@ -43477,7 +44268,7 @@ function printMateInfo(mateId, json) {
43477
44268
  async function confirmOrExit(question) {
43478
44269
  const rl = await import("readline");
43479
44270
  const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
43480
- const answer = await new Promise((resolve5) => iface.question(question, resolve5));
44271
+ const answer = await new Promise((resolve6) => iface.question(question, resolve6));
43481
44272
  iface.close();
43482
44273
  if (answer.trim().toLowerCase() !== "y") {
43483
44274
  console.log("Aborted.");
@@ -43487,7 +44278,7 @@ async function confirmOrExit(question) {
43487
44278
  async function promptLine(question) {
43488
44279
  const rl = await import("readline");
43489
44280
  const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
43490
- const answer = await new Promise((resolve5) => iface.question(question, resolve5));
44281
+ const answer = await new Promise((resolve6) => iface.question(question, resolve6));
43491
44282
  iface.close();
43492
44283
  return answer.trim();
43493
44284
  }
@@ -43495,7 +44286,7 @@ async function promptSecret(question) {
43495
44286
  if (!process.stdin.isTTY) {
43496
44287
  return promptLine(question);
43497
44288
  }
43498
- return new Promise((resolve5) => {
44289
+ return new Promise((resolve6) => {
43499
44290
  const stdin2 = process.stdin;
43500
44291
  const wasRaw = stdin2.isRaw;
43501
44292
  let value = "";
@@ -43508,7 +44299,7 @@ async function promptSecret(question) {
43508
44299
  stdin2.off("data", onData);
43509
44300
  stdin2.setRawMode(wasRaw);
43510
44301
  process.stdout.write("\n");
43511
- resolve5(value);
44302
+ resolve6(value);
43512
44303
  return;
43513
44304
  }
43514
44305
  if (char === "") {
@@ -43528,7 +44319,7 @@ async function promptSecret(question) {
43528
44319
  }
43529
44320
  async function writeSecretFile(filePath, content, force = false) {
43530
44321
  const { mkdir, writeFile, stat: stat2 } = await import("fs/promises");
43531
- const { dirname: dirname2 } = await import("path");
44322
+ const { dirname: dirname3 } = await import("path");
43532
44323
  try {
43533
44324
  await stat2(filePath);
43534
44325
  if (!force) throw new Error(`${filePath} already exists. Use --force to overwrite.`);
@@ -43538,7 +44329,7 @@ async function writeSecretFile(filePath, content, force = false) {
43538
44329
  }
43539
44330
  if (error instanceof Error && !("code" in error)) throw error;
43540
44331
  }
43541
- await mkdir(dirname2(filePath), { recursive: true });
44332
+ await mkdir(dirname3(filePath), { recursive: true });
43542
44333
  await writeFile(filePath, content, { mode: 384 });
43543
44334
  return filePath;
43544
44335
  }
@@ -46496,7 +47287,7 @@ async function handleDocs(client, subcommand, rest, flags) {
46496
47287
  }
46497
47288
  if (subcommand === "download") {
46498
47289
  const { writeFile, mkdir } = await import("fs/promises");
46499
- const { join: join4, dirname: dirname2 } = await import("path");
47290
+ const { join: join5, dirname: dirname3 } = await import("path");
46500
47291
  if (flags.all === true) {
46501
47292
  const outputDir = typeof flags.output === "string" ? flags.output : "./openmates-docs";
46502
47293
  const tree = await client.listDocs();
@@ -46505,8 +47296,8 @@ async function handleDocs(client, subcommand, rest, flags) {
46505
47296
  let count = 0;
46506
47297
  for (const slug2 of slugs) {
46507
47298
  const content2 = await client.getDoc(slug2);
46508
- const filePath = join4(outputDir, `${slug2}.md`);
46509
- await mkdir(dirname2(filePath), { recursive: true });
47299
+ const filePath = join5(outputDir, `${slug2}.md`);
47300
+ await mkdir(dirname3(filePath), { recursive: true });
46510
47301
  await writeFile(filePath, content2, "utf-8");
46511
47302
  count++;
46512
47303
  process.stderr.write(`\r Downloaded ${count}/${slugs.length}`);
@@ -46578,8 +47369,8 @@ function isCliEntrypoint() {
46578
47369
  if (!entrypoint) return false;
46579
47370
  try {
46580
47371
  const invokedPath = realpathSync(entrypoint);
46581
- const modulePath = realpathSync(fileURLToPath(import.meta.url));
46582
- return invokedPath === modulePath || basename3(invokedPath) === "cli.js" && dirname(invokedPath) === dirname(modulePath);
47372
+ const modulePath = realpathSync(fileURLToPath2(import.meta.url));
47373
+ return invokedPath === modulePath || basename3(invokedPath) === "cli.js" && dirname2(invokedPath) === dirname2(modulePath);
46583
47374
  } catch {
46584
47375
  return false;
46585
47376
  }