openmates 0.12.0-alpha.11 → 0.12.0-alpha.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-2QG4XPEB.js → chunk-R5Z4FBJJ.js} +924 -160
- package/dist/cli.js +1 -1
- package/dist/index.d.ts +11 -0
- package/dist/index.js +1 -1
- package/fixtures/brandenburger-tor.png +0 -0
- package/fixtures/brandenburger-tor.svg +25 -0
- package/package.json +5 -3
|
@@ -986,14 +986,14 @@ var OpenMatesWsClient = class {
|
|
|
986
986
|
});
|
|
987
987
|
}
|
|
988
988
|
async open(timeoutMs = 1e4) {
|
|
989
|
-
await new Promise((
|
|
989
|
+
await new Promise((resolve6, reject) => {
|
|
990
990
|
const timeout = setTimeout(
|
|
991
991
|
() => reject(new Error("WebSocket open timeout")),
|
|
992
992
|
timeoutMs
|
|
993
993
|
);
|
|
994
994
|
this.socket.once("open", () => {
|
|
995
995
|
clearTimeout(timeout);
|
|
996
|
-
|
|
996
|
+
resolve6();
|
|
997
997
|
});
|
|
998
998
|
this.socket.once("error", (error) => {
|
|
999
999
|
clearTimeout(timeout);
|
|
@@ -1022,15 +1022,15 @@ var OpenMatesWsClient = class {
|
|
|
1022
1022
|
this.socket.send(JSON.stringify({ type, payload }));
|
|
1023
1023
|
}
|
|
1024
1024
|
sendAsync(type, payload) {
|
|
1025
|
-
return new Promise((
|
|
1025
|
+
return new Promise((resolve6, reject) => {
|
|
1026
1026
|
this.socket.send(JSON.stringify({ type, payload }), (error) => {
|
|
1027
1027
|
if (error) reject(error);
|
|
1028
|
-
else
|
|
1028
|
+
else resolve6();
|
|
1029
1029
|
});
|
|
1030
1030
|
});
|
|
1031
1031
|
}
|
|
1032
1032
|
waitForMessage(expectedType, predicate, timeoutMs = 2e4) {
|
|
1033
|
-
return new Promise((
|
|
1033
|
+
return new Promise((resolve6, reject) => {
|
|
1034
1034
|
const onMessage = (rawData) => {
|
|
1035
1035
|
try {
|
|
1036
1036
|
const parsed = JSON.parse(rawData.toString());
|
|
@@ -1041,7 +1041,7 @@ var OpenMatesWsClient = class {
|
|
|
1041
1041
|
return;
|
|
1042
1042
|
}
|
|
1043
1043
|
cleanup();
|
|
1044
|
-
|
|
1044
|
+
resolve6(parsed);
|
|
1045
1045
|
} catch {
|
|
1046
1046
|
}
|
|
1047
1047
|
};
|
|
@@ -1074,14 +1074,14 @@ var OpenMatesWsClient = class {
|
|
|
1074
1074
|
* Used by ensureSynced to consume the full phased-sync event stream.
|
|
1075
1075
|
*/
|
|
1076
1076
|
collectMessages(terminatorType, timeoutMs = 9e4) {
|
|
1077
|
-
return new Promise((
|
|
1077
|
+
return new Promise((resolve6, reject) => {
|
|
1078
1078
|
const collected = [];
|
|
1079
1079
|
const onMessage = (rawData) => {
|
|
1080
1080
|
try {
|
|
1081
1081
|
const parsed = JSON.parse(rawData.toString());
|
|
1082
1082
|
if (parsed.type === terminatorType) {
|
|
1083
1083
|
cleanup();
|
|
1084
|
-
|
|
1084
|
+
resolve6(collected);
|
|
1085
1085
|
return;
|
|
1086
1086
|
}
|
|
1087
1087
|
collected.push(parsed);
|
|
@@ -1094,7 +1094,7 @@ var OpenMatesWsClient = class {
|
|
|
1094
1094
|
};
|
|
1095
1095
|
const onClose = () => {
|
|
1096
1096
|
cleanup();
|
|
1097
|
-
|
|
1097
|
+
resolve6(collected);
|
|
1098
1098
|
};
|
|
1099
1099
|
const timeout = setTimeout(() => {
|
|
1100
1100
|
cleanup();
|
|
@@ -1132,7 +1132,7 @@ var OpenMatesWsClient = class {
|
|
|
1132
1132
|
const timeoutMs = options?.timeoutMs ?? 9e4;
|
|
1133
1133
|
const onStream = options?.onStream;
|
|
1134
1134
|
const asyncEmbedWaitMs = options?.asyncEmbedWaitMs ?? 12e4;
|
|
1135
|
-
return new Promise((
|
|
1135
|
+
return new Promise((resolve6, reject) => {
|
|
1136
1136
|
let latestContent = "";
|
|
1137
1137
|
let messageId = null;
|
|
1138
1138
|
let taskId = null;
|
|
@@ -1189,7 +1189,7 @@ var OpenMatesWsClient = class {
|
|
|
1189
1189
|
if (waitingForUserPayload) {
|
|
1190
1190
|
if (pendingSubChatHandlers.size > 0) return;
|
|
1191
1191
|
cleanup();
|
|
1192
|
-
|
|
1192
|
+
resolve6({
|
|
1193
1193
|
status: "waiting_for_user",
|
|
1194
1194
|
messageId,
|
|
1195
1195
|
taskId,
|
|
@@ -1209,7 +1209,7 @@ var OpenMatesWsClient = class {
|
|
|
1209
1209
|
if (processingEmbedIds.size > 0 && !asyncEmbedTimer) {
|
|
1210
1210
|
asyncEmbedTimer = setTimeout(() => {
|
|
1211
1211
|
cleanup();
|
|
1212
|
-
|
|
1212
|
+
resolve6({
|
|
1213
1213
|
status: "completed",
|
|
1214
1214
|
messageId,
|
|
1215
1215
|
taskId,
|
|
@@ -1226,7 +1226,7 @@ var OpenMatesWsClient = class {
|
|
|
1226
1226
|
}
|
|
1227
1227
|
if (processingEmbedIds.size > 0) return;
|
|
1228
1228
|
cleanup();
|
|
1229
|
-
|
|
1229
|
+
resolve6({
|
|
1230
1230
|
status: "completed",
|
|
1231
1231
|
messageId,
|
|
1232
1232
|
taskId,
|
|
@@ -1440,7 +1440,7 @@ var OpenMatesWsClient = class {
|
|
|
1440
1440
|
const onClose = () => {
|
|
1441
1441
|
if (aiResponseDone) {
|
|
1442
1442
|
cleanup();
|
|
1443
|
-
|
|
1443
|
+
resolve6({
|
|
1444
1444
|
status: "completed",
|
|
1445
1445
|
messageId,
|
|
1446
1446
|
taskId,
|
|
@@ -3681,7 +3681,11 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
3681
3681
|
messagePayload.benchmark_metadata = params.benchmarkMetadata;
|
|
3682
3682
|
}
|
|
3683
3683
|
if (params.incognito) {
|
|
3684
|
-
|
|
3684
|
+
const providedHistory = (params.messageHistory ?? []).map((historyMessage) => ({
|
|
3685
|
+
...historyMessage,
|
|
3686
|
+
chat_id: historyMessage.chat_id ?? chatId
|
|
3687
|
+
}));
|
|
3688
|
+
messagePayload.message_history = [...providedHistory, {
|
|
3685
3689
|
message_id: messageId,
|
|
3686
3690
|
chat_id: chatId,
|
|
3687
3691
|
role: "user",
|
|
@@ -4315,7 +4319,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
4315
4319
|
if (response.data.status === "failed") {
|
|
4316
4320
|
throw new Error(response.data.error ?? "Task failed");
|
|
4317
4321
|
}
|
|
4318
|
-
await new Promise((
|
|
4322
|
+
await new Promise((resolve6) => setTimeout(resolve6, SKILL_TASK_POLL_INTERVAL_MS));
|
|
4319
4323
|
}
|
|
4320
4324
|
throw new Error(`Task ${taskId} did not complete within ${SKILL_TASK_POLL_TIMEOUT_MS / 1e3}s`);
|
|
4321
4325
|
}
|
|
@@ -4536,7 +4540,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
4536
4540
|
`Rate limited by settings API; retrying in ${Math.ceil(SETTINGS_GET_RATE_LIMIT_RETRY_MS / 1e3)}s...
|
|
4537
4541
|
`
|
|
4538
4542
|
);
|
|
4539
|
-
await new Promise((
|
|
4543
|
+
await new Promise((resolve6) => setTimeout(resolve6, SETTINGS_GET_RATE_LIMIT_RETRY_MS));
|
|
4540
4544
|
response = await this.http.get(normalizedPath, this.getCliRequestHeaders());
|
|
4541
4545
|
}
|
|
4542
4546
|
if (!response.ok) {
|
|
@@ -6037,7 +6041,7 @@ function filenameFromContentDisposition(header2) {
|
|
|
6037
6041
|
return plain?.trim() ?? null;
|
|
6038
6042
|
}
|
|
6039
6043
|
function sleep(ms) {
|
|
6040
|
-
return new Promise((
|
|
6044
|
+
return new Promise((resolve6) => setTimeout(resolve6, ms));
|
|
6041
6045
|
}
|
|
6042
6046
|
function printLogo() {
|
|
6043
6047
|
const W = "\x1B[1;37m";
|
|
@@ -6054,8 +6058,8 @@ function printLogo() {
|
|
|
6054
6058
|
// src/cli.ts
|
|
6055
6059
|
import { createInterface as createInterface3 } from "readline/promises";
|
|
6056
6060
|
import { realpathSync, writeFileSync as writeFileSync5 } from "fs";
|
|
6057
|
-
import { fileURLToPath } from "url";
|
|
6058
|
-
import { basename as basename3, dirname } from "path";
|
|
6061
|
+
import { fileURLToPath as fileURLToPath2 } from "url";
|
|
6062
|
+
import { basename as basename3, dirname as dirname2 } from "path";
|
|
6059
6063
|
import WebSocket2 from "ws";
|
|
6060
6064
|
|
|
6061
6065
|
// ../secret-scanner/src/registry.ts
|
|
@@ -7755,8 +7759,8 @@ async function renderRemotionShareLink(embedId, client, ln) {
|
|
|
7755
7759
|
}
|
|
7756
7760
|
}
|
|
7757
7761
|
function generateQr(value) {
|
|
7758
|
-
return new Promise((
|
|
7759
|
-
qrcode2.generate(value, { small: true }, (qr) =>
|
|
7762
|
+
return new Promise((resolve6) => {
|
|
7763
|
+
qrcode2.generate(value, { small: true }, (qr) => resolve6(qr));
|
|
7760
7764
|
});
|
|
7761
7765
|
}
|
|
7762
7766
|
function remotionMeta(c) {
|
|
@@ -8611,9 +8615,9 @@ function exec(cmd, cwd) {
|
|
|
8611
8615
|
return execSync(cmd, { cwd, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] }).trim();
|
|
8612
8616
|
}
|
|
8613
8617
|
function runInteractive(cmd, args, cwd) {
|
|
8614
|
-
return new Promise((
|
|
8618
|
+
return new Promise((resolve6, reject) => {
|
|
8615
8619
|
const child = nodeSpawn(cmd, args, { cwd, stdio: "inherit", shell: false });
|
|
8616
|
-
child.on("close", (code) =>
|
|
8620
|
+
child.on("close", (code) => resolve6(code ?? 1));
|
|
8617
8621
|
child.on("error", reject);
|
|
8618
8622
|
});
|
|
8619
8623
|
}
|
|
@@ -8874,10 +8878,10 @@ function warnIfMissingLlmCredentials(installPath) {
|
|
|
8874
8878
|
}
|
|
8875
8879
|
async function confirmDestructive(phrase) {
|
|
8876
8880
|
const rl = createInterface2({ input: process.stdin, output: process.stderr });
|
|
8877
|
-
return new Promise((
|
|
8881
|
+
return new Promise((resolve6) => {
|
|
8878
8882
|
rl.question(`Type "${phrase}" to confirm: `, (answer) => {
|
|
8879
8883
|
rl.close();
|
|
8880
|
-
|
|
8884
|
+
resolve6(answer.trim() === phrase);
|
|
8881
8885
|
});
|
|
8882
8886
|
});
|
|
8883
8887
|
}
|
|
@@ -29279,6 +29283,15 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
29279
29283
|
copy_failed: {
|
|
29280
29284
|
text: "Failed to copy to clipboard"
|
|
29281
29285
|
},
|
|
29286
|
+
code_file_downloaded: {
|
|
29287
|
+
text: "Code file downloaded successfully"
|
|
29288
|
+
},
|
|
29289
|
+
code_file_download_failed: {
|
|
29290
|
+
text: "Failed to download code file"
|
|
29291
|
+
},
|
|
29292
|
+
action_failed: {
|
|
29293
|
+
text: "Failed to perform action"
|
|
29294
|
+
},
|
|
29282
29295
|
download_itinerary: {
|
|
29283
29296
|
text: "Download itinerary"
|
|
29284
29297
|
},
|
|
@@ -41521,26 +41534,276 @@ function buildAssistantFeedbackDecision(rating) {
|
|
|
41521
41534
|
|
|
41522
41535
|
// src/benchmark.ts
|
|
41523
41536
|
import { randomUUID as randomUUID3 } from "crypto";
|
|
41524
|
-
import { writeFileSync as writeFileSync4 } from "fs";
|
|
41537
|
+
import { existsSync as existsSync6, mkdtempSync, readFileSync as readFileSync6, readdirSync, writeFileSync as writeFileSync4 } from "fs";
|
|
41538
|
+
import { tmpdir } from "os";
|
|
41539
|
+
import { dirname, join as join4, resolve as resolve5 } from "path";
|
|
41540
|
+
import { fileURLToPath } from "url";
|
|
41525
41541
|
var DEFAULT_JUDGE_MODEL = "google/gemini-3-flash-preview";
|
|
41526
|
-
var
|
|
41542
|
+
var DEFAULT_EXTENSIVE_SIZE = 10;
|
|
41543
|
+
var DEFAULT_PARALLEL = 4;
|
|
41544
|
+
var FIXTURE_IMAGE_SVG = `<?xml version="1.0" encoding="UTF-8"?>
|
|
41545
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="800" viewBox="0 0 1200 800">
|
|
41546
|
+
<rect width="1200" height="800" fill="#d8ecff"/>
|
|
41547
|
+
<rect y="560" width="1200" height="240" fill="#d7c39a"/>
|
|
41548
|
+
<text x="600" y="88" text-anchor="middle" font-family="Arial, sans-serif" font-size="44" font-weight="700" fill="#23344d">Brandenburger Tor, Berlin</text>
|
|
41549
|
+
<g transform="translate(160 170)" fill="#c9aa6a" stroke="#5d4522" stroke-width="8">
|
|
41550
|
+
<rect x="80" y="160" width="800" height="58"/>
|
|
41551
|
+
<rect x="120" y="218" width="720" height="48"/>
|
|
41552
|
+
<rect x="150" y="266" width="660" height="42"/>
|
|
41553
|
+
<g fill="#d9bd7d">
|
|
41554
|
+
<rect x="170" y="308" width="54" height="250"/>
|
|
41555
|
+
<rect x="285" y="308" width="54" height="250"/>
|
|
41556
|
+
<rect x="400" y="308" width="54" height="250"/>
|
|
41557
|
+
<rect x="515" y="308" width="54" height="250"/>
|
|
41558
|
+
<rect x="630" y="308" width="54" height="250"/>
|
|
41559
|
+
<rect x="745" y="308" width="54" height="250"/>
|
|
41560
|
+
</g>
|
|
41561
|
+
<rect x="130" y="558" width="700" height="50"/>
|
|
41562
|
+
<path d="M480 30 C530 72 620 88 682 48 L720 84 C652 142 530 124 456 78 Z" fill="#3e6f5f"/>
|
|
41563
|
+
<circle cx="510" cy="92" r="22" fill="#3e6f5f"/>
|
|
41564
|
+
<circle cx="625" cy="92" r="22" fill="#3e6f5f"/>
|
|
41565
|
+
<path d="M565 38 l26 78 h-52 z" fill="#3e6f5f"/>
|
|
41566
|
+
</g>
|
|
41567
|
+
<text x="600" y="740" text-anchor="middle" font-family="Arial, sans-serif" font-size="32" fill="#23344d">Neoclassical gate with Quadriga on top</text>
|
|
41568
|
+
</svg>
|
|
41569
|
+
`;
|
|
41570
|
+
var QUICK_CASES = [
|
|
41527
41571
|
{
|
|
41528
|
-
id: "
|
|
41529
|
-
suite: "
|
|
41572
|
+
id: "quick-exact-token",
|
|
41573
|
+
suite: "quick",
|
|
41574
|
+
title: "Exact token smoke test",
|
|
41530
41575
|
prompt: "Reply with exactly this token and no extra text: BENCHMARK_SMOKE_OK",
|
|
41531
|
-
|
|
41576
|
+
complexity: "basic",
|
|
41577
|
+
category: "smoke",
|
|
41578
|
+
expectedIncludes: "BENCHMARK_SMOKE_OK",
|
|
41579
|
+
judge: true,
|
|
41580
|
+
estimatedInputTokens: 12e3,
|
|
41581
|
+
estimatedOutputTokens: 64
|
|
41532
41582
|
},
|
|
41533
41583
|
{
|
|
41534
|
-
id: "arithmetic
|
|
41535
|
-
suite: "
|
|
41584
|
+
id: "quick-arithmetic",
|
|
41585
|
+
suite: "quick",
|
|
41586
|
+
title: "Arithmetic direct answer",
|
|
41536
41587
|
prompt: "Compute 19 * 23. Reply with only the integer result.",
|
|
41537
|
-
|
|
41588
|
+
complexity: "basic",
|
|
41589
|
+
category: "math",
|
|
41590
|
+
expectedIncludes: "437",
|
|
41591
|
+
judge: true,
|
|
41592
|
+
estimatedInputTokens: 12e3,
|
|
41593
|
+
estimatedOutputTokens: 64
|
|
41594
|
+
},
|
|
41595
|
+
{
|
|
41596
|
+
id: "quick-code",
|
|
41597
|
+
suite: "quick",
|
|
41598
|
+
title: "Small code generation",
|
|
41599
|
+
prompt: "Write a TypeScript function isPalindrome(input: string): boolean that ignores spaces, punctuation, and case. Include only the function and one short usage example.",
|
|
41600
|
+
complexity: "medium",
|
|
41601
|
+
category: "coding",
|
|
41602
|
+
judge: true,
|
|
41603
|
+
estimatedInputTokens: 12200,
|
|
41604
|
+
estimatedOutputTokens: 650
|
|
41538
41605
|
},
|
|
41539
41606
|
{
|
|
41540
|
-
id: "
|
|
41541
|
-
suite: "
|
|
41542
|
-
|
|
41543
|
-
|
|
41607
|
+
id: "quick-image-brandenburger-tor",
|
|
41608
|
+
suite: "quick",
|
|
41609
|
+
title: "Default image understanding",
|
|
41610
|
+
prompt: "Look at the attached image. What landmark is shown, when was it built, and who designed it? Answer in three concise bullet points.",
|
|
41611
|
+
complexity: "medium",
|
|
41612
|
+
category: "image",
|
|
41613
|
+
image: "default",
|
|
41614
|
+
expectedIncludes: "Brandenburg",
|
|
41615
|
+
judge: true,
|
|
41616
|
+
estimatedInputTokens: 13500,
|
|
41617
|
+
estimatedOutputTokens: 350
|
|
41618
|
+
},
|
|
41619
|
+
{
|
|
41620
|
+
id: "quick-followup-continuity",
|
|
41621
|
+
suite: "quick",
|
|
41622
|
+
title: "Short multi-turn continuity",
|
|
41623
|
+
prompt: "Create a three-step plan for evaluating whether a new AI model is ready for production use.",
|
|
41624
|
+
complexity: "medium",
|
|
41625
|
+
category: "multi_turn",
|
|
41626
|
+
judge: true,
|
|
41627
|
+
estimatedInputTokens: 14e3,
|
|
41628
|
+
estimatedOutputTokens: 900,
|
|
41629
|
+
followUps: [
|
|
41630
|
+
{ prompt: "Now make step 2 more concrete with two measurable checks." },
|
|
41631
|
+
{ prompt: "Summarize the final plan in one sentence." }
|
|
41632
|
+
]
|
|
41633
|
+
}
|
|
41634
|
+
];
|
|
41635
|
+
var EXTENSIVE_CASES = [
|
|
41636
|
+
...QUICK_CASES,
|
|
41637
|
+
{
|
|
41638
|
+
id: "extensive-coding-debug",
|
|
41639
|
+
suite: "extensive",
|
|
41640
|
+
title: "Debug a JavaScript bug",
|
|
41641
|
+
prompt: "A JavaScript function returns NaN when summing prices from [{price: '12.50'}, {price: undefined}]. Explain the bug and write a corrected function.",
|
|
41642
|
+
complexity: "medium",
|
|
41643
|
+
category: "coding",
|
|
41644
|
+
judge: true,
|
|
41645
|
+
estimatedInputTokens: 12300,
|
|
41646
|
+
estimatedOutputTokens: 850
|
|
41647
|
+
},
|
|
41648
|
+
{
|
|
41649
|
+
id: "extensive-coding-api-design",
|
|
41650
|
+
suite: "extensive",
|
|
41651
|
+
title: "Design a small API contract",
|
|
41652
|
+
prompt: "Design a minimal JSON API for creating and listing benchmark runs. Include request/response examples and one validation error.",
|
|
41653
|
+
complexity: "advanced",
|
|
41654
|
+
category: "coding",
|
|
41655
|
+
judge: true,
|
|
41656
|
+
estimatedInputTokens: 12300,
|
|
41657
|
+
estimatedOutputTokens: 1e3
|
|
41658
|
+
},
|
|
41659
|
+
{
|
|
41660
|
+
id: "extensive-reasoning-tradeoffs",
|
|
41661
|
+
suite: "extensive",
|
|
41662
|
+
title: "Reason about benchmark tradeoffs",
|
|
41663
|
+
prompt: "Compare deterministic assertions and LLM-as-judge evaluation for model benchmarks. Give two strengths and two risks for each.",
|
|
41664
|
+
complexity: "medium",
|
|
41665
|
+
category: "reasoning",
|
|
41666
|
+
judge: true,
|
|
41667
|
+
estimatedInputTokens: 12200,
|
|
41668
|
+
estimatedOutputTokens: 800
|
|
41669
|
+
},
|
|
41670
|
+
{
|
|
41671
|
+
id: "extensive-planning",
|
|
41672
|
+
suite: "extensive",
|
|
41673
|
+
title: "Operational rollout plan",
|
|
41674
|
+
prompt: "Create a rollout checklist for switching a production chatbot from one model to another. Include monitoring, rollback, and user-visible risk checks.",
|
|
41675
|
+
complexity: "advanced",
|
|
41676
|
+
category: "synthesis",
|
|
41677
|
+
judge: true,
|
|
41678
|
+
estimatedInputTokens: 12300,
|
|
41679
|
+
estimatedOutputTokens: 950
|
|
41680
|
+
},
|
|
41681
|
+
{
|
|
41682
|
+
id: "extensive-long-context-followup",
|
|
41683
|
+
suite: "extensive",
|
|
41684
|
+
title: "Prebuilt 20-message long chat follow-up",
|
|
41685
|
+
prompt: "Based on the earlier discussion, choose the best launch strategy and explain why in five bullets.",
|
|
41686
|
+
complexity: "advanced",
|
|
41687
|
+
category: "long_context",
|
|
41688
|
+
longContext: true,
|
|
41689
|
+
judge: true,
|
|
41690
|
+
estimatedInputTokens: 18500,
|
|
41691
|
+
estimatedOutputTokens: 900
|
|
41692
|
+
},
|
|
41693
|
+
{
|
|
41694
|
+
id: "extensive-policy-summary",
|
|
41695
|
+
suite: "extensive",
|
|
41696
|
+
title: "Policy summarization",
|
|
41697
|
+
prompt: "Summarize why privacy-preserving benchmark logs should avoid raw user prompts. Include a concrete safer alternative.",
|
|
41698
|
+
complexity: "medium",
|
|
41699
|
+
category: "reasoning",
|
|
41700
|
+
judge: true,
|
|
41701
|
+
estimatedInputTokens: 12200,
|
|
41702
|
+
estimatedOutputTokens: 650
|
|
41703
|
+
},
|
|
41704
|
+
{
|
|
41705
|
+
id: "extensive-structured-output",
|
|
41706
|
+
suite: "extensive",
|
|
41707
|
+
title: "Structured JSON output",
|
|
41708
|
+
prompt: "Return only JSON with keys risk, mitigation, and confidence for the risk: benchmark results are biased by prompt wording.",
|
|
41709
|
+
complexity: "medium",
|
|
41710
|
+
category: "synthesis",
|
|
41711
|
+
judge: true,
|
|
41712
|
+
estimatedInputTokens: 12200,
|
|
41713
|
+
estimatedOutputTokens: 350
|
|
41714
|
+
},
|
|
41715
|
+
{
|
|
41716
|
+
id: "extensive-creative-constraint",
|
|
41717
|
+
suite: "extensive",
|
|
41718
|
+
title: "Creative constrained response",
|
|
41719
|
+
prompt: "Write a six-line product note announcing model comparisons. Each line must be under 70 characters and avoid hype words like revolutionary or magical.",
|
|
41720
|
+
complexity: "medium",
|
|
41721
|
+
category: "synthesis",
|
|
41722
|
+
judge: true,
|
|
41723
|
+
estimatedInputTokens: 12200,
|
|
41724
|
+
estimatedOutputTokens: 500
|
|
41725
|
+
},
|
|
41726
|
+
{
|
|
41727
|
+
id: "extensive-data-reasoning",
|
|
41728
|
+
suite: "extensive",
|
|
41729
|
+
title: "Interpret metrics",
|
|
41730
|
+
prompt: "A benchmark has pass rates 8/10, 7/10, and 9/10 across three runs. Explain what you can and cannot conclude from this sample.",
|
|
41731
|
+
complexity: "medium",
|
|
41732
|
+
category: "reasoning",
|
|
41733
|
+
judge: true,
|
|
41734
|
+
estimatedInputTokens: 12200,
|
|
41735
|
+
estimatedOutputTokens: 600
|
|
41736
|
+
},
|
|
41737
|
+
{
|
|
41738
|
+
id: "extensive-security-review",
|
|
41739
|
+
suite: "extensive",
|
|
41740
|
+
title: "Security review",
|
|
41741
|
+
prompt: "Review this benchmark design for security risks: it logs prompts, outputs, model ids, and usage costs to a shared file. List risks and safer defaults.",
|
|
41742
|
+
complexity: "advanced",
|
|
41743
|
+
category: "reasoning",
|
|
41744
|
+
judge: true,
|
|
41745
|
+
estimatedInputTokens: 12300,
|
|
41746
|
+
estimatedOutputTokens: 850
|
|
41747
|
+
},
|
|
41748
|
+
{
|
|
41749
|
+
id: "extensive-followup-requirements",
|
|
41750
|
+
suite: "extensive",
|
|
41751
|
+
title: "Three-turn requirements refinement",
|
|
41752
|
+
prompt: "Draft acceptance criteria for a CLI benchmark comparison feature.",
|
|
41753
|
+
complexity: "advanced",
|
|
41754
|
+
category: "multi_turn",
|
|
41755
|
+
judge: true,
|
|
41756
|
+
estimatedInputTokens: 14500,
|
|
41757
|
+
estimatedOutputTokens: 1100,
|
|
41758
|
+
followUps: [
|
|
41759
|
+
{ prompt: "Add one criterion about cost estimation before live runs." },
|
|
41760
|
+
{ prompt: "Add one criterion about partial results after interruption." },
|
|
41761
|
+
{ prompt: "Now compress the criteria to five bullets total." }
|
|
41762
|
+
]
|
|
41763
|
+
},
|
|
41764
|
+
{
|
|
41765
|
+
id: "extensive-coding-tests",
|
|
41766
|
+
suite: "extensive",
|
|
41767
|
+
title: "Write tests for parser behavior",
|
|
41768
|
+
prompt: "Write Node.js test cases for a function parseSuites(value) that accepts quick, extensive, all, and comma-separated lists, and rejects unknown suites.",
|
|
41769
|
+
complexity: "medium",
|
|
41770
|
+
category: "coding",
|
|
41771
|
+
judge: true,
|
|
41772
|
+
estimatedInputTokens: 12300,
|
|
41773
|
+
estimatedOutputTokens: 950
|
|
41774
|
+
},
|
|
41775
|
+
{
|
|
41776
|
+
id: "extensive-coding-refactor",
|
|
41777
|
+
suite: "extensive",
|
|
41778
|
+
title: "Refactor duplicated code",
|
|
41779
|
+
prompt: "Given two duplicated TypeScript loops that build arrays of result objects, explain when to extract a helper and write the helper signature.",
|
|
41780
|
+
complexity: "medium",
|
|
41781
|
+
category: "coding",
|
|
41782
|
+
judge: true,
|
|
41783
|
+
estimatedInputTokens: 12300,
|
|
41784
|
+
estimatedOutputTokens: 750
|
|
41785
|
+
},
|
|
41786
|
+
{
|
|
41787
|
+
id: "extensive-comparison-analysis",
|
|
41788
|
+
suite: "extensive",
|
|
41789
|
+
title: "Compare two model outputs",
|
|
41790
|
+
prompt: "Explain how you would compare two model outputs when one is concise but misses caveats and the other is verbose but complete.",
|
|
41791
|
+
complexity: "medium",
|
|
41792
|
+
category: "reasoning",
|
|
41793
|
+
judge: true,
|
|
41794
|
+
estimatedInputTokens: 12200,
|
|
41795
|
+
estimatedOutputTokens: 650
|
|
41796
|
+
},
|
|
41797
|
+
{
|
|
41798
|
+
id: "extensive-failure-mode",
|
|
41799
|
+
suite: "extensive",
|
|
41800
|
+
title: "Failure-mode analysis",
|
|
41801
|
+
prompt: "List five failure modes for image-understanding benchmarks and one mitigation for each.",
|
|
41802
|
+
complexity: "advanced",
|
|
41803
|
+
category: "image",
|
|
41804
|
+
judge: true,
|
|
41805
|
+
estimatedInputTokens: 12300,
|
|
41806
|
+
estimatedOutputTokens: 900
|
|
41544
41807
|
}
|
|
41545
41808
|
];
|
|
41546
41809
|
async function handleBenchmark(client, subcommand, rest, flags) {
|
|
@@ -41551,122 +41814,103 @@ async function handleBenchmark(client, subcommand, rest, flags) {
|
|
|
41551
41814
|
if (subcommand !== "model") {
|
|
41552
41815
|
throw new Error(`Unknown benchmark command '${subcommand}'. Run 'openmates benchmark --help'.`);
|
|
41553
41816
|
}
|
|
41554
|
-
const
|
|
41555
|
-
if (
|
|
41556
|
-
throw new Error("Missing target model. Usage: openmates benchmark model <provider/model> --confirm-spend-credits");
|
|
41817
|
+
const targetModels = rest.filter((arg) => !arg.startsWith("--"));
|
|
41818
|
+
if (targetModels.length === 0) {
|
|
41819
|
+
throw new Error("Missing target model. Usage: openmates benchmark model <provider/model> [model-b] --confirm-spend-credits");
|
|
41820
|
+
}
|
|
41821
|
+
const compare = flags.compare === true;
|
|
41822
|
+
if (targetModels.length > 1 && !compare) {
|
|
41823
|
+
throw new Error("Multiple target models require --compare.");
|
|
41824
|
+
}
|
|
41825
|
+
if (compare && targetModels.length < 2) {
|
|
41826
|
+
throw new Error("--compare requires at least two target models.");
|
|
41557
41827
|
}
|
|
41558
41828
|
const judgeModel = typeof flags["judge-model"] === "string" ? flags["judge-model"] : DEFAULT_JUDGE_MODEL;
|
|
41559
41829
|
const suites = parseSuites(flags.suite);
|
|
41560
41830
|
const runs = parseRuns(flags.runs);
|
|
41831
|
+
const extensiveSize = parseExtensiveSize(flags["extensive-size"]);
|
|
41832
|
+
const parallel = parseParallel(flags.parallel);
|
|
41561
41833
|
const dryRun = flags["dry-run"] === true;
|
|
41562
41834
|
const output = typeof flags.output === "string" ? flags.output : void 0;
|
|
41563
41835
|
const runId = typeof flags["run-id"] === "string" ? flags["run-id"] : randomUUID3();
|
|
41836
|
+
const imagePath = typeof flags.image === "string" ? resolve5(flags.image) : defaultImageFixturePath();
|
|
41564
41837
|
if (!dryRun && flags["confirm-spend-credits"] !== true) {
|
|
41565
41838
|
throw new Error(
|
|
41566
41839
|
"Benchmark runs spend real credits from the logged-in account. Rerun with --confirm-spend-credits, or use --dry-run to preview the plan."
|
|
41567
41840
|
);
|
|
41568
41841
|
}
|
|
41569
|
-
const cases = expandCases(suites, runs);
|
|
41570
|
-
const
|
|
41571
|
-
|
|
41572
|
-
|
|
41842
|
+
const cases = expandCases(suites, runs, extensiveSize);
|
|
41843
|
+
const pricing = loadPricingForModels([...targetModels, judgeModel]);
|
|
41844
|
+
const estimate = estimateCredits(cases, targetModels, judgeModel, pricing);
|
|
41845
|
+
const result = makeBaseResult({
|
|
41573
41846
|
runId,
|
|
41574
|
-
|
|
41847
|
+
targetModels,
|
|
41575
41848
|
judgeModel,
|
|
41576
41849
|
suites,
|
|
41577
41850
|
runs,
|
|
41578
|
-
|
|
41579
|
-
|
|
41580
|
-
|
|
41581
|
-
|
|
41851
|
+
compare,
|
|
41852
|
+
parallel,
|
|
41853
|
+
extensiveSize,
|
|
41854
|
+
dryRun,
|
|
41855
|
+
estimate,
|
|
41856
|
+
totalJobs: cases.length * targetModels.length
|
|
41857
|
+
});
|
|
41582
41858
|
if (dryRun) {
|
|
41583
|
-
writeBenchmarkResult(
|
|
41859
|
+
writeBenchmarkResult(result, flags, output);
|
|
41584
41860
|
return;
|
|
41585
41861
|
}
|
|
41586
41862
|
if (!client.hasSession()) {
|
|
41587
41863
|
throw new Error("Benchmark runs require login. Run 'openmates login' first.");
|
|
41588
41864
|
}
|
|
41589
|
-
|
|
41590
|
-
|
|
41591
|
-
|
|
41592
|
-
|
|
41593
|
-
|
|
41594
|
-
|
|
41595
|
-
|
|
41596
|
-
|
|
41597
|
-
|
|
41598
|
-
|
|
41599
|
-
|
|
41600
|
-
|
|
41601
|
-
}),
|
|
41602
|
-
precollectResponse: true
|
|
41865
|
+
let interrupted = false;
|
|
41866
|
+
const onInterrupt = () => {
|
|
41867
|
+
interrupted = true;
|
|
41868
|
+
};
|
|
41869
|
+
process.once("SIGINT", onInterrupt);
|
|
41870
|
+
try {
|
|
41871
|
+
const jobs = cases.flatMap((benchmarkCase) => targetModels.map((model) => ({ model, benchmarkCase })));
|
|
41872
|
+
await runPool(jobs, parallel, async (job) => {
|
|
41873
|
+
if (interrupted) return;
|
|
41874
|
+
const caseResult = await runCaseJob({ client, job, judgeModel, runId, imagePath });
|
|
41875
|
+
result.cases.push(caseResult);
|
|
41876
|
+
recomputeResult(result, jobs.length, interrupted);
|
|
41603
41877
|
});
|
|
41604
|
-
|
|
41605
|
-
|
|
41606
|
-
suite: benchmarkCase.suite,
|
|
41607
|
-
run: benchmarkCase.run,
|
|
41608
|
-
prompt: benchmarkCase.prompt,
|
|
41609
|
-
assistant: targetResponse.assistant,
|
|
41610
|
-
modelName: targetResponse.modelName,
|
|
41611
|
-
passed: benchmarkCase.expectedIncludes ? targetResponse.assistant.includes(benchmarkCase.expectedIncludes) : true,
|
|
41612
|
-
durationMs: Date.now() - startedAt,
|
|
41613
|
-
expectedIncludes: benchmarkCase.expectedIncludes
|
|
41614
|
-
};
|
|
41615
|
-
if (benchmarkCase.needsJudge) {
|
|
41616
|
-
const judgeResponse = await client.sendMessage({
|
|
41617
|
-
message: `${modelMention(judgeModel)} ${judgePrompt(benchmarkCase.prompt, targetResponse.assistant)}`,
|
|
41618
|
-
incognito: true,
|
|
41619
|
-
autoApproveSubChats: true,
|
|
41620
|
-
benchmarkMetadata: benchmarkMetadata({
|
|
41621
|
-
runId,
|
|
41622
|
-
suite: benchmarkCase.suite,
|
|
41623
|
-
caseId: `${benchmarkCase.id}:judge`,
|
|
41624
|
-
targetModel,
|
|
41625
|
-
judgeModel
|
|
41626
|
-
}),
|
|
41627
|
-
precollectResponse: true
|
|
41628
|
-
});
|
|
41629
|
-
const judgment = parseJudgment(judgeResponse.assistant);
|
|
41630
|
-
caseResult.judge = {
|
|
41631
|
-
model: judgeModel,
|
|
41632
|
-
score: judgment.score,
|
|
41633
|
-
reason: judgment.reason,
|
|
41634
|
-
raw: judgeResponse.assistant
|
|
41635
|
-
};
|
|
41636
|
-
caseResult.passed = judgment.score !== null && judgment.score >= 4;
|
|
41637
|
-
}
|
|
41638
|
-
baseResult.cases.push(caseResult);
|
|
41878
|
+
} finally {
|
|
41879
|
+
process.off("SIGINT", onInterrupt);
|
|
41639
41880
|
}
|
|
41640
|
-
|
|
41641
|
-
|
|
41642
|
-
writeBenchmarkResult(baseResult, flags, output);
|
|
41881
|
+
recomputeResult(result, cases.length * targetModels.length, interrupted);
|
|
41882
|
+
writeBenchmarkResult(result, flags, output);
|
|
41643
41883
|
}
|
|
41644
41884
|
function printBenchmarkHelp() {
|
|
41645
41885
|
console.log(`Benchmark commands:
|
|
41646
|
-
openmates benchmark model <provider/model> --confirm-spend-credits [--suite
|
|
41886
|
+
openmates benchmark model <provider/model> [provider/model...] --confirm-spend-credits [--compare] [--suite quick|extensive|all] [--json]
|
|
41647
41887
|
|
|
41648
41888
|
Runs real incognito chat requests through the OpenMates product path. Live runs
|
|
41649
41889
|
spend the logged-in user's credits and usage entries are grouped as benchmark spend.
|
|
41650
41890
|
|
|
41651
41891
|
Options:
|
|
41652
41892
|
--confirm-spend-credits Required for live benchmark runs
|
|
41653
|
-
--dry-run Preview the benchmark plan without
|
|
41654
|
-
--
|
|
41893
|
+
--dry-run Preview the benchmark plan without inference or spend
|
|
41894
|
+
--compare Compare two or more target models
|
|
41895
|
+
--suite <list> Comma-separated suites: quick, extensive, all (default: quick)
|
|
41896
|
+
--extensive-size <n> Extensive cases to run: 5, 10, or 20 (default: ${DEFAULT_EXTENSIVE_SIZE})
|
|
41655
41897
|
--runs <n> Repeat each selected case (default: 1)
|
|
41656
|
-
--
|
|
41898
|
+
--parallel <n> Concurrent target case requests (default: ${DEFAULT_PARALLEL})
|
|
41899
|
+
--judge-model <provider/model> Judge for evaluated cases (default: ${DEFAULT_JUDGE_MODEL})
|
|
41900
|
+
--image <path> Override default Brandenburger Tor image fixture
|
|
41657
41901
|
--run-id <id> Reuse a benchmark run id for grouping
|
|
41658
41902
|
--output <path> Save JSON result to a file
|
|
41659
41903
|
--json Print JSON result`);
|
|
41660
41904
|
}
|
|
41661
41905
|
function parseSuites(value) {
|
|
41662
|
-
if (value === void 0 || value === false) return ["
|
|
41906
|
+
if (value === void 0 || value === false) return ["quick"];
|
|
41663
41907
|
if (value === true) throw new Error("--suite requires a value");
|
|
41664
41908
|
const suites = value.split(",").map((suite) => suite.trim()).filter(Boolean);
|
|
41665
|
-
if (suites.includes("all")) return ["
|
|
41666
|
-
const allowed = /* @__PURE__ */ new Set(["
|
|
41909
|
+
if (suites.includes("all")) return ["quick", "extensive"];
|
|
41910
|
+
const allowed = /* @__PURE__ */ new Set(["quick", "extensive"]);
|
|
41667
41911
|
const invalid = suites.filter((suite) => !allowed.has(suite));
|
|
41668
41912
|
if (invalid.length > 0 || suites.length === 0) {
|
|
41669
|
-
throw new Error("Invalid --suite. Use
|
|
41913
|
+
throw new Error("Invalid --suite. Use quick, extensive, or all.");
|
|
41670
41914
|
}
|
|
41671
41915
|
return [...new Set(suites)];
|
|
41672
41916
|
}
|
|
@@ -41679,14 +41923,306 @@ function parseRuns(value) {
|
|
|
41679
41923
|
}
|
|
41680
41924
|
return parsed;
|
|
41681
41925
|
}
|
|
41682
|
-
function
|
|
41683
|
-
|
|
41926
|
+
function parseExtensiveSize(value) {
|
|
41927
|
+
if (value === void 0 || value === false) return DEFAULT_EXTENSIVE_SIZE;
|
|
41928
|
+
if (value === true) throw new Error("--extensive-size requires a value");
|
|
41929
|
+
const parsed = Number.parseInt(value, 10);
|
|
41930
|
+
if (![5, 10, 20].includes(parsed)) {
|
|
41931
|
+
throw new Error("--extensive-size must be 5, 10, or 20");
|
|
41932
|
+
}
|
|
41933
|
+
return parsed;
|
|
41934
|
+
}
|
|
41935
|
+
function parseParallel(value) {
|
|
41936
|
+
if (value === void 0 || value === false) return DEFAULT_PARALLEL;
|
|
41937
|
+
if (value === true) throw new Error("--parallel requires a value");
|
|
41938
|
+
const parsed = Number.parseInt(value, 10);
|
|
41939
|
+
if (!Number.isInteger(parsed) || parsed < 1 || parsed > 20) {
|
|
41940
|
+
throw new Error("--parallel must be an integer from 1 to 20");
|
|
41941
|
+
}
|
|
41942
|
+
return parsed;
|
|
41943
|
+
}
|
|
41944
|
+
function expandCases(suites, runs, extensiveSize) {
|
|
41945
|
+
const selected = [];
|
|
41946
|
+
if (suites.includes("quick")) selected.push(...QUICK_CASES);
|
|
41947
|
+
if (suites.includes("extensive")) selected.push(...selectExtensiveCases(extensiveSize));
|
|
41948
|
+
const uniqueSelected = dedupeCases(selected);
|
|
41684
41949
|
const expanded = [];
|
|
41685
41950
|
for (let run = 1; run <= runs; run += 1) {
|
|
41686
|
-
for (const benchmarkCase of
|
|
41951
|
+
for (const benchmarkCase of uniqueSelected) expanded.push({ ...benchmarkCase, run });
|
|
41687
41952
|
}
|
|
41688
41953
|
return expanded;
|
|
41689
41954
|
}
|
|
41955
|
+
function selectExtensiveCases(size) {
|
|
41956
|
+
const cases = dedupeCases(EXTENSIVE_CASES).slice(0, size);
|
|
41957
|
+
const minimumCoding = Math.ceil(size * 0.15);
|
|
41958
|
+
const codingCount = cases.filter((benchmarkCase) => benchmarkCase.category === "coding").length;
|
|
41959
|
+
if (codingCount >= minimumCoding) return cases;
|
|
41960
|
+
const selectedIds = new Set(cases.map((benchmarkCase) => benchmarkCase.id));
|
|
41961
|
+
const codingBackfill = EXTENSIVE_CASES.filter(
|
|
41962
|
+
(benchmarkCase) => benchmarkCase.category === "coding" && !selectedIds.has(benchmarkCase.id)
|
|
41963
|
+
);
|
|
41964
|
+
const result = [...cases];
|
|
41965
|
+
for (const codingCase of codingBackfill) {
|
|
41966
|
+
let replaceIndex = -1;
|
|
41967
|
+
for (let index = result.length - 1; index >= 0; index -= 1) {
|
|
41968
|
+
if (result[index]?.category !== "coding") {
|
|
41969
|
+
replaceIndex = index;
|
|
41970
|
+
break;
|
|
41971
|
+
}
|
|
41972
|
+
}
|
|
41973
|
+
if (replaceIndex === -1) break;
|
|
41974
|
+
result[replaceIndex] = codingCase;
|
|
41975
|
+
if (result.filter((benchmarkCase) => benchmarkCase.category === "coding").length >= minimumCoding) break;
|
|
41976
|
+
}
|
|
41977
|
+
return result;
|
|
41978
|
+
}
|
|
41979
|
+
function dedupeCases(cases) {
|
|
41980
|
+
const seen = /* @__PURE__ */ new Set();
|
|
41981
|
+
const result = [];
|
|
41982
|
+
for (const benchmarkCase of cases) {
|
|
41983
|
+
if (seen.has(benchmarkCase.id)) continue;
|
|
41984
|
+
seen.add(benchmarkCase.id);
|
|
41985
|
+
result.push(benchmarkCase);
|
|
41986
|
+
}
|
|
41987
|
+
return result;
|
|
41988
|
+
}
|
|
41989
|
+
async function runCaseJob(params) {
|
|
41990
|
+
const { client, job, judgeModel, runId, imagePath } = params;
|
|
41991
|
+
const { model, benchmarkCase } = job;
|
|
41992
|
+
const startedAt = Date.now();
|
|
41993
|
+
const turns = [];
|
|
41994
|
+
const history = benchmarkCase.longContext ? buildLongContextHistory() : [];
|
|
41995
|
+
let chatId;
|
|
41996
|
+
try {
|
|
41997
|
+
const initialPrompt = await buildPromptWithAttachments(client, benchmarkCase, model, imagePath);
|
|
41998
|
+
const targetResponse = await sendBenchmarkTurn({
|
|
41999
|
+
client,
|
|
42000
|
+
model,
|
|
42001
|
+
judgeModel,
|
|
42002
|
+
runId,
|
|
42003
|
+
benchmarkCase,
|
|
42004
|
+
prompt: initialPrompt.message,
|
|
42005
|
+
chatId,
|
|
42006
|
+
history,
|
|
42007
|
+
preparedEmbeds: initialPrompt.embeds,
|
|
42008
|
+
caseId: benchmarkCase.id
|
|
42009
|
+
});
|
|
42010
|
+
chatId = targetResponse.chatId;
|
|
42011
|
+
turns.push(targetResponse.turn);
|
|
42012
|
+
appendHistory(history, "user", initialPrompt.message);
|
|
42013
|
+
appendHistory(history, "assistant", targetResponse.turn.assistant);
|
|
42014
|
+
for (const [index, followUp] of (benchmarkCase.followUps ?? []).entries()) {
|
|
42015
|
+
const response = await sendBenchmarkTurn({
|
|
42016
|
+
client,
|
|
42017
|
+
model,
|
|
42018
|
+
judgeModel,
|
|
42019
|
+
runId,
|
|
42020
|
+
benchmarkCase,
|
|
42021
|
+
prompt: `${modelMention(model)} ${followUp.prompt}`,
|
|
42022
|
+
chatId,
|
|
42023
|
+
history,
|
|
42024
|
+
caseId: `${benchmarkCase.id}:followup-${index + 1}`
|
|
42025
|
+
});
|
|
42026
|
+
chatId = response.chatId;
|
|
42027
|
+
turns.push(response.turn);
|
|
42028
|
+
appendHistory(history, "user", response.rawPrompt);
|
|
42029
|
+
appendHistory(history, "assistant", response.turn.assistant);
|
|
42030
|
+
}
|
|
42031
|
+
const assistant = turns.at(-1)?.assistant ?? "";
|
|
42032
|
+
const caseResult = {
|
|
42033
|
+
id: benchmarkCase.id,
|
|
42034
|
+
suite: benchmarkCase.suite,
|
|
42035
|
+
title: benchmarkCase.title,
|
|
42036
|
+
model,
|
|
42037
|
+
run: benchmarkCase.run,
|
|
42038
|
+
complexity: benchmarkCase.complexity,
|
|
42039
|
+
category: benchmarkCase.category,
|
|
42040
|
+
prompt: benchmarkCase.prompt,
|
|
42041
|
+
assistant,
|
|
42042
|
+
modelName: turns.at(-1)?.modelName ?? null,
|
|
42043
|
+
passed: benchmarkCase.expectedIncludes ? assistant.includes(benchmarkCase.expectedIncludes) : true,
|
|
42044
|
+
durationMs: Date.now() - startedAt,
|
|
42045
|
+
expectedIncludes: benchmarkCase.expectedIncludes,
|
|
42046
|
+
turns
|
|
42047
|
+
};
|
|
42048
|
+
if (benchmarkCase.judge) {
|
|
42049
|
+
caseResult.judge = await judgeCase({ client, judgeModel, targetModel: model, benchmarkCase, caseResult, runId });
|
|
42050
|
+
caseResult.passed = caseResult.judge.score !== null && caseResult.judge.score >= 4 && caseResult.passed;
|
|
42051
|
+
}
|
|
42052
|
+
return caseResult;
|
|
42053
|
+
} catch (error) {
|
|
42054
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
42055
|
+
return {
|
|
42056
|
+
id: benchmarkCase.id,
|
|
42057
|
+
suite: benchmarkCase.suite,
|
|
42058
|
+
title: benchmarkCase.title,
|
|
42059
|
+
model,
|
|
42060
|
+
run: benchmarkCase.run,
|
|
42061
|
+
complexity: benchmarkCase.complexity,
|
|
42062
|
+
category: benchmarkCase.category,
|
|
42063
|
+
prompt: benchmarkCase.prompt,
|
|
42064
|
+
assistant: turns.at(-1)?.assistant ?? "",
|
|
42065
|
+
modelName: turns.at(-1)?.modelName ?? null,
|
|
42066
|
+
passed: false,
|
|
42067
|
+
durationMs: Date.now() - startedAt,
|
|
42068
|
+
expectedIncludes: benchmarkCase.expectedIncludes,
|
|
42069
|
+
turns,
|
|
42070
|
+
error: message
|
|
42071
|
+
};
|
|
42072
|
+
}
|
|
42073
|
+
}
|
|
42074
|
+
async function sendBenchmarkTurn(params) {
|
|
42075
|
+
const startedAt = Date.now();
|
|
42076
|
+
const response = await params.client.sendMessage({
|
|
42077
|
+
message: params.prompt,
|
|
42078
|
+
chatId: params.chatId,
|
|
42079
|
+
incognito: true,
|
|
42080
|
+
autoApproveSubChats: true,
|
|
42081
|
+
benchmarkMetadata: benchmarkMetadata({
|
|
42082
|
+
runId: params.runId,
|
|
42083
|
+
suite: params.benchmarkCase.suite,
|
|
42084
|
+
caseId: params.caseId,
|
|
42085
|
+
targetModel: params.model,
|
|
42086
|
+
judgeModel: params.judgeModel
|
|
42087
|
+
}),
|
|
42088
|
+
messageHistory: params.history,
|
|
42089
|
+
preparedEmbeds: params.preparedEmbeds,
|
|
42090
|
+
precollectResponse: true
|
|
42091
|
+
});
|
|
42092
|
+
return {
|
|
42093
|
+
chatId: response.chatId,
|
|
42094
|
+
rawPrompt: params.prompt,
|
|
42095
|
+
turn: {
|
|
42096
|
+
prompt: params.prompt,
|
|
42097
|
+
assistant: response.assistant,
|
|
42098
|
+
modelName: response.modelName,
|
|
42099
|
+
durationMs: Date.now() - startedAt
|
|
42100
|
+
}
|
|
42101
|
+
};
|
|
42102
|
+
}
|
|
42103
|
+
async function buildPromptWithAttachments(client, benchmarkCase, model, imagePath) {
|
|
42104
|
+
const baseMessage = `${modelMention(model)} ${benchmarkCase.prompt}`;
|
|
42105
|
+
if (benchmarkCase.image !== "default") return { message: baseMessage };
|
|
42106
|
+
const attachment = await prepareImageAttachment(client, imagePath);
|
|
42107
|
+
return { message: `${baseMessage}
|
|
42108
|
+
|
|
42109
|
+
${attachment.messageSuffix}`, embeds: attachment.embeds };
|
|
42110
|
+
}
|
|
42111
|
+
async function prepareImageAttachment(client, imagePath) {
|
|
42112
|
+
if (!existsSync6(imagePath)) throw new Error(`Benchmark image not found: ${imagePath}`);
|
|
42113
|
+
const processed = processFiles([imagePath], null);
|
|
42114
|
+
if (processed.blocked.length > 0 || processed.errors.length > 0 || processed.embeds.length === 0) {
|
|
42115
|
+
const reason = [...processed.blocked, ...processed.errors].map((entry) => entry.error).join("; ") || "no image embed produced";
|
|
42116
|
+
throw new Error(`Failed to prepare benchmark image: ${reason}`);
|
|
42117
|
+
}
|
|
42118
|
+
const fileEmbed = processed.embeds[0];
|
|
42119
|
+
if (!fileEmbed.requiresUpload || !fileEmbed.localPath) {
|
|
42120
|
+
return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
|
|
42121
|
+
}
|
|
42122
|
+
await uploadBenchmarkImage(client, fileEmbed);
|
|
42123
|
+
return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
|
|
42124
|
+
}
|
|
42125
|
+
async function uploadBenchmarkImage(client, fileEmbed) {
|
|
42126
|
+
if (!fileEmbed.localPath) return;
|
|
42127
|
+
const uploadResult = await uploadFile(fileEmbed.localPath, client.getSession());
|
|
42128
|
+
const embedRef = fileEmbed.embed.embedRef ?? `benchmark-image-${uploadResult.embed_id.slice(0, 8)}`;
|
|
42129
|
+
fileEmbed.embed.embedRef = embedRef;
|
|
42130
|
+
fileEmbed.embed.content = toonEncodeContent({
|
|
42131
|
+
type: "image",
|
|
42132
|
+
app_id: "images",
|
|
42133
|
+
skill_id: "upload",
|
|
42134
|
+
status: "finished",
|
|
42135
|
+
filename: fileEmbed.displayName,
|
|
42136
|
+
embed_ref: embedRef,
|
|
42137
|
+
content_hash: uploadResult.content_hash,
|
|
42138
|
+
s3_base_url: uploadResult.s3_base_url,
|
|
42139
|
+
files: uploadResult.files,
|
|
42140
|
+
aes_key: uploadResult.aes_key,
|
|
42141
|
+
aes_nonce: uploadResult.aes_nonce,
|
|
42142
|
+
vault_wrapped_aes_key: uploadResult.vault_wrapped_aes_key,
|
|
42143
|
+
ai_detection: uploadResult.ai_detection
|
|
42144
|
+
});
|
|
42145
|
+
fileEmbed.embed.status = "finished";
|
|
42146
|
+
fileEmbed.embed.contentHash = uploadResult.content_hash;
|
|
42147
|
+
fileEmbed.embed.embedId = uploadResult.embed_id;
|
|
42148
|
+
fileEmbed.referenceBlock = createEmbedReferenceBlock(embedRef);
|
|
42149
|
+
}
|
|
42150
|
+
async function judgeCase(params) {
|
|
42151
|
+
const startedAt = Date.now();
|
|
42152
|
+
const judgeResponse = await params.client.sendMessage({
|
|
42153
|
+
message: `${modelMention(params.judgeModel)} ${judgePrompt(params.targetModel, params.benchmarkCase, params.caseResult)}`,
|
|
42154
|
+
incognito: true,
|
|
42155
|
+
autoApproveSubChats: true,
|
|
42156
|
+
benchmarkMetadata: benchmarkMetadata({
|
|
42157
|
+
runId: params.runId,
|
|
42158
|
+
suite: params.benchmarkCase.suite,
|
|
42159
|
+
caseId: `${params.benchmarkCase.id}:judge:${params.targetModel}`,
|
|
42160
|
+
targetModel: params.targetModel,
|
|
42161
|
+
judgeModel: params.judgeModel
|
|
42162
|
+
}),
|
|
42163
|
+
precollectResponse: true
|
|
42164
|
+
});
|
|
42165
|
+
const judgment = parseJudgment(judgeResponse.assistant);
|
|
42166
|
+
return {
|
|
42167
|
+
model: params.judgeModel,
|
|
42168
|
+
score: judgment.score,
|
|
42169
|
+
reason: judgment.reason,
|
|
42170
|
+
raw: judgeResponse.assistant,
|
|
42171
|
+
durationMs: Date.now() - startedAt
|
|
42172
|
+
};
|
|
42173
|
+
}
|
|
42174
|
+
async function runPool(items, parallel, worker) {
|
|
42175
|
+
let index = 0;
|
|
42176
|
+
const workers = Array.from({ length: Math.min(parallel, items.length) }, async () => {
|
|
42177
|
+
while (index < items.length) {
|
|
42178
|
+
const item = items[index];
|
|
42179
|
+
index += 1;
|
|
42180
|
+
await worker(item);
|
|
42181
|
+
}
|
|
42182
|
+
});
|
|
42183
|
+
await Promise.all(workers);
|
|
42184
|
+
}
|
|
42185
|
+
function buildLongContextHistory() {
|
|
42186
|
+
const now = Math.floor(Date.now() / 1e3) - 2e3;
|
|
42187
|
+
const topics = [
|
|
42188
|
+
["user", "We need to launch a CLI benchmark for model comparisons."],
|
|
42189
|
+
["assistant", "The first goal should be a quick suite with deterministic checks."],
|
|
42190
|
+
["user", "The benchmark also needs image inference."],
|
|
42191
|
+
["assistant", "Use a public fixture image and ask a factual visual question."],
|
|
42192
|
+
["user", "We should avoid wasting credits."],
|
|
42193
|
+
["assistant", "Run a pricing preflight and require explicit spend confirmation."],
|
|
42194
|
+
["user", "What about longer conversations?"],
|
|
42195
|
+
["assistant", "Add a 20-message predefined history and a dependent follow-up."],
|
|
42196
|
+
["user", "The extensive suite should not be too small."],
|
|
42197
|
+
["assistant", "Default to 10 cases and allow 5 or 20 as alternatives."],
|
|
42198
|
+
["user", "Coding quality matters."],
|
|
42199
|
+
["assistant", "Reserve at least 15 percent of extensive cases for coding prompts."],
|
|
42200
|
+
["user", "We also need comparison mode."],
|
|
42201
|
+
["assistant", "Accept multiple models with --compare and run target jobs in parallel."],
|
|
42202
|
+
["user", "How should judging work?"],
|
|
42203
|
+
["assistant", "Judge each completed case immediately with Gemini so partial results remain useful."],
|
|
42204
|
+
["user", "What if the process is interrupted?"],
|
|
42205
|
+
["assistant", "Print or write a partial summary with completed judgments and skipped counts."],
|
|
42206
|
+
["user", "What is the best launch strategy?"],
|
|
42207
|
+
["assistant", "Ship quick and comparison first, then use extensive for slower releases."]
|
|
42208
|
+
];
|
|
42209
|
+
return topics.map(([role, content], index) => ({
|
|
42210
|
+
message_id: `benchmark-history-${index + 1}`,
|
|
42211
|
+
role,
|
|
42212
|
+
sender_name: role === "user" ? "User" : "Assistant",
|
|
42213
|
+
content,
|
|
42214
|
+
created_at: now + index * 30
|
|
42215
|
+
}));
|
|
42216
|
+
}
|
|
42217
|
+
function appendHistory(history, role, content) {
|
|
42218
|
+
history.push({
|
|
42219
|
+
message_id: randomUUID3(),
|
|
42220
|
+
role,
|
|
42221
|
+
sender_name: role === "user" ? "User" : "Assistant",
|
|
42222
|
+
content,
|
|
42223
|
+
created_at: Math.floor(Date.now() / 1e3)
|
|
42224
|
+
});
|
|
42225
|
+
}
|
|
41690
42226
|
function modelMention(model) {
|
|
41691
42227
|
const separator = model.indexOf("/");
|
|
41692
42228
|
if (separator === -1) return `@ai-model:${model}`;
|
|
@@ -41705,16 +42241,28 @@ function benchmarkMetadata(params) {
|
|
|
41705
42241
|
benchmark_judge_model: params.judgeModel
|
|
41706
42242
|
};
|
|
41707
42243
|
}
|
|
41708
|
-
function judgePrompt(
|
|
42244
|
+
function judgePrompt(targetModel, benchmarkCase, result) {
|
|
41709
42245
|
return [
|
|
41710
|
-
"You are judging a model benchmark response.",
|
|
41711
|
-
|
|
41712
|
-
"
|
|
41713
|
-
|
|
41714
|
-
|
|
42246
|
+
"You are judging a real OpenMates model benchmark response.",
|
|
42247
|
+
"Return exactly two plain-text lines, with no markdown, no code block, and no tool use.",
|
|
42248
|
+
"Line 1 format: BENCHMARK_SCORE=<integer from 1 to 5>",
|
|
42249
|
+
"Line 2 format: BENCHMARK_REASON=<one short sentence>",
|
|
42250
|
+
"Score for correctness, instruction-following, usefulness, and continuity where relevant.",
|
|
42251
|
+
`Target model: ${targetModel}`,
|
|
42252
|
+
`Benchmark case: ${benchmarkCase.id} (${benchmarkCase.category}, ${benchmarkCase.complexity})`,
|
|
42253
|
+
`Initial prompt: ${JSON.stringify(benchmarkCase.prompt)}`,
|
|
42254
|
+
`Turns: ${JSON.stringify(result.turns.map((turn) => ({ prompt: turn.prompt, assistant: turn.assistant })))}`
|
|
41715
42255
|
].join("\n");
|
|
41716
42256
|
}
|
|
41717
42257
|
function parseJudgment(answer) {
|
|
42258
|
+
const markerScore = answer.match(/BENCHMARK_SCORE\s*=\s*([1-5])/i);
|
|
42259
|
+
if (markerScore) {
|
|
42260
|
+
const reasonMatch = answer.match(/BENCHMARK_REASON\s*=\s*(.+)/i);
|
|
42261
|
+
return {
|
|
42262
|
+
score: Number.parseInt(markerScore[1], 10),
|
|
42263
|
+
reason: reasonMatch?.[1]?.trim() ?? null
|
|
42264
|
+
};
|
|
42265
|
+
}
|
|
41718
42266
|
const jsonText = extractJsonObject(answer);
|
|
41719
42267
|
if (!jsonText) return { score: null, reason: null };
|
|
41720
42268
|
try {
|
|
@@ -41734,6 +42282,220 @@ function extractJsonObject(text) {
|
|
|
41734
42282
|
if (start === -1 || end === -1 || end <= start) return null;
|
|
41735
42283
|
return text.slice(start, end + 1);
|
|
41736
42284
|
}
|
|
42285
|
+
function loadPricingForModels(models) {
|
|
42286
|
+
const availablePricing = loadProviderPricing();
|
|
42287
|
+
const pricing = /* @__PURE__ */ new Map();
|
|
42288
|
+
const missing = [];
|
|
42289
|
+
for (const model of [...new Set(models)]) {
|
|
42290
|
+
const key = normalizeModelKey(model);
|
|
42291
|
+
const modelPricing = availablePricing.get(key);
|
|
42292
|
+
if (!modelPricing) {
|
|
42293
|
+
missing.push(model);
|
|
42294
|
+
continue;
|
|
42295
|
+
}
|
|
42296
|
+
pricing.set(model, modelPricing);
|
|
42297
|
+
}
|
|
42298
|
+
if (missing.length > 0) {
|
|
42299
|
+
throw new Error(
|
|
42300
|
+
`Cannot estimate benchmark cost because pricing metadata is unavailable for: ${missing.join(", ")}. Use provider/model ids with backend provider pricing metadata.`
|
|
42301
|
+
);
|
|
42302
|
+
}
|
|
42303
|
+
return pricing;
|
|
42304
|
+
}
|
|
42305
|
+
function loadProviderPricing() {
|
|
42306
|
+
const providersDir = findProvidersDir();
|
|
42307
|
+
const pricing = /* @__PURE__ */ new Map();
|
|
42308
|
+
if (!providersDir) return pricing;
|
|
42309
|
+
for (const fileName of readdirSync(providersDir)) {
|
|
42310
|
+
if (!fileName.endsWith(".yml")) continue;
|
|
42311
|
+
const filePath = join4(providersDir, fileName);
|
|
42312
|
+
const text = readFileSync6(filePath, "utf-8");
|
|
42313
|
+
const provider = parseProviderId(text) ?? fileName.replace(/\.yml$/, "");
|
|
42314
|
+
for (const modelPricing of parseModelPricing(text, provider)) {
|
|
42315
|
+
pricing.set(`${modelPricing.provider}/${modelPricing.modelId}`, modelPricing);
|
|
42316
|
+
pricing.set(modelPricing.modelId, modelPricing);
|
|
42317
|
+
}
|
|
42318
|
+
}
|
|
42319
|
+
return pricing;
|
|
42320
|
+
}
|
|
42321
|
+
function parseProviderId(text) {
|
|
42322
|
+
const match = text.match(/^provider_id:\s*["']?([^"'\n]+)["']?/m);
|
|
42323
|
+
return match?.[1]?.trim() ?? null;
|
|
42324
|
+
}
|
|
42325
|
+
function parseModelPricing(text, provider) {
|
|
42326
|
+
const lines = text.split("\n");
|
|
42327
|
+
const results = [];
|
|
42328
|
+
let modelId = null;
|
|
42329
|
+
let inModel = false;
|
|
42330
|
+
let inputTokensPerCredit = null;
|
|
42331
|
+
let outputTokensPerCredit = null;
|
|
42332
|
+
for (const line of lines) {
|
|
42333
|
+
const modelMatch = line.match(/^\s{2}-\s+id:\s*["']?([^"'\n#]+)["']?/);
|
|
42334
|
+
if (modelMatch) {
|
|
42335
|
+
if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
|
|
42336
|
+
results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
|
|
42337
|
+
}
|
|
42338
|
+
inModel = true;
|
|
42339
|
+
modelId = modelMatch[1].trim();
|
|
42340
|
+
inputTokensPerCredit = null;
|
|
42341
|
+
outputTokensPerCredit = null;
|
|
42342
|
+
continue;
|
|
42343
|
+
}
|
|
42344
|
+
if (!inModel) continue;
|
|
42345
|
+
const inputMatch = line.match(/^\s{10}per_credit_unit:\s*(\d+)/);
|
|
42346
|
+
if (inputMatch && inputTokensPerCredit === null) {
|
|
42347
|
+
inputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
|
|
42348
|
+
continue;
|
|
42349
|
+
}
|
|
42350
|
+
if (inputMatch && inputTokensPerCredit !== null && outputTokensPerCredit === null) {
|
|
42351
|
+
outputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
|
|
42352
|
+
}
|
|
42353
|
+
}
|
|
42354
|
+
if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
|
|
42355
|
+
results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
|
|
42356
|
+
}
|
|
42357
|
+
return results;
|
|
42358
|
+
}
|
|
42359
|
+
function normalizeModelKey(model) {
|
|
42360
|
+
return model.includes("/") ? model : model;
|
|
42361
|
+
}
|
|
42362
|
+
function findProvidersDir() {
|
|
42363
|
+
const currentFile = fileURLToPath(import.meta.url);
|
|
42364
|
+
let current = dirname(currentFile);
|
|
42365
|
+
for (let index = 0; index < 8; index += 1) {
|
|
42366
|
+
const candidate = join4(current, "backend", "providers");
|
|
42367
|
+
if (existsSync6(candidate)) return candidate;
|
|
42368
|
+
const parentCandidate = join4(current, "..", "..", "backend", "providers");
|
|
42369
|
+
if (existsSync6(parentCandidate)) return resolve5(parentCandidate);
|
|
42370
|
+
const next = dirname(current);
|
|
42371
|
+
if (next === current) break;
|
|
42372
|
+
current = next;
|
|
42373
|
+
}
|
|
42374
|
+
return null;
|
|
42375
|
+
}
|
|
42376
|
+
function estimateCredits(cases, targetModels, judgeModel, pricing) {
|
|
42377
|
+
let targetCredits = 0;
|
|
42378
|
+
let judgeCredits = 0;
|
|
42379
|
+
let targetInputTokens = 0;
|
|
42380
|
+
let targetOutputTokens = 0;
|
|
42381
|
+
let judgeInputTokens = 0;
|
|
42382
|
+
let judgeOutputTokens = 0;
|
|
42383
|
+
for (const benchmarkCase of cases) {
|
|
42384
|
+
const turnCount = 1 + (benchmarkCase.followUps?.length ?? 0);
|
|
42385
|
+
for (const model of targetModels) {
|
|
42386
|
+
const modelPricing = pricing.get(model);
|
|
42387
|
+
if (!modelPricing) continue;
|
|
42388
|
+
const input = benchmarkCase.estimatedInputTokens * turnCount;
|
|
42389
|
+
const output = benchmarkCase.estimatedOutputTokens * turnCount;
|
|
42390
|
+
targetInputTokens += input;
|
|
42391
|
+
targetOutputTokens += output;
|
|
42392
|
+
targetCredits += creditsFor(modelPricing, input, output);
|
|
42393
|
+
if (benchmarkCase.judge) {
|
|
42394
|
+
const judgePricing = pricing.get(judgeModel);
|
|
42395
|
+
if (!judgePricing) continue;
|
|
42396
|
+
const judgeInput = Math.max(2e3, Math.ceil(output * 1.5));
|
|
42397
|
+
const judgeOutput = 350;
|
|
42398
|
+
judgeInputTokens += judgeInput;
|
|
42399
|
+
judgeOutputTokens += judgeOutput;
|
|
42400
|
+
judgeCredits += creditsFor(judgePricing, judgeInput, judgeOutput);
|
|
42401
|
+
}
|
|
42402
|
+
}
|
|
42403
|
+
}
|
|
42404
|
+
return {
|
|
42405
|
+
targetCredits,
|
|
42406
|
+
judgeCredits,
|
|
42407
|
+
totalCredits: targetCredits + judgeCredits,
|
|
42408
|
+
assumptions: { targetInputTokens, targetOutputTokens, judgeInputTokens, judgeOutputTokens }
|
|
42409
|
+
};
|
|
42410
|
+
}
|
|
42411
|
+
function creditsFor(pricing, inputTokens, outputTokens) {
|
|
42412
|
+
return Math.ceil(inputTokens / pricing.inputTokensPerCredit) + Math.ceil(outputTokens / pricing.outputTokensPerCredit);
|
|
42413
|
+
}
|
|
42414
|
+
function makeBaseResult(params) {
|
|
42415
|
+
return {
|
|
42416
|
+
command: "benchmark model",
|
|
42417
|
+
status: params.dryRun ? "planned" : "completed",
|
|
42418
|
+
runId: params.runId,
|
|
42419
|
+
targetModel: params.targetModels[0],
|
|
42420
|
+
targetModels: params.targetModels,
|
|
42421
|
+
judgeModel: params.judgeModel,
|
|
42422
|
+
suites: params.suites,
|
|
42423
|
+
runs: params.runs,
|
|
42424
|
+
compare: params.compare,
|
|
42425
|
+
parallel: params.parallel,
|
|
42426
|
+
extensiveSize: params.extensiveSize,
|
|
42427
|
+
spendsCredits: !params.dryRun,
|
|
42428
|
+
estimatedCredits: params.estimate,
|
|
42429
|
+
cases: [],
|
|
42430
|
+
modelSummaries: params.targetModels.map((model) => ({
|
|
42431
|
+
model,
|
|
42432
|
+
total: 0,
|
|
42433
|
+
passed: 0,
|
|
42434
|
+
failed: 0,
|
|
42435
|
+
averageJudgeScore: null,
|
|
42436
|
+
averageDurationMs: null
|
|
42437
|
+
})),
|
|
42438
|
+
summary: {
|
|
42439
|
+
total: params.totalJobs,
|
|
42440
|
+
completed: 0,
|
|
42441
|
+
passed: 0,
|
|
42442
|
+
failed: 0,
|
|
42443
|
+
skipped: params.dryRun ? params.totalJobs : 0,
|
|
42444
|
+
interrupted: false
|
|
42445
|
+
}
|
|
42446
|
+
};
|
|
42447
|
+
}
|
|
42448
|
+
function recomputeResult(result, totalJobs, interrupted) {
|
|
42449
|
+
const completed = result.cases.length;
|
|
42450
|
+
const passed = result.cases.filter((caseResult) => caseResult.passed).length;
|
|
42451
|
+
const failed = result.cases.filter((caseResult) => !caseResult.passed).length;
|
|
42452
|
+
result.summary = {
|
|
42453
|
+
total: totalJobs,
|
|
42454
|
+
completed,
|
|
42455
|
+
passed,
|
|
42456
|
+
failed,
|
|
42457
|
+
skipped: Math.max(0, totalJobs - completed),
|
|
42458
|
+
interrupted
|
|
42459
|
+
};
|
|
42460
|
+
result.status = interrupted || completed < totalJobs ? "partial" : "completed";
|
|
42461
|
+
result.modelSummaries = result.targetModels.map((model) => summarizeModel(model, result.cases));
|
|
42462
|
+
if (result.compare) result.comparison = buildComparison(result.modelSummaries);
|
|
42463
|
+
}
|
|
42464
|
+
function summarizeModel(model, cases) {
|
|
42465
|
+
const modelCases = cases.filter((caseResult) => caseResult.model === model);
|
|
42466
|
+
const scores = modelCases.map((caseResult) => caseResult.judge?.score).filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
42467
|
+
const durations = modelCases.map((caseResult) => caseResult.durationMs).filter((value) => value > 0);
|
|
42468
|
+
return {
|
|
42469
|
+
model,
|
|
42470
|
+
total: modelCases.length,
|
|
42471
|
+
passed: modelCases.filter((caseResult) => caseResult.passed).length,
|
|
42472
|
+
failed: modelCases.filter((caseResult) => !caseResult.passed).length,
|
|
42473
|
+
averageJudgeScore: scores.length > 0 ? round2(scores.reduce((sum, score) => sum + score, 0) / scores.length) : null,
|
|
42474
|
+
averageDurationMs: durations.length > 0 ? Math.round(durations.reduce((sum, value) => sum + value, 0) / durations.length) : null
|
|
42475
|
+
};
|
|
42476
|
+
}
|
|
42477
|
+
function buildComparison(summaries) {
|
|
42478
|
+
const ranking = [...summaries].sort((a, b) => (b.averageJudgeScore ?? -1) - (a.averageJudgeScore ?? -1) || b.passed - a.passed).map((summary) => ({
|
|
42479
|
+
model: summary.model,
|
|
42480
|
+
averageJudgeScore: summary.averageJudgeScore,
|
|
42481
|
+
passed: summary.passed,
|
|
42482
|
+
total: summary.total
|
|
42483
|
+
}));
|
|
42484
|
+
const notes = ranking.length > 0 ? [`Top model so far: ${ranking[0].model} (${ranking[0].passed}/${ranking[0].total} passed).`] : [];
|
|
42485
|
+
return { ranking, notes };
|
|
42486
|
+
}
|
|
42487
|
+
function round2(value) {
|
|
42488
|
+
return Math.round(value * 100) / 100;
|
|
42489
|
+
}
|
|
42490
|
+
function defaultImageFixturePath() {
|
|
42491
|
+
const fixtureDir = join4(dirname(fileURLToPath(import.meta.url)), "..", "fixtures");
|
|
42492
|
+
const fixturePath = join4(fixtureDir, "brandenburger-tor.png");
|
|
42493
|
+
if (existsSync6(fixturePath)) return fixturePath;
|
|
42494
|
+
const tempDir = mkdtempSync(join4(tmpdir(), "openmates-benchmark-"));
|
|
42495
|
+
const tempPath = join4(tempDir, "brandenburger-tor.svg");
|
|
42496
|
+
writeFileSync4(tempPath, FIXTURE_IMAGE_SVG, "utf-8");
|
|
42497
|
+
return tempPath;
|
|
42498
|
+
}
|
|
41737
42499
|
function writeBenchmarkResult(result, flags, output) {
|
|
41738
42500
|
const json = `${JSON.stringify(result, null, 2)}
|
|
41739
42501
|
`;
|
|
@@ -41742,17 +42504,19 @@ function writeBenchmarkResult(result, flags, output) {
|
|
|
41742
42504
|
process.stdout.write(json);
|
|
41743
42505
|
return;
|
|
41744
42506
|
}
|
|
41745
|
-
console.log(`Benchmark ${result.status}: ${result.
|
|
42507
|
+
console.log(`Benchmark ${result.status}: ${result.targetModels.join(", ")}`);
|
|
41746
42508
|
console.log(`Run ID: ${result.runId}`);
|
|
41747
42509
|
console.log(`Suites: ${result.suites.join(", ")}`);
|
|
41748
42510
|
console.log(`Judge: ${result.judgeModel}`);
|
|
42511
|
+
console.log(`Estimated credits: ${result.estimatedCredits.totalCredits}`);
|
|
41749
42512
|
console.log(`Spend credits: ${result.spendsCredits ? "yes" : "no"}`);
|
|
41750
|
-
if (result.status
|
|
41751
|
-
console.log(`Passed: ${result.summary.passed}/${result.summary.
|
|
42513
|
+
if (result.status !== "planned") {
|
|
42514
|
+
console.log(`Passed: ${result.summary.passed}/${result.summary.completed} completed (${result.summary.skipped} skipped)`);
|
|
41752
42515
|
for (const benchmarkCase of result.cases) {
|
|
41753
42516
|
const mark = benchmarkCase.passed ? "PASS" : "FAIL";
|
|
41754
|
-
const judge = benchmarkCase.judge
|
|
41755
|
-
|
|
42517
|
+
const judge = benchmarkCase.judge ? ` judge=${benchmarkCase.judge.score ?? "unparsed"}` : "";
|
|
42518
|
+
const error = benchmarkCase.error ? ` error=${benchmarkCase.error}` : "";
|
|
42519
|
+
console.log(`${mark} ${benchmarkCase.model} ${benchmarkCase.suite}/${benchmarkCase.id} (${benchmarkCase.durationMs}ms)${judge}${error}`);
|
|
41756
42520
|
}
|
|
41757
42521
|
}
|
|
41758
42522
|
}
|
|
@@ -42141,10 +42905,10 @@ Run 'openmates chats show ` + chatId + "' to check if suggestions have been save
|
|
|
42141
42905
|
input: process.stdin,
|
|
42142
42906
|
output: process.stdout
|
|
42143
42907
|
});
|
|
42144
|
-
const answer = await new Promise((
|
|
42908
|
+
const answer = await new Promise((resolve6) => {
|
|
42145
42909
|
iface.question(
|
|
42146
42910
|
`Delete ${resolved.length} chat(s)? This cannot be undone. [y/N] `,
|
|
42147
|
-
|
|
42911
|
+
resolve6
|
|
42148
42912
|
);
|
|
42149
42913
|
});
|
|
42150
42914
|
iface.close();
|
|
@@ -42304,16 +43068,16 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42304
43068
|
}
|
|
42305
43069
|
}
|
|
42306
43070
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
42307
|
-
const { join:
|
|
43071
|
+
const { join: join5 } = await import("path");
|
|
42308
43072
|
if (useZip) {
|
|
42309
|
-
const tmpDir =
|
|
43073
|
+
const tmpDir = join5(outputDir, `.${filenameBase}_tmp`);
|
|
42310
43074
|
await mkdir(tmpDir, { recursive: true });
|
|
42311
|
-
await writeFile(
|
|
42312
|
-
await writeFile(
|
|
43075
|
+
await writeFile(join5(tmpDir, `${filenameBase}.yml`), yamlContent);
|
|
43076
|
+
await writeFile(join5(tmpDir, `${filenameBase}.md`), mdContent);
|
|
42313
43077
|
if (codeEmbeds.length > 0) {
|
|
42314
43078
|
for (const ce of codeEmbeds) {
|
|
42315
43079
|
const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
|
|
42316
|
-
const fullPath =
|
|
43080
|
+
const fullPath = join5(tmpDir, "code", fpath);
|
|
42317
43081
|
await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
|
|
42318
43082
|
recursive: true
|
|
42319
43083
|
});
|
|
@@ -42321,13 +43085,13 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42321
43085
|
}
|
|
42322
43086
|
}
|
|
42323
43087
|
if (transcriptEmbeds.length > 0) {
|
|
42324
|
-
const tDir =
|
|
43088
|
+
const tDir = join5(tmpDir, "transcripts");
|
|
42325
43089
|
await mkdir(tDir, { recursive: true });
|
|
42326
43090
|
for (const te of transcriptEmbeds) {
|
|
42327
|
-
await writeFile(
|
|
43091
|
+
await writeFile(join5(tDir, te.filename), te.content);
|
|
42328
43092
|
}
|
|
42329
43093
|
}
|
|
42330
|
-
const zipPath =
|
|
43094
|
+
const zipPath = join5(outputDir, `${filenameBase}.zip`);
|
|
42331
43095
|
const { execSync: execSync2 } = await import("child_process");
|
|
42332
43096
|
try {
|
|
42333
43097
|
execSync2(`cd "${tmpDir}" && zip -r "${zipPath}" .`, { stdio: "pipe" });
|
|
@@ -42342,17 +43106,17 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42342
43106
|
);
|
|
42343
43107
|
}
|
|
42344
43108
|
} else {
|
|
42345
|
-
const chatDir =
|
|
43109
|
+
const chatDir = join5(outputDir, filenameBase);
|
|
42346
43110
|
await mkdir(chatDir, { recursive: true });
|
|
42347
43111
|
const written = [];
|
|
42348
|
-
await writeFile(
|
|
43112
|
+
await writeFile(join5(chatDir, `${filenameBase}.yml`), yamlContent);
|
|
42349
43113
|
written.push(`${filenameBase}.yml`);
|
|
42350
|
-
await writeFile(
|
|
43114
|
+
await writeFile(join5(chatDir, `${filenameBase}.md`), mdContent);
|
|
42351
43115
|
written.push(`${filenameBase}.md`);
|
|
42352
43116
|
if (codeEmbeds.length > 0) {
|
|
42353
43117
|
for (const ce of codeEmbeds) {
|
|
42354
43118
|
const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
|
|
42355
|
-
const fullPath =
|
|
43119
|
+
const fullPath = join5(chatDir, "code", fpath);
|
|
42356
43120
|
await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
|
|
42357
43121
|
recursive: true
|
|
42358
43122
|
});
|
|
@@ -42361,10 +43125,10 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42361
43125
|
}
|
|
42362
43126
|
}
|
|
42363
43127
|
if (transcriptEmbeds.length > 0) {
|
|
42364
|
-
const tDir =
|
|
43128
|
+
const tDir = join5(chatDir, "transcripts");
|
|
42365
43129
|
await mkdir(tDir, { recursive: true });
|
|
42366
43130
|
for (const te of transcriptEmbeds) {
|
|
42367
|
-
await writeFile(
|
|
43131
|
+
await writeFile(join5(tDir, te.filename), te.content);
|
|
42368
43132
|
written.push(`transcripts/${te.filename}`);
|
|
42369
43133
|
}
|
|
42370
43134
|
}
|
|
@@ -42400,7 +43164,7 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42400
43164
|
printJson2({
|
|
42401
43165
|
chat_id: chat.id,
|
|
42402
43166
|
title: chat.title,
|
|
42403
|
-
output_dir: useZip ?
|
|
43167
|
+
output_dir: useZip ? join5(outputDir, `${filenameBase}.zip`) : join5(outputDir, filenameBase),
|
|
42404
43168
|
files,
|
|
42405
43169
|
code_embeds: codeEmbeds.length,
|
|
42406
43170
|
transcript_embeds: transcriptEmbeds.length
|
|
@@ -42921,7 +43685,7 @@ async function handleCodeRun(client, flags, apiKey) {
|
|
|
42921
43685
|
}
|
|
42922
43686
|
}
|
|
42923
43687
|
async function streamCodeRunToTerminal(url, jsonMode) {
|
|
42924
|
-
return await new Promise((
|
|
43688
|
+
return await new Promise((resolve6, reject) => {
|
|
42925
43689
|
const ws = new WebSocket2(url);
|
|
42926
43690
|
let lastStatus = {};
|
|
42927
43691
|
ws.on("message", (data) => {
|
|
@@ -42940,7 +43704,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
|
|
|
42940
43704
|
const status = String(payload.status ?? "");
|
|
42941
43705
|
if (["finished", "failed", "timeout", "cancelled"].includes(status)) {
|
|
42942
43706
|
ws.close();
|
|
42943
|
-
|
|
43707
|
+
resolve6(lastStatus);
|
|
42944
43708
|
}
|
|
42945
43709
|
}
|
|
42946
43710
|
} catch (err) {
|
|
@@ -42950,7 +43714,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
|
|
|
42950
43714
|
});
|
|
42951
43715
|
ws.on("error", () => reject(new Error("Code Run stream failed.")));
|
|
42952
43716
|
ws.on("close", () => {
|
|
42953
|
-
if (Object.keys(lastStatus).length > 0)
|
|
43717
|
+
if (Object.keys(lastStatus).length > 0) resolve6(lastStatus);
|
|
42954
43718
|
});
|
|
42955
43719
|
});
|
|
42956
43720
|
}
|
|
@@ -42961,7 +43725,7 @@ async function pollCodeRunStatus(client, statusPath, apiKey, jsonMode) {
|
|
|
42961
43725
|
if (!jsonMode && value) process.stderr.write(`Code Run status: ${value}
|
|
42962
43726
|
`);
|
|
42963
43727
|
if (["finished", "failed", "timeout", "cancelled"].includes(value)) return status;
|
|
42964
|
-
await new Promise((
|
|
43728
|
+
await new Promise((resolve6) => setTimeout(resolve6, 1e3));
|
|
42965
43729
|
}
|
|
42966
43730
|
}
|
|
42967
43731
|
function buildSkillInput(flags, inlineTokens, schemaParams) {
|
|
@@ -43445,11 +44209,11 @@ function parseYamlScalar(value) {
|
|
|
43445
44209
|
}
|
|
43446
44210
|
async function saveDownloadedDocument(document, output) {
|
|
43447
44211
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
43448
|
-
const { join:
|
|
44212
|
+
const { join: join5, basename: basename4, dirname: dirname3 } = await import("path");
|
|
43449
44213
|
const target = typeof output === "string" ? output : ".";
|
|
43450
44214
|
const filename = basename4(document.filename || "document.pdf");
|
|
43451
|
-
const filePath = target.endsWith(".pdf") ? target :
|
|
43452
|
-
await mkdir(
|
|
44215
|
+
const filePath = target.endsWith(".pdf") ? target : join5(target, filename);
|
|
44216
|
+
await mkdir(dirname3(filePath), { recursive: true });
|
|
43453
44217
|
await writeFile(filePath, document.data);
|
|
43454
44218
|
return filePath;
|
|
43455
44219
|
}
|
|
@@ -43477,7 +44241,7 @@ function printMateInfo(mateId, json) {
|
|
|
43477
44241
|
async function confirmOrExit(question) {
|
|
43478
44242
|
const rl = await import("readline");
|
|
43479
44243
|
const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
|
|
43480
|
-
const answer = await new Promise((
|
|
44244
|
+
const answer = await new Promise((resolve6) => iface.question(question, resolve6));
|
|
43481
44245
|
iface.close();
|
|
43482
44246
|
if (answer.trim().toLowerCase() !== "y") {
|
|
43483
44247
|
console.log("Aborted.");
|
|
@@ -43487,7 +44251,7 @@ async function confirmOrExit(question) {
|
|
|
43487
44251
|
async function promptLine(question) {
|
|
43488
44252
|
const rl = await import("readline");
|
|
43489
44253
|
const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
|
|
43490
|
-
const answer = await new Promise((
|
|
44254
|
+
const answer = await new Promise((resolve6) => iface.question(question, resolve6));
|
|
43491
44255
|
iface.close();
|
|
43492
44256
|
return answer.trim();
|
|
43493
44257
|
}
|
|
@@ -43495,7 +44259,7 @@ async function promptSecret(question) {
|
|
|
43495
44259
|
if (!process.stdin.isTTY) {
|
|
43496
44260
|
return promptLine(question);
|
|
43497
44261
|
}
|
|
43498
|
-
return new Promise((
|
|
44262
|
+
return new Promise((resolve6) => {
|
|
43499
44263
|
const stdin2 = process.stdin;
|
|
43500
44264
|
const wasRaw = stdin2.isRaw;
|
|
43501
44265
|
let value = "";
|
|
@@ -43508,7 +44272,7 @@ async function promptSecret(question) {
|
|
|
43508
44272
|
stdin2.off("data", onData);
|
|
43509
44273
|
stdin2.setRawMode(wasRaw);
|
|
43510
44274
|
process.stdout.write("\n");
|
|
43511
|
-
|
|
44275
|
+
resolve6(value);
|
|
43512
44276
|
return;
|
|
43513
44277
|
}
|
|
43514
44278
|
if (char === "") {
|
|
@@ -43528,7 +44292,7 @@ async function promptSecret(question) {
|
|
|
43528
44292
|
}
|
|
43529
44293
|
async function writeSecretFile(filePath, content, force = false) {
|
|
43530
44294
|
const { mkdir, writeFile, stat: stat2 } = await import("fs/promises");
|
|
43531
|
-
const { dirname:
|
|
44295
|
+
const { dirname: dirname3 } = await import("path");
|
|
43532
44296
|
try {
|
|
43533
44297
|
await stat2(filePath);
|
|
43534
44298
|
if (!force) throw new Error(`${filePath} already exists. Use --force to overwrite.`);
|
|
@@ -43538,7 +44302,7 @@ async function writeSecretFile(filePath, content, force = false) {
|
|
|
43538
44302
|
}
|
|
43539
44303
|
if (error instanceof Error && !("code" in error)) throw error;
|
|
43540
44304
|
}
|
|
43541
|
-
await mkdir(
|
|
44305
|
+
await mkdir(dirname3(filePath), { recursive: true });
|
|
43542
44306
|
await writeFile(filePath, content, { mode: 384 });
|
|
43543
44307
|
return filePath;
|
|
43544
44308
|
}
|
|
@@ -46496,7 +47260,7 @@ async function handleDocs(client, subcommand, rest, flags) {
|
|
|
46496
47260
|
}
|
|
46497
47261
|
if (subcommand === "download") {
|
|
46498
47262
|
const { writeFile, mkdir } = await import("fs/promises");
|
|
46499
|
-
const { join:
|
|
47263
|
+
const { join: join5, dirname: dirname3 } = await import("path");
|
|
46500
47264
|
if (flags.all === true) {
|
|
46501
47265
|
const outputDir = typeof flags.output === "string" ? flags.output : "./openmates-docs";
|
|
46502
47266
|
const tree = await client.listDocs();
|
|
@@ -46505,8 +47269,8 @@ async function handleDocs(client, subcommand, rest, flags) {
|
|
|
46505
47269
|
let count = 0;
|
|
46506
47270
|
for (const slug2 of slugs) {
|
|
46507
47271
|
const content2 = await client.getDoc(slug2);
|
|
46508
|
-
const filePath =
|
|
46509
|
-
await mkdir(
|
|
47272
|
+
const filePath = join5(outputDir, `${slug2}.md`);
|
|
47273
|
+
await mkdir(dirname3(filePath), { recursive: true });
|
|
46510
47274
|
await writeFile(filePath, content2, "utf-8");
|
|
46511
47275
|
count++;
|
|
46512
47276
|
process.stderr.write(`\r Downloaded ${count}/${slugs.length}`);
|
|
@@ -46578,8 +47342,8 @@ function isCliEntrypoint() {
|
|
|
46578
47342
|
if (!entrypoint) return false;
|
|
46579
47343
|
try {
|
|
46580
47344
|
const invokedPath = realpathSync(entrypoint);
|
|
46581
|
-
const modulePath = realpathSync(
|
|
46582
|
-
return invokedPath === modulePath || basename3(invokedPath) === "cli.js" &&
|
|
47345
|
+
const modulePath = realpathSync(fileURLToPath2(import.meta.url));
|
|
47346
|
+
return invokedPath === modulePath || basename3(invokedPath) === "cli.js" && dirname2(invokedPath) === dirname2(modulePath);
|
|
46583
47347
|
} catch {
|
|
46584
47348
|
return false;
|
|
46585
47349
|
}
|