openmates 0.12.0-alpha.10 → 0.12.0-alpha.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-D7RIGVLZ.js → chunk-R5Z4FBJJ.js} +1113 -70
- package/dist/cli.js +1 -1
- package/dist/index.d.ts +23 -0
- package/dist/index.js +1 -1
- package/fixtures/brandenburger-tor.png +0 -0
- package/fixtures/brandenburger-tor.svg +25 -0
- package/package.json +5 -3
|
@@ -986,14 +986,14 @@ var OpenMatesWsClient = class {
|
|
|
986
986
|
});
|
|
987
987
|
}
|
|
988
988
|
async open(timeoutMs = 1e4) {
|
|
989
|
-
await new Promise((
|
|
989
|
+
await new Promise((resolve6, reject) => {
|
|
990
990
|
const timeout = setTimeout(
|
|
991
991
|
() => reject(new Error("WebSocket open timeout")),
|
|
992
992
|
timeoutMs
|
|
993
993
|
);
|
|
994
994
|
this.socket.once("open", () => {
|
|
995
995
|
clearTimeout(timeout);
|
|
996
|
-
|
|
996
|
+
resolve6();
|
|
997
997
|
});
|
|
998
998
|
this.socket.once("error", (error) => {
|
|
999
999
|
clearTimeout(timeout);
|
|
@@ -1022,15 +1022,15 @@ var OpenMatesWsClient = class {
|
|
|
1022
1022
|
this.socket.send(JSON.stringify({ type, payload }));
|
|
1023
1023
|
}
|
|
1024
1024
|
sendAsync(type, payload) {
|
|
1025
|
-
return new Promise((
|
|
1025
|
+
return new Promise((resolve6, reject) => {
|
|
1026
1026
|
this.socket.send(JSON.stringify({ type, payload }), (error) => {
|
|
1027
1027
|
if (error) reject(error);
|
|
1028
|
-
else
|
|
1028
|
+
else resolve6();
|
|
1029
1029
|
});
|
|
1030
1030
|
});
|
|
1031
1031
|
}
|
|
1032
1032
|
waitForMessage(expectedType, predicate, timeoutMs = 2e4) {
|
|
1033
|
-
return new Promise((
|
|
1033
|
+
return new Promise((resolve6, reject) => {
|
|
1034
1034
|
const onMessage = (rawData) => {
|
|
1035
1035
|
try {
|
|
1036
1036
|
const parsed = JSON.parse(rawData.toString());
|
|
@@ -1041,7 +1041,7 @@ var OpenMatesWsClient = class {
|
|
|
1041
1041
|
return;
|
|
1042
1042
|
}
|
|
1043
1043
|
cleanup();
|
|
1044
|
-
|
|
1044
|
+
resolve6(parsed);
|
|
1045
1045
|
} catch {
|
|
1046
1046
|
}
|
|
1047
1047
|
};
|
|
@@ -1074,14 +1074,14 @@ var OpenMatesWsClient = class {
|
|
|
1074
1074
|
* Used by ensureSynced to consume the full phased-sync event stream.
|
|
1075
1075
|
*/
|
|
1076
1076
|
collectMessages(terminatorType, timeoutMs = 9e4) {
|
|
1077
|
-
return new Promise((
|
|
1077
|
+
return new Promise((resolve6, reject) => {
|
|
1078
1078
|
const collected = [];
|
|
1079
1079
|
const onMessage = (rawData) => {
|
|
1080
1080
|
try {
|
|
1081
1081
|
const parsed = JSON.parse(rawData.toString());
|
|
1082
1082
|
if (parsed.type === terminatorType) {
|
|
1083
1083
|
cleanup();
|
|
1084
|
-
|
|
1084
|
+
resolve6(collected);
|
|
1085
1085
|
return;
|
|
1086
1086
|
}
|
|
1087
1087
|
collected.push(parsed);
|
|
@@ -1094,7 +1094,7 @@ var OpenMatesWsClient = class {
|
|
|
1094
1094
|
};
|
|
1095
1095
|
const onClose = () => {
|
|
1096
1096
|
cleanup();
|
|
1097
|
-
|
|
1097
|
+
resolve6(collected);
|
|
1098
1098
|
};
|
|
1099
1099
|
const timeout = setTimeout(() => {
|
|
1100
1100
|
cleanup();
|
|
@@ -1132,7 +1132,7 @@ var OpenMatesWsClient = class {
|
|
|
1132
1132
|
const timeoutMs = options?.timeoutMs ?? 9e4;
|
|
1133
1133
|
const onStream = options?.onStream;
|
|
1134
1134
|
const asyncEmbedWaitMs = options?.asyncEmbedWaitMs ?? 12e4;
|
|
1135
|
-
return new Promise((
|
|
1135
|
+
return new Promise((resolve6, reject) => {
|
|
1136
1136
|
let latestContent = "";
|
|
1137
1137
|
let messageId = null;
|
|
1138
1138
|
let taskId = null;
|
|
@@ -1189,7 +1189,7 @@ var OpenMatesWsClient = class {
|
|
|
1189
1189
|
if (waitingForUserPayload) {
|
|
1190
1190
|
if (pendingSubChatHandlers.size > 0) return;
|
|
1191
1191
|
cleanup();
|
|
1192
|
-
|
|
1192
|
+
resolve6({
|
|
1193
1193
|
status: "waiting_for_user",
|
|
1194
1194
|
messageId,
|
|
1195
1195
|
taskId,
|
|
@@ -1209,7 +1209,7 @@ var OpenMatesWsClient = class {
|
|
|
1209
1209
|
if (processingEmbedIds.size > 0 && !asyncEmbedTimer) {
|
|
1210
1210
|
asyncEmbedTimer = setTimeout(() => {
|
|
1211
1211
|
cleanup();
|
|
1212
|
-
|
|
1212
|
+
resolve6({
|
|
1213
1213
|
status: "completed",
|
|
1214
1214
|
messageId,
|
|
1215
1215
|
taskId,
|
|
@@ -1226,7 +1226,7 @@ var OpenMatesWsClient = class {
|
|
|
1226
1226
|
}
|
|
1227
1227
|
if (processingEmbedIds.size > 0) return;
|
|
1228
1228
|
cleanup();
|
|
1229
|
-
|
|
1229
|
+
resolve6({
|
|
1230
1230
|
status: "completed",
|
|
1231
1231
|
messageId,
|
|
1232
1232
|
taskId,
|
|
@@ -1440,7 +1440,7 @@ var OpenMatesWsClient = class {
|
|
|
1440
1440
|
const onClose = () => {
|
|
1441
1441
|
if (aiResponseDone) {
|
|
1442
1442
|
cleanup();
|
|
1443
|
-
|
|
1443
|
+
resolve6({
|
|
1444
1444
|
status: "completed",
|
|
1445
1445
|
messageId,
|
|
1446
1446
|
taskId,
|
|
@@ -3677,6 +3677,23 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
3677
3677
|
if (connectedAccountTokenRefs.length > 0) {
|
|
3678
3678
|
messagePayload.connected_account_token_refs = connectedAccountTokenRefs;
|
|
3679
3679
|
}
|
|
3680
|
+
if (params.benchmarkMetadata) {
|
|
3681
|
+
messagePayload.benchmark_metadata = params.benchmarkMetadata;
|
|
3682
|
+
}
|
|
3683
|
+
if (params.incognito) {
|
|
3684
|
+
const providedHistory = (params.messageHistory ?? []).map((historyMessage) => ({
|
|
3685
|
+
...historyMessage,
|
|
3686
|
+
chat_id: historyMessage.chat_id ?? chatId
|
|
3687
|
+
}));
|
|
3688
|
+
messagePayload.message_history = [...providedHistory, {
|
|
3689
|
+
message_id: messageId,
|
|
3690
|
+
chat_id: chatId,
|
|
3691
|
+
role: "user",
|
|
3692
|
+
sender_name: "User",
|
|
3693
|
+
content: params.message,
|
|
3694
|
+
created_at: createdAt
|
|
3695
|
+
}];
|
|
3696
|
+
}
|
|
3680
3697
|
let chatKeyBytes = null;
|
|
3681
3698
|
let encryptedChatKey = null;
|
|
3682
3699
|
let baselineMessagesV = 0;
|
|
@@ -3735,6 +3752,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
3735
3752
|
if (encryptedEmbeds.length > 0) {
|
|
3736
3753
|
messagePayload.encrypted_embeds = encryptedEmbeds;
|
|
3737
3754
|
}
|
|
3755
|
+
const precollectedResponse = params.precollectResponse ? ws.collectAiResponse(messageId, chatId, { onStream: params.onStream }) : null;
|
|
3738
3756
|
const confirmed = ws.waitForMessage(
|
|
3739
3757
|
"chat_message_confirmed",
|
|
3740
3758
|
(payload) => {
|
|
@@ -3949,7 +3967,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
3949
3967
|
};
|
|
3950
3968
|
if (params.incognito) {
|
|
3951
3969
|
try {
|
|
3952
|
-
const resp = await ws.collectAiResponse(messageId, chatId, streamOpts);
|
|
3970
|
+
const resp = await (precollectedResponse ?? ws.collectAiResponse(messageId, chatId, streamOpts));
|
|
3953
3971
|
assistantMessageId = resp.messageId;
|
|
3954
3972
|
assistant = resp.content;
|
|
3955
3973
|
category = resp.category;
|
|
@@ -4301,7 +4319,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
4301
4319
|
if (response.data.status === "failed") {
|
|
4302
4320
|
throw new Error(response.data.error ?? "Task failed");
|
|
4303
4321
|
}
|
|
4304
|
-
await new Promise((
|
|
4322
|
+
await new Promise((resolve6) => setTimeout(resolve6, SKILL_TASK_POLL_INTERVAL_MS));
|
|
4305
4323
|
}
|
|
4306
4324
|
throw new Error(`Task ${taskId} did not complete within ${SKILL_TASK_POLL_TIMEOUT_MS / 1e3}s`);
|
|
4307
4325
|
}
|
|
@@ -4522,7 +4540,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
4522
4540
|
`Rate limited by settings API; retrying in ${Math.ceil(SETTINGS_GET_RATE_LIMIT_RETRY_MS / 1e3)}s...
|
|
4523
4541
|
`
|
|
4524
4542
|
);
|
|
4525
|
-
await new Promise((
|
|
4543
|
+
await new Promise((resolve6) => setTimeout(resolve6, SETTINGS_GET_RATE_LIMIT_RETRY_MS));
|
|
4526
4544
|
response = await this.http.get(normalizedPath, this.getCliRequestHeaders());
|
|
4527
4545
|
}
|
|
4528
4546
|
if (!response.ok) {
|
|
@@ -6023,7 +6041,7 @@ function filenameFromContentDisposition(header2) {
|
|
|
6023
6041
|
return plain?.trim() ?? null;
|
|
6024
6042
|
}
|
|
6025
6043
|
function sleep(ms) {
|
|
6026
|
-
return new Promise((
|
|
6044
|
+
return new Promise((resolve6) => setTimeout(resolve6, ms));
|
|
6027
6045
|
}
|
|
6028
6046
|
function printLogo() {
|
|
6029
6047
|
const W = "\x1B[1;37m";
|
|
@@ -6039,9 +6057,9 @@ function printLogo() {
|
|
|
6039
6057
|
|
|
6040
6058
|
// src/cli.ts
|
|
6041
6059
|
import { createInterface as createInterface3 } from "readline/promises";
|
|
6042
|
-
import { realpathSync, writeFileSync as
|
|
6043
|
-
import { fileURLToPath } from "url";
|
|
6044
|
-
import { basename as basename3, dirname } from "path";
|
|
6060
|
+
import { realpathSync, writeFileSync as writeFileSync5 } from "fs";
|
|
6061
|
+
import { fileURLToPath as fileURLToPath2 } from "url";
|
|
6062
|
+
import { basename as basename3, dirname as dirname2 } from "path";
|
|
6045
6063
|
import WebSocket2 from "ws";
|
|
6046
6064
|
|
|
6047
6065
|
// ../secret-scanner/src/registry.ts
|
|
@@ -7741,8 +7759,8 @@ async function renderRemotionShareLink(embedId, client, ln) {
|
|
|
7741
7759
|
}
|
|
7742
7760
|
}
|
|
7743
7761
|
function generateQr(value) {
|
|
7744
|
-
return new Promise((
|
|
7745
|
-
qrcode2.generate(value, { small: true }, (qr) =>
|
|
7762
|
+
return new Promise((resolve6) => {
|
|
7763
|
+
qrcode2.generate(value, { small: true }, (qr) => resolve6(qr));
|
|
7746
7764
|
});
|
|
7747
7765
|
}
|
|
7748
7766
|
function remotionMeta(c) {
|
|
@@ -8597,9 +8615,9 @@ function exec(cmd, cwd) {
|
|
|
8597
8615
|
return execSync(cmd, { cwd, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] }).trim();
|
|
8598
8616
|
}
|
|
8599
8617
|
function runInteractive(cmd, args, cwd) {
|
|
8600
|
-
return new Promise((
|
|
8618
|
+
return new Promise((resolve6, reject) => {
|
|
8601
8619
|
const child = nodeSpawn(cmd, args, { cwd, stdio: "inherit", shell: false });
|
|
8602
|
-
child.on("close", (code) =>
|
|
8620
|
+
child.on("close", (code) => resolve6(code ?? 1));
|
|
8603
8621
|
child.on("error", reject);
|
|
8604
8622
|
});
|
|
8605
8623
|
}
|
|
@@ -8860,10 +8878,10 @@ function warnIfMissingLlmCredentials(installPath) {
|
|
|
8860
8878
|
}
|
|
8861
8879
|
async function confirmDestructive(phrase) {
|
|
8862
8880
|
const rl = createInterface2({ input: process.stdin, output: process.stderr });
|
|
8863
|
-
return new Promise((
|
|
8881
|
+
return new Promise((resolve6) => {
|
|
8864
8882
|
rl.question(`Type "${phrase}" to confirm: `, (answer) => {
|
|
8865
8883
|
rl.close();
|
|
8866
|
-
|
|
8884
|
+
resolve6(answer.trim() === phrase);
|
|
8867
8885
|
});
|
|
8868
8886
|
});
|
|
8869
8887
|
}
|
|
@@ -27900,6 +27918,12 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
27900
27918
|
account_created: {
|
|
27901
27919
|
text: "Account created"
|
|
27902
27920
|
},
|
|
27921
|
+
account_created_second_login_title: {
|
|
27922
|
+
text: "Add a second login method"
|
|
27923
|
+
},
|
|
27924
|
+
account_created_second_login_info: {
|
|
27925
|
+
text: "If you signed up with a passkey, add password plus 2FA as a backup. If you signed up with password plus 2FA, add a passkey for faster secure login."
|
|
27926
|
+
},
|
|
27903
27927
|
password_security_reminder: {
|
|
27904
27928
|
subject: {
|
|
27905
27929
|
text: "Action needed to secure your OpenMates account"
|
|
@@ -28129,10 +28153,7 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
28129
28153
|
text: "Welcome to OpenMates!"
|
|
28130
28154
|
},
|
|
28131
28155
|
complete_signup_info: {
|
|
28132
|
-
text: "
|
|
28133
|
-
},
|
|
28134
|
-
auto_delete_warning: {
|
|
28135
|
-
text: "Please note: Accounts that haven't completed the signup process will be automatically deleted after 7 days."
|
|
28156
|
+
text: "Your account is ready. Here are a few helpful next steps to protect your access and keep a copy of your data."
|
|
28136
28157
|
},
|
|
28137
28158
|
want_to_delete_account: {
|
|
28138
28159
|
text: "Want to delete your account?"
|
|
@@ -29262,6 +29283,15 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
29262
29283
|
copy_failed: {
|
|
29263
29284
|
text: "Failed to copy to clipboard"
|
|
29264
29285
|
},
|
|
29286
|
+
code_file_downloaded: {
|
|
29287
|
+
text: "Code file downloaded successfully"
|
|
29288
|
+
},
|
|
29289
|
+
code_file_download_failed: {
|
|
29290
|
+
text: "Failed to download code file"
|
|
29291
|
+
},
|
|
29292
|
+
action_failed: {
|
|
29293
|
+
text: "Failed to perform action"
|
|
29294
|
+
},
|
|
29265
29295
|
download_itinerary: {
|
|
29266
29296
|
text: "Download itinerary"
|
|
29267
29297
|
},
|
|
@@ -30004,6 +30034,15 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
30004
30034
|
anonymous_terms_reminder: {
|
|
30005
30035
|
text: "By sending a message you accept the terms & privacy policy of OpenMates."
|
|
30006
30036
|
},
|
|
30037
|
+
anonymous_terms_reminder_prefix: {
|
|
30038
|
+
text: "By sending a message you accept the "
|
|
30039
|
+
},
|
|
30040
|
+
anonymous_terms_reminder_connector: {
|
|
30041
|
+
text: " & "
|
|
30042
|
+
},
|
|
30043
|
+
anonymous_terms_reminder_suffix: {
|
|
30044
|
+
text: " of OpenMates."
|
|
30045
|
+
},
|
|
30007
30046
|
send: {
|
|
30008
30047
|
text: "Send"
|
|
30009
30048
|
},
|
|
@@ -33661,10 +33700,10 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
33661
33700
|
text: "Account: Email address, username/display name, profile image, locale, and security settings (e.g., 2FA enabled). Email and username are encrypted with your key before storage. We also keep a separate server-side Vault-encrypted copy of your verified email address for mandatory account lifecycle notices, such as account verification, security alerts, and deletion reminders. Passwords are stored as salted hashes."
|
|
33662
33701
|
},
|
|
33663
33702
|
usage: {
|
|
33664
|
-
text: "Usage: Server logs, event timestamps, feature usage, error logs, and device recognition identifiers (hashed). IP addresses may be temporarily processed for security and
|
|
33703
|
+
text: "Usage: Server logs, event timestamps, feature usage, error logs, and device recognition identifiers (hashed). IP addresses may be temporarily processed for security, rate limiting, and anonymous free-usage budget checks. Anonymous free usage uses a first-party random local identifier and sends only a server-side HMAC-hashed form for per-identity abuse limits."
|
|
33665
33704
|
},
|
|
33666
33705
|
content: {
|
|
33667
|
-
text: "Content: Chat messages, prompts, attachments, and uploaded images/videos necessary to deliver the service (subject to moderation where applicable)."
|
|
33706
|
+
text: "Content: Chat messages, prompts, attachments, and uploaded images/videos necessary to deliver the service (subject to moderation where applicable). Anonymous free-usage chats stay local-only and encrypted with per-chat keys before signup; they are uploaded only if you sign up and promote them into account sync."
|
|
33668
33707
|
},
|
|
33669
33708
|
payments: {
|
|
33670
33709
|
text: "Payments: Payment method tokens, transaction IDs, billing address and VAT information as required for invoicing (processed primarily by Stripe). We do not store full card numbers."
|
|
@@ -34202,7 +34241,7 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
34202
34241
|
text: "Credits and Payments"
|
|
34203
34242
|
},
|
|
34204
34243
|
description: {
|
|
34205
|
-
text: "OpenMates uses a credit-based payment system. You purchase credit packs which are consumed when using AI services. Credits do not expire and remain in your account until used. Payment processing is handled by Stripe - see our Privacy Policy for details about payment data handling."
|
|
34244
|
+
text: "OpenMates uses a credit-based payment system. You purchase credit packs which are consumed when using AI services. Credits do not expire and remain in your account until used. Official-cloud anonymous free usage, when available, is a limited fair-use trial with shared daily/weekly caps and per-identity abuse limits; it is not an account credit balance and may be unavailable when the budget is exhausted. Payment processing is handled by Stripe - see our Privacy Policy for details about payment data handling."
|
|
34206
34245
|
},
|
|
34207
34246
|
refund: {
|
|
34208
34247
|
text: "Refund Policy: You may request a refund for unused credits within 14 days after purchase. Your right of withdrawal expires once credits are used. See our signup process for the full refund consent details."
|
|
@@ -38936,6 +38975,9 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
38936
38975
|
weekly_remaining: {
|
|
38937
38976
|
text: "Weekly remaining"
|
|
38938
38977
|
},
|
|
38978
|
+
monthly_remaining: {
|
|
38979
|
+
text: "Monthly remaining"
|
|
38980
|
+
},
|
|
38939
38981
|
reset_at: {
|
|
38940
38982
|
text: "Daily reset"
|
|
38941
38983
|
},
|
|
@@ -38968,6 +39010,9 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
38968
39010
|
},
|
|
38969
39011
|
validation_percent: {
|
|
38970
39012
|
text: "Percent values must be between 0 and 100."
|
|
39013
|
+
},
|
|
39014
|
+
validation_per_identity_cap: {
|
|
39015
|
+
text: "Per-identity daily cap must be at least 1 credit when the monthly budget is above 0."
|
|
38971
39016
|
}
|
|
38972
39017
|
},
|
|
38973
39018
|
tests: {
|
|
@@ -41487,6 +41532,995 @@ function buildAssistantFeedbackDecision(rating) {
|
|
|
41487
41532
|
};
|
|
41488
41533
|
}
|
|
41489
41534
|
|
|
41535
|
+
// src/benchmark.ts
|
|
41536
|
+
import { randomUUID as randomUUID3 } from "crypto";
|
|
41537
|
+
import { existsSync as existsSync6, mkdtempSync, readFileSync as readFileSync6, readdirSync, writeFileSync as writeFileSync4 } from "fs";
|
|
41538
|
+
import { tmpdir } from "os";
|
|
41539
|
+
import { dirname, join as join4, resolve as resolve5 } from "path";
|
|
41540
|
+
import { fileURLToPath } from "url";
|
|
41541
|
+
var DEFAULT_JUDGE_MODEL = "google/gemini-3-flash-preview";
|
|
41542
|
+
var DEFAULT_EXTENSIVE_SIZE = 10;
|
|
41543
|
+
var DEFAULT_PARALLEL = 4;
|
|
41544
|
+
var FIXTURE_IMAGE_SVG = `<?xml version="1.0" encoding="UTF-8"?>
|
|
41545
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="800" viewBox="0 0 1200 800">
|
|
41546
|
+
<rect width="1200" height="800" fill="#d8ecff"/>
|
|
41547
|
+
<rect y="560" width="1200" height="240" fill="#d7c39a"/>
|
|
41548
|
+
<text x="600" y="88" text-anchor="middle" font-family="Arial, sans-serif" font-size="44" font-weight="700" fill="#23344d">Brandenburger Tor, Berlin</text>
|
|
41549
|
+
<g transform="translate(160 170)" fill="#c9aa6a" stroke="#5d4522" stroke-width="8">
|
|
41550
|
+
<rect x="80" y="160" width="800" height="58"/>
|
|
41551
|
+
<rect x="120" y="218" width="720" height="48"/>
|
|
41552
|
+
<rect x="150" y="266" width="660" height="42"/>
|
|
41553
|
+
<g fill="#d9bd7d">
|
|
41554
|
+
<rect x="170" y="308" width="54" height="250"/>
|
|
41555
|
+
<rect x="285" y="308" width="54" height="250"/>
|
|
41556
|
+
<rect x="400" y="308" width="54" height="250"/>
|
|
41557
|
+
<rect x="515" y="308" width="54" height="250"/>
|
|
41558
|
+
<rect x="630" y="308" width="54" height="250"/>
|
|
41559
|
+
<rect x="745" y="308" width="54" height="250"/>
|
|
41560
|
+
</g>
|
|
41561
|
+
<rect x="130" y="558" width="700" height="50"/>
|
|
41562
|
+
<path d="M480 30 C530 72 620 88 682 48 L720 84 C652 142 530 124 456 78 Z" fill="#3e6f5f"/>
|
|
41563
|
+
<circle cx="510" cy="92" r="22" fill="#3e6f5f"/>
|
|
41564
|
+
<circle cx="625" cy="92" r="22" fill="#3e6f5f"/>
|
|
41565
|
+
<path d="M565 38 l26 78 h-52 z" fill="#3e6f5f"/>
|
|
41566
|
+
</g>
|
|
41567
|
+
<text x="600" y="740" text-anchor="middle" font-family="Arial, sans-serif" font-size="32" fill="#23344d">Neoclassical gate with Quadriga on top</text>
|
|
41568
|
+
</svg>
|
|
41569
|
+
`;
|
|
41570
|
+
var QUICK_CASES = [
|
|
41571
|
+
{
|
|
41572
|
+
id: "quick-exact-token",
|
|
41573
|
+
suite: "quick",
|
|
41574
|
+
title: "Exact token smoke test",
|
|
41575
|
+
prompt: "Reply with exactly this token and no extra text: BENCHMARK_SMOKE_OK",
|
|
41576
|
+
complexity: "basic",
|
|
41577
|
+
category: "smoke",
|
|
41578
|
+
expectedIncludes: "BENCHMARK_SMOKE_OK",
|
|
41579
|
+
judge: true,
|
|
41580
|
+
estimatedInputTokens: 12e3,
|
|
41581
|
+
estimatedOutputTokens: 64
|
|
41582
|
+
},
|
|
41583
|
+
{
|
|
41584
|
+
id: "quick-arithmetic",
|
|
41585
|
+
suite: "quick",
|
|
41586
|
+
title: "Arithmetic direct answer",
|
|
41587
|
+
prompt: "Compute 19 * 23. Reply with only the integer result.",
|
|
41588
|
+
complexity: "basic",
|
|
41589
|
+
category: "math",
|
|
41590
|
+
expectedIncludes: "437",
|
|
41591
|
+
judge: true,
|
|
41592
|
+
estimatedInputTokens: 12e3,
|
|
41593
|
+
estimatedOutputTokens: 64
|
|
41594
|
+
},
|
|
41595
|
+
{
|
|
41596
|
+
id: "quick-code",
|
|
41597
|
+
suite: "quick",
|
|
41598
|
+
title: "Small code generation",
|
|
41599
|
+
prompt: "Write a TypeScript function isPalindrome(input: string): boolean that ignores spaces, punctuation, and case. Include only the function and one short usage example.",
|
|
41600
|
+
complexity: "medium",
|
|
41601
|
+
category: "coding",
|
|
41602
|
+
judge: true,
|
|
41603
|
+
estimatedInputTokens: 12200,
|
|
41604
|
+
estimatedOutputTokens: 650
|
|
41605
|
+
},
|
|
41606
|
+
{
|
|
41607
|
+
id: "quick-image-brandenburger-tor",
|
|
41608
|
+
suite: "quick",
|
|
41609
|
+
title: "Default image understanding",
|
|
41610
|
+
prompt: "Look at the attached image. What landmark is shown, when was it built, and who designed it? Answer in three concise bullet points.",
|
|
41611
|
+
complexity: "medium",
|
|
41612
|
+
category: "image",
|
|
41613
|
+
image: "default",
|
|
41614
|
+
expectedIncludes: "Brandenburg",
|
|
41615
|
+
judge: true,
|
|
41616
|
+
estimatedInputTokens: 13500,
|
|
41617
|
+
estimatedOutputTokens: 350
|
|
41618
|
+
},
|
|
41619
|
+
{
|
|
41620
|
+
id: "quick-followup-continuity",
|
|
41621
|
+
suite: "quick",
|
|
41622
|
+
title: "Short multi-turn continuity",
|
|
41623
|
+
prompt: "Create a three-step plan for evaluating whether a new AI model is ready for production use.",
|
|
41624
|
+
complexity: "medium",
|
|
41625
|
+
category: "multi_turn",
|
|
41626
|
+
judge: true,
|
|
41627
|
+
estimatedInputTokens: 14e3,
|
|
41628
|
+
estimatedOutputTokens: 900,
|
|
41629
|
+
followUps: [
|
|
41630
|
+
{ prompt: "Now make step 2 more concrete with two measurable checks." },
|
|
41631
|
+
{ prompt: "Summarize the final plan in one sentence." }
|
|
41632
|
+
]
|
|
41633
|
+
}
|
|
41634
|
+
];
|
|
41635
|
+
var EXTENSIVE_CASES = [
|
|
41636
|
+
...QUICK_CASES,
|
|
41637
|
+
{
|
|
41638
|
+
id: "extensive-coding-debug",
|
|
41639
|
+
suite: "extensive",
|
|
41640
|
+
title: "Debug a JavaScript bug",
|
|
41641
|
+
prompt: "A JavaScript function returns NaN when summing prices from [{price: '12.50'}, {price: undefined}]. Explain the bug and write a corrected function.",
|
|
41642
|
+
complexity: "medium",
|
|
41643
|
+
category: "coding",
|
|
41644
|
+
judge: true,
|
|
41645
|
+
estimatedInputTokens: 12300,
|
|
41646
|
+
estimatedOutputTokens: 850
|
|
41647
|
+
},
|
|
41648
|
+
{
|
|
41649
|
+
id: "extensive-coding-api-design",
|
|
41650
|
+
suite: "extensive",
|
|
41651
|
+
title: "Design a small API contract",
|
|
41652
|
+
prompt: "Design a minimal JSON API for creating and listing benchmark runs. Include request/response examples and one validation error.",
|
|
41653
|
+
complexity: "advanced",
|
|
41654
|
+
category: "coding",
|
|
41655
|
+
judge: true,
|
|
41656
|
+
estimatedInputTokens: 12300,
|
|
41657
|
+
estimatedOutputTokens: 1e3
|
|
41658
|
+
},
|
|
41659
|
+
{
|
|
41660
|
+
id: "extensive-reasoning-tradeoffs",
|
|
41661
|
+
suite: "extensive",
|
|
41662
|
+
title: "Reason about benchmark tradeoffs",
|
|
41663
|
+
prompt: "Compare deterministic assertions and LLM-as-judge evaluation for model benchmarks. Give two strengths and two risks for each.",
|
|
41664
|
+
complexity: "medium",
|
|
41665
|
+
category: "reasoning",
|
|
41666
|
+
judge: true,
|
|
41667
|
+
estimatedInputTokens: 12200,
|
|
41668
|
+
estimatedOutputTokens: 800
|
|
41669
|
+
},
|
|
41670
|
+
{
|
|
41671
|
+
id: "extensive-planning",
|
|
41672
|
+
suite: "extensive",
|
|
41673
|
+
title: "Operational rollout plan",
|
|
41674
|
+
prompt: "Create a rollout checklist for switching a production chatbot from one model to another. Include monitoring, rollback, and user-visible risk checks.",
|
|
41675
|
+
complexity: "advanced",
|
|
41676
|
+
category: "synthesis",
|
|
41677
|
+
judge: true,
|
|
41678
|
+
estimatedInputTokens: 12300,
|
|
41679
|
+
estimatedOutputTokens: 950
|
|
41680
|
+
},
|
|
41681
|
+
{
|
|
41682
|
+
id: "extensive-long-context-followup",
|
|
41683
|
+
suite: "extensive",
|
|
41684
|
+
title: "Prebuilt 20-message long chat follow-up",
|
|
41685
|
+
prompt: "Based on the earlier discussion, choose the best launch strategy and explain why in five bullets.",
|
|
41686
|
+
complexity: "advanced",
|
|
41687
|
+
category: "long_context",
|
|
41688
|
+
longContext: true,
|
|
41689
|
+
judge: true,
|
|
41690
|
+
estimatedInputTokens: 18500,
|
|
41691
|
+
estimatedOutputTokens: 900
|
|
41692
|
+
},
|
|
41693
|
+
{
|
|
41694
|
+
id: "extensive-policy-summary",
|
|
41695
|
+
suite: "extensive",
|
|
41696
|
+
title: "Policy summarization",
|
|
41697
|
+
prompt: "Summarize why privacy-preserving benchmark logs should avoid raw user prompts. Include a concrete safer alternative.",
|
|
41698
|
+
complexity: "medium",
|
|
41699
|
+
category: "reasoning",
|
|
41700
|
+
judge: true,
|
|
41701
|
+
estimatedInputTokens: 12200,
|
|
41702
|
+
estimatedOutputTokens: 650
|
|
41703
|
+
},
|
|
41704
|
+
{
|
|
41705
|
+
id: "extensive-structured-output",
|
|
41706
|
+
suite: "extensive",
|
|
41707
|
+
title: "Structured JSON output",
|
|
41708
|
+
prompt: "Return only JSON with keys risk, mitigation, and confidence for the risk: benchmark results are biased by prompt wording.",
|
|
41709
|
+
complexity: "medium",
|
|
41710
|
+
category: "synthesis",
|
|
41711
|
+
judge: true,
|
|
41712
|
+
estimatedInputTokens: 12200,
|
|
41713
|
+
estimatedOutputTokens: 350
|
|
41714
|
+
},
|
|
41715
|
+
{
|
|
41716
|
+
id: "extensive-creative-constraint",
|
|
41717
|
+
suite: "extensive",
|
|
41718
|
+
title: "Creative constrained response",
|
|
41719
|
+
prompt: "Write a six-line product note announcing model comparisons. Each line must be under 70 characters and avoid hype words like revolutionary or magical.",
|
|
41720
|
+
complexity: "medium",
|
|
41721
|
+
category: "synthesis",
|
|
41722
|
+
judge: true,
|
|
41723
|
+
estimatedInputTokens: 12200,
|
|
41724
|
+
estimatedOutputTokens: 500
|
|
41725
|
+
},
|
|
41726
|
+
{
|
|
41727
|
+
id: "extensive-data-reasoning",
|
|
41728
|
+
suite: "extensive",
|
|
41729
|
+
title: "Interpret metrics",
|
|
41730
|
+
prompt: "A benchmark has pass rates 8/10, 7/10, and 9/10 across three runs. Explain what you can and cannot conclude from this sample.",
|
|
41731
|
+
complexity: "medium",
|
|
41732
|
+
category: "reasoning",
|
|
41733
|
+
judge: true,
|
|
41734
|
+
estimatedInputTokens: 12200,
|
|
41735
|
+
estimatedOutputTokens: 600
|
|
41736
|
+
},
|
|
41737
|
+
{
|
|
41738
|
+
id: "extensive-security-review",
|
|
41739
|
+
suite: "extensive",
|
|
41740
|
+
title: "Security review",
|
|
41741
|
+
prompt: "Review this benchmark design for security risks: it logs prompts, outputs, model ids, and usage costs to a shared file. List risks and safer defaults.",
|
|
41742
|
+
complexity: "advanced",
|
|
41743
|
+
category: "reasoning",
|
|
41744
|
+
judge: true,
|
|
41745
|
+
estimatedInputTokens: 12300,
|
|
41746
|
+
estimatedOutputTokens: 850
|
|
41747
|
+
},
|
|
41748
|
+
{
|
|
41749
|
+
id: "extensive-followup-requirements",
|
|
41750
|
+
suite: "extensive",
|
|
41751
|
+
title: "Three-turn requirements refinement",
|
|
41752
|
+
prompt: "Draft acceptance criteria for a CLI benchmark comparison feature.",
|
|
41753
|
+
complexity: "advanced",
|
|
41754
|
+
category: "multi_turn",
|
|
41755
|
+
judge: true,
|
|
41756
|
+
estimatedInputTokens: 14500,
|
|
41757
|
+
estimatedOutputTokens: 1100,
|
|
41758
|
+
followUps: [
|
|
41759
|
+
{ prompt: "Add one criterion about cost estimation before live runs." },
|
|
41760
|
+
{ prompt: "Add one criterion about partial results after interruption." },
|
|
41761
|
+
{ prompt: "Now compress the criteria to five bullets total." }
|
|
41762
|
+
]
|
|
41763
|
+
},
|
|
41764
|
+
{
|
|
41765
|
+
id: "extensive-coding-tests",
|
|
41766
|
+
suite: "extensive",
|
|
41767
|
+
title: "Write tests for parser behavior",
|
|
41768
|
+
prompt: "Write Node.js test cases for a function parseSuites(value) that accepts quick, extensive, all, and comma-separated lists, and rejects unknown suites.",
|
|
41769
|
+
complexity: "medium",
|
|
41770
|
+
category: "coding",
|
|
41771
|
+
judge: true,
|
|
41772
|
+
estimatedInputTokens: 12300,
|
|
41773
|
+
estimatedOutputTokens: 950
|
|
41774
|
+
},
|
|
41775
|
+
{
|
|
41776
|
+
id: "extensive-coding-refactor",
|
|
41777
|
+
suite: "extensive",
|
|
41778
|
+
title: "Refactor duplicated code",
|
|
41779
|
+
prompt: "Given two duplicated TypeScript loops that build arrays of result objects, explain when to extract a helper and write the helper signature.",
|
|
41780
|
+
complexity: "medium",
|
|
41781
|
+
category: "coding",
|
|
41782
|
+
judge: true,
|
|
41783
|
+
estimatedInputTokens: 12300,
|
|
41784
|
+
estimatedOutputTokens: 750
|
|
41785
|
+
},
|
|
41786
|
+
{
|
|
41787
|
+
id: "extensive-comparison-analysis",
|
|
41788
|
+
suite: "extensive",
|
|
41789
|
+
title: "Compare two model outputs",
|
|
41790
|
+
prompt: "Explain how you would compare two model outputs when one is concise but misses caveats and the other is verbose but complete.",
|
|
41791
|
+
complexity: "medium",
|
|
41792
|
+
category: "reasoning",
|
|
41793
|
+
judge: true,
|
|
41794
|
+
estimatedInputTokens: 12200,
|
|
41795
|
+
estimatedOutputTokens: 650
|
|
41796
|
+
},
|
|
41797
|
+
{
|
|
41798
|
+
id: "extensive-failure-mode",
|
|
41799
|
+
suite: "extensive",
|
|
41800
|
+
title: "Failure-mode analysis",
|
|
41801
|
+
prompt: "List five failure modes for image-understanding benchmarks and one mitigation for each.",
|
|
41802
|
+
complexity: "advanced",
|
|
41803
|
+
category: "image",
|
|
41804
|
+
judge: true,
|
|
41805
|
+
estimatedInputTokens: 12300,
|
|
41806
|
+
estimatedOutputTokens: 900
|
|
41807
|
+
}
|
|
41808
|
+
];
|
|
41809
|
+
async function handleBenchmark(client, subcommand, rest, flags) {
|
|
41810
|
+
if (!subcommand || subcommand === "help" || flags.help === true) {
|
|
41811
|
+
printBenchmarkHelp();
|
|
41812
|
+
return;
|
|
41813
|
+
}
|
|
41814
|
+
if (subcommand !== "model") {
|
|
41815
|
+
throw new Error(`Unknown benchmark command '${subcommand}'. Run 'openmates benchmark --help'.`);
|
|
41816
|
+
}
|
|
41817
|
+
const targetModels = rest.filter((arg) => !arg.startsWith("--"));
|
|
41818
|
+
if (targetModels.length === 0) {
|
|
41819
|
+
throw new Error("Missing target model. Usage: openmates benchmark model <provider/model> [model-b] --confirm-spend-credits");
|
|
41820
|
+
}
|
|
41821
|
+
const compare = flags.compare === true;
|
|
41822
|
+
if (targetModels.length > 1 && !compare) {
|
|
41823
|
+
throw new Error("Multiple target models require --compare.");
|
|
41824
|
+
}
|
|
41825
|
+
if (compare && targetModels.length < 2) {
|
|
41826
|
+
throw new Error("--compare requires at least two target models.");
|
|
41827
|
+
}
|
|
41828
|
+
const judgeModel = typeof flags["judge-model"] === "string" ? flags["judge-model"] : DEFAULT_JUDGE_MODEL;
|
|
41829
|
+
const suites = parseSuites(flags.suite);
|
|
41830
|
+
const runs = parseRuns(flags.runs);
|
|
41831
|
+
const extensiveSize = parseExtensiveSize(flags["extensive-size"]);
|
|
41832
|
+
const parallel = parseParallel(flags.parallel);
|
|
41833
|
+
const dryRun = flags["dry-run"] === true;
|
|
41834
|
+
const output = typeof flags.output === "string" ? flags.output : void 0;
|
|
41835
|
+
const runId = typeof flags["run-id"] === "string" ? flags["run-id"] : randomUUID3();
|
|
41836
|
+
const imagePath = typeof flags.image === "string" ? resolve5(flags.image) : defaultImageFixturePath();
|
|
41837
|
+
if (!dryRun && flags["confirm-spend-credits"] !== true) {
|
|
41838
|
+
throw new Error(
|
|
41839
|
+
"Benchmark runs spend real credits from the logged-in account. Rerun with --confirm-spend-credits, or use --dry-run to preview the plan."
|
|
41840
|
+
);
|
|
41841
|
+
}
|
|
41842
|
+
const cases = expandCases(suites, runs, extensiveSize);
|
|
41843
|
+
const pricing = loadPricingForModels([...targetModels, judgeModel]);
|
|
41844
|
+
const estimate = estimateCredits(cases, targetModels, judgeModel, pricing);
|
|
41845
|
+
const result = makeBaseResult({
|
|
41846
|
+
runId,
|
|
41847
|
+
targetModels,
|
|
41848
|
+
judgeModel,
|
|
41849
|
+
suites,
|
|
41850
|
+
runs,
|
|
41851
|
+
compare,
|
|
41852
|
+
parallel,
|
|
41853
|
+
extensiveSize,
|
|
41854
|
+
dryRun,
|
|
41855
|
+
estimate,
|
|
41856
|
+
totalJobs: cases.length * targetModels.length
|
|
41857
|
+
});
|
|
41858
|
+
if (dryRun) {
|
|
41859
|
+
writeBenchmarkResult(result, flags, output);
|
|
41860
|
+
return;
|
|
41861
|
+
}
|
|
41862
|
+
if (!client.hasSession()) {
|
|
41863
|
+
throw new Error("Benchmark runs require login. Run 'openmates login' first.");
|
|
41864
|
+
}
|
|
41865
|
+
let interrupted = false;
|
|
41866
|
+
const onInterrupt = () => {
|
|
41867
|
+
interrupted = true;
|
|
41868
|
+
};
|
|
41869
|
+
process.once("SIGINT", onInterrupt);
|
|
41870
|
+
try {
|
|
41871
|
+
const jobs = cases.flatMap((benchmarkCase) => targetModels.map((model) => ({ model, benchmarkCase })));
|
|
41872
|
+
await runPool(jobs, parallel, async (job) => {
|
|
41873
|
+
if (interrupted) return;
|
|
41874
|
+
const caseResult = await runCaseJob({ client, job, judgeModel, runId, imagePath });
|
|
41875
|
+
result.cases.push(caseResult);
|
|
41876
|
+
recomputeResult(result, jobs.length, interrupted);
|
|
41877
|
+
});
|
|
41878
|
+
} finally {
|
|
41879
|
+
process.off("SIGINT", onInterrupt);
|
|
41880
|
+
}
|
|
41881
|
+
recomputeResult(result, cases.length * targetModels.length, interrupted);
|
|
41882
|
+
writeBenchmarkResult(result, flags, output);
|
|
41883
|
+
}
|
|
41884
|
+
function printBenchmarkHelp() {
|
|
41885
|
+
console.log(`Benchmark commands:
|
|
41886
|
+
openmates benchmark model <provider/model> [provider/model...] --confirm-spend-credits [--compare] [--suite quick|extensive|all] [--json]
|
|
41887
|
+
|
|
41888
|
+
Runs real incognito chat requests through the OpenMates product path. Live runs
|
|
41889
|
+
spend the logged-in user's credits and usage entries are grouped as benchmark spend.
|
|
41890
|
+
|
|
41891
|
+
Options:
|
|
41892
|
+
--confirm-spend-credits Required for live benchmark runs
|
|
41893
|
+
--dry-run Preview the benchmark plan without inference or spend
|
|
41894
|
+
--compare Compare two or more target models
|
|
41895
|
+
--suite <list> Comma-separated suites: quick, extensive, all (default: quick)
|
|
41896
|
+
--extensive-size <n> Extensive cases to run: 5, 10, or 20 (default: ${DEFAULT_EXTENSIVE_SIZE})
|
|
41897
|
+
--runs <n> Repeat each selected case (default: 1)
|
|
41898
|
+
--parallel <n> Concurrent target case requests (default: ${DEFAULT_PARALLEL})
|
|
41899
|
+
--judge-model <provider/model> Judge for evaluated cases (default: ${DEFAULT_JUDGE_MODEL})
|
|
41900
|
+
--image <path> Override default Brandenburger Tor image fixture
|
|
41901
|
+
--run-id <id> Reuse a benchmark run id for grouping
|
|
41902
|
+
--output <path> Save JSON result to a file
|
|
41903
|
+
--json Print JSON result`);
|
|
41904
|
+
}
|
|
41905
|
+
function parseSuites(value) {
|
|
41906
|
+
if (value === void 0 || value === false) return ["quick"];
|
|
41907
|
+
if (value === true) throw new Error("--suite requires a value");
|
|
41908
|
+
const suites = value.split(",").map((suite) => suite.trim()).filter(Boolean);
|
|
41909
|
+
if (suites.includes("all")) return ["quick", "extensive"];
|
|
41910
|
+
const allowed = /* @__PURE__ */ new Set(["quick", "extensive"]);
|
|
41911
|
+
const invalid = suites.filter((suite) => !allowed.has(suite));
|
|
41912
|
+
if (invalid.length > 0 || suites.length === 0) {
|
|
41913
|
+
throw new Error("Invalid --suite. Use quick, extensive, or all.");
|
|
41914
|
+
}
|
|
41915
|
+
return [...new Set(suites)];
|
|
41916
|
+
}
|
|
41917
|
+
function parseRuns(value) {
|
|
41918
|
+
if (value === void 0 || value === false) return 1;
|
|
41919
|
+
if (value === true) throw new Error("--runs requires a value");
|
|
41920
|
+
const parsed = Number.parseInt(value, 10);
|
|
41921
|
+
if (!Number.isInteger(parsed) || parsed < 1 || parsed > 20) {
|
|
41922
|
+
throw new Error("--runs must be an integer from 1 to 20");
|
|
41923
|
+
}
|
|
41924
|
+
return parsed;
|
|
41925
|
+
}
|
|
41926
|
+
function parseExtensiveSize(value) {
|
|
41927
|
+
if (value === void 0 || value === false) return DEFAULT_EXTENSIVE_SIZE;
|
|
41928
|
+
if (value === true) throw new Error("--extensive-size requires a value");
|
|
41929
|
+
const parsed = Number.parseInt(value, 10);
|
|
41930
|
+
if (![5, 10, 20].includes(parsed)) {
|
|
41931
|
+
throw new Error("--extensive-size must be 5, 10, or 20");
|
|
41932
|
+
}
|
|
41933
|
+
return parsed;
|
|
41934
|
+
}
|
|
41935
|
+
function parseParallel(value) {
|
|
41936
|
+
if (value === void 0 || value === false) return DEFAULT_PARALLEL;
|
|
41937
|
+
if (value === true) throw new Error("--parallel requires a value");
|
|
41938
|
+
const parsed = Number.parseInt(value, 10);
|
|
41939
|
+
if (!Number.isInteger(parsed) || parsed < 1 || parsed > 20) {
|
|
41940
|
+
throw new Error("--parallel must be an integer from 1 to 20");
|
|
41941
|
+
}
|
|
41942
|
+
return parsed;
|
|
41943
|
+
}
|
|
41944
|
+
function expandCases(suites, runs, extensiveSize) {
|
|
41945
|
+
const selected = [];
|
|
41946
|
+
if (suites.includes("quick")) selected.push(...QUICK_CASES);
|
|
41947
|
+
if (suites.includes("extensive")) selected.push(...selectExtensiveCases(extensiveSize));
|
|
41948
|
+
const uniqueSelected = dedupeCases(selected);
|
|
41949
|
+
const expanded = [];
|
|
41950
|
+
for (let run = 1; run <= runs; run += 1) {
|
|
41951
|
+
for (const benchmarkCase of uniqueSelected) expanded.push({ ...benchmarkCase, run });
|
|
41952
|
+
}
|
|
41953
|
+
return expanded;
|
|
41954
|
+
}
|
|
41955
|
+
function selectExtensiveCases(size) {
|
|
41956
|
+
const cases = dedupeCases(EXTENSIVE_CASES).slice(0, size);
|
|
41957
|
+
const minimumCoding = Math.ceil(size * 0.15);
|
|
41958
|
+
const codingCount = cases.filter((benchmarkCase) => benchmarkCase.category === "coding").length;
|
|
41959
|
+
if (codingCount >= minimumCoding) return cases;
|
|
41960
|
+
const selectedIds = new Set(cases.map((benchmarkCase) => benchmarkCase.id));
|
|
41961
|
+
const codingBackfill = EXTENSIVE_CASES.filter(
|
|
41962
|
+
(benchmarkCase) => benchmarkCase.category === "coding" && !selectedIds.has(benchmarkCase.id)
|
|
41963
|
+
);
|
|
41964
|
+
const result = [...cases];
|
|
41965
|
+
for (const codingCase of codingBackfill) {
|
|
41966
|
+
let replaceIndex = -1;
|
|
41967
|
+
for (let index = result.length - 1; index >= 0; index -= 1) {
|
|
41968
|
+
if (result[index]?.category !== "coding") {
|
|
41969
|
+
replaceIndex = index;
|
|
41970
|
+
break;
|
|
41971
|
+
}
|
|
41972
|
+
}
|
|
41973
|
+
if (replaceIndex === -1) break;
|
|
41974
|
+
result[replaceIndex] = codingCase;
|
|
41975
|
+
if (result.filter((benchmarkCase) => benchmarkCase.category === "coding").length >= minimumCoding) break;
|
|
41976
|
+
}
|
|
41977
|
+
return result;
|
|
41978
|
+
}
|
|
41979
|
+
function dedupeCases(cases) {
|
|
41980
|
+
const seen = /* @__PURE__ */ new Set();
|
|
41981
|
+
const result = [];
|
|
41982
|
+
for (const benchmarkCase of cases) {
|
|
41983
|
+
if (seen.has(benchmarkCase.id)) continue;
|
|
41984
|
+
seen.add(benchmarkCase.id);
|
|
41985
|
+
result.push(benchmarkCase);
|
|
41986
|
+
}
|
|
41987
|
+
return result;
|
|
41988
|
+
}
|
|
41989
|
+
async function runCaseJob(params) {
|
|
41990
|
+
const { client, job, judgeModel, runId, imagePath } = params;
|
|
41991
|
+
const { model, benchmarkCase } = job;
|
|
41992
|
+
const startedAt = Date.now();
|
|
41993
|
+
const turns = [];
|
|
41994
|
+
const history = benchmarkCase.longContext ? buildLongContextHistory() : [];
|
|
41995
|
+
let chatId;
|
|
41996
|
+
try {
|
|
41997
|
+
const initialPrompt = await buildPromptWithAttachments(client, benchmarkCase, model, imagePath);
|
|
41998
|
+
const targetResponse = await sendBenchmarkTurn({
|
|
41999
|
+
client,
|
|
42000
|
+
model,
|
|
42001
|
+
judgeModel,
|
|
42002
|
+
runId,
|
|
42003
|
+
benchmarkCase,
|
|
42004
|
+
prompt: initialPrompt.message,
|
|
42005
|
+
chatId,
|
|
42006
|
+
history,
|
|
42007
|
+
preparedEmbeds: initialPrompt.embeds,
|
|
42008
|
+
caseId: benchmarkCase.id
|
|
42009
|
+
});
|
|
42010
|
+
chatId = targetResponse.chatId;
|
|
42011
|
+
turns.push(targetResponse.turn);
|
|
42012
|
+
appendHistory(history, "user", initialPrompt.message);
|
|
42013
|
+
appendHistory(history, "assistant", targetResponse.turn.assistant);
|
|
42014
|
+
for (const [index, followUp] of (benchmarkCase.followUps ?? []).entries()) {
|
|
42015
|
+
const response = await sendBenchmarkTurn({
|
|
42016
|
+
client,
|
|
42017
|
+
model,
|
|
42018
|
+
judgeModel,
|
|
42019
|
+
runId,
|
|
42020
|
+
benchmarkCase,
|
|
42021
|
+
prompt: `${modelMention(model)} ${followUp.prompt}`,
|
|
42022
|
+
chatId,
|
|
42023
|
+
history,
|
|
42024
|
+
caseId: `${benchmarkCase.id}:followup-${index + 1}`
|
|
42025
|
+
});
|
|
42026
|
+
chatId = response.chatId;
|
|
42027
|
+
turns.push(response.turn);
|
|
42028
|
+
appendHistory(history, "user", response.rawPrompt);
|
|
42029
|
+
appendHistory(history, "assistant", response.turn.assistant);
|
|
42030
|
+
}
|
|
42031
|
+
const assistant = turns.at(-1)?.assistant ?? "";
|
|
42032
|
+
const caseResult = {
|
|
42033
|
+
id: benchmarkCase.id,
|
|
42034
|
+
suite: benchmarkCase.suite,
|
|
42035
|
+
title: benchmarkCase.title,
|
|
42036
|
+
model,
|
|
42037
|
+
run: benchmarkCase.run,
|
|
42038
|
+
complexity: benchmarkCase.complexity,
|
|
42039
|
+
category: benchmarkCase.category,
|
|
42040
|
+
prompt: benchmarkCase.prompt,
|
|
42041
|
+
assistant,
|
|
42042
|
+
modelName: turns.at(-1)?.modelName ?? null,
|
|
42043
|
+
passed: benchmarkCase.expectedIncludes ? assistant.includes(benchmarkCase.expectedIncludes) : true,
|
|
42044
|
+
durationMs: Date.now() - startedAt,
|
|
42045
|
+
expectedIncludes: benchmarkCase.expectedIncludes,
|
|
42046
|
+
turns
|
|
42047
|
+
};
|
|
42048
|
+
if (benchmarkCase.judge) {
|
|
42049
|
+
caseResult.judge = await judgeCase({ client, judgeModel, targetModel: model, benchmarkCase, caseResult, runId });
|
|
42050
|
+
caseResult.passed = caseResult.judge.score !== null && caseResult.judge.score >= 4 && caseResult.passed;
|
|
42051
|
+
}
|
|
42052
|
+
return caseResult;
|
|
42053
|
+
} catch (error) {
|
|
42054
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
42055
|
+
return {
|
|
42056
|
+
id: benchmarkCase.id,
|
|
42057
|
+
suite: benchmarkCase.suite,
|
|
42058
|
+
title: benchmarkCase.title,
|
|
42059
|
+
model,
|
|
42060
|
+
run: benchmarkCase.run,
|
|
42061
|
+
complexity: benchmarkCase.complexity,
|
|
42062
|
+
category: benchmarkCase.category,
|
|
42063
|
+
prompt: benchmarkCase.prompt,
|
|
42064
|
+
assistant: turns.at(-1)?.assistant ?? "",
|
|
42065
|
+
modelName: turns.at(-1)?.modelName ?? null,
|
|
42066
|
+
passed: false,
|
|
42067
|
+
durationMs: Date.now() - startedAt,
|
|
42068
|
+
expectedIncludes: benchmarkCase.expectedIncludes,
|
|
42069
|
+
turns,
|
|
42070
|
+
error: message
|
|
42071
|
+
};
|
|
42072
|
+
}
|
|
42073
|
+
}
|
|
42074
|
+
async function sendBenchmarkTurn(params) {
|
|
42075
|
+
const startedAt = Date.now();
|
|
42076
|
+
const response = await params.client.sendMessage({
|
|
42077
|
+
message: params.prompt,
|
|
42078
|
+
chatId: params.chatId,
|
|
42079
|
+
incognito: true,
|
|
42080
|
+
autoApproveSubChats: true,
|
|
42081
|
+
benchmarkMetadata: benchmarkMetadata({
|
|
42082
|
+
runId: params.runId,
|
|
42083
|
+
suite: params.benchmarkCase.suite,
|
|
42084
|
+
caseId: params.caseId,
|
|
42085
|
+
targetModel: params.model,
|
|
42086
|
+
judgeModel: params.judgeModel
|
|
42087
|
+
}),
|
|
42088
|
+
messageHistory: params.history,
|
|
42089
|
+
preparedEmbeds: params.preparedEmbeds,
|
|
42090
|
+
precollectResponse: true
|
|
42091
|
+
});
|
|
42092
|
+
return {
|
|
42093
|
+
chatId: response.chatId,
|
|
42094
|
+
rawPrompt: params.prompt,
|
|
42095
|
+
turn: {
|
|
42096
|
+
prompt: params.prompt,
|
|
42097
|
+
assistant: response.assistant,
|
|
42098
|
+
modelName: response.modelName,
|
|
42099
|
+
durationMs: Date.now() - startedAt
|
|
42100
|
+
}
|
|
42101
|
+
};
|
|
42102
|
+
}
|
|
42103
|
+
async function buildPromptWithAttachments(client, benchmarkCase, model, imagePath) {
|
|
42104
|
+
const baseMessage = `${modelMention(model)} ${benchmarkCase.prompt}`;
|
|
42105
|
+
if (benchmarkCase.image !== "default") return { message: baseMessage };
|
|
42106
|
+
const attachment = await prepareImageAttachment(client, imagePath);
|
|
42107
|
+
return { message: `${baseMessage}
|
|
42108
|
+
|
|
42109
|
+
${attachment.messageSuffix}`, embeds: attachment.embeds };
|
|
42110
|
+
}
|
|
42111
|
+
async function prepareImageAttachment(client, imagePath) {
|
|
42112
|
+
if (!existsSync6(imagePath)) throw new Error(`Benchmark image not found: ${imagePath}`);
|
|
42113
|
+
const processed = processFiles([imagePath], null);
|
|
42114
|
+
if (processed.blocked.length > 0 || processed.errors.length > 0 || processed.embeds.length === 0) {
|
|
42115
|
+
const reason = [...processed.blocked, ...processed.errors].map((entry) => entry.error).join("; ") || "no image embed produced";
|
|
42116
|
+
throw new Error(`Failed to prepare benchmark image: ${reason}`);
|
|
42117
|
+
}
|
|
42118
|
+
const fileEmbed = processed.embeds[0];
|
|
42119
|
+
if (!fileEmbed.requiresUpload || !fileEmbed.localPath) {
|
|
42120
|
+
return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
|
|
42121
|
+
}
|
|
42122
|
+
await uploadBenchmarkImage(client, fileEmbed);
|
|
42123
|
+
return { messageSuffix: fileEmbed.referenceBlock, embeds: [fileEmbed.embed] };
|
|
42124
|
+
}
|
|
42125
|
+
async function uploadBenchmarkImage(client, fileEmbed) {
|
|
42126
|
+
if (!fileEmbed.localPath) return;
|
|
42127
|
+
const uploadResult = await uploadFile(fileEmbed.localPath, client.getSession());
|
|
42128
|
+
const embedRef = fileEmbed.embed.embedRef ?? `benchmark-image-${uploadResult.embed_id.slice(0, 8)}`;
|
|
42129
|
+
fileEmbed.embed.embedRef = embedRef;
|
|
42130
|
+
fileEmbed.embed.content = toonEncodeContent({
|
|
42131
|
+
type: "image",
|
|
42132
|
+
app_id: "images",
|
|
42133
|
+
skill_id: "upload",
|
|
42134
|
+
status: "finished",
|
|
42135
|
+
filename: fileEmbed.displayName,
|
|
42136
|
+
embed_ref: embedRef,
|
|
42137
|
+
content_hash: uploadResult.content_hash,
|
|
42138
|
+
s3_base_url: uploadResult.s3_base_url,
|
|
42139
|
+
files: uploadResult.files,
|
|
42140
|
+
aes_key: uploadResult.aes_key,
|
|
42141
|
+
aes_nonce: uploadResult.aes_nonce,
|
|
42142
|
+
vault_wrapped_aes_key: uploadResult.vault_wrapped_aes_key,
|
|
42143
|
+
ai_detection: uploadResult.ai_detection
|
|
42144
|
+
});
|
|
42145
|
+
fileEmbed.embed.status = "finished";
|
|
42146
|
+
fileEmbed.embed.contentHash = uploadResult.content_hash;
|
|
42147
|
+
fileEmbed.embed.embedId = uploadResult.embed_id;
|
|
42148
|
+
fileEmbed.referenceBlock = createEmbedReferenceBlock(embedRef);
|
|
42149
|
+
}
|
|
42150
|
+
async function judgeCase(params) {
|
|
42151
|
+
const startedAt = Date.now();
|
|
42152
|
+
const judgeResponse = await params.client.sendMessage({
|
|
42153
|
+
message: `${modelMention(params.judgeModel)} ${judgePrompt(params.targetModel, params.benchmarkCase, params.caseResult)}`,
|
|
42154
|
+
incognito: true,
|
|
42155
|
+
autoApproveSubChats: true,
|
|
42156
|
+
benchmarkMetadata: benchmarkMetadata({
|
|
42157
|
+
runId: params.runId,
|
|
42158
|
+
suite: params.benchmarkCase.suite,
|
|
42159
|
+
caseId: `${params.benchmarkCase.id}:judge:${params.targetModel}`,
|
|
42160
|
+
targetModel: params.targetModel,
|
|
42161
|
+
judgeModel: params.judgeModel
|
|
42162
|
+
}),
|
|
42163
|
+
precollectResponse: true
|
|
42164
|
+
});
|
|
42165
|
+
const judgment = parseJudgment(judgeResponse.assistant);
|
|
42166
|
+
return {
|
|
42167
|
+
model: params.judgeModel,
|
|
42168
|
+
score: judgment.score,
|
|
42169
|
+
reason: judgment.reason,
|
|
42170
|
+
raw: judgeResponse.assistant,
|
|
42171
|
+
durationMs: Date.now() - startedAt
|
|
42172
|
+
};
|
|
42173
|
+
}
|
|
42174
|
+
async function runPool(items, parallel, worker) {
|
|
42175
|
+
let index = 0;
|
|
42176
|
+
const workers = Array.from({ length: Math.min(parallel, items.length) }, async () => {
|
|
42177
|
+
while (index < items.length) {
|
|
42178
|
+
const item = items[index];
|
|
42179
|
+
index += 1;
|
|
42180
|
+
await worker(item);
|
|
42181
|
+
}
|
|
42182
|
+
});
|
|
42183
|
+
await Promise.all(workers);
|
|
42184
|
+
}
|
|
42185
|
+
function buildLongContextHistory() {
|
|
42186
|
+
const now = Math.floor(Date.now() / 1e3) - 2e3;
|
|
42187
|
+
const topics = [
|
|
42188
|
+
["user", "We need to launch a CLI benchmark for model comparisons."],
|
|
42189
|
+
["assistant", "The first goal should be a quick suite with deterministic checks."],
|
|
42190
|
+
["user", "The benchmark also needs image inference."],
|
|
42191
|
+
["assistant", "Use a public fixture image and ask a factual visual question."],
|
|
42192
|
+
["user", "We should avoid wasting credits."],
|
|
42193
|
+
["assistant", "Run a pricing preflight and require explicit spend confirmation."],
|
|
42194
|
+
["user", "What about longer conversations?"],
|
|
42195
|
+
["assistant", "Add a 20-message predefined history and a dependent follow-up."],
|
|
42196
|
+
["user", "The extensive suite should not be too small."],
|
|
42197
|
+
["assistant", "Default to 10 cases and allow 5 or 20 as alternatives."],
|
|
42198
|
+
["user", "Coding quality matters."],
|
|
42199
|
+
["assistant", "Reserve at least 15 percent of extensive cases for coding prompts."],
|
|
42200
|
+
["user", "We also need comparison mode."],
|
|
42201
|
+
["assistant", "Accept multiple models with --compare and run target jobs in parallel."],
|
|
42202
|
+
["user", "How should judging work?"],
|
|
42203
|
+
["assistant", "Judge each completed case immediately with Gemini so partial results remain useful."],
|
|
42204
|
+
["user", "What if the process is interrupted?"],
|
|
42205
|
+
["assistant", "Print or write a partial summary with completed judgments and skipped counts."],
|
|
42206
|
+
["user", "What is the best launch strategy?"],
|
|
42207
|
+
["assistant", "Ship quick and comparison first, then use extensive for slower releases."]
|
|
42208
|
+
];
|
|
42209
|
+
return topics.map(([role, content], index) => ({
|
|
42210
|
+
message_id: `benchmark-history-${index + 1}`,
|
|
42211
|
+
role,
|
|
42212
|
+
sender_name: role === "user" ? "User" : "Assistant",
|
|
42213
|
+
content,
|
|
42214
|
+
created_at: now + index * 30
|
|
42215
|
+
}));
|
|
42216
|
+
}
|
|
42217
|
+
function appendHistory(history, role, content) {
|
|
42218
|
+
history.push({
|
|
42219
|
+
message_id: randomUUID3(),
|
|
42220
|
+
role,
|
|
42221
|
+
sender_name: role === "user" ? "User" : "Assistant",
|
|
42222
|
+
content,
|
|
42223
|
+
created_at: Math.floor(Date.now() / 1e3)
|
|
42224
|
+
});
|
|
42225
|
+
}
|
|
42226
|
+
function modelMention(model) {
|
|
42227
|
+
const separator = model.indexOf("/");
|
|
42228
|
+
if (separator === -1) return `@ai-model:${model}`;
|
|
42229
|
+
const provider = model.slice(0, separator);
|
|
42230
|
+
const modelId = model.slice(separator + 1);
|
|
42231
|
+
if (!provider || !modelId) return `@ai-model:${model}`;
|
|
42232
|
+
return `@ai-model:${modelId}:${provider}`;
|
|
42233
|
+
}
|
|
42234
|
+
function benchmarkMetadata(params) {
|
|
42235
|
+
return {
|
|
42236
|
+
source: "benchmark",
|
|
42237
|
+
benchmark_run_id: params.runId,
|
|
42238
|
+
benchmark_suite: params.suite,
|
|
42239
|
+
benchmark_case: params.caseId,
|
|
42240
|
+
benchmark_target_model: params.targetModel,
|
|
42241
|
+
benchmark_judge_model: params.judgeModel
|
|
42242
|
+
};
|
|
42243
|
+
}
|
|
42244
|
+
function judgePrompt(targetModel, benchmarkCase, result) {
|
|
42245
|
+
return [
|
|
42246
|
+
"You are judging a real OpenMates model benchmark response.",
|
|
42247
|
+
"Return exactly two plain-text lines, with no markdown, no code block, and no tool use.",
|
|
42248
|
+
"Line 1 format: BENCHMARK_SCORE=<integer from 1 to 5>",
|
|
42249
|
+
"Line 2 format: BENCHMARK_REASON=<one short sentence>",
|
|
42250
|
+
"Score for correctness, instruction-following, usefulness, and continuity where relevant.",
|
|
42251
|
+
`Target model: ${targetModel}`,
|
|
42252
|
+
`Benchmark case: ${benchmarkCase.id} (${benchmarkCase.category}, ${benchmarkCase.complexity})`,
|
|
42253
|
+
`Initial prompt: ${JSON.stringify(benchmarkCase.prompt)}`,
|
|
42254
|
+
`Turns: ${JSON.stringify(result.turns.map((turn) => ({ prompt: turn.prompt, assistant: turn.assistant })))}`
|
|
42255
|
+
].join("\n");
|
|
42256
|
+
}
|
|
42257
|
+
function parseJudgment(answer) {
|
|
42258
|
+
const markerScore = answer.match(/BENCHMARK_SCORE\s*=\s*([1-5])/i);
|
|
42259
|
+
if (markerScore) {
|
|
42260
|
+
const reasonMatch = answer.match(/BENCHMARK_REASON\s*=\s*(.+)/i);
|
|
42261
|
+
return {
|
|
42262
|
+
score: Number.parseInt(markerScore[1], 10),
|
|
42263
|
+
reason: reasonMatch?.[1]?.trim() ?? null
|
|
42264
|
+
};
|
|
42265
|
+
}
|
|
42266
|
+
const jsonText = extractJsonObject(answer);
|
|
42267
|
+
if (!jsonText) return { score: null, reason: null };
|
|
42268
|
+
try {
|
|
42269
|
+
const parsed = JSON.parse(jsonText);
|
|
42270
|
+
const score = typeof parsed.score === "number" && Number.isFinite(parsed.score) ? parsed.score : null;
|
|
42271
|
+
const reason = typeof parsed.reason === "string" ? parsed.reason : null;
|
|
42272
|
+
return { score, reason };
|
|
42273
|
+
} catch {
|
|
42274
|
+
return { score: null, reason: null };
|
|
42275
|
+
}
|
|
42276
|
+
}
|
|
42277
|
+
function extractJsonObject(text) {
|
|
42278
|
+
const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i);
|
|
42279
|
+
if (fenced) return fenced[1];
|
|
42280
|
+
const start = text.indexOf("{");
|
|
42281
|
+
const end = text.lastIndexOf("}");
|
|
42282
|
+
if (start === -1 || end === -1 || end <= start) return null;
|
|
42283
|
+
return text.slice(start, end + 1);
|
|
42284
|
+
}
|
|
42285
|
+
function loadPricingForModels(models) {
|
|
42286
|
+
const availablePricing = loadProviderPricing();
|
|
42287
|
+
const pricing = /* @__PURE__ */ new Map();
|
|
42288
|
+
const missing = [];
|
|
42289
|
+
for (const model of [...new Set(models)]) {
|
|
42290
|
+
const key = normalizeModelKey(model);
|
|
42291
|
+
const modelPricing = availablePricing.get(key);
|
|
42292
|
+
if (!modelPricing) {
|
|
42293
|
+
missing.push(model);
|
|
42294
|
+
continue;
|
|
42295
|
+
}
|
|
42296
|
+
pricing.set(model, modelPricing);
|
|
42297
|
+
}
|
|
42298
|
+
if (missing.length > 0) {
|
|
42299
|
+
throw new Error(
|
|
42300
|
+
`Cannot estimate benchmark cost because pricing metadata is unavailable for: ${missing.join(", ")}. Use provider/model ids with backend provider pricing metadata.`
|
|
42301
|
+
);
|
|
42302
|
+
}
|
|
42303
|
+
return pricing;
|
|
42304
|
+
}
|
|
42305
|
+
function loadProviderPricing() {
|
|
42306
|
+
const providersDir = findProvidersDir();
|
|
42307
|
+
const pricing = /* @__PURE__ */ new Map();
|
|
42308
|
+
if (!providersDir) return pricing;
|
|
42309
|
+
for (const fileName of readdirSync(providersDir)) {
|
|
42310
|
+
if (!fileName.endsWith(".yml")) continue;
|
|
42311
|
+
const filePath = join4(providersDir, fileName);
|
|
42312
|
+
const text = readFileSync6(filePath, "utf-8");
|
|
42313
|
+
const provider = parseProviderId(text) ?? fileName.replace(/\.yml$/, "");
|
|
42314
|
+
for (const modelPricing of parseModelPricing(text, provider)) {
|
|
42315
|
+
pricing.set(`${modelPricing.provider}/${modelPricing.modelId}`, modelPricing);
|
|
42316
|
+
pricing.set(modelPricing.modelId, modelPricing);
|
|
42317
|
+
}
|
|
42318
|
+
}
|
|
42319
|
+
return pricing;
|
|
42320
|
+
}
|
|
42321
|
+
function parseProviderId(text) {
|
|
42322
|
+
const match = text.match(/^provider_id:\s*["']?([^"'\n]+)["']?/m);
|
|
42323
|
+
return match?.[1]?.trim() ?? null;
|
|
42324
|
+
}
|
|
42325
|
+
function parseModelPricing(text, provider) {
|
|
42326
|
+
const lines = text.split("\n");
|
|
42327
|
+
const results = [];
|
|
42328
|
+
let modelId = null;
|
|
42329
|
+
let inModel = false;
|
|
42330
|
+
let inputTokensPerCredit = null;
|
|
42331
|
+
let outputTokensPerCredit = null;
|
|
42332
|
+
for (const line of lines) {
|
|
42333
|
+
const modelMatch = line.match(/^\s{2}-\s+id:\s*["']?([^"'\n#]+)["']?/);
|
|
42334
|
+
if (modelMatch) {
|
|
42335
|
+
if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
|
|
42336
|
+
results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
|
|
42337
|
+
}
|
|
42338
|
+
inModel = true;
|
|
42339
|
+
modelId = modelMatch[1].trim();
|
|
42340
|
+
inputTokensPerCredit = null;
|
|
42341
|
+
outputTokensPerCredit = null;
|
|
42342
|
+
continue;
|
|
42343
|
+
}
|
|
42344
|
+
if (!inModel) continue;
|
|
42345
|
+
const inputMatch = line.match(/^\s{10}per_credit_unit:\s*(\d+)/);
|
|
42346
|
+
if (inputMatch && inputTokensPerCredit === null) {
|
|
42347
|
+
inputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
|
|
42348
|
+
continue;
|
|
42349
|
+
}
|
|
42350
|
+
if (inputMatch && inputTokensPerCredit !== null && outputTokensPerCredit === null) {
|
|
42351
|
+
outputTokensPerCredit = Number.parseInt(inputMatch[1], 10);
|
|
42352
|
+
}
|
|
42353
|
+
}
|
|
42354
|
+
if (inModel && modelId && inputTokensPerCredit && outputTokensPerCredit) {
|
|
42355
|
+
results.push({ provider, modelId, inputTokensPerCredit, outputTokensPerCredit });
|
|
42356
|
+
}
|
|
42357
|
+
return results;
|
|
42358
|
+
}
|
|
42359
|
+
function normalizeModelKey(model) {
|
|
42360
|
+
return model.includes("/") ? model : model;
|
|
42361
|
+
}
|
|
42362
|
+
function findProvidersDir() {
|
|
42363
|
+
const currentFile = fileURLToPath(import.meta.url);
|
|
42364
|
+
let current = dirname(currentFile);
|
|
42365
|
+
for (let index = 0; index < 8; index += 1) {
|
|
42366
|
+
const candidate = join4(current, "backend", "providers");
|
|
42367
|
+
if (existsSync6(candidate)) return candidate;
|
|
42368
|
+
const parentCandidate = join4(current, "..", "..", "backend", "providers");
|
|
42369
|
+
if (existsSync6(parentCandidate)) return resolve5(parentCandidate);
|
|
42370
|
+
const next = dirname(current);
|
|
42371
|
+
if (next === current) break;
|
|
42372
|
+
current = next;
|
|
42373
|
+
}
|
|
42374
|
+
return null;
|
|
42375
|
+
}
|
|
42376
|
+
function estimateCredits(cases, targetModels, judgeModel, pricing) {
|
|
42377
|
+
let targetCredits = 0;
|
|
42378
|
+
let judgeCredits = 0;
|
|
42379
|
+
let targetInputTokens = 0;
|
|
42380
|
+
let targetOutputTokens = 0;
|
|
42381
|
+
let judgeInputTokens = 0;
|
|
42382
|
+
let judgeOutputTokens = 0;
|
|
42383
|
+
for (const benchmarkCase of cases) {
|
|
42384
|
+
const turnCount = 1 + (benchmarkCase.followUps?.length ?? 0);
|
|
42385
|
+
for (const model of targetModels) {
|
|
42386
|
+
const modelPricing = pricing.get(model);
|
|
42387
|
+
if (!modelPricing) continue;
|
|
42388
|
+
const input = benchmarkCase.estimatedInputTokens * turnCount;
|
|
42389
|
+
const output = benchmarkCase.estimatedOutputTokens * turnCount;
|
|
42390
|
+
targetInputTokens += input;
|
|
42391
|
+
targetOutputTokens += output;
|
|
42392
|
+
targetCredits += creditsFor(modelPricing, input, output);
|
|
42393
|
+
if (benchmarkCase.judge) {
|
|
42394
|
+
const judgePricing = pricing.get(judgeModel);
|
|
42395
|
+
if (!judgePricing) continue;
|
|
42396
|
+
const judgeInput = Math.max(2e3, Math.ceil(output * 1.5));
|
|
42397
|
+
const judgeOutput = 350;
|
|
42398
|
+
judgeInputTokens += judgeInput;
|
|
42399
|
+
judgeOutputTokens += judgeOutput;
|
|
42400
|
+
judgeCredits += creditsFor(judgePricing, judgeInput, judgeOutput);
|
|
42401
|
+
}
|
|
42402
|
+
}
|
|
42403
|
+
}
|
|
42404
|
+
return {
|
|
42405
|
+
targetCredits,
|
|
42406
|
+
judgeCredits,
|
|
42407
|
+
totalCredits: targetCredits + judgeCredits,
|
|
42408
|
+
assumptions: { targetInputTokens, targetOutputTokens, judgeInputTokens, judgeOutputTokens }
|
|
42409
|
+
};
|
|
42410
|
+
}
|
|
42411
|
+
function creditsFor(pricing, inputTokens, outputTokens) {
|
|
42412
|
+
return Math.ceil(inputTokens / pricing.inputTokensPerCredit) + Math.ceil(outputTokens / pricing.outputTokensPerCredit);
|
|
42413
|
+
}
|
|
42414
|
+
function makeBaseResult(params) {
|
|
42415
|
+
return {
|
|
42416
|
+
command: "benchmark model",
|
|
42417
|
+
status: params.dryRun ? "planned" : "completed",
|
|
42418
|
+
runId: params.runId,
|
|
42419
|
+
targetModel: params.targetModels[0],
|
|
42420
|
+
targetModels: params.targetModels,
|
|
42421
|
+
judgeModel: params.judgeModel,
|
|
42422
|
+
suites: params.suites,
|
|
42423
|
+
runs: params.runs,
|
|
42424
|
+
compare: params.compare,
|
|
42425
|
+
parallel: params.parallel,
|
|
42426
|
+
extensiveSize: params.extensiveSize,
|
|
42427
|
+
spendsCredits: !params.dryRun,
|
|
42428
|
+
estimatedCredits: params.estimate,
|
|
42429
|
+
cases: [],
|
|
42430
|
+
modelSummaries: params.targetModels.map((model) => ({
|
|
42431
|
+
model,
|
|
42432
|
+
total: 0,
|
|
42433
|
+
passed: 0,
|
|
42434
|
+
failed: 0,
|
|
42435
|
+
averageJudgeScore: null,
|
|
42436
|
+
averageDurationMs: null
|
|
42437
|
+
})),
|
|
42438
|
+
summary: {
|
|
42439
|
+
total: params.totalJobs,
|
|
42440
|
+
completed: 0,
|
|
42441
|
+
passed: 0,
|
|
42442
|
+
failed: 0,
|
|
42443
|
+
skipped: params.dryRun ? params.totalJobs : 0,
|
|
42444
|
+
interrupted: false
|
|
42445
|
+
}
|
|
42446
|
+
};
|
|
42447
|
+
}
|
|
42448
|
+
function recomputeResult(result, totalJobs, interrupted) {
|
|
42449
|
+
const completed = result.cases.length;
|
|
42450
|
+
const passed = result.cases.filter((caseResult) => caseResult.passed).length;
|
|
42451
|
+
const failed = result.cases.filter((caseResult) => !caseResult.passed).length;
|
|
42452
|
+
result.summary = {
|
|
42453
|
+
total: totalJobs,
|
|
42454
|
+
completed,
|
|
42455
|
+
passed,
|
|
42456
|
+
failed,
|
|
42457
|
+
skipped: Math.max(0, totalJobs - completed),
|
|
42458
|
+
interrupted
|
|
42459
|
+
};
|
|
42460
|
+
result.status = interrupted || completed < totalJobs ? "partial" : "completed";
|
|
42461
|
+
result.modelSummaries = result.targetModels.map((model) => summarizeModel(model, result.cases));
|
|
42462
|
+
if (result.compare) result.comparison = buildComparison(result.modelSummaries);
|
|
42463
|
+
}
|
|
42464
|
+
function summarizeModel(model, cases) {
|
|
42465
|
+
const modelCases = cases.filter((caseResult) => caseResult.model === model);
|
|
42466
|
+
const scores = modelCases.map((caseResult) => caseResult.judge?.score).filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
42467
|
+
const durations = modelCases.map((caseResult) => caseResult.durationMs).filter((value) => value > 0);
|
|
42468
|
+
return {
|
|
42469
|
+
model,
|
|
42470
|
+
total: modelCases.length,
|
|
42471
|
+
passed: modelCases.filter((caseResult) => caseResult.passed).length,
|
|
42472
|
+
failed: modelCases.filter((caseResult) => !caseResult.passed).length,
|
|
42473
|
+
averageJudgeScore: scores.length > 0 ? round2(scores.reduce((sum, score) => sum + score, 0) / scores.length) : null,
|
|
42474
|
+
averageDurationMs: durations.length > 0 ? Math.round(durations.reduce((sum, value) => sum + value, 0) / durations.length) : null
|
|
42475
|
+
};
|
|
42476
|
+
}
|
|
42477
|
+
function buildComparison(summaries) {
|
|
42478
|
+
const ranking = [...summaries].sort((a, b) => (b.averageJudgeScore ?? -1) - (a.averageJudgeScore ?? -1) || b.passed - a.passed).map((summary) => ({
|
|
42479
|
+
model: summary.model,
|
|
42480
|
+
averageJudgeScore: summary.averageJudgeScore,
|
|
42481
|
+
passed: summary.passed,
|
|
42482
|
+
total: summary.total
|
|
42483
|
+
}));
|
|
42484
|
+
const notes = ranking.length > 0 ? [`Top model so far: ${ranking[0].model} (${ranking[0].passed}/${ranking[0].total} passed).`] : [];
|
|
42485
|
+
return { ranking, notes };
|
|
42486
|
+
}
|
|
42487
|
+
function round2(value) {
|
|
42488
|
+
return Math.round(value * 100) / 100;
|
|
42489
|
+
}
|
|
42490
|
+
function defaultImageFixturePath() {
|
|
42491
|
+
const fixtureDir = join4(dirname(fileURLToPath(import.meta.url)), "..", "fixtures");
|
|
42492
|
+
const fixturePath = join4(fixtureDir, "brandenburger-tor.png");
|
|
42493
|
+
if (existsSync6(fixturePath)) return fixturePath;
|
|
42494
|
+
const tempDir = mkdtempSync(join4(tmpdir(), "openmates-benchmark-"));
|
|
42495
|
+
const tempPath = join4(tempDir, "brandenburger-tor.svg");
|
|
42496
|
+
writeFileSync4(tempPath, FIXTURE_IMAGE_SVG, "utf-8");
|
|
42497
|
+
return tempPath;
|
|
42498
|
+
}
|
|
42499
|
+
function writeBenchmarkResult(result, flags, output) {
|
|
42500
|
+
const json = `${JSON.stringify(result, null, 2)}
|
|
42501
|
+
`;
|
|
42502
|
+
if (output) writeFileSync4(output, json, "utf-8");
|
|
42503
|
+
if (flags.json === true || output) {
|
|
42504
|
+
process.stdout.write(json);
|
|
42505
|
+
return;
|
|
42506
|
+
}
|
|
42507
|
+
console.log(`Benchmark ${result.status}: ${result.targetModels.join(", ")}`);
|
|
42508
|
+
console.log(`Run ID: ${result.runId}`);
|
|
42509
|
+
console.log(`Suites: ${result.suites.join(", ")}`);
|
|
42510
|
+
console.log(`Judge: ${result.judgeModel}`);
|
|
42511
|
+
console.log(`Estimated credits: ${result.estimatedCredits.totalCredits}`);
|
|
42512
|
+
console.log(`Spend credits: ${result.spendsCredits ? "yes" : "no"}`);
|
|
42513
|
+
if (result.status !== "planned") {
|
|
42514
|
+
console.log(`Passed: ${result.summary.passed}/${result.summary.completed} completed (${result.summary.skipped} skipped)`);
|
|
42515
|
+
for (const benchmarkCase of result.cases) {
|
|
42516
|
+
const mark = benchmarkCase.passed ? "PASS" : "FAIL";
|
|
42517
|
+
const judge = benchmarkCase.judge ? ` judge=${benchmarkCase.judge.score ?? "unparsed"}` : "";
|
|
42518
|
+
const error = benchmarkCase.error ? ` error=${benchmarkCase.error}` : "";
|
|
42519
|
+
console.log(`${mark} ${benchmarkCase.model} ${benchmarkCase.suite}/${benchmarkCase.id} (${benchmarkCase.durationMs}ms)${judge}${error}`);
|
|
42520
|
+
}
|
|
42521
|
+
}
|
|
42522
|
+
}
|
|
42523
|
+
|
|
41490
42524
|
// src/cli.ts
|
|
41491
42525
|
async function main() {
|
|
41492
42526
|
const parsed = parseArgs(process.argv.slice(2));
|
|
@@ -41557,6 +42591,10 @@ async function main() {
|
|
|
41557
42591
|
printDocsHelp();
|
|
41558
42592
|
return;
|
|
41559
42593
|
}
|
|
42594
|
+
if (command === "benchmark") {
|
|
42595
|
+
printBenchmarkHelp();
|
|
42596
|
+
return;
|
|
42597
|
+
}
|
|
41560
42598
|
printHelp();
|
|
41561
42599
|
return;
|
|
41562
42600
|
}
|
|
@@ -41627,6 +42665,10 @@ async function main() {
|
|
|
41627
42665
|
handleFeedback(subcommand, rest, parsed.flags);
|
|
41628
42666
|
return;
|
|
41629
42667
|
}
|
|
42668
|
+
if (command === "benchmark") {
|
|
42669
|
+
await handleBenchmark(client, subcommand, rest, parsed.flags);
|
|
42670
|
+
return;
|
|
42671
|
+
}
|
|
41630
42672
|
throw new Error(`Unknown command '${command}'. Run 'openmates help'.`);
|
|
41631
42673
|
}
|
|
41632
42674
|
function shouldInitializeRedactor(command, subcommand) {
|
|
@@ -41863,10 +42905,10 @@ Run 'openmates chats show ` + chatId + "' to check if suggestions have been save
|
|
|
41863
42905
|
input: process.stdin,
|
|
41864
42906
|
output: process.stdout
|
|
41865
42907
|
});
|
|
41866
|
-
const answer = await new Promise((
|
|
42908
|
+
const answer = await new Promise((resolve6) => {
|
|
41867
42909
|
iface.question(
|
|
41868
42910
|
`Delete ${resolved.length} chat(s)? This cannot be undone. [y/N] `,
|
|
41869
|
-
|
|
42911
|
+
resolve6
|
|
41870
42912
|
);
|
|
41871
42913
|
});
|
|
41872
42914
|
iface.close();
|
|
@@ -42026,16 +43068,16 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42026
43068
|
}
|
|
42027
43069
|
}
|
|
42028
43070
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
42029
|
-
const { join:
|
|
43071
|
+
const { join: join5 } = await import("path");
|
|
42030
43072
|
if (useZip) {
|
|
42031
|
-
const tmpDir =
|
|
43073
|
+
const tmpDir = join5(outputDir, `.${filenameBase}_tmp`);
|
|
42032
43074
|
await mkdir(tmpDir, { recursive: true });
|
|
42033
|
-
await writeFile(
|
|
42034
|
-
await writeFile(
|
|
43075
|
+
await writeFile(join5(tmpDir, `${filenameBase}.yml`), yamlContent);
|
|
43076
|
+
await writeFile(join5(tmpDir, `${filenameBase}.md`), mdContent);
|
|
42035
43077
|
if (codeEmbeds.length > 0) {
|
|
42036
43078
|
for (const ce of codeEmbeds) {
|
|
42037
43079
|
const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
|
|
42038
|
-
const fullPath =
|
|
43080
|
+
const fullPath = join5(tmpDir, "code", fpath);
|
|
42039
43081
|
await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
|
|
42040
43082
|
recursive: true
|
|
42041
43083
|
});
|
|
@@ -42043,13 +43085,13 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42043
43085
|
}
|
|
42044
43086
|
}
|
|
42045
43087
|
if (transcriptEmbeds.length > 0) {
|
|
42046
|
-
const tDir =
|
|
43088
|
+
const tDir = join5(tmpDir, "transcripts");
|
|
42047
43089
|
await mkdir(tDir, { recursive: true });
|
|
42048
43090
|
for (const te of transcriptEmbeds) {
|
|
42049
|
-
await writeFile(
|
|
43091
|
+
await writeFile(join5(tDir, te.filename), te.content);
|
|
42050
43092
|
}
|
|
42051
43093
|
}
|
|
42052
|
-
const zipPath =
|
|
43094
|
+
const zipPath = join5(outputDir, `${filenameBase}.zip`);
|
|
42053
43095
|
const { execSync: execSync2 } = await import("child_process");
|
|
42054
43096
|
try {
|
|
42055
43097
|
execSync2(`cd "${tmpDir}" && zip -r "${zipPath}" .`, { stdio: "pipe" });
|
|
@@ -42064,17 +43106,17 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42064
43106
|
);
|
|
42065
43107
|
}
|
|
42066
43108
|
} else {
|
|
42067
|
-
const chatDir =
|
|
43109
|
+
const chatDir = join5(outputDir, filenameBase);
|
|
42068
43110
|
await mkdir(chatDir, { recursive: true });
|
|
42069
43111
|
const written = [];
|
|
42070
|
-
await writeFile(
|
|
43112
|
+
await writeFile(join5(chatDir, `${filenameBase}.yml`), yamlContent);
|
|
42071
43113
|
written.push(`${filenameBase}.yml`);
|
|
42072
|
-
await writeFile(
|
|
43114
|
+
await writeFile(join5(chatDir, `${filenameBase}.md`), mdContent);
|
|
42073
43115
|
written.push(`${filenameBase}.md`);
|
|
42074
43116
|
if (codeEmbeds.length > 0) {
|
|
42075
43117
|
for (const ce of codeEmbeds) {
|
|
42076
43118
|
const fpath = ce.filePath ?? ce.filename ?? `${ce.embedId.slice(0, 8)}.${getExtForLang(ce.language)}`;
|
|
42077
|
-
const fullPath =
|
|
43119
|
+
const fullPath = join5(chatDir, "code", fpath);
|
|
42078
43120
|
await mkdir(fullPath.substring(0, fullPath.lastIndexOf("/")), {
|
|
42079
43121
|
recursive: true
|
|
42080
43122
|
});
|
|
@@ -42083,10 +43125,10 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42083
43125
|
}
|
|
42084
43126
|
}
|
|
42085
43127
|
if (transcriptEmbeds.length > 0) {
|
|
42086
|
-
const tDir =
|
|
43128
|
+
const tDir = join5(chatDir, "transcripts");
|
|
42087
43129
|
await mkdir(tDir, { recursive: true });
|
|
42088
43130
|
for (const te of transcriptEmbeds) {
|
|
42089
|
-
await writeFile(
|
|
43131
|
+
await writeFile(join5(tDir, te.filename), te.content);
|
|
42090
43132
|
written.push(`transcripts/${te.filename}`);
|
|
42091
43133
|
}
|
|
42092
43134
|
}
|
|
@@ -42122,7 +43164,7 @@ ${deleted}/${resolved.length} chat(s) deleted.`);
|
|
|
42122
43164
|
printJson2({
|
|
42123
43165
|
chat_id: chat.id,
|
|
42124
43166
|
title: chat.title,
|
|
42125
|
-
output_dir: useZip ?
|
|
43167
|
+
output_dir: useZip ? join5(outputDir, `${filenameBase}.zip`) : join5(outputDir, filenameBase),
|
|
42126
43168
|
files,
|
|
42127
43169
|
code_embeds: codeEmbeds.length,
|
|
42128
43170
|
transcript_embeds: transcriptEmbeds.length
|
|
@@ -42643,7 +43685,7 @@ async function handleCodeRun(client, flags, apiKey) {
|
|
|
42643
43685
|
}
|
|
42644
43686
|
}
|
|
42645
43687
|
async function streamCodeRunToTerminal(url, jsonMode) {
|
|
42646
|
-
return await new Promise((
|
|
43688
|
+
return await new Promise((resolve6, reject) => {
|
|
42647
43689
|
const ws = new WebSocket2(url);
|
|
42648
43690
|
let lastStatus = {};
|
|
42649
43691
|
ws.on("message", (data) => {
|
|
@@ -42662,7 +43704,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
|
|
|
42662
43704
|
const status = String(payload.status ?? "");
|
|
42663
43705
|
if (["finished", "failed", "timeout", "cancelled"].includes(status)) {
|
|
42664
43706
|
ws.close();
|
|
42665
|
-
|
|
43707
|
+
resolve6(lastStatus);
|
|
42666
43708
|
}
|
|
42667
43709
|
}
|
|
42668
43710
|
} catch (err) {
|
|
@@ -42672,7 +43714,7 @@ async function streamCodeRunToTerminal(url, jsonMode) {
|
|
|
42672
43714
|
});
|
|
42673
43715
|
ws.on("error", () => reject(new Error("Code Run stream failed.")));
|
|
42674
43716
|
ws.on("close", () => {
|
|
42675
|
-
if (Object.keys(lastStatus).length > 0)
|
|
43717
|
+
if (Object.keys(lastStatus).length > 0) resolve6(lastStatus);
|
|
42676
43718
|
});
|
|
42677
43719
|
});
|
|
42678
43720
|
}
|
|
@@ -42683,7 +43725,7 @@ async function pollCodeRunStatus(client, statusPath, apiKey, jsonMode) {
|
|
|
42683
43725
|
if (!jsonMode && value) process.stderr.write(`Code Run status: ${value}
|
|
42684
43726
|
`);
|
|
42685
43727
|
if (["finished", "failed", "timeout", "cancelled"].includes(value)) return status;
|
|
42686
|
-
await new Promise((
|
|
43728
|
+
await new Promise((resolve6) => setTimeout(resolve6, 1e3));
|
|
42687
43729
|
}
|
|
42688
43730
|
}
|
|
42689
43731
|
function buildSkillInput(flags, inlineTokens, schemaParams) {
|
|
@@ -42883,7 +43925,7 @@ async function handleEmbeds(client, subcommand, rest, flags) {
|
|
|
42883
43925
|
throw new Error("Embed version content was not available after local reconstruction.");
|
|
42884
43926
|
}
|
|
42885
43927
|
if (typeof flags.output === "string") {
|
|
42886
|
-
|
|
43928
|
+
writeFileSync5(flags.output, result.content, "utf-8");
|
|
42887
43929
|
if (flags.json === true) {
|
|
42888
43930
|
printJson2({ ...result, output: flags.output });
|
|
42889
43931
|
} else {
|
|
@@ -43167,11 +44209,11 @@ function parseYamlScalar(value) {
|
|
|
43167
44209
|
}
|
|
43168
44210
|
async function saveDownloadedDocument(document, output) {
|
|
43169
44211
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
43170
|
-
const { join:
|
|
44212
|
+
const { join: join5, basename: basename4, dirname: dirname3 } = await import("path");
|
|
43171
44213
|
const target = typeof output === "string" ? output : ".";
|
|
43172
44214
|
const filename = basename4(document.filename || "document.pdf");
|
|
43173
|
-
const filePath = target.endsWith(".pdf") ? target :
|
|
43174
|
-
await mkdir(
|
|
44215
|
+
const filePath = target.endsWith(".pdf") ? target : join5(target, filename);
|
|
44216
|
+
await mkdir(dirname3(filePath), { recursive: true });
|
|
43175
44217
|
await writeFile(filePath, document.data);
|
|
43176
44218
|
return filePath;
|
|
43177
44219
|
}
|
|
@@ -43199,7 +44241,7 @@ function printMateInfo(mateId, json) {
|
|
|
43199
44241
|
async function confirmOrExit(question) {
|
|
43200
44242
|
const rl = await import("readline");
|
|
43201
44243
|
const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
|
|
43202
|
-
const answer = await new Promise((
|
|
44244
|
+
const answer = await new Promise((resolve6) => iface.question(question, resolve6));
|
|
43203
44245
|
iface.close();
|
|
43204
44246
|
if (answer.trim().toLowerCase() !== "y") {
|
|
43205
44247
|
console.log("Aborted.");
|
|
@@ -43209,7 +44251,7 @@ async function confirmOrExit(question) {
|
|
|
43209
44251
|
async function promptLine(question) {
|
|
43210
44252
|
const rl = await import("readline");
|
|
43211
44253
|
const iface = rl.createInterface({ input: process.stdin, output: process.stdout });
|
|
43212
|
-
const answer = await new Promise((
|
|
44254
|
+
const answer = await new Promise((resolve6) => iface.question(question, resolve6));
|
|
43213
44255
|
iface.close();
|
|
43214
44256
|
return answer.trim();
|
|
43215
44257
|
}
|
|
@@ -43217,7 +44259,7 @@ async function promptSecret(question) {
|
|
|
43217
44259
|
if (!process.stdin.isTTY) {
|
|
43218
44260
|
return promptLine(question);
|
|
43219
44261
|
}
|
|
43220
|
-
return new Promise((
|
|
44262
|
+
return new Promise((resolve6) => {
|
|
43221
44263
|
const stdin2 = process.stdin;
|
|
43222
44264
|
const wasRaw = stdin2.isRaw;
|
|
43223
44265
|
let value = "";
|
|
@@ -43230,7 +44272,7 @@ async function promptSecret(question) {
|
|
|
43230
44272
|
stdin2.off("data", onData);
|
|
43231
44273
|
stdin2.setRawMode(wasRaw);
|
|
43232
44274
|
process.stdout.write("\n");
|
|
43233
|
-
|
|
44275
|
+
resolve6(value);
|
|
43234
44276
|
return;
|
|
43235
44277
|
}
|
|
43236
44278
|
if (char === "") {
|
|
@@ -43250,7 +44292,7 @@ async function promptSecret(question) {
|
|
|
43250
44292
|
}
|
|
43251
44293
|
async function writeSecretFile(filePath, content, force = false) {
|
|
43252
44294
|
const { mkdir, writeFile, stat: stat2 } = await import("fs/promises");
|
|
43253
|
-
const { dirname:
|
|
44295
|
+
const { dirname: dirname3 } = await import("path");
|
|
43254
44296
|
try {
|
|
43255
44297
|
await stat2(filePath);
|
|
43256
44298
|
if (!force) throw new Error(`${filePath} already exists. Use --force to overwrite.`);
|
|
@@ -43260,7 +44302,7 @@ async function writeSecretFile(filePath, content, force = false) {
|
|
|
43260
44302
|
}
|
|
43261
44303
|
if (error instanceof Error && !("code" in error)) throw error;
|
|
43262
44304
|
}
|
|
43263
|
-
await mkdir(
|
|
44305
|
+
await mkdir(dirname3(filePath), { recursive: true });
|
|
43264
44306
|
await writeFile(filePath, content, { mode: 384 });
|
|
43265
44307
|
return filePath;
|
|
43266
44308
|
}
|
|
@@ -45887,6 +46929,7 @@ Commands:
|
|
|
45887
46929
|
openmates inspirations [--lang <code>] [--json] Daily inspirations
|
|
45888
46930
|
openmates newchatsuggestions [--limit <n>] [--json] Personalized new chat suggestions
|
|
45889
46931
|
openmates feedback [--help] Assistant response feedback helpers
|
|
46932
|
+
openmates benchmark [--help] Run real model benchmarks with usage tagged as benchmark spend
|
|
45890
46933
|
openmates server [--help] Server management (install, start, stop, ...)
|
|
45891
46934
|
openmates docs [--help] Browse, search, and download documentation
|
|
45892
46935
|
openmates e2e provision-auth-accounts Provision local E2E auth-account artifacts
|
|
@@ -46217,7 +47260,7 @@ async function handleDocs(client, subcommand, rest, flags) {
|
|
|
46217
47260
|
}
|
|
46218
47261
|
if (subcommand === "download") {
|
|
46219
47262
|
const { writeFile, mkdir } = await import("fs/promises");
|
|
46220
|
-
const { join:
|
|
47263
|
+
const { join: join5, dirname: dirname3 } = await import("path");
|
|
46221
47264
|
if (flags.all === true) {
|
|
46222
47265
|
const outputDir = typeof flags.output === "string" ? flags.output : "./openmates-docs";
|
|
46223
47266
|
const tree = await client.listDocs();
|
|
@@ -46226,8 +47269,8 @@ async function handleDocs(client, subcommand, rest, flags) {
|
|
|
46226
47269
|
let count = 0;
|
|
46227
47270
|
for (const slug2 of slugs) {
|
|
46228
47271
|
const content2 = await client.getDoc(slug2);
|
|
46229
|
-
const filePath =
|
|
46230
|
-
await mkdir(
|
|
47272
|
+
const filePath = join5(outputDir, `${slug2}.md`);
|
|
47273
|
+
await mkdir(dirname3(filePath), { recursive: true });
|
|
46231
47274
|
await writeFile(filePath, content2, "utf-8");
|
|
46232
47275
|
count++;
|
|
46233
47276
|
process.stderr.write(`\r Downloaded ${count}/${slugs.length}`);
|
|
@@ -46299,8 +47342,8 @@ function isCliEntrypoint() {
|
|
|
46299
47342
|
if (!entrypoint) return false;
|
|
46300
47343
|
try {
|
|
46301
47344
|
const invokedPath = realpathSync(entrypoint);
|
|
46302
|
-
const modulePath = realpathSync(
|
|
46303
|
-
return invokedPath === modulePath || basename3(invokedPath) === "cli.js" &&
|
|
47345
|
+
const modulePath = realpathSync(fileURLToPath2(import.meta.url));
|
|
47346
|
+
return invokedPath === modulePath || basename3(invokedPath) === "cli.js" && dirname2(invokedPath) === dirname2(modulePath);
|
|
46304
47347
|
} catch {
|
|
46305
47348
|
return false;
|
|
46306
47349
|
}
|