openmates 0.12.0-alpha.10 → 0.12.0-alpha.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-D7RIGVLZ.js → chunk-2QG4XPEB.js} +289 -10
- package/dist/cli.js +1 -1
- package/dist/index.d.ts +12 -0
- package/dist/index.js +1 -1
- package/package.json +1 -1
|
@@ -3677,6 +3677,19 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
3677
3677
|
if (connectedAccountTokenRefs.length > 0) {
|
|
3678
3678
|
messagePayload.connected_account_token_refs = connectedAccountTokenRefs;
|
|
3679
3679
|
}
|
|
3680
|
+
if (params.benchmarkMetadata) {
|
|
3681
|
+
messagePayload.benchmark_metadata = params.benchmarkMetadata;
|
|
3682
|
+
}
|
|
3683
|
+
if (params.incognito) {
|
|
3684
|
+
messagePayload.message_history = [{
|
|
3685
|
+
message_id: messageId,
|
|
3686
|
+
chat_id: chatId,
|
|
3687
|
+
role: "user",
|
|
3688
|
+
sender_name: "User",
|
|
3689
|
+
content: params.message,
|
|
3690
|
+
created_at: createdAt
|
|
3691
|
+
}];
|
|
3692
|
+
}
|
|
3680
3693
|
let chatKeyBytes = null;
|
|
3681
3694
|
let encryptedChatKey = null;
|
|
3682
3695
|
let baselineMessagesV = 0;
|
|
@@ -3735,6 +3748,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
3735
3748
|
if (encryptedEmbeds.length > 0) {
|
|
3736
3749
|
messagePayload.encrypted_embeds = encryptedEmbeds;
|
|
3737
3750
|
}
|
|
3751
|
+
const precollectedResponse = params.precollectResponse ? ws.collectAiResponse(messageId, chatId, { onStream: params.onStream }) : null;
|
|
3738
3752
|
const confirmed = ws.waitForMessage(
|
|
3739
3753
|
"chat_message_confirmed",
|
|
3740
3754
|
(payload) => {
|
|
@@ -3949,7 +3963,7 @@ var OpenMatesClient = class _OpenMatesClient {
|
|
|
3949
3963
|
};
|
|
3950
3964
|
if (params.incognito) {
|
|
3951
3965
|
try {
|
|
3952
|
-
const resp = await ws.collectAiResponse(messageId, chatId, streamOpts);
|
|
3966
|
+
const resp = await (precollectedResponse ?? ws.collectAiResponse(messageId, chatId, streamOpts));
|
|
3953
3967
|
assistantMessageId = resp.messageId;
|
|
3954
3968
|
assistant = resp.content;
|
|
3955
3969
|
category = resp.category;
|
|
@@ -6039,7 +6053,7 @@ function printLogo() {
|
|
|
6039
6053
|
|
|
6040
6054
|
// src/cli.ts
|
|
6041
6055
|
import { createInterface as createInterface3 } from "readline/promises";
|
|
6042
|
-
import { realpathSync, writeFileSync as
|
|
6056
|
+
import { realpathSync, writeFileSync as writeFileSync5 } from "fs";
|
|
6043
6057
|
import { fileURLToPath } from "url";
|
|
6044
6058
|
import { basename as basename3, dirname } from "path";
|
|
6045
6059
|
import WebSocket2 from "ws";
|
|
@@ -27900,6 +27914,12 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
27900
27914
|
account_created: {
|
|
27901
27915
|
text: "Account created"
|
|
27902
27916
|
},
|
|
27917
|
+
account_created_second_login_title: {
|
|
27918
|
+
text: "Add a second login method"
|
|
27919
|
+
},
|
|
27920
|
+
account_created_second_login_info: {
|
|
27921
|
+
text: "If you signed up with a passkey, add password plus 2FA as a backup. If you signed up with password plus 2FA, add a passkey for faster secure login."
|
|
27922
|
+
},
|
|
27903
27923
|
password_security_reminder: {
|
|
27904
27924
|
subject: {
|
|
27905
27925
|
text: "Action needed to secure your OpenMates account"
|
|
@@ -28129,10 +28149,7 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
28129
28149
|
text: "Welcome to OpenMates!"
|
|
28130
28150
|
},
|
|
28131
28151
|
complete_signup_info: {
|
|
28132
|
-
text: "
|
|
28133
|
-
},
|
|
28134
|
-
auto_delete_warning: {
|
|
28135
|
-
text: "Please note: Accounts that haven't completed the signup process will be automatically deleted after 7 days."
|
|
28152
|
+
text: "Your account is ready. Here are a few helpful next steps to protect your access and keep a copy of your data."
|
|
28136
28153
|
},
|
|
28137
28154
|
want_to_delete_account: {
|
|
28138
28155
|
text: "Want to delete your account?"
|
|
@@ -30004,6 +30021,15 @@ Only output the final Markdown table. Do NOT include explanations, notes, or any
|
|
|
30004
30021
|
anonymous_terms_reminder: {
|
|
30005
30022
|
text: "By sending a message you accept the terms & privacy policy of OpenMates."
|
|
30006
30023
|
},
|
|
30024
|
+
anonymous_terms_reminder_prefix: {
|
|
30025
|
+
text: "By sending a message you accept the "
|
|
30026
|
+
},
|
|
30027
|
+
anonymous_terms_reminder_connector: {
|
|
30028
|
+
text: " & "
|
|
30029
|
+
},
|
|
30030
|
+
anonymous_terms_reminder_suffix: {
|
|
30031
|
+
text: " of OpenMates."
|
|
30032
|
+
},
|
|
30007
30033
|
send: {
|
|
30008
30034
|
text: "Send"
|
|
30009
30035
|
},
|
|
@@ -33661,10 +33687,10 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
33661
33687
|
text: "Account: Email address, username/display name, profile image, locale, and security settings (e.g., 2FA enabled). Email and username are encrypted with your key before storage. We also keep a separate server-side Vault-encrypted copy of your verified email address for mandatory account lifecycle notices, such as account verification, security alerts, and deletion reminders. Passwords are stored as salted hashes."
|
|
33662
33688
|
},
|
|
33663
33689
|
usage: {
|
|
33664
|
-
text: "Usage: Server logs, event timestamps, feature usage, error logs, and device recognition identifiers (hashed). IP addresses may be temporarily processed for security and
|
|
33690
|
+
text: "Usage: Server logs, event timestamps, feature usage, error logs, and device recognition identifiers (hashed). IP addresses may be temporarily processed for security, rate limiting, and anonymous free-usage budget checks. Anonymous free usage uses a first-party random local identifier and sends only a server-side HMAC-hashed form for per-identity abuse limits."
|
|
33665
33691
|
},
|
|
33666
33692
|
content: {
|
|
33667
|
-
text: "Content: Chat messages, prompts, attachments, and uploaded images/videos necessary to deliver the service (subject to moderation where applicable)."
|
|
33693
|
+
text: "Content: Chat messages, prompts, attachments, and uploaded images/videos necessary to deliver the service (subject to moderation where applicable). Anonymous free-usage chats stay local-only and encrypted with per-chat keys before signup; they are uploaded only if you sign up and promote them into account sync."
|
|
33668
33694
|
},
|
|
33669
33695
|
payments: {
|
|
33670
33696
|
text: "Payments: Payment method tokens, transaction IDs, billing address and VAT information as required for invoicing (processed primarily by Stripe). We do not store full card numbers."
|
|
@@ -34202,7 +34228,7 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
34202
34228
|
text: "Credits and Payments"
|
|
34203
34229
|
},
|
|
34204
34230
|
description: {
|
|
34205
|
-
text: "OpenMates uses a credit-based payment system. You purchase credit packs which are consumed when using AI services. Credits do not expire and remain in your account until used. Payment processing is handled by Stripe - see our Privacy Policy for details about payment data handling."
|
|
34231
|
+
text: "OpenMates uses a credit-based payment system. You purchase credit packs which are consumed when using AI services. Credits do not expire and remain in your account until used. Official-cloud anonymous free usage, when available, is a limited fair-use trial with shared daily/weekly caps and per-identity abuse limits; it is not an account credit balance and may be unavailable when the budget is exhausted. Payment processing is handled by Stripe - see our Privacy Policy for details about payment data handling."
|
|
34206
34232
|
},
|
|
34207
34233
|
refund: {
|
|
34208
34234
|
text: "Refund Policy: You may request a refund for unused credits within 14 days after purchase. Your right of withdrawal expires once credits are used. See our signup process for the full refund consent details."
|
|
@@ -38936,6 +38962,9 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
38936
38962
|
weekly_remaining: {
|
|
38937
38963
|
text: "Weekly remaining"
|
|
38938
38964
|
},
|
|
38965
|
+
monthly_remaining: {
|
|
38966
|
+
text: "Monthly remaining"
|
|
38967
|
+
},
|
|
38939
38968
|
reset_at: {
|
|
38940
38969
|
text: "Daily reset"
|
|
38941
38970
|
},
|
|
@@ -38968,6 +38997,9 @@ As of mid-2026, the severe supply shocks from the 2024\u20132025 avian flu have
|
|
|
38968
38997
|
},
|
|
38969
38998
|
validation_percent: {
|
|
38970
38999
|
text: "Percent values must be between 0 and 100."
|
|
39000
|
+
},
|
|
39001
|
+
validation_per_identity_cap: {
|
|
39002
|
+
text: "Per-identity daily cap must be at least 1 credit when the monthly budget is above 0."
|
|
38971
39003
|
}
|
|
38972
39004
|
},
|
|
38973
39005
|
tests: {
|
|
@@ -41487,6 +41519,244 @@ function buildAssistantFeedbackDecision(rating) {
|
|
|
41487
41519
|
};
|
|
41488
41520
|
}
|
|
41489
41521
|
|
|
41522
|
+
// src/benchmark.ts
|
|
41523
|
+
import { randomUUID as randomUUID3 } from "crypto";
|
|
41524
|
+
import { writeFileSync as writeFileSync4 } from "fs";
|
|
41525
|
+
var DEFAULT_JUDGE_MODEL = "google/gemini-3-flash-preview";
|
|
41526
|
+
var BENCHMARK_CASES = [
|
|
41527
|
+
{
|
|
41528
|
+
id: "smoke-exact-token",
|
|
41529
|
+
suite: "smoke",
|
|
41530
|
+
prompt: "Reply with exactly this token and no extra text: BENCHMARK_SMOKE_OK",
|
|
41531
|
+
expectedIncludes: "BENCHMARK_SMOKE_OK"
|
|
41532
|
+
},
|
|
41533
|
+
{
|
|
41534
|
+
id: "arithmetic-direct",
|
|
41535
|
+
suite: "tools",
|
|
41536
|
+
prompt: "Compute 19 * 23. Reply with only the integer result.",
|
|
41537
|
+
expectedIncludes: "437"
|
|
41538
|
+
},
|
|
41539
|
+
{
|
|
41540
|
+
id: "quality-concise-explanation",
|
|
41541
|
+
suite: "quality",
|
|
41542
|
+
prompt: "In four concise sentences, explain why deterministic benchmarks still need human-readable evaluation notes.",
|
|
41543
|
+
needsJudge: true
|
|
41544
|
+
}
|
|
41545
|
+
];
|
|
41546
|
+
async function handleBenchmark(client, subcommand, rest, flags) {
|
|
41547
|
+
if (!subcommand || subcommand === "help" || flags.help === true) {
|
|
41548
|
+
printBenchmarkHelp();
|
|
41549
|
+
return;
|
|
41550
|
+
}
|
|
41551
|
+
if (subcommand !== "model") {
|
|
41552
|
+
throw new Error(`Unknown benchmark command '${subcommand}'. Run 'openmates benchmark --help'.`);
|
|
41553
|
+
}
|
|
41554
|
+
const targetModel = rest[0];
|
|
41555
|
+
if (!targetModel) {
|
|
41556
|
+
throw new Error("Missing target model. Usage: openmates benchmark model <provider/model> --confirm-spend-credits");
|
|
41557
|
+
}
|
|
41558
|
+
const judgeModel = typeof flags["judge-model"] === "string" ? flags["judge-model"] : DEFAULT_JUDGE_MODEL;
|
|
41559
|
+
const suites = parseSuites(flags.suite);
|
|
41560
|
+
const runs = parseRuns(flags.runs);
|
|
41561
|
+
const dryRun = flags["dry-run"] === true;
|
|
41562
|
+
const output = typeof flags.output === "string" ? flags.output : void 0;
|
|
41563
|
+
const runId = typeof flags["run-id"] === "string" ? flags["run-id"] : randomUUID3();
|
|
41564
|
+
if (!dryRun && flags["confirm-spend-credits"] !== true) {
|
|
41565
|
+
throw new Error(
|
|
41566
|
+
"Benchmark runs spend real credits from the logged-in account. Rerun with --confirm-spend-credits, or use --dry-run to preview the plan."
|
|
41567
|
+
);
|
|
41568
|
+
}
|
|
41569
|
+
const cases = expandCases(suites, runs);
|
|
41570
|
+
const baseResult = {
|
|
41571
|
+
command: "benchmark model",
|
|
41572
|
+
status: dryRun ? "planned" : "completed",
|
|
41573
|
+
runId,
|
|
41574
|
+
targetModel,
|
|
41575
|
+
judgeModel,
|
|
41576
|
+
suites,
|
|
41577
|
+
runs,
|
|
41578
|
+
spendsCredits: !dryRun,
|
|
41579
|
+
cases: [],
|
|
41580
|
+
summary: { total: cases.length, passed: 0, failed: 0 }
|
|
41581
|
+
};
|
|
41582
|
+
if (dryRun) {
|
|
41583
|
+
writeBenchmarkResult(baseResult, flags, output);
|
|
41584
|
+
return;
|
|
41585
|
+
}
|
|
41586
|
+
if (!client.hasSession()) {
|
|
41587
|
+
throw new Error("Benchmark runs require login. Run 'openmates login' first.");
|
|
41588
|
+
}
|
|
41589
|
+
for (const benchmarkCase of cases) {
|
|
41590
|
+
const startedAt = Date.now();
|
|
41591
|
+
const targetResponse = await client.sendMessage({
|
|
41592
|
+
message: `${modelMention(targetModel)} ${benchmarkCase.prompt}`,
|
|
41593
|
+
incognito: true,
|
|
41594
|
+
autoApproveSubChats: true,
|
|
41595
|
+
benchmarkMetadata: benchmarkMetadata({
|
|
41596
|
+
runId,
|
|
41597
|
+
suite: benchmarkCase.suite,
|
|
41598
|
+
caseId: benchmarkCase.id,
|
|
41599
|
+
targetModel,
|
|
41600
|
+
judgeModel
|
|
41601
|
+
}),
|
|
41602
|
+
precollectResponse: true
|
|
41603
|
+
});
|
|
41604
|
+
const caseResult = {
|
|
41605
|
+
id: benchmarkCase.id,
|
|
41606
|
+
suite: benchmarkCase.suite,
|
|
41607
|
+
run: benchmarkCase.run,
|
|
41608
|
+
prompt: benchmarkCase.prompt,
|
|
41609
|
+
assistant: targetResponse.assistant,
|
|
41610
|
+
modelName: targetResponse.modelName,
|
|
41611
|
+
passed: benchmarkCase.expectedIncludes ? targetResponse.assistant.includes(benchmarkCase.expectedIncludes) : true,
|
|
41612
|
+
durationMs: Date.now() - startedAt,
|
|
41613
|
+
expectedIncludes: benchmarkCase.expectedIncludes
|
|
41614
|
+
};
|
|
41615
|
+
if (benchmarkCase.needsJudge) {
|
|
41616
|
+
const judgeResponse = await client.sendMessage({
|
|
41617
|
+
message: `${modelMention(judgeModel)} ${judgePrompt(benchmarkCase.prompt, targetResponse.assistant)}`,
|
|
41618
|
+
incognito: true,
|
|
41619
|
+
autoApproveSubChats: true,
|
|
41620
|
+
benchmarkMetadata: benchmarkMetadata({
|
|
41621
|
+
runId,
|
|
41622
|
+
suite: benchmarkCase.suite,
|
|
41623
|
+
caseId: `${benchmarkCase.id}:judge`,
|
|
41624
|
+
targetModel,
|
|
41625
|
+
judgeModel
|
|
41626
|
+
}),
|
|
41627
|
+
precollectResponse: true
|
|
41628
|
+
});
|
|
41629
|
+
const judgment = parseJudgment(judgeResponse.assistant);
|
|
41630
|
+
caseResult.judge = {
|
|
41631
|
+
model: judgeModel,
|
|
41632
|
+
score: judgment.score,
|
|
41633
|
+
reason: judgment.reason,
|
|
41634
|
+
raw: judgeResponse.assistant
|
|
41635
|
+
};
|
|
41636
|
+
caseResult.passed = judgment.score !== null && judgment.score >= 4;
|
|
41637
|
+
}
|
|
41638
|
+
baseResult.cases.push(caseResult);
|
|
41639
|
+
}
|
|
41640
|
+
baseResult.summary.passed = baseResult.cases.filter((result) => result.passed).length;
|
|
41641
|
+
baseResult.summary.failed = baseResult.cases.length - baseResult.summary.passed;
|
|
41642
|
+
writeBenchmarkResult(baseResult, flags, output);
|
|
41643
|
+
}
|
|
41644
|
+
function printBenchmarkHelp() {
|
|
41645
|
+
console.log(`Benchmark commands:
|
|
41646
|
+
openmates benchmark model <provider/model> --confirm-spend-credits [--suite smoke|tools|quality|all] [--runs <n>] [--json]
|
|
41647
|
+
|
|
41648
|
+
Runs real incognito chat requests through the OpenMates product path. Live runs
|
|
41649
|
+
spend the logged-in user's credits and usage entries are grouped as benchmark spend.
|
|
41650
|
+
|
|
41651
|
+
Options:
|
|
41652
|
+
--confirm-spend-credits Required for live benchmark runs
|
|
41653
|
+
--dry-run Preview the benchmark plan without login or spend
|
|
41654
|
+
--suite <list> Comma-separated suites: smoke, tools, quality, all (default: smoke)
|
|
41655
|
+
--runs <n> Repeat each selected case (default: 1)
|
|
41656
|
+
--judge-model <provider/model> Judge for quality cases (default: ${DEFAULT_JUDGE_MODEL})
|
|
41657
|
+
--run-id <id> Reuse a benchmark run id for grouping
|
|
41658
|
+
--output <path> Save JSON result to a file
|
|
41659
|
+
--json Print JSON result`);
|
|
41660
|
+
}
|
|
41661
|
+
function parseSuites(value) {
|
|
41662
|
+
if (value === void 0 || value === false) return ["smoke"];
|
|
41663
|
+
if (value === true) throw new Error("--suite requires a value");
|
|
41664
|
+
const suites = value.split(",").map((suite) => suite.trim()).filter(Boolean);
|
|
41665
|
+
if (suites.includes("all")) return ["smoke", "tools", "quality"];
|
|
41666
|
+
const allowed = /* @__PURE__ */ new Set(["smoke", "tools", "quality"]);
|
|
41667
|
+
const invalid = suites.filter((suite) => !allowed.has(suite));
|
|
41668
|
+
if (invalid.length > 0 || suites.length === 0) {
|
|
41669
|
+
throw new Error("Invalid --suite. Use smoke, tools, quality, or all.");
|
|
41670
|
+
}
|
|
41671
|
+
return [...new Set(suites)];
|
|
41672
|
+
}
|
|
41673
|
+
function parseRuns(value) {
|
|
41674
|
+
if (value === void 0 || value === false) return 1;
|
|
41675
|
+
if (value === true) throw new Error("--runs requires a value");
|
|
41676
|
+
const parsed = Number.parseInt(value, 10);
|
|
41677
|
+
if (!Number.isInteger(parsed) || parsed < 1 || parsed > 20) {
|
|
41678
|
+
throw new Error("--runs must be an integer from 1 to 20");
|
|
41679
|
+
}
|
|
41680
|
+
return parsed;
|
|
41681
|
+
}
|
|
41682
|
+
function expandCases(suites, runs) {
|
|
41683
|
+
const selected = BENCHMARK_CASES.filter((benchmarkCase) => suites.includes(benchmarkCase.suite));
|
|
41684
|
+
const expanded = [];
|
|
41685
|
+
for (let run = 1; run <= runs; run += 1) {
|
|
41686
|
+
for (const benchmarkCase of selected) expanded.push({ ...benchmarkCase, run });
|
|
41687
|
+
}
|
|
41688
|
+
return expanded;
|
|
41689
|
+
}
|
|
41690
|
+
function modelMention(model) {
|
|
41691
|
+
const separator = model.indexOf("/");
|
|
41692
|
+
if (separator === -1) return `@ai-model:${model}`;
|
|
41693
|
+
const provider = model.slice(0, separator);
|
|
41694
|
+
const modelId = model.slice(separator + 1);
|
|
41695
|
+
if (!provider || !modelId) return `@ai-model:${model}`;
|
|
41696
|
+
return `@ai-model:${modelId}:${provider}`;
|
|
41697
|
+
}
|
|
41698
|
+
function benchmarkMetadata(params) {
|
|
41699
|
+
return {
|
|
41700
|
+
source: "benchmark",
|
|
41701
|
+
benchmark_run_id: params.runId,
|
|
41702
|
+
benchmark_suite: params.suite,
|
|
41703
|
+
benchmark_case: params.caseId,
|
|
41704
|
+
benchmark_target_model: params.targetModel,
|
|
41705
|
+
benchmark_judge_model: params.judgeModel
|
|
41706
|
+
};
|
|
41707
|
+
}
|
|
41708
|
+
function judgePrompt(prompt, answer) {
|
|
41709
|
+
return [
|
|
41710
|
+
"You are judging a model benchmark response.",
|
|
41711
|
+
'Return only JSON with shape {"score": number, "reason": string}.',
|
|
41712
|
+
"Score from 1 to 5 for correctness, clarity, and following instructions.",
|
|
41713
|
+
`Benchmark prompt: ${JSON.stringify(prompt)}`,
|
|
41714
|
+
`Candidate answer: ${JSON.stringify(answer)}`
|
|
41715
|
+
].join("\n");
|
|
41716
|
+
}
|
|
41717
|
+
function parseJudgment(answer) {
|
|
41718
|
+
const jsonText = extractJsonObject(answer);
|
|
41719
|
+
if (!jsonText) return { score: null, reason: null };
|
|
41720
|
+
try {
|
|
41721
|
+
const parsed = JSON.parse(jsonText);
|
|
41722
|
+
const score = typeof parsed.score === "number" && Number.isFinite(parsed.score) ? parsed.score : null;
|
|
41723
|
+
const reason = typeof parsed.reason === "string" ? parsed.reason : null;
|
|
41724
|
+
return { score, reason };
|
|
41725
|
+
} catch {
|
|
41726
|
+
return { score: null, reason: null };
|
|
41727
|
+
}
|
|
41728
|
+
}
|
|
41729
|
+
function extractJsonObject(text) {
|
|
41730
|
+
const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i);
|
|
41731
|
+
if (fenced) return fenced[1];
|
|
41732
|
+
const start = text.indexOf("{");
|
|
41733
|
+
const end = text.lastIndexOf("}");
|
|
41734
|
+
if (start === -1 || end === -1 || end <= start) return null;
|
|
41735
|
+
return text.slice(start, end + 1);
|
|
41736
|
+
}
|
|
41737
|
+
function writeBenchmarkResult(result, flags, output) {
|
|
41738
|
+
const json = `${JSON.stringify(result, null, 2)}
|
|
41739
|
+
`;
|
|
41740
|
+
if (output) writeFileSync4(output, json, "utf-8");
|
|
41741
|
+
if (flags.json === true || output) {
|
|
41742
|
+
process.stdout.write(json);
|
|
41743
|
+
return;
|
|
41744
|
+
}
|
|
41745
|
+
console.log(`Benchmark ${result.status}: ${result.targetModel}`);
|
|
41746
|
+
console.log(`Run ID: ${result.runId}`);
|
|
41747
|
+
console.log(`Suites: ${result.suites.join(", ")}`);
|
|
41748
|
+
console.log(`Judge: ${result.judgeModel}`);
|
|
41749
|
+
console.log(`Spend credits: ${result.spendsCredits ? "yes" : "no"}`);
|
|
41750
|
+
if (result.status === "completed") {
|
|
41751
|
+
console.log(`Passed: ${result.summary.passed}/${result.summary.total}`);
|
|
41752
|
+
for (const benchmarkCase of result.cases) {
|
|
41753
|
+
const mark = benchmarkCase.passed ? "PASS" : "FAIL";
|
|
41754
|
+
const judge = benchmarkCase.judge?.score !== void 0 ? ` judge=${benchmarkCase.judge.score ?? "unparsed"}` : "";
|
|
41755
|
+
console.log(`${mark} ${benchmarkCase.suite}/${benchmarkCase.id} (${benchmarkCase.durationMs}ms)${judge}`);
|
|
41756
|
+
}
|
|
41757
|
+
}
|
|
41758
|
+
}
|
|
41759
|
+
|
|
41490
41760
|
// src/cli.ts
|
|
41491
41761
|
async function main() {
|
|
41492
41762
|
const parsed = parseArgs(process.argv.slice(2));
|
|
@@ -41557,6 +41827,10 @@ async function main() {
|
|
|
41557
41827
|
printDocsHelp();
|
|
41558
41828
|
return;
|
|
41559
41829
|
}
|
|
41830
|
+
if (command === "benchmark") {
|
|
41831
|
+
printBenchmarkHelp();
|
|
41832
|
+
return;
|
|
41833
|
+
}
|
|
41560
41834
|
printHelp();
|
|
41561
41835
|
return;
|
|
41562
41836
|
}
|
|
@@ -41627,6 +41901,10 @@ async function main() {
|
|
|
41627
41901
|
handleFeedback(subcommand, rest, parsed.flags);
|
|
41628
41902
|
return;
|
|
41629
41903
|
}
|
|
41904
|
+
if (command === "benchmark") {
|
|
41905
|
+
await handleBenchmark(client, subcommand, rest, parsed.flags);
|
|
41906
|
+
return;
|
|
41907
|
+
}
|
|
41630
41908
|
throw new Error(`Unknown command '${command}'. Run 'openmates help'.`);
|
|
41631
41909
|
}
|
|
41632
41910
|
function shouldInitializeRedactor(command, subcommand) {
|
|
@@ -42883,7 +43161,7 @@ async function handleEmbeds(client, subcommand, rest, flags) {
|
|
|
42883
43161
|
throw new Error("Embed version content was not available after local reconstruction.");
|
|
42884
43162
|
}
|
|
42885
43163
|
if (typeof flags.output === "string") {
|
|
42886
|
-
|
|
43164
|
+
writeFileSync5(flags.output, result.content, "utf-8");
|
|
42887
43165
|
if (flags.json === true) {
|
|
42888
43166
|
printJson2({ ...result, output: flags.output });
|
|
42889
43167
|
} else {
|
|
@@ -45887,6 +46165,7 @@ Commands:
|
|
|
45887
46165
|
openmates inspirations [--lang <code>] [--json] Daily inspirations
|
|
45888
46166
|
openmates newchatsuggestions [--limit <n>] [--json] Personalized new chat suggestions
|
|
45889
46167
|
openmates feedback [--help] Assistant response feedback helpers
|
|
46168
|
+
openmates benchmark [--help] Run real model benchmarks with usage tagged as benchmark spend
|
|
45890
46169
|
openmates server [--help] Server management (install, start, stop, ...)
|
|
45891
46170
|
openmates docs [--help] Browse, search, and download documentation
|
|
45892
46171
|
openmates e2e provision-auth-accounts Provision local E2E auth-account artifacts
|
package/dist/cli.js
CHANGED
package/dist/index.d.ts
CHANGED
|
@@ -298,6 +298,14 @@ interface ChatListPage {
|
|
|
298
298
|
limit: number;
|
|
299
299
|
hasMore: boolean;
|
|
300
300
|
}
|
|
301
|
+
interface BenchmarkMetadata {
|
|
302
|
+
source: "benchmark";
|
|
303
|
+
benchmark_run_id: string;
|
|
304
|
+
benchmark_suite: string;
|
|
305
|
+
benchmark_case: string;
|
|
306
|
+
benchmark_target_model: string;
|
|
307
|
+
benchmark_judge_model?: string;
|
|
308
|
+
}
|
|
301
309
|
/** Decrypted message for display */
|
|
302
310
|
interface DecryptedMessage {
|
|
303
311
|
id: string;
|
|
@@ -688,6 +696,10 @@ declare class OpenMatesClient {
|
|
|
688
696
|
connectedAccountDirectory?: ConnectedAccountDirectoryEntry[];
|
|
689
697
|
/** Refresh-token envelopes to convert into short-lived token refs before send. */
|
|
690
698
|
connectedAccountTokenRefInputs?: ConnectedAccountTurnTokenRefInput[];
|
|
699
|
+
/** Non-sensitive CLI benchmark labels for usage-source grouping. */
|
|
700
|
+
benchmarkMetadata?: BenchmarkMetadata;
|
|
701
|
+
/** Start collecting before send for latency-sensitive benchmark turns. */
|
|
702
|
+
precollectResponse?: boolean;
|
|
691
703
|
}): Promise<{
|
|
692
704
|
status: "completed" | "waiting_for_user";
|
|
693
705
|
chatId: string;
|
package/dist/index.js
CHANGED