openmates 0.12.0-alpha.12 → 0.12.0-alpha.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/{chunk-R5Z4FBJJ.js → chunk-YHOUQRWZ.js} +29 -2
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -26,10 +26,11 @@ openmates apps code run --language python --code 'print("Hello")'
|
|
|
26
26
|
openmates settings account export data --json
|
|
27
27
|
openmates settings memories list --json
|
|
28
28
|
openmates docs list
|
|
29
|
+
openmates benchmark model google/gemini-3.5-flash --dry-run --json
|
|
29
30
|
openmates server install
|
|
30
31
|
```
|
|
31
32
|
|
|
32
|
-
Run `openmates --help` or `openmates <command> --help` for the full command surface.
|
|
33
|
+
Run `openmates --help` or `openmates <command> --help` for the full command surface. Benchmark commands also support `openmates benchmark --help` for suite, comparison, judge, and spend-confirmation options.
|
|
33
34
|
|
|
34
35
|
## Environments
|
|
35
36
|
|
|
@@ -41830,6 +41830,7 @@ async function handleBenchmark(client, subcommand, rest, flags) {
|
|
|
41830
41830
|
const runs = parseRuns(flags.runs);
|
|
41831
41831
|
const extensiveSize = parseExtensiveSize(flags["extensive-size"]);
|
|
41832
41832
|
const parallel = parseParallel(flags.parallel);
|
|
41833
|
+
const caseIds = parseCaseIds(flags.case);
|
|
41833
41834
|
const dryRun = flags["dry-run"] === true;
|
|
41834
41835
|
const output = typeof flags.output === "string" ? flags.output : void 0;
|
|
41835
41836
|
const runId = typeof flags["run-id"] === "string" ? flags["run-id"] : randomUUID3();
|
|
@@ -41839,7 +41840,7 @@ async function handleBenchmark(client, subcommand, rest, flags) {
|
|
|
41839
41840
|
"Benchmark runs spend real credits from the logged-in account. Rerun with --confirm-spend-credits, or use --dry-run to preview the plan."
|
|
41840
41841
|
);
|
|
41841
41842
|
}
|
|
41842
|
-
const cases = expandCases(suites, runs, extensiveSize);
|
|
41843
|
+
const cases = filterCases(expandCases(suites, runs, extensiveSize), caseIds);
|
|
41843
41844
|
const pricing = loadPricingForModels([...targetModels, judgeModel]);
|
|
41844
41845
|
const estimate = estimateCredits(cases, targetModels, judgeModel, pricing);
|
|
41845
41846
|
const result = makeBaseResult({
|
|
@@ -41893,6 +41894,7 @@ Options:
|
|
|
41893
41894
|
--dry-run Preview the benchmark plan without inference or spend
|
|
41894
41895
|
--compare Compare two or more target models
|
|
41895
41896
|
--suite <list> Comma-separated suites: quick, extensive, all (default: quick)
|
|
41897
|
+
--case <id[,id...]> Run only specific case id(s) from the selected suites
|
|
41896
41898
|
--extensive-size <n> Extensive cases to run: 5, 10, or 20 (default: ${DEFAULT_EXTENSIVE_SIZE})
|
|
41897
41899
|
--runs <n> Repeat each selected case (default: 1)
|
|
41898
41900
|
--parallel <n> Concurrent target case requests (default: ${DEFAULT_PARALLEL})
|
|
@@ -41941,6 +41943,24 @@ function parseParallel(value) {
|
|
|
41941
41943
|
}
|
|
41942
41944
|
return parsed;
|
|
41943
41945
|
}
|
|
41946
|
+
function parseCaseIds(value) {
|
|
41947
|
+
if (value === void 0 || value === false) return [];
|
|
41948
|
+
if (value === true) throw new Error("--case requires a case id");
|
|
41949
|
+
const caseIds = value.split(",").map((caseId) => caseId.trim()).filter(Boolean);
|
|
41950
|
+
if (caseIds.length === 0) throw new Error("--case requires at least one case id");
|
|
41951
|
+
return [...new Set(caseIds)];
|
|
41952
|
+
}
|
|
41953
|
+
function filterCases(cases, caseIds) {
|
|
41954
|
+
if (caseIds.length === 0) return cases;
|
|
41955
|
+
const availableIds = new Set(cases.map((benchmarkCase) => benchmarkCase.id));
|
|
41956
|
+
const missing = caseIds.filter((caseId) => !availableIds.has(caseId));
|
|
41957
|
+
if (missing.length > 0) {
|
|
41958
|
+
throw new Error(
|
|
41959
|
+
`Unknown benchmark case id(s): ${missing.join(", ")}. Available in selected suite(s): ${[...availableIds].sort().join(", ")}`
|
|
41960
|
+
);
|
|
41961
|
+
}
|
|
41962
|
+
return cases.filter((benchmarkCase) => caseIds.includes(benchmarkCase.id));
|
|
41963
|
+
}
|
|
41944
41964
|
function expandCases(suites, runs, extensiveSize) {
|
|
41945
41965
|
const selected = [];
|
|
41946
41966
|
if (suites.includes("quick")) selected.push(...QUICK_CASES);
|
|
@@ -42145,7 +42165,14 @@ async function uploadBenchmarkImage(client, fileEmbed) {
|
|
|
42145
42165
|
fileEmbed.embed.status = "finished";
|
|
42146
42166
|
fileEmbed.embed.contentHash = uploadResult.content_hash;
|
|
42147
42167
|
fileEmbed.embed.embedId = uploadResult.embed_id;
|
|
42148
|
-
fileEmbed.referenceBlock =
|
|
42168
|
+
fileEmbed.referenceBlock = createBenchmarkEmbedReferenceBlock(fileEmbed.embed.embedId, fileEmbed.embed.type);
|
|
42169
|
+
}
|
|
42170
|
+
function createBenchmarkEmbedReferenceBlock(embedId, embedType) {
|
|
42171
|
+
return `
|
|
42172
|
+
|
|
42173
|
+
\`\`\`json
|
|
42174
|
+
${JSON.stringify({ type: embedType, embed_id: embedId })}
|
|
42175
|
+
\`\`\``;
|
|
42149
42176
|
}
|
|
42150
42177
|
async function judgeCase(params) {
|
|
42151
42178
|
const startedAt = Date.now();
|
package/dist/cli.js
CHANGED
package/dist/index.js
CHANGED