npm - openmates - Versions diffs - 0.12.0-alpha.12 → 0.12.0-alpha.14 - Mend

openmates 0.12.0-alpha.12 → 0.12.0-alpha.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +2 -1
package/dist/{chunk-R5Z4FBJJ.js → chunk-YHOUQRWZ.js} +29 -2
package/dist/cli.js +1 -1
package/dist/index.js +1 -1
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -26,10 +26,11 @@ openmates apps code run --language python --code 'print("Hello")'
 openmates settings account export data --json
 openmates settings memories list --json
 openmates docs list
+openmates benchmark model google/gemini-3.5-flash --dry-run --json
 openmates server install
 ```
-Run `openmates --help` or `openmates <command> --help` for the full command surface.
+Run `openmates --help` or `openmates <command> --help` for the full command surface. Benchmark commands also support `openmates benchmark --help` for suite, comparison, judge, and spend-confirmation options.
 ## Environments

package/dist/{chunk-R5Z4FBJJ.js → chunk-YHOUQRWZ.js} RENAMED Viewed

@@ -41830,6 +41830,7 @@ async function handleBenchmark(client, subcommand, rest, flags) {
   const runs = parseRuns(flags.runs);
   const extensiveSize = parseExtensiveSize(flags["extensive-size"]);
   const parallel = parseParallel(flags.parallel);
+  const caseIds = parseCaseIds(flags.case);
   const dryRun = flags["dry-run"] === true;
   const output = typeof flags.output === "string" ? flags.output : void 0;
   const runId = typeof flags["run-id"] === "string" ? flags["run-id"] : randomUUID3();
@@ -41839,7 +41840,7 @@ async function handleBenchmark(client, subcommand, rest, flags) {
       "Benchmark runs spend real credits from the logged-in account. Rerun with --confirm-spend-credits, or use --dry-run to preview the plan."
     );
   }
-  const cases = expandCases(suites, runs, extensiveSize);
+  const cases = filterCases(expandCases(suites, runs, extensiveSize), caseIds);
   const pricing = loadPricingForModels([...targetModels, judgeModel]);
   const estimate = estimateCredits(cases, targetModels, judgeModel, pricing);
   const result = makeBaseResult({
@@ -41893,6 +41894,7 @@ Options:
   --dry-run                     Preview the benchmark plan without inference or spend
   --compare                     Compare two or more target models
   --suite <list>                Comma-separated suites: quick, extensive, all (default: quick)
+  --case <id[,id...]>           Run only specific case id(s) from the selected suites
   --extensive-size <n>          Extensive cases to run: 5, 10, or 20 (default: ${DEFAULT_EXTENSIVE_SIZE})
   --runs <n>                    Repeat each selected case (default: 1)
   --parallel <n>                Concurrent target case requests (default: ${DEFAULT_PARALLEL})
@@ -41941,6 +41943,24 @@ function parseParallel(value) {
   }
   return parsed;
 }
+function parseCaseIds(value) {
+  if (value === void 0 || value === false) return [];
+  if (value === true) throw new Error("--case requires a case id");
+  const caseIds = value.split(",").map((caseId) => caseId.trim()).filter(Boolean);
+  if (caseIds.length === 0) throw new Error("--case requires at least one case id");
+  return [...new Set(caseIds)];
+}
+function filterCases(cases, caseIds) {
+  if (caseIds.length === 0) return cases;
+  const availableIds = new Set(cases.map((benchmarkCase) => benchmarkCase.id));
+  const missing = caseIds.filter((caseId) => !availableIds.has(caseId));
+  if (missing.length > 0) {
+    throw new Error(
+      `Unknown benchmark case id(s): ${missing.join(", ")}. Available in selected suite(s): ${[...availableIds].sort().join(", ")}`
+    );
+  }
+  return cases.filter((benchmarkCase) => caseIds.includes(benchmarkCase.id));
+}
 function expandCases(suites, runs, extensiveSize) {
   const selected = [];
   if (suites.includes("quick")) selected.push(...QUICK_CASES);
@@ -42145,7 +42165,14 @@ async function uploadBenchmarkImage(client, fileEmbed) {
   fileEmbed.embed.status = "finished";
   fileEmbed.embed.contentHash = uploadResult.content_hash;
   fileEmbed.embed.embedId = uploadResult.embed_id;
-  fileEmbed.referenceBlock = createEmbedReferenceBlock(embedRef);
+  fileEmbed.referenceBlock = createBenchmarkEmbedReferenceBlock(fileEmbed.embed.embedId, fileEmbed.embed.type);
+}
+function createBenchmarkEmbedReferenceBlock(embedId, embedType) {
+  return `
+\`\`\`json
+${JSON.stringify({ type: embedType, embed_id: embedId })}
+\`\`\``;
 }
 async function judgeCase(params) {
   const startedAt = Date.now();

package/dist/cli.js CHANGED Viewed

@@ -2,7 +2,7 @@
 import {
   getExtForLang,
   serializeToYaml
-} from "./chunk-R5Z4FBJJ.js";
+} from "./chunk-YHOUQRWZ.js";
 import "./chunk-AXNRPVLE.js";
 export {
   getExtForLang,

package/dist/index.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   deriveAppUrl,
   getExtForLang,
   serializeToYaml
-} from "./chunk-R5Z4FBJJ.js";
+} from "./chunk-YHOUQRWZ.js";
 import "./chunk-AXNRPVLE.js";
 export {
   ASSISTANT_FEEDBACK_REPORT_TITLE,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "openmates",
-  "version": "0.12.0-alpha.12",
+  "version": "0.12.0-alpha.14",
   "description": "OpenMates CLI and SDK",
   "type": "module",
   "main": "dist/index.js",