@smithers-orchestrator/cli 0.20.3 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.js CHANGED
@@ -1,8 +1,10 @@
1
1
  #!/usr/bin/env bun
2
2
  import { setJsonMode } from "./util/logger.ts";
3
+ import { findFirstPositionalIndex, parseMcpSurfaceArgv, rewriteBareResumeFlagArgv } from "./argv-utils.js";
4
+ import { CLI_JSON_ARGUMENT_MAX_BYTES, parseJsonArgument, parseJsonInput } from "./json-args.js";
3
5
  import { resolve, dirname, basename } from "node:path";
4
6
  import { pathToFileURL } from "node:url";
5
- import { readFileSync, existsSync, openSync, statSync, mkdirSync, writeFileSync } from "node:fs";
7
+ import { readFileSync, existsSync, openSync, statSync } from "node:fs";
6
8
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
7
9
  import { Effect, Fiber } from "effect";
8
10
  import { Cli, Mcp as IncurMcp, z } from "incur";
@@ -39,7 +41,18 @@ import { listAccounts, removeAccount } from "@smithers-orchestrator/accounts";
39
41
  import { runAgentAdd, pingAccount } from "./agent-commands/runAgentAdd.js";
40
42
  import { agentAddWizard } from "./agent-commands/agentAddWizard.js";
41
43
  import { initWorkflowPack, getWorkflowFollowUpCtas } from "./workflow-pack.js";
42
- import { discoverWorkflows, resolveWorkflow, createWorkflowFile } from "./workflows.js";
44
+ import { discoverWorkflows, resolveWorkflow, createWorkflowFile, renderWorkflowSkill, writeWorkflowSkillFiles } from "./workflows.js";
45
+ import {
46
+ assertEvalRunIdsAvailable,
47
+ assertEvalReportWritable,
48
+ buildEvalPlan,
49
+ buildEvalReport,
50
+ evaluateEvalCaseResult,
51
+ loadEvalCases,
52
+ renderEvalPlan,
53
+ renderEvalReport,
54
+ writeEvalReport,
55
+ } from "./eval-suite.js";
43
56
  import { ask } from "./ask.js";
44
57
  import { runScheduler } from "./scheduler.js";
45
58
  import { resumeRunDetached } from "./resume-detached.js";
@@ -47,6 +60,7 @@ import { formatCliAgentCapabilityDoctorReport, getCliAgentCapabilityDoctorReport
47
60
  import { parseDurationMs, supervisorLoopEffect, } from "./supervisor.js";
48
61
  import { WATCH_MIN_INTERVAL_MS, runWatchLoop, watchIntervalSecondsToMs, } from "./watch.js";
49
62
  import { createSemanticMcpServer } from "./mcp/semantic-server.js";
63
+ import { parseTokenScopes, readSmithersTokenStore, smithersTokenStorePath, writeSmithersTokenStore, } from "./token-store.js";
50
64
  import pc from "picocolors";
51
65
  import crypto from "node:crypto";
52
66
  import React from "react";
@@ -105,43 +119,9 @@ function readPackageVersion() {
105
119
  return "unknown";
106
120
  }
107
121
  }
108
- function smithersTokenStorePath() {
109
- return process.env.SMITHERS_TOKEN_STORE ?? resolve(process.env.HOME ?? process.cwd(), ".smithers", "tokens.json");
110
- }
111
- function readSmithersTokenStore() {
112
- const path = smithersTokenStorePath();
113
- if (!existsSync(path)) {
114
- return { tokens: {} };
115
- }
116
- try {
117
- const parsed = JSON.parse(readFileSync(path, "utf8"));
118
- if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
119
- return { tokens: {} };
120
- }
121
- const tokens = parsed.tokens && typeof parsed.tokens === "object" && !Array.isArray(parsed.tokens)
122
- ? parsed.tokens
123
- : {};
124
- return { tokens };
125
- }
126
- catch {
127
- return { tokens: {} };
128
- }
129
- }
130
- function writeSmithersTokenStore(store) {
131
- const path = smithersTokenStorePath();
132
- mkdirSync(dirname(path), { recursive: true });
133
- writeFileSync(path, `${JSON.stringify(store, null, 2)}\n`, { mode: 0o600 });
134
- }
135
- function parseTokenScopes(raw) {
136
- return raw
137
- .split(/[,\s]+/)
138
- .map((scope) => scope.trim())
139
- .filter(Boolean);
140
- }
141
122
  const CLI_ARGUMENT_MAX_LENGTH = 4096;
142
123
  const CLI_IDENTIFIER_MAX_LENGTH = 256;
143
124
  const CLI_TEXT_ARGUMENT_MAX_LENGTH = 64 * 1024;
144
- const CLI_JSON_ARGUMENT_MAX_BYTES = 1024 * 1024;
145
125
  const CLI_HANDLER_BOUNDS_WRAPPED = Symbol("smithers.cliHandlerBoundsWrapped");
146
126
  /**
147
127
  * @param {string} path
@@ -240,55 +220,6 @@ function wrapCliCommandHandlersWithInputBounds(commands) {
240
220
  entry[CLI_HANDLER_BOUNDS_WRAPPED] = true;
241
221
  }
242
222
  }
243
- /**
244
- * @param {string | undefined} raw
245
- * @param {string} label
246
- * @param {FailFn} fail
247
- */
248
- function parseJsonInput(raw, label, fail) {
249
- if (!raw)
250
- return undefined;
251
- try {
252
- return JSON.parse(raw);
253
- }
254
- catch (err) {
255
- return fail({
256
- code: "INVALID_JSON",
257
- message: `Invalid JSON for ${label}: ${err?.message ?? String(err)}`,
258
- exitCode: 4,
259
- });
260
- }
261
- }
262
- /**
263
- * @param {string | undefined} raw
264
- * @param {FailFn} fail
265
- * @returns {Record<string, string | number | boolean> | undefined}
266
- */
267
- function parseAnnotations(raw, fail) {
268
- const parsed = parseJsonInput(raw, "annotations", fail);
269
- if (parsed === undefined)
270
- return undefined;
271
- if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
272
- return fail({
273
- code: "INVALID_ANNOTATIONS",
274
- message: "Run annotations must be a flat JSON object of string/number/boolean values",
275
- exitCode: 4,
276
- });
277
- }
278
- /** @type {Record<string, string | number | boolean>} */
279
- const annotations = {};
280
- for (const [key, value] of Object.entries(parsed)) {
281
- if (!["string", "number", "boolean"].includes(typeof value)) {
282
- return fail({
283
- code: "INVALID_ANNOTATIONS",
284
- message: `Run annotation ${key} must be a string, number, or boolean`,
285
- exitCode: 4,
286
- });
287
- }
288
- annotations[key] = /** @type {string | number | boolean} */ (value);
289
- }
290
- return annotations;
291
- }
292
223
  /**
293
224
  * @param {string | undefined} status
294
225
  */
@@ -1308,6 +1239,24 @@ const upOptions = z.object({
1308
1239
  authToken: z.string().optional().describe("Bearer token for HTTP auth (or set SMITHERS_API_KEY)"),
1309
1240
  metrics: z.boolean().default(true).describe("Expose /metrics endpoint (with --serve)"),
1310
1241
  });
1242
+ const evalOptions = z.object({
1243
+ cases: z.string().describe("JSON or JSONL eval case file"),
1244
+ suite: z.string().optional().describe("Stable suite ID used in run IDs and report paths"),
1245
+ runLabel: z.string().optional().describe("Run label appended to eval run IDs; defaults to current UTC timestamp plus a nonce"),
1246
+ dryRun: z.boolean().default(false).describe("Plan the suite without launching runs"),
1247
+ concurrency: z.number().int().min(1).max(16).default(1).describe("Number of eval cases to run at once"),
1248
+ maxCases: z.number().int().min(1).optional().describe("Run only the first N cases"),
1249
+ report: z.string().optional().describe("Write report JSON to this path"),
1250
+ force: z.boolean().default(false).describe("Overwrite an existing eval report"),
1251
+ includeOutput: z.boolean().default(true).describe("Include workflow outputs in the report"),
1252
+ maxConcurrency: z.number().int().min(1).optional().describe("Per-workflow max task concurrency"),
1253
+ root: z.string().optional().describe("Tool sandbox root directory"),
1254
+ log: z.boolean().default(true).describe("Enable NDJSON event log file output"),
1255
+ logDir: z.string().optional().describe("NDJSON event logs directory"),
1256
+ allowNetwork: z.boolean().default(false).describe("Allow bash tool network requests"),
1257
+ maxOutputBytes: z.number().int().min(1).optional().describe("Max bytes a single tool call can return"),
1258
+ toolTimeoutMs: z.number().int().min(1).optional().describe("Max wall-clock time per tool call in ms"),
1259
+ });
1311
1260
  const superviseOptions = z.object({
1312
1261
  dryRun: z.boolean().default(false).describe("Show which stale runs would be resumed, without acting"),
1313
1262
  interval: z.string().default("10s").describe("Poll interval (e.g. 10s, 30s, 1m)"),
@@ -1347,7 +1296,7 @@ const chatOptions = z.object({
1347
1296
  stderr: z.boolean().default(true).describe("Include agent stderr output"),
1348
1297
  });
1349
1298
  const chatCreateOptions = z.object({
1350
- agent: z.enum(["claude-code", "codex", "gemini"]).describe("CLI agent engine to launch"),
1299
+ agent: z.enum(["claude-code", "codex", "antigravity", "gemini"]).describe("CLI agent engine to launch"),
1351
1300
  cwd: z.string().optional().describe("Working directory for the chat session (default: current directory)"),
1352
1301
  });
1353
1302
  const inspectArgs = z.object({
@@ -1438,6 +1387,13 @@ const workflowPathArgs = z.object({
1438
1387
  const workflowDoctorArgs = z.object({
1439
1388
  name: z.string().optional().describe("Workflow ID"),
1440
1389
  });
1390
+ const workflowSkillArgs = z.object({
1391
+ name: z.string().optional().describe("Workflow ID, or omit to generate skills for all workflows"),
1392
+ });
1393
+ const workflowSkillOptions = z.object({
1394
+ output: z.string().optional().describe("Output file for one workflow, or output directory for all workflows"),
1395
+ force: z.boolean().default(false).describe("Overwrite existing skill files"),
1396
+ });
1441
1397
  const workflowRunOptions = upOptions.extend({
1442
1398
  prompt: z.string().optional().describe("Prompt text mapped to input.prompt when --input is omitted"),
1443
1399
  });
@@ -1455,6 +1411,54 @@ function normalizeWorkflowRunOptions(options) {
1455
1411
  root: options.root ?? ".",
1456
1412
  };
1457
1413
  }
1414
+ function formatRequestedJsonOutput() {
1415
+ for (let index = 0; index < process.argv.length; index += 1) {
1416
+ const arg = process.argv[index];
1417
+ if (arg === "--format") {
1418
+ const value = process.argv[index + 1];
1419
+ return value === "json" || value === "jsonl";
1420
+ }
1421
+ if (arg === "--format=json" || arg === "--format=jsonl") {
1422
+ return true;
1423
+ }
1424
+ }
1425
+ return false;
1426
+ }
1427
+ function defaultEvalRunLabel() {
1428
+ const timestamp = new Date().toISOString().replace(/[-:TZ.]/g, "").slice(0, 14);
1429
+ return `${timestamp}-${crypto.randomUUID().slice(0, 8)}`;
1430
+ }
1431
+ /**
1432
+ * @param {string} workflowInput
1433
+ */
1434
+ function resolveWorkflowPathForEval(workflowInput) {
1435
+ const asPath = resolve(process.cwd(), workflowInput);
1436
+ if (existsSync(asPath)) {
1437
+ return workflowInput;
1438
+ }
1439
+ return resolveWorkflow(workflowInput, process.cwd()).entryFile;
1440
+ }
1441
+ /**
1442
+ * @template T
1443
+ * @template R
1444
+ * @param {T[]} items
1445
+ * @param {number} limit
1446
+ * @param {(item: T, index: number) => Promise<R>} worker
1447
+ * @returns {Promise<R[]>}
1448
+ */
1449
+ async function runWithLimit(items, limit, worker) {
1450
+ const results = new Array(items.length);
1451
+ let cursor = 0;
1452
+ const workerCount = Math.min(limit, items.length);
1453
+ await Promise.all(Array.from({ length: workerCount }, async () => {
1454
+ while (cursor < items.length) {
1455
+ const index = cursor;
1456
+ cursor += 1;
1457
+ results[index] = await worker(items[index], index);
1458
+ }
1459
+ }));
1460
+ return results;
1461
+ }
1458
1462
  /**
1459
1463
  * @param {string} intervalRaw
1460
1464
  * @param {string} staleThresholdRaw
@@ -1515,8 +1519,42 @@ function normalizeEventsQuery(options) {
1515
1519
  async function executeUpCommand(c, workflowPath, options, fail) {
1516
1520
  try {
1517
1521
  const resolvedWorkflowPath = resolve(process.cwd(), workflowPath);
1518
- const input = parseJsonInput(options.input, "input", fail) ?? {};
1519
- const annotations = parseAnnotations(options.annotations, fail);
1522
+ let input;
1523
+ let annotations;
1524
+ try {
1525
+ input = parseJsonArgument(options.input, "input") ?? {};
1526
+ const parsedAnnotations = parseJsonArgument(options.annotations, "annotations");
1527
+ if (parsedAnnotations === undefined) {
1528
+ annotations = undefined;
1529
+ }
1530
+ else if (!parsedAnnotations || typeof parsedAnnotations !== "object" || Array.isArray(parsedAnnotations)) {
1531
+ return fail({
1532
+ code: "INVALID_ANNOTATIONS",
1533
+ message: "Run annotations must be a flat JSON object of string/number/boolean values",
1534
+ exitCode: 4,
1535
+ });
1536
+ }
1537
+ else {
1538
+ annotations = {};
1539
+ for (const [key, value] of Object.entries(parsedAnnotations)) {
1540
+ if (!["string", "number", "boolean"].includes(typeof value)) {
1541
+ return fail({
1542
+ code: "INVALID_ANNOTATIONS",
1543
+ message: `Run annotation ${key} must be a string, number, or boolean`,
1544
+ exitCode: 4,
1545
+ });
1546
+ }
1547
+ annotations[key] = /** @type {string | number | boolean} */ (value);
1548
+ }
1549
+ }
1550
+ }
1551
+ catch (err) {
1552
+ return fail({
1553
+ code: err instanceof SmithersError ? err.code : "INVALID_JSON",
1554
+ message: err?.message ?? String(err),
1555
+ exitCode: 4,
1556
+ });
1557
+ }
1520
1558
  const { resume, resumeRunId } = normalizeResumeOption(options.resume);
1521
1559
  const runId = options.runId ?? resumeRunId;
1522
1560
  // Detached mode: spawn ourselves as a background process
@@ -1526,9 +1564,9 @@ async function executeUpCommand(c, workflowPath, options, fail) {
1526
1564
  if (runId)
1527
1565
  childArgs.push("--run-id", runId);
1528
1566
  if (options.input)
1529
- childArgs.push("--input", options.input);
1567
+ childArgs.push("--input", options.input === "-" ? JSON.stringify(input) : options.input);
1530
1568
  if (options.annotations)
1531
- childArgs.push("--annotations", options.annotations);
1569
+ childArgs.push("--annotations", options.annotations === "-" ? JSON.stringify(annotations ?? {}) : options.annotations);
1532
1570
  if (options.maxConcurrency)
1533
1571
  childArgs.push("--max-concurrency", String(options.maxConcurrency));
1534
1572
  if (options.root)
@@ -1611,6 +1649,13 @@ async function executeUpCommand(c, workflowPath, options, fail) {
1611
1649
  exitCode: 4,
1612
1650
  });
1613
1651
  }
1652
+ if (Boolean(options.resumeClaimOwner) !== Boolean(options.resumeClaimHeartbeat)) {
1653
+ return fail({
1654
+ code: "INVALID_RESUME_CLAIM",
1655
+ message: "--resume-claim-owner and --resume-claim-heartbeat must be provided together.",
1656
+ exitCode: 4,
1657
+ });
1658
+ }
1614
1659
  const workflow = await loadWorkflow(workflowPath);
1615
1660
  ensureSmithersTables(workflow.db);
1616
1661
  if (options.hot) {
@@ -1644,13 +1689,6 @@ async function executeUpCommand(c, workflowPath, options, fail) {
1644
1689
  const logDir = options.log ? options.logDir : null;
1645
1690
  const onProgress = buildProgressReporter();
1646
1691
  const abort = setupAbortSignal();
1647
- if (Boolean(options.resumeClaimOwner) !== Boolean(options.resumeClaimHeartbeat)) {
1648
- return fail({
1649
- code: "INVALID_RESUME_CLAIM",
1650
- message: "--resume-claim-owner and --resume-claim-heartbeat must be provided together.",
1651
- exitCode: 4,
1652
- });
1653
- }
1654
1692
  const resumeClaim = options.resumeClaimOwner && options.resumeClaimHeartbeat
1655
1693
  ? {
1656
1694
  claimOwnerId: options.resumeClaimOwner,
@@ -1870,6 +1908,49 @@ const workflowCli = Cli.create({
1870
1908
  });
1871
1909
  }
1872
1910
  },
1911
+ })
1912
+ .command("inspect", {
1913
+ description: "Show workflow metadata and an agent-facing skill preview.",
1914
+ args: workflowPathArgs,
1915
+ run(c) {
1916
+ const workflow = resolveWorkflow(c.args.name, process.cwd());
1917
+ return c.ok({
1918
+ workflow,
1919
+ skillPreview: renderWorkflowSkill(workflow, { root: process.cwd() }),
1920
+ });
1921
+ },
1922
+ })
1923
+ .command("skills", {
1924
+ description: "Generate agent-facing skill docs for local workflows.",
1925
+ args: workflowSkillArgs,
1926
+ options: workflowSkillOptions,
1927
+ run(c) {
1928
+ const fail = (opts) => {
1929
+ commandExitOverride = opts.exitCode ?? 1;
1930
+ return c.error(opts);
1931
+ };
1932
+ try {
1933
+ return c.ok(writeWorkflowSkillFiles(process.cwd(), {
1934
+ workflowId: c.args.name ?? "all",
1935
+ output: c.options.output,
1936
+ force: c.options.force,
1937
+ }));
1938
+ }
1939
+ catch (err) {
1940
+ if (err instanceof SmithersError) {
1941
+ return fail({
1942
+ code: err.code,
1943
+ message: err.message,
1944
+ exitCode: 4,
1945
+ });
1946
+ }
1947
+ return fail({
1948
+ code: "WORKFLOW_SKILLS_FAILED",
1949
+ message: err?.message ?? String(err),
1950
+ exitCode: 1,
1951
+ });
1952
+ }
1953
+ },
1873
1954
  })
1874
1955
  .command("doctor", {
1875
1956
  description: "Inspect workflow discovery, preload files, and detected agents.",
@@ -2039,7 +2120,7 @@ const agentsCli = Cli.create({
2039
2120
  description: "Register a Smithers agent account (interactive wizard, or non-interactive via flags).",
2040
2121
  options: z.object({
2041
2122
  provider: z.enum([
2042
- "claude-code", "codex", "gemini", "kimi",
2123
+ "claude-code", "antigravity", "codex", "gemini", "kimi",
2043
2124
  "anthropic-api", "openai-api", "gemini-api",
2044
2125
  ]).optional().describe("Provider id; omit to launch the interactive wizard"),
2045
2126
  label: z.string().optional().describe("Unique label, e.g. 'claude-work'"),
@@ -2301,10 +2382,10 @@ let lastDevtoolsCommandOutcome;
2301
2382
  * friendly typed error the helper already wrote to stderr (finding #2).
2302
2383
  *
2303
2384
  * @param {"tree"|"diff"|"output"|"rewind"} cmd
2304
- * @param {{ args: any; options: any; ok: (d?: unknown) => unknown }} c
2385
+ * @param {{ args: any; options: any }} c
2305
2386
  * @param {() => Promise<number>} handler
2306
2387
  */
2307
- async function runDevtoolsCommandWithTelemetry(cmd, c, handler) {
2388
+ async function* runDevtoolsCommandWithTelemetry(cmd, c, handler) {
2308
2389
  const startedAt = Date.now();
2309
2390
  let exitCode = 0;
2310
2391
  try {
@@ -2362,9 +2443,8 @@ async function runDevtoolsCommandWithTelemetry(cmd, c, handler) {
2362
2443
  // best-effort metrics.
2363
2444
  }
2364
2445
  }
2365
- // Return c.ok(undefined) so incur does not emit an additional
2366
- // envelope on stdout (finding #2).
2367
- return c.ok(undefined);
2446
+ // This is an empty stream so Incur does not emit an additional envelope
2447
+ // or framework CTA on stdout after the helper has already written output.
2368
2448
  }
2369
2449
 
2370
2450
  /**
@@ -2623,6 +2703,141 @@ const cli = Cli.create({
2623
2703
  };
2624
2704
  return executeUpCommand(c, c.args.workflow, c.options, fail);
2625
2705
  },
2706
+ })
2707
+ // =========================================================================
2708
+ // smithers eval <workflow>
2709
+ // =========================================================================
2710
+ .command("eval", {
2711
+ description: "Run a workflow over a JSON/JSONL eval suite and write a regression report.",
2712
+ args: workflowArgs,
2713
+ options: evalOptions,
2714
+ alias: { cases: "c", suite: "s", dryRun: "n", concurrency: "j", report: "r" },
2715
+ async run(c) {
2716
+ const fail = (opts) => {
2717
+ commandExitOverride = opts.exitCode ?? 1;
2718
+ return c.error(opts);
2719
+ };
2720
+ try {
2721
+ const workflowPath = resolveWorkflowPathForEval(c.args.workflow);
2722
+ const loadedCases = loadEvalCases(process.cwd(), c.options.cases, {
2723
+ maxCases: c.options.maxCases,
2724
+ });
2725
+ const plan = buildEvalPlan({
2726
+ suiteId: c.options.suite,
2727
+ runLabel: c.options.runLabel ?? defaultEvalRunLabel(),
2728
+ workflowPath,
2729
+ casesPath: c.options.cases,
2730
+ loadedCases,
2731
+ });
2732
+ const wantsStructured = c.format === "json" || c.format === "jsonl" || formatRequestedJsonOutput();
2733
+ if (c.options.dryRun) {
2734
+ if (wantsStructured) {
2735
+ return c.ok({ suite: plan });
2736
+ }
2737
+ process.stdout.write(`${renderEvalPlan(plan)}\n`);
2738
+ return c.ok(undefined);
2739
+ }
2740
+ assertEvalReportWritable(process.cwd(), plan.suiteId, {
2741
+ path: c.options.report,
2742
+ force: c.options.force,
2743
+ });
2744
+ const workflow = await loadWorkflow(workflowPath);
2745
+ ensureSmithersTables(workflow.db);
2746
+ await assertEvalRunIdsAvailable(new SmithersDb(workflow.db), plan.cases);
2747
+ setupSqliteCleanup(workflow);
2748
+ const schema = resolveSchema(workflow.db);
2749
+ const resolvedWorkflowPath = resolve(process.cwd(), workflowPath);
2750
+ const rootDir = c.options.root ? resolve(process.cwd(), c.options.root) : dirname(resolvedWorkflowPath);
2751
+ const logDir = c.options.log ? c.options.logDir : null;
2752
+ const abort = setupAbortSignal();
2753
+ const startedAtMs = Date.now();
2754
+ const results = await runWithLimit(plan.cases, c.options.concurrency, async (testCase) => {
2755
+ const caseStartedAtMs = Date.now();
2756
+ process.stderr.write(`[eval:${plan.suiteId}] ${testCase.id} -> ${testCase.runId}\n`);
2757
+ try {
2758
+ const result = await Effect.runPromise(runWorkflow(workflow, {
2759
+ input: testCase.input,
2760
+ runId: testCase.runId,
2761
+ workflowPath: resolvedWorkflowPath,
2762
+ maxConcurrency: c.options.maxConcurrency,
2763
+ rootDir,
2764
+ logDir,
2765
+ allowNetwork: c.options.allowNetwork,
2766
+ maxOutputBytes: c.options.maxOutputBytes,
2767
+ toolTimeoutMs: c.options.toolTimeoutMs,
2768
+ annotations: {
2769
+ suiteId: plan.suiteId,
2770
+ caseId: testCase.id,
2771
+ ...testCase.annotations,
2772
+ },
2773
+ signal: abort.signal,
2774
+ }));
2775
+ const output = await loadOutputs(workflow.db, schema, testCase.runId);
2776
+ const durationMs = Date.now() - caseStartedAtMs;
2777
+ const evaluation = evaluateEvalCaseResult(testCase, {
2778
+ ...result,
2779
+ output,
2780
+ });
2781
+ return {
2782
+ caseId: testCase.id,
2783
+ runId: testCase.runId,
2784
+ expectedStatus: testCase.expected.status,
2785
+ status: result.status,
2786
+ passed: evaluation.passed,
2787
+ assertions: evaluation.assertions,
2788
+ durationMs,
2789
+ input: testCase.input,
2790
+ ...(c.options.includeOutput ? { output } : {}),
2791
+ metadata: testCase.metadata,
2792
+ };
2793
+ }
2794
+ catch (err) {
2795
+ const errorMessage = err?.message ?? String(err);
2796
+ const durationMs = Date.now() - caseStartedAtMs;
2797
+ const evaluation = evaluateEvalCaseResult(testCase, {
2798
+ status: "error",
2799
+ error: err,
2800
+ });
2801
+ return {
2802
+ caseId: testCase.id,
2803
+ runId: testCase.runId,
2804
+ expectedStatus: testCase.expected.status,
2805
+ status: "error",
2806
+ passed: evaluation.passed,
2807
+ assertions: evaluation.assertions,
2808
+ durationMs,
2809
+ input: testCase.input,
2810
+ error: errorMessage,
2811
+ metadata: testCase.metadata,
2812
+ };
2813
+ }
2814
+ });
2815
+ const finishedAtMs = Date.now();
2816
+ let report = buildEvalReport({
2817
+ plan,
2818
+ results,
2819
+ startedAtMs,
2820
+ finishedAtMs,
2821
+ });
2822
+ const reportPath = writeEvalReport(process.cwd(), report, {
2823
+ path: c.options.report,
2824
+ force: c.options.force,
2825
+ });
2826
+ report = { ...report, reportPath };
2827
+ process.exitCode = report.summary.failed > 0 ? 1 : 0;
2828
+ if (wantsStructured) {
2829
+ return c.ok({ eval: report });
2830
+ }
2831
+ process.stdout.write(`${renderEvalReport(report)}\n`);
2832
+ return c.ok(undefined);
2833
+ }
2834
+ catch (err) {
2835
+ if (err instanceof SmithersError) {
2836
+ return fail({ code: err.code, message: err.message, exitCode: 4 });
2837
+ }
2838
+ return fail({ code: "EVAL_FAILED", message: err?.message ?? String(err), exitCode: 1 });
2839
+ }
2840
+ },
2626
2841
  })
2627
2842
  // =========================================================================
2628
2843
  // smithers supervise
@@ -3429,8 +3644,8 @@ const cli = Cli.create({
3429
3644
  // =========================================================================
3430
3645
  // smithers inspect <run_id>
3431
3646
  // =========================================================================
3432
- .command("inspect", {
3433
- description: "Output detailed state of a run: steps, agents, approvals, and outputs.",
3647
+ .command("inspect", {
3648
+ description: "Output detailed run state, including steps, agents, approvals, and outputs.",
3434
3649
  args: inspectArgs,
3435
3650
  options: inspectOptions,
3436
3651
  alias: { watch: "w", interval: "i" },
@@ -4356,8 +4571,8 @@ const cli = Cli.create({
4356
4571
  // =========================================================================
4357
4572
  // smithers timetravel <workflow>
4358
4573
  // =========================================================================
4359
- .command("timetravel", {
4360
- description: "Time-travel to a previous task state: revert filesystem, reset DB, and optionally resume.",
4574
+ .command("timetravel", {
4575
+ description: "Time-travel to a previous task state by reverting filesystem state, resetting DB state, and optionally resuming.",
4361
4576
  args: workflowArgs,
4362
4577
  options: z.object({
4363
4578
  runId: z.string().describe("Run ID"),
@@ -4488,7 +4703,7 @@ const cli = Cli.create({
4488
4703
  question: z.string().optional().describe("The question to ask"),
4489
4704
  }),
4490
4705
  options: z.object({
4491
- agent: z.enum(["claude", "codex", "gemini", "kimi", "pi"]).optional().describe("Explicitly select which agent CLI to use"),
4706
+ agent: z.enum(["claude", "codex", "antigravity", "gemini", "kimi", "pi"]).optional().describe("Explicitly select which agent CLI to use"),
4492
4707
  listAgents: z.boolean().default(false).describe("List detected agents plus their bootstrap mode and exit"),
4493
4708
  dumpPrompt: z.boolean().default(false).describe("Print the generated system prompt and exit"),
4494
4709
  toolSurface: z.enum(["semantic", "raw"]).default("semantic").describe("Choose which Smithers MCP tool surface to expose"),
@@ -4641,7 +4856,7 @@ const cli = Cli.create({
4641
4856
  // rewrites raw `--json` → `-j` for these commands so it lands as a
4642
4857
  // command option, not a format directive.
4643
4858
  alias: { json: "j" },
4644
- async run(c) {
4859
+ run(c) {
4645
4860
  return runDevtoolsCommandWithTelemetry("tree", c, async () => {
4646
4861
  const { runTreeOnce, runTreeWatch } = await import("./tree.js");
4647
4862
  const { adapter, cleanup } = await findAndOpenDb();
@@ -4707,7 +4922,7 @@ const cli = Cli.create({
4707
4922
  color: z.enum(["auto", "always", "never"]).default("auto").describe("Colorize output"),
4708
4923
  }),
4709
4924
  alias: { json: "j" },
4710
- async run(c) {
4925
+ run(c) {
4711
4926
  return runDevtoolsCommandWithTelemetry("diff", c, async () => {
4712
4927
  const { runDiffOnce } = await import("./diff.js");
4713
4928
  const { adapter, cleanup } = await findAndOpenDb();
@@ -4746,7 +4961,7 @@ const cli = Cli.create({
4746
4961
  pretty: z.boolean().default(false).describe("Schema-ordered render"),
4747
4962
  }),
4748
4963
  alias: { json: "j" },
4749
- async run(c) {
4964
+ run(c) {
4750
4965
  return runDevtoolsCommandWithTelemetry("output", c, async () => {
4751
4966
  const { runOutputOnce } = await import("./output.js");
4752
4967
  const { adapter, cleanup } = await findAndOpenDb();
@@ -4782,7 +4997,7 @@ const cli = Cli.create({
4782
4997
  json: z.boolean().default(false).describe("Emit JumpResult JSON"),
4783
4998
  }),
4784
4999
  alias: { json: "j" },
4785
- async run(c) {
5000
+ run(c) {
4786
5001
  return runDevtoolsCommandWithTelemetry("rewind", c, async () => {
4787
5002
  const { runRewindOnce } = await import("./rewind.js");
4788
5003
  const { adapter, cleanup } = await findAndOpenDb();
@@ -4990,10 +5205,10 @@ wrapCliCommandHandlersWithInputBounds(cliCommands);
4990
5205
  // Main
4991
5206
  // ---------------------------------------------------------------------------
4992
5207
  const KNOWN_COMMANDS = new Set([
4993
- "init", "up", "supervise", "down", "ps", "logs", "events", "chat", "inspect", "node", "why", "approve", "deny",
4994
- "cancel", "graph", "revert", "scores", "observability", "workflow", "ask", "cron", "chat-create",
4995
- "replay", "diff", "fork", "timeline", "memory", "openapi", "token", "agents", "alerts",
4996
- "tree", "output", "rewind", "gui",
5208
+ ...cliCommands.keys(),
5209
+ "completions",
5210
+ "mcp",
5211
+ "skills",
4997
5212
  ]);
4998
5213
  /**
4999
5214
  * Rewrite `smithers .` or `smithers <path>` (when path looks like a directory) to `smithers gui <path>`.
@@ -5032,54 +5247,15 @@ function resolveCliColor(mode, stream) {
5032
5247
  if (process.env.NO_COLOR !== undefined && process.env.NO_COLOR.length > 0) return false;
5033
5248
  return Boolean(stream.isTTY);
5034
5249
  }
5035
- const BUILTIN_FLAGS_WITH_VALUES = new Set([
5036
- "--format",
5037
- "--filter-output",
5038
- "--token-limit",
5039
- "--token-offset",
5040
- ]);
5041
5250
  const WORKFLOW_UTILITY_COMMANDS = new Set([
5042
5251
  "run",
5043
5252
  "list",
5044
5253
  "path",
5045
5254
  "create",
5255
+ "inspect",
5256
+ "skills",
5046
5257
  "doctor",
5047
5258
  ]);
5048
- /**
5049
- * @param {string | undefined} value
5050
- * @returns {McpSurface}
5051
- */
5052
- function normalizeMcpSurface(value) {
5053
- const surface = value?.trim().toLowerCase();
5054
- if (surface === undefined || surface.length === 0) {
5055
- throw new Error("Missing value for --surface. Expected semantic, raw, or both.");
5056
- }
5057
- if (surface === "semantic" || surface === "raw" || surface === "both") {
5058
- return surface;
5059
- }
5060
- throw new Error(`Invalid --surface value: ${value}. Expected semantic, raw, or both.`);
5061
- }
5062
- /**
5063
- * @param {string[]} argv
5064
- */
5065
- function parseMcpSurfaceArgv(argv) {
5066
- let surface = "semantic";
5067
- const filtered = [];
5068
- for (let index = 0; index < argv.length; index++) {
5069
- const arg = argv[index];
5070
- if (arg === "--surface") {
5071
- surface = normalizeMcpSurface(argv[index + 1]);
5072
- index += 1;
5073
- continue;
5074
- }
5075
- if (arg.startsWith("--surface=")) {
5076
- surface = normalizeMcpSurface(arg.slice("--surface=".length));
5077
- continue;
5078
- }
5079
- filtered.push(arg);
5080
- }
5081
- return { surface, argv: filtered };
5082
- }
5083
5259
  /**
5084
5260
  * @param {ReturnType<typeof createSemanticMcpServer>} server
5085
5261
  */
@@ -5104,22 +5280,6 @@ function registerRawToolsOnMcpServer(server) {
5104
5280
  });
5105
5281
  }
5106
5282
  }
5107
- /**
5108
- * @param {string[]} argv
5109
- * @returns {number}
5110
- */
5111
- function findFirstPositionalIndex(argv, startIndex = 0) {
5112
- for (let index = startIndex; index < argv.length; index++) {
5113
- const arg = argv[index];
5114
- if (!arg.startsWith("-")) {
5115
- return index;
5116
- }
5117
- if (BUILTIN_FLAGS_WITH_VALUES.has(arg)) {
5118
- index++;
5119
- }
5120
- }
5121
- return -1;
5122
- }
5123
5283
  /**
5124
5284
  * @param {string[]} argv
5125
5285
  */
@@ -5193,6 +5353,54 @@ function argvRequestsJsonMode(argv) {
5193
5353
  }
5194
5354
  return false;
5195
5355
  }
5356
+ /**
5357
+ * Some commands own stdout completely and promise a raw JSON document even
5358
+ * without `--format json`. Run those before Incur can append framework CTAs
5359
+ * such as the stale-skills reminder, which would make stdout unparsable.
5360
+ *
5361
+ * @param {string[]} argv
5362
+ * @returns {boolean}
5363
+ */
5364
+ function runRawJsonAgentCommandIfMatched(argv) {
5365
+ const positionals = [];
5366
+ let jsonOutput = false;
5367
+ for (let index = 0; index < argv.length; index++) {
5368
+ const arg = argv[index];
5369
+ if (arg === "--json") {
5370
+ jsonOutput = true;
5371
+ continue;
5372
+ }
5373
+ if (arg === "--format") {
5374
+ if (argv[index + 1] !== "json") {
5375
+ return false;
5376
+ }
5377
+ jsonOutput = true;
5378
+ index += 1;
5379
+ continue;
5380
+ }
5381
+ if (arg === "--format=json") {
5382
+ jsonOutput = true;
5383
+ continue;
5384
+ }
5385
+ if (arg.startsWith("-")) {
5386
+ return false;
5387
+ }
5388
+ positionals.push(arg);
5389
+ }
5390
+ if (positionals.length !== 2 || positionals[0] !== "agents") {
5391
+ return false;
5392
+ }
5393
+ if (positionals[1] === "capabilities") {
5394
+ process.stdout.write(`${JSON.stringify(getCliAgentCapabilityReport(), null, 2)}\n`);
5395
+ process.exit(0);
5396
+ }
5397
+ if (positionals[1] === "doctor" && jsonOutput) {
5398
+ const report = getCliAgentCapabilityDoctorReport();
5399
+ process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
5400
+ process.exit(report.ok ? 0 : 1);
5401
+ }
5402
+ return false;
5403
+ }
5196
5404
  /**
5197
5405
  * @param {string[]} argv
5198
5406
  */
@@ -5240,17 +5448,6 @@ function rewriteEventsJsonFlagArgv(argv) {
5240
5448
  }
5241
5449
  return argv.map((arg) => (arg === "--json" ? "-j" : arg));
5242
5450
  }
5243
- /**
5244
- * Incur treats union-typed options as value-bearing flags, so a bare
5245
- * `--resume --run-id value` would consume `--run-id` as the resume value.
5246
- *
5247
- * @param {string[]} argv
5248
- */
5249
- function rewriteBareResumeFlagArgv(argv) {
5250
- return argv.map((arg, index) => arg === "--resume" && (argv[index + 1] === undefined || argv[index + 1]?.startsWith("-"))
5251
- ? "--resume=true"
5252
- : arg);
5253
- }
5254
5451
  /**
5255
5452
  * @param {unknown} value
5256
5453
  */
@@ -5276,10 +5473,10 @@ function normalizeResumeOption(value) {
5276
5473
  const CHAT_CREATE_PROMPT = [
5277
5474
  "Start an interactive chat session with the user and help them directly.",
5278
5475
  "Stay in this conversation until the user is done.",
5279
- 'When you are completely finished and want to hand control back to Smithers, end your final response with an empty JSON object in a ```json fence: {}.',
5476
+ 'When you are completely finished and want to hand control back to Smithers, return ONLY this raw JSON object with no prose, markdown, or code fence: {}.',
5280
5477
  ].join("\n\n");
5281
5478
  /**
5282
- * @param {"claude-code" | "codex" | "gemini"} agentId
5479
+ * @param {"claude-code" | "codex" | "antigravity" | "gemini"} agentId
5283
5480
  * @param {string} cwd
5284
5481
  */
5285
5482
  async function createChatAgent(agentId, cwd) {
@@ -5288,7 +5485,7 @@ async function createChatAgent(agentId, cwd) {
5288
5485
  const { ClaudeCodeAgent } = await import("@smithers-orchestrator/agents/ClaudeCodeAgent");
5289
5486
  return new ClaudeCodeAgent({
5290
5487
  cwd,
5291
- model: "claude-opus-4-6",
5488
+ model: "claude-opus-4-7",
5292
5489
  });
5293
5490
  }
5294
5491
  case "codex": {
@@ -5299,6 +5496,12 @@ async function createChatAgent(agentId, cwd) {
5299
5496
  skipGitRepoCheck: true,
5300
5497
  });
5301
5498
  }
5499
+ case "antigravity": {
5500
+ const { AntigravityAgent } = await import("@smithers-orchestrator/agents/AntigravityAgent");
5501
+ return new AntigravityAgent({
5502
+ cwd,
5503
+ });
5504
+ }
5302
5505
  case "gemini": {
5303
5506
  const { GeminiAgent } = await import("@smithers-orchestrator/agents/GeminiAgent");
5304
5507
  return new GeminiAgent({
@@ -5309,7 +5512,7 @@ async function createChatAgent(agentId, cwd) {
5309
5512
  }
5310
5513
  }
5311
5514
  /**
5312
- * @param {"claude-code" | "codex" | "gemini"} agentId
5515
+ * @param {"claude-code" | "codex" | "antigravity" | "gemini"} agentId
5313
5516
  * @param {string} cwd
5314
5517
  * @returns {Promise<import("@smithers-orchestrator/components/SmithersWorkflow").SmithersWorkflow<any>>}
5315
5518
  */
@@ -5395,6 +5598,9 @@ async function main() {
5395
5598
  if (argvRequestsJsonMode(argv)) {
5396
5599
  setJsonMode(true);
5397
5600
  }
5601
+ if (runRawJsonAgentCommandIfMatched(argv)) {
5602
+ return;
5603
+ }
5398
5604
  // Finding #1: pre-validate argv for devtools commands so missing-args
5399
5605
  // / invalid-flag errors go to stderr with exit 1 (not incur's
5400
5606
  // remap-to-4 VALIDATION_ERROR envelope on stdout).
@@ -5411,6 +5617,12 @@ async function main() {
5411
5617
  ...argv.slice(firstPositionalIndex),
5412
5618
  ];
5413
5619
  }
5620
+ const commandIndex = findFirstPositionalIndex(argv);
5621
+ const command = commandIndex >= 0 ? argv[commandIndex] : undefined;
5622
+ if (command && !KNOWN_COMMANDS.has(command)) {
5623
+ console.error(`Unknown command: ${command}`);
5624
+ process.exit(4);
5625
+ }
5414
5626
  argv = rewriteBareResumeFlagArgv(argv);
5415
5627
  // --mcp mode: the MCP server needs to stay alive listening on stdin.
5416
5628
  if (argv.includes("--mcp")) {