cowork-harness 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/.env.example +16 -0
  2. package/CHANGELOG.md +190 -0
  3. package/LICENSE +21 -0
  4. package/README.md +470 -0
  5. package/baselines/desktop-1.11847.5.json +78 -0
  6. package/baselines/desktop-1.12603.1.json +140 -0
  7. package/baselines/prompts/desktop-1.12603.1/host-loop-append.md +8 -0
  8. package/baselines/prompts/desktop-1.12603.1/subagent-append-vm.md +3 -0
  9. package/baselines/prompts/desktop-1.12603.1/system-prompt-append.md +18 -0
  10. package/dist/agent/session.js +465 -0
  11. package/dist/assert.js +159 -0
  12. package/dist/baseline.js +87 -0
  13. package/dist/boundary.js +114 -0
  14. package/dist/canary/grants.js +37 -0
  15. package/dist/cli.js +1107 -0
  16. package/dist/decide/decider.js +521 -0
  17. package/dist/decide/external-channel.js +262 -0
  18. package/dist/decide/llm-transport.js +52 -0
  19. package/dist/dotenv.js +52 -0
  20. package/dist/egress/proxy.js +138 -0
  21. package/dist/egress/sidecar.js +125 -0
  22. package/dist/hostloop/provenance.js +110 -0
  23. package/dist/hostloop/workspace-handler.js +226 -0
  24. package/dist/loop-decision.js +62 -0
  25. package/dist/prompt.js +43 -0
  26. package/dist/run/cassette.js +420 -0
  27. package/dist/run/chat.js +194 -0
  28. package/dist/run/envelope.js +31 -0
  29. package/dist/run/execute.js +533 -0
  30. package/dist/run/renderer.js +179 -0
  31. package/dist/run/run.js +347 -0
  32. package/dist/run/trace-view.js +227 -0
  33. package/dist/runtime/argv.js +126 -0
  34. package/dist/runtime/container.js +76 -0
  35. package/dist/runtime/host-env.js +28 -0
  36. package/dist/runtime/hostloop.js +129 -0
  37. package/dist/runtime/lima.js +177 -0
  38. package/dist/runtime/microvm.js +151 -0
  39. package/dist/runtime/protocol.js +79 -0
  40. package/dist/runtime/stage.js +52 -0
  41. package/dist/secrets.js +42 -0
  42. package/dist/session.js +315 -0
  43. package/dist/sync/cowork-sync.js +215 -0
  44. package/dist/types.js +127 -0
  45. package/docker/Dockerfile.agent +31 -0
  46. package/docker/Dockerfile.proxy +12 -0
  47. package/docker/compose.yml +31 -0
  48. package/fixtures/subagent-grants.json +5 -0
  49. package/package.json +70 -0
package/dist/cli.js ADDED
@@ -0,0 +1,1107 @@
1
+ #!/usr/bin/env node
2
+ import { readFileSync, writeFileSync, mkdirSync, readdirSync, statSync, writeSync, existsSync } from "node:fs";
3
+ import { join, basename, resolve } from "node:path";
4
+ import { fileURLToPath } from "node:url";
5
+ import { parse as parseYaml } from "yaml";
6
+ import { Scenario, AnswerRule } from "./types.js";
7
+ import { loadBaseline, BASELINES_DIR } from "./baseline.js";
8
+ import { loadSession, resolveSessionPaths } from "./session.js";
9
+ import { executeScenario, parseScenarioFile, UnansweredError, BoundaryError } from "./run/execute.js";
10
+ import { ScriptedDecider, ExternalDecider, LlmDecider, ABSTAIN } from "./decide/decider.js";
11
+ import { claudeCliComplete } from "./decide/llm-transport.js";
12
+ import { vmInit, vmDelete, vmStatus, vmPrune, instanceName } from "./runtime/lima.js";
13
+ import { sync } from "./sync/cowork-sync.js";
14
+ import { runBoundaryChecks, formatBoundary } from "./boundary.js";
15
+ import { cmdChat } from "./run/chat.js";
16
+ import { cmdRecord, cmdReplay } from "./run/cassette.js";
17
+ import { loadDotenv } from "./dotenv.js";
18
+ import { makeRenderer, renderStart, renderFooter, startHeartbeat } from "./run/renderer.js";
19
+ import { resolveEventsFile, buildTrace, formatTrace, buildGateTrace, formatGateTrace } from "./run/trace-view.js";
20
+ import { pkgVersion, jsonEnvelope, jsonError } from "./run/envelope.js";
21
+ import { spawnChannel, fileChannel, streamGates, answerGate, readGate } from "./decide/external-channel.js";
22
+ // Synchronous writes (fd 1/2): `process.stdout.write` + `process.exit()` truncates on a PIPE, which
23
+ // would lose the json envelope for any agent/CI that pipes us. writeSync flushes before exit.
24
+ const out = (s) => writeSync(1, s + "\n"); // machine (stdout)
25
+ const log = (s) => writeSync(2, s + "\n"); // human (stderr)
26
+ const HELP = `cowork-harness <command> (v${"$VERSION"})
27
+
28
+ skill <folder> "<prompt>" test a LOCAL skill folder directly (copied fresh each run)
29
+ [--prompt-file <path>] read the prompt verbatim from a file (bypasses the shell — no $-expansion)
30
+ [--fidelity protocol|container|microvm|hostloop|cowork] (default container)
31
+ [--plugin <dir>]… [--marketplace <dir> --enable name@mkt] extra plugin/marketplace sources
32
+ [--answer "<question-regex>=<choice>"] scripted AskUserQuestion answer (repeatable)
33
+ [--on-unanswered fail|prompt|first] policy for unscripted questions (default: adaptive — prompt on a TTY, fail when piped/CI)
34
+ [--decider-llm [--intent "…"]] answer LIVE questions with a model (state test intent in one line)
35
+ [--decider-cmd '<helper>'] …or via a spawned helper (custom logic)
36
+ [--decider-dir <dir>] …or in-band from the driving agent (arm a Monitor; see 'skill --help')
37
+ [--upload <file>]… [--folder <dir>]… attach files / connect folders (mnt/uploads, mnt/.projects)
38
+ [--session-id <id> [--resume]] pin + resume a session (for gated, checkpoint-and-resume skills)
39
+ [--output-format text|json] [--model <id>] [--keep] [--dry-run]
40
+ (run 'skill --help' for the full flag reference)
41
+
42
+ run <scenario.yaml | dir/> run one scenario or every *.yaml in a dir
43
+ [--on-unanswered fail|first] (run rejects 'prompt' — would break determinism)
44
+ [--output-format text|json] [--quiet|-q] [--verbose|-V]
45
+ (run 'run --help' for the full flag reference)
46
+ chat <folder> interactive multi-turn REPL against a skill (TTY); --raw for native
47
+ record <scenario.yaml> run + save a control-protocol cassette
48
+ [--out <file>] cassette path (default: cassettes/<scenario-name>.cassette.json)
49
+ replay --cassette <file> deterministic protocol-replay of a cassette (no token) [--output-format json]
50
+ trace <run-id | dir | path> digest a run's events.jsonl (tools+result status, dispatches, decisions)
51
+ [--tools] tool/dispatch rows only [--gates] gate lifecycle (question→answer→delivered)
52
+ [--output-format json] structured rows
53
+ decide VALIDATE a decider against a sample question in ~2s (no run)
54
+ (--decider-cmd '<helper>' | --decider-llm [--intent …] | --answer "rx=c" | --answer-policy <yaml>)
55
+ [--question "<text>"] [--option <label>]… override the sample question
56
+ gates <dir> [--follow] in-band gate stream (for --decider-dir): one JSON line per pending gate
57
+ + a terminal {"done":true}. Point a single Monitor at this.
58
+ answer <dir> --gate <N> answer an in-band gate (atomic write): --choose <label> | --answer "q=c"
59
+ sync [--diff] derive/refresh a platform baseline from the live Desktop install
60
+ list list available platform baselines
61
+ boundary-check [baseline] prove the sandbox enforces Cowork's limitations
62
+ vm <init|status|delete|prune> manage the L2 Apple-VZ microVM (fidelity: microvm); prune drops orphaned VMs
63
+
64
+ Global: --dotenv <path> load a .env before the command (host-side creds; never mounted).
65
+ Auth resolves from process.env > --dotenv > ./.env > <install>/.env.
66
+ --version, -v print version --help, -h print this help`;
67
+ const SKILL_HELP = `cowork-harness skill <plugin-folder> "<prompt>"
68
+
69
+ Run a LOCAL skill/plugin folder against the staged Cowork agent. The folder is copied fresh into the
70
+ session on every run — no install, marketplace registration, or version bump.
71
+
72
+ Source (at least one):
73
+ <plugin-folder> dir containing .claude-plugin/plugin.json
74
+ --plugin <dir> extra plugin source (repeatable)
75
+ --marketplace <dir> --enable name@mkt load skills via a marketplace.json
76
+
77
+ Files (for skills that need an attached file, e.g. deck-review):
78
+ --upload <path> mount a file at mnt/uploads/<name> — the "attach a file" path (repeatable)
79
+ --folder <dir> mount a folder at mnt/.projects/<id> — a connected repo/space (repeatable)
80
+
81
+ Session persistence (for gated skills that checkpoint + resume):
82
+ --session-id <id> pin a stable session (persists the work dir + the agent's session)
83
+ --resume continue a prior --session-id session (reuses its work dir, so any
84
+ skill-written checkpoint state + outputs survive; passes the agent's
85
+ native --resume so it reloads the conversation)
86
+
87
+ Prompt (one of):
88
+ "<prompt>" inline — MIND SHELL EXPANSION: a literal $ in double quotes is
89
+ eaten by the shell. Single-quote it, or use --prompt-file.
90
+ --prompt-file <path> read the prompt verbatim from a file (raw bytes; no shell parsing)
91
+
92
+ Fidelity --fidelity <tier> (default: container)
93
+ protocol L0 — no sandbox, control protocol only
94
+ container L1 — Docker + per-run default-deny egress proxy (CI-native; fast)
95
+ microvm L2 — Apple-VZ Lima microVM + guest firewall
96
+ hostloop Cowork's production split-execution (file tools on host, shell/web via the workspace MCP)
97
+ cowork auto-pick host-loop vs container the way real Cowork does (via the synced gate)
98
+
99
+ Questions:
100
+ --answer "<q-regex>=<choice>" pre-answer a matching AskUserQuestion (repeatable)
101
+ --answer-policy <yaml> a reusable file of the same regex→choice rules (a bare list, or an
102
+ {answers: [...]} doc) — for skills with several known gates
103
+ --on-unanswered <policy> what to do with an UNscripted question (default: adaptive)
104
+ fail error + print the exact --answer to add (default when piped / CI)
105
+ prompt ask at the TTY (default when a human is attached)
106
+ first pick option 1, loudly warn — then the footer prints the --answer to lock it in
107
+ (the footer always echoes auto-answered questions as copy-pasteable --answer lines)
108
+ (to answer LIVE questions, use --decider-llm / --decider-cmd / --decider-dir below)
109
+ --decider-llm [--intent "<one line>"] answer LIVE questions with a small model (the ergonomic
110
+ default for agent-driven runs: state the test's intent once instead of
111
+ writing a helper). Picks an option by label per question; an out-of-set
112
+ answer FAILS LOUD. NON-deterministic — the footer flags the run so a
113
+ green isn't mistaken for a scripted pass; pin with --answer for CI.
114
+ (Uses the host 'claude -p' on a small model — COWORK_HARNESS_DECIDER_MODEL.)
115
+ --decider-cmd '<helper>' answer the LIVE question via a spawned helper (for custom logic). The
116
+ helper reads a {"type":"decision_request",…} line on stdin and writes
117
+ back {"answers":{"<q>":"<label or 1-based index>"}} (MUST flush per
118
+ line). Carries a reply_with template + a scrubbed transcript context.
119
+ The helper owns its own pipes → the CLI's stdout stays free, so this
120
+ composes with --output-format json.
121
+ --decider-dir <dir> answer LIVE questions IN-BAND from the DRIVING agent (run the harness
122
+ in the background; arm a Monitor on <dir>). Each gate is written to
123
+ <dir>/req-N.json; write the answer to <dir>/resp-N.json (temp+rename).
124
+ stdout stays free → composes with --output-format json. The run is marked
125
+ non-deterministic. Use a FRESH empty dir per run. (See docs/decider-dir.md.)
126
+
127
+ Output:
128
+ --output-format text|json text = live stream + footer (default); json = one stdout envelope
129
+ --quiet, -q verdict footer only --verbose, -V + thinking/tool inputs/sub-agent tree
130
+ --keep print the run dir + deliverable path (runs are always kept on disk)
131
+ --model <id> override the session model
132
+ --dry-run resolve + print the plan, don't run NO_COLOR=1 disable ANSI
133
+
134
+ Long runs: an idle "still running" heartbeat prints on stderr after ~30s of silence.
135
+ COWORK_HARNESS_NO_HEARTBEAT=1 disables it; COWORK_HARNESS_HEARTBEAT_MS tunes the interval.
136
+
137
+ Auth: CLAUDE_CODE_OAUTH_TOKEN (or ANTHROPIC_API_KEY) from process.env > --dotenv <path> > ./.env >
138
+ <install>/.env. So you can run from any directory and still pick up the install's credentials.
139
+
140
+ Exit codes: 0 pass · 1 assertion/agent failure · 2 usage / unanswered-under-fail / boundary / runtime.`;
141
+ const RUN_HELP = `cowork-harness run <scenario.yaml | dir/>
142
+
143
+ Run one authored scenario, or every *.yaml/*.yml in a directory, with assertions and a CI-ready exit
144
+ code. Verdict-first: on FAIL the failing transcript is printed inline (no spelunking runs/…).
145
+
146
+ Input policy:
147
+ --on-unanswered fail|first policy for an unscripted question (default: fail — deterministic).
148
+ 'prompt' is rejected (it would break reproducibility).
149
+ fail error + the exact --answer to add (the CI default)
150
+ first pick option 1, loudly warn; the footer echoes it as a --answer line to lock in
151
+ --decider-cmd '<helper>' answer live questions via a spawned helper (see 'skill --help')
152
+ --decider-dir <dir> answer live questions in-band from the driving agent (see 'skill --help')
153
+ (run omits --decider-llm by design — scenarios pin answers for reproducibility; a scenario may still
154
+ opt into the model with 'on_unanswered: llm' in its YAML, which flags the run non-deterministic)
155
+ (per-scenario answers/on_unanswered in the YAML take precedence where set)
156
+
157
+ Output:
158
+ --output-format text|json text = verdict + failing transcript (default); json = stdout envelope
159
+ --quiet, -q verdict only --verbose, -V live stream + per-tool markers
160
+ NO_COLOR=1 disable ANSI on stderr
161
+
162
+ Long runs: an idle "still running" heartbeat prints on stderr after ~30s of silence
163
+ (COWORK_HARNESS_NO_HEARTBEAT=1 / COWORK_HARNESS_HEARTBEAT_MS to disable/tune).
164
+
165
+ Exit codes: 0 all pass · 1 any assertion/agent failure · 2 usage / unanswered-under-fail / boundary.`;
166
+ function printHelp() {
167
+ log(HELP.replace("$VERSION", pkgVersion()));
168
+ }
169
+ function hasHelp(args) {
170
+ return args.includes("--help") || args.includes("-h");
171
+ }
172
+ async function main() {
173
+ const argv = process.argv.slice(2);
174
+ // `--dotenv <path>` is a GLOBAL flag — parse + strip it before command dispatch so a skill run from
175
+ // any directory can point at the install's credentials. Credentials then resolve in priority order:
176
+ // process.env (exported wins) > --dotenv > ./.env (cwd) > <install>/.env (package root). loadDotenv
177
+ // only fills UNDEFINED keys, so calling it in this order yields exactly that precedence.
178
+ // (NOT `--env-file`: Node reserves that name and consumes it before this code runs.)
179
+ const envFileIdx = argv.indexOf("--dotenv");
180
+ const explicitEnvFile = envFileIdx >= 0 ? argv[envFileIdx + 1] : undefined;
181
+ // #4: bounds-check the value, reject a command name mistaken as the path (`--dotenv run x.yaml`
182
+ // would treat `run` as the dotenv path and dispatch `x.yaml`), and FAIL when an explicitly named
183
+ // file is absent — an explicitly-requested credential file silently ignored is a footgun.
184
+ if (envFileIdx >= 0) {
185
+ if (explicitEnvFile === undefined) {
186
+ log("--dotenv requires a path (none provided)");
187
+ process.exit(2);
188
+ }
189
+ const COMMANDS = [
190
+ "skill",
191
+ "run",
192
+ "chat",
193
+ "record",
194
+ "replay",
195
+ "trace",
196
+ "decide",
197
+ "gates",
198
+ "answer",
199
+ "sync",
200
+ "list",
201
+ "boundary-check",
202
+ "vm",
203
+ ];
204
+ if (COMMANDS.includes(explicitEnvFile)) {
205
+ log(`--dotenv requires a path but got the command "${explicitEnvFile}" — write \`--dotenv <path> ${explicitEnvFile} …\``);
206
+ process.exit(2);
207
+ }
208
+ argv.splice(envFileIdx, 2);
209
+ if (!existsSync(explicitEnvFile)) {
210
+ log(`--dotenv file not found: ${explicitEnvFile}`);
211
+ process.exit(2);
212
+ }
213
+ }
214
+ const packageRootEnv = fileURLToPath(new URL("../.env", import.meta.url)); // dist/cli.js → <install>/.env
215
+ const sources = [...(explicitEnvFile ? [explicitEnvFile] : []), resolve(process.cwd(), ".env"), packageRootEnv];
216
+ const loadedEnv = [];
217
+ const seenSources = new Set();
218
+ for (const f of sources) {
219
+ const key = resolve(f);
220
+ if (seenSources.has(key))
221
+ continue; // don't double-load when cwd === install dir
222
+ seenSources.add(key);
223
+ loadedEnv.push(...loadDotenv(f));
224
+ }
225
+ // Only surface env-loading when it's non-obvious — an explicit --dotenv, or debug. The common
226
+ // auto-load (./.env / install .env) stays silent: auth either works or fails loudly. (Feedback: the
227
+ // line was repetitive noise across many invocations.)
228
+ if (loadedEnv.length && (explicitEnvFile || process.env.COWORK_HARNESS_DEBUG))
229
+ log(`[env] loaded ${loadedEnv.length} var(s): ${loadedEnv.join(", ")}`);
230
+ const [cmd, ...rest] = argv;
231
+ if (cmd === "--version" || cmd === "-v")
232
+ return void out(pkgVersion());
233
+ if (cmd === undefined || cmd === "--help" || cmd === "-h" || cmd === "help")
234
+ return printHelp();
235
+ switch (cmd) {
236
+ case "run":
237
+ return cmdRun(rest);
238
+ case "sync":
239
+ return cmdSync(rest);
240
+ case "list":
241
+ return cmdList();
242
+ case "boundary-check":
243
+ return cmdBoundary(rest);
244
+ case "vm":
245
+ return cmdVm(rest);
246
+ case "skill":
247
+ return cmdSkill(rest);
248
+ case "chat":
249
+ return cmdChat(rest);
250
+ case "record":
251
+ return cmdRecord(rest);
252
+ case "replay":
253
+ return cmdReplay(rest);
254
+ case "trace":
255
+ return cmdTrace(rest);
256
+ case "decide":
257
+ return cmdDecide(rest);
258
+ case "gates":
259
+ return cmdGates(rest);
260
+ case "answer":
261
+ return cmdAnswer(rest);
262
+ default:
263
+ log(`unknown command: ${cmd}\n`);
264
+ printHelp();
265
+ process.exit(2);
266
+ }
267
+ }
268
+ /** Shared json-output predicate so the parser and the top-level catch can never drift. */
269
+ function isJsonOutput(args) {
270
+ for (let i = 0; i < args.length; i++) {
271
+ if (args[i] === "--output-format" && args[i + 1] === "json")
272
+ return true;
273
+ if (args[i] === "--output-format=json")
274
+ return true;
275
+ }
276
+ return false;
277
+ }
278
+ /**
279
+ * #58: bounds-checked reader for value-taking flags. `args[++i]` with no following token silently
280
+ * yields `undefined` (e.g. a trailing `--decider-cmd` at the end of argv), which then becomes a
281
+ * broken flag value. Read the next token explicitly and, when it's absent, fail with the established
282
+ * usage-error exit code (2). takeCommonFlags can run before --output-format json is resolved, so the error
283
+ * goes to stderr unconditionally (machine callers piping us still see a non-zero exit).
284
+ */
285
+ function flagValue(args, i, flag) {
286
+ const v = args[i + 1];
287
+ if (v === undefined) {
288
+ log(`${flag} requires a value (none provided)`); // stderr usage error
289
+ process.exit(2);
290
+ }
291
+ return v;
292
+ }
293
+ /**
294
+ * Extract true positionals — args that are neither a flag nor the value consumed by a known
295
+ * value-taking flag. Fixes the `args.find((a) => !a.startsWith("--"))` idiom (#15/#16), which
296
+ * mistook a flag's value (e.g. the `1` in `--gate 1`, the `json` in `--output-format json`) for
297
+ * the positional. `valueFlags` lists the value-taking flags whose following token must be skipped.
298
+ */
299
+ function positionals(args, valueFlags) {
300
+ const out = [];
301
+ for (let i = 0; i < args.length; i++) {
302
+ const a = args[i];
303
+ if (valueFlags.includes(a)) {
304
+ i++; // skip the flag AND its value
305
+ continue;
306
+ }
307
+ if (a.startsWith("-"))
308
+ continue; // any other (boolean) flag — skip just the flag
309
+ out.push(a);
310
+ }
311
+ return out;
312
+ }
313
+ function takeCommonFlags(args) {
314
+ const rest = [];
315
+ const flags = { output: "text", quiet: false, verbose: false };
316
+ for (let i = 0; i < args.length; i++) {
317
+ const a = args[i];
318
+ if (a === "--on-unanswered")
319
+ flags.onUnanswered = flagValue(args, i++, a);
320
+ else if (a === "--output-format") {
321
+ // #2: validate the enum (and bounds-check the value). An invalid/missing value previously fell
322
+ // back to "text" silently (`--output-format xml` behaved as text; a trailing `--output-format` too).
323
+ const v = flagValue(args, i++, a);
324
+ if (v !== "text" && v !== "json") {
325
+ log(`--output-format must be "text" or "json" (got "${v}")`);
326
+ process.exit(2);
327
+ }
328
+ flags.output = v;
329
+ }
330
+ else if (a === "--output-format=json")
331
+ flags.output = "json";
332
+ else if (a === "--output-format=text")
333
+ flags.output = "text";
334
+ else if (a === "--quiet" || a === "-q")
335
+ flags.quiet = true;
336
+ else if (a === "--verbose" || a === "-V")
337
+ flags.verbose = true;
338
+ else if (a === "--decider-cmd")
339
+ flags.deciderCmd = flagValue(args, i++, a);
340
+ else if (a === "--decider-dir")
341
+ flags.deciderDir = flagValue(args, i++, a);
342
+ else
343
+ rest.push(a);
344
+ }
345
+ return { rest, flags };
346
+ }
347
+ /** Resolve the output/render plan for a command (unified output model). */
348
+ function resolveOutput(command, flags) {
349
+ const color = process.stderr.isTTY === true && !process.env.NO_COLOR;
350
+ if (flags.output === "json")
351
+ return { json: true, render: false, footer: false, plan: { live: false, progress: false, verbose: false, color: false } };
352
+ if (flags.quiet)
353
+ return { json: false, render: false, footer: true, plan: { live: false, progress: false, verbose: false, color } };
354
+ const verbose = flags.verbose;
355
+ // skill renders live ("show me what it did"); run is verdict-first (renderer buffers for the
356
+ // failure transcript; live/per-tool only under --verbose).
357
+ const live = command === "skill" ? true : verbose;
358
+ const progress = command === "skill" ? true : verbose;
359
+ return { json: false, render: true, footer: true, plan: { live, progress, verbose, color } };
360
+ }
361
+ /** Resolve the on_unanswered default for a command (input-and-interactivity plan §3). This is the choke
362
+ * point BOTH run and skill pass through, so the removed/internal policy values are rejected here — they
363
+ * can't silently degrade to `fail` (which would pass a no-gate run green under a bogus policy). */
364
+ function resolvePolicy(command, flags) {
365
+ const json = flags.output === "json";
366
+ // `external` (the removed stdio channel) → `--decider-dir`/`--decider-cmd` subsume it.
367
+ if (flags.onUnanswered === "external")
368
+ fail(command, "usage", "--on-unanswered external was removed. Use --decider-dir <dir> (the in-band file channel for a driving agent) or --decider-cmd '<helper>'.", undefined, json);
369
+ // The LLM decider's CLI spelling is --decider-llm; we reject the raw policy value on the CLI to keep deciders in the --decider-* family (the scenario-YAML spelling is on_unanswered: llm).
370
+ if (flags.onUnanswered === "llm")
371
+ fail(command, "usage", '--on-unanswered llm is not a user flag. Use --decider-llm [--intent "<one line>"] to answer live questions with a model.', undefined, json);
372
+ if (flags.onUnanswered) {
373
+ // #3: validate the accepted set. `external`/`llm` are rejected above with redirect messages (the
374
+ // decider-orthogonality invariant); any OTHER bogus value (e.g. "banana") used to fall through here
375
+ // and pass unvalidated, with audit metadata reporting a nonsensical policy. Reject it loudly.
376
+ if (flags.onUnanswered !== "fail" && flags.onUnanswered !== "prompt" && flags.onUnanswered !== "first")
377
+ fail(command, "usage", `--on-unanswered must be fail|prompt|first (got "${flags.onUnanswered}")`, "for a model/external decider use --decider-llm, --decider-dir, or --decider-cmd", json);
378
+ if (command === "run" && flags.onUnanswered === "prompt") {
379
+ log("run rejects --on-unanswered prompt (would break determinism). Use fail|first.");
380
+ process.exit(2);
381
+ }
382
+ return flags.onUnanswered;
383
+ }
384
+ if (command === "run")
385
+ return "fail"; // scenarios are reproducible regression tests
386
+ // skill: adaptive — prompt if a human is at the TTY, else fail (CI/agent)
387
+ return process.stdin.isTTY && !process.env.CI ? "prompt" : "fail";
388
+ }
389
+ /** Resolve the external decider channel, if requested: `--decider-cmd` → a spawned helper, or
390
+ * `--decider-dir` → a file rendezvous (the driving agent answers in-band). BOTH keep the CLI's stdout
391
+ * FREE (the protocol is on the helper's pipes / on disk), so they compose with `--output-format json`.
392
+ * Returns undefined when neither is set. */
393
+ function resolveExternal(command, flags) {
394
+ if (flags.deciderDir != null && flags.deciderCmd != null)
395
+ fail(command, "usage", "--decider-dir conflicts with --decider-cmd (one terminal channel).", undefined, flags.output === "json");
396
+ if (flags.deciderDir != null) {
397
+ try {
398
+ return fileChannel(flags.deciderDir);
399
+ }
400
+ catch (e) {
401
+ return fail(command, "usage", String(e.message), undefined, flags.output === "json");
402
+ }
403
+ }
404
+ return flags.deciderCmd != null ? spawnChannel(flags.deciderCmd) : undefined;
405
+ }
406
+ /** The single error exit used by commands + the top-level catch. Every category → exit 2. */
407
+ function fail(command, category, message, hint, json) {
408
+ if (json)
409
+ out(jsonError(command, category, message, hint));
410
+ else {
411
+ log(message);
412
+ if (hint)
413
+ log(hint);
414
+ }
415
+ process.exit(2);
416
+ }
417
+ /** Split a `--answer "<key>=<value>"` arg; the value rejoins on "=" so a choice may itself contain "=". */
418
+ function splitEq(s) {
419
+ const [k, ...r] = (s ?? "").split("=");
420
+ return [k, r.join("=")];
421
+ }
422
+ /** Load an `--answer-policy <yaml>` file → scripted rules. Same shape as a scenario `answers:` block (a
423
+ * bare list, or an `{answers: [...]}` doc). Fails LOUD on a missing / unparseable / non-list file — a
424
+ * malformed policy must NOT validate as "0 rules" (the user would discover it only when a gate goes
425
+ * unanswered mid-run). */
426
+ function loadAnswerPolicy(command, path, json) {
427
+ if (!existsSync(path))
428
+ fail(command, "usage", `--answer-policy file not found: ${path}`, undefined, json);
429
+ let parsed;
430
+ try {
431
+ parsed = parseYaml(readFileSync(path, "utf8"));
432
+ }
433
+ catch (e) {
434
+ return fail(command, "usage", `cannot parse --answer-policy ${path}: ${String(e.message)}`, undefined, json);
435
+ }
436
+ const rules = Array.isArray(parsed) ? parsed : (parsed?.answers ?? []);
437
+ if (!Array.isArray(rules))
438
+ fail(command, "usage", `--answer-policy must be a list of rules (or an {answers: [...]} doc)`, undefined, json);
439
+ // #7: validate EACH rule against the AnswerRule schema instead of a blind cast. A malformed rule
440
+ // (non-object, wrong field types) must fail loud here, not silently validate as a rule that never
441
+ // matches and surfaces only as an unanswered gate mid-run.
442
+ const out = [];
443
+ for (const [idx, raw] of rules.entries()) {
444
+ const r = AnswerRule.safeParse(raw);
445
+ if (!r.success)
446
+ fail(command, "usage", `--answer-policy rule #${idx + 1} is malformed: ${r.error.issues.map((i) => `${i.path.join(".") || "(root)"} ${i.message}`).join("; ")}`, undefined, json);
447
+ out.push(r.data);
448
+ }
449
+ return out;
450
+ }
451
+ /**
452
+ * The per-scenario run lifecycle shared by `cmdRun` and `cmdSkill` (they had drifted while hand-kept in
453
+ * sync). Owns ONLY the per-scenario spine: renderer + renderStart, the idle heartbeat (disabled under
454
+ * --output-format json OR an external channel), `executeScenario`, the `UnansweredError → fail` mapping, and the
455
+ * footer. The CALLER keeps everything that differs: the external channel's create/close (run reuses ONE
456
+ * across the file loop), the `--output-format json` envelope, and the exit code.
457
+ */
458
+ async function runOneScenario(p) {
459
+ const { command, scenario, label, flags, policy, externalChannel, o, keep, extra } = p;
460
+ const renderer = o.render ? makeRenderer(o.plan) : undefined;
461
+ if (!o.json && !flags.quiet)
462
+ renderStart(label, scenario.fidelity, o.plan);
463
+ const start = Date.now();
464
+ const stopHeartbeat = o.json || externalChannel ? () => { } : startHeartbeat(renderer, o.plan, start);
465
+ let result;
466
+ try {
467
+ result = await executeScenario(scenario, { ...extra, onUnanswered: policy, externalChannel, hooks: renderer ? [renderer] : [] });
468
+ }
469
+ catch (e) {
470
+ if (e instanceof UnansweredError) {
471
+ const chan = flags.deciderDir ? "decider-dir" : flags.deciderCmd ? "decider-cmd" : policy;
472
+ const prefix = command === "run" ? `${scenario.name}: ` : ""; // run names the scenario; skill is single
473
+ fail(command, "unanswered", `${prefix}unanswered question (on_unanswered=${chan})`, e.hint, o.json);
474
+ }
475
+ throw e; // BoundaryError + generic → top-level catch (categorized there)
476
+ }
477
+ finally {
478
+ stopHeartbeat();
479
+ }
480
+ // footer (stderr) and the json envelope (stdout, emitted by the caller) are mutually exclusive —
481
+ // resolveOutput makes `footer` false under --output-format json — so their relative order never matters.
482
+ if (o.footer)
483
+ renderFooter(result, o.plan, { durationMs: Date.now() - start, renderer, keep });
484
+ return result;
485
+ }
486
+ async function cmdRun(rawArgs) {
487
+ if (hasHelp(rawArgs))
488
+ return void log(RUN_HELP);
489
+ const { rest: args, flags } = takeCommonFlags(rawArgs);
490
+ const target = args[0];
491
+ if (!target)
492
+ fail("run", "usage", "usage: run <scenario.yaml | dir/>", undefined, flags.output === "json");
493
+ // `takeCommonFlags` strips known flags; `run` takes exactly one positional (a scenario file or a
494
+ // dir), so anything left over is unexpected. Reject it LOUDLY instead of silently dropping it —
495
+ // e.g. `--fidelity microvm` was a silent no-op (fidelity comes from the scenario's `fidelity:`
496
+ // field, not a flag). Runs before existsSync so the message is precise even for a bogus path.
497
+ const extra = args.slice(1);
498
+ if (extra.length)
499
+ fail("run", "usage", `unexpected argument(s): ${extra.join(" ")} — \`run\` takes one <scenario.yaml | dir/> plus common flags. Fidelity is set by the scenario's \`fidelity:\` field, not a flag.`, undefined, flags.output === "json");
500
+ // A non-existent path threw a raw ENOENT (exit 2 + stack) instead of a clean usage message.
501
+ if (!existsSync(target))
502
+ fail("run", "usage", `scenario path not found: ${target}`, undefined, flags.output === "json");
503
+ const files = statSync(target).isDirectory()
504
+ ? readdirSync(target)
505
+ .filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
506
+ .sort() // deterministic batch order — readdirSync is FS/OS-dependent
507
+ .map((f) => join(target, f))
508
+ : [target];
509
+ const externalChannel = resolveExternal("run", flags); // created once; reused across scenarios
510
+ const policy = externalChannel ? "fail" : resolvePolicy("run", flags);
511
+ const o = resolveOutput("run", flags);
512
+ const results = [];
513
+ try {
514
+ for (let i = 0; i < files.length; i++) {
515
+ const scenario = parseScenarioFile(files[i]);
516
+ // The CLI flag guard (resolvePolicy) rejects --on-unanswered prompt on `run`, but a committed
517
+ // scenario could smuggle it via its YAML and silently block/hang in non-TTY CI. Reject it here too.
518
+ if (scenario.on_unanswered === "prompt")
519
+ fail("run", "usage", `scenario "${scenario.name}" sets on_unanswered: prompt — rejected on \`run\` (breaks determinism / hangs in CI). Use fail|first, or --decider-dir/--decider-cmd.`, undefined, o.json);
520
+ const label = files.length > 1 ? `[${i + 1}/${files.length}] ${scenario.name}` : scenario.name;
521
+ results.push(await runOneScenario({ command: "run", scenario, label, flags, policy, externalChannel, o }));
522
+ }
523
+ }
524
+ finally {
525
+ externalChannel?.close?.(); // ONE channel reused across the loop — close after ALL scenarios (not per-run)
526
+ }
527
+ // All channels keep stdout free → the normal output path (envelope under --output-format json, nothing
528
+ // otherwise). No terminal {type:"result"} line — `--decider-cmd`/`--decider-dir` compose with json.
529
+ if (o.json)
530
+ out(jsonEnvelope("run", results));
531
+ const failed = results.filter((r) => r.assertions.some((a) => !a.pass) || r.result === "error");
532
+ process.exit(failed.length > 0 ? 1 : 0);
533
+ }
534
+ async function cmdSkill(rawArgs) {
535
+ if (hasHelp(rawArgs))
536
+ return void log(SKILL_HELP);
537
+ const { rest: args, flags } = takeCommonFlags(rawArgs);
538
+ const positional = [];
539
+ const answers = [];
540
+ const extraPlugins = [];
541
+ const marketplaces = [];
542
+ const enables = [];
543
+ const uploads = [];
544
+ const folders = [];
545
+ let fidelity = "container";
546
+ let model;
547
+ let promptFile;
548
+ let sessionId;
549
+ let answerPolicy;
550
+ let intent;
551
+ let deciderLlm = false;
552
+ let resume = false;
553
+ let dryRun = false;
554
+ let keep = false;
555
+ for (let i = 0; i < args.length; i++) {
556
+ const a = args[i];
557
+ if (a === "--fidelity") {
558
+ fidelity = flagValue(args, i++, a); // #58: bounds-checked
559
+ // #6: validate at parse time → category `usage`. Previously an invalid value was only rejected
560
+ // later by Scenario.parse (a Zod throw), which the top-level catch mapped to `internal` — a user
561
+ // mistake masquerading as a harness bug.
562
+ const FID = ["protocol", "container", "microvm", "hostloop", "cowork"];
563
+ if (!FID.includes(fidelity))
564
+ fail("skill", "usage", `--fidelity must be one of ${FID.join("|")} (got "${fidelity}")`, undefined, flags.output === "json");
565
+ }
566
+ else if (a === "--model")
567
+ model = flagValue(args, i++, a);
568
+ else if (a === "--prompt-file")
569
+ promptFile = flagValue(args, i++, a);
570
+ else if (a === "--upload")
571
+ uploads.push(flagValue(args, i++, a));
572
+ else if (a === "--folder")
573
+ folders.push(flagValue(args, i++, a));
574
+ else if (a === "--session-id")
575
+ sessionId = flagValue(args, i++, a);
576
+ else if (a === "--resume")
577
+ resume = true;
578
+ else if (a === "--decider-llm")
579
+ deciderLlm = true;
580
+ else if (a === "--intent")
581
+ intent = flagValue(args, i++, a);
582
+ else if (a === "--dry-run")
583
+ dryRun = true;
584
+ else if (a === "--keep")
585
+ keep = true;
586
+ else if (a === "--plugin")
587
+ extraPlugins.push(flagValue(args, i++, a));
588
+ else if (a === "--marketplace")
589
+ marketplaces.push(flagValue(args, i++, a));
590
+ else if (a === "--enable")
591
+ enables.push(flagValue(args, i++, a));
592
+ else if (a === "--answer") {
593
+ const [q, choose] = splitEq(flagValue(args, i++, a));
594
+ answers.push({ when_question: q, choose });
595
+ }
596
+ else if (a === "--answer-policy")
597
+ answerPolicy = flagValue(args, i++, a);
598
+ else
599
+ positional.push(a);
600
+ }
601
+ const isJson = flags.output === "json";
602
+ if (resume && !sessionId)
603
+ fail("skill", "usage", "--resume requires --session-id <id> (the session to resume)", undefined, isJson);
604
+ // #5: reject extra positionals so a shell-quoting slip (an unquoted multi-word prompt) can't silently
605
+ // drop part of the intended prompt. With --prompt-file the only positional is the plugin folder (1);
606
+ // without it, <plugin-folder> "<prompt>" (2). Anything beyond is unexpected.
607
+ const maxPositional = promptFile !== undefined ? 1 : 2;
608
+ if (positional.length > maxPositional)
609
+ fail("skill", "usage", `unexpected extra argument(s): ${positional.slice(maxPositional).join(" ")} — ${promptFile !== undefined
610
+ ? "with --prompt-file, skill takes at most one positional (the plugin folder)"
611
+ : 'skill takes <plugin-folder> "<prompt>" — quote a prompt that contains spaces'}`, undefined, isJson);
612
+ // --answer-policy <yaml>: a reusable file of regex→choice rules (same shape as a scenario `answers:`
613
+ // block), so the common "answer known gates, zero JS" case needs no --decider-cmd helper. Rules from
614
+ // the file resolve first (ScriptedDecider); anything unmatched still follows --on-unanswered.
615
+ if (answerPolicy)
616
+ answers.push(...loadAnswerPolicy("skill", answerPolicy, isJson));
617
+ // --prompt-file reads the prompt verbatim (raw bytes, no shell parsing) — the robust way to pass a
618
+ // prompt containing $, backticks, or newlines. When given, the folder is positional[0] (no inline
619
+ // prompt positional is consumed for the prompt).
620
+ let filePrompt;
621
+ if (promptFile !== undefined) {
622
+ if (!existsSync(promptFile))
623
+ fail("skill", "usage", `--prompt-file not found: ${promptFile}`, undefined, isJson);
624
+ try {
625
+ filePrompt = readFileSync(promptFile, "utf8");
626
+ }
627
+ catch (e) {
628
+ fail("skill", "usage", `cannot read --prompt-file ${promptFile}: ${String(e.message)}`, undefined, isJson);
629
+ }
630
+ if (!filePrompt.trim())
631
+ fail("skill", "usage", `--prompt-file is empty: ${promptFile}`, undefined, isJson);
632
+ }
633
+ // With --prompt-file, every positional is a source (folder); without it, the LAST positional is the
634
+ // inline prompt and earlier positionals (if any) are the folder.
635
+ const haveSource = (filePrompt !== undefined ? positional.length >= 1 : positional.length >= 2) || marketplaces.length || extraPlugins.length;
636
+ const folder = filePrompt !== undefined ? positional[0] : positional.length >= 2 ? positional[0] : undefined;
637
+ const prompt = filePrompt ?? positional[positional.length >= 2 ? 1 : 0];
638
+ if (!haveSource || !prompt) {
639
+ fail("skill", "usage", 'usage: cowork-harness skill <plugin-folder> "<prompt>" [--prompt-file <path>] [--marketplace <dir> --enable name@mkt] [--plugin <dir>]… [--fidelity …] [--answer "q=choice"] (skill --help for all flags)', undefined, isJson);
640
+ }
641
+ const localPlugins = [...(folder ? [folder] : []), ...extraPlugins];
642
+ // Resolve the inline session's relative paths against cwd (consistent with `run`'s file path, which
643
+ // goes through resolveSessionPaths) so uploads/folders/plugins are cwd-independent for the skill path.
644
+ const session = resolveSessionPaths(loadSession({
645
+ model,
646
+ permission_parity: "cowork",
647
+ plugins: { local_plugins: localPlugins, local_marketplaces: marketplaces, enabled: enables },
648
+ uploads, // --upload <file> → mnt/uploads/<basename> (the "attach a file" path; ad-hoc parity with session.uploads)
649
+ folders: folders.map((from) => ({ from, mode: "rw" })), // --folder <dir> → mnt/.projects/<id> (asar: rw, delete denied by default)
650
+ }), process.cwd());
651
+ // Name the run after the skill folder's BASENAME (not the whole dashified path → "skill-ill-…").
652
+ const sourceName = basename((folder ?? marketplaces[0] ?? extraPlugins[0] ?? "test").replace(/\/+$/, "")) || "test";
653
+ const scenario = Scenario.parse({
654
+ name: `skill-${sourceName
655
+ .replace(/[^a-zA-Z0-9]+/g, "-")
656
+ .replace(/^-+|-+$/g, "")
657
+ .slice(0, 40)}`,
658
+ baseline: "latest",
659
+ session: "(inline)",
660
+ fidelity,
661
+ prompt,
662
+ answers,
663
+ assert: [{ result: "success" }],
664
+ });
665
+ if (dryRun) {
666
+ out(JSON.stringify({ fidelity, prompt, localPlugins, marketplaces, enabled: enables, answers }, null, 2));
667
+ return;
668
+ }
669
+ const externalChannel = resolveExternal("skill", flags);
670
+ // `--decider-llm` is the ONLY user-facing way to select the LLM terminal (it maps to the `llm`
671
+ // policy below; the bare `--on-unanswered llm` CLI flag is rejected at resolvePolicy). (Issue 2)
672
+ const useLlm = deciderLlm;
673
+ if (useLlm && externalChannel)
674
+ fail("skill", "usage", "--decider-llm conflicts with --decider-cmd/--decider-dir (two terminals).", undefined, isJson);
675
+ // base policy; an external channel or the LLM decider overrides the terminal in execute.ts
676
+ const policy = externalChannel ? "fail" : useLlm ? "llm" : resolvePolicy("skill", flags);
677
+ const o = resolveOutput("skill", flags);
678
+ let result;
679
+ try {
680
+ result = await runOneScenario({
681
+ command: "skill",
682
+ scenario,
683
+ label: scenario.name,
684
+ flags,
685
+ policy,
686
+ externalChannel,
687
+ o,
688
+ keep,
689
+ extra: {
690
+ session,
691
+ sessionId,
692
+ resume,
693
+ llmIntent: intent,
694
+ nonDeterministicHint: flags.deciderDir != null || flags.deciderCmd != null, // driving agent / helper answers → not reproducible (M4; #48)
695
+ },
696
+ });
697
+ }
698
+ finally {
699
+ externalChannel?.close?.();
700
+ }
701
+ const bad = result.assertions.filter((a) => !a.pass);
702
+ // All channels keep stdout free → the json envelope is the only stdout (footer goes to stderr, and is
703
+ // mutually exclusive with --output-format json). The footer itself is emitted inside runOneScenario.
704
+ if (o.json)
705
+ out(jsonEnvelope("skill", [result]));
706
+ process.exit(bad.length || result.result === "error" ? 1 : 0);
707
+ }
708
+ function cmdVm(args) {
709
+ const sub = args[0];
710
+ const baseline = loadBaseline(args[1] ?? "latest");
711
+ // #62/#63: the instance name is derived from the config hash (see lima.ts instanceName) — a config
712
+ // change yields a new name, so a stale VM is never silently reused.
713
+ const instance = instanceName(baseline);
714
+ if (sub === "status")
715
+ log(`${instance}: ${vmStatus(instance)}`);
716
+ else if (sub === "init") {
717
+ const { status } = vmInit(baseline);
718
+ log(`${instance}: ${status}`);
719
+ }
720
+ else if (sub === "delete") {
721
+ vmDelete(instance);
722
+ log(`${instance} deleted`);
723
+ }
724
+ else if (sub === "prune") {
725
+ const pruned = vmPrune(instance);
726
+ log(pruned.length ? `pruned ${pruned.length} orphaned VM(s): ${pruned.join(", ")}` : `no orphaned VMs (current: ${instance})`);
727
+ }
728
+ else {
729
+ // #11: an invalid/absent subcommand must exit non-zero — a bare `log` exits 0, so a CI script
730
+ // running `vm typo` would read it as success.
731
+ log("usage: vm <init|status|delete|prune>");
732
+ process.exit(2);
733
+ }
734
+ }
735
+ function cmdBoundary(args) {
736
+ // Optional --session <file>: fold that session's egress additions into the boundary allowlist so the
737
+ // self-test exercises the same boundary the session's runs would (not just baseline invariants).
738
+ const si = args.indexOf("--session");
739
+ // #12: a trailing `--session` with no value silently ran the boundary check WITHOUT the session's
740
+ // egress additions. Bounds-check it.
741
+ if (si >= 0 && args[si + 1] === undefined) {
742
+ log("--session requires a value (path to a session YAML)");
743
+ process.exit(2);
744
+ }
745
+ const sessionPath = si >= 0 ? args[si + 1] : undefined;
746
+ const positional = args.filter((a, i) => a !== "--session" && args[i - 1] !== "--session");
747
+ const baseline = loadBaseline(positional[0] ?? "latest");
748
+ let sessionEgress;
749
+ if (sessionPath) {
750
+ const s = loadSession(parseYaml(readFileSync(sessionPath, "utf8")));
751
+ sessionEgress = { extraAllow: s.egress.extra_allow, unrestricted: s.egress.unrestricted };
752
+ }
753
+ const results = runBoundaryChecks(baseline, sessionEgress);
754
+ log(formatBoundary(results));
755
+ process.exit(results.every((r) => r.pass) ? 0 : 1);
756
+ }
757
+ function cmdSync(args) {
758
+ const allowEmpty = args.includes("--allow-empty");
759
+ const res = sync();
760
+ // #37 — refuse to write a baseline with empty version fields. An empty appVersion would produce
761
+ // `desktop-.json` (invalid filename); an empty agentVersion means resolveAgentBinary will fail.
762
+ const versionErrors = [];
763
+ if (!res.appVersion)
764
+ versionErrors.push("appVersion (Desktop not found or Info.plist unreadable — install/open Claude Desktop)");
765
+ if (!res.agentVersion)
766
+ versionErrors.push("agentVersion (.sdk-version missing — open Cowork once to stage the agent binary)");
767
+ if (versionErrors.length) {
768
+ log("ERROR: sync could not resolve required version fields — refusing to write baseline:");
769
+ for (const e of versionErrors)
770
+ log(` - ${e}`);
771
+ log("Fix the above, then re-run `cowork-harness sync`.");
772
+ process.exit(1);
773
+ }
774
+ // #41 — refuse to write a baseline with an empty allowlist unless --allow-empty is passed.
775
+ // An empty allowDomains = default-deny on ALL egress, which silently breaks every scenario.
776
+ if (res.allowDomains.length === 0) {
777
+ log("WARNING: sync produced an empty allowDomains list (asar domain regex matched nothing — asar layout moved).");
778
+ if (!allowEmpty) {
779
+ log("Refusing to write baseline with allowDomains: []. Fix the regex in cowork-sync.ts,");
780
+ log("or hand-edit network.allowDomains in an existing baseline, then re-run.");
781
+ log("Pass --allow-empty to force-write anyway (use only if you understand the egress impact).");
782
+ process.exit(1);
783
+ }
784
+ log("--allow-empty passed: proceeding with empty allowDomains (egress will be default-deny for ALL domains).");
785
+ }
786
+ const baselinePath = join(BASELINES_DIR, `desktop-${res.appVersion}.json`);
787
+ let base;
788
+ try {
789
+ base = JSON.parse(JSON.stringify(loadBaseline("latest")));
790
+ }
791
+ catch {
792
+ throw new Error("No base baseline in baselines/. Commit one (e.g. desktop-<ver>.json) before sync can merge onto it.");
793
+ }
794
+ // #38 — recompute agentBinary.stagedPath when agentVersion changes.
795
+ // Strategy: derive the path by convention (same layout as in the committed baselines:
796
+ // ~/Library/Application Support/Claude/claude-code-vm/<agentVersion>/claude)
797
+ // then VERIFY the derived path exists, because resolveAgentBinary (baseline.ts:16) will fail
798
+ // on a stale path. We warn loudly rather than blocking — the file may not be staged yet on this
799
+ // machine, but the path is the correct convention for the new version.
800
+ const baseAgentBinary = (base.agentBinary ?? {});
801
+ const oldStagedPath = baseAgentBinary.stagedPath ?? "";
802
+ // Replace the version segment in the staged path with the new agentVersion. Gate on whether the regex
803
+ // actually MATCHED (not result==input) — an unchanged-version re-sync produces result==input and must NOT
804
+ // warn; an empty/non-standard layout falls back to the canonical Desktop path so the pointer isn't stale.
805
+ const versionRe = /claude-code-vm\/[^/]+\/claude$/;
806
+ let derivedStagedPath;
807
+ if (versionRe.test(oldStagedPath)) {
808
+ derivedStagedPath = oldStagedPath.replace(versionRe, `claude-code-vm/${res.agentVersion}/claude`);
809
+ }
810
+ else {
811
+ derivedStagedPath = `~/Library/Application Support/Claude/claude-code-vm/${res.agentVersion}/claude`;
812
+ if (oldStagedPath)
813
+ log(`WARNING: agentBinary.stagedPath layout was unexpected ("${oldStagedPath}") — rewrote to the canonical path for ${res.agentVersion}.`);
814
+ }
815
+ const resolvedDerived = derivedStagedPath.replace(/^~(?=$|\/)/, join(process.env.HOME ?? "~"));
816
+ if (!existsSync(resolvedDerived)) {
817
+ log(`WARNING: derived agentBinary.stagedPath does not exist on this machine: ${derivedStagedPath}`);
818
+ log(` (The new agentVersion is ${res.agentVersion}. Open Cowork once to stage the binary, then re-run sync.)`);
819
+ log(` resolveAgentBinary will fail until the file is present or COWORK_AGENT_BINARY is set.`);
820
+ }
821
+ const nextAgentBinary = { ...baseAgentBinary, stagedPath: derivedStagedPath };
822
+ // #39 — re-sync GrowthBook gate states from the decoded fcache (was: stale-carry + blanket warning).
823
+ // Gates drive the cowork loop decision (decideLoopFromBaseline) and the dispatch cap; decoding the
824
+ // fcache here makes a re-sync refresh them and surfaces real drift instead of silently carrying stale.
825
+ const baseProvenance = (base.provenance ?? {});
826
+ const baseGates = (baseProvenance.gates ?? {});
827
+ let nextGates = baseGates;
828
+ if (res.gates) {
829
+ nextGates = {};
830
+ // Preserve authored $comment / any non-pinned keys from the base.
831
+ for (const [k, v] of Object.entries(baseGates))
832
+ if (k.startsWith("$"))
833
+ nextGates[k] = v;
834
+ for (const g of Object.values(res.gates)) {
835
+ const key = `${g.name}:${g.id}`;
836
+ const prev = baseGates[key];
837
+ const prevOn = typeof prev === "string" ? /on|true|force/i.test(prev) : !!prev?.on;
838
+ // Preserve the human annotation: from a prose string, drop the leading "on(force) " token; from
839
+ // a structured entry, keep its `note`.
840
+ const prevNote = typeof prev === "string"
841
+ ? prev.replace(/^(on|off)\([^)]*\)\s*/i, "").trim()
842
+ : (prev?.note ?? "").trim();
843
+ nextGates[key] = { on: g.on, source: g.source, value: g.value, ...(prevNote ? { note: prevNote } : {}) };
844
+ if (prev !== undefined && prevOn !== g.on) {
845
+ log(`WARNING: gate ${key} DRIFTED: ${prevOn ? "on" : "off"} → ${g.on ? "on" : "off"} (source=${g.source}). Loop/dispatch behavior may change — review carefully.`);
846
+ }
847
+ }
848
+ // A pinned gate absent from THIS fcache (partial cache) would otherwise vanish from provenance,
849
+ // silently dropping a loop/dispatch-driving gate. Carry it forward from the base and flag it.
850
+ for (const [k, v] of Object.entries(baseGates)) {
851
+ if (k.startsWith("$") || k in nextGates)
852
+ continue;
853
+ nextGates[k] = v;
854
+ log(`WARNING: gate ${k} not present in fcache this sync — carried forward from base (may be stale).`);
855
+ }
856
+ log(`gates: re-synced ${Object.values(res.gates).length} pinned gate states from fcache.`);
857
+ }
858
+ else {
859
+ log("WARNING: fcache unreadable — provenance.gates carried over from base (may be stale).");
860
+ }
861
+ const prevFingerprint = baseProvenance.asarFingerprint;
862
+ if (prevFingerprint && prevFingerprint !== res.asarFingerprint) {
863
+ log(`note: asarFingerprint changed (${prevFingerprint} → ${res.asarFingerprint}); gates re-synced above.`);
864
+ }
865
+ const next = {
866
+ ...base,
867
+ baselineVersion: 1,
868
+ appVersion: res.appVersion,
869
+ capturedAt: new Date().toISOString().slice(0, 10),
870
+ agentVersion: res.agentVersion,
871
+ agentBinary: nextAgentBinary,
872
+ network: { ...base.network, mode: res.networkMode ?? "gvisor", allowKind: "allowlist", allowDomains: res.allowDomains },
873
+ requireFullVmSandbox: res.requireFullVmSandbox,
874
+ provenance: { ...baseProvenance, gates: nextGates, asarFingerprint: res.asarFingerprint },
875
+ };
876
+ if (args.includes("--diff")) {
877
+ try {
878
+ const prev = JSON.parse(readFileSync(baselinePath, "utf8"));
879
+ log("=== diff vs committed baseline ===");
880
+ diff(prev, next, "");
881
+ }
882
+ catch {
883
+ log(`(no committed ${baselinePath} yet — this would be the first)`);
884
+ }
885
+ }
886
+ if (res.unknownDeltas.length) {
887
+ log("\n⚠ unknown deltas (extend src/sync/cowork-sync.ts):");
888
+ for (const d of res.unknownDeltas)
889
+ log(" - " + d);
890
+ }
891
+ if (!args.includes("--diff")) {
892
+ mkdirSync(BASELINES_DIR, { recursive: true });
893
+ writeFileSync(baselinePath, JSON.stringify(next, null, 2));
894
+ log(`wrote ${baselinePath}`);
895
+ }
896
+ }
897
+ function cmdList() {
898
+ for (const f of readdirSync(BASELINES_DIR).filter((f) => f.endsWith(".json")))
899
+ out(f);
900
+ }
901
+ /** `decide` — validate a decider (helper OR policy) against a sample question in ~2s, so you don't
902
+ * discover a wire-protocol bug 12 minutes into a live run. Shows the exact request a `--decider-cmd`
903
+ * helper receives and the answer it produced (or the protocol error); for `--answer`/`--answer-policy`
904
+ * it shows which rule matched. */
905
+ async function cmdDecide(args) {
906
+ const json = isJsonOutput(args);
907
+ let question = "Confirm the detected stage before proceeding?";
908
+ const options = [];
909
+ let deciderCmd;
910
+ let policy;
911
+ let deciderLlm = false;
912
+ let intent;
913
+ const rules = [];
914
+ for (let i = 0; i < args.length; i++) {
915
+ const a = args[i];
916
+ if (a === "--question")
917
+ question = flagValue(args, i++, a); // #58: bounds-checked
918
+ else if (a === "--option")
919
+ options.push(flagValue(args, i++, a));
920
+ else if (a === "--decider-cmd")
921
+ deciderCmd = flagValue(args, i++, a);
922
+ else if (a === "--decider-llm")
923
+ deciderLlm = true;
924
+ else if (a === "--intent")
925
+ intent = flagValue(args, i++, a);
926
+ else if (a === "--answer-policy")
927
+ policy = flagValue(args, i++, a);
928
+ else if (a === "--answer") {
929
+ const [q, choose] = splitEq(flagValue(args, i++, a));
930
+ rules.push({ when_question: q, choose });
931
+ }
932
+ }
933
+ // #13: `decide` does not implement the file-rendezvous channel — reject `--decider-dir` loudly
934
+ // instead of silently ignoring a first-class runtime path.
935
+ if (args.includes("--decider-dir"))
936
+ fail("decide", "usage", "decide does not support --decider-dir (the file-rendezvous channel); validate that path by running a scenario with --decider-dir. Use --decider-cmd '<helper>' to check a spawned helper here.", undefined, json);
937
+ // #14: reject conflicting terminal deciders — both set, the LLM branch would silently win and the
938
+ // helper would never be exercised. Mirrors cmdSkill/resolveExternal's conflict guards.
939
+ if (deciderLlm && deciderCmd)
940
+ fail("decide", "usage", "--decider-llm conflicts with --decider-cmd (one terminal decider).", undefined, json);
941
+ if (policy)
942
+ rules.push(...loadAnswerPolicy("decide", policy, json));
943
+ const opts = options.length ? options : ["Looks right", "Change it", "Correct or add data"];
944
+ const req = { id: "check", kind: "question", questions: [{ question, options: opts.map((label) => ({ label })) }] };
945
+ const ctx = { task: "", transcript: () => "(sample transcript context)", toolLog: () => [], runId: "decide-check" };
946
+ log(`sample question: "${question}" options: [${opts.join(" | ")}]`);
947
+ try {
948
+ if (deciderLlm) {
949
+ const d = await new LlmDecider(claudeCliComplete, intent).decide(req, ctx);
950
+ const answer = d.response.answers?.[question];
951
+ if (json)
952
+ out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: true, answer, by: "llm" }));
953
+ else
954
+ log(`✓ LLM decider answered: "${question}" → "${answer}" (non-deterministic)`);
955
+ }
956
+ else if (deciderCmd) {
957
+ const inner = spawnChannel(deciderCmd);
958
+ let sent = "";
959
+ const channel = {
960
+ write: (l) => ((sent = l), inner.write(l)),
961
+ readLine: () => inner.readLine(),
962
+ close: () => inner.close?.(),
963
+ };
964
+ try {
965
+ const d = await new ExternalDecider(channel).decide(req, ctx);
966
+ const answer = d.response.answers?.[question];
967
+ log(`helper received: ${sent}`);
968
+ if (json)
969
+ out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: true, answer }));
970
+ else
971
+ log(`✓ helper answered: "${question}" → "${answer}"`);
972
+ }
973
+ finally {
974
+ channel.close();
975
+ }
976
+ }
977
+ else {
978
+ const d = await new ScriptedDecider(rules).decide(req, ctx);
979
+ if (d === ABSTAIN) {
980
+ if (json)
981
+ out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: false, matched: false }));
982
+ else
983
+ log(`✗ no rule matched — this question would fall to --on-unanswered (add an --answer/--answer-policy rule)`);
984
+ process.exit(1);
985
+ }
986
+ const answer = d.response.answers?.[question];
987
+ if (json)
988
+ out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: true, matched: true, answer }));
989
+ else
990
+ log(`✓ rule matched: "${question}" → "${answer}"`);
991
+ }
992
+ }
993
+ catch (e) {
994
+ if (json)
995
+ out(jsonError("decide", "runtime", String(e.message)));
996
+ else
997
+ log(`✗ decider error: ${String(e.message)}`);
998
+ process.exit(1);
999
+ }
1000
+ }
1001
+ /** `gates <dir> [--follow]` — the gate stream for the in-band `--decider-dir` path. Emits one clean
1002
+ * JSON line per pending gate (`{seq, …decision_request}`) + a terminal `{"done":true}`. Point ONE
1003
+ * Monitor at this (no hand-written zsh/find/seen-set loop). */
1004
+ async function cmdGates(args) {
1005
+ const follow = args.includes("--follow");
1006
+ const dir = args.find((a) => !a.startsWith("--"));
1007
+ if (!dir)
1008
+ return void fail("gates", "usage", "usage: gates <dir> [--follow]", undefined, isJsonOutput(args));
1009
+ await streamGates(dir, (line) => out(line), { once: !follow });
1010
+ }
1011
+ /** `answer <dir> --gate <N> (--choose <label> | --answer "<q>=<label>"…)` — write a gate answer
1012
+ * atomically with the right wire shape (hides the temp+rename + `{id, answers}` the driver had to build). */
1013
+ function cmdAnswer(args) {
1014
+ const json = isJsonOutput(args);
1015
+ // #15: skip flag values so `answer --gate 1 --choose Yes <dir>` doesn't read `1` as the directory.
1016
+ const dir = positionals(args, ["--gate", "--choose", "--answer", "--output-format"])[0];
1017
+ let seq;
1018
+ let choose;
1019
+ const pairs = [];
1020
+ for (let i = 0; i < args.length; i++) {
1021
+ const a = args[i];
1022
+ if (a === "--gate")
1023
+ seq = Number(flagValue(args, i++, a)); // #58: bounds-checked
1024
+ else if (a === "--choose")
1025
+ choose = flagValue(args, i++, a);
1026
+ else if (a === "--answer") {
1027
+ const [q, label] = splitEq(flagValue(args, i++, a));
1028
+ pairs.push({ q, label });
1029
+ }
1030
+ }
1031
+ if (!dir || !seq)
1032
+ return void fail("answer", "usage", 'usage: answer <dir> --gate <N> (--choose <label> | --answer "<q>=<label>")', undefined, json);
1033
+ const answers = {};
1034
+ if (pairs.length)
1035
+ for (const p of pairs)
1036
+ answers[p.q] = p.label;
1037
+ else if (choose) {
1038
+ try {
1039
+ const g = readGate(dir, seq);
1040
+ answers[g.questions?.[0]?.question ?? g.questions?.[0]?.header ?? ""] = choose;
1041
+ }
1042
+ catch (e) {
1043
+ return void fail("answer", "usage", `cannot read gate ${seq} in ${dir}: ${String(e.message)}`, undefined, json);
1044
+ }
1045
+ }
1046
+ else
1047
+ return void fail("answer", "usage", 'answer needs --choose <label> or --answer "<q>=<label>"', undefined, json);
1048
+ answerGate(dir, seq, answers);
1049
+ if (json)
1050
+ out(JSON.stringify({ tool: "cowork-harness", command: "answer", ok: true, gate: seq, answers }));
1051
+ else
1052
+ log(`✓ answered gate ${seq}: ${JSON.stringify(answers)}`);
1053
+ }
1054
+ function cmdTrace(args) {
1055
+ const json = isJsonOutput(args);
1056
+ const tools = args.includes("--tools");
1057
+ const gates = args.includes("--gates");
1058
+ // #16: skip the `--output-format` value so `trace --output-format json` doesn't try to trace a run
1059
+ // named `json` instead of reporting the missing target.
1060
+ const target = positionals(args, ["--output-format"])[0];
1061
+ if (!target)
1062
+ fail("trace", "usage", "usage: trace <run-id | run-dir | events.jsonl> [--tools | --gates] [--output-format json]", undefined, json);
1063
+ let file;
1064
+ try {
1065
+ file = resolveEventsFile(target);
1066
+ }
1067
+ catch (e) {
1068
+ return fail("trace", "usage", String(e.message), undefined, json);
1069
+ }
1070
+ if (gates) {
1071
+ // --gates: question → injected answer → delivered result, the full gate lifecycle in one command (Part 4).
1072
+ const rows = buildGateTrace(file);
1073
+ if (json)
1074
+ out(JSON.stringify({ tool: "cowork-harness", command: "trace", file, gates: rows }));
1075
+ else
1076
+ out(formatGateTrace(rows));
1077
+ return;
1078
+ }
1079
+ const rows = buildTrace(file, { tools });
1080
+ if (json)
1081
+ out(JSON.stringify({ tool: "cowork-harness", command: "trace", file, rows }));
1082
+ else
1083
+ out(formatTrace(rows));
1084
+ }
1085
+ function diff(a, b, path) {
1086
+ const keys = new Set([...Object.keys(a ?? {}), ...Object.keys(b ?? {})]);
1087
+ for (const k of keys) {
1088
+ const pa = JSON.stringify(a?.[k]);
1089
+ const pb = JSON.stringify(b?.[k]);
1090
+ if (pa !== pb)
1091
+ log(` ${path}${k}: ${pa} -> ${pb}`);
1092
+ }
1093
+ }
1094
+ main().catch((e) => {
1095
+ const command = process.argv[2] ?? "";
1096
+ const json = isJsonOutput(process.argv.slice(2));
1097
+ if (e instanceof UnansweredError)
1098
+ fail(command, "unanswered", e.message, e.hint, json);
1099
+ if (e instanceof BoundaryError)
1100
+ fail(command, "boundary", e.message, undefined, json);
1101
+ // runtime/unexpected: keep the stack on stderr for humans; a structured envelope on stdout for json.
1102
+ if (json)
1103
+ out(jsonError(command, "internal", String(e?.message ?? e)));
1104
+ else
1105
+ log(String(e?.stack ?? e));
1106
+ process.exit(2);
1107
+ });