cowork-harness 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +16 -0
- package/CHANGELOG.md +190 -0
- package/LICENSE +21 -0
- package/README.md +470 -0
- package/baselines/desktop-1.11847.5.json +78 -0
- package/baselines/desktop-1.12603.1.json +140 -0
- package/baselines/prompts/desktop-1.12603.1/host-loop-append.md +8 -0
- package/baselines/prompts/desktop-1.12603.1/subagent-append-vm.md +3 -0
- package/baselines/prompts/desktop-1.12603.1/system-prompt-append.md +18 -0
- package/dist/agent/session.js +465 -0
- package/dist/assert.js +159 -0
- package/dist/baseline.js +87 -0
- package/dist/boundary.js +114 -0
- package/dist/canary/grants.js +37 -0
- package/dist/cli.js +1107 -0
- package/dist/decide/decider.js +521 -0
- package/dist/decide/external-channel.js +262 -0
- package/dist/decide/llm-transport.js +52 -0
- package/dist/dotenv.js +52 -0
- package/dist/egress/proxy.js +138 -0
- package/dist/egress/sidecar.js +125 -0
- package/dist/hostloop/provenance.js +110 -0
- package/dist/hostloop/workspace-handler.js +226 -0
- package/dist/loop-decision.js +62 -0
- package/dist/prompt.js +43 -0
- package/dist/run/cassette.js +420 -0
- package/dist/run/chat.js +194 -0
- package/dist/run/envelope.js +31 -0
- package/dist/run/execute.js +533 -0
- package/dist/run/renderer.js +179 -0
- package/dist/run/run.js +347 -0
- package/dist/run/trace-view.js +227 -0
- package/dist/runtime/argv.js +126 -0
- package/dist/runtime/container.js +76 -0
- package/dist/runtime/host-env.js +28 -0
- package/dist/runtime/hostloop.js +129 -0
- package/dist/runtime/lima.js +177 -0
- package/dist/runtime/microvm.js +151 -0
- package/dist/runtime/protocol.js +79 -0
- package/dist/runtime/stage.js +52 -0
- package/dist/secrets.js +42 -0
- package/dist/session.js +315 -0
- package/dist/sync/cowork-sync.js +215 -0
- package/dist/types.js +127 -0
- package/docker/Dockerfile.agent +31 -0
- package/docker/Dockerfile.proxy +12 -0
- package/docker/compose.yml +31 -0
- package/fixtures/subagent-grants.json +5 -0
- package/package.json +70 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,1107 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { readFileSync, writeFileSync, mkdirSync, readdirSync, statSync, writeSync, existsSync } from "node:fs";
|
|
3
|
+
import { join, basename, resolve } from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
import { parse as parseYaml } from "yaml";
|
|
6
|
+
import { Scenario, AnswerRule } from "./types.js";
|
|
7
|
+
import { loadBaseline, BASELINES_DIR } from "./baseline.js";
|
|
8
|
+
import { loadSession, resolveSessionPaths } from "./session.js";
|
|
9
|
+
import { executeScenario, parseScenarioFile, UnansweredError, BoundaryError } from "./run/execute.js";
|
|
10
|
+
import { ScriptedDecider, ExternalDecider, LlmDecider, ABSTAIN } from "./decide/decider.js";
|
|
11
|
+
import { claudeCliComplete } from "./decide/llm-transport.js";
|
|
12
|
+
import { vmInit, vmDelete, vmStatus, vmPrune, instanceName } from "./runtime/lima.js";
|
|
13
|
+
import { sync } from "./sync/cowork-sync.js";
|
|
14
|
+
import { runBoundaryChecks, formatBoundary } from "./boundary.js";
|
|
15
|
+
import { cmdChat } from "./run/chat.js";
|
|
16
|
+
import { cmdRecord, cmdReplay } from "./run/cassette.js";
|
|
17
|
+
import { loadDotenv } from "./dotenv.js";
|
|
18
|
+
import { makeRenderer, renderStart, renderFooter, startHeartbeat } from "./run/renderer.js";
|
|
19
|
+
import { resolveEventsFile, buildTrace, formatTrace, buildGateTrace, formatGateTrace } from "./run/trace-view.js";
|
|
20
|
+
import { pkgVersion, jsonEnvelope, jsonError } from "./run/envelope.js";
|
|
21
|
+
import { spawnChannel, fileChannel, streamGates, answerGate, readGate } from "./decide/external-channel.js";
|
|
22
|
+
// Synchronous writes (fd 1/2): `process.stdout.write` + `process.exit()` truncates on a PIPE, which
|
|
23
|
+
// would lose the json envelope for any agent/CI that pipes us. writeSync flushes before exit.
|
|
24
|
+
const out = (s) => writeSync(1, s + "\n"); // machine (stdout)
|
|
25
|
+
const log = (s) => writeSync(2, s + "\n"); // human (stderr)
|
|
26
|
+
const HELP = `cowork-harness <command> (v${"$VERSION"})
|
|
27
|
+
|
|
28
|
+
skill <folder> "<prompt>" test a LOCAL skill folder directly (copied fresh each run)
|
|
29
|
+
[--prompt-file <path>] read the prompt verbatim from a file (bypasses the shell — no $-expansion)
|
|
30
|
+
[--fidelity protocol|container|microvm|hostloop|cowork] (default container)
|
|
31
|
+
[--plugin <dir>]… [--marketplace <dir> --enable name@mkt] extra plugin/marketplace sources
|
|
32
|
+
[--answer "<question-regex>=<choice>"] scripted AskUserQuestion answer (repeatable)
|
|
33
|
+
[--on-unanswered fail|prompt|first] policy for unscripted questions (default: adaptive — prompt on a TTY, fail when piped/CI)
|
|
34
|
+
[--decider-llm [--intent "…"]] answer LIVE questions with a model (state test intent in one line)
|
|
35
|
+
[--decider-cmd '<helper>'] …or via a spawned helper (custom logic)
|
|
36
|
+
[--decider-dir <dir>] …or in-band from the driving agent (arm a Monitor; see 'skill --help')
|
|
37
|
+
[--upload <file>]… [--folder <dir>]… attach files / connect folders (mnt/uploads, mnt/.projects)
|
|
38
|
+
[--session-id <id> [--resume]] pin + resume a session (for gated, checkpoint-and-resume skills)
|
|
39
|
+
[--output-format text|json] [--model <id>] [--keep] [--dry-run]
|
|
40
|
+
(run 'skill --help' for the full flag reference)
|
|
41
|
+
|
|
42
|
+
run <scenario.yaml | dir/> run one scenario or every *.yaml in a dir
|
|
43
|
+
[--on-unanswered fail|first] (run rejects 'prompt' — would break determinism)
|
|
44
|
+
[--output-format text|json] [--quiet|-q] [--verbose|-V]
|
|
45
|
+
(run 'run --help' for the full flag reference)
|
|
46
|
+
chat <folder> interactive multi-turn REPL against a skill (TTY); --raw for native
|
|
47
|
+
record <scenario.yaml> run + save a control-protocol cassette
|
|
48
|
+
[--out <file>] cassette path (default: cassettes/<scenario-name>.cassette.json)
|
|
49
|
+
replay --cassette <file> deterministic protocol-replay of a cassette (no token) [--output-format json]
|
|
50
|
+
trace <run-id | dir | path> digest a run's events.jsonl (tools+result status, dispatches, decisions)
|
|
51
|
+
[--tools] tool/dispatch rows only [--gates] gate lifecycle (question→answer→delivered)
|
|
52
|
+
[--output-format json] structured rows
|
|
53
|
+
decide VALIDATE a decider against a sample question in ~2s (no run)
|
|
54
|
+
(--decider-cmd '<helper>' | --decider-llm [--intent …] | --answer "rx=c" | --answer-policy <yaml>)
|
|
55
|
+
[--question "<text>"] [--option <label>]… override the sample question
|
|
56
|
+
gates <dir> [--follow] in-band gate stream (for --decider-dir): one JSON line per pending gate
|
|
57
|
+
+ a terminal {"done":true}. Point a single Monitor at this.
|
|
58
|
+
answer <dir> --gate <N> answer an in-band gate (atomic write): --choose <label> | --answer "q=c"
|
|
59
|
+
sync [--diff] derive/refresh a platform baseline from the live Desktop install
|
|
60
|
+
list list available platform baselines
|
|
61
|
+
boundary-check [baseline] prove the sandbox enforces Cowork's limitations
|
|
62
|
+
vm <init|status|delete|prune> manage the L2 Apple-VZ microVM (fidelity: microvm); prune drops orphaned VMs
|
|
63
|
+
|
|
64
|
+
Global: --dotenv <path> load a .env before the command (host-side creds; never mounted).
|
|
65
|
+
Auth resolves from process.env > --dotenv > ./.env > <install>/.env.
|
|
66
|
+
--version, -v print version --help, -h print this help`;
|
|
67
|
+
const SKILL_HELP = `cowork-harness skill <plugin-folder> "<prompt>"
|
|
68
|
+
|
|
69
|
+
Run a LOCAL skill/plugin folder against the staged Cowork agent. The folder is copied fresh into the
|
|
70
|
+
session on every run — no install, marketplace registration, or version bump.
|
|
71
|
+
|
|
72
|
+
Source (at least one):
|
|
73
|
+
<plugin-folder> dir containing .claude-plugin/plugin.json
|
|
74
|
+
--plugin <dir> extra plugin source (repeatable)
|
|
75
|
+
--marketplace <dir> --enable name@mkt load skills via a marketplace.json
|
|
76
|
+
|
|
77
|
+
Files (for skills that need an attached file, e.g. deck-review):
|
|
78
|
+
--upload <path> mount a file at mnt/uploads/<name> — the "attach a file" path (repeatable)
|
|
79
|
+
--folder <dir> mount a folder at mnt/.projects/<id> — a connected repo/space (repeatable)
|
|
80
|
+
|
|
81
|
+
Session persistence (for gated skills that checkpoint + resume):
|
|
82
|
+
--session-id <id> pin a stable session (persists the work dir + the agent's session)
|
|
83
|
+
--resume continue a prior --session-id session (reuses its work dir, so any
|
|
84
|
+
skill-written checkpoint state + outputs survive; passes the agent's
|
|
85
|
+
native --resume so it reloads the conversation)
|
|
86
|
+
|
|
87
|
+
Prompt (one of):
|
|
88
|
+
"<prompt>" inline — MIND SHELL EXPANSION: a literal $ in double quotes is
|
|
89
|
+
eaten by the shell. Single-quote it, or use --prompt-file.
|
|
90
|
+
--prompt-file <path> read the prompt verbatim from a file (raw bytes; no shell parsing)
|
|
91
|
+
|
|
92
|
+
Fidelity --fidelity <tier> (default: container)
|
|
93
|
+
protocol L0 — no sandbox, control protocol only
|
|
94
|
+
container L1 — Docker + per-run default-deny egress proxy (CI-native; fast)
|
|
95
|
+
microvm L2 — Apple-VZ Lima microVM + guest firewall
|
|
96
|
+
hostloop Cowork's production split-execution (file tools on host, shell/web via the workspace MCP)
|
|
97
|
+
cowork auto-pick host-loop vs container the way real Cowork does (via the synced gate)
|
|
98
|
+
|
|
99
|
+
Questions:
|
|
100
|
+
--answer "<q-regex>=<choice>" pre-answer a matching AskUserQuestion (repeatable)
|
|
101
|
+
--answer-policy <yaml> a reusable file of the same regex→choice rules (a bare list, or an
|
|
102
|
+
{answers: [...]} doc) — for skills with several known gates
|
|
103
|
+
--on-unanswered <policy> what to do with an UNscripted question (default: adaptive)
|
|
104
|
+
fail error + print the exact --answer to add (default when piped / CI)
|
|
105
|
+
prompt ask at the TTY (default when a human is attached)
|
|
106
|
+
first pick option 1, loudly warn — then the footer prints the --answer to lock it in
|
|
107
|
+
(the footer always echoes auto-answered questions as copy-pasteable --answer lines)
|
|
108
|
+
(to answer LIVE questions, use --decider-llm / --decider-cmd / --decider-dir below)
|
|
109
|
+
--decider-llm [--intent "<one line>"] answer LIVE questions with a small model (the ergonomic
|
|
110
|
+
default for agent-driven runs: state the test's intent once instead of
|
|
111
|
+
writing a helper). Picks an option by label per question; an out-of-set
|
|
112
|
+
answer FAILS LOUD. NON-deterministic — the footer flags the run so a
|
|
113
|
+
green isn't mistaken for a scripted pass; pin with --answer for CI.
|
|
114
|
+
(Uses the host 'claude -p' on a small model — COWORK_HARNESS_DECIDER_MODEL.)
|
|
115
|
+
--decider-cmd '<helper>' answer the LIVE question via a spawned helper (for custom logic). The
|
|
116
|
+
helper reads a {"type":"decision_request",…} line on stdin and writes
|
|
117
|
+
back {"answers":{"<q>":"<label or 1-based index>"}} (MUST flush per
|
|
118
|
+
line). Carries a reply_with template + a scrubbed transcript context.
|
|
119
|
+
The helper owns its own pipes → the CLI's stdout stays free, so this
|
|
120
|
+
composes with --output-format json.
|
|
121
|
+
--decider-dir <dir> answer LIVE questions IN-BAND from the DRIVING agent (run the harness
|
|
122
|
+
in the background; arm a Monitor on <dir>). Each gate is written to
|
|
123
|
+
<dir>/req-N.json; write the answer to <dir>/resp-N.json (temp+rename).
|
|
124
|
+
stdout stays free → composes with --output-format json. The run is marked
|
|
125
|
+
non-deterministic. Use a FRESH empty dir per run. (See docs/decider-dir.md.)
|
|
126
|
+
|
|
127
|
+
Output:
|
|
128
|
+
--output-format text|json text = live stream + footer (default); json = one stdout envelope
|
|
129
|
+
--quiet, -q verdict footer only --verbose, -V + thinking/tool inputs/sub-agent tree
|
|
130
|
+
--keep print the run dir + deliverable path (runs are always kept on disk)
|
|
131
|
+
--model <id> override the session model
|
|
132
|
+
--dry-run resolve + print the plan, don't run NO_COLOR=1 disable ANSI
|
|
133
|
+
|
|
134
|
+
Long runs: an idle "still running" heartbeat prints on stderr after ~30s of silence.
|
|
135
|
+
COWORK_HARNESS_NO_HEARTBEAT=1 disables it; COWORK_HARNESS_HEARTBEAT_MS tunes the interval.
|
|
136
|
+
|
|
137
|
+
Auth: CLAUDE_CODE_OAUTH_TOKEN (or ANTHROPIC_API_KEY) from process.env > --dotenv <path> > ./.env >
|
|
138
|
+
<install>/.env. So you can run from any directory and still pick up the install's credentials.
|
|
139
|
+
|
|
140
|
+
Exit codes: 0 pass · 1 assertion/agent failure · 2 usage / unanswered-under-fail / boundary / runtime.`;
|
|
141
|
+
const RUN_HELP = `cowork-harness run <scenario.yaml | dir/>
|
|
142
|
+
|
|
143
|
+
Run one authored scenario, or every *.yaml/*.yml in a directory, with assertions and a CI-ready exit
|
|
144
|
+
code. Verdict-first: on FAIL the failing transcript is printed inline (no spelunking runs/…).
|
|
145
|
+
|
|
146
|
+
Input policy:
|
|
147
|
+
--on-unanswered fail|first policy for an unscripted question (default: fail — deterministic).
|
|
148
|
+
'prompt' is rejected (it would break reproducibility).
|
|
149
|
+
fail error + the exact --answer to add (the CI default)
|
|
150
|
+
first pick option 1, loudly warn; the footer echoes it as a --answer line to lock in
|
|
151
|
+
--decider-cmd '<helper>' answer live questions via a spawned helper (see 'skill --help')
|
|
152
|
+
--decider-dir <dir> answer live questions in-band from the driving agent (see 'skill --help')
|
|
153
|
+
(run omits --decider-llm by design — scenarios pin answers for reproducibility; a scenario may still
|
|
154
|
+
opt into the model with 'on_unanswered: llm' in its YAML, which flags the run non-deterministic)
|
|
155
|
+
(per-scenario answers/on_unanswered in the YAML take precedence where set)
|
|
156
|
+
|
|
157
|
+
Output:
|
|
158
|
+
--output-format text|json text = verdict + failing transcript (default); json = stdout envelope
|
|
159
|
+
--quiet, -q verdict only --verbose, -V live stream + per-tool markers
|
|
160
|
+
NO_COLOR=1 disable ANSI on stderr
|
|
161
|
+
|
|
162
|
+
Long runs: an idle "still running" heartbeat prints on stderr after ~30s of silence
|
|
163
|
+
(COWORK_HARNESS_NO_HEARTBEAT=1 / COWORK_HARNESS_HEARTBEAT_MS to disable/tune).
|
|
164
|
+
|
|
165
|
+
Exit codes: 0 all pass · 1 any assertion/agent failure · 2 usage / unanswered-under-fail / boundary.`;
|
|
166
|
+
function printHelp() {
|
|
167
|
+
log(HELP.replace("$VERSION", pkgVersion()));
|
|
168
|
+
}
|
|
169
|
+
function hasHelp(args) {
|
|
170
|
+
return args.includes("--help") || args.includes("-h");
|
|
171
|
+
}
|
|
172
|
+
async function main() {
|
|
173
|
+
const argv = process.argv.slice(2);
|
|
174
|
+
// `--dotenv <path>` is a GLOBAL flag — parse + strip it before command dispatch so a skill run from
|
|
175
|
+
// any directory can point at the install's credentials. Credentials then resolve in priority order:
|
|
176
|
+
// process.env (exported wins) > --dotenv > ./.env (cwd) > <install>/.env (package root). loadDotenv
|
|
177
|
+
// only fills UNDEFINED keys, so calling it in this order yields exactly that precedence.
|
|
178
|
+
// (NOT `--env-file`: Node reserves that name and consumes it before this code runs.)
|
|
179
|
+
const envFileIdx = argv.indexOf("--dotenv");
|
|
180
|
+
const explicitEnvFile = envFileIdx >= 0 ? argv[envFileIdx + 1] : undefined;
|
|
181
|
+
// #4: bounds-check the value, reject a command name mistaken as the path (`--dotenv run x.yaml`
|
|
182
|
+
// would treat `run` as the dotenv path and dispatch `x.yaml`), and FAIL when an explicitly named
|
|
183
|
+
// file is absent — an explicitly-requested credential file silently ignored is a footgun.
|
|
184
|
+
if (envFileIdx >= 0) {
|
|
185
|
+
if (explicitEnvFile === undefined) {
|
|
186
|
+
log("--dotenv requires a path (none provided)");
|
|
187
|
+
process.exit(2);
|
|
188
|
+
}
|
|
189
|
+
const COMMANDS = [
|
|
190
|
+
"skill",
|
|
191
|
+
"run",
|
|
192
|
+
"chat",
|
|
193
|
+
"record",
|
|
194
|
+
"replay",
|
|
195
|
+
"trace",
|
|
196
|
+
"decide",
|
|
197
|
+
"gates",
|
|
198
|
+
"answer",
|
|
199
|
+
"sync",
|
|
200
|
+
"list",
|
|
201
|
+
"boundary-check",
|
|
202
|
+
"vm",
|
|
203
|
+
];
|
|
204
|
+
if (COMMANDS.includes(explicitEnvFile)) {
|
|
205
|
+
log(`--dotenv requires a path but got the command "${explicitEnvFile}" — write \`--dotenv <path> ${explicitEnvFile} …\``);
|
|
206
|
+
process.exit(2);
|
|
207
|
+
}
|
|
208
|
+
argv.splice(envFileIdx, 2);
|
|
209
|
+
if (!existsSync(explicitEnvFile)) {
|
|
210
|
+
log(`--dotenv file not found: ${explicitEnvFile}`);
|
|
211
|
+
process.exit(2);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
const packageRootEnv = fileURLToPath(new URL("../.env", import.meta.url)); // dist/cli.js → <install>/.env
|
|
215
|
+
const sources = [...(explicitEnvFile ? [explicitEnvFile] : []), resolve(process.cwd(), ".env"), packageRootEnv];
|
|
216
|
+
const loadedEnv = [];
|
|
217
|
+
const seenSources = new Set();
|
|
218
|
+
for (const f of sources) {
|
|
219
|
+
const key = resolve(f);
|
|
220
|
+
if (seenSources.has(key))
|
|
221
|
+
continue; // don't double-load when cwd === install dir
|
|
222
|
+
seenSources.add(key);
|
|
223
|
+
loadedEnv.push(...loadDotenv(f));
|
|
224
|
+
}
|
|
225
|
+
// Only surface env-loading when it's non-obvious — an explicit --dotenv, or debug. The common
|
|
226
|
+
// auto-load (./.env / install .env) stays silent: auth either works or fails loudly. (Feedback: the
|
|
227
|
+
// line was repetitive noise across many invocations.)
|
|
228
|
+
if (loadedEnv.length && (explicitEnvFile || process.env.COWORK_HARNESS_DEBUG))
|
|
229
|
+
log(`[env] loaded ${loadedEnv.length} var(s): ${loadedEnv.join(", ")}`);
|
|
230
|
+
const [cmd, ...rest] = argv;
|
|
231
|
+
if (cmd === "--version" || cmd === "-v")
|
|
232
|
+
return void out(pkgVersion());
|
|
233
|
+
if (cmd === undefined || cmd === "--help" || cmd === "-h" || cmd === "help")
|
|
234
|
+
return printHelp();
|
|
235
|
+
switch (cmd) {
|
|
236
|
+
case "run":
|
|
237
|
+
return cmdRun(rest);
|
|
238
|
+
case "sync":
|
|
239
|
+
return cmdSync(rest);
|
|
240
|
+
case "list":
|
|
241
|
+
return cmdList();
|
|
242
|
+
case "boundary-check":
|
|
243
|
+
return cmdBoundary(rest);
|
|
244
|
+
case "vm":
|
|
245
|
+
return cmdVm(rest);
|
|
246
|
+
case "skill":
|
|
247
|
+
return cmdSkill(rest);
|
|
248
|
+
case "chat":
|
|
249
|
+
return cmdChat(rest);
|
|
250
|
+
case "record":
|
|
251
|
+
return cmdRecord(rest);
|
|
252
|
+
case "replay":
|
|
253
|
+
return cmdReplay(rest);
|
|
254
|
+
case "trace":
|
|
255
|
+
return cmdTrace(rest);
|
|
256
|
+
case "decide":
|
|
257
|
+
return cmdDecide(rest);
|
|
258
|
+
case "gates":
|
|
259
|
+
return cmdGates(rest);
|
|
260
|
+
case "answer":
|
|
261
|
+
return cmdAnswer(rest);
|
|
262
|
+
default:
|
|
263
|
+
log(`unknown command: ${cmd}\n`);
|
|
264
|
+
printHelp();
|
|
265
|
+
process.exit(2);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
/** Shared json-output predicate so the parser and the top-level catch can never drift. */
|
|
269
|
+
function isJsonOutput(args) {
|
|
270
|
+
for (let i = 0; i < args.length; i++) {
|
|
271
|
+
if (args[i] === "--output-format" && args[i + 1] === "json")
|
|
272
|
+
return true;
|
|
273
|
+
if (args[i] === "--output-format=json")
|
|
274
|
+
return true;
|
|
275
|
+
}
|
|
276
|
+
return false;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* #58: bounds-checked reader for value-taking flags. `args[++i]` with no following token silently
|
|
280
|
+
* yields `undefined` (e.g. a trailing `--decider-cmd` at the end of argv), which then becomes a
|
|
281
|
+
* broken flag value. Read the next token explicitly and, when it's absent, fail with the established
|
|
282
|
+
* usage-error exit code (2). takeCommonFlags can run before --output-format json is resolved, so the error
|
|
283
|
+
* goes to stderr unconditionally (machine callers piping us still see a non-zero exit).
|
|
284
|
+
*/
|
|
285
|
+
function flagValue(args, i, flag) {
|
|
286
|
+
const v = args[i + 1];
|
|
287
|
+
if (v === undefined) {
|
|
288
|
+
log(`${flag} requires a value (none provided)`); // stderr usage error
|
|
289
|
+
process.exit(2);
|
|
290
|
+
}
|
|
291
|
+
return v;
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Extract true positionals — args that are neither a flag nor the value consumed by a known
|
|
295
|
+
* value-taking flag. Fixes the `args.find((a) => !a.startsWith("--"))` idiom (#15/#16), which
|
|
296
|
+
* mistook a flag's value (e.g. the `1` in `--gate 1`, the `json` in `--output-format json`) for
|
|
297
|
+
* the positional. `valueFlags` lists the value-taking flags whose following token must be skipped.
|
|
298
|
+
*/
|
|
299
|
+
function positionals(args, valueFlags) {
|
|
300
|
+
const out = [];
|
|
301
|
+
for (let i = 0; i < args.length; i++) {
|
|
302
|
+
const a = args[i];
|
|
303
|
+
if (valueFlags.includes(a)) {
|
|
304
|
+
i++; // skip the flag AND its value
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
if (a.startsWith("-"))
|
|
308
|
+
continue; // any other (boolean) flag — skip just the flag
|
|
309
|
+
out.push(a);
|
|
310
|
+
}
|
|
311
|
+
return out;
|
|
312
|
+
}
|
|
313
|
+
function takeCommonFlags(args) {
|
|
314
|
+
const rest = [];
|
|
315
|
+
const flags = { output: "text", quiet: false, verbose: false };
|
|
316
|
+
for (let i = 0; i < args.length; i++) {
|
|
317
|
+
const a = args[i];
|
|
318
|
+
if (a === "--on-unanswered")
|
|
319
|
+
flags.onUnanswered = flagValue(args, i++, a);
|
|
320
|
+
else if (a === "--output-format") {
|
|
321
|
+
// #2: validate the enum (and bounds-check the value). An invalid/missing value previously fell
|
|
322
|
+
// back to "text" silently (`--output-format xml` behaved as text; a trailing `--output-format` too).
|
|
323
|
+
const v = flagValue(args, i++, a);
|
|
324
|
+
if (v !== "text" && v !== "json") {
|
|
325
|
+
log(`--output-format must be "text" or "json" (got "${v}")`);
|
|
326
|
+
process.exit(2);
|
|
327
|
+
}
|
|
328
|
+
flags.output = v;
|
|
329
|
+
}
|
|
330
|
+
else if (a === "--output-format=json")
|
|
331
|
+
flags.output = "json";
|
|
332
|
+
else if (a === "--output-format=text")
|
|
333
|
+
flags.output = "text";
|
|
334
|
+
else if (a === "--quiet" || a === "-q")
|
|
335
|
+
flags.quiet = true;
|
|
336
|
+
else if (a === "--verbose" || a === "-V")
|
|
337
|
+
flags.verbose = true;
|
|
338
|
+
else if (a === "--decider-cmd")
|
|
339
|
+
flags.deciderCmd = flagValue(args, i++, a);
|
|
340
|
+
else if (a === "--decider-dir")
|
|
341
|
+
flags.deciderDir = flagValue(args, i++, a);
|
|
342
|
+
else
|
|
343
|
+
rest.push(a);
|
|
344
|
+
}
|
|
345
|
+
return { rest, flags };
|
|
346
|
+
}
|
|
347
|
+
/** Resolve the output/render plan for a command (unified output model). */
|
|
348
|
+
function resolveOutput(command, flags) {
|
|
349
|
+
const color = process.stderr.isTTY === true && !process.env.NO_COLOR;
|
|
350
|
+
if (flags.output === "json")
|
|
351
|
+
return { json: true, render: false, footer: false, plan: { live: false, progress: false, verbose: false, color: false } };
|
|
352
|
+
if (flags.quiet)
|
|
353
|
+
return { json: false, render: false, footer: true, plan: { live: false, progress: false, verbose: false, color } };
|
|
354
|
+
const verbose = flags.verbose;
|
|
355
|
+
// skill renders live ("show me what it did"); run is verdict-first (renderer buffers for the
|
|
356
|
+
// failure transcript; live/per-tool only under --verbose).
|
|
357
|
+
const live = command === "skill" ? true : verbose;
|
|
358
|
+
const progress = command === "skill" ? true : verbose;
|
|
359
|
+
return { json: false, render: true, footer: true, plan: { live, progress, verbose, color } };
|
|
360
|
+
}
|
|
361
|
+
/** Resolve the on_unanswered default for a command (input-and-interactivity plan §3). This is the choke
|
|
362
|
+
* point BOTH run and skill pass through, so the removed/internal policy values are rejected here — they
|
|
363
|
+
* can't silently degrade to `fail` (which would pass a no-gate run green under a bogus policy). */
|
|
364
|
+
function resolvePolicy(command, flags) {
|
|
365
|
+
const json = flags.output === "json";
|
|
366
|
+
// `external` (the removed stdio channel) → `--decider-dir`/`--decider-cmd` subsume it.
|
|
367
|
+
if (flags.onUnanswered === "external")
|
|
368
|
+
fail(command, "usage", "--on-unanswered external was removed. Use --decider-dir <dir> (the in-band file channel for a driving agent) or --decider-cmd '<helper>'.", undefined, json);
|
|
369
|
+
// The LLM decider's CLI spelling is --decider-llm; we reject the raw policy value on the CLI to keep deciders in the --decider-* family (the scenario-YAML spelling is on_unanswered: llm).
|
|
370
|
+
if (flags.onUnanswered === "llm")
|
|
371
|
+
fail(command, "usage", '--on-unanswered llm is not a user flag. Use --decider-llm [--intent "<one line>"] to answer live questions with a model.', undefined, json);
|
|
372
|
+
if (flags.onUnanswered) {
|
|
373
|
+
// #3: validate the accepted set. `external`/`llm` are rejected above with redirect messages (the
|
|
374
|
+
// decider-orthogonality invariant); any OTHER bogus value (e.g. "banana") used to fall through here
|
|
375
|
+
// and pass unvalidated, with audit metadata reporting a nonsensical policy. Reject it loudly.
|
|
376
|
+
if (flags.onUnanswered !== "fail" && flags.onUnanswered !== "prompt" && flags.onUnanswered !== "first")
|
|
377
|
+
fail(command, "usage", `--on-unanswered must be fail|prompt|first (got "${flags.onUnanswered}")`, "for a model/external decider use --decider-llm, --decider-dir, or --decider-cmd", json);
|
|
378
|
+
if (command === "run" && flags.onUnanswered === "prompt") {
|
|
379
|
+
log("run rejects --on-unanswered prompt (would break determinism). Use fail|first.");
|
|
380
|
+
process.exit(2);
|
|
381
|
+
}
|
|
382
|
+
return flags.onUnanswered;
|
|
383
|
+
}
|
|
384
|
+
if (command === "run")
|
|
385
|
+
return "fail"; // scenarios are reproducible regression tests
|
|
386
|
+
// skill: adaptive — prompt if a human is at the TTY, else fail (CI/agent)
|
|
387
|
+
return process.stdin.isTTY && !process.env.CI ? "prompt" : "fail";
|
|
388
|
+
}
|
|
389
|
+
/** Resolve the external decider channel, if requested: `--decider-cmd` → a spawned helper, or
|
|
390
|
+
* `--decider-dir` → a file rendezvous (the driving agent answers in-band). BOTH keep the CLI's stdout
|
|
391
|
+
* FREE (the protocol is on the helper's pipes / on disk), so they compose with `--output-format json`.
|
|
392
|
+
* Returns undefined when neither is set. */
|
|
393
|
+
function resolveExternal(command, flags) {
|
|
394
|
+
if (flags.deciderDir != null && flags.deciderCmd != null)
|
|
395
|
+
fail(command, "usage", "--decider-dir conflicts with --decider-cmd (one terminal channel).", undefined, flags.output === "json");
|
|
396
|
+
if (flags.deciderDir != null) {
|
|
397
|
+
try {
|
|
398
|
+
return fileChannel(flags.deciderDir);
|
|
399
|
+
}
|
|
400
|
+
catch (e) {
|
|
401
|
+
return fail(command, "usage", String(e.message), undefined, flags.output === "json");
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
return flags.deciderCmd != null ? spawnChannel(flags.deciderCmd) : undefined;
|
|
405
|
+
}
|
|
406
|
+
/** The single error exit used by commands + the top-level catch. Every category → exit 2. */
|
|
407
|
+
function fail(command, category, message, hint, json) {
|
|
408
|
+
if (json)
|
|
409
|
+
out(jsonError(command, category, message, hint));
|
|
410
|
+
else {
|
|
411
|
+
log(message);
|
|
412
|
+
if (hint)
|
|
413
|
+
log(hint);
|
|
414
|
+
}
|
|
415
|
+
process.exit(2);
|
|
416
|
+
}
|
|
417
|
+
/** Split a `--answer "<key>=<value>"` arg; the value rejoins on "=" so a choice may itself contain "=". */
|
|
418
|
+
function splitEq(s) {
|
|
419
|
+
const [k, ...r] = (s ?? "").split("=");
|
|
420
|
+
return [k, r.join("=")];
|
|
421
|
+
}
|
|
422
|
+
/** Load an `--answer-policy <yaml>` file → scripted rules. Same shape as a scenario `answers:` block (a
|
|
423
|
+
* bare list, or an `{answers: [...]}` doc). Fails LOUD on a missing / unparseable / non-list file — a
|
|
424
|
+
* malformed policy must NOT validate as "0 rules" (the user would discover it only when a gate goes
|
|
425
|
+
* unanswered mid-run). */
|
|
426
|
+
function loadAnswerPolicy(command, path, json) {
|
|
427
|
+
if (!existsSync(path))
|
|
428
|
+
fail(command, "usage", `--answer-policy file not found: ${path}`, undefined, json);
|
|
429
|
+
let parsed;
|
|
430
|
+
try {
|
|
431
|
+
parsed = parseYaml(readFileSync(path, "utf8"));
|
|
432
|
+
}
|
|
433
|
+
catch (e) {
|
|
434
|
+
return fail(command, "usage", `cannot parse --answer-policy ${path}: ${String(e.message)}`, undefined, json);
|
|
435
|
+
}
|
|
436
|
+
const rules = Array.isArray(parsed) ? parsed : (parsed?.answers ?? []);
|
|
437
|
+
if (!Array.isArray(rules))
|
|
438
|
+
fail(command, "usage", `--answer-policy must be a list of rules (or an {answers: [...]} doc)`, undefined, json);
|
|
439
|
+
// #7: validate EACH rule against the AnswerRule schema instead of a blind cast. A malformed rule
|
|
440
|
+
// (non-object, wrong field types) must fail loud here, not silently validate as a rule that never
|
|
441
|
+
// matches and surfaces only as an unanswered gate mid-run.
|
|
442
|
+
const out = [];
|
|
443
|
+
for (const [idx, raw] of rules.entries()) {
|
|
444
|
+
const r = AnswerRule.safeParse(raw);
|
|
445
|
+
if (!r.success)
|
|
446
|
+
fail(command, "usage", `--answer-policy rule #${idx + 1} is malformed: ${r.error.issues.map((i) => `${i.path.join(".") || "(root)"} ${i.message}`).join("; ")}`, undefined, json);
|
|
447
|
+
out.push(r.data);
|
|
448
|
+
}
|
|
449
|
+
return out;
|
|
450
|
+
}
|
|
451
|
+
/**
|
|
452
|
+
* The per-scenario run lifecycle shared by `cmdRun` and `cmdSkill` (they had drifted while hand-kept in
|
|
453
|
+
* sync). Owns ONLY the per-scenario spine: renderer + renderStart, the idle heartbeat (disabled under
|
|
454
|
+
* --output-format json OR an external channel), `executeScenario`, the `UnansweredError → fail` mapping, and the
|
|
455
|
+
* footer. The CALLER keeps everything that differs: the external channel's create/close (run reuses ONE
|
|
456
|
+
* across the file loop), the `--output-format json` envelope, and the exit code.
|
|
457
|
+
*/
|
|
458
|
+
async function runOneScenario(p) {
|
|
459
|
+
const { command, scenario, label, flags, policy, externalChannel, o, keep, extra } = p;
|
|
460
|
+
const renderer = o.render ? makeRenderer(o.plan) : undefined;
|
|
461
|
+
if (!o.json && !flags.quiet)
|
|
462
|
+
renderStart(label, scenario.fidelity, o.plan);
|
|
463
|
+
const start = Date.now();
|
|
464
|
+
const stopHeartbeat = o.json || externalChannel ? () => { } : startHeartbeat(renderer, o.plan, start);
|
|
465
|
+
let result;
|
|
466
|
+
try {
|
|
467
|
+
result = await executeScenario(scenario, { ...extra, onUnanswered: policy, externalChannel, hooks: renderer ? [renderer] : [] });
|
|
468
|
+
}
|
|
469
|
+
catch (e) {
|
|
470
|
+
if (e instanceof UnansweredError) {
|
|
471
|
+
const chan = flags.deciderDir ? "decider-dir" : flags.deciderCmd ? "decider-cmd" : policy;
|
|
472
|
+
const prefix = command === "run" ? `${scenario.name}: ` : ""; // run names the scenario; skill is single
|
|
473
|
+
fail(command, "unanswered", `${prefix}unanswered question (on_unanswered=${chan})`, e.hint, o.json);
|
|
474
|
+
}
|
|
475
|
+
throw e; // BoundaryError + generic → top-level catch (categorized there)
|
|
476
|
+
}
|
|
477
|
+
finally {
|
|
478
|
+
stopHeartbeat();
|
|
479
|
+
}
|
|
480
|
+
// footer (stderr) and the json envelope (stdout, emitted by the caller) are mutually exclusive —
|
|
481
|
+
// resolveOutput makes `footer` false under --output-format json — so their relative order never matters.
|
|
482
|
+
if (o.footer)
|
|
483
|
+
renderFooter(result, o.plan, { durationMs: Date.now() - start, renderer, keep });
|
|
484
|
+
return result;
|
|
485
|
+
}
|
|
486
|
+
async function cmdRun(rawArgs) {
|
|
487
|
+
if (hasHelp(rawArgs))
|
|
488
|
+
return void log(RUN_HELP);
|
|
489
|
+
const { rest: args, flags } = takeCommonFlags(rawArgs);
|
|
490
|
+
const target = args[0];
|
|
491
|
+
if (!target)
|
|
492
|
+
fail("run", "usage", "usage: run <scenario.yaml | dir/>", undefined, flags.output === "json");
|
|
493
|
+
// `takeCommonFlags` strips known flags; `run` takes exactly one positional (a scenario file or a
|
|
494
|
+
// dir), so anything left over is unexpected. Reject it LOUDLY instead of silently dropping it —
|
|
495
|
+
// e.g. `--fidelity microvm` was a silent no-op (fidelity comes from the scenario's `fidelity:`
|
|
496
|
+
// field, not a flag). Runs before existsSync so the message is precise even for a bogus path.
|
|
497
|
+
const extra = args.slice(1);
|
|
498
|
+
if (extra.length)
|
|
499
|
+
fail("run", "usage", `unexpected argument(s): ${extra.join(" ")} — \`run\` takes one <scenario.yaml | dir/> plus common flags. Fidelity is set by the scenario's \`fidelity:\` field, not a flag.`, undefined, flags.output === "json");
|
|
500
|
+
// A non-existent path threw a raw ENOENT (exit 2 + stack) instead of a clean usage message.
|
|
501
|
+
if (!existsSync(target))
|
|
502
|
+
fail("run", "usage", `scenario path not found: ${target}`, undefined, flags.output === "json");
|
|
503
|
+
const files = statSync(target).isDirectory()
|
|
504
|
+
? readdirSync(target)
|
|
505
|
+
.filter((f) => f.endsWith(".yaml") || f.endsWith(".yml"))
|
|
506
|
+
.sort() // deterministic batch order — readdirSync is FS/OS-dependent
|
|
507
|
+
.map((f) => join(target, f))
|
|
508
|
+
: [target];
|
|
509
|
+
const externalChannel = resolveExternal("run", flags); // created once; reused across scenarios
|
|
510
|
+
const policy = externalChannel ? "fail" : resolvePolicy("run", flags);
|
|
511
|
+
const o = resolveOutput("run", flags);
|
|
512
|
+
const results = [];
|
|
513
|
+
try {
|
|
514
|
+
for (let i = 0; i < files.length; i++) {
|
|
515
|
+
const scenario = parseScenarioFile(files[i]);
|
|
516
|
+
// The CLI flag guard (resolvePolicy) rejects --on-unanswered prompt on `run`, but a committed
|
|
517
|
+
// scenario could smuggle it via its YAML and silently block/hang in non-TTY CI. Reject it here too.
|
|
518
|
+
if (scenario.on_unanswered === "prompt")
|
|
519
|
+
fail("run", "usage", `scenario "${scenario.name}" sets on_unanswered: prompt — rejected on \`run\` (breaks determinism / hangs in CI). Use fail|first, or --decider-dir/--decider-cmd.`, undefined, o.json);
|
|
520
|
+
const label = files.length > 1 ? `[${i + 1}/${files.length}] ${scenario.name}` : scenario.name;
|
|
521
|
+
results.push(await runOneScenario({ command: "run", scenario, label, flags, policy, externalChannel, o }));
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
finally {
|
|
525
|
+
externalChannel?.close?.(); // ONE channel reused across the loop — close after ALL scenarios (not per-run)
|
|
526
|
+
}
|
|
527
|
+
// All channels keep stdout free → the normal output path (envelope under --output-format json, nothing
|
|
528
|
+
// otherwise). No terminal {type:"result"} line — `--decider-cmd`/`--decider-dir` compose with json.
|
|
529
|
+
if (o.json)
|
|
530
|
+
out(jsonEnvelope("run", results));
|
|
531
|
+
const failed = results.filter((r) => r.assertions.some((a) => !a.pass) || r.result === "error");
|
|
532
|
+
process.exit(failed.length > 0 ? 1 : 0);
|
|
533
|
+
}
|
|
534
|
+
async function cmdSkill(rawArgs) {
|
|
535
|
+
if (hasHelp(rawArgs))
|
|
536
|
+
return void log(SKILL_HELP);
|
|
537
|
+
const { rest: args, flags } = takeCommonFlags(rawArgs);
|
|
538
|
+
const positional = [];
|
|
539
|
+
const answers = [];
|
|
540
|
+
const extraPlugins = [];
|
|
541
|
+
const marketplaces = [];
|
|
542
|
+
const enables = [];
|
|
543
|
+
const uploads = [];
|
|
544
|
+
const folders = [];
|
|
545
|
+
let fidelity = "container";
|
|
546
|
+
let model;
|
|
547
|
+
let promptFile;
|
|
548
|
+
let sessionId;
|
|
549
|
+
let answerPolicy;
|
|
550
|
+
let intent;
|
|
551
|
+
let deciderLlm = false;
|
|
552
|
+
let resume = false;
|
|
553
|
+
let dryRun = false;
|
|
554
|
+
let keep = false;
|
|
555
|
+
for (let i = 0; i < args.length; i++) {
|
|
556
|
+
const a = args[i];
|
|
557
|
+
if (a === "--fidelity") {
|
|
558
|
+
fidelity = flagValue(args, i++, a); // #58: bounds-checked
|
|
559
|
+
// #6: validate at parse time → category `usage`. Previously an invalid value was only rejected
|
|
560
|
+
// later by Scenario.parse (a Zod throw), which the top-level catch mapped to `internal` — a user
|
|
561
|
+
// mistake masquerading as a harness bug.
|
|
562
|
+
const FID = ["protocol", "container", "microvm", "hostloop", "cowork"];
|
|
563
|
+
if (!FID.includes(fidelity))
|
|
564
|
+
fail("skill", "usage", `--fidelity must be one of ${FID.join("|")} (got "${fidelity}")`, undefined, flags.output === "json");
|
|
565
|
+
}
|
|
566
|
+
else if (a === "--model")
|
|
567
|
+
model = flagValue(args, i++, a);
|
|
568
|
+
else if (a === "--prompt-file")
|
|
569
|
+
promptFile = flagValue(args, i++, a);
|
|
570
|
+
else if (a === "--upload")
|
|
571
|
+
uploads.push(flagValue(args, i++, a));
|
|
572
|
+
else if (a === "--folder")
|
|
573
|
+
folders.push(flagValue(args, i++, a));
|
|
574
|
+
else if (a === "--session-id")
|
|
575
|
+
sessionId = flagValue(args, i++, a);
|
|
576
|
+
else if (a === "--resume")
|
|
577
|
+
resume = true;
|
|
578
|
+
else if (a === "--decider-llm")
|
|
579
|
+
deciderLlm = true;
|
|
580
|
+
else if (a === "--intent")
|
|
581
|
+
intent = flagValue(args, i++, a);
|
|
582
|
+
else if (a === "--dry-run")
|
|
583
|
+
dryRun = true;
|
|
584
|
+
else if (a === "--keep")
|
|
585
|
+
keep = true;
|
|
586
|
+
else if (a === "--plugin")
|
|
587
|
+
extraPlugins.push(flagValue(args, i++, a));
|
|
588
|
+
else if (a === "--marketplace")
|
|
589
|
+
marketplaces.push(flagValue(args, i++, a));
|
|
590
|
+
else if (a === "--enable")
|
|
591
|
+
enables.push(flagValue(args, i++, a));
|
|
592
|
+
else if (a === "--answer") {
|
|
593
|
+
const [q, choose] = splitEq(flagValue(args, i++, a));
|
|
594
|
+
answers.push({ when_question: q, choose });
|
|
595
|
+
}
|
|
596
|
+
else if (a === "--answer-policy")
|
|
597
|
+
answerPolicy = flagValue(args, i++, a);
|
|
598
|
+
else
|
|
599
|
+
positional.push(a);
|
|
600
|
+
}
|
|
601
|
+
const isJson = flags.output === "json";
|
|
602
|
+
if (resume && !sessionId)
|
|
603
|
+
fail("skill", "usage", "--resume requires --session-id <id> (the session to resume)", undefined, isJson);
|
|
604
|
+
// #5: reject extra positionals so a shell-quoting slip (an unquoted multi-word prompt) can't silently
|
|
605
|
+
// drop part of the intended prompt. With --prompt-file the only positional is the plugin folder (1);
|
|
606
|
+
// without it, <plugin-folder> "<prompt>" (2). Anything beyond is unexpected.
|
|
607
|
+
const maxPositional = promptFile !== undefined ? 1 : 2;
|
|
608
|
+
if (positional.length > maxPositional)
|
|
609
|
+
fail("skill", "usage", `unexpected extra argument(s): ${positional.slice(maxPositional).join(" ")} — ${promptFile !== undefined
|
|
610
|
+
? "with --prompt-file, skill takes at most one positional (the plugin folder)"
|
|
611
|
+
: 'skill takes <plugin-folder> "<prompt>" — quote a prompt that contains spaces'}`, undefined, isJson);
|
|
612
|
+
// --answer-policy <yaml>: a reusable file of regex→choice rules (same shape as a scenario `answers:`
|
|
613
|
+
// block), so the common "answer known gates, zero JS" case needs no --decider-cmd helper. Rules from
|
|
614
|
+
// the file resolve first (ScriptedDecider); anything unmatched still follows --on-unanswered.
|
|
615
|
+
if (answerPolicy)
|
|
616
|
+
answers.push(...loadAnswerPolicy("skill", answerPolicy, isJson));
|
|
617
|
+
// --prompt-file reads the prompt verbatim (raw bytes, no shell parsing) — the robust way to pass a
|
|
618
|
+
// prompt containing $, backticks, or newlines. When given, the folder is positional[0] (no inline
|
|
619
|
+
// prompt positional is consumed for the prompt).
|
|
620
|
+
let filePrompt;
|
|
621
|
+
if (promptFile !== undefined) {
|
|
622
|
+
if (!existsSync(promptFile))
|
|
623
|
+
fail("skill", "usage", `--prompt-file not found: ${promptFile}`, undefined, isJson);
|
|
624
|
+
try {
|
|
625
|
+
filePrompt = readFileSync(promptFile, "utf8");
|
|
626
|
+
}
|
|
627
|
+
catch (e) {
|
|
628
|
+
fail("skill", "usage", `cannot read --prompt-file ${promptFile}: ${String(e.message)}`, undefined, isJson);
|
|
629
|
+
}
|
|
630
|
+
if (!filePrompt.trim())
|
|
631
|
+
fail("skill", "usage", `--prompt-file is empty: ${promptFile}`, undefined, isJson);
|
|
632
|
+
}
|
|
633
|
+
// With --prompt-file, every positional is a source (folder); without it, the LAST positional is the
|
|
634
|
+
// inline prompt and earlier positionals (if any) are the folder.
|
|
635
|
+
const haveSource = (filePrompt !== undefined ? positional.length >= 1 : positional.length >= 2) || marketplaces.length || extraPlugins.length;
|
|
636
|
+
const folder = filePrompt !== undefined ? positional[0] : positional.length >= 2 ? positional[0] : undefined;
|
|
637
|
+
const prompt = filePrompt ?? positional[positional.length >= 2 ? 1 : 0];
|
|
638
|
+
if (!haveSource || !prompt) {
|
|
639
|
+
fail("skill", "usage", 'usage: cowork-harness skill <plugin-folder> "<prompt>" [--prompt-file <path>] [--marketplace <dir> --enable name@mkt] [--plugin <dir>]… [--fidelity …] [--answer "q=choice"] (skill --help for all flags)', undefined, isJson);
|
|
640
|
+
}
|
|
641
|
+
const localPlugins = [...(folder ? [folder] : []), ...extraPlugins];
|
|
642
|
+
// Resolve the inline session's relative paths against cwd (consistent with `run`'s file path, which
|
|
643
|
+
// goes through resolveSessionPaths) so uploads/folders/plugins are cwd-independent for the skill path.
|
|
644
|
+
const session = resolveSessionPaths(loadSession({
|
|
645
|
+
model,
|
|
646
|
+
permission_parity: "cowork",
|
|
647
|
+
plugins: { local_plugins: localPlugins, local_marketplaces: marketplaces, enabled: enables },
|
|
648
|
+
uploads, // --upload <file> → mnt/uploads/<basename> (the "attach a file" path; ad-hoc parity with session.uploads)
|
|
649
|
+
folders: folders.map((from) => ({ from, mode: "rw" })), // --folder <dir> → mnt/.projects/<id> (asar: rw, delete denied by default)
|
|
650
|
+
}), process.cwd());
|
|
651
|
+
// Name the run after the skill folder's BASENAME (not the whole dashified path → "skill-ill-…").
|
|
652
|
+
const sourceName = basename((folder ?? marketplaces[0] ?? extraPlugins[0] ?? "test").replace(/\/+$/, "")) || "test";
|
|
653
|
+
const scenario = Scenario.parse({
|
|
654
|
+
name: `skill-${sourceName
|
|
655
|
+
.replace(/[^a-zA-Z0-9]+/g, "-")
|
|
656
|
+
.replace(/^-+|-+$/g, "")
|
|
657
|
+
.slice(0, 40)}`,
|
|
658
|
+
baseline: "latest",
|
|
659
|
+
session: "(inline)",
|
|
660
|
+
fidelity,
|
|
661
|
+
prompt,
|
|
662
|
+
answers,
|
|
663
|
+
assert: [{ result: "success" }],
|
|
664
|
+
});
|
|
665
|
+
if (dryRun) {
|
|
666
|
+
out(JSON.stringify({ fidelity, prompt, localPlugins, marketplaces, enabled: enables, answers }, null, 2));
|
|
667
|
+
return;
|
|
668
|
+
}
|
|
669
|
+
const externalChannel = resolveExternal("skill", flags);
|
|
670
|
+
// `--decider-llm` is the ONLY user-facing way to select the LLM terminal (it maps to the `llm`
|
|
671
|
+
// policy below; the bare `--on-unanswered llm` CLI flag is rejected at resolvePolicy). (Issue 2)
|
|
672
|
+
const useLlm = deciderLlm;
|
|
673
|
+
if (useLlm && externalChannel)
|
|
674
|
+
fail("skill", "usage", "--decider-llm conflicts with --decider-cmd/--decider-dir (two terminals).", undefined, isJson);
|
|
675
|
+
// base policy; an external channel or the LLM decider overrides the terminal in execute.ts
|
|
676
|
+
const policy = externalChannel ? "fail" : useLlm ? "llm" : resolvePolicy("skill", flags);
|
|
677
|
+
const o = resolveOutput("skill", flags);
|
|
678
|
+
let result;
|
|
679
|
+
try {
|
|
680
|
+
result = await runOneScenario({
|
|
681
|
+
command: "skill",
|
|
682
|
+
scenario,
|
|
683
|
+
label: scenario.name,
|
|
684
|
+
flags,
|
|
685
|
+
policy,
|
|
686
|
+
externalChannel,
|
|
687
|
+
o,
|
|
688
|
+
keep,
|
|
689
|
+
extra: {
|
|
690
|
+
session,
|
|
691
|
+
sessionId,
|
|
692
|
+
resume,
|
|
693
|
+
llmIntent: intent,
|
|
694
|
+
nonDeterministicHint: flags.deciderDir != null || flags.deciderCmd != null, // driving agent / helper answers → not reproducible (M4; #48)
|
|
695
|
+
},
|
|
696
|
+
});
|
|
697
|
+
}
|
|
698
|
+
finally {
|
|
699
|
+
externalChannel?.close?.();
|
|
700
|
+
}
|
|
701
|
+
const bad = result.assertions.filter((a) => !a.pass);
|
|
702
|
+
// All channels keep stdout free → the json envelope is the only stdout (footer goes to stderr, and is
|
|
703
|
+
// mutually exclusive with --output-format json). The footer itself is emitted inside runOneScenario.
|
|
704
|
+
if (o.json)
|
|
705
|
+
out(jsonEnvelope("skill", [result]));
|
|
706
|
+
process.exit(bad.length || result.result === "error" ? 1 : 0);
|
|
707
|
+
}
|
|
708
|
+
function cmdVm(args) {
|
|
709
|
+
const sub = args[0];
|
|
710
|
+
const baseline = loadBaseline(args[1] ?? "latest");
|
|
711
|
+
// #62/#63: the instance name is derived from the config hash (see lima.ts instanceName) — a config
|
|
712
|
+
// change yields a new name, so a stale VM is never silently reused.
|
|
713
|
+
const instance = instanceName(baseline);
|
|
714
|
+
if (sub === "status")
|
|
715
|
+
log(`${instance}: ${vmStatus(instance)}`);
|
|
716
|
+
else if (sub === "init") {
|
|
717
|
+
const { status } = vmInit(baseline);
|
|
718
|
+
log(`${instance}: ${status}`);
|
|
719
|
+
}
|
|
720
|
+
else if (sub === "delete") {
|
|
721
|
+
vmDelete(instance);
|
|
722
|
+
log(`${instance} deleted`);
|
|
723
|
+
}
|
|
724
|
+
else if (sub === "prune") {
|
|
725
|
+
const pruned = vmPrune(instance);
|
|
726
|
+
log(pruned.length ? `pruned ${pruned.length} orphaned VM(s): ${pruned.join(", ")}` : `no orphaned VMs (current: ${instance})`);
|
|
727
|
+
}
|
|
728
|
+
else {
|
|
729
|
+
// #11: an invalid/absent subcommand must exit non-zero — a bare `log` exits 0, so a CI script
|
|
730
|
+
// running `vm typo` would read it as success.
|
|
731
|
+
log("usage: vm <init|status|delete|prune>");
|
|
732
|
+
process.exit(2);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
function cmdBoundary(args) {
|
|
736
|
+
// Optional --session <file>: fold that session's egress additions into the boundary allowlist so the
|
|
737
|
+
// self-test exercises the same boundary the session's runs would (not just baseline invariants).
|
|
738
|
+
const si = args.indexOf("--session");
|
|
739
|
+
// #12: a trailing `--session` with no value silently ran the boundary check WITHOUT the session's
|
|
740
|
+
// egress additions. Bounds-check it.
|
|
741
|
+
if (si >= 0 && args[si + 1] === undefined) {
|
|
742
|
+
log("--session requires a value (path to a session YAML)");
|
|
743
|
+
process.exit(2);
|
|
744
|
+
}
|
|
745
|
+
const sessionPath = si >= 0 ? args[si + 1] : undefined;
|
|
746
|
+
const positional = args.filter((a, i) => a !== "--session" && args[i - 1] !== "--session");
|
|
747
|
+
const baseline = loadBaseline(positional[0] ?? "latest");
|
|
748
|
+
let sessionEgress;
|
|
749
|
+
if (sessionPath) {
|
|
750
|
+
const s = loadSession(parseYaml(readFileSync(sessionPath, "utf8")));
|
|
751
|
+
sessionEgress = { extraAllow: s.egress.extra_allow, unrestricted: s.egress.unrestricted };
|
|
752
|
+
}
|
|
753
|
+
const results = runBoundaryChecks(baseline, sessionEgress);
|
|
754
|
+
log(formatBoundary(results));
|
|
755
|
+
process.exit(results.every((r) => r.pass) ? 0 : 1);
|
|
756
|
+
}
|
|
757
|
+
function cmdSync(args) {
|
|
758
|
+
const allowEmpty = args.includes("--allow-empty");
|
|
759
|
+
const res = sync();
|
|
760
|
+
// #37 — refuse to write a baseline with empty version fields. An empty appVersion would produce
|
|
761
|
+
// `desktop-.json` (invalid filename); an empty agentVersion means resolveAgentBinary will fail.
|
|
762
|
+
const versionErrors = [];
|
|
763
|
+
if (!res.appVersion)
|
|
764
|
+
versionErrors.push("appVersion (Desktop not found or Info.plist unreadable — install/open Claude Desktop)");
|
|
765
|
+
if (!res.agentVersion)
|
|
766
|
+
versionErrors.push("agentVersion (.sdk-version missing — open Cowork once to stage the agent binary)");
|
|
767
|
+
if (versionErrors.length) {
|
|
768
|
+
log("ERROR: sync could not resolve required version fields — refusing to write baseline:");
|
|
769
|
+
for (const e of versionErrors)
|
|
770
|
+
log(` - ${e}`);
|
|
771
|
+
log("Fix the above, then re-run `cowork-harness sync`.");
|
|
772
|
+
process.exit(1);
|
|
773
|
+
}
|
|
774
|
+
// #41 — refuse to write a baseline with an empty allowlist unless --allow-empty is passed.
|
|
775
|
+
// An empty allowDomains = default-deny on ALL egress, which silently breaks every scenario.
|
|
776
|
+
if (res.allowDomains.length === 0) {
|
|
777
|
+
log("WARNING: sync produced an empty allowDomains list (asar domain regex matched nothing — asar layout moved).");
|
|
778
|
+
if (!allowEmpty) {
|
|
779
|
+
log("Refusing to write baseline with allowDomains: []. Fix the regex in cowork-sync.ts,");
|
|
780
|
+
log("or hand-edit network.allowDomains in an existing baseline, then re-run.");
|
|
781
|
+
log("Pass --allow-empty to force-write anyway (use only if you understand the egress impact).");
|
|
782
|
+
process.exit(1);
|
|
783
|
+
}
|
|
784
|
+
log("--allow-empty passed: proceeding with empty allowDomains (egress will be default-deny for ALL domains).");
|
|
785
|
+
}
|
|
786
|
+
const baselinePath = join(BASELINES_DIR, `desktop-${res.appVersion}.json`);
|
|
787
|
+
let base;
|
|
788
|
+
try {
|
|
789
|
+
base = JSON.parse(JSON.stringify(loadBaseline("latest")));
|
|
790
|
+
}
|
|
791
|
+
catch {
|
|
792
|
+
throw new Error("No base baseline in baselines/. Commit one (e.g. desktop-<ver>.json) before sync can merge onto it.");
|
|
793
|
+
}
|
|
794
|
+
// #38 — recompute agentBinary.stagedPath when agentVersion changes.
|
|
795
|
+
// Strategy: derive the path by convention (same layout as in the committed baselines:
|
|
796
|
+
// ~/Library/Application Support/Claude/claude-code-vm/<agentVersion>/claude)
|
|
797
|
+
// then VERIFY the derived path exists, because resolveAgentBinary (baseline.ts:16) will fail
|
|
798
|
+
// on a stale path. We warn loudly rather than blocking — the file may not be staged yet on this
|
|
799
|
+
// machine, but the path is the correct convention for the new version.
|
|
800
|
+
const baseAgentBinary = (base.agentBinary ?? {});
|
|
801
|
+
const oldStagedPath = baseAgentBinary.stagedPath ?? "";
|
|
802
|
+
// Replace the version segment in the staged path with the new agentVersion. Gate on whether the regex
|
|
803
|
+
// actually MATCHED (not result==input) — an unchanged-version re-sync produces result==input and must NOT
|
|
804
|
+
// warn; an empty/non-standard layout falls back to the canonical Desktop path so the pointer isn't stale.
|
|
805
|
+
const versionRe = /claude-code-vm\/[^/]+\/claude$/;
|
|
806
|
+
let derivedStagedPath;
|
|
807
|
+
if (versionRe.test(oldStagedPath)) {
|
|
808
|
+
derivedStagedPath = oldStagedPath.replace(versionRe, `claude-code-vm/${res.agentVersion}/claude`);
|
|
809
|
+
}
|
|
810
|
+
else {
|
|
811
|
+
derivedStagedPath = `~/Library/Application Support/Claude/claude-code-vm/${res.agentVersion}/claude`;
|
|
812
|
+
if (oldStagedPath)
|
|
813
|
+
log(`WARNING: agentBinary.stagedPath layout was unexpected ("${oldStagedPath}") — rewrote to the canonical path for ${res.agentVersion}.`);
|
|
814
|
+
}
|
|
815
|
+
const resolvedDerived = derivedStagedPath.replace(/^~(?=$|\/)/, join(process.env.HOME ?? "~"));
|
|
816
|
+
if (!existsSync(resolvedDerived)) {
|
|
817
|
+
log(`WARNING: derived agentBinary.stagedPath does not exist on this machine: ${derivedStagedPath}`);
|
|
818
|
+
log(` (The new agentVersion is ${res.agentVersion}. Open Cowork once to stage the binary, then re-run sync.)`);
|
|
819
|
+
log(` resolveAgentBinary will fail until the file is present or COWORK_AGENT_BINARY is set.`);
|
|
820
|
+
}
|
|
821
|
+
const nextAgentBinary = { ...baseAgentBinary, stagedPath: derivedStagedPath };
|
|
822
|
+
// #39 — re-sync GrowthBook gate states from the decoded fcache (was: stale-carry + blanket warning).
|
|
823
|
+
// Gates drive the cowork loop decision (decideLoopFromBaseline) and the dispatch cap; decoding the
|
|
824
|
+
// fcache here makes a re-sync refresh them and surfaces real drift instead of silently carrying stale.
|
|
825
|
+
const baseProvenance = (base.provenance ?? {});
|
|
826
|
+
const baseGates = (baseProvenance.gates ?? {});
|
|
827
|
+
let nextGates = baseGates;
|
|
828
|
+
if (res.gates) {
|
|
829
|
+
nextGates = {};
|
|
830
|
+
// Preserve authored $comment / any non-pinned keys from the base.
|
|
831
|
+
for (const [k, v] of Object.entries(baseGates))
|
|
832
|
+
if (k.startsWith("$"))
|
|
833
|
+
nextGates[k] = v;
|
|
834
|
+
for (const g of Object.values(res.gates)) {
|
|
835
|
+
const key = `${g.name}:${g.id}`;
|
|
836
|
+
const prev = baseGates[key];
|
|
837
|
+
const prevOn = typeof prev === "string" ? /on|true|force/i.test(prev) : !!prev?.on;
|
|
838
|
+
// Preserve the human annotation: from a prose string, drop the leading "on(force) " token; from
|
|
839
|
+
// a structured entry, keep its `note`.
|
|
840
|
+
const prevNote = typeof prev === "string"
|
|
841
|
+
? prev.replace(/^(on|off)\([^)]*\)\s*/i, "").trim()
|
|
842
|
+
: (prev?.note ?? "").trim();
|
|
843
|
+
nextGates[key] = { on: g.on, source: g.source, value: g.value, ...(prevNote ? { note: prevNote } : {}) };
|
|
844
|
+
if (prev !== undefined && prevOn !== g.on) {
|
|
845
|
+
log(`WARNING: gate ${key} DRIFTED: ${prevOn ? "on" : "off"} → ${g.on ? "on" : "off"} (source=${g.source}). Loop/dispatch behavior may change — review carefully.`);
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
// A pinned gate absent from THIS fcache (partial cache) would otherwise vanish from provenance,
|
|
849
|
+
// silently dropping a loop/dispatch-driving gate. Carry it forward from the base and flag it.
|
|
850
|
+
for (const [k, v] of Object.entries(baseGates)) {
|
|
851
|
+
if (k.startsWith("$") || k in nextGates)
|
|
852
|
+
continue;
|
|
853
|
+
nextGates[k] = v;
|
|
854
|
+
log(`WARNING: gate ${k} not present in fcache this sync — carried forward from base (may be stale).`);
|
|
855
|
+
}
|
|
856
|
+
log(`gates: re-synced ${Object.values(res.gates).length} pinned gate states from fcache.`);
|
|
857
|
+
}
|
|
858
|
+
else {
|
|
859
|
+
log("WARNING: fcache unreadable — provenance.gates carried over from base (may be stale).");
|
|
860
|
+
}
|
|
861
|
+
const prevFingerprint = baseProvenance.asarFingerprint;
|
|
862
|
+
if (prevFingerprint && prevFingerprint !== res.asarFingerprint) {
|
|
863
|
+
log(`note: asarFingerprint changed (${prevFingerprint} → ${res.asarFingerprint}); gates re-synced above.`);
|
|
864
|
+
}
|
|
865
|
+
const next = {
|
|
866
|
+
...base,
|
|
867
|
+
baselineVersion: 1,
|
|
868
|
+
appVersion: res.appVersion,
|
|
869
|
+
capturedAt: new Date().toISOString().slice(0, 10),
|
|
870
|
+
agentVersion: res.agentVersion,
|
|
871
|
+
agentBinary: nextAgentBinary,
|
|
872
|
+
network: { ...base.network, mode: res.networkMode ?? "gvisor", allowKind: "allowlist", allowDomains: res.allowDomains },
|
|
873
|
+
requireFullVmSandbox: res.requireFullVmSandbox,
|
|
874
|
+
provenance: { ...baseProvenance, gates: nextGates, asarFingerprint: res.asarFingerprint },
|
|
875
|
+
};
|
|
876
|
+
if (args.includes("--diff")) {
|
|
877
|
+
try {
|
|
878
|
+
const prev = JSON.parse(readFileSync(baselinePath, "utf8"));
|
|
879
|
+
log("=== diff vs committed baseline ===");
|
|
880
|
+
diff(prev, next, "");
|
|
881
|
+
}
|
|
882
|
+
catch {
|
|
883
|
+
log(`(no committed ${baselinePath} yet — this would be the first)`);
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
if (res.unknownDeltas.length) {
|
|
887
|
+
log("\n⚠ unknown deltas (extend src/sync/cowork-sync.ts):");
|
|
888
|
+
for (const d of res.unknownDeltas)
|
|
889
|
+
log(" - " + d);
|
|
890
|
+
}
|
|
891
|
+
if (!args.includes("--diff")) {
|
|
892
|
+
mkdirSync(BASELINES_DIR, { recursive: true });
|
|
893
|
+
writeFileSync(baselinePath, JSON.stringify(next, null, 2));
|
|
894
|
+
log(`wrote ${baselinePath}`);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
function cmdList() {
|
|
898
|
+
for (const f of readdirSync(BASELINES_DIR).filter((f) => f.endsWith(".json")))
|
|
899
|
+
out(f);
|
|
900
|
+
}
|
|
901
|
+
/** `decide` — validate a decider (helper OR policy) against a sample question in ~2s, so you don't
|
|
902
|
+
* discover a wire-protocol bug 12 minutes into a live run. Shows the exact request a `--decider-cmd`
|
|
903
|
+
* helper receives and the answer it produced (or the protocol error); for `--answer`/`--answer-policy`
|
|
904
|
+
* it shows which rule matched. */
|
|
905
|
+
async function cmdDecide(args) {
|
|
906
|
+
const json = isJsonOutput(args);
|
|
907
|
+
let question = "Confirm the detected stage before proceeding?";
|
|
908
|
+
const options = [];
|
|
909
|
+
let deciderCmd;
|
|
910
|
+
let policy;
|
|
911
|
+
let deciderLlm = false;
|
|
912
|
+
let intent;
|
|
913
|
+
const rules = [];
|
|
914
|
+
for (let i = 0; i < args.length; i++) {
|
|
915
|
+
const a = args[i];
|
|
916
|
+
if (a === "--question")
|
|
917
|
+
question = flagValue(args, i++, a); // #58: bounds-checked
|
|
918
|
+
else if (a === "--option")
|
|
919
|
+
options.push(flagValue(args, i++, a));
|
|
920
|
+
else if (a === "--decider-cmd")
|
|
921
|
+
deciderCmd = flagValue(args, i++, a);
|
|
922
|
+
else if (a === "--decider-llm")
|
|
923
|
+
deciderLlm = true;
|
|
924
|
+
else if (a === "--intent")
|
|
925
|
+
intent = flagValue(args, i++, a);
|
|
926
|
+
else if (a === "--answer-policy")
|
|
927
|
+
policy = flagValue(args, i++, a);
|
|
928
|
+
else if (a === "--answer") {
|
|
929
|
+
const [q, choose] = splitEq(flagValue(args, i++, a));
|
|
930
|
+
rules.push({ when_question: q, choose });
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
// #13: `decide` does not implement the file-rendezvous channel — reject `--decider-dir` loudly
|
|
934
|
+
// instead of silently ignoring a first-class runtime path.
|
|
935
|
+
if (args.includes("--decider-dir"))
|
|
936
|
+
fail("decide", "usage", "decide does not support --decider-dir (the file-rendezvous channel); validate that path by running a scenario with --decider-dir. Use --decider-cmd '<helper>' to check a spawned helper here.", undefined, json);
|
|
937
|
+
// #14: reject conflicting terminal deciders — both set, the LLM branch would silently win and the
|
|
938
|
+
// helper would never be exercised. Mirrors cmdSkill/resolveExternal's conflict guards.
|
|
939
|
+
if (deciderLlm && deciderCmd)
|
|
940
|
+
fail("decide", "usage", "--decider-llm conflicts with --decider-cmd (one terminal decider).", undefined, json);
|
|
941
|
+
if (policy)
|
|
942
|
+
rules.push(...loadAnswerPolicy("decide", policy, json));
|
|
943
|
+
const opts = options.length ? options : ["Looks right", "Change it", "Correct or add data"];
|
|
944
|
+
const req = { id: "check", kind: "question", questions: [{ question, options: opts.map((label) => ({ label })) }] };
|
|
945
|
+
const ctx = { task: "", transcript: () => "(sample transcript context)", toolLog: () => [], runId: "decide-check" };
|
|
946
|
+
log(`sample question: "${question}" options: [${opts.join(" | ")}]`);
|
|
947
|
+
try {
|
|
948
|
+
if (deciderLlm) {
|
|
949
|
+
const d = await new LlmDecider(claudeCliComplete, intent).decide(req, ctx);
|
|
950
|
+
const answer = d.response.answers?.[question];
|
|
951
|
+
if (json)
|
|
952
|
+
out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: true, answer, by: "llm" }));
|
|
953
|
+
else
|
|
954
|
+
log(`✓ LLM decider answered: "${question}" → "${answer}" (non-deterministic)`);
|
|
955
|
+
}
|
|
956
|
+
else if (deciderCmd) {
|
|
957
|
+
const inner = spawnChannel(deciderCmd);
|
|
958
|
+
let sent = "";
|
|
959
|
+
const channel = {
|
|
960
|
+
write: (l) => ((sent = l), inner.write(l)),
|
|
961
|
+
readLine: () => inner.readLine(),
|
|
962
|
+
close: () => inner.close?.(),
|
|
963
|
+
};
|
|
964
|
+
try {
|
|
965
|
+
const d = await new ExternalDecider(channel).decide(req, ctx);
|
|
966
|
+
const answer = d.response.answers?.[question];
|
|
967
|
+
log(`helper received: ${sent}`);
|
|
968
|
+
if (json)
|
|
969
|
+
out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: true, answer }));
|
|
970
|
+
else
|
|
971
|
+
log(`✓ helper answered: "${question}" → "${answer}"`);
|
|
972
|
+
}
|
|
973
|
+
finally {
|
|
974
|
+
channel.close();
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
else {
|
|
978
|
+
const d = await new ScriptedDecider(rules).decide(req, ctx);
|
|
979
|
+
if (d === ABSTAIN) {
|
|
980
|
+
if (json)
|
|
981
|
+
out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: false, matched: false }));
|
|
982
|
+
else
|
|
983
|
+
log(`✗ no rule matched — this question would fall to --on-unanswered (add an --answer/--answer-policy rule)`);
|
|
984
|
+
process.exit(1);
|
|
985
|
+
}
|
|
986
|
+
const answer = d.response.answers?.[question];
|
|
987
|
+
if (json)
|
|
988
|
+
out(JSON.stringify({ tool: "cowork-harness", command: "decide", ok: true, matched: true, answer }));
|
|
989
|
+
else
|
|
990
|
+
log(`✓ rule matched: "${question}" → "${answer}"`);
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
catch (e) {
|
|
994
|
+
if (json)
|
|
995
|
+
out(jsonError("decide", "runtime", String(e.message)));
|
|
996
|
+
else
|
|
997
|
+
log(`✗ decider error: ${String(e.message)}`);
|
|
998
|
+
process.exit(1);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
/** `gates <dir> [--follow]` — the gate stream for the in-band `--decider-dir` path. Emits one clean
|
|
1002
|
+
* JSON line per pending gate (`{seq, …decision_request}`) + a terminal `{"done":true}`. Point ONE
|
|
1003
|
+
* Monitor at this (no hand-written zsh/find/seen-set loop). */
|
|
1004
|
+
async function cmdGates(args) {
|
|
1005
|
+
const follow = args.includes("--follow");
|
|
1006
|
+
const dir = args.find((a) => !a.startsWith("--"));
|
|
1007
|
+
if (!dir)
|
|
1008
|
+
return void fail("gates", "usage", "usage: gates <dir> [--follow]", undefined, isJsonOutput(args));
|
|
1009
|
+
await streamGates(dir, (line) => out(line), { once: !follow });
|
|
1010
|
+
}
|
|
1011
|
+
/** `answer <dir> --gate <N> (--choose <label> | --answer "<q>=<label>"…)` — write a gate answer
|
|
1012
|
+
* atomically with the right wire shape (hides the temp+rename + `{id, answers}` the driver had to build). */
|
|
1013
|
+
function cmdAnswer(args) {
|
|
1014
|
+
const json = isJsonOutput(args);
|
|
1015
|
+
// #15: skip flag values so `answer --gate 1 --choose Yes <dir>` doesn't read `1` as the directory.
|
|
1016
|
+
const dir = positionals(args, ["--gate", "--choose", "--answer", "--output-format"])[0];
|
|
1017
|
+
let seq;
|
|
1018
|
+
let choose;
|
|
1019
|
+
const pairs = [];
|
|
1020
|
+
for (let i = 0; i < args.length; i++) {
|
|
1021
|
+
const a = args[i];
|
|
1022
|
+
if (a === "--gate")
|
|
1023
|
+
seq = Number(flagValue(args, i++, a)); // #58: bounds-checked
|
|
1024
|
+
else if (a === "--choose")
|
|
1025
|
+
choose = flagValue(args, i++, a);
|
|
1026
|
+
else if (a === "--answer") {
|
|
1027
|
+
const [q, label] = splitEq(flagValue(args, i++, a));
|
|
1028
|
+
pairs.push({ q, label });
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
if (!dir || !seq)
|
|
1032
|
+
return void fail("answer", "usage", 'usage: answer <dir> --gate <N> (--choose <label> | --answer "<q>=<label>")', undefined, json);
|
|
1033
|
+
const answers = {};
|
|
1034
|
+
if (pairs.length)
|
|
1035
|
+
for (const p of pairs)
|
|
1036
|
+
answers[p.q] = p.label;
|
|
1037
|
+
else if (choose) {
|
|
1038
|
+
try {
|
|
1039
|
+
const g = readGate(dir, seq);
|
|
1040
|
+
answers[g.questions?.[0]?.question ?? g.questions?.[0]?.header ?? ""] = choose;
|
|
1041
|
+
}
|
|
1042
|
+
catch (e) {
|
|
1043
|
+
return void fail("answer", "usage", `cannot read gate ${seq} in ${dir}: ${String(e.message)}`, undefined, json);
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
else
|
|
1047
|
+
return void fail("answer", "usage", 'answer needs --choose <label> or --answer "<q>=<label>"', undefined, json);
|
|
1048
|
+
answerGate(dir, seq, answers);
|
|
1049
|
+
if (json)
|
|
1050
|
+
out(JSON.stringify({ tool: "cowork-harness", command: "answer", ok: true, gate: seq, answers }));
|
|
1051
|
+
else
|
|
1052
|
+
log(`✓ answered gate ${seq}: ${JSON.stringify(answers)}`);
|
|
1053
|
+
}
|
|
1054
|
+
function cmdTrace(args) {
|
|
1055
|
+
const json = isJsonOutput(args);
|
|
1056
|
+
const tools = args.includes("--tools");
|
|
1057
|
+
const gates = args.includes("--gates");
|
|
1058
|
+
// #16: skip the `--output-format` value so `trace --output-format json` doesn't try to trace a run
|
|
1059
|
+
// named `json` instead of reporting the missing target.
|
|
1060
|
+
const target = positionals(args, ["--output-format"])[0];
|
|
1061
|
+
if (!target)
|
|
1062
|
+
fail("trace", "usage", "usage: trace <run-id | run-dir | events.jsonl> [--tools | --gates] [--output-format json]", undefined, json);
|
|
1063
|
+
let file;
|
|
1064
|
+
try {
|
|
1065
|
+
file = resolveEventsFile(target);
|
|
1066
|
+
}
|
|
1067
|
+
catch (e) {
|
|
1068
|
+
return fail("trace", "usage", String(e.message), undefined, json);
|
|
1069
|
+
}
|
|
1070
|
+
if (gates) {
|
|
1071
|
+
// --gates: question → injected answer → delivered result, the full gate lifecycle in one command (Part 4).
|
|
1072
|
+
const rows = buildGateTrace(file);
|
|
1073
|
+
if (json)
|
|
1074
|
+
out(JSON.stringify({ tool: "cowork-harness", command: "trace", file, gates: rows }));
|
|
1075
|
+
else
|
|
1076
|
+
out(formatGateTrace(rows));
|
|
1077
|
+
return;
|
|
1078
|
+
}
|
|
1079
|
+
const rows = buildTrace(file, { tools });
|
|
1080
|
+
if (json)
|
|
1081
|
+
out(JSON.stringify({ tool: "cowork-harness", command: "trace", file, rows }));
|
|
1082
|
+
else
|
|
1083
|
+
out(formatTrace(rows));
|
|
1084
|
+
}
|
|
1085
|
+
function diff(a, b, path) {
|
|
1086
|
+
const keys = new Set([...Object.keys(a ?? {}), ...Object.keys(b ?? {})]);
|
|
1087
|
+
for (const k of keys) {
|
|
1088
|
+
const pa = JSON.stringify(a?.[k]);
|
|
1089
|
+
const pb = JSON.stringify(b?.[k]);
|
|
1090
|
+
if (pa !== pb)
|
|
1091
|
+
log(` ${path}${k}: ${pa} -> ${pb}`);
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
main().catch((e) => {
|
|
1095
|
+
const command = process.argv[2] ?? "";
|
|
1096
|
+
const json = isJsonOutput(process.argv.slice(2));
|
|
1097
|
+
if (e instanceof UnansweredError)
|
|
1098
|
+
fail(command, "unanswered", e.message, e.hint, json);
|
|
1099
|
+
if (e instanceof BoundaryError)
|
|
1100
|
+
fail(command, "boundary", e.message, undefined, json);
|
|
1101
|
+
// runtime/unexpected: keep the stack on stderr for humans; a structured envelope on stdout for json.
|
|
1102
|
+
if (json)
|
|
1103
|
+
out(jsonError(command, "internal", String(e?.message ?? e)));
|
|
1104
|
+
else
|
|
1105
|
+
log(String(e?.stack ?? e));
|
|
1106
|
+
process.exit(2);
|
|
1107
|
+
});
|