cowork-harness 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +87 -177
- package/README.md +62 -32
- package/dist/agent/session.js +47 -14
- package/dist/assert.js +143 -45
- package/dist/boundary.js +65 -59
- package/dist/cli.js +114 -12
- package/dist/decide/decider.js +92 -39
- package/dist/decide/external-channel.js +2 -1
- package/dist/egress/proxy.js +26 -1
- package/dist/egress/sidecar.js +21 -4
- package/dist/hostloop/workspace-handler.js +5 -1
- package/dist/io.js +11 -0
- package/dist/prompt.js +2 -1
- package/dist/regex.js +14 -0
- package/dist/run/cassette.js +200 -26
- package/dist/run/chat.js +2 -1
- package/dist/run/envelope.js +24 -2
- package/dist/run/execute.js +92 -25
- package/dist/run/renderer.js +7 -4
- package/dist/run/run.js +26 -27
- package/dist/run/scaffold.js +54 -0
- package/dist/run/trace-view.js +54 -6
- package/dist/run/verdict.js +62 -0
- package/dist/runtime/argv.js +29 -10
- package/dist/runtime/container.js +2 -1
- package/dist/runtime/hostloop.js +4 -2
- package/dist/runtime/lima.js +13 -4
- package/dist/runtime/microvm.js +12 -42
- package/dist/runtime/protocol.js +2 -1
- package/dist/runtime/stage.js +17 -0
- package/dist/session.js +83 -41
- package/dist/staging/resolve.js +42 -0
- package/dist/types.js +88 -28
- package/docs/README.md +29 -0
- package/docs/assets/banner.png +0 -0
- package/docs/boundary.md +81 -0
- package/docs/cassette.md +226 -0
- package/docs/cowork-spawn-contract-1.12603.1.md +78 -0
- package/docs/decider-dir.md +134 -0
- package/docs/discovery.md +43 -0
- package/docs/maintenance.md +151 -0
- package/docs/scenario.md +392 -0
- package/docs/session.md +106 -0
- package/package.json +8 -2
- package/python/README.md +146 -0
- package/python/conftest.py +18 -0
- package/python/cowork_harness.py +387 -0
- package/python/pyproject.toml +28 -0
- package/python/test_cowork_lane.py +206 -0
- package/python/test_csv_fx_lane.py +48 -0
- package/python/test_csv_metrics_lane.py +49 -0
- package/schema/scenario.schema.json +263 -0
- package/schema/session.schema.json +240 -0
- package/scripts/gen-schema.ts +60 -0
package/CHANGELOG.md
CHANGED
|
@@ -4,187 +4,97 @@ All notable changes to this project are documented here. The format is based on
|
|
|
4
4
|
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/). The project uses
|
|
5
5
|
[Semantic Versioning](https://semver.org/); pre-1.0 minor versions may include breaking changes.
|
|
6
6
|
|
|
7
|
-
## [
|
|
7
|
+
## [Unreleased]
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
## [0.2.0] — 2026-06-17
|
|
10
|
+
|
|
11
|
+
Binary-verified the AskUserQuestion answer wire shape (agent ELF 2.1.170), implemented the
|
|
12
|
+
harness-improvements plan, and resolved a 39-finding code-review pass behind two centralizing seams.
|
|
12
13
|
|
|
13
14
|
### Added
|
|
14
15
|
|
|
15
|
-
- **
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
- **
|
|
31
|
-
`
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
agent-driven runs where writing a `--decider-cmd` helper is overkill: a small model (host `claude -p`,
|
|
55
|
-
`COWORK_HARNESS_DECIDER_MODEL`) picks an option **by label** per live question, optionally steered by a
|
|
56
|
-
one-line intent (e.g. "test the not_ai branch" → picks `not_ai`). Scripted `--answer`/`--answer-policy`
|
|
57
|
-
still resolve first; an out-of-set answer **fails loud** (never a silent default). Because it's
|
|
58
|
-
non-deterministic, the run is flagged `nonDeterministic` and the footer prints `⚠ non-deterministic
|
|
59
|
-
(LLM-decided)` so a green can't be mistaken for a reproducible/scripted pass. Validate it in ~2s with
|
|
60
|
-
`cowork-harness decide --decider-llm --intent …`. (`--decider-llm` is the only user-facing spelling;
|
|
61
|
-
the internal `agent` policy it rides is not a CLI flag — `--on-unanswered agent` is rejected with a redirect.)
|
|
62
|
-
- **Safety fix — a question is never silently answered with option 1.** A question that reaches the
|
|
63
|
-
decider chain's terminal unanswered now **fails loud** (`UnansweredError`) instead of the prior
|
|
64
|
-
silent option-1 fallback in the run loop — closing the worst failure mode (a wrong-branch run printing
|
|
65
|
-
`✓ success`). Permissions/dialogs still fail *closed* (deny/cancel), which is correct.
|
|
66
|
-
- **External decider — answer the LIVE question (the stochastic-question fix).** Because a skill's
|
|
67
|
-
AskUserQuestions are LLM-generated and vary run to run, a pre-written `--answer` regex is brittle.
|
|
68
|
-
**`--decider-cmd '<helper>'`** spawns a helper once and pipes each *actual* live question (with options
|
|
69
|
-
+ a scrubbed transcript `context` + a literal `reply_with` template) to it, reading the answer back —
|
|
70
|
-
the agent-usable, one-shot path for custom logic (even an LLM call). The Python package's
|
|
71
|
-
**`serve_decider(fn)`** pre-builds the wire loop so a helper writes only the decision function (the
|
|
72
|
-
spawn-helper analogue of the `gates`/`answer` commands). Replies are lenient (label OR 1-based index,
|
|
73
|
-
`id` optional); scripted `--answer` + permission parity still apply first; the request is
|
|
74
|
-
secret-scrubbed before it leaves the process. The helper owns its own pipes, so **`--decider-cmd`
|
|
75
|
-
keeps the CLI's stdout free and composes with `--output-format json`** — as does `--decider-dir`; both
|
|
76
|
-
external channels are stdout-free and orthogonal (pick one terminal).
|
|
77
|
-
- **Egress sandbox.** Default-deny outbound, enforced against the **synced** Cowork domain allowlist
|
|
78
|
-
(plus per-scenario `extra_allow`); `egress_*` / `expect_denied` assertions; `web_fetch` modeled
|
|
79
|
-
host/API-routed (gated by a web-fetch allowlist) as in real Cowork, distinct from container-sandboxed
|
|
80
|
-
`bash`.
|
|
81
|
-
- **Assertions** (`assert:`): transcript, files, user-visible artifacts, tool / sub-agent usage,
|
|
82
|
-
`subagent_dispatched` / `subagent_declared_but_unused` / `dispatch_count_max`, egress, no-delete-in-
|
|
83
|
-
outputs, self-heal, host-path-leak, question count, `gate_answers_delivered`, result status, and
|
|
84
|
-
**`transcript_matches`/`transcript_not_matches`** (case-insensitive regex over the transcript — the
|
|
85
|
-
drift-tolerant content check for stochastic prose; replay-safe, so it runs on the token-free PR gate).
|
|
86
|
-
- **AskUserQuestion answer delivery (correctness fix).** The answer to an AskUserQuestion gate is now
|
|
87
|
-
injected as the binary's COMPLETE tool input — `updatedInput:{questions, answers}`, not `{answers}` —
|
|
88
|
-
matching the ELF's built-in handler, which does `questions.map(…)` over the input (verified against
|
|
89
|
-
`claude-code-vm` 2.1.170). Dropping `questions` threw `undefined is not an object (evaluating 'q.map')`,
|
|
90
|
-
so the answer never reached the model and gate-steering silently no-oped. (The earlier golden snapshot
|
|
91
|
-
had blessed the `{answers}`-only shape as "faithful"; it was the bug — corrected, with a regression test
|
|
92
|
-
asserting `questions` is preserved.) **New verification surfaces:** `tool_result` blocks are now captured;
|
|
93
|
-
`RunResult.gateDeliveries[]` + the `gate_answers_delivered` assertion confirm each answer actually
|
|
94
|
-
reached the model (a `::warning:: [gate] DELIVERY FAILED` fires in real time on an errored result);
|
|
95
|
-
`cowork-harness trace <id> --tools` shows each tool's result status; `trace <id> --gates` shows the gate
|
|
96
|
-
lifecycle (question → injected answer → delivered result); the gate rendezvous wire shapes are
|
|
97
|
-
written into `<run>/gates/` on every run (so the forensic evidence survives the channel's cleanup, even
|
|
98
|
-
without `--keep`).
|
|
99
|
-
- **Truthful tool counts (`RunResult.toolCounts`).** Per-tool call counts from the actual tool_use stream
|
|
100
|
-
(top-level only). On the cowork path `usage.server_tool_use.web_search_requests` is 0 (it counts the
|
|
101
|
-
Anthropic *server* tool; WebSearch is a host-routed *client* tool) — `toolCounts.WebSearch` is the real
|
|
102
|
-
count to assert on.
|
|
103
|
-
- **Run-once-then-script.** Every question the agent asks that wasn't pre-scripted (auto-answered by
|
|
104
|
-
`first`, or answered interactively) is echoed on the footer as a copy-pasteable `--answer "<q>=<choice>"`
|
|
105
|
-
line — turning an exploratory run into a deterministic one. An **idle heartbeat** (`… still running
|
|
106
|
-
(Xs · N tools)` on stderr after ~30s of silence) keeps long 5–20 min runs legible; disable with
|
|
107
|
-
`COWORK_HARNESS_NO_HEARTBEAT`, tune with `COWORK_HARNESS_HEARTBEAT_MS`.
|
|
108
|
-
- **File provision.** `--upload <file>` attaches a file at `mnt/uploads/<name>` (the "attach a file"
|
|
109
|
-
path skills like deck-review require) and `--project <dir>` connects a folder at `mnt/.projects/<id>`,
|
|
110
|
-
ad-hoc on the `skill` command (parity with the scenario `session.uploads`/`folders`).
|
|
111
|
-
- **Resume-after-failure hardening.** Ephemeral Docker resources (egress networks/proxy + the host-loop
|
|
112
|
-
container) are named by a unique per-invocation token (not the session id), and the agent container is
|
|
113
|
-
reaped on teardown — so a `--resume` after a failed/interrupted run no longer collides with a leftover
|
|
114
|
-
orphaned container/network.
|
|
115
|
-
- **Session persistence & resume.** `--session-id <id>` pins a stable run dir + the agent's native
|
|
116
|
-
session UUID (persisted in a `session.json` manifest); `--resume` reuses that work dir — preserving
|
|
117
|
-
`mnt/.claude/projects/<uuid>.jsonl`, `gate_state.json`, and `mnt/outputs` — and passes the agent's
|
|
118
|
-
own `--resume` so it reloads the conversation. This is how checkpoint-and-resume skills (a gate that
|
|
119
|
-
writes state, ends, and is re-invoked later with the prior RUN_ID) are tested. The harness leans on
|
|
120
|
-
the agent's native resume rather than reimplementing it (binary-verified).
|
|
121
|
-
- **Interactive `chat`** — multi-turn REPL keeping the full harness (egress sandbox + control protocol);
|
|
122
|
-
`chat --raw` drops to the agent's native interactive cowork mode via `docker run -it`.
|
|
123
|
-
- **Cassettes + full-fidelity replay.** `record` saves a control-protocol cassette; `replay --cassette`
|
|
124
|
-
plays it back deterministically (no token, no Docker) and re-evaluates content assertions.
|
|
125
|
-
- *The cassette captures both protocol directions:* `events` (child→driver, the assistant turn stream)
|
|
126
|
-
AND `controlOut` (driver→child decision responses). Both are recorded; `replay` now **consumes** both.
|
|
127
|
-
- *Full-fidelity replay (C1 false-green fix).* Consuming `controlOut` re-runs the decision pipeline on
|
|
128
|
-
replay, populating `rec.questions`/`rec.gateAnswers`/`rec.gateDeliveries`. Previously,
|
|
129
|
-
`question_asked` silently false-failed (questions invisible), `questions_count_max` passed vacuously
|
|
130
|
-
(0 ≤ max), and `gate_answers_delivered: true` passed vacuously (no deliveries recorded) — a silent
|
|
131
|
-
false-green violating the project's core principle. All three now genuinely evaluate when `controlOut`
|
|
132
|
-
is present.
|
|
133
|
-
- *The O7 guard on the token-free lane (`replay_protocol_fidelity`).* `replay` re-serializes each
|
|
134
|
-
decision response via `serializeDecision` and compares to the frozen `controlOut` envelope. A mismatch
|
|
135
|
-
(e.g. `serializeDecision` dropping `questions` from the AskUserQuestion `updatedInput`) appends a
|
|
136
|
-
`{ assertion: { replay_protocol_fidelity: true }, pass: false, message }` entry and exits 1 — catching
|
|
137
|
-
the O7 answer-shape regression without a live model.
|
|
138
|
-
- *Backward compatibility.* Old cassettes without `controlOut` get a loud `::warning::` on stderr;
|
|
139
|
-
`question_asked`, `questions_count_max`, and `gate_answers_delivered` are excluded from evaluation
|
|
140
|
-
(not vacuously passed). Re-record to enable full-fidelity mode.
|
|
141
|
-
- *Committed synthetic fixture + CI replay gate.* `examples/replays/example-pdf-skill.cassette.json`
|
|
142
|
-
is a hand-authored fixture (permission gate + AskUserQuestion gate + `tool_result`) committed to the
|
|
143
|
-
repo and replayed in the token-free CI job — dogfooding the documented PR-gate pattern and pinning
|
|
144
|
-
the fixture against `parseMessage`/assertion/`Run` regressions on every push.
|
|
145
|
-
- **pytest `cowork` lane** (`python/`) — `@pytest.mark.cowork` + a `cowork` fixture over the
|
|
146
|
-
`--output-format json` surface, selectable with `-m cowork` beside your fast tests.
|
|
147
|
-
- **Faithful sub-agent aggregation.** Recognizes the real cowork dispatch tool — **`Agent`**
|
|
148
|
-
(`{description, subagent_type, prompt}`; binary-verified primary name, with `Task` as its legacy
|
|
149
|
-
alias) and any tool carrying `subagent_type` — so `subagents[]` and the `subagent_dispatched` /
|
|
150
|
-
`dispatch_count_max` / `subagent_tool_*` assertions fire under `--fidelity cowork`. The cowork
|
|
151
|
-
`TaskCreate`/`TaskUpdate` **todo list** and `Monitor` are correctly excluded (no over-counting). Each
|
|
152
|
-
dispatch also captures its **`description`** — so a dispatch the skill made with no `subagent_type`
|
|
153
|
-
(`agentType:"unknown"`) is still self-explaining in `trace` and assertable: `subagent_dispatched`
|
|
154
|
-
matches the agentType **OR** the description.
|
|
155
|
-
- **`trace <run-id | dir | events.jsonl> [--tools]`** — digests a run's `events.jsonl` into tool calls,
|
|
156
|
-
sub-agent dispatches (deduped), decisions, and questions (reuses the live `parseMessage` so it tracks
|
|
157
|
-
the schema); `--output-format json` for structured rows. Plus `result.json`/the json envelope now expose
|
|
158
|
-
`workDir`/`outputsDir`, and `--keep` prints the deep `mnt/outputs` deliverable path.
|
|
159
|
-
- **Per-run artifacts** under `runs/<scenario>/<id>/`: `events.jsonl` + `control-out.jsonl` (the cassette
|
|
160
|
-
source), `run.jsonl` (harness-observability log), `trace.json` (structured trace), `egress.log`,
|
|
161
|
-
`result.json`, `agent.stderr.log`. Injected secrets (OAuth token / API key) are scrubbed from every
|
|
162
|
-
persisted log by value.
|
|
163
|
-
- **Auth via env or `.env`.** `CLAUDE_CODE_OAUTH_TOKEN` (preferred) or `ANTHROPIC_API_KEY`, resolved in
|
|
164
|
-
priority order: exported env > `--dotenv <path>` > `./.env` (cwd) > `<install>/.env` (package root),
|
|
165
|
-
so a run from any directory still finds the install's credentials. Host-side, gitignored, exported
|
|
166
|
-
vars win, never mounted into the sandbox; passed via argv/env, never written to a runtime path. (The
|
|
167
|
-
flag is `--dotenv`, not `--env-file` — Node reserves the latter.)
|
|
168
|
-
- **Platform baselines** (`cowork-harness sync`) derive the release-specific facts (agent version, domain
|
|
169
|
-
allowlist, mount layout, the production GrowthBook gate states) from your installed Claude Desktop, so
|
|
170
|
-
the code rides the stable protocol while data tracks each release. `--diff` previews changes; an
|
|
171
|
-
`asarFingerprint` tripwire flags unrecognized deltas.
|
|
172
|
-
- **Sandbox self-verification** — `cowork-harness boundary-check` proves the sandbox enforces Cowork's
|
|
173
|
-
limitations; `cowork-harness vm <init|status|delete|prune>` manages the L2 microVM (`prune` drops orphaned VMs).
|
|
174
|
-
- **CI** (`.github/workflows/ci.yml`): typecheck · format · unit + golden snapshots · build · boundary
|
|
175
|
-
parity (Docker) · live-contract guards · scenario suite (gated on a key) · the pytest lane.
|
|
16
|
+
- **AskUserQuestion answer shapes.** `multiSelect` gates (answer with a list of labels → the verified
|
|
17
|
+
comma-joined wire shape); free-text **"Other"** via `answer:` (distinct from the label-validated
|
|
18
|
+
`choose:`); `choose` tolerates the `(Recommended)` suffix + `recommended`/`first` keywords. A partial
|
|
19
|
+
match on a batched gate now **names the unmatched sub-questions**.
|
|
20
|
+
- **`artifact_json` assertion** — assert a JSON artifact's contents via a dotted path
|
|
21
|
+
(`equals`/`gt`/`exists`/`absent`/`is_null`); `absent`, `is_null`, and an unresolved intermediate are
|
|
22
|
+
distinct (the last fails loud, never a vacuous pass).
|
|
23
|
+
- **Artifact manifest in cassettes** — `record` snapshots `outputs/`/`.projects/` (paths + hashes + small
|
|
24
|
+
JSON bodies) so `file_exists`/`user_visible_artifact`/`artifact_json` run on token-free `replay`. A
|
|
25
|
+
cassette→skill/baseline **staleness fingerprint** warns on drift; `replay --strict` fails on it. Cassettes
|
|
26
|
+
now carry a `cassetteVersion` (forward-compat guard).
|
|
27
|
+
- **`RunResult.artifacts`** (ENV-MANIFEST) — observed user-visible files (path + bytes); also surfaced as
|
|
28
|
+
`Result.artifacts` in the Python helper.
|
|
29
|
+
- **`allow_permissive_auto_allow` assertion + `RunResult.scan`** — a security-scan surface for the
|
|
30
|
+
Cowork-parity verdict (below); the assertion opts a scenario into a permissive auto-allow on purpose.
|
|
31
|
+
- **CLI:** `trace --dispatches` (sub-agent dispatch tree + real total), `assert --list` (schema-generated),
|
|
32
|
+
`scaffold --from-run <id>` (kept run → starter scenario YAML).
|
|
33
|
+
- **Python:** `run_scenario()` — run an authored scenario YAML and get the typed `Result`.
|
|
34
|
+
|
|
35
|
+
### Changed
|
|
36
|
+
|
|
37
|
+
- **Single verdict source (`computeVerdict()`)** wired into all five pass/fail sites (run/skill exit, footer,
|
|
38
|
+
replay exit, JSON-envelope `ok`) plus the Python `assert_success`. A Cowork-parity violation — a permissive
|
|
39
|
+
auto-allow, a recorded `outputs/` delete, or a host-path leak — now **default-fails** the run unless the
|
|
40
|
+
scenario explicitly asserts about it.
|
|
41
|
+
- **Single fail-loud staging policy (`src/staging/resolve.ts`)** for every declared input (marketplace
|
|
42
|
+
manifest, enabled-plugin resolution, local skills, `mcp.config`, uploads, folders), with a Docker-safe
|
|
43
|
+
marketplace charset.
|
|
44
|
+
- The run root honors `COWORK_HARNESS_RUNS_DIR`.
|
|
45
|
+
|
|
46
|
+
### Fixed
|
|
47
|
+
|
|
48
|
+
- **Egress / runtime hardening:** per-hop redirect egress logging, allowlist validation, a per-run proxy
|
|
49
|
+
port, proxy/sidecar readiness handshakes, fail-loud Lima provisioning, and boundary teardown in
|
|
50
|
+
`try/finally`.
|
|
51
|
+
- **Protocol / decider hardening:** oversized control-frame hard-fail, a nonzero child-exit error event,
|
|
52
|
+
provenance untruncation, TTY-elicit cancel, and a JSON-safe `reply_with` key.
|
|
53
|
+
- **Detection / packaging:** `%2F`/backslash decode in the outputs-delete detector; the npm package now
|
|
54
|
+
ships `schema/`, `docs/`, `python/`, and `scripts/`; assertion path containment; resume empty-tree warning.
|
|
176
55
|
|
|
177
56
|
### Notes
|
|
178
57
|
|
|
179
|
-
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
-
|
|
189
|
-
|
|
190
|
-
|
|
58
|
+
- Held/deferred per the plan's gating: composed partial-gate answering, `decider_intent:` in scenario YAML,
|
|
59
|
+
a whole-gate `response:` freeform, and `artifacts_share_field`. All additive/opt-in when built.
|
|
60
|
+
|
|
61
|
+
## [0.1.1] — 2026-06-16
|
|
62
|
+
|
|
63
|
+
Docs, distribution, and packaging. No CLI behavior change.
|
|
64
|
+
|
|
65
|
+
### Added
|
|
66
|
+
|
|
67
|
+
- **Companion Claude Code skill, installable.** A `.claude-plugin/marketplace.json` + skills-directory
|
|
68
|
+
plugin make the bundled skill installable via `/plugin marketplace add yaniv-golan/cowork-harness`;
|
|
69
|
+
the skill self-bootstraps the CLI (`npx cowork-harness@latest`) and fails loud on missing tier deps.
|
|
70
|
+
- **`AGENTS.md`** — canonical, cross-tool agent instructions — and **`llms.txt`** doc index.
|
|
71
|
+
- **JSON Schema for scenario & session YAML** (`schema/*.schema.json`, generated via `npm run schema`,
|
|
72
|
+
pinned by a token-free drift-guard); `# yaml-language-server: $schema=` hints in the example scenarios.
|
|
73
|
+
- README banner, badges, an "For AI agents" section, and `npm install` instructions.
|
|
74
|
+
|
|
75
|
+
### Changed
|
|
76
|
+
|
|
77
|
+
- Release pipeline publishes via npm **Trusted Publishing (OIDC)** with provenance (no stored token).
|
|
78
|
+
- GitHub Actions bumped off the deprecated Node 20 runtime; CI live-scenario job skips cleanly without a key.
|
|
79
|
+
|
|
80
|
+
## [0.1.0] — 2026-06-16
|
|
81
|
+
|
|
82
|
+
Initial public release. A faithful, headless, scriptable harness for Claude Cowork's runtime — for
|
|
83
|
+
testing Claude Code **skills** outside the Desktop app with the same staged agent, spawn/control-protocol
|
|
84
|
+
contract, egress allowlist, permission protocol, and sandbox limitations. Binary-grounded against
|
|
85
|
+
`app.asar` 1.12603.1 / agent ELF 2.1.170.
|
|
86
|
+
|
|
87
|
+
### Added
|
|
88
|
+
|
|
89
|
+
- Commands: `skill`, `run`, `chat`, `record`, `replay`, `trace`, and `decide`, plus `sync`,
|
|
90
|
+
`boundary-check`, and `vm` management. Stable `--output-format json` envelope and CI-ready exit codes.
|
|
91
|
+
- Five fidelity tiers (`fidelity:`): `protocol`, `container`, `microvm`, `hostloop`, and `cowork`
|
|
92
|
+
(auto-picks host-loop vs container the way Cowork does).
|
|
93
|
+
- Scenario YAML — prompt + scripted answers + `assert:` (transcript, files, artifacts, tool / sub-agent
|
|
94
|
+
usage, egress, and more) for authored, asserted regression runs.
|
|
95
|
+
- Input policy with no silent false-greens: scripted, LLM, and in-band (`--decider-dir`) answering for
|
|
96
|
+
AskUserQuestion / tool-permission gates; an unanswered gate fails loud.
|
|
97
|
+
- Default-deny egress sandbox enforced against the synced Cowork domain allowlist.
|
|
98
|
+
- Token-free, Docker-free cassette `record` / `replay` for the PR gate.
|
|
99
|
+
- Platform baselines synced from a local Claude Desktop install — nothing Anthropic-owned is bundled
|
|
100
|
+
or distributed.
|
package/README.md
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="docs/assets/banner.png" alt="cowork-harness — headless, scriptable, CI-ready test harness for Claude Cowork skills" width="100%">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
1
5
|
# cowork-harness
|
|
2
6
|
|
|
3
7
|
[](https://github.com/yaniv-golan/cowork-harness/actions/workflows/ci.yml)
|
|
4
8
|
[](./LICENSE)
|
|
5
9
|
[](#quick-start)
|
|
10
|
+
[](#drive-it-from-claude-code-companion-skill)
|
|
11
|
+
[](https://github.com/yaniv-golan/skill-creator-plus)
|
|
12
|
+
[](https://agentskills.io)
|
|
6
13
|
|
|
7
14
|
Scriptable, CI-friendly test harness that reproduces **Claude Cowork's observable runtime contract** closely enough to test the skills you write — across many scenarios, headless, in CI — without the (locked) Desktop app. It reproduces not just Cowork's *behavior* but its *limitations*: sealed filesystem, default-deny egress, MCP-only cross-boundary — so a green test means green in real Cowork.
|
|
8
15
|
|
|
@@ -67,8 +74,10 @@ Skill testing is the headline use, but the tool is a general harness over the Co
|
|
|
67
74
|
| `skill <folder> "<prompt>"` | Run a local skill/plugin folder once against the staged agent | ad-hoc "is the skill alive / does it do X?" — the fast inner loop |
|
|
68
75
|
| `run <scenario.yaml \| dir/>` | Run authored scenarios with `assert:` + a CI-ready exit code | you want a repeatable, **asserted regression test** |
|
|
69
76
|
| `chat <folder>` | Interactive multi-turn REPL against a skill (TTY) | debugging a multi-turn flow by hand |
|
|
70
|
-
| `record` / `replay` | Save a control-protocol cassette, then replay it deterministically | **token-free, Docker-free CI** from a once-recorded run |
|
|
71
|
-
| `trace <run-id>` | Digest a run's `events.jsonl` (tools
|
|
77
|
+
| `record` / `replay` | Save a control-protocol cassette, then replay it deterministically (`replay --strict` fails on a stale cassette) | **token-free, Docker-free CI** from a once-recorded run |
|
|
78
|
+
| `trace <run-id>` | Digest a run's `events.jsonl` (`--tools`, `--gates`, `--dispatches` for the sub-agent dispatch tree + total) | "how many sub-agents *actually* dispatched, and which?" |
|
|
79
|
+
| `scaffold --from-run <id>` | Turn a kept run into a starter scenario YAML (gates→answers, artifacts→`file_exists`) | authoring a scenario from a real run instead of guessing |
|
|
80
|
+
| `assert --list` | List the available scenario assertions (generated from the schema) | "what can I assert?" without grepping the source |
|
|
72
81
|
| `decide` | Validate a decider against a sample question in ~2 s (no run) | sanity-check a `--decider-*` / `--answer` wiring before a long run |
|
|
73
82
|
| `gates` / `answer` | Stream / answer in-band gates for `--decider-dir` | a **driving agent** answers live questions via a Monitor |
|
|
74
83
|
| `boundary-check [baseline]` | Prove the sandbox enforces Cowork's limitations | verifying the harness's own fidelity |
|
|
@@ -141,7 +150,13 @@ It mounts the folder(s) at the Cowork plugin path, runs the staged agent in cowo
|
|
|
141
150
|
|
|
142
151
|
## Quick start
|
|
143
152
|
|
|
144
|
-
**Install from
|
|
153
|
+
**Install from npm:**
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
npm install -g cowork-harness # puts the `cowork-harness` command on your PATH
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**Or build from source:**
|
|
145
160
|
|
|
146
161
|
```bash
|
|
147
162
|
git clone https://github.com/yaniv-golan/cowork-harness && cd cowork-harness
|
|
@@ -149,8 +164,18 @@ npm install && npm run build && npm link # puts the `cowork-harness` command
|
|
|
149
164
|
# …or skip the link and call it directly: node dist/cli.js <cmd>
|
|
150
165
|
```
|
|
151
166
|
|
|
152
|
-
|
|
153
|
-
|
|
167
|
+
### Drive it from Claude Code (companion skill)
|
|
168
|
+
|
|
169
|
+
This repo ships a **companion skill** (`.claude/skills/cowork-harness/`) that teaches an agent how to drive the harness — author scenarios, pick a fidelity tier, script answers, place assertions in the right CI lane, and avoid the "✓ passed ≠ correct" traps. Install it into Claude Code via the bundled marketplace:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
/plugin marketplace add yaniv-golan/cowork-harness
|
|
173
|
+
/plugin install cowork-harness@cowork-harness
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
The skill **self-bootstraps the CLI**: if `cowork-harness` isn't on your PATH it falls back to `npx cowork-harness@>=0.2.0` (a version floor that fails loud rather than silently fetching a too-old CLI; Node ≥ 20). Tiers above `protocol` still need Docker/Lima and a Claude Desktop agent binary — see the prerequisites below.
|
|
177
|
+
|
|
178
|
+
It also follows the open [Agent Skills](https://github.com/vercel-labs/skills) spec, so it installs cross-editor (Cursor, Codex, OpenCode, …) by pointing the `npx skills` CLI at `.claude/skills/cowork-harness` in this repo. (Working *inside* this repo, the skill auto-loads as a project skill — no install needed.)
|
|
154
179
|
|
|
155
180
|
**Prerequisites for anything above `protocol` fidelity** (the `protocol` tier needs none of these — it's pure logic iteration):
|
|
156
181
|
1. **Claude Desktop, opened once.** The Cowork agent binary is **bind-mounted from your own install** at run time — nothing Anthropic-owned is bundled. Open Cowork once so the agent ELF is staged (`…/claude-code-vm/<ver>/claude`); the harness auto-detects it, or set `COWORK_AGENT_BINARY=<path>` to point at it. Without a staged agent, container/cowork runs fail with "Open Cowork once to stage it…".
|
|
@@ -309,32 +334,26 @@ Secrets (the injected OAuth token / API key) are scrubbed from every persisted l
|
|
|
309
334
|
## Architecture
|
|
310
335
|
|
|
311
336
|
```
|
|
312
|
-
|
|
313
|
-
scenario.yaml
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
┌─────────────────────▼──────────────┐ ┌───────────────────────┐
|
|
333
|
-
│ AgentSession → Decider → Run │ │ Egress proxy │
|
|
334
|
-
│ (protocol seam · policy seam · │ │ default-deny, │
|
|
335
|
-
│ turn loop + RunRecord) │ │ allowlist = synced │
|
|
336
|
-
└─────────────────────────────────────┘ │ vmAllowedDomains() │
|
|
337
|
-
└───────────────────────┘
|
|
337
|
+
┌────────────────────────────────────────────────┐
|
|
338
|
+
scenario.yaml ────► │ cowork-harness (TypeScript CLI) │
|
|
339
|
+
│ baseline loader ◄── baselines/desktop-*.json│
|
|
340
|
+
│ runtime selector ──► L0 / L1 / L2 │
|
|
341
|
+
└───────────────────────┬────────────────────────┘
|
|
342
|
+
│ spawns + speaks stream-json
|
|
343
|
+
┌───────────────────────▼────────────────────────┐
|
|
344
|
+
│ Agent: claude -p (CLAUDE_CODE_IS_COWORK=1) │
|
|
345
|
+
│ --input-format / --output-format stream-json│
|
|
346
|
+
│ cwd = /sessions/<id>/mnt │
|
|
347
|
+
│ mnt/uploads · mnt/.projects/* · plugins │
|
|
348
|
+
└───────────────────────┬────────────────────────┘
|
|
349
|
+
decision control request │ outbound network (egress)
|
|
350
|
+
(tool · question · dialog) │ default-deny → allowlist
|
|
351
|
+
┌───────────────────────▼────────────┐ ┌────────────────────────┐
|
|
352
|
+
│ AgentSession ──► Decider ──► Run │ │ Egress proxy │
|
|
353
|
+
│ protocol · policy · turn loop │ │ default-deny; │
|
|
354
|
+
│ + RunRecord │ │ allowlist = synced │
|
|
355
|
+
│ │ │ vmAllowedDomains() │
|
|
356
|
+
└────────────────────────────────────┘ └────────────────────────┘
|
|
338
357
|
```
|
|
339
358
|
|
|
340
359
|
- **AgentSession** speaks the Agent SDK control protocol over stream-json, emitting a typed event
|
|
@@ -424,7 +443,7 @@ The diff shows exactly what moved (agent bump, allowlist change, new mount). You
|
|
|
424
443
|
|
|
425
444
|
---
|
|
426
445
|
|
|
427
|
-
##
|
|
446
|
+
## Limitations
|
|
428
447
|
|
|
429
448
|
- **Not the full Desktop network transport.** L1 is a container, not a VM; L2 *is* a real Apple-VZ microVM but still does not reproduce Cowork's gVisor netstack — its egress is the same allowlist proxy as L1 (with a guest iptables firewall in front). If your skill depends on VM-kernel specifics, validate at L2; if it depends on packet-level gVisor behavior, no tier reproduces it.
|
|
430
449
|
- **Cowork in-guest context is partial.** Desktop supplies host-loop staging, runtime `mountPath` RPC, and the bridge. We reproduce the *filesystem and cowork mode*, not those host-side services. Skills that call Desktop-only host RPCs won't run here (they wouldn't be portable anyway).
|
|
@@ -435,6 +454,17 @@ These are documented per-tier in [DESIGN.md](./DESIGN.md) so a green test means
|
|
|
435
454
|
|
|
436
455
|
---
|
|
437
456
|
|
|
457
|
+
## For AI agents
|
|
458
|
+
|
|
459
|
+
This repo is built to be driven by agents, not just read by humans:
|
|
460
|
+
|
|
461
|
+
- **[AGENTS.md](./AGENTS.md)** — the canonical agent-instructions file (architecture seams, the build gate, invariants, ethos). Read it before changing code. Also indexed in **[llms.txt](./llms.txt)**.
|
|
462
|
+
- **Companion skill** — [`.claude/skills/cowork-harness/`](./.claude/skills/cowork-harness/SKILL.md) teaches an agent to drive the harness; install it via the marketplace (see [above](#drive-it-from-claude-code-companion-skill)).
|
|
463
|
+
- **Machine-readable interfaces** — stable `--output-format json` envelope on stdout, deterministic exit codes (`0`/`1`/`2`), and `--help` on every command.
|
|
464
|
+
- **JSON Schemas** — [`schema/scenario.schema.json`](./schema/scenario.schema.json) and [`schema/session.schema.json`](./schema/session.schema.json) describe every field of the YAML you author (generated from the source schemas; `npm run schema`).
|
|
465
|
+
|
|
466
|
+
---
|
|
467
|
+
|
|
438
468
|
## Documentation
|
|
439
469
|
|
|
440
470
|
| Doc | Read it for |
|
package/dist/agent/session.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { warn } from "../io.js";
|
|
1
2
|
import { createWriteStream } from "node:fs";
|
|
2
3
|
import { join } from "node:path";
|
|
3
4
|
import readline from "node:readline";
|
|
@@ -159,6 +160,8 @@ export class LiveAgentSession {
|
|
|
159
160
|
/** Reject function set when proc emits an error — bridges the callback into the async generator.
|
|
160
161
|
* Set before the generator loop starts; called at most once (the Promise settles once). */
|
|
161
162
|
rejectError;
|
|
163
|
+
/** Bounded tail of the child's stderr, for the nonzero-exit error message. */
|
|
164
|
+
stderrTail = "";
|
|
162
165
|
constructor(proc, outDir) {
|
|
163
166
|
this.proc = proc;
|
|
164
167
|
this.outDir = outDir;
|
|
@@ -166,6 +169,11 @@ export class LiveAgentSession {
|
|
|
166
169
|
this.controlOut = createWriteStream(join(outDir, "control-out.jsonl"), { flags: "a" });
|
|
167
170
|
const errLog = createWriteStream(join(outDir, "agent.stderr.log"), { flags: "a" });
|
|
168
171
|
this.proc.stderr.pipe(errLog);
|
|
172
|
+
// keep a bounded stderr tail and capture the exit code/signal so a child that dies nonzero
|
|
173
|
+
// (with no structured {type:"result"} error) is surfaced as a typed error event, not a silent stop.
|
|
174
|
+
this.proc.stderr.on("data", (d) => {
|
|
175
|
+
this.stderrTail = (this.stderrTail + d.toString()).slice(-2000);
|
|
176
|
+
});
|
|
169
177
|
// #15: attach stdin error listener once at construction so dead-child writes don't produce
|
|
170
178
|
// unhandled process errors. Routes to the same error path as spawn errors when possible.
|
|
171
179
|
this.proc.stdin.on("error", (e) => {
|
|
@@ -241,6 +249,20 @@ export class LiveAgentSession {
|
|
|
241
249
|
}
|
|
242
250
|
yield* this.translate(msg);
|
|
243
251
|
}
|
|
252
|
+
// stdout closed. Give a pending 'exit' one tick to land (NOT a blocking wait on 'close' — a
|
|
253
|
+
// mock/fake child may never emit it), then surface a nonzero/signal exit as a typed error — a
|
|
254
|
+
// crashed child that emitted no {type:"result"} error line would otherwise be a silent stop.
|
|
255
|
+
await new Promise((res) => setImmediate(res));
|
|
256
|
+
const code = this.proc.exitCode;
|
|
257
|
+
const signal = this.proc.signalCode;
|
|
258
|
+
if (signal || (code !== null && code !== 0)) {
|
|
259
|
+
const tail = this.stderrTail.trim();
|
|
260
|
+
yield {
|
|
261
|
+
type: "error",
|
|
262
|
+
source: "exit",
|
|
263
|
+
message: `agent process exited ${signal ? `on signal ${signal}` : `with code ${code}`}${tail ? ` — stderr tail: ${tail}` : ""}`,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
244
266
|
}
|
|
245
267
|
finally {
|
|
246
268
|
this.rejectError = undefined; // generator is done; stop routing errors here
|
|
@@ -274,7 +296,7 @@ export class LiveAgentSession {
|
|
|
274
296
|
// reply — an unanswered mcp_message blocks the in-VM agent on the round-trip forever (deadlock).
|
|
275
297
|
// Reply with a JSON-RPC error instead, mirroring the no-handler defense below.
|
|
276
298
|
const message = e?.message ?? String(e);
|
|
277
|
-
|
|
299
|
+
warn(`::warning:: sdkMcp.handle threw for "${server}" — replying with a JSON-RPC error: ${message}\n`);
|
|
278
300
|
out = { error: { code: -32603, message: `handler error: ${message}` } };
|
|
279
301
|
}
|
|
280
302
|
this.write(mcpResponseEnvelope(msg.request_id, out, jr.id));
|
|
@@ -294,7 +316,7 @@ export class LiveAgentSession {
|
|
|
294
316
|
// #10: an mcp_message arrived but no sdkMcp handler is configured. Reply with a JSON-RPC error
|
|
295
317
|
// (well-formed via mcpResponseEnvelope) instead of silently dropping it — a dropped request
|
|
296
318
|
// leaves the in-VM agent waiting on the round-trip forever (protocol deadlock in host-loop mode).
|
|
297
|
-
|
|
319
|
+
warn(`::warning:: mcp_message for server "${server}" arrived but no sdkMcp handler is configured — replying with a JSON-RPC error (would otherwise deadlock)\n`);
|
|
298
320
|
this.write(mcpResponseEnvelope(msg.request_id, { error: { code: -32601, message: "no sdkMcp handler configured" } }, jr.id));
|
|
299
321
|
return;
|
|
300
322
|
}
|
|
@@ -312,7 +334,7 @@ export class LiveAgentSession {
|
|
|
312
334
|
if (!req) {
|
|
313
335
|
// #13: an id with no matching request_id is a protocol drift. Writing a guessed envelope would
|
|
314
336
|
// be worse, but a silent return leaves the agent blocked until timeout (looks like a hang).
|
|
315
|
-
|
|
337
|
+
warn(`::warning:: respond() for unknown decision id "${decisionId}" — no matching request_id was seen; the agent may block until timeout (protocol drift)\n`);
|
|
316
338
|
return;
|
|
317
339
|
}
|
|
318
340
|
// #14: serializeDecision returns a safe deny envelope on a kind mismatch (defense in depth). That
|
|
@@ -320,7 +342,7 @@ export class LiveAgentSession {
|
|
|
320
342
|
// while the agent actually received a deny. (serializeDecision stays a pure declared inverse of
|
|
321
343
|
// deserializeDecision; the warning lives here in the caller, not in the pure function.)
|
|
322
344
|
if (req.kind !== r.kind)
|
|
323
|
-
|
|
345
|
+
warn(`::warning:: decider returned kind "${r.kind}" for a "${req.kind}" request (id ${decisionId}) → sending a safe deny/cancel; the agent did NOT receive an answer\n`);
|
|
324
346
|
this.write(serializeDecision(req, r));
|
|
325
347
|
}
|
|
326
348
|
close() {
|
|
@@ -333,13 +355,12 @@ export class LiveAgentSession {
|
|
|
333
355
|
}
|
|
334
356
|
write(obj) {
|
|
335
357
|
const line = JSON.stringify(obj);
|
|
336
|
-
//
|
|
337
|
-
//
|
|
338
|
-
//
|
|
339
|
-
//
|
|
340
|
-
// fires instead of silently risking partial buffering on a frame far larger than expected.
|
|
358
|
+
// The control protocol writes small single-line JSON frames, so stdin backpressure effectively never
|
|
359
|
+
// engages; we ignore the write() return / drain here. A frame past the safe threshold is anomalous —
|
|
360
|
+
// hard-FAIL rather than risk a partially-buffered write that silently corrupts the protocol stream.
|
|
361
|
+
// (If large control frames ever become legitimate, switch to a drain-aware queue, making writes async.)
|
|
341
362
|
if (line.length > 256 * 1024)
|
|
342
|
-
|
|
363
|
+
throw new Error(`control frame is ${line.length} bytes (> 256 KiB safe limit) — refusing to write to avoid partial stdin buffering; this indicates an unexpectedly large control payload`);
|
|
343
364
|
this.controlOut.write(line + "\n");
|
|
344
365
|
this.proc.stdin.write(line + "\n");
|
|
345
366
|
}
|
|
@@ -393,6 +414,7 @@ export function parseMessage(msg) {
|
|
|
393
414
|
ev.push({
|
|
394
415
|
type: "subagent_dispatch",
|
|
395
416
|
toolUseId: String(block.id ?? ""),
|
|
417
|
+
parentToolUseId,
|
|
396
418
|
// Skills often dispatch with only {description, prompt} (no subagent_type) → agentType is
|
|
397
419
|
// "unknown" but the description still identifies the dispatch (e.g. "TOP_DOWN market sizing").
|
|
398
420
|
agentType: String(inp.subagent_type ?? inp.subagentType ?? "unknown"),
|
|
@@ -415,6 +437,7 @@ export function parseMessage(msg) {
|
|
|
415
437
|
toolUseId: block.tool_use_id ? String(block.tool_use_id) : undefined,
|
|
416
438
|
isError: !!block.is_error,
|
|
417
439
|
text: toolResultText(block.content),
|
|
440
|
+
provenanceText: toolResultRaw(block.content),
|
|
418
441
|
});
|
|
419
442
|
}
|
|
420
443
|
break;
|
|
@@ -424,17 +447,27 @@ export function parseMessage(msg) {
|
|
|
424
447
|
}
|
|
425
448
|
return ev;
|
|
426
449
|
}
|
|
427
|
-
/** Flatten a tool_result `content` (a string, or an array of `{type:"text",text}` blocks)
|
|
428
|
-
|
|
450
|
+
/** Flatten a tool_result `content` (a string, or an array of `{type:"text",text}` blocks), capped at
|
|
451
|
+
* `max` chars. The 500-char DISPLAY value (toolResultText) keeps the recorder/trace compact; the larger
|
|
452
|
+
* PROVENANCE value (toolResultRaw) is what seeds web_fetch provenance, so a URL past char 500 isn't lost. */
|
|
453
|
+
function flattenToolResult(content, max) {
|
|
429
454
|
if (typeof content === "string")
|
|
430
|
-
return content.slice(0,
|
|
455
|
+
return content.slice(0, max);
|
|
431
456
|
if (Array.isArray(content))
|
|
432
457
|
return content
|
|
433
458
|
.map((b) => (b && typeof b === "object" && "text" in b ? String(b.text) : ""))
|
|
434
459
|
.join(" ")
|
|
435
|
-
.slice(0,
|
|
460
|
+
.slice(0, max);
|
|
436
461
|
return "";
|
|
437
462
|
}
|
|
463
|
+
function toolResultText(content) {
|
|
464
|
+
return flattenToolResult(content, 500);
|
|
465
|
+
}
|
|
466
|
+
/** Larger cap for provenance (URL extraction) — matches the web_fetch body cap so any URL the agent
|
|
467
|
+
* could realistically act on is seeded; still bounded so a pathological result can't blow up memory. */
|
|
468
|
+
function toolResultRaw(content) {
|
|
469
|
+
return flattenToolResult(content, 200_000);
|
|
470
|
+
}
|
|
438
471
|
export function toDecisionRequest(msg) {
|
|
439
472
|
const sub = msg.request?.subtype;
|
|
440
473
|
const id = msg.request_id;
|