cowork-harness 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/.env.example +16 -0
  2. package/CHANGELOG.md +190 -0
  3. package/LICENSE +21 -0
  4. package/README.md +470 -0
  5. package/baselines/desktop-1.11847.5.json +78 -0
  6. package/baselines/desktop-1.12603.1.json +140 -0
  7. package/baselines/prompts/desktop-1.12603.1/host-loop-append.md +8 -0
  8. package/baselines/prompts/desktop-1.12603.1/subagent-append-vm.md +3 -0
  9. package/baselines/prompts/desktop-1.12603.1/system-prompt-append.md +18 -0
  10. package/dist/agent/session.js +465 -0
  11. package/dist/assert.js +159 -0
  12. package/dist/baseline.js +87 -0
  13. package/dist/boundary.js +114 -0
  14. package/dist/canary/grants.js +37 -0
  15. package/dist/cli.js +1107 -0
  16. package/dist/decide/decider.js +521 -0
  17. package/dist/decide/external-channel.js +262 -0
  18. package/dist/decide/llm-transport.js +52 -0
  19. package/dist/dotenv.js +52 -0
  20. package/dist/egress/proxy.js +138 -0
  21. package/dist/egress/sidecar.js +125 -0
  22. package/dist/hostloop/provenance.js +110 -0
  23. package/dist/hostloop/workspace-handler.js +226 -0
  24. package/dist/loop-decision.js +62 -0
  25. package/dist/prompt.js +43 -0
  26. package/dist/run/cassette.js +420 -0
  27. package/dist/run/chat.js +194 -0
  28. package/dist/run/envelope.js +31 -0
  29. package/dist/run/execute.js +533 -0
  30. package/dist/run/renderer.js +179 -0
  31. package/dist/run/run.js +347 -0
  32. package/dist/run/trace-view.js +227 -0
  33. package/dist/runtime/argv.js +126 -0
  34. package/dist/runtime/container.js +76 -0
  35. package/dist/runtime/host-env.js +28 -0
  36. package/dist/runtime/hostloop.js +129 -0
  37. package/dist/runtime/lima.js +177 -0
  38. package/dist/runtime/microvm.js +151 -0
  39. package/dist/runtime/protocol.js +79 -0
  40. package/dist/runtime/stage.js +52 -0
  41. package/dist/secrets.js +42 -0
  42. package/dist/session.js +315 -0
  43. package/dist/sync/cowork-sync.js +215 -0
  44. package/dist/types.js +127 -0
  45. package/docker/Dockerfile.agent +31 -0
  46. package/docker/Dockerfile.proxy +12 -0
  47. package/docker/compose.yml +31 -0
  48. package/fixtures/subagent-grants.json +5 -0
  49. package/package.json +70 -0
package/README.md ADDED
@@ -0,0 +1,470 @@
1
+ # cowork-harness
2
+
3
+ [![ci](https://github.com/yaniv-golan/cowork-harness/actions/workflows/ci.yml/badge.svg)](https://github.com/yaniv-golan/cowork-harness/actions/workflows/ci.yml)
4
+ [![license: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
5
+ [![node: >=20](https://img.shields.io/badge/node-%3E%3D20-339933.svg)](#quick-start)
6
+
7
+ Scriptable, CI-friendly test harness that reproduces **Claude Cowork's observable runtime contract** closely enough to test the skills you write — across many scenarios, headless, in CI — without the (locked) Desktop app. It reproduces not just Cowork's *behavior* but its *limitations*: sealed filesystem, default-deny egress, MCP-only cross-boundary — so a green test means green in real Cowork.
8
+
9
+ **Contents:** [Why it works](#why-this-works-for-skill-testing) · [Fidelity tiers](#fidelity-tiers-pick-per-scenario--per-ci-job) · [Quick start](#quick-start) · [Session + scenario](#two-files-session--scenario) · [Boundary](#sandboxing-container-vs-the-real-vm) · [Discovery](#discovery-marketplaces-plugins-skills-mcp) · [Testing & CI/CD](#testing--cicd) · [Maintenance](#maintenance-parity-between-releases) · [Docs](#documentation)
10
+
11
+ > **New here?** Read [docs/boundary.md](./docs/boundary.md) (the limitations model) and [docs/session.md](./docs/session.md) (the file you'll author).
12
+
13
+ > **What this is and isn't.** This is an *emulator of the contract*, not the Desktop runtime. Cowork's real session control plane lives behind the Desktop renderer's IPC (per-build UUID + `senderFrame` origin checks) and the app ships with remote debugging disabled (verified: `--remote-debugging-port` opens no listener; Electron `EnableNodeCliInspectArguments` fuse is OFF). So you **cannot** drive the real Apple Virtualization.framework microVM from a script. What you *can* faithfully reproduce is everything that actually changes how a **skill** behaves: the same agent binary in **cowork mode** (`CLAUDE_CODE_IS_COWORK=1` — there is no `--cowork` flag), the same mount layout, the same egress allowlist, and the same permission/question protocol. That's what this project does.
14
+
15
+ ---
16
+
17
+ ## Why this works for skill testing
18
+
19
+ A skill's behavior under Cowork is determined by four things, all reproducible outside the VM:
20
+
21
+ | Dimension | What Cowork does | How we reproduce it | Fidelity |
22
+ |---|---|---|---|
23
+ | **Agent** | Spawns the staged in-VM agent `claude-code-vm/<ver>/claude` in cowork mode (`CLAUDE_CODE_IS_COWORK=1` env — there is no `--cowork` flag) | Run the **same pinned agent**, **bind-mounted** from your Claude Desktop install's staged Linux/arm64 ELF (no npm path; override with `COWORK_AGENT_BINARY`) | **High** — same binary contract |
24
+ | **Mounts** | `/sessions/<id>/mnt/{uploads,.projects/<id>,.local-plugins,.remote-plugins}` | Recreate the same paths as bind mounts; skill-under-test discovered at the plugin mount, same as Cowork | **High** — same discovery path |
25
+ | **Egress** | gVisor netstack with a compiled domain allowlist (`vmAllowedDomains()` + `coworkEgressAllowedHosts`) | Default-deny egress proxy enforcing the **synced** allowlist | **Med-High** — allowlist-exact, transport-approximate |
26
+ | **Permissions / questions** | `onToolPermissionRequest` → `respondToToolPermission`; AskUserQuestion answered by the UI | The **Agent SDK `can_use_tool` control protocol** — the exact same channel — answered by your scenario script | **High** — same protocol Desktop uses |
27
+
28
+ The permission/question protocol is the backbone, and it's the *most stable* surface — it's the documented Agent SDK control protocol (`can_use_tool`, `hook_callback`, `mcp_message`, …). Everything fragile (agent version, mount paths, allowlist contents) is pushed into a **versioned baseline** that you re-sync per release. See [Maintenance](#maintenance-parity-between-releases).
29
+
30
+ ---
31
+
32
+ ## Fidelity tiers (pick per scenario / per CI job)
33
+
34
+ ```
35
+ L0 protocol-only claude -p stream-json on the host. No sandbox, no egress control.
36
+ Fastest. Tests skill logic + scripted answers. CI default for unit-style.
37
+
38
+ L1 container parity Pinned agent in cowork mode inside an arm64 Linux container with the real
39
+ (recommended) mount layout and a default-deny egress proxy enforcing the synced allowlist.
40
+ Reproducible, CI-native (Docker/Podman). The faithful-yet-maintainable sweet spot.
41
+
42
+ L2 microvm parity Optional. Agent inside a real Linux microVM (Lima/Apple-VZ) with a guest
43
+ (opt-in, heavy) default-deny iptables firewall funnelling to the same allowlist proxy as L1.
44
+ VM-grade escape isolation; egress transport equals L1's HTTP-CONNECT proxy —
45
+ no gVisor netstack reproduced. Not for CI; periodic high-fidelity checks only.
46
+
47
+ ─── loop-mode overlays (orthogonal to L0/L1/L2: they pick WHERE the loop runs, not isolation) ───
48
+
49
+ hostloop Cowork's PRODUCTION split-execution: the agent loop runs host-side, while the
50
+ shell/web tools run in the container via the workspace SDK-MCP server
51
+ (mcp__workspace__bash). Reproduces the real host-loop boundary.
52
+
53
+ cowork Auto-picks hostloop vs container the way Cowork itself does, from GrowthBook
54
+ gate 1143815894 decoded in the synced baseline. "Do what real Cowork does."
55
+ ```
56
+
57
+ Most skill testing runs **L1 (`container`)**. Use **L0 (`protocol`)** for fast inner-loop and pure-logic assertions; **L2 (`microvm`)** for VM-grade escape isolation of untrusted code (rare — it does **not** improve network-transport fidelity over L1); **`hostloop`/`cowork`** to reproduce Cowork's production split-execution model. Set the tier with `fidelity:` in a scenario or `--fidelity` on `skill`.
58
+
59
+ ---
60
+
61
+ ## Commands at a glance
62
+
63
+ Skill testing is the headline use, but the tool is a general harness over the Cowork runtime. Run any command with `--help` for its full flag reference.
64
+
65
+ | Command | What it does | Reach for it when… |
66
+ |---|---|---|
67
+ | `skill <folder> "<prompt>"` | Run a local skill/plugin folder once against the staged agent | ad-hoc "is the skill alive / does it do X?" — the fast inner loop |
68
+ | `run <scenario.yaml \| dir/>` | Run authored scenarios with `assert:` + a CI-ready exit code | you want a repeatable, **asserted regression test** |
69
+ | `chat <folder>` | Interactive multi-turn REPL against a skill (TTY) | debugging a multi-turn flow by hand |
70
+ | `record` / `replay` | Save a control-protocol cassette, then replay it deterministically | **token-free, Docker-free CI** from a once-recorded run |
71
+ | `trace <run-id>` | Digest a run's `events.jsonl` (tools, sub-agent dispatches, decisions) | "how many sub-agents *actually* dispatched, and which?" |
72
+ | `decide` | Validate a decider against a sample question in ~2 s (no run) | sanity-check a `--decider-*` / `--answer` wiring before a long run |
73
+ | `gates` / `answer` | Stream / answer in-band gates for `--decider-dir` | a **driving agent** answers live questions via a Monitor |
74
+ | `boundary-check [baseline]` | Prove the sandbox enforces Cowork's limitations | verifying the harness's own fidelity |
75
+ | `sync` / `list` | Derive/refresh & list platform baselines from the Desktop install | after Claude Desktop updates (baselines ship, so it's optional otherwise) |
76
+ | `vm <init\|status\|delete\|prune>` | Manage the L2 Apple-VZ / Lima microVM (`prune` removes orphaned VMs left by config/agent-version changes) | running `--fidelity microvm` |
77
+
78
+ There's also a **Python `cowork` pytest lane** (`python/`) for driving any of this from `pytest` beside your normal tests — see [`python/README.md`](./python/README.md).
79
+
80
+ ---
81
+
82
+ ## Test a local skill in one command
83
+
84
+ The fastest path — point at a **local folder**, no repo, no `claude plugin install`, no marketplace, no version bump, no cache layers. The folder is copied **fresh into the session on every run**, so you edit and re-run and your changes are live immediately:
85
+
86
+ ```bash
87
+ # Auth once — export it, OR put it in a .env file (resolved: env > --dotenv > ./.env > <install>/.env):
88
+ # cp .env.example .env && echo "CLAUDE_CODE_OAUTH_TOKEN=$(claude setup-token)" >> .env
89
+ export CLAUDE_CODE_OAUTH_TOKEN=$(claude setup-token)
90
+
91
+ cowork-harness skill ~/my-plugin 'Use my-skill to do X' # single-quote: no $ expansion
92
+ cowork-harness skill ~/my-plugin --prompt-file ./prompt.txt # prompt verbatim (raw bytes)
93
+ cowork-harness skill ~/my-plugin "..." --answer "which format=Markdown" # script AskUserQuestion
94
+ cowork-harness skill ~/my-plugin "..." --fidelity protocol # fast, no sandbox
95
+ cowork-harness skill ~/my-plugin "..." --dry-run # resolve & print the launch plan, don't run
96
+ cowork-harness skill ~/my-plugin "..." --keep # print the run dir to inspect
97
+ cowork-harness skill ~/my-plugin "..." --output-format json # machine-readable result on stdout
98
+ cowork-harness skill ~/my-plugin "..." --on-unanswered fail # never fabricate an answer (CI/agents)
99
+ cowork-harness skill ~/my-plugin "..." --decider-cmd 'node answer.js' # answer LIVE stochastic questions via a helper
100
+ cowork-harness skill ~/my-plugin 'review this deck' --upload deck.pdf # attach a file → mnt/uploads (deck-review etc.)
101
+ cowork-harness skill ~/my-plugin "..." --session-id s1 # pin a session…
102
+ cowork-harness skill ~/my-plugin '<next turn>' --session-id s1 --resume # …then resume it (gated/checkpoint skills)
103
+ cowork-harness skill ~/my-plugin "..." --keep # then: trace the run
104
+ cowork-harness trace <run-id> --tools # tool calls + sub-agent dispatches from events.jsonl
105
+ cowork-harness skill --help # full per-command flag reference
106
+
107
+ cowork-harness chat ~/my-plugin # interactive multi-turn REPL (full harness: egress sandbox + control protocol)
108
+ # chat --raw → native interactive cowork mode via `docker run -it` (needs Docker + the arm64
109
+ # cowork-agent-base:1 image; the egress sandbox is NOT applied in --raw)
110
+ ```
111
+
112
+ **Input policy — no silent false-greens.** When an AskUserQuestion arrives with no scripted
113
+ `--answer`, the policy is explicit: `fail` (error + the exact `--answer` to add — the default for
114
+ `run`/CI), `prompt` (ask at the TTY — the default for `skill` when interactive), or `first` (pick
115
+ option 1, loudly warn). Pick with `--on-unanswered`; left unset, `skill` is **adaptive** (`prompt` on
116
+ a TTY, `fail` when piped/CI) and `run` is always `fail`. Exit codes: `0`
117
+ pass · `1` assertion/agent failure · `2` usage /
118
+ unanswered-under-`fail` / boundary / runtime. After a run, the footer **echoes every auto-answered
119
+ question as a copy-pasteable `--answer "<q>=<choice>"` line** — run once exploratorily, then paste them
120
+ back to lock in a deterministic re-run.
121
+
122
+ **Output.** `skill` **renders the agent's work** (assistant text + tool calls) and a metered footer —
123
+ you see *what it did*, not just a green. `run` is verdict-first but **prints the failing transcript
124
+ inline** on a `FAIL` (no spelunking `runs/…`). Tune with `--quiet` (verdict only) / `--verbose`/`-V`
125
+ (+ thinking, tool inputs, sub-agent tree). `--output-format json` emits a stable machine envelope on stdout
126
+ (`{tool, version, command, ok, results[], error}`; errors are `{ok:false, error:{category,message,hint}}`)
127
+ — see [SPEC §11](./SPEC.md). Human output is stderr, machine output is stdout, so `--output-format json` pipes
128
+ cleanly. Honors `NO_COLOR`.
129
+
130
+ **Test a specific local plugin version** — just point at the folder at that version (it's copied fresh; no install, no version bump). Add more with `--plugin`:
131
+ ```bash
132
+ cowork-harness skill ~/my-plugin "..." --plugin ~/other-plugin
133
+ ```
134
+
135
+ **Test a specific local marketplace version** — point at the marketplace dir (the one with `.claude-plugin/marketplace.json`); it's registered fresh each run via `claude plugin marketplace add`, no clone/cache:
136
+ ```bash
137
+ cowork-harness skill --marketplace ~/my-marketplace --enable my-skill@my-marketplace "Use my-skill"
138
+ ```
139
+
140
+ It mounts the folder(s) at the Cowork plugin path, runs the staged agent in cowork mode, and prints PASS/RESULT (add `--keep` to print the run dir, or `--output-format json` for the machine-readable result). No YAML to author. (Author `scenarios/*.yaml` only for repeatable, asserted regression tests.)
141
+
142
+ ## Quick start
143
+
144
+ **Install from source** (not yet published to npm):
145
+
146
+ ```bash
147
+ git clone https://github.com/yaniv-golan/cowork-harness && cd cowork-harness
148
+ npm install && npm run build && npm link # puts the `cowork-harness` command on your PATH
149
+ # …or skip the link and call it directly: node dist/cli.js <cmd>
150
+ ```
151
+
152
+ > A global `npm install -g cowork-harness` will work once the package is published; for now, build from source.
153
+ > (Heads-up: the repo folder is `claude-cowork-headless-emulator`, the package/CLI is `cowork-harness`, and the GitHub repo is `yaniv-golan/cowork-harness`.)
154
+
155
+ **Prerequisites for anything above `protocol` fidelity** (the `protocol` tier needs none of these — it's pure logic iteration):
156
+ 1. **Claude Desktop, opened once.** The Cowork agent binary is **bind-mounted from your own install** at run time — nothing Anthropic-owned is bundled. Open Cowork once so the agent ELF is staged (`…/claude-code-vm/<ver>/claude`); the harness auto-detects it, or set `COWORK_AGENT_BINARY=<path>` to point at it. Without a staged agent, container/cowork runs fail with "Open Cowork once to stage it…".
157
+ 2. **Docker (arm64)** + the agent image: `docker build --platform linux/arm64 -t cowork-agent-base:1 -f docker/Dockerfile.agent .` (override the tag with `COWORK_AGENT_IMAGE`).
158
+ 3. **An auth token** — either `export CLAUDE_CODE_OAUTH_TOKEN=$(claude setup-token)` or a **`.env`** file (copy `.env.example` → `.env`; gitignored). The token resolves in priority order: exported env > `--dotenv <path>` > `./.env` (cwd) > `<install>/.env` (the package root), so a `npm link`ed install works from any directory. Keep `.env` at a working-dir or install root, never inside a mounted skill/project folder. (Use `--dotenv`, not `--env-file` — Node reserves the latter.)
159
+
160
+ > `sync` (below) is **optional for a first run** — the repo ships `baselines/desktop-*.json`, so `baseline: latest` already resolves. Run `sync` only to refresh the platform baseline after Claude Desktop updates. (`sync` is **macOS-only** today; on Linux/Windows use the committed baselines — they work cross-platform.)
161
+
162
+ ```bash
163
+ # 1. Sync a platform baseline from your installed Claude Desktop (one-time + per release)
164
+ cowork-harness sync # writes baselines/desktop-<appVersion>.json
165
+ cowork-harness sync --diff # show what changed vs the committed baseline
166
+
167
+ # 2. Run a scenario (L1 container by default)
168
+ cowork-harness run examples/scenarios/example-pdf-skill.yaml # minimal: plumbing only
169
+ cowork-harness run examples/scenarios/csv-metrics.yaml # worked example: a real skill runs a bundled producer end-to-end
170
+ cowork-harness run examples/scenarios/csv-fx-normalize.yaml # graceful degradation: the skill's network step is blocked, it falls back
171
+
172
+ # 3. Run a whole suite in CI (machine-readable results, CI-ready exit code)
173
+ cowork-harness run examples/scenarios/ --output-format json
174
+
175
+ # 4. Record a cassette once, then replay it deterministically (no token, no Docker)
176
+ # (without --out, the cassette is named after the scenario — its `name:`, or the filename)
177
+ cowork-harness record examples/scenarios/example-pdf-skill.yaml --out cassettes/example-pdf-skill.cassette.json
178
+ cowork-harness replay --cassette cassettes/example-pdf-skill.cassette.json
179
+
180
+ # A committed synthetic fixture is ready to replay on a fresh clone (no record step needed):
181
+ cowork-harness replay --cassette examples/replays/example-pdf-skill.cassette.json
182
+ ```
183
+
184
+ > **What replay checks.** A cassette bundles BOTH recorded protocol directions: the child→driver
185
+ > `events` stream AND the driver→child `controlOut` decision responses. `replay` re-runs the
186
+ > orchestration from both, re-evaluates the **content** assertions, and re-exercises
187
+ > `serializeDecision` as a token-free O7 guard (the AskUserQuestion `{questions,answers}` answer-shape
188
+ > invariant). Evaluated on replay: `transcript_*`, `tool_*`, `subagent_*`, `dispatch_count_max`,
189
+ > `result`. **`question_asked`, `questions_count_max`, and `gate_answers_delivered` are also evaluated —
190
+ > but only when the cassette carries `controlOut` (full-fidelity)**; old cassettes without it get a
191
+ > loud warning and those three keys are excluded (not vacuously passed). Silently skipped (no
192
+ > filesystem/network in a replay): `file_exists`, `user_visible_artifact`, `egress_*`,
193
+ > `expect_denied`, `no_delete_in_outputs`, `self_heal_ran`, `transcript_no_host_path` — keep those
194
+ > in a periodic live `run`. See [docs/cassette.md](./docs/cassette.md) for the full guide.
195
+
196
+ **Drive it from pytest** — the `cowork` lane (see [`python/README.md`](./python/README.md)):
197
+ `@pytest.mark.cowork` + a `cowork` fixture over the `--output-format json` surface, selectable with
198
+ `-m cowork` (opt-in, beside your fast tests).
199
+
200
+ ---
201
+
202
+ ## Two files: session + scenario
203
+
204
+ Configuration splits the way Cowork itself splits — *what you set up before the first prompt* vs. *what you ask*:
205
+
206
+ - **Session setup** (`sessions/*.yaml`) — everything you'd configure in Cowork's pre-prompt setup: model, effort, extended thinking, permission mode, **mounted work folders / projects**, uploaded files, and **discovery** (marketplaces, plugins, skills, MCP servers). Hand-authored, one per project, reused across scenarios.
207
+ - **Scenario** (`scenarios/*.yaml`) — the prompt, the **scripted answers**, and the assertions. References a session.
208
+
209
+ > **Worked examples to copy** live under [`examples/`](./examples/) (see [examples/README.md](./examples/README.md)). `examples/skills/csv-metrics/` + `examples/sessions/csv-metrics.yaml` + `examples/scenarios/csv-metrics.yaml` is a complete, non-trivial skill running end-to-end: the agent loads the skill, runs its **bundled producer** (`scripts/metrics.py`, stdlib-only so it works under default-deny egress), and writes a structured `outputs/metrics.json` + a `outputs/summary.md`. The scenario asserts the structure (skill loaded, producer ran, artifacts exist); the paired [`python/test_csv_metrics_lane.py`](./python/test_csv_metrics_lane.py) adds a predicate over the JSON content (`assert_artifact_json`). Read those files to see the whole loop — discovery → run → deliverable → assert — that every real skill follows. (`examples/scenarios/example-pdf-skill.yaml` is the minimal counterpart: harness plumbing, placeholder skill.)
210
+ >
211
+ > **Worked example #2 — graceful degradation under the sealed network.** `examples/skills/csv-fx-normalize/` + `examples/scenarios/csv-fx-normalize.yaml` shows the property you can *only* test by running against the real boundary: the skill's job needs the network (fetch an FX rate to convert EUR→USD), Cowork's default-deny egress blocks it, and the skill **falls back to source currency instead of crashing or hanging**. Its `egress_denied: api.frankfurter.app` assertion is backed by a *real* fetch the skill makes — not a synthetic probe — and `result: success` + the delivered artifact prove the fallback. This is the right way to assert egress: cause a genuine denial through real behavior.
212
+
213
+ ```yaml
214
+ # scenarios/pdf.yaml ← the filename is the test's identity (name: is an optional override)
215
+ baseline: latest # platform baseline (auto-synced from Desktop)
216
+ session: ../sessions/default.yaml # pre-prompt setup, resolved relative to THIS file
217
+ fidelity: container # protocol | container | microvm
218
+
219
+ prompt: |
220
+ Summarize report.pdf and write the action items to outputs/actions.md
221
+
222
+ # Scripted answers — the can_use_tool control channel, same as Desktop's question UI
223
+ answers:
224
+ - when_question: "Which output format" # substring/regex on AskUserQuestion
225
+ choose: "Markdown"
226
+ - when_tool: Bash # tool-permission decisions
227
+ allow_if: "!command.includes('rm -rf')"
228
+ else: deny
229
+ - when_tool: Write
230
+ decide: allow
231
+
232
+ expect_denied: ["evil.example.com"] # assert this host is denied egress
233
+
234
+ assert:
235
+ - transcript_contains: "action items"
236
+ - file_exists: outputs/actions.md
237
+ - tool_called: Write
238
+ - egress_denied: evil.example.com
239
+ - result: success
240
+ ```
241
+
242
+ ```yaml
243
+ # sessions/default.yaml (abridged — see the file for every field)
244
+ # Relative paths below resolve from THIS file's dir (absolute and ~ are used as-is).
245
+ model: claude-opus-4-8
246
+ effort: high
247
+ max_thinking_tokens: 8000
248
+ permission_mode: default
249
+ permission_parity: cowork # cowork (allow unscripted tool calls, the default) | strict (deny unscripted)
250
+ folders:
251
+ - { from: ~/code/myproject, to: proj1 } # a work folder / Space -> mnt/.projects/proj1
252
+ uploads:
253
+ - ~/Downloads/report.pdf # -> mnt/uploads
254
+ plugins:
255
+ marketplaces: ["https://github.com/anthropics/claude-code.git"]
256
+ # local_marketplaces: ["../my-marketplace"] # LOCAL marketplace dirs (each with a marketplace.json)
257
+ local_plugins: ["../skills/my-pdf-skill"] # mounted at mnt/.local-plugins/cache under the synthetic "local" marketplace
258
+ enabled: ["my-pdf-skill@local"] # name@marketplace: a local_plugins entry is referenced as <plugin>@local
259
+ mcp:
260
+ config: ../data/mcp.json # standard mcpServers map (--mcp-config) — the way to attach an MCP server
261
+ egress:
262
+ extra_allow: ["api.github.com"]
263
+ ```
264
+
265
+ Multiple scenarios × sessions × platform baselines = your regression matrix. Drop YAML in `scenarios/` and CI runs them all.
266
+
267
+ ## Sandboxing: container vs. the real VM
268
+
269
+ Cowork runs the agent in an **Apple Virtualization.framework microVM** (separate kernel). The harness's default `container` tier uses an OS container (shared kernel, namespaces/cgroups). For **testing skills you wrote**, that's faithful where it counts — same agent binary, same cowork mode, same mount layout, same egress allowlist, same permission protocol — because skill behavior is agent-loop + tool behavior, all kernel-invisible. The container is the right default precisely because it's CI-native; a VM needs nested virtualization most shared CI runners don't have.
270
+
271
+ It only *matters* to use a real VM when you're testing **isolation of untrusted skills** (container escape is easier than VM escape), or a skill that probes kernel internals. For that, the `microvm` tier runs the same agent in a real Linux microVM via **Lima with `vmType: vz` — the same Apple Virtualization.framework Cowork uses** (highest off-app fidelity). This tier is **macOS arm64 only** (it needs Apple's hypervisor); there is no Linux/Firecracker path. The launch contract is identical to the container tier; only the isolation boundary differs — egress is the same allowlist proxy as the container tier (no gVisor netstack at any harness tier). See [DESIGN.md §1](./DESIGN.md).
272
+
273
+ ## Discovery: marketplaces, plugins, skills, MCP
274
+
275
+ The agent we run **is** `claude-code` (the same binary Cowork stages in `claude-code-vm/<ver>`), so it discovers extensions from the same roots — verified against the staged binary:
276
+
277
+ | Kind | Real root | How the harness populates it | Override |
278
+ |---|---|---|---|
279
+ | Plugins / marketplaces | `CLAUDE_CONFIG_DIR/plugins`, `plugin_marketplaces` + Cowork mounts `.local-plugins/cache`, `.remote-plugins` | session `plugins.local_plugins`/`remote_plugins` → mounted at those paths; `marketplaces`/`local_marketplaces`/`enabled` → `settings.json` | point `plugins.config_dir` at a test dir |
280
+ | Skills | `CLAUDE_CONFIG_DIR/skills` + skills inside plugins | session `skills.local` staged into the config dir; plugin skills discovered at the mount | swap the config dir or local dirs |
281
+ | MCP servers | `.mcp.json` / `--mcp-config`, `enabledMcpjsonServers` | session `mcp.config` → `--mcp-config`; `mcp.enabled` → `settings.json` | use a test `mcp.json` |
282
+
283
+ The harness builds a **clean managed `CLAUDE_CONFIG_DIR` per run** (with a generated `settings.json`) so discovery is hermetic and reproducible — nothing leaks from your real `~/.claude`. Pin `plugins.config_dir` to a fixed dir if you want to reproduce a specific real setup instead.
284
+
285
+ > Fidelity note: in real Cowork, stdio MCP servers run **host-side** (split execution — the VM shell is sealed, host MCP servers get full env). At the `container` tier the harness runs them alongside the agent for simplicity; if your skill depends on that host/VM split, document it as an unreproduced gap — `microvm` runs MCP inside the guest too, so no harness tier reproduces the split.
286
+
287
+ ---
288
+
289
+ ## What you get out (inspectable output)
290
+
291
+ Every run writes `runs/<scenario>/<sessionId>/`:
292
+
293
+ ```
294
+ events.jsonl full stream-json event log (child→driver; the cassette source)
295
+ control-out.jsonl driver→child control_responses (the other cassette half)
296
+ run.jsonl harness-observability log: decisions (+who decided), sub-agent dispatch
297
+ tree, egress, transcript, cost (replaces transcript.json/decisions.jsonl)
298
+ trace.json structured run trace: steps, questions, sub-agents, egress, decisions, cost
299
+ egress.log raw allow/deny per outbound connection (microvm: at top level; container: under
300
+ proxy/ — the allow/deny decisions are also folded into run.jsonl/result.json)
301
+ result.json assertion results + decisions + sub-agents + cost/usage + exit status
302
+ agent.stderr.log the agent process's stderr (auth errors, flag rejects)
303
+ ```
304
+
305
+ Secrets (the injected OAuth token / API key) are scrubbed from every persisted log by value.
306
+
307
+ ---
308
+
309
+ ## Architecture
310
+
311
+ ```
312
+ ┌──────────────────────────────────────────────┐
313
+ scenario.yaml ─────► │ cowork-harness (TS CLI) │
314
+ │ │
315
+ │ baseline loader ── baselines/desktop-*.json │ ◄── cowork-sync
316
+ │ (agent ver, mounts, │ (reads live
317
+ │ egress allowlist) │ Desktop install
318
+ │ │ + app.asar)
319
+ │ scenario → runtime selector (L0/L1/L2) │
320
+ └───────────────┬──────────────────────────────┘
321
+ │ spawns + speaks stream-json
322
+ ┌───────────────▼──────────────────────────────┐
323
+ │ Agent: claude -p (CLAUDE_CODE_IS_COWORK=1) │
324
+ │ --input-format stream-json │
325
+ │ --output-format stream-json │
326
+ │ │
327
+ │ cwd = /sessions/<id>/mnt │
328
+ │ mnt/uploads, mnt/.projects/*, plugin mounts │
329
+ └───────────────┬──────────────────────────────┘
330
+ decision control req (tool/ │ egress
331
+ question/dialog/elicit) │
332
+ ┌─────────────────────▼──────────────┐ ┌───────────────────────┐
333
+ │ AgentSession → Decider → Run │ │ Egress proxy │
334
+ │ (protocol seam · policy seam · │ │ default-deny, │
335
+ │ turn loop + RunRecord) │ │ allowlist = synced │
336
+ └─────────────────────────────────────┘ │ vmAllowedDomains() │
337
+ └───────────────────────┘
338
+ ```
339
+
340
+ - **AgentSession** speaks the Agent SDK control protocol over stream-json, emitting a typed event
341
+ stream. When the agent emits a decision request (a tool permission, an `AskUserQuestion`, or a
342
+ `request_user_dialog`/`elicitation`), the **Decider** resolves it — scripted `answers:` first, then
343
+ the cowork/strict permission default, then the `on_unanswered` policy (fail/prompt/first) — and
344
+ **Run** drives the turn loop and builds the `RunRecord` (decisions, the sub-agent dispatch tree,
345
+ egress, cost).
346
+ - **Egress proxy** (L1/L2) enforces the synced allowlist; default-deny. Domains come from the baseline, plus per-scenario `extra_allow`.
347
+ - **The platform baseline** is the single source of release-specific truth. Code rides the stable protocol; data tracks the release.
348
+
349
+ See [DESIGN.md](./DESIGN.md) for the full parity matrix, the known deltas vs. real Cowork, and the threat-model notes on egress.
350
+
351
+ ---
352
+
353
+ ## Testing & CI/CD
354
+
355
+ The harness is built to *be* your skills' test suite, and it ships with its own. Two layers:
356
+
357
+ ### Your skills' suite
358
+
359
+ Author scenarios in your own `scenarios/` dir, run the lot, get a non-zero exit on any failure:
360
+
361
+ ```bash
362
+ cowork-harness run scenarios/ # your repo's scenarios; runs every *.yaml, CI-ready exit code
363
+ ```
364
+
365
+ The provided [GitHub Actions workflow](.github/workflows/ci.yml) runs a **four-stage pipeline** you can copy into your skill repo:
366
+
367
+ | Stage | Runs | Needs | Gates |
368
+ |---|---|---|---|
369
+ | **unit** | format check · typecheck · unit tests · build · CLI smoke · token-free `replay` gate | nothing | every push/PR |
370
+ | **boundary** | builds the pinned agent image, brings up the default-deny network, runs `boundary-check` | Docker, arm64 runner | proves the sandbox enforces Cowork's limits — **no API key** |
371
+ | **scenarios** | the live scenario suite at `container` fidelity, uploads transcripts/egress logs as artifacts | `ANTHROPIC_API_KEY` (or `CLAUDE_CODE_OAUTH_TOKEN`) | fork PRs: the whole job is skipped (`if:` guard); same-repo without a key: warns and exits 0 |
372
+ | **parity-drift** | reminder to re-`sync` when Desktop updates | nothing | informational, never blocks |
373
+
374
+ This ordering means cheap checks fail fast, the **boundary parity gate runs without secrets** (so forks get it too), and expensive live runs only happen when a key is present.
375
+
376
+ ### The harness's own suite
377
+
378
+ ```bash
379
+ npm run ci # typecheck + build + test (Stage 1 locally; run format:check separately)
380
+ npm test # vitest: decider, egress allowlist, launch plan, example validation
381
+ cowork-harness boundary-check # Stage 2: self-verify the sandbox (needs Docker)
382
+ ```
383
+
384
+ Unit tests cover the scripted-answer logic, the egress allowlist matcher, the session→launch-plan materialization (mounts + discovery settings + env-strip), and a **schema guard** that fails if any shipped baseline/session/scenario stops validating. Add a test alongside any new schema field or `Decider` rule — see [CONTRIBUTING.md](./CONTRIBUTING.md).
385
+
386
+ > Copy your starting scenarios/sessions from **`examples/`**. The **`e2e/`** directory is the harness's *own* fidelity self-tests (smoke scenarios per tier) — not a template to copy.
387
+
388
+ ### Reproducibility knobs
389
+
390
+ - `COWORK_LOCKDOWN=off` — relax container hardening for debugging (default `on`). With it `on`, an L2 microVM whose guest egress firewall fails to apply **aborts loudly** rather than running un-isolated.
391
+ - `COWORK_CONTAINER_RUNTIME=podman` — use Podman instead of Docker.
392
+ - `COWORK_AGENT_IMAGE=<tag>` — override the agent image name (default `cowork-agent-base:1`); `COWORK_AGENT_BINARY=<path>` — override the auto-detected staged agent ELF.
393
+ - `COWORK_HARNESS_DECIDER_DIR_POLL_MS` / `_TIMEOUT_MS` — tune the `--decider-dir` rendezvous poll/backstop; `COWORK_HARNESS_DECIDER_CMD_TIMEOUT_MS` / `COWORK_HARNESS_LLM_TIMEOUT_MS` — backstop a hung `--decider-cmd` helper / `--decider-llm` model call (default 600 s, fail loud); `COWORK_HARNESS_DIALOG_TIMEOUT_MS` — override the 6 s dialog auto-cancel.
394
+ - `COWORK_HARNESS_RUNS_DIR` — relocate the `runs/` output root (so `trace` resolves runs from any directory).
395
+ - **Strictness escape hatches** (the harness fails loud by default): `COWORK_HARNESS_SOFT_MISSING=1` downgrades a missing mount source from a hard error to warn-and-exclude; `COWORK_HARNESS_ALLOW_CONFIG_DIR_WRITE=1` permits writing into an existing pinned `plugins.config_dir` (otherwise refused, to avoid clobbering a real Claude config).
396
+ - **Secret scrubbing:** `COWORK_HARNESS_SCRUB_KEYS=<KEY1,KEY2>` adds extra env-var names whose values are redacted from logs (beyond the known auth tokens + `ANTHROPIC_CUSTOM_HEADERS`); `COWORK_HARNESS_SCRUB_VALUES=<v1,v2>` redacts literal values regardless of env.
397
+ - L2 microVM: `COWORK_VM_GATEWAY` overrides the Lima host-proxy gateway IP (default `192.168.5.2`); `COWORK_VM_PROXY_PORT` the proxy port. The Lima instance is named `cowork-vm-<config-hash>` (a config change → a fresh VM); `COWORK_LIMA_INSTANCE` pins a fixed name, and `vm prune` removes orphaned ones.
398
+ - Pin `baseline: desktop-<ver>` and `model:` in a session for byte-stable runs; use `latest` to track.
399
+
400
+ ## Maintenance: parity between releases
401
+
402
+ This is the part built for longevity. The fragile, release-specific facts live in **one JSON baseline**; the orchestration code rides the stable stream-json protocol.
403
+
404
+ When a new Claude Desktop ships:
405
+
406
+ ```bash
407
+ cowork-harness sync --diff
408
+ ```
409
+
410
+ `cowork-sync` reads your **live install** and the **app.asar** and re-derives the baseline:
411
+
412
+ | Baseline field | Source (auto-detected) |
413
+ |---|---|
414
+ | `agentVersion` | `~/Library/Application Support/Claude/claude-code-vm/.sdk-version` |
415
+ | env-strip list | `app.asar` main bundle (BG env-strip) |
416
+ | `mountLayout` | `app.asar` (`{uuid,name,mountPath,hostPath}` model) |
417
+ | `egress.allowDomains` | `app.asar` `vmAllowedDomains()` + `firewallAlso` + `config.json:coworkEgressAllowedHosts` |
418
+ | `networkMode` | `config.json:coworkNetworkMode`, asar `vm_network_mode` |
419
+ | `requireFullVmSandbox` | `config.json:lastSeenRequireCoworkFullVmSandbox` |
420
+
421
+ The diff shows exactly what moved (agent bump, allowlist change, new mount). You review, commit the new `baselines/desktop-<ver>.json`, and the container pin updates automatically from the baseline. Parity drift then surfaces as **test diffs**, not silent rot.
422
+
423
+ > The sync script is the maintenance contract. If an Anthropic release changes something the sync script doesn't yet read, `sync --diff` flags an `unknown delta` from the asar fingerprint so you know to extend it — rather than parity quietly degrading.
424
+
425
+ ---
426
+
427
+ ## Honest limitations
428
+
429
+ - **Not the full Desktop network transport.** L1 is a container, not a VM; L2 *is* a real Apple-VZ microVM but still does not reproduce Cowork's gVisor netstack — its egress is the same allowlist proxy as L1 (with a guest iptables firewall in front). If your skill depends on VM-kernel specifics, validate at L2; if it depends on packet-level gVisor behavior, no tier reproduces it.
430
+ - **Cowork in-guest context is partial.** Desktop supplies host-loop staging, runtime `mountPath` RPC, and the bridge. We reproduce the *filesystem and cowork mode*, not those host-side services. Skills that call Desktop-only host RPCs won't run here (they wouldn't be portable anyway).
431
+ - **The agent binary is the staged ELF** (`claude-code-vm/<ver>/claude`), **bind-mounted** from your own Claude Desktop install — nothing Anthropic-owned is bundled or installed. There is **no npm path**; override the path with `COWORK_AGENT_BINARY`. Check licensing/ToS for your use.
432
+ - **Egress fidelity is allowlist-exact, transport-approximate** at L1 and L2. Domain allow/deny matches Cowork; the packet-level gVisor netstack is reproduced at neither — both use a default-deny allowlist proxy (L2 adds a guest iptables firewall).
433
+
434
+ These are documented per-tier in [DESIGN.md](./DESIGN.md) so a green test means what you think it means.
435
+
436
+ ---
437
+
438
+ ## Documentation
439
+
440
+ | Doc | Read it for |
441
+ |---|---|
442
+ | [docs/boundary.md](./docs/boundary.md) | The limitations model — sealed FS, default-deny egress, MCP-only crossing; how each tier enforces it; how to verify. |
443
+ | [docs/session.md](./docs/session.md) | Every `sessions/*.yaml` field and its Cowork mapping. |
444
+ | [docs/scenario.md](./docs/scenario.md) | `scenarios/*.yaml` — prompt, scripted answers, assertions. |
445
+ | [docs/cassette.md](./docs/cassette.md) | `record`/`replay` cassettes — what replay checks, which assertions are skipped. |
446
+ | [docs/decider-dir.md](./docs/decider-dir.md) | The `--decider-dir` recipe — a driving agent answers live gates in-band via `gates`/`answer` + a Monitor. |
447
+ | [docs/discovery.md](./docs/discovery.md) | Where plugins/skills/MCP are found + overrides. |
448
+ | [docs/maintenance.md](./docs/maintenance.md) | Parity across Desktop releases via `sync`. |
449
+ | [DESIGN.md](./DESIGN.md) | Architecture deep-dive + full parity matrix. |
450
+ | [SPEC.md](./SPEC.md) | The authoritative testable contract (scenario/session schema, `RunResult`, exit codes). |
451
+ | [CHANGELOG.md](./CHANGELOG.md) | Release history. |
452
+ | [python/README.md](./python/README.md) | The `cowork` pytest lane for driving the harness from Python. |
453
+ | [SECURITY.md](./SECURITY.md) | Threat model — the sandbox is a fidelity fixture, not a security boundary. |
454
+
455
+ ## Status
456
+
457
+ **Verified end-to-end against the live staged agent (2.1.170 / asar 1.12603.1).**
458
+
459
+ - ✅ **Three isolation tiers (L0/L1/L2) + two loop-mode overlays** — `protocol` (L0 control loop), `container` (L1 sandboxed arm64 + per-run default-deny egress sidecar), `microvm` (L2 real Apple-VZ Linux microVM + guest firewall); plus the loop-mode overlays `hostloop` (production split-execution: agent loop on host, shell/web via the workspace SDK-MCP server) and `cowork` (auto-picks host-loop vs container the way Cowork does — gate `1143815894`). Egress enforced at container/microvm/hostloop; `boundary-check` reports **ALL CONSTRAINTS ENFORCED**.
460
+ - ✅ **Three-seam driver** — `AgentSession` (control protocol) → `Decider` (scripted + `on_unanswered` policy, no silent false-greens) → `Run` (turn loop, sub-agent dispatch tree, `RunRecord`). Multi-turn `chat`, deterministic cassette `record`/`replay` (no token), `run.jsonl`/`trace.json` logging with secret-scrub.
461
+ - ✅ **Answering live questions, every way you'd need** — `--decider-llm --intent "<one line>"` (a small model picks per question, steered by your test intent — the ergonomic default; non-determinism is flagged so a green isn't mistaken for a scripted pass), `--answer-policy <yaml>`/`--answer "rx=c"` (declarative regex→label, deterministic CI), `--decider-cmd '<helper>'` (custom logic — the Python `serve_decider(fn)` adapter pre-builds the wire loop so the helper writes only the decision), and **`--decider-dir <dir>`** (the *driving agent* answers each gate **in-band** with full context — it arms a Monitor on `cowork-harness gates <dir> --follow` and replies with `cowork-harness answer <dir> --gate <N> --choose <label>`; the session-under-test stays live, no resume/re-worded question — binary-verified). Every channel keeps stdout free, so all compose with `--output-format json`. A question is **never silently answered with option 1** — unhandled fails loud. Validate any decider in ~2s with `cowork-harness decide`.
462
+ - ✅ **Sub-agent aggregation + `trace`** — recognizes the real `Agent` dispatch tool (binary-verified; `Task` is its alias) so `subagent_dispatched`/`dispatch_count_max` fire under `--fidelity cowork`, excluding the `TaskCreate` todo list; `cowork-harness trace <id> --tools` digests `events.jsonl` (each tool row now shows its **result status** `ok`/`error`), and `trace <id> --gates` shows the gate lifecycle (**question → injected answer → delivered result**); `result.json`/`--keep` surface the deep `mnt/outputs` deliverable path.
463
+ - ✅ **Answer delivery is verified, not assumed** — an AskUserQuestion answer is injected as the binary's full tool input (`{questions, answers}`, ELF-verified), so the answer actually reaches the model; `RunResult.gateDeliveries[]` + the `gate_answers_delivered` assertion catch any delivery failure, and `RunResult.toolCounts` gives the **truthful** per-tool call count (host-routed `WebSearch` shows here, not the always-0 `usage.server_tool_use`).
464
+ - ✅ **Binary-grounded fidelity** — cwd `/sessions/<id>`, the three-channel MCP model (`--mcp-config` honored in plain cowork mode; host/API-routed `web_fetch`; SDK-server delivery), host-loop tool partition, auth-env token-only drop, and production GrowthBook gates pinned per release.
465
+ - ✅ The agent binary is **bind-mounted from your own install** at run time — nothing Anthropic-owned is in any image or distributed.
466
+ - ✅ **File provision & session resume** — `--upload <file>` / `--folder <dir>` attach files & connect folders (`mnt/uploads`, `mnt/.projects`); `--session-id <id>` + `--resume` persist and continue a session via the agent's *native* resume (binary-verified), so checkpoint-and-resume gated skills are testable. Demonstrated end-to-end in the live-contract suite (`test/live-contract.test.ts`: codeword established → resumed → recalled).
467
+ - ✅ **The full unit suite + the live-contract suite green**; a `cowork` pytest lane (`python/`) for skill authors.
468
+ - ℹ️ **Auth:** a `claude setup-token` OAuth token (or `ANTHROPIC_API_KEY`), provided via the env, `--dotenv <path>`, `./.env`, or the install's own `.env` (gitignored; keep it out of mounted folders). It's passed into the sandbox **off the process argv** (Docker: `-e KEY` inherit-by-name; microVM: a stdin prologue) so the token isn't visible via `ps`/`/proc`, scrubbed from logs, never persisted in a runtime path; the token-only path mirrors the desktop. A fresh `CLAUDE_CONFIG_DIR` alone breaks local OAuth.
469
+
470
+ See [SPEC.md](./SPEC.md) (the testable contract), [DESIGN.md](./DESIGN.md) (architecture + parity), and [CHANGELOG.md](./CHANGELOG.md).
@@ -0,0 +1,78 @@
1
+ {
2
+ "$comment": "Platform baseline auto-derived by `cowork-harness sync` from a live Claude Desktop install + app.asar. VOLATILE per-release facts only. Regenerate per release; review the diff. Captured 2026-06-12 on macOS arm64.",
3
+ "baselineVersion": 1,
4
+ "appVersion": "1.11847.5",
5
+ "capturedAt": "2026-06-12",
6
+ "platform": "darwin-arm64",
7
+
8
+ "agentVersion": "2.1.170",
9
+ "agentBinary": {
10
+ "stagedPath": "~/Library/Application Support/Claude/claude-code-vm/2.1.170/claude",
11
+ "format": "elf-aarch64",
12
+ "$comment": "There is NO npm path — the Linux/arm64 ELF is bind-mounted from this staged Desktop install (or COWORK_AGENT_BINARY). npmPackage/preferReuseStaged removed (Q1)."
13
+ },
14
+
15
+ "guest": { "os": "linux", "arch": "arm64" },
16
+
17
+ "mountLayout": {
18
+ "sessionRoot": "/sessions/{sessionId}/mnt",
19
+ "cwd": "/sessions/{sessionId}/mnt",
20
+ "mounts": [
21
+ { "name": "uploads", "mountPath": "uploads", "mode": "r", "purpose": "user-uploaded files (read-only — asar 'ro')" },
22
+ { "name": "projects", "mountPath": ".projects/{projectId}", "mode": "rw", "purpose": "selected work folders (a Space) — delete denied by default (asar IX)" },
23
+ { "name": "local-plugins", "mountPath": ".local-plugins/cache", "mode": "r", "purpose": "marketplace skills/plugins, runtime-discovered" },
24
+ { "name": "remote-plugins", "mountPath": ".remote-plugins", "mode": "r", "purpose": "org-remote plugins, runtime-discovered" },
25
+ { "name": "outputs", "mountPath": "outputs", "mode": "rw", "purpose": "session outputs/artifacts — delete denied by default (asar IX); rwd only when approved" }
26
+ ]
27
+ },
28
+
29
+ "network": {
30
+ "mode": "gvisor",
31
+ "configKey_networkMode": "coworkNetworkMode",
32
+ "configKey_requireFullVmSandbox": "lastSeenRequireCoworkFullVmSandbox",
33
+ "allowKind": "allowlist",
34
+ "allowDomains": [
35
+ "api.anthropic.com",
36
+ "a-api.anthropic.com",
37
+ "a-cdn.anthropic.com",
38
+ "api-staging.anthropic.com",
39
+ "console.anthropic.com",
40
+ "docs.anthropic.com",
41
+ "mcp-proxy.anthropic.com",
42
+ "support.anthropic.com",
43
+ "assets.claude.ai",
44
+ "downloads.claude.ai",
45
+ "pivot.claude.ai",
46
+ "preview.claude.ai",
47
+ "sentry.io",
48
+ "statsig.anthropic.com",
49
+ "statsigapi.net"
50
+ ],
51
+ "userOverrideKey": "coworkEgressAllowedHosts",
52
+ "$comment_allow": "Derived from asar vmAllowedDomains() + firewallAlso + config.json:coworkEgressAllowedHosts. statsig* host normalized; verify exact statsig endpoint per release."
53
+ },
54
+
55
+ "settings": {
56
+ "autoMountFolders": { "key": "autoMountFolders", "default": false },
57
+ "localAgentModeTrustedFolders": { "key": "localAgentModeTrustedFolders", "default": [] }
58
+ },
59
+
60
+ "bgEnvStrip": {
61
+ "$comment": "Vars Cowork strips before spawning the in-VM agent (≥2.1.160 = 12 vars incl CLAUDE_CODE_OAUTH_TOKEN). The harness mirrors this strip so skills can't rely on host env that Cowork removes.",
62
+ "count": 12,
63
+ "knownVars": [
64
+ "CLAUDE_CODE_OAUTH_TOKEN",
65
+ "CLAUDE_CODE_SESSION_KIND",
66
+ "CLAUDE_CODE_SESSION_ID",
67
+ "CLAUDE_CODE_SESSION_NAME",
68
+ "CLAUDE_CODE_SESSION_LOG"
69
+ ]
70
+ },
71
+
72
+ "provenance": {
73
+ "asarPath": "/Applications/Claude.app/Contents/Resources/app.asar",
74
+ "asarFingerprint": "TODO:computed-by-sync",
75
+ "eipcChannelUuid": "4f426349-8d6f-45f3-ae22-280fef323564",
76
+ "$comment": "eipcChannelUuid is per-build; recorded for provenance only — the harness does not use Desktop IPC."
77
+ }
78
+ }