cowork-harness 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/cowork-harness/scripts/assertion-keys.json +30 -0
- package/.claude/skills/cowork-harness/scripts/scenario.py +89 -21
- package/CHANGELOG.md +24 -0
- package/baselines/desktop-1.13576.1.json +220 -0
- package/dist/types.js +2 -1
- package/package.json +3 -1
- package/python/test_scenario_lint.py +75 -0
- package/scripts/gen-schema.ts +23 -1
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$comment": "GENERATED from the Zod Assertion schema (src/types.ts) by scripts/gen-schema.ts — do not edit; run `npm run schema`.",
|
|
3
|
+
"keys": [
|
|
4
|
+
"allow_permissive_auto_allow",
|
|
5
|
+
"artifact_json",
|
|
6
|
+
"dispatch_count_max",
|
|
7
|
+
"egress_allowed",
|
|
8
|
+
"egress_denied",
|
|
9
|
+
"file_exists",
|
|
10
|
+
"gate_answers_delivered",
|
|
11
|
+
"no_delete_in_outputs",
|
|
12
|
+
"question_asked",
|
|
13
|
+
"questions_count_max",
|
|
14
|
+
"replay_protocol_fidelity",
|
|
15
|
+
"result",
|
|
16
|
+
"self_heal_ran",
|
|
17
|
+
"subagent_declared_but_unused",
|
|
18
|
+
"subagent_dispatched",
|
|
19
|
+
"subagent_tool_absent",
|
|
20
|
+
"subagent_tool_used",
|
|
21
|
+
"tool_called",
|
|
22
|
+
"tool_not_called",
|
|
23
|
+
"transcript_contains",
|
|
24
|
+
"transcript_matches",
|
|
25
|
+
"transcript_no_host_path",
|
|
26
|
+
"transcript_not_contains",
|
|
27
|
+
"transcript_not_matches",
|
|
28
|
+
"user_visible_artifact"
|
|
29
|
+
]
|
|
30
|
+
}
|
|
@@ -55,19 +55,55 @@ CONTENT_KEYS = {
|
|
|
55
55
|
}
|
|
56
56
|
# content keys, but only evaluated on replay when the cassette carries controlOut
|
|
57
57
|
GATE_KEYS = {"question_asked", "questions_count_max", "gate_answers_delivered"}
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
# manifest-backed: replay-checkable when the cassette carries an `artifacts` manifest (record snapshots one);
|
|
59
|
+
# a manifest-less cassette skips them. Since the 0.3.0 artifact-manifest these are NOT always live-only.
|
|
60
|
+
MANIFEST_KEYS = {"file_exists", "user_visible_artifact", "artifact_json"}
|
|
61
|
+
# live-only: ALWAYS silently skipped on replay (no filesystem, no network on the token-free lane)
|
|
62
|
+
LIVE_ONLY_KEYS = {
|
|
63
|
+
"egress_denied",
|
|
64
|
+
"egress_allowed",
|
|
62
65
|
"no_delete_in_outputs",
|
|
63
66
|
"self_heal_ran",
|
|
64
67
|
"transcript_no_host_path",
|
|
65
|
-
"egress_denied",
|
|
66
|
-
"egress_allowed",
|
|
67
68
|
}
|
|
68
69
|
EGRESS_KEYS = {"egress_denied", "egress_allowed"}
|
|
69
|
-
#
|
|
70
|
-
|
|
70
|
+
# verdict modifiers — don't verify anything themselves (e.g. suppress a default-fail)
|
|
71
|
+
VERDICT_MODIFIER_KEYS = {"allow_permissive_auto_allow"}
|
|
72
|
+
|
|
73
|
+
# Every key the replay-class logic knows how to handle. `replay_protocol_fidelity` is valid-but-not-authorable
|
|
74
|
+
# (errored separately below). This is also the embedded fallback for ASSERT_KEYS — kept EQUAL to the generated
|
|
75
|
+
# list (test-enforced) so a missing assertion-keys.json can't silently reintroduce key drift.
|
|
76
|
+
_CLASSIFIED_KEYS = CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS | LIVE_ONLY_KEYS | VERDICT_MODIFIER_KEYS | {"replay_protocol_fidelity"}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _load_assert_keys():
|
|
80
|
+
"""The authoritative `assert:` key set, generated from the Zod Assertion schema into a sibling
|
|
81
|
+
`assertion-keys.json` (so the unknown-key check can't drift). Falls back to the embedded
|
|
82
|
+
`_CLASSIFIED_KEYS` (kept equal to the generated list) with a loud warning if the file is missing."""
|
|
83
|
+
p = Path(__file__).resolve().parent / "assertion-keys.json"
|
|
84
|
+
try:
|
|
85
|
+
return set(json.loads(p.read_text(encoding="utf-8"))["keys"])
|
|
86
|
+
except Exception:
|
|
87
|
+
print(
|
|
88
|
+
f"::warning:: assertion-keys.json not found next to scenario.py ({p}) — "
|
|
89
|
+
"using a built-in key list that may be stale (run `npm run schema`).",
|
|
90
|
+
file=sys.stderr,
|
|
91
|
+
)
|
|
92
|
+
return set(_CLASSIFIED_KEYS)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# every valid key inside an `assert:` list item (generated from the zod schema; see _load_assert_keys)
|
|
96
|
+
ASSERT_KEYS = _load_assert_keys()
|
|
97
|
+
|
|
98
|
+
# Self-check: every valid assertion key must be classified, else the replay-class lint logic mishandles it.
|
|
99
|
+
# Surfaced loudly at load AND as a lint ERROR in cmd_lint (so --strict / exit codes flow). Never sys.exit here.
|
|
100
|
+
UNCLASSIFIED_KEYS = sorted(ASSERT_KEYS - _CLASSIFIED_KEYS)
|
|
101
|
+
if UNCLASSIFIED_KEYS:
|
|
102
|
+
print(
|
|
103
|
+
f"::warning:: scenario.py: assertion key(s) {UNCLASSIFIED_KEYS} are in the schema but not classified "
|
|
104
|
+
"— add them to the linter's CONTENT/GATE/MANIFEST/LIVE_ONLY/VERDICT_MODIFIER sets.",
|
|
105
|
+
file=sys.stderr,
|
|
106
|
+
)
|
|
71
107
|
# every valid top-level scenario key
|
|
72
108
|
TOP_LEVEL_KEYS = {
|
|
73
109
|
"name",
|
|
@@ -232,40 +268,59 @@ def lint_doc(doc, path, raw_lines):
|
|
|
232
268
|
)
|
|
233
269
|
)
|
|
234
270
|
|
|
235
|
-
# W:
|
|
271
|
+
# W: nothing replay-checkable → a replay PR gate verifies nothing. Content/gate are replay-checkable, and
|
|
272
|
+
# manifest-backed keys are too WHEN the cassette carries an artifacts manifest — so only an all-live-only
|
|
273
|
+
# (egress / no_delete / self_heal / host-path) assert set genuinely no-ops on replay.
|
|
236
274
|
if items:
|
|
237
|
-
|
|
238
|
-
if not
|
|
275
|
+
replay_checkable = bool(assert_keys & (CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS))
|
|
276
|
+
if not replay_checkable:
|
|
239
277
|
findings.append(
|
|
240
278
|
Finding(
|
|
241
279
|
"WARN",
|
|
242
280
|
"replay-noop",
|
|
243
|
-
"every assertion is
|
|
244
|
-
"
|
|
245
|
-
"
|
|
281
|
+
"every assertion is live-only (egress / no_delete_in_outputs / self_heal_ran / "
|
|
282
|
+
"transcript_no_host_path) — on the token-free `replay` lane they are ALL silently "
|
|
283
|
+
"skipped, so a replay PR gate would verify nothing.",
|
|
284
|
+
"Add a content assertion (result / transcript_* / tool_* / subagent_*) or a "
|
|
285
|
+
"manifest-backed one (file_exists / user_visible_artifact / artifact_json), or run this "
|
|
246
286
|
"scenario only on the live (run/record) lane.",
|
|
247
287
|
path,
|
|
248
288
|
)
|
|
249
289
|
)
|
|
250
290
|
|
|
251
|
-
# W: mixed-class assert item →
|
|
291
|
+
# W: mixed-class assert item → the live-only half is dropped on replay (manifest-backed keys are NOT)
|
|
252
292
|
for idx, item in enumerate(items):
|
|
253
293
|
ks = set(item.keys())
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
if
|
|
294
|
+
kept_half = ks & (CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS)
|
|
295
|
+
live_half = ks & LIVE_ONLY_KEYS
|
|
296
|
+
if kept_half and live_half:
|
|
257
297
|
findings.append(
|
|
258
298
|
Finding(
|
|
259
299
|
"WARN",
|
|
260
300
|
"mixed-assert-item",
|
|
261
|
-
f"assert item #{idx} mixes
|
|
262
|
-
f"
|
|
263
|
-
"
|
|
301
|
+
f"assert item #{idx} mixes replay-checkable {sorted(kept_half)} with "
|
|
302
|
+
f"live-only {sorted(live_half)} — on replay the live-only half is dropped "
|
|
303
|
+
"(only the replay-checkable half is evaluated).",
|
|
264
304
|
"Split into separate list items: one per concern.",
|
|
265
305
|
path,
|
|
266
306
|
)
|
|
267
307
|
)
|
|
268
308
|
|
|
309
|
+
# I: manifest-backed keys need an artifacts manifest on replay
|
|
310
|
+
manifest_present = sorted(assert_keys & MANIFEST_KEYS)
|
|
311
|
+
if manifest_present:
|
|
312
|
+
findings.append(
|
|
313
|
+
Finding(
|
|
314
|
+
"INFO",
|
|
315
|
+
"manifest-needs-snapshot",
|
|
316
|
+
f"assertion(s) {manifest_present} evaluate on replay only when the cassette carries an "
|
|
317
|
+
"`artifacts` manifest (`record` snapshots one). A manifest-less cassette skips them "
|
|
318
|
+
"(with a loud warning).",
|
|
319
|
+
"Record with a current harness so the cassette carries the artifacts manifest.",
|
|
320
|
+
path,
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
|
|
269
324
|
# I: gate keys need a controlOut cassette on replay
|
|
270
325
|
gate_present = sorted(assert_keys & GATE_KEYS)
|
|
271
326
|
if gate_present:
|
|
@@ -361,6 +416,19 @@ def _print_findings(findings, n_files):
|
|
|
361
416
|
|
|
362
417
|
def cmd_lint(args):
|
|
363
418
|
all_findings = []
|
|
419
|
+
# Linter self-check (B3): a valid schema key the replay-class sets don't classify can't be linted
|
|
420
|
+
# correctly — surface it as a hard ERROR so it fails the gate (and --strict) until someone classifies it.
|
|
421
|
+
if UNCLASSIFIED_KEYS:
|
|
422
|
+
all_findings.append(
|
|
423
|
+
Finding(
|
|
424
|
+
"ERROR",
|
|
425
|
+
"linter-unclassified-key",
|
|
426
|
+
f"linter is out of date: assertion key(s) {UNCLASSIFIED_KEYS} are valid (in the schema) but "
|
|
427
|
+
"scenario.py doesn't classify their replay behavior, so they can't be linted.",
|
|
428
|
+
"Add them to the linter's CONTENT/GATE/MANIFEST/LIVE_ONLY/VERDICT_MODIFIER sets.",
|
|
429
|
+
"(scenario.py)",
|
|
430
|
+
)
|
|
431
|
+
)
|
|
364
432
|
for f in args.files:
|
|
365
433
|
all_findings.extend(lint_file(f))
|
|
366
434
|
if args.json:
|
package/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,30 @@ All notable changes to this project are documented here. The format is based on
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.4.3] — 2026-06-18
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
|
|
13
|
+
- **`cowork-harness lint` no longer flags `artifact_json` / `allow_permissive_auto_allow` as unknown keys.**
|
|
14
|
+
The linter's assertion-key list is now **generated from the Zod `Assertion` schema** (the same source
|
|
15
|
+
`assert --list` uses) into a file shipped next to `scenario.py`, with a CI drift-guard — so it can't lag the
|
|
16
|
+
schema again. Its replay-class warnings were also reconciled with the 0.3.0 artifact-manifest: `file_exists`,
|
|
17
|
+
`user_visible_artifact`, and `artifact_json` are now treated as **manifest-backed** (replay-checkable when the
|
|
18
|
+
cassette carries an `artifacts` manifest) rather than always-skipped, so a scenario asserting only those is no
|
|
19
|
+
longer a false `replay-noop`. A self-check fails the linter if a future schema key isn't classified.
|
|
20
|
+
|
|
21
|
+
### Internal
|
|
22
|
+
|
|
23
|
+
- The npm tarball no longer ships `docs/internal/` (internal planning docs were being published).
|
|
24
|
+
|
|
25
|
+
### Added
|
|
26
|
+
|
|
27
|
+
- **Platform baseline `desktop-1.13576.1`** — synced from the updated Claude Desktop (the app moved
|
|
28
|
+
`1.12603.1` → `1.13576.1`). `loadBaseline("latest")` now resolves to it. The embedded agent binary is
|
|
29
|
+
unchanged at `2.1.177` (the update changed the app shell + gate states, not the agent ELF); this baseline
|
|
30
|
+
also corrects the prior baselines' stale `2.1.170` agent pin to the actually-staged `2.1.177`. Egress
|
|
31
|
+
allowlist unchanged.
|
|
32
|
+
|
|
9
33
|
## [0.4.1] — 2026-06-18
|
|
10
34
|
|
|
11
35
|
### Fixed
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
{
|
|
2
|
+
"baselineVersion": 1,
|
|
3
|
+
"appVersion": "1.13576.1",
|
|
4
|
+
"agentVersion": "2.1.177",
|
|
5
|
+
"agentBinary": {
|
|
6
|
+
"stagedPath": "~/Library/Application Support/Claude/claude-code-vm/2.1.177/claude",
|
|
7
|
+
"format": "elf-aarch64"
|
|
8
|
+
},
|
|
9
|
+
"guest": {
|
|
10
|
+
"os": "linux",
|
|
11
|
+
"arch": "arm64",
|
|
12
|
+
"baseImage": "ubuntu:22.04"
|
|
13
|
+
},
|
|
14
|
+
"spawn": {
|
|
15
|
+
"configDirInGuest": "mnt/.claude",
|
|
16
|
+
"settingSources": [
|
|
17
|
+
"user"
|
|
18
|
+
],
|
|
19
|
+
"permissionMode": "default",
|
|
20
|
+
"maxThinkingTokens": 31999,
|
|
21
|
+
"effortDefault": "medium",
|
|
22
|
+
"tools": [
|
|
23
|
+
"Task",
|
|
24
|
+
"Bash",
|
|
25
|
+
"Glob",
|
|
26
|
+
"Grep",
|
|
27
|
+
"Read",
|
|
28
|
+
"Edit",
|
|
29
|
+
"Write",
|
|
30
|
+
"NotebookEdit",
|
|
31
|
+
"WebFetch",
|
|
32
|
+
"TaskCreate",
|
|
33
|
+
"TaskUpdate",
|
|
34
|
+
"TaskGet",
|
|
35
|
+
"TaskList",
|
|
36
|
+
"TaskStop",
|
|
37
|
+
"WebSearch",
|
|
38
|
+
"Skill",
|
|
39
|
+
"REPL",
|
|
40
|
+
"JavaScript",
|
|
41
|
+
"AskUserQuestion",
|
|
42
|
+
"ToolSearch"
|
|
43
|
+
],
|
|
44
|
+
"allowedTools": [
|
|
45
|
+
"Task",
|
|
46
|
+
"Bash",
|
|
47
|
+
"Glob",
|
|
48
|
+
"Grep",
|
|
49
|
+
"Read",
|
|
50
|
+
"Edit",
|
|
51
|
+
"Write",
|
|
52
|
+
"NotebookEdit",
|
|
53
|
+
"WebFetch",
|
|
54
|
+
"TaskCreate",
|
|
55
|
+
"TaskUpdate",
|
|
56
|
+
"TaskGet",
|
|
57
|
+
"TaskList",
|
|
58
|
+
"TaskStop",
|
|
59
|
+
"WebSearch",
|
|
60
|
+
"Skill",
|
|
61
|
+
"REPL",
|
|
62
|
+
"JavaScript",
|
|
63
|
+
"ToolSearch"
|
|
64
|
+
],
|
|
65
|
+
"env": {
|
|
66
|
+
"CLAUDE_CODE_IS_COWORK": "1",
|
|
67
|
+
"CLAUDE_CODE_ENTRYPOINT": "local-agent",
|
|
68
|
+
"CLAUDE_CODE_TAGS": "lam_session_type:chat",
|
|
69
|
+
"CLAUDE_CODE_PROVIDER_MANAGED_BY_HOST": "1",
|
|
70
|
+
"CLAUDE_CODE_ENABLE_ASK_USER_QUESTION_TOOL": "true",
|
|
71
|
+
"CLAUDE_CODE_DISABLE_CRON": "1",
|
|
72
|
+
"CLAUDE_CODE_DISABLE_BACKGROUND_TASKS": "1",
|
|
73
|
+
"CLAUDE_CODE_DISABLE_AGENTS_FLEET": "1",
|
|
74
|
+
"CLAUDE_CODE_ENABLE_APPEND_SUBAGENT_PROMPT": "1",
|
|
75
|
+
"CLAUDE_CODE_ENABLE_TASKS": "true",
|
|
76
|
+
"CLAUDE_CODE_DISABLE_TERMINAL_TITLE": "1",
|
|
77
|
+
"ENABLE_PROMPT_CACHING_1H": "1",
|
|
78
|
+
"DISABLE_MICROCOMPACT": "1",
|
|
79
|
+
"MCP_CONNECTION_NONBLOCKING": "true"
|
|
80
|
+
},
|
|
81
|
+
"promptTemplate": "prompts/desktop-1.12603.1/system-prompt-append.md",
|
|
82
|
+
"subagentAppend": "prompts/desktop-1.12603.1/subagent-append-vm.md",
|
|
83
|
+
"$comment": "Binary-verified Desktop->agent spawn contract (asar 1.12603.1). See docs/cowork-spawn-contract-1.12603.1.md.",
|
|
84
|
+
"$comment_notSet": "Deliberately NOT set: CLAUDE_CODE_USE_COWORK_PLUGINS (Desktop never sets it; would flip the agent to cowork_settings.json/cowork_plugins). Host-derived (TZ, account UUIDs, WORKSPACE_HOST_PATHS, OTEL) injected at runtime, not pinned.",
|
|
85
|
+
"$comment_prompts": "Reconstructed cowork-specific sections (not the full base prompt — not cleanly extractable). The main append is delivered via the --append-system-prompt CLI flag (layered on the agent's built-in base prompt), NOT the initialize handshake; only the subagent append goes over initialize (appendSubagentSystemPrompt), gated on CLAUDE_CODE_ENABLE_APPEND_SUBAGENT_PROMPT."
|
|
86
|
+
},
|
|
87
|
+
"mountLayout": {
|
|
88
|
+
"sessionRoot": "/sessions/{sessionId}",
|
|
89
|
+
"cwd": "/sessions/{sessionId}",
|
|
90
|
+
"mntRoot": "/sessions/{sessionId}/mnt",
|
|
91
|
+
"mounts": [
|
|
92
|
+
{
|
|
93
|
+
"name": "uploads",
|
|
94
|
+
"mountPath": "uploads",
|
|
95
|
+
"mode": "r",
|
|
96
|
+
"purpose": "user-uploaded files (read-only — asar 'ro')"
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"name": "projects",
|
|
100
|
+
"mountPath": ".projects/{projectId}",
|
|
101
|
+
"mode": "rw",
|
|
102
|
+
"purpose": "selected work folders (a Space) — delete denied by default (asar IX)"
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
"name": "local-plugins",
|
|
106
|
+
"mountPath": ".local-plugins/cache",
|
|
107
|
+
"mode": "r",
|
|
108
|
+
"purpose": "marketplace skills/plugins, runtime-discovered"
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"name": "remote-plugins",
|
|
112
|
+
"mountPath": ".remote-plugins",
|
|
113
|
+
"mode": "r",
|
|
114
|
+
"purpose": "org-remote plugins, runtime-discovered"
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"name": "outputs",
|
|
118
|
+
"mountPath": "outputs",
|
|
119
|
+
"mode": "rw",
|
|
120
|
+
"purpose": "session outputs/artifacts — delete denied by default (asar IX); rwd only when approved"
|
|
121
|
+
}
|
|
122
|
+
]
|
|
123
|
+
},
|
|
124
|
+
"network": {
|
|
125
|
+
"mode": "gvisor",
|
|
126
|
+
"allowKind": "allowlist",
|
|
127
|
+
"allowDomains": [
|
|
128
|
+
"sentry.io",
|
|
129
|
+
"preview.claude.ai",
|
|
130
|
+
"downloads.claude.ai",
|
|
131
|
+
"api.anthropic.com",
|
|
132
|
+
"a-cdn.anthropic.com",
|
|
133
|
+
"a-api.anthropic.com",
|
|
134
|
+
"console.anthropic.com",
|
|
135
|
+
"api-staging.anthropic.com",
|
|
136
|
+
"www.anthropic.com",
|
|
137
|
+
"docs.anthropic.com",
|
|
138
|
+
"mcp-proxy.anthropic.com",
|
|
139
|
+
"pivot.claude.ai",
|
|
140
|
+
"support.anthropic.com",
|
|
141
|
+
"assets.claude.ai"
|
|
142
|
+
]
|
|
143
|
+
},
|
|
144
|
+
"bgEnvStrip": {
|
|
145
|
+
"knownVars": [
|
|
146
|
+
"CLAUDE_CODE_OAUTH_TOKEN",
|
|
147
|
+
"CLAUDE_CODE_SESSION_KIND",
|
|
148
|
+
"CLAUDE_CODE_SESSION_ID",
|
|
149
|
+
"CLAUDE_CODE_SESSION_NAME",
|
|
150
|
+
"CLAUDE_CODE_SESSION_LOG"
|
|
151
|
+
]
|
|
152
|
+
},
|
|
153
|
+
"$comment": "Platform baseline auto-derived by `cowork-harness sync` from a live Claude Desktop install + app.asar. VOLATILE per-release facts only. Regenerate per release; review the diff. Captured 2026-06-12 on macOS arm64.",
|
|
154
|
+
"capturedAt": "2026-06-17",
|
|
155
|
+
"platform": "darwin-arm64",
|
|
156
|
+
"settings": {
|
|
157
|
+
"autoMountFolders": {
|
|
158
|
+
"key": "autoMountFolders",
|
|
159
|
+
"default": false
|
|
160
|
+
},
|
|
161
|
+
"localAgentModeTrustedFolders": {
|
|
162
|
+
"key": "localAgentModeTrustedFolders",
|
|
163
|
+
"default": []
|
|
164
|
+
}
|
|
165
|
+
},
|
|
166
|
+
"provenance": {
|
|
167
|
+
"asarPath": "/Applications/Claude.app/Contents/Resources/app.asar",
|
|
168
|
+
"asarFingerprint": "88b6968a8a249dbf",
|
|
169
|
+
"gates": {
|
|
170
|
+
"$comment": "Production GrowthBook gate states decoded from ~/Library/Application Support/Claude/fcache (standard interactive Anthropic account, 2026-06-13; binary-verified app.asar 1.12603.1). Pin per release. Behavior-affecting gates the harness models: 1143815894 (loop), 1648655587 (dispatch cap), 1978029737 (web_fetch routing). Telemetry/auth-internal gates omitted.",
|
|
171
|
+
"bridgeSdkTransport:583857784": {
|
|
172
|
+
"on": true,
|
|
173
|
+
"source": "force",
|
|
174
|
+
"value": true,
|
|
175
|
+
"note": "— Cowork uses the SDK-based transport (control protocol), confirming the harness's sdkMcpServers/mcp_message path is the production transport."
|
|
176
|
+
},
|
|
177
|
+
"hostLoop:1143815894": {
|
|
178
|
+
"on": true,
|
|
179
|
+
"source": "force",
|
|
180
|
+
"value": true
|
|
181
|
+
},
|
|
182
|
+
"taskDispatchLimiter:1648655587": {
|
|
183
|
+
"on": true,
|
|
184
|
+
"source": "force",
|
|
185
|
+
"value": {
|
|
186
|
+
"perTask": 1,
|
|
187
|
+
"global": 3
|
|
188
|
+
},
|
|
189
|
+
"note": "perTask=1 global=3 (host-side SKIP: recordSkipAndEmit/GCA.PerTaskLimit — NOT queue/deny). A dispatch session launches <=1 sub-task; <=3 concurrent globally."
|
|
190
|
+
},
|
|
191
|
+
"coworkRuntimeConfig:1978029737": {
|
|
192
|
+
"on": true,
|
|
193
|
+
"source": "force",
|
|
194
|
+
"value": {
|
|
195
|
+
"sessionsBridgePollBlockMs": 30,
|
|
196
|
+
"coworkNativeFilePreview": true,
|
|
197
|
+
"coworkWebFetchViaApi": true,
|
|
198
|
+
"coworkWebFetchPrompt": true,
|
|
199
|
+
"workspaceBashWaitLonger": true
|
|
200
|
+
},
|
|
201
|
+
"note": "coworkWebFetchViaApi=true coworkWebFetchPrompt=true workspaceBashWaitLonger=true sessionsBridgePollBlockMs=30 — web_fetch is host/API-routed (POST /api/organizations/<org>/cowork/web_fetch), NOT container egress; gated by a separate web-fetch hostname allowlist + URL provenance."
|
|
202
|
+
},
|
|
203
|
+
"cliPlugin:2307090146": {
|
|
204
|
+
"on": false,
|
|
205
|
+
"source": "defaultValue",
|
|
206
|
+
"value": false,
|
|
207
|
+
"note": "— the CLI-plugin credential broker is dark-launched off for standard interactive accounts (Ch23/L106)."
|
|
208
|
+
},
|
|
209
|
+
"pluginSyncSparkplug:2340532315": {
|
|
210
|
+
"on": true,
|
|
211
|
+
"source": "force",
|
|
212
|
+
"value": true,
|
|
213
|
+
"note": "— startup syncPlugins(); plugins load via --plugin-dir (registry inert in-VM)."
|
|
214
|
+
}
|
|
215
|
+
},
|
|
216
|
+
"eipcChannelUuid": "4f426349-8d6f-45f3-ae22-280fef323564",
|
|
217
|
+
"$comment": "eipcChannelUuid is per-build; recorded for provenance only — the harness does not use Desktop IPC."
|
|
218
|
+
},
|
|
219
|
+
"requireFullVmSandbox": null
|
|
220
|
+
}
|
package/dist/types.js
CHANGED
|
@@ -142,7 +142,8 @@ export const Assertion = z.object({
|
|
|
142
142
|
.describe("(replay-only, NOT authorable) serializeDecision output matched the frozen recording — the token-free O7 guard; synthesized by the replay lane and rejected if written in a scenario"),
|
|
143
143
|
// #5: assert over the CONTENTS of a JSON artifact via a dotted path. `absent` and `is_null` are DISTINCT
|
|
144
144
|
// (key-missing vs present-null); an unresolved INTERMEDIATE segment fails loud (malformed artifact),
|
|
145
|
-
// never a vacuous pass.
|
|
145
|
+
// never a vacuous pass. Manifest-backed: evaluated on replay when the cassette carries an `artifacts`
|
|
146
|
+
// manifest (`record` snapshots one); a manifest-less cassette skips it (with a loud warning).
|
|
146
147
|
artifact_json: z
|
|
147
148
|
.object({
|
|
148
149
|
artifact: z.string().describe("relative path to a JSON artifact under the work root (e.g. outputs/cap_state.json)"),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "cowork-harness",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.3",
|
|
4
4
|
"description": "Scriptable, CI-friendly harness for Claude Cowork's runtime contract for testing skills across scenarios — same agent, mounts, egress allowlist, permission protocol, and sandbox limitations.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -34,9 +34,11 @@
|
|
|
34
34
|
"docker",
|
|
35
35
|
"schema",
|
|
36
36
|
"docs",
|
|
37
|
+
"!docs/internal",
|
|
37
38
|
"python",
|
|
38
39
|
"scripts",
|
|
39
40
|
".claude/skills/cowork-harness/scripts/scenario.py",
|
|
41
|
+
".claude/skills/cowork-harness/scripts/assertion-keys.json",
|
|
40
42
|
".env.example",
|
|
41
43
|
"README.md",
|
|
42
44
|
"CHANGELOG.md",
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Tests for the bundled linter (scenario.py): its assertion-key list is generated from the Zod schema
|
|
2
|
+
(no drift), and its replay-class warnings account for manifest-backed assertions.
|
|
3
|
+
|
|
4
|
+
Run via the repo's pytest lane: `pytest -m 'not cowork'` from python/.
|
|
5
|
+
"""
|
|
6
|
+
import importlib.util
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
REPO = Path(__file__).resolve().parents[1]
|
|
11
|
+
SCENARIO_PY = REPO / ".claude/skills/cowork-harness/scripts/scenario.py"
|
|
12
|
+
KEYS_JSON = REPO / ".claude/skills/cowork-harness/scripts/assertion-keys.json"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _load_scenario_module():
|
|
16
|
+
spec = importlib.util.spec_from_file_location("scenario_lint_under_test", SCENARIO_PY)
|
|
17
|
+
mod = importlib.util.module_from_spec(spec)
|
|
18
|
+
spec.loader.exec_module(mod)
|
|
19
|
+
return mod
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
scenario = _load_scenario_module()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _rules(yaml_body, tmp_path):
|
|
26
|
+
f = tmp_path / "sc.yaml"
|
|
27
|
+
f.write_text(
|
|
28
|
+
"name: t\nbaseline: latest\nsession: (inline)\nfidelity: container\nprompt: hi\n" + yaml_body,
|
|
29
|
+
encoding="utf-8",
|
|
30
|
+
)
|
|
31
|
+
return {fnd.rule for fnd in scenario.lint_file(str(f))}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_assert_keys_loaded_from_generated_file():
|
|
35
|
+
generated = set(json.loads(KEYS_JSON.read_text(encoding="utf-8"))["keys"])
|
|
36
|
+
assert scenario.ASSERT_KEYS == generated
|
|
37
|
+
# the two keys that used to drift are present
|
|
38
|
+
assert {"artifact_json", "allow_permissive_auto_allow"} <= scenario.ASSERT_KEYS
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_embedded_fallback_equals_generated_list():
|
|
42
|
+
# the in-code fallback must equal the generated list, else a missing file silently reintroduces drift
|
|
43
|
+
generated = set(json.loads(KEYS_JSON.read_text(encoding="utf-8"))["keys"])
|
|
44
|
+
assert scenario._CLASSIFIED_KEYS == generated
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_every_key_is_classified_self_check():
|
|
48
|
+
assert scenario.UNCLASSIFIED_KEYS == []
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_artifact_json_is_not_unknown(tmp_path):
|
|
52
|
+
rules = _rules("assert:\n - artifact_json: {artifact: outputs/x.json, path: a, equals: 1}\n", tmp_path)
|
|
53
|
+
assert "unknown-assert-key" not in rules
|
|
54
|
+
assert "manifest-needs-snapshot" in rules # it IS manifest-backed on replay
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_allow_permissive_auto_allow_is_not_unknown(tmp_path):
|
|
58
|
+
rules = _rules("assert:\n - allow_permissive_auto_allow: true\n", tmp_path)
|
|
59
|
+
assert "unknown-assert-key" not in rules
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_file_exists_only_is_not_replay_noop(tmp_path):
|
|
63
|
+
rules = _rules("assert:\n - file_exists: outputs/x.md\n", tmp_path)
|
|
64
|
+
assert "replay-noop" not in rules # manifest-backed → replay-checkable with a manifest
|
|
65
|
+
assert "manifest-needs-snapshot" in rules
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_egress_only_is_replay_noop(tmp_path):
|
|
69
|
+
rules = _rules("assert:\n - egress_denied: evil.com\n", tmp_path)
|
|
70
|
+
assert "replay-noop" in rules # truly live-only → skipped on replay
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_invented_key_still_flagged(tmp_path):
|
|
74
|
+
rules = _rules("assert:\n - file_not_empty: outputs/x\n", tmp_path)
|
|
75
|
+
assert "unknown-assert-key" in rules
|
package/scripts/gen-schema.ts
CHANGED
|
@@ -9,11 +9,31 @@ import { mkdirSync, writeFileSync } from "node:fs";
|
|
|
9
9
|
import { dirname, join } from "node:path";
|
|
10
10
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
11
11
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
12
|
-
import { ScenarioObject } from "../src/types.js";
|
|
12
|
+
import { ScenarioObject, Assertion } from "../src/types.js";
|
|
13
13
|
import { SessionConfig } from "../src/session.js";
|
|
14
14
|
|
|
15
15
|
const REPO_ROOT = join(dirname(fileURLToPath(import.meta.url)), "..");
|
|
16
16
|
export const SCHEMA_DIR = join(REPO_ROOT, "schema");
|
|
17
|
+
/** The bundled linter (`scenario.py`) reads this for its assertion-key list. It lives NEXT TO scenario.py
|
|
18
|
+
* (not under schema/) because schema/ is not shipped inside the plugin tree — only the skill's scripts dir
|
|
19
|
+
* is. Writer + the drift-guard test both reference this one constant. */
|
|
20
|
+
export const ASSERTION_KEYS_PATH = join(REPO_ROOT, ".claude/skills/cowork-harness/scripts/assertion-keys.json");
|
|
21
|
+
|
|
22
|
+
/** The authoritative assertion-key list, derived from the Zod `Assertion` schema (the same source
|
|
23
|
+
* `assert --list` reads). Generating it keeps `scenario.py`'s unknown-key check from drifting. */
|
|
24
|
+
export function buildAssertionKeys(): string {
|
|
25
|
+
return (
|
|
26
|
+
JSON.stringify(
|
|
27
|
+
{
|
|
28
|
+
$comment:
|
|
29
|
+
"GENERATED from the Zod Assertion schema (src/types.ts) by scripts/gen-schema.ts — do not edit; run `npm run schema`.",
|
|
30
|
+
keys: Object.keys(Assertion.shape).sort(),
|
|
31
|
+
},
|
|
32
|
+
null,
|
|
33
|
+
2,
|
|
34
|
+
) + "\n"
|
|
35
|
+
);
|
|
36
|
+
}
|
|
17
37
|
|
|
18
38
|
const TARGETS = [
|
|
19
39
|
{
|
|
@@ -54,6 +74,8 @@ function main(): void {
|
|
|
54
74
|
writeFileSync(join(SCHEMA_DIR, file), body);
|
|
55
75
|
process.stdout.write(`wrote schema/${file}\n`);
|
|
56
76
|
}
|
|
77
|
+
writeFileSync(ASSERTION_KEYS_PATH, buildAssertionKeys());
|
|
78
|
+
process.stdout.write(`wrote ${ASSERTION_KEYS_PATH}\n`);
|
|
57
79
|
}
|
|
58
80
|
|
|
59
81
|
// Run only when invoked directly (so the test can import buildSchemas without side effects).
|