cowork-harness 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ {
2
+ "$comment": "GENERATED from the Zod Assertion schema (src/types.ts) by scripts/gen-schema.ts — do not edit; run `npm run schema`.",
3
+ "keys": [
4
+ "allow_permissive_auto_allow",
5
+ "artifact_json",
6
+ "dispatch_count_max",
7
+ "egress_allowed",
8
+ "egress_denied",
9
+ "file_exists",
10
+ "gate_answers_delivered",
11
+ "no_delete_in_outputs",
12
+ "question_asked",
13
+ "questions_count_max",
14
+ "replay_protocol_fidelity",
15
+ "result",
16
+ "self_heal_ran",
17
+ "subagent_declared_but_unused",
18
+ "subagent_dispatched",
19
+ "subagent_tool_absent",
20
+ "subagent_tool_used",
21
+ "tool_called",
22
+ "tool_not_called",
23
+ "transcript_contains",
24
+ "transcript_matches",
25
+ "transcript_no_host_path",
26
+ "transcript_not_contains",
27
+ "transcript_not_matches",
28
+ "user_visible_artifact"
29
+ ]
30
+ }
@@ -55,19 +55,55 @@ CONTENT_KEYS = {
55
55
  }
56
56
  # content keys, but only evaluated on replay when the cassette carries controlOut
57
57
  GATE_KEYS = {"question_asked", "questions_count_max", "gate_answers_delivered"}
58
- # live-only: silently skipped on replay (no filesystem, no network)
59
- FS_EGRESS_KEYS = {
60
- "file_exists",
61
- "user_visible_artifact",
58
+ # manifest-backed: replay-checkable when the cassette carries an `artifacts` manifest (record snapshots one);
59
+ # a manifest-less cassette skips them. Since the 0.3.0 artifact-manifest these are NOT always live-only.
60
+ MANIFEST_KEYS = {"file_exists", "user_visible_artifact", "artifact_json"}
61
+ # live-only: ALWAYS silently skipped on replay (no filesystem, no network on the token-free lane)
62
+ LIVE_ONLY_KEYS = {
63
+ "egress_denied",
64
+ "egress_allowed",
62
65
  "no_delete_in_outputs",
63
66
  "self_heal_ran",
64
67
  "transcript_no_host_path",
65
- "egress_denied",
66
- "egress_allowed",
67
68
  }
68
69
  EGRESS_KEYS = {"egress_denied", "egress_allowed"}
69
- # every valid key inside an `assert:` list item
70
- ASSERT_KEYS = CONTENT_KEYS | GATE_KEYS | FS_EGRESS_KEYS | {"replay_protocol_fidelity"}
70
+ # verdict modifiers don't verify anything themselves (e.g. suppress a default-fail)
71
+ VERDICT_MODIFIER_KEYS = {"allow_permissive_auto_allow"}
72
+
73
+ # Every key the replay-class logic knows how to handle. `replay_protocol_fidelity` is valid-but-not-authorable
74
+ # (errored separately below). This is also the embedded fallback for ASSERT_KEYS — kept EQUAL to the generated
75
+ # list (test-enforced) so a missing assertion-keys.json can't silently reintroduce key drift.
76
+ _CLASSIFIED_KEYS = CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS | LIVE_ONLY_KEYS | VERDICT_MODIFIER_KEYS | {"replay_protocol_fidelity"}
77
+
78
+
79
+ def _load_assert_keys():
80
+ """The authoritative `assert:` key set, generated from the Zod Assertion schema into a sibling
81
+ `assertion-keys.json` (so the unknown-key check can't drift). Falls back to the embedded
82
+ `_CLASSIFIED_KEYS` (kept equal to the generated list) with a loud warning if the file is missing."""
83
+ p = Path(__file__).resolve().parent / "assertion-keys.json"
84
+ try:
85
+ return set(json.loads(p.read_text(encoding="utf-8"))["keys"])
86
+ except Exception:
87
+ print(
88
+ f"::warning:: assertion-keys.json not found next to scenario.py ({p}) — "
89
+ "using a built-in key list that may be stale (run `npm run schema`).",
90
+ file=sys.stderr,
91
+ )
92
+ return set(_CLASSIFIED_KEYS)
93
+
94
+
95
+ # every valid key inside an `assert:` list item (generated from the zod schema; see _load_assert_keys)
96
+ ASSERT_KEYS = _load_assert_keys()
97
+
98
+ # Self-check: every valid assertion key must be classified, else the replay-class lint logic mishandles it.
99
+ # Surfaced loudly at load AND as a lint ERROR in cmd_lint (so --strict / exit codes flow). Never sys.exit here.
100
+ UNCLASSIFIED_KEYS = sorted(ASSERT_KEYS - _CLASSIFIED_KEYS)
101
+ if UNCLASSIFIED_KEYS:
102
+ print(
103
+ f"::warning:: scenario.py: assertion key(s) {UNCLASSIFIED_KEYS} are in the schema but not classified "
104
+ "— add them to the linter's CONTENT/GATE/MANIFEST/LIVE_ONLY/VERDICT_MODIFIER sets.",
105
+ file=sys.stderr,
106
+ )
71
107
  # every valid top-level scenario key
72
108
  TOP_LEVEL_KEYS = {
73
109
  "name",
@@ -232,40 +268,59 @@ def lint_doc(doc, path, raw_lines):
232
268
  )
233
269
  )
234
270
 
235
- # W: no content assertion → a replay PR gate verifies nothing
271
+ # W: nothing replay-checkable → a replay PR gate verifies nothing. Content/gate are replay-checkable, and
272
+ # manifest-backed keys are too WHEN the cassette carries an artifacts manifest — so only an all-live-only
273
+ # (egress / no_delete / self_heal / host-path) assert set genuinely no-ops on replay.
236
274
  if items:
237
- content_present = bool(assert_keys & (CONTENT_KEYS | GATE_KEYS))
238
- if not content_present:
275
+ replay_checkable = bool(assert_keys & (CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS))
276
+ if not replay_checkable:
239
277
  findings.append(
240
278
  Finding(
241
279
  "WARN",
242
280
  "replay-noop",
243
- "every assertion is filesystem/egress on the token-free `replay` lane they are "
244
- "ALL silently skipped, so a replay PR gate would verify nothing.",
245
- "Add a content assertion (result / transcript_* / tool_* / subagent_*) or run this "
281
+ "every assertion is live-only (egress / no_delete_in_outputs / self_heal_ran / "
282
+ "transcript_no_host_path) on the token-free `replay` lane they are ALL silently "
283
+ "skipped, so a replay PR gate would verify nothing.",
284
+ "Add a content assertion (result / transcript_* / tool_* / subagent_*) or a "
285
+ "manifest-backed one (file_exists / user_visible_artifact / artifact_json), or run this "
246
286
  "scenario only on the live (run/record) lane.",
247
287
  path,
248
288
  )
249
289
  )
250
290
 
251
- # W: mixed-class assert item → fs/egress half dropped on replay
291
+ # W: mixed-class assert item → the live-only half is dropped on replay (manifest-backed keys are NOT)
252
292
  for idx, item in enumerate(items):
253
293
  ks = set(item.keys())
254
- content_half = ks & (CONTENT_KEYS | GATE_KEYS)
255
- fs_half = ks & FS_EGRESS_KEYS
256
- if content_half and fs_half:
294
+ kept_half = ks & (CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS)
295
+ live_half = ks & LIVE_ONLY_KEYS
296
+ if kept_half and live_half:
257
297
  findings.append(
258
298
  Finding(
259
299
  "WARN",
260
300
  "mixed-assert-item",
261
- f"assert item #{idx} mixes content {sorted(content_half)} with "
262
- f"filesystem/egress {sorted(fs_half)} — on replay the filesystem/egress half is "
263
- "dropped (only the content half is evaluated).",
301
+ f"assert item #{idx} mixes replay-checkable {sorted(kept_half)} with "
302
+ f"live-only {sorted(live_half)} — on replay the live-only half is dropped "
303
+ "(only the replay-checkable half is evaluated).",
264
304
  "Split into separate list items: one per concern.",
265
305
  path,
266
306
  )
267
307
  )
268
308
 
309
+ # I: manifest-backed keys need an artifacts manifest on replay
310
+ manifest_present = sorted(assert_keys & MANIFEST_KEYS)
311
+ if manifest_present:
312
+ findings.append(
313
+ Finding(
314
+ "INFO",
315
+ "manifest-needs-snapshot",
316
+ f"assertion(s) {manifest_present} evaluate on replay only when the cassette carries an "
317
+ "`artifacts` manifest (`record` snapshots one). A manifest-less cassette skips them "
318
+ "(with a loud warning).",
319
+ "Record with a current harness so the cassette carries the artifacts manifest.",
320
+ path,
321
+ )
322
+ )
323
+
269
324
  # I: gate keys need a controlOut cassette on replay
270
325
  gate_present = sorted(assert_keys & GATE_KEYS)
271
326
  if gate_present:
@@ -361,6 +416,19 @@ def _print_findings(findings, n_files):
361
416
 
362
417
  def cmd_lint(args):
363
418
  all_findings = []
419
+ # Linter self-check (B3): a valid schema key the replay-class sets don't classify can't be linted
420
+ # correctly — surface it as a hard ERROR so it fails the gate (and --strict) until someone classifies it.
421
+ if UNCLASSIFIED_KEYS:
422
+ all_findings.append(
423
+ Finding(
424
+ "ERROR",
425
+ "linter-unclassified-key",
426
+ f"linter is out of date: assertion key(s) {UNCLASSIFIED_KEYS} are valid (in the schema) but "
427
+ "scenario.py doesn't classify their replay behavior, so they can't be linted.",
428
+ "Add them to the linter's CONTENT/GATE/MANIFEST/LIVE_ONLY/VERDICT_MODIFIER sets.",
429
+ "(scenario.py)",
430
+ )
431
+ )
364
432
  for f in args.files:
365
433
  all_findings.extend(lint_file(f))
366
434
  if args.json:
package/CHANGELOG.md CHANGED
@@ -6,7 +6,21 @@ All notable changes to this project are documented here. The format is based on
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
- ## [0.4.2] — 2026-06-18
9
+ ## [0.4.3] — 2026-06-18
10
+
11
+ ### Fixed
12
+
13
+ - **`cowork-harness lint` no longer flags `artifact_json` / `allow_permissive_auto_allow` as unknown keys.**
14
+ The linter's assertion-key list is now **generated from the Zod `Assertion` schema** (the same source
15
+ `assert --list` uses) into a file shipped next to `scenario.py`, with a CI drift-guard — so it can't lag the
16
+ schema again. Its replay-class warnings were also reconciled with the 0.3.0 artifact-manifest: `file_exists`,
17
+ `user_visible_artifact`, and `artifact_json` are now treated as **manifest-backed** (replay-checkable when the
18
+ cassette carries an `artifacts` manifest) rather than always-skipped, so a scenario asserting only those is no
19
+ longer a false `replay-noop`. A self-check fails the linter if a future schema key isn't classified.
20
+
21
+ ### Internal
22
+
23
+ - The npm tarball no longer ships `docs/internal/` (internal planning docs were being published).
10
24
 
11
25
  ### Added
12
26
 
package/dist/types.js CHANGED
@@ -142,7 +142,8 @@ export const Assertion = z.object({
142
142
  .describe("(replay-only, NOT authorable) serializeDecision output matched the frozen recording — the token-free O7 guard; synthesized by the replay lane and rejected if written in a scenario"),
143
143
  // #5: assert over the CONTENTS of a JSON artifact via a dotted path. `absent` and `is_null` are DISTINCT
144
144
  // (key-missing vs present-null); an unresolved INTERMEDIATE segment fails loud (malformed artifact),
145
- // never a vacuous pass. Live-only until the cassette artifact-manifest (#1) lands stripped on replay.
145
+ // never a vacuous pass. Manifest-backed: evaluated on replay when the cassette carries an `artifacts`
146
+ // manifest (`record` snapshots one); a manifest-less cassette skips it (with a loud warning).
146
147
  artifact_json: z
147
148
  .object({
148
149
  artifact: z.string().describe("relative path to a JSON artifact under the work root (e.g. outputs/cap_state.json)"),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cowork-harness",
3
- "version": "0.4.2",
3
+ "version": "0.4.3",
4
4
  "description": "Scriptable, CI-friendly harness for Claude Cowork's runtime contract for testing skills across scenarios — same agent, mounts, egress allowlist, permission protocol, and sandbox limitations.",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -34,9 +34,11 @@
34
34
  "docker",
35
35
  "schema",
36
36
  "docs",
37
+ "!docs/internal",
37
38
  "python",
38
39
  "scripts",
39
40
  ".claude/skills/cowork-harness/scripts/scenario.py",
41
+ ".claude/skills/cowork-harness/scripts/assertion-keys.json",
40
42
  ".env.example",
41
43
  "README.md",
42
44
  "CHANGELOG.md",
@@ -0,0 +1,75 @@
1
+ """Tests for the bundled linter (scenario.py): its assertion-key list is generated from the Zod schema
2
+ (no drift), and its replay-class warnings account for manifest-backed assertions.
3
+
4
+ Run via the repo's pytest lane: `pytest -m 'not cowork'` from python/.
5
+ """
6
+ import importlib.util
7
+ import json
8
+ from pathlib import Path
9
+
10
+ REPO = Path(__file__).resolve().parents[1]
11
+ SCENARIO_PY = REPO / ".claude/skills/cowork-harness/scripts/scenario.py"
12
+ KEYS_JSON = REPO / ".claude/skills/cowork-harness/scripts/assertion-keys.json"
13
+
14
+
15
+ def _load_scenario_module():
16
+ spec = importlib.util.spec_from_file_location("scenario_lint_under_test", SCENARIO_PY)
17
+ mod = importlib.util.module_from_spec(spec)
18
+ spec.loader.exec_module(mod)
19
+ return mod
20
+
21
+
22
+ scenario = _load_scenario_module()
23
+
24
+
25
+ def _rules(yaml_body, tmp_path):
26
+ f = tmp_path / "sc.yaml"
27
+ f.write_text(
28
+ "name: t\nbaseline: latest\nsession: (inline)\nfidelity: container\nprompt: hi\n" + yaml_body,
29
+ encoding="utf-8",
30
+ )
31
+ return {fnd.rule for fnd in scenario.lint_file(str(f))}
32
+
33
+
34
+ def test_assert_keys_loaded_from_generated_file():
35
+ generated = set(json.loads(KEYS_JSON.read_text(encoding="utf-8"))["keys"])
36
+ assert scenario.ASSERT_KEYS == generated
37
+ # the two keys that used to drift are present
38
+ assert {"artifact_json", "allow_permissive_auto_allow"} <= scenario.ASSERT_KEYS
39
+
40
+
41
+ def test_embedded_fallback_equals_generated_list():
42
+ # the in-code fallback must equal the generated list, else a missing file silently reintroduces drift
43
+ generated = set(json.loads(KEYS_JSON.read_text(encoding="utf-8"))["keys"])
44
+ assert scenario._CLASSIFIED_KEYS == generated
45
+
46
+
47
+ def test_every_key_is_classified_self_check():
48
+ assert scenario.UNCLASSIFIED_KEYS == []
49
+
50
+
51
+ def test_artifact_json_is_not_unknown(tmp_path):
52
+ rules = _rules("assert:\n - artifact_json: {artifact: outputs/x.json, path: a, equals: 1}\n", tmp_path)
53
+ assert "unknown-assert-key" not in rules
54
+ assert "manifest-needs-snapshot" in rules # it IS manifest-backed on replay
55
+
56
+
57
+ def test_allow_permissive_auto_allow_is_not_unknown(tmp_path):
58
+ rules = _rules("assert:\n - allow_permissive_auto_allow: true\n", tmp_path)
59
+ assert "unknown-assert-key" not in rules
60
+
61
+
62
+ def test_file_exists_only_is_not_replay_noop(tmp_path):
63
+ rules = _rules("assert:\n - file_exists: outputs/x.md\n", tmp_path)
64
+ assert "replay-noop" not in rules # manifest-backed → replay-checkable with a manifest
65
+ assert "manifest-needs-snapshot" in rules
66
+
67
+
68
+ def test_egress_only_is_replay_noop(tmp_path):
69
+ rules = _rules("assert:\n - egress_denied: evil.com\n", tmp_path)
70
+ assert "replay-noop" in rules # truly live-only → skipped on replay
71
+
72
+
73
+ def test_invented_key_still_flagged(tmp_path):
74
+ rules = _rules("assert:\n - file_not_empty: outputs/x\n", tmp_path)
75
+ assert "unknown-assert-key" in rules
@@ -9,11 +9,31 @@ import { mkdirSync, writeFileSync } from "node:fs";
9
9
  import { dirname, join } from "node:path";
10
10
  import { fileURLToPath, pathToFileURL } from "node:url";
11
11
  import { zodToJsonSchema } from "zod-to-json-schema";
12
- import { ScenarioObject } from "../src/types.js";
12
+ import { ScenarioObject, Assertion } from "../src/types.js";
13
13
  import { SessionConfig } from "../src/session.js";
14
14
 
15
15
  const REPO_ROOT = join(dirname(fileURLToPath(import.meta.url)), "..");
16
16
  export const SCHEMA_DIR = join(REPO_ROOT, "schema");
17
+ /** The bundled linter (`scenario.py`) reads this for its assertion-key list. It lives NEXT TO scenario.py
18
+ * (not under schema/) because schema/ is not shipped inside the plugin tree — only the skill's scripts dir
19
+ * is. Writer + the drift-guard test both reference this one constant. */
20
+ export const ASSERTION_KEYS_PATH = join(REPO_ROOT, ".claude/skills/cowork-harness/scripts/assertion-keys.json");
21
+
22
+ /** The authoritative assertion-key list, derived from the Zod `Assertion` schema (the same source
23
+ * `assert --list` reads). Generating it keeps `scenario.py`'s unknown-key check from drifting. */
24
+ export function buildAssertionKeys(): string {
25
+ return (
26
+ JSON.stringify(
27
+ {
28
+ $comment:
29
+ "GENERATED from the Zod Assertion schema (src/types.ts) by scripts/gen-schema.ts — do not edit; run `npm run schema`.",
30
+ keys: Object.keys(Assertion.shape).sort(),
31
+ },
32
+ null,
33
+ 2,
34
+ ) + "\n"
35
+ );
36
+ }
17
37
 
18
38
  const TARGETS = [
19
39
  {
@@ -54,6 +74,8 @@ function main(): void {
54
74
  writeFileSync(join(SCHEMA_DIR, file), body);
55
75
  process.stdout.write(`wrote schema/${file}\n`);
56
76
  }
77
+ writeFileSync(ASSERTION_KEYS_PATH, buildAssertionKeys());
78
+ process.stdout.write(`wrote ${ASSERTION_KEYS_PATH}\n`);
57
79
  }
58
80
 
59
81
  // Run only when invoked directly (so the test can import buildSchemas without side effects).