cowork-harness 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,560 @@
1
+ #!/usr/bin/env python3
2
+ """scenario.py — author and check cowork-harness scenarios without hallucinating the schema.
3
+
4
+ Two subcommands:
5
+
6
+ scenario.py scaffold ... emit a VALID scenario skeleton (real keys, correct tier,
7
+ content vs live-only assertions split one-per-item). The
8
+ generator self-lints its own output and refuses to emit a
9
+ scenario its own linter would reject.
10
+
11
+ scenario.py lint FILE... catch silent false-greens in existing scenarios. The
12
+ cowork-harness has several ways to make a check *silently
13
+ do nothing*; this encodes those invariants so they fail at
14
+ author time / in CI instead of rotting as a green-but-empty
15
+ assertion.
16
+
17
+ lint flags (see references/scenario-schema.md for the why of each):
18
+ E egress assertion on `fidelity: protocol` (the harness rejects this run)
19
+ E on_unanswered: agent / invalid value (schema rejects `agent`)
20
+ E authored `replay_protocol_fidelity` assertion (replay-synthesized only)
21
+ E `assertions:` instead of `assert:` (block ignored → every check no-ops)
22
+ W no content assertion → no-op on a replay gate (every assertion is fs/egress)
23
+ W mixed-class assert item → fs/egress half dropped on replay
24
+ W unknown top-level / assertion key (typo or hallucinated schema)
25
+ W double-quoted regex with a backslash (YAML eats the backslash)
26
+ I gate key present → needs a controlOut cassette on replay
27
+
28
+ Designed for agents and CI: non-interactive, --help, --json, meaningful exit codes,
29
+ idempotent. `lint` exits 1 on any ERROR (or any finding with --strict); else 0.
30
+
31
+ Requires PyYAML (`pip install pyyaml`).
32
+ """
33
+ from __future__ import annotations
34
+
35
+ import argparse
36
+ import json
37
+ import re
38
+ import sys
39
+ from pathlib import Path
40
+
41
+ # --- the replay-class taxonomy (mirrors `contentKeys` in src/run/cassette.ts) ---
42
+ CONTENT_KEYS = {
43
+ "result",
44
+ "transcript_contains",
45
+ "transcript_not_contains",
46
+ "transcript_matches",
47
+ "transcript_not_matches",
48
+ "tool_called",
49
+ "tool_not_called",
50
+ "subagent_tool_used",
51
+ "subagent_tool_absent",
52
+ "subagent_dispatched",
53
+ "subagent_declared_but_unused",
54
+ "dispatch_count_max",
55
+ }
56
+ # content keys, but only evaluated on replay when the cassette carries controlOut
57
+ GATE_KEYS = {"question_asked", "questions_count_max", "gate_answers_delivered"}
58
+ # live-only: silently skipped on replay (no filesystem, no network)
59
+ FS_EGRESS_KEYS = {
60
+ "file_exists",
61
+ "user_visible_artifact",
62
+ "no_delete_in_outputs",
63
+ "self_heal_ran",
64
+ "transcript_no_host_path",
65
+ "egress_denied",
66
+ "egress_allowed",
67
+ }
68
+ EGRESS_KEYS = {"egress_denied", "egress_allowed"}
69
+ # every valid key inside an `assert:` list item
70
+ ASSERT_KEYS = CONTENT_KEYS | GATE_KEYS | FS_EGRESS_KEYS | {"replay_protocol_fidelity"}
71
+ # every valid top-level scenario key
72
+ TOP_LEVEL_KEYS = {
73
+ "name",
74
+ "baseline",
75
+ "session",
76
+ "fidelity",
77
+ "on_unanswered",
78
+ "prompt",
79
+ "answers",
80
+ "expect_denied",
81
+ "assert",
82
+ }
83
+ REGEX_KEYS = {
84
+ "transcript_matches",
85
+ "transcript_not_matches",
86
+ "when_question",
87
+ "subagent_dispatched",
88
+ "question_asked",
89
+ }
90
+ VALID_ON_UNANSWERED = {"fail", "prompt", "first", "llm"}
91
+ VALID_TIERS = ("protocol", "container", "microvm", "hostloop", "cowork")
92
+
93
+
94
+ class Finding:
95
+ __slots__ = ("severity", "rule", "message", "fix", "file", "line")
96
+
97
+ def __init__(self, severity, rule, message, fix, file, line=None):
98
+ self.severity = severity # "ERROR" | "WARN" | "INFO"
99
+ self.rule = rule
100
+ self.message = message
101
+ self.fix = fix
102
+ self.file = file
103
+ self.line = line
104
+
105
+ def as_dict(self):
106
+ return {
107
+ "severity": self.severity,
108
+ "rule": self.rule,
109
+ "message": self.message,
110
+ "fix": self.fix,
111
+ "file": self.file,
112
+ "line": self.line,
113
+ }
114
+
115
+
116
+ def _assert_items(doc):
117
+ """Return the list of assert items (each a dict), tolerating shapes."""
118
+ a = doc.get("assert")
119
+ if a is None:
120
+ return []
121
+ if isinstance(a, dict): # someone wrote a single mapping instead of a list
122
+ return [a]
123
+ if isinstance(a, list):
124
+ return [x for x in a if isinstance(x, dict)]
125
+ return []
126
+
127
+
128
+ def _all_assert_keys(items):
129
+ keys = set()
130
+ for item in items:
131
+ keys |= set(item.keys())
132
+ return keys
133
+
134
+
135
+ def lint_doc(doc, path, raw_lines):
136
+ findings = []
137
+ if not isinstance(doc, dict):
138
+ findings.append(
139
+ Finding(
140
+ "ERROR",
141
+ "parse",
142
+ "scenario is not a YAML mapping (expected top-level keys like prompt/assert)",
143
+ "Check the file is a single scenario document.",
144
+ path,
145
+ )
146
+ )
147
+ return findings
148
+
149
+ fidelity = (doc.get("fidelity") or "container")
150
+ items = _assert_items(doc)
151
+ assert_keys = _all_assert_keys(items)
152
+ has_expect_denied = bool(doc.get("expect_denied"))
153
+
154
+ # E: `assertions:` instead of `assert:` — a common hallucination. The block is
155
+ # silently ignored by the harness, so every "assertion" is a no-op (false-green).
156
+ if "assertions" in doc and "assert" not in doc:
157
+ findings.append(
158
+ Finding(
159
+ "ERROR",
160
+ "assertions-key",
161
+ "scenario uses `assertions:` — the real key is `assert:`. The harness ignores "
162
+ "`assertions:`, so NONE of these checks run (a guaranteed silent false-green).",
163
+ "Rename the block to `assert:` and use flat keys (e.g. `- file_exists: outputs/x.md`).",
164
+ path,
165
+ )
166
+ )
167
+
168
+ # W: unknown top-level keys (typo or hallucinated schema)
169
+ for k in doc:
170
+ if k not in TOP_LEVEL_KEYS and k != "assertions" and k != "profile":
171
+ findings.append(
172
+ Finding(
173
+ "WARN",
174
+ "unknown-top-key",
175
+ f"unknown scenario key `{k}` — not part of the schema (typo or hallucination?).",
176
+ f"Valid top-level keys: {', '.join(sorted(TOP_LEVEL_KEYS))}.",
177
+ path,
178
+ )
179
+ )
180
+
181
+ # W: unknown assertion keys inside assert items (e.g. invented file_not_empty, kind, path)
182
+ unknown_assert = sorted(assert_keys - ASSERT_KEYS)
183
+ for k in unknown_assert:
184
+ findings.append(
185
+ Finding(
186
+ "WARN",
187
+ "unknown-assert-key",
188
+ f"unknown assertion key `{k}` — not in the assertion catalog (the harness would "
189
+ "ignore it, so it silently does nothing).",
190
+ "Use a real assertion key — see references/scenario-schema.md for the full catalog.",
191
+ path,
192
+ )
193
+ )
194
+
195
+ # E: egress assertion on protocol fidelity (the harness rejects the run)
196
+ egress_used = bool(assert_keys & EGRESS_KEYS) or has_expect_denied
197
+ if fidelity == "protocol" and egress_used:
198
+ findings.append(
199
+ Finding(
200
+ "ERROR",
201
+ "egress-on-protocol",
202
+ "egress assertion (egress_*/expect_denied) on `fidelity: protocol` — the harness "
203
+ "rejects this run because protocol has no egress enforcement (it would false-pass).",
204
+ "Use fidelity: container (or microvm/hostloop) for any egress/expect_denied check.",
205
+ path,
206
+ )
207
+ )
208
+
209
+ # E: retired/invalid on_unanswered
210
+ ou = doc.get("on_unanswered")
211
+ if ou is not None and ou not in VALID_ON_UNANSWERED:
212
+ extra = " (`agent` was renamed to `llm`)" if ou == "agent" else ""
213
+ findings.append(
214
+ Finding(
215
+ "ERROR",
216
+ "on-unanswered-invalid",
217
+ f"on_unanswered: {ou} is not a valid value{extra}.",
218
+ "Use one of: fail | prompt | first | llm (YAML). For a live model use on_unanswered: llm.",
219
+ path,
220
+ )
221
+ )
222
+
223
+ # E: authored replay_protocol_fidelity
224
+ if "replay_protocol_fidelity" in assert_keys:
225
+ findings.append(
226
+ Finding(
227
+ "ERROR",
228
+ "authored-replay-fidelity",
229
+ "`replay_protocol_fidelity` is synthesized by the replay lane only and cannot be authored.",
230
+ "Remove it — on a live run it evaluates as an empty assertion.",
231
+ path,
232
+ )
233
+ )
234
+
235
+ # W: no content assertion → a replay PR gate verifies nothing
236
+ if items:
237
+ content_present = bool(assert_keys & (CONTENT_KEYS | GATE_KEYS))
238
+ if not content_present:
239
+ findings.append(
240
+ Finding(
241
+ "WARN",
242
+ "replay-noop",
243
+ "every assertion is filesystem/egress — on the token-free `replay` lane they are "
244
+ "ALL silently skipped, so a replay PR gate would verify nothing.",
245
+ "Add a content assertion (result / transcript_* / tool_* / subagent_*) or run this "
246
+ "scenario only on the live (run/record) lane.",
247
+ path,
248
+ )
249
+ )
250
+
251
+ # W: mixed-class assert item → fs/egress half dropped on replay
252
+ for idx, item in enumerate(items):
253
+ ks = set(item.keys())
254
+ content_half = ks & (CONTENT_KEYS | GATE_KEYS)
255
+ fs_half = ks & FS_EGRESS_KEYS
256
+ if content_half and fs_half:
257
+ findings.append(
258
+ Finding(
259
+ "WARN",
260
+ "mixed-assert-item",
261
+ f"assert item #{idx} mixes content {sorted(content_half)} with "
262
+ f"filesystem/egress {sorted(fs_half)} — on replay the filesystem/egress half is "
263
+ "dropped (only the content half is evaluated).",
264
+ "Split into separate list items: one per concern.",
265
+ path,
266
+ )
267
+ )
268
+
269
+ # I: gate keys need a controlOut cassette on replay
270
+ gate_present = sorted(assert_keys & GATE_KEYS)
271
+ if gate_present:
272
+ findings.append(
273
+ Finding(
274
+ "INFO",
275
+ "gate-needs-controlout",
276
+ f"gate assertion(s) {gate_present} only evaluate on replay when the cassette has "
277
+ "controlOut (full-fidelity). An old cassette excludes them (with a loud warning).",
278
+ "Re-record with a current harness so the cassette carries controlOut.",
279
+ path,
280
+ )
281
+ )
282
+
283
+ # W: double-quoted regex with a backslash (raw-text scan — the parser already ate it)
284
+ findings.extend(_lint_regex_quoting(path, raw_lines))
285
+
286
+ return findings
287
+
288
+
289
+ _DQ_REGEX_LINE = re.compile(
290
+ r'^\s*-?\s*(' + "|".join(sorted(REGEX_KEYS)) + r')\s*:\s*"([^"]*\\[^"]*)"'
291
+ )
292
+
293
+
294
+ def _lint_regex_quoting(path, raw_lines):
295
+ out = []
296
+ for i, line in enumerate(raw_lines, start=1):
297
+ m = _DQ_REGEX_LINE.match(line)
298
+ if m:
299
+ out.append(
300
+ Finding(
301
+ "WARN",
302
+ "regex-double-quoted",
303
+ f"`{m.group(1)}` uses a DOUBLE-quoted regex containing a backslash "
304
+ f'("{m.group(2)}") — YAML strips the backslash, so the regex is wrong.',
305
+ "Single-quote the regex (e.g. '\\d+') or use a block scalar. Use [\\s\\S] not . to span turns.",
306
+ path,
307
+ i,
308
+ )
309
+ )
310
+ return out
311
+
312
+
313
+ def _require_yaml():
314
+ try:
315
+ import yaml # type: ignore
316
+
317
+ return yaml
318
+ except ImportError:
319
+ print("scenario.py requires PyYAML. Install it: pip install pyyaml", file=sys.stderr)
320
+ sys.exit(2)
321
+
322
+
323
+ def lint_file(path):
324
+ yaml = _require_yaml()
325
+ p = Path(path)
326
+ if not p.is_file():
327
+ return [Finding("ERROR", "not-found", f"file not found: {path}", "Check the path.", path)]
328
+ text = p.read_text(encoding="utf-8")
329
+ raw_lines = text.splitlines()
330
+ # The regex-quoting scan runs on raw text, so it works even when YAML parsing fails —
331
+ # and a bad double-quoted regex (e.g. "\d") is exactly a case that can fail to parse.
332
+ quoting = _lint_regex_quoting(path, raw_lines)
333
+ try:
334
+ doc = yaml.safe_load(text)
335
+ except yaml.YAMLError as e: # noqa
336
+ msg = str(e).splitlines()[0]
337
+ return quoting + [
338
+ Finding("ERROR", "parse", f"YAML parse error: {msg}", "Fix the YAML syntax.", path)
339
+ ]
340
+ return lint_doc(doc, path, raw_lines)
341
+
342
+
343
+ SEV_ORDER = {"ERROR": 0, "WARN": 1, "INFO": 2}
344
+
345
+
346
+ def _print_findings(findings, n_files):
347
+ if not findings:
348
+ print(f"✓ {n_files} scenario(s) clean — no silent-false-green findings.")
349
+ return
350
+ for x in sorted(findings, key=lambda f: (str(f.file), SEV_ORDER[f.severity])):
351
+ loc = f"{x.file}:{x.line}" if x.line else x.file
352
+ glyph = {"ERROR": "✗", "WARN": "⚠", "INFO": "ℹ"}[x.severity]
353
+ print(f"{glyph} {x.severity} [{x.rule}] {loc}")
354
+ print(f" {x.message}")
355
+ print(f" fix: {x.fix}")
356
+ n_err = sum(1 for x in findings if x.severity == "ERROR")
357
+ n_warn = sum(1 for x in findings if x.severity == "WARN")
358
+ n_info = sum(1 for x in findings if x.severity == "INFO")
359
+ print(f"\n{n_err} error(s), {n_warn} warning(s), {n_info} info across {n_files} file(s).")
360
+
361
+
362
+ def cmd_lint(args):
363
+ all_findings = []
364
+ for f in args.files:
365
+ all_findings.extend(lint_file(f))
366
+ if args.json:
367
+ print(json.dumps([x.as_dict() for x in all_findings], indent=2))
368
+ else:
369
+ _print_findings(all_findings, len(args.files))
370
+ has_error = any(x.severity == "ERROR" for x in all_findings)
371
+ if has_error or (args.strict and all_findings):
372
+ return 1
373
+ return 0
374
+
375
+
376
+ # --------------------------------------------------------------------------- #
377
+ # scaffold
378
+ # --------------------------------------------------------------------------- #
379
+
380
+ def _sq(s):
381
+ """Single-quote a YAML scalar (doubling internal single quotes). Single quotes keep
382
+ regex backslashes literal — double quotes would eat them (the regex-quoting gotcha)."""
383
+ return "'" + str(s).replace("'", "''") + "'"
384
+
385
+
386
+ def _split_kv(spec, flag):
387
+ if "=" not in spec:
388
+ print(f"{flag} expects '<regex>=<choice>', got: {spec}", file=sys.stderr)
389
+ sys.exit(2)
390
+ k, v = spec.split("=", 1)
391
+ return k.strip(), v.strip()
392
+
393
+
394
+ def build_scenario(args):
395
+ """Return (yaml_text, notes[]). Encodes the convergent skeleton: container by default,
396
+ scripted answers + on_unanswered: fail, content-class assertions first then live-only,
397
+ one concern per item."""
398
+ notes = []
399
+ tier = args.tier
400
+ egress_asserted = bool(args.egress_denied or args.egress_allowed)
401
+
402
+ # Never emit a scenario the linter would reject: protocol + egress is rejected by the harness.
403
+ if tier == "protocol" and egress_asserted:
404
+ tier = "container"
405
+ notes.append(
406
+ "tier auto-upgraded protocol → container: egress assertions need a sandboxed tier "
407
+ "(protocol is rejected by the harness)."
408
+ )
409
+
410
+ gates = [_split_kv(g, "--gate") for g in (args.gate or [])]
411
+
412
+ L = []
413
+ L.append(f"# {args.name} — cowork-harness scenario (scaffolded; edit the TODOs).")
414
+ L.append(f"# Tier '{tier}': "
415
+ + ("sandbox + real default-deny egress." if tier == "container"
416
+ else "see references/fidelity-and-answers.md.")
417
+ + " on_unanswered: fail keeps this deterministic for CI.")
418
+ if args.skill:
419
+ L.append(f"# Mount the skill under test ({args.skill}) via a session: e.g.")
420
+ L.append("# plugins:")
421
+ L.append(f"# local_plugins: [{args.skill}]")
422
+ L.append("# enabled: [<plugin-name>@local]")
423
+ L.append("")
424
+ L.append(f"name: {args.name}")
425
+ L.append("baseline: latest")
426
+ if args.session:
427
+ L.append(f"session: {args.session}")
428
+ L.append(f"fidelity: {tier}")
429
+ L.append("on_unanswered: fail")
430
+ L.append("")
431
+ L.append("prompt: |")
432
+ for line in (args.prompt or "TODO: the user turn that drives the skill.").splitlines() or [""]:
433
+ L.append(f" {line}")
434
+
435
+ # answers (scripted gates + web_fetch approvals) — the only deterministic path
436
+ if gates or args.web_fetch:
437
+ L.append("")
438
+ L.append("answers:")
439
+ for rx, choice in gates:
440
+ L.append(f" - when_question: {_sq(rx)}")
441
+ L.append(f" choose: {_sq(choice)}")
442
+ for dom in (args.web_fetch or []):
443
+ L.append(f' - when_tool: "webfetch:{dom}" # web_fetch approval (provenance-miss gate)')
444
+ L.append(" decide: allow")
445
+ L.append(" grant: domain")
446
+
447
+ # assertions: content/structure first (replay PR gate), then live-only (filesystem/egress)
448
+ content_lines = [" - result: success"]
449
+ for rx in (args.content or []):
450
+ content_lines.append(f" - transcript_matches: {_sq(rx)}")
451
+ for tool in (args.tool or []):
452
+ content_lines.append(f" - tool_called: {tool}")
453
+ for rx in (args.subagent or []):
454
+ content_lines.append(f" - subagent_dispatched: {_sq(rx)} # matches agentType OR dispatch description")
455
+ if gates:
456
+ for rx, _ in gates:
457
+ content_lines.append(f" - question_asked: {_sq(rx)} # gate key: replay only with a controlOut cassette")
458
+ content_lines.append(f" - questions_count_max: {len(gates)}")
459
+ content_lines.append(" - gate_answers_delivered: true # the steered answers actually reached the model")
460
+
461
+ live_lines = []
462
+ for p in (args.file or []):
463
+ live_lines.append(f" - file_exists: {p}")
464
+ for p in (args.artifact or []):
465
+ live_lines.append(f" - user_visible_artifact: {p}")
466
+ if args.no_delete:
467
+ live_lines.append(" - no_delete_in_outputs: true")
468
+ for h in (args.egress_allowed or []):
469
+ live_lines.append(f" - egress_allowed: {h}")
470
+ for h in (args.egress_denied or []):
471
+ live_lines.append(f" - egress_denied: {h}")
472
+
473
+ L.append("")
474
+ L.append("assert:")
475
+ L.append(" # --- content / structure: evaluate on the token-free replay PR gate AND live ---")
476
+ L.extend(content_lines)
477
+ if live_lines:
478
+ L.append(" # --- filesystem / egress: LIVE-only (silently skipped on replay) ---")
479
+ L.extend(live_lines)
480
+ else:
481
+ L.append(" # TODO add filesystem/egress checks (file_exists / user_visible_artifact /")
482
+ L.append(" # egress_denied / no_delete_in_outputs) — they run on the LIVE lane only.")
483
+
484
+ if args.web_fetch:
485
+ notes.append(
486
+ "web_fetch: put the URL in the prompt so it is provenanced (the deterministic way to make a "
487
+ "fetch succeed). egress.extra_allow is a NO-OP on the provenanced path — provenance is the gate."
488
+ )
489
+ if not (args.content or args.tool or args.subagent or gates):
490
+ notes.append("only `result: success` is a content assertion — add a transcript_matches / tool_called "
491
+ "so the replay PR gate verifies something real.")
492
+
493
+ return "\n".join(L) + "\n", notes
494
+
495
+
496
+ def cmd_scaffold(args):
497
+ yaml = _require_yaml()
498
+ text, notes = build_scenario(args)
499
+
500
+ # Dogfood: self-lint the generated scenario; refuse to emit something the linter rejects.
501
+ if not args.no_validate:
502
+ doc = yaml.safe_load(text)
503
+ findings = lint_doc(doc, "<scaffold>", text.splitlines())
504
+ errors = [f for f in findings if f.severity == "ERROR"]
505
+ if errors:
506
+ print("scaffold produced a scenario its own linter rejects (this is a bug):", file=sys.stderr)
507
+ for e in errors:
508
+ print(f" ✗ [{e.rule}] {e.message}", file=sys.stderr)
509
+ return 2
510
+
511
+ if args.out:
512
+ Path(args.out).write_text(text, encoding="utf-8")
513
+ print(f"✓ wrote {args.out}", file=sys.stderr)
514
+ else:
515
+ sys.stdout.write(text)
516
+
517
+ for n in notes:
518
+ print(f"note: {n}", file=sys.stderr)
519
+ return 0
520
+
521
+
522
+ def main(argv=None):
523
+ ap = argparse.ArgumentParser(
524
+ prog="scenario.py",
525
+ description="Author (scaffold) and check (lint) cowork-harness scenarios.",
526
+ )
527
+ sub = ap.add_subparsers(dest="command", required=True)
528
+
529
+ lp = sub.add_parser("lint", help="lint scenario(s) for silent-false-green invariants")
530
+ lp.add_argument("files", nargs="+", help="scenario YAML file(s) to lint")
531
+ lp.add_argument("--json", action="store_true", help="emit findings as JSON")
532
+ lp.add_argument("--strict", action="store_true", help="exit non-zero on WARN/INFO too, not just ERROR")
533
+ lp.set_defaults(func=cmd_lint)
534
+
535
+ sp = sub.add_parser("scaffold", help="emit a valid scenario skeleton (self-linted)")
536
+ sp.add_argument("--name", default="my-scenario", help="scenario name (default: my-scenario)")
537
+ sp.add_argument("--prompt", help="the user turn (the prompt: block)")
538
+ sp.add_argument("--tier", choices=VALID_TIERS, default="container", help="fidelity tier (default: container)")
539
+ sp.add_argument("--session", help="path for the session: field (discovery/setup file)")
540
+ sp.add_argument("--skill", help="skill folder under test — adds a session-mount comment")
541
+ sp.add_argument("--content", action="append", metavar="REGEX", help="transcript_matches assertion (repeatable)")
542
+ sp.add_argument("--tool", action="append", metavar="TOOL", help="tool_called assertion (repeatable)")
543
+ sp.add_argument("--subagent", action="append", metavar="REGEX", help="subagent_dispatched assertion (repeatable)")
544
+ sp.add_argument("--gate", action="append", metavar="REGEX=CHOICE", help="scripted AskUserQuestion answer (repeatable)")
545
+ sp.add_argument("--web-fetch", dest="web_fetch", action="append", metavar="DOMAIN", help="web_fetch approval rule (repeatable)")
546
+ sp.add_argument("--file", action="append", metavar="PATH", help="file_exists assertion (repeatable)")
547
+ sp.add_argument("--artifact", action="append", metavar="PATH", help="user_visible_artifact assertion (repeatable)")
548
+ sp.add_argument("--no-delete", action="store_true", help="add no_delete_in_outputs: true")
549
+ sp.add_argument("--egress-allowed", dest="egress_allowed", action="append", metavar="HOST", help="egress_allowed assertion (repeatable)")
550
+ sp.add_argument("--egress-denied", dest="egress_denied", action="append", metavar="HOST", help="egress_denied assertion (repeatable)")
551
+ sp.add_argument("--out", help="write to this file (default: stdout)")
552
+ sp.add_argument("--no-validate", action="store_true", help="skip the self-lint of the generated scenario")
553
+ sp.set_defaults(func=cmd_scaffold)
554
+
555
+ args = ap.parse_args(argv)
556
+ return args.func(args)
557
+
558
+
559
+ if __name__ == "__main__":
560
+ sys.exit(main())
package/CHANGELOG.md CHANGED
@@ -6,6 +6,106 @@ All notable changes to this project are documented here. The format is based on
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.4.0] — 2026-06-18
10
+
11
+ The parsing/validation hardening + safety release: a current-tree code-review sweep plus fidelity and
12
+ robustness findings from real skill-testing sessions — uniform fail-loud CLI parsing (enforced by a
13
+ structural test + CI guard), a centralized staging-source resolver, cassette replay/manifest safety
14
+ (base64 + containment + hash-verify), egress SSRF/DNS-rebind hardening, `replay <dir>`, and `cowork-harness lint`.
15
+
16
+ ### Added
17
+
18
+ - **`cowork-harness lint <scenario.yaml>…`** — the bundled scenario linter/scaffolder (`scenario.py`) is now
19
+ shipped in the npm package and reachable as a first-class subcommand, so a consumer who `npm i`s the harness
20
+ (with no skill checkout) can run the no-silent-false-green checks in CI. Needs `python3` + PyYAML; a missing
21
+ interpreter fails with a clear, actionable message.
22
+ - **`replay <dir>`** — `replay` now accepts a directory and replays every `*.cassette.json` in it (sorted,
23
+ non-recursive), exiting on the worst per-cassette verdict, in addition to the existing `--cassette <file>`
24
+ form. An unreadable cassette is reported per-file and forces the JSON envelope's `ok:false` (never a vacuous
25
+ pass), and never aborts the batch.
26
+ - **A shipped `protocol`-tier example** (`examples/scenarios/protocol-smoke.yaml` + its session) — the first
27
+ zero-Docker/zero-agent worked example for the L0 tier (a scripted answer reaches the model, a tool runs, a
28
+ file is written), with the host-path leak owned via `transcript_no_host_path: false` to illustrate exactly
29
+ what protocol fidelity does and does not seal.
30
+ - **Documentation for previously-undocumented surfaces:** `sync --allow-empty`, `boundary-check --session`,
31
+ `decide`'s `--decider-dir` rejection, `verify-cassettes`'s non-recursive scan, `replay` (one file vs
32
+ `record` batching), `gates` raw-output (no envelope), `gate_answers_delivered: false`, python
33
+ `run_scenario()`, six public reproducibility env vars, and HELP text for `chat --fidelity/--model` and
34
+ `sync --allow-empty`. Plus a zero-dependency "try it in 10s" `replay` lead in the README quick start.
35
+
36
+ ### Changed
37
+
38
+ - **Uniform CLI argument validation.** A shared declarative argument parser backs the cassette commands
39
+ (`record`/`replay`/`verify-cassettes`) + `boundary-check`, and **every** command now rejects unknown flags,
40
+ extra positionals, and flag-looking values for path/id flags instead of silently ignoring them — closing a
41
+ class of silent-accept parsing footguns. This is enforced going forward by a structural test (every command
42
+ must reject an unknown flag) and a CI grep-ban on the legacy first-non-dash-token idiom. Error paths only;
43
+ valid invocations are unchanged.
44
+ - **The npm package ships `scenario.py`** (the linter/scaffolder) and publishes with provenance attestation so
45
+ CI consumers can lint without a skill checkout.
46
+ - **Agent-binary discovery falls back to the newest staged build.** When the baseline's exact
47
+ `claude-code-vm/<ver>/claude` is absent (e.g. Cowork staged a newer build), the harness now uses the newest
48
+ staged sibling with a warning instead of hard-failing; `COWORK_AGENT_BINARY` still takes precedence.
49
+ - **`chat --fidelity` now validates its argument** — a value other than `container`/`hostloop` is rejected
50
+ (exit 2) instead of being silently coerced to `container` (a fidelity footgun).
51
+ - **`assert --list`** now describes `replay_protocol_fidelity` as replay-only and **not authorable** (it is
52
+ synthesized by the replay lane and rejected if written in a scenario).
53
+
54
+ ### Fixed
55
+
56
+ - **CLI parsing hygiene across commands.** `run` now treats an empty scenario directory as a loud non-zero
57
+ (was a vacuous exit-0 pass); `record`/`verify-cassettes`/`gates` no longer mistake a `--output-format`/
58
+ `--allow` value for the positional target; `trace` rejects mutually-exclusive view flags and extra targets;
59
+ `scaffold`/`assert --list` validate `--output-format` and reject stray arguments; `decide` rejects unknown
60
+ flags, stray positionals, `--intent` without `--decider-llm`, an `--decider-llm`+`--answer` conflict, and a
61
+ flag-looking `--decider-cmd` value; `vm` validates its subcommand before loading a baseline; `boundary-check`
62
+ rejects unknown flags; the global `--dotenv=<path>` equals form is accepted; and `--output-format=<x>`
63
+ validates the value rather than silently degrading to text.
64
+ - **Cassette replay safety.** `replay` routes reads through the safe cassette reader (a malformed cassette is a
65
+ clean error, not an internal crash); a lenient schema guards the dereferenced `scenario`/`events` fields and
66
+ a missing optional `assert` is normalized so it can't crash a batch; manifest bodies are stored with an
67
+ encoding marker (binary as base64) so non-text artifacts round-trip byte-exactly; materialized entries are
68
+ path-contained (no `..`/absolute escape) and verified against their recorded sha256.
69
+ - **Skill-staleness hash no longer self-invalidates.** The `skillHash` fingerprint now excludes recorded
70
+ cassettes (`*.cassette.json`, by extension) and VCS/cache dirs (`.git`/`node_modules`/`__pycache__`/…), so
71
+ writing a cassette under the hashed skill tree no longer changes the fingerprint it just recorded (and a
72
+ repo that co-locates committed cassettes with the skill stops falsely tripping the staleness gate). Real
73
+ skill-source edits — including under a `tests/` dir — still change the hash (kept conservative: no
74
+ false-negative).
75
+ - **Staging source validation.** Every declared session source now resolves through one central choke point
76
+ (`resolveDeclaredSource`, guarded by a structural test): `mcp.config` must be a file; connected folders,
77
+ local/remote plugin roots, and local skills must be directories; a nameless marketplace manifest now
78
+ resolves and qualifier-matches by its derived name; and a corrupt `plugin.json` errors instead of silently
79
+ defaulting to version `0.0.0`. The soft-missing reconciliation path is preserved (a missing source still
80
+ reconciles; only a wrong-kind existing source fails loud).
81
+ - **Artifact collection no longer follows symlinks** (`lstat` + symlink-skip + a realpath cycle guard), and the
82
+ egress sidecar/proxy are acquired inside the protected block so a prompt-render throw can't leak them.
83
+ - **Egress/web-fetch guards.** The private-address guard recognizes IPv4-mapped IPv6 and numeric/hex/octal IPv4
84
+ loopback forms; a host-side `web_fetch` to a hostname that **resolves** to a private/loopback address is now
85
+ denied (DNS-rebind/SSRF, fail-closed — a name that won't resolve is also denied), checked on every redirect
86
+ hop; the proxy parses bracketed IPv6 `Host` headers; and an `allow` egress decision is recorded only once the
87
+ upstream actually connects (so `egress_allowed` can't pass when nothing reached the host).
88
+ - **Verdict/assertion correctness.** A nonzero child exit after a success result is now fatal (with the stderr
89
+ tail); `artifact_json` `equals`/`in` compare JSON with key-order-insensitive deep equality (arrays stay
90
+ order-significant); the external decider rejects an invalid permission `behavior` loudly instead of silently
91
+ denying; `no_delete_in_outputs` accepts only `true` (authoring `false` was a silent no-op footgun); and the
92
+ outputs-delete detector parses `mv` direction (a move *into* `outputs/` is no longer a false delete) with an
93
+ opt-in safe-staging-prefix suppression for scratch cleanups (`COWORK_HARNESS_SAFE_STAGING_PREFIX`).
94
+ - **Python wrapper drift.** `run_scenario()` no longer passes `--fidelity`/`--answer` flags the `run` command
95
+ rejects; fidelity and answers are scenario-authored (the YAML's `fidelity:`/`answers:` fields).
96
+ - **Docs reconciled with the 0.3.0 artifact-manifest replay behavior.** README, SPEC, `docs/scenario.md`,
97
+ the companion `SKILL.md`, and the skill references previously claimed `file_exists`/`user_visible_artifact`/
98
+ `artifact_json` were "always skipped" on replay; they now correctly state these are evaluated **when the
99
+ cassette carries an `artifacts` manifest** (only the live-only egress keys are always skipped), with
100
+ `docs/cassette.md` flagged as canonical and `allow_permissive_auto_allow` added to its table.
101
+ - **Corrected the claim that the `protocol` tier needs no token** — L0 spawns the host `claude` and calls a
102
+ real model, so it needs the auth token (Docker-free/agent-free, not token-free).
103
+ - **Aligned stale references:** npx floor `>=0.2.0` → `>=0.3.0`; skill reference headers `0.1.0` → `0.3.0`;
104
+ stale `cassette.ts` line-cites → the `contentKeys` symbol; and the broken `DESIGN.md §1` anchor.
105
+ - **Doc accuracy:** all five fidelity values (vs "L0/L1/L2"), `max_thinking_tokens` over "extended thinking",
106
+ the `config_dir` write-guard caveat, the `boundary-check` (exit 1) vs `BoundaryError` (exit 2) exit-code
107
+ distinction, and the `npm run ci` vs CI-Stage-1 gate framing.
108
+
9
109
  ## [0.3.0] — 2026-06-17
10
110
 
11
111
  The CI-operate + privacy layer for committed cassettes: record-time redaction, an always-on