cowork-harness 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/cowork-harness/scripts/scenario.py +560 -0
- package/CHANGELOG.md +100 -0
- package/README.md +27 -13
- package/dist/assert.js +40 -1
- package/dist/baseline.js +48 -5
- package/dist/cli-args.js +61 -0
- package/dist/cli.js +163 -37
- package/dist/decide/decider.js +6 -0
- package/dist/egress/proxy.js +15 -7
- package/dist/hostloop/workspace-handler.js +135 -21
- package/dist/run/cassette.js +213 -118
- package/dist/run/chat.js +8 -2
- package/dist/run/envelope.js +7 -4
- package/dist/run/execute.js +145 -33
- package/dist/run/inputs.js +23 -0
- package/dist/run/run.js +9 -1
- package/dist/run/scenario-tool.js +37 -0
- package/dist/run/skill-hash.js +60 -0
- package/dist/runtime/stage.js +14 -6
- package/dist/session.js +43 -15
- package/dist/staging/resolve.js +59 -4
- package/dist/types.js +6 -3
- package/docs/README.md +1 -1
- package/docs/boundary.md +10 -2
- package/docs/cassette.md +10 -0
- package/docs/decider-dir.md +3 -0
- package/docs/discovery.md +5 -1
- package/docs/maintenance.md +8 -0
- package/docs/scenario.md +10 -5
- package/package.json +2 -1
- package/python/README.md +14 -0
- package/python/cowork_harness.py +12 -11
- package/python/test_cowork_lane.py +31 -0
- package/schema/scenario.schema.json +3 -2
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""scenario.py — author and check cowork-harness scenarios without hallucinating the schema.
|
|
3
|
+
|
|
4
|
+
Two subcommands:
|
|
5
|
+
|
|
6
|
+
scenario.py scaffold ... emit a VALID scenario skeleton (real keys, correct tier,
|
|
7
|
+
content vs live-only assertions split one-per-item). The
|
|
8
|
+
generator self-lints its own output and refuses to emit a
|
|
9
|
+
scenario its own linter would reject.
|
|
10
|
+
|
|
11
|
+
scenario.py lint FILE... catch silent false-greens in existing scenarios. The
|
|
12
|
+
cowork-harness has several ways to make a check *silently
|
|
13
|
+
do nothing*; this encodes those invariants so they fail at
|
|
14
|
+
author time / in CI instead of rotting as a green-but-empty
|
|
15
|
+
assertion.
|
|
16
|
+
|
|
17
|
+
lint flags (see references/scenario-schema.md for the why of each):
|
|
18
|
+
E egress assertion on `fidelity: protocol` (the harness rejects this run)
|
|
19
|
+
E on_unanswered: agent / invalid value (schema rejects `agent`)
|
|
20
|
+
E authored `replay_protocol_fidelity` assertion (replay-synthesized only)
|
|
21
|
+
E `assertions:` instead of `assert:` (block ignored → every check no-ops)
|
|
22
|
+
W no content assertion → no-op on a replay gate (every assertion is fs/egress)
|
|
23
|
+
W mixed-class assert item → fs/egress half dropped on replay
|
|
24
|
+
W unknown top-level / assertion key (typo or hallucinated schema)
|
|
25
|
+
W double-quoted regex with a backslash (YAML eats the backslash)
|
|
26
|
+
I gate key present → needs a controlOut cassette on replay
|
|
27
|
+
|
|
28
|
+
Designed for agents and CI: non-interactive, --help, --json, meaningful exit codes,
|
|
29
|
+
idempotent. `lint` exits 1 on any ERROR (or any finding with --strict); else 0.
|
|
30
|
+
|
|
31
|
+
Requires PyYAML (`pip install pyyaml`).
|
|
32
|
+
"""
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import argparse
|
|
36
|
+
import json
|
|
37
|
+
import re
|
|
38
|
+
import sys
|
|
39
|
+
from pathlib import Path
|
|
40
|
+
|
|
41
|
+
# --- the replay-class taxonomy (mirrors `contentKeys` in src/run/cassette.ts) ---
|
|
42
|
+
CONTENT_KEYS = {
|
|
43
|
+
"result",
|
|
44
|
+
"transcript_contains",
|
|
45
|
+
"transcript_not_contains",
|
|
46
|
+
"transcript_matches",
|
|
47
|
+
"transcript_not_matches",
|
|
48
|
+
"tool_called",
|
|
49
|
+
"tool_not_called",
|
|
50
|
+
"subagent_tool_used",
|
|
51
|
+
"subagent_tool_absent",
|
|
52
|
+
"subagent_dispatched",
|
|
53
|
+
"subagent_declared_but_unused",
|
|
54
|
+
"dispatch_count_max",
|
|
55
|
+
}
|
|
56
|
+
# content keys, but only evaluated on replay when the cassette carries controlOut
|
|
57
|
+
GATE_KEYS = {"question_asked", "questions_count_max", "gate_answers_delivered"}
|
|
58
|
+
# live-only: silently skipped on replay (no filesystem, no network)
|
|
59
|
+
FS_EGRESS_KEYS = {
|
|
60
|
+
"file_exists",
|
|
61
|
+
"user_visible_artifact",
|
|
62
|
+
"no_delete_in_outputs",
|
|
63
|
+
"self_heal_ran",
|
|
64
|
+
"transcript_no_host_path",
|
|
65
|
+
"egress_denied",
|
|
66
|
+
"egress_allowed",
|
|
67
|
+
}
|
|
68
|
+
EGRESS_KEYS = {"egress_denied", "egress_allowed"}
|
|
69
|
+
# every valid key inside an `assert:` list item
|
|
70
|
+
ASSERT_KEYS = CONTENT_KEYS | GATE_KEYS | FS_EGRESS_KEYS | {"replay_protocol_fidelity"}
|
|
71
|
+
# every valid top-level scenario key
|
|
72
|
+
TOP_LEVEL_KEYS = {
|
|
73
|
+
"name",
|
|
74
|
+
"baseline",
|
|
75
|
+
"session",
|
|
76
|
+
"fidelity",
|
|
77
|
+
"on_unanswered",
|
|
78
|
+
"prompt",
|
|
79
|
+
"answers",
|
|
80
|
+
"expect_denied",
|
|
81
|
+
"assert",
|
|
82
|
+
}
|
|
83
|
+
REGEX_KEYS = {
|
|
84
|
+
"transcript_matches",
|
|
85
|
+
"transcript_not_matches",
|
|
86
|
+
"when_question",
|
|
87
|
+
"subagent_dispatched",
|
|
88
|
+
"question_asked",
|
|
89
|
+
}
|
|
90
|
+
VALID_ON_UNANSWERED = {"fail", "prompt", "first", "llm"}
|
|
91
|
+
VALID_TIERS = ("protocol", "container", "microvm", "hostloop", "cowork")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class Finding:
|
|
95
|
+
__slots__ = ("severity", "rule", "message", "fix", "file", "line")
|
|
96
|
+
|
|
97
|
+
def __init__(self, severity, rule, message, fix, file, line=None):
|
|
98
|
+
self.severity = severity # "ERROR" | "WARN" | "INFO"
|
|
99
|
+
self.rule = rule
|
|
100
|
+
self.message = message
|
|
101
|
+
self.fix = fix
|
|
102
|
+
self.file = file
|
|
103
|
+
self.line = line
|
|
104
|
+
|
|
105
|
+
def as_dict(self):
|
|
106
|
+
return {
|
|
107
|
+
"severity": self.severity,
|
|
108
|
+
"rule": self.rule,
|
|
109
|
+
"message": self.message,
|
|
110
|
+
"fix": self.fix,
|
|
111
|
+
"file": self.file,
|
|
112
|
+
"line": self.line,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _assert_items(doc):
|
|
117
|
+
"""Return the list of assert items (each a dict), tolerating shapes."""
|
|
118
|
+
a = doc.get("assert")
|
|
119
|
+
if a is None:
|
|
120
|
+
return []
|
|
121
|
+
if isinstance(a, dict): # someone wrote a single mapping instead of a list
|
|
122
|
+
return [a]
|
|
123
|
+
if isinstance(a, list):
|
|
124
|
+
return [x for x in a if isinstance(x, dict)]
|
|
125
|
+
return []
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _all_assert_keys(items):
|
|
129
|
+
keys = set()
|
|
130
|
+
for item in items:
|
|
131
|
+
keys |= set(item.keys())
|
|
132
|
+
return keys
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def lint_doc(doc, path, raw_lines):
|
|
136
|
+
findings = []
|
|
137
|
+
if not isinstance(doc, dict):
|
|
138
|
+
findings.append(
|
|
139
|
+
Finding(
|
|
140
|
+
"ERROR",
|
|
141
|
+
"parse",
|
|
142
|
+
"scenario is not a YAML mapping (expected top-level keys like prompt/assert)",
|
|
143
|
+
"Check the file is a single scenario document.",
|
|
144
|
+
path,
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
return findings
|
|
148
|
+
|
|
149
|
+
fidelity = (doc.get("fidelity") or "container")
|
|
150
|
+
items = _assert_items(doc)
|
|
151
|
+
assert_keys = _all_assert_keys(items)
|
|
152
|
+
has_expect_denied = bool(doc.get("expect_denied"))
|
|
153
|
+
|
|
154
|
+
# E: `assertions:` instead of `assert:` — a common hallucination. The block is
|
|
155
|
+
# silently ignored by the harness, so every "assertion" is a no-op (false-green).
|
|
156
|
+
if "assertions" in doc and "assert" not in doc:
|
|
157
|
+
findings.append(
|
|
158
|
+
Finding(
|
|
159
|
+
"ERROR",
|
|
160
|
+
"assertions-key",
|
|
161
|
+
"scenario uses `assertions:` — the real key is `assert:`. The harness ignores "
|
|
162
|
+
"`assertions:`, so NONE of these checks run (a guaranteed silent false-green).",
|
|
163
|
+
"Rename the block to `assert:` and use flat keys (e.g. `- file_exists: outputs/x.md`).",
|
|
164
|
+
path,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# W: unknown top-level keys (typo or hallucinated schema)
|
|
169
|
+
for k in doc:
|
|
170
|
+
if k not in TOP_LEVEL_KEYS and k != "assertions" and k != "profile":
|
|
171
|
+
findings.append(
|
|
172
|
+
Finding(
|
|
173
|
+
"WARN",
|
|
174
|
+
"unknown-top-key",
|
|
175
|
+
f"unknown scenario key `{k}` — not part of the schema (typo or hallucination?).",
|
|
176
|
+
f"Valid top-level keys: {', '.join(sorted(TOP_LEVEL_KEYS))}.",
|
|
177
|
+
path,
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# W: unknown assertion keys inside assert items (e.g. invented file_not_empty, kind, path)
|
|
182
|
+
unknown_assert = sorted(assert_keys - ASSERT_KEYS)
|
|
183
|
+
for k in unknown_assert:
|
|
184
|
+
findings.append(
|
|
185
|
+
Finding(
|
|
186
|
+
"WARN",
|
|
187
|
+
"unknown-assert-key",
|
|
188
|
+
f"unknown assertion key `{k}` — not in the assertion catalog (the harness would "
|
|
189
|
+
"ignore it, so it silently does nothing).",
|
|
190
|
+
"Use a real assertion key — see references/scenario-schema.md for the full catalog.",
|
|
191
|
+
path,
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# E: egress assertion on protocol fidelity (the harness rejects the run)
|
|
196
|
+
egress_used = bool(assert_keys & EGRESS_KEYS) or has_expect_denied
|
|
197
|
+
if fidelity == "protocol" and egress_used:
|
|
198
|
+
findings.append(
|
|
199
|
+
Finding(
|
|
200
|
+
"ERROR",
|
|
201
|
+
"egress-on-protocol",
|
|
202
|
+
"egress assertion (egress_*/expect_denied) on `fidelity: protocol` — the harness "
|
|
203
|
+
"rejects this run because protocol has no egress enforcement (it would false-pass).",
|
|
204
|
+
"Use fidelity: container (or microvm/hostloop) for any egress/expect_denied check.",
|
|
205
|
+
path,
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# E: retired/invalid on_unanswered
|
|
210
|
+
ou = doc.get("on_unanswered")
|
|
211
|
+
if ou is not None and ou not in VALID_ON_UNANSWERED:
|
|
212
|
+
extra = " (`agent` was renamed to `llm`)" if ou == "agent" else ""
|
|
213
|
+
findings.append(
|
|
214
|
+
Finding(
|
|
215
|
+
"ERROR",
|
|
216
|
+
"on-unanswered-invalid",
|
|
217
|
+
f"on_unanswered: {ou} is not a valid value{extra}.",
|
|
218
|
+
"Use one of: fail | prompt | first | llm (YAML). For a live model use on_unanswered: llm.",
|
|
219
|
+
path,
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# E: authored replay_protocol_fidelity
|
|
224
|
+
if "replay_protocol_fidelity" in assert_keys:
|
|
225
|
+
findings.append(
|
|
226
|
+
Finding(
|
|
227
|
+
"ERROR",
|
|
228
|
+
"authored-replay-fidelity",
|
|
229
|
+
"`replay_protocol_fidelity` is synthesized by the replay lane only and cannot be authored.",
|
|
230
|
+
"Remove it — on a live run it evaluates as an empty assertion.",
|
|
231
|
+
path,
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# W: no content assertion → a replay PR gate verifies nothing
|
|
236
|
+
if items:
|
|
237
|
+
content_present = bool(assert_keys & (CONTENT_KEYS | GATE_KEYS))
|
|
238
|
+
if not content_present:
|
|
239
|
+
findings.append(
|
|
240
|
+
Finding(
|
|
241
|
+
"WARN",
|
|
242
|
+
"replay-noop",
|
|
243
|
+
"every assertion is filesystem/egress — on the token-free `replay` lane they are "
|
|
244
|
+
"ALL silently skipped, so a replay PR gate would verify nothing.",
|
|
245
|
+
"Add a content assertion (result / transcript_* / tool_* / subagent_*) or run this "
|
|
246
|
+
"scenario only on the live (run/record) lane.",
|
|
247
|
+
path,
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# W: mixed-class assert item → fs/egress half dropped on replay
|
|
252
|
+
for idx, item in enumerate(items):
|
|
253
|
+
ks = set(item.keys())
|
|
254
|
+
content_half = ks & (CONTENT_KEYS | GATE_KEYS)
|
|
255
|
+
fs_half = ks & FS_EGRESS_KEYS
|
|
256
|
+
if content_half and fs_half:
|
|
257
|
+
findings.append(
|
|
258
|
+
Finding(
|
|
259
|
+
"WARN",
|
|
260
|
+
"mixed-assert-item",
|
|
261
|
+
f"assert item #{idx} mixes content {sorted(content_half)} with "
|
|
262
|
+
f"filesystem/egress {sorted(fs_half)} — on replay the filesystem/egress half is "
|
|
263
|
+
"dropped (only the content half is evaluated).",
|
|
264
|
+
"Split into separate list items: one per concern.",
|
|
265
|
+
path,
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# I: gate keys need a controlOut cassette on replay
|
|
270
|
+
gate_present = sorted(assert_keys & GATE_KEYS)
|
|
271
|
+
if gate_present:
|
|
272
|
+
findings.append(
|
|
273
|
+
Finding(
|
|
274
|
+
"INFO",
|
|
275
|
+
"gate-needs-controlout",
|
|
276
|
+
f"gate assertion(s) {gate_present} only evaluate on replay when the cassette has "
|
|
277
|
+
"controlOut (full-fidelity). An old cassette excludes them (with a loud warning).",
|
|
278
|
+
"Re-record with a current harness so the cassette carries controlOut.",
|
|
279
|
+
path,
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# W: double-quoted regex with a backslash (raw-text scan — the parser already ate it)
|
|
284
|
+
findings.extend(_lint_regex_quoting(path, raw_lines))
|
|
285
|
+
|
|
286
|
+
return findings
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
_DQ_REGEX_LINE = re.compile(
|
|
290
|
+
r'^\s*-?\s*(' + "|".join(sorted(REGEX_KEYS)) + r')\s*:\s*"([^"]*\\[^"]*)"'
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _lint_regex_quoting(path, raw_lines):
|
|
295
|
+
out = []
|
|
296
|
+
for i, line in enumerate(raw_lines, start=1):
|
|
297
|
+
m = _DQ_REGEX_LINE.match(line)
|
|
298
|
+
if m:
|
|
299
|
+
out.append(
|
|
300
|
+
Finding(
|
|
301
|
+
"WARN",
|
|
302
|
+
"regex-double-quoted",
|
|
303
|
+
f"`{m.group(1)}` uses a DOUBLE-quoted regex containing a backslash "
|
|
304
|
+
f'("{m.group(2)}") — YAML strips the backslash, so the regex is wrong.',
|
|
305
|
+
"Single-quote the regex (e.g. '\\d+') or use a block scalar. Use [\\s\\S] not . to span turns.",
|
|
306
|
+
path,
|
|
307
|
+
i,
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
return out
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _require_yaml():
|
|
314
|
+
try:
|
|
315
|
+
import yaml # type: ignore
|
|
316
|
+
|
|
317
|
+
return yaml
|
|
318
|
+
except ImportError:
|
|
319
|
+
print("scenario.py requires PyYAML. Install it: pip install pyyaml", file=sys.stderr)
|
|
320
|
+
sys.exit(2)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def lint_file(path):
|
|
324
|
+
yaml = _require_yaml()
|
|
325
|
+
p = Path(path)
|
|
326
|
+
if not p.is_file():
|
|
327
|
+
return [Finding("ERROR", "not-found", f"file not found: {path}", "Check the path.", path)]
|
|
328
|
+
text = p.read_text(encoding="utf-8")
|
|
329
|
+
raw_lines = text.splitlines()
|
|
330
|
+
# The regex-quoting scan runs on raw text, so it works even when YAML parsing fails —
|
|
331
|
+
# and a bad double-quoted regex (e.g. "\d") is exactly a case that can fail to parse.
|
|
332
|
+
quoting = _lint_regex_quoting(path, raw_lines)
|
|
333
|
+
try:
|
|
334
|
+
doc = yaml.safe_load(text)
|
|
335
|
+
except yaml.YAMLError as e: # noqa
|
|
336
|
+
msg = str(e).splitlines()[0]
|
|
337
|
+
return quoting + [
|
|
338
|
+
Finding("ERROR", "parse", f"YAML parse error: {msg}", "Fix the YAML syntax.", path)
|
|
339
|
+
]
|
|
340
|
+
return lint_doc(doc, path, raw_lines)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
SEV_ORDER = {"ERROR": 0, "WARN": 1, "INFO": 2}
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _print_findings(findings, n_files):
|
|
347
|
+
if not findings:
|
|
348
|
+
print(f"✓ {n_files} scenario(s) clean — no silent-false-green findings.")
|
|
349
|
+
return
|
|
350
|
+
for x in sorted(findings, key=lambda f: (str(f.file), SEV_ORDER[f.severity])):
|
|
351
|
+
loc = f"{x.file}:{x.line}" if x.line else x.file
|
|
352
|
+
glyph = {"ERROR": "✗", "WARN": "⚠", "INFO": "ℹ"}[x.severity]
|
|
353
|
+
print(f"{glyph} {x.severity} [{x.rule}] {loc}")
|
|
354
|
+
print(f" {x.message}")
|
|
355
|
+
print(f" fix: {x.fix}")
|
|
356
|
+
n_err = sum(1 for x in findings if x.severity == "ERROR")
|
|
357
|
+
n_warn = sum(1 for x in findings if x.severity == "WARN")
|
|
358
|
+
n_info = sum(1 for x in findings if x.severity == "INFO")
|
|
359
|
+
print(f"\n{n_err} error(s), {n_warn} warning(s), {n_info} info across {n_files} file(s).")
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def cmd_lint(args):
|
|
363
|
+
all_findings = []
|
|
364
|
+
for f in args.files:
|
|
365
|
+
all_findings.extend(lint_file(f))
|
|
366
|
+
if args.json:
|
|
367
|
+
print(json.dumps([x.as_dict() for x in all_findings], indent=2))
|
|
368
|
+
else:
|
|
369
|
+
_print_findings(all_findings, len(args.files))
|
|
370
|
+
has_error = any(x.severity == "ERROR" for x in all_findings)
|
|
371
|
+
if has_error or (args.strict and all_findings):
|
|
372
|
+
return 1
|
|
373
|
+
return 0
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
# --------------------------------------------------------------------------- #
|
|
377
|
+
# scaffold
|
|
378
|
+
# --------------------------------------------------------------------------- #
|
|
379
|
+
|
|
380
|
+
def _sq(s):
|
|
381
|
+
"""Single-quote a YAML scalar (doubling internal single quotes). Single quotes keep
|
|
382
|
+
regex backslashes literal — double quotes would eat them (the regex-quoting gotcha)."""
|
|
383
|
+
return "'" + str(s).replace("'", "''") + "'"
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _split_kv(spec, flag):
|
|
387
|
+
if "=" not in spec:
|
|
388
|
+
print(f"{flag} expects '<regex>=<choice>', got: {spec}", file=sys.stderr)
|
|
389
|
+
sys.exit(2)
|
|
390
|
+
k, v = spec.split("=", 1)
|
|
391
|
+
return k.strip(), v.strip()
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def build_scenario(args):
|
|
395
|
+
"""Return (yaml_text, notes[]). Encodes the convergent skeleton: container by default,
|
|
396
|
+
scripted answers + on_unanswered: fail, content-class assertions first then live-only,
|
|
397
|
+
one concern per item."""
|
|
398
|
+
notes = []
|
|
399
|
+
tier = args.tier
|
|
400
|
+
egress_asserted = bool(args.egress_denied or args.egress_allowed)
|
|
401
|
+
|
|
402
|
+
# Never emit a scenario the linter would reject: protocol + egress is rejected by the harness.
|
|
403
|
+
if tier == "protocol" and egress_asserted:
|
|
404
|
+
tier = "container"
|
|
405
|
+
notes.append(
|
|
406
|
+
"tier auto-upgraded protocol → container: egress assertions need a sandboxed tier "
|
|
407
|
+
"(protocol is rejected by the harness)."
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
gates = [_split_kv(g, "--gate") for g in (args.gate or [])]
|
|
411
|
+
|
|
412
|
+
L = []
|
|
413
|
+
L.append(f"# {args.name} — cowork-harness scenario (scaffolded; edit the TODOs).")
|
|
414
|
+
L.append(f"# Tier '{tier}': "
|
|
415
|
+
+ ("sandbox + real default-deny egress." if tier == "container"
|
|
416
|
+
else "see references/fidelity-and-answers.md.")
|
|
417
|
+
+ " on_unanswered: fail keeps this deterministic for CI.")
|
|
418
|
+
if args.skill:
|
|
419
|
+
L.append(f"# Mount the skill under test ({args.skill}) via a session: e.g.")
|
|
420
|
+
L.append("# plugins:")
|
|
421
|
+
L.append(f"# local_plugins: [{args.skill}]")
|
|
422
|
+
L.append("# enabled: [<plugin-name>@local]")
|
|
423
|
+
L.append("")
|
|
424
|
+
L.append(f"name: {args.name}")
|
|
425
|
+
L.append("baseline: latest")
|
|
426
|
+
if args.session:
|
|
427
|
+
L.append(f"session: {args.session}")
|
|
428
|
+
L.append(f"fidelity: {tier}")
|
|
429
|
+
L.append("on_unanswered: fail")
|
|
430
|
+
L.append("")
|
|
431
|
+
L.append("prompt: |")
|
|
432
|
+
for line in (args.prompt or "TODO: the user turn that drives the skill.").splitlines() or [""]:
|
|
433
|
+
L.append(f" {line}")
|
|
434
|
+
|
|
435
|
+
# answers (scripted gates + web_fetch approvals) — the only deterministic path
|
|
436
|
+
if gates or args.web_fetch:
|
|
437
|
+
L.append("")
|
|
438
|
+
L.append("answers:")
|
|
439
|
+
for rx, choice in gates:
|
|
440
|
+
L.append(f" - when_question: {_sq(rx)}")
|
|
441
|
+
L.append(f" choose: {_sq(choice)}")
|
|
442
|
+
for dom in (args.web_fetch or []):
|
|
443
|
+
L.append(f' - when_tool: "webfetch:{dom}" # web_fetch approval (provenance-miss gate)')
|
|
444
|
+
L.append(" decide: allow")
|
|
445
|
+
L.append(" grant: domain")
|
|
446
|
+
|
|
447
|
+
# assertions: content/structure first (replay PR gate), then live-only (filesystem/egress)
|
|
448
|
+
content_lines = [" - result: success"]
|
|
449
|
+
for rx in (args.content or []):
|
|
450
|
+
content_lines.append(f" - transcript_matches: {_sq(rx)}")
|
|
451
|
+
for tool in (args.tool or []):
|
|
452
|
+
content_lines.append(f" - tool_called: {tool}")
|
|
453
|
+
for rx in (args.subagent or []):
|
|
454
|
+
content_lines.append(f" - subagent_dispatched: {_sq(rx)} # matches agentType OR dispatch description")
|
|
455
|
+
if gates:
|
|
456
|
+
for rx, _ in gates:
|
|
457
|
+
content_lines.append(f" - question_asked: {_sq(rx)} # gate key: replay only with a controlOut cassette")
|
|
458
|
+
content_lines.append(f" - questions_count_max: {len(gates)}")
|
|
459
|
+
content_lines.append(" - gate_answers_delivered: true # the steered answers actually reached the model")
|
|
460
|
+
|
|
461
|
+
live_lines = []
|
|
462
|
+
for p in (args.file or []):
|
|
463
|
+
live_lines.append(f" - file_exists: {p}")
|
|
464
|
+
for p in (args.artifact or []):
|
|
465
|
+
live_lines.append(f" - user_visible_artifact: {p}")
|
|
466
|
+
if args.no_delete:
|
|
467
|
+
live_lines.append(" - no_delete_in_outputs: true")
|
|
468
|
+
for h in (args.egress_allowed or []):
|
|
469
|
+
live_lines.append(f" - egress_allowed: {h}")
|
|
470
|
+
for h in (args.egress_denied or []):
|
|
471
|
+
live_lines.append(f" - egress_denied: {h}")
|
|
472
|
+
|
|
473
|
+
L.append("")
|
|
474
|
+
L.append("assert:")
|
|
475
|
+
L.append(" # --- content / structure: evaluate on the token-free replay PR gate AND live ---")
|
|
476
|
+
L.extend(content_lines)
|
|
477
|
+
if live_lines:
|
|
478
|
+
L.append(" # --- filesystem / egress: LIVE-only (silently skipped on replay) ---")
|
|
479
|
+
L.extend(live_lines)
|
|
480
|
+
else:
|
|
481
|
+
L.append(" # TODO add filesystem/egress checks (file_exists / user_visible_artifact /")
|
|
482
|
+
L.append(" # egress_denied / no_delete_in_outputs) — they run on the LIVE lane only.")
|
|
483
|
+
|
|
484
|
+
if args.web_fetch:
|
|
485
|
+
notes.append(
|
|
486
|
+
"web_fetch: put the URL in the prompt so it is provenanced (the deterministic way to make a "
|
|
487
|
+
"fetch succeed). egress.extra_allow is a NO-OP on the provenanced path — provenance is the gate."
|
|
488
|
+
)
|
|
489
|
+
if not (args.content or args.tool or args.subagent or gates):
|
|
490
|
+
notes.append("only `result: success` is a content assertion — add a transcript_matches / tool_called "
|
|
491
|
+
"so the replay PR gate verifies something real.")
|
|
492
|
+
|
|
493
|
+
return "\n".join(L) + "\n", notes
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def cmd_scaffold(args):
|
|
497
|
+
yaml = _require_yaml()
|
|
498
|
+
text, notes = build_scenario(args)
|
|
499
|
+
|
|
500
|
+
# Dogfood: self-lint the generated scenario; refuse to emit something the linter rejects.
|
|
501
|
+
if not args.no_validate:
|
|
502
|
+
doc = yaml.safe_load(text)
|
|
503
|
+
findings = lint_doc(doc, "<scaffold>", text.splitlines())
|
|
504
|
+
errors = [f for f in findings if f.severity == "ERROR"]
|
|
505
|
+
if errors:
|
|
506
|
+
print("scaffold produced a scenario its own linter rejects (this is a bug):", file=sys.stderr)
|
|
507
|
+
for e in errors:
|
|
508
|
+
print(f" ✗ [{e.rule}] {e.message}", file=sys.stderr)
|
|
509
|
+
return 2
|
|
510
|
+
|
|
511
|
+
if args.out:
|
|
512
|
+
Path(args.out).write_text(text, encoding="utf-8")
|
|
513
|
+
print(f"✓ wrote {args.out}", file=sys.stderr)
|
|
514
|
+
else:
|
|
515
|
+
sys.stdout.write(text)
|
|
516
|
+
|
|
517
|
+
for n in notes:
|
|
518
|
+
print(f"note: {n}", file=sys.stderr)
|
|
519
|
+
return 0
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def main(argv=None):
|
|
523
|
+
ap = argparse.ArgumentParser(
|
|
524
|
+
prog="scenario.py",
|
|
525
|
+
description="Author (scaffold) and check (lint) cowork-harness scenarios.",
|
|
526
|
+
)
|
|
527
|
+
sub = ap.add_subparsers(dest="command", required=True)
|
|
528
|
+
|
|
529
|
+
lp = sub.add_parser("lint", help="lint scenario(s) for silent-false-green invariants")
|
|
530
|
+
lp.add_argument("files", nargs="+", help="scenario YAML file(s) to lint")
|
|
531
|
+
lp.add_argument("--json", action="store_true", help="emit findings as JSON")
|
|
532
|
+
lp.add_argument("--strict", action="store_true", help="exit non-zero on WARN/INFO too, not just ERROR")
|
|
533
|
+
lp.set_defaults(func=cmd_lint)
|
|
534
|
+
|
|
535
|
+
sp = sub.add_parser("scaffold", help="emit a valid scenario skeleton (self-linted)")
|
|
536
|
+
sp.add_argument("--name", default="my-scenario", help="scenario name (default: my-scenario)")
|
|
537
|
+
sp.add_argument("--prompt", help="the user turn (the prompt: block)")
|
|
538
|
+
sp.add_argument("--tier", choices=VALID_TIERS, default="container", help="fidelity tier (default: container)")
|
|
539
|
+
sp.add_argument("--session", help="path for the session: field (discovery/setup file)")
|
|
540
|
+
sp.add_argument("--skill", help="skill folder under test — adds a session-mount comment")
|
|
541
|
+
sp.add_argument("--content", action="append", metavar="REGEX", help="transcript_matches assertion (repeatable)")
|
|
542
|
+
sp.add_argument("--tool", action="append", metavar="TOOL", help="tool_called assertion (repeatable)")
|
|
543
|
+
sp.add_argument("--subagent", action="append", metavar="REGEX", help="subagent_dispatched assertion (repeatable)")
|
|
544
|
+
sp.add_argument("--gate", action="append", metavar="REGEX=CHOICE", help="scripted AskUserQuestion answer (repeatable)")
|
|
545
|
+
sp.add_argument("--web-fetch", dest="web_fetch", action="append", metavar="DOMAIN", help="web_fetch approval rule (repeatable)")
|
|
546
|
+
sp.add_argument("--file", action="append", metavar="PATH", help="file_exists assertion (repeatable)")
|
|
547
|
+
sp.add_argument("--artifact", action="append", metavar="PATH", help="user_visible_artifact assertion (repeatable)")
|
|
548
|
+
sp.add_argument("--no-delete", action="store_true", help="add no_delete_in_outputs: true")
|
|
549
|
+
sp.add_argument("--egress-allowed", dest="egress_allowed", action="append", metavar="HOST", help="egress_allowed assertion (repeatable)")
|
|
550
|
+
sp.add_argument("--egress-denied", dest="egress_denied", action="append", metavar="HOST", help="egress_denied assertion (repeatable)")
|
|
551
|
+
sp.add_argument("--out", help="write to this file (default: stdout)")
|
|
552
|
+
sp.add_argument("--no-validate", action="store_true", help="skip the self-lint of the generated scenario")
|
|
553
|
+
sp.set_defaults(func=cmd_scaffold)
|
|
554
|
+
|
|
555
|
+
args = ap.parse_args(argv)
|
|
556
|
+
return args.func(args)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
if __name__ == "__main__":
|
|
560
|
+
sys.exit(main())
|
package/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,106 @@ All notable changes to this project are documented here. The format is based on
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.4.0] — 2026-06-18
|
|
10
|
+
|
|
11
|
+
The parsing/validation hardening + safety release: a current-tree code-review sweep plus fidelity and
|
|
12
|
+
robustness findings from real skill-testing sessions — uniform fail-loud CLI parsing (enforced by a
|
|
13
|
+
structural test + CI guard), a centralized staging-source resolver, cassette replay/manifest safety
|
|
14
|
+
(base64 + containment + hash-verify), egress SSRF/DNS-rebind hardening, `replay <dir>`, and `cowork-harness lint`.
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
|
|
18
|
+
- **`cowork-harness lint <scenario.yaml>…`** — the bundled scenario linter/scaffolder (`scenario.py`) is now
|
|
19
|
+
shipped in the npm package and reachable as a first-class subcommand, so a consumer who `npm i`s the harness
|
|
20
|
+
(with no skill checkout) can run the no-silent-false-green checks in CI. Needs `python3` + PyYAML; a missing
|
|
21
|
+
interpreter fails with a clear, actionable message.
|
|
22
|
+
- **`replay <dir>`** — `replay` now accepts a directory and replays every `*.cassette.json` in it (sorted,
|
|
23
|
+
non-recursive), exiting on the worst per-cassette verdict, in addition to the existing `--cassette <file>`
|
|
24
|
+
form. An unreadable cassette is reported per-file and forces the JSON envelope's `ok:false` (never a vacuous
|
|
25
|
+
pass), and never aborts the batch.
|
|
26
|
+
- **A shipped `protocol`-tier example** (`examples/scenarios/protocol-smoke.yaml` + its session) — the first
|
|
27
|
+
zero-Docker/zero-agent worked example for the L0 tier (a scripted answer reaches the model, a tool runs, a
|
|
28
|
+
file is written), with the host-path leak owned via `transcript_no_host_path: false` to illustrate exactly
|
|
29
|
+
what protocol fidelity does and does not seal.
|
|
30
|
+
- **Documentation for previously-undocumented surfaces:** `sync --allow-empty`, `boundary-check --session`,
|
|
31
|
+
`decide`'s `--decider-dir` rejection, `verify-cassettes`'s non-recursive scan, `replay` (one file vs
|
|
32
|
+
`record` batching), `gates` raw-output (no envelope), `gate_answers_delivered: false`, python
|
|
33
|
+
`run_scenario()`, six public reproducibility env vars, and HELP text for `chat --fidelity/--model` and
|
|
34
|
+
`sync --allow-empty`. Plus a zero-dependency "try it in 10s" `replay` lead in the README quick start.
|
|
35
|
+
|
|
36
|
+
### Changed
|
|
37
|
+
|
|
38
|
+
- **Uniform CLI argument validation.** A shared declarative argument parser backs the cassette commands
|
|
39
|
+
(`record`/`replay`/`verify-cassettes`) + `boundary-check`, and **every** command now rejects unknown flags,
|
|
40
|
+
extra positionals, and flag-looking values for path/id flags instead of silently ignoring them — closing a
|
|
41
|
+
class of silent-accept parsing footguns. This is enforced going forward by a structural test (every command
|
|
42
|
+
must reject an unknown flag) and a CI grep-ban on the legacy first-non-dash-token idiom. Error paths only;
|
|
43
|
+
valid invocations are unchanged.
|
|
44
|
+
- **The npm package ships `scenario.py`** (the linter/scaffolder) and publishes with provenance attestation so
|
|
45
|
+
CI consumers can lint without a skill checkout.
|
|
46
|
+
- **Agent-binary discovery falls back to the newest staged build.** When the baseline's exact
|
|
47
|
+
`claude-code-vm/<ver>/claude` is absent (e.g. Cowork staged a newer build), the harness now uses the newest
|
|
48
|
+
staged sibling with a warning instead of hard-failing; `COWORK_AGENT_BINARY` still takes precedence.
|
|
49
|
+
- **`chat --fidelity` now validates its argument** — a value other than `container`/`hostloop` is rejected
|
|
50
|
+
(exit 2) instead of being silently coerced to `container` (a fidelity footgun).
|
|
51
|
+
- **`assert --list`** now describes `replay_protocol_fidelity` as replay-only and **not authorable** (it is
|
|
52
|
+
synthesized by the replay lane and rejected if written in a scenario).
|
|
53
|
+
|
|
54
|
+
### Fixed
|
|
55
|
+
|
|
56
|
+
- **CLI parsing hygiene across commands.** `run` now treats an empty scenario directory as a loud non-zero
|
|
57
|
+
(was a vacuous exit-0 pass); `record`/`verify-cassettes`/`gates` no longer mistake a `--output-format`/
|
|
58
|
+
`--allow` value for the positional target; `trace` rejects mutually-exclusive view flags and extra targets;
|
|
59
|
+
`scaffold`/`assert --list` validate `--output-format` and reject stray arguments; `decide` rejects unknown
|
|
60
|
+
flags, stray positionals, `--intent` without `--decider-llm`, an `--decider-llm`+`--answer` conflict, and a
|
|
61
|
+
flag-looking `--decider-cmd` value; `vm` validates its subcommand before loading a baseline; `boundary-check`
|
|
62
|
+
rejects unknown flags; the global `--dotenv=<path>` equals form is accepted; and `--output-format=<x>`
|
|
63
|
+
validates the value rather than silently degrading to text.
|
|
64
|
+
- **Cassette replay safety.** `replay` routes reads through the safe cassette reader (a malformed cassette is a
|
|
65
|
+
clean error, not an internal crash); a lenient schema guards the dereferenced `scenario`/`events` fields and
|
|
66
|
+
a missing optional `assert` is normalized so it can't crash a batch; manifest bodies are stored with an
|
|
67
|
+
encoding marker (binary as base64) so non-text artifacts round-trip byte-exactly; materialized entries are
|
|
68
|
+
path-contained (no `..`/absolute escape) and verified against their recorded sha256.
|
|
69
|
+
- **Skill-staleness hash no longer self-invalidates.** The `skillHash` fingerprint now excludes recorded
|
|
70
|
+
cassettes (`*.cassette.json`, by extension) and VCS/cache dirs (`.git`/`node_modules`/`__pycache__`/…), so
|
|
71
|
+
writing a cassette under the hashed skill tree no longer changes the fingerprint it just recorded (and a
|
|
72
|
+
repo that co-locates committed cassettes with the skill stops falsely tripping the staleness gate). Real
|
|
73
|
+
skill-source edits — including under a `tests/` dir — still change the hash (kept conservative: no
|
|
74
|
+
false-negative).
|
|
75
|
+
- **Staging source validation.** Every declared session source now resolves through one central choke point
|
|
76
|
+
(`resolveDeclaredSource`, guarded by a structural test): `mcp.config` must be a file; connected folders,
|
|
77
|
+
local/remote plugin roots, and local skills must be directories; a nameless marketplace manifest now
|
|
78
|
+
resolves and qualifier-matches by its derived name; and a corrupt `plugin.json` errors instead of silently
|
|
79
|
+
defaulting to version `0.0.0`. The soft-missing reconciliation path is preserved (a missing source still
|
|
80
|
+
reconciles; only a wrong-kind existing source fails loud).
|
|
81
|
+
- **Artifact collection no longer follows symlinks** (`lstat` + symlink-skip + a realpath cycle guard), and the
|
|
82
|
+
egress sidecar/proxy are acquired inside the protected block so a prompt-render throw can't leak them.
|
|
83
|
+
- **Egress/web-fetch guards.** The private-address guard recognizes IPv4-mapped IPv6 and numeric/hex/octal IPv4
|
|
84
|
+
loopback forms; a host-side `web_fetch` to a hostname that **resolves** to a private/loopback address is now
|
|
85
|
+
denied (DNS-rebind/SSRF, fail-closed — a name that won't resolve is also denied), checked on every redirect
|
|
86
|
+
hop; the proxy parses bracketed IPv6 `Host` headers; and an `allow` egress decision is recorded only once the
|
|
87
|
+
upstream actually connects (so `egress_allowed` can't pass when nothing reached the host).
|
|
88
|
+
- **Verdict/assertion correctness.** A nonzero child exit after a success result is now fatal (with the stderr
|
|
89
|
+
tail); `artifact_json` `equals`/`in` compare JSON with key-order-insensitive deep equality (arrays stay
|
|
90
|
+
order-significant); the external decider rejects an invalid permission `behavior` loudly instead of silently
|
|
91
|
+
denying; `no_delete_in_outputs` accepts only `true` (authoring `false` was a silent no-op footgun); and the
|
|
92
|
+
outputs-delete detector parses `mv` direction (a move *into* `outputs/` is no longer a false delete) with an
|
|
93
|
+
opt-in safe-staging-prefix suppression for scratch cleanups (`COWORK_HARNESS_SAFE_STAGING_PREFIX`).
|
|
94
|
+
- **Python wrapper drift.** `run_scenario()` no longer passes `--fidelity`/`--answer` flags the `run` command
|
|
95
|
+
rejects; fidelity and answers are scenario-authored (the YAML's `fidelity:`/`answers:` fields).
|
|
96
|
+
- **Docs reconciled with the 0.3.0 artifact-manifest replay behavior.** README, SPEC, `docs/scenario.md`,
|
|
97
|
+
the companion `SKILL.md`, and the skill references previously claimed `file_exists`/`user_visible_artifact`/
|
|
98
|
+
`artifact_json` were "always skipped" on replay; they now correctly state these are evaluated **when the
|
|
99
|
+
cassette carries an `artifacts` manifest** (only the live-only egress keys are always skipped), with
|
|
100
|
+
`docs/cassette.md` flagged as canonical and `allow_permissive_auto_allow` added to its table.
|
|
101
|
+
- **Corrected the claim that the `protocol` tier needs no token** — L0 spawns the host `claude` and calls a
|
|
102
|
+
real model, so it needs the auth token (Docker-free/agent-free, not token-free).
|
|
103
|
+
- **Aligned stale references:** npx floor `>=0.2.0` → `>=0.3.0`; skill reference headers `0.1.0` → `0.3.0`;
|
|
104
|
+
stale `cassette.ts` line-cites → the `contentKeys` symbol; and the broken `DESIGN.md §1` anchor.
|
|
105
|
+
- **Doc accuracy:** all five fidelity values (vs "L0/L1/L2"), `max_thinking_tokens` over "extended thinking",
|
|
106
|
+
the `config_dir` write-guard caveat, the `boundary-check` (exit 1) vs `BoundaryError` (exit 2) exit-code
|
|
107
|
+
distinction, and the `npm run ci` vs CI-Stage-1 gate framing.
|
|
108
|
+
|
|
9
109
|
## [0.3.0] — 2026-06-17
|
|
10
110
|
|
|
11
111
|
The CI-operate + privacy layer for committed cassettes: record-time redaction, an always-on
|