npm - cowork-harness - Versions diffs - 0.4.2 → 0.4.3 - Mend

cowork-harness 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/.claude/skills/cowork-harness/scripts/assertion-keys.json +30 -0
package/.claude/skills/cowork-harness/scripts/scenario.py +89 -21
package/CHANGELOG.md +15 -1
package/dist/types.js +2 -1
package/package.json +3 -1
package/python/test_scenario_lint.py +75 -0
package/scripts/gen-schema.ts +23 -1

package/.claude/skills/cowork-harness/scripts/assertion-keys.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "$comment": "GENERATED from the Zod Assertion schema (src/types.ts) by scripts/gen-schema.ts — do not edit; run `npm run schema`.",
+  "keys": [
+    "allow_permissive_auto_allow",
+    "artifact_json",
+    "dispatch_count_max",
+    "egress_allowed",
+    "egress_denied",
+    "file_exists",
+    "gate_answers_delivered",
+    "no_delete_in_outputs",
+    "question_asked",
+    "questions_count_max",
+    "replay_protocol_fidelity",
+    "result",
+    "self_heal_ran",
+    "subagent_declared_but_unused",
+    "subagent_dispatched",
+    "subagent_tool_absent",
+    "subagent_tool_used",
+    "tool_called",
+    "tool_not_called",
+    "transcript_contains",
+    "transcript_matches",
+    "transcript_no_host_path",
+    "transcript_not_contains",
+    "transcript_not_matches",
+    "user_visible_artifact"
+  ]
+}

package/.claude/skills/cowork-harness/scripts/scenario.py CHANGED Viewed

@@ -55,19 +55,55 @@ CONTENT_KEYS = {
 }
 # content keys, but only evaluated on replay when the cassette carries controlOut
 GATE_KEYS = {"question_asked", "questions_count_max", "gate_answers_delivered"}
-# live-only: silently skipped on replay (no filesystem, no network)
-FS_EGRESS_KEYS = {
-    "file_exists",
-    "user_visible_artifact",
+# manifest-backed: replay-checkable when the cassette carries an `artifacts` manifest (record snapshots one);
+# a manifest-less cassette skips them. Since the 0.3.0 artifact-manifest these are NOT always live-only.
+MANIFEST_KEYS = {"file_exists", "user_visible_artifact", "artifact_json"}
+# live-only: ALWAYS silently skipped on replay (no filesystem, no network on the token-free lane)
+LIVE_ONLY_KEYS = {
+    "egress_denied",
+    "egress_allowed",
     "no_delete_in_outputs",
     "self_heal_ran",
     "transcript_no_host_path",
-    "egress_denied",
-    "egress_allowed",
 }
 EGRESS_KEYS = {"egress_denied", "egress_allowed"}
-# every valid key inside an `assert:` list item
-ASSERT_KEYS = CONTENT_KEYS | GATE_KEYS | FS_EGRESS_KEYS | {"replay_protocol_fidelity"}
+# verdict modifiers — don't verify anything themselves (e.g. suppress a default-fail)
+VERDICT_MODIFIER_KEYS = {"allow_permissive_auto_allow"}
+# Every key the replay-class logic knows how to handle. `replay_protocol_fidelity` is valid-but-not-authorable
+# (errored separately below). This is also the embedded fallback for ASSERT_KEYS — kept EQUAL to the generated
+# list (test-enforced) so a missing assertion-keys.json can't silently reintroduce key drift.
+_CLASSIFIED_KEYS = CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS | LIVE_ONLY_KEYS | VERDICT_MODIFIER_KEYS | {"replay_protocol_fidelity"}
+def _load_assert_keys():
+    """The authoritative `assert:` key set, generated from the Zod Assertion schema into a sibling
+    `assertion-keys.json` (so the unknown-key check can't drift). Falls back to the embedded
+    `_CLASSIFIED_KEYS` (kept equal to the generated list) with a loud warning if the file is missing."""
+    p = Path(__file__).resolve().parent / "assertion-keys.json"
+    try:
+        return set(json.loads(p.read_text(encoding="utf-8"))["keys"])
+    except Exception:
+        print(
+            f"::warning:: assertion-keys.json not found next to scenario.py ({p}) — "
+            "using a built-in key list that may be stale (run `npm run schema`).",
+            file=sys.stderr,
+        )
+        return set(_CLASSIFIED_KEYS)
+# every valid key inside an `assert:` list item (generated from the zod schema; see _load_assert_keys)
+ASSERT_KEYS = _load_assert_keys()
+# Self-check: every valid assertion key must be classified, else the replay-class lint logic mishandles it.
+# Surfaced loudly at load AND as a lint ERROR in cmd_lint (so --strict / exit codes flow). Never sys.exit here.
+UNCLASSIFIED_KEYS = sorted(ASSERT_KEYS - _CLASSIFIED_KEYS)
+if UNCLASSIFIED_KEYS:
+    print(
+        f"::warning:: scenario.py: assertion key(s) {UNCLASSIFIED_KEYS} are in the schema but not classified "
+        "— add them to the linter's CONTENT/GATE/MANIFEST/LIVE_ONLY/VERDICT_MODIFIER sets.",
+        file=sys.stderr,
+    )
 # every valid top-level scenario key
 TOP_LEVEL_KEYS = {
     "name",
@@ -232,40 +268,59 @@ def lint_doc(doc, path, raw_lines):
             )
         )
-    # W: no content assertion → a replay PR gate verifies nothing
+    # W: nothing replay-checkable → a replay PR gate verifies nothing. Content/gate are replay-checkable, and
+    # manifest-backed keys are too WHEN the cassette carries an artifacts manifest — so only an all-live-only
+    # (egress / no_delete / self_heal / host-path) assert set genuinely no-ops on replay.
     if items:
-        content_present = bool(assert_keys & (CONTENT_KEYS | GATE_KEYS))
-        if not content_present:
+        replay_checkable = bool(assert_keys & (CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS))
+        if not replay_checkable:
             findings.append(
                 Finding(
                     "WARN",
                     "replay-noop",
-                    "every assertion is filesystem/egress — on the token-free `replay` lane they are "
-                    "ALL silently skipped, so a replay PR gate would verify nothing.",
-                    "Add a content assertion (result / transcript_* / tool_* / subagent_*) or run this "
+                    "every assertion is live-only (egress / no_delete_in_outputs / self_heal_ran / "
+                    "transcript_no_host_path) — on the token-free `replay` lane they are ALL silently "
+                    "skipped, so a replay PR gate would verify nothing.",
+                    "Add a content assertion (result / transcript_* / tool_* / subagent_*) or a "
+                    "manifest-backed one (file_exists / user_visible_artifact / artifact_json), or run this "
                     "scenario only on the live (run/record) lane.",
                     path,
                 )
             )
-    # W: mixed-class assert item → fs/egress half dropped on replay
+    # W: mixed-class assert item → the live-only half is dropped on replay (manifest-backed keys are NOT)
     for idx, item in enumerate(items):
         ks = set(item.keys())
-        content_half = ks & (CONTENT_KEYS | GATE_KEYS)
-        fs_half = ks & FS_EGRESS_KEYS
-        if content_half and fs_half:
+        kept_half = ks & (CONTENT_KEYS | GATE_KEYS | MANIFEST_KEYS)
+        live_half = ks & LIVE_ONLY_KEYS
+        if kept_half and live_half:
             findings.append(
                 Finding(
                     "WARN",
                     "mixed-assert-item",
-                    f"assert item #{idx} mixes content {sorted(content_half)} with "
-                    f"filesystem/egress {sorted(fs_half)} — on replay the filesystem/egress half is "
-                    "dropped (only the content half is evaluated).",
+                    f"assert item #{idx} mixes replay-checkable {sorted(kept_half)} with "
+                    f"live-only {sorted(live_half)} — on replay the live-only half is dropped "
+                    "(only the replay-checkable half is evaluated).",
                     "Split into separate list items: one per concern.",
                     path,
                 )
             )
+    # I: manifest-backed keys need an artifacts manifest on replay
+    manifest_present = sorted(assert_keys & MANIFEST_KEYS)
+    if manifest_present:
+        findings.append(
+            Finding(
+                "INFO",
+                "manifest-needs-snapshot",
+                f"assertion(s) {manifest_present} evaluate on replay only when the cassette carries an "
+                "`artifacts` manifest (`record` snapshots one). A manifest-less cassette skips them "
+                "(with a loud warning).",
+                "Record with a current harness so the cassette carries the artifacts manifest.",
+                path,
+            )
+        )
     # I: gate keys need a controlOut cassette on replay
     gate_present = sorted(assert_keys & GATE_KEYS)
     if gate_present:
@@ -361,6 +416,19 @@ def _print_findings(findings, n_files):
 def cmd_lint(args):
     all_findings = []
+    # Linter self-check (B3): a valid schema key the replay-class sets don't classify can't be linted
+    # correctly — surface it as a hard ERROR so it fails the gate (and --strict) until someone classifies it.
+    if UNCLASSIFIED_KEYS:
+        all_findings.append(
+            Finding(
+                "ERROR",
+                "linter-unclassified-key",
+                f"linter is out of date: assertion key(s) {UNCLASSIFIED_KEYS} are valid (in the schema) but "
+                "scenario.py doesn't classify their replay behavior, so they can't be linted.",
+                "Add them to the linter's CONTENT/GATE/MANIFEST/LIVE_ONLY/VERDICT_MODIFIER sets.",
+                "(scenario.py)",
+            )
+        )
     for f in args.files:
         all_findings.extend(lint_file(f))
     if args.json:

package/CHANGELOG.md CHANGED Viewed

@@ -6,7 +6,21 @@ All notable changes to this project are documented here. The format is based on
 ## [Unreleased]
-## [0.4.2] — 2026-06-18
+## [0.4.3] — 2026-06-18
+### Fixed
+- **`cowork-harness lint` no longer flags `artifact_json` / `allow_permissive_auto_allow` as unknown keys.**
+  The linter's assertion-key list is now **generated from the Zod `Assertion` schema** (the same source
+  `assert --list` uses) into a file shipped next to `scenario.py`, with a CI drift-guard — so it can't lag the
+  schema again. Its replay-class warnings were also reconciled with the 0.3.0 artifact-manifest: `file_exists`,
+  `user_visible_artifact`, and `artifact_json` are now treated as **manifest-backed** (replay-checkable when the
+  cassette carries an `artifacts` manifest) rather than always-skipped, so a scenario asserting only those is no
+  longer a false `replay-noop`. A self-check fails the linter if a future schema key isn't classified.
+### Internal
+- The npm tarball no longer ships `docs/internal/` (internal planning docs were being published).
 ### Added

package/dist/types.js CHANGED Viewed

@@ -142,7 +142,8 @@ export const Assertion = z.object({
         .describe("(replay-only, NOT authorable) serializeDecision output matched the frozen recording — the token-free O7 guard; synthesized by the replay lane and rejected if written in a scenario"),
     // #5: assert over the CONTENTS of a JSON artifact via a dotted path. `absent` and `is_null` are DISTINCT
     // (key-missing vs present-null); an unresolved INTERMEDIATE segment fails loud (malformed artifact),
-    // never a vacuous pass. Live-only until the cassette artifact-manifest (#1) lands — stripped on replay.
+    // never a vacuous pass. Manifest-backed: evaluated on replay when the cassette carries an `artifacts`
+    // manifest (`record` snapshots one); a manifest-less cassette skips it (with a loud warning).
     artifact_json: z
         .object({
         artifact: z.string().describe("relative path to a JSON artifact under the work root (e.g. outputs/cap_state.json)"),

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cowork-harness",
-  "version": "0.4.2",
+  "version": "0.4.3",
   "description": "Scriptable, CI-friendly harness for Claude Cowork's runtime contract for testing skills across scenarios — same agent, mounts, egress allowlist, permission protocol, and sandbox limitations.",
   "license": "MIT",
   "type": "module",
@@ -34,9 +34,11 @@
     "docker",
     "schema",
     "docs",
+    "!docs/internal",
     "python",
     "scripts",
     ".claude/skills/cowork-harness/scripts/scenario.py",
+    ".claude/skills/cowork-harness/scripts/assertion-keys.json",
     ".env.example",
     "README.md",
     "CHANGELOG.md",

package/python/test_scenario_lint.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Tests for the bundled linter (scenario.py): its assertion-key list is generated from the Zod schema
+(no drift), and its replay-class warnings account for manifest-backed assertions.
+Run via the repo's pytest lane: `pytest -m 'not cowork'` from python/.
+"""
+import importlib.util
+import json
+from pathlib import Path
+REPO = Path(__file__).resolve().parents[1]
+SCENARIO_PY = REPO / ".claude/skills/cowork-harness/scripts/scenario.py"
+KEYS_JSON = REPO / ".claude/skills/cowork-harness/scripts/assertion-keys.json"
+def _load_scenario_module():
+    spec = importlib.util.spec_from_file_location("scenario_lint_under_test", SCENARIO_PY)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+scenario = _load_scenario_module()
+def _rules(yaml_body, tmp_path):
+    f = tmp_path / "sc.yaml"
+    f.write_text(
+        "name: t\nbaseline: latest\nsession: (inline)\nfidelity: container\nprompt: hi\n" + yaml_body,
+        encoding="utf-8",
+    )
+    return {fnd.rule for fnd in scenario.lint_file(str(f))}
+def test_assert_keys_loaded_from_generated_file():
+    generated = set(json.loads(KEYS_JSON.read_text(encoding="utf-8"))["keys"])
+    assert scenario.ASSERT_KEYS == generated
+    # the two keys that used to drift are present
+    assert {"artifact_json", "allow_permissive_auto_allow"} <= scenario.ASSERT_KEYS
+def test_embedded_fallback_equals_generated_list():
+    # the in-code fallback must equal the generated list, else a missing file silently reintroduces drift
+    generated = set(json.loads(KEYS_JSON.read_text(encoding="utf-8"))["keys"])
+    assert scenario._CLASSIFIED_KEYS == generated
+def test_every_key_is_classified_self_check():
+    assert scenario.UNCLASSIFIED_KEYS == []
+def test_artifact_json_is_not_unknown(tmp_path):
+    rules = _rules("assert:\n  - artifact_json: {artifact: outputs/x.json, path: a, equals: 1}\n", tmp_path)
+    assert "unknown-assert-key" not in rules
+    assert "manifest-needs-snapshot" in rules  # it IS manifest-backed on replay
+def test_allow_permissive_auto_allow_is_not_unknown(tmp_path):
+    rules = _rules("assert:\n  - allow_permissive_auto_allow: true\n", tmp_path)
+    assert "unknown-assert-key" not in rules
+def test_file_exists_only_is_not_replay_noop(tmp_path):
+    rules = _rules("assert:\n  - file_exists: outputs/x.md\n", tmp_path)
+    assert "replay-noop" not in rules  # manifest-backed → replay-checkable with a manifest
+    assert "manifest-needs-snapshot" in rules
+def test_egress_only_is_replay_noop(tmp_path):
+    rules = _rules("assert:\n  - egress_denied: evil.com\n", tmp_path)
+    assert "replay-noop" in rules  # truly live-only → skipped on replay
+def test_invented_key_still_flagged(tmp_path):
+    rules = _rules("assert:\n  - file_not_empty: outputs/x\n", tmp_path)
+    assert "unknown-assert-key" in rules

package/scripts/gen-schema.ts CHANGED Viewed

@@ -9,11 +9,31 @@ import { mkdirSync, writeFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath, pathToFileURL } from "node:url";
 import { zodToJsonSchema } from "zod-to-json-schema";
-import { ScenarioObject } from "../src/types.js";
+import { ScenarioObject, Assertion } from "../src/types.js";
 import { SessionConfig } from "../src/session.js";
 const REPO_ROOT = join(dirname(fileURLToPath(import.meta.url)), "..");
 export const SCHEMA_DIR = join(REPO_ROOT, "schema");
+/** The bundled linter (`scenario.py`) reads this for its assertion-key list. It lives NEXT TO scenario.py
+ *  (not under schema/) because schema/ is not shipped inside the plugin tree — only the skill's scripts dir
+ *  is. Writer + the drift-guard test both reference this one constant. */
+export const ASSERTION_KEYS_PATH = join(REPO_ROOT, ".claude/skills/cowork-harness/scripts/assertion-keys.json");
+/** The authoritative assertion-key list, derived from the Zod `Assertion` schema (the same source
+ *  `assert --list` reads). Generating it keeps `scenario.py`'s unknown-key check from drifting. */
+export function buildAssertionKeys(): string {
+  return (
+    JSON.stringify(
+      {
+        $comment:
+          "GENERATED from the Zod Assertion schema (src/types.ts) by scripts/gen-schema.ts — do not edit; run `npm run schema`.",
+        keys: Object.keys(Assertion.shape).sort(),
+      },
+      null,
+      2,
+    ) + "\n"
+  );
+}
 const TARGETS = [
   {
@@ -54,6 +74,8 @@ function main(): void {
     writeFileSync(join(SCHEMA_DIR, file), body);
     process.stdout.write(`wrote schema/${file}\n`);
   }
+  writeFileSync(ASSERTION_KEYS_PATH, buildAssertionKeys());
+  process.stdout.write(`wrote ${ASSERTION_KEYS_PATH}\n`);
 }
 // Run only when invoked directly (so the test can import buildSchemas without side effects).