@lannguyensi/harness 0.17.4 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +65 -0
  2. package/README.md +86 -201
  3. package/dist/cli/approve/understanding.js +51 -35
  4. package/dist/cli/approve/understanding.js.map +1 -1
  5. package/dist/cli/doctor/format.js +20 -2
  6. package/dist/cli/doctor/format.js.map +1 -1
  7. package/dist/cli/doctor/index.d.ts +8 -0
  8. package/dist/cli/doctor/index.js +27 -1
  9. package/dist/cli/doctor/index.js.map +1 -1
  10. package/dist/cli/doctor/npm-bin-path.d.ts +23 -0
  11. package/dist/cli/doctor/npm-bin-path.js +82 -0
  12. package/dist/cli/doctor/npm-bin-path.js.map +1 -0
  13. package/dist/cli/doctor/types.d.ts +20 -4
  14. package/dist/cli/doctor/types.js.map +1 -1
  15. package/dist/cli/index.js +19 -2
  16. package/dist/cli/index.js.map +1 -1
  17. package/dist/cli/init/agent-tasks-auth.d.ts +32 -0
  18. package/dist/cli/init/agent-tasks-auth.js +75 -0
  19. package/dist/cli/init/agent-tasks-auth.js.map +1 -0
  20. package/dist/cli/init/composer.js +11 -0
  21. package/dist/cli/init/composer.js.map +1 -1
  22. package/dist/cli/init/dependencies.js +7 -3
  23. package/dist/cli/init/dependencies.js.map +1 -1
  24. package/dist/cli/init/interactive.d.ts +5 -0
  25. package/dist/cli/init/interactive.js +162 -4
  26. package/dist/cli/init/interactive.js.map +1 -1
  27. package/dist/cli/init/profiles.d.ts +2 -2
  28. package/dist/cli/init/profiles.js +30 -0
  29. package/dist/cli/init/profiles.js.map +1 -1
  30. package/dist/cli/init/templates.d.ts +1 -1
  31. package/dist/cli/init/templates.js +37 -1
  32. package/dist/cli/init/templates.js.map +1 -1
  33. package/dist/cli/pack/hook-post-tool-use.d.ts +19 -0
  34. package/dist/cli/pack/hook-post-tool-use.js +168 -0
  35. package/dist/cli/pack/hook-post-tool-use.js.map +1 -0
  36. package/dist/cli/pack/hook-pre-tool-use.js +5 -2
  37. package/dist/cli/pack/hook-pre-tool-use.js.map +1 -1
  38. package/dist/cli/session-start/index.js +8 -1
  39. package/dist/cli/session-start/index.js.map +1 -1
  40. package/dist/policy-packs/builtin/understanding-before-execution-runtime.d.ts +47 -1
  41. package/dist/policy-packs/builtin/understanding-before-execution-runtime.js +98 -1
  42. package/dist/policy-packs/builtin/understanding-before-execution-runtime.js.map +1 -1
  43. package/dist/policy-packs/builtin/understanding-before-execution.js +87 -2
  44. package/dist/policy-packs/builtin/understanding-before-execution.js.map +1 -1
  45. package/package.json +1 -1
package/CHANGELOG.md CHANGED
@@ -7,6 +7,71 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.19.0] - 2026-05-17
11
+
12
+ **Headline: setup UX gap closed for non-agent-tasks operators.** Through v0.18.x several pieces of the harness experience silently degraded for operators picking Solo (or Team without an agent-tasks account): the new per-task understanding-gate marker expiry never fired because the configured boundary list was agent-tasks MCP names, `harness init --interactive` left the bridge wired but unauthenticated, `harness doctor` flagged the deliberately operator-driven `dogfood-before-release` policy as a missing-producer false positive, and an nvm-drift class of bug went undiagnosed. This release closes those four gaps: profile-aware reset defaults plus a new `expire_on_bash_match` regex list for gh-CLI workflows, a post-install auth probe with login / skip / abort dialog, doctor respect for the policy's own `producers:` array, and a doctor warning that catches when `npm prefix -g`'s bin dir is not on PATH. Doc cleanup made the external-account assumptions of each profile explicit up-front.
13
+
14
+ ### Added
15
+
16
+ - **understanding-gate: `approval_lifecycle.expire_on_bash_match`** (harness/f54e0ecb). New optional schema field on the `understanding-before-execution` pack config: a string array of regex patterns matched against the `Bash` tool's `tool_input.command`. When a Bash command matches, the per-session approval marker is deleted on PostToolUse, same semantics as the existing `expire_on_tool_match` does for MCP tool names. Enables gh-CLI / pure-Bash workflows to declare task boundaries (e.g. `^gh pr (merge|close)\b`, `^git push origin (master|main)\b`) so the gate's per-task re-prompt works for them too. Profile defaults updated: Solo drops the agent-tasks tool list (dead weight there) and ships only the Bash list with `max_age: 1h`; Team and Full keep the tool list and add the Bash list for hybrid coverage. Patterns are pre-compiled at parse time, invalid regexes dropped with stderr warnings. Round-trip regression tests in `tests/cli/init-full-template-pins.test.ts` parse each template through `yaml.parse + new RegExp + .test()` to pin the escape-pipeline correctness, since the unit-level tests bypass that surface.
17
+
18
+ - **`harness doctor`: warn when `npm prefix -g`'s bin dir is not on PATH** (harness/4ddd78ed). Surfaces the nvm-drift footgun where `harness init --interactive` runs `npm i -g` against the active Node's prefix but the operator's shell PATH points at a different one, so installed binaries are silently invisible to subsequent doctor probes. Doctor now resolves the bin dir via `npm prefix -g` (the modern replacement for the removed `npm bin -g`) and renders an `Environment` section with the actionable PATH-patch suggestion when the bin dir is not in `process.env.PATH`. The section stays absent on ok and on the unknown branch (npm missing); skipped under `--shallow` so the 100ms timing budget stays intact.
19
+
20
+ - **`harness init --interactive`: post-install auth probe for the agent-tasks bridge** (harness/3f775180). After a successful `npm i -g @agent-tasks/mcp-bridge`, the wizard runs `agent-tasks-mcp-bridge status` to detect whether a token is configured. Three branches:
21
+ - **ok**: prints `✓ agent-tasks token validated against the backend.` and continues.
22
+ - **token present but validation fails** (backend unreachable, expired token, wrong base URL): prints an informational warning naming the bridge's reason and continues. The wizard does not block on this because the recovery is not actionable from inside it.
23
+ - **no token stored**: opens a three-option dialog: (a) run `agent-tasks-mcp-bridge login` interactively now via stdio pass-through, (b) skip with a reminder, (c) abort the wizard with a pointer to the signup URL and the re-run command. After a successful login the wizard re-probes to confirm.
24
+
25
+ Closes the silent footgun where a fresh operator could finish the wizard with `harness doctor` reporting all-green but every `mcp__agent-tasks__*` call returning an auth error.
26
+
27
+ - **FULL_TEMPLATE `git-preflight` hook pin: `min_version: "0.1.1"` + `version_command: ["preflight", "--version"]`** (agent-preflight/cb5a1770). Same pattern as the existing pins for `agent-tasks-mcp-bridge`, `grounding-mcp`, `memory-router-user-prompt-submit`. Floor at agent-preflight 0.1.1, the release that distinguishes "tool not installed" (e.g. an npm script invoking eslint that is not in devDependencies) from real lint/test/typecheck failures. Stale 0.1.0 installs silently emit false-positive blockers that keep the `preflight-before-*` policies closed forever; with the floor wired, `harness doctor` now warns operators to upgrade.
28
+
29
+ ### Changed
30
+
31
+ - **`harness doctor`: producer-gap warning now respects the policy's own `producers:` array** (harness/f97e152f). A `block` policy with a `within:` window used to be flagged with `⚠ ... no manifest hook produces it` whenever no automatic SessionStart hook wrote the required tag, even when the policy itself declared a `producers:` entry pointing the agent at the manual recovery (`mcp__agent-grounding__ledger_add`). For `dogfood-before-release` in the Full template that was a false positive: the gate is deliberately operator-driven (an automatic SessionStart producer would defeat its purpose), and the `producers:` array IS the schema-blessed manual recovery path the agent sees in the deny envelope. Doctor now treats a non-empty `producers:` array as a documented producer and suppresses the warning. The warning still fires when both kinds are absent. Visible effect on the Full template: one fewer false-positive warning (dogfood-before-release flips from `⚠` to `✓`); the two preflight policies were already satisfied by the `git-preflight` SessionStart hook and stay green.
32
+
33
+ - **Profile dependency clarity in README + wizard** (harness/75de11c4). README, `docs/init-interactive.md`, `docs/for-humans.md`, the wizard's profile-choice descriptions, and the Team-profile confirm prompt now state the external-account assumptions of each profile up-front: Solo is standalone, Team requires an agent-tasks account, Full additionally requires `@lannguyensi/agent-preflight` and `gh` on PATH. The wizard also prints a post-init reminder for Team/Full operators naming `agent-tasks-mcp-bridge login` as the auth recovery path and `--template solo` as the fallback for non-agent-tasks workflows.
34
+
35
+ ## [0.18.0] - 2026-05-17
36
+
37
+ **Headline: per-task understanding-gate marker expiry.** Through v0.17.x the approval marker had no lifetime: one `harness approve understanding` covered every subsequent Edit / Write / Bash for the whole session. That contract was correct when the gate was about "agent starts a session, picks ONE interpretation, runs", but no longer matches multi-task sessions, where a stale interpretation can silently drive the next task's edits. Live failure mode from the v0.17.4 dogfood: three sequential tasks in one session, marker stayed valid across all three, the third task started implementing the wrong fix surface before the operator caught the misdiagnose. v0.18 expires the marker on configurable task-boundary tools and (optionally) on a TTL safety net, so a fresh task gets a fresh Understanding Report. Backing task: agent-tasks/d8ee60ca.
38
+
39
+ **Operator action required (sort of):** the new behaviour is default-on for every install via `harness init --template solo / team / full` and via `init --interactive` Custom. Existing manifests that already use the pack will see the stricter behaviour on the next `harness apply` if they re-render from the template. Operators who prefer the legacy "one approval per session, no expiry" contract opt out by setting `policy_packs[].config.approval_lifecycle: { mode: "session" }`. Manifests that copy the pack config verbatim from the README / docs and pin it inline keep working unchanged until they explicitly add the new block.
40
+
41
+ ### Added
42
+
43
+ - **`config.approval_lifecycle` on the understanding-before-execution pack** (agent-tasks/d8ee60ca). New schema-shape under the pack's `config:`:
44
+
45
+ ```yaml
46
+ policy_packs:
47
+ - name: understanding-before-execution
48
+ config:
49
+ approval_lifecycle:
50
+ expire_on_tool_match:
51
+ - mcp__agent-tasks__task_finish
52
+ - mcp__agent-tasks__task_abandon
53
+ - mcp__agent-tasks__pull_requests_merge
54
+ max_age: 4h
55
+ ```
56
+
57
+ `expire_on_tool_match` is a list of tool name patterns whose successful PostToolUse fires marker expiry. `max_age` is a duration (`24h` / `30m` / `PT1H` / ...) that the PreToolUse blocker enforces against the marker's `approvedAt` field. Both are optional. `{ mode: "session" }` opts out of both and restores the legacy behaviour. Coupling note: the default tool list names `mcp__agent-tasks__*` verbs because that is what every wizard-defaulted install uses, but the field is purely string-based, so operators on Linear / JIRA / GitHub Projects override with their own task-system verbs.
58
+
59
+ - **PostToolUse marker-expiry hook** (`harness pack hook post-tool-use`, new subcommand). Reads the PostToolUse event JSON from stdin and, when the just-completed tool matches the pack's `expire_on_tool_match` list, deletes the per-session approval marker. Fails closed-to-noop: any error path is logged and the hook exits 0, so a bug in this code never escalates into a session-wide tool block. Worst case the marker persists past the intended boundary, which degrades to the legacy per-session contract.
60
+
61
+ - **`checkApprovalMarker` honours `opts.maxAgeMs`** (extended). When set, a marker whose `approvedAt` is older than `now - maxAgeMs` is treated as expired and returns `matched:false` with an "expired" detail, so the agent sees the same "no approval" UX as a never-approved session and must re-approve. A marker with no readable `approvedAt` (body corrupted, missing field) skips the freshness check, so the existence-only DoS-resistance contract from v0.13.0 still wins.
62
+
63
+ ### Changed
64
+
65
+ - **`init --template solo / team / full` + Custom-composer all ship `approval_lifecycle` defaults by default.** Re-running `harness init --force` on an existing install picks them up; an existing operator-edited manifest keeps the legacy behaviour unchanged until the operator manually adds the block or re-renders from a template.
66
+
67
+ - **`policy_packs[].config.approval_lifecycle` flows into the pack-expand surface.** `expandPolicyPacks` now contributes 4 Claude hooks instead of 3 (UserPromptSubmit + Stop + PreToolUse + the new PostToolUse). Operators who pinned the v0.17 3-hook shape in custom infrastructure should expect the new hook in their generated `settings.json` after the next `harness apply`.
68
+
69
+ ### Verification
70
+
71
+ - `npm test`: 1361/1361 (was 1344, +17 new tests across `tests/cli/pack-hook-post-tool-use.test.ts`, `tests/policy-packs/marker-max-age.test.ts`, and additions to `tests/policy-packs/expand.test.ts`).
72
+ - `npm run typecheck`: clean.
73
+ - Golden fixture: `docs/examples/full-manifest.expected.yaml` updated for the new pack config block.
74
+
10
75
  ## [0.17.4] - 2026-05-17
11
76
 
12
77
  **Headline: `harness init --interactive` wire-now actually wires settings.json now.** Closes a silent-no-op bug surfaced during the v0.17.2 dogfood (operator picked Full, picked claude-code in wire-now, but branch-protection's hooks never reached `~/.claude/settings.json`). Root cause: wireRuntime called `apply({ target, merge: true })` without `overwriteDrift`. A pre-existing stale or missing `~/.claude/harness.generated/.last-apply` snapshot made the freshly-rendered `harness.generated/settings.json` look like full-file drift, so apply returned `outcome: "drift-refuse"` without throwing. wireRuntime only checked `targetWritten` and printed nothing when it was false — leaving the operator with a "restart hint" line that implied success while settings.json was never updated. Fix: init's wire-now passes `overwriteDrift: true` with an auto-confirm prompt. Drift safeguards remain in place for ad-hoc `harness apply`; init's canonical "start from scratch" intent now always lands. Backing task: agent-tasks/df68b3e6.
package/README.md CHANGED
@@ -11,18 +11,13 @@ applies, audits, and *enforces*.
11
11
  > exact context, and why.
12
12
 
13
13
  A coding agent like Claude Code is configured across half a dozen
14
- files: `settings.json`, `CLAUDE.md`, memory notes, MCP registrations,
15
- hook scripts, per-project overrides. No single file answers *"what can
16
- this agent do right now, and why is it set up that way?"*, so
17
- configuration drifts between sessions, rules you wrote down in one
18
- place quietly stop firing, and a broken tool is discovered only by
19
- tripping over it.
20
-
21
- `harness` puts all of that in one YAML file you can read, validate,
22
- and diff. From that file it generates the config the agent actually
23
- loads, and at runtime it enforces the rules you declared: it blocks a
24
- tool call that violates one, and records every decision so you can
25
- see what fired and why.
14
+ files (`settings.json`, `CLAUDE.md`, memory notes, MCP registrations,
15
+ hook scripts, per-project overrides), and no single file answers
16
+ *"what can this agent do right now, and why is it set up that way?"*.
17
+ `harness` puts all of it in one YAML you read, validate, and diff;
18
+ generates the config the agent loads from it; and at runtime blocks
19
+ tool calls that violate the declared rules while recording every
20
+ decision.
26
21
 
27
22
  ## See it work
28
23
 
@@ -31,13 +26,20 @@ until it has logged a review.*
31
26
 
32
27
  Claude Code goes to merge PR 42. Before the tool call runs, the
33
28
  runtime hands the event to `harness`, which checks it against the
34
- manifest:
29
+ manifest. The hook protocol wire shape is the legacy engine-vocabulary
30
+ envelope (operators see this on stderr; agents read it via
31
+ `permissionDecisionReason` when the policy declares no `ux:` block):
35
32
 
36
33
  ```console
37
34
  $ harness policy intercept # Claude Code runs this before each tool call
38
35
  {"decision":"block","reason":"review-before-merge: no matching ledger entry for tag `review:42`","hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"review-before-merge: no matching ledger entry for tag `review:42`"}}
39
36
  ```
40
37
 
38
+ Built-in block-enforcement policies ship a `ux:` block since v0.17.0,
39
+ so the agent sees a plain-language three-section form
40
+ ([`docs/for-agents.md`](docs/for-agents.md#agent-facing-block-messages-ux-block));
41
+ the engine-vocabulary text above stays in the audit ledger.
42
+
41
43
  Blocked. `harness explain` says exactly why:
42
44
 
43
45
  ```console
@@ -78,52 +80,6 @@ timestamp policy outcome reason
78
80
  Declare the rule once; every session is held to it, with a paper
79
81
  trail of every decision.
80
82
 
81
- ## What the agent sees vs what the engine records
82
-
83
- A policy has two readers: the audit ledger (which wants every internal
84
- detail) and the agent (which only needs to know what is blocked, what
85
- condition is missing, and which command satisfies it). Declaring a
86
- policy's `ux:` block splits those readers cleanly.
87
-
88
- Engine-internal model (unchanged): session IDs, ledger entries,
89
- attestations, provenance chains, policy DAGs. All of it still feeds
90
- `audit`, `explain --trace`, and the evidence-ledger writes that
91
- `session-export` replays.
92
-
93
- Agent-facing model (new, opt-in per policy): `cannot` (what is
94
- blocked), `required` (the missing precondition, in plain words), and
95
- `run` (the exact command to satisfy it). When `ux:` is declared, the
96
- agent sees only this shape, with `${VAR}` references substituted
97
- against the same context the `ledger_tag` resolved against.
98
-
99
- ```yaml
100
- policies:
101
- - name: preflight-before-investigation
102
- requires: { ledger_tag: "preflight:${REPO}", within: "1h" }
103
- enforcement: block
104
- ux:
105
- cannot: "You cannot investigate this repository yet."
106
- required: ["verified repository preflight"]
107
- run: ["harness preflight"]
108
- ```
109
-
110
- On block, the agent sees:
111
-
112
- ```
113
- You cannot investigate this repository yet.
114
-
115
- Required:
116
- - verified repository preflight
117
-
118
- Run:
119
- harness preflight
120
- ```
121
-
122
- Not `no matching ledger entry for tag preflight:harness`. The
123
- internal failure (tag, hint, matched count) is still written to the
124
- ledger for `audit` and `explain --trace`. Policies without `ux:` keep
125
- the legacy deny envelope unchanged.
126
-
127
83
  ## Concepts in six lines
128
84
 
129
85
  | Term | What it is |
@@ -152,25 +108,16 @@ flowchart LR
152
108
  observe -. refine .-> declare
153
109
  ```
154
110
 
155
- One manifest declares grounding, tools, memory, hooks, policies, and
156
- workflows. `apply` materialises that into the files Claude Code
157
- actually reads. At runtime, hooks and policies enforce the contract
158
- and write decision rows to the evidence ledger. The read-side
159
- surfaces (`audit`, `explain --trace`, `session-export`) replay those
160
- rows so you can see what fired, why, and across which session.
161
- Whatever you learn from observing flows back into the manifest. That
162
- loop is the whole product.
111
+ Observe refine declare is the whole loop. The read-side surfaces
112
+ (`audit`, `explain --trace`, `session-export`) replay rows the runtime
113
+ already recorded, so what flows back into the manifest is grounded in
114
+ what actually happened.
163
115
 
164
116
  ## Pick your audience
165
117
 
166
- - **Operator?** Read [`docs/for-humans.md`](docs/for-humans.md). It
167
- walks from `npm i -g @lannguyensi/harness` through your first
168
- `apply`, your first real policy, and the diagnostics cheat sheet.
169
- - **Agent (or onboarding one)?** Read
170
- [`docs/for-agents.md`](docs/for-agents.md). It defines the
171
- workflow lifecycle, the policy / ledger sequence, the CLI cheat
172
- sheet split by side-effect class, and the audit triumvirate
173
- (`audit` vs `explain --trace` vs `session-export`).
118
+ - **Operator?** [`docs/for-humans.md`](docs/for-humans.md): install through first `apply`, first real policy, diagnostics cheat sheet.
119
+ - **Agent (or onboarding one)?** [`docs/for-agents.md`](docs/for-agents.md): workflow lifecycle, policy / ledger sequence, CLI cheat sheet by side-effect class, the audit triumvirate.
120
+ - **Writing your own policy?** [`docs/writing-custom-policies.md`](docs/writing-custom-policies.md): three tripwires, four worked recipes (each validated in CI), author loop, field reference.
174
121
 
175
122
  ## Install
176
123
 
@@ -189,11 +136,21 @@ command path, install to wired-in, no prose.
189
136
  harness init --interactive
190
137
  ```
191
138
 
192
- Guided wizard that detects your environment (existing `~/.claude/` and
193
- `~/.codex/`, MCP servers already wired in `settings.json`, harness
194
- binary version), picks a profile (`solo` / `team` / `custom`), and
195
- writes a starting `harness.yaml`. Ctrl-C at any prompt aborts with no
196
- partial write. Walkthrough + limitations: `docs/init-interactive.md`.
139
+ Guided wizard. Detects `~/.claude/` and `~/.codex/`, MCP servers
140
+ already wired in `settings.json`, harness binary version. Picks a
141
+ profile (`solo` / `team` / `custom`) and writes a starting
142
+ `harness.yaml`. Ctrl-C aborts cleanly. Walkthrough +
143
+ limitations: [`docs/init-interactive.md`](docs/init-interactive.md).
144
+
145
+ ### Profiles at a glance
146
+
147
+ | Profile | External accounts / tools required | Best for |
148
+ |---------|------------------------------------|----------|
149
+ | `solo` | None. `npm` + Claude Code is enough. | Single operators who want the Understanding Gate without committing to a tasking system. |
150
+ | `team` | An **agent-tasks** account ([hosted](https://agent-tasks.opentriologue.ai) or [self-hosted](https://github.com/LanNguyenSi/agent-tasks)). | Teams that already use `agent-tasks` for PR review tracking. The merge gate (`review:<pr-number>` ledger tag) wires against the agent-tasks MCP. |
151
+ | `full` | Same as `team` plus `@lannguyensi/agent-preflight` and `gh` on PATH. | Operators who want every reference policy enforced (dogfood gate, preflight gates, review-subagent gate, merge gate). |
152
+
153
+ **Not using agent-tasks?** Pick `solo`. The `team` and `full` review gates currently match only the agent-tasks MCP tool names, so a `gh pr create` workflow stays unprotected by them today. Tool-agnostic gates that also match `gh pr` are tracked in the backlog.
197
154
 
198
155
  If you prefer non-interactive (CI, fresh-VM provisioning), pick a
199
156
  template directly:
@@ -204,17 +161,14 @@ harness init --template team # solo + agent-tasks MCP + review-before-merge po
204
161
  harness init --template full # everything from the Appendix A reference manifest
205
162
  ```
206
163
 
207
- Debug what the harness sees in your env without writing anything:
164
+ Use `harness init --probe` for a JSON snapshot of detected runtimes
165
+ and MCPs without writing anything.
208
166
 
209
- ```bash
210
- harness init --probe # JSON snapshot of detected runtimes + MCPs + manifest
211
- ```
212
-
213
- ## Try it yourself
167
+ ## Try it without installing
214
168
 
215
- The demo above shows the runtime path. To see policy matching without
216
- installing anything or touching the ledger, run `dry-run` against the
217
- reference manifest:
169
+ `harness dry-run` reports which hooks fire and which policies match
170
+ for a given tool call, against the reference manifest, before any
171
+ ledger I/O:
218
172
 
219
173
  ```bash
220
174
  git clone https://github.com/LanNguyenSi/harness && cd harness
@@ -225,44 +179,23 @@ node dist/cli/main.js dry-run "merge PR 42" \
225
179
  --config docs/examples/full-manifest.yaml
226
180
  ```
227
181
 
228
- `dry-run` reads the reference manifest, runs the trigger matcher,
229
- substitutes `${PR_NUMBER}=42` through the JSONPath-restricted extract
230
- DSL, and tells you exactly which hooks would fire and which policies
231
- would match, before any ledger I/O.
232
-
233
- The reference manifest is a schema-coverage example, not a runnable
234
- config. `harness validate --config docs/examples/full-manifest.yaml`
235
- will report errors for install-specific hook script paths it
236
- references (and warnings for binaries like `git-batch` that only exist
237
- in a real install). That is expected; the file header spells out the
238
- contract. Use `harness init --template full` to get a manifest
239
- tailored to your machine.
240
-
241
- Convinced? Install globally and set up your own:
242
- `npm i -g @lannguyensi/harness && harness init --interactive`.
182
+ `docs/examples/full-manifest.yaml` is a schema-coverage example, not a
183
+ runnable config (the file header spells out the contract). For a
184
+ manifest tailored to your machine, install globally and run
185
+ `harness init --interactive`.
243
186
 
244
187
  ## Uninstall
245
188
 
246
189
  `harness uninstall` is the single-command teardown: dry-run by default,
247
- `--apply` to mutate. It inventories what harness planted under
248
- `~/.claude/` (manifest, lock, `harness.generated/`, harness-owned hook
249
- groups and `mcpServers` entries in `settings.json`, any leftover
250
- `settings.json.pre-harness-<TS>` backups), then removes them after
251
- writing a reversible backup + JSON snapshot next to `settings.json`.
252
-
253
- ```bash
254
- harness uninstall # list, exit 0
255
- harness uninstall --apply # tear down
256
- harness uninstall --restore-from <pre-harness-backup> # atomic restore
257
- npm uninstall -g @lannguyensi/harness # drop the CLI itself
258
- ```
190
+ `--apply` to mutate, `--restore-from <backup>` to roll back. Full
191
+ inventory + recommended order in [`docs/uninstall.md`](docs/uninstall.md).
259
192
 
260
193
  ## Status
261
194
 
262
195
  harness ships in phases. Phases 1 through 6 are released: read-only
263
196
  inventory → managed edits → declarative truth → policy layer → polish
264
197
  and dogfood lessons → the Understanding Gate Policy Pack. Phase 7, the
265
- Risk Gate, is next. The current release is `v0.16.0`.
198
+ Risk Gate, is next. The current release is `v0.19.0`.
266
199
 
267
200
  The phase-by-phase plan with acceptance criteria lives in
268
201
  [`docs/ROADMAP.md`](docs/ROADMAP.md); what shipped in each version is
@@ -270,109 +203,61 @@ in [`CHANGELOG.md`](CHANGELOG.md).
270
203
 
271
204
  ## Policy Packs
272
205
 
273
- A *Policy Pack* is a reusable bundle of instruction template, hooks,
274
- policies, and permission profiles that ships under one name and is
275
- referenced from `harness.yaml` with a single key. The first pack,
276
- `understanding-before-execution` (shipped in `v0.9.0`), forces agents
277
- to expose and confirm their task interpretation before any
278
- write-capable tool fires.
206
+ A *Policy Pack* is a reusable bundle of hooks, policies, instruction
207
+ template, and permission profiles shipped under one name and enabled
208
+ from `harness.yaml` with a single key:
279
209
 
280
210
  ```yaml
281
211
  policy_packs:
282
212
  - name: understanding-before-execution
283
213
  config:
284
- mode: grill_me # fast_confirm | grill_me | strict
285
- permission_profile: safe-start # safe-start | implementation-after-approval | high-risk-grill-me
286
- ```
287
-
288
- Manage packs with `harness pack add / remove / list`. Apply against
289
- either runtime:
290
-
291
- ```sh
292
- harness apply --runtime claude-code # default; writes harness.generated/settings.json
293
- harness apply --runtime codex # writes harness.generated/codex/config.toml
214
+ mode: grill_me # fast_confirm | grill_me | strict
215
+ permission_profile: safe-start # safe-start | implementation-after-approval | high-risk-grill-me
294
216
  ```
295
217
 
296
- Approve a session's Understanding Report via
297
- `harness approve understanding --session <id>` (round-trips both the
298
- evidence-ledger tag and the persisted JSON report). Verify the
299
- adapter wiring with `harness doctor --target codex` (`--json` for
300
- machine-readable). The full reference lives in
301
- [`docs/policy-packs/understanding-before-execution.md`](docs/policy-packs/understanding-before-execution.md);
302
- synthetic-stdin dogfood under
303
- [`dogfood/phase6-6/`](dogfood/phase6-6/run-smoke.sh) exercises the
304
- block / allow / capture / approve round-trip without a real Codex
305
- binary.
218
+ Manage packs with `harness pack add / remove / list`. Two packs ship
219
+ today: [`understanding-before-execution`](docs/policy-packs/understanding-before-execution.md)
220
+ (forces an Understanding Report before any write-capable tool fires)
221
+ and [`branch-protection`](docs/policy-packs/branch-protection.md)
222
+ (blocks source mutations on protected branches without an explicit
223
+ override). Custom packs from `path:`, `npm:`, or `git:` sources are
224
+ out of scope for v1 (see the pack docs for the future-vocabulary
225
+ contract).
306
226
 
307
227
  ## What's next
308
228
 
309
- **Phase 7, Risk Gate.** Today's policy model evaluates a rule per
310
- matching trigger and returns a binary block/allow. Phase 7 makes
311
- harness reason about *the action itself*: an Action Envelope (tool +
312
- raw input + session + runtime context) is enriched by a Context
313
- Resolver (production / staging / dev / unknown), classified by a Risk
314
- Classifier (severity + categories + reversibility), then matched
315
- against policies whose `when:` clauses can reference
316
- `risk.severity_at_least`, `environment.name`, and similar. The
317
- decision space extends to `allow / warn / require_approval / deny`.
318
- Motivating use case: prevent `DROP TABLE users`, `kubectl delete
319
- namespace prod`, `terraform destroy` against an unverified production
320
- target, even if the model would have happily run them.
321
-
322
- Phase 7 builds on Phase 4's `policy intercept` runtime backbone and
323
- Phase 6's Policy Pack distribution surface; neither is replaced.
229
+ **Phase 7, Risk Gate.** Today's policy model returns a binary
230
+ block/allow per matching trigger. Phase 7 lets harness reason about
231
+ the action itself (Action Envelope Context Resolver → Risk
232
+ Classifier) and extends the decision space to `allow / warn /
233
+ require_approval / deny`. Motivating use case: block `DROP TABLE
234
+ users`, `kubectl delete namespace prod`, `terraform destroy` against
235
+ unverified production targets. Full plan in
236
+ [`docs/ROADMAP.md#phase-7--risk-gate`](docs/ROADMAP.md#phase-7--risk-gate).
324
237
 
325
238
  > Bring your favorite agent harness. Add governance.
326
239
 
327
240
  ## Why this exists
328
241
 
329
- A working agent harness today has six to eight configuration
330
- surfaces, each with its own schema and lifecycle: `~/.claude/settings.json`,
331
- `CLAUDE.md` (per repo + root), `~/.claude/projects/*/memory/*.md`
332
- with frontmatter, `~/.claude/keybindings.json`, MCP server
333
- registrations in `~/.claude.json`, skill directories, per-project
334
- overrides, and external CLIs that behave differently per project.
335
-
336
- There is no single place that answers *"what can this agent do right
337
- now, and why is that configured that way?"*. Drift between sessions
338
- is invisible until it breaks something. Humans editing one surface
339
- do not know which other surfaces they need to touch. A fresh agent
340
- instance has no way to audit its own setup.
341
-
342
- Our entry point into this problem: on 2026-04-23, an
343
- `agent-grounding` checkout that was 16 commits behind origin led two
344
- tasks to be incorrectly called "stale". The check that would have
345
- caught it already exists,
242
+ On 2026-04-23, an `agent-grounding` checkout that was 16 commits
243
+ behind origin led two tasks to be incorrectly called "stale". The
244
+ check that would have caught it already existed:
346
245
  [`agent-preflight`](https://github.com/LanNguyenSi/agent-preflight)
347
- runs `git fetch` + `git status` (alongside lint, typecheck, test,
348
- audit) and emits a structured `ready` + confidence-score result. The
349
- missing piece was not the check itself, it was the deterministic
350
- *trigger*: a `SessionStart` hook that invokes `preflight run` and a
351
- policy that gates further work on the result. Building that wiring
352
- needs an agreed-upon place for harness config to live first. That
353
- conversation is the origin of this repo.
246
+ runs `git fetch` + `git status` and emits a structured `ready` +
247
+ confidence-score result. The missing piece was not the check, it was
248
+ the deterministic *trigger*: a `SessionStart` hook that invokes
249
+ `preflight run` and a policy that gates further work on the result.
250
+ Building that wiring needs an agreed-upon place for harness config to
251
+ live first. That conversation is the origin of this repo.
354
252
 
355
253
  ## Related
356
254
 
357
- - [`agent-grounding`](https://github.com/LanNguyenSi/agent-grounding):
358
- grounding primitives (evidence-ledger, claim-gate,
359
- review-claim-gate); `grounding-mcp` is the canonical client surface
360
- harness queries through `queryLedgerByTag`.
361
- - [`agent-memory`](https://github.com/LanNguyenSi/agent-memory):
362
- memory surfaces the control plane inventories.
363
- - [`agent-tasks`](https://github.com/LanNguyenSi/agent-tasks): the
364
- MCP-registered task platform whose registration + health appear in
365
- `harness describe`.
366
- - [`agent-preflight`](https://github.com/LanNguyenSi/agent-preflight):
367
- local preflight validator; the canonical implementation of
368
- preflight-hook content harness wires.
369
- - [`codebase-oracle`](https://github.com/LanNguyenSi/codebase-oracle):
370
- an opt-in MCP surface for multi-repo RAG search. Not in the Full
371
- default; operators wire it via `harness add mcp codebase-oracle
372
- --command codebase-oracle,mcp`.
373
- - [`agent-dx`](https://github.com/LanNguyenSi/agent-dx): ships
374
- `git-batch-cli`, a day-to-day tool whose inventory appears in
375
- `harness describe`.
255
+ - [`agent-grounding`](https://github.com/LanNguyenSi/agent-grounding): evidence-ledger, claim-gate, review-claim-gate; `grounding-mcp` is the canonical client surface harness queries through `queryLedgerByTag`.
256
+ - [`agent-memory`](https://github.com/LanNguyenSi/agent-memory): the memory surfaces the control plane inventories.
257
+ - [`agent-tasks`](https://github.com/LanNguyenSi/agent-tasks): MCP-registered task platform whose registration + health appear in `harness describe`.
258
+ - [`agent-preflight`](https://github.com/LanNguyenSi/agent-preflight): local preflight validator; the canonical implementation of preflight-hook content harness wires.
259
+ - [`codebase-oracle`](https://github.com/LanNguyenSi/codebase-oracle): opt-in MCP for multi-repo RAG search; not in Full, wire via `harness add mcp codebase-oracle --command codebase-oracle,mcp`.
260
+ - [`agent-dx`](https://github.com/LanNguyenSi/agent-dx): ships `git-batch-cli`, a day-to-day tool whose inventory appears in `harness describe`.
376
261
 
377
262
  ## License
378
263
 
@@ -60,8 +60,18 @@ async function writeLedgerTag(manifest, sessionId, content, opts) {
60
60
  * from a silent dead end to a "hook fired but parse failed because X"
61
61
  * pointer. Best-effort: any I/O error is swallowed and we report no
62
62
  * parse-error, mirroring the listPersistedReports contract.
63
+ *
64
+ * `sessionId` filter (agent-tasks/b13205b2): each parse-error log's JSON
65
+ * header carries the `sessionId` of the session that produced it. The
66
+ * lookup used to return the directory-newest log regardless of whose
67
+ * session wrote it, so a stale parse-error from a previous session would
68
+ * surface in the current operator's approve output and read like a
69
+ * failure of THEIR session. Logs whose header sessionId does not match
70
+ * `sessionId` are now skipped entirely. Logs missing a `sessionId` field
71
+ * (or whose header is not JSON) are also skipped, since we cannot
72
+ * attribute them.
63
73
  */
64
- function findLatestParseError(dir) {
74
+ function findLatestParseError(dir, sessionId) {
65
75
  let names;
66
76
  try {
67
77
  names = fs.readdirSync(dir);
@@ -69,7 +79,7 @@ function findLatestParseError(dir) {
69
79
  catch {
70
80
  return null;
71
81
  }
72
- let newest = null;
82
+ const candidates = [];
73
83
  for (const name of names) {
74
84
  if (!name.endsWith(".log"))
75
85
  continue;
@@ -83,42 +93,48 @@ function findLatestParseError(dir) {
83
93
  }
84
94
  if (!stat.isFile())
85
95
  continue;
86
- if (!newest || stat.mtimeMs > newest.mtimeMs) {
87
- newest = { filePath: full, mtimeMs: stat.mtimeMs };
88
- }
89
- }
90
- if (!newest)
91
- return null;
92
- let raw;
93
- try {
94
- raw = fs.readFileSync(newest.filePath, "utf8");
95
- }
96
- catch {
97
- return { filePath: newest.filePath, summary: "<unreadable>" };
96
+ candidates.push({ filePath: full, mtimeMs: stat.mtimeMs });
98
97
  }
99
- // The standalone package writes a JSON header followed by `--- raw ---`
100
- // and the original assistant text. Read the header for a `message`,
101
- // `reason`, or `missing` field; fall back to the first line if the
102
- // schema is unfamiliar so a future format change still surfaces
103
- // *something* rather than going silent.
104
- const header = raw.split("\n--- raw ---")[0] ?? raw;
105
- let summary = (header.split("\n")[0] ?? "").trim();
106
- try {
107
- const parsed = JSON.parse(header);
108
- if (typeof parsed["message"] === "string" && parsed["message"].length > 0) {
109
- summary = parsed["message"];
98
+ candidates.sort((a, b) => b.mtimeMs - a.mtimeMs);
99
+ for (const cand of candidates) {
100
+ let raw;
101
+ try {
102
+ raw = fs.readFileSync(cand.filePath, "utf8");
103
+ }
104
+ catch {
105
+ continue;
106
+ }
107
+ // The standalone package writes a JSON header followed by `--- raw ---`
108
+ // and the original assistant text. Read the header for a `message`,
109
+ // `reason`, or `missing` field; fall back to the first line if the
110
+ // schema is unfamiliar so a future format change still surfaces
111
+ // *something* rather than going silent.
112
+ const header = raw.split("\n--- raw ---")[0] ?? raw;
113
+ let summary = (header.split("\n")[0] ?? "").trim();
114
+ let headerSessionId = null;
115
+ try {
116
+ const parsed = JSON.parse(header);
117
+ if (typeof parsed["sessionId"] === "string") {
118
+ headerSessionId = parsed["sessionId"];
119
+ }
120
+ if (typeof parsed["message"] === "string" && parsed["message"].length > 0) {
121
+ summary = parsed["message"];
122
+ }
123
+ else if (typeof parsed["reason"] === "string") {
124
+ const missing = Array.isArray(parsed["missing"])
125
+ ? ` (missing: ${parsed["missing"].filter((m) => typeof m === "string").join(", ")})`
126
+ : "";
127
+ summary = `${parsed["reason"]}${missing}`;
128
+ }
110
129
  }
111
- else if (typeof parsed["reason"] === "string") {
112
- const missing = Array.isArray(parsed["missing"])
113
- ? ` (missing: ${parsed["missing"].filter((m) => typeof m === "string").join(", ")})`
114
- : "";
115
- summary = `${parsed["reason"]}${missing}`;
130
+ catch {
131
+ /* keep the first-line fallback; headerSessionId stays null */
116
132
  }
133
+ if (headerSessionId !== sessionId)
134
+ continue;
135
+ return { filePath: cand.filePath, summary };
117
136
  }
118
- catch {
119
- /* keep the first-line fallback */
120
- }
121
- return { filePath: newest.filePath, summary };
137
+ return null;
122
138
  }
123
139
  function rewriteReportApproved(filePath, approvedAt, approvedBy) {
124
140
  const raw = fs.readFileSync(filePath, "utf8");
@@ -247,7 +263,7 @@ export async function approveUnderstanding(opts = {}) {
247
263
  // fired but the parser rejected the report — here is why", rather
248
264
  // than a silent dead end.
249
265
  const parseErrorsDir = path.join(path.dirname(reportsDir), "parse-errors");
250
- const latestParseError = findLatestParseError(parseErrorsDir);
266
+ const latestParseError = findLatestParseError(parseErrorsDir, sessionId);
251
267
  let reason;
252
268
  if (reports.length === 0) {
253
269
  reason = `no reports found at ${reportsDir}`;