@lannguyensi/harness 0.17.4 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +65 -0
- package/README.md +86 -201
- package/dist/cli/approve/understanding.js +51 -35
- package/dist/cli/approve/understanding.js.map +1 -1
- package/dist/cli/doctor/format.js +20 -2
- package/dist/cli/doctor/format.js.map +1 -1
- package/dist/cli/doctor/index.d.ts +8 -0
- package/dist/cli/doctor/index.js +27 -1
- package/dist/cli/doctor/index.js.map +1 -1
- package/dist/cli/doctor/npm-bin-path.d.ts +23 -0
- package/dist/cli/doctor/npm-bin-path.js +82 -0
- package/dist/cli/doctor/npm-bin-path.js.map +1 -0
- package/dist/cli/doctor/types.d.ts +20 -4
- package/dist/cli/doctor/types.js.map +1 -1
- package/dist/cli/index.js +19 -2
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/init/agent-tasks-auth.d.ts +32 -0
- package/dist/cli/init/agent-tasks-auth.js +75 -0
- package/dist/cli/init/agent-tasks-auth.js.map +1 -0
- package/dist/cli/init/composer.js +11 -0
- package/dist/cli/init/composer.js.map +1 -1
- package/dist/cli/init/dependencies.js +7 -3
- package/dist/cli/init/dependencies.js.map +1 -1
- package/dist/cli/init/interactive.d.ts +5 -0
- package/dist/cli/init/interactive.js +162 -4
- package/dist/cli/init/interactive.js.map +1 -1
- package/dist/cli/init/profiles.d.ts +2 -2
- package/dist/cli/init/profiles.js +30 -0
- package/dist/cli/init/profiles.js.map +1 -1
- package/dist/cli/init/templates.d.ts +1 -1
- package/dist/cli/init/templates.js +37 -1
- package/dist/cli/init/templates.js.map +1 -1
- package/dist/cli/pack/hook-post-tool-use.d.ts +19 -0
- package/dist/cli/pack/hook-post-tool-use.js +168 -0
- package/dist/cli/pack/hook-post-tool-use.js.map +1 -0
- package/dist/cli/pack/hook-pre-tool-use.js +5 -2
- package/dist/cli/pack/hook-pre-tool-use.js.map +1 -1
- package/dist/cli/session-start/index.js +8 -1
- package/dist/cli/session-start/index.js.map +1 -1
- package/dist/policy-packs/builtin/understanding-before-execution-runtime.d.ts +47 -1
- package/dist/policy-packs/builtin/understanding-before-execution-runtime.js +98 -1
- package/dist/policy-packs/builtin/understanding-before-execution-runtime.js.map +1 -1
- package/dist/policy-packs/builtin/understanding-before-execution.js +87 -2
- package/dist/policy-packs/builtin/understanding-before-execution.js.map +1 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,71 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.19.0] - 2026-05-17
|
|
11
|
+
|
|
12
|
+
**Headline: setup UX gap closed for non-agent-tasks operators.** Through v0.18.x several pieces of the harness experience silently degraded for operators picking Solo (or Team without an agent-tasks account): the new per-task understanding-gate marker expiry never fired because the configured boundary list was agent-tasks MCP names, `harness init --interactive` left the bridge wired but unauthenticated, `harness doctor` flagged the deliberately operator-driven `dogfood-before-release` policy as a missing-producer false positive, and an nvm-drift class of bug went undiagnosed. This release closes those four gaps: profile-aware reset defaults plus a new `expire_on_bash_match` regex list for gh-CLI workflows, a post-install auth probe with login / skip / abort dialog, doctor respect for the policy's own `producers:` array, and a doctor warning that catches when `npm prefix -g`'s bin dir is not on PATH. Doc cleanup made the external-account assumptions of each profile explicit up-front.
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
|
|
16
|
+
- **understanding-gate: `approval_lifecycle.expire_on_bash_match`** (harness/f54e0ecb). New optional schema field on the `understanding-before-execution` pack config: a string array of regex patterns matched against the `Bash` tool's `tool_input.command`. When a Bash command matches, the per-session approval marker is deleted on PostToolUse, same semantics as the existing `expire_on_tool_match` does for MCP tool names. Enables gh-CLI / pure-Bash workflows to declare task boundaries (e.g. `^gh pr (merge|close)\b`, `^git push origin (master|main)\b`) so the gate's per-task re-prompt works for them too. Profile defaults updated: Solo drops the agent-tasks tool list (dead weight there) and ships only the Bash list with `max_age: 1h`; Team and Full keep the tool list and add the Bash list for hybrid coverage. Patterns are pre-compiled at parse time, invalid regexes dropped with stderr warnings. Round-trip regression tests in `tests/cli/init-full-template-pins.test.ts` parse each template through `yaml.parse + new RegExp + .test()` to pin the escape-pipeline correctness, since the unit-level tests bypass that surface.
|
|
17
|
+
|
|
18
|
+
- **`harness doctor`: warn when `npm prefix -g`'s bin dir is not on PATH** (harness/4ddd78ed). Surfaces the nvm-drift footgun where `harness init --interactive` runs `npm i -g` against the active Node's prefix but the operator's shell PATH points at a different one, so installed binaries are silently invisible to subsequent doctor probes. Doctor now resolves the bin dir via `npm prefix -g` (the modern replacement for the removed `npm bin -g`) and renders an `Environment` section with the actionable PATH-patch suggestion when the bin dir is not in `process.env.PATH`. The section stays absent on ok and on the unknown branch (npm missing); skipped under `--shallow` so the 100ms timing budget stays intact.
|
|
19
|
+
|
|
20
|
+
- **`harness init --interactive`: post-install auth probe for the agent-tasks bridge** (harness/3f775180). After a successful `npm i -g @agent-tasks/mcp-bridge`, the wizard runs `agent-tasks-mcp-bridge status` to detect whether a token is configured. Three branches:
|
|
21
|
+
- **ok**: prints `✓ agent-tasks token validated against the backend.` and continues.
|
|
22
|
+
- **token present but validation fails** (backend unreachable, expired token, wrong base URL): prints an informational warning naming the bridge's reason and continues. The wizard does not block on this because the recovery is not actionable from inside it.
|
|
23
|
+
- **no token stored**: opens a three-option dialog: (a) run `agent-tasks-mcp-bridge login` interactively now via stdio pass-through, (b) skip with a reminder, (c) abort the wizard with a pointer to the signup URL and the re-run command. After a successful login the wizard re-probes to confirm.
|
|
24
|
+
|
|
25
|
+
Closes the silent footgun where a fresh operator could finish the wizard with `harness doctor` reporting all-green but every `mcp__agent-tasks__*` call returning an auth error.
|
|
26
|
+
|
|
27
|
+
- **FULL_TEMPLATE `git-preflight` hook pin: `min_version: "0.1.1"` + `version_command: ["preflight", "--version"]`** (agent-preflight/cb5a1770). Same pattern as the existing pins for `agent-tasks-mcp-bridge`, `grounding-mcp`, `memory-router-user-prompt-submit`. Floor at agent-preflight 0.1.1, the release that distinguishes "tool not installed" (e.g. an npm script invoking eslint that is not in devDependencies) from real lint/test/typecheck failures. Stale 0.1.0 installs silently emit false-positive blockers that keep the `preflight-before-*` policies closed forever; with the floor wired, `harness doctor` now warns operators to upgrade.
|
|
28
|
+
|
|
29
|
+
### Changed
|
|
30
|
+
|
|
31
|
+
- **`harness doctor`: producer-gap warning now respects the policy's own `producers:` array** (harness/f97e152f). A `block` policy with a `within:` window used to be flagged with `⚠ ... no manifest hook produces it` whenever no automatic SessionStart hook wrote the required tag, even when the policy itself declared a `producers:` entry pointing the agent at the manual recovery (`mcp__agent-grounding__ledger_add`). For `dogfood-before-release` in the Full template that was a false positive: the gate is deliberately operator-driven (an automatic SessionStart producer would defeat its purpose), and the `producers:` array IS the schema-blessed manual recovery path the agent sees in the deny envelope. Doctor now treats a non-empty `producers:` array as a documented producer and suppresses the warning. The warning still fires when both kinds are absent. Visible effect on the Full template: one fewer false-positive warning (dogfood-before-release flips from `⚠` to `✓`); the two preflight policies were already satisfied by the `git-preflight` SessionStart hook and stay green.
|
|
32
|
+
|
|
33
|
+
- **Profile dependency clarity in README + wizard** (harness/75de11c4). README, `docs/init-interactive.md`, `docs/for-humans.md`, the wizard's profile-choice descriptions, and the Team-profile confirm prompt now state the external-account assumptions of each profile up-front: Solo is standalone, Team requires an agent-tasks account, Full additionally requires `@lannguyensi/agent-preflight` and `gh` on PATH. The wizard also prints a post-init reminder for Team/Full operators naming `agent-tasks-mcp-bridge login` as the auth recovery path and `--template solo` as the fallback for non-agent-tasks workflows.
|
|
34
|
+
|
|
35
|
+
## [0.18.0] - 2026-05-17
|
|
36
|
+
|
|
37
|
+
**Headline: per-task understanding-gate marker expiry.** Through v0.17.x the approval marker had no lifetime: one `harness approve understanding` covered every subsequent Edit / Write / Bash for the whole session. That contract was correct when the gate was about "agent starts a session, picks ONE interpretation, runs", but no longer matches multi-task sessions, where a stale interpretation can silently drive the next task's edits. Live failure mode from the v0.17.4 dogfood: three sequential tasks in one session, marker stayed valid across all three, the third task started implementing the wrong fix surface before the operator caught the misdiagnose. v0.18 expires the marker on configurable task-boundary tools and (optionally) on a TTL safety net, so a fresh task gets a fresh Understanding Report. Backing task: agent-tasks/d8ee60ca.
|
|
38
|
+
|
|
39
|
+
**Operator action required (sort of):** the new behaviour is default-on for every install via `harness init --template solo / team / full` and via `init --interactive` Custom. Existing manifests that already use the pack will see the stricter behaviour on the next `harness apply` if they re-render from the template. Operators who prefer the legacy "one approval per session, no expiry" contract opt out by setting `policy_packs[].config.approval_lifecycle: { mode: "session" }`. Manifests that copy the pack config verbatim from the README / docs and pin it inline keep working unchanged until they explicitly add the new block.
|
|
40
|
+
|
|
41
|
+
### Added
|
|
42
|
+
|
|
43
|
+
- **`config.approval_lifecycle` on the understanding-before-execution pack** (agent-tasks/d8ee60ca). New schema-shape under the pack's `config:`:
|
|
44
|
+
|
|
45
|
+
```yaml
|
|
46
|
+
policy_packs:
|
|
47
|
+
- name: understanding-before-execution
|
|
48
|
+
config:
|
|
49
|
+
approval_lifecycle:
|
|
50
|
+
expire_on_tool_match:
|
|
51
|
+
- mcp__agent-tasks__task_finish
|
|
52
|
+
- mcp__agent-tasks__task_abandon
|
|
53
|
+
- mcp__agent-tasks__pull_requests_merge
|
|
54
|
+
max_age: 4h
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
`expire_on_tool_match` is a list of tool name patterns whose successful PostToolUse fires marker expiry. `max_age` is a duration (`24h` / `30m` / `PT1H` / ...) that the PreToolUse blocker enforces against the marker's `approvedAt` field. Both are optional. `{ mode: "session" }` opts out of both and restores the legacy behaviour. Coupling note: the default tool list names `mcp__agent-tasks__*` verbs because that is what every wizard-defaulted install uses, but the field is purely string-based, so operators on Linear / JIRA / GitHub Projects override with their own task-system verbs.
|
|
58
|
+
|
|
59
|
+
- **PostToolUse marker-expiry hook** (`harness pack hook post-tool-use`, new subcommand). Reads the PostToolUse event JSON from stdin and, when the just-completed tool matches the pack's `expire_on_tool_match` list, deletes the per-session approval marker. Fails closed-to-noop: any error path is logged and the hook exits 0, so a bug in this code never escalates into a session-wide tool block. Worst case the marker persists past the intended boundary, which degrades to the legacy per-session contract.
|
|
60
|
+
|
|
61
|
+
- **`checkApprovalMarker` honours `opts.maxAgeMs`** (extended). When set, a marker whose `approvedAt` is older than `now - maxAgeMs` is treated as expired and returns `matched:false` with an "expired" detail, so the agent sees the same "no approval" UX as a never-approved session and must re-approve. A marker with no readable `approvedAt` (body corrupted, missing field) skips the freshness check, so the existence-only DoS-resistance contract from v0.13.0 still wins.
|
|
62
|
+
|
|
63
|
+
### Changed
|
|
64
|
+
|
|
65
|
+
- **`init --template solo / team / full` + Custom-composer all ship `approval_lifecycle` defaults by default.** Re-running `harness init --force` on an existing install picks them up; an existing operator-edited manifest keeps the legacy behaviour unchanged until the operator manually adds the block or re-renders from a template.
|
|
66
|
+
|
|
67
|
+
- **`policy_packs[].config.approval_lifecycle` flows into the pack-expand surface.** `expandPolicyPacks` now contributes 4 Claude hooks instead of 3 (UserPromptSubmit + Stop + PreToolUse + the new PostToolUse). Operators who pinned the v0.17 3-hook shape in custom infrastructure should expect the new hook in their generated `settings.json` after the next `harness apply`.
|
|
68
|
+
|
|
69
|
+
### Verification
|
|
70
|
+
|
|
71
|
+
- `npm test`: 1361/1361 (was 1344, +17 new tests across `tests/cli/pack-hook-post-tool-use.test.ts`, `tests/policy-packs/marker-max-age.test.ts`, and additions to `tests/policy-packs/expand.test.ts`).
|
|
72
|
+
- `npm run typecheck`: clean.
|
|
73
|
+
- Golden fixture: `docs/examples/full-manifest.expected.yaml` updated for the new pack config block.
|
|
74
|
+
|
|
10
75
|
## [0.17.4] - 2026-05-17
|
|
11
76
|
|
|
12
77
|
**Headline: `harness init --interactive` wire-now actually wires settings.json now.** Closes a silent-no-op bug surfaced during the v0.17.2 dogfood (operator picked Full, picked claude-code in wire-now, but branch-protection's hooks never reached `~/.claude/settings.json`). Root cause: wireRuntime called `apply({ target, merge: true })` without `overwriteDrift`. A pre-existing stale or missing `~/.claude/harness.generated/.last-apply` snapshot made the freshly-rendered `harness.generated/settings.json` look like full-file drift, so apply returned `outcome: "drift-refuse"` without throwing. wireRuntime only checked `targetWritten` and printed nothing when it was false — leaving the operator with a "restart hint" line that implied success while settings.json was never updated. Fix: init's wire-now passes `overwriteDrift: true` with an auto-confirm prompt. Drift safeguards remain in place for ad-hoc `harness apply`; init's canonical "start from scratch" intent now always lands. Backing task: agent-tasks/df68b3e6.
|
package/README.md
CHANGED
|
@@ -11,18 +11,13 @@ applies, audits, and *enforces*.
|
|
|
11
11
|
> exact context, and why.
|
|
12
12
|
|
|
13
13
|
A coding agent like Claude Code is configured across half a dozen
|
|
14
|
-
files
|
|
15
|
-
hook scripts, per-project overrides
|
|
16
|
-
this agent do right now, and why is it set up that way?"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
`harness` puts all of that in one YAML file you can read, validate,
|
|
22
|
-
and diff. From that file it generates the config the agent actually
|
|
23
|
-
loads, and at runtime it enforces the rules you declared: it blocks a
|
|
24
|
-
tool call that violates one, and records every decision so you can
|
|
25
|
-
see what fired and why.
|
|
14
|
+
files (`settings.json`, `CLAUDE.md`, memory notes, MCP registrations,
|
|
15
|
+
hook scripts, per-project overrides), and no single file answers
|
|
16
|
+
*"what can this agent do right now, and why is it set up that way?"*.
|
|
17
|
+
`harness` puts all of it in one YAML you read, validate, and diff;
|
|
18
|
+
generates the config the agent loads from it; and at runtime blocks
|
|
19
|
+
tool calls that violate the declared rules while recording every
|
|
20
|
+
decision.
|
|
26
21
|
|
|
27
22
|
## See it work
|
|
28
23
|
|
|
@@ -31,13 +26,20 @@ until it has logged a review.*
|
|
|
31
26
|
|
|
32
27
|
Claude Code goes to merge PR 42. Before the tool call runs, the
|
|
33
28
|
runtime hands the event to `harness`, which checks it against the
|
|
34
|
-
manifest
|
|
29
|
+
manifest. The hook protocol wire shape is the legacy engine-vocabulary
|
|
30
|
+
envelope (operators see this on stderr; agents read it via
|
|
31
|
+
`permissionDecisionReason` when the policy declares no `ux:` block):
|
|
35
32
|
|
|
36
33
|
```console
|
|
37
34
|
$ harness policy intercept # Claude Code runs this before each tool call
|
|
38
35
|
{"decision":"block","reason":"review-before-merge: no matching ledger entry for tag `review:42`","hookSpecificOutput":{"hookEventName":"PreToolUse","permissionDecision":"deny","permissionDecisionReason":"review-before-merge: no matching ledger entry for tag `review:42`"}}
|
|
39
36
|
```
|
|
40
37
|
|
|
38
|
+
Built-in block-enforcement policies ship a `ux:` block since v0.17.0,
|
|
39
|
+
so the agent sees a plain-language three-section form
|
|
40
|
+
([`docs/for-agents.md`](docs/for-agents.md#agent-facing-block-messages-ux-block));
|
|
41
|
+
the engine-vocabulary text above stays in the audit ledger.
|
|
42
|
+
|
|
41
43
|
Blocked. `harness explain` says exactly why:
|
|
42
44
|
|
|
43
45
|
```console
|
|
@@ -78,52 +80,6 @@ timestamp policy outcome reason
|
|
|
78
80
|
Declare the rule once; every session is held to it, with a paper
|
|
79
81
|
trail of every decision.
|
|
80
82
|
|
|
81
|
-
## What the agent sees vs what the engine records
|
|
82
|
-
|
|
83
|
-
A policy has two readers: the audit ledger (which wants every internal
|
|
84
|
-
detail) and the agent (which only needs to know what is blocked, what
|
|
85
|
-
condition is missing, and which command satisfies it). Declaring a
|
|
86
|
-
policy's `ux:` block splits those readers cleanly.
|
|
87
|
-
|
|
88
|
-
Engine-internal model (unchanged): session IDs, ledger entries,
|
|
89
|
-
attestations, provenance chains, policy DAGs. All of it still feeds
|
|
90
|
-
`audit`, `explain --trace`, and the evidence-ledger writes that
|
|
91
|
-
`session-export` replays.
|
|
92
|
-
|
|
93
|
-
Agent-facing model (new, opt-in per policy): `cannot` (what is
|
|
94
|
-
blocked), `required` (the missing precondition, in plain words), and
|
|
95
|
-
`run` (the exact command to satisfy it). When `ux:` is declared, the
|
|
96
|
-
agent sees only this shape, with `${VAR}` references substituted
|
|
97
|
-
against the same context the `ledger_tag` resolved against.
|
|
98
|
-
|
|
99
|
-
```yaml
|
|
100
|
-
policies:
|
|
101
|
-
- name: preflight-before-investigation
|
|
102
|
-
requires: { ledger_tag: "preflight:${REPO}", within: "1h" }
|
|
103
|
-
enforcement: block
|
|
104
|
-
ux:
|
|
105
|
-
cannot: "You cannot investigate this repository yet."
|
|
106
|
-
required: ["verified repository preflight"]
|
|
107
|
-
run: ["harness preflight"]
|
|
108
|
-
```
|
|
109
|
-
|
|
110
|
-
On block, the agent sees:
|
|
111
|
-
|
|
112
|
-
```
|
|
113
|
-
You cannot investigate this repository yet.
|
|
114
|
-
|
|
115
|
-
Required:
|
|
116
|
-
- verified repository preflight
|
|
117
|
-
|
|
118
|
-
Run:
|
|
119
|
-
harness preflight
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
Not `no matching ledger entry for tag preflight:harness`. The
|
|
123
|
-
internal failure (tag, hint, matched count) is still written to the
|
|
124
|
-
ledger for `audit` and `explain --trace`. Policies without `ux:` keep
|
|
125
|
-
the legacy deny envelope unchanged.
|
|
126
|
-
|
|
127
83
|
## Concepts in six lines
|
|
128
84
|
|
|
129
85
|
| Term | What it is |
|
|
@@ -152,25 +108,16 @@ flowchart LR
|
|
|
152
108
|
observe -. refine .-> declare
|
|
153
109
|
```
|
|
154
110
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
surfaces (`audit`, `explain --trace`, `session-export`) replay those
|
|
160
|
-
rows so you can see what fired, why, and across which session.
|
|
161
|
-
Whatever you learn from observing flows back into the manifest. That
|
|
162
|
-
loop is the whole product.
|
|
111
|
+
Observe → refine → declare is the whole loop. The read-side surfaces
|
|
112
|
+
(`audit`, `explain --trace`, `session-export`) replay rows the runtime
|
|
113
|
+
already recorded, so what flows back into the manifest is grounded in
|
|
114
|
+
what actually happened.
|
|
163
115
|
|
|
164
116
|
## Pick your audience
|
|
165
117
|
|
|
166
|
-
- **Operator?**
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
- **Agent (or onboarding one)?** Read
|
|
170
|
-
[`docs/for-agents.md`](docs/for-agents.md). It defines the
|
|
171
|
-
workflow lifecycle, the policy / ledger sequence, the CLI cheat
|
|
172
|
-
sheet split by side-effect class, and the audit triumvirate
|
|
173
|
-
(`audit` vs `explain --trace` vs `session-export`).
|
|
118
|
+
- **Operator?** [`docs/for-humans.md`](docs/for-humans.md): install through first `apply`, first real policy, diagnostics cheat sheet.
|
|
119
|
+
- **Agent (or onboarding one)?** [`docs/for-agents.md`](docs/for-agents.md): workflow lifecycle, policy / ledger sequence, CLI cheat sheet by side-effect class, the audit triumvirate.
|
|
120
|
+
- **Writing your own policy?** [`docs/writing-custom-policies.md`](docs/writing-custom-policies.md): three tripwires, four worked recipes (each validated in CI), author loop, field reference.
|
|
174
121
|
|
|
175
122
|
## Install
|
|
176
123
|
|
|
@@ -189,11 +136,21 @@ command path, install to wired-in, no prose.
|
|
|
189
136
|
harness init --interactive
|
|
190
137
|
```
|
|
191
138
|
|
|
192
|
-
Guided wizard
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
139
|
+
Guided wizard. Detects `~/.claude/` and `~/.codex/`, MCP servers
|
|
140
|
+
already wired in `settings.json`, harness binary version. Picks a
|
|
141
|
+
profile (`solo` / `team` / `custom`) and writes a starting
|
|
142
|
+
`harness.yaml`. Ctrl-C aborts cleanly. Walkthrough +
|
|
143
|
+
limitations: [`docs/init-interactive.md`](docs/init-interactive.md).
|
|
144
|
+
|
|
145
|
+
### Profiles at a glance
|
|
146
|
+
|
|
147
|
+
| Profile | External accounts / tools required | Best for |
|
|
148
|
+
|---------|------------------------------------|----------|
|
|
149
|
+
| `solo` | None. `npm` + Claude Code is enough. | Single operators who want the Understanding Gate without committing to a tasking system. |
|
|
150
|
+
| `team` | An **agent-tasks** account ([hosted](https://agent-tasks.opentriologue.ai) or [self-hosted](https://github.com/LanNguyenSi/agent-tasks)). | Teams that already use `agent-tasks` for PR review tracking. The merge gate (`review:<pr-number>` ledger tag) wires against the agent-tasks MCP. |
|
|
151
|
+
| `full` | Same as `team` plus `@lannguyensi/agent-preflight` and `gh` on PATH. | Operators who want every reference policy enforced (dogfood gate, preflight gates, review-subagent gate, merge gate). |
|
|
152
|
+
|
|
153
|
+
**Not using agent-tasks?** Pick `solo`. The `team` and `full` review gates currently match only the agent-tasks MCP tool names, so a `gh pr create` workflow stays unprotected by them today. Tool-agnostic gates that also match `gh pr` are tracked in the backlog.
|
|
197
154
|
|
|
198
155
|
If you prefer non-interactive (CI, fresh-VM provisioning), pick a
|
|
199
156
|
template directly:
|
|
@@ -204,17 +161,14 @@ harness init --template team # solo + agent-tasks MCP + review-before-merge po
|
|
|
204
161
|
harness init --template full # everything from the Appendix A reference manifest
|
|
205
162
|
```
|
|
206
163
|
|
|
207
|
-
|
|
164
|
+
Use `harness init --probe` for a JSON snapshot of detected runtimes
|
|
165
|
+
and MCPs without writing anything.
|
|
208
166
|
|
|
209
|
-
|
|
210
|
-
harness init --probe # JSON snapshot of detected runtimes + MCPs + manifest
|
|
211
|
-
```
|
|
212
|
-
|
|
213
|
-
## Try it yourself
|
|
167
|
+
## Try it without installing
|
|
214
168
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
169
|
+
`harness dry-run` reports which hooks fire and which policies match
|
|
170
|
+
for a given tool call, against the reference manifest, before any
|
|
171
|
+
ledger I/O:
|
|
218
172
|
|
|
219
173
|
```bash
|
|
220
174
|
git clone https://github.com/LanNguyenSi/harness && cd harness
|
|
@@ -225,44 +179,23 @@ node dist/cli/main.js dry-run "merge PR 42" \
|
|
|
225
179
|
--config docs/examples/full-manifest.yaml
|
|
226
180
|
```
|
|
227
181
|
|
|
228
|
-
`
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
The reference manifest is a schema-coverage example, not a runnable
|
|
234
|
-
config. `harness validate --config docs/examples/full-manifest.yaml`
|
|
235
|
-
will report errors for install-specific hook script paths it
|
|
236
|
-
references (and warnings for binaries like `git-batch` that only exist
|
|
237
|
-
in a real install). That is expected; the file header spells out the
|
|
238
|
-
contract. Use `harness init --template full` to get a manifest
|
|
239
|
-
tailored to your machine.
|
|
240
|
-
|
|
241
|
-
Convinced? Install globally and set up your own:
|
|
242
|
-
`npm i -g @lannguyensi/harness && harness init --interactive`.
|
|
182
|
+
`docs/examples/full-manifest.yaml` is a schema-coverage example, not a
|
|
183
|
+
runnable config (the file header spells out the contract). For a
|
|
184
|
+
manifest tailored to your machine, install globally and run
|
|
185
|
+
`harness init --interactive`.
|
|
243
186
|
|
|
244
187
|
## Uninstall
|
|
245
188
|
|
|
246
189
|
`harness uninstall` is the single-command teardown: dry-run by default,
|
|
247
|
-
`--apply` to mutate
|
|
248
|
-
|
|
249
|
-
groups and `mcpServers` entries in `settings.json`, any leftover
|
|
250
|
-
`settings.json.pre-harness-<TS>` backups), then removes them after
|
|
251
|
-
writing a reversible backup + JSON snapshot next to `settings.json`.
|
|
252
|
-
|
|
253
|
-
```bash
|
|
254
|
-
harness uninstall # list, exit 0
|
|
255
|
-
harness uninstall --apply # tear down
|
|
256
|
-
harness uninstall --restore-from <pre-harness-backup> # atomic restore
|
|
257
|
-
npm uninstall -g @lannguyensi/harness # drop the CLI itself
|
|
258
|
-
```
|
|
190
|
+
`--apply` to mutate, `--restore-from <backup>` to roll back. Full
|
|
191
|
+
inventory + recommended order in [`docs/uninstall.md`](docs/uninstall.md).
|
|
259
192
|
|
|
260
193
|
## Status
|
|
261
194
|
|
|
262
195
|
harness ships in phases. Phases 1 through 6 are released: read-only
|
|
263
196
|
inventory → managed edits → declarative truth → policy layer → polish
|
|
264
197
|
and dogfood lessons → the Understanding Gate Policy Pack. Phase 7, the
|
|
265
|
-
Risk Gate, is next. The current release is `v0.
|
|
198
|
+
Risk Gate, is next. The current release is `v0.19.0`.
|
|
266
199
|
|
|
267
200
|
The phase-by-phase plan with acceptance criteria lives in
|
|
268
201
|
[`docs/ROADMAP.md`](docs/ROADMAP.md); what shipped in each version is
|
|
@@ -270,109 +203,61 @@ in [`CHANGELOG.md`](CHANGELOG.md).
|
|
|
270
203
|
|
|
271
204
|
## Policy Packs
|
|
272
205
|
|
|
273
|
-
A *Policy Pack* is a reusable bundle of
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
`understanding-before-execution` (shipped in `v0.9.0`), forces agents
|
|
277
|
-
to expose and confirm their task interpretation before any
|
|
278
|
-
write-capable tool fires.
|
|
206
|
+
A *Policy Pack* is a reusable bundle of hooks, policies, instruction
|
|
207
|
+
template, and permission profiles shipped under one name and enabled
|
|
208
|
+
from `harness.yaml` with a single key:
|
|
279
209
|
|
|
280
210
|
```yaml
|
|
281
211
|
policy_packs:
|
|
282
212
|
- name: understanding-before-execution
|
|
283
213
|
config:
|
|
284
|
-
mode: grill_me
|
|
285
|
-
permission_profile: safe-start
|
|
286
|
-
```
|
|
287
|
-
|
|
288
|
-
Manage packs with `harness pack add / remove / list`. Apply against
|
|
289
|
-
either runtime:
|
|
290
|
-
|
|
291
|
-
```sh
|
|
292
|
-
harness apply --runtime claude-code # default; writes harness.generated/settings.json
|
|
293
|
-
harness apply --runtime codex # writes harness.generated/codex/config.toml
|
|
214
|
+
mode: grill_me # fast_confirm | grill_me | strict
|
|
215
|
+
permission_profile: safe-start # safe-start | implementation-after-approval | high-risk-grill-me
|
|
294
216
|
```
|
|
295
217
|
|
|
296
|
-
|
|
297
|
-
`
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
block / allow / capture / approve round-trip without a real Codex
|
|
305
|
-
binary.
|
|
218
|
+
Manage packs with `harness pack add / remove / list`. Two packs ship
|
|
219
|
+
today: [`understanding-before-execution`](docs/policy-packs/understanding-before-execution.md)
|
|
220
|
+
(forces an Understanding Report before any write-capable tool fires)
|
|
221
|
+
and [`branch-protection`](docs/policy-packs/branch-protection.md)
|
|
222
|
+
(blocks source mutations on protected branches without an explicit
|
|
223
|
+
override). Custom packs from `path:`, `npm:`, or `git:` sources are
|
|
224
|
+
out of scope for v1 (see the pack docs for the future-vocabulary
|
|
225
|
+
contract).
|
|
306
226
|
|
|
307
227
|
## What's next
|
|
308
228
|
|
|
309
|
-
**Phase 7, Risk Gate.** Today's policy model
|
|
310
|
-
matching trigger
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
`risk
|
|
317
|
-
decision space extends to `allow / warn / require_approval / deny`.
|
|
318
|
-
Motivating use case: prevent `DROP TABLE users`, `kubectl delete
|
|
319
|
-
namespace prod`, `terraform destroy` against an unverified production
|
|
320
|
-
target, even if the model would have happily run them.
|
|
321
|
-
|
|
322
|
-
Phase 7 builds on Phase 4's `policy intercept` runtime backbone and
|
|
323
|
-
Phase 6's Policy Pack distribution surface; neither is replaced.
|
|
229
|
+
**Phase 7, Risk Gate.** Today's policy model returns a binary
|
|
230
|
+
block/allow per matching trigger. Phase 7 lets harness reason about
|
|
231
|
+
the action itself (Action Envelope → Context Resolver → Risk
|
|
232
|
+
Classifier) and extends the decision space to `allow / warn /
|
|
233
|
+
require_approval / deny`. Motivating use case: block `DROP TABLE
|
|
234
|
+
users`, `kubectl delete namespace prod`, `terraform destroy` against
|
|
235
|
+
unverified production targets. Full plan in
|
|
236
|
+
[`docs/ROADMAP.md#phase-7--risk-gate`](docs/ROADMAP.md#phase-7--risk-gate).
|
|
324
237
|
|
|
325
238
|
> Bring your favorite agent harness. Add governance.
|
|
326
239
|
|
|
327
240
|
## Why this exists
|
|
328
241
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
with frontmatter, `~/.claude/keybindings.json`, MCP server
|
|
333
|
-
registrations in `~/.claude.json`, skill directories, per-project
|
|
334
|
-
overrides, and external CLIs that behave differently per project.
|
|
335
|
-
|
|
336
|
-
There is no single place that answers *"what can this agent do right
|
|
337
|
-
now, and why is that configured that way?"*. Drift between sessions
|
|
338
|
-
is invisible until it breaks something. Humans editing one surface
|
|
339
|
-
do not know which other surfaces they need to touch. A fresh agent
|
|
340
|
-
instance has no way to audit its own setup.
|
|
341
|
-
|
|
342
|
-
Our entry point into this problem: on 2026-04-23, an
|
|
343
|
-
`agent-grounding` checkout that was 16 commits behind origin led two
|
|
344
|
-
tasks to be incorrectly called "stale". The check that would have
|
|
345
|
-
caught it already exists,
|
|
242
|
+
On 2026-04-23, an `agent-grounding` checkout that was 16 commits
|
|
243
|
+
behind origin led two tasks to be incorrectly called "stale". The
|
|
244
|
+
check that would have caught it already existed:
|
|
346
245
|
[`agent-preflight`](https://github.com/LanNguyenSi/agent-preflight)
|
|
347
|
-
runs `git fetch` + `git status`
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
conversation is the origin of this repo.
|
|
246
|
+
runs `git fetch` + `git status` and emits a structured `ready` +
|
|
247
|
+
confidence-score result. The missing piece was not the check, it was
|
|
248
|
+
the deterministic *trigger*: a `SessionStart` hook that invokes
|
|
249
|
+
`preflight run` and a policy that gates further work on the result.
|
|
250
|
+
Building that wiring needs an agreed-upon place for harness config to
|
|
251
|
+
live first. That conversation is the origin of this repo.
|
|
354
252
|
|
|
355
253
|
## Related
|
|
356
254
|
|
|
357
|
-
- [`agent-grounding`](https://github.com/LanNguyenSi/agent-grounding):
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
- [`
|
|
362
|
-
|
|
363
|
-
- [`agent-tasks`](https://github.com/LanNguyenSi/agent-tasks): the
|
|
364
|
-
MCP-registered task platform whose registration + health appear in
|
|
365
|
-
`harness describe`.
|
|
366
|
-
- [`agent-preflight`](https://github.com/LanNguyenSi/agent-preflight):
|
|
367
|
-
local preflight validator; the canonical implementation of
|
|
368
|
-
preflight-hook content harness wires.
|
|
369
|
-
- [`codebase-oracle`](https://github.com/LanNguyenSi/codebase-oracle):
|
|
370
|
-
an opt-in MCP surface for multi-repo RAG search. Not in the Full
|
|
371
|
-
default; operators wire it via `harness add mcp codebase-oracle
|
|
372
|
-
--command codebase-oracle,mcp`.
|
|
373
|
-
- [`agent-dx`](https://github.com/LanNguyenSi/agent-dx): ships
|
|
374
|
-
`git-batch-cli`, a day-to-day tool whose inventory appears in
|
|
375
|
-
`harness describe`.
|
|
255
|
+
- [`agent-grounding`](https://github.com/LanNguyenSi/agent-grounding): evidence-ledger, claim-gate, review-claim-gate; `grounding-mcp` is the canonical client surface harness queries through `queryLedgerByTag`.
|
|
256
|
+
- [`agent-memory`](https://github.com/LanNguyenSi/agent-memory): the memory surfaces the control plane inventories.
|
|
257
|
+
- [`agent-tasks`](https://github.com/LanNguyenSi/agent-tasks): MCP-registered task platform whose registration + health appear in `harness describe`.
|
|
258
|
+
- [`agent-preflight`](https://github.com/LanNguyenSi/agent-preflight): local preflight validator; the canonical implementation of preflight-hook content harness wires.
|
|
259
|
+
- [`codebase-oracle`](https://github.com/LanNguyenSi/codebase-oracle): opt-in MCP for multi-repo RAG search; not in Full, wire via `harness add mcp codebase-oracle --command codebase-oracle,mcp`.
|
|
260
|
+
- [`agent-dx`](https://github.com/LanNguyenSi/agent-dx): ships `git-batch-cli`, a day-to-day tool whose inventory appears in `harness describe`.
|
|
376
261
|
|
|
377
262
|
## License
|
|
378
263
|
|
|
@@ -60,8 +60,18 @@ async function writeLedgerTag(manifest, sessionId, content, opts) {
|
|
|
60
60
|
* from a silent dead end to a "hook fired but parse failed because X"
|
|
61
61
|
* pointer. Best-effort: any I/O error is swallowed and we report no
|
|
62
62
|
* parse-error, mirroring the listPersistedReports contract.
|
|
63
|
+
*
|
|
64
|
+
* `sessionId` filter (agent-tasks/b13205b2): each parse-error log's JSON
|
|
65
|
+
* header carries the `sessionId` of the session that produced it. The
|
|
66
|
+
* lookup used to return the directory-newest log regardless of whose
|
|
67
|
+
* session wrote it, so a stale parse-error from a previous session would
|
|
68
|
+
* surface in the current operator's approve output and read like a
|
|
69
|
+
* failure of THEIR session. Logs whose header sessionId does not match
|
|
70
|
+
* `sessionId` are now skipped entirely. Logs missing a `sessionId` field
|
|
71
|
+
* (or whose header is not JSON) are also skipped, since we cannot
|
|
72
|
+
* attribute them.
|
|
63
73
|
*/
|
|
64
|
-
function findLatestParseError(dir) {
|
|
74
|
+
function findLatestParseError(dir, sessionId) {
|
|
65
75
|
let names;
|
|
66
76
|
try {
|
|
67
77
|
names = fs.readdirSync(dir);
|
|
@@ -69,7 +79,7 @@ function findLatestParseError(dir) {
|
|
|
69
79
|
catch {
|
|
70
80
|
return null;
|
|
71
81
|
}
|
|
72
|
-
|
|
82
|
+
const candidates = [];
|
|
73
83
|
for (const name of names) {
|
|
74
84
|
if (!name.endsWith(".log"))
|
|
75
85
|
continue;
|
|
@@ -83,42 +93,48 @@ function findLatestParseError(dir) {
|
|
|
83
93
|
}
|
|
84
94
|
if (!stat.isFile())
|
|
85
95
|
continue;
|
|
86
|
-
|
|
87
|
-
newest = { filePath: full, mtimeMs: stat.mtimeMs };
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
if (!newest)
|
|
91
|
-
return null;
|
|
92
|
-
let raw;
|
|
93
|
-
try {
|
|
94
|
-
raw = fs.readFileSync(newest.filePath, "utf8");
|
|
95
|
-
}
|
|
96
|
-
catch {
|
|
97
|
-
return { filePath: newest.filePath, summary: "<unreadable>" };
|
|
96
|
+
candidates.push({ filePath: full, mtimeMs: stat.mtimeMs });
|
|
98
97
|
}
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
98
|
+
candidates.sort((a, b) => b.mtimeMs - a.mtimeMs);
|
|
99
|
+
for (const cand of candidates) {
|
|
100
|
+
let raw;
|
|
101
|
+
try {
|
|
102
|
+
raw = fs.readFileSync(cand.filePath, "utf8");
|
|
103
|
+
}
|
|
104
|
+
catch {
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
// The standalone package writes a JSON header followed by `--- raw ---`
|
|
108
|
+
// and the original assistant text. Read the header for a `message`,
|
|
109
|
+
// `reason`, or `missing` field; fall back to the first line if the
|
|
110
|
+
// schema is unfamiliar so a future format change still surfaces
|
|
111
|
+
// *something* rather than going silent.
|
|
112
|
+
const header = raw.split("\n--- raw ---")[0] ?? raw;
|
|
113
|
+
let summary = (header.split("\n")[0] ?? "").trim();
|
|
114
|
+
let headerSessionId = null;
|
|
115
|
+
try {
|
|
116
|
+
const parsed = JSON.parse(header);
|
|
117
|
+
if (typeof parsed["sessionId"] === "string") {
|
|
118
|
+
headerSessionId = parsed["sessionId"];
|
|
119
|
+
}
|
|
120
|
+
if (typeof parsed["message"] === "string" && parsed["message"].length > 0) {
|
|
121
|
+
summary = parsed["message"];
|
|
122
|
+
}
|
|
123
|
+
else if (typeof parsed["reason"] === "string") {
|
|
124
|
+
const missing = Array.isArray(parsed["missing"])
|
|
125
|
+
? ` (missing: ${parsed["missing"].filter((m) => typeof m === "string").join(", ")})`
|
|
126
|
+
: "";
|
|
127
|
+
summary = `${parsed["reason"]}${missing}`;
|
|
128
|
+
}
|
|
110
129
|
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
? ` (missing: ${parsed["missing"].filter((m) => typeof m === "string").join(", ")})`
|
|
114
|
-
: "";
|
|
115
|
-
summary = `${parsed["reason"]}${missing}`;
|
|
130
|
+
catch {
|
|
131
|
+
/* keep the first-line fallback; headerSessionId stays null */
|
|
116
132
|
}
|
|
133
|
+
if (headerSessionId !== sessionId)
|
|
134
|
+
continue;
|
|
135
|
+
return { filePath: cand.filePath, summary };
|
|
117
136
|
}
|
|
118
|
-
|
|
119
|
-
/* keep the first-line fallback */
|
|
120
|
-
}
|
|
121
|
-
return { filePath: newest.filePath, summary };
|
|
137
|
+
return null;
|
|
122
138
|
}
|
|
123
139
|
function rewriteReportApproved(filePath, approvedAt, approvedBy) {
|
|
124
140
|
const raw = fs.readFileSync(filePath, "utf8");
|
|
@@ -247,7 +263,7 @@ export async function approveUnderstanding(opts = {}) {
|
|
|
247
263
|
// fired but the parser rejected the report — here is why", rather
|
|
248
264
|
// than a silent dead end.
|
|
249
265
|
const parseErrorsDir = path.join(path.dirname(reportsDir), "parse-errors");
|
|
250
|
-
const latestParseError = findLatestParseError(parseErrorsDir);
|
|
266
|
+
const latestParseError = findLatestParseError(parseErrorsDir, sessionId);
|
|
251
267
|
let reason;
|
|
252
268
|
if (reports.length === 0) {
|
|
253
269
|
reason = `no reports found at ${reportsDir}`;
|