@onlooker-community/ecosystem 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/.claude-plugin/marketplace.json +13 -0
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/.release-please-manifest.json +3 -2
  4. package/CHANGELOG.md +7 -0
  5. package/CLAUDE.md +1 -0
  6. package/package.json +3 -3
  7. package/plugins/assayer/.claude-plugin/plugin.json +14 -0
  8. package/plugins/assayer/CHANGELOG.md +10 -0
  9. package/plugins/assayer/README.md +114 -0
  10. package/plugins/assayer/config.json +14 -0
  11. package/plugins/assayer/docs/adr/001-verify-claims-against-transcript-evidence.md +57 -0
  12. package/plugins/assayer/docs/design.md +72 -0
  13. package/plugins/assayer/hooks/hooks.json +15 -0
  14. package/plugins/assayer/scripts/hooks/assayer-stop.sh +249 -0
  15. package/plugins/assayer/scripts/lib/assayer-config.sh +88 -0
  16. package/plugins/assayer/scripts/lib/assayer-events.sh +85 -0
  17. package/plugins/assayer/scripts/lib/assayer-extract.sh +87 -0
  18. package/plugins/assayer/scripts/lib/assayer-project-key.sh +69 -0
  19. package/plugins/assayer/scripts/lib/assayer-transcript.sh +99 -0
  20. package/plugins/assayer/scripts/lib/assayer-ulid.sh +46 -0
  21. package/plugins/assayer/scripts/lib/assayer-verify.sh +95 -0
  22. package/release-please-config.json +16 -0
  23. package/test/bats/assayer-config.bats +60 -0
  24. package/test/bats/assayer-events.bats +99 -0
  25. package/test/bats/assayer-extract.bats +76 -0
  26. package/test/bats/assayer-project-key.bats +58 -0
  27. package/test/bats/assayer-stop-hook.bats +81 -0
  28. package/test/bats/assayer-transcript.bats +72 -0
  29. package/test/bats/assayer-ulid.bats +31 -0
  30. package/test/bats/assayer-verify.bats +89 -0
@@ -163,6 +163,19 @@
163
163
  "license": "MIT",
164
164
  "keywords": ["memory", "episodic", "transcript", "indexing", "session", "retrieval"],
165
165
  "tags": ["memory", "context-engineering"]
166
+ },
167
+ {
168
+ "name": "assayer",
169
+ "source": "./plugins/assayer",
170
+ "description": "Claim verification. At session end, parses the agent's final message for testable success claims (\"I ran the tests, they pass\", \"the build is green\") and cross-checks each against the actual command results in the session transcript, classifying it corroborated, contradicted, or unverifiable. Catches lying-without-malice. Advisory by default. Requires the ecosystem plugin.",
171
+ "author": {
172
+ "name": "Onlooker Community"
173
+ },
174
+ "homepage": "https://onlooker.dev",
175
+ "repository": "https://github.com/onlooker-community/ecosystem",
176
+ "license": "MIT",
177
+ "keywords": ["verification", "claims", "exit-codes", "honesty", "testing", "transcript"],
178
+ "tags": ["verification", "testing"]
166
179
  }
167
180
  ]
168
181
  }
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ecosystem",
3
- "version": "0.24.0",
3
+ "version": "0.25.0",
4
4
  "description": "Observability substrate for Claude Code. Provides the shared ~/.onlooker/ storage root, canonical schema-validated event emission, session and tool tracking hooks, and prompt rules. Required by all other Onlooker plugins.",
5
5
  "author": {
6
6
  "name": "Onlooker Community",
@@ -1,5 +1,5 @@
1
1
  {
2
- ".": "0.24.0",
2
+ ".": "0.25.0",
3
3
  "plugins/archivist": "0.1.0",
4
4
  "plugins/tribunal": "1.0.1",
5
5
  "plugins/echo": "0.2.0",
@@ -11,5 +11,6 @@
11
11
  "plugins/warden": "0.2.0",
12
12
  "plugins/librarian": "0.2.0",
13
13
  "plugins/curator": "0.1.0",
14
- "plugins/historian": "0.2.0"
14
+ "plugins/historian": "0.2.0",
15
+ "plugins/assayer": "1.0.0"
15
16
  }
package/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.25.0](https://github.com/onlooker-community/ecosystem/compare/ecosystem-v0.24.0...ecosystem-v0.25.0) (2026-06-04)
4
+
5
+
6
+ ### Features
7
+
8
+ * **assayer:** introduce claim-verification plugin ([#70](https://github.com/onlooker-community/ecosystem/issues/70)) ([1d0500b](https://github.com/onlooker-community/ecosystem/commit/1d0500b64f8cd670d1cfa1ac070182d72696bdfd))
9
+
3
10
  ## [0.24.0](https://github.com/onlooker-community/ecosystem/compare/ecosystem-v0.23.1...ecosystem-v0.24.0) (2026-06-04)
4
11
 
5
12
 
package/CLAUDE.md CHANGED
@@ -37,6 +37,7 @@ scripts/lib/onlooker-event.mjs ← canonical event builder; all plugins route t
37
37
  | governor | SessionStart, PreToolUse (Task), PostToolUse (Task), Stop | Budget gates on subagent spawns; tracks spend per session |
38
38
  | tribunal | Stop + skill invocation | Post-task quality gate; also invokable via `/tribunal` |
39
39
  | warden | PostToolUse (WebFetch, Read), PreToolUse (Write, Edit, MultiEdit, Bash), SessionStart + skill invocation | Scans ingested content for injection; closes a content gate that blocks write-class tools until cleared via `/warden` |
40
+ | assayer | Stop | Verifies the agent's final-message claims against actual command results in the transcript; advisory |
40
41
 
41
42
  Plugins communicate by emitting events to the JSONL log — they do not call each other directly. All plugins depend on the ecosystem substrate; no plugin depends on another plugin directly.
42
43
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@onlooker-community/ecosystem",
3
- "version": "0.24.0",
3
+ "version": "0.25.0",
4
4
  "description": "Agents, skills, hooks, commands, rules, and MCP configurations that power [Onlooker](https://onlooker.dev)",
5
5
  "author": {
6
6
  "name": "Onlooker Community",
@@ -19,14 +19,14 @@
19
19
  "onlooker-install": "install.sh"
20
20
  },
21
21
  "dependencies": {
22
- "@onlooker-community/schema": "^2.5.0"
22
+ "@onlooker-community/schema": "^2.6.0"
23
23
  },
24
24
  "scripts": {
25
25
  "postinstall": "echo '\\n onlooker-ecosystem installed!\\n Run: npx onlooker-install typescript\\n Docs: https://github.com/onlooker-community/ecosystem\\n'",
26
26
  "test": "npm run test:bats && npm run test:schema",
27
27
  "test:bats": "bats test/bats",
28
28
  "test:schema": "node --test test/node/*.test.mjs",
29
- "test:shellcheck": "shellcheck -S error -x install.sh scripts/common.sh scripts/hooks/*.sh scripts/lib/*.sh plugins/archivist/scripts/hooks/*.sh plugins/archivist/scripts/lib/*.sh plugins/tribunal/scripts/hooks/*.sh plugins/tribunal/scripts/lib/*.sh plugins/echo/scripts/hooks/*.sh plugins/echo/scripts/lib/*.sh plugins/governor/scripts/hooks/*.sh plugins/governor/scripts/lib/*.sh plugins/compass/scripts/hooks/*.sh plugins/compass/scripts/lib/*.sh plugins/scribe/scripts/hooks/*.sh plugins/scribe/scripts/lib/*.sh plugins/counsel/scripts/hooks/*.sh plugins/counsel/scripts/lib/*.sh plugins/warden/scripts/hooks/*.sh plugins/warden/scripts/lib/*.sh plugins/librarian/scripts/hooks/*.sh plugins/librarian/scripts/lib/*.sh plugins/curator/scripts/hooks/*.sh plugins/curator/scripts/lib/*.sh plugins/historian/scripts/hooks/*.sh plugins/historian/scripts/lib/*.sh",
29
+ "test:shellcheck": "shellcheck -S error -x install.sh scripts/common.sh scripts/hooks/*.sh scripts/lib/*.sh plugins/archivist/scripts/hooks/*.sh plugins/archivist/scripts/lib/*.sh plugins/tribunal/scripts/hooks/*.sh plugins/tribunal/scripts/lib/*.sh plugins/echo/scripts/hooks/*.sh plugins/echo/scripts/lib/*.sh plugins/governor/scripts/hooks/*.sh plugins/governor/scripts/lib/*.sh plugins/compass/scripts/hooks/*.sh plugins/compass/scripts/lib/*.sh plugins/scribe/scripts/hooks/*.sh plugins/scribe/scripts/lib/*.sh plugins/counsel/scripts/hooks/*.sh plugins/counsel/scripts/lib/*.sh plugins/warden/scripts/hooks/*.sh plugins/warden/scripts/lib/*.sh plugins/librarian/scripts/hooks/*.sh plugins/librarian/scripts/lib/*.sh plugins/curator/scripts/hooks/*.sh plugins/curator/scripts/lib/*.sh plugins/historian/scripts/hooks/*.sh plugins/historian/scripts/lib/*.sh plugins/assayer/scripts/hooks/*.sh plugins/assayer/scripts/lib/*.sh",
30
30
  "lint:references": "node scripts/lint/check-references.mjs",
31
31
  "lint:manifests": "node scripts/lint/check-manifests.mjs",
32
32
  "coverage:node": "node scripts/coverage/run-coverage.mjs",
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "assayer",
3
+ "version": "1.0.0",
4
+ "description": "Claim verification. At session end, parses the agent's final message for testable claims (\"I ran the tests, they pass\", \"the build is green\") and checks each against the actual command results in the session transcript, classifying it corroborated, contradicted, or unverifiable. Catches lying-without-malice. Advisory by default. Builds on the Onlooker ecosystem plugin.",
5
+ "author": {
6
+ "name": "Onlooker Community",
7
+ "url": "https://onlooker.dev"
8
+ },
9
+ "homepage": "https://onlooker.dev",
10
+ "repository": "https://github.com/onlooker-community/ecosystem",
11
+ "license": "MIT",
12
+ "skills": [],
13
+ "agents": []
14
+ }
@@ -0,0 +1,10 @@
1
+ # Changelog
2
+
3
+ ## 1.0.0 (2026-06-04)
4
+
5
+
6
+ ### Features
7
+
8
+ * **assayer:** introduce claim-verification plugin ([#70](https://github.com/onlooker-community/ecosystem/issues/70)) ([1d0500b](https://github.com/onlooker-community/ecosystem/commit/1d0500b64f8cd670d1cfa1ac070182d72696bdfd))
9
+
10
+ ## Changelog
@@ -0,0 +1,114 @@
1
+ # Assayer
2
+
3
+ Claim verification — does the agent's story match the session's receipts?
4
+
5
+ When an agent finishes, it tells you what it did: "I ran the tests, they pass," "the build is green," "lint is clean." Assayer treats those as **testable claims** and checks each against what actually happened in the session — the Bash commands that ran and whether they errored. A claim that a passing run contradicts is surfaced, not silently trusted. This catches lying-without-malice: the agent isn't deceiving you, it just misremembered, or assumed, or never re-ran after a change.
6
+
7
+ Assayer is a sibling plugin to [`ecosystem`](../../) and assumes the Onlooker observability substrate (`~/.onlooker/`) is present.
8
+
9
+ ## How it works
10
+
11
+ | Hook | What Assayer does |
12
+ |------|-------------------|
13
+ | `Stop` | Reads the just-finished session's transcript (`transcript_path`). Extracts the agent's testable success claims from its final message with a single `claude -p` pass, cross-checks each against the actual Bash command results in the same transcript, and emits a verdict per claim plus an audit summary. Advisory only — always exits 0, never blocks Stop. |
14
+
15
+ The pipeline:
16
+
17
+ ```
18
+ Stop → Transcript Reader → Claim Extractor (claude -p) → Deterministic Verifier → Events
19
+ ```
20
+
21
+ - **Transcript reader** pulls two things from the JSONL transcript: the **final assistant message** (where claims live) and every **Bash command** paired with its result status. Claude Code records a command as a `tool_use` block and its outcome as a `tool_result` carrying an `is_error` flag — there is no per-call numeric exit code, so `is_error` is the success/failure signal.
22
+ - **Claim extractor** (LLM) reads only the final message and identifies success claims, tagging each with a `type` (`tests_pass`, `build_succeeds`, `lint_clean`, `types_check`, `command_succeeds`, `generic`) and a `command_keyword` — the substring it expects in the verifying command. The LLM does **not** judge truth; it only identifies claims and what would settle them.
23
+ - **Verifier** (deterministic bash) is the factual half: for each claim it finds the most recent command matching the claim's keywords and reads its `is_error`. Same inputs always produce the same verdict.
24
+
25
+ ## Verdicts
26
+
27
+ | Verdict | Meaning |
28
+ |---------|---------|
29
+ | **corroborated** | A matching command ran and succeeded. |
30
+ | **contradicted** | A matching command ran and **failed** — the claim is not backed by the evidence. |
31
+ | **unverified** | No matching command (`no_matching_command`), or the claim implies no checkable command (`ambiguous`). |
32
+
33
+ The most **recent** matching command wins: an agent may fail, fix, and re-run, and the last run reflects the state the final message describes.
34
+
35
+ ## Activation
36
+
37
+ Assayer is **off by default** — it calls `claude -p` on every Stop, so it is opt-in. Enable per-project in `.claude/settings.json`:
38
+
39
+ ```json
40
+ {
41
+ "assayer": {
42
+ "enabled": true
43
+ }
44
+ }
45
+ ```
46
+
47
+ Or globally in `~/.claude/settings.json`.
48
+
49
+ ## Configuration
50
+
51
+ All keys are optional. Unset keys fall back to the plugin's `config.json` defaults.
52
+
53
+ ```json
54
+ {
55
+ "assayer": {
56
+ "enabled": false,
57
+ "evaluation": {
58
+ "model": "claude-haiku-4-5-20251001",
59
+ "timeout_seconds": 60
60
+ },
61
+ "max_claims": 12,
62
+ "min_confidence": 0.5,
63
+ "final_message_chars": 6000
64
+ }
65
+ }
66
+ ```
67
+
68
+ | Key | Default | Description |
69
+ |-----|---------|-------------|
70
+ | `enabled` | `false` | Must be `true` for any audit to run. |
71
+ | `evaluation.model` | `claude-haiku-4-5-20251001` | Model used for claim extraction. Haiku is fast and cheap; the task is structured and shallow. |
72
+ | `evaluation.timeout_seconds` | `60` | Per-pass wall-clock timeout passed to the `timeout` command. |
73
+ | `max_claims` | `12` | Maximum number of claims to extract from a final message. |
74
+ | `min_confidence` | `0.5` | Claims the extractor scores below this are dropped before verification. |
75
+ | `final_message_chars` | `6000` | How many characters of the final assistant message to feed into extraction. |
76
+
77
+ ## Storage layout
78
+
79
+ ```text
80
+ ~/.onlooker/assayer/<project-key>/
81
+ └── audit-<session-id>.json # advisory summary written at end of each audit
82
+ ```
83
+
84
+ Each audit file records the claim count, the corroborated / contradicted / unverified tallies, the overall verdict, and the per-claim list for review in the next session.
85
+
86
+ Project key: first 12 hex chars of SHA256 of `git remote get-url origin`, falling back to a hash of the repo root realpath — stable across directory moves, clones, and worktrees of the same repo.
87
+
88
+ ## Events emitted
89
+
90
+ Assayer emits the canonical `assayer.*` event surface from [`@onlooker-community/schema`](https://github.com/onlooker-community/schema). All events land in `~/.onlooker/logs/onlooker-events.jsonl` and are validated against the schema before write.
91
+
92
+ | Event | When |
93
+ |-------|------|
94
+ | `assayer.audit.started` | Before verification begins. Includes `claim_count` and `command_count`. |
95
+ | `assayer.claim.contradicted` | A claim is contradicted by a failing command. Includes the `claim`, the `evidence_command`, and a `result_excerpt`. |
96
+ | `assayer.claim.unverified` | A claim has no supporting evidence (`reason`: `no_matching_command` or `ambiguous`). |
97
+ | `assayer.audit.complete` | After all claims are checked. Includes the tallies, the `verdict` (`clean`, `contradictions_found`, `nothing_to_verify`), and `duration_ms`. |
98
+
99
+ Corroborated claims are counted in the summary rather than emitted individually — the happy path is the quiet path.
100
+
101
+ ## Requirements
102
+
103
+ - The `ecosystem` plugin installed (for the `~/.onlooker/` substrate and canonical event emission).
104
+ - A release of `@onlooker-community/schema` that includes the `assayer.*` event types (the emitter validates every envelope against the installed schema; older versions reject `assayer.*`).
105
+ - `claude` CLI on `PATH` (the hook shells out to `claude -p` for the extraction pass).
106
+ - `jq` for JSON manipulation.
107
+ - `node` for canonical-event emission.
108
+ - `python3` for millisecond timestamps (standard on macOS and most Linux distributions).
109
+
110
+ ## Architecture decisions
111
+
112
+ Key decisions made during initial design are recorded in [`docs/adr/`](docs/adr/):
113
+
114
+ - [ADR-001](docs/adr/001-verify-claims-against-transcript-evidence.md) — Verify claims at Stop against transcript evidence (and why `is_error`, not exit codes, and why advisory)
@@ -0,0 +1,14 @@
1
+ {
2
+ "plugin_name": "assayer",
3
+ "storage_path": "~/.onlooker",
4
+ "assayer": {
5
+ "enabled": false,
6
+ "evaluation": {
7
+ "model": "claude-haiku-4-5-20251001",
8
+ "timeout_seconds": 60
9
+ },
10
+ "max_claims": 12,
11
+ "min_confidence": 0.5,
12
+ "final_message_chars": 6000
13
+ }
14
+ }
@@ -0,0 +1,57 @@
1
+ # ADR-001: Verify Claims at Stop Against Transcript Evidence
2
+
3
+ - Status: Accepted
4
+ - Date: 2026-06-04
5
+ - Deciders: Meagan
6
+ - Tags: assayer, verification, stop-hook, transcript, honesty
7
+
8
+ ## Context and Problem Statement
9
+
10
+ Every other plugin in the ecosystem assumes the agent's account of its own work is true. Tribunal judges the output, echo scores the prompt, governor counts the spend — but none of them check the most basic thing: when the agent says "I ran the tests and they pass," did the tests actually pass?
11
+
12
+ This failure mode is not malice. An agent claims success because it intended to verify, or it verified an earlier revision, or it ran the command, saw red, fixed something, and never re-ran. The final message reflects a belief, and the belief can be stale. The session transcript already holds the ground truth — the commands that ran and whether they errored — but nothing reconciles the two.
13
+
14
+ The question: **how do we check the agent's claims against what actually happened, cheaply and without false alarms?**
15
+
16
+ ## Decision Drivers
17
+
18
+ - The evidence (commands + results) must already exist and be trustworthy — not reconstructed or re-run.
19
+ - Verification must be deterministic: the same session must always produce the same verdict.
20
+ - False positives are expensive. Flagging a true claim as a lie destroys trust in the plugin faster than missing a false one.
21
+ - It must not interrupt the user's flow for an advisory signal.
22
+
23
+ ## Decision
24
+
25
+ **Assayer runs at `Stop`, reads the session transcript, and reconciles the agent's final-message claims against the Bash command results recorded in that same transcript.**
26
+
27
+ Three sub-decisions follow:
28
+
29
+ ### 1. Stop, reading the committed transcript
30
+
31
+ `Stop` fires once the turn is over and the transcript is fully written to disk — the same `transcript_path` tribunal and compass read. There is no timing-skew risk: every command the agent ran, and every result, is already on disk before assayer looks. Running earlier (e.g. `PostToolUse`) would mean verifying claims that have not been made yet.
32
+
33
+ ### 2. `is_error`, not exit codes
34
+
35
+ Claude Code's transcript represents a command as a `tool_use` block and its outcome as a `tool_result` carrying an `is_error` boolean. It does **not** expose a per-call numeric exit code. So `is_error` is the success/failure signal: a claim of success contradicted by a matching command whose `is_error` is true. The schema's `assayer.claim.contradicted` payload reflects this honestly — `evidence_command` is required, `exit_code` is optional (populated only when a code is recoverable from output), and a `result_excerpt` captures the failing output for the reader.
36
+
37
+ ### 3. Split the work: LLM identifies, bash verifies
38
+
39
+ Claim extraction is a language problem — what counts as a testable success claim, and what command would settle it — so an LLM (`claude -p`, Haiku) does it. The factual cross-check is not a language problem; it is a lookup. So a deterministic bash/jq verifier matches each claim to the most recent command containing its keywords and reads `is_error`. The LLM never judges truth; the verifier never interprets language. This keeps the verdict reproducible and unit-testable, and confines the non-determinism to the one step that genuinely needs it.
40
+
41
+ ### 4. Advisory, not blocking
42
+
43
+ Assayer always exits 0. A contradicted claim is emitted as an event and written to an advisory file, not used to block `Stop`. The turn is already over; the high-value action is a durable, queryable signal ("the agent claimed X; the evidence says otherwise"), not interrupting a finished session. A blocking/enforce mode that re-prompts the agent on contradiction is a plausible future opt-in, but the safe default is advisory.
44
+
45
+ ## Consequences
46
+
47
+ **Positive**
48
+
49
+ - Closes the "did it actually work?" gap with zero new infrastructure — the evidence is already in the transcript.
50
+ - Deterministic and testable: the verifier is pure bash/jq with no LLM in the factual path.
51
+ - Off by default and advisory, so it can never block or surprise a session it was not invited to.
52
+
53
+ **Negative / accepted trade-offs**
54
+
55
+ - Keyword matching is heuristic. A claim whose verifying command uses unexpected wording falls to `unverified` rather than being checked — a miss, not a false alarm, which is the safer direction to err.
56
+ - `is_error` is coarser than an exit code. A command that exits 0 but prints failures (a test runner that swallows its own status) reads as success. Documented; acceptable for v0.1.
57
+ - One `claude -p` call per Stop. This is why the plugin is off by default.
@@ -0,0 +1,72 @@
1
+ # Assayer — Design
2
+
3
+ Assayer is the verification layer of the Onlooker ecosystem. Where tribunal judges *quality* and echo tracks *prompt drift*, assayer answers a narrower, more literal question: **did the things the agent said it did actually happen?**
4
+
5
+ ## The problem: lying-without-malice
6
+
7
+ An agent's final message is a self-report. It says "tests pass," "build is green," "lint is clean." These read as facts but are really *beliefs*, and beliefs drift from reality in ordinary ways:
8
+
9
+ - The agent ran the check against an earlier revision, then changed code, and never re-ran.
10
+ - It intended to run the check and narrated as if it had.
11
+ - It misread a noisy command's output.
12
+
13
+ None of this is deception. But a user who trusts the self-report ships on a false premise. The session transcript already contains the ground truth — every command and its result — so the gap is purely one of reconciliation.
14
+
15
+ ## Pipeline
16
+
17
+ ```
18
+ Stop
19
+ → Transcript Reader (final message + commands-with-status)
20
+ → Claim Extractor (claude -p, Haiku — language understanding)
21
+ → Deterministic Verifier (bash/jq — factual lookup)
22
+ → Events + advisory file
23
+ ```
24
+
25
+ ### Transcript reader (`assayer-transcript.sh`)
26
+
27
+ Two extractions from the JSONL transcript at `transcript_path`:
28
+
29
+ - `assayer_final_assistant_message` — the text blocks of the last assistant turn that contains any, truncated to `final_message_chars`. This is where claims live.
30
+ - `assayer_collect_commands` — every `Bash` `tool_use` joined to its `tool_result` by `tool_use_id`, yielding `{ command, is_error, excerpt }`. Claude Code does not record a numeric exit code per call; `is_error` is the success/failure signal. See ADR-001.
31
+
32
+ ### Claim extractor (`assayer-extract.sh`)
33
+
34
+ A single `claude -p` pass reads only the final message and returns a JSON array of claims, each `{ text, type, command_keyword, confidence }`. `type` is one of `tests_pass | build_succeeds | lint_clean | types_check | command_succeeds | generic`. The model identifies claims and what command would settle them; it does not judge whether they are true. `assayer_parse_claims` strips fences, validates the array, coerces unknown types to `generic`, and drops entries without text.
35
+
36
+ ### Deterministic verifier (`assayer-verify.sh`)
37
+
38
+ `assayer_classify_claim` derives keywords from the claim's `type` (e.g. `tests_pass → ["test"]`) plus the LLM-supplied `command_keyword`, finds the **most recent** command containing any keyword, and classifies on its `is_error`:
39
+
40
+ - matched + not errored → **corroborated**
41
+ - matched + errored → **contradicted**
42
+ - no match → **unverified** (`no_matching_command`, or `ambiguous` when the claim implies no checkable command)
43
+
44
+ "Most recent wins" handles the fail-fix-rerun pattern: the last run reflects the final state. The function is pure — no LLM, no filesystem — so it is fully unit-tested.
45
+
46
+ `assayer_audit_verdict` rolls the per-claim counts into `clean`, `contradictions_found`, or `nothing_to_verify`.
47
+
48
+ ## Events
49
+
50
+ | Event | Payload highlights |
51
+ |-------|--------------------|
52
+ | `assayer.audit.started` | `audit_id`, `claim_count`, `command_count`, `trigger` |
53
+ | `assayer.claim.contradicted` | `claim`, `claim_type`, `evidence_command`, `result_excerpt`, `confidence` |
54
+ | `assayer.claim.unverified` | `claim`, `claim_type`, `reason` |
55
+ | `assayer.audit.complete` | `corroborated`, `contradicted`, `unverified`, `verdict`, `duration_ms` |
56
+
57
+ Corroborated claims are counted in the summary, not emitted individually — the happy path stays quiet.
58
+
59
+ ## Non-goals (v0.1)
60
+
61
+ - **Blocking.** Assayer is advisory; it never blocks `Stop`. An enforce mode is a future opt-in.
62
+ - **Re-running commands.** Assayer reconciles against what already ran; it does not execute anything.
63
+ - **Parsing exit codes from output.** It relies on `is_error`. A command that exits 0 while printing failures reads as success.
64
+ - **Non-Bash evidence.** Only `Bash` results are treated as evidence today.
65
+
66
+ ## Relationship to other plugins
67
+
68
+ Assayer occupies the verification/execution layer, empty until now. It is complementary to:
69
+
70
+ - **tribunal** — judges whether the work is *good*; assayer checks whether the *claims about it* are true.
71
+ - **scribe** — writes the narrative of *why*; assayer checks the factual assertions in that narrative's neighbor, the final message.
72
+ - **counsel** — can consume `assayer.*` events to surface recurring honesty gaps over time.
@@ -0,0 +1,15 @@
1
+ {
2
+ "hooks": {
3
+ "Stop": [
4
+ {
5
+ "matcher": "*",
6
+ "hooks": [
7
+ {
8
+ "type": "command",
9
+ "command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/assayer-stop.sh"
10
+ }
11
+ ]
12
+ }
13
+ ]
14
+ }
15
+ }
@@ -0,0 +1,249 @@
1
+ #!/usr/bin/env bash
2
+ # Assayer Stop hook.
3
+ #
4
+ # Triggered by Stop. Off by default — gated on assayer.enabled in config.
5
+ # When enabled, it reads the just-finished session's transcript, extracts the
6
+ # agent's testable success claims from its final message, and cross-checks each
7
+ # against the actual Bash command results in the same transcript. Each claim is
8
+ # classified corroborated / contradicted / unverified and emitted as an event.
9
+ #
10
+ # Hook contract:
11
+ # - Always exits 0. Advisory only — never blocks Stop.
12
+ # - Skips silently if disabled, no git context, no transcript, or no claims.
13
+ # - Recursion guard: exits immediately if ASSAYER_NESTED=1 to prevent a
14
+ # claude -p subprocess from re-triggering this hook on its own Stop.
15
+ # - Errors from `claude -p` are swallowed; worst case is no audit written.
16
+
17
+ set -uo pipefail
18
+
19
+ # Recursion guard — must be first.
20
+ [[ "${ASSAYER_NESTED:-}" == "1" ]] && exit 0
21
+ export ASSAYER_NESTED=1
22
+
23
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
24
+ PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
25
+
26
+ # Resolve the ecosystem root (sibling to this plugin's parent).
27
+ _ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
28
+ if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
29
+ _candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
30
+ if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
31
+ _ECOSYSTEM_ROOT="$_candidate"
32
+ fi
33
+ fi
34
+
35
+ if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
36
+ # shellcheck disable=SC1091
37
+ CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
38
+ # shellcheck disable=SC1091
39
+ CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/onlooker-schema.sh"
40
+ fi
41
+
42
+ # shellcheck source=../lib/assayer-config.sh
43
+ source "${PLUGIN_ROOT}/scripts/lib/assayer-config.sh"
44
+ # shellcheck source=../lib/assayer-project-key.sh
45
+ source "${PLUGIN_ROOT}/scripts/lib/assayer-project-key.sh"
46
+ # shellcheck source=../lib/assayer-ulid.sh
47
+ source "${PLUGIN_ROOT}/scripts/lib/assayer-ulid.sh"
48
+ # shellcheck source=../lib/assayer-transcript.sh
49
+ source "${PLUGIN_ROOT}/scripts/lib/assayer-transcript.sh"
50
+ # shellcheck source=../lib/assayer-extract.sh
51
+ source "${PLUGIN_ROOT}/scripts/lib/assayer-extract.sh"
52
+ # shellcheck source=../lib/assayer-verify.sh
53
+ source "${PLUGIN_ROOT}/scripts/lib/assayer-verify.sh"
54
+ # shellcheck source=../lib/assayer-events.sh
55
+ CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" source "${PLUGIN_ROOT}/scripts/lib/assayer-events.sh"
56
+
57
+ INPUT=$(cat)
58
+ CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
59
+ SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
60
+ TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | jq -r '.transcript_path // ""' 2>/dev/null) || TRANSCRIPT_PATH=""
61
+ [[ -z "$TRANSCRIPT_PATH" ]] && TRANSCRIPT_PATH="${CLAUDE_TRANSCRIPT_PATH:-}"
62
+
63
+ export _HOOK_SESSION_ID="${SESSION_ID:-unknown}"
64
+
65
+ _done() { exit 0; }
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Config + prerequisites
69
+ # ---------------------------------------------------------------------------
70
+
71
+ REPO_ROOT=$(assayer_project_repo_root "$CWD")
72
+ [[ -z "$REPO_ROOT" ]] && _done
73
+
74
+ CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" assayer_config_load "$REPO_ROOT"
75
+ assayer_config_enabled || _done
76
+
77
+ PROJECT_KEY=$(assayer_project_key "$CWD")
78
+ [[ -z "$PROJECT_KEY" ]] && _done
79
+
80
+ command -v claude >/dev/null 2>&1 || _done
81
+ command -v jq >/dev/null 2>&1 || _done
82
+
83
+ [[ -f "$TRANSCRIPT_PATH" ]] || _done
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Read transcript: final message + command evidence
87
+ # ---------------------------------------------------------------------------
88
+
89
+ FINAL_MESSAGE_CHARS=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" assayer_config_final_message_chars)
90
+ FINAL_MESSAGE=$(assayer_final_assistant_message "$TRANSCRIPT_PATH" "$FINAL_MESSAGE_CHARS")
91
+ [[ -z "$FINAL_MESSAGE" ]] && _done
92
+
93
+ COMMANDS=$(assayer_collect_commands "$TRANSCRIPT_PATH")
94
+ COMMAND_COUNT=$(printf '%s' "$COMMANDS" | jq 'length' 2>/dev/null) || COMMAND_COUNT=0
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # Extract claims via claude -p
98
+ # ---------------------------------------------------------------------------
99
+
100
+ MAX_CLAIMS=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" assayer_config_max_claims)
101
+ MIN_CONFIDENCE=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" assayer_config_min_confidence)
102
+ EVAL_MODEL=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" assayer_config_model)
103
+ TIMEOUT_SECS=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" assayer_config_timeout)
104
+
105
+ PROMPT_FILE=$(mktemp -t assayer-prompt.XXXXXX 2>/dev/null) || PROMPT_FILE="/tmp/assayer-prompt.$$"
106
+ trap 'rm -f "$PROMPT_FILE"' EXIT
107
+ assayer_build_extraction_prompt "$FINAL_MESSAGE" "$MAX_CLAIMS" >"$PROMPT_FILE"
108
+
109
+ CLAUDE_ARGS=(-p --max-turns 1)
110
+ [[ -n "$EVAL_MODEL" ]] && CLAUDE_ARGS+=(--model "$EVAL_MODEL")
111
+
112
+ RESPONSE=""
113
+ if command -v timeout >/dev/null 2>&1; then
114
+ RESPONSE=$(timeout "$TIMEOUT_SECS" claude "${CLAUDE_ARGS[@]}" <"$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
115
+ elif command -v gtimeout >/dev/null 2>&1; then
116
+ RESPONSE=$(gtimeout "$TIMEOUT_SECS" claude "${CLAUDE_ARGS[@]}" <"$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
117
+ else
118
+ RESPONSE=$(claude "${CLAUDE_ARGS[@]}" <"$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
119
+ fi
120
+ [[ -z "$RESPONSE" ]] && _done
121
+
122
+ CLAIMS=$(assayer_parse_claims "$RESPONSE")
123
+ CLAIM_COUNT=$(printf '%s' "$CLAIMS" | jq 'length' 2>/dev/null) || CLAIM_COUNT=0
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # Audit
127
+ # ---------------------------------------------------------------------------
128
+
129
+ AUDIT_ID=$(assayer_ulid)
130
+ AUDIT_START=$(python3 -c 'import time; print(int(time.time()*1000))' 2>/dev/null || echo 0)
131
+
132
+ started_payload=$(jq -n \
133
+ --arg audit_id "$AUDIT_ID" \
134
+ --argjson claim_count "$CLAIM_COUNT" \
135
+ --arg trigger "stop" \
136
+ --argjson command_count "${COMMAND_COUNT:-0}" \
137
+ '{audit_id: $audit_id, claim_count: $claim_count, trigger: $trigger, command_count: $command_count}')
138
+ assayer_emit_event "assayer.audit.started" "$started_payload" || true
139
+
140
+ ONLOOKER_BASE="${ONLOOKER_DIR:-$HOME/.onlooker}"
141
+ ASSAYER_DIR="${ONLOOKER_BASE}/assayer/${PROJECT_KEY}"
142
+ mkdir -p "$ASSAYER_DIR" 2>/dev/null || true
143
+
144
+ count_corroborated=0
145
+ count_contradicted=0
146
+ count_unverified=0
147
+ checked_claims="[]"
148
+
149
+ while IFS= read -r claim; do
150
+ [[ -z "$claim" ]] && continue
151
+
152
+ # Confidence floor — skip low-confidence extractions. Compare with awk via
153
+ # -v bindings (not string-interpolated into code), so an LLM- or
154
+ # config-supplied value is treated as a number and a non-numeric value
155
+ # degrades to 0 instead of executing as code.
156
+ conf=$(printf '%s' "$claim" | jq -r '.confidence // 0.6' 2>/dev/null) || conf="0.6"
157
+ if awk -v a="$conf" -v b="$MIN_CONFIDENCE" 'BEGIN { exit !(a >= b) }' 2>/dev/null; then
158
+ keep=1
159
+ else
160
+ keep=0
161
+ fi
162
+ [[ "$keep" != "1" ]] && continue
163
+
164
+ claim_text=$(printf '%s' "$claim" | jq -r '.text // ""' 2>/dev/null) || claim_text=""
165
+ claim_type=$(printf '%s' "$claim" | jq -r '.type // "generic"' 2>/dev/null) || claim_type="generic"
166
+ [[ -z "$claim_text" ]] && continue
167
+
168
+ verdict_obj=$(assayer_classify_claim "$claim" "$COMMANDS")
169
+ verdict=$(printf '%s' "$verdict_obj" | jq -r '.verdict // "unverified"' 2>/dev/null) || verdict="unverified"
170
+
171
+ case "$verdict" in
172
+ contradicted)
173
+ count_contradicted=$((count_contradicted + 1))
174
+ evidence_command=$(printf '%s' "$verdict_obj" | jq -r '.evidence_command // ""' 2>/dev/null) || evidence_command=""
175
+ excerpt=$(printf '%s' "$verdict_obj" | jq -r '.excerpt // ""' 2>/dev/null) || excerpt=""
176
+ contradicted_payload=$(jq -n \
177
+ --arg audit_id "$AUDIT_ID" \
178
+ --arg claim "$claim_text" \
179
+ --arg claim_type "$claim_type" \
180
+ --arg evidence_command "$evidence_command" \
181
+ --arg result_excerpt "$excerpt" \
182
+ --argjson confidence "$conf" \
183
+ '{audit_id: $audit_id, claim: $claim, claim_type: $claim_type,
184
+ evidence_command: $evidence_command, result_excerpt: $result_excerpt,
185
+ confidence: $confidence}')
186
+ assayer_emit_event "assayer.claim.contradicted" "$contradicted_payload" || true
187
+ ;;
188
+ corroborated)
189
+ count_corroborated=$((count_corroborated + 1))
190
+ ;;
191
+ *)
192
+ count_unverified=$((count_unverified + 1))
193
+ reason=$(printf '%s' "$verdict_obj" | jq -r '.reason // "no_evidence"' 2>/dev/null) || reason="no_evidence"
194
+ unverified_payload=$(jq -n \
195
+ --arg audit_id "$AUDIT_ID" \
196
+ --arg claim "$claim_text" \
197
+ --arg claim_type "$claim_type" \
198
+ --arg reason "$reason" \
199
+ '{audit_id: $audit_id, claim: $claim, claim_type: $claim_type, reason: $reason}')
200
+ assayer_emit_event "assayer.claim.unverified" "$unverified_payload" || true
201
+ ;;
202
+ esac
203
+
204
+ checked_claims=$(printf '%s' "$checked_claims" | jq -c \
205
+ --arg text "$claim_text" \
206
+ --arg verdict "$verdict" \
207
+ '. + [{text: $text, verdict: $verdict}]' 2>/dev/null) || true
208
+ done < <(printf '%s' "$CLAIMS" | jq -c '.[]' 2>/dev/null)
209
+
210
+ # ---------------------------------------------------------------------------
211
+ # Audit summary
212
+ # ---------------------------------------------------------------------------
213
+
214
+ AUDIT_END=$(python3 -c 'import time; print(int(time.time()*1000))' 2>/dev/null || echo 0)
215
+ DURATION_MS=$((AUDIT_END - AUDIT_START))
216
+ [[ "$DURATION_MS" -lt 0 ]] && DURATION_MS=0
217
+
218
+ VERDICT=$(assayer_audit_verdict "$count_contradicted" "$count_corroborated" "$count_unverified")
219
+
220
+ complete_payload=$(jq -n \
221
+ --arg audit_id "$AUDIT_ID" \
222
+ --argjson claim_count "$CLAIM_COUNT" \
223
+ --argjson corroborated "$count_corroborated" \
224
+ --argjson contradicted "$count_contradicted" \
225
+ --argjson unverified "$count_unverified" \
226
+ --arg verdict "$VERDICT" \
227
+ --argjson duration_ms "$DURATION_MS" \
228
+ '{audit_id: $audit_id, claim_count: $claim_count,
229
+ corroborated: $corroborated, contradicted: $contradicted,
230
+ unverified: $unverified, verdict: $verdict, duration_ms: $duration_ms}')
231
+ assayer_emit_event "assayer.audit.complete" "$complete_payload" || true
232
+
233
+ # Advisory file for review in the next session.
234
+ SAFE_SESSION_ID=$(printf '%s' "${SESSION_ID:-unknown}" | tr -c 'a-zA-Z0-9-' '_')
235
+ jq -n \
236
+ --arg audit_id "$AUDIT_ID" \
237
+ --arg session_id "${SESSION_ID:-unknown}" \
238
+ --argjson claim_count "$CLAIM_COUNT" \
239
+ --argjson corroborated "$count_corroborated" \
240
+ --argjson contradicted "$count_contradicted" \
241
+ --argjson unverified "$count_unverified" \
242
+ --arg verdict "$VERDICT" \
243
+ --argjson claims "$checked_claims" \
244
+ '{audit_id: $audit_id, session_id: $session_id, claim_count: $claim_count,
245
+ corroborated: $corroborated, contradicted: $contradicted,
246
+ unverified: $unverified, verdict: $verdict, claims: $claims}' \
247
+ >"${ASSAYER_DIR}/audit-${SAFE_SESSION_ID}.json" 2>/dev/null || true
248
+
249
+ _done