@onlooker-community/ecosystem 0.24.0 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.claude-plugin/marketplace.json +39 -13
  2. package/.claude-plugin/plugin.json +2 -2
  3. package/.release-please-manifest.json +5 -4
  4. package/CHANGELOG.md +14 -0
  5. package/CLAUDE.md +1 -0
  6. package/package.json +3 -3
  7. package/plugins/assayer/.claude-plugin/plugin.json +14 -0
  8. package/plugins/assayer/CHANGELOG.md +10 -0
  9. package/plugins/assayer/README.md +114 -0
  10. package/plugins/assayer/config.json +14 -0
  11. package/plugins/assayer/docs/adr/001-verify-claims-against-transcript-evidence.md +57 -0
  12. package/plugins/assayer/docs/design.md +72 -0
  13. package/plugins/assayer/hooks/hooks.json +15 -0
  14. package/plugins/assayer/scripts/hooks/assayer-stop.sh +249 -0
  15. package/plugins/assayer/scripts/lib/assayer-config.sh +88 -0
  16. package/plugins/assayer/scripts/lib/assayer-events.sh +85 -0
  17. package/plugins/assayer/scripts/lib/assayer-extract.sh +87 -0
  18. package/plugins/assayer/scripts/lib/assayer-project-key.sh +69 -0
  19. package/plugins/assayer/scripts/lib/assayer-transcript.sh +99 -0
  20. package/plugins/assayer/scripts/lib/assayer-ulid.sh +46 -0
  21. package/plugins/assayer/scripts/lib/assayer-verify.sh +95 -0
  22. package/plugins/cartographer/.claude-plugin/plugin.json +1 -1
  23. package/plugins/cartographer/CHANGELOG.md +7 -0
  24. package/plugins/cartographer/scripts/lib/cartographer-lock.sh +17 -7
  25. package/plugins/cartographer/scripts/lib/portable-lock.sh +57 -0
  26. package/plugins/governor/.claude-plugin/plugin.json +1 -1
  27. package/plugins/governor/CHANGELOG.md +7 -0
  28. package/plugins/governor/scripts/hooks/governor-post-tool-use.sh +6 -2
  29. package/plugins/governor/scripts/hooks/governor-pre-tool-use.sh +6 -2
  30. package/plugins/governor/scripts/hooks/governor-session-start.sh +6 -2
  31. package/plugins/governor/scripts/hooks/governor-stop.sh +6 -2
  32. package/plugins/governor/scripts/lib/portable-lock.sh +59 -0
  33. package/release-please-config.json +16 -0
  34. package/scripts/lib/portable-lock.sh +1 -1
  35. package/test/bats/assayer-config.bats +60 -0
  36. package/test/bats/assayer-events.bats +99 -0
  37. package/test/bats/assayer-extract.bats +76 -0
  38. package/test/bats/assayer-project-key.bats +58 -0
  39. package/test/bats/assayer-stop-hook.bats +81 -0
  40. package/test/bats/assayer-transcript.bats +72 -0
  41. package/test/bats/assayer-ulid.bats +31 -0
  42. package/test/bats/assayer-verify.bats +89 -0
  43. package/test/bats/cartographer-lock.bats +19 -0
@@ -5,26 +5,26 @@
5
5
  "email": "community@onlooker.dev"
6
6
  },
7
7
  "metadata": {
8
- "description": "TODO Fill this out"
8
+ "description": "Composable observability, memory, and quality-gate plugins for Claude Code — all built on the Onlooker ecosystem event substrate."
9
9
  },
10
10
  "plugins": [
11
11
  {
12
12
  "name": "ecosystem",
13
13
  "source": "./",
14
- "description": "Fill this out",
14
+ "description": "Observability substrate for Claude Code. Provides the shared $ONLOOKER_DIR storage root (default $HOME/.onlooker), canonical schema-validated event emission, session and tool tracking hooks, and prompt rules. Required by all other Onlooker plugins.",
15
15
  "author": {
16
16
  "name": "Onlooker Community"
17
17
  },
18
18
  "homepage": "https://onlooker.dev",
19
19
  "repository": "https://github.com/onlooker-community/ecosystem",
20
20
  "license": "MIT",
21
- "keywords": [],
22
- "tags": []
21
+ "keywords": ["observability", "substrate", "events", "hooks", "telemetry"],
22
+ "tags": ["observability", "substrate"]
23
23
  },
24
24
  {
25
25
  "name": "archivist",
26
26
  "source": "./plugins/archivist",
27
- "description": "Structured session memory across context truncation. Extracts decisions, dead ends, and open questions on PreCompact and reinjects the most important items at SessionStart. Requires the ecosystem plugin.",
27
+ "description": "Structured session memory across context truncation: extracts decisions, dead ends, and open questions on PreCompact and reinjects the most important items at SessionStart. Requires the ecosystem plugin.",
28
28
  "author": {
29
29
  "name": "Onlooker Community"
30
30
  },
@@ -34,10 +34,23 @@
34
34
  "keywords": ["memory", "compaction", "context", "session"],
35
35
  "tags": ["memory", "context-engineering"]
36
36
  },
37
+ {
38
+ "name": "cartographer",
39
+ "source": "./plugins/cartographer",
40
+ "description": "Proactive periodic auditor of the persistent instruction layer (CLAUDE.md, AGENTS.md, .claude/rules/). Discovers all instruction files in the repo, extracts semantic maps, and surfaces contradictions, shadowing, gaps, and drift before they cause expensive agent misbehavior. Requires the ecosystem plugin.",
41
+ "author": {
42
+ "name": "Onlooker Community"
43
+ },
44
+ "homepage": "https://onlooker.dev",
45
+ "repository": "https://github.com/onlooker-community/ecosystem",
46
+ "license": "MIT",
47
+ "keywords": ["instructions", "audit", "claude-md", "agents-md", "drift", "consistency"],
48
+ "tags": ["instructions", "context-engineering"]
49
+ },
37
50
  {
38
51
  "name": "tribunal",
39
52
  "source": "./plugins/tribunal",
40
- "description": "Multi-agent execution with LLM-as-a-Judge quality gates. An Actor performs work; a jury of typed Judges scores it against a project-overridable rubric; a Meta-Judge reviews the jury for bias; the gate decides accept, retry, or exhaust. Requires the ecosystem plugin.",
53
+ "description": "Multi-agent execution with LLM-as-a-Judge quality gates. An Actor performs work; a jury of typed Judges scores it against a project-overridable rubric; a Meta-Judge reviews the jury for bias; the gate decides accept, retry, or exhaust. Grounded in LLM-as-a-Judge (Zheng et al. 2023) and LLM-as-a-Meta-Judge (Wu et al. 2024). Requires the ecosystem plugin.",
41
54
  "author": {
42
55
  "name": "Onlooker Community"
43
56
  },
@@ -76,7 +89,7 @@
76
89
  {
77
90
  "name": "compass",
78
91
  "source": "./plugins/compass",
79
- "description": "Pre-write intent clarity gate. Intercepts write-class tool calls and samples N=5 parallel evaluators to score intent clarity before allowing writes to proceed. Blocks when confidence is low or evaluators disagree, surfacing a structured clarification prompt. Requires the ecosystem plugin.",
92
+ "description": "Pre-write intent clarity gate. Intercepts write-class tool calls and requires a confidence threshold before allowing them to proceed. Evaluates the pending write against the prior assistant turn as context to avoid false positives on question-answer turns. Requires the ecosystem plugin.",
80
93
  "author": {
81
94
  "name": "Onlooker Community"
82
95
  },
@@ -89,7 +102,7 @@
89
102
  {
90
103
  "name": "scribe",
91
104
  "source": "./plugins/scribe",
92
- "description": "Intent documentation from agent activity. Captures why changes were made — problem context, decisions, tradeoffs, and constraints — and distills them into readable Markdown artifacts at session end. Git logs record what changed; scribe records why. Requires the ecosystem plugin.",
105
+ "description": "Intent documentation from agent activity. Captures why changes were made — problem context, decisions, tradeoffs — and distills them into readable artifacts at session end. Requires the ecosystem plugin.",
93
106
  "author": {
94
107
  "name": "Onlooker Community"
95
108
  },
@@ -102,7 +115,7 @@
102
115
  {
103
116
  "name": "counsel",
104
117
  "source": "./plugins/counsel",
105
- "description": "Weekly synthesis and recommendations from your full observability stack. Reads all plugin event logs, identifies patterns, surfaces improvement opportunities, and injects a structured brief at session start when the last brief is stale. Turns disparate logs into a coaching signal. Requires the ecosystem plugin.",
118
+ "description": "Weekly synthesis and recommendations from the full observability stack. Reads all plugin event logs, produces a structured improvement brief, and injects it at session start when the last brief is stale. Requires the ecosystem plugin.",
106
119
  "author": {
107
120
  "name": "Onlooker Community"
108
121
  },
@@ -115,7 +128,7 @@
115
128
  {
116
129
  "name": "warden",
117
130
  "source": "./plugins/warden",
118
- "description": "Untrusted-content gate. Scans content flowing in through WebFetch and Read for prompt-injection patterns, and when a threat is detected closes a session-scoped gate that blocks Write, Edit, and Bash until the user explicitly clears it. Grounded in Meta's Agents Rule of Two — warden removes the agent's external-actions property while untrusted content is in play. Requires the ecosystem plugin.",
131
+ "description": "Untrusted-content gate. Scans content flowing in through WebFetch and Read for prompt-injection patterns, and when a threat is detected closes a session-scoped gate that blocks Write, Edit, and Bash until the user explicitly clears it. Grounded in Meta's Agents Rule of Two: an agent should hold no more than two of {private data, external actions, untrusted content} at once — warden removes the external-actions property while untrusted content is in play. Requires the ecosystem plugin.",
119
132
  "author": {
120
133
  "name": "Onlooker Community"
121
134
  },
@@ -128,7 +141,7 @@
128
141
  {
129
142
  "name": "librarian",
130
143
  "source": "./plugins/librarian",
131
- "description": "Consolidation layer between archivist's per-session artifacts and the user's durable typed memory store. Reads archivist artifacts at SessionEnd, applies a durability filter, classifies survivors via Haiku into the four memory types (user, feedback, project, reference), and queues proposals for explicit confirmation via /librarian review. Auto-promotion is opt-in. Requires the ecosystem plugin.",
144
+ "description": "Consolidation layer between archivist's per-session artifacts and the user's durable typed memory store. Detects which session decisions, dead-ends, and open questions deserve to live across sessions, classifies them into the user/feedback/project/reference types, and queues them as proposals for explicit confirmation. Auto-promotion is opt-in. Requires the ecosystem plugin.",
132
145
  "author": {
133
146
  "name": "Onlooker Community"
134
147
  },
@@ -141,7 +154,7 @@
141
154
  {
142
155
  "name": "curator",
143
156
  "source": "./plugins/curator",
144
- "description": "Maintenance layer for the typed auto-memory store. At every SessionStart, runs four cheap heuristic checks against the memories at ~/.claude/projects/<encoded>/memory/ date_decayed (ISO-8601 dates past the grace period), path_broken (path-shaped references that don't resolve under the repo root), broken_index (MEMORY.md pointing at missing files), and orphaned_memory (files in the dir not referenced from MEMORY.md). Surfaces findings via /curator review; never edits the memory store directly. Parallel to cartographer (which audits hand-maintained instruction files), curator audits the auto-memory substrate. Requires the ecosystem plugin.",
157
+ "description": "Maintenance layer for the user's typed auto-memory store. At every SessionStart, runs four cheap heuristic checks (date_decayed, path_broken, broken_index, orphaned_memory) against the memories at ~/.claude/projects/<encoded>/memory/ inside a wall-clock budget. Surfaces findings as a one-line pointer to /curator review; never edits the memory store directly. Requires the ecosystem plugin.",
145
158
  "author": {
146
159
  "name": "Onlooker Community"
147
160
  },
@@ -154,7 +167,7 @@
154
167
  {
155
168
  "name": "historian",
156
169
  "source": "./plugins/historian",
157
- "description": "Episodic memory layer for past Claude Code sessions. At SessionEnd, reads the session transcript, drops tool calls and tool results, chunks the remaining user + assistant turns at turn boundaries with overlap, redacts secret-shaped substrings (AWS keys, GitHub PATs, Anthropic API keys, KEY=value env assignments), and appends one JSONL line per surviving chunk to ~/.onlooker/historian/<project-key>/sessions/<session-id>.jsonl. Future-tense retrieval (vector embeddings + UserPromptSubmit similarity surfacer) lands in a follow-up; this version ships the indexing pipeline only. Requires the ecosystem plugin.",
170
+ "description": "Episodic memory layer. At SessionEnd, chunks and sanitizes the session transcript and stores chunks under $ONLOOKER_DIR/historian/<project-key>/sessions/ (default $HOME/.onlooker). On UserPromptSubmit, embeds the prompt and performs similarity retrieval over stored chunks to surface relevant past context. Requires the ecosystem plugin.",
158
171
  "author": {
159
172
  "name": "Onlooker Community"
160
173
  },
@@ -163,6 +176,19 @@
163
176
  "license": "MIT",
164
177
  "keywords": ["memory", "episodic", "transcript", "indexing", "session", "retrieval"],
165
178
  "tags": ["memory", "context-engineering"]
179
+ },
180
+ {
181
+ "name": "assayer",
182
+ "source": "./plugins/assayer",
183
+ "description": "Claim verification. At session end, parses the agent's final message for testable claims (\"I ran the tests, they pass\", \"the build is green\") and checks each against the actual command results in the session transcript, classifying it corroborated, contradicted, or unverifiable. Catches lying-without-malice. Advisory by default. Requires the ecosystem plugin.",
184
+ "author": {
185
+ "name": "Onlooker Community"
186
+ },
187
+ "homepage": "https://onlooker.dev",
188
+ "repository": "https://github.com/onlooker-community/ecosystem",
189
+ "license": "MIT",
190
+ "keywords": ["verification", "claims", "exit-codes", "honesty", "testing", "transcript"],
191
+ "tags": ["verification", "testing"]
166
192
  }
167
193
  ]
168
194
  }
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "ecosystem",
3
- "version": "0.24.0",
4
- "description": "Observability substrate for Claude Code. Provides the shared ~/.onlooker/ storage root, canonical schema-validated event emission, session and tool tracking hooks, and prompt rules. Required by all other Onlooker plugins.",
3
+ "version": "0.25.1",
4
+ "description": "Observability substrate for Claude Code. Provides the shared $ONLOOKER_DIR storage root (default $HOME/.onlooker), canonical schema-validated event emission, session and tool tracking hooks, and prompt rules. Required by all other Onlooker plugins.",
5
5
  "author": {
6
6
  "name": "Onlooker Community",
7
7
  "url": "https://onlooker.dev"
@@ -1,15 +1,16 @@
1
1
  {
2
- ".": "0.24.0",
2
+ ".": "0.25.1",
3
3
  "plugins/archivist": "0.1.0",
4
4
  "plugins/tribunal": "1.0.1",
5
5
  "plugins/echo": "0.2.0",
6
- "plugins/cartographer": "0.2.0",
7
- "plugins/governor": "0.2.0",
6
+ "plugins/cartographer": "0.2.1",
7
+ "plugins/governor": "0.2.1",
8
8
  "plugins/compass": "0.2.0",
9
9
  "plugins/scribe": "0.2.1",
10
10
  "plugins/counsel": "0.2.0",
11
11
  "plugins/warden": "0.2.0",
12
12
  "plugins/librarian": "0.2.0",
13
13
  "plugins/curator": "0.1.0",
14
- "plugins/historian": "0.2.0"
14
+ "plugins/historian": "0.2.0",
15
+ "plugins/assayer": "1.0.0"
15
16
  }
package/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.25.1](https://github.com/onlooker-community/ecosystem/compare/ecosystem-v0.25.0...ecosystem-v0.25.1) (2026-06-10)
4
+
5
+
6
+ ### Bug Fixes
7
+
8
+ * vendor portable-lock.sh into cartographer and governor ([#73](https://github.com/onlooker-community/ecosystem/issues/73)) ([ab2c354](https://github.com/onlooker-community/ecosystem/commit/ab2c354b131c26cc642ebb51e84a043dc43cbaa1))
9
+
10
+ ## [0.25.0](https://github.com/onlooker-community/ecosystem/compare/ecosystem-v0.24.0...ecosystem-v0.25.0) (2026-06-04)
11
+
12
+
13
+ ### Features
14
+
15
+ * **assayer:** introduce claim-verification plugin ([#70](https://github.com/onlooker-community/ecosystem/issues/70)) ([1d0500b](https://github.com/onlooker-community/ecosystem/commit/1d0500b64f8cd670d1cfa1ac070182d72696bdfd))
16
+
3
17
  ## [0.24.0](https://github.com/onlooker-community/ecosystem/compare/ecosystem-v0.23.1...ecosystem-v0.24.0) (2026-06-04)
4
18
 
5
19
 
package/CLAUDE.md CHANGED
@@ -37,6 +37,7 @@ scripts/lib/onlooker-event.mjs ← canonical event builder; all plugins route t
37
37
  | governor | SessionStart, PreToolUse (Task), PostToolUse (Task), Stop | Budget gates on subagent spawns; tracks spend per session |
38
38
  | tribunal | Stop + skill invocation | Post-task quality gate; also invokable via `/tribunal` |
39
39
  | warden | PostToolUse (WebFetch, Read), PreToolUse (Write, Edit, MultiEdit, Bash), SessionStart + skill invocation | Scans ingested content for injection; closes a content gate that blocks write-class tools until cleared via `/warden` |
40
+ | assayer | Stop | Verifies the agent's final-message claims against actual command results in the transcript; advisory |
40
41
 
41
42
  Plugins communicate by emitting events to the JSONL log — they do not call each other directly. All plugins depend on the ecosystem substrate; no plugin depends on another plugin directly.
42
43
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@onlooker-community/ecosystem",
3
- "version": "0.24.0",
3
+ "version": "0.25.1",
4
4
  "description": "Agents, skills, hooks, commands, rules, and MCP configurations that power [Onlooker](https://onlooker.dev)",
5
5
  "author": {
6
6
  "name": "Onlooker Community",
@@ -19,14 +19,14 @@
19
19
  "onlooker-install": "install.sh"
20
20
  },
21
21
  "dependencies": {
22
- "@onlooker-community/schema": "^2.5.0"
22
+ "@onlooker-community/schema": "^2.6.0"
23
23
  },
24
24
  "scripts": {
25
25
  "postinstall": "echo '\\n onlooker-ecosystem installed!\\n Run: npx onlooker-install typescript\\n Docs: https://github.com/onlooker-community/ecosystem\\n'",
26
26
  "test": "npm run test:bats && npm run test:schema",
27
27
  "test:bats": "bats test/bats",
28
28
  "test:schema": "node --test test/node/*.test.mjs",
29
- "test:shellcheck": "shellcheck -S error -x install.sh scripts/common.sh scripts/hooks/*.sh scripts/lib/*.sh plugins/archivist/scripts/hooks/*.sh plugins/archivist/scripts/lib/*.sh plugins/tribunal/scripts/hooks/*.sh plugins/tribunal/scripts/lib/*.sh plugins/echo/scripts/hooks/*.sh plugins/echo/scripts/lib/*.sh plugins/governor/scripts/hooks/*.sh plugins/governor/scripts/lib/*.sh plugins/compass/scripts/hooks/*.sh plugins/compass/scripts/lib/*.sh plugins/scribe/scripts/hooks/*.sh plugins/scribe/scripts/lib/*.sh plugins/counsel/scripts/hooks/*.sh plugins/counsel/scripts/lib/*.sh plugins/warden/scripts/hooks/*.sh plugins/warden/scripts/lib/*.sh plugins/librarian/scripts/hooks/*.sh plugins/librarian/scripts/lib/*.sh plugins/curator/scripts/hooks/*.sh plugins/curator/scripts/lib/*.sh plugins/historian/scripts/hooks/*.sh plugins/historian/scripts/lib/*.sh",
29
+ "test:shellcheck": "shellcheck -S error -x install.sh scripts/common.sh scripts/hooks/*.sh scripts/lib/*.sh plugins/archivist/scripts/hooks/*.sh plugins/archivist/scripts/lib/*.sh plugins/tribunal/scripts/hooks/*.sh plugins/tribunal/scripts/lib/*.sh plugins/echo/scripts/hooks/*.sh plugins/echo/scripts/lib/*.sh plugins/governor/scripts/hooks/*.sh plugins/governor/scripts/lib/*.sh plugins/compass/scripts/hooks/*.sh plugins/compass/scripts/lib/*.sh plugins/scribe/scripts/hooks/*.sh plugins/scribe/scripts/lib/*.sh plugins/counsel/scripts/hooks/*.sh plugins/counsel/scripts/lib/*.sh plugins/warden/scripts/hooks/*.sh plugins/warden/scripts/lib/*.sh plugins/librarian/scripts/hooks/*.sh plugins/librarian/scripts/lib/*.sh plugins/curator/scripts/hooks/*.sh plugins/curator/scripts/lib/*.sh plugins/historian/scripts/hooks/*.sh plugins/historian/scripts/lib/*.sh plugins/assayer/scripts/hooks/*.sh plugins/assayer/scripts/lib/*.sh plugins/cartographer/scripts/hooks/*.sh plugins/cartographer/scripts/lib/*.sh",
30
30
  "lint:references": "node scripts/lint/check-references.mjs",
31
31
  "lint:manifests": "node scripts/lint/check-manifests.mjs",
32
32
  "coverage:node": "node scripts/coverage/run-coverage.mjs",
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "assayer",
3
+ "version": "1.0.0",
4
+ "description": "Claim verification. At session end, parses the agent's final message for testable claims (\"I ran the tests, they pass\", \"the build is green\") and checks each against the actual command results in the session transcript, classifying it corroborated, contradicted, or unverifiable. Catches lying-without-malice. Advisory by default. Builds on the Onlooker ecosystem plugin.",
5
+ "author": {
6
+ "name": "Onlooker Community",
7
+ "url": "https://onlooker.dev"
8
+ },
9
+ "homepage": "https://onlooker.dev",
10
+ "repository": "https://github.com/onlooker-community/ecosystem",
11
+ "license": "MIT",
12
+ "skills": [],
13
+ "agents": []
14
+ }
@@ -0,0 +1,10 @@
1
+ # Changelog
2
+
3
+ ## 1.0.0 (2026-06-04)
4
+
5
+
6
+ ### Features
7
+
8
+ * **assayer:** introduce claim-verification plugin ([#70](https://github.com/onlooker-community/ecosystem/issues/70)) ([1d0500b](https://github.com/onlooker-community/ecosystem/commit/1d0500b64f8cd670d1cfa1ac070182d72696bdfd))
9
+
10
+ ## Changelog
@@ -0,0 +1,114 @@
1
+ # Assayer
2
+
3
+ Claim verification — does the agent's story match the session's receipts?
4
+
5
+ When an agent finishes, it tells you what it did: "I ran the tests, they pass," "the build is green," "lint is clean." Assayer treats those as **testable claims** and checks each against what actually happened in the session — the Bash commands that ran and whether they errored. A claim that a passing run contradicts is surfaced, not silently trusted. This catches lying-without-malice: the agent isn't deceiving you, it just misremembered, or assumed, or never re-ran after a change.
6
+
7
+ Assayer is a sibling plugin to [`ecosystem`](../../) and assumes the Onlooker observability substrate (`~/.onlooker/`) is present.
8
+
9
+ ## How it works
10
+
11
+ | Hook | What Assayer does |
12
+ |------|-------------------|
13
+ | `Stop` | Reads the just-finished session's transcript (`transcript_path`). Extracts the agent's testable success claims from its final message with a single `claude -p` pass, cross-checks each against the actual Bash command results in the same transcript, and emits a verdict per claim plus an audit summary. Advisory only — always exits 0, never blocks Stop. |
14
+
15
+ The pipeline:
16
+
17
+ ```
18
+ Stop → Transcript Reader → Claim Extractor (claude -p) → Deterministic Verifier → Events
19
+ ```
20
+
21
+ - **Transcript reader** pulls two things from the JSONL transcript: the **final assistant message** (where claims live) and every **Bash command** paired with its result status. Claude Code records a command as a `tool_use` block and its outcome as a `tool_result` carrying an `is_error` flag — there is no per-call numeric exit code, so `is_error` is the success/failure signal.
22
+ - **Claim extractor** (LLM) reads only the final message and identifies success claims, tagging each with a `type` (`tests_pass`, `build_succeeds`, `lint_clean`, `types_check`, `command_succeeds`, `generic`) and a `command_keyword` — the substring it expects in the verifying command. The LLM does **not** judge truth; it only identifies claims and what would settle them.
23
+ - **Verifier** (deterministic bash) is the factual half: for each claim it finds the most recent command matching the claim's keywords and reads its `is_error`. Same inputs always produce the same verdict.
24
+
25
+ ## Verdicts
26
+
27
+ | Verdict | Meaning |
28
+ |---------|---------|
29
+ | **corroborated** | A matching command ran and succeeded. |
30
+ | **contradicted** | A matching command ran and **failed** — the claim is not backed by the evidence. |
31
+ | **unverified** | No matching command (`no_matching_command`), or the claim implies no checkable command (`ambiguous`). |
32
+
33
+ The most **recent** matching command wins: an agent may fail, fix, and re-run, and the last run reflects the state the final message describes.
34
+
35
+ ## Activation
36
+
37
+ Assayer is **off by default** — it calls `claude -p` on every Stop, so it is opt-in. Enable per-project in `.claude/settings.json`:
38
+
39
+ ```json
40
+ {
41
+ "assayer": {
42
+ "enabled": true
43
+ }
44
+ }
45
+ ```
46
+
47
+ Or globally in `~/.claude/settings.json`.
48
+
49
+ ## Configuration
50
+
51
+ All keys are optional. Unset keys fall back to the plugin's `config.json` defaults.
52
+
53
+ ```json
54
+ {
55
+ "assayer": {
56
+ "enabled": false,
57
+ "evaluation": {
58
+ "model": "claude-haiku-4-5-20251001",
59
+ "timeout_seconds": 60
60
+ },
61
+ "max_claims": 12,
62
+ "min_confidence": 0.5,
63
+ "final_message_chars": 6000
64
+ }
65
+ }
66
+ ```
67
+
68
+ | Key | Default | Description |
69
+ |-----|---------|-------------|
70
+ | `enabled` | `false` | Must be `true` for any audit to run. |
71
+ | `evaluation.model` | `claude-haiku-4-5-20251001` | Model used for claim extraction. Haiku is fast and cheap; the task is structured and shallow. |
72
+ | `evaluation.timeout_seconds` | `60` | Per-pass wall-clock timeout passed to the `timeout` command. |
73
+ | `max_claims` | `12` | Maximum number of claims to extract from a final message. |
74
+ | `min_confidence` | `0.5` | Claims the extractor scores below this are dropped before verification. |
75
+ | `final_message_chars` | `6000` | How many characters of the final assistant message to feed into extraction. |
76
+
77
+ ## Storage layout
78
+
79
+ ```text
80
+ ~/.onlooker/assayer/<project-key>/
81
+ └── audit-<session-id>.json # advisory summary written at end of each audit
82
+ ```
83
+
84
+ Each audit file records the claim count, the corroborated / contradicted / unverified tallies, the overall verdict, and the per-claim list for review in the next session.
85
+
86
+ Project key: first 12 hex chars of SHA256 of `git remote get-url origin`, falling back to a hash of the repo root realpath — stable across directory moves, clones, and worktrees of the same repo.
87
+
88
+ ## Events emitted
89
+
90
+ Assayer emits the canonical `assayer.*` event surface from [`@onlooker-community/schema`](https://github.com/onlooker-community/schema). All events land in `~/.onlooker/logs/onlooker-events.jsonl` and are validated against the schema before write.
91
+
92
+ | Event | When |
93
+ |-------|------|
94
+ | `assayer.audit.started` | Before verification begins. Includes `claim_count` and `command_count`. |
95
+ | `assayer.claim.contradicted` | A claim is contradicted by a failing command. Includes the `claim`, the `evidence_command`, and a `result_excerpt`. |
96
+ | `assayer.claim.unverified` | A claim has no supporting evidence (`reason`: `no_matching_command` or `ambiguous`). |
97
+ | `assayer.audit.complete` | After all claims are checked. Includes the tallies, the `verdict` (`clean`, `contradictions_found`, `nothing_to_verify`), and `duration_ms`. |
98
+
99
+ Corroborated claims are counted in the summary rather than emitted individually — the happy path is the quiet path.
100
+
101
+ ## Requirements
102
+
103
+ - The `ecosystem` plugin installed (for the `~/.onlooker/` substrate and canonical event emission).
104
+ - A release of `@onlooker-community/schema` that includes the `assayer.*` event types (the emitter validates every envelope against the installed schema; older versions reject `assayer.*`).
105
+ - `claude` CLI on `PATH` (the hook shells out to `claude -p` for the extraction pass).
106
+ - `jq` for JSON manipulation.
107
+ - `node` for canonical-event emission.
108
+ - `python3` for millisecond timestamps (standard on macOS and most Linux distributions).
109
+
110
+ ## Architecture decisions
111
+
112
+ Key decisions made during initial design are recorded in [`docs/adr/`](docs/adr/):
113
+
114
+ - [ADR-001](docs/adr/001-verify-claims-against-transcript-evidence.md) — Verify claims at Stop against transcript evidence (and why `is_error`, not exit codes, and why advisory)
@@ -0,0 +1,14 @@
1
+ {
2
+ "plugin_name": "assayer",
3
+ "storage_path": "~/.onlooker",
4
+ "assayer": {
5
+ "enabled": false,
6
+ "evaluation": {
7
+ "model": "claude-haiku-4-5-20251001",
8
+ "timeout_seconds": 60
9
+ },
10
+ "max_claims": 12,
11
+ "min_confidence": 0.5,
12
+ "final_message_chars": 6000
13
+ }
14
+ }
@@ -0,0 +1,57 @@
1
+ # ADR-001: Verify Claims at Stop Against Transcript Evidence
2
+
3
+ - Status: Accepted
4
+ - Date: 2026-06-04
5
+ - Deciders: Meagan
6
+ - Tags: assayer, verification, stop-hook, transcript, honesty
7
+
8
+ ## Context and Problem Statement
9
+
10
+ Every other plugin in the ecosystem assumes the agent's account of its own work is true. Tribunal judges the output, echo scores the prompt, governor counts the spend — but none of them check the most basic thing: when the agent says "I ran the tests and they pass," did the tests actually pass?
11
+
12
+ This failure mode is not malice. An agent claims success because it intended to verify, or it verified an earlier revision, or it ran the command, saw red, fixed something, and never re-ran. The final message reflects a belief, and the belief can be stale. The session transcript already holds the ground truth — the commands that ran and whether they errored — but nothing reconciles the two.
13
+
14
+ The question: **how do we check the agent's claims against what actually happened, cheaply and without false alarms?**
15
+
16
+ ## Decision Drivers
17
+
18
+ - The evidence (commands + results) must already exist and be trustworthy — not reconstructed or re-run.
19
+ - Verification must be deterministic: the same session must always produce the same verdict.
20
+ - False positives are expensive. Flagging a true claim as a lie destroys trust in the plugin faster than missing a false one.
21
+ - It must not interrupt the user's flow for an advisory signal.
22
+
23
+ ## Decision
24
+
25
+ **Assayer runs at `Stop`, reads the session transcript, and reconciles the agent's final-message claims against the Bash command results recorded in that same transcript.**
26
+
27
+ Three sub-decisions follow:
28
+
29
+ ### 1. Stop, reading the committed transcript
30
+
31
+ `Stop` fires once the turn is over and the transcript is fully written to disk — the same `transcript_path` tribunal and compass read. There is no timing-skew risk: every command the agent ran, and every result, is already on disk before assayer looks. Running earlier (e.g. `PostToolUse`) would mean verifying claims that have not been made yet.
32
+
33
+ ### 2. `is_error`, not exit codes
34
+
35
+ Claude Code's transcript represents a command as a `tool_use` block and its outcome as a `tool_result` carrying an `is_error` boolean. It does **not** expose a per-call numeric exit code. So `is_error` is the success/failure signal: a claim of success contradicted by a matching command whose `is_error` is true. The schema's `assayer.claim.contradicted` payload reflects this honestly — `evidence_command` is required, `exit_code` is optional (populated only when a code is recoverable from output), and a `result_excerpt` captures the failing output for the reader.
36
+
37
+ ### 3. Split the work: LLM identifies, bash verifies
38
+
39
+ Claim extraction is a language problem — what counts as a testable success claim, and what command would settle it — so an LLM (`claude -p`, Haiku) does it. The factual cross-check is not a language problem; it is a lookup. So a deterministic bash/jq verifier matches each claim to the most recent command containing its keywords and reads `is_error`. The LLM never judges truth; the verifier never interprets language. This keeps the verdict reproducible and unit-testable, and confines the non-determinism to the one step that genuinely needs it.
40
+
41
+ ### 4. Advisory, not blocking
42
+
43
+ Assayer always exits 0. A contradicted claim is emitted as an event and written to an advisory file, not used to block `Stop`. The turn is already over; the high-value action is a durable, queryable signal ("the agent claimed X; the evidence says otherwise"), not interrupting a finished session. A blocking/enforce mode that re-prompts the agent on contradiction is a plausible future opt-in, but the safe default is advisory.
44
+
45
+ ## Consequences
46
+
47
+ **Positive**
48
+
49
+ - Closes the "did it actually work?" gap with zero new infrastructure — the evidence is already in the transcript.
50
+ - Deterministic and testable: the verifier is pure bash/jq with no LLM in the factual path.
51
+ - Off by default and advisory, so it can never block or surprise a session it was not invited to.
52
+
53
+ **Negative / accepted trade-offs**
54
+
55
+ - Keyword matching is heuristic. A claim whose verifying command uses unexpected wording falls to `unverified` rather than being checked — a miss, not a false alarm, which is the safer direction to err.
56
+ - `is_error` is coarser than an exit code. A command that exits 0 but prints failures (a test runner that swallows its own status) reads as success. Documented; acceptable for v0.1.
57
+ - One `claude -p` call per Stop. This is why the plugin is off by default.
@@ -0,0 +1,72 @@
1
+ # Assayer — Design
2
+
3
+ Assayer is the verification layer of the Onlooker ecosystem. Where tribunal judges *quality* and echo tracks *prompt drift*, assayer answers a narrower, more literal question: **did the things the agent said it did actually happen?**
4
+
5
+ ## The problem: lying-without-malice
6
+
7
+ An agent's final message is a self-report. It says "tests pass," "build is green," "lint is clean." These read as facts but are really *beliefs*, and beliefs drift from reality in ordinary ways:
8
+
9
+ - The agent ran the check against an earlier revision, then changed code, and never re-ran.
10
+ - It intended to run the check and narrated as if it had.
11
+ - It misread a noisy command's output.
12
+
13
+ None of this is deception. But a user who trusts the self-report ships on a false premise. The session transcript already contains the ground truth — every command and its result — so the gap is purely one of reconciliation.
14
+
15
+ ## Pipeline
16
+
17
+ ```
18
+ Stop
19
+ → Transcript Reader (final message + commands-with-status)
20
+ → Claim Extractor (claude -p, Haiku — language understanding)
21
+ → Deterministic Verifier (bash/jq — factual lookup)
22
+ → Events + advisory file
23
+ ```
24
+
25
+ ### Transcript reader (`assayer-transcript.sh`)
26
+
27
+ Two extractions from the JSONL transcript at `transcript_path`:
28
+
29
+ - `assayer_final_assistant_message` — the text blocks of the last assistant turn that contains any, truncated to `final_message_chars`. This is where claims live.
30
+ - `assayer_collect_commands` — every `Bash` `tool_use` joined to its `tool_result` by `tool_use_id`, yielding `{ command, is_error, excerpt }`. Claude Code does not record a numeric exit code per call; `is_error` is the success/failure signal. See ADR-001.
31
+
32
+ ### Claim extractor (`assayer-extract.sh`)
33
+
34
+ A single `claude -p` pass reads only the final message and returns a JSON array of claims, each `{ text, type, command_keyword, confidence }`. `type` is one of `tests_pass | build_succeeds | lint_clean | types_check | command_succeeds | generic`. The model identifies claims and what command would settle them; it does not judge whether they are true. `assayer_parse_claims` strips fences, validates the array, coerces unknown types to `generic`, and drops entries without text.
35
+
36
+ ### Deterministic verifier (`assayer-verify.sh`)
37
+
38
+ `assayer_classify_claim` derives keywords from the claim's `type` (e.g. `tests_pass → ["test"]`) plus the LLM-supplied `command_keyword`, finds the **most recent** command containing any keyword, and classifies on its `is_error`:
39
+
40
+ - matched + not errored → **corroborated**
41
+ - matched + errored → **contradicted**
42
+ - no match → **unverified** (`no_matching_command`, or `ambiguous` when the claim implies no checkable command)
43
+
44
+ "Most recent wins" handles the fail-fix-rerun pattern: the last run reflects the final state. The function is pure — no LLM, no filesystem — so it is fully unit-tested.
45
+
46
+ `assayer_audit_verdict` rolls the per-claim counts into `clean`, `contradictions_found`, or `nothing_to_verify`.
47
+
48
+ ## Events
49
+
50
+ | Event | Payload highlights |
51
+ |-------|--------------------|
52
+ | `assayer.audit.started` | `audit_id`, `claim_count`, `command_count`, `trigger` |
53
+ | `assayer.claim.contradicted` | `claim`, `claim_type`, `evidence_command`, `result_excerpt`, `confidence` |
54
+ | `assayer.claim.unverified` | `claim`, `claim_type`, `reason` |
55
+ | `assayer.audit.complete` | `corroborated`, `contradicted`, `unverified`, `verdict`, `duration_ms` |
56
+
57
+ Corroborated claims are counted in the summary, not emitted individually — the happy path stays quiet.
58
+
59
+ ## Non-goals (v0.1)
60
+
61
+ - **Blocking.** Assayer is advisory; it never blocks `Stop`. An enforce mode is a future opt-in.
62
+ - **Re-running commands.** Assayer reconciles against what already ran; it does not execute anything.
63
+ - **Parsing exit codes from output.** It relies on `is_error`. A command that exits 0 while printing failures reads as success.
64
+ - **Non-Bash evidence.** Only `Bash` results are treated as evidence today.
65
+
66
+ ## Relationship to other plugins
67
+
68
+ Assayer occupies the verification/execution layer, empty until now. It is complementary to:
69
+
70
+ - **tribunal** — judges whether the work is *good*; assayer checks whether the *claims about it* are true.
71
+ - **scribe** — writes the narrative of *why*; assayer checks the factual assertions in that narrative's neighbor, the final message.
72
+ - **counsel** — can consume `assayer.*` events to surface recurring honesty gaps over time.
@@ -0,0 +1,15 @@
1
+ {
2
+ "hooks": {
3
+ "Stop": [
4
+ {
5
+ "matcher": "*",
6
+ "hooks": [
7
+ {
8
+ "type": "command",
9
+ "command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/assayer-stop.sh"
10
+ }
11
+ ]
12
+ }
13
+ ]
14
+ }
15
+ }