npm - @onlooker-community/ecosystem - Versions diffs - 0.10.0 → 0.14.0 - Mend

@onlooker-community/ecosystem 0.10.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/.claude-plugin/marketplace.json +39 -1
package/.claude-plugin/plugin.json +2 -2
package/.github/copilot-instructions.md +46 -0
package/.github/workflows/coverage.yml +78 -0
package/.github/workflows/release.yml +24 -8
package/.github/workflows/test.yml +3 -0
package/.markdownlintignore +3 -0
package/.release-please-manifest.json +4 -1
package/CHANGELOG.md +37 -0
package/README.md +57 -13
package/config.json +6 -1
package/docs/adr/001-claude-code-hooks-as-integration-surface.md +43 -0
package/docs/adr/002-centralized-jsonl-event-log.md +39 -0
package/docs/adr/003-ulid-over-uuid.md +40 -0
package/docs/adr/004-plugin-config-with-settings-overlay.md +34 -0
package/docs/architecture.md +117 -0
package/hooks/hooks.json +4 -0
package/package.json +13 -7
package/plugins/archivist/.claude-plugin/plugin.json +14 -0
package/plugins/archivist/CHANGELOG.md +8 -0
package/plugins/archivist/README.md +105 -0
package/plugins/archivist/config.json +18 -0
package/plugins/archivist/hooks/hooks.json +35 -0
package/plugins/archivist/scripts/hooks/archivist-extract.sh +238 -0
package/plugins/archivist/scripts/hooks/archivist-inject.sh +159 -0
package/plugins/archivist/scripts/lib/archivist-config.sh +66 -0
package/plugins/archivist/scripts/lib/archivist-project-key.sh +91 -0
package/plugins/archivist/scripts/lib/archivist-storage.sh +215 -0
package/plugins/archivist/scripts/lib/archivist-ulid.sh +52 -0
package/plugins/echo/.claude-plugin/plugin.json +14 -0
package/plugins/echo/CHANGELOG.md +24 -0
package/plugins/echo/README.md +110 -0
package/plugins/echo/config.json +15 -0
package/plugins/echo/docs/adr/001-echo-as-separate-plugin.md +33 -0
package/plugins/echo/docs/adr/002-direct-evaluation-vs-tribunal-pipeline.md +35 -0
package/plugins/echo/docs/adr/003-stop-hook-trigger.md +40 -0
package/plugins/echo/hooks/hooks.json +15 -0
package/plugins/echo/scripts/hooks/echo-stop-gate.sh +366 -0
package/plugins/echo/scripts/lib/echo-config.sh +108 -0
package/plugins/echo/scripts/lib/echo-events.sh +74 -0
package/plugins/echo/scripts/lib/echo-project-key.sh +81 -0
package/plugins/echo/scripts/lib/echo-ulid.sh +46 -0
package/plugins/tribunal/.claude-plugin/plugin.json +20 -0
package/plugins/tribunal/CHANGELOG.md +10 -0
package/plugins/tribunal/README.md +134 -0
package/plugins/tribunal/agents/tribunal-actor.md +35 -0
package/plugins/tribunal/agents/tribunal-judge-adversarial.md +51 -0
package/plugins/tribunal/agents/tribunal-judge-security.md +47 -0
package/plugins/tribunal/agents/tribunal-judge-standard.md +47 -0
package/plugins/tribunal/agents/tribunal-meta-judge.md +61 -0
package/plugins/tribunal/config.json +50 -0
package/plugins/tribunal/docs/adr/001-actor-jury-meta-gate-loop.md +40 -0
package/plugins/tribunal/docs/adr/002-majority-gate-policy.md +48 -0
package/plugins/tribunal/hooks/hooks.json +15 -0
package/plugins/tribunal/scripts/hooks/tribunal-stop-gate.sh +267 -0
package/plugins/tribunal/scripts/lib/tribunal-aggregate.sh +65 -0
package/plugins/tribunal/scripts/lib/tribunal-config.sh +101 -0
package/plugins/tribunal/scripts/lib/tribunal-events.sh +97 -0
package/plugins/tribunal/scripts/lib/tribunal-gate.sh +111 -0
package/plugins/tribunal/scripts/lib/tribunal-jury.sh +102 -0
package/plugins/tribunal/scripts/lib/tribunal-project-key.sh +84 -0
package/plugins/tribunal/scripts/lib/tribunal-rubric.sh +153 -0
package/plugins/tribunal/scripts/lib/tribunal-ulid.sh +50 -0
package/plugins/tribunal/scripts/lib/tribunal-verdict.sh +127 -0
package/plugins/tribunal/skills/tribunal/SKILL.md +129 -0
package/release-please-config.json +43 -5
package/scripts/coverage/bash-coverage.mjs +169 -0
package/scripts/coverage/format-comment.mjs +120 -0
package/scripts/coverage/run-coverage.mjs +151 -0
package/scripts/hooks/agent-spawn-tracker.sh +4 -4
package/scripts/hooks/prompt-rule-injector.sh +122 -0
package/scripts/lib/portable-lock.sh +48 -0
package/scripts/lib/prompt-rules.sh +207 -0
package/scripts/lib/tool-history.sh +7 -8
package/scripts/lib/validate-path.sh +4 -0
package/scripts/lint/check-manifests.mjs +314 -0
package/scripts/lint/check-references.mjs +311 -0
package/skills/list-prompt-rules/SKILL.md +15 -0
package/test/bats/archivist-config-files.bats +60 -0
package/test/bats/archivist-config.bats +54 -0
package/test/bats/archivist-inject.bats +73 -0
package/test/bats/archivist-project-key.bats +75 -0
package/test/bats/archivist-storage.bats +119 -0
package/test/bats/archivist-ulid.bats +36 -0
package/test/bats/config.bats +10 -10
package/test/bats/echo-config.bats +90 -0
package/test/bats/echo-events.bats +121 -0
package/test/bats/echo-project-key.bats +115 -0
package/test/bats/echo-stop-hook.bats +101 -0
package/test/bats/echo-ulid.bats +38 -0
package/test/bats/portable-lock.bats +62 -0
package/test/bats/prompt-rules.bats +269 -0
package/test/bats/tribunal-aggregate.bats +77 -0
package/test/bats/tribunal-config.bats +86 -0
package/test/bats/tribunal-events.bats +209 -0
package/test/bats/tribunal-gate.bats +95 -0
package/test/bats/tribunal-jury.bats +80 -0
package/test/bats/tribunal-rubric.bats +119 -0
package/test/bats/tribunal-stop-hook.bats +73 -0
package/test/bats/tribunal-verdict.bats +71 -0
package/test/fixtures/hook-inputs/user-prompt-submit-rule-match.json +8 -0
package/test/fixtures/hook-inputs/user-prompt-submit-rule-nomatch.json +8 -0
package/test/helpers/setup.bash +9 -0
package/test/node/check-manifests.test.mjs +173 -0
package/test/node/check-references.test.mjs +279 -0
package/test/node/coverage.test.mjs +143 -0

package/plugins/echo/README.md ADDED Viewed

@@ -0,0 +1,110 @@
+# Echo
+Prompt-change regression detection for the Onlooker ecosystem.
+When a watched agent file is modified, Echo runs a single-judge quality pass on the file via `claude -p` and compares the score against a stored baseline. It reports whether the change **improved**, **degraded**, or had **no measurable effect** on prompt quality — giving every prompt edit a before/after signal instead of relying on intuition.
+Echo is a sibling plugin to [`ecosystem`](../../) and assumes the Onlooker observability substrate (`~/.onlooker/`) is present.
+## How it works
+Echo registers a **Stop hook** that fires at the end of every Claude Code session. When triggered:
+1. Detects which watched files changed (unstaged, staged, or untracked).
+2. Filters against configured `watch_paths` and `exclude_paths` patterns.
+3. For each matching file, builds a rubric prompt and calls `claude -p --max-turns 1` to score it on four criteria: role clarity, output format, criterion coverage, and internal consistency.
+4. Compares the score to a stored baseline (if one exists) and emits `echo.improvement.detected` or `echo.regression.detected`.
+5. Emits `echo.suite.complete` with aggregate drift, a `merge_recommended` flag, and duration.
+The hook always exits 0 — it never blocks a session from ending.
+## Activation
+Echo is **off by default**. Enable it per-project in `.claude/settings.json`:
+```json
+{
+  "echo": {
+    "enabled": true
+  }
+}
+```
+## Configuration
+All keys are optional. Unset keys fall back to the plugin's `config.json` defaults.
+```json
+{
+  "echo": {
+    "enabled": true,
+    "watch_paths": ["plugins/*/agents/*.md"],
+    "exclude_paths": [],
+    "drift_threshold": 0.05,
+    "evaluation": {
+      "model": "claude-haiku-4-5-20251001",
+      "timeout_seconds": 60
+    }
+  }
+}
+```
+| Key | Default | Description |
+|-----|---------|-------------|
+| `enabled` | `false` | Must be `true` for any evaluation to run. |
+| `watch_paths` | `["plugins/*/agents/*.md"]` | Glob patterns (relative to repo root) of files to watch. Bash extended glob syntax. |
+| `exclude_paths` | `[]` | Patterns to exclude. `plugins/echo/**` is always excluded regardless of this setting. |
+| `drift_threshold` | `0.05` | Minimum absolute score delta to classify a change as improvement or regression. Deltas below this are reported as neutral. |
+| `evaluation.model` | `claude-haiku-4-5-20251001` | Model used for the quality pass. Haiku is fast and cheap; upgrade to Sonnet for higher-stakes repos. |
+| `evaluation.timeout_seconds` | `60` | Per-file wall-clock timeout passed to the `timeout` command. |
+## Scoring rubric
+Each watched file is scored 0.0–1.0 on four equally-weighted criteria:
+| Criterion | What it checks |
+|-----------|---------------|
+| **Role clarity** | Does the file clearly define what the agent is and what it must do? |
+| **Output format** | Are output format and schema requirements unambiguous? |
+| **Criterion coverage** | Are all evaluation dimensions specified with enough detail to apply consistently? |
+| **Internal consistency** | No contradictory instructions; no undefined terms. |
+A score ≥ 0.7 is considered "passed". A delta beyond `drift_threshold` in either direction is classified as improvement or regression.
+## Storage layout
+```text
+~/.onlooker/echo/<project-key>/
+├── baselines/
+│   └── <test-id>.json          # one per watched file (test-id = first 16 hex of SHA256 of path)
+└── run-<session-id>.json       # advisory summary written at end of each suite
+```
+Project key: first 12 hex chars of SHA256 of `git remote get-url origin`, falling back to a hash of the repo root realpath. This makes the key stable across directory moves and clones of the same repo.
+## Events emitted
+Echo emits the canonical `echo.*` event surface from [`@onlooker-community/schema`](https://github.com/onlooker-community/schema) v2.2.0+. All events land in `~/.onlooker/logs/onlooker-events.jsonl` and are validated against the schema before write.
+| Event | When |
+|-------|------|
+| `echo.suite.started` | Before the evaluation loop begins. Includes `test_count` and `changed_file`. |
+| `echo.improvement.detected` | A file's score increased beyond `drift_threshold`. |
+| `echo.regression.detected` | A file's score decreased beyond `drift_threshold`. |
+| `echo.suite.complete` | After all files are evaluated. Includes aggregate drift fields when a prior baseline exists. |
+## Requirements
+- The `ecosystem` plugin installed (for the `~/.onlooker/` substrate and canonical event emission).
+- `claude` CLI on `PATH` (the hook shells out to `claude -p` for evaluation passes).
+- `jq` for JSON manipulation.
+- `node` for canonical-event emission.
+- `python3` for millisecond timestamps (standard on macOS and most Linux distributions).
+## Architecture decisions
+Key decisions made during initial design are recorded in [`docs/adr/`](docs/adr/):
+- [ADR-001](docs/adr/001-echo-as-separate-plugin.md) — Echo as a separate plugin, not an extension of Tribunal
+- [ADR-002](docs/adr/002-direct-evaluation-vs-tribunal-pipeline.md) — Direct `claude -p` evaluation vs. routing through Tribunal's full pipeline
+- [ADR-003](docs/adr/003-stop-hook-trigger.md) — Stop hook as the trigger mechanism

package/plugins/echo/config.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "plugin_name": "echo",
+  "storage_path": "~/.onlooker",
+  "echo": {
+    "enabled": false,
+    "watch_paths": ["plugins/*/agents/*.md"],
+    "exclude_paths": [],
+    "drift_threshold": 0.05,
+    "evaluation": {
+      "model": "claude-haiku-4-5-20251001",
+      "max_output_tokens": 512,
+      "timeout_seconds": 60
+    }
+  }
+}

package/plugins/echo/docs/adr/001-echo-as-separate-plugin.md ADDED Viewed

@@ -0,0 +1,33 @@
+# ADR-001: Echo as a Separate Plugin, Not an Extension of Tribunal
+**Status:** Accepted
+**Date:** 2026-05-24
+## Context
+Tribunal already exists as an evaluation engine in this ecosystem. When designing Echo's prompt-change regression detection, the first question was whether Echo should live inside Tribunal (as a sub-feature or mode) or stand alone as its own plugin.
+The Tribunal team ran a formal evaluation of this question using Tribunal itself. The final score across three iterations was 0.79 (above the 0.75 acceptance threshold), with outcome `exhausted_iterations` — the adversarial judge never passed, which is expected behavior for that judge type. The substantive conclusion from all three iterations was: Echo is architecturally sound as a standalone plugin.
+## Decision
+Echo is a separate, independent plugin under `plugins/echo/`.
+## Rationale
+**Separate concerns.** Tribunal is an orchestrator for arbitrary tasks. Echo is a specialized harness for prompt quality regression testing. Bundling Echo into Tribunal would couple two distinct concerns: general task evaluation and change-detection/baselining.
+**Different lifecycle.** Tribunal is always on (for `/tribunal` skill invocations) and opt-in for its Stop hook. Echo is opt-in by default (`"enabled": false`) and has no interactive skill surface — it only runs as a Stop hook. These are different activation patterns that would conflict if forced into the same plugin config namespace.
+**Independent versioning.** Echo and Tribunal can release, iterate, and break/fix independently. Echo v0.2 does not need to drag along a Tribunal major bump.
+**Composability.** Echo today calls `claude -p` directly. A future version could delegate to Tribunal for richer multi-judge evaluation (see ADR-002). That migration is easier when Echo is its own entry point.
+**Self-exclusion.** Echo must never trigger on its own files changing. This is simpler to enforce as a first-class concern in a standalone plugin (`plugins/echo/**` is always in `exclude_paths`) than as a special case inside Tribunal.
+## Consequences
+- Echo requires the ecosystem plugin but does **not** require Tribunal to be installed.
+- Echo gets its own `config.json`, `hooks.json`, `.claude-plugin/plugin.json`, CHANGELOG, and release-please track.
+- Any future Tribunal integration (e.g., Echo delegating multi-judge eval to Tribunal) will be an opt-in config option, not a hard dependency.
+- Marketplace listing and docs must be careful not to imply Tribunal is a prerequisite (an early draft of the description made this mistake; corrected before merge).

package/plugins/echo/docs/adr/002-direct-evaluation-vs-tribunal-pipeline.md ADDED Viewed

@@ -0,0 +1,35 @@
+# ADR-002: Direct `claude -p` Evaluation vs. Routing Through Tribunal's Pipeline
+**Status:** Accepted (with planned future extension)
+**Date:** 2026-05-24
+## Context
+Echo needs to evaluate prompt file quality before and after a change. Two approaches were available:
+**Option A — Direct `claude -p`**: Build an inline rubric prompt, call `claude -p --max-turns 1` for each file, and parse the JSON score from the response.
+**Option B — Tribunal pipeline**: Invoke Tribunal's multi-judge Actor → Jury → Meta-Judge → Gate loop for each file and use the aggregated score as the quality signal.
+## Decision
+Echo v0.1 uses **Option A** — direct `claude -p` with an inline rubric.
+## Rationale
+**Stop hook latency budget.** A Stop hook fires synchronously at the end of every session. Tribunal's full loop (Actor + two judges + Meta-Judge + Gate, with potential retries) takes 30–120 seconds per task. Multiplied across several watched files, this would make sessions feel like they hang after every edit. A single `claude -p` call with a 60-second timeout keeps the overhead acceptable.
+**Echo evaluates prompts, not outputs.** Tribunal's loop is designed to evaluate an Agent's *work product* against a rubric. Echo evaluates *the prompt file itself* — a simpler, single-document task. A full jury is architecturally overweight for this use case.
+**Baseline stability.** Tribunal's multi-judge scores have meaningful variance across runs (different judge models, adversarial judge behavior, Meta-Judge overrides). Echo's baseline comparison depends on stable, reproducible scores — a single `claude -p` pass with a fixed model and rubric is more consistent as a yardstick.
+**Haiku is cheap enough.** Evaluating a prompt file with Haiku costs a fraction of a cent. Running a full Tribunal loop (Opus-class models for judges) would cost 10–50× more per file per session. With a default model of `claude-haiku-4-5-20251001`, Echo can run automatically without raising cost concerns.
+**Independent of Tribunal installation.** Option A requires only the `claude` CLI. Option B would make Tribunal a hard runtime dependency of Echo, coupling two plugins that have separate versioning and installation paths (see ADR-001).
+## Consequences
+- Echo's evaluation quality is bounded by a single-model, single-pass rubric. It will miss issues that a diverse jury would catch, but it is consistent enough to detect regressions.
+- The scoring rubric (role clarity, output format, criterion coverage, internal consistency) is hardcoded in the hook rather than being user-overridable in v0.1. A future version should expose this as config.
+- A future `echo.mode: "tribunal"` config option could delegate to Tribunal's jury for higher-confidence evaluation when cost and latency are acceptable. The current design leaves room for this — Echo's event schema (`echo.suite.started`, etc.) is agnostic to the underlying evaluator.
+- The `claude -p` response parsing includes a `sed` strip for accidental markdown fences, which Tribunal's pipeline avoids by using structured judge output. This is a fragility to watch.

package/plugins/echo/docs/adr/003-stop-hook-trigger.md ADDED Viewed

@@ -0,0 +1,40 @@
+# ADR-003: Stop Hook as the Trigger Mechanism
+**Status:** Accepted
+**Date:** 2026-05-24
+## Context
+Echo needs to know when an agent file has changed and run an evaluation. Several trigger points were considered:
+- **Stop hook** — fires when a Claude Code session ends.
+- **Pre-commit hook** — fires when the developer runs `git commit`.
+- **PostToolUse hook** — fires after every tool call that writes a file.
+- **CI step** — fires on push to a remote branch.
+- **Manual `/echo` skill** — user-invoked on demand.
+## Decision
+Echo v0.1 uses the **Stop hook**.
+## Rationale
+**Correct granularity.** A session is the natural unit of prompt engineering work. A developer edits `tribunal-judge-standard.md`, tests it through several turns, and ends the session. That's the moment Echo should fire — after the work is done, not after each intermediate save.
+**Claude Code already provides it.** The Stop hook is a first-class Claude Code hook type with a well-defined contract: the hook receives `{cwd, session_id}` on stdin and must exit 0 (or the session stop is blocked, which is why Echo always exits 0). No additional tooling or git hooks needed.
+**Consistent with Tribunal's pattern.** Tribunal's Stop hook (when enabled) follows the same pattern — an advisory pass that fires at session end without blocking the stop. Echo mirrors this, which keeps the plugin model coherent across the ecosystem.
+**No commit discipline required.** A pre-commit hook would only fire when the developer commits. Many prompt engineering workflows involve many experimental edits before any commit. Echo should capture signal on *any* session where a watched file changed, not only committed ones. Untracked and unstaged files are explicitly included in Echo's change detection.
+**Low friction.** PostToolUse fires on every file write, which would run evaluations continuously mid-session — expensive, noisy, and disruptive. The Stop hook batches all changes from a session into a single suite run.
+**Not CI.** CI integration has value but is a separate concern. A CI step can't write to `~/.onlooker/` on the developer's machine, and the baseline comparison is inherently local. Echo is a local development feedback tool; CI integration (e.g., posting drift to a PR comment) is a future feature.
+## Consequences
+- The recursion guard (`ECHO_NESTED=1`) is mandatory. `claude -p` spawns a subprocess that also triggers Stop, which would re-enter the hook infinitely. The guard must be set before any work begins and is checked as the very first statement.
+- Echo cannot fire mid-session, so rapid iteration on a prompt file produces one signal per session, not one per edit. This is a feature for reducing noise, but means a long session with many edits only records the final state of each file.
+- If a session ends without the developer saving changes (e.g., closed the terminal abruptly), the Stop hook may not fire. This is consistent with how all Stop hooks in Claude Code behave.
+- Users who want on-demand evaluation can invoke Echo's logic manually by calling the hook directly. A future `/echo` skill could wrap this.
+- The hook must be registered in `hooks.json` with `"matcher": "*"` so it fires on all sessions. Projects that want to opt out can set `echo.enabled: false` rather than removing the hook registration.

package/plugins/echo/hooks/hooks.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "hooks": {
+    "Stop": [
+      {
+        "matcher": "*",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/echo-stop-gate.sh"
+          }
+        ]
+      }
+    ]
+  }
+}

package/plugins/echo/scripts/hooks/echo-stop-gate.sh ADDED Viewed

@@ -0,0 +1,366 @@
+#!/usr/bin/env bash
+# Echo Stop-gate hook.
+#
+# Triggered by Stop. Off by default — gated on echo.enabled in config.
+# When enabled, detects which watched agent files changed in this session,
+# runs a single-judge advisory pass on each, and compares the score against a
+# stored baseline to report improved / degraded / neutral.
+#
+# Hook contract:
+#   - Always exits 0. Never blocks Stop.
+#   - Skips silently if disabled, no git context, or no watched files changed.
+#   - Recursion guard: exits immediately if ECHO_NESTED=1 to prevent a claude -p
+#     subprocess from re-triggering this hook on its own Writes.
+#   - Errors from `claude -p` are swallowed; worst case is no verdict written.
+set -uo pipefail
+# Recursion guard — must be first.
+[[ "${ECHO_NESTED:-}" == "1" ]] && exit 0
+export ECHO_NESTED=1
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+# Resolve the ecosystem root (sibling to this plugin's parent).
+_ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
+if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
+	_candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
+	if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
+		_ECOSYSTEM_ROOT="$_candidate"
+	fi
+fi
+if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
+	# shellcheck disable=SC1091
+	CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
+	# shellcheck disable=SC1091
+	CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/onlooker-schema.sh"
+fi
+# shellcheck source=../lib/echo-config.sh
+source "${PLUGIN_ROOT}/scripts/lib/echo-config.sh"
+# shellcheck source=../lib/echo-project-key.sh
+source "${PLUGIN_ROOT}/scripts/lib/echo-project-key.sh"
+# shellcheck source=../lib/echo-ulid.sh
+source "${PLUGIN_ROOT}/scripts/lib/echo-ulid.sh"
+# shellcheck source=../lib/echo-events.sh
+CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" source "${PLUGIN_ROOT}/scripts/lib/echo-events.sh"
+INPUT=$(cat)
+CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
+SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
+_done() { exit 0; }
+# ---------------------------------------------------------------------------
+# Config + prerequisites
+# ---------------------------------------------------------------------------
+REPO_ROOT=$(echo_project_repo_root "$CWD")
+[[ -z "$REPO_ROOT" ]] && _done
+CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_load "$REPO_ROOT"
+echo_config_enabled || _done
+PROJECT_KEY=$(echo_project_key "$CWD")
+[[ -z "$PROJECT_KEY" ]] && _done
+command -v claude >/dev/null 2>&1 || _done
+command -v jq >/dev/null 2>&1 || _done
+# ---------------------------------------------------------------------------
+# Identify changed agent files
+# ---------------------------------------------------------------------------
+# Collect all changed paths: unstaged, staged, and untracked.
+CHANGED_FILES=$(git -C "$REPO_ROOT" diff --name-only HEAD 2>/dev/null) || CHANGED_FILES=""
+STAGED_FILES=$(git -C "$REPO_ROOT" diff --name-only --cached 2>/dev/null) || STAGED_FILES=""
+UNTRACKED_FILES=$(git -C "$REPO_ROOT" ls-files --others --exclude-standard 2>/dev/null) || UNTRACKED_FILES=""
+ALL_CHANGED=$(printf '%s\n%s\n%s' "$CHANGED_FILES" "$STAGED_FILES" "$UNTRACKED_FILES" | sort -u | grep -v '^$') || ALL_CHANGED=""
+[[ -z "$ALL_CHANGED" ]] && _done
+# Load watch and exclude patterns (bash 3 compatible — no mapfile).
+WATCH_PATTERNS=()
+while IFS= read -r _pat; do
+	[[ -n "$_pat" ]] && WATCH_PATTERNS+=("$_pat")
+done < <(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_watch_paths)
+EXCLUDE_PATTERNS=()
+while IFS= read -r _pat; do
+	[[ -n "$_pat" ]] && EXCLUDE_PATTERNS+=("$_pat")
+done < <(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_exclude_paths)
+# Filter changed files: must match at least one watch pattern AND no exclude pattern.
+WATCHED_CHANGED=()
+while IFS= read -r f; do
+	[[ -z "$f" ]] && continue
+	local_match=0
+	for pat in "${WATCH_PATTERNS[@]}"; do
+		# shellcheck disable=SC2053
+		if [[ "$f" == $pat ]]; then
+			local_match=1
+			break
+		fi
+	done
+	[[ "$local_match" -eq 0 ]] && continue
+	excluded=0
+	for pat in "${EXCLUDE_PATTERNS[@]}"; do
+		# shellcheck disable=SC2053
+		if [[ "$f" == $pat ]]; then
+			excluded=1
+			break
+		fi
+	done
+	[[ "$excluded" -eq 1 ]] && continue
+	WATCHED_CHANGED+=("$f")
+done <<< "$ALL_CHANGED"
+[[ "${#WATCHED_CHANGED[@]}" -eq 0 ]] && _done
+# ---------------------------------------------------------------------------
+# Storage paths
+# ---------------------------------------------------------------------------
+ONLOOKER_BASE="${ONLOOKER_DIR:-$HOME/.onlooker}"
+ECHO_DIR="${ONLOOKER_BASE}/echo/${PROJECT_KEY}"
+BASELINE_DIR="${ECHO_DIR}/baselines"
+mkdir -p "$BASELINE_DIR" 2>/dev/null || _done
+# ---------------------------------------------------------------------------
+# Evaluation loop
+# ---------------------------------------------------------------------------
+EVAL_MODEL=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_model)
+TIMEOUT_SECS=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_timeout)
+DRIFT_THRESHOLD=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_drift_threshold)
+SUITE_ID=$(echo_ulid)
+SUITE_START=$(python3 -c 'import time; print(int(time.time()*1000))' 2>/dev/null || echo 0)
+FIRST_CHANGED="${WATCHED_CHANGED[0]}"
+suite_started_payload=$(jq -n \
+	--arg suite_id "$SUITE_ID" \
+	--argjson test_count "${#WATCHED_CHANGED[@]}" \
+	--arg trigger "file_change" \
+	--arg changed_file "$FIRST_CHANGED" \
+	'{suite_id: $suite_id, test_count: $test_count, trigger: $trigger, changed_file: $changed_file}')
+echo_emit_event "echo.suite.started" "$suite_started_payload" || true
+PROMPT_FILE=$(mktemp -t echo-prompt.XXXXXX 2>/dev/null) || PROMPT_FILE="/tmp/echo-prompt.$$"
+trap 'rm -f "$PROMPT_FILE"' EXIT
+count_improved=0
+count_degraded=0
+count_neutral=0
+sum_before=0
+sum_after=0
+file_count=0
+for rel_path in "${WATCHED_CHANGED[@]}"; do
+	abs_path="${REPO_ROOT}/${rel_path}"
+	[[ ! -f "$abs_path" ]] && continue
+	FILE_CONTENT=$(cat "$abs_path" 2>/dev/null) || continue
+	[[ -z "$FILE_CONTENT" ]] && continue
+	TEST_ID=$(echo_test_id_for_path "$rel_path")
+	BASELINE_FILE="${BASELINE_DIR}/${TEST_ID}.json"
+	# Build the evaluation prompt.
+	{
+		printf '%s\n' 'You are evaluating an agent prompt file for quality. Return JSON only — no prose, no markdown fences.'
+		printf '\n'
+		printf '%s\n' 'Output schema (exactly these keys):'
+		printf '%s\n' '{'
+		printf '%s\n' '  "score": 0.0..1.0,'
+		printf '%s\n' '  "passed": true|false,'
+		printf '%s\n' '  "confidence": 0.0..1.0,'
+		printf '%s\n' '  "feedback": "1-2 sentences on the highest-leverage issue, if any."'
+		printf '%s\n' '}'
+		printf '\n'
+		printf '%s\n' 'Score on these criteria (equal weight):'
+		printf '%s\n' '  - Role clarity: does the file clearly define what the agent is and what it must do?'
+		printf '%s\n' '  - Output format: are output format and schema requirements unambiguous?'
+		printf '%s\n' '  - Criterion coverage: are all evaluation dimensions specified with enough detail to apply consistently?'
+		printf '%s\n' '  - Internal consistency: no contradictory instructions, no undefined terms.'
+		printf '\n'
+		printf '%s\n' "A score >= 0.7 is \"passed\". Be concise."
+		printf '\n'
+		printf '%s\n' "---FILE: ${rel_path}---"
+		printf '%s\n' "$FILE_CONTENT"
+		printf '%s\n' '---END FILE---'
+	} > "$PROMPT_FILE"
+	CLAUDE_ARGS=(-p --max-turns 1)
+	[[ -n "$EVAL_MODEL" ]] && CLAUDE_ARGS+=(--model "$EVAL_MODEL")
+	RESPONSE=""
+	if command -v timeout >/dev/null 2>&1; then
+		RESPONSE=$(timeout "$TIMEOUT_SECS" claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
+	elif command -v gtimeout >/dev/null 2>&1; then
+		RESPONSE=$(gtimeout "$TIMEOUT_SECS" claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
+	else
+		RESPONSE=$(claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
+	fi
+	[[ -z "$RESPONSE" ]] && continue
+	CLEAN=$(printf '%s' "$RESPONSE" | sed -e 's/^```json//' -e 's/^```//' -e 's/```$//')
+	SCORE_AFTER=$(printf '%s' "$CLEAN" | jq -r '.score // empty' 2>/dev/null) || SCORE_AFTER=""
+	CONFIDENCE=$(printf '%s' "$CLEAN" | jq -r '.confidence // "0.6"' 2>/dev/null) || CONFIDENCE="0.6"
+	[[ -z "$SCORE_AFTER" ]] && continue
+	SCORE_BEFORE=""
+	if [[ -f "$BASELINE_FILE" ]]; then
+		SCORE_BEFORE=$(jq -r '.score // empty' "$BASELINE_FILE" 2>/dev/null) || SCORE_BEFORE=""
+	fi
+	# Persist new baseline.
+	jq -n \
+		--arg path "$rel_path" \
+		--arg test_id "$TEST_ID" \
+		--argjson score "$SCORE_AFTER" \
+		--arg ts "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \
+		'{path: $path, test_id: $test_id, score: $score, recorded_at: $ts}' \
+		> "$BASELINE_FILE" 2>/dev/null || true
+	file_count=$((file_count + 1))
+	sum_after=$(python3 -c "print($sum_after + $SCORE_AFTER)" 2>/dev/null) || sum_after=$sum_after
+	if [[ -n "$SCORE_BEFORE" ]]; then
+		DELTA=$(python3 -c "print(round($SCORE_AFTER - $SCORE_BEFORE, 4))" 2>/dev/null) || DELTA="0"
+		sum_before=$(python3 -c "print($sum_before + $SCORE_BEFORE)" 2>/dev/null) || sum_before=$sum_before
+		ABS_DELTA=$(python3 -c "print(abs($DELTA))" 2>/dev/null) || ABS_DELTA="0"
+		IS_IMPROVED=$(python3 -c "print('true' if $DELTA > $DRIFT_THRESHOLD else 'false')" 2>/dev/null) || IS_IMPROVED="false"
+		IS_DEGRADED=$(python3 -c "print('true' if $DELTA < -$DRIFT_THRESHOLD else 'false')" 2>/dev/null) || IS_DEGRADED="false"
+		FILE_NAME=$(basename "$rel_path")
+		if [[ "$IS_IMPROVED" == "true" ]]; then
+			count_improved=$((count_improved + 1))
+			improvement_payload=$(jq -n \
+				--arg suite_id "$SUITE_ID" \
+				--arg test_id "$TEST_ID" \
+				--arg test_name "$FILE_NAME" \
+				--argjson score_before "$SCORE_BEFORE" \
+				--argjson score_after "$SCORE_AFTER" \
+				--argjson delta "$DELTA" \
+				--argjson confidence "$CONFIDENCE" \
+				'{suite_id: $suite_id, test_id: $test_id, test_name: $test_name,
+				  score_before: $score_before, score_after: $score_after,
+				  delta: $delta, confidence: $confidence}')
+			echo_emit_event "echo.improvement.detected" "$improvement_payload" || true
+		elif [[ "$IS_DEGRADED" == "true" ]]; then
+			count_degraded=$((count_degraded + 1))
+			regression_payload=$(jq -n \
+				--arg suite_id "$SUITE_ID" \
+				--arg test_id "$TEST_ID" \
+				--arg test_name "$FILE_NAME" \
+				--argjson score_before "$SCORE_BEFORE" \
+				--argjson score_after "$SCORE_AFTER" \
+				--argjson delta "$DELTA" \
+				--argjson confidence "$CONFIDENCE" \
+				'{suite_id: $suite_id, test_id: $test_id, test_name: $test_name,
+				  score_before: $score_before, score_after: $score_after,
+				  delta: $delta, confidence: $confidence}')
+			echo_emit_event "echo.regression.detected" "$regression_payload" || true
+		else
+			count_neutral=$((count_neutral + 1))
+		fi
+	else
+		# First evaluation for this file — no baseline to compare against yet.
+		count_neutral=$((count_neutral + 1))
+	fi
+done
+[[ "$file_count" -eq 0 ]] && _done
+# ---------------------------------------------------------------------------
+# Emit suite events
+# ---------------------------------------------------------------------------
+SUITE_END=$(python3 -c 'import time; print(int(time.time()*1000))' 2>/dev/null || echo 0)
+DURATION_MS=$(( SUITE_END - SUITE_START ))
+MERGE_RECOMMENDED="false"
+[[ "$count_degraded" -eq 0 && "$count_improved" -gt 0 ]] && MERGE_RECOMMENDED="true"
+[[ "$count_degraded" -eq 0 && "$count_improved" -eq 0 ]] && MERGE_RECOMMENDED="true"
+if [[ "$file_count" -gt 0 && -n "$sum_before" ]] && python3 -c "exit(0 if $sum_before > 0 else 1)" 2>/dev/null; then
+	BASELINE_AVG=$(python3 -c "print(round($sum_before / $file_count, 4))" 2>/dev/null) || BASELINE_AVG=""
+	AFTER_AVG=$(python3 -c "print(round($sum_after / $file_count, 4))" 2>/dev/null) || AFTER_AVG=""
+	DRIFT=$(python3 -c "print(round($sum_after / $file_count - $sum_before / $file_count, 4))" 2>/dev/null) || DRIFT=""
+	if [[ -n "$BASELINE_AVG" && -n "$AFTER_AVG" && -n "$DRIFT" ]]; then
+		suite_complete_payload=$(jq -n \
+			--arg suite_id "$SUITE_ID" \
+			--argjson test_count "$file_count" \
+			--argjson improved "$count_improved" \
+			--argjson degraded "$count_degraded" \
+			--argjson neutral "$count_neutral" \
+			--argjson merge_recommended "$MERGE_RECOMMENDED" \
+			--argjson duration_ms "$DURATION_MS" \
+			--argjson baseline_score "$BASELINE_AVG" \
+			--argjson score_after "$AFTER_AVG" \
+			--argjson drift "$DRIFT" \
+			--argjson drift_threshold "$DRIFT_THRESHOLD" \
+			'{suite_id: $suite_id, test_count: $test_count,
+			  improved: $improved, degraded: $degraded, neutral: $neutral,
+			  merge_recommended: $merge_recommended, duration_ms: $duration_ms,
+			  baseline_score: $baseline_score, score_after: $score_after,
+			  drift: $drift, drift_threshold: $drift_threshold}')
+	else
+		suite_complete_payload=$(jq -n \
+			--arg suite_id "$SUITE_ID" \
+			--argjson test_count "$file_count" \
+			--argjson improved "$count_improved" \
+			--argjson degraded "$count_degraded" \
+			--argjson neutral "$count_neutral" \
+			--argjson merge_recommended "$MERGE_RECOMMENDED" \
+			--argjson duration_ms "$DURATION_MS" \
+			'{suite_id: $suite_id, test_count: $test_count,
+			  improved: $improved, degraded: $degraded, neutral: $neutral,
+			  merge_recommended: $merge_recommended, duration_ms: $duration_ms}')
+	fi
+else
+	suite_complete_payload=$(jq -n \
+		--arg suite_id "$SUITE_ID" \
+		--argjson test_count "$file_count" \
+		--argjson improved "$count_improved" \
+		--argjson degraded "$count_degraded" \
+		--argjson neutral "$count_neutral" \
+		--argjson merge_recommended "$MERGE_RECOMMENDED" \
+		--argjson duration_ms "$DURATION_MS" \
+		'{suite_id: $suite_id, test_count: $test_count,
+		  improved: $improved, degraded: $degraded, neutral: $neutral,
+		  merge_recommended: $merge_recommended, duration_ms: $duration_ms}')
+fi
+echo_emit_event "echo.suite.complete" "$suite_complete_payload" || true
+# ---------------------------------------------------------------------------
+# Write advisory file for review in next session.
+# ---------------------------------------------------------------------------
+SAFE_SESSION_ID=$(printf '%s' "${SESSION_ID:-unknown}" | tr -c 'a-zA-Z0-9-' '_')
+jq -n \
+	--arg suite_id "$SUITE_ID" \
+	--arg session_id "${SESSION_ID:-unknown}" \
+	--argjson test_count "$file_count" \
+	--argjson improved "$count_improved" \
+	--argjson degraded "$count_degraded" \
+	--argjson neutral "$count_neutral" \
+	--argjson merge_recommended "$MERGE_RECOMMENDED" \
+	--argjson files "$(printf '%s\n' "${WATCHED_CHANGED[@]}" | jq -R . | jq -s .)" \
+	'{suite_id: $suite_id, session_id: $session_id, test_count: $test_count,
+	  improved: $improved, degraded: $degraded, neutral: $neutral,
+	  merge_recommended: $merge_recommended, files: $files}' \
+	> "${ECHO_DIR}/run-${SAFE_SESSION_ID}.json" 2>/dev/null || true
+_done