@onlooker-community/ecosystem 0.10.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/.claude-plugin/marketplace.json +39 -1
  2. package/.claude-plugin/plugin.json +2 -2
  3. package/.github/copilot-instructions.md +46 -0
  4. package/.github/workflows/coverage.yml +78 -0
  5. package/.github/workflows/release.yml +24 -8
  6. package/.github/workflows/test.yml +3 -0
  7. package/.markdownlintignore +3 -0
  8. package/.release-please-manifest.json +4 -1
  9. package/CHANGELOG.md +37 -0
  10. package/README.md +57 -13
  11. package/config.json +6 -1
  12. package/docs/adr/001-claude-code-hooks-as-integration-surface.md +43 -0
  13. package/docs/adr/002-centralized-jsonl-event-log.md +39 -0
  14. package/docs/adr/003-ulid-over-uuid.md +40 -0
  15. package/docs/adr/004-plugin-config-with-settings-overlay.md +34 -0
  16. package/docs/architecture.md +117 -0
  17. package/hooks/hooks.json +4 -0
  18. package/package.json +13 -7
  19. package/plugins/archivist/.claude-plugin/plugin.json +14 -0
  20. package/plugins/archivist/CHANGELOG.md +8 -0
  21. package/plugins/archivist/README.md +105 -0
  22. package/plugins/archivist/config.json +18 -0
  23. package/plugins/archivist/hooks/hooks.json +35 -0
  24. package/plugins/archivist/scripts/hooks/archivist-extract.sh +238 -0
  25. package/plugins/archivist/scripts/hooks/archivist-inject.sh +159 -0
  26. package/plugins/archivist/scripts/lib/archivist-config.sh +66 -0
  27. package/plugins/archivist/scripts/lib/archivist-project-key.sh +91 -0
  28. package/plugins/archivist/scripts/lib/archivist-storage.sh +215 -0
  29. package/plugins/archivist/scripts/lib/archivist-ulid.sh +52 -0
  30. package/plugins/echo/.claude-plugin/plugin.json +14 -0
  31. package/plugins/echo/CHANGELOG.md +24 -0
  32. package/plugins/echo/README.md +110 -0
  33. package/plugins/echo/config.json +15 -0
  34. package/plugins/echo/docs/adr/001-echo-as-separate-plugin.md +33 -0
  35. package/plugins/echo/docs/adr/002-direct-evaluation-vs-tribunal-pipeline.md +35 -0
  36. package/plugins/echo/docs/adr/003-stop-hook-trigger.md +40 -0
  37. package/plugins/echo/hooks/hooks.json +15 -0
  38. package/plugins/echo/scripts/hooks/echo-stop-gate.sh +366 -0
  39. package/plugins/echo/scripts/lib/echo-config.sh +108 -0
  40. package/plugins/echo/scripts/lib/echo-events.sh +74 -0
  41. package/plugins/echo/scripts/lib/echo-project-key.sh +81 -0
  42. package/plugins/echo/scripts/lib/echo-ulid.sh +46 -0
  43. package/plugins/tribunal/.claude-plugin/plugin.json +20 -0
  44. package/plugins/tribunal/CHANGELOG.md +10 -0
  45. package/plugins/tribunal/README.md +134 -0
  46. package/plugins/tribunal/agents/tribunal-actor.md +35 -0
  47. package/plugins/tribunal/agents/tribunal-judge-adversarial.md +51 -0
  48. package/plugins/tribunal/agents/tribunal-judge-security.md +47 -0
  49. package/plugins/tribunal/agents/tribunal-judge-standard.md +47 -0
  50. package/plugins/tribunal/agents/tribunal-meta-judge.md +61 -0
  51. package/plugins/tribunal/config.json +50 -0
  52. package/plugins/tribunal/docs/adr/001-actor-jury-meta-gate-loop.md +40 -0
  53. package/plugins/tribunal/docs/adr/002-majority-gate-policy.md +48 -0
  54. package/plugins/tribunal/hooks/hooks.json +15 -0
  55. package/plugins/tribunal/scripts/hooks/tribunal-stop-gate.sh +267 -0
  56. package/plugins/tribunal/scripts/lib/tribunal-aggregate.sh +65 -0
  57. package/plugins/tribunal/scripts/lib/tribunal-config.sh +101 -0
  58. package/plugins/tribunal/scripts/lib/tribunal-events.sh +97 -0
  59. package/plugins/tribunal/scripts/lib/tribunal-gate.sh +111 -0
  60. package/plugins/tribunal/scripts/lib/tribunal-jury.sh +102 -0
  61. package/plugins/tribunal/scripts/lib/tribunal-project-key.sh +84 -0
  62. package/plugins/tribunal/scripts/lib/tribunal-rubric.sh +153 -0
  63. package/plugins/tribunal/scripts/lib/tribunal-ulid.sh +50 -0
  64. package/plugins/tribunal/scripts/lib/tribunal-verdict.sh +127 -0
  65. package/plugins/tribunal/skills/tribunal/SKILL.md +129 -0
  66. package/release-please-config.json +43 -5
  67. package/scripts/coverage/bash-coverage.mjs +169 -0
  68. package/scripts/coverage/format-comment.mjs +120 -0
  69. package/scripts/coverage/run-coverage.mjs +151 -0
  70. package/scripts/hooks/agent-spawn-tracker.sh +4 -4
  71. package/scripts/hooks/prompt-rule-injector.sh +122 -0
  72. package/scripts/lib/portable-lock.sh +48 -0
  73. package/scripts/lib/prompt-rules.sh +207 -0
  74. package/scripts/lib/tool-history.sh +7 -8
  75. package/scripts/lib/validate-path.sh +4 -0
  76. package/scripts/lint/check-manifests.mjs +314 -0
  77. package/scripts/lint/check-references.mjs +311 -0
  78. package/skills/list-prompt-rules/SKILL.md +15 -0
  79. package/test/bats/archivist-config-files.bats +60 -0
  80. package/test/bats/archivist-config.bats +54 -0
  81. package/test/bats/archivist-inject.bats +73 -0
  82. package/test/bats/archivist-project-key.bats +75 -0
  83. package/test/bats/archivist-storage.bats +119 -0
  84. package/test/bats/archivist-ulid.bats +36 -0
  85. package/test/bats/config.bats +10 -10
  86. package/test/bats/echo-config.bats +90 -0
  87. package/test/bats/echo-events.bats +121 -0
  88. package/test/bats/echo-project-key.bats +115 -0
  89. package/test/bats/echo-stop-hook.bats +101 -0
  90. package/test/bats/echo-ulid.bats +38 -0
  91. package/test/bats/portable-lock.bats +62 -0
  92. package/test/bats/prompt-rules.bats +269 -0
  93. package/test/bats/tribunal-aggregate.bats +77 -0
  94. package/test/bats/tribunal-config.bats +86 -0
  95. package/test/bats/tribunal-events.bats +209 -0
  96. package/test/bats/tribunal-gate.bats +95 -0
  97. package/test/bats/tribunal-jury.bats +80 -0
  98. package/test/bats/tribunal-rubric.bats +119 -0
  99. package/test/bats/tribunal-stop-hook.bats +73 -0
  100. package/test/bats/tribunal-verdict.bats +71 -0
  101. package/test/fixtures/hook-inputs/user-prompt-submit-rule-match.json +8 -0
  102. package/test/fixtures/hook-inputs/user-prompt-submit-rule-nomatch.json +8 -0
  103. package/test/helpers/setup.bash +9 -0
  104. package/test/node/check-manifests.test.mjs +173 -0
  105. package/test/node/check-references.test.mjs +279 -0
  106. package/test/node/coverage.test.mjs +143 -0
@@ -0,0 +1,110 @@
1
+ # Echo
2
+
3
+ Prompt-change regression detection for the Onlooker ecosystem.
4
+
5
+ When a watched agent file is modified, Echo runs a single-judge quality pass on the file via `claude -p` and compares the score against a stored baseline. It reports whether the change **improved**, **degraded**, or had **no measurable effect** on prompt quality — giving every prompt edit a before/after signal instead of relying on intuition.
6
+
7
+ Echo is a sibling plugin to [`ecosystem`](../../) and assumes the Onlooker observability substrate (`~/.onlooker/`) is present.
8
+
9
+ ## How it works
10
+
11
+ Echo registers a **Stop hook** that fires at the end of every Claude Code session. When triggered:
12
+
13
+ 1. Detects which watched files changed (unstaged, staged, or untracked).
14
+ 2. Filters against configured `watch_paths` and `exclude_paths` patterns.
15
+ 3. For each matching file, builds a rubric prompt and calls `claude -p --max-turns 1` to score it on four criteria: role clarity, output format, criterion coverage, and internal consistency.
16
+ 4. Compares the score to a stored baseline (if one exists) and emits `echo.improvement.detected` or `echo.regression.detected`.
17
+ 5. Emits `echo.suite.complete` with aggregate drift, a `merge_recommended` flag, and duration.
18
+
19
+ The hook always exits 0 — it never blocks a session from ending.
20
+
21
+ ## Activation
22
+
23
+ Echo is **off by default**. Enable it per-project in `.claude/settings.json`:
24
+
25
+ ```json
26
+ {
27
+ "echo": {
28
+ "enabled": true
29
+ }
30
+ }
31
+ ```
32
+
33
+ ## Configuration
34
+
35
+ All keys are optional. Unset keys fall back to the plugin's `config.json` defaults.
36
+
37
+ ```json
38
+ {
39
+ "echo": {
40
+ "enabled": true,
41
+ "watch_paths": ["plugins/*/agents/*.md"],
42
+ "exclude_paths": [],
43
+ "drift_threshold": 0.05,
44
+ "evaluation": {
45
+ "model": "claude-haiku-4-5-20251001",
46
+ "timeout_seconds": 60
47
+ }
48
+ }
49
+ }
50
+ ```
51
+
52
+ | Key | Default | Description |
53
+ |-----|---------|-------------|
54
+ | `enabled` | `false` | Must be `true` for any evaluation to run. |
55
+ | `watch_paths` | `["plugins/*/agents/*.md"]` | Glob patterns (relative to repo root) of files to watch. Bash extended glob syntax. |
56
+ | `exclude_paths` | `[]` | Patterns to exclude. `plugins/echo/**` is always excluded regardless of this setting. |
57
+ | `drift_threshold` | `0.05` | Minimum absolute score delta to classify a change as improvement or regression. Deltas below this are reported as neutral. |
58
+ | `evaluation.model` | `claude-haiku-4-5-20251001` | Model used for the quality pass. Haiku is fast and cheap; upgrade to Sonnet for higher-stakes repos. |
59
+ | `evaluation.timeout_seconds` | `60` | Per-file wall-clock timeout passed to the `timeout` command. |
60
+
61
+ ## Scoring rubric
62
+
63
+ Each watched file is scored 0.0–1.0 on four equally-weighted criteria:
64
+
65
+ | Criterion | What it checks |
66
+ |-----------|---------------|
67
+ | **Role clarity** | Does the file clearly define what the agent is and what it must do? |
68
+ | **Output format** | Are output format and schema requirements unambiguous? |
69
+ | **Criterion coverage** | Are all evaluation dimensions specified with enough detail to apply consistently? |
70
+ | **Internal consistency** | No contradictory instructions; no undefined terms. |
71
+
72
+ A score ≥ 0.7 is considered "passed". A delta beyond `drift_threshold` in either direction is classified as improvement or regression.
73
+
74
+ ## Storage layout
75
+
76
+ ```text
77
+ ~/.onlooker/echo/<project-key>/
78
+ ├── baselines/
79
+ │ └── <test-id>.json # one per watched file (test-id = first 16 hex of SHA256 of path)
80
+ └── run-<session-id>.json # advisory summary written at end of each suite
81
+ ```
82
+
83
+ Project key: first 12 hex chars of SHA256 of `git remote get-url origin`, falling back to a hash of the repo root realpath. This makes the key stable across directory moves and clones of the same repo.
84
+
85
+ ## Events emitted
86
+
87
+ Echo emits the canonical `echo.*` event surface from [`@onlooker-community/schema`](https://github.com/onlooker-community/schema) v2.2.0+. All events land in `~/.onlooker/logs/onlooker-events.jsonl` and are validated against the schema before write.
88
+
89
+ | Event | When |
90
+ |-------|------|
91
+ | `echo.suite.started` | Before the evaluation loop begins. Includes `test_count` and `changed_file`. |
92
+ | `echo.improvement.detected` | A file's score increased beyond `drift_threshold`. |
93
+ | `echo.regression.detected` | A file's score decreased beyond `drift_threshold`. |
94
+ | `echo.suite.complete` | After all files are evaluated. Includes aggregate drift fields when a prior baseline exists. |
95
+
96
+ ## Requirements
97
+
98
+ - The `ecosystem` plugin installed (for the `~/.onlooker/` substrate and canonical event emission).
99
+ - `claude` CLI on `PATH` (the hook shells out to `claude -p` for evaluation passes).
100
+ - `jq` for JSON manipulation.
101
+ - `node` for canonical-event emission.
102
+ - `python3` for millisecond timestamps (standard on macOS and most Linux distributions).
103
+
104
+ ## Architecture decisions
105
+
106
+ Key decisions made during initial design are recorded in [`docs/adr/`](docs/adr/):
107
+
108
+ - [ADR-001](docs/adr/001-echo-as-separate-plugin.md) — Echo as a separate plugin, not an extension of Tribunal
109
+ - [ADR-002](docs/adr/002-direct-evaluation-vs-tribunal-pipeline.md) — Direct `claude -p` evaluation vs. routing through Tribunal's full pipeline
110
+ - [ADR-003](docs/adr/003-stop-hook-trigger.md) — Stop hook as the trigger mechanism
@@ -0,0 +1,15 @@
1
+ {
2
+ "plugin_name": "echo",
3
+ "storage_path": "~/.onlooker",
4
+ "echo": {
5
+ "enabled": false,
6
+ "watch_paths": ["plugins/*/agents/*.md"],
7
+ "exclude_paths": [],
8
+ "drift_threshold": 0.05,
9
+ "evaluation": {
10
+ "model": "claude-haiku-4-5-20251001",
11
+ "max_output_tokens": 512,
12
+ "timeout_seconds": 60
13
+ }
14
+ }
15
+ }
@@ -0,0 +1,33 @@
1
+ # ADR-001: Echo as a Separate Plugin, Not an Extension of Tribunal
2
+
3
+ **Status:** Accepted
4
+ **Date:** 2026-05-24
5
+
6
+ ## Context
7
+
8
+ Tribunal already exists as an evaluation engine in this ecosystem. When designing Echo's prompt-change regression detection, the first question was whether Echo should live inside Tribunal (as a sub-feature or mode) or stand alone as its own plugin.
9
+
10
+ The Tribunal team ran a formal evaluation of this question using Tribunal itself. The final score across three iterations was 0.79 (above the 0.75 acceptance threshold), with outcome `exhausted_iterations` — the adversarial judge never passed, which is expected behavior for that judge type. The substantive conclusion from all three iterations was: Echo is architecturally sound as a standalone plugin.
11
+
12
+ ## Decision
13
+
14
+ Echo is a separate, independent plugin under `plugins/echo/`.
15
+
16
+ ## Rationale
17
+
18
+ **Separate concerns.** Tribunal is an orchestrator for arbitrary tasks. Echo is a specialized harness for prompt quality regression testing. Bundling Echo into Tribunal would couple two distinct concerns: general task evaluation and change-detection/baselining.
19
+
20
+ **Different lifecycle.** Tribunal is always on (for `/tribunal` skill invocations) and opt-in for its Stop hook. Echo is opt-in by default (`"enabled": false`) and has no interactive skill surface — it only runs as a Stop hook. These are different activation patterns that would conflict if forced into the same plugin config namespace.
21
+
22
+ **Independent versioning.** Echo and Tribunal can release, iterate, and break/fix independently. Echo v0.2 does not need to drag along a Tribunal major bump.
23
+
24
+ **Composability.** Echo today calls `claude -p` directly. A future version could delegate to Tribunal for richer multi-judge evaluation (see ADR-002). That migration is easier when Echo is its own entry point.
25
+
26
+ **Self-exclusion.** Echo must never trigger on its own files changing. This is simpler to enforce as a first-class concern in a standalone plugin (`plugins/echo/**` is always in `exclude_paths`) than as a special case inside Tribunal.
27
+
28
+ ## Consequences
29
+
30
+ - Echo requires the ecosystem plugin but does **not** require Tribunal to be installed.
31
+ - Echo gets its own `config.json`, `hooks.json`, `.claude-plugin/plugin.json`, CHANGELOG, and release-please track.
32
+ - Any future Tribunal integration (e.g., Echo delegating multi-judge eval to Tribunal) will be an opt-in config option, not a hard dependency.
33
+ - Marketplace listing and docs must be careful not to imply Tribunal is a prerequisite (an early draft of the description made this mistake; corrected before merge).
@@ -0,0 +1,35 @@
1
+ # ADR-002: Direct `claude -p` Evaluation vs. Routing Through Tribunal's Pipeline
2
+
3
+ **Status:** Accepted (with planned future extension)
4
+ **Date:** 2026-05-24
5
+
6
+ ## Context
7
+
8
+ Echo needs to evaluate prompt file quality before and after a change. Two approaches were available:
9
+
10
+ **Option A — Direct `claude -p`**: Build an inline rubric prompt, call `claude -p --max-turns 1` for each file, and parse the JSON score from the response.
11
+
12
+ **Option B — Tribunal pipeline**: Invoke Tribunal's multi-judge Actor → Jury → Meta-Judge → Gate loop for each file and use the aggregated score as the quality signal.
13
+
14
+ ## Decision
15
+
16
+ Echo v0.1 uses **Option A** — direct `claude -p` with an inline rubric.
17
+
18
+ ## Rationale
19
+
20
+ **Stop hook latency budget.** A Stop hook fires synchronously at the end of every session. Tribunal's full loop (Actor + two judges + Meta-Judge + Gate, with potential retries) takes 30–120 seconds per task. Multiplied across several watched files, this would make sessions feel like they hang after every edit. A single `claude -p` call with a 60-second timeout keeps the overhead acceptable.
21
+
22
+ **Echo evaluates prompts, not outputs.** Tribunal's loop is designed to evaluate an Agent's *work product* against a rubric. Echo evaluates *the prompt file itself* — a simpler, single-document task. A full jury is architecturally overweight for this use case.
23
+
24
+ **Baseline stability.** Tribunal's multi-judge scores have meaningful variance across runs (different judge models, adversarial judge behavior, Meta-Judge overrides). Echo's baseline comparison depends on stable, reproducible scores — a single `claude -p` pass with a fixed model and rubric is more consistent as a yardstick.
25
+
26
+ **Haiku is cheap enough.** Evaluating a prompt file with Haiku costs a fraction of a cent. Running a full Tribunal loop (Opus-class models for judges) would cost 10–50× more per file per session. With a default model of `claude-haiku-4-5-20251001`, Echo can run automatically without raising cost concerns.
27
+
28
+ **Independent of Tribunal installation.** Option A requires only the `claude` CLI. Option B would make Tribunal a hard runtime dependency of Echo, coupling two plugins that have separate versioning and installation paths (see ADR-001).
29
+
30
+ ## Consequences
31
+
32
+ - Echo's evaluation quality is bounded by a single-model, single-pass rubric. It will miss issues that a diverse jury would catch, but it is consistent enough to detect regressions.
33
+ - The scoring rubric (role clarity, output format, criterion coverage, internal consistency) is hardcoded in the hook rather than being user-overridable in v0.1. A future version should expose this as config.
34
+ - A future `echo.mode: "tribunal"` config option could delegate to Tribunal's jury for higher-confidence evaluation when cost and latency are acceptable. The current design leaves room for this — Echo's event schema (`echo.suite.started`, etc.) is agnostic to the underlying evaluator.
35
+ - The `claude -p` response parsing includes a `sed` strip for accidental markdown fences, which Tribunal's pipeline avoids by using structured judge output. This is a fragility to watch.
@@ -0,0 +1,40 @@
1
+ # ADR-003: Stop Hook as the Trigger Mechanism
2
+
3
+ **Status:** Accepted
4
+ **Date:** 2026-05-24
5
+
6
+ ## Context
7
+
8
+ Echo needs to know when an agent file has changed and run an evaluation. Several trigger points were considered:
9
+
10
+ - **Stop hook** — fires when a Claude Code session ends.
11
+ - **Pre-commit hook** — fires when the developer runs `git commit`.
12
+ - **PostToolUse hook** — fires after every tool call that writes a file.
13
+ - **CI step** — fires on push to a remote branch.
14
+ - **Manual `/echo` skill** — user-invoked on demand.
15
+
16
+ ## Decision
17
+
18
+ Echo v0.1 uses the **Stop hook**.
19
+
20
+ ## Rationale
21
+
22
+ **Correct granularity.** A session is the natural unit of prompt engineering work. A developer edits `tribunal-judge-standard.md`, tests it through several turns, and ends the session. That's the moment Echo should fire — after the work is done, not after each intermediate save.
23
+
24
+ **Claude Code already provides it.** The Stop hook is a first-class Claude Code hook type with a well-defined contract: the hook receives `{cwd, session_id}` on stdin and must exit 0 (or the session stop is blocked, which is why Echo always exits 0). No additional tooling or git hooks needed.
25
+
26
+ **Consistent with Tribunal's pattern.** Tribunal's Stop hook (when enabled) follows the same pattern — an advisory pass that fires at session end without blocking the stop. Echo mirrors this, which keeps the plugin model coherent across the ecosystem.
27
+
28
+ **No commit discipline required.** A pre-commit hook would only fire when the developer commits. Many prompt engineering workflows involve many experimental edits before any commit. Echo should capture signal on *any* session where a watched file changed, not only committed ones. Untracked and unstaged files are explicitly included in Echo's change detection.
29
+
30
+ **Low friction.** PostToolUse fires on every file write, which would run evaluations continuously mid-session — expensive, noisy, and disruptive. The Stop hook batches all changes from a session into a single suite run.
31
+
32
+ **Not CI.** CI integration has value but is a separate concern. A CI step can't write to `~/.onlooker/` on the developer's machine, and the baseline comparison is inherently local. Echo is a local development feedback tool; CI integration (e.g., posting drift to a PR comment) is a future feature.
33
+
34
+ ## Consequences
35
+
36
+ - The recursion guard (`ECHO_NESTED=1`) is mandatory. `claude -p` spawns a subprocess that also triggers Stop, which would re-enter the hook infinitely. The guard must be set before any work begins and is checked as the very first statement.
37
+ - Echo cannot fire mid-session, so rapid iteration on a prompt file produces one signal per session, not one per edit. This is a feature for reducing noise, but means a long session with many edits only records the final state of each file.
38
+ - If a session ends without the developer saving changes (e.g., closed the terminal abruptly), the Stop hook may not fire. This is consistent with how all Stop hooks in Claude Code behave.
39
+ - Users who want on-demand evaluation can invoke Echo's logic manually by calling the hook directly. A future `/echo` skill could wrap this.
40
+ - The hook must be registered in `hooks.json` with `"matcher": "*"` so it fires on all sessions. Projects that want to opt out can set `echo.enabled: false` rather than removing the hook registration.
@@ -0,0 +1,15 @@
1
+ {
2
+ "hooks": {
3
+ "Stop": [
4
+ {
5
+ "matcher": "*",
6
+ "hooks": [
7
+ {
8
+ "type": "command",
9
+ "command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/echo-stop-gate.sh"
10
+ }
11
+ ]
12
+ }
13
+ ]
14
+ }
15
+ }
@@ -0,0 +1,366 @@
1
+ #!/usr/bin/env bash
2
+ # Echo Stop-gate hook.
3
+ #
4
+ # Triggered by Stop. Off by default — gated on echo.enabled in config.
5
+ # When enabled, detects which watched agent files changed in this session,
6
+ # runs a single-judge advisory pass on each, and compares the score against a
7
+ # stored baseline to report improved / degraded / neutral.
8
+ #
9
+ # Hook contract:
10
+ # - Always exits 0. Never blocks Stop.
11
+ # - Skips silently if disabled, no git context, or no watched files changed.
12
+ # - Recursion guard: exits immediately if ECHO_NESTED=1 to prevent a claude -p
13
+ # subprocess from re-triggering this hook on its own Writes.
14
+ # - Errors from `claude -p` are swallowed; worst case is no verdict written.
15
+
16
+ set -uo pipefail
17
+
18
+ # Recursion guard — must be first.
19
+ [[ "${ECHO_NESTED:-}" == "1" ]] && exit 0
20
+ export ECHO_NESTED=1
21
+
22
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
23
+ PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
24
+
25
+ # Resolve the ecosystem root (sibling to this plugin's parent).
26
+ _ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
27
+ if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
28
+ _candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
29
+ if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
30
+ _ECOSYSTEM_ROOT="$_candidate"
31
+ fi
32
+ fi
33
+
34
+ if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
35
+ # shellcheck disable=SC1091
36
+ CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
37
+ # shellcheck disable=SC1091
38
+ CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/onlooker-schema.sh"
39
+ fi
40
+
41
+ # shellcheck source=../lib/echo-config.sh
42
+ source "${PLUGIN_ROOT}/scripts/lib/echo-config.sh"
43
+ # shellcheck source=../lib/echo-project-key.sh
44
+ source "${PLUGIN_ROOT}/scripts/lib/echo-project-key.sh"
45
+ # shellcheck source=../lib/echo-ulid.sh
46
+ source "${PLUGIN_ROOT}/scripts/lib/echo-ulid.sh"
47
+ # shellcheck source=../lib/echo-events.sh
48
+ CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" source "${PLUGIN_ROOT}/scripts/lib/echo-events.sh"
49
+
50
+ INPUT=$(cat)
51
+ CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
52
+ SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
53
+
54
+ _done() { exit 0; }
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Config + prerequisites
58
+ # ---------------------------------------------------------------------------
59
+
60
+ REPO_ROOT=$(echo_project_repo_root "$CWD")
61
+ [[ -z "$REPO_ROOT" ]] && _done
62
+
63
+ CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_load "$REPO_ROOT"
64
+ echo_config_enabled || _done
65
+
66
+ PROJECT_KEY=$(echo_project_key "$CWD")
67
+ [[ -z "$PROJECT_KEY" ]] && _done
68
+
69
+ command -v claude >/dev/null 2>&1 || _done
70
+ command -v jq >/dev/null 2>&1 || _done
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Identify changed agent files
74
+ # ---------------------------------------------------------------------------
75
+
76
+ # Collect all changed paths: unstaged, staged, and untracked.
77
+ CHANGED_FILES=$(git -C "$REPO_ROOT" diff --name-only HEAD 2>/dev/null) || CHANGED_FILES=""
78
+ STAGED_FILES=$(git -C "$REPO_ROOT" diff --name-only --cached 2>/dev/null) || STAGED_FILES=""
79
+ UNTRACKED_FILES=$(git -C "$REPO_ROOT" ls-files --others --exclude-standard 2>/dev/null) || UNTRACKED_FILES=""
80
+ ALL_CHANGED=$(printf '%s\n%s\n%s' "$CHANGED_FILES" "$STAGED_FILES" "$UNTRACKED_FILES" | sort -u | grep -v '^$') || ALL_CHANGED=""
81
+ [[ -z "$ALL_CHANGED" ]] && _done
82
+
83
+ # Load watch and exclude patterns (bash 3 compatible — no mapfile).
84
+ WATCH_PATTERNS=()
85
+ while IFS= read -r _pat; do
86
+ [[ -n "$_pat" ]] && WATCH_PATTERNS+=("$_pat")
87
+ done < <(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_watch_paths)
88
+
89
+ EXCLUDE_PATTERNS=()
90
+ while IFS= read -r _pat; do
91
+ [[ -n "$_pat" ]] && EXCLUDE_PATTERNS+=("$_pat")
92
+ done < <(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_exclude_paths)
93
+
94
+ # Filter changed files: must match at least one watch pattern AND no exclude pattern.
95
+ WATCHED_CHANGED=()
96
+ while IFS= read -r f; do
97
+ [[ -z "$f" ]] && continue
98
+
99
+ local_match=0
100
+ for pat in "${WATCH_PATTERNS[@]}"; do
101
+ # shellcheck disable=SC2053
102
+ if [[ "$f" == $pat ]]; then
103
+ local_match=1
104
+ break
105
+ fi
106
+ done
107
+ [[ "$local_match" -eq 0 ]] && continue
108
+
109
+ excluded=0
110
+ for pat in "${EXCLUDE_PATTERNS[@]}"; do
111
+ # shellcheck disable=SC2053
112
+ if [[ "$f" == $pat ]]; then
113
+ excluded=1
114
+ break
115
+ fi
116
+ done
117
+ [[ "$excluded" -eq 1 ]] && continue
118
+
119
+ WATCHED_CHANGED+=("$f")
120
+ done <<< "$ALL_CHANGED"
121
+
122
+ [[ "${#WATCHED_CHANGED[@]}" -eq 0 ]] && _done
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Storage paths
126
+ # ---------------------------------------------------------------------------
127
+
128
+ ONLOOKER_BASE="${ONLOOKER_DIR:-$HOME/.onlooker}"
129
+ ECHO_DIR="${ONLOOKER_BASE}/echo/${PROJECT_KEY}"
130
+ BASELINE_DIR="${ECHO_DIR}/baselines"
131
+ mkdir -p "$BASELINE_DIR" 2>/dev/null || _done
132
+
133
+ # ---------------------------------------------------------------------------
134
+ # Evaluation loop
135
+ # ---------------------------------------------------------------------------
136
+
137
+ EVAL_MODEL=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_model)
138
+ TIMEOUT_SECS=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_timeout)
139
+ DRIFT_THRESHOLD=$(CLAUDE_PLUGIN_ROOT="$PLUGIN_ROOT" echo_config_drift_threshold)
140
+
141
+ SUITE_ID=$(echo_ulid)
142
+ SUITE_START=$(python3 -c 'import time; print(int(time.time()*1000))' 2>/dev/null || echo 0)
143
+ FIRST_CHANGED="${WATCHED_CHANGED[0]}"
144
+
145
+ suite_started_payload=$(jq -n \
146
+ --arg suite_id "$SUITE_ID" \
147
+ --argjson test_count "${#WATCHED_CHANGED[@]}" \
148
+ --arg trigger "file_change" \
149
+ --arg changed_file "$FIRST_CHANGED" \
150
+ '{suite_id: $suite_id, test_count: $test_count, trigger: $trigger, changed_file: $changed_file}')
151
+ echo_emit_event "echo.suite.started" "$suite_started_payload" || true
152
+
153
+ PROMPT_FILE=$(mktemp -t echo-prompt.XXXXXX 2>/dev/null) || PROMPT_FILE="/tmp/echo-prompt.$$"
154
+ trap 'rm -f "$PROMPT_FILE"' EXIT
155
+
156
+ count_improved=0
157
+ count_degraded=0
158
+ count_neutral=0
159
+ sum_before=0
160
+ sum_after=0
161
+ file_count=0
162
+
163
+ for rel_path in "${WATCHED_CHANGED[@]}"; do
164
+ abs_path="${REPO_ROOT}/${rel_path}"
165
+ [[ ! -f "$abs_path" ]] && continue
166
+
167
+ FILE_CONTENT=$(cat "$abs_path" 2>/dev/null) || continue
168
+ [[ -z "$FILE_CONTENT" ]] && continue
169
+
170
+ TEST_ID=$(echo_test_id_for_path "$rel_path")
171
+ BASELINE_FILE="${BASELINE_DIR}/${TEST_ID}.json"
172
+
173
+ # Build the evaluation prompt.
174
+ {
175
+ printf '%s\n' 'You are evaluating an agent prompt file for quality. Return JSON only — no prose, no markdown fences.'
176
+ printf '\n'
177
+ printf '%s\n' 'Output schema (exactly these keys):'
178
+ printf '%s\n' '{'
179
+ printf '%s\n' ' "score": 0.0..1.0,'
180
+ printf '%s\n' ' "passed": true|false,'
181
+ printf '%s\n' ' "confidence": 0.0..1.0,'
182
+ printf '%s\n' ' "feedback": "1-2 sentences on the highest-leverage issue, if any."'
183
+ printf '%s\n' '}'
184
+ printf '\n'
185
+ printf '%s\n' 'Score on these criteria (equal weight):'
186
+ printf '%s\n' ' - Role clarity: does the file clearly define what the agent is and what it must do?'
187
+ printf '%s\n' ' - Output format: are output format and schema requirements unambiguous?'
188
+ printf '%s\n' ' - Criterion coverage: are all evaluation dimensions specified with enough detail to apply consistently?'
189
+ printf '%s\n' ' - Internal consistency: no contradictory instructions, no undefined terms.'
190
+ printf '\n'
191
+ printf '%s\n' "A score >= 0.7 is \"passed\". Be concise."
192
+ printf '\n'
193
+ printf '%s\n' "---FILE: ${rel_path}---"
194
+ printf '%s\n' "$FILE_CONTENT"
195
+ printf '%s\n' '---END FILE---'
196
+ } > "$PROMPT_FILE"
197
+
198
+ CLAUDE_ARGS=(-p --max-turns 1)
199
+ [[ -n "$EVAL_MODEL" ]] && CLAUDE_ARGS+=(--model "$EVAL_MODEL")
200
+
201
+ RESPONSE=""
202
+ if command -v timeout >/dev/null 2>&1; then
203
+ RESPONSE=$(timeout "$TIMEOUT_SECS" claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
204
+ elif command -v gtimeout >/dev/null 2>&1; then
205
+ RESPONSE=$(gtimeout "$TIMEOUT_SECS" claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
206
+ else
207
+ RESPONSE=$(claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
208
+ fi
209
+
210
+ [[ -z "$RESPONSE" ]] && continue
211
+
212
+ CLEAN=$(printf '%s' "$RESPONSE" | sed -e 's/^```json//' -e 's/^```//' -e 's/```$//')
213
+ SCORE_AFTER=$(printf '%s' "$CLEAN" | jq -r '.score // empty' 2>/dev/null) || SCORE_AFTER=""
214
+ CONFIDENCE=$(printf '%s' "$CLEAN" | jq -r '.confidence // "0.6"' 2>/dev/null) || CONFIDENCE="0.6"
215
+ [[ -z "$SCORE_AFTER" ]] && continue
216
+
217
+ SCORE_BEFORE=""
218
+ if [[ -f "$BASELINE_FILE" ]]; then
219
+ SCORE_BEFORE=$(jq -r '.score // empty' "$BASELINE_FILE" 2>/dev/null) || SCORE_BEFORE=""
220
+ fi
221
+
222
+ # Persist new baseline.
223
+ jq -n \
224
+ --arg path "$rel_path" \
225
+ --arg test_id "$TEST_ID" \
226
+ --argjson score "$SCORE_AFTER" \
227
+ --arg ts "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \
228
+ '{path: $path, test_id: $test_id, score: $score, recorded_at: $ts}' \
229
+ > "$BASELINE_FILE" 2>/dev/null || true
230
+
231
+ file_count=$((file_count + 1))
232
+ sum_after=$(python3 -c "print($sum_after + $SCORE_AFTER)" 2>/dev/null) || sum_after=$sum_after
233
+
234
+ if [[ -n "$SCORE_BEFORE" ]]; then
235
+ DELTA=$(python3 -c "print(round($SCORE_AFTER - $SCORE_BEFORE, 4))" 2>/dev/null) || DELTA="0"
236
+ sum_before=$(python3 -c "print($sum_before + $SCORE_BEFORE)" 2>/dev/null) || sum_before=$sum_before
237
+
238
+ ABS_DELTA=$(python3 -c "print(abs($DELTA))" 2>/dev/null) || ABS_DELTA="0"
239
+ IS_IMPROVED=$(python3 -c "print('true' if $DELTA > $DRIFT_THRESHOLD else 'false')" 2>/dev/null) || IS_IMPROVED="false"
240
+ IS_DEGRADED=$(python3 -c "print('true' if $DELTA < -$DRIFT_THRESHOLD else 'false')" 2>/dev/null) || IS_DEGRADED="false"
241
+
242
+ FILE_NAME=$(basename "$rel_path")
243
+
244
+ if [[ "$IS_IMPROVED" == "true" ]]; then
245
+ count_improved=$((count_improved + 1))
246
+ improvement_payload=$(jq -n \
247
+ --arg suite_id "$SUITE_ID" \
248
+ --arg test_id "$TEST_ID" \
249
+ --arg test_name "$FILE_NAME" \
250
+ --argjson score_before "$SCORE_BEFORE" \
251
+ --argjson score_after "$SCORE_AFTER" \
252
+ --argjson delta "$DELTA" \
253
+ --argjson confidence "$CONFIDENCE" \
254
+ '{suite_id: $suite_id, test_id: $test_id, test_name: $test_name,
255
+ score_before: $score_before, score_after: $score_after,
256
+ delta: $delta, confidence: $confidence}')
257
+ echo_emit_event "echo.improvement.detected" "$improvement_payload" || true
258
+
259
+ elif [[ "$IS_DEGRADED" == "true" ]]; then
260
+ count_degraded=$((count_degraded + 1))
261
+ regression_payload=$(jq -n \
262
+ --arg suite_id "$SUITE_ID" \
263
+ --arg test_id "$TEST_ID" \
264
+ --arg test_name "$FILE_NAME" \
265
+ --argjson score_before "$SCORE_BEFORE" \
266
+ --argjson score_after "$SCORE_AFTER" \
267
+ --argjson delta "$DELTA" \
268
+ --argjson confidence "$CONFIDENCE" \
269
+ '{suite_id: $suite_id, test_id: $test_id, test_name: $test_name,
270
+ score_before: $score_before, score_after: $score_after,
271
+ delta: $delta, confidence: $confidence}')
272
+ echo_emit_event "echo.regression.detected" "$regression_payload" || true
273
+ else
274
+ count_neutral=$((count_neutral + 1))
275
+ fi
276
+ else
277
+ # First evaluation for this file — no baseline to compare against yet.
278
+ count_neutral=$((count_neutral + 1))
279
+ fi
280
+ done
281
+
282
+ [[ "$file_count" -eq 0 ]] && _done
283
+
284
+ # ---------------------------------------------------------------------------
285
+ # Emit suite events
286
+ # ---------------------------------------------------------------------------
287
+
288
+ SUITE_END=$(python3 -c 'import time; print(int(time.time()*1000))' 2>/dev/null || echo 0)
289
+ DURATION_MS=$(( SUITE_END - SUITE_START ))
290
+
291
+ MERGE_RECOMMENDED="false"
292
+ [[ "$count_degraded" -eq 0 && "$count_improved" -gt 0 ]] && MERGE_RECOMMENDED="true"
293
+ [[ "$count_degraded" -eq 0 && "$count_improved" -eq 0 ]] && MERGE_RECOMMENDED="true"
294
+
295
+ if [[ "$file_count" -gt 0 && -n "$sum_before" ]] && python3 -c "exit(0 if $sum_before > 0 else 1)" 2>/dev/null; then
296
+ BASELINE_AVG=$(python3 -c "print(round($sum_before / $file_count, 4))" 2>/dev/null) || BASELINE_AVG=""
297
+ AFTER_AVG=$(python3 -c "print(round($sum_after / $file_count, 4))" 2>/dev/null) || AFTER_AVG=""
298
+ DRIFT=$(python3 -c "print(round($sum_after / $file_count - $sum_before / $file_count, 4))" 2>/dev/null) || DRIFT=""
299
+
300
+ if [[ -n "$BASELINE_AVG" && -n "$AFTER_AVG" && -n "$DRIFT" ]]; then
301
+ suite_complete_payload=$(jq -n \
302
+ --arg suite_id "$SUITE_ID" \
303
+ --argjson test_count "$file_count" \
304
+ --argjson improved "$count_improved" \
305
+ --argjson degraded "$count_degraded" \
306
+ --argjson neutral "$count_neutral" \
307
+ --argjson merge_recommended "$MERGE_RECOMMENDED" \
308
+ --argjson duration_ms "$DURATION_MS" \
309
+ --argjson baseline_score "$BASELINE_AVG" \
310
+ --argjson score_after "$AFTER_AVG" \
311
+ --argjson drift "$DRIFT" \
312
+ --argjson drift_threshold "$DRIFT_THRESHOLD" \
313
+ '{suite_id: $suite_id, test_count: $test_count,
314
+ improved: $improved, degraded: $degraded, neutral: $neutral,
315
+ merge_recommended: $merge_recommended, duration_ms: $duration_ms,
316
+ baseline_score: $baseline_score, score_after: $score_after,
317
+ drift: $drift, drift_threshold: $drift_threshold}')
318
+ else
319
+ suite_complete_payload=$(jq -n \
320
+ --arg suite_id "$SUITE_ID" \
321
+ --argjson test_count "$file_count" \
322
+ --argjson improved "$count_improved" \
323
+ --argjson degraded "$count_degraded" \
324
+ --argjson neutral "$count_neutral" \
325
+ --argjson merge_recommended "$MERGE_RECOMMENDED" \
326
+ --argjson duration_ms "$DURATION_MS" \
327
+ '{suite_id: $suite_id, test_count: $test_count,
328
+ improved: $improved, degraded: $degraded, neutral: $neutral,
329
+ merge_recommended: $merge_recommended, duration_ms: $duration_ms}')
330
+ fi
331
+ else
332
+ suite_complete_payload=$(jq -n \
333
+ --arg suite_id "$SUITE_ID" \
334
+ --argjson test_count "$file_count" \
335
+ --argjson improved "$count_improved" \
336
+ --argjson degraded "$count_degraded" \
337
+ --argjson neutral "$count_neutral" \
338
+ --argjson merge_recommended "$MERGE_RECOMMENDED" \
339
+ --argjson duration_ms "$DURATION_MS" \
340
+ '{suite_id: $suite_id, test_count: $test_count,
341
+ improved: $improved, degraded: $degraded, neutral: $neutral,
342
+ merge_recommended: $merge_recommended, duration_ms: $duration_ms}')
343
+ fi
344
+ echo_emit_event "echo.suite.complete" "$suite_complete_payload" || true
345
+
346
+ # ---------------------------------------------------------------------------
347
+ # Write advisory file for review in next session.
348
+ # ---------------------------------------------------------------------------
349
+
350
+ SAFE_SESSION_ID=$(printf '%s' "${SESSION_ID:-unknown}" | tr -c 'a-zA-Z0-9-' '_')
351
+
352
+ jq -n \
353
+ --arg suite_id "$SUITE_ID" \
354
+ --arg session_id "${SESSION_ID:-unknown}" \
355
+ --argjson test_count "$file_count" \
356
+ --argjson improved "$count_improved" \
357
+ --argjson degraded "$count_degraded" \
358
+ --argjson neutral "$count_neutral" \
359
+ --argjson merge_recommended "$MERGE_RECOMMENDED" \
360
+ --argjson files "$(printf '%s\n' "${WATCHED_CHANGED[@]}" | jq -R . | jq -s .)" \
361
+ '{suite_id: $suite_id, session_id: $session_id, test_count: $test_count,
362
+ improved: $improved, degraded: $degraded, neutral: $neutral,
363
+ merge_recommended: $merge_recommended, files: $files}' \
364
+ > "${ECHO_DIR}/run-${SAFE_SESSION_ID}.json" 2>/dev/null || true
365
+
366
+ _done