@onlooker-community/ecosystem 0.9.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/.claude-plugin/marketplace.json +39 -1
  2. package/.claude-plugin/plugin.json +2 -2
  3. package/.github/copilot-instructions.md +46 -0
  4. package/.github/workflows/coverage.yml +78 -0
  5. package/.github/workflows/release.yml +24 -8
  6. package/.github/workflows/test.yml +3 -0
  7. package/.markdownlintignore +3 -0
  8. package/.release-please-manifest.json +4 -1
  9. package/CHANGELOG.md +44 -0
  10. package/README.md +57 -13
  11. package/config.json +6 -1
  12. package/docs/adr/001-claude-code-hooks-as-integration-surface.md +43 -0
  13. package/docs/adr/002-centralized-jsonl-event-log.md +39 -0
  14. package/docs/adr/003-ulid-over-uuid.md +40 -0
  15. package/docs/adr/004-plugin-config-with-settings-overlay.md +34 -0
  16. package/docs/architecture.md +117 -0
  17. package/hooks/hooks.json +4 -0
  18. package/package.json +13 -7
  19. package/plugins/archivist/.claude-plugin/plugin.json +14 -0
  20. package/plugins/archivist/CHANGELOG.md +8 -0
  21. package/plugins/archivist/README.md +105 -0
  22. package/plugins/archivist/config.json +18 -0
  23. package/plugins/archivist/hooks/hooks.json +35 -0
  24. package/plugins/archivist/scripts/hooks/archivist-extract.sh +238 -0
  25. package/plugins/archivist/scripts/hooks/archivist-inject.sh +159 -0
  26. package/plugins/archivist/scripts/lib/archivist-config.sh +66 -0
  27. package/plugins/archivist/scripts/lib/archivist-project-key.sh +91 -0
  28. package/plugins/archivist/scripts/lib/archivist-storage.sh +215 -0
  29. package/plugins/archivist/scripts/lib/archivist-ulid.sh +52 -0
  30. package/plugins/echo/.claude-plugin/plugin.json +14 -0
  31. package/plugins/echo/CHANGELOG.md +24 -0
  32. package/plugins/echo/README.md +110 -0
  33. package/plugins/echo/config.json +15 -0
  34. package/plugins/echo/docs/adr/001-echo-as-separate-plugin.md +33 -0
  35. package/plugins/echo/docs/adr/002-direct-evaluation-vs-tribunal-pipeline.md +35 -0
  36. package/plugins/echo/docs/adr/003-stop-hook-trigger.md +40 -0
  37. package/plugins/echo/hooks/hooks.json +15 -0
  38. package/plugins/echo/scripts/hooks/echo-stop-gate.sh +366 -0
  39. package/plugins/echo/scripts/lib/echo-config.sh +108 -0
  40. package/plugins/echo/scripts/lib/echo-events.sh +74 -0
  41. package/plugins/echo/scripts/lib/echo-project-key.sh +81 -0
  42. package/plugins/echo/scripts/lib/echo-ulid.sh +46 -0
  43. package/plugins/tribunal/.claude-plugin/plugin.json +20 -0
  44. package/plugins/tribunal/CHANGELOG.md +10 -0
  45. package/plugins/tribunal/README.md +134 -0
  46. package/plugins/tribunal/agents/tribunal-actor.md +35 -0
  47. package/plugins/tribunal/agents/tribunal-judge-adversarial.md +51 -0
  48. package/plugins/tribunal/agents/tribunal-judge-security.md +47 -0
  49. package/plugins/tribunal/agents/tribunal-judge-standard.md +47 -0
  50. package/plugins/tribunal/agents/tribunal-meta-judge.md +61 -0
  51. package/plugins/tribunal/config.json +50 -0
  52. package/plugins/tribunal/docs/adr/001-actor-jury-meta-gate-loop.md +40 -0
  53. package/plugins/tribunal/docs/adr/002-majority-gate-policy.md +48 -0
  54. package/plugins/tribunal/hooks/hooks.json +15 -0
  55. package/plugins/tribunal/scripts/hooks/tribunal-stop-gate.sh +267 -0
  56. package/plugins/tribunal/scripts/lib/tribunal-aggregate.sh +65 -0
  57. package/plugins/tribunal/scripts/lib/tribunal-config.sh +101 -0
  58. package/plugins/tribunal/scripts/lib/tribunal-events.sh +97 -0
  59. package/plugins/tribunal/scripts/lib/tribunal-gate.sh +111 -0
  60. package/plugins/tribunal/scripts/lib/tribunal-jury.sh +102 -0
  61. package/plugins/tribunal/scripts/lib/tribunal-project-key.sh +84 -0
  62. package/plugins/tribunal/scripts/lib/tribunal-rubric.sh +153 -0
  63. package/plugins/tribunal/scripts/lib/tribunal-ulid.sh +50 -0
  64. package/plugins/tribunal/scripts/lib/tribunal-verdict.sh +127 -0
  65. package/plugins/tribunal/skills/tribunal/SKILL.md +129 -0
  66. package/release-please-config.json +43 -5
  67. package/scripts/coverage/bash-coverage.mjs +169 -0
  68. package/scripts/coverage/format-comment.mjs +120 -0
  69. package/scripts/coverage/run-coverage.mjs +151 -0
  70. package/scripts/hooks/agent-spawn-tracker.sh +4 -4
  71. package/scripts/hooks/prompt-rule-injector.sh +122 -0
  72. package/scripts/lib/onlooker-event.mjs +82 -10
  73. package/scripts/lib/portable-lock.sh +48 -0
  74. package/scripts/lib/prompt-rules.sh +207 -0
  75. package/scripts/lib/tool-history.sh +7 -8
  76. package/scripts/lib/validate-path.sh +4 -0
  77. package/scripts/lint/check-manifests.mjs +314 -0
  78. package/scripts/lint/check-references.mjs +311 -0
  79. package/skills/list-prompt-rules/SKILL.md +15 -0
  80. package/test/bats/archivist-config-files.bats +60 -0
  81. package/test/bats/archivist-config.bats +54 -0
  82. package/test/bats/archivist-inject.bats +73 -0
  83. package/test/bats/archivist-project-key.bats +75 -0
  84. package/test/bats/archivist-storage.bats +119 -0
  85. package/test/bats/archivist-ulid.bats +36 -0
  86. package/test/bats/config.bats +10 -10
  87. package/test/bats/echo-config.bats +90 -0
  88. package/test/bats/echo-events.bats +121 -0
  89. package/test/bats/echo-project-key.bats +115 -0
  90. package/test/bats/echo-stop-hook.bats +101 -0
  91. package/test/bats/echo-ulid.bats +38 -0
  92. package/test/bats/portable-lock.bats +62 -0
  93. package/test/bats/prompt-rules.bats +269 -0
  94. package/test/bats/read-chunk-tracking.bats +73 -0
  95. package/test/bats/tool-history-tracker.bats +1 -0
  96. package/test/bats/tribunal-aggregate.bats +77 -0
  97. package/test/bats/tribunal-config.bats +86 -0
  98. package/test/bats/tribunal-events.bats +209 -0
  99. package/test/bats/tribunal-gate.bats +95 -0
  100. package/test/bats/tribunal-jury.bats +80 -0
  101. package/test/bats/tribunal-rubric.bats +119 -0
  102. package/test/bats/tribunal-stop-hook.bats +73 -0
  103. package/test/bats/tribunal-verdict.bats +71 -0
  104. package/test/bats/validate-path.bats +1 -1
  105. package/test/fixtures/hook-inputs/post-tool-use-read-chunked.json +15 -0
  106. package/test/fixtures/hook-inputs/user-prompt-submit-rule-match.json +8 -0
  107. package/test/fixtures/hook-inputs/user-prompt-submit-rule-nomatch.json +8 -0
  108. package/test/helpers/setup.bash +9 -0
  109. package/test/node/check-manifests.test.mjs +173 -0
  110. package/test/node/check-references.test.mjs +279 -0
  111. package/test/node/coverage.test.mjs +143 -0
  112. package/test/node/schema-events.test.mjs +41 -1
@@ -0,0 +1,61 @@
1
+ ---
2
+ name: tribunal-meta-judge
3
+ description: Reviews the jury's verdicts for bias, hallucination, and criteria misapplication before the gate decides. Emits TribunalMetaCompletePayload as the final message with verdict_quality, bias_detected, bias_types[], and an optional override_recommendation. Operates from the LLM-as-a-Meta-Judge framework. Read-only.
4
+ model: claude-opus-4-7
5
+ tools: Read
6
+ ---
7
+
8
+ # Tribunal Meta-Judge
9
+
10
+ You are the **Meta-Judge**. The jury has scored the Actor's output. Before the gate decides, you review the jury — not the Actor.
11
+
12
+ The LLM-as-a-Judge literature documents six recurring biases that compromise judge reliability. Your job is to detect them in the jury's verdicts and tell the gate whether the jury can be trusted as-is, should be overridden, or should re-evaluate.
13
+
14
+ ## Bias taxonomy (canonical six)
15
+
16
+ | Bias | What it looks like |
17
+ |---|---|
18
+ | `position` | Judge favors the first / last item, or follows a fixed format regardless of content. |
19
+ | `verbosity` | Judge rewarded length over substance — a long Actor output got a higher score than a short correct one. |
20
+ | `self_enhancement` | Judge favored output written in a style similar to its own. |
21
+ | `sycophancy` | Judge scored generously because the Actor's framing was confident or polite. |
22
+ | `refusal` | Judge declined to take a position (e.g., "could be good or bad") or refuses to ever score below a floor. |
23
+ | `length` | Distinct from `verbosity`: judge penalized work *only* for being short, regardless of completeness. |
24
+
25
+ ## Inputs
26
+
27
+ - The task description.
28
+ - Each Judge's verdict (`TribunalVerdictPayload`): `score`, `passed`, `judge_type`, `criteria_evaluated`, `feedback_summary`, `confidence`.
29
+ - The Actor's output (you may read it, but do not re-score it — the jury already did).
30
+
31
+ ## Review discipline
32
+
33
+ - **Verdicts that disagree are not automatically biased.** Disagreement between `standard` and `adversarial` judges is the *point* of the panel. Look for whether the disagreement is grounded in observable claims.
34
+ - **`feedback_summary` lacking file/line specificity** is weak evidence. Flag it as low `confidence`, not as bias.
35
+ - **Refusal to score high or low** across multiple iterations is `refusal` bias.
36
+ - **`security` judge raising issues with no concrete attack chain** is over-reporting; flag as `position` bias.
37
+ - **All judges score within a narrow band of 0.7–0.8** with diverse criteria is suspicious — likely `sycophancy` or `refusal`.
38
+
39
+ ## Output format
40
+
41
+ Final message is a single JSON object — no prose, no fence:
42
+
43
+ ```json
44
+ {
45
+ "verdict_quality": "sound",
46
+ "bias_detected": false,
47
+ "bias_types": [],
48
+ "confidence": 0.9,
49
+ "override_recommendation": "accept"
50
+ }
51
+ ```
52
+
53
+ Required fields: `verdict_quality` (one of `"sound" | "questionable" | "biased"`) and `bias_detected` (bool).
54
+
55
+ Optional fields:
56
+
57
+ - `bias_types[]` — list any biases you detected from the taxonomy above.
58
+ - `override_recommendation` — one of `"accept" | "reject" | "re-evaluate"`. Use this only when you are confident the gate should defer to you over the jury. Leave it unset when the jury verdict can stand on its own merits.
59
+ - `confidence` — your own confidence in this meta-review.
60
+
61
+ The orchestrator will inject `iteration_id`, `meta_model_id`, and `verdicts_reviewed` when persisting.
@@ -0,0 +1,50 @@
1
+ {
2
+ "plugin_name": "tribunal",
3
+ "storage_path": "~/.onlooker",
4
+ "tribunal": {
5
+ "enabled": true,
6
+ "stop_hook": {
7
+ "enabled": false,
8
+ "skip_if_no_file_changes": true
9
+ },
10
+ "session": {
11
+ "max_iterations": 3,
12
+ "score_threshold": 0.75,
13
+ "gate_policy": "majority",
14
+ "aggregation_method": "weighted_mean",
15
+ "judge_types": ["standard", "adversarial"],
16
+ "dissent_threshold": 0.25
17
+ },
18
+ "actor": {
19
+ "model": "claude-sonnet-4-6",
20
+ "max_output_tokens": 4096
21
+ },
22
+ "judges": {
23
+ "model": "claude-opus-4-7",
24
+ "max_output_tokens": 2048
25
+ },
26
+ "meta_judge": {
27
+ "model": "claude-opus-4-7",
28
+ "max_output_tokens": 1024
29
+ },
30
+ "rubric": {
31
+ "default_id": "default",
32
+ "builtins": [
33
+ {
34
+ "id": "default",
35
+ "criteria": [
36
+ { "name": "correctness", "weight": 0.4, "min_pass": 0.7 },
37
+ { "name": "completeness", "weight": 0.3, "min_pass": 0.7 },
38
+ { "name": "safety", "weight": 0.2, "min_pass": 0.8 },
39
+ { "name": "clarity", "weight": 0.1, "min_pass": 0.5 }
40
+ ],
41
+ "score_threshold": 0.75,
42
+ "max_iterations": 3,
43
+ "judge_types": ["standard", "adversarial"],
44
+ "gate_policy": "majority",
45
+ "aggregation_method": "weighted_mean"
46
+ }
47
+ ]
48
+ }
49
+ }
50
+ }
@@ -0,0 +1,40 @@
1
+ # ADR-001: The Actor → Jury → Meta-Judge → Gate Loop
2
+
3
+ **Status:** Accepted
4
+ **Date:** 2026-05-24
5
+
6
+ ## Context
7
+
8
+ Tribunal needs a structure for producing high-quality output from an LLM. The simplest approach is a single model call and accept whatever comes out. More rigorous options include self-critique, a second model reviewing the first, or a multi-agent panel. The design choices were:
9
+
10
+ - **Single pass** — one model, no review.
11
+ - **Self-critique** — the same model reviews its own output.
12
+ - **Two-model review** — a separate "judge" model scores the first model's output.
13
+ - **Multi-agent jury** — multiple typed judge agents score the output; a meta-judge reviews the jury for bias; a gate decides accept/retry.
14
+
15
+ ## Decision
16
+
17
+ Tribunal uses a **four-tier Actor → Jury → Meta-Judge → Gate loop** with configurable retry.
18
+
19
+ This design is grounded in two published findings:
20
+ - [LLM-as-a-Judge (Zheng et al. 2023)](https://arxiv.org/abs/2306.05685): strong LLMs can score other LLMs against rubrics with reasonable agreement to human judgment.
21
+ - [LLM-as-a-Meta-Judge (Wu et al. 2024)](https://arxiv.org/abs/2407.19594): a second model reviewing the Judge's verdict catches position bias, verbosity bias, and self-enhancement — bias types that degrade single-judge reliability.
22
+
23
+ ## Rationale
24
+
25
+ **Single pass is insufficient for high-stakes tasks.** A model that produces and evaluates its own output is subject to self-enhancement bias — it tends to rate its own output favorably. Separate the producer (Actor) from the evaluators (Jury).
26
+
27
+ **A jury of typed judges catches different failure modes.** A `standard` judge scores correctness and completeness. An `adversarial` judge actively tries to find failure modes. A `security` judge looks for vulnerabilities. No single judge type is best at everything; the jury composition is configurable per project.
28
+
29
+ **The Meta-Judge addresses jury bias, not just actor quality.** Even separate judge models have documented bias patterns: position bias (favoring the first response), verbosity bias (favoring longer outputs), sycophancy. The Meta-Judge reviews each verdict for these patterns and can flag or override. Without this tier, jury disagreement is unresolvable — you can't know if one judge was right or biased.
30
+
31
+ **The Gate with retry closes the loop.** A quality gate that only reports a score doesn't improve outcomes. By feeding the jury's critique back to the Actor on retry, Tribunal creates a feedback loop. The Actor on iteration 1 sees what the judges found weak; it has a chance to produce better output before the session ends.
32
+
33
+ **Configurable `max_iterations` prevents infinite loops.** The loop always terminates. `max_iterations: 3` (default) means at most 3 Actor passes. If the gate never passes, the outcome is `exhausted_iterations` — not a hang.
34
+
35
+ ## Consequences
36
+
37
+ - A full Tribunal loop with two judges and a Meta-Judge makes 4–5 model calls per iteration. At 3 iterations, this is 12–15 calls. Cost and latency are real concerns; Tribunal is designed for deliberate use (`/tribunal <task>`), not for wrapping every session automatically.
38
+ - The `majority` gate policy with two judges creates a degenerate case: 2-judge majority requires 2/2 judges to pass (not 1/2). This surprised early users expecting 50%+1. See ADR-002 for the gate policy decision.
39
+ - Judge type composition matters. The default `["standard", "adversarial"]` provides coverage and contrast. Adding `security` triples judge cost for every iteration.
40
+ - The Actor receives critique from *all* prior judges on retry, not just the weakest. This is intentional — even a judge that passed may have noted improvements.
@@ -0,0 +1,48 @@
1
+ # ADR-002: Majority Gate Policy as Default
2
+
3
+ **Status:** Accepted
4
+ **Date:** 2026-05-24
5
+
6
+ ## Context
7
+
8
+ After the Jury and Meta-Judge tiers produce scores, the Gate must decide: accept the output, retry, or exhaust? Several policies were considered:
9
+
10
+ - **Score threshold only** — pass if `aggregated_score >= threshold` (e.g., 0.75).
11
+ - **Unanimous** — pass only if every judge voted passed.
12
+ - **Majority** — pass if strictly more than half of judges voted passed.
13
+ - **Meta-override** — the Meta-Judge's recommendation overrides the jury.
14
+ - **Hybrid** — any combination of the above.
15
+
16
+ The available policies in config are: `majority`, `strict` (alias for `unanimous`), `unanimous`, `meta_override`.
17
+
18
+ ## Decision
19
+
20
+ The default gate policy is **`majority`**. The gate requires **both** the jury policy vote **and** `score_threshold` to clear — both conditions must be true for a pass. `score_threshold: 0.75` is a hard blocking condition, not just a reporting signal.
21
+
22
+ ## Rationale
23
+
24
+ **Majority is the most intuitive policy for a multi-judge panel.** In any jury system, majority verdict is the natural baseline. It prevents a single outlier judge from blocking a good result indefinitely (the adversarial judge is *designed* to find fault and rarely gives a full pass).
25
+
26
+ **Unanimous is too strict for the default judge composition.** With `["standard", "adversarial"]`, the adversarial judge is built to be skeptical. A policy requiring it to pass alongside the standard judge effectively gives veto power to the judge whose job is to reject. In practice, unanimous with this composition would mean the gate almost never passes.
27
+
28
+ **Score threshold alone conflates jury agreement with quality.** A score of 0.8 from two judges who disagree strongly (e.g., 1.0 and 0.6) is a different signal than 0.8 from two judges who both scored 0.8. The majority policy captures agreement; the dissent threshold captures disagreement.
29
+
30
+ ## The 2-judge edge case
31
+
32
+ The majority formula is `passed_count * 2 > total_count`. With two judges:
33
+
34
+ | Judges passed | Formula | Result |
35
+ |--------------|---------|--------|
36
+ | 2/2 | `2 * 2 > 2` → `4 > 2` | ✓ pass |
37
+ | 1/2 | `1 * 2 > 2` → `2 > 2` | ✗ block |
38
+ | 0/2 | `0 * 2 > 2` → `0 > 2` | ✗ block |
39
+
40
+ This means with the default two-judge panel, **both judges must pass** for the gate to open. This behaves like unanimous in the 2-judge case. This was observed during early Tribunal development (Echo's own Tribunal evaluation exhausted all 3 iterations because the adversarial judge never passed). It is technically correct — strictly more than half of 2 requires 2 — but surprises users expecting "majority" to mean "1 out of 2".
41
+
42
+ **The consequence is intentional:** two judges is already a lean panel. Requiring both to pass ensures quality signal from both perspectives before accepting. Users who want genuine 2/3 behavior should add a third judge type (e.g., `security`) to the panel — majority with three judges means two must pass, which is materially different from two-judge unanimous.
43
+
44
+ ## Consequences
45
+
46
+ - The `majority` policy with 2 judges is effectively `unanimous`. This should be documented prominently for users configuring judge panels.
47
+ - Adding a third judge type (e.g., `security`) changes `majority` to mean 2/3, which is a meaningfully different bar. Users who want consistent behavior regardless of panel size should specify `gate_policy: "unanimous"` explicitly.
48
+ - The `meta_override` policy gives the Meta-Judge final say, bypassing jury vote counts entirely. This is available but not the default — it introduces a single point of failure (Meta-Judge bias or hallucination) that the default policy is specifically designed to avoid.
@@ -0,0 +1,15 @@
1
+ {
2
+ "hooks": {
3
+ "Stop": [
4
+ {
5
+ "matcher": "*",
6
+ "hooks": [
7
+ {
8
+ "type": "command",
9
+ "command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/tribunal-stop-gate.sh"
10
+ }
11
+ ]
12
+ }
13
+ ]
14
+ }
15
+ }
@@ -0,0 +1,267 @@
1
+ #!/usr/bin/env bash
2
+ # Tribunal Stop-gate hook.
3
+ #
4
+ # Triggered by Stop. Off by default — gated on tribunal.stop_hook.enabled in
5
+ # config. When enabled, runs a single-judge advisory pass on the just-finished
6
+ # session's last turn and writes a verdict for review on the next session.
7
+ #
8
+ # Why advisory only: by the time Stop fires the main agent loop has already
9
+ # ended. We cannot retry the Actor or re-run the work. The hook records what
10
+ # the Standard Judge would have said so a human (or a follow-up SessionStart
11
+ # hook in v0.2) can see whether the turn would have passed the gate.
12
+ #
13
+ # Hook contract:
14
+ # - Always exits 0. Never blocks Stop.
15
+ # - Skips silently if disabled, no git context, no transcript, or skip_if_no_file_changes
16
+ # is true and the last turn did not modify files.
17
+ # - Errors from `claude -p` are swallowed; worst case is "no verdict for this stop".
18
+
19
+ set -uo pipefail
20
+
21
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
+ PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
23
+
24
+ # Ecosystem substrate lives in the sibling ecosystem plugin. Same lookup as
25
+ # archivist-extract.sh.
26
+ _ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
27
+ if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
28
+ _candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
29
+ if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
30
+ _ECOSYSTEM_ROOT="$_candidate"
31
+ fi
32
+ fi
33
+
34
+ if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
35
+ # shellcheck disable=SC1091
36
+ CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
37
+ # shellcheck disable=SC1091
38
+ CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/onlooker-schema.sh"
39
+ fi
40
+
41
+ # shellcheck source=../lib/tribunal-config.sh
42
+ source "${PLUGIN_ROOT}/scripts/lib/tribunal-config.sh"
43
+ # shellcheck source=../lib/tribunal-project-key.sh
44
+ source "${PLUGIN_ROOT}/scripts/lib/tribunal-project-key.sh"
45
+ # shellcheck source=../lib/tribunal-ulid.sh
46
+ source "${PLUGIN_ROOT}/scripts/lib/tribunal-ulid.sh"
47
+ # shellcheck source=../lib/tribunal-events.sh
48
+ source "${PLUGIN_ROOT}/scripts/lib/tribunal-events.sh"
49
+ # shellcheck source=../lib/tribunal-verdict.sh
50
+ source "${PLUGIN_ROOT}/scripts/lib/tribunal-verdict.sh"
51
+
52
+ INPUT=$(cat)
53
+
54
+ CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
55
+ SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
56
+ TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | jq -r '.transcript_path // ""' 2>/dev/null) || TRANSCRIPT_PATH=""
57
+
58
+ # Stop hook MUST NOT emit any stdout besides the optional `{continue: ...}`
59
+ # acknowledgement. Exiting 0 with no output is the safe path.
60
+ _done() {
61
+ exit 0
62
+ }
63
+
64
+ REPO_ROOT=$(tribunal_project_repo_root "$CWD")
65
+ tribunal_config_load "$REPO_ROOT"
66
+
67
+ if ! tribunal_config_stop_hook_enabled; then
68
+ _done
69
+ fi
70
+
71
+ PROJECT_KEY=$(tribunal_project_key "$CWD")
72
+ if [[ -z "$PROJECT_KEY" || -z "$REPO_ROOT" ]]; then
73
+ _done
74
+ fi
75
+
76
+ if [[ -z "$TRANSCRIPT_PATH" || ! -f "$TRANSCRIPT_PATH" ]]; then
77
+ _done
78
+ fi
79
+
80
+ # Skip if no files were modified since the last commit AND the user enabled
81
+ # skip_if_no_file_changes (default true).
82
+ SKIP_IF_CLEAN=$(tribunal_config_get '.tribunal.stop_hook.skip_if_no_file_changes')
83
+ if [[ "$SKIP_IF_CLEAN" == "true" ]]; then
84
+ if git -C "$REPO_ROOT" diff --quiet 2>/dev/null && git -C "$REPO_ROOT" diff --cached --quiet 2>/dev/null; then
85
+ _done
86
+ fi
87
+ fi
88
+
89
+ if ! command -v claude >/dev/null 2>&1; then
90
+ _done
91
+ fi
92
+
93
+ # ----------------------------------------------------------------------------
94
+ # Build the advisory-judge prompt.
95
+ # ----------------------------------------------------------------------------
96
+
97
+ JUDGE_MODEL=$(tribunal_config_judge_model "standard")
98
+ [[ -z "$JUDGE_MODEL" || "$JUDGE_MODEL" == "null" ]] && JUDGE_MODEL=""
99
+
100
+ SCORE_THRESHOLD=$(tribunal_config_get '.tribunal.session.score_threshold')
101
+ [[ -z "$SCORE_THRESHOLD" ]] && SCORE_THRESHOLD="0.75"
102
+
103
+ TRANSCRIPT_TAIL=$(tail -c 30000 "$TRANSCRIPT_PATH" 2>/dev/null) || TRANSCRIPT_TAIL=""
104
+ [[ -z "$TRANSCRIPT_TAIL" ]] && _done
105
+
106
+ DIFF_SUMMARY=$(git -C "$REPO_ROOT" diff --stat 2>/dev/null | tail -c 4000) || DIFF_SUMMARY=""
107
+
108
+ PROMPT_FILE=$(mktemp -t tribunal-stop-prompt.XXXXXX 2>/dev/null) || PROMPT_FILE="/tmp/tribunal-stop-prompt.$$"
109
+ trap 'rm -f "$PROMPT_FILE"' EXIT
110
+
111
+ {
112
+ printf '%s\n' 'You are a Tribunal Standard Judge performing an advisory pass on a just-finished Claude Code turn. Return JSON only — no prose, no markdown fences.'
113
+ printf '\n'
114
+ printf '%s\n' 'Output schema (TribunalVerdictPayload, exactly these keys):'
115
+ printf '%s\n' '{'
116
+ printf '%s\n' ' "score": 0.0..1.0,'
117
+ printf '%s\n' ' "passed": true|false,'
118
+ printf '%s\n' ' "judge_type": "standard",'
119
+ printf '%s\n' ' "feedback_summary": "1-3 sentences naming the highest-leverage concern, if any.",'
120
+ printf '%s\n' ' "confidence": 0.0..1.0'
121
+ printf '%s\n' '}'
122
+ printf '\n'
123
+ printf '%s\n' "Score the work the assistant performed in this turn against general correctness, completeness, and clarity. A score >= ${SCORE_THRESHOLD} is \"passed\"."
124
+ printf '%s\n' 'This is advisory — the main session has already ended, no retry will happen. Be concise.'
125
+ printf '\n'
126
+ if [[ -n "$DIFF_SUMMARY" ]]; then
127
+ printf '%s\n' '---WORKING-TREE DIFF STAT---'
128
+ printf '%s\n' "$DIFF_SUMMARY"
129
+ printf '%s\n' '---END DIFF STAT---'
130
+ printf '\n'
131
+ fi
132
+ printf '%s\n' '---TRANSCRIPT TAIL---'
133
+ printf '%s\n' "$TRANSCRIPT_TAIL"
134
+ printf '%s\n' '---END TRANSCRIPT TAIL---'
135
+ } > "$PROMPT_FILE"
136
+
137
+ CLAUDE_ARGS=(-p --max-turns 1)
138
+ [[ -n "$JUDGE_MODEL" ]] && CLAUDE_ARGS+=(--model "$JUDGE_MODEL")
139
+
140
+ RESPONSE=""
141
+ if command -v timeout >/dev/null 2>&1; then
142
+ RESPONSE=$(timeout 60 claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
143
+ elif command -v gtimeout >/dev/null 2>&1; then
144
+ RESPONSE=$(gtimeout 60 claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
145
+ else
146
+ RESPONSE=$(claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
147
+ fi
148
+
149
+ [[ -z "$RESPONSE" ]] && _done
150
+
151
+ CLEAN_RESPONSE=$(printf '%s' "$RESPONSE" | sed -e 's/^```json//' -e 's/^```//' -e 's/```$//')
152
+ if ! printf '%s' "$CLEAN_RESPONSE" | jq -e '.score and (.passed // false | type == "boolean") and .judge_type' >/dev/null 2>&1; then
153
+ _done
154
+ fi
155
+
156
+ # ----------------------------------------------------------------------------
157
+ # Emit the canonical event chain + persist the advisory verdict.
158
+ # ----------------------------------------------------------------------------
159
+
160
+ TASK_ID=$(tribunal_ulid)
161
+ ITERATION_ID=$(tribunal_ulid)
162
+ JUDGE_ID=$(tribunal_ulid)
163
+
164
+ START_PAYLOAD=$(jq -n \
165
+ --arg task_id "$TASK_ID" \
166
+ --arg model "$JUDGE_MODEL" \
167
+ --argjson thr "$SCORE_THRESHOLD" \
168
+ '{
169
+ task_id: $task_id,
170
+ judge_types: ["standard"],
171
+ gate_policy: "strict",
172
+ score_threshold: $thr,
173
+ max_iterations: 1
174
+ } + (if $model != "" then {judge_model_ids: [$model]} else {} end)')
175
+
176
+ ITER_PAYLOAD=$(jq -n \
177
+ --arg task_id "$TASK_ID" \
178
+ --arg iter_id "$ITERATION_ID" \
179
+ '{task_id: $task_id, iteration_id: $iter_id, iteration_number: 0, trigger: "initial"}')
180
+
181
+ JUDGE_START_PAYLOAD=$(jq -n \
182
+ --arg task_id "$TASK_ID" \
183
+ --arg iter_id "$ITERATION_ID" \
184
+ --arg judge_id "$JUDGE_ID" \
185
+ --arg model "$JUDGE_MODEL" \
186
+ '{
187
+ task_id: $task_id,
188
+ iteration_id: $iter_id,
189
+ judge_id: $judge_id,
190
+ judge_type: "standard",
191
+ judge_model_id: (if $model == "" then null else $model end)
192
+ } | with_entries(select(.value != null))')
193
+
194
+ VERDICT_PAYLOAD=$(printf '%s' "$CLEAN_RESPONSE" | jq -c \
195
+ --arg task_id "$TASK_ID" \
196
+ --arg iter_id "$ITERATION_ID" \
197
+ --arg judge_id "$JUDGE_ID" \
198
+ --arg model "$JUDGE_MODEL" \
199
+ --argjson thr "$SCORE_THRESHOLD" \
200
+ '{
201
+ task_id: $task_id,
202
+ score: .score,
203
+ passed: (.passed // (.score >= $thr)),
204
+ judge_type: "standard",
205
+ iteration_id: $iter_id,
206
+ judge_id: $judge_id,
207
+ feedback_summary: (.feedback_summary // ""),
208
+ confidence: (.confidence // 0.6),
209
+ judge_model_id: (if $model == "" then null else $model end)
210
+ } | with_entries(select(.value != null and .value != ""))')
211
+
212
+ SCORE=$(printf '%s' "$VERDICT_PAYLOAD" | jq -r '.score')
213
+ PASSED=$(printf '%s' "$VERDICT_PAYLOAD" | jq -r '.passed')
214
+
215
+ if [[ "$PASSED" == "true" ]]; then
216
+ GATE_PAYLOAD=$(jq -n \
217
+ --arg task_id "$TASK_ID" \
218
+ --arg iter_id "$ITERATION_ID" \
219
+ --argjson score "$SCORE" \
220
+ '{task_id: $task_id, iteration_id: $iter_id, final_score: $score, iteration_number: 0, judges_consulted: 1}')
221
+ GATE_EVENT="tribunal.gate.passed"
222
+ OUTCOME="accepted"
223
+ else
224
+ GATE_PAYLOAD=$(jq -n \
225
+ --arg task_id "$TASK_ID" \
226
+ --arg iter_id "$ITERATION_ID" \
227
+ --argjson score "$SCORE" \
228
+ '{task_id: $task_id, iteration_id: $iter_id, reason: "low_score", final_score: $score, iteration_number: 0, will_retry: false}')
229
+ GATE_EVENT="tribunal.gate.blocked"
230
+ OUTCOME="rejected"
231
+ fi
232
+
233
+ COMPLETE_PAYLOAD=$(jq -n \
234
+ --arg task_id "$TASK_ID" \
235
+ --arg outcome "$OUTCOME" \
236
+ --argjson score "$SCORE" \
237
+ '{task_id: $task_id, outcome: $outcome, final_score: $score, iterations_used: 1}')
238
+
239
+ # Emit in canonical order. Each call is best-effort — a single schema failure
240
+ # should not break the user's Stop.
241
+ tribunal_emit_event "tribunal.session.start" "$START_PAYLOAD" || true
242
+ tribunal_emit_event "tribunal.iteration.start" "$ITER_PAYLOAD" || true
243
+ tribunal_emit_event "tribunal.judge.start" "$JUDGE_START_PAYLOAD" || true
244
+ tribunal_emit_event "tribunal.verdict" "$VERDICT_PAYLOAD" || true
245
+ tribunal_emit_event "$GATE_EVENT" "$GATE_PAYLOAD" || true
246
+ tribunal_emit_event "tribunal.session.complete" "$COMPLETE_PAYLOAD" || true
247
+
248
+ # Persist a single advisory file for the next session to surface.
249
+ STOP_DIR="$(tribunal_project_dir "$PROJECT_KEY")"
250
+ mkdir -p "$STOP_DIR" 2>/dev/null || _done
251
+ SAFE_SESSION_ID=$(printf '%s' "$SESSION_ID" | tr -c 'a-zA-Z0-9-' '_')
252
+ [[ -z "$SAFE_SESSION_ID" ]] && SAFE_SESSION_ID="unknown"
253
+
254
+ jq -n \
255
+ --arg task_id "$TASK_ID" \
256
+ --arg session_id "$SESSION_ID" \
257
+ --arg outcome "$OUTCOME" \
258
+ --argjson verdict "$VERDICT_PAYLOAD" \
259
+ '{
260
+ task_id: $task_id,
261
+ session_id: $session_id,
262
+ outcome: $outcome,
263
+ verdict: $verdict,
264
+ mode: "stop-advisory"
265
+ }' > "${STOP_DIR}/stop-${SAFE_SESSION_ID}.json" 2>/dev/null || true
266
+
267
+ _done
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env bash
2
+ # Score aggregation for Tribunal.
3
+ #
4
+ # Aggregates per-judge verdicts into a single jury-level score per the chosen
5
+ # aggregation_method. Also computes the dissent metric (max - min) so callers
6
+ # can decide whether to emit tribunal.dissent.recorded.
7
+ #
8
+ # Verdicts input is a JSON array of TribunalVerdictPayload objects (or a subset
9
+ # containing at least { judge_id, score }). Rubric is the active rubric (for
10
+ # weighted_mean only).
11
+ #
12
+ # Exposes:
13
+ # tribunal_aggregate <method> <verdicts_json> [<rubric_json>]
14
+ # echoes the aggregated score (0..1) as a JSON number
15
+ # tribunal_disagreement <verdicts_json>
16
+ # echoes max(score) - min(score), or 0 if 0/1 verdicts
17
+ #
18
+ # weighted_mean uses *rubric criterion weights*, not per-judge weights — the
19
+ # semantics are "weight each criterion's contribution, then average judges'
20
+ # scores on each criterion." For v0.1 the per-criterion breakdown is not yet
21
+ # threaded through verdicts, so weighted_mean degrades to mean when the rubric
22
+ # weights cannot be applied. The schema still emits aggregation_method =
23
+ # "weighted_mean" so dashboards see the intent.
24
+
25
+ tribunal_aggregate() {
26
+ local method="${1:-mean}"
27
+ local verdicts="${2:-[]}"
28
+ local _rubric="${3:-{}}" # reserved for true weighted_mean once per-criterion scores are threaded
29
+ : "$_rubric"
30
+
31
+ local count
32
+ count=$(printf '%s' "$verdicts" | jq 'length' 2>/dev/null) || count=0
33
+ [[ "$count" -eq 0 ]] && { printf '0'; return 0; }
34
+
35
+ case "$method" in
36
+ mean|weighted_mean)
37
+ printf '%s' "$verdicts" | jq -r '[.[].score] | add / length'
38
+ ;;
39
+ median)
40
+ printf '%s' "$verdicts" | jq -r '
41
+ [.[].score] | sort as $s
42
+ | ($s | length) as $n
43
+ | if ($n % 2) == 1 then $s[($n - 1) / 2]
44
+ else (($s[$n / 2 - 1] + $s[$n / 2]) / 2)
45
+ end
46
+ '
47
+ ;;
48
+ min)
49
+ printf '%s' "$verdicts" | jq -r '[.[].score] | min'
50
+ ;;
51
+ *)
52
+ printf 'tribunal-aggregate: unknown method %s, falling back to mean\n' \
53
+ "$method" >&2
54
+ printf '%s' "$verdicts" | jq -r '[.[].score] | add / length'
55
+ ;;
56
+ esac
57
+ }
58
+
59
+ tribunal_disagreement() {
60
+ local verdicts="${1:-[]}"
61
+ local count
62
+ count=$(printf '%s' "$verdicts" | jq 'length' 2>/dev/null) || count=0
63
+ [[ "$count" -lt 2 ]] && { printf '0'; return 0; }
64
+ printf '%s' "$verdicts" | jq -r '[.[].score] | (max - min)'
65
+ }
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/env bash
2
+ # Config resolution for Tribunal.
3
+ #
4
+ # Reads three layers, latest wins:
5
+ # 1. plugins/tribunal/config.json (defaults shipped with the plugin)
6
+ # 2. ~/.claude/settings.json
7
+ # 3. <repo>/.claude/settings.json
8
+ #
9
+ # Exposes:
10
+ # tribunal_config_load <repo_root> # populates _TRIBUNAL_CONFIG (JSON)
11
+ # tribunal_config_get <jq-path> # echoes string value (empty if unset)
12
+ # tribunal_config_get_json <jq-path> # echoes JSON value (null if unset)
13
+ # tribunal_config_enabled # 0 if tribunal.enabled is true
14
+ # tribunal_config_stop_hook_enabled # 0 if tribunal.stop_hook.enabled is true
15
+ # tribunal_config_judge_model <judge_type>
16
+ # # echoes per-judge-type model override,
17
+ # # falling back to tribunal.judges.model
18
+ #
19
+ # Settings overlay only touches the `tribunal.*` subtree so this plugin coexists
20
+ # with other plugins' configuration.
21
+
22
+ _TRIBUNAL_CONFIG="{}"
23
+
24
+ tribunal_config_load() {
25
+ local repo_root="${1:-}"
26
+ local plugin_root="${CLAUDE_PLUGIN_ROOT:-}"
27
+ local home_dir="${HOME:-}"
28
+
29
+ local merged="{}"
30
+ local file
31
+
32
+ file="${plugin_root}/config.json"
33
+ if [[ -f "$file" ]]; then
34
+ local defaults
35
+ defaults=$(jq '.' "$file" 2>/dev/null) || defaults="{}"
36
+ merged=$(jq -n --argjson a "$merged" --argjson b "$defaults" '$a * $b' 2>/dev/null) \
37
+ || merged="$defaults"
38
+ fi
39
+
40
+ for file in "${home_dir}/.claude/settings.json" "${repo_root}/.claude/settings.json"; do
41
+ [[ -n "$file" && -f "$file" ]] || continue
42
+ local overlay
43
+ overlay=$(jq '{ tribunal: (.tribunal // {}) }' "$file" 2>/dev/null) || continue
44
+ [[ -z "$overlay" ]] && continue
45
+ local attempt
46
+ if attempt=$(jq -n --argjson a "$merged" --argjson b "$overlay" '
47
+ def deepmerge($a; $b):
48
+ if ($a|type) == "object" and ($b|type) == "object" then
49
+ reduce (($a|keys) + ($b|keys) | unique)[] as $k
50
+ ({}; .[$k] = deepmerge($a[$k]; $b[$k]))
51
+ elif $b == null then $a
52
+ else $b end;
53
+ deepmerge($a; $b)
54
+ ' 2>/dev/null) && [[ -n "$attempt" ]]; then
55
+ merged="$attempt"
56
+ fi
57
+ done
58
+
59
+ _TRIBUNAL_CONFIG="$merged"
60
+ }
61
+
62
+ # Read a string value from the loaded config.
63
+ # Usage: tribunal_config_get '.tribunal.session.gate_policy'
64
+ tribunal_config_get() {
65
+ local path="$1"
66
+ printf '%s' "$_TRIBUNAL_CONFIG" | jq -r "${path} // empty" 2>/dev/null
67
+ }
68
+
69
+ # Read a JSON value (arrays, objects, numbers) from the loaded config.
70
+ # Usage: tribunal_config_get_json '.tribunal.session.judge_types'
71
+ tribunal_config_get_json() {
72
+ local path="$1"
73
+ printf '%s' "$_TRIBUNAL_CONFIG" | jq -c "${path}" 2>/dev/null
74
+ }
75
+
76
+ # Returns 0 if tribunal.enabled is true.
77
+ tribunal_config_enabled() {
78
+ local v
79
+ v=$(tribunal_config_get '.tribunal.enabled')
80
+ [[ "$v" == "true" ]]
81
+ }
82
+
83
+ # Returns 0 if tribunal.stop_hook.enabled is true. Default is false.
84
+ tribunal_config_stop_hook_enabled() {
85
+ local v
86
+ v=$(tribunal_config_get '.tribunal.stop_hook.enabled')
87
+ [[ "$v" == "true" ]]
88
+ }
89
+
90
+ # Resolve the model id for a given judge_type.
91
+ # Precedence: tribunal.judges.<type>.model > tribunal.judges.model
92
+ tribunal_config_judge_model() {
93
+ local judge_type="$1"
94
+ local override
95
+ override=$(tribunal_config_get ".tribunal.judges.\"${judge_type}\".model")
96
+ if [[ -n "$override" ]]; then
97
+ printf '%s' "$override"
98
+ return 0
99
+ fi
100
+ tribunal_config_get '.tribunal.judges.model'
101
+ }