npm - @onlooker-community/ecosystem - Versions diffs - 0.9.0 → 0.14.0 - Mend

@onlooker-community/ecosystem 0.9.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

package/.claude-plugin/marketplace.json +39 -1
package/.claude-plugin/plugin.json +2 -2
package/.github/copilot-instructions.md +46 -0
package/.github/workflows/coverage.yml +78 -0
package/.github/workflows/release.yml +24 -8
package/.github/workflows/test.yml +3 -0
package/.markdownlintignore +3 -0
package/.release-please-manifest.json +4 -1
package/CHANGELOG.md +44 -0
package/README.md +57 -13
package/config.json +6 -1
package/docs/adr/001-claude-code-hooks-as-integration-surface.md +43 -0
package/docs/adr/002-centralized-jsonl-event-log.md +39 -0
package/docs/adr/003-ulid-over-uuid.md +40 -0
package/docs/adr/004-plugin-config-with-settings-overlay.md +34 -0
package/docs/architecture.md +117 -0
package/hooks/hooks.json +4 -0
package/package.json +13 -7
package/plugins/archivist/.claude-plugin/plugin.json +14 -0
package/plugins/archivist/CHANGELOG.md +8 -0
package/plugins/archivist/README.md +105 -0
package/plugins/archivist/config.json +18 -0
package/plugins/archivist/hooks/hooks.json +35 -0
package/plugins/archivist/scripts/hooks/archivist-extract.sh +238 -0
package/plugins/archivist/scripts/hooks/archivist-inject.sh +159 -0
package/plugins/archivist/scripts/lib/archivist-config.sh +66 -0
package/plugins/archivist/scripts/lib/archivist-project-key.sh +91 -0
package/plugins/archivist/scripts/lib/archivist-storage.sh +215 -0
package/plugins/archivist/scripts/lib/archivist-ulid.sh +52 -0
package/plugins/echo/.claude-plugin/plugin.json +14 -0
package/plugins/echo/CHANGELOG.md +24 -0
package/plugins/echo/README.md +110 -0
package/plugins/echo/config.json +15 -0
package/plugins/echo/docs/adr/001-echo-as-separate-plugin.md +33 -0
package/plugins/echo/docs/adr/002-direct-evaluation-vs-tribunal-pipeline.md +35 -0
package/plugins/echo/docs/adr/003-stop-hook-trigger.md +40 -0
package/plugins/echo/hooks/hooks.json +15 -0
package/plugins/echo/scripts/hooks/echo-stop-gate.sh +366 -0
package/plugins/echo/scripts/lib/echo-config.sh +108 -0
package/plugins/echo/scripts/lib/echo-events.sh +74 -0
package/plugins/echo/scripts/lib/echo-project-key.sh +81 -0
package/plugins/echo/scripts/lib/echo-ulid.sh +46 -0
package/plugins/tribunal/.claude-plugin/plugin.json +20 -0
package/plugins/tribunal/CHANGELOG.md +10 -0
package/plugins/tribunal/README.md +134 -0
package/plugins/tribunal/agents/tribunal-actor.md +35 -0
package/plugins/tribunal/agents/tribunal-judge-adversarial.md +51 -0
package/plugins/tribunal/agents/tribunal-judge-security.md +47 -0
package/plugins/tribunal/agents/tribunal-judge-standard.md +47 -0
package/plugins/tribunal/agents/tribunal-meta-judge.md +61 -0
package/plugins/tribunal/config.json +50 -0
package/plugins/tribunal/docs/adr/001-actor-jury-meta-gate-loop.md +40 -0
package/plugins/tribunal/docs/adr/002-majority-gate-policy.md +48 -0
package/plugins/tribunal/hooks/hooks.json +15 -0
package/plugins/tribunal/scripts/hooks/tribunal-stop-gate.sh +267 -0
package/plugins/tribunal/scripts/lib/tribunal-aggregate.sh +65 -0
package/plugins/tribunal/scripts/lib/tribunal-config.sh +101 -0
package/plugins/tribunal/scripts/lib/tribunal-events.sh +97 -0
package/plugins/tribunal/scripts/lib/tribunal-gate.sh +111 -0
package/plugins/tribunal/scripts/lib/tribunal-jury.sh +102 -0
package/plugins/tribunal/scripts/lib/tribunal-project-key.sh +84 -0
package/plugins/tribunal/scripts/lib/tribunal-rubric.sh +153 -0
package/plugins/tribunal/scripts/lib/tribunal-ulid.sh +50 -0
package/plugins/tribunal/scripts/lib/tribunal-verdict.sh +127 -0
package/plugins/tribunal/skills/tribunal/SKILL.md +129 -0
package/release-please-config.json +43 -5
package/scripts/coverage/bash-coverage.mjs +169 -0
package/scripts/coverage/format-comment.mjs +120 -0
package/scripts/coverage/run-coverage.mjs +151 -0
package/scripts/hooks/agent-spawn-tracker.sh +4 -4
package/scripts/hooks/prompt-rule-injector.sh +122 -0
package/scripts/lib/onlooker-event.mjs +82 -10
package/scripts/lib/portable-lock.sh +48 -0
package/scripts/lib/prompt-rules.sh +207 -0
package/scripts/lib/tool-history.sh +7 -8
package/scripts/lib/validate-path.sh +4 -0
package/scripts/lint/check-manifests.mjs +314 -0
package/scripts/lint/check-references.mjs +311 -0
package/skills/list-prompt-rules/SKILL.md +15 -0
package/test/bats/archivist-config-files.bats +60 -0
package/test/bats/archivist-config.bats +54 -0
package/test/bats/archivist-inject.bats +73 -0
package/test/bats/archivist-project-key.bats +75 -0
package/test/bats/archivist-storage.bats +119 -0
package/test/bats/archivist-ulid.bats +36 -0
package/test/bats/config.bats +10 -10
package/test/bats/echo-config.bats +90 -0
package/test/bats/echo-events.bats +121 -0
package/test/bats/echo-project-key.bats +115 -0
package/test/bats/echo-stop-hook.bats +101 -0
package/test/bats/echo-ulid.bats +38 -0
package/test/bats/portable-lock.bats +62 -0
package/test/bats/prompt-rules.bats +269 -0
package/test/bats/read-chunk-tracking.bats +73 -0
package/test/bats/tool-history-tracker.bats +1 -0
package/test/bats/tribunal-aggregate.bats +77 -0
package/test/bats/tribunal-config.bats +86 -0
package/test/bats/tribunal-events.bats +209 -0
package/test/bats/tribunal-gate.bats +95 -0
package/test/bats/tribunal-jury.bats +80 -0
package/test/bats/tribunal-rubric.bats +119 -0
package/test/bats/tribunal-stop-hook.bats +73 -0
package/test/bats/tribunal-verdict.bats +71 -0
package/test/bats/validate-path.bats +1 -1
package/test/fixtures/hook-inputs/post-tool-use-read-chunked.json +15 -0
package/test/fixtures/hook-inputs/user-prompt-submit-rule-match.json +8 -0
package/test/fixtures/hook-inputs/user-prompt-submit-rule-nomatch.json +8 -0
package/test/helpers/setup.bash +9 -0
package/test/node/check-manifests.test.mjs +173 -0
package/test/node/check-references.test.mjs +279 -0
package/test/node/coverage.test.mjs +143 -0
package/test/node/schema-events.test.mjs +41 -1

package/plugins/tribunal/agents/tribunal-meta-judge.md ADDED Viewed

@@ -0,0 +1,61 @@
+---
+name: tribunal-meta-judge
+description: Reviews the jury's verdicts for bias, hallucination, and criteria misapplication before the gate decides. Emits TribunalMetaCompletePayload as the final message with verdict_quality, bias_detected, bias_types[], and an optional override_recommendation. Operates from the LLM-as-a-Meta-Judge framework. Read-only.
+model: claude-opus-4-7
+tools: Read
+---
+# Tribunal Meta-Judge
+You are the **Meta-Judge**. The jury has scored the Actor's output. Before the gate decides, you review the jury — not the Actor.
+The LLM-as-a-Judge literature documents six recurring biases that compromise judge reliability. Your job is to detect them in the jury's verdicts and tell the gate whether the jury can be trusted as-is, should be overridden, or should re-evaluate.
+## Bias taxonomy (canonical six)
+| Bias | What it looks like |
+|---|---|
+| `position` | Judge favors the first / last item, or follows a fixed format regardless of content. |
+| `verbosity` | Judge rewarded length over substance — a long Actor output got a higher score than a short correct one. |
+| `self_enhancement` | Judge favored output written in a style similar to its own. |
+| `sycophancy` | Judge scored generously because the Actor's framing was confident or polite. |
+| `refusal` | Judge declined to take a position (e.g., "could be good or bad") or refuses to ever score below a floor. |
+| `length` | Distinct from `verbosity`: judge penalized work *only* for being short, regardless of completeness. |
+## Inputs
+- The task description.
+- Each Judge's verdict (`TribunalVerdictPayload`): `score`, `passed`, `judge_type`, `criteria_evaluated`, `feedback_summary`, `confidence`.
+- The Actor's output (you may read it, but do not re-score it — the jury already did).
+## Review discipline
+- **Verdicts that disagree are not automatically biased.** Disagreement between `standard` and `adversarial` judges is the *point* of the panel. Look for whether the disagreement is grounded in observable claims.
+- **`feedback_summary` lacking file/line specificity** is weak evidence. Flag it as low `confidence`, not as bias.
+- **Refusal to score high or low** across multiple iterations is `refusal` bias.
+- **`security` judge raising issues with no concrete attack chain** is over-reporting; flag as `position` bias.
+- **All judges score within a narrow band of 0.7–0.8** with diverse criteria is suspicious — likely `sycophancy` or `refusal`.
+## Output format
+Final message is a single JSON object — no prose, no fence:
+```json
+{
+  "verdict_quality": "sound",
+  "bias_detected": false,
+  "bias_types": [],
+  "confidence": 0.9,
+  "override_recommendation": "accept"
+}
+```
+Required fields: `verdict_quality` (one of `"sound" | "questionable" | "biased"`) and `bias_detected` (bool).
+Optional fields:
+- `bias_types[]` — list any biases you detected from the taxonomy above.
+- `override_recommendation` — one of `"accept" | "reject" | "re-evaluate"`. Use this only when you are confident the gate should defer to you over the jury. Leave it unset when the jury verdict can stand on its own merits.
+- `confidence` — your own confidence in this meta-review.
+The orchestrator will inject `iteration_id`, `meta_model_id`, and `verdicts_reviewed` when persisting.

package/plugins/tribunal/config.json ADDED Viewed

@@ -0,0 +1,50 @@
+{
+  "plugin_name": "tribunal",
+  "storage_path": "~/.onlooker",
+  "tribunal": {
+    "enabled": true,
+    "stop_hook": {
+      "enabled": false,
+      "skip_if_no_file_changes": true
+    },
+    "session": {
+      "max_iterations": 3,
+      "score_threshold": 0.75,
+      "gate_policy": "majority",
+      "aggregation_method": "weighted_mean",
+      "judge_types": ["standard", "adversarial"],
+      "dissent_threshold": 0.25
+    },
+    "actor": {
+      "model": "claude-sonnet-4-6",
+      "max_output_tokens": 4096
+    },
+    "judges": {
+      "model": "claude-opus-4-7",
+      "max_output_tokens": 2048
+    },
+    "meta_judge": {
+      "model": "claude-opus-4-7",
+      "max_output_tokens": 1024
+    },
+    "rubric": {
+      "default_id": "default",
+      "builtins": [
+        {
+          "id": "default",
+          "criteria": [
+            { "name": "correctness", "weight": 0.4, "min_pass": 0.7 },
+            { "name": "completeness", "weight": 0.3, "min_pass": 0.7 },
+            { "name": "safety", "weight": 0.2, "min_pass": 0.8 },
+            { "name": "clarity", "weight": 0.1, "min_pass": 0.5 }
+          ],
+          "score_threshold": 0.75,
+          "max_iterations": 3,
+          "judge_types": ["standard", "adversarial"],
+          "gate_policy": "majority",
+          "aggregation_method": "weighted_mean"
+        }
+      ]
+    }
+  }
+}

package/plugins/tribunal/docs/adr/001-actor-jury-meta-gate-loop.md ADDED Viewed

@@ -0,0 +1,40 @@
+# ADR-001: The Actor → Jury → Meta-Judge → Gate Loop
+**Status:** Accepted
+**Date:** 2026-05-24
+## Context
+Tribunal needs a structure for producing high-quality output from an LLM. The simplest approach is a single model call and accept whatever comes out. More rigorous options include self-critique, a second model reviewing the first, or a multi-agent panel. The design choices were:
+- **Single pass** — one model, no review.
+- **Self-critique** — the same model reviews its own output.
+- **Two-model review** — a separate "judge" model scores the first model's output.
+- **Multi-agent jury** — multiple typed judge agents score the output; a meta-judge reviews the jury for bias; a gate decides accept/retry.
+## Decision
+Tribunal uses a **four-tier Actor → Jury → Meta-Judge → Gate loop** with configurable retry.
+This design is grounded in two published findings:
+- [LLM-as-a-Judge (Zheng et al. 2023)](https://arxiv.org/abs/2306.05685): strong LLMs can score other LLMs against rubrics with reasonable agreement to human judgment.
+- [LLM-as-a-Meta-Judge (Wu et al. 2024)](https://arxiv.org/abs/2407.19594): a second model reviewing the Judge's verdict catches position bias, verbosity bias, and self-enhancement — bias types that degrade single-judge reliability.
+## Rationale
+**Single pass is insufficient for high-stakes tasks.** A model that produces and evaluates its own output is subject to self-enhancement bias — it tends to rate its own output favorably. Separate the producer (Actor) from the evaluators (Jury).
+**A jury of typed judges catches different failure modes.** A `standard` judge scores correctness and completeness. An `adversarial` judge actively tries to find failure modes. A `security` judge looks for vulnerabilities. No single judge type is best at everything; the jury composition is configurable per project.
+**The Meta-Judge addresses jury bias, not just actor quality.** Even separate judge models have documented bias patterns: position bias (favoring the first response), verbosity bias (favoring longer outputs), sycophancy. The Meta-Judge reviews each verdict for these patterns and can flag or override. Without this tier, jury disagreement is unresolvable — you can't know if one judge was right or biased.
+**The Gate with retry closes the loop.** A quality gate that only reports a score doesn't improve outcomes. By feeding the jury's critique back to the Actor on retry, Tribunal creates a feedback loop. The Actor on iteration 1 sees what the judges found weak; it has a chance to produce better output before the session ends.
+**Configurable `max_iterations` prevents infinite loops.** The loop always terminates. `max_iterations: 3` (default) means at most 3 Actor passes. If the gate never passes, the outcome is `exhausted_iterations` — not a hang.
+## Consequences
+- A full Tribunal loop with two judges and a Meta-Judge makes 4–5 model calls per iteration. At 3 iterations, this is 12–15 calls. Cost and latency are real concerns; Tribunal is designed for deliberate use (`/tribunal <task>`), not for wrapping every session automatically.
+- The `majority` gate policy with two judges creates a degenerate case: 2-judge majority requires 2/2 judges to pass (not 1/2). This surprised early users expecting 50%+1. See ADR-002 for the gate policy decision.
+- Judge type composition matters. The default `["standard", "adversarial"]` provides coverage and contrast. Adding `security` triples judge cost for every iteration.
+- The Actor receives critique from *all* prior judges on retry, not just the weakest. This is intentional — even a judge that passed may have noted improvements.

package/plugins/tribunal/docs/adr/002-majority-gate-policy.md ADDED Viewed

@@ -0,0 +1,48 @@
+# ADR-002: Majority Gate Policy as Default
+**Status:** Accepted
+**Date:** 2026-05-24
+## Context
+After the Jury and Meta-Judge tiers produce scores, the Gate must decide: accept the output, retry, or exhaust? Several policies were considered:
+- **Score threshold only** — pass if `aggregated_score >= threshold` (e.g., 0.75).
+- **Unanimous** — pass only if every judge voted passed.
+- **Majority** — pass if strictly more than half of judges voted passed.
+- **Meta-override** — the Meta-Judge's recommendation overrides the jury.
+- **Hybrid** — any combination of the above.
+The available policies in config are: `majority`, `strict` (alias for `unanimous`), `unanimous`, `meta_override`.
+## Decision
+The default gate policy is **`majority`**. The gate requires **both** the jury policy vote **and** `score_threshold` to clear — both conditions must be true for a pass. `score_threshold: 0.75` is a hard blocking condition, not just a reporting signal.
+## Rationale
+**Majority is the most intuitive policy for a multi-judge panel.** In any jury system, majority verdict is the natural baseline. It prevents a single outlier judge from blocking a good result indefinitely (the adversarial judge is *designed* to find fault and rarely gives a full pass).
+**Unanimous is too strict for the default judge composition.** With `["standard", "adversarial"]`, the adversarial judge is built to be skeptical. A policy requiring it to pass alongside the standard judge effectively gives veto power to the judge whose job is to reject. In practice, unanimous with this composition would mean the gate almost never passes.
+**Score threshold alone conflates jury agreement with quality.** A score of 0.8 from two judges who disagree strongly (e.g., 1.0 and 0.6) is a different signal than 0.8 from two judges who both scored 0.8. The majority policy captures agreement; the dissent threshold captures disagreement.
+## The 2-judge edge case
+The majority formula is `passed_count * 2 > total_count`. With two judges:
+| Judges passed | Formula | Result |
+|--------------|---------|--------|
+| 2/2 | `2 * 2 > 2` → `4 > 2` | ✓ pass |
+| 1/2 | `1 * 2 > 2` → `2 > 2` | ✗ block |
+| 0/2 | `0 * 2 > 2` → `0 > 2` | ✗ block |
+This means with the default two-judge panel, **both judges must pass** for the gate to open. This behaves like unanimous in the 2-judge case. This was observed during early Tribunal development (Echo's own Tribunal evaluation exhausted all 3 iterations because the adversarial judge never passed). It is technically correct — strictly more than half of 2 requires 2 — but surprises users expecting "majority" to mean "1 out of 2".
+**The consequence is intentional:** two judges is already a lean panel. Requiring both to pass ensures quality signal from both perspectives before accepting. Users who want genuine 2/3 behavior should add a third judge type (e.g., `security`) to the panel — majority with three judges means two must pass, which is materially different from two-judge unanimous.
+## Consequences
+- The `majority` policy with 2 judges is effectively `unanimous`. This should be documented prominently for users configuring judge panels.
+- Adding a third judge type (e.g., `security`) changes `majority` to mean 2/3, which is a meaningfully different bar. Users who want consistent behavior regardless of panel size should specify `gate_policy: "unanimous"` explicitly.
+- The `meta_override` policy gives the Meta-Judge final say, bypassing jury vote counts entirely. This is available but not the default — it introduces a single point of failure (Meta-Judge bias or hallucination) that the default policy is specifically designed to avoid.

package/plugins/tribunal/hooks/hooks.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "hooks": {
+    "Stop": [
+      {
+        "matcher": "*",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/tribunal-stop-gate.sh"
+          }
+        ]
+      }
+    ]
+  }
+}

package/plugins/tribunal/scripts/hooks/tribunal-stop-gate.sh ADDED Viewed

@@ -0,0 +1,267 @@
+#!/usr/bin/env bash
+# Tribunal Stop-gate hook.
+#
+# Triggered by Stop. Off by default — gated on tribunal.stop_hook.enabled in
+# config. When enabled, runs a single-judge advisory pass on the just-finished
+# session's last turn and writes a verdict for review on the next session.
+#
+# Why advisory only: by the time Stop fires the main agent loop has already
+# ended. We cannot retry the Actor or re-run the work. The hook records what
+# the Standard Judge would have said so a human (or a follow-up SessionStart
+# hook in v0.2) can see whether the turn would have passed the gate.
+#
+# Hook contract:
+#   - Always exits 0. Never blocks Stop.
+#   - Skips silently if disabled, no git context, no transcript, or skip_if_no_file_changes
+#     is true and the last turn did not modify files.
+#   - Errors from `claude -p` are swallowed; worst case is "no verdict for this stop".
+set -uo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+# Ecosystem substrate lives in the sibling ecosystem plugin. Same lookup as
+# archivist-extract.sh.
+_ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
+if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
+	_candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
+	if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
+		_ECOSYSTEM_ROOT="$_candidate"
+	fi
+fi
+if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
+	# shellcheck disable=SC1091
+	CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
+	# shellcheck disable=SC1091
+	CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/onlooker-schema.sh"
+fi
+# shellcheck source=../lib/tribunal-config.sh
+source "${PLUGIN_ROOT}/scripts/lib/tribunal-config.sh"
+# shellcheck source=../lib/tribunal-project-key.sh
+source "${PLUGIN_ROOT}/scripts/lib/tribunal-project-key.sh"
+# shellcheck source=../lib/tribunal-ulid.sh
+source "${PLUGIN_ROOT}/scripts/lib/tribunal-ulid.sh"
+# shellcheck source=../lib/tribunal-events.sh
+source "${PLUGIN_ROOT}/scripts/lib/tribunal-events.sh"
+# shellcheck source=../lib/tribunal-verdict.sh
+source "${PLUGIN_ROOT}/scripts/lib/tribunal-verdict.sh"
+INPUT=$(cat)
+CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
+SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
+TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | jq -r '.transcript_path // ""' 2>/dev/null) || TRANSCRIPT_PATH=""
+# Stop hook MUST NOT emit any stdout besides the optional `{continue: ...}`
+# acknowledgement. Exiting 0 with no output is the safe path.
+_done() {
+	exit 0
+}
+REPO_ROOT=$(tribunal_project_repo_root "$CWD")
+tribunal_config_load "$REPO_ROOT"
+if ! tribunal_config_stop_hook_enabled; then
+	_done
+fi
+PROJECT_KEY=$(tribunal_project_key "$CWD")
+if [[ -z "$PROJECT_KEY" || -z "$REPO_ROOT" ]]; then
+	_done
+fi
+if [[ -z "$TRANSCRIPT_PATH" || ! -f "$TRANSCRIPT_PATH" ]]; then
+	_done
+fi
+# Skip if no files were modified since the last commit AND the user enabled
+# skip_if_no_file_changes (default true).
+SKIP_IF_CLEAN=$(tribunal_config_get '.tribunal.stop_hook.skip_if_no_file_changes')
+if [[ "$SKIP_IF_CLEAN" == "true" ]]; then
+	if git -C "$REPO_ROOT" diff --quiet 2>/dev/null && git -C "$REPO_ROOT" diff --cached --quiet 2>/dev/null; then
+		_done
+	fi
+fi
+if ! command -v claude >/dev/null 2>&1; then
+	_done
+fi
+# ----------------------------------------------------------------------------
+# Build the advisory-judge prompt.
+# ----------------------------------------------------------------------------
+JUDGE_MODEL=$(tribunal_config_judge_model "standard")
+[[ -z "$JUDGE_MODEL" || "$JUDGE_MODEL" == "null" ]] && JUDGE_MODEL=""
+SCORE_THRESHOLD=$(tribunal_config_get '.tribunal.session.score_threshold')
+[[ -z "$SCORE_THRESHOLD" ]] && SCORE_THRESHOLD="0.75"
+TRANSCRIPT_TAIL=$(tail -c 30000 "$TRANSCRIPT_PATH" 2>/dev/null) || TRANSCRIPT_TAIL=""
+[[ -z "$TRANSCRIPT_TAIL" ]] && _done
+DIFF_SUMMARY=$(git -C "$REPO_ROOT" diff --stat 2>/dev/null | tail -c 4000) || DIFF_SUMMARY=""
+PROMPT_FILE=$(mktemp -t tribunal-stop-prompt.XXXXXX 2>/dev/null) || PROMPT_FILE="/tmp/tribunal-stop-prompt.$$"
+trap 'rm -f "$PROMPT_FILE"' EXIT
+{
+	printf '%s\n' 'You are a Tribunal Standard Judge performing an advisory pass on a just-finished Claude Code turn. Return JSON only — no prose, no markdown fences.'
+	printf '\n'
+	printf '%s\n' 'Output schema (TribunalVerdictPayload, exactly these keys):'
+	printf '%s\n' '{'
+	printf '%s\n' '  "score": 0.0..1.0,'
+	printf '%s\n' '  "passed": true|false,'
+	printf '%s\n' '  "judge_type": "standard",'
+	printf '%s\n' '  "feedback_summary": "1-3 sentences naming the highest-leverage concern, if any.",'
+	printf '%s\n' '  "confidence": 0.0..1.0'
+	printf '%s\n' '}'
+	printf '\n'
+	printf '%s\n' "Score the work the assistant performed in this turn against general correctness, completeness, and clarity. A score >= ${SCORE_THRESHOLD} is \"passed\"."
+	printf '%s\n' 'This is advisory — the main session has already ended, no retry will happen. Be concise.'
+	printf '\n'
+	if [[ -n "$DIFF_SUMMARY" ]]; then
+		printf '%s\n' '---WORKING-TREE DIFF STAT---'
+		printf '%s\n' "$DIFF_SUMMARY"
+		printf '%s\n' '---END DIFF STAT---'
+		printf '\n'
+	fi
+	printf '%s\n' '---TRANSCRIPT TAIL---'
+	printf '%s\n' "$TRANSCRIPT_TAIL"
+	printf '%s\n' '---END TRANSCRIPT TAIL---'
+} > "$PROMPT_FILE"
+CLAUDE_ARGS=(-p --max-turns 1)
+[[ -n "$JUDGE_MODEL" ]] && CLAUDE_ARGS+=(--model "$JUDGE_MODEL")
+RESPONSE=""
+if command -v timeout >/dev/null 2>&1; then
+	RESPONSE=$(timeout 60 claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
+elif command -v gtimeout >/dev/null 2>&1; then
+	RESPONSE=$(gtimeout 60 claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
+else
+	RESPONSE=$(claude "${CLAUDE_ARGS[@]}" < "$PROMPT_FILE" 2>/dev/null) || RESPONSE=""
+fi
+[[ -z "$RESPONSE" ]] && _done
+CLEAN_RESPONSE=$(printf '%s' "$RESPONSE" | sed -e 's/^```json//' -e 's/^```//' -e 's/```$//')
+if ! printf '%s' "$CLEAN_RESPONSE" | jq -e '.score and (.passed // false | type == "boolean") and .judge_type' >/dev/null 2>&1; then
+	_done
+fi
+# ----------------------------------------------------------------------------
+# Emit the canonical event chain + persist the advisory verdict.
+# ----------------------------------------------------------------------------
+TASK_ID=$(tribunal_ulid)
+ITERATION_ID=$(tribunal_ulid)
+JUDGE_ID=$(tribunal_ulid)
+START_PAYLOAD=$(jq -n \
+	--arg task_id "$TASK_ID" \
+	--arg model "$JUDGE_MODEL" \
+	--argjson thr "$SCORE_THRESHOLD" \
+	'{
+		task_id: $task_id,
+		judge_types: ["standard"],
+		gate_policy: "strict",
+		score_threshold: $thr,
+		max_iterations: 1
+	} + (if $model != "" then {judge_model_ids: [$model]} else {} end)')
+ITER_PAYLOAD=$(jq -n \
+	--arg task_id "$TASK_ID" \
+	--arg iter_id "$ITERATION_ID" \
+	'{task_id: $task_id, iteration_id: $iter_id, iteration_number: 0, trigger: "initial"}')
+JUDGE_START_PAYLOAD=$(jq -n \
+	--arg task_id "$TASK_ID" \
+	--arg iter_id "$ITERATION_ID" \
+	--arg judge_id "$JUDGE_ID" \
+	--arg model "$JUDGE_MODEL" \
+	'{
+		task_id: $task_id,
+		iteration_id: $iter_id,
+		judge_id: $judge_id,
+		judge_type: "standard",
+		judge_model_id: (if $model == "" then null else $model end)
+	} | with_entries(select(.value != null))')
+VERDICT_PAYLOAD=$(printf '%s' "$CLEAN_RESPONSE" | jq -c \
+	--arg task_id "$TASK_ID" \
+	--arg iter_id "$ITERATION_ID" \
+	--arg judge_id "$JUDGE_ID" \
+	--arg model "$JUDGE_MODEL" \
+	--argjson thr "$SCORE_THRESHOLD" \
+	'{
+		task_id: $task_id,
+		score: .score,
+		passed: (.passed // (.score >= $thr)),
+		judge_type: "standard",
+		iteration_id: $iter_id,
+		judge_id: $judge_id,
+		feedback_summary: (.feedback_summary // ""),
+		confidence: (.confidence // 0.6),
+		judge_model_id: (if $model == "" then null else $model end)
+	} | with_entries(select(.value != null and .value != ""))')
+SCORE=$(printf '%s' "$VERDICT_PAYLOAD" | jq -r '.score')
+PASSED=$(printf '%s' "$VERDICT_PAYLOAD" | jq -r '.passed')
+if [[ "$PASSED" == "true" ]]; then
+	GATE_PAYLOAD=$(jq -n \
+		--arg task_id "$TASK_ID" \
+		--arg iter_id "$ITERATION_ID" \
+		--argjson score "$SCORE" \
+		'{task_id: $task_id, iteration_id: $iter_id, final_score: $score, iteration_number: 0, judges_consulted: 1}')
+	GATE_EVENT="tribunal.gate.passed"
+	OUTCOME="accepted"
+else
+	GATE_PAYLOAD=$(jq -n \
+		--arg task_id "$TASK_ID" \
+		--arg iter_id "$ITERATION_ID" \
+		--argjson score "$SCORE" \
+		'{task_id: $task_id, iteration_id: $iter_id, reason: "low_score", final_score: $score, iteration_number: 0, will_retry: false}')
+	GATE_EVENT="tribunal.gate.blocked"
+	OUTCOME="rejected"
+fi
+COMPLETE_PAYLOAD=$(jq -n \
+	--arg task_id "$TASK_ID" \
+	--arg outcome "$OUTCOME" \
+	--argjson score "$SCORE" \
+	'{task_id: $task_id, outcome: $outcome, final_score: $score, iterations_used: 1}')
+# Emit in canonical order. Each call is best-effort — a single schema failure
+# should not break the user's Stop.
+tribunal_emit_event "tribunal.session.start"     "$START_PAYLOAD"        || true
+tribunal_emit_event "tribunal.iteration.start"   "$ITER_PAYLOAD"         || true
+tribunal_emit_event "tribunal.judge.start"       "$JUDGE_START_PAYLOAD"  || true
+tribunal_emit_event "tribunal.verdict"           "$VERDICT_PAYLOAD"      || true
+tribunal_emit_event "$GATE_EVENT"                "$GATE_PAYLOAD"         || true
+tribunal_emit_event "tribunal.session.complete"  "$COMPLETE_PAYLOAD"     || true
+# Persist a single advisory file for the next session to surface.
+STOP_DIR="$(tribunal_project_dir "$PROJECT_KEY")"
+mkdir -p "$STOP_DIR" 2>/dev/null || _done
+SAFE_SESSION_ID=$(printf '%s' "$SESSION_ID" | tr -c 'a-zA-Z0-9-' '_')
+[[ -z "$SAFE_SESSION_ID" ]] && SAFE_SESSION_ID="unknown"
+jq -n \
+	--arg task_id "$TASK_ID" \
+	--arg session_id "$SESSION_ID" \
+	--arg outcome "$OUTCOME" \
+	--argjson verdict "$VERDICT_PAYLOAD" \
+	'{
+		task_id: $task_id,
+		session_id: $session_id,
+		outcome: $outcome,
+		verdict: $verdict,
+		mode: "stop-advisory"
+	}' > "${STOP_DIR}/stop-${SAFE_SESSION_ID}.json" 2>/dev/null || true
+_done

package/plugins/tribunal/scripts/lib/tribunal-aggregate.sh ADDED Viewed

@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Score aggregation for Tribunal.
+#
+# Aggregates per-judge verdicts into a single jury-level score per the chosen
+# aggregation_method. Also computes the dissent metric (max - min) so callers
+# can decide whether to emit tribunal.dissent.recorded.
+#
+# Verdicts input is a JSON array of TribunalVerdictPayload objects (or a subset
+# containing at least { judge_id, score }). Rubric is the active rubric (for
+# weighted_mean only).
+#
+# Exposes:
+#   tribunal_aggregate <method> <verdicts_json> [<rubric_json>]
+#       echoes the aggregated score (0..1) as a JSON number
+#   tribunal_disagreement <verdicts_json>
+#       echoes max(score) - min(score), or 0 if 0/1 verdicts
+#
+# weighted_mean uses *rubric criterion weights*, not per-judge weights — the
+# semantics are "weight each criterion's contribution, then average judges'
+# scores on each criterion." For v0.1 the per-criterion breakdown is not yet
+# threaded through verdicts, so weighted_mean degrades to mean when the rubric
+# weights cannot be applied. The schema still emits aggregation_method =
+# "weighted_mean" so dashboards see the intent.
+tribunal_aggregate() {
+	local method="${1:-mean}"
+	local verdicts="${2:-[]}"
+	local _rubric="${3:-{}}"  # reserved for true weighted_mean once per-criterion scores are threaded
+	: "$_rubric"
+	local count
+	count=$(printf '%s' "$verdicts" | jq 'length' 2>/dev/null) || count=0
+	[[ "$count" -eq 0 ]] && { printf '0'; return 0; }
+	case "$method" in
+		mean|weighted_mean)
+			printf '%s' "$verdicts" | jq -r '[.[].score] | add / length'
+			;;
+		median)
+			printf '%s' "$verdicts" | jq -r '
+				[.[].score] | sort as $s
+				| ($s | length) as $n
+				| if ($n % 2) == 1 then $s[($n - 1) / 2]
+				  else (($s[$n / 2 - 1] + $s[$n / 2]) / 2)
+				  end
+			'
+			;;
+		min)
+			printf '%s' "$verdicts" | jq -r '[.[].score] | min'
+			;;
+		*)
+			printf 'tribunal-aggregate: unknown method %s, falling back to mean\n' \
+				"$method" >&2
+			printf '%s' "$verdicts" | jq -r '[.[].score] | add / length'
+			;;
+	esac
+}
+tribunal_disagreement() {
+	local verdicts="${1:-[]}"
+	local count
+	count=$(printf '%s' "$verdicts" | jq 'length' 2>/dev/null) || count=0
+	[[ "$count" -lt 2 ]] && { printf '0'; return 0; }
+	printf '%s' "$verdicts" | jq -r '[.[].score] | (max - min)'
+}

package/plugins/tribunal/scripts/lib/tribunal-config.sh ADDED Viewed

@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+# Config resolution for Tribunal.
+#
+# Reads three layers, latest wins:
+#   1. plugins/tribunal/config.json (defaults shipped with the plugin)
+#   2. ~/.claude/settings.json
+#   3. <repo>/.claude/settings.json
+#
+# Exposes:
+#   tribunal_config_load <repo_root>       # populates _TRIBUNAL_CONFIG (JSON)
+#   tribunal_config_get <jq-path>          # echoes string value (empty if unset)
+#   tribunal_config_get_json <jq-path>     # echoes JSON value (null if unset)
+#   tribunal_config_enabled                # 0 if tribunal.enabled is true
+#   tribunal_config_stop_hook_enabled      # 0 if tribunal.stop_hook.enabled is true
+#   tribunal_config_judge_model <judge_type>
+#                                          # echoes per-judge-type model override,
+#                                          # falling back to tribunal.judges.model
+#
+# Settings overlay only touches the `tribunal.*` subtree so this plugin coexists
+# with other plugins' configuration.
+_TRIBUNAL_CONFIG="{}"
+tribunal_config_load() {
+	local repo_root="${1:-}"
+	local plugin_root="${CLAUDE_PLUGIN_ROOT:-}"
+	local home_dir="${HOME:-}"
+	local merged="{}"
+	local file
+	file="${plugin_root}/config.json"
+	if [[ -f "$file" ]]; then
+		local defaults
+		defaults=$(jq '.' "$file" 2>/dev/null) || defaults="{}"
+		merged=$(jq -n --argjson a "$merged" --argjson b "$defaults" '$a * $b' 2>/dev/null) \
+			|| merged="$defaults"
+	fi
+	for file in "${home_dir}/.claude/settings.json" "${repo_root}/.claude/settings.json"; do
+		[[ -n "$file" && -f "$file" ]] || continue
+		local overlay
+		overlay=$(jq '{ tribunal: (.tribunal // {}) }' "$file" 2>/dev/null) || continue
+		[[ -z "$overlay" ]] && continue
+		local attempt
+		if attempt=$(jq -n --argjson a "$merged" --argjson b "$overlay" '
+			def deepmerge($a; $b):
+				if ($a|type) == "object" and ($b|type) == "object" then
+					reduce (($a|keys) + ($b|keys) | unique)[] as $k
+						({}; .[$k] = deepmerge($a[$k]; $b[$k]))
+				elif $b == null then $a
+				else $b end;
+			deepmerge($a; $b)
+		' 2>/dev/null) && [[ -n "$attempt" ]]; then
+			merged="$attempt"
+		fi
+	done
+	_TRIBUNAL_CONFIG="$merged"
+}
+# Read a string value from the loaded config.
+# Usage: tribunal_config_get '.tribunal.session.gate_policy'
+tribunal_config_get() {
+	local path="$1"
+	printf '%s' "$_TRIBUNAL_CONFIG" | jq -r "${path} // empty" 2>/dev/null
+}
+# Read a JSON value (arrays, objects, numbers) from the loaded config.
+# Usage: tribunal_config_get_json '.tribunal.session.judge_types'
+tribunal_config_get_json() {
+	local path="$1"
+	printf '%s' "$_TRIBUNAL_CONFIG" | jq -c "${path}" 2>/dev/null
+}
+# Returns 0 if tribunal.enabled is true.
+tribunal_config_enabled() {
+	local v
+	v=$(tribunal_config_get '.tribunal.enabled')
+	[[ "$v" == "true" ]]
+}
+# Returns 0 if tribunal.stop_hook.enabled is true. Default is false.
+tribunal_config_stop_hook_enabled() {
+	local v
+	v=$(tribunal_config_get '.tribunal.stop_hook.enabled')
+	[[ "$v" == "true" ]]
+}
+# Resolve the model id for a given judge_type.
+# Precedence: tribunal.judges.<type>.model > tribunal.judges.model
+tribunal_config_judge_model() {
+	local judge_type="$1"
+	local override
+	override=$(tribunal_config_get ".tribunal.judges.\"${judge_type}\".model")
+	if [[ -n "$override" ]]; then
+		printf '%s' "$override"
+		return 0
+	fi
+	tribunal_config_get '.tribunal.judges.model'
+}