npm - @balpal4495/quorum - Versions diffs - 0.2.0 → 0.4.0 - Mend

@balpal4495/quorum 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +223 -11
package/SETUP.md +30 -0
package/bin/commands/check.js +122 -0
package/bin/commands/commit.js +210 -0
package/bin/commands/init.js +236 -0
package/bin/commands/sentinel.js +160 -0
package/bin/commands/status.js +117 -0
package/bin/quorum.js +103 -0
package/bin/shared/chronicle.js +129 -0
package/bin/shared/colors.js +22 -0
package/bin/shared/patterns.js +83 -0
package/evals/__tests__/eval.test.ts +31 -0
package/evals/cases/auth_hs256_rejected.json +46 -0
package/evals/cases/auth_rs256_valid.json +30 -0
package/evals/cases/cache_missing_lock.json +31 -0
package/evals/cases/db_naive_not_null.json +32 -0
package/evals/cases/logging_pii_leak.json +32 -0
package/evals/cases/migration_with_rollback.json +43 -0
package/evals/cases/no_evidence_novel_design.json +16 -0
package/evals/cases/payment_no_idempotency.json +33 -0
package/evals/cases/redis_session_rejected.json +32 -0
package/evals/cases/safe_refactor.json +17 -0
package/evals/runner.ts +226 -0
package/modules/AGENTS.md +9 -5
package/modules/CLAUDE.md +25 -2
package/modules/README.md +153 -6
package/modules/council/chairman.ts +84 -14
package/modules/council/deliberate.ts +24 -4
package/modules/council/index.ts +6 -1
package/modules/council/risk.ts +89 -0
package/modules/council/types.ts +63 -1
package/modules/jury/evaluate.ts +32 -8
package/modules/jury/index.ts +3 -1
package/modules/jury/preflight.ts +101 -0
package/modules/jury/schema.ts +9 -0
package/modules/jury/types.ts +20 -1
package/modules/shared/types.ts +8 -0
package/package.json +3 -3

package/evals/runner.ts ADDED Viewed

@@ -0,0 +1,226 @@
+/**
+ * Eval runner for Quorum Jury + Council.
+ *
+ * Each case in evals/cases/ defines a proposal and what the system should produce.
+ * The runner validates:
+ *   - Jury confidence is within expected bounds
+ *   - Preflight detects the expected signals
+ *   - Risk classifier assigns the expected level
+ *   - Council recommendation matches (when an LLM provider is available)
+ *
+ * Jury + preflight run without any LLM (deterministic).
+ * Council assertions are skipped if no LLM provider is injected.
+ *
+ * Usage:
+ *   npx vitest run evals/
+ *
+ * Or run against a real LLM:
+ *   EVAL_LLM=openai npx vitest run evals/
+ */
+import { promises as fs } from "fs"
+import path from "path"
+import type { OracleResult, LLMProvider } from "../modules/shared/types"
+import { runPreflight } from "../modules/jury/preflight"
+import { classifyRisk } from "../modules/council/risk"
+export interface EvalCase {
+  id: string
+  description: string
+  outcome: string
+  design: string
+  oracle_evidence: OracleResult[]
+  expected: {
+    jury_min_confidence?: number
+    jury_max_confidence?: number
+    council_recommendation?: "proceed" | "redesign" | "investigate-more"
+    must_flag?: string[]
+    must_not_flag?: string[]
+    must_cite?: string[]
+    risk_level?: string
+    preflight_expects?: {
+      touches_sensitive_area?: boolean
+      sensitive_areas_include?: string[]
+      rollback_mentioned?: boolean
+      test_strategy_mentioned?: boolean
+      chronicle_conflicts?: string[]
+    }
+  }
+}
+export interface EvalResult {
+  caseId: string
+  description: string
+  passed: boolean
+  failures: string[]
+  preflight: ReturnType<typeof runPreflight>
+  risk: ReturnType<typeof classifyRisk>
+  juryOutput?: unknown
+  councilOutput?: unknown
+  durationMs: number
+}
+export async function loadCases(casesDir?: string): Promise<EvalCase[]> {
+  const dir = casesDir ?? path.join(__dirname, "cases")
+  const files = (await fs.readdir(dir)).filter(f => f.endsWith(".json"))
+  const cases = await Promise.all(
+    files.map(async f => {
+      const raw = await fs.readFile(path.join(dir, f), "utf8")
+      return JSON.parse(raw) as EvalCase
+    }),
+  )
+  return cases
+}
+export async function runCase(
+  evalCase: EvalCase,
+  llm?: LLMProvider,
+): Promise<EvalResult> {
+  const start = Date.now()
+  const failures: string[] = []
+  const { outcome, design, oracle_evidence: evidence, expected } = evalCase
+  // ── Deterministic checks (no LLM) ──────────────────────────────────────────
+  const preflight = runPreflight(outcome, design, evidence)
+  const risk = classifyRisk(outcome, design, evidence)
+  // Risk level
+  if (expected.risk_level && risk.level !== expected.risk_level) {
+    failures.push(
+      `risk_level: expected "${expected.risk_level}", got "${risk.level}" (reasons: ${risk.reasons.join(", ")})`,
+    )
+  }
+  // Preflight assertions
+  const pf = expected.preflight_expects
+  if (pf) {
+    if (pf.touches_sensitive_area !== undefined && preflight.touches_sensitive_area !== pf.touches_sensitive_area) {
+      failures.push(`preflight.touches_sensitive_area: expected ${pf.touches_sensitive_area}, got ${preflight.touches_sensitive_area}`)
+    }
+    if (pf.rollback_mentioned !== undefined && preflight.rollback_mentioned !== pf.rollback_mentioned) {
+      failures.push(`preflight.rollback_mentioned: expected ${pf.rollback_mentioned}, got ${preflight.rollback_mentioned}`)
+    }
+    if (pf.test_strategy_mentioned !== undefined && preflight.test_strategy_mentioned !== pf.test_strategy_mentioned) {
+      failures.push(`preflight.test_strategy_mentioned: expected ${pf.test_strategy_mentioned}, got ${preflight.test_strategy_mentioned}`)
+    }
+    if (pf.chronicle_conflicts) {
+      for (const id of pf.chronicle_conflicts) {
+        if (!preflight.chronicle_conflicts.includes(id)) {
+          failures.push(`preflight.chronicle_conflicts: expected "${id}" to be flagged`)
+        }
+      }
+    }
+    if (pf.sensitive_areas_include) {
+      for (const area of pf.sensitive_areas_include) {
+        if (!preflight.sensitive_areas.includes(area)) {
+          failures.push(`preflight.sensitive_areas: expected "${area}" to be detected`)
+        }
+      }
+    }
+  }
+  let juryOutput: unknown
+  let councilOutput: unknown
+  // ── LLM-dependent checks (skipped if no provider) ──────────────────────────
+  if (llm) {
+    const { evaluate } = await import("../modules/jury/evaluate")
+    try {
+      juryOutput = await evaluate({ outcome, design, evidence }, { llm })
+      const jury = juryOutput as { confidence: number; recommendation: string; assessment: string; gaps: string[] }
+      if (expected.jury_min_confidence !== undefined && jury.confidence < expected.jury_min_confidence) {
+        failures.push(`jury.confidence: expected ≥ ${expected.jury_min_confidence}, got ${jury.confidence}`)
+      }
+      if (expected.jury_max_confidence !== undefined && jury.confidence > expected.jury_max_confidence) {
+        failures.push(`jury.confidence: expected ≤ ${expected.jury_max_confidence}, got ${jury.confidence}`)
+      }
+    } catch (err) {
+      failures.push(`jury threw: ${String(err)}`)
+    }
+    if (expected.council_recommendation && juryOutput) {
+      const { deliberate } = await import("../modules/council/deliberate")
+      const mockOracle = {
+        query: async () => [],
+        propose: async () => ({ proposalId: "eval-proposal" }),
+        commit: async () => { throw new Error("commit not available in eval") },
+      }
+      try {
+        councilOutput = await deliberate(
+          { outcome, design, evidence, jury_output: juryOutput as never },
+          { llm, oracle: mockOracle, advisorCount: 2, reviewerCount: 2 },
+        )
+        const council = councilOutput as { recommendation: string; verdict: string; blockers: Array<{ issue: string }>; evidence_cited: string[] }
+        if (council.recommendation !== expected.council_recommendation) {
+          failures.push(
+            `council.recommendation: expected "${expected.council_recommendation}", got "${council.recommendation}"`,
+          )
+        }
+        const verdictText = [
+          council.verdict,
+          ...council.blockers.map(b => b.issue),
+        ].join(" ").toLowerCase()
+        if (expected.must_flag) {
+          for (const term of expected.must_flag) {
+            if (!verdictText.includes(term.toLowerCase())) {
+              failures.push(`council must_flag: "${term}" not mentioned in verdict or blockers`)
+            }
+          }
+        }
+        if (expected.must_not_flag) {
+          for (const term of expected.must_not_flag) {
+            if (verdictText.includes(term.toLowerCase())) {
+              failures.push(`council must_not_flag: "${term}" was mentioned but should not be`)
+            }
+          }
+        }
+        if (expected.must_cite) {
+          for (const id of expected.must_cite) {
+            if (!council.evidence_cited.includes(id)) {
+              failures.push(`council must_cite: entry ID "${id}" not in evidence_cited`)
+            }
+          }
+        }
+      } catch (err) {
+        failures.push(`council threw: ${String(err)}`)
+      }
+    }
+  }
+  return {
+    caseId: evalCase.id,
+    description: evalCase.description,
+    passed: failures.length === 0,
+    failures,
+    preflight,
+    risk,
+    juryOutput,
+    councilOutput,
+    durationMs: Date.now() - start,
+  }
+}
+export function printEvalSummary(results: EvalResult[]): void {
+  const passed = results.filter(r => r.passed).length
+  const total = results.length
+  console.log(`\n${"─".repeat(60)}`)
+  console.log(`Eval results: ${passed}/${total} passed`)
+  console.log("─".repeat(60))
+  for (const r of results) {
+    const icon = r.passed ? "✓" : "✗"
+    console.log(`${icon} ${r.caseId} (${r.durationMs}ms)`)
+    if (!r.passed) {
+      for (const f of r.failures) {
+        console.log(`    → ${f}`)
+      }
+    }
+  }
+  console.log("─".repeat(60))
+}

package/modules/AGENTS.md CHANGED Viewed

@@ -20,8 +20,9 @@ When working inside this folder, follow these rules in addition to the root guid
 ### Jury
 | File | Owns |
 |---|---|
-| `jury/schema.ts` | Zod schema for structured LLM output. Source of truth for `JuryOutput` shape. |
-| `jury/evaluate.ts` | Four-dimension evaluation. **`council_brief` is always overridden from confidence here — do not remove this enforcement.** |
+| `jury/schema.ts` | Zod schema for structured LLM output. Source of truth for `JuryOutput` shape including `confidence_breakdown` and `blocking_gaps`. |
+| `jury/evaluate.ts` | Four-dimension evaluation. **Confidence is always recomputed from the breakdown average here — do not remove this. `council_brief` is also overridden from confidence.** |
+| `jury/preflight.ts` | Deterministic preflight — no LLM. Detects sensitive areas, rollback mention, and Chronicle conflicts before the LLM runs. Safe to extend with new patterns. |
 ### Council
 | File | Owns |
@@ -30,8 +31,9 @@ When working inside this folder, follow these rules in addition to the root guid
 | `council/frame.ts` | Sets deliberation tone from `council_brief`. Challenge vs pressure-test framing lives here. |
 | `council/advisors.ts` | Parallel advisor fan-out. Advisors must cite Oracle entry IDs — enforced in the prompt. |
 | `council/reviewers.ts` | Anonymisation of advisor responses + parallel reviewer fan-out. Anonymisation must happen before reviewers see responses. |
-| `council/chairman.ts` | Verdict synthesis + Zod validation. Throws on bad output — do not add fallbacks. |
-| `council/deliberate.ts` | Full pipeline orchestration. Calls `oracle.propose()` at the end — never `oracle.commit()`. |
+| `council/chairman.ts` | Verdict synthesis + Zod validation. Produces structured `blockers`/`warnings`, validates citations, tracks `advisor_split`. Throws on bad output — do not add fallbacks. |
+| `council/risk.ts` | Deterministic risk classifier — no LLM. Assigns `low/medium/high/critical` and `council_mode` from design text and refuted evidence. Drives advisor/reviewer fan-out counts. |
+| `council/deliberate.ts` | Full pipeline orchestration. Calls `oracle.propose()` at the end — never `oracle.commit()`. Risk classifier runs first to set fan-out counts. |
 ---
@@ -50,8 +52,10 @@ When working inside this folder, follow these rules in addition to the root guid
 ## Invariants — do not break these
 - `oracle.commit()` is never called without explicit human input. `deliberate()` calls `propose()` only.
-- `jury/evaluate.ts` always computes `council_brief` from `confidence` after parsing — never trusts the LLM value.
+- `jury/evaluate.ts` recomputes `confidence` as the exact average of `confidence_breakdown` dimensions — the LLM value is discarded.
+- `jury/evaluate.ts` derives `council_brief` from the recomputed confidence — never trusts the LLM value.
 - `chairman.ts` and `jury/evaluate.ts` throw on schema validation failure. Do not add try/catch that swallows these errors.
+- `deliberate.ts` passes `citation_validation.valid_ids` (not raw `evidence_cited`) to `oracle.propose()` — hallucinated IDs are stripped.
 - Query logging in `oracle/log.ts` is always best-effort — callers must not fail because of a log write error.
 - `VectorStore` and `embedder` are always injected — never imported directly inside Oracle logic.

package/modules/CLAUDE.md CHANGED Viewed

@@ -17,8 +17,8 @@ The entry point for a host application is `setup.ts`. Everything else is interna
 ### Dependency injection throughout
 No module imports a specific LLM provider, vector store, or embedder. All external dependencies are passed in as function arguments or via a deps object. If you add a new capability, follow this pattern — do not hardcode providers.
-### council_brief is computed, not trusted
-In `jury/evaluate.ts`, the `council_brief` field in the LLM response is **always overridden** based on the numeric `confidence` value after parsing. The LLM is not trusted to compute this correctly. Do not remove this override.
+### Confidence is recomputed from the breakdown — never trusted from the LLM
+In `jury/evaluate.ts`, after parsing the LLM response, `confidence` is recomputed as the exact average of the four `confidence_breakdown` dimensions. The LLM's stated `confidence` value is discarded. `council_brief` is then derived from this recomputed value. Do not remove either override.
 ### Throw on bad LLM output — never default to passing
 Both `jury/evaluate.ts` and `council/chairman.ts` throw if the LLM returns non-JSON or output that fails Zod validation. This is intentional. A silently passing Jury score is worse than an error. Do not add fallbacks or defaults.
@@ -26,6 +26,15 @@ Both `jury/evaluate.ts` and `council/chairman.ts` throw if the LLM returns non-J
 ### oracle.commit() is a human gate
 `council/deliberate.ts` calls `oracle.propose()` at the end of every deliberation. It never calls `oracle.commit()`. If you see a code path that calls `oracle.commit()` without explicit human input, that is a bug.
+### Oracle proposals use only validated citation IDs
+`deliberate.ts` passes `verdict.citation_validation.valid_ids` as `evidence_cited` when calling `oracle.propose()` — not the raw `evidence_cited` array from the Chairman. Hallucinated IDs (cited but not in the evidence pack) are stripped before the proposal is written.
+### Preflight runs before every Jury LLM call — do not remove it
+`jury/evaluate.ts` calls `runPreflight()` before building the user prompt. The preflight result is injected as the `## Deterministic Preflight` section. This gives the LLM hard facts to reason over rather than discovering them itself. Do not move this call after the LLM invocation.
+### Risk classifier determines fan-out counts — do not hardcode them
+`deliberate.ts` reads `risk.council_mode` from `classifyRisk()` to set advisor and reviewer counts. Do not hardcode `advisorCount` or `reviewerCount` defaults inside `deliberate.ts` — the risk classifier owns these defaults.
 ### Query logging is best-effort
 `oracle/log.ts` writes to a JSONL file. The `query()` function wraps this in a try/catch that swallows errors silently. This is correct behaviour — a log write failure must never fail a query.
@@ -47,14 +56,27 @@ The pipeline order is fixed: `frameQuestion → fanOutAdvisors → fanOutReviewe
 Anonymisation of advisor responses happens inside `fanOutReviewers()` before any reviewer sees them. It must stay there.
+The risk classifier runs at the start of `deliberate()` before any LLM calls. It sets advisor/reviewer counts and is logged in the Chronicle proposal's `scope` field. Do not move it.
+---
+## When modifying jury/preflight.ts
+`SENSITIVE_PATTERNS` and the risk rules in `council/risk.ts` are separate but related. Preflight detects patterns for the Jury prompt; the risk classifier uses its own pattern set to determine Council mode. They are intentionally independent — changing one does not update the other. Keep them in sync when adding new sensitive area categories.
+The eval suite in `evals/cases/` has `preflight_expects` and `risk_level` assertions. When changing patterns, run `npx vitest run evals/` to verify existing cases still pass.
 ---
 ## Safe to change
 - `council/personas.ts` — add or adjust personas freely
+- `jury/preflight.ts` `SENSITIVE_PATTERNS` — extend with new categories; run evals after
+- `council/risk.ts` `RISK_RULES` — add new risk patterns; run evals after
 - `models` defaults in `setup.ts` — adjust model names as providers evolve
 - BM25 constants (`K1`, `B`) in `oracle/bm25.ts` — tunable, well-commented
 - `CANDIDATE_MULTIPLIER` and `RRF_K` in `oracle/query.ts` — tunable retrieval parameters
+- `evals/cases/` — add new eval cases freely; they run in CI automatically
 ## Do not change without strong reason
@@ -62,3 +84,4 @@ Anonymisation of advisor responses happens inside `fanOutReviewers()` before any
 - The `ChronicleEntry` type in `shared/types.ts` — changing it breaks stored data
 - The Zod schemas in `jury/schema.ts` and `council/chairman.ts` — these are the output contracts
 - The `OracleClient` interface in `shared/types.ts` — Jury and Council depend on it
+- The confidence recomputation in `jury/evaluate.ts` — it makes confidence calibrated and deterministic

package/modules/README.md CHANGED Viewed

@@ -34,6 +34,42 @@ Chronicle is the data that underpins the system. It is not a module — it lives
 Every entry goes through `oracle.propose()` → human approval → `oracle.commit()`. There are no auto-commits.
+### Chronicle entry schema (v2)
+```typescript
+type ChronicleEntry = {
+  // Always present (v1 + v2)
+  id: string
+  key_insight: string        // v1: primary text; v2: copy of decision for compat
+  affected_areas: string[]   // file paths — used by Sentinel for coverage matching
+  status: "validated" | "refuted" | "open"
+  confidence: number         // 0–1
+  source_module: string
+  evidence_cited: string[]
+  timestamp: string
+  // v2 fields (optional — absent on legacy entries)
+  schema_version?: 2
+  topic?: string                    // short label: "auth/session strategy"
+  decision?: string                 // the decision — primary text in v2
+  scope?: string[]                  // domain tags: ["auth", "sessions"] — additive
+  alternatives_considered?: string[]
+  rejected_reason?: string[]
+  supersedes?: string | null        // ID of the entry this replaces
+  superseded_by?: string | null     // ID of the entry that replaced this
+  // Outcome tracking fields (optional — filled in post-execution)
+  outcome?: string                  // what actually happened when acted on
+  validation_plan?: string[]        // steps that confirm the decision was correct
+  review_after?: string             // ISO date to re-evaluate for drift
+  post_merge_result?: "successful" | "bug" | "partial" | "rolled-back"
+}
+```
+Use `entryText(entry)` from `shared/types` whenever you need to read the primary text — it returns `entry.decision ?? entry.key_insight` and works across both schema versions.
+New entries created by Council automatically include `decision`, `topic`, `alternatives_considered`, `rejected_reason`, and `scope` (from the risk classifier) from the deliberation output.
 ---
 ## Dependencies
@@ -155,23 +191,127 @@ const anthropicProvider: LLMProvider = async (messages, model = "claude-3-5-sonn
 ---
-## Output routing
+## Jury output
+```typescript
+interface JuryOutput {
+  confidence: number              // exact average of the four breakdown scores
+  confidence_breakdown: {
+    evidence_support: number      // do validated entries confirm this approach?
+    feasibility: number           // is this achievable given what Chronicle knows?
+    risk: number                  // how well does the design address failure modes?
+    completeness: number          // does it cover the full outcome?
+  }
+  assessment: string
+  gaps: string[]                  // all missing evidence
+  blocking_gaps: string[]         // subset of gaps that are hard blockers
+  council_brief: "challenge" | "pressure-test"
+  recommendation: "proceed" | "investigate-more" | "redesign"
+}
+```
+`confidence` is always recomputed from the breakdown average — the LLM's stated value is discarded. `council_brief` is derived from `confidence` (< 0.6 → challenge, ≥ 0.6 → pressure-test).
+### Preflight (no LLM)
+Before the LLM runs, Jury executes a deterministic preflight:
+```typescript
+import { runPreflight } from "./modules/jury"
+const preflight = runPreflight(outcome, design, evidence)
+// preflight.touches_sensitive_area
+// preflight.sensitive_areas      — ["auth", "database", ...]
+// preflight.rollback_mentioned
+// preflight.test_strategy_mentioned
+// preflight.chronicle_conflicts  — refuted entry IDs that overlap with the design
+```
+Results are injected into the Jury prompt as hard facts. Auth, database migrations, crypto, payments, PII, and secrets are the detected sensitive areas.
-### Jury
+### Jury output routing
 | `recommendation` | Next step |
 |---|---|
 | `proceed` | Pass to Council |
-| `investigate-more` | Return to Detective with `gaps` |
+| `investigate-more` | Return to Detective with `blocking_gaps` |
 | `redesign` | Return to Designer |
-### Council
+---
+## Council output
+```typescript
+interface CouncilOutput {
+  satisfied: boolean
+  verdict: string
+  blockers: Array<{              // must be resolved before proceeding
+    issue: string
+    evidence: string[]           // Oracle entry IDs that evidence this blocker
+    required_fix: string
+  }>
+  warnings: Array<{              // should be addressed, does not block
+    issue: string
+    suggested_fix?: string
+  }>
+  challenges: string[]           // flat list of all issues — backwards compatible
+  evidence_cited: string[]
+  citation_validation: {
+    valid_ids: string[]          // cited IDs that were in the evidence pack
+    hallucinated_ids: string[]   // cited IDs that were NOT — hallucinated
+  }
+  advisor_split: {               // how advisors split on recommendation
+    proceed: number
+    redesign: number
+    "investigate-more": number
+  }
+  recommendation: "proceed" | "redesign" | "investigate-more"
+}
+```
+Only `citation_validation.valid_ids` are written to the Chronicle proposal — hallucinated IDs are stripped automatically.
+### Risk classifier (no LLM)
+Before running the panel, Council classifies risk and scales fan-out accordingly:
+```typescript
+import { classifyRisk } from "./modules/council"
+const risk = classifyRisk(outcome, design, evidence)
+// risk.level          — "low" | "medium" | "high" | "critical"
+// risk.reasons        — ["authentication or authorisation logic", ...]
+// risk.council_mode   — "jury-only" | "lite" | "full"
+```
+| Risk | Triggers | Advisor + Reviewer count |
+|---|---|---|
+| Low | Nothing sensitive detected | 1 + 1 |
+| Medium | Cache, queues, deployments, rate limiting | 1 + 2 |
+| High | DB migrations, permissions, PII, secrets | 5 + 5 |
+| Critical | Auth, payments, crypto, data deletion | 5 + 5 |
+Refuted entries in the evidence pack always elevate risk by at least one level.
+### Council output routing
 | `satisfied` | `recommendation` | Next step |
 |---|---|---|
 | `true` | `proceed` | Human gate → Executor |
-| `false` | `redesign` | Return to Designer with `verdict` |
-| `false` | `investigate-more` | Return to Detective with `juryOutput.gaps` |
+| `false` | `redesign` | Return to Designer with `blockers` |
+| `false` | `investigate-more` | Return to Detective with `juryOutput.blocking_gaps` |
+---
+## Eval suite
+`evals/` contains canonical test cases — known-bad proposals that should block and known-good ones that should pass. Deterministic assertions run on every CI pass:
+```bash
+npx vitest run evals/
+```
+Each case defines the proposal, expected risk level, expected preflight signals, and (optionally) expected Council recommendation for LLM-gated assertions. See `evals/cases/` for the full set and `evals/runner.ts` for the runner API.
 ---
@@ -234,7 +374,14 @@ describe("sentinel", () => { assertions.forEach(a => a()) })
 Tests use [Vitest](https://vitest.dev/). Add to your project's test config or run directly:
 ```bash
+# Module unit tests
 npx vitest run modules/
+# Eval suite (deterministic assertions — no LLM required)
+npx vitest run evals/
+# Eval suite with LLM-gated assertions (jury confidence + council recommendation)
+EVAL_LLM=1 OPENAI_API_KEY=sk-... npx vitest run evals/
 ```
 ---

package/modules/council/chairman.ts CHANGED Viewed

@@ -3,13 +3,32 @@ import type { LLMProvider, OracleResult } from "../shared/types"
 import { entryText } from "../shared/types"
 import type { AdvisorResponse } from "./advisors"
 import type { ReviewerResponse } from "./reviewers"
-import type { CouncilOutput } from "./types"
+import type { CouncilOutput, CitationValidation } from "./types"
+const BlockerSchema = z.object({
+  issue: z.string().min(1),
+  evidence: z.array(z.string()),
+  required_fix: z.string().min(1),
+})
+const WarningSchema = z.object({
+  issue: z.string().min(1),
+  suggested_fix: z.string().optional(),
+})
+const AdvisorSplitSchema = z.object({
+  proceed: z.number().int().min(0),
+  redesign: z.number().int().min(0),
+  "investigate-more": z.number().int().min(0),
+})
 const ChairmanOutputSchema = z.object({
   satisfied: z.boolean(),
   verdict: z.string().min(1),
-  challenges: z.array(z.string()),
+  blockers: z.array(BlockerSchema),
+  warnings: z.array(WarningSchema),
   evidence_cited: z.array(z.string()),
+  advisor_split: AdvisorSplitSchema,
   recommendation: z.enum(["proceed", "redesign", "investigate-more"]),
 })
@@ -35,31 +54,67 @@ function formatEvidence(evidence: OracleResult[]): string {
     .join("\n")
 }
+/**
+ * Validate that every ID in evidence_cited actually appeared in the evidence pack.
+ * Hallucinated IDs are cited but were never in the evidence sent to Council.
+ */
+function validateCitations(
+  citedIds: string[],
+  evidence: OracleResult[],
+): CitationValidation {
+  const evidenceIds = new Set(evidence.map(e => e.id))
+  const valid_ids: string[] = []
+  const hallucinated_ids: string[] = []
+  for (const id of citedIds) {
+    if (evidenceIds.has(id)) {
+      valid_ids.push(id)
+    } else {
+      hallucinated_ids.push(id)
+    }
+  }
+  return { valid_ids, hallucinated_ids }
+}
 const CHAIRMAN_SYSTEM_PROMPT = [
   "You are the Council Chairman. You synthesise the final verdict from all advisor and reviewer inputs.",
   "",
-  "Your verdict must:",
-  "1. Be grounded in Oracle evidence — cite specific entry IDs for every material conclusion",
-  "2. Summarise what was challenged and what held up under scrutiny",
-  "3. State a clear recommendation",
-  "4. List every Oracle entry ID that materially influenced the verdict in evidence_cited",
+  "Your output must classify findings by severity:",
+  "  blockers — issues that MUST be resolved before the design can proceed",
+  "    (e.g. no rollback plan for a destructive migration, repeated a documented failure mode)",
+  "  warnings — issues that SHOULD be addressed but do not block execution",
+  "    (e.g. no test coverage for an edge case, a preferred pattern not followed)",
+  "",
+  "For each blocker, cite the Oracle entry IDs that evidence it and state the required fix precisely.",
+  "For each warning, a suggested_fix is optional but preferred.",
+  "",
+  "advisor_split: count how many advisors recommended each option from their responses.",
+  "  High split (no clear majority) is a signal of genuine uncertainty — reflect this in your verdict.",
   "",
-  "satisfied = true  → design holds up, can proceed to the human gate",
-  "satisfied = false → fundamental flaw, unresolved gap, or design needs rework",
+  "satisfied = true  → no blockers, design can proceed to the human gate",
+  "satisfied = false → at least one blocker exists, or the design needs rework",
+  "",
+  "evidence_cited: list every Oracle entry ID that materially influenced the verdict.",
+  "  Only cite IDs that appeared in the Oracle Evidence section below.",
+  "  Do not cite IDs from memory or general knowledge.",
   "",
   "Return ONLY valid JSON — no markdown fences, no explanation:",
   JSON.stringify({
     satisfied: "<boolean>",
-    verdict: "<string ≤400 words — clear synthesis>",
-    challenges: ["<string — each challenge raised>"],
-    evidence_cited: ["<Oracle entry ID>"],
+    verdict: "<string ≤400 words — clear synthesis citing entry IDs>",
+    blockers: [{ issue: "<string>", evidence: ["<Oracle entry ID>"], required_fix: "<string>" }],
+    warnings: [{ issue: "<string>", suggested_fix: "<string — optional>" }],
+    evidence_cited: ["<Oracle entry ID — only IDs present in the evidence pack>"],
+    advisor_split: { proceed: "<int>", redesign: "<int>", "investigate-more": "<int>" },
     recommendation: "proceed | redesign | investigate-more",
   }),
 ].join("\n")
 /**
  * Chairman synthesises the verdict from all advisor and reviewer inputs.
- * Every material conclusion must cite specific Oracle entry IDs.
+ * Classifies findings into blockers and warnings, validates citations,
+ * and tracks advisor split to surface genuine disagreement.
  *
  * Throws if the LLM returns non-JSON or output fails schema validation.
  */
@@ -109,5 +164,20 @@ export async function chairman(
     )
   }
-  return result.data
+  const data = result.data
+  // Validate citations — flag any IDs cited that weren't in the evidence pack
+  const citation_validation = validateCitations(data.evidence_cited, evidence)
+  // Derive flat challenges array for backwards compatibility
+  const challenges = [
+    ...data.blockers.map(b => `[BLOCKER] ${b.issue}`),
+    ...data.warnings.map(w => w.issue),
+  ]
+  return {
+    ...data,
+    challenges,
+    citation_validation,
+  }
 }