@balpal4495/quorum 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +223 -11
  2. package/SETUP.md +30 -0
  3. package/bin/commands/check.js +122 -0
  4. package/bin/commands/commit.js +210 -0
  5. package/bin/commands/init.js +236 -0
  6. package/bin/commands/sentinel.js +160 -0
  7. package/bin/commands/status.js +117 -0
  8. package/bin/quorum.js +103 -0
  9. package/bin/shared/chronicle.js +129 -0
  10. package/bin/shared/colors.js +22 -0
  11. package/bin/shared/patterns.js +83 -0
  12. package/evals/__tests__/eval.test.ts +31 -0
  13. package/evals/cases/auth_hs256_rejected.json +46 -0
  14. package/evals/cases/auth_rs256_valid.json +30 -0
  15. package/evals/cases/cache_missing_lock.json +31 -0
  16. package/evals/cases/db_naive_not_null.json +32 -0
  17. package/evals/cases/logging_pii_leak.json +32 -0
  18. package/evals/cases/migration_with_rollback.json +43 -0
  19. package/evals/cases/no_evidence_novel_design.json +16 -0
  20. package/evals/cases/payment_no_idempotency.json +33 -0
  21. package/evals/cases/redis_session_rejected.json +32 -0
  22. package/evals/cases/safe_refactor.json +17 -0
  23. package/evals/runner.ts +226 -0
  24. package/modules/AGENTS.md +9 -5
  25. package/modules/CLAUDE.md +25 -2
  26. package/modules/README.md +153 -6
  27. package/modules/council/chairman.ts +84 -14
  28. package/modules/council/deliberate.ts +24 -4
  29. package/modules/council/index.ts +6 -1
  30. package/modules/council/risk.ts +89 -0
  31. package/modules/council/types.ts +63 -1
  32. package/modules/jury/evaluate.ts +32 -8
  33. package/modules/jury/index.ts +3 -1
  34. package/modules/jury/preflight.ts +101 -0
  35. package/modules/jury/schema.ts +9 -0
  36. package/modules/jury/types.ts +20 -1
  37. package/modules/shared/types.ts +8 -0
  38. package/package.json +3 -3
@@ -0,0 +1,226 @@
1
+ /**
2
+ * Eval runner for Quorum Jury + Council.
3
+ *
4
+ * Each case in evals/cases/ defines a proposal and what the system should produce.
5
+ * The runner validates:
6
+ * - Jury confidence is within expected bounds
7
+ * - Preflight detects the expected signals
8
+ * - Risk classifier assigns the expected level
9
+ * - Council recommendation matches (when an LLM provider is available)
10
+ *
11
+ * Jury + preflight run without any LLM (deterministic).
12
+ * Council assertions are skipped if no LLM provider is injected.
13
+ *
14
+ * Usage:
15
+ * npx vitest run evals/
16
+ *
17
+ * Or run against a real LLM:
18
+ * EVAL_LLM=openai npx vitest run evals/
19
+ */
20
+
21
+ import { promises as fs } from "fs"
22
+ import path from "path"
23
+ import type { OracleResult, LLMProvider } from "../modules/shared/types"
24
+ import { runPreflight } from "../modules/jury/preflight"
25
+ import { classifyRisk } from "../modules/council/risk"
26
+
27
+ export interface EvalCase {
28
+ id: string
29
+ description: string
30
+ outcome: string
31
+ design: string
32
+ oracle_evidence: OracleResult[]
33
+ expected: {
34
+ jury_min_confidence?: number
35
+ jury_max_confidence?: number
36
+ council_recommendation?: "proceed" | "redesign" | "investigate-more"
37
+ must_flag?: string[]
38
+ must_not_flag?: string[]
39
+ must_cite?: string[]
40
+ risk_level?: string
41
+ preflight_expects?: {
42
+ touches_sensitive_area?: boolean
43
+ sensitive_areas_include?: string[]
44
+ rollback_mentioned?: boolean
45
+ test_strategy_mentioned?: boolean
46
+ chronicle_conflicts?: string[]
47
+ }
48
+ }
49
+ }
50
+
51
+ export interface EvalResult {
52
+ caseId: string
53
+ description: string
54
+ passed: boolean
55
+ failures: string[]
56
+ preflight: ReturnType<typeof runPreflight>
57
+ risk: ReturnType<typeof classifyRisk>
58
+ juryOutput?: unknown
59
+ councilOutput?: unknown
60
+ durationMs: number
61
+ }
62
+
63
+ export async function loadCases(casesDir?: string): Promise<EvalCase[]> {
64
+ const dir = casesDir ?? path.join(__dirname, "cases")
65
+ const files = (await fs.readdir(dir)).filter(f => f.endsWith(".json"))
66
+ const cases = await Promise.all(
67
+ files.map(async f => {
68
+ const raw = await fs.readFile(path.join(dir, f), "utf8")
69
+ return JSON.parse(raw) as EvalCase
70
+ }),
71
+ )
72
+ return cases
73
+ }
74
+
75
+ export async function runCase(
76
+ evalCase: EvalCase,
77
+ llm?: LLMProvider,
78
+ ): Promise<EvalResult> {
79
+ const start = Date.now()
80
+ const failures: string[] = []
81
+
82
+ const { outcome, design, oracle_evidence: evidence, expected } = evalCase
83
+
84
+ // ── Deterministic checks (no LLM) ──────────────────────────────────────────
85
+
86
+ const preflight = runPreflight(outcome, design, evidence)
87
+ const risk = classifyRisk(outcome, design, evidence)
88
+
89
+ // Risk level
90
+ if (expected.risk_level && risk.level !== expected.risk_level) {
91
+ failures.push(
92
+ `risk_level: expected "${expected.risk_level}", got "${risk.level}" (reasons: ${risk.reasons.join(", ")})`,
93
+ )
94
+ }
95
+
96
+ // Preflight assertions
97
+ const pf = expected.preflight_expects
98
+ if (pf) {
99
+ if (pf.touches_sensitive_area !== undefined && preflight.touches_sensitive_area !== pf.touches_sensitive_area) {
100
+ failures.push(`preflight.touches_sensitive_area: expected ${pf.touches_sensitive_area}, got ${preflight.touches_sensitive_area}`)
101
+ }
102
+ if (pf.rollback_mentioned !== undefined && preflight.rollback_mentioned !== pf.rollback_mentioned) {
103
+ failures.push(`preflight.rollback_mentioned: expected ${pf.rollback_mentioned}, got ${preflight.rollback_mentioned}`)
104
+ }
105
+ if (pf.test_strategy_mentioned !== undefined && preflight.test_strategy_mentioned !== pf.test_strategy_mentioned) {
106
+ failures.push(`preflight.test_strategy_mentioned: expected ${pf.test_strategy_mentioned}, got ${preflight.test_strategy_mentioned}`)
107
+ }
108
+ if (pf.chronicle_conflicts) {
109
+ for (const id of pf.chronicle_conflicts) {
110
+ if (!preflight.chronicle_conflicts.includes(id)) {
111
+ failures.push(`preflight.chronicle_conflicts: expected "${id}" to be flagged`)
112
+ }
113
+ }
114
+ }
115
+ if (pf.sensitive_areas_include) {
116
+ for (const area of pf.sensitive_areas_include) {
117
+ if (!preflight.sensitive_areas.includes(area)) {
118
+ failures.push(`preflight.sensitive_areas: expected "${area}" to be detected`)
119
+ }
120
+ }
121
+ }
122
+ }
123
+
124
+ let juryOutput: unknown
125
+ let councilOutput: unknown
126
+
127
+ // ── LLM-dependent checks (skipped if no provider) ──────────────────────────
128
+
129
+ if (llm) {
130
+ const { evaluate } = await import("../modules/jury/evaluate")
131
+ try {
132
+ juryOutput = await evaluate({ outcome, design, evidence }, { llm })
133
+ const jury = juryOutput as { confidence: number; recommendation: string; assessment: string; gaps: string[] }
134
+
135
+ if (expected.jury_min_confidence !== undefined && jury.confidence < expected.jury_min_confidence) {
136
+ failures.push(`jury.confidence: expected ≥ ${expected.jury_min_confidence}, got ${jury.confidence}`)
137
+ }
138
+ if (expected.jury_max_confidence !== undefined && jury.confidence > expected.jury_max_confidence) {
139
+ failures.push(`jury.confidence: expected ≤ ${expected.jury_max_confidence}, got ${jury.confidence}`)
140
+ }
141
+ } catch (err) {
142
+ failures.push(`jury threw: ${String(err)}`)
143
+ }
144
+
145
+ if (expected.council_recommendation && juryOutput) {
146
+ const { deliberate } = await import("../modules/council/deliberate")
147
+ const mockOracle = {
148
+ query: async () => [],
149
+ propose: async () => ({ proposalId: "eval-proposal" }),
150
+ commit: async () => { throw new Error("commit not available in eval") },
151
+ }
152
+ try {
153
+ councilOutput = await deliberate(
154
+ { outcome, design, evidence, jury_output: juryOutput as never },
155
+ { llm, oracle: mockOracle, advisorCount: 2, reviewerCount: 2 },
156
+ )
157
+ const council = councilOutput as { recommendation: string; verdict: string; blockers: Array<{ issue: string }>; evidence_cited: string[] }
158
+
159
+ if (council.recommendation !== expected.council_recommendation) {
160
+ failures.push(
161
+ `council.recommendation: expected "${expected.council_recommendation}", got "${council.recommendation}"`,
162
+ )
163
+ }
164
+
165
+ const verdictText = [
166
+ council.verdict,
167
+ ...council.blockers.map(b => b.issue),
168
+ ].join(" ").toLowerCase()
169
+
170
+ if (expected.must_flag) {
171
+ for (const term of expected.must_flag) {
172
+ if (!verdictText.includes(term.toLowerCase())) {
173
+ failures.push(`council must_flag: "${term}" not mentioned in verdict or blockers`)
174
+ }
175
+ }
176
+ }
177
+ if (expected.must_not_flag) {
178
+ for (const term of expected.must_not_flag) {
179
+ if (verdictText.includes(term.toLowerCase())) {
180
+ failures.push(`council must_not_flag: "${term}" was mentioned but should not be`)
181
+ }
182
+ }
183
+ }
184
+ if (expected.must_cite) {
185
+ for (const id of expected.must_cite) {
186
+ if (!council.evidence_cited.includes(id)) {
187
+ failures.push(`council must_cite: entry ID "${id}" not in evidence_cited`)
188
+ }
189
+ }
190
+ }
191
+ } catch (err) {
192
+ failures.push(`council threw: ${String(err)}`)
193
+ }
194
+ }
195
+ }
196
+
197
+ return {
198
+ caseId: evalCase.id,
199
+ description: evalCase.description,
200
+ passed: failures.length === 0,
201
+ failures,
202
+ preflight,
203
+ risk,
204
+ juryOutput,
205
+ councilOutput,
206
+ durationMs: Date.now() - start,
207
+ }
208
+ }
209
+
210
+ export function printEvalSummary(results: EvalResult[]): void {
211
+ const passed = results.filter(r => r.passed).length
212
+ const total = results.length
213
+ console.log(`\n${"─".repeat(60)}`)
214
+ console.log(`Eval results: ${passed}/${total} passed`)
215
+ console.log("─".repeat(60))
216
+ for (const r of results) {
217
+ const icon = r.passed ? "✓" : "✗"
218
+ console.log(`${icon} ${r.caseId} (${r.durationMs}ms)`)
219
+ if (!r.passed) {
220
+ for (const f of r.failures) {
221
+ console.log(` → ${f}`)
222
+ }
223
+ }
224
+ }
225
+ console.log("─".repeat(60))
226
+ }
package/modules/AGENTS.md CHANGED
@@ -20,8 +20,9 @@ When working inside this folder, follow these rules in addition to the root guid
20
20
  ### Jury
21
21
  | File | Owns |
22
22
  |---|---|
23
- | `jury/schema.ts` | Zod schema for structured LLM output. Source of truth for `JuryOutput` shape. |
24
- | `jury/evaluate.ts` | Four-dimension evaluation. **`council_brief` is always overridden from confidence here — do not remove this enforcement.** |
23
+ | `jury/schema.ts` | Zod schema for structured LLM output. Source of truth for `JuryOutput` shape including `confidence_breakdown` and `blocking_gaps`. |
24
+ | `jury/evaluate.ts` | Four-dimension evaluation. **Confidence is always recomputed from the breakdown average here — do not remove this. `council_brief` is also overridden from confidence.** |
25
+ | `jury/preflight.ts` | Deterministic preflight — no LLM. Detects sensitive areas, rollback mention, and Chronicle conflicts before the LLM runs. Safe to extend with new patterns. |
25
26
 
26
27
  ### Council
27
28
  | File | Owns |
@@ -30,8 +31,9 @@ When working inside this folder, follow these rules in addition to the root guid
30
31
  | `council/frame.ts` | Sets deliberation tone from `council_brief`. Challenge vs pressure-test framing lives here. |
31
32
  | `council/advisors.ts` | Parallel advisor fan-out. Advisors must cite Oracle entry IDs — enforced in the prompt. |
32
33
  | `council/reviewers.ts` | Anonymisation of advisor responses + parallel reviewer fan-out. Anonymisation must happen before reviewers see responses. |
33
- | `council/chairman.ts` | Verdict synthesis + Zod validation. Throws on bad output — do not add fallbacks. |
34
- | `council/deliberate.ts` | Full pipeline orchestration. Calls `oracle.propose()` at the end never `oracle.commit()`. |
34
+ | `council/chairman.ts` | Verdict synthesis + Zod validation. Produces structured `blockers`/`warnings`, validates citations, tracks `advisor_split`. Throws on bad output — do not add fallbacks. |
35
+ | `council/risk.ts` | Deterministic risk classifier — no LLM. Assigns `low/medium/high/critical` and `council_mode` from design text and refuted evidence. Drives advisor/reviewer fan-out counts. |
36
+ | `council/deliberate.ts` | Full pipeline orchestration. Calls `oracle.propose()` at the end — never `oracle.commit()`. Risk classifier runs first to set fan-out counts. |
35
37
 
36
38
  ---
37
39
 
@@ -50,8 +52,10 @@ When working inside this folder, follow these rules in addition to the root guid
50
52
  ## Invariants — do not break these
51
53
 
52
54
  - `oracle.commit()` is never called without explicit human input. `deliberate()` calls `propose()` only.
53
- - `jury/evaluate.ts` always computes `council_brief` from `confidence` after parsing never trusts the LLM value.
55
+ - `jury/evaluate.ts` recomputes `confidence` as the exact average of `confidence_breakdown` dimensions — the LLM value is discarded.
56
+ - `jury/evaluate.ts` derives `council_brief` from the recomputed confidence — never trusts the LLM value.
54
57
  - `chairman.ts` and `jury/evaluate.ts` throw on schema validation failure. Do not add try/catch that swallows these errors.
58
+ - `deliberate.ts` passes `citation_validation.valid_ids` (not raw `evidence_cited`) to `oracle.propose()` — hallucinated IDs are stripped.
55
59
  - Query logging in `oracle/log.ts` is always best-effort — callers must not fail because of a log write error.
56
60
  - `VectorStore` and `embedder` are always injected — never imported directly inside Oracle logic.
57
61
 
package/modules/CLAUDE.md CHANGED
@@ -17,8 +17,8 @@ The entry point for a host application is `setup.ts`. Everything else is interna
17
17
  ### Dependency injection throughout
18
18
  No module imports a specific LLM provider, vector store, or embedder. All external dependencies are passed in as function arguments or via a deps object. If you add a new capability, follow this pattern — do not hardcode providers.
19
19
 
20
- ### council_brief is computed, not trusted
21
- In `jury/evaluate.ts`, the `council_brief` field in the LLM response is **always overridden** based on the numeric `confidence` value after parsing. The LLM is not trusted to compute this correctly. Do not remove this override.
20
+ ### Confidence is recomputed from the breakdown — never trusted from the LLM
21
+ In `jury/evaluate.ts`, after parsing the LLM response, `confidence` is recomputed as the exact average of the four `confidence_breakdown` dimensions. The LLM's stated `confidence` value is discarded. `council_brief` is then derived from this recomputed value. Do not remove either override.
22
22
 
23
23
  ### Throw on bad LLM output — never default to passing
24
24
  Both `jury/evaluate.ts` and `council/chairman.ts` throw if the LLM returns non-JSON or output that fails Zod validation. This is intentional. A silently passing Jury score is worse than an error. Do not add fallbacks or defaults.
@@ -26,6 +26,15 @@ Both `jury/evaluate.ts` and `council/chairman.ts` throw if the LLM returns non-J
26
26
  ### oracle.commit() is a human gate
27
27
  `council/deliberate.ts` calls `oracle.propose()` at the end of every deliberation. It never calls `oracle.commit()`. If you see a code path that calls `oracle.commit()` without explicit human input, that is a bug.
28
28
 
29
+ ### Oracle proposals use only validated citation IDs
30
+ `deliberate.ts` passes `verdict.citation_validation.valid_ids` as `evidence_cited` when calling `oracle.propose()` — not the raw `evidence_cited` array from the Chairman. Hallucinated IDs (cited but not in the evidence pack) are stripped before the proposal is written.
31
+
32
+ ### Preflight runs before every Jury LLM call — do not remove it
33
+ `jury/evaluate.ts` calls `runPreflight()` before building the user prompt. The preflight result is injected as the `## Deterministic Preflight` section. This gives the LLM hard facts to reason over rather than discovering them itself. Do not move this call after the LLM invocation.
34
+
35
+ ### Risk classifier determines fan-out counts — do not hardcode them
36
+ `deliberate.ts` reads `risk.council_mode` from `classifyRisk()` to set advisor and reviewer counts. Do not hardcode `advisorCount` or `reviewerCount` defaults inside `deliberate.ts` — the risk classifier owns these defaults.
37
+
29
38
  ### Query logging is best-effort
30
39
  `oracle/log.ts` writes to a JSONL file. The `query()` function wraps this in a try/catch that swallows errors silently. This is correct behaviour — a log write failure must never fail a query.
31
40
 
@@ -47,14 +56,27 @@ The pipeline order is fixed: `frameQuestion → fanOutAdvisors → fanOutReviewe
47
56
 
48
57
  Anonymisation of advisor responses happens inside `fanOutReviewers()` before any reviewer sees them. It must stay there.
49
58
 
59
+ The risk classifier runs at the start of `deliberate()` before any LLM calls. It sets advisor/reviewer counts and is logged in the Chronicle proposal's `scope` field. Do not move it.
60
+
61
+ ---
62
+
63
+ ## When modifying jury/preflight.ts
64
+
65
+ `SENSITIVE_PATTERNS` and the risk rules in `council/risk.ts` are separate but related. Preflight detects patterns for the Jury prompt; the risk classifier uses its own pattern set to determine Council mode. They are intentionally independent — changing one does not update the other. Keep them in sync when adding new sensitive area categories.
66
+
67
+ The eval suite in `evals/cases/` has `preflight_expects` and `risk_level` assertions. When changing patterns, run `npx vitest run evals/` to verify existing cases still pass.
68
+
50
69
  ---
51
70
 
52
71
  ## Safe to change
53
72
 
54
73
  - `council/personas.ts` — add or adjust personas freely
74
+ - `jury/preflight.ts` `SENSITIVE_PATTERNS` — extend with new categories; run evals after
75
+ - `council/risk.ts` `RISK_RULES` — add new risk patterns; run evals after
55
76
  - `models` defaults in `setup.ts` — adjust model names as providers evolve
56
77
  - BM25 constants (`K1`, `B`) in `oracle/bm25.ts` — tunable, well-commented
57
78
  - `CANDIDATE_MULTIPLIER` and `RRF_K` in `oracle/query.ts` — tunable retrieval parameters
79
+ - `evals/cases/` — add new eval cases freely; they run in CI automatically
58
80
 
59
81
  ## Do not change without strong reason
60
82
 
@@ -62,3 +84,4 @@ Anonymisation of advisor responses happens inside `fanOutReviewers()` before any
62
84
  - The `ChronicleEntry` type in `shared/types.ts` — changing it breaks stored data
63
85
  - The Zod schemas in `jury/schema.ts` and `council/chairman.ts` — these are the output contracts
64
86
  - The `OracleClient` interface in `shared/types.ts` — Jury and Council depend on it
87
+ - The confidence recomputation in `jury/evaluate.ts` — it makes confidence calibrated and deterministic
package/modules/README.md CHANGED
@@ -34,6 +34,42 @@ Chronicle is the data that underpins the system. It is not a module — it lives
34
34
 
35
35
  Every entry goes through `oracle.propose()` → human approval → `oracle.commit()`. There are no auto-commits.
36
36
 
37
+ ### Chronicle entry schema (v2)
38
+
39
+ ```typescript
40
+ type ChronicleEntry = {
41
+ // Always present (v1 + v2)
42
+ id: string
43
+ key_insight: string // v1: primary text; v2: copy of decision for compat
44
+ affected_areas: string[] // file paths — used by Sentinel for coverage matching
45
+ status: "validated" | "refuted" | "open"
46
+ confidence: number // 0–1
47
+ source_module: string
48
+ evidence_cited: string[]
49
+ timestamp: string
50
+
51
+ // v2 fields (optional — absent on legacy entries)
52
+ schema_version?: 2
53
+ topic?: string // short label: "auth/session strategy"
54
+ decision?: string // the decision — primary text in v2
55
+ scope?: string[] // domain tags: ["auth", "sessions"] — additive
56
+ alternatives_considered?: string[]
57
+ rejected_reason?: string[]
58
+ supersedes?: string | null // ID of the entry this replaces
59
+ superseded_by?: string | null // ID of the entry that replaced this
60
+
61
+ // Outcome tracking fields (optional — filled in post-execution)
62
+ outcome?: string // what actually happened when acted on
63
+ validation_plan?: string[] // steps that confirm the decision was correct
64
+ review_after?: string // ISO date to re-evaluate for drift
65
+ post_merge_result?: "successful" | "bug" | "partial" | "rolled-back"
66
+ }
67
+ ```
68
+
69
+ Use `entryText(entry)` from `shared/types` whenever you need to read the primary text — it returns `entry.decision ?? entry.key_insight` and works across both schema versions.
70
+
71
+ New entries created by Council automatically include `decision`, `topic`, `alternatives_considered`, `rejected_reason`, and `scope` (from the risk classifier) from the deliberation output.
72
+
37
73
  ---
38
74
 
39
75
  ## Dependencies
@@ -155,23 +191,127 @@ const anthropicProvider: LLMProvider = async (messages, model = "claude-3-5-sonn
155
191
 
156
192
  ---
157
193
 
158
- ## Output routing
194
+ ## Jury output
195
+
196
+ ```typescript
197
+ interface JuryOutput {
198
+ confidence: number // exact average of the four breakdown scores
199
+ confidence_breakdown: {
200
+ evidence_support: number // do validated entries confirm this approach?
201
+ feasibility: number // is this achievable given what Chronicle knows?
202
+ risk: number // how well does the design address failure modes?
203
+ completeness: number // does it cover the full outcome?
204
+ }
205
+ assessment: string
206
+ gaps: string[] // all missing evidence
207
+ blocking_gaps: string[] // subset of gaps that are hard blockers
208
+ council_brief: "challenge" | "pressure-test"
209
+ recommendation: "proceed" | "investigate-more" | "redesign"
210
+ }
211
+ ```
212
+
213
+ `confidence` is always recomputed from the breakdown average — the LLM's stated value is discarded. `council_brief` is derived from `confidence` (< 0.6 → challenge, ≥ 0.6 → pressure-test).
214
+
215
+ ### Preflight (no LLM)
216
+
217
+ Before the LLM runs, Jury executes a deterministic preflight:
218
+
219
+ ```typescript
220
+ import { runPreflight } from "./modules/jury"
221
+
222
+ const preflight = runPreflight(outcome, design, evidence)
223
+ // preflight.touches_sensitive_area
224
+ // preflight.sensitive_areas — ["auth", "database", ...]
225
+ // preflight.rollback_mentioned
226
+ // preflight.test_strategy_mentioned
227
+ // preflight.chronicle_conflicts — refuted entry IDs that overlap with the design
228
+ ```
229
+
230
+ Results are injected into the Jury prompt as hard facts. Auth, database migrations, crypto, payments, PII, and secrets are the detected sensitive areas.
159
231
 
160
- ### Jury
232
+ ### Jury output routing
161
233
 
162
234
  | `recommendation` | Next step |
163
235
  |---|---|
164
236
  | `proceed` | Pass to Council |
165
- | `investigate-more` | Return to Detective with `gaps` |
237
+ | `investigate-more` | Return to Detective with `blocking_gaps` |
166
238
  | `redesign` | Return to Designer |
167
239
 
168
- ### Council
240
+ ---
241
+
242
+ ## Council output
243
+
244
+ ```typescript
245
+ interface CouncilOutput {
246
+ satisfied: boolean
247
+ verdict: string
248
+ blockers: Array<{ // must be resolved before proceeding
249
+ issue: string
250
+ evidence: string[] // Oracle entry IDs that evidence this blocker
251
+ required_fix: string
252
+ }>
253
+ warnings: Array<{ // should be addressed, does not block
254
+ issue: string
255
+ suggested_fix?: string
256
+ }>
257
+ challenges: string[] // flat list of all issues — backwards compatible
258
+ evidence_cited: string[]
259
+ citation_validation: {
260
+ valid_ids: string[] // cited IDs that were in the evidence pack
261
+ hallucinated_ids: string[] // cited IDs that were NOT — hallucinated
262
+ }
263
+ advisor_split: { // how advisors split on recommendation
264
+ proceed: number
265
+ redesign: number
266
+ "investigate-more": number
267
+ }
268
+ recommendation: "proceed" | "redesign" | "investigate-more"
269
+ }
270
+ ```
271
+
272
+ Only `citation_validation.valid_ids` are written to the Chronicle proposal — hallucinated IDs are stripped automatically.
273
+
274
+ ### Risk classifier (no LLM)
275
+
276
+ Before running the panel, Council classifies risk and scales fan-out accordingly:
277
+
278
+ ```typescript
279
+ import { classifyRisk } from "./modules/council"
280
+
281
+ const risk = classifyRisk(outcome, design, evidence)
282
+ // risk.level — "low" | "medium" | "high" | "critical"
283
+ // risk.reasons — ["authentication or authorisation logic", ...]
284
+ // risk.council_mode — "jury-only" | "lite" | "full"
285
+ ```
286
+
287
+ | Risk | Triggers | Advisor + Reviewer count |
288
+ |---|---|---|
289
+ | Low | Nothing sensitive detected | 1 + 1 |
290
+ | Medium | Cache, queues, deployments, rate limiting | 1 + 2 |
291
+ | High | DB migrations, permissions, PII, secrets | 5 + 5 |
292
+ | Critical | Auth, payments, crypto, data deletion | 5 + 5 |
293
+
294
+ Refuted entries in the evidence pack always elevate risk by at least one level.
295
+
296
+ ### Council output routing
169
297
 
170
298
  | `satisfied` | `recommendation` | Next step |
171
299
  |---|---|---|
172
300
  | `true` | `proceed` | Human gate → Executor |
173
- | `false` | `redesign` | Return to Designer with `verdict` |
174
- | `false` | `investigate-more` | Return to Detective with `juryOutput.gaps` |
301
+ | `false` | `redesign` | Return to Designer with `blockers` |
302
+ | `false` | `investigate-more` | Return to Detective with `juryOutput.blocking_gaps` |
303
+
304
+ ---
305
+
306
+ ## Eval suite
307
+
308
+ `evals/` contains canonical test cases — known-bad proposals that should block and known-good ones that should pass. Deterministic assertions run on every CI pass:
309
+
310
+ ```bash
311
+ npx vitest run evals/
312
+ ```
313
+
314
+ Each case defines the proposal, expected risk level, expected preflight signals, and (optionally) expected Council recommendation for LLM-gated assertions. See `evals/cases/` for the full set and `evals/runner.ts` for the runner API.
175
315
 
176
316
  ---
177
317
 
@@ -234,7 +374,14 @@ describe("sentinel", () => { assertions.forEach(a => a()) })
234
374
  Tests use [Vitest](https://vitest.dev/). Add to your project's test config or run directly:
235
375
 
236
376
  ```bash
377
+ # Module unit tests
237
378
  npx vitest run modules/
379
+
380
+ # Eval suite (deterministic assertions — no LLM required)
381
+ npx vitest run evals/
382
+
383
+ # Eval suite with LLM-gated assertions (jury confidence + council recommendation)
384
+ EVAL_LLM=1 OPENAI_API_KEY=sk-... npx vitest run evals/
238
385
  ```
239
386
 
240
387
  ---
@@ -3,13 +3,32 @@ import type { LLMProvider, OracleResult } from "../shared/types"
3
3
  import { entryText } from "../shared/types"
4
4
  import type { AdvisorResponse } from "./advisors"
5
5
  import type { ReviewerResponse } from "./reviewers"
6
- import type { CouncilOutput } from "./types"
6
+ import type { CouncilOutput, CitationValidation } from "./types"
7
+
8
+ const BlockerSchema = z.object({
9
+ issue: z.string().min(1),
10
+ evidence: z.array(z.string()),
11
+ required_fix: z.string().min(1),
12
+ })
13
+
14
+ const WarningSchema = z.object({
15
+ issue: z.string().min(1),
16
+ suggested_fix: z.string().optional(),
17
+ })
18
+
19
+ const AdvisorSplitSchema = z.object({
20
+ proceed: z.number().int().min(0),
21
+ redesign: z.number().int().min(0),
22
+ "investigate-more": z.number().int().min(0),
23
+ })
7
24
 
8
25
  const ChairmanOutputSchema = z.object({
9
26
  satisfied: z.boolean(),
10
27
  verdict: z.string().min(1),
11
- challenges: z.array(z.string()),
28
+ blockers: z.array(BlockerSchema),
29
+ warnings: z.array(WarningSchema),
12
30
  evidence_cited: z.array(z.string()),
31
+ advisor_split: AdvisorSplitSchema,
13
32
  recommendation: z.enum(["proceed", "redesign", "investigate-more"]),
14
33
  })
15
34
 
@@ -35,31 +54,67 @@ function formatEvidence(evidence: OracleResult[]): string {
35
54
  .join("\n")
36
55
  }
37
56
 
57
+ /**
58
+ * Validate that every ID in evidence_cited actually appeared in the evidence pack.
59
+ * Hallucinated IDs are cited but were never in the evidence sent to Council.
60
+ */
61
+ function validateCitations(
62
+ citedIds: string[],
63
+ evidence: OracleResult[],
64
+ ): CitationValidation {
65
+ const evidenceIds = new Set(evidence.map(e => e.id))
66
+ const valid_ids: string[] = []
67
+ const hallucinated_ids: string[] = []
68
+
69
+ for (const id of citedIds) {
70
+ if (evidenceIds.has(id)) {
71
+ valid_ids.push(id)
72
+ } else {
73
+ hallucinated_ids.push(id)
74
+ }
75
+ }
76
+
77
+ return { valid_ids, hallucinated_ids }
78
+ }
79
+
38
80
  const CHAIRMAN_SYSTEM_PROMPT = [
39
81
  "You are the Council Chairman. You synthesise the final verdict from all advisor and reviewer inputs.",
40
82
  "",
41
- "Your verdict must:",
42
- "1. Be grounded in Oracle evidence cite specific entry IDs for every material conclusion",
43
- "2. Summarise what was challenged and what held up under scrutiny",
44
- "3. State a clear recommendation",
45
- "4. List every Oracle entry ID that materially influenced the verdict in evidence_cited",
83
+ "Your output must classify findings by severity:",
84
+ " blockers issues that MUST be resolved before the design can proceed",
85
+ " (e.g. no rollback plan for a destructive migration, repeated a documented failure mode)",
86
+ " warnings issues that SHOULD be addressed but do not block execution",
87
+ " (e.g. no test coverage for an edge case, a preferred pattern not followed)",
88
+ "",
89
+ "For each blocker, cite the Oracle entry IDs that evidence it and state the required fix precisely.",
90
+ "For each warning, a suggested_fix is optional but preferred.",
91
+ "",
92
+ "advisor_split: count how many advisors recommended each option from their responses.",
93
+ " High split (no clear majority) is a signal of genuine uncertainty — reflect this in your verdict.",
46
94
  "",
47
- "satisfied = true → design holds up, can proceed to the human gate",
48
- "satisfied = false → fundamental flaw, unresolved gap, or design needs rework",
95
+ "satisfied = true → no blockers, design can proceed to the human gate",
96
+ "satisfied = false → at least one blocker exists, or the design needs rework",
97
+ "",
98
+ "evidence_cited: list every Oracle entry ID that materially influenced the verdict.",
99
+ " Only cite IDs that appeared in the Oracle Evidence section below.",
100
+ " Do not cite IDs from memory or general knowledge.",
49
101
  "",
50
102
  "Return ONLY valid JSON — no markdown fences, no explanation:",
51
103
  JSON.stringify({
52
104
  satisfied: "<boolean>",
53
- verdict: "<string ≤400 words — clear synthesis>",
54
- challenges: ["<string each challenge raised>"],
55
- evidence_cited: ["<Oracle entry ID>"],
105
+ verdict: "<string ≤400 words — clear synthesis citing entry IDs>",
106
+ blockers: [{ issue: "<string>", evidence: ["<Oracle entry ID>"], required_fix: "<string>" }],
107
+ warnings: [{ issue: "<string>", suggested_fix: "<string — optional>" }],
108
+ evidence_cited: ["<Oracle entry ID — only IDs present in the evidence pack>"],
109
+ advisor_split: { proceed: "<int>", redesign: "<int>", "investigate-more": "<int>" },
56
110
  recommendation: "proceed | redesign | investigate-more",
57
111
  }),
58
112
  ].join("\n")
59
113
 
60
114
  /**
61
115
  * Chairman synthesises the verdict from all advisor and reviewer inputs.
62
- * Every material conclusion must cite specific Oracle entry IDs.
116
+ * Classifies findings into blockers and warnings, validates citations,
117
+ * and tracks advisor split to surface genuine disagreement.
63
118
  *
64
119
  * Throws if the LLM returns non-JSON or output fails schema validation.
65
120
  */
@@ -109,5 +164,20 @@ export async function chairman(
109
164
  )
110
165
  }
111
166
 
112
- return result.data
167
+ const data = result.data
168
+
169
+ // Validate citations — flag any IDs cited that weren't in the evidence pack
170
+ const citation_validation = validateCitations(data.evidence_cited, evidence)
171
+
172
+ // Derive flat challenges array for backwards compatibility
173
+ const challenges = [
174
+ ...data.blockers.map(b => `[BLOCKER] ${b.issue}`),
175
+ ...data.warnings.map(w => w.issue),
176
+ ]
177
+
178
+ return {
179
+ ...data,
180
+ challenges,
181
+ citation_validation,
182
+ }
113
183
  }