@balpal4495/quorum 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +223 -11
- package/SETUP.md +30 -0
- package/bin/commands/check.js +122 -0
- package/bin/commands/commit.js +210 -0
- package/bin/commands/init.js +236 -0
- package/bin/commands/sentinel.js +160 -0
- package/bin/commands/status.js +117 -0
- package/bin/quorum.js +103 -0
- package/bin/shared/chronicle.js +129 -0
- package/bin/shared/colors.js +22 -0
- package/bin/shared/patterns.js +83 -0
- package/evals/__tests__/eval.test.ts +31 -0
- package/evals/cases/auth_hs256_rejected.json +46 -0
- package/evals/cases/auth_rs256_valid.json +30 -0
- package/evals/cases/cache_missing_lock.json +31 -0
- package/evals/cases/db_naive_not_null.json +32 -0
- package/evals/cases/logging_pii_leak.json +32 -0
- package/evals/cases/migration_with_rollback.json +43 -0
- package/evals/cases/no_evidence_novel_design.json +16 -0
- package/evals/cases/payment_no_idempotency.json +33 -0
- package/evals/cases/redis_session_rejected.json +32 -0
- package/evals/cases/safe_refactor.json +17 -0
- package/evals/runner.ts +226 -0
- package/modules/AGENTS.md +9 -5
- package/modules/CLAUDE.md +25 -2
- package/modules/README.md +153 -6
- package/modules/council/chairman.ts +84 -14
- package/modules/council/deliberate.ts +24 -4
- package/modules/council/index.ts +6 -1
- package/modules/council/risk.ts +89 -0
- package/modules/council/types.ts +63 -1
- package/modules/jury/evaluate.ts +32 -8
- package/modules/jury/index.ts +3 -1
- package/modules/jury/preflight.ts +101 -0
- package/modules/jury/schema.ts +9 -0
- package/modules/jury/types.ts +20 -1
- package/modules/shared/types.ts +8 -0
- package/package.json +3 -3
|
@@ -4,9 +4,14 @@ import { frameQuestion } from "./frame"
|
|
|
4
4
|
import { fanOutAdvisors } from "./advisors"
|
|
5
5
|
import { fanOutReviewers } from "./reviewers"
|
|
6
6
|
import { chairman } from "./chairman"
|
|
7
|
+
import { classifyRisk } from "./risk"
|
|
7
8
|
|
|
8
9
|
const DEFAULT_ADVISOR_COUNT = 5
|
|
9
10
|
const DEFAULT_REVIEWER_COUNT = 5
|
|
11
|
+
const LITE_ADVISOR_COUNT = 1
|
|
12
|
+
const LITE_REVIEWER_COUNT = 2
|
|
13
|
+
const JURY_ONLY_ADVISOR_COUNT = 1
|
|
14
|
+
const JURY_ONLY_REVIEWER_COUNT = 1
|
|
10
15
|
|
|
11
16
|
/**
|
|
12
17
|
* Run the Council deliberation pipeline.
|
|
@@ -34,11 +39,23 @@ export async function deliberate(
|
|
|
34
39
|
const {
|
|
35
40
|
llm,
|
|
36
41
|
oracle,
|
|
37
|
-
advisorCount = DEFAULT_ADVISOR_COUNT,
|
|
38
|
-
reviewerCount = DEFAULT_REVIEWER_COUNT,
|
|
39
42
|
models = {},
|
|
40
43
|
} = deps
|
|
41
44
|
|
|
45
|
+
// Classify risk to determine Council mode and advisor/reviewer counts
|
|
46
|
+
const risk = classifyRisk(input.outcome, input.design, input.evidence)
|
|
47
|
+
let defaultAdvisors = DEFAULT_ADVISOR_COUNT
|
|
48
|
+
let defaultReviewers = DEFAULT_REVIEWER_COUNT
|
|
49
|
+
if (risk.council_mode === "lite") {
|
|
50
|
+
defaultAdvisors = LITE_ADVISOR_COUNT
|
|
51
|
+
defaultReviewers = LITE_REVIEWER_COUNT
|
|
52
|
+
} else if (risk.council_mode === "jury-only") {
|
|
53
|
+
defaultAdvisors = JURY_ONLY_ADVISOR_COUNT
|
|
54
|
+
defaultReviewers = JURY_ONLY_REVIEWER_COUNT
|
|
55
|
+
}
|
|
56
|
+
const advisorCount = deps.advisorCount ?? defaultAdvisors
|
|
57
|
+
const reviewerCount = deps.reviewerCount ?? defaultReviewers
|
|
58
|
+
|
|
42
59
|
// Select personas — cycle DEFAULT_PERSONAS if advisorCount > 5
|
|
43
60
|
const personas = Array.from(
|
|
44
61
|
{ length: advisorCount },
|
|
@@ -88,11 +105,14 @@ export async function deliberate(
|
|
|
88
105
|
key_insight: keyInsight,
|
|
89
106
|
affected_areas: extractAffectedAreas(input.outcome, input.design),
|
|
90
107
|
alternatives_considered: verdict.challenges,
|
|
91
|
-
rejected_reason: verdict.satisfied
|
|
108
|
+
rejected_reason: verdict.satisfied
|
|
109
|
+
? []
|
|
110
|
+
: verdict.blockers.map(b => b.issue).slice(0, 3),
|
|
92
111
|
status: "open",
|
|
93
112
|
confidence: input.jury_output.confidence,
|
|
94
113
|
source_module: "council",
|
|
95
|
-
evidence_cited: verdict.
|
|
114
|
+
evidence_cited: verdict.citation_validation.valid_ids,
|
|
115
|
+
scope: risk.reasons.slice(0, 3),
|
|
96
116
|
})
|
|
97
117
|
|
|
98
118
|
return verdict
|
package/modules/council/index.ts
CHANGED
|
@@ -1,4 +1,9 @@
|
|
|
1
1
|
export { deliberate } from "./deliberate"
|
|
2
|
-
export type {
|
|
2
|
+
export type {
|
|
3
|
+
CouncilInput, CouncilOutput, CouncilDeps, CouncilModels,
|
|
4
|
+
BlockerItem, WarningItem, CitationValidation, AdvisorSplit,
|
|
5
|
+
RiskLevel, CouncilMode, RiskAssessment,
|
|
6
|
+
} from "./types"
|
|
3
7
|
export { DEFAULT_PERSONAS } from "./personas"
|
|
4
8
|
export type { AdvisorPersona } from "./personas"
|
|
9
|
+
export { classifyRisk } from "./risk"
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import type { OracleResult } from "../shared/types"
|
|
2
|
+
import type { RiskLevel, CouncilMode, RiskAssessment } from "./types"
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Patterns that trigger risk escalation.
|
|
6
|
+
* Each entry has a level (the minimum risk level it triggers) and a reason label.
|
|
7
|
+
*/
|
|
8
|
+
const RISK_RULES: Array<{ pattern: RegExp; level: RiskLevel; reason: string }> = [
|
|
9
|
+
// Critical — always run full Council + flag for human architecture review
|
|
10
|
+
{ pattern: /\b(auth(?:entication|orization)?|jwt|token|session|password|oauth|credential|bearer)\b/i, level: "critical", reason: "authentication or authorisation logic" },
|
|
11
|
+
{ pattern: /\b(payment|stripe|charge|billing|checkout|refund|subscription)\b/i, level: "critical", reason: "payment or billing logic" },
|
|
12
|
+
{ pattern: /\b(encrypt|decrypt|private\s+key|certificate|tls|ssl|hmac|cipher)\b/i, level: "critical", reason: "cryptography or key management" },
|
|
13
|
+
{ pattern: /\b(delete\s+all|drop\s+table|truncate|wipe|destroy.*data|hard\s+delete)\b/i, level: "critical", reason: "irreversible data deletion" },
|
|
14
|
+
|
|
15
|
+
// High — full Council
|
|
16
|
+
{ pattern: /\b(migrat(?:ion|e)|alter\s+table|schema\s+change|not\s+null|backfill|pg_repack|shadow\s+column)\b/i, level: "high", reason: "database schema migration" },
|
|
17
|
+
{ pattern: /\b(permission|role(?:s)?|acl|rbac|access\s+control|entitlement)\b/i, level: "high", reason: "permissions or access control" },
|
|
18
|
+
{ pattern: /\b(pii|personal\s+data|gdpr|ccpa|email(?:\s+address)?|phone(?:\s+number)?|ssn|passport)\b/i, level: "high", reason: "PII or compliance-regulated data" },
|
|
19
|
+
{ pattern: /\b(api\s+key|secret(?:s)?|private\s+key|credentials?)\b/i, level: "high", reason: "secrets or credentials handling" },
|
|
20
|
+
|
|
21
|
+
// Medium — Jury + lite Council
|
|
22
|
+
{ pattern: /\b(cache|redis|memcached|invalidat(?:e|ion))\b/i, level: "medium", reason: "cache strategy" },
|
|
23
|
+
{ pattern: /\b(rate\s*limit|throttl(?:e|ing)|quota)\b/i, level: "medium", reason: "rate limiting or throttling" },
|
|
24
|
+
{ pattern: /\b(webhook|event|queue|pubsub|kafka|rabbitmq|sns|sqs)\b/i, level: "medium", reason: "async event or messaging" },
|
|
25
|
+
{ pattern: /\b(deploy(?:ment)?|ci(?:\/cd)?|docker|kubernetes|infra(?:structure)?)\b/i, level: "medium", reason: "deployment or infrastructure" },
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
const RISK_ORDER: RiskLevel[] = ["low", "medium", "high", "critical"]
|
|
29
|
+
|
|
30
|
+
function maxLevel(a: RiskLevel, b: RiskLevel): RiskLevel {
|
|
31
|
+
return RISK_ORDER.indexOf(a) >= RISK_ORDER.indexOf(b) ? a : b
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function councilModeForLevel(level: RiskLevel): CouncilMode {
|
|
35
|
+
switch (level) {
|
|
36
|
+
case "low": return "jury-only"
|
|
37
|
+
case "medium": return "lite"
|
|
38
|
+
case "high": return "full"
|
|
39
|
+
case "critical": return "full"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Classify the risk level of a proposed change from its text and evidence.
|
|
45
|
+
*
|
|
46
|
+
* Risk determines Council mode — avoid running full fan-out on low-risk changes:
|
|
47
|
+
* low → jury-only (no advisor/reviewer fan-out)
|
|
48
|
+
* medium → lite (Jury + 2 reviewers)
|
|
49
|
+
* high → full (standard 5 advisors + 5 reviewers)
|
|
50
|
+
* critical → full (same as high, but Chronicle entry flags for human architecture review)
|
|
51
|
+
*
|
|
52
|
+
* Refuted Oracle entries also elevate risk — a known failure mode in the evidence pack
|
|
53
|
+
* means the design is repeating something that already went wrong.
|
|
54
|
+
*/
|
|
55
|
+
export function classifyRisk(
|
|
56
|
+
outcome: string,
|
|
57
|
+
design: string,
|
|
58
|
+
evidence: OracleResult[],
|
|
59
|
+
): RiskAssessment {
|
|
60
|
+
const text = `${outcome} ${design}`
|
|
61
|
+
let level: RiskLevel = "low"
|
|
62
|
+
const reasons: string[] = []
|
|
63
|
+
|
|
64
|
+
for (const rule of RISK_RULES) {
|
|
65
|
+
if (rule.pattern.test(text)) {
|
|
66
|
+
const matched = maxLevel(level, rule.level)
|
|
67
|
+
if (matched !== level || !reasons.includes(rule.reason)) {
|
|
68
|
+
level = matched
|
|
69
|
+
reasons.push(rule.reason)
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Refuted entries in the evidence pack are a direct risk signal
|
|
75
|
+
const refutedCount = evidence.filter(e => e.status === "refuted").length
|
|
76
|
+
if (refutedCount > 0) {
|
|
77
|
+
const refutedRisk: RiskLevel = refutedCount >= 2 ? "high" : "medium"
|
|
78
|
+
if (RISK_ORDER.indexOf(refutedRisk) > RISK_ORDER.indexOf(level)) {
|
|
79
|
+
level = maxLevel(level, refutedRisk)
|
|
80
|
+
}
|
|
81
|
+
reasons.push(`${refutedCount} refuted Chronicle ${refutedCount === 1 ? "entry" : "entries"} in evidence pack`)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
level,
|
|
86
|
+
reasons: reasons.length > 0 ? reasons : ["no sensitive patterns detected"],
|
|
87
|
+
council_mode: councilModeForLevel(level),
|
|
88
|
+
}
|
|
89
|
+
}
|
package/modules/council/types.ts
CHANGED
|
@@ -12,14 +12,57 @@ export interface CouncilInput {
|
|
|
12
12
|
jury_output: JuryOutput
|
|
13
13
|
}
|
|
14
14
|
|
|
15
|
+
/** A finding that must be resolved before the design can proceed. */
|
|
16
|
+
export interface BlockerItem {
|
|
17
|
+
issue: string
|
|
18
|
+
/** Oracle entry IDs that evidence this blocker. */
|
|
19
|
+
evidence: string[]
|
|
20
|
+
/** What must change in the design to resolve this. */
|
|
21
|
+
required_fix: string
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** A finding that should be addressed but does not block proceeding. */
|
|
25
|
+
export interface WarningItem {
|
|
26
|
+
issue: string
|
|
27
|
+
suggested_fix?: string
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Validates that cited Oracle IDs actually appeared in the evidence pack. */
|
|
31
|
+
export interface CitationValidation {
|
|
32
|
+
/** IDs that were cited and exist in the evidence pack. */
|
|
33
|
+
valid_ids: string[]
|
|
34
|
+
/** IDs that were cited but were NOT in the evidence pack — likely hallucinated. */
|
|
35
|
+
hallucinated_ids: string[]
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** How advisors split on their recommendation. Signals disagreement level. */
|
|
39
|
+
export interface AdvisorSplit {
|
|
40
|
+
proceed: number
|
|
41
|
+
redesign: number
|
|
42
|
+
"investigate-more": number
|
|
43
|
+
}
|
|
44
|
+
|
|
15
45
|
export interface CouncilOutput {
|
|
16
46
|
satisfied: boolean
|
|
17
47
|
/** Chairman synthesis — every material conclusion cites Oracle entry IDs. */
|
|
18
48
|
verdict: string
|
|
19
|
-
/**
|
|
49
|
+
/**
|
|
50
|
+
* Findings that MUST be resolved before the design proceeds.
|
|
51
|
+
* Each blocker names the issue, the Oracle evidence behind it, and the required fix.
|
|
52
|
+
*/
|
|
53
|
+
blockers: BlockerItem[]
|
|
54
|
+
/**
|
|
55
|
+
* Findings that SHOULD be addressed but don't block execution.
|
|
56
|
+
*/
|
|
57
|
+
warnings: WarningItem[]
|
|
58
|
+
/** Flat list of all issues raised — backwards compatible with existing consumers. */
|
|
20
59
|
challenges: string[]
|
|
21
60
|
/** Oracle entry IDs referenced in the verdict. */
|
|
22
61
|
evidence_cited: string[]
|
|
62
|
+
/** Validation of whether cited IDs exist in the evidence pack. */
|
|
63
|
+
citation_validation: CitationValidation
|
|
64
|
+
/** How advisors split on recommendation — high disagreement = escalate. */
|
|
65
|
+
advisor_split: AdvisorSplit
|
|
23
66
|
recommendation: "proceed" | "redesign" | "investigate-more"
|
|
24
67
|
}
|
|
25
68
|
|
|
@@ -43,3 +86,22 @@ export interface CouncilDeps {
|
|
|
43
86
|
reviewerCount?: number
|
|
44
87
|
models?: CouncilModels
|
|
45
88
|
}
|
|
89
|
+
|
|
90
|
+
// ── Risk classifier types ─────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
export type RiskLevel = "low" | "medium" | "high" | "critical"
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Determines which Council mode to use.
|
|
96
|
+
* skip → Oracle query only, no LLM validation
|
|
97
|
+
* jury-only → Jury scores, no Council fan-out
|
|
98
|
+
* lite → Jury + 1–2 reviewers (no full advisor fan-out)
|
|
99
|
+
* full → Full Council (default 5 advisors + 5 reviewers + Chairman)
|
|
100
|
+
*/
|
|
101
|
+
export type CouncilMode = "skip" | "jury-only" | "lite" | "full"
|
|
102
|
+
|
|
103
|
+
export interface RiskAssessment {
|
|
104
|
+
level: RiskLevel
|
|
105
|
+
reasons: string[]
|
|
106
|
+
council_mode: CouncilMode
|
|
107
|
+
}
|
package/modules/jury/evaluate.ts
CHANGED
|
@@ -2,6 +2,7 @@ import type { JuryInput, JuryOutput, JuryDeps } from "./types"
|
|
|
2
2
|
import type { OracleResult } from "../shared/types"
|
|
3
3
|
import { entryText } from "../shared/types"
|
|
4
4
|
import { JuryOutputSchema } from "./schema"
|
|
5
|
+
import { runPreflight, formatPreflight } from "./preflight"
|
|
5
6
|
|
|
6
7
|
const CONFIDENCE_THRESHOLD = 0.6
|
|
7
8
|
|
|
@@ -25,14 +26,21 @@ function formatEvidence(evidence: OracleResult[]): string {
|
|
|
25
26
|
|
|
26
27
|
const SYSTEM_PROMPT = `You are the Jury — an evidence-based evaluator for agentic development workflows.
|
|
27
28
|
|
|
28
|
-
Your job is to evaluate a proposed design against Oracle evidence and produce a
|
|
29
|
+
Your job is to evaluate a proposed design against Oracle evidence and produce a calibrated confidence score.
|
|
29
30
|
You do NOT make decisions. You assess and score. Your output determines the Council's brief.
|
|
30
31
|
|
|
31
|
-
Score the design across
|
|
32
|
-
1.
|
|
33
|
-
2.
|
|
34
|
-
3.
|
|
35
|
-
4.
|
|
32
|
+
Score the design across four dimensions, each 0–1:
|
|
33
|
+
1. evidence_support — do validated Oracle entries confirm this approach works in this codebase?
|
|
34
|
+
2. feasibility — do Oracle entries (or their absence) suggest this is achievable?
|
|
35
|
+
3. risk — how well does the design address known failure modes? (1 = fully addressed, 0 = ignored)
|
|
36
|
+
4. completeness — does the design cover the full outcome, or only part of it?
|
|
37
|
+
|
|
38
|
+
confidence = average of the four scores (you must compute this yourself — do not round or adjust it).
|
|
39
|
+
|
|
40
|
+
Gaps fall into two categories:
|
|
41
|
+
- gaps: any missing evidence that would improve confidence
|
|
42
|
+
- blocking_gaps: a SUBSET of gaps that are hard blockers — must be resolved before proceeding
|
|
43
|
+
(examples: no rollback plan for a destructive change, no auth strategy for a security-sensitive feature)
|
|
36
44
|
|
|
37
45
|
council_brief is determined by confidence only (do not invent a value):
|
|
38
46
|
confidence < 0.6 → council_brief = "challenge"
|
|
@@ -41,8 +49,15 @@ council_brief is determined by confidence only (do not invent a value):
|
|
|
41
49
|
Return ONLY valid JSON that matches this schema exactly — no markdown fences, no explanation:
|
|
42
50
|
{
|
|
43
51
|
"confidence": <number 0–1>,
|
|
52
|
+
"confidence_breakdown": {
|
|
53
|
+
"evidence_support": <number 0–1>,
|
|
54
|
+
"feasibility": <number 0–1>,
|
|
55
|
+
"risk": <number 0–1>,
|
|
56
|
+
"completeness": <number 0–1>
|
|
57
|
+
},
|
|
44
58
|
"assessment": <string — what the evidence supports or contradicts>,
|
|
45
|
-
"gaps": [<string — each missing piece of evidence
|
|
59
|
+
"gaps": [<string — each missing piece of evidence>],
|
|
60
|
+
"blocking_gaps": [<string — gaps that are hard blockers only>],
|
|
46
61
|
"council_brief": "challenge" | "pressure-test",
|
|
47
62
|
"recommendation": "proceed" | "investigate-more" | "redesign"
|
|
48
63
|
}`
|
|
@@ -63,6 +78,8 @@ export async function evaluate(
|
|
|
63
78
|
): Promise<JuryOutput> {
|
|
64
79
|
const { llm, model } = deps
|
|
65
80
|
const evidenceText = formatEvidence(input.evidence)
|
|
81
|
+
const preflight = runPreflight(input.outcome, input.design, input.evidence)
|
|
82
|
+
const preflightText = formatPreflight(preflight)
|
|
66
83
|
|
|
67
84
|
const userPrompt = [
|
|
68
85
|
"## Outcome",
|
|
@@ -71,6 +88,8 @@ export async function evaluate(
|
|
|
71
88
|
"## Proposed Design",
|
|
72
89
|
input.design,
|
|
73
90
|
"",
|
|
91
|
+
preflightText,
|
|
92
|
+
"",
|
|
74
93
|
"## Oracle Evidence",
|
|
75
94
|
evidenceText,
|
|
76
95
|
].join("\n")
|
|
@@ -105,7 +124,12 @@ export async function evaluate(
|
|
|
105
124
|
|
|
106
125
|
const output = result.data
|
|
107
126
|
|
|
108
|
-
//
|
|
127
|
+
// Recompute confidence as the exact average of breakdown dimensions
|
|
128
|
+
// This makes confidence deterministic and calibrated regardless of what the LLM returned
|
|
129
|
+
const { evidence_support, feasibility, risk, completeness } = output.confidence_breakdown
|
|
130
|
+
output.confidence = Math.round(((evidence_support + feasibility + risk + completeness) / 4) * 100) / 100
|
|
131
|
+
|
|
132
|
+
// Enforce council_brief from recomputed confidence — do not trust the LLM to compute this correctly
|
|
109
133
|
output.council_brief =
|
|
110
134
|
output.confidence < CONFIDENCE_THRESHOLD ? "challenge" : "pressure-test"
|
|
111
135
|
|
package/modules/jury/index.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
1
|
export { evaluate } from "./evaluate"
|
|
2
|
-
export type { JuryInput, JuryOutput, JuryDeps } from "./types"
|
|
2
|
+
export type { JuryInput, JuryOutput, JuryDeps, ConfidenceBreakdown } from "./types"
|
|
3
3
|
export { JuryOutputSchema } from "./schema"
|
|
4
|
+
export { runPreflight, formatPreflight } from "./preflight"
|
|
5
|
+
export type { PreflightResult } from "./preflight"
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import type { OracleResult } from "../shared/types"
|
|
2
|
+
import { entryText } from "../shared/types"
|
|
3
|
+
|
|
4
|
+
/** Areas that warrant elevated scrutiny. */
|
|
5
|
+
const SENSITIVE_PATTERNS: Record<string, RegExp> = {
|
|
6
|
+
auth: /\b(auth(?:entication|orization)?|jwt|token|session|password|oauth|login|logout|credential|bearer)\b/i,
|
|
7
|
+
database: /\b(migrat(?:ion|e)|alter\s+table|schema\s+change|postgres|mysql|sqlite|prisma|drizzle|knex|sequelize)\b/i,
|
|
8
|
+
crypto: /\b(encrypt|decrypt|cipher|hash(?:ing)?|hmac|sign(?:ing)?|verify|private\s+key|certificate|tls|ssl)\b/i,
|
|
9
|
+
payments: /\b(payment|stripe|charge|billing|invoice|subscription|price|checkout|refund)\b/i,
|
|
10
|
+
permissions: /\b(permission|role(?:s)?|acl|access\s+control|rbac|authorization|entitlement)\b/i,
|
|
11
|
+
pii: /\b(pii|personal\s+data|gdpr|ccpa|email(?:\s+address)?|phone(?:\s+number)?|postal\s+address|ssn|passport)\b/i,
|
|
12
|
+
data_deletion: /\b(delete(?:\s+all)?|drop\s+table|truncate|purge|wipe|destroy.*data|hard\s+delete)\b/i,
|
|
13
|
+
secrets: /\b(api\s+key|secret(?:s)?|env(?:ironment)?\s+var(?:iable)?|\.env|private\s+key|credentials?)\b/i,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const ROLLBACK_PATTERNS = /\b(rollback|roll\s+back|revert|undo|restore|recovery|fallback|backward[- ]compat)\b/i
|
|
17
|
+
const TEST_PATTERNS = /\b(test(?:ing|s)?|spec(?:ification)?|unit\s+test|integration\s+test|coverage|vitest|jest|mocha)\b/i
|
|
18
|
+
|
|
19
|
+
export interface PreflightResult {
|
|
20
|
+
touches_sensitive_area: boolean
|
|
21
|
+
/** Which sensitive area categories were detected. */
|
|
22
|
+
sensitive_areas: string[]
|
|
23
|
+
/** Whether the design mentions a rollback or recovery strategy. */
|
|
24
|
+
rollback_mentioned: boolean
|
|
25
|
+
/** Whether the design mentions testing. */
|
|
26
|
+
test_strategy_mentioned: boolean
|
|
27
|
+
/**
|
|
28
|
+
* IDs of refuted Chronicle entries that semantically overlap with the design text.
|
|
29
|
+
* These are potential conflicts — Jury should surface them.
|
|
30
|
+
*/
|
|
31
|
+
chronicle_conflicts: string[]
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Static preflight analysis — no LLM required.
|
|
36
|
+
*
|
|
37
|
+
* Runs deterministic checks on the outcome + design text and the evidence pack
|
|
38
|
+
* before any LLM call. Results are injected into the Jury prompt so the LLM
|
|
39
|
+
* reasons over concrete signals rather than discovering them itself.
|
|
40
|
+
*/
|
|
41
|
+
export function runPreflight(
|
|
42
|
+
outcome: string,
|
|
43
|
+
design: string,
|
|
44
|
+
evidence: OracleResult[],
|
|
45
|
+
): PreflightResult {
|
|
46
|
+
const text = `${outcome} ${design}`
|
|
47
|
+
|
|
48
|
+
const sensitive_areas = Object.entries(SENSITIVE_PATTERNS)
|
|
49
|
+
.filter(([, pattern]) => pattern.test(text))
|
|
50
|
+
.map(([area]) => area)
|
|
51
|
+
|
|
52
|
+
// Refuted entries whose primary text shares at least one significant word with the design
|
|
53
|
+
const designWords = new Set(
|
|
54
|
+
text
|
|
55
|
+
.toLowerCase()
|
|
56
|
+
.split(/\W+/)
|
|
57
|
+
.filter(w => w.length > 4),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
const chronicle_conflicts = evidence
|
|
61
|
+
.filter(e => {
|
|
62
|
+
if (e.status !== "refuted") return false
|
|
63
|
+
const entryWords = entryText(e)
|
|
64
|
+
.toLowerCase()
|
|
65
|
+
.split(/\W+/)
|
|
66
|
+
.filter(w => w.length > 4)
|
|
67
|
+
return entryWords.some(w => designWords.has(w))
|
|
68
|
+
})
|
|
69
|
+
.map(e => e.id)
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
touches_sensitive_area: sensitive_areas.length > 0,
|
|
73
|
+
sensitive_areas,
|
|
74
|
+
rollback_mentioned: ROLLBACK_PATTERNS.test(text),
|
|
75
|
+
test_strategy_mentioned: TEST_PATTERNS.test(text),
|
|
76
|
+
chronicle_conflicts,
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Format preflight result for injection into the Jury prompt. */
|
|
81
|
+
export function formatPreflight(preflight: PreflightResult): string {
|
|
82
|
+
const lines: string[] = ["## Deterministic Preflight (machine-checked, not LLM-inferred)"]
|
|
83
|
+
|
|
84
|
+
if (preflight.touches_sensitive_area) {
|
|
85
|
+
lines.push(`⚠ Sensitive areas detected: ${preflight.sensitive_areas.join(", ")}`)
|
|
86
|
+
} else {
|
|
87
|
+
lines.push("✓ No sensitive areas detected")
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
lines.push(preflight.rollback_mentioned ? "✓ Rollback strategy mentioned" : "✗ No rollback strategy mentioned")
|
|
91
|
+
lines.push(preflight.test_strategy_mentioned ? "✓ Test strategy mentioned" : "✗ No test strategy mentioned")
|
|
92
|
+
|
|
93
|
+
if (preflight.chronicle_conflicts.length > 0) {
|
|
94
|
+
lines.push(`⚠ Refuted Chronicle entries potentially conflicting: ${preflight.chronicle_conflicts.join(", ")}`)
|
|
95
|
+
lines.push(" These entries were previously tried and failed — verify the design addresses the documented failure reason.")
|
|
96
|
+
} else {
|
|
97
|
+
lines.push("✓ No conflicting refuted Chronicle entries")
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return lines.join("\n")
|
|
101
|
+
}
|
package/modules/jury/schema.ts
CHANGED
|
@@ -1,13 +1,22 @@
|
|
|
1
1
|
import { z } from "zod"
|
|
2
2
|
|
|
3
|
+
const ConfidenceBreakdownSchema = z.object({
|
|
4
|
+
evidence_support: z.number().min(0).max(1),
|
|
5
|
+
feasibility: z.number().min(0).max(1),
|
|
6
|
+
risk: z.number().min(0).max(1),
|
|
7
|
+
completeness: z.number().min(0).max(1),
|
|
8
|
+
})
|
|
9
|
+
|
|
3
10
|
/**
|
|
4
11
|
* Zod schema for the Jury's structured LLM output.
|
|
5
12
|
* evaluate() validates all LLM responses against this before returning.
|
|
6
13
|
*/
|
|
7
14
|
export const JuryOutputSchema = z.object({
|
|
8
15
|
confidence: z.number().min(0).max(1),
|
|
16
|
+
confidence_breakdown: ConfidenceBreakdownSchema,
|
|
9
17
|
assessment: z.string().min(1),
|
|
10
18
|
gaps: z.array(z.string()),
|
|
19
|
+
blocking_gaps: z.array(z.string()),
|
|
11
20
|
council_brief: z.enum(["challenge", "pressure-test"]),
|
|
12
21
|
recommendation: z.enum(["proceed", "investigate-more", "redesign"]),
|
|
13
22
|
})
|
package/modules/jury/types.ts
CHANGED
|
@@ -9,13 +9,32 @@ export interface JuryInput {
|
|
|
9
9
|
evidence: OracleResult[]
|
|
10
10
|
}
|
|
11
11
|
|
|
12
|
+
/** Per-dimension breakdown of the 0–1 confidence score. */
|
|
13
|
+
export interface ConfidenceBreakdown {
|
|
14
|
+
/** Do validated Oracle entries confirm this approach works here? */
|
|
15
|
+
evidence_support: number
|
|
16
|
+
/** Do Oracle entries suggest this is achievable in this codebase? */
|
|
17
|
+
feasibility: number
|
|
18
|
+
/** How well does the design address known failure modes? (1 = fully addressed) */
|
|
19
|
+
risk: number
|
|
20
|
+
/** Does the design cover the full outcome, or only part of it? */
|
|
21
|
+
completeness: number
|
|
22
|
+
}
|
|
23
|
+
|
|
12
24
|
export interface JuryOutput {
|
|
13
|
-
/** 0–1 confidence score.
|
|
25
|
+
/** 0–1 confidence score. Average of the four breakdown dimensions. */
|
|
14
26
|
confidence: number
|
|
27
|
+
/** Per-dimension breakdown of the confidence score. */
|
|
28
|
+
confidence_breakdown: ConfidenceBreakdown
|
|
15
29
|
/** What the evidence supports or contradicts. */
|
|
16
30
|
assessment: string
|
|
17
31
|
/** Evidence missing from Oracle that would improve confidence. */
|
|
18
32
|
gaps: string[]
|
|
33
|
+
/**
|
|
34
|
+
* Gaps that are hard blockers — must be resolved before Council should proceed.
|
|
35
|
+
* Subset of gaps where the missing information is critical (auth, rollback, data safety).
|
|
36
|
+
*/
|
|
37
|
+
blocking_gaps: string[]
|
|
19
38
|
/**
|
|
20
39
|
* Council brief derived from confidence:
|
|
21
40
|
* < 0.6 → "challenge" (find what is wrong — broader scope)
|
package/modules/shared/types.ts
CHANGED
|
@@ -56,6 +56,14 @@ export type ChronicleEntry = {
|
|
|
56
56
|
work_ref?: WorkRef
|
|
57
57
|
timestamp: string
|
|
58
58
|
|
|
59
|
+
// ── outcome tracking fields (optional — filled in post-execution) ────────────
|
|
60
|
+
/** Steps that must pass to confirm this decision was correct. */
|
|
61
|
+
validation_plan?: string[]
|
|
62
|
+
/** ISO date after which this entry should be re-evaluated for drift. */
|
|
63
|
+
review_after?: string
|
|
64
|
+
/** What actually happened after the decision was acted on in production. */
|
|
65
|
+
post_merge_result?: "successful" | "bug" | "partial" | "rolled-back"
|
|
66
|
+
|
|
59
67
|
// ── v2 fields (optional — absent on legacy entries) ──────────────────────
|
|
60
68
|
/** 2 = decision record format. Absent = v1 legacy entry. */
|
|
61
69
|
schema_version?: 2
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@balpal4495/quorum",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Portable reasoning layer for agentic codebases — Oracle, Jury, Council, Sentinel",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,11 +20,11 @@
|
|
|
20
20
|
"llm"
|
|
21
21
|
],
|
|
22
22
|
"bin": {
|
|
23
|
-
"quorum": "bin/
|
|
23
|
+
"quorum": "bin/quorum.js"
|
|
24
24
|
},
|
|
25
25
|
"scripts": {
|
|
26
26
|
"init": "node bin/init.js",
|
|
27
|
-
"test": "vitest run modules/",
|
|
27
|
+
"test": "vitest run modules/ evals/",
|
|
28
28
|
"test:watch": "vitest modules/",
|
|
29
29
|
"typecheck": "tsc --noEmit"
|
|
30
30
|
},
|