@kevinrabun/judges 3.115.4 → 3.117.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/accessibility.judge.md +7 -0
- package/agents/agent-instructions.judge.md +7 -0
- package/agents/ai-code-safety.judge.md +7 -0
- package/agents/api-contract.judge.md +7 -0
- package/agents/api-design.judge.md +7 -0
- package/agents/authentication.judge.md +7 -0
- package/agents/backwards-compatibility.judge.md +7 -0
- package/agents/caching.judge.md +7 -0
- package/agents/ci-cd.judge.md +7 -0
- package/agents/cloud-readiness.judge.md +7 -0
- package/agents/concurrency.judge.md +7 -0
- package/agents/configuration-management.judge.md +7 -0
- package/agents/cybersecurity.judge.md +7 -0
- package/agents/data-security.judge.md +7 -0
- package/agents/dependency-health.judge.md +7 -0
- package/agents/documentation.judge.md +7 -0
- package/agents/error-handling.judge.md +7 -0
- package/agents/ethics-bias.judge.md +7 -0
- package/agents/false-positive-review.judge.md +12 -0
- package/agents/framework-safety.judge.md +7 -0
- package/agents/hallucination-detection.judge.md +13 -0
- package/agents/iac-security.judge.md +7 -0
- package/agents/intent-alignment.judge.md +13 -0
- package/agents/logging-privacy.judge.md +7 -0
- package/agents/maintainability.judge.md +7 -0
- package/agents/multi-turn-coherence.judge.md +7 -0
- package/agents/observability.judge.md +7 -0
- package/agents/portability.judge.md +7 -0
- package/agents/rate-limiting.judge.md +7 -0
- package/agents/reliability.judge.md +7 -0
- package/agents/security.judge.md +13 -0
- package/agents/testing.judge.md +7 -0
- package/agents/ux.judge.md +7 -0
- package/dist/a2a-protocol.d.ts +136 -0
- package/dist/a2a-protocol.js +218 -0
- package/dist/api.d.ts +21 -3
- package/dist/api.js +21 -1
- package/dist/audit-trail.d.ts +245 -0
- package/dist/audit-trail.js +257 -0
- package/dist/commands/benchmark-advanced.js +51 -51
- package/dist/commands/benchmark-ai-agents.js +16 -16
- package/dist/commands/benchmark-compliance-ethics.js +12 -12
- package/dist/commands/benchmark-expanded-2.js +2 -2
- package/dist/commands/benchmark-expanded.js +2 -2
- package/dist/commands/benchmark-infrastructure.js +12 -12
- package/dist/commands/benchmark-languages.js +11 -11
- package/dist/commands/benchmark-quality-ops.js +7 -7
- package/dist/commands/benchmark-security-deep.js +9 -9
- package/dist/commands/benchmark.js +1 -1
- package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
- package/dist/commands/llm-benchmark-optimizer.js +241 -0
- package/dist/commands/llm-benchmark.d.ts +4 -2
- package/dist/commands/llm-benchmark.js +40 -12
- package/dist/escalation.d.ts +100 -0
- package/dist/escalation.js +292 -0
- package/dist/evaluation-session.d.ts +74 -0
- package/dist/evaluation-session.js +152 -0
- package/dist/evaluators/index.d.ts +23 -1
- package/dist/evaluators/index.js +192 -3
- package/dist/evaluators/judge-selector.d.ts +19 -0
- package/dist/evaluators/judge-selector.js +141 -0
- package/dist/evaluators/recall-boost.d.ts +27 -0
- package/dist/evaluators/recall-boost.js +409 -0
- package/dist/feedback-loop.d.ts +62 -0
- package/dist/feedback-loop.js +179 -0
- package/dist/index.js +2 -0
- package/dist/judges/accessibility.js +7 -0
- package/dist/judges/agent-instructions.js +7 -0
- package/dist/judges/ai-code-safety.js +7 -0
- package/dist/judges/api-contract.js +7 -0
- package/dist/judges/api-design.js +7 -0
- package/dist/judges/authentication.js +7 -0
- package/dist/judges/backwards-compatibility.js +7 -0
- package/dist/judges/caching.js +7 -0
- package/dist/judges/ci-cd.js +7 -0
- package/dist/judges/cloud-readiness.js +7 -0
- package/dist/judges/concurrency.js +7 -0
- package/dist/judges/configuration-management.js +7 -0
- package/dist/judges/cybersecurity.js +7 -0
- package/dist/judges/data-security.js +7 -0
- package/dist/judges/dependency-health.js +7 -0
- package/dist/judges/documentation.js +7 -0
- package/dist/judges/error-handling.js +7 -0
- package/dist/judges/ethics-bias.js +7 -0
- package/dist/judges/false-positive-review.js +13 -1
- package/dist/judges/framework-safety.js +7 -0
- package/dist/judges/hallucination-detection.js +14 -1
- package/dist/judges/iac-security.js +7 -0
- package/dist/judges/intent-alignment.js +14 -1
- package/dist/judges/logging-privacy.js +7 -0
- package/dist/judges/maintainability.js +7 -0
- package/dist/judges/multi-turn-coherence.js +7 -0
- package/dist/judges/observability.js +7 -0
- package/dist/judges/portability.js +7 -0
- package/dist/judges/rate-limiting.js +7 -0
- package/dist/judges/reliability.js +7 -0
- package/dist/judges/security.js +14 -1
- package/dist/judges/testing.js +7 -0
- package/dist/judges/ux.js +7 -0
- package/dist/review-conversation.d.ts +87 -0
- package/dist/review-conversation.js +307 -0
- package/dist/sast-integration.d.ts +112 -0
- package/dist/sast-integration.js +215 -0
- package/dist/tools/register-evaluation.js +208 -8
- package/dist/tools/register-fix.js +24 -1
- package/dist/tools/register-resources.d.ts +6 -0
- package/dist/tools/register-resources.js +177 -0
- package/dist/tools/register-review.js +26 -1
- package/dist/tools/register-workflow.js +384 -11
- package/dist/tools/validation.d.ts +13 -0
- package/dist/tools/validation.js +77 -0
- package/dist/types.d.ts +122 -0
- package/package.json +25 -12
- package/server.json +2 -2
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Human Escalation Protocol
|
|
3
|
+
*
|
|
4
|
+
* Routes low-confidence findings to human reviewers instead of auto-actioning.
|
|
5
|
+
* Provides a structured escalation workflow with reasons, routing suggestions,
|
|
6
|
+
* and a persistent escalation queue.
|
|
7
|
+
*
|
|
8
|
+
* Data stored in .judges-escalations.json
|
|
9
|
+
*/
|
|
10
|
+
import { readFileSync, writeFileSync, existsSync } from "fs";
|
|
11
|
+
import { resolve } from "path";
|
|
12
|
+
import { getDataAdapter } from "./data-adapter.js";
|
|
13
|
+
// ─── Constants ───────────────────────────────────────────────────────────────
|
|
14
|
+
const ESCALATION_FILE = ".judges-escalations.json";
|
|
15
|
+
const DEFAULT_CONFIDENCE_THRESHOLD = 0.5;
|
|
16
|
+
/** Rule prefixes that route to security team */
|
|
17
|
+
const SECURITY_PREFIXES = new Set(["SEC-", "CYBER-", "AUTH-", "DATA-", "AICS-", "LOGPRIV-"]);
|
|
18
|
+
/** Rule prefixes that route to compliance officer */
|
|
19
|
+
const COMPLIANCE_PREFIXES = new Set(["COMP-", "DSOV-", "ETH-"]);
|
|
20
|
+
// ─── Escalation Store I/O ────────────────────────────────────────────────────
|
|
21
|
+
export function loadEscalationStore(dir = ".") {
|
|
22
|
+
const filePath = resolve(dir, ESCALATION_FILE);
|
|
23
|
+
if (!existsSync(filePath)) {
|
|
24
|
+
return { version: "1.0.0", escalations: [], lastUpdated: new Date().toISOString() };
|
|
25
|
+
}
|
|
26
|
+
try {
|
|
27
|
+
return JSON.parse(readFileSync(filePath, "utf-8"));
|
|
28
|
+
}
|
|
29
|
+
catch {
|
|
30
|
+
return { version: "1.0.0", escalations: [], lastUpdated: new Date().toISOString() };
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
export function saveEscalationStore(store, dir = ".") {
|
|
34
|
+
store.lastUpdated = new Date().toISOString();
|
|
35
|
+
const filePath = resolve(dir, ESCALATION_FILE);
|
|
36
|
+
writeFileSync(filePath, JSON.stringify(store, null, 2) + "\n", "utf-8");
|
|
37
|
+
}
|
|
38
|
+
export async function loadEscalationsViaAdapter(projectDir, adapter) {
|
|
39
|
+
const da = adapter ?? getDataAdapter();
|
|
40
|
+
return ((await da.loadJson("escalations", projectDir)) ?? {
|
|
41
|
+
version: "1.0.0",
|
|
42
|
+
escalations: [],
|
|
43
|
+
lastUpdated: new Date().toISOString(),
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
export async function saveEscalationsViaAdapter(store, projectDir, adapter) {
|
|
47
|
+
const da = adapter ?? getDataAdapter();
|
|
48
|
+
return da.saveJson("escalations", store, projectDir);
|
|
49
|
+
}
|
|
50
|
+
// ─── Escalation Logic ────────────────────────────────────────────────────────
|
|
51
|
+
/**
|
|
52
|
+
* Determine why a finding should be escalated.
|
|
53
|
+
*/
|
|
54
|
+
function classifyEscalationReasons(finding, verdict) {
|
|
55
|
+
const reasons = [];
|
|
56
|
+
const conf = finding.confidence ?? 0.5;
|
|
57
|
+
if (conf < DEFAULT_CONFIDENCE_THRESHOLD) {
|
|
58
|
+
reasons.push("low-confidence");
|
|
59
|
+
}
|
|
60
|
+
// Check for conflicting judge signals
|
|
61
|
+
if (verdict) {
|
|
62
|
+
const judgeVerdicts = verdict.evaluations.map((e) => e.verdict);
|
|
63
|
+
const hasPass = judgeVerdicts.includes("pass");
|
|
64
|
+
const hasFail = judgeVerdicts.includes("fail");
|
|
65
|
+
if (hasPass && hasFail) {
|
|
66
|
+
reasons.push("conflicting-judges");
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
// AI-generated code detection
|
|
70
|
+
if (finding.ruleId.startsWith("MFPR")) {
|
|
71
|
+
reasons.push("ai-generated-code");
|
|
72
|
+
}
|
|
73
|
+
// Novel pattern — absence-based with no prior feedback data
|
|
74
|
+
if (finding.isAbsenceBased && finding.provenance === "absence-of-pattern") {
|
|
75
|
+
reasons.push("cross-file-uncertainty");
|
|
76
|
+
}
|
|
77
|
+
// Compliance-sensitive rules
|
|
78
|
+
if (finding.ruleId.startsWith("COMP-") || finding.ruleId.startsWith("DSOV-") || finding.ruleId.startsWith("ETH-")) {
|
|
79
|
+
reasons.push("compliance-sensitive");
|
|
80
|
+
}
|
|
81
|
+
// High-severity with low evidence
|
|
82
|
+
if ((finding.severity === "critical" || finding.severity === "high") && conf < 0.7) {
|
|
83
|
+
reasons.push("security-critical-low-evidence");
|
|
84
|
+
}
|
|
85
|
+
return reasons.length > 0 ? reasons : ["low-confidence"];
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Determine which team/role should review this escalation.
|
|
89
|
+
*/
|
|
90
|
+
function determineRouting(finding) {
|
|
91
|
+
const ruleId = finding.ruleId;
|
|
92
|
+
for (const prefix of SECURITY_PREFIXES) {
|
|
93
|
+
if (ruleId.startsWith(prefix))
|
|
94
|
+
return "security-team";
|
|
95
|
+
}
|
|
96
|
+
for (const prefix of COMPLIANCE_PREFIXES) {
|
|
97
|
+
if (ruleId.startsWith(prefix))
|
|
98
|
+
return "compliance-officer";
|
|
99
|
+
}
|
|
100
|
+
if (finding.severity === "critical")
|
|
101
|
+
return "senior-developer";
|
|
102
|
+
if (finding.severity === "high")
|
|
103
|
+
return "tech-lead";
|
|
104
|
+
return "any-human";
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Generate a human-readable explanation for why this finding was escalated.
|
|
108
|
+
*/
|
|
109
|
+
function buildEscalationExplanation(finding, reasons) {
|
|
110
|
+
const parts = [];
|
|
111
|
+
if (reasons.includes("low-confidence")) {
|
|
112
|
+
const conf = finding.confidence ?? 0;
|
|
113
|
+
parts.push(`Confidence is ${Math.round(conf * 100)}%, below the escalation threshold`);
|
|
114
|
+
}
|
|
115
|
+
if (reasons.includes("conflicting-judges")) {
|
|
116
|
+
parts.push("Judges disagree on the verdict for this file");
|
|
117
|
+
}
|
|
118
|
+
if (reasons.includes("ai-generated-code")) {
|
|
119
|
+
parts.push("AI-generated code detected — requires human verification of correctness");
|
|
120
|
+
}
|
|
121
|
+
if (reasons.includes("cross-file-uncertainty")) {
|
|
122
|
+
parts.push("Finding depends on cross-file context that could not be verified");
|
|
123
|
+
}
|
|
124
|
+
if (reasons.includes("compliance-sensitive")) {
|
|
125
|
+
parts.push("Compliance-sensitive finding requires human sign-off");
|
|
126
|
+
}
|
|
127
|
+
if (reasons.includes("security-critical-low-evidence")) {
|
|
128
|
+
parts.push(`High-severity security finding (${finding.severity}) with insufficient evidence — needs expert analysis`);
|
|
129
|
+
}
|
|
130
|
+
if (reasons.includes("novel-pattern")) {
|
|
131
|
+
parts.push("Pattern not seen before — no historical data to calibrate confidence");
|
|
132
|
+
}
|
|
133
|
+
return `[${finding.ruleId}] ${finding.title}: ${parts.join("; ")}.`;
|
|
134
|
+
}
|
|
135
|
+
let escalationCounter = 0;
|
|
136
|
+
/**
|
|
137
|
+
* Generate a unique escalation ID.
|
|
138
|
+
*/
|
|
139
|
+
function generateEscalationId() {
|
|
140
|
+
escalationCounter++;
|
|
141
|
+
const ts = Date.now().toString(36);
|
|
142
|
+
const seq = escalationCounter.toString(36).padStart(4, "0");
|
|
143
|
+
return `ESC-${ts}-${seq}`;
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Evaluate which findings in a tribunal verdict need human escalation.
|
|
147
|
+
* Mutates findings to set `needsHumanReview` and returns the escalation records.
|
|
148
|
+
*/
|
|
149
|
+
export function evaluateEscalations(verdict, filePath, policy) {
|
|
150
|
+
const threshold = policy?.confidenceThreshold ?? DEFAULT_CONFIDENCE_THRESHOLD;
|
|
151
|
+
const alwaysSeverities = new Set(policy?.alwaysEscalateSeverities ?? []);
|
|
152
|
+
const alwaysPrefixes = policy?.alwaysEscalatePrefixes ?? [];
|
|
153
|
+
const escalations = [];
|
|
154
|
+
const now = new Date().toISOString();
|
|
155
|
+
for (const finding of verdict.findings) {
|
|
156
|
+
const conf = finding.confidence ?? 0.5;
|
|
157
|
+
let shouldEscalate = false;
|
|
158
|
+
// Confidence below threshold
|
|
159
|
+
if (conf < threshold)
|
|
160
|
+
shouldEscalate = true;
|
|
161
|
+
// Always-escalate severities
|
|
162
|
+
if (alwaysSeverities.has(finding.severity))
|
|
163
|
+
shouldEscalate = true;
|
|
164
|
+
// Always-escalate rule prefixes
|
|
165
|
+
if (alwaysPrefixes.some((p) => finding.ruleId.startsWith(p)))
|
|
166
|
+
shouldEscalate = true;
|
|
167
|
+
if (shouldEscalate) {
|
|
168
|
+
finding.needsHumanReview = true;
|
|
169
|
+
const reasons = classifyEscalationReasons(finding, verdict);
|
|
170
|
+
const routing = determineRouting(finding);
|
|
171
|
+
const explanation = buildEscalationExplanation(finding, reasons);
|
|
172
|
+
escalations.push({
|
|
173
|
+
escalationId: generateEscalationId(),
|
|
174
|
+
finding,
|
|
175
|
+
filePath,
|
|
176
|
+
reasons,
|
|
177
|
+
routing,
|
|
178
|
+
explanation,
|
|
179
|
+
status: "pending",
|
|
180
|
+
createdAt: now,
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return escalations;
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Resolve an escalation — mark it as resolved or dismissed.
|
|
188
|
+
*/
|
|
189
|
+
export function resolveEscalation(store, escalationId, resolution) {
|
|
190
|
+
const esc = store.escalations.find((e) => e.escalationId === escalationId);
|
|
191
|
+
if (!esc || esc.status === "resolved" || esc.status === "dismissed")
|
|
192
|
+
return false;
|
|
193
|
+
esc.status = resolution.status;
|
|
194
|
+
esc.resolvedAt = new Date().toISOString();
|
|
195
|
+
esc.resolvedBy = resolution.resolvedBy;
|
|
196
|
+
esc.resolutionNotes = resolution.notes;
|
|
197
|
+
return true;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Compute summary statistics for the escalation queue.
|
|
201
|
+
*/
|
|
202
|
+
export function computeEscalationSummary(store) {
|
|
203
|
+
const byRouting = {};
|
|
204
|
+
const byReason = {};
|
|
205
|
+
let pending = 0;
|
|
206
|
+
let acknowledged = 0;
|
|
207
|
+
let resolved = 0;
|
|
208
|
+
let dismissed = 0;
|
|
209
|
+
let oldestPendingMs = 0;
|
|
210
|
+
const now = Date.now();
|
|
211
|
+
for (const esc of store.escalations) {
|
|
212
|
+
switch (esc.status) {
|
|
213
|
+
case "pending":
|
|
214
|
+
pending++;
|
|
215
|
+
break;
|
|
216
|
+
case "acknowledged":
|
|
217
|
+
acknowledged++;
|
|
218
|
+
break;
|
|
219
|
+
case "resolved":
|
|
220
|
+
resolved++;
|
|
221
|
+
break;
|
|
222
|
+
case "dismissed":
|
|
223
|
+
dismissed++;
|
|
224
|
+
break;
|
|
225
|
+
}
|
|
226
|
+
if (esc.status === "pending") {
|
|
227
|
+
const age = now - new Date(esc.createdAt).getTime();
|
|
228
|
+
if (age > oldestPendingMs)
|
|
229
|
+
oldestPendingMs = age;
|
|
230
|
+
}
|
|
231
|
+
byRouting[esc.routing] = (byRouting[esc.routing] ?? 0) + 1;
|
|
232
|
+
for (const reason of esc.reasons) {
|
|
233
|
+
byReason[reason] = (byReason[reason] ?? 0) + 1;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
return {
|
|
237
|
+
total: store.escalations.length,
|
|
238
|
+
pending,
|
|
239
|
+
acknowledged,
|
|
240
|
+
resolved,
|
|
241
|
+
dismissed,
|
|
242
|
+
byRouting,
|
|
243
|
+
byReason,
|
|
244
|
+
oldestPendingHours: Math.round((oldestPendingMs / (1000 * 60 * 60)) * 10) / 10,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Check whether the escalation queue should block a merge.
|
|
249
|
+
* Blocks when pending escalations exceed the policy limit.
|
|
250
|
+
*/
|
|
251
|
+
export function shouldBlockOnEscalations(store, policy) {
|
|
252
|
+
const maxPending = policy?.maxPendingBeforeBlock ?? 0;
|
|
253
|
+
if (maxPending <= 0)
|
|
254
|
+
return false;
|
|
255
|
+
const pending = store.escalations.filter((e) => e.status === "pending").length;
|
|
256
|
+
return pending >= maxPending;
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Enhance a ReviewDecision with escalation information.
|
|
260
|
+
* When escalations exist, the review action may be upgraded to "request-changes"
|
|
261
|
+
* to ensure a human signs off.
|
|
262
|
+
*/
|
|
263
|
+
export function enhanceReviewWithEscalations(decision, escalations) {
|
|
264
|
+
if (escalations.length === 0)
|
|
265
|
+
return decision;
|
|
266
|
+
const pendingCount = escalations.filter((e) => e.status === "pending").length;
|
|
267
|
+
// If there are pending escalations, upgrade to at least "comment"
|
|
268
|
+
let action = decision.action;
|
|
269
|
+
if (pendingCount > 0 && action === "approve") {
|
|
270
|
+
action = "comment";
|
|
271
|
+
}
|
|
272
|
+
// Critical escalations force request-changes
|
|
273
|
+
const hasCriticalEscalation = escalations.some((e) => e.status === "pending" &&
|
|
274
|
+
(e.reasons.includes("security-critical-low-evidence") || e.reasons.includes("compliance-sensitive")));
|
|
275
|
+
if (hasCriticalEscalation) {
|
|
276
|
+
action = "request-changes";
|
|
277
|
+
}
|
|
278
|
+
const escalationSummary = `\n\n**Escalation Notice**: ${pendingCount} finding(s) flagged for human review. ` +
|
|
279
|
+
`Routing: ${[...new Set(escalations.map((e) => e.routing))].join(", ")}.`;
|
|
280
|
+
return {
|
|
281
|
+
...decision,
|
|
282
|
+
action,
|
|
283
|
+
summary: decision.summary + escalationSummary,
|
|
284
|
+
blockingIssues: [
|
|
285
|
+
...decision.blockingIssues,
|
|
286
|
+
...escalations
|
|
287
|
+
.filter((e) => e.status === "pending")
|
|
288
|
+
.slice(0, 3)
|
|
289
|
+
.map((e) => `[ESCALATED] ${e.explanation}`),
|
|
290
|
+
],
|
|
291
|
+
};
|
|
292
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation session — persistent context that survives across multiple
|
|
3
|
+
* evaluation calls within the same session (MCP connection, VS Code
|
|
4
|
+
* extension lifetime, or CLI watch mode).
|
|
5
|
+
*
|
|
6
|
+
* Avoids redundant framework detection, capability scanning, and feedback
|
|
7
|
+
* loading. Tracks verdict evolution per file for stability detection.
|
|
8
|
+
*/
|
|
9
|
+
import type { SessionContext, TribunalVerdict } from "./types.js";
|
|
10
|
+
/**
|
|
11
|
+
* An evaluation session that accumulates project knowledge across calls.
|
|
12
|
+
*/
|
|
13
|
+
export declare class EvaluationSession {
|
|
14
|
+
private ctx;
|
|
15
|
+
constructor();
|
|
16
|
+
/** Get the current session context (read-only snapshot). */
|
|
17
|
+
getContext(): Readonly<SessionContext>;
|
|
18
|
+
/** Number of evaluations performed. */
|
|
19
|
+
get evaluationCount(): number;
|
|
20
|
+
/** Record detected frameworks (deduplicated). */
|
|
21
|
+
addFrameworks(frameworks: string[]): void;
|
|
22
|
+
/** Record detected project capabilities (e.g. "rate-limiting", "auth"). */
|
|
23
|
+
addCapabilities(caps: Iterable<string>): void;
|
|
24
|
+
/** Get accumulated capabilities for absence-based finding suppression. */
|
|
25
|
+
getCapabilities(): Set<string>;
|
|
26
|
+
/**
|
|
27
|
+
* Record an evaluation result for a file. Tracks verdict history
|
|
28
|
+
* so repeated evaluations can detect stability (converging scores).
|
|
29
|
+
*/
|
|
30
|
+
recordEvaluation(filePath: string, code: string, verdict: TribunalVerdict): void;
|
|
31
|
+
/**
|
|
32
|
+
* Check if a file's verdict is stable — same score and finding count
|
|
33
|
+
* across the last N evaluations. Returns true if stable (skip re-eval).
|
|
34
|
+
*/
|
|
35
|
+
isVerdictStable(filePath: string, minRuns?: number): boolean;
|
|
36
|
+
/**
|
|
37
|
+
* Check if a file has already been evaluated with the same content.
|
|
38
|
+
*/
|
|
39
|
+
hasEvaluated(filePath: string, code: string): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Get verdict history for a file — most recent first.
|
|
42
|
+
*/
|
|
43
|
+
getVerdictHistory(filePath: string): Array<{
|
|
44
|
+
score: number;
|
|
45
|
+
findingCount: number;
|
|
46
|
+
timestamp: string;
|
|
47
|
+
}>;
|
|
48
|
+
/** Reset the session (clear all accumulated context). */
|
|
49
|
+
reset(): void;
|
|
50
|
+
/**
|
|
51
|
+
* Record user feedback for a finding rule.
|
|
52
|
+
* tp = true positive, fp = false positive, wontfix = acknowledged but skipped.
|
|
53
|
+
*/
|
|
54
|
+
recordFeedback(ruleId: string, verdict: "tp" | "fp" | "wontfix"): void;
|
|
55
|
+
/**
|
|
56
|
+
* Get a confidence penalty for a rule based on accumulated FP feedback.
|
|
57
|
+
* Returns a multiplier in (0, 1] — 1.0 means no penalty, lower means
|
|
58
|
+
* the rule has been flagged as FP frequently and confidence should be reduced.
|
|
59
|
+
*
|
|
60
|
+
* Formula: 1 / (1 + fpCount) — degrades smoothly as FP reports accumulate.
|
|
61
|
+
* A single FP report halves confidence; two reports reduce it to 1/3, etc.
|
|
62
|
+
*/
|
|
63
|
+
getConfidencePenalty(ruleId: string): number;
|
|
64
|
+
/** Get the raw feedback tally for all rules. */
|
|
65
|
+
getFeedbackTally(): ReadonlyMap<string, {
|
|
66
|
+
tp: number;
|
|
67
|
+
fp: number;
|
|
68
|
+
wontfix: number;
|
|
69
|
+
}>;
|
|
70
|
+
}
|
|
71
|
+
/** Get or create the global evaluation session (shared across MCP calls). */
|
|
72
|
+
export declare function getGlobalSession(): EvaluationSession;
|
|
73
|
+
/** Reset the global session (for testing or explicit reset). */
|
|
74
|
+
export declare function resetGlobalSession(): void;
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation session — persistent context that survives across multiple
|
|
3
|
+
* evaluation calls within the same session (MCP connection, VS Code
|
|
4
|
+
* extension lifetime, or CLI watch mode).
|
|
5
|
+
*
|
|
6
|
+
* Avoids redundant framework detection, capability scanning, and feedback
|
|
7
|
+
* loading. Tracks verdict evolution per file for stability detection.
|
|
8
|
+
*/
|
|
9
|
+
import { contentHash } from "./cache.js";
|
|
10
|
+
/**
|
|
11
|
+
* An evaluation session that accumulates project knowledge across calls.
|
|
12
|
+
*/
|
|
13
|
+
export class EvaluationSession {
|
|
14
|
+
ctx;
|
|
15
|
+
constructor() {
|
|
16
|
+
this.ctx = {
|
|
17
|
+
frameworks: [],
|
|
18
|
+
capabilities: new Set(),
|
|
19
|
+
verdictHistory: new Map(),
|
|
20
|
+
evaluatedFiles: new Map(),
|
|
21
|
+
startedAt: new Date().toISOString(),
|
|
22
|
+
evaluationCount: 0,
|
|
23
|
+
feedbackTally: new Map(),
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
/** Get the current session context (read-only snapshot). */
|
|
27
|
+
getContext() {
|
|
28
|
+
return this.ctx;
|
|
29
|
+
}
|
|
30
|
+
/** Number of evaluations performed. */
|
|
31
|
+
get evaluationCount() {
|
|
32
|
+
return this.ctx.evaluationCount;
|
|
33
|
+
}
|
|
34
|
+
/** Record detected frameworks (deduplicated). */
|
|
35
|
+
addFrameworks(frameworks) {
|
|
36
|
+
const existing = new Set(this.ctx.frameworks);
|
|
37
|
+
for (const fw of frameworks) {
|
|
38
|
+
if (!existing.has(fw)) {
|
|
39
|
+
this.ctx.frameworks.push(fw);
|
|
40
|
+
existing.add(fw);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
/** Record detected project capabilities (e.g. "rate-limiting", "auth"). */
|
|
45
|
+
addCapabilities(caps) {
|
|
46
|
+
for (const cap of caps) {
|
|
47
|
+
this.ctx.capabilities.add(cap);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
/** Get accumulated capabilities for absence-based finding suppression. */
|
|
51
|
+
getCapabilities() {
|
|
52
|
+
return this.ctx.capabilities;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Record an evaluation result for a file. Tracks verdict history
|
|
56
|
+
* so repeated evaluations can detect stability (converging scores).
|
|
57
|
+
*/
|
|
58
|
+
recordEvaluation(filePath, code, verdict) {
|
|
59
|
+
this.ctx.evaluationCount++;
|
|
60
|
+
const hash = contentHash(code, filePath);
|
|
61
|
+
this.ctx.evaluatedFiles.set(hash, filePath);
|
|
62
|
+
const history = this.ctx.verdictHistory.get(filePath) ?? [];
|
|
63
|
+
history.push({
|
|
64
|
+
score: verdict.overallScore,
|
|
65
|
+
findingCount: verdict.findings.length,
|
|
66
|
+
timestamp: verdict.timestamp,
|
|
67
|
+
});
|
|
68
|
+
// Keep last 10 evaluations per file
|
|
69
|
+
if (history.length > 10)
|
|
70
|
+
history.shift();
|
|
71
|
+
this.ctx.verdictHistory.set(filePath, history);
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Check if a file's verdict is stable — same score and finding count
|
|
75
|
+
* across the last N evaluations. Returns true if stable (skip re-eval).
|
|
76
|
+
*/
|
|
77
|
+
isVerdictStable(filePath, minRuns = 3) {
|
|
78
|
+
const history = this.ctx.verdictHistory.get(filePath);
|
|
79
|
+
if (!history || history.length < minRuns)
|
|
80
|
+
return false;
|
|
81
|
+
const recent = history.slice(-minRuns);
|
|
82
|
+
const firstScore = recent[0].score;
|
|
83
|
+
const firstCount = recent[0].findingCount;
|
|
84
|
+
return recent.every((h) => h.score === firstScore && h.findingCount === firstCount);
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Check if a file has already been evaluated with the same content.
|
|
88
|
+
*/
|
|
89
|
+
hasEvaluated(filePath, code) {
|
|
90
|
+
const hash = contentHash(code, filePath);
|
|
91
|
+
return this.ctx.evaluatedFiles.has(hash);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Get verdict history for a file — most recent first.
|
|
95
|
+
*/
|
|
96
|
+
getVerdictHistory(filePath) {
|
|
97
|
+
return [...(this.ctx.verdictHistory.get(filePath) ?? [])].reverse();
|
|
98
|
+
}
|
|
99
|
+
/** Reset the session (clear all accumulated context). */
|
|
100
|
+
reset() {
|
|
101
|
+
this.ctx = {
|
|
102
|
+
frameworks: [],
|
|
103
|
+
capabilities: new Set(),
|
|
104
|
+
verdictHistory: new Map(),
|
|
105
|
+
evaluatedFiles: new Map(),
|
|
106
|
+
startedAt: new Date().toISOString(),
|
|
107
|
+
evaluationCount: 0,
|
|
108
|
+
feedbackTally: new Map(),
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Record user feedback for a finding rule.
|
|
113
|
+
* tp = true positive, fp = false positive, wontfix = acknowledged but skipped.
|
|
114
|
+
*/
|
|
115
|
+
recordFeedback(ruleId, verdict) {
|
|
116
|
+
const existing = this.ctx.feedbackTally.get(ruleId) ?? { tp: 0, fp: 0, wontfix: 0 };
|
|
117
|
+
existing[verdict]++;
|
|
118
|
+
this.ctx.feedbackTally.set(ruleId, existing);
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Get a confidence penalty for a rule based on accumulated FP feedback.
|
|
122
|
+
* Returns a multiplier in (0, 1] — 1.0 means no penalty, lower means
|
|
123
|
+
* the rule has been flagged as FP frequently and confidence should be reduced.
|
|
124
|
+
*
|
|
125
|
+
* Formula: 1 / (1 + fpCount) — degrades smoothly as FP reports accumulate.
|
|
126
|
+
* A single FP report halves confidence; two reports reduce it to 1/3, etc.
|
|
127
|
+
*/
|
|
128
|
+
getConfidencePenalty(ruleId) {
|
|
129
|
+
const tally = this.ctx.feedbackTally.get(ruleId);
|
|
130
|
+
if (!tally || tally.fp === 0)
|
|
131
|
+
return 1.0;
|
|
132
|
+
return 1 / (1 + tally.fp);
|
|
133
|
+
}
|
|
134
|
+
/** Get the raw feedback tally for all rules. */
|
|
135
|
+
getFeedbackTally() {
|
|
136
|
+
return this.ctx.feedbackTally;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// ─── Singleton for MCP Server / Extension lifetime ──────────────────────────
|
|
140
|
+
let _globalSession;
|
|
141
|
+
/** Get or create the global evaluation session (shared across MCP calls). */
|
|
142
|
+
export function getGlobalSession() {
|
|
143
|
+
if (!_globalSession) {
|
|
144
|
+
_globalSession = new EvaluationSession();
|
|
145
|
+
}
|
|
146
|
+
return _globalSession;
|
|
147
|
+
}
|
|
148
|
+
/** Reset the global session (for testing or explicit reset). */
|
|
149
|
+
export function resetGlobalSession() {
|
|
150
|
+
_globalSession?.reset();
|
|
151
|
+
_globalSession = undefined;
|
|
152
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { JudgeDefinition, JudgeEvaluation, TribunalVerdict, ProjectVerdict, DiffVerdict, Finding, MustFixGateOptions, JudgesConfig, SuppressionResult } from "../types.js";
|
|
1
|
+
import type { JudgeDefinition, JudgeEvaluation, TribunalVerdict, ProjectVerdict, DiffVerdict, Finding, MustFixGateOptions, JudgesConfig, SuppressionResult, StreamingBatch } from "../types.js";
|
|
2
2
|
import type { CodeStructure } from "../ast/types.js";
|
|
3
3
|
import type { TaintFlow } from "../ast/taint-tracker.js";
|
|
4
4
|
import { formatVerdictAsMarkdown, formatEvaluationAsMarkdown } from "./shared.js";
|
|
@@ -53,6 +53,12 @@ export interface EvaluationOptions {
|
|
|
53
53
|
* Generated by `scanProjectCapabilities()` from the project evaluator.
|
|
54
54
|
*/
|
|
55
55
|
projectCapabilities?: Set<string>;
|
|
56
|
+
/**
|
|
57
|
+
* Enable adaptive judge selection — automatically skip judges that are
|
|
58
|
+
* irrelevant to the file's language, framework, or role. Reduces noise
|
|
59
|
+
* and improves performance. Defaults to false (run all judges).
|
|
60
|
+
*/
|
|
61
|
+
adaptiveSelection?: boolean;
|
|
56
62
|
/** @internal — pre-computed AST structure for the file (set by evaluateWithTribunal) */
|
|
57
63
|
_astCache?: CodeStructure;
|
|
58
64
|
/** @internal — pre-computed taint flows for the file (set by evaluateWithTribunal) */
|
|
@@ -88,6 +94,22 @@ export declare function evaluateWithJudge(judge: JudgeDefinition, code: string,
|
|
|
88
94
|
* Run the full tribunal — all judges evaluate the code.
|
|
89
95
|
*/
|
|
90
96
|
export declare function evaluateWithTribunal(code: string, language: string, context?: string, options?: EvaluationOptions): TribunalVerdict;
|
|
97
|
+
/**
|
|
98
|
+
* Streaming tribunal evaluation — yields per-judge results as each judge
|
|
99
|
+
* completes, enabling progressive UI updates and early termination.
|
|
100
|
+
*
|
|
101
|
+
* Each yielded `StreamingBatch` contains the judge evaluation, execution
|
|
102
|
+
* trace, and running aggregate statistics.
|
|
103
|
+
*
|
|
104
|
+
* Usage:
|
|
105
|
+
* ```ts
|
|
106
|
+
* for await (const batch of evaluateWithTribunalStreaming(code, lang)) {
|
|
107
|
+
* console.log(`${batch.judgeName}: ${batch.evaluation.findings.length} findings`);
|
|
108
|
+
* if (batch.aggregate.criticalSoFar > 10) break; // early termination
|
|
109
|
+
* }
|
|
110
|
+
* ```
|
|
111
|
+
*/
|
|
112
|
+
export declare function evaluateWithTribunalStreaming(code: string, language: string, context?: string, options?: EvaluationOptions): AsyncGenerator<StreamingBatch>;
|
|
91
113
|
export { scanProjectWideSecurityPatterns } from "./project.js";
|
|
92
114
|
export declare function evaluateProject(files: Array<{
|
|
93
115
|
path: string;
|