@kevinrabun/judges 3.125.0 → 3.126.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +2 -1
- package/dist/api.js +2 -0
- package/dist/cli-formatters.js +38 -0
- package/dist/cli.js +27 -1
- package/dist/evaluators/index.js +163 -1
- package/dist/evaluators/shared.js +33 -0
- package/dist/regulatory-scope.d.ts +27 -0
- package/dist/regulatory-scope.js +181 -0
- package/dist/tools/prompts.d.ts +1 -1
- package/dist/tools/prompts.js +3 -1
- package/dist/types.d.ts +87 -0
- package/judgesrc.schema.json +14 -0
- package/package.json +2 -2
- package/server.json +2 -2
package/dist/api.d.ts
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* const result = evaluateCode("const x = eval(input);", "typescript");
|
|
9
9
|
* ```
|
|
10
10
|
*/
|
|
11
|
-
export type { Severity, Verdict, Finding, Patch, LangFamily, JudgesConfig, RuleOverride, ProjectFile, ProjectVerdict, DiffVerdict, DependencyEntry, DependencyVerdict, JudgeEvaluation, TribunalVerdict, JudgeDefinition, EvaluationContextV2, EvidenceBundleV2, SpecializedFindingV2, TribunalVerdictV2, MustFixGateOptions, MustFixGateResult, AppBuilderWorkflowResult, PlainLanguageFinding, WorkflowTask, PolicyProfile, SuppressionRecord, SuppressionResult, ExecutionTrace, RuleTrace, StreamingBatch, JudgeSelectionContext, JudgeSelectionResult, SessionContext, } from "./types.js";
|
|
11
|
+
export type { Severity, Verdict, Finding, Patch, LangFamily, JudgesConfig, RuleOverride, ProjectFile, ProjectVerdict, DiffVerdict, DependencyEntry, DependencyVerdict, JudgeEvaluation, TribunalVerdict, JudgeDefinition, EvaluationContextV2, EvidenceBundleV2, SpecializedFindingV2, TribunalVerdictV2, MustFixGateOptions, MustFixGateResult, AppBuilderWorkflowResult, PlainLanguageFinding, WorkflowTask, PolicyProfile, SuppressionRecord, SuppressionResult, ExecutionTrace, RuleTrace, StreamingBatch, JudgeSelectionContext, JudgeSelectionResult, SessionContext, HumanFocusGuide, FocusItem, BlindSpot, } from "./types.js";
|
|
12
12
|
export { JudgesError, ConfigError, EvaluationError, ParseError } from "./errors.js";
|
|
13
13
|
export { parseConfig, defaultConfig, mergeConfigs, discoverCascadingConfigs, loadCascadingConfig, loadConfigFile, expandEnvPlaceholders, loadPluginJudges, validatePluginSpecifiers, isValidJudgeDefinition, validateJudgeDefinition, applyOverridesForFile, applyLanguageProfile, resolveExtendsConfig, } from "./config.js";
|
|
14
14
|
export { EXT_TO_LANG, SUPPORTED_EXTENSIONS, detectLanguageFromPath } from "./ext-to-lang.js";
|
|
@@ -47,6 +47,7 @@ export { runFeedbackLoop, formatFeedbackLoopReport } from "./feedback-loop.js";
|
|
|
47
47
|
export type { FeedbackLoopResult, ConfidenceAdjustment, FeedbackLoopStats } from "./feedback-loop.js";
|
|
48
48
|
export { registerPlugin, unregisterPlugin, getRegisteredPlugins, getCustomRules, getPluginJudges, evaluateCustomRules, runBeforeHooks, runAfterHooks, clearPlugins, } from "./plugins.js";
|
|
49
49
|
export type { CustomRule, JudgesPlugin, PluginRegistration } from "./plugins.js";
|
|
50
|
+
export { filterByRegulatoryScope, getSupportedFrameworks } from "./regulatory-scope.js";
|
|
50
51
|
export { JudgeRegistry, defaultRegistry } from "./judge-registry.js";
|
|
51
52
|
export { parseFrontmatter, validateFrontmatter, parseAgentFile, resolveEvaluator, agentToJudgeDefinition, loadAgentDirectory, loadAndRegisterAgents, } from "./agent-loader.js";
|
|
52
53
|
export type { AgentFrontmatter, ParsedAgent } from "./agent-loader.js";
|
package/dist/api.js
CHANGED
|
@@ -56,6 +56,8 @@ export { getAgentCard, createTask, getTask, completeTask, failTask, listTasks, p
|
|
|
56
56
|
export { runFeedbackLoop, formatFeedbackLoopReport } from "./feedback-loop.js";
|
|
57
57
|
// ─── Plugin API ──────────────────────────────────────────────────────────────
|
|
58
58
|
export { registerPlugin, unregisterPlugin, getRegisteredPlugins, getCustomRules, getPluginJudges, evaluateCustomRules, runBeforeHooks, runAfterHooks, clearPlugins, } from "./plugins.js";
|
|
59
|
+
// ─── Regulatory Scope ────────────────────────────────────────────────────────
|
|
60
|
+
export { filterByRegulatoryScope, getSupportedFrameworks } from "./regulatory-scope.js";
|
|
59
61
|
// ─── Judge Registry ──────────────────────────────────────────────────────────
|
|
60
62
|
export { JudgeRegistry, defaultRegistry } from "./judge-registry.js";
|
|
61
63
|
// ─── Agent Markdown Loader ───────────────────────────────────────────────────
|
package/dist/cli-formatters.js
CHANGED
|
@@ -136,6 +136,44 @@ export function formatTextOutput(verdict) {
|
|
|
136
136
|
}
|
|
137
137
|
lines.push("");
|
|
138
138
|
}
|
|
139
|
+
// Human Focus Guide
|
|
140
|
+
if (verdict.humanFocusGuide) {
|
|
141
|
+
const guide = verdict.humanFocusGuide;
|
|
142
|
+
lines.push(" 👤 Human Reviewer Focus Guide");
|
|
143
|
+
lines.push(" " + "─".repeat(60));
|
|
144
|
+
lines.push(` ${guide.summary}`);
|
|
145
|
+
lines.push("");
|
|
146
|
+
if (guide.trust.length > 0) {
|
|
147
|
+
lines.push(" ✅ TRUST (act on these directly):");
|
|
148
|
+
for (const item of guide.trust.slice(0, 10)) {
|
|
149
|
+
const lineRef = item.lineNumbers?.[0] ? ` L${item.lineNumbers[0]}` : "";
|
|
150
|
+
lines.push(` [${item.severity.toUpperCase()}] ${item.ruleId}${lineRef}: ${item.title}`);
|
|
151
|
+
lines.push(` ${item.reason}`);
|
|
152
|
+
}
|
|
153
|
+
if (guide.trust.length > 10)
|
|
154
|
+
lines.push(` ... and ${guide.trust.length - 10} more`);
|
|
155
|
+
lines.push("");
|
|
156
|
+
}
|
|
157
|
+
if (guide.verify.length > 0) {
|
|
158
|
+
lines.push(" 🔍 VERIFY (use your judgment):");
|
|
159
|
+
for (const item of guide.verify.slice(0, 10)) {
|
|
160
|
+
const lineRef = item.lineNumbers?.[0] ? ` L${item.lineNumbers[0]}` : "";
|
|
161
|
+
lines.push(` [${item.severity.toUpperCase()}] ${item.ruleId}${lineRef}: ${item.title}`);
|
|
162
|
+
lines.push(` ${item.reason}`);
|
|
163
|
+
}
|
|
164
|
+
if (guide.verify.length > 10)
|
|
165
|
+
lines.push(` ... and ${guide.verify.length - 10} more`);
|
|
166
|
+
lines.push("");
|
|
167
|
+
}
|
|
168
|
+
if (guide.blindSpots.length > 0) {
|
|
169
|
+
lines.push(" 🔦 BLIND SPOTS (automated analysis cannot evaluate):");
|
|
170
|
+
for (const spot of guide.blindSpots) {
|
|
171
|
+
lines.push(` • ${spot.area}`);
|
|
172
|
+
lines.push(` ${spot.guidance.slice(0, 120)}${spot.guidance.length > 120 ? "…" : ""}`);
|
|
173
|
+
}
|
|
174
|
+
lines.push("");
|
|
175
|
+
}
|
|
176
|
+
}
|
|
139
177
|
// Exit guidance
|
|
140
178
|
if (verdict.overallVerdict === "fail") {
|
|
141
179
|
lines.push(" ⛔ FAIL — This code has issues that should be addressed before shipping.");
|
package/dist/cli.js
CHANGED
|
@@ -43,6 +43,7 @@ import { formatComparisonReport, formatFullComparisonMatrix, TOOL_PROFILES } fro
|
|
|
43
43
|
import { loadOverrideStore, applyOverrides } from "./commands/override.js";
|
|
44
44
|
import { runGit } from "./tools/command-safety.js";
|
|
45
45
|
import { detectLanguageFromPath, SUPPORTED_EXTENSIONS } from "./ext-to-lang.js";
|
|
46
|
+
import { getSupportedFrameworks } from "./regulatory-scope.js";
|
|
46
47
|
import { formatTribunalOutput, writeOutputIfSpecified, formatSingleJudgeTextOutput, } from "./cli-formatters.js";
|
|
47
48
|
import { COMMAND_TABLE } from "./cli-dispatch.js";
|
|
48
49
|
// ─── Language Detection ─────────────────────────────────────────────────────
|
|
@@ -226,6 +227,8 @@ function printHelp() {
|
|
|
226
227
|
* over-promising features that aren't wired yet.
|
|
227
228
|
*/
|
|
228
229
|
const coreCommands = [
|
|
230
|
+
["judges list", "List all available judges"],
|
|
231
|
+
["judges list --frameworks", "List supported regulatory frameworks"],
|
|
229
232
|
["judges eval [options] [file]", "Evaluate code with the full tribunal"],
|
|
230
233
|
["judges eval --judge <id> [file]", "Evaluate with a single judge"],
|
|
231
234
|
["judges init", "Interactive project setup wizard"],
|
|
@@ -485,6 +488,24 @@ function listJudges() {
|
|
|
485
488
|
console.log(` Total: ${judges.length} judges`);
|
|
486
489
|
console.log("");
|
|
487
490
|
}
|
|
491
|
+
// ─── List Regulatory Frameworks ─────────────────────────────────────────────
|
|
492
|
+
function listFrameworks() {
|
|
493
|
+
const frameworks = getSupportedFrameworks();
|
|
494
|
+
console.log("");
|
|
495
|
+
console.log(" Supported Regulatory Frameworks:");
|
|
496
|
+
console.log(" " + "─".repeat(60));
|
|
497
|
+
console.log(" Use these IDs in .judgesrc → regulatoryScope: [...]");
|
|
498
|
+
console.log("");
|
|
499
|
+
for (const fw of frameworks) {
|
|
500
|
+
console.log(` ${fw.id.padEnd(15)} ${fw.description}`);
|
|
501
|
+
}
|
|
502
|
+
console.log("");
|
|
503
|
+
console.log(` Total: ${frameworks.length} frameworks`);
|
|
504
|
+
console.log("");
|
|
505
|
+
console.log(" Example .judgesrc:");
|
|
506
|
+
console.log(' { "regulatoryScope": ["GDPR", "PCI-DSS", "SOC2"] }');
|
|
507
|
+
console.log("");
|
|
508
|
+
}
|
|
488
509
|
// ─── Version ────────────────────────────────────────────────────────────────
|
|
489
510
|
function getPackageVersion() {
|
|
490
511
|
try {
|
|
@@ -681,7 +702,12 @@ export async function runCli(argv) {
|
|
|
681
702
|
}
|
|
682
703
|
// ─── List Command ────────────────────────────────────────────────────
|
|
683
704
|
if (args.command === "list") {
|
|
684
|
-
|
|
705
|
+
if (argv.includes("--frameworks")) {
|
|
706
|
+
listFrameworks();
|
|
707
|
+
}
|
|
708
|
+
else {
|
|
709
|
+
listJudges();
|
|
710
|
+
}
|
|
685
711
|
process.exit(0);
|
|
686
712
|
}
|
|
687
713
|
// ─── Eval Command ────────────────────────────────────────────────────
|
package/dist/evaluators/index.js
CHANGED
|
@@ -6,6 +6,7 @@ import { analyzeStructure } from "../ast/index.js";
|
|
|
6
6
|
import { analyzeTaintFlows } from "../ast/index.js";
|
|
7
7
|
import { LRUCache, contentHash } from "../cache.js";
|
|
8
8
|
import { getSharedDiskCache } from "../disk-cache.js";
|
|
9
|
+
import { filterByRegulatoryScope } from "../regulatory-scope.js";
|
|
9
10
|
// ─── Shared Utilities ────────────────────────────────────────────────────────
|
|
10
11
|
import { calculateScore, deriveVerdict, buildSummary, buildTribunalSummary, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, classifyFile, shouldRunAbsenceRules, applyConfig, applyFrameworkAwareness, } from "./shared.js";
|
|
11
12
|
// ─── Extracted Modules ───────────────────────────────────────────────────────
|
|
@@ -414,6 +415,137 @@ function synthesizeReviewDecision(findings) {
|
|
|
414
415
|
blockingIssues,
|
|
415
416
|
};
|
|
416
417
|
}
|
|
418
|
+
// ─── Human Focus Guide ────────────────────────────────────────────────────────
|
|
419
|
+
/**
|
|
420
|
+
* Synthesize a Human Focus Guide from tribunal findings.
|
|
421
|
+
*
|
|
422
|
+
* Categorizes findings into three buckets:
|
|
423
|
+
* - **Trust**: High-confidence, evidence-backed findings (confidence ≥ 0.8)
|
|
424
|
+
* - **Verify**: Lower-confidence or absence-based findings (confidence < 0.8 or absence-based)
|
|
425
|
+
* - **Blind spots**: Areas automated analysis cannot evaluate (business logic, architecture, UX judgment)
|
|
426
|
+
*
|
|
427
|
+
* Also detects code characteristics that suggest human attention is needed.
|
|
428
|
+
*/
|
|
429
|
+
function synthesizeHumanFocusGuide(findings, code, language) {
|
|
430
|
+
const trust = [];
|
|
431
|
+
const verify = [];
|
|
432
|
+
for (const f of findings) {
|
|
433
|
+
const conf = f.confidence ?? 0.7;
|
|
434
|
+
const item = {
|
|
435
|
+
ruleId: f.ruleId,
|
|
436
|
+
title: f.title,
|
|
437
|
+
severity: f.severity,
|
|
438
|
+
confidence: conf,
|
|
439
|
+
lineNumbers: f.lineNumbers,
|
|
440
|
+
reason: "",
|
|
441
|
+
};
|
|
442
|
+
if (f.isAbsenceBased) {
|
|
443
|
+
item.reason = "Absence-based — the detected issue may be handled in another file";
|
|
444
|
+
verify.push(item);
|
|
445
|
+
}
|
|
446
|
+
else if (conf >= 0.8 && (f.provenance === "ast-confirmed" || f.provenance === "taint-flow")) {
|
|
447
|
+
item.reason = "AST/taint-flow confirmed with high confidence";
|
|
448
|
+
trust.push(item);
|
|
449
|
+
}
|
|
450
|
+
else if (conf >= 0.8) {
|
|
451
|
+
item.reason = "High confidence with concrete evidence";
|
|
452
|
+
trust.push(item);
|
|
453
|
+
}
|
|
454
|
+
else if (conf >= 0.5) {
|
|
455
|
+
item.reason = `Moderate confidence (${Math.round(conf * 100)}%) — verify manually`;
|
|
456
|
+
verify.push(item);
|
|
457
|
+
}
|
|
458
|
+
else {
|
|
459
|
+
item.reason = `Low confidence (${Math.round(conf * 100)}%) — may be a false positive`;
|
|
460
|
+
verify.push(item);
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
// ── Blind spots: areas automated analysis cannot evaluate ──
|
|
464
|
+
const blindSpots = [];
|
|
465
|
+
// Always include core blind spots
|
|
466
|
+
blindSpots.push({
|
|
467
|
+
area: "Business Logic Correctness",
|
|
468
|
+
guidance: "Verify that the code implements the intended requirements correctly. Automated analysis checks for patterns and vulnerabilities but cannot validate business rules, domain constraints, or functional correctness.",
|
|
469
|
+
});
|
|
470
|
+
// Code-characteristic-based blind spots
|
|
471
|
+
if (code) {
|
|
472
|
+
const lines = code.split("\n");
|
|
473
|
+
const lineCount = lines.length;
|
|
474
|
+
// Complex branching
|
|
475
|
+
const branchCount = (code.match(/\bif\b|\belse\b|\bswitch\b|\bcase\b|\?\s*:/g) || []).length;
|
|
476
|
+
if (branchCount > lineCount * 0.15 && branchCount > 10) {
|
|
477
|
+
blindSpots.push({
|
|
478
|
+
area: "Complex Control Flow",
|
|
479
|
+
guidance: `This code has dense branching logic (~${branchCount} branch points). Review edge cases, boundary conditions, and off-by-one errors that pattern matching cannot reliably detect.`,
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
// External API/service calls
|
|
483
|
+
const hasExternalCalls = /fetch\(|axios\.|http\.|https\.|\.request\(|urllib|requests\.|HttpClient|WebClient/i.test(code);
|
|
484
|
+
if (hasExternalCalls) {
|
|
485
|
+
blindSpots.push({
|
|
486
|
+
area: "External Service Integration",
|
|
487
|
+
guidance: "This code calls external services. Verify timeout behavior, retry logic, circuit breaking, and graceful degradation when services are unavailable. Automated analysis can detect missing patterns but cannot validate the integration logic.",
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
// Financial/monetary operations
|
|
491
|
+
const hasFinancial = /price|amount|balance|payment|invoice|refund|discount|tax|currency|decimal|money/i.test(code);
|
|
492
|
+
if (hasFinancial) {
|
|
493
|
+
blindSpots.push({
|
|
494
|
+
area: "Financial/Monetary Calculations",
|
|
495
|
+
guidance: "This code handles monetary values. Verify rounding behavior, currency precision, and that floating-point arithmetic is not used for financial calculations.",
|
|
496
|
+
});
|
|
497
|
+
}
|
|
498
|
+
// Complex regex
|
|
499
|
+
const complexRegex = (code.match(/\/[^/\n]{30,}\//g) || []).length;
|
|
500
|
+
if (complexRegex > 0) {
|
|
501
|
+
blindSpots.push({
|
|
502
|
+
area: "Complex Regular Expressions",
|
|
503
|
+
guidance: `Found ${complexRegex} complex regex pattern(s). Verify they match the intended inputs and don't have catastrophic backtracking on adversarial input.`,
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
// State machines / workflow
|
|
507
|
+
const hasStateMachine = /state\s*[=:]\s*['"][^'"]+['"]|status\s*===?\s*['"]|transition|workflow|step.*next/i.test(code);
|
|
508
|
+
if (hasStateMachine) {
|
|
509
|
+
blindSpots.push({
|
|
510
|
+
area: "State Management / Workflow Logic",
|
|
511
|
+
guidance: "This code manages state transitions or workflow steps. Verify that all valid state transitions are handled and invalid transitions are rejected. Automated analysis cannot validate state machine correctness.",
|
|
512
|
+
});
|
|
513
|
+
}
|
|
514
|
+
// PII/sensitive data handling
|
|
515
|
+
const hasPII = /\b(email|ssn|social.security|phone.number|address|birth.date|passport|national.id|credit.card)\b/i.test(code);
|
|
516
|
+
if (hasPII) {
|
|
517
|
+
blindSpots.push({
|
|
518
|
+
area: "PII / Sensitive Data Handling",
|
|
519
|
+
guidance: "This code handles personally identifiable information. Verify data minimization, consent tracking, retention policies, and that PII is not logged or transmitted unnecessarily.",
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
// Architecture blind spot (always relevant for non-trivial code)
|
|
524
|
+
if (code && code.split("\n").length > 50) {
|
|
525
|
+
blindSpots.push({
|
|
526
|
+
area: "Architectural Fit",
|
|
527
|
+
guidance: "Verify this code fits the project's architectural patterns (service boundaries, dependency direction, naming conventions). Automated analysis evaluates code in isolation and cannot assess architectural context.",
|
|
528
|
+
});
|
|
529
|
+
}
|
|
530
|
+
// ── Build summary ──
|
|
531
|
+
const trustCount = trust.length;
|
|
532
|
+
const verifyCount = verify.length;
|
|
533
|
+
const blindCount = blindSpots.length;
|
|
534
|
+
const parts = [];
|
|
535
|
+
if (trustCount > 0) {
|
|
536
|
+
parts.push(`${trustCount} high-confidence finding${trustCount > 1 ? "s" : ""} you can act on directly`);
|
|
537
|
+
}
|
|
538
|
+
if (verifyCount > 0) {
|
|
539
|
+
parts.push(`${verifyCount} finding${verifyCount > 1 ? "s" : ""} that need your judgment`);
|
|
540
|
+
}
|
|
541
|
+
if (blindCount > 0) {
|
|
542
|
+
parts.push(`${blindCount} area${blindCount > 1 ? "s" : ""} that automated analysis cannot evaluate`);
|
|
543
|
+
}
|
|
544
|
+
const summary = parts.length > 0
|
|
545
|
+
? `Human reviewer: ${parts.join(", ")}. Focus your review time on the "Verify" and "Blind Spots" sections — the "Trust" findings have strong automated evidence.`
|
|
546
|
+
: "No findings — code looks clean. Focus your review on business logic correctness and architectural fit.";
|
|
547
|
+
return { trust, verify, blindSpots, summary };
|
|
548
|
+
}
|
|
417
549
|
/**
|
|
418
550
|
* Cap the number of findings by priority-sorting and keeping only
|
|
419
551
|
* the top N. Ensures high-severity / high-confidence findings always survive.
|
|
@@ -571,6 +703,16 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
571
703
|
}
|
|
572
704
|
}
|
|
573
705
|
}
|
|
706
|
+
// ── Regulatory scope filtering ──
|
|
707
|
+
// When regulatoryScope is set in config, suppress findings that cite ONLY
|
|
708
|
+
// out-of-scope regulatory frameworks.
|
|
709
|
+
let regulatorySuppressed = 0;
|
|
710
|
+
if (options?.config?.regulatoryScope && options.config.regulatoryScope.length > 0) {
|
|
711
|
+
const scopeResult = filterByRegulatoryScope(configFiltered, options.config.regulatoryScope);
|
|
712
|
+
configFiltered.length = 0;
|
|
713
|
+
configFiltered.push(...scopeResult.findings);
|
|
714
|
+
regulatorySuppressed = scopeResult.suppressed;
|
|
715
|
+
}
|
|
574
716
|
// ── Feedback-driven confidence calibration & auto-tuning ──
|
|
575
717
|
// When options.calibrate is set, load the feedback store and apply:
|
|
576
718
|
// 1. Auto-suppression of rules with FP rate ≥ 80%
|
|
@@ -709,8 +851,27 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
709
851
|
...(owaspLlmTop10 ? { owaspLlmTop10 } : {}),
|
|
710
852
|
};
|
|
711
853
|
});
|
|
854
|
+
// ── Consensus-based suppression ──
|
|
855
|
+
// When consensusThreshold is set: if a supermajority of judges reported
|
|
856
|
+
// zero findings, suppress findings from the minority outlier judges.
|
|
857
|
+
// This catches cases where most judges agree code is clean but a few
|
|
858
|
+
// structurally over-flag (e.g. error-handling, testing).
|
|
859
|
+
let consensusSuppressed = 0;
|
|
860
|
+
let postConsensuFindings = allFindings;
|
|
861
|
+
const consensusThreshold = options?.config?.consensusThreshold;
|
|
862
|
+
if (consensusThreshold !== undefined && consensusThreshold > 0 && evaluations.length > 0) {
|
|
863
|
+
const zeroFindingJudges = evaluations.filter((e) => e.findings.length === 0).length;
|
|
864
|
+
const totalJudges = evaluations.length;
|
|
865
|
+
const cleanRatio = zeroFindingJudges / totalJudges;
|
|
866
|
+
if (cleanRatio >= consensusThreshold) {
|
|
867
|
+
// Majority says clean — suppress minority findings (keep critical severity)
|
|
868
|
+
const before = postConsensuFindings.length;
|
|
869
|
+
postConsensuFindings = postConsensuFindings.filter((f) => f.severity === "critical");
|
|
870
|
+
consensusSuppressed = before - postConsensuFindings.length;
|
|
871
|
+
}
|
|
872
|
+
}
|
|
712
873
|
// ── Structured CWE/OWASP IDs and Learn More URLs ──
|
|
713
|
-
const enrichedFindings = enrichWithSecurityIds(
|
|
874
|
+
const enrichedFindings = enrichWithSecurityIds(postConsensuFindings);
|
|
714
875
|
const mustFixGate = evaluateMustFixGate(enrichedFindings, options?.mustFixGate);
|
|
715
876
|
const criticalCount = enrichedFindings.filter((f) => f.severity === "critical").length;
|
|
716
877
|
const highCount = enrichedFindings.filter((f) => f.severity === "high").length;
|
|
@@ -741,6 +902,7 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
741
902
|
})),
|
|
742
903
|
},
|
|
743
904
|
reviewDecision: synthesizeReviewDecision(enrichedFindings),
|
|
905
|
+
humanFocusGuide: synthesizeHumanFocusGuide(enrichedFindings, code, language),
|
|
744
906
|
};
|
|
745
907
|
// ── Deep review prompt attachment (P0.1) ──
|
|
746
908
|
// When deepReview is enabled, build and attach a structured LLM prompt
|
|
@@ -1036,6 +1036,39 @@ export function formatVerdictAsMarkdown(verdict) {
|
|
|
1036
1036
|
md += `---\n\n`;
|
|
1037
1037
|
}
|
|
1038
1038
|
}
|
|
1039
|
+
// Human Focus Guide
|
|
1040
|
+
if (verdict.humanFocusGuide) {
|
|
1041
|
+
const guide = verdict.humanFocusGuide;
|
|
1042
|
+
md += `## 👤 Human Reviewer Focus Guide\n\n`;
|
|
1043
|
+
md += `${guide.summary}\n\n`;
|
|
1044
|
+
if (guide.trust.length > 0) {
|
|
1045
|
+
md += `### ✅ Trust (act on these directly)\n\n`;
|
|
1046
|
+
md += `| Severity | Rule | Finding | Reason |\n|---|---|---|---|\n`;
|
|
1047
|
+
for (const item of guide.trust.slice(0, 15)) {
|
|
1048
|
+
md += `| ${item.severity} | \`${item.ruleId}\` | ${item.title} | ${item.reason} |\n`;
|
|
1049
|
+
}
|
|
1050
|
+
if (guide.trust.length > 15)
|
|
1051
|
+
md += `\n*...and ${guide.trust.length - 15} more*\n`;
|
|
1052
|
+
md += `\n`;
|
|
1053
|
+
}
|
|
1054
|
+
if (guide.verify.length > 0) {
|
|
1055
|
+
md += `### 🔍 Verify (use your judgment)\n\n`;
|
|
1056
|
+
md += `| Severity | Rule | Finding | Reason |\n|---|---|---|---|\n`;
|
|
1057
|
+
for (const item of guide.verify.slice(0, 15)) {
|
|
1058
|
+
md += `| ${item.severity} | \`${item.ruleId}\` | ${item.title} | ${item.reason} |\n`;
|
|
1059
|
+
}
|
|
1060
|
+
if (guide.verify.length > 15)
|
|
1061
|
+
md += `\n*...and ${guide.verify.length - 15} more*\n`;
|
|
1062
|
+
md += `\n`;
|
|
1063
|
+
}
|
|
1064
|
+
if (guide.blindSpots.length > 0) {
|
|
1065
|
+
md += `### 🔦 Blind Spots (automated analysis cannot evaluate)\n\n`;
|
|
1066
|
+
for (const spot of guide.blindSpots) {
|
|
1067
|
+
md += `- **${spot.area}** — ${spot.guidance}\n`;
|
|
1068
|
+
}
|
|
1069
|
+
md += `\n`;
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1039
1072
|
return md;
|
|
1040
1073
|
}
|
|
1041
1074
|
// ─── Shared Credential / Placeholder Detection ──────────────────────────────
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Regulatory Scope — Framework-aware finding filtering.
|
|
3
|
+
*
|
|
4
|
+
* When `regulatoryScope` is set in `.judgesrc`, findings whose `reference`
|
|
5
|
+
* field cites ONLY out-of-scope frameworks are suppressed. Findings that
|
|
6
|
+
* cite at least one in-scope framework (or have no regulatory reference)
|
|
7
|
+
* are kept.
|
|
8
|
+
*/
|
|
9
|
+
import type { Finding } from "./types.js";
|
|
10
|
+
/** Look up supported framework IDs for listing/validation. */
|
|
11
|
+
export declare function getSupportedFrameworks(): Array<{
|
|
12
|
+
id: string;
|
|
13
|
+
description: string;
|
|
14
|
+
}>;
|
|
15
|
+
/**
|
|
16
|
+
* Filter findings based on `regulatoryScope`. Findings that cite ONLY
|
|
17
|
+
* out-of-scope frameworks are suppressed. Findings with no regulatory
|
|
18
|
+
* reference or with at least one in-scope framework are kept.
|
|
19
|
+
*
|
|
20
|
+
* @param findings - All findings from the tribunal
|
|
21
|
+
* @param scope - Array of framework IDs (e.g. ["GDPR", "PCI-DSS"])
|
|
22
|
+
* @returns Object with kept findings and count of suppressed findings
|
|
23
|
+
*/
|
|
24
|
+
export declare function filterByRegulatoryScope(findings: Finding[], scope: string[]): {
|
|
25
|
+
findings: Finding[];
|
|
26
|
+
suppressed: number;
|
|
27
|
+
};
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Regulatory Scope — Framework-aware finding filtering.
|
|
3
|
+
*
|
|
4
|
+
* When `regulatoryScope` is set in `.judgesrc`, findings whose `reference`
|
|
5
|
+
* field cites ONLY out-of-scope frameworks are suppressed. Findings that
|
|
6
|
+
* cite at least one in-scope framework (or have no regulatory reference)
|
|
7
|
+
* are kept.
|
|
8
|
+
*/
|
|
9
|
+
const FRAMEWORKS = [
|
|
10
|
+
{
|
|
11
|
+
id: "GDPR",
|
|
12
|
+
aliases: [
|
|
13
|
+
"gdpr",
|
|
14
|
+
"general data protection",
|
|
15
|
+
"article 5",
|
|
16
|
+
"article 6",
|
|
17
|
+
"article 8",
|
|
18
|
+
"article 17",
|
|
19
|
+
"article 22",
|
|
20
|
+
"article 32",
|
|
21
|
+
"chapter v",
|
|
22
|
+
"data protection regulation",
|
|
23
|
+
],
|
|
24
|
+
description: "EU General Data Protection Regulation",
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: "CCPA",
|
|
28
|
+
aliases: ["ccpa", "california consumer privacy", "cpra", "right to delete"],
|
|
29
|
+
description: "California Consumer Privacy Act",
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
id: "HIPAA",
|
|
33
|
+
aliases: [
|
|
34
|
+
"hipaa",
|
|
35
|
+
"health insurance portability",
|
|
36
|
+
"phi",
|
|
37
|
+
"protected health information",
|
|
38
|
+
"45 cfr",
|
|
39
|
+
"security rule",
|
|
40
|
+
"minimum necessary",
|
|
41
|
+
],
|
|
42
|
+
description: "Health Insurance Portability and Accountability Act",
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
id: "PCI-DSS",
|
|
46
|
+
aliases: ["pci", "pci dss", "pci-dss", "payment card", "cardholder data", "requirement 3"],
|
|
47
|
+
description: "Payment Card Industry Data Security Standard",
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
id: "SOC2",
|
|
51
|
+
aliases: ["soc 2", "soc2", "trust service", "cc6", "cc7"],
|
|
52
|
+
description: "SOC 2 Trust Service Criteria",
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
id: "SOX",
|
|
56
|
+
aliases: ["sox", "sarbanes-oxley", "sarbanes oxley"],
|
|
57
|
+
description: "Sarbanes-Oxley Act",
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
id: "COPPA",
|
|
61
|
+
aliases: ["coppa", "children.*online privacy", "age appropriate design"],
|
|
62
|
+
description: "Children's Online Privacy Protection Act",
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
id: "FERPA",
|
|
66
|
+
aliases: ["ferpa", "family educational rights"],
|
|
67
|
+
description: "Family Educational Rights and Privacy Act",
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
id: "FedRAMP",
|
|
71
|
+
aliases: ["fedramp", "fed ramp", "federal risk"],
|
|
72
|
+
description: "Federal Risk and Authorization Management Program",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
id: "NIST",
|
|
76
|
+
aliases: ["nist", "sp 800", "800-53", "800-63", "800-131", "800-122", "ssdf"],
|
|
77
|
+
description: "NIST Cybersecurity Framework & Special Publications",
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
id: "ISO27001",
|
|
81
|
+
aliases: ["iso 27001", "iso27001", "iso/iec 27001"],
|
|
82
|
+
description: "ISO/IEC 27001 Information Security Management",
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
id: "ePrivacy",
|
|
86
|
+
aliases: ["eprivacy", "e-privacy", "cookie.*directive", "eprivacy directive"],
|
|
87
|
+
description: "EU ePrivacy Directive",
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
id: "DORA",
|
|
91
|
+
aliases: ["dora", "digital operational resilience"],
|
|
92
|
+
description: "Digital Operational Resilience Act",
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
id: "NIS2",
|
|
96
|
+
aliases: ["nis2", "nis 2", "network.*information.*security"],
|
|
97
|
+
description: "Network and Information Security Directive 2",
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
id: "EU-AI-Act",
|
|
101
|
+
aliases: ["eu ai act", "ai act", "artificial intelligence act"],
|
|
102
|
+
description: "EU Artificial Intelligence Act",
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
id: "LGPD",
|
|
106
|
+
aliases: ["lgpd", "lei geral.*prote"],
|
|
107
|
+
description: "Brazil General Data Protection Law",
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
id: "PIPEDA",
|
|
111
|
+
aliases: ["pipeda", "personal information protection.*electronic"],
|
|
112
|
+
description: "Canada Personal Information Protection and Electronic Documents Act",
|
|
113
|
+
},
|
|
114
|
+
];
|
|
115
|
+
/** Look up supported framework IDs for listing/validation. */
|
|
116
|
+
export function getSupportedFrameworks() {
|
|
117
|
+
return FRAMEWORKS.map((f) => ({ id: f.id, description: f.description }));
|
|
118
|
+
}
|
|
119
|
+
// ─── Framework Detection in Finding References ──────────────────────────────
|
|
120
|
+
/**
|
|
121
|
+
* Detect which regulatory frameworks a finding references.
|
|
122
|
+
* Checks the `reference` and `description` fields for framework aliases.
|
|
123
|
+
*/
|
|
124
|
+
function detectFrameworks(finding) {
|
|
125
|
+
const detected = new Set();
|
|
126
|
+
const text = `${finding.reference ?? ""} ${finding.description ?? ""}`.toLowerCase();
|
|
127
|
+
if (!text.trim())
|
|
128
|
+
return detected;
|
|
129
|
+
for (const fw of FRAMEWORKS) {
|
|
130
|
+
for (const alias of fw.aliases) {
|
|
131
|
+
if (text.includes(alias.toLowerCase())) {
|
|
132
|
+
detected.add(fw.id);
|
|
133
|
+
break;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
return detected;
|
|
138
|
+
}
|
|
139
|
+
// ─── Regulatory Scope Filter ────────────────────────────────────────────────
|
|
140
|
+
/**
|
|
141
|
+
* Filter findings based on `regulatoryScope`. Findings that cite ONLY
|
|
142
|
+
* out-of-scope frameworks are suppressed. Findings with no regulatory
|
|
143
|
+
* reference or with at least one in-scope framework are kept.
|
|
144
|
+
*
|
|
145
|
+
* @param findings - All findings from the tribunal
|
|
146
|
+
* @param scope - Array of framework IDs (e.g. ["GDPR", "PCI-DSS"])
|
|
147
|
+
* @returns Object with kept findings and count of suppressed findings
|
|
148
|
+
*/
|
|
149
|
+
export function filterByRegulatoryScope(findings, scope) {
|
|
150
|
+
if (!scope || scope.length === 0) {
|
|
151
|
+
return { findings, suppressed: 0 };
|
|
152
|
+
}
|
|
153
|
+
const scopeSet = new Set(scope.map((s) => s.toUpperCase()));
|
|
154
|
+
// Normalize framework IDs (e.g. "pci-dss" → "PCI-DSS")
|
|
155
|
+
const normalizedScope = new Set();
|
|
156
|
+
for (const id of scopeSet) {
|
|
157
|
+
const fw = FRAMEWORKS.find((f) => f.id.toUpperCase() === id);
|
|
158
|
+
if (fw)
|
|
159
|
+
normalizedScope.add(fw.id);
|
|
160
|
+
}
|
|
161
|
+
let suppressed = 0;
|
|
162
|
+
const kept = [];
|
|
163
|
+
for (const finding of findings) {
|
|
164
|
+
const cited = detectFrameworks(finding);
|
|
165
|
+
if (cited.size === 0) {
|
|
166
|
+
// No regulatory reference — keep (it's a general code quality finding)
|
|
167
|
+
kept.push(finding);
|
|
168
|
+
}
|
|
169
|
+
else {
|
|
170
|
+
// Has regulatory reference — keep only if at least one is in scope
|
|
171
|
+
const hasInScope = [...cited].some((id) => normalizedScope.has(id));
|
|
172
|
+
if (hasInScope) {
|
|
173
|
+
kept.push(finding);
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
suppressed++;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
return { findings: kept, suppressed };
|
|
181
|
+
}
|
package/dist/tools/prompts.d.ts
CHANGED
|
@@ -4,7 +4,7 @@ export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies
|
|
|
4
4
|
/** Precision override — ensures evidence-based findings. */
|
|
5
5
|
export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.\n\nCOMMON FALSE POSITIVE PATTERNS (do NOT report these):\n- ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.\n- LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.\n- MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.\n- SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. \"Could be stronger\" is NOT a vulnerability.\n- STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.";
|
|
6
6
|
/** Clean code gate — explicit instructions when code quality is high. */
|
|
7
|
-
export declare const CLEAN_CODE_GATE = "CLEAN CODE GATE (applies AFTER individual judge evaluation):\n- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.\n- Do NOT report stylistic preferences, alternative approaches, or \"nice to have\" improvements as findings. These are opinions, not defects.\n- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).\n- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.\n- SELF-CHECK before finalizing: For each finding, ask \"Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?\" If the answer is not a clear YES, discard the finding.\n- The goal is to match what a thoughtful, experienced human reviewer would flag \u2014 not to demonstrate comprehensive knowledge of every possible concern.";
|
|
7
|
+
export declare const CLEAN_CODE_GATE = "CLEAN CODE GATE (applies AFTER individual judge evaluation):\n- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.\n- Do NOT report stylistic preferences, alternative approaches, or \"nice to have\" improvements as findings. These are opinions, not defects.\n- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).\n- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.\n- SELF-CHECK before finalizing: For each finding, ask \"Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?\" If the answer is not a clear YES, discard the finding.\n- The goal is to match what a thoughtful, experienced human reviewer would flag \u2014 not to demonstrate comprehensive knowledge of every possible concern.\n- SINGLE-FILE LIMITATION: You are reviewing a code snippet, not a complete project. Missing tests, missing docs, missing middleware, missing configs, missing CI/CD, missing logging setup \u2014 these are EXPECTED in a single-file review. Only flag what is WRONG in the code present, not what is ABSENT from the project.\n- FINAL GATE: If your evaluation produces findings for a code snippet that uses established libraries correctly, has proper error handling, follows language idioms, and contains no security vulnerabilities \u2014 your findings are almost certainly false positives. Discard them and report ZERO findings.";
|
|
8
8
|
/**
|
|
9
9
|
* Extract only the unique evaluation criteria from a judge's systemPrompt,
|
|
10
10
|
* stripping the persona introduction line, the ADVERSARIAL MANDATE block,
|
package/dist/tools/prompts.js
CHANGED
|
@@ -44,7 +44,9 @@ export const CLEAN_CODE_GATE = `CLEAN CODE GATE (applies AFTER individual judge
|
|
|
44
44
|
- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).
|
|
45
45
|
- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.
|
|
46
46
|
- SELF-CHECK before finalizing: For each finding, ask "Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?" If the answer is not a clear YES, discard the finding.
|
|
47
|
-
- The goal is to match what a thoughtful, experienced human reviewer would flag — not to demonstrate comprehensive knowledge of every possible concern
|
|
47
|
+
- The goal is to match what a thoughtful, experienced human reviewer would flag — not to demonstrate comprehensive knowledge of every possible concern.
|
|
48
|
+
- SINGLE-FILE LIMITATION: You are reviewing a code snippet, not a complete project. Missing tests, missing docs, missing middleware, missing configs, missing CI/CD, missing logging setup — these are EXPECTED in a single-file review. Only flag what is WRONG in the code present, not what is ABSENT from the project.
|
|
49
|
+
- FINAL GATE: If your evaluation produces findings for a code snippet that uses established libraries correctly, has proper error handling, follows language idioms, and contains no security vulnerabilities — your findings are almost certainly false positives. Discard them and report ZERO findings.`;
|
|
48
50
|
// ─── Criteria Extraction ─────────────────────────────────────────────────────
|
|
49
51
|
/**
|
|
50
52
|
* Extract only the unique evaluation criteria from a judge's systemPrompt,
|
package/dist/types.d.ts
CHANGED
|
@@ -313,6 +313,45 @@ export interface JudgesConfig {
|
|
|
313
313
|
url?: string;
|
|
314
314
|
headers?: Record<string, string>;
|
|
315
315
|
};
|
|
316
|
+
/**
|
|
317
|
+
* Regulatory frameworks in scope for this project. When set, findings that
|
|
318
|
+
* cite ONLY out-of-scope frameworks are suppressed, and in-scope findings
|
|
319
|
+
* are elevated to ensure visibility.
|
|
320
|
+
*
|
|
321
|
+
* If not set, all regulatory findings are reported (no filtering).
|
|
322
|
+
*
|
|
323
|
+
* Supported values: "GDPR", "CCPA", "HIPAA", "PCI-DSS", "SOC2", "SOX",
|
|
324
|
+
* "COPPA", "FERPA", "FedRAMP", "NIST", "ISO27001", "ePrivacy", "DORA",
|
|
325
|
+
* "NIS2", "EU-AI-Act", "LGPD", "PIPEDA"
|
|
326
|
+
*
|
|
327
|
+
* Example:
|
|
328
|
+
* ```json
|
|
329
|
+
* { "regulatoryScope": ["GDPR", "PCI-DSS", "SOC2"] }
|
|
330
|
+
* ```
|
|
331
|
+
*/
|
|
332
|
+
regulatoryScope?: string[];
|
|
333
|
+
/**
|
|
334
|
+
* Consensus suppression threshold (0–1). When set, if at least this
|
|
335
|
+
* fraction of judges report zero findings for a file, findings from
|
|
336
|
+
* the remaining minority judges are suppressed as outliers.
|
|
337
|
+
*
|
|
338
|
+
* This reduces false positives from judges that are structurally prone
|
|
339
|
+
* to over-flagging clean code. A value of 0.7 means "if 70% of judges
|
|
340
|
+
* agree the code is clean, suppress the other 30%."
|
|
341
|
+
*
|
|
342
|
+
* Default: not set (no consensus suppression).
|
|
343
|
+
*
|
|
344
|
+
* Recommended values:
|
|
345
|
+
* - `0.7` — moderate: suppresses when most judges agree (good for CI)
|
|
346
|
+
* - `0.8` — conservative: only suppresses with strong consensus
|
|
347
|
+
* - `0.6` — aggressive: suppresses with slight majority
|
|
348
|
+
*
|
|
349
|
+
* Example:
|
|
350
|
+
* ```json
|
|
351
|
+
* { "consensusThreshold": 0.7 }
|
|
352
|
+
* ```
|
|
353
|
+
*/
|
|
354
|
+
consensusThreshold?: number;
|
|
316
355
|
}
|
|
317
356
|
/**
|
|
318
357
|
* A user-defined pattern-based rule for business logic validation.
|
|
@@ -613,6 +652,48 @@ export interface ReviewDecision {
|
|
|
613
652
|
/** Top blocking issues (up to 3 critical/high findings) */
|
|
614
653
|
blockingIssues: string[];
|
|
615
654
|
}
|
|
655
|
+
/**
|
|
656
|
+
* A finding categorized for the human focus guide.
|
|
657
|
+
*/
|
|
658
|
+
export interface FocusItem {
|
|
659
|
+
/** Rule ID (e.g. "SEC-001") */
|
|
660
|
+
ruleId: string;
|
|
661
|
+
/** Short title */
|
|
662
|
+
title: string;
|
|
663
|
+
/** Severity level */
|
|
664
|
+
severity: Severity;
|
|
665
|
+
/** Confidence score (0-1) */
|
|
666
|
+
confidence: number;
|
|
667
|
+
/** Line numbers if available */
|
|
668
|
+
lineNumbers?: number[];
|
|
669
|
+
/** Why this item is in its bucket */
|
|
670
|
+
reason: string;
|
|
671
|
+
}
|
|
672
|
+
/**
|
|
673
|
+
* An area the automated analysis could not evaluate — requires human judgment.
|
|
674
|
+
*/
|
|
675
|
+
export interface BlindSpot {
|
|
676
|
+
/** Category label (e.g. "Business Logic", "Architectural Fit") */
|
|
677
|
+
area: string;
|
|
678
|
+
/** Description of what the reviewer should look for */
|
|
679
|
+
guidance: string;
|
|
680
|
+
/** Optional: specific lines or patterns that triggered this recommendation */
|
|
681
|
+
triggers?: string[];
|
|
682
|
+
}
|
|
683
|
+
/**
|
|
684
|
+
* Human Focus Guide — directs human reviewers to the areas where their
|
|
685
|
+
* attention adds the most value beyond what automated analysis provides.
|
|
686
|
+
*/
|
|
687
|
+
export interface HumanFocusGuide {
|
|
688
|
+
/** High-confidence, evidence-backed findings the reviewer can trust */
|
|
689
|
+
trust: FocusItem[];
|
|
690
|
+
/** Lower-confidence or absence-based findings that need human verification */
|
|
691
|
+
verify: FocusItem[];
|
|
692
|
+
/** Areas the automated analysis cannot evaluate — human judgment required */
|
|
693
|
+
blindSpots: BlindSpot[];
|
|
694
|
+
/** One-paragraph summary for the reviewer */
|
|
695
|
+
summary: string;
|
|
696
|
+
}
|
|
616
697
|
/**
|
|
617
698
|
* The combined result from the full tribunal panel.
|
|
618
699
|
*/
|
|
@@ -651,6 +732,12 @@ export interface TribunalVerdict {
|
|
|
651
732
|
* act as a primary code reviewer rather than just a warning list.
|
|
652
733
|
*/
|
|
653
734
|
reviewDecision?: ReviewDecision;
|
|
735
|
+
/**
|
|
736
|
+
* Human Focus Guide — directs human reviewers to the areas where their
|
|
737
|
+
* attention adds the most value beyond what automated analysis provides.
|
|
738
|
+
* Categorizes findings into trust/verify/blind-spots buckets.
|
|
739
|
+
*/
|
|
740
|
+
humanFocusGuide?: HumanFocusGuide;
|
|
654
741
|
/**
|
|
655
742
|
* AI model detection escalation. Present when the model-fingerprint judge
|
|
656
743
|
* detects AI-generated code patterns (MFPR-* rules). Downstream consumers
|
package/judgesrc.schema.json
CHANGED
|
@@ -88,6 +88,20 @@
|
|
|
88
88
|
"type": "array",
|
|
89
89
|
"items": { "type": "string" },
|
|
90
90
|
"description": "Plugin module specifiers (npm packages or relative file paths) that export custom JudgeDefinition arrays. Each module must export { judges: JudgeDefinition[] } or a default export."
|
|
91
|
+
},
|
|
92
|
+
"regulatoryScope": {
|
|
93
|
+
"type": "array",
|
|
94
|
+
"items": {
|
|
95
|
+
"type": "string",
|
|
96
|
+
"enum": ["GDPR", "CCPA", "HIPAA", "PCI-DSS", "SOC2", "SOX", "COPPA", "FERPA", "FedRAMP", "NIST", "ISO27001", "ePrivacy", "DORA", "NIS2", "EU-AI-Act", "LGPD", "PIPEDA"]
|
|
97
|
+
},
|
|
98
|
+
"description": "Regulatory frameworks in scope for this project. Findings citing ONLY out-of-scope frameworks are suppressed. If omitted, all regulatory findings are reported."
|
|
99
|
+
},
|
|
100
|
+
"consensusThreshold": {
|
|
101
|
+
"type": "number",
|
|
102
|
+
"minimum": 0,
|
|
103
|
+
"maximum": 1,
|
|
104
|
+
"description": "Consensus suppression threshold (0-1). If this fraction of judges report zero findings, minority findings are suppressed. Recommended: 0.7 (moderate), 0.8 (conservative). If omitted, no consensus suppression is applied."
|
|
91
105
|
}
|
|
92
106
|
},
|
|
93
107
|
"additionalProperties": false
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kevinrabun/judges",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.126.1",
|
|
4
4
|
"description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
|
|
5
5
|
"mcpName": "io.github.KevinRabun/judges",
|
|
6
6
|
"type": "module",
|
|
@@ -145,7 +145,7 @@
|
|
|
145
145
|
"zod": "^4.3.6"
|
|
146
146
|
},
|
|
147
147
|
"devDependencies": {
|
|
148
|
-
"@anthropic-ai/sdk": "^0.
|
|
148
|
+
"@anthropic-ai/sdk": "^0.81.0",
|
|
149
149
|
"@eslint/js": "^10.0.1",
|
|
150
150
|
"@types/node": "^25.3.0",
|
|
151
151
|
"@typescript-eslint/eslint-plugin": "^8.56.1",
|
package/server.json
CHANGED
|
@@ -16,12 +16,12 @@
|
|
|
16
16
|
"mimeType": "image/png"
|
|
17
17
|
}
|
|
18
18
|
],
|
|
19
|
-
"version": "3.
|
|
19
|
+
"version": "3.126.1",
|
|
20
20
|
"packages": [
|
|
21
21
|
{
|
|
22
22
|
"registryType": "npm",
|
|
23
23
|
"identifier": "@kevinrabun/judges",
|
|
24
|
-
"version": "3.
|
|
24
|
+
"version": "3.126.1",
|
|
25
25
|
"transport": {
|
|
26
26
|
"type": "stdio"
|
|
27
27
|
}
|