@kevinrabun/judges 3.115.4 → 3.117.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/accessibility.judge.md +7 -0
- package/agents/agent-instructions.judge.md +7 -0
- package/agents/ai-code-safety.judge.md +7 -0
- package/agents/api-contract.judge.md +7 -0
- package/agents/api-design.judge.md +7 -0
- package/agents/authentication.judge.md +7 -0
- package/agents/backwards-compatibility.judge.md +7 -0
- package/agents/caching.judge.md +7 -0
- package/agents/ci-cd.judge.md +7 -0
- package/agents/cloud-readiness.judge.md +7 -0
- package/agents/concurrency.judge.md +7 -0
- package/agents/configuration-management.judge.md +7 -0
- package/agents/cybersecurity.judge.md +7 -0
- package/agents/data-security.judge.md +7 -0
- package/agents/dependency-health.judge.md +7 -0
- package/agents/documentation.judge.md +7 -0
- package/agents/error-handling.judge.md +7 -0
- package/agents/ethics-bias.judge.md +7 -0
- package/agents/false-positive-review.judge.md +12 -0
- package/agents/framework-safety.judge.md +7 -0
- package/agents/hallucination-detection.judge.md +13 -0
- package/agents/iac-security.judge.md +7 -0
- package/agents/intent-alignment.judge.md +13 -0
- package/agents/logging-privacy.judge.md +7 -0
- package/agents/maintainability.judge.md +7 -0
- package/agents/multi-turn-coherence.judge.md +7 -0
- package/agents/observability.judge.md +7 -0
- package/agents/portability.judge.md +7 -0
- package/agents/rate-limiting.judge.md +7 -0
- package/agents/reliability.judge.md +7 -0
- package/agents/security.judge.md +13 -0
- package/agents/testing.judge.md +7 -0
- package/agents/ux.judge.md +7 -0
- package/dist/a2a-protocol.d.ts +136 -0
- package/dist/a2a-protocol.js +218 -0
- package/dist/api.d.ts +21 -3
- package/dist/api.js +21 -1
- package/dist/audit-trail.d.ts +245 -0
- package/dist/audit-trail.js +257 -0
- package/dist/commands/benchmark-advanced.js +51 -51
- package/dist/commands/benchmark-ai-agents.js +16 -16
- package/dist/commands/benchmark-compliance-ethics.js +12 -12
- package/dist/commands/benchmark-expanded-2.js +2 -2
- package/dist/commands/benchmark-expanded.js +2 -2
- package/dist/commands/benchmark-infrastructure.js +12 -12
- package/dist/commands/benchmark-languages.js +11 -11
- package/dist/commands/benchmark-quality-ops.js +7 -7
- package/dist/commands/benchmark-security-deep.js +9 -9
- package/dist/commands/benchmark.js +1 -1
- package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
- package/dist/commands/llm-benchmark-optimizer.js +241 -0
- package/dist/commands/llm-benchmark.d.ts +4 -2
- package/dist/commands/llm-benchmark.js +40 -12
- package/dist/escalation.d.ts +100 -0
- package/dist/escalation.js +292 -0
- package/dist/evaluation-session.d.ts +74 -0
- package/dist/evaluation-session.js +152 -0
- package/dist/evaluators/index.d.ts +23 -1
- package/dist/evaluators/index.js +192 -3
- package/dist/evaluators/judge-selector.d.ts +19 -0
- package/dist/evaluators/judge-selector.js +141 -0
- package/dist/evaluators/recall-boost.d.ts +27 -0
- package/dist/evaluators/recall-boost.js +409 -0
- package/dist/feedback-loop.d.ts +62 -0
- package/dist/feedback-loop.js +179 -0
- package/dist/index.js +2 -0
- package/dist/judges/accessibility.js +7 -0
- package/dist/judges/agent-instructions.js +7 -0
- package/dist/judges/ai-code-safety.js +7 -0
- package/dist/judges/api-contract.js +7 -0
- package/dist/judges/api-design.js +7 -0
- package/dist/judges/authentication.js +7 -0
- package/dist/judges/backwards-compatibility.js +7 -0
- package/dist/judges/caching.js +7 -0
- package/dist/judges/ci-cd.js +7 -0
- package/dist/judges/cloud-readiness.js +7 -0
- package/dist/judges/concurrency.js +7 -0
- package/dist/judges/configuration-management.js +7 -0
- package/dist/judges/cybersecurity.js +7 -0
- package/dist/judges/data-security.js +7 -0
- package/dist/judges/dependency-health.js +7 -0
- package/dist/judges/documentation.js +7 -0
- package/dist/judges/error-handling.js +7 -0
- package/dist/judges/ethics-bias.js +7 -0
- package/dist/judges/false-positive-review.js +13 -1
- package/dist/judges/framework-safety.js +7 -0
- package/dist/judges/hallucination-detection.js +14 -1
- package/dist/judges/iac-security.js +7 -0
- package/dist/judges/intent-alignment.js +14 -1
- package/dist/judges/logging-privacy.js +7 -0
- package/dist/judges/maintainability.js +7 -0
- package/dist/judges/multi-turn-coherence.js +7 -0
- package/dist/judges/observability.js +7 -0
- package/dist/judges/portability.js +7 -0
- package/dist/judges/rate-limiting.js +7 -0
- package/dist/judges/reliability.js +7 -0
- package/dist/judges/security.js +14 -1
- package/dist/judges/testing.js +7 -0
- package/dist/judges/ux.js +7 -0
- package/dist/review-conversation.d.ts +87 -0
- package/dist/review-conversation.js +307 -0
- package/dist/sast-integration.d.ts +112 -0
- package/dist/sast-integration.js +215 -0
- package/dist/tools/register-evaluation.js +208 -8
- package/dist/tools/register-fix.js +24 -1
- package/dist/tools/register-resources.d.ts +6 -0
- package/dist/tools/register-resources.js +177 -0
- package/dist/tools/register-review.js +26 -1
- package/dist/tools/register-workflow.js +384 -11
- package/dist/tools/validation.d.ts +13 -0
- package/dist/tools/validation.js +77 -0
- package/dist/types.d.ts +122 -0
- package/package.json +25 -12
- package/server.json +2 -2
package/dist/evaluators/index.js
CHANGED
|
@@ -19,6 +19,10 @@ import { loadFeedbackStore } from "../commands/feedback.js";
|
|
|
19
19
|
import { CROSS_FILE_SECURITY_CATEGORIES } from "./project.js";
|
|
20
20
|
import { applyTriageFeedback, loadFindingStore } from "../finding-lifecycle.js";
|
|
21
21
|
import { enrichWithSecurityIds } from "../security-ids.js";
|
|
22
|
+
import { selectJudges } from "./judge-selector.js";
|
|
23
|
+
import { getGlobalSession } from "../evaluation-session.js";
|
|
24
|
+
import { evaluateEscalations, enhanceReviewWithEscalations } from "../escalation.js";
|
|
25
|
+
import { applyRecallBoost } from "./recall-boost.js";
|
|
22
26
|
// ── AST-aware post-processing ───────────────────────────────────────────────
|
|
23
27
|
// ── Module-level caches for AST/taint results ───────────────────────────────
|
|
24
28
|
const astStructureCache = new LRUCache(256);
|
|
@@ -390,7 +394,19 @@ function resolveJudgeSet(options) {
|
|
|
390
394
|
const disabled = new Set(options.config.disabledJudges);
|
|
391
395
|
judges = judges.filter((j) => !disabled.has(j.id));
|
|
392
396
|
}
|
|
393
|
-
|
|
397
|
+
// Adaptive judge selection — skip irrelevant judges based on file context
|
|
398
|
+
if (options?.adaptiveSelection && options.filePath) {
|
|
399
|
+
const fileCategory = classifyFile("", options.filePath.split(".").pop() ?? "", options.filePath);
|
|
400
|
+
const ctx = {
|
|
401
|
+
language: options.filePath.split(".").pop() ?? "unknown",
|
|
402
|
+
fileCategory,
|
|
403
|
+
filePath: options.filePath,
|
|
404
|
+
projectMode: options.projectMode,
|
|
405
|
+
};
|
|
406
|
+
const result = selectJudges(judges, ctx);
|
|
407
|
+
return { judges: result.selected, skipped: result.skipped };
|
|
408
|
+
}
|
|
409
|
+
return { judges };
|
|
394
410
|
}
|
|
395
411
|
/**
|
|
396
412
|
* Check whether an absence-based finding is mitigated by a pre-scanned
|
|
@@ -419,6 +435,16 @@ export function evaluateWithJudge(judge, code, language, context, options) {
|
|
|
419
435
|
: undefined;
|
|
420
436
|
findings.push(...judge.analyze(code, language, analyzeCtx));
|
|
421
437
|
}
|
|
438
|
+
// ── Recall boost: supplementary patterns for weak-recall categories ──
|
|
439
|
+
const boostResult = applyRecallBoost(code, language);
|
|
440
|
+
if (boostResult.findings.length > 0) {
|
|
441
|
+
// Deduplicate: only add boost findings whose ruleId isn't already present
|
|
442
|
+
for (const bf of boostResult.findings) {
|
|
443
|
+
if (!findings.some((f) => f.ruleId === bf.ruleId)) {
|
|
444
|
+
findings.push(bf);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
}
|
|
422
448
|
// ── Absence gating ──
|
|
423
449
|
// Absence-based findings ("no rate limiting", "no monitoring", etc.) are
|
|
424
450
|
// project-level concerns that cannot be accurately assessed from a single
|
|
@@ -649,7 +675,7 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
649
675
|
...(astResult ? { _astCache: astResult } : {}),
|
|
650
676
|
...(taintResult ? { _taintFlows: taintResult } : {}),
|
|
651
677
|
};
|
|
652
|
-
const judges = resolveJudgeSet(enrichedOptions);
|
|
678
|
+
const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
|
|
653
679
|
const tribunalStart = performance.now();
|
|
654
680
|
const evaluations = judges.map((judge) => {
|
|
655
681
|
const start = performance.now();
|
|
@@ -776,7 +802,30 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
776
802
|
// No triage data or error loading — continue without adjustment
|
|
777
803
|
}
|
|
778
804
|
const maxFindings = options?.maxFindingsPerFile ?? DEFAULT_MAX_FINDINGS_PER_FILE;
|
|
779
|
-
|
|
805
|
+
// ── Session feedback calibration ──
|
|
806
|
+
// Apply confidence penalties from accumulated FP feedback in the
|
|
807
|
+
// current evaluation session. This is the real-time agentic loop:
|
|
808
|
+
// user marks findings as FP → session records it → subsequent
|
|
809
|
+
// evaluations automatically reduce confidence on those rules.
|
|
810
|
+
let sessionAdjusted = triageAdjusted;
|
|
811
|
+
try {
|
|
812
|
+
const session = getGlobalSession();
|
|
813
|
+
const tally = session.getFeedbackTally();
|
|
814
|
+
if (tally.size > 0) {
|
|
815
|
+
sessionAdjusted = triageAdjusted.map((f) => {
|
|
816
|
+
const penalty = session.getConfidencePenalty(f.ruleId);
|
|
817
|
+
if (penalty < 1.0) {
|
|
818
|
+
const adjusted = clampConfidence((f.confidence ?? 0.5) * penalty);
|
|
819
|
+
return { ...f, confidence: adjusted };
|
|
820
|
+
}
|
|
821
|
+
return f;
|
|
822
|
+
});
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
catch {
|
|
826
|
+
// Session feedback calibration failure is non-fatal
|
|
827
|
+
}
|
|
828
|
+
const cappedFindings = applyPerFileFindingCap(sessionAdjusted, maxFindings);
|
|
780
829
|
// ── Confidence-based tiering for progressive disclosure ──
|
|
781
830
|
// Tag each finding with a disclosure tier so downstream consumers (CLI,
|
|
782
831
|
// formatters, VS Code extension) can show only high-confidence findings
|
|
@@ -852,6 +901,23 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
852
901
|
: "AI-generated code patterns detected — review for model-specific biases",
|
|
853
902
|
};
|
|
854
903
|
}
|
|
904
|
+
// ── Human escalation protocol ──
|
|
905
|
+
// Evaluate which findings need human review based on escalation policy.
|
|
906
|
+
// Enhances the review decision with escalation routing information.
|
|
907
|
+
if (options?.config?.escalationThreshold || options?.filePath) {
|
|
908
|
+
try {
|
|
909
|
+
const escalationPolicy = options?.config?.escalationThreshold
|
|
910
|
+
? { confidenceThreshold: options.config.escalationThreshold }
|
|
911
|
+
: undefined;
|
|
912
|
+
const escalations = evaluateEscalations(result, options?.filePath ?? "<unknown>", escalationPolicy);
|
|
913
|
+
if (escalations.length > 0 && result.reviewDecision) {
|
|
914
|
+
result.reviewDecision = enhanceReviewWithEscalations(result.reviewDecision, escalations);
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
catch {
|
|
918
|
+
// Escalation evaluation failure is non-fatal
|
|
919
|
+
}
|
|
920
|
+
}
|
|
855
921
|
// ── Disk cache: persist for future runs ──
|
|
856
922
|
if (diskCache) {
|
|
857
923
|
try {
|
|
@@ -863,6 +929,129 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
863
929
|
}
|
|
864
930
|
return result;
|
|
865
931
|
}
|
|
932
|
+
// ─── Streaming Evaluation ────────────────────────────────────────────────────
|
|
933
|
+
/**
|
|
934
|
+
* Streaming tribunal evaluation — yields per-judge results as each judge
|
|
935
|
+
* completes, enabling progressive UI updates and early termination.
|
|
936
|
+
*
|
|
937
|
+
* Each yielded `StreamingBatch` contains the judge evaluation, execution
|
|
938
|
+
* trace, and running aggregate statistics.
|
|
939
|
+
*
|
|
940
|
+
* Usage:
|
|
941
|
+
* ```ts
|
|
942
|
+
* for await (const batch of evaluateWithTribunalStreaming(code, lang)) {
|
|
943
|
+
* console.log(`${batch.judgeName}: ${batch.evaluation.findings.length} findings`);
|
|
944
|
+
* if (batch.aggregate.criticalSoFar > 10) break; // early termination
|
|
945
|
+
* }
|
|
946
|
+
* ```
|
|
947
|
+
*/
|
|
948
|
+
export async function* evaluateWithTribunalStreaming(code, language, context, options) {
|
|
949
|
+
const includeAst = options?.includeAstFindings ?? true;
|
|
950
|
+
const hash = contentHash(code, language);
|
|
951
|
+
let astResult = options?._astCache;
|
|
952
|
+
if (!astResult && includeAst) {
|
|
953
|
+
astResult = astStructureCache.get(hash);
|
|
954
|
+
if (!astResult) {
|
|
955
|
+
astResult = analyzeStructure(code, language);
|
|
956
|
+
astStructureCache.set(hash, astResult);
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
let taintResult = options?._taintFlows;
|
|
960
|
+
if (!taintResult) {
|
|
961
|
+
taintResult = taintFlowCache.get(hash);
|
|
962
|
+
if (!taintResult) {
|
|
963
|
+
taintResult = analyzeTaintFlows(code, language);
|
|
964
|
+
taintFlowCache.set(hash, taintResult);
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
const enrichedOptions = {
|
|
968
|
+
...options,
|
|
969
|
+
...(astResult ? { _astCache: astResult } : {}),
|
|
970
|
+
...(taintResult ? { _taintFlows: taintResult } : {}),
|
|
971
|
+
};
|
|
972
|
+
const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
|
|
973
|
+
const totalJudges = judges.length;
|
|
974
|
+
let completedJudges = 0;
|
|
975
|
+
let findingsSoFar = 0;
|
|
976
|
+
let criticalSoFar = 0;
|
|
977
|
+
let highSoFar = 0;
|
|
978
|
+
let scoreSum = 0;
|
|
979
|
+
let hasFailure = false;
|
|
980
|
+
let hasWarning = false;
|
|
981
|
+
for (const judge of judges) {
|
|
982
|
+
const start = performance.now();
|
|
983
|
+
const evaluation = evaluateWithJudge(judge, code, language, context, enrichedOptions);
|
|
984
|
+
const durationMs = Math.round(performance.now() - start);
|
|
985
|
+
evaluation.durationMs = durationMs;
|
|
986
|
+
completedJudges++;
|
|
987
|
+
findingsSoFar += evaluation.findings.length;
|
|
988
|
+
criticalSoFar += evaluation.findings.filter((f) => f.severity === "critical").length;
|
|
989
|
+
highSoFar += evaluation.findings.filter((f) => f.severity === "high").length;
|
|
990
|
+
scoreSum += evaluation.score;
|
|
991
|
+
if (evaluation.verdict === "fail")
|
|
992
|
+
hasFailure = true;
|
|
993
|
+
if (evaluation.verdict === "warning")
|
|
994
|
+
hasWarning = true;
|
|
995
|
+
const trace = {
|
|
996
|
+
judgeId: judge.id,
|
|
997
|
+
judgeName: judge.name,
|
|
998
|
+
durationMs,
|
|
999
|
+
rules: buildRuleTraces(evaluation),
|
|
1000
|
+
rawFindingCount: evaluation.findings.length,
|
|
1001
|
+
finalFindingCount: evaluation.findings.length,
|
|
1002
|
+
...(astResult
|
|
1003
|
+
? {
|
|
1004
|
+
astResolution: {
|
|
1005
|
+
functionsAnalyzed: astResult.functions.length,
|
|
1006
|
+
maxComplexity: Math.max(0, ...astResult.functions.map((f) => f.cyclomaticComplexity)),
|
|
1007
|
+
taintFlowsDetected: taintResult?.length ?? 0,
|
|
1008
|
+
},
|
|
1009
|
+
}
|
|
1010
|
+
: {}),
|
|
1011
|
+
};
|
|
1012
|
+
const currentVerdict = hasFailure ? "fail" : hasWarning ? "warning" : "pass";
|
|
1013
|
+
yield {
|
|
1014
|
+
judgeId: judge.id,
|
|
1015
|
+
judgeName: judge.name,
|
|
1016
|
+
evaluation,
|
|
1017
|
+
trace,
|
|
1018
|
+
aggregate: {
|
|
1019
|
+
completedJudges,
|
|
1020
|
+
totalJudges,
|
|
1021
|
+
findingsSoFar,
|
|
1022
|
+
criticalSoFar,
|
|
1023
|
+
highSoFar,
|
|
1024
|
+
currentScore: Math.round(scoreSum / completedJudges),
|
|
1025
|
+
currentVerdict,
|
|
1026
|
+
},
|
|
1027
|
+
done: completedJudges === totalJudges,
|
|
1028
|
+
};
|
|
1029
|
+
// Yield to the event loop between judges for responsiveness
|
|
1030
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
/**
|
|
1034
|
+
* Build rule-level traces from a judge evaluation for observability.
|
|
1035
|
+
*/
|
|
1036
|
+
function buildRuleTraces(evaluation) {
|
|
1037
|
+
const ruleMap = new Map();
|
|
1038
|
+
for (const f of evaluation.findings) {
|
|
1039
|
+
const existing = ruleMap.get(f.ruleId);
|
|
1040
|
+
if (existing) {
|
|
1041
|
+
existing.count++;
|
|
1042
|
+
existing.peakConf = Math.max(existing.peakConf, f.confidence ?? 0.5);
|
|
1043
|
+
}
|
|
1044
|
+
else {
|
|
1045
|
+
ruleMap.set(f.ruleId, { count: 1, peakConf: f.confidence ?? 0.5 });
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
return [...ruleMap.entries()].map(([ruleId, { count, peakConf }]) => ({
|
|
1049
|
+
ruleId,
|
|
1050
|
+
matched: true,
|
|
1051
|
+
findingCount: count,
|
|
1052
|
+
peakConfidence: peakConf,
|
|
1053
|
+
}));
|
|
1054
|
+
}
|
|
866
1055
|
// ─── Project-level Multi-file Analysis (delegated to project.ts) ─────────────
|
|
867
1056
|
import { evaluateProject as _evaluateProject } from "./project.js";
|
|
868
1057
|
export { scanProjectWideSecurityPatterns } from "./project.js";
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive judge selection — picks only the judges relevant to a given file
|
|
3
|
+
* based on language, framework, file role, and project context.
|
|
4
|
+
*
|
|
5
|
+
* Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
|
|
6
|
+
* or "iac-security" on a React component) while keeping the full panel
|
|
7
|
+
* available for explicit requests.
|
|
8
|
+
*/
|
|
9
|
+
import type { JudgeDefinition, JudgeSelectionContext, JudgeSelectionResult } from "../types.js";
|
|
10
|
+
/**
|
|
11
|
+
* Select the most relevant judges for a given file context.
|
|
12
|
+
*
|
|
13
|
+
* Strategy:
|
|
14
|
+
* 1. Always include core judges (security, false-positive-review)
|
|
15
|
+
* 2. Skip judges with language incompatibility
|
|
16
|
+
* 3. Skip judges irrelevant to the file category
|
|
17
|
+
* 4. Return selection with skip reasons for observability
|
|
18
|
+
*/
|
|
19
|
+
export declare function selectJudges(judges: JudgeDefinition[], ctx: JudgeSelectionContext): JudgeSelectionResult;
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive judge selection — picks only the judges relevant to a given file
|
|
3
|
+
* based on language, framework, file role, and project context.
|
|
4
|
+
*
|
|
5
|
+
* Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
|
|
6
|
+
* or "iac-security" on a React component) while keeping the full panel
|
|
7
|
+
* available for explicit requests.
|
|
8
|
+
*/
|
|
9
|
+
// ─── Language → judge relevance ──────────────────────────────────────────────
|
|
10
|
+
/**
|
|
11
|
+
* Judges that are ONLY relevant for specific language families.
|
|
12
|
+
* If the language isn't listed, the judge is skipped.
|
|
13
|
+
* Most judges are language-agnostic and not listed here.
|
|
14
|
+
*/
|
|
15
|
+
const LANGUAGE_SPECIFIC = {
|
|
16
|
+
// IaC judges only apply to infrastructure languages
|
|
17
|
+
"iac-security": new Set(["terraform", "bicep", "arm", "dockerfile", "yaml"]),
|
|
18
|
+
};
|
|
19
|
+
/**
|
|
20
|
+
* Judges to SKIP for specific languages — inverse of above.
|
|
21
|
+
* E.g. testing patterns don't apply to SQL or Dockerfile.
|
|
22
|
+
*/
|
|
23
|
+
const LANGUAGE_SKIP = {
|
|
24
|
+
testing: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
|
|
25
|
+
documentation: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
|
|
26
|
+
"code-structure": new Set(["sql", "dockerfile", "yaml"]),
|
|
27
|
+
ux: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
|
|
28
|
+
accessibility: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
|
|
29
|
+
internationalization: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
|
|
30
|
+
concurrency: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
|
|
31
|
+
"over-engineering": new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
|
|
32
|
+
};
|
|
33
|
+
// ─── File category → judge relevance ────────────────────────────────────────
|
|
34
|
+
/**
|
|
35
|
+
* Judges to skip when evaluating test files — noise reduction.
|
|
36
|
+
*/
|
|
37
|
+
const SKIP_FOR_TESTS = new Set([
|
|
38
|
+
"documentation",
|
|
39
|
+
"rate-limiting",
|
|
40
|
+
"scalability",
|
|
41
|
+
"cloud-readiness",
|
|
42
|
+
"ci-cd",
|
|
43
|
+
"configuration-management",
|
|
44
|
+
"cost-effectiveness",
|
|
45
|
+
"data-sovereignty",
|
|
46
|
+
"compliance",
|
|
47
|
+
"internationalization",
|
|
48
|
+
"ux",
|
|
49
|
+
"accessibility",
|
|
50
|
+
"observability",
|
|
51
|
+
]);
|
|
52
|
+
/**
|
|
53
|
+
* Judges to skip for config/manifest files.
|
|
54
|
+
*/
|
|
55
|
+
const SKIP_FOR_CONFIG = new Set([
|
|
56
|
+
"testing",
|
|
57
|
+
"documentation",
|
|
58
|
+
"code-structure",
|
|
59
|
+
"error-handling",
|
|
60
|
+
"performance",
|
|
61
|
+
"concurrency",
|
|
62
|
+
"scalability",
|
|
63
|
+
"ux",
|
|
64
|
+
"accessibility",
|
|
65
|
+
"internationalization",
|
|
66
|
+
"over-engineering",
|
|
67
|
+
"backwards-compatibility",
|
|
68
|
+
"maintainability",
|
|
69
|
+
]);
|
|
70
|
+
/**
|
|
71
|
+
* Judges to skip for IaC files (Terraform, Bicep, ARM, Dockerfile).
|
|
72
|
+
*/
|
|
73
|
+
const SKIP_FOR_IAC = new Set([
|
|
74
|
+
"testing",
|
|
75
|
+
"code-structure",
|
|
76
|
+
"concurrency",
|
|
77
|
+
"over-engineering",
|
|
78
|
+
"ux",
|
|
79
|
+
"accessibility",
|
|
80
|
+
"internationalization",
|
|
81
|
+
"api-design",
|
|
82
|
+
"api-contract",
|
|
83
|
+
"backwards-compatibility",
|
|
84
|
+
"hallucination-detection",
|
|
85
|
+
"multi-turn-coherence",
|
|
86
|
+
"model-fingerprint",
|
|
87
|
+
]);
|
|
88
|
+
// ─── Core judges that always run ─────────────────────────────────────────────
|
|
89
|
+
/** These judges run unconditionally — they cover universally applicable concerns. */
|
|
90
|
+
const ALWAYS_RUN = new Set(["security", "cybersecurity", "false-positive-review"]);
|
|
91
|
+
// ─── Selection logic ─────────────────────────────────────────────────────────
|
|
92
|
+
/**
|
|
93
|
+
* Select the most relevant judges for a given file context.
|
|
94
|
+
*
|
|
95
|
+
* Strategy:
|
|
96
|
+
* 1. Always include core judges (security, false-positive-review)
|
|
97
|
+
* 2. Skip judges with language incompatibility
|
|
98
|
+
* 3. Skip judges irrelevant to the file category
|
|
99
|
+
* 4. Return selection with skip reasons for observability
|
|
100
|
+
*/
|
|
101
|
+
export function selectJudges(judges, ctx) {
|
|
102
|
+
const selected = [];
|
|
103
|
+
const skipped = [];
|
|
104
|
+
const lang = ctx.language.toLowerCase();
|
|
105
|
+
const cat = ctx.fileCategory?.toLowerCase() ?? "";
|
|
106
|
+
for (const judge of judges) {
|
|
107
|
+
// Core judges always run
|
|
108
|
+
if (ALWAYS_RUN.has(judge.id)) {
|
|
109
|
+
selected.push(judge);
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
// Language-specific judge: skip if language not in its set
|
|
113
|
+
const langOnly = LANGUAGE_SPECIFIC[judge.id];
|
|
114
|
+
if (langOnly && !langOnly.has(lang)) {
|
|
115
|
+
skipped.push({ judgeId: judge.id, reason: `not relevant for language: ${lang}` });
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
// Language skip: judge not useful for this language
|
|
119
|
+
const langSkip = LANGUAGE_SKIP[judge.id];
|
|
120
|
+
if (langSkip && langSkip.has(lang)) {
|
|
121
|
+
skipped.push({ judgeId: judge.id, reason: `skipped for language: ${lang}` });
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
// File category gating
|
|
125
|
+
if (cat === "test" && SKIP_FOR_TESTS.has(judge.id)) {
|
|
126
|
+
skipped.push({ judgeId: judge.id, reason: "not relevant for test files" });
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
if (cat === "config" && SKIP_FOR_CONFIG.has(judge.id)) {
|
|
130
|
+
skipped.push({ judgeId: judge.id, reason: "not relevant for config files" });
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
if ((cat === "iac" || lang === "terraform" || lang === "bicep" || lang === "arm" || lang === "dockerfile") &&
|
|
134
|
+
SKIP_FOR_IAC.has(judge.id)) {
|
|
135
|
+
skipped.push({ judgeId: judge.id, reason: "not relevant for infrastructure code" });
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
selected.push(judge);
|
|
139
|
+
}
|
|
140
|
+
return { selected, skipped };
|
|
141
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recall Booster — Additional detection patterns for weak-recall categories
|
|
3
|
+
*
|
|
4
|
+
* This module provides supplementary pattern detection for judge categories
|
|
5
|
+
* where the deterministic evaluators have recall below 85%. It acts as
|
|
6
|
+
* a second-pass augmentation applied after the primary evaluator.
|
|
7
|
+
*
|
|
8
|
+
* Categories strengthened (by recall gap analysis):
|
|
9
|
+
* - hallucination-detection (46.2% → improved)
|
|
10
|
+
* - ci-cd (41.7% → improved)
|
|
11
|
+
* - internationalization (42.9% → improved)
|
|
12
|
+
* - cost-effectiveness (57.1% → improved)
|
|
13
|
+
* - documentation (63.6% → improved)
|
|
14
|
+
* - iac-security (66.7% → improved)
|
|
15
|
+
* - cloud/cloud-readiness (50-73% → improved)
|
|
16
|
+
*/
|
|
17
|
+
import type { Finding } from "../types.js";
|
|
18
|
+
interface BoostResult {
|
|
19
|
+
findings: Finding[];
|
|
20
|
+
boostedCategories: string[];
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Apply recall-boosting patterns to detect issues that primary evaluators miss.
|
|
24
|
+
* Returns additional findings (does not modify existing ones).
|
|
25
|
+
*/
|
|
26
|
+
export declare function applyRecallBoost(code: string, language: string): BoostResult;
|
|
27
|
+
export {};
|