@kevinrabun/judges 3.115.3 → 3.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +7 -3
- package/dist/api.js +7 -1
- package/dist/evaluation-session.d.ts +74 -0
- package/dist/evaluation-session.js +152 -0
- package/dist/evaluators/index.d.ts +23 -1
- package/dist/evaluators/index.js +163 -3
- package/dist/evaluators/judge-selector.d.ts +19 -0
- package/dist/evaluators/judge-selector.js +141 -0
- package/dist/index.js +2 -0
- package/dist/judges/index.d.ts +54 -9
- package/dist/judges/index.js +72 -14
- package/dist/tools/register-evaluation.js +208 -8
- package/dist/tools/register-fix.js +24 -1
- package/dist/tools/register-resources.d.ts +6 -0
- package/dist/tools/register-resources.js +177 -0
- package/dist/tools/register-review.js +26 -1
- package/dist/tools/register-workflow.js +384 -11
- package/dist/tools/validation.d.ts +13 -0
- package/dist/tools/validation.js +77 -0
- package/dist/types.d.ts +122 -0
- package/package.json +24 -12
- package/server.json +2 -2
package/dist/api.d.ts
CHANGED
|
@@ -8,12 +8,16 @@
|
|
|
8
8
|
* const result = evaluateCode("const x = eval(input);", "typescript");
|
|
9
9
|
* ```
|
|
10
10
|
*/
|
|
11
|
-
export type { Severity, Verdict, Finding, Patch, LangFamily, JudgesConfig, RuleOverride, ProjectFile, ProjectVerdict, DiffVerdict, DependencyEntry, DependencyVerdict, JudgeEvaluation, TribunalVerdict, JudgeDefinition, EvaluationContextV2, EvidenceBundleV2, SpecializedFindingV2, TribunalVerdictV2, MustFixGateOptions, MustFixGateResult, AppBuilderWorkflowResult, PlainLanguageFinding, WorkflowTask, PolicyProfile, SuppressionRecord, SuppressionResult, } from "./types.js";
|
|
11
|
+
export type { Severity, Verdict, Finding, Patch, LangFamily, JudgesConfig, RuleOverride, ProjectFile, ProjectVerdict, DiffVerdict, DependencyEntry, DependencyVerdict, JudgeEvaluation, TribunalVerdict, JudgeDefinition, EvaluationContextV2, EvidenceBundleV2, SpecializedFindingV2, TribunalVerdictV2, MustFixGateOptions, MustFixGateResult, AppBuilderWorkflowResult, PlainLanguageFinding, WorkflowTask, PolicyProfile, SuppressionRecord, SuppressionResult, ExecutionTrace, RuleTrace, StreamingBatch, JudgeSelectionContext, JudgeSelectionResult, SessionContext, } from "./types.js";
|
|
12
12
|
export { JudgesError, ConfigError, EvaluationError, ParseError } from "./errors.js";
|
|
13
13
|
export { parseConfig, defaultConfig, mergeConfigs, discoverCascadingConfigs, loadCascadingConfig, loadPluginJudges, validatePluginSpecifiers, isValidJudgeDefinition, applyOverridesForFile, applyLanguageProfile, resolveExtendsConfig, } from "./config.js";
|
|
14
14
|
export { JUDGES, getJudge, getJudgeSummaries } from "./judges/index.js";
|
|
15
|
-
export { evaluateWithJudge, evaluateWithTribunal, evaluateProject, evaluateDiff, analyzeDependencies, enrichWithPatches, crossEvaluatorDedup, diffFindings, formatFindingDiff, evaluateNetChangeGate, applyInlineSuppressions, applyInlineSuppressionsWithAudit, runAppBuilderWorkflow, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, clearEvaluationCaches, scanProjectWideSecurityPatterns, } from "./evaluators/index.js";
|
|
16
|
-
export type { FindingDiff, NetChangeGateOptions, NetChangeGateResult } from "./evaluators/index.js";
|
|
15
|
+
export { evaluateWithJudge, evaluateWithTribunal, evaluateWithTribunalStreaming, evaluateProject, evaluateDiff, analyzeDependencies, enrichWithPatches, crossEvaluatorDedup, diffFindings, formatFindingDiff, evaluateNetChangeGate, applyInlineSuppressions, applyInlineSuppressionsWithAudit, runAppBuilderWorkflow, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, clearEvaluationCaches, scanProjectWideSecurityPatterns, } from "./evaluators/index.js";
|
|
16
|
+
export type { FindingDiff, NetChangeGateOptions, NetChangeGateResult, EvaluationOptions } from "./evaluators/index.js";
|
|
17
|
+
export { selectJudges } from "./evaluators/judge-selector.js";
|
|
18
|
+
export { EvaluationSession, getGlobalSession, resetGlobalSession } from "./evaluation-session.js";
|
|
19
|
+
export { getPreset, composePresets, PRESETS } from "./presets.js";
|
|
20
|
+
export type { Preset } from "./presets.js";
|
|
17
21
|
export { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from "./evaluators/v2.js";
|
|
18
22
|
export { analyzeCrossFileTaint } from "./ast/cross-file-taint.js";
|
|
19
23
|
export { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection, buildSimplifiedDeepReviewSection, isContentPolicyRefusal, DEEP_REVIEW_PROMPT_INTRO, DEEP_REVIEW_IDENTITY, } from "./tools/deep-review.js";
|
package/dist/api.js
CHANGED
|
@@ -15,7 +15,13 @@ export { parseConfig, defaultConfig, mergeConfigs, discoverCascadingConfigs, loa
|
|
|
15
15
|
// ─── Judge Registry ──────────────────────────────────────────────────────────
|
|
16
16
|
export { JUDGES, getJudge, getJudgeSummaries } from "./judges/index.js";
|
|
17
17
|
// ─── Core Evaluation Functions ───────────────────────────────────────────────
|
|
18
|
-
export { evaluateWithJudge, evaluateWithTribunal, evaluateProject, evaluateDiff, analyzeDependencies, enrichWithPatches, crossEvaluatorDedup, diffFindings, formatFindingDiff, evaluateNetChangeGate, applyInlineSuppressions, applyInlineSuppressionsWithAudit, runAppBuilderWorkflow, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, clearEvaluationCaches, scanProjectWideSecurityPatterns, } from "./evaluators/index.js";
|
|
18
|
+
export { evaluateWithJudge, evaluateWithTribunal, evaluateWithTribunalStreaming, evaluateProject, evaluateDiff, analyzeDependencies, enrichWithPatches, crossEvaluatorDedup, diffFindings, formatFindingDiff, evaluateNetChangeGate, applyInlineSuppressions, applyInlineSuppressionsWithAudit, runAppBuilderWorkflow, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, clearEvaluationCaches, scanProjectWideSecurityPatterns, } from "./evaluators/index.js";
|
|
19
|
+
// ─── Adaptive Judge Selection ────────────────────────────────────────────────
|
|
20
|
+
export { selectJudges } from "./evaluators/judge-selector.js";
|
|
21
|
+
// ─── Evaluation Session ─────────────────────────────────────────────────────
|
|
22
|
+
export { EvaluationSession, getGlobalSession, resetGlobalSession } from "./evaluation-session.js";
|
|
23
|
+
// ─── Presets ─────────────────────────────────────────────────────────────────
|
|
24
|
+
export { getPreset, composePresets, PRESETS } from "./presets.js";
|
|
19
25
|
// ─── V2 Policy-Aware API ────────────────────────────────────────────────────
|
|
20
26
|
export { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from "./evaluators/v2.js";
|
|
21
27
|
// ─── Cross-File Taint Analysis ───────────────────────────────────────────────
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation session — persistent context that survives across multiple
|
|
3
|
+
* evaluation calls within the same session (MCP connection, VS Code
|
|
4
|
+
* extension lifetime, or CLI watch mode).
|
|
5
|
+
*
|
|
6
|
+
* Avoids redundant framework detection, capability scanning, and feedback
|
|
7
|
+
* loading. Tracks verdict evolution per file for stability detection.
|
|
8
|
+
*/
|
|
9
|
+
import type { SessionContext, TribunalVerdict } from "./types.js";
|
|
10
|
+
/**
|
|
11
|
+
* An evaluation session that accumulates project knowledge across calls.
|
|
12
|
+
*/
|
|
13
|
+
export declare class EvaluationSession {
|
|
14
|
+
private ctx;
|
|
15
|
+
constructor();
|
|
16
|
+
/** Get the current session context (read-only snapshot). */
|
|
17
|
+
getContext(): Readonly<SessionContext>;
|
|
18
|
+
/** Number of evaluations performed. */
|
|
19
|
+
get evaluationCount(): number;
|
|
20
|
+
/** Record detected frameworks (deduplicated). */
|
|
21
|
+
addFrameworks(frameworks: string[]): void;
|
|
22
|
+
/** Record detected project capabilities (e.g. "rate-limiting", "auth"). */
|
|
23
|
+
addCapabilities(caps: Iterable<string>): void;
|
|
24
|
+
/** Get accumulated capabilities for absence-based finding suppression. */
|
|
25
|
+
getCapabilities(): Set<string>;
|
|
26
|
+
/**
|
|
27
|
+
* Record an evaluation result for a file. Tracks verdict history
|
|
28
|
+
* so repeated evaluations can detect stability (converging scores).
|
|
29
|
+
*/
|
|
30
|
+
recordEvaluation(filePath: string, code: string, verdict: TribunalVerdict): void;
|
|
31
|
+
/**
|
|
32
|
+
* Check if a file's verdict is stable — same score and finding count
|
|
33
|
+
* across the last N evaluations. Returns true if stable (skip re-eval).
|
|
34
|
+
*/
|
|
35
|
+
isVerdictStable(filePath: string, minRuns?: number): boolean;
|
|
36
|
+
/**
|
|
37
|
+
* Check if a file has already been evaluated with the same content.
|
|
38
|
+
*/
|
|
39
|
+
hasEvaluated(filePath: string, code: string): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Get verdict history for a file — most recent first.
|
|
42
|
+
*/
|
|
43
|
+
getVerdictHistory(filePath: string): Array<{
|
|
44
|
+
score: number;
|
|
45
|
+
findingCount: number;
|
|
46
|
+
timestamp: string;
|
|
47
|
+
}>;
|
|
48
|
+
/** Reset the session (clear all accumulated context). */
|
|
49
|
+
reset(): void;
|
|
50
|
+
/**
|
|
51
|
+
* Record user feedback for a finding rule.
|
|
52
|
+
* tp = true positive, fp = false positive, wontfix = acknowledged but skipped.
|
|
53
|
+
*/
|
|
54
|
+
recordFeedback(ruleId: string, verdict: "tp" | "fp" | "wontfix"): void;
|
|
55
|
+
/**
|
|
56
|
+
* Get a confidence penalty for a rule based on accumulated FP feedback.
|
|
57
|
+
* Returns a multiplier in (0, 1] — 1.0 means no penalty, lower means
|
|
58
|
+
* the rule has been flagged as FP frequently and confidence should be reduced.
|
|
59
|
+
*
|
|
60
|
+
* Formula: 1 / (1 + fpCount) — degrades smoothly as FP reports accumulate.
|
|
61
|
+
* A single FP report halves confidence; two reports reduce it to 1/3, etc.
|
|
62
|
+
*/
|
|
63
|
+
getConfidencePenalty(ruleId: string): number;
|
|
64
|
+
/** Get the raw feedback tally for all rules. */
|
|
65
|
+
getFeedbackTally(): ReadonlyMap<string, {
|
|
66
|
+
tp: number;
|
|
67
|
+
fp: number;
|
|
68
|
+
wontfix: number;
|
|
69
|
+
}>;
|
|
70
|
+
}
|
|
71
|
+
/** Get or create the global evaluation session (shared across MCP calls). */
|
|
72
|
+
export declare function getGlobalSession(): EvaluationSession;
|
|
73
|
+
/** Reset the global session (for testing or explicit reset). */
|
|
74
|
+
export declare function resetGlobalSession(): void;
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation session — persistent context that survives across multiple
|
|
3
|
+
* evaluation calls within the same session (MCP connection, VS Code
|
|
4
|
+
* extension lifetime, or CLI watch mode).
|
|
5
|
+
*
|
|
6
|
+
* Avoids redundant framework detection, capability scanning, and feedback
|
|
7
|
+
* loading. Tracks verdict evolution per file for stability detection.
|
|
8
|
+
*/
|
|
9
|
+
import { contentHash } from "./cache.js";
|
|
10
|
+
/**
|
|
11
|
+
* An evaluation session that accumulates project knowledge across calls.
|
|
12
|
+
*/
|
|
13
|
+
export class EvaluationSession {
|
|
14
|
+
ctx;
|
|
15
|
+
constructor() {
|
|
16
|
+
this.ctx = {
|
|
17
|
+
frameworks: [],
|
|
18
|
+
capabilities: new Set(),
|
|
19
|
+
verdictHistory: new Map(),
|
|
20
|
+
evaluatedFiles: new Map(),
|
|
21
|
+
startedAt: new Date().toISOString(),
|
|
22
|
+
evaluationCount: 0,
|
|
23
|
+
feedbackTally: new Map(),
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
/** Get the current session context (read-only snapshot). */
|
|
27
|
+
getContext() {
|
|
28
|
+
return this.ctx;
|
|
29
|
+
}
|
|
30
|
+
/** Number of evaluations performed. */
|
|
31
|
+
get evaluationCount() {
|
|
32
|
+
return this.ctx.evaluationCount;
|
|
33
|
+
}
|
|
34
|
+
/** Record detected frameworks (deduplicated). */
|
|
35
|
+
addFrameworks(frameworks) {
|
|
36
|
+
const existing = new Set(this.ctx.frameworks);
|
|
37
|
+
for (const fw of frameworks) {
|
|
38
|
+
if (!existing.has(fw)) {
|
|
39
|
+
this.ctx.frameworks.push(fw);
|
|
40
|
+
existing.add(fw);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
/** Record detected project capabilities (e.g. "rate-limiting", "auth"). */
|
|
45
|
+
addCapabilities(caps) {
|
|
46
|
+
for (const cap of caps) {
|
|
47
|
+
this.ctx.capabilities.add(cap);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
/** Get accumulated capabilities for absence-based finding suppression. */
|
|
51
|
+
getCapabilities() {
|
|
52
|
+
return this.ctx.capabilities;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Record an evaluation result for a file. Tracks verdict history
|
|
56
|
+
* so repeated evaluations can detect stability (converging scores).
|
|
57
|
+
*/
|
|
58
|
+
recordEvaluation(filePath, code, verdict) {
|
|
59
|
+
this.ctx.evaluationCount++;
|
|
60
|
+
const hash = contentHash(code, filePath);
|
|
61
|
+
this.ctx.evaluatedFiles.set(hash, filePath);
|
|
62
|
+
const history = this.ctx.verdictHistory.get(filePath) ?? [];
|
|
63
|
+
history.push({
|
|
64
|
+
score: verdict.overallScore,
|
|
65
|
+
findingCount: verdict.findings.length,
|
|
66
|
+
timestamp: verdict.timestamp,
|
|
67
|
+
});
|
|
68
|
+
// Keep last 10 evaluations per file
|
|
69
|
+
if (history.length > 10)
|
|
70
|
+
history.shift();
|
|
71
|
+
this.ctx.verdictHistory.set(filePath, history);
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Check if a file's verdict is stable — same score and finding count
|
|
75
|
+
* across the last N evaluations. Returns true if stable (skip re-eval).
|
|
76
|
+
*/
|
|
77
|
+
isVerdictStable(filePath, minRuns = 3) {
|
|
78
|
+
const history = this.ctx.verdictHistory.get(filePath);
|
|
79
|
+
if (!history || history.length < minRuns)
|
|
80
|
+
return false;
|
|
81
|
+
const recent = history.slice(-minRuns);
|
|
82
|
+
const firstScore = recent[0].score;
|
|
83
|
+
const firstCount = recent[0].findingCount;
|
|
84
|
+
return recent.every((h) => h.score === firstScore && h.findingCount === firstCount);
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Check if a file has already been evaluated with the same content.
|
|
88
|
+
*/
|
|
89
|
+
hasEvaluated(filePath, code) {
|
|
90
|
+
const hash = contentHash(code, filePath);
|
|
91
|
+
return this.ctx.evaluatedFiles.has(hash);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Get verdict history for a file — most recent first.
|
|
95
|
+
*/
|
|
96
|
+
getVerdictHistory(filePath) {
|
|
97
|
+
return [...(this.ctx.verdictHistory.get(filePath) ?? [])].reverse();
|
|
98
|
+
}
|
|
99
|
+
/** Reset the session (clear all accumulated context). */
|
|
100
|
+
reset() {
|
|
101
|
+
this.ctx = {
|
|
102
|
+
frameworks: [],
|
|
103
|
+
capabilities: new Set(),
|
|
104
|
+
verdictHistory: new Map(),
|
|
105
|
+
evaluatedFiles: new Map(),
|
|
106
|
+
startedAt: new Date().toISOString(),
|
|
107
|
+
evaluationCount: 0,
|
|
108
|
+
feedbackTally: new Map(),
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Record user feedback for a finding rule.
|
|
113
|
+
* tp = true positive, fp = false positive, wontfix = acknowledged but skipped.
|
|
114
|
+
*/
|
|
115
|
+
recordFeedback(ruleId, verdict) {
|
|
116
|
+
const existing = this.ctx.feedbackTally.get(ruleId) ?? { tp: 0, fp: 0, wontfix: 0 };
|
|
117
|
+
existing[verdict]++;
|
|
118
|
+
this.ctx.feedbackTally.set(ruleId, existing);
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Get a confidence penalty for a rule based on accumulated FP feedback.
|
|
122
|
+
* Returns a multiplier in (0, 1] — 1.0 means no penalty, lower means
|
|
123
|
+
* the rule has been flagged as FP frequently and confidence should be reduced.
|
|
124
|
+
*
|
|
125
|
+
* Formula: 1 / (1 + fpCount) — degrades smoothly as FP reports accumulate.
|
|
126
|
+
* A single FP report halves confidence; two reports reduce it to 1/3, etc.
|
|
127
|
+
*/
|
|
128
|
+
getConfidencePenalty(ruleId) {
|
|
129
|
+
const tally = this.ctx.feedbackTally.get(ruleId);
|
|
130
|
+
if (!tally || tally.fp === 0)
|
|
131
|
+
return 1.0;
|
|
132
|
+
return 1 / (1 + tally.fp);
|
|
133
|
+
}
|
|
134
|
+
/** Get the raw feedback tally for all rules. */
|
|
135
|
+
getFeedbackTally() {
|
|
136
|
+
return this.ctx.feedbackTally;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// ─── Singleton for MCP Server / Extension lifetime ──────────────────────────
|
|
140
|
+
let _globalSession;
|
|
141
|
+
/** Get or create the global evaluation session (shared across MCP calls). */
|
|
142
|
+
export function getGlobalSession() {
|
|
143
|
+
if (!_globalSession) {
|
|
144
|
+
_globalSession = new EvaluationSession();
|
|
145
|
+
}
|
|
146
|
+
return _globalSession;
|
|
147
|
+
}
|
|
148
|
+
/** Reset the global session (for testing or explicit reset). */
|
|
149
|
+
export function resetGlobalSession() {
|
|
150
|
+
_globalSession?.reset();
|
|
151
|
+
_globalSession = undefined;
|
|
152
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { JudgeDefinition, JudgeEvaluation, TribunalVerdict, ProjectVerdict, DiffVerdict, Finding, MustFixGateOptions, JudgesConfig, SuppressionResult } from "../types.js";
|
|
1
|
+
import type { JudgeDefinition, JudgeEvaluation, TribunalVerdict, ProjectVerdict, DiffVerdict, Finding, MustFixGateOptions, JudgesConfig, SuppressionResult, StreamingBatch } from "../types.js";
|
|
2
2
|
import type { CodeStructure } from "../ast/types.js";
|
|
3
3
|
import type { TaintFlow } from "../ast/taint-tracker.js";
|
|
4
4
|
import { formatVerdictAsMarkdown, formatEvaluationAsMarkdown } from "./shared.js";
|
|
@@ -53,6 +53,12 @@ export interface EvaluationOptions {
|
|
|
53
53
|
* Generated by `scanProjectCapabilities()` from the project evaluator.
|
|
54
54
|
*/
|
|
55
55
|
projectCapabilities?: Set<string>;
|
|
56
|
+
/**
|
|
57
|
+
* Enable adaptive judge selection — automatically skip judges that are
|
|
58
|
+
* irrelevant to the file's language, framework, or role. Reduces noise
|
|
59
|
+
* and improves performance. Defaults to false (run all judges).
|
|
60
|
+
*/
|
|
61
|
+
adaptiveSelection?: boolean;
|
|
56
62
|
/** @internal — pre-computed AST structure for the file (set by evaluateWithTribunal) */
|
|
57
63
|
_astCache?: CodeStructure;
|
|
58
64
|
/** @internal — pre-computed taint flows for the file (set by evaluateWithTribunal) */
|
|
@@ -88,6 +94,22 @@ export declare function evaluateWithJudge(judge: JudgeDefinition, code: string,
|
|
|
88
94
|
* Run the full tribunal — all judges evaluate the code.
|
|
89
95
|
*/
|
|
90
96
|
export declare function evaluateWithTribunal(code: string, language: string, context?: string, options?: EvaluationOptions): TribunalVerdict;
|
|
97
|
+
/**
|
|
98
|
+
* Streaming tribunal evaluation — yields per-judge results as each judge
|
|
99
|
+
* completes, enabling progressive UI updates and early termination.
|
|
100
|
+
*
|
|
101
|
+
* Each yielded `StreamingBatch` contains the judge evaluation, execution
|
|
102
|
+
* trace, and running aggregate statistics.
|
|
103
|
+
*
|
|
104
|
+
* Usage:
|
|
105
|
+
* ```ts
|
|
106
|
+
* for await (const batch of evaluateWithTribunalStreaming(code, lang)) {
|
|
107
|
+
* console.log(`${batch.judgeName}: ${batch.evaluation.findings.length} findings`);
|
|
108
|
+
* if (batch.aggregate.criticalSoFar > 10) break; // early termination
|
|
109
|
+
* }
|
|
110
|
+
* ```
|
|
111
|
+
*/
|
|
112
|
+
export declare function evaluateWithTribunalStreaming(code: string, language: string, context?: string, options?: EvaluationOptions): AsyncGenerator<StreamingBatch>;
|
|
91
113
|
export { scanProjectWideSecurityPatterns } from "./project.js";
|
|
92
114
|
export declare function evaluateProject(files: Array<{
|
|
93
115
|
path: string;
|
package/dist/evaluators/index.js
CHANGED
|
@@ -19,6 +19,8 @@ import { loadFeedbackStore } from "../commands/feedback.js";
|
|
|
19
19
|
import { CROSS_FILE_SECURITY_CATEGORIES } from "./project.js";
|
|
20
20
|
import { applyTriageFeedback, loadFindingStore } from "../finding-lifecycle.js";
|
|
21
21
|
import { enrichWithSecurityIds } from "../security-ids.js";
|
|
22
|
+
import { selectJudges } from "./judge-selector.js";
|
|
23
|
+
import { getGlobalSession } from "../evaluation-session.js";
|
|
22
24
|
// ── AST-aware post-processing ───────────────────────────────────────────────
|
|
23
25
|
// ── Module-level caches for AST/taint results ───────────────────────────────
|
|
24
26
|
const astStructureCache = new LRUCache(256);
|
|
@@ -390,7 +392,19 @@ function resolveJudgeSet(options) {
|
|
|
390
392
|
const disabled = new Set(options.config.disabledJudges);
|
|
391
393
|
judges = judges.filter((j) => !disabled.has(j.id));
|
|
392
394
|
}
|
|
393
|
-
|
|
395
|
+
// Adaptive judge selection — skip irrelevant judges based on file context
|
|
396
|
+
if (options?.adaptiveSelection && options.filePath) {
|
|
397
|
+
const fileCategory = classifyFile("", options.filePath.split(".").pop() ?? "", options.filePath);
|
|
398
|
+
const ctx = {
|
|
399
|
+
language: options.filePath.split(".").pop() ?? "unknown",
|
|
400
|
+
fileCategory,
|
|
401
|
+
filePath: options.filePath,
|
|
402
|
+
projectMode: options.projectMode,
|
|
403
|
+
};
|
|
404
|
+
const result = selectJudges(judges, ctx);
|
|
405
|
+
return { judges: result.selected, skipped: result.skipped };
|
|
406
|
+
}
|
|
407
|
+
return { judges };
|
|
394
408
|
}
|
|
395
409
|
/**
|
|
396
410
|
* Check whether an absence-based finding is mitigated by a pre-scanned
|
|
@@ -649,7 +663,7 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
649
663
|
...(astResult ? { _astCache: astResult } : {}),
|
|
650
664
|
...(taintResult ? { _taintFlows: taintResult } : {}),
|
|
651
665
|
};
|
|
652
|
-
const judges = resolveJudgeSet(enrichedOptions);
|
|
666
|
+
const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
|
|
653
667
|
const tribunalStart = performance.now();
|
|
654
668
|
const evaluations = judges.map((judge) => {
|
|
655
669
|
const start = performance.now();
|
|
@@ -776,7 +790,30 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
776
790
|
// No triage data or error loading — continue without adjustment
|
|
777
791
|
}
|
|
778
792
|
const maxFindings = options?.maxFindingsPerFile ?? DEFAULT_MAX_FINDINGS_PER_FILE;
|
|
779
|
-
|
|
793
|
+
// ── Session feedback calibration ──
|
|
794
|
+
// Apply confidence penalties from accumulated FP feedback in the
|
|
795
|
+
// current evaluation session. This is the real-time agentic loop:
|
|
796
|
+
// user marks findings as FP → session records it → subsequent
|
|
797
|
+
// evaluations automatically reduce confidence on those rules.
|
|
798
|
+
let sessionAdjusted = triageAdjusted;
|
|
799
|
+
try {
|
|
800
|
+
const session = getGlobalSession();
|
|
801
|
+
const tally = session.getFeedbackTally();
|
|
802
|
+
if (tally.size > 0) {
|
|
803
|
+
sessionAdjusted = triageAdjusted.map((f) => {
|
|
804
|
+
const penalty = session.getConfidencePenalty(f.ruleId);
|
|
805
|
+
if (penalty < 1.0) {
|
|
806
|
+
const adjusted = clampConfidence((f.confidence ?? 0.5) * penalty);
|
|
807
|
+
return { ...f, confidence: adjusted };
|
|
808
|
+
}
|
|
809
|
+
return f;
|
|
810
|
+
});
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
catch {
|
|
814
|
+
// Session feedback calibration failure is non-fatal
|
|
815
|
+
}
|
|
816
|
+
const cappedFindings = applyPerFileFindingCap(sessionAdjusted, maxFindings);
|
|
780
817
|
// ── Confidence-based tiering for progressive disclosure ──
|
|
781
818
|
// Tag each finding with a disclosure tier so downstream consumers (CLI,
|
|
782
819
|
// formatters, VS Code extension) can show only high-confidence findings
|
|
@@ -863,6 +900,129 @@ export function evaluateWithTribunal(code, language, context, options) {
|
|
|
863
900
|
}
|
|
864
901
|
return result;
|
|
865
902
|
}
|
|
903
|
+
// ─── Streaming Evaluation ────────────────────────────────────────────────────
|
|
904
|
+
/**
|
|
905
|
+
* Streaming tribunal evaluation — yields per-judge results as each judge
|
|
906
|
+
* completes, enabling progressive UI updates and early termination.
|
|
907
|
+
*
|
|
908
|
+
* Each yielded `StreamingBatch` contains the judge evaluation, execution
|
|
909
|
+
* trace, and running aggregate statistics.
|
|
910
|
+
*
|
|
911
|
+
* Usage:
|
|
912
|
+
* ```ts
|
|
913
|
+
* for await (const batch of evaluateWithTribunalStreaming(code, lang)) {
|
|
914
|
+
* console.log(`${batch.judgeName}: ${batch.evaluation.findings.length} findings`);
|
|
915
|
+
* if (batch.aggregate.criticalSoFar > 10) break; // early termination
|
|
916
|
+
* }
|
|
917
|
+
* ```
|
|
918
|
+
*/
|
|
919
|
+
export async function* evaluateWithTribunalStreaming(code, language, context, options) {
|
|
920
|
+
const includeAst = options?.includeAstFindings ?? true;
|
|
921
|
+
const hash = contentHash(code, language);
|
|
922
|
+
let astResult = options?._astCache;
|
|
923
|
+
if (!astResult && includeAst) {
|
|
924
|
+
astResult = astStructureCache.get(hash);
|
|
925
|
+
if (!astResult) {
|
|
926
|
+
astResult = analyzeStructure(code, language);
|
|
927
|
+
astStructureCache.set(hash, astResult);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
let taintResult = options?._taintFlows;
|
|
931
|
+
if (!taintResult) {
|
|
932
|
+
taintResult = taintFlowCache.get(hash);
|
|
933
|
+
if (!taintResult) {
|
|
934
|
+
taintResult = analyzeTaintFlows(code, language);
|
|
935
|
+
taintFlowCache.set(hash, taintResult);
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
const enrichedOptions = {
|
|
939
|
+
...options,
|
|
940
|
+
...(astResult ? { _astCache: astResult } : {}),
|
|
941
|
+
...(taintResult ? { _taintFlows: taintResult } : {}),
|
|
942
|
+
};
|
|
943
|
+
const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
|
|
944
|
+
const totalJudges = judges.length;
|
|
945
|
+
let completedJudges = 0;
|
|
946
|
+
let findingsSoFar = 0;
|
|
947
|
+
let criticalSoFar = 0;
|
|
948
|
+
let highSoFar = 0;
|
|
949
|
+
let scoreSum = 0;
|
|
950
|
+
let hasFailure = false;
|
|
951
|
+
let hasWarning = false;
|
|
952
|
+
for (const judge of judges) {
|
|
953
|
+
const start = performance.now();
|
|
954
|
+
const evaluation = evaluateWithJudge(judge, code, language, context, enrichedOptions);
|
|
955
|
+
const durationMs = Math.round(performance.now() - start);
|
|
956
|
+
evaluation.durationMs = durationMs;
|
|
957
|
+
completedJudges++;
|
|
958
|
+
findingsSoFar += evaluation.findings.length;
|
|
959
|
+
criticalSoFar += evaluation.findings.filter((f) => f.severity === "critical").length;
|
|
960
|
+
highSoFar += evaluation.findings.filter((f) => f.severity === "high").length;
|
|
961
|
+
scoreSum += evaluation.score;
|
|
962
|
+
if (evaluation.verdict === "fail")
|
|
963
|
+
hasFailure = true;
|
|
964
|
+
if (evaluation.verdict === "warning")
|
|
965
|
+
hasWarning = true;
|
|
966
|
+
const trace = {
|
|
967
|
+
judgeId: judge.id,
|
|
968
|
+
judgeName: judge.name,
|
|
969
|
+
durationMs,
|
|
970
|
+
rules: buildRuleTraces(evaluation),
|
|
971
|
+
rawFindingCount: evaluation.findings.length,
|
|
972
|
+
finalFindingCount: evaluation.findings.length,
|
|
973
|
+
...(astResult
|
|
974
|
+
? {
|
|
975
|
+
astResolution: {
|
|
976
|
+
functionsAnalyzed: astResult.functions.length,
|
|
977
|
+
maxComplexity: Math.max(0, ...astResult.functions.map((f) => f.cyclomaticComplexity)),
|
|
978
|
+
taintFlowsDetected: taintResult?.length ?? 0,
|
|
979
|
+
},
|
|
980
|
+
}
|
|
981
|
+
: {}),
|
|
982
|
+
};
|
|
983
|
+
const currentVerdict = hasFailure ? "fail" : hasWarning ? "warning" : "pass";
|
|
984
|
+
yield {
|
|
985
|
+
judgeId: judge.id,
|
|
986
|
+
judgeName: judge.name,
|
|
987
|
+
evaluation,
|
|
988
|
+
trace,
|
|
989
|
+
aggregate: {
|
|
990
|
+
completedJudges,
|
|
991
|
+
totalJudges,
|
|
992
|
+
findingsSoFar,
|
|
993
|
+
criticalSoFar,
|
|
994
|
+
highSoFar,
|
|
995
|
+
currentScore: Math.round(scoreSum / completedJudges),
|
|
996
|
+
currentVerdict,
|
|
997
|
+
},
|
|
998
|
+
done: completedJudges === totalJudges,
|
|
999
|
+
};
|
|
1000
|
+
// Yield to the event loop between judges for responsiveness
|
|
1001
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
/**
|
|
1005
|
+
* Build rule-level traces from a judge evaluation for observability.
|
|
1006
|
+
*/
|
|
1007
|
+
function buildRuleTraces(evaluation) {
|
|
1008
|
+
const ruleMap = new Map();
|
|
1009
|
+
for (const f of evaluation.findings) {
|
|
1010
|
+
const existing = ruleMap.get(f.ruleId);
|
|
1011
|
+
if (existing) {
|
|
1012
|
+
existing.count++;
|
|
1013
|
+
existing.peakConf = Math.max(existing.peakConf, f.confidence ?? 0.5);
|
|
1014
|
+
}
|
|
1015
|
+
else {
|
|
1016
|
+
ruleMap.set(f.ruleId, { count: 1, peakConf: f.confidence ?? 0.5 });
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
return [...ruleMap.entries()].map(([ruleId, { count, peakConf }]) => ({
|
|
1020
|
+
ruleId,
|
|
1021
|
+
matched: true,
|
|
1022
|
+
findingCount: count,
|
|
1023
|
+
peakConfidence: peakConf,
|
|
1024
|
+
}));
|
|
1025
|
+
}
|
|
866
1026
|
// ─── Project-level Multi-file Analysis (delegated to project.ts) ─────────────
|
|
867
1027
|
import { evaluateProject as _evaluateProject } from "./project.js";
|
|
868
1028
|
export { scanProjectWideSecurityPatterns } from "./project.js";
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive judge selection — picks only the judges relevant to a given file
|
|
3
|
+
* based on language, framework, file role, and project context.
|
|
4
|
+
*
|
|
5
|
+
* Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
|
|
6
|
+
* or "iac-security" on a React component) while keeping the full panel
|
|
7
|
+
* available for explicit requests.
|
|
8
|
+
*/
|
|
9
|
+
import type { JudgeDefinition, JudgeSelectionContext, JudgeSelectionResult } from "../types.js";
|
|
10
|
+
/**
|
|
11
|
+
* Select the most relevant judges for a given file context.
|
|
12
|
+
*
|
|
13
|
+
* Strategy:
|
|
14
|
+
* 1. Always include core judges (security, false-positive-review)
|
|
15
|
+
* 2. Skip judges with language incompatibility
|
|
16
|
+
* 3. Skip judges irrelevant to the file category
|
|
17
|
+
* 4. Return selection with skip reasons for observability
|
|
18
|
+
*/
|
|
19
|
+
export declare function selectJudges(judges: JudgeDefinition[], ctx: JudgeSelectionContext): JudgeSelectionResult;
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive judge selection — picks only the judges relevant to a given file
|
|
3
|
+
* based on language, framework, file role, and project context.
|
|
4
|
+
*
|
|
5
|
+
* Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
|
|
6
|
+
* or "iac-security" on a React component) while keeping the full panel
|
|
7
|
+
* available for explicit requests.
|
|
8
|
+
*/
|
|
9
|
+
// ─── Language → judge relevance ──────────────────────────────────────────────
|
|
10
|
+
/**
|
|
11
|
+
* Judges that are ONLY relevant for specific language families.
|
|
12
|
+
* If the language isn't listed, the judge is skipped.
|
|
13
|
+
* Most judges are language-agnostic and not listed here.
|
|
14
|
+
*/
|
|
15
|
+
const LANGUAGE_SPECIFIC = {
|
|
16
|
+
// IaC judges only apply to infrastructure languages
|
|
17
|
+
"iac-security": new Set(["terraform", "bicep", "arm", "dockerfile", "yaml"]),
|
|
18
|
+
};
|
|
19
|
+
/**
|
|
20
|
+
* Judges to SKIP for specific languages — inverse of above.
|
|
21
|
+
* E.g. testing patterns don't apply to SQL or Dockerfile.
|
|
22
|
+
*/
|
|
23
|
+
const LANGUAGE_SKIP = {
|
|
24
|
+
testing: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
|
|
25
|
+
documentation: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
|
|
26
|
+
"code-structure": new Set(["sql", "dockerfile", "yaml"]),
|
|
27
|
+
ux: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
|
|
28
|
+
accessibility: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
|
|
29
|
+
internationalization: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
|
|
30
|
+
concurrency: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
|
|
31
|
+
"over-engineering": new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
|
|
32
|
+
};
|
|
33
|
+
// ─── File category → judge relevance ────────────────────────────────────────
|
|
34
|
+
/**
|
|
35
|
+
* Judges to skip when evaluating test files — noise reduction.
|
|
36
|
+
*/
|
|
37
|
+
const SKIP_FOR_TESTS = new Set([
|
|
38
|
+
"documentation",
|
|
39
|
+
"rate-limiting",
|
|
40
|
+
"scalability",
|
|
41
|
+
"cloud-readiness",
|
|
42
|
+
"ci-cd",
|
|
43
|
+
"configuration-management",
|
|
44
|
+
"cost-effectiveness",
|
|
45
|
+
"data-sovereignty",
|
|
46
|
+
"compliance",
|
|
47
|
+
"internationalization",
|
|
48
|
+
"ux",
|
|
49
|
+
"accessibility",
|
|
50
|
+
"observability",
|
|
51
|
+
]);
|
|
52
|
+
/**
|
|
53
|
+
* Judges to skip for config/manifest files.
|
|
54
|
+
*/
|
|
55
|
+
const SKIP_FOR_CONFIG = new Set([
|
|
56
|
+
"testing",
|
|
57
|
+
"documentation",
|
|
58
|
+
"code-structure",
|
|
59
|
+
"error-handling",
|
|
60
|
+
"performance",
|
|
61
|
+
"concurrency",
|
|
62
|
+
"scalability",
|
|
63
|
+
"ux",
|
|
64
|
+
"accessibility",
|
|
65
|
+
"internationalization",
|
|
66
|
+
"over-engineering",
|
|
67
|
+
"backwards-compatibility",
|
|
68
|
+
"maintainability",
|
|
69
|
+
]);
|
|
70
|
+
/**
|
|
71
|
+
* Judges to skip for IaC files (Terraform, Bicep, ARM, Dockerfile).
|
|
72
|
+
*/
|
|
73
|
+
const SKIP_FOR_IAC = new Set([
|
|
74
|
+
"testing",
|
|
75
|
+
"code-structure",
|
|
76
|
+
"concurrency",
|
|
77
|
+
"over-engineering",
|
|
78
|
+
"ux",
|
|
79
|
+
"accessibility",
|
|
80
|
+
"internationalization",
|
|
81
|
+
"api-design",
|
|
82
|
+
"api-contract",
|
|
83
|
+
"backwards-compatibility",
|
|
84
|
+
"hallucination-detection",
|
|
85
|
+
"multi-turn-coherence",
|
|
86
|
+
"model-fingerprint",
|
|
87
|
+
]);
|
|
88
|
+
// ─── Core judges that always run ─────────────────────────────────────────────
|
|
89
|
+
/** These judges run unconditionally — they cover universally applicable concerns. */
|
|
90
|
+
const ALWAYS_RUN = new Set(["security", "cybersecurity", "false-positive-review"]);
|
|
91
|
+
// ─── Selection logic ─────────────────────────────────────────────────────────
|
|
92
|
+
/**
|
|
93
|
+
* Select the most relevant judges for a given file context.
|
|
94
|
+
*
|
|
95
|
+
* Strategy:
|
|
96
|
+
* 1. Always include core judges (security, false-positive-review)
|
|
97
|
+
* 2. Skip judges with language incompatibility
|
|
98
|
+
* 3. Skip judges irrelevant to the file category
|
|
99
|
+
* 4. Return selection with skip reasons for observability
|
|
100
|
+
*/
|
|
101
|
+
export function selectJudges(judges, ctx) {
|
|
102
|
+
const selected = [];
|
|
103
|
+
const skipped = [];
|
|
104
|
+
const lang = ctx.language.toLowerCase();
|
|
105
|
+
const cat = ctx.fileCategory?.toLowerCase() ?? "";
|
|
106
|
+
for (const judge of judges) {
|
|
107
|
+
// Core judges always run
|
|
108
|
+
if (ALWAYS_RUN.has(judge.id)) {
|
|
109
|
+
selected.push(judge);
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
// Language-specific judge: skip if language not in its set
|
|
113
|
+
const langOnly = LANGUAGE_SPECIFIC[judge.id];
|
|
114
|
+
if (langOnly && !langOnly.has(lang)) {
|
|
115
|
+
skipped.push({ judgeId: judge.id, reason: `not relevant for language: ${lang}` });
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
// Language skip: judge not useful for this language
|
|
119
|
+
const langSkip = LANGUAGE_SKIP[judge.id];
|
|
120
|
+
if (langSkip && langSkip.has(lang)) {
|
|
121
|
+
skipped.push({ judgeId: judge.id, reason: `skipped for language: ${lang}` });
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
// File category gating
|
|
125
|
+
if (cat === "test" && SKIP_FOR_TESTS.has(judge.id)) {
|
|
126
|
+
skipped.push({ judgeId: judge.id, reason: "not relevant for test files" });
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
if (cat === "config" && SKIP_FOR_CONFIG.has(judge.id)) {
|
|
130
|
+
skipped.push({ judgeId: judge.id, reason: "not relevant for config files" });
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
if ((cat === "iac" || lang === "terraform" || lang === "bicep" || lang === "arm" || lang === "dockerfile") &&
|
|
134
|
+
SKIP_FOR_IAC.has(judge.id)) {
|
|
135
|
+
skipped.push({ judgeId: judge.id, reason: "not relevant for infrastructure code" });
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
selected.push(judge);
|
|
139
|
+
}
|
|
140
|
+
return { selected, skipped };
|
|
141
|
+
}
|