@kevinrabun/judges 3.38.0 → 3.41.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +68 -0
- package/README.md +5 -4
- package/dist/api.d.ts +5 -2
- package/dist/api.d.ts.map +1 -1
- package/dist/api.js +5 -1
- package/dist/api.js.map +1 -1
- package/dist/ast/structural-parser.js +3 -3
- package/dist/ast/structural-parser.js.map +1 -1
- package/dist/calibration.d.ts +35 -0
- package/dist/calibration.d.ts.map +1 -1
- package/dist/calibration.js +52 -0
- package/dist/calibration.js.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +370 -16
- package/dist/cli.js.map +1 -1
- package/dist/commands/auto-calibrate.d.ts +15 -0
- package/dist/commands/auto-calibrate.d.ts.map +1 -0
- package/dist/commands/auto-calibrate.js +107 -0
- package/dist/commands/auto-calibrate.js.map +1 -0
- package/dist/commands/benchmark-languages.js +4 -4
- package/dist/commands/benchmark.d.ts +2 -1
- package/dist/commands/benchmark.d.ts.map +1 -1
- package/dist/commands/benchmark.js +67 -2
- package/dist/commands/benchmark.js.map +1 -1
- package/dist/commands/calibration-dashboard.d.ts.map +1 -1
- package/dist/commands/calibration-dashboard.js +198 -0
- package/dist/commands/calibration-dashboard.js.map +1 -1
- package/dist/commands/calibration-share.d.ts +31 -0
- package/dist/commands/calibration-share.d.ts.map +1 -0
- package/dist/commands/calibration-share.js +183 -0
- package/dist/commands/calibration-share.js.map +1 -0
- package/dist/commands/compliance-report.d.ts +35 -0
- package/dist/commands/compliance-report.d.ts.map +1 -0
- package/dist/commands/compliance-report.js +162 -0
- package/dist/commands/compliance-report.js.map +1 -0
- package/dist/commands/config-migrate.d.ts +44 -0
- package/dist/commands/config-migrate.d.ts.map +1 -0
- package/dist/commands/config-migrate.js +241 -0
- package/dist/commands/config-migrate.js.map +1 -0
- package/dist/commands/dedup-report.d.ts +13 -0
- package/dist/commands/dedup-report.d.ts.map +1 -0
- package/dist/commands/dedup-report.js +138 -0
- package/dist/commands/dedup-report.js.map +1 -0
- package/dist/commands/dep-audit.d.ts +53 -0
- package/dist/commands/dep-audit.d.ts.map +1 -0
- package/dist/commands/dep-audit.js +278 -0
- package/dist/commands/dep-audit.js.map +1 -0
- package/dist/commands/deprecated.d.ts +48 -0
- package/dist/commands/deprecated.d.ts.map +1 -0
- package/dist/commands/deprecated.js +202 -0
- package/dist/commands/deprecated.js.map +1 -0
- package/dist/commands/diff.d.ts.map +1 -1
- package/dist/commands/diff.js +8 -3
- package/dist/commands/diff.js.map +1 -1
- package/dist/commands/feedback-rules.d.ts +29 -0
- package/dist/commands/feedback-rules.d.ts.map +1 -0
- package/dist/commands/feedback-rules.js +174 -0
- package/dist/commands/feedback-rules.js.map +1 -0
- package/dist/commands/feedback.d.ts +12 -0
- package/dist/commands/feedback.d.ts.map +1 -1
- package/dist/commands/feedback.js +16 -0
- package/dist/commands/feedback.js.map +1 -1
- package/dist/commands/fix-pr.d.ts +23 -0
- package/dist/commands/fix-pr.d.ts.map +1 -0
- package/dist/commands/fix-pr.js +323 -0
- package/dist/commands/fix-pr.js.map +1 -0
- package/dist/commands/fix.d.ts.map +1 -1
- package/dist/commands/fix.js +33 -1
- package/dist/commands/fix.js.map +1 -1
- package/dist/commands/governance.d.ts +32 -0
- package/dist/commands/governance.d.ts.map +1 -0
- package/dist/commands/governance.js +203 -0
- package/dist/commands/governance.js.map +1 -0
- package/dist/commands/help.d.ts +8 -0
- package/dist/commands/help.d.ts.map +1 -0
- package/dist/commands/help.js +303 -0
- package/dist/commands/help.js.map +1 -0
- package/dist/commands/hook.d.ts.map +1 -1
- package/dist/commands/hook.js +17 -20
- package/dist/commands/hook.js.map +1 -1
- package/dist/commands/interactive-fix.d.ts +23 -0
- package/dist/commands/interactive-fix.d.ts.map +1 -0
- package/dist/commands/interactive-fix.js +140 -0
- package/dist/commands/interactive-fix.js.map +1 -0
- package/dist/commands/llm-benchmark.d.ts +119 -0
- package/dist/commands/llm-benchmark.d.ts.map +1 -0
- package/dist/commands/llm-benchmark.js +396 -0
- package/dist/commands/llm-benchmark.js.map +1 -0
- package/dist/commands/metrics-dashboard.d.ts +22 -0
- package/dist/commands/metrics-dashboard.d.ts.map +1 -0
- package/dist/commands/metrics-dashboard.js +335 -0
- package/dist/commands/metrics-dashboard.js.map +1 -0
- package/dist/commands/metrics.d.ts +58 -0
- package/dist/commands/metrics.d.ts.map +1 -0
- package/dist/commands/metrics.js +242 -0
- package/dist/commands/metrics.js.map +1 -0
- package/dist/commands/monorepo.d.ts +38 -0
- package/dist/commands/monorepo.d.ts.map +1 -0
- package/dist/commands/monorepo.js +233 -0
- package/dist/commands/monorepo.js.map +1 -0
- package/dist/commands/notify.d.ts +79 -0
- package/dist/commands/notify.d.ts.map +1 -0
- package/dist/commands/notify.js +325 -0
- package/dist/commands/notify.js.map +1 -0
- package/dist/commands/onboard.d.ts +13 -0
- package/dist/commands/onboard.d.ts.map +1 -0
- package/dist/commands/onboard.js +179 -0
- package/dist/commands/onboard.js.map +1 -0
- package/dist/commands/org-metrics.d.ts +24 -0
- package/dist/commands/org-metrics.d.ts.map +1 -0
- package/dist/commands/org-metrics.js +238 -0
- package/dist/commands/org-metrics.js.map +1 -0
- package/dist/commands/override.d.ts +62 -0
- package/dist/commands/override.d.ts.map +1 -0
- package/dist/commands/override.js +264 -0
- package/dist/commands/override.js.map +1 -0
- package/dist/commands/parity.d.ts +31 -0
- package/dist/commands/parity.d.ts.map +1 -0
- package/dist/commands/parity.js +213 -0
- package/dist/commands/parity.js.map +1 -0
- package/dist/commands/plugin-search.d.ts +40 -0
- package/dist/commands/plugin-search.d.ts.map +1 -0
- package/dist/commands/plugin-search.js +328 -0
- package/dist/commands/plugin-search.js.map +1 -0
- package/dist/commands/plugins.d.ts +13 -0
- package/dist/commands/plugins.d.ts.map +1 -0
- package/dist/commands/plugins.js +105 -0
- package/dist/commands/plugins.js.map +1 -0
- package/dist/commands/quality-gate.d.ts +70 -0
- package/dist/commands/quality-gate.d.ts.map +1 -0
- package/dist/commands/quality-gate.js +264 -0
- package/dist/commands/quality-gate.js.map +1 -0
- package/dist/commands/review.js +1 -1
- package/dist/commands/review.js.map +1 -1
- package/dist/commands/snapshot.d.ts +27 -0
- package/dist/commands/snapshot.d.ts.map +1 -1
- package/dist/commands/snapshot.js +99 -0
- package/dist/commands/snapshot.js.map +1 -1
- package/dist/commands/trace.d.ts +65 -0
- package/dist/commands/trace.d.ts.map +1 -0
- package/dist/commands/trace.js +246 -0
- package/dist/commands/trace.js.map +1 -0
- package/dist/commands/trust-ramp.d.ts +30 -0
- package/dist/commands/trust-ramp.d.ts.map +1 -0
- package/dist/commands/trust-ramp.js +190 -0
- package/dist/commands/trust-ramp.js.map +1 -0
- package/dist/config.d.ts +5 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +65 -0
- package/dist/config.js.map +1 -1
- package/dist/data-adapter.d.ts +124 -0
- package/dist/data-adapter.d.ts.map +1 -0
- package/dist/data-adapter.js +213 -0
- package/dist/data-adapter.js.map +1 -0
- package/dist/evaluators/accessibility.js +1 -1
- package/dist/evaluators/accessibility.js.map +1 -1
- package/dist/evaluators/ai-code-safety.d.ts.map +1 -1
- package/dist/evaluators/ai-code-safety.js +1 -4
- package/dist/evaluators/ai-code-safety.js.map +1 -1
- package/dist/evaluators/cost-effectiveness.js +1 -1
- package/dist/evaluators/cost-effectiveness.js.map +1 -1
- package/dist/evaluators/false-positive-review.js +4 -4
- package/dist/evaluators/false-positive-review.js.map +1 -1
- package/dist/evaluators/framework-rules.d.ts +59 -0
- package/dist/evaluators/framework-rules.d.ts.map +1 -0
- package/dist/evaluators/framework-rules.js +292 -0
- package/dist/evaluators/framework-rules.js.map +1 -0
- package/dist/evaluators/iac-security.js +1 -1
- package/dist/evaluators/iac-security.js.map +1 -1
- package/dist/evaluators/index.d.ts.map +1 -1
- package/dist/evaluators/index.js +59 -10
- package/dist/evaluators/index.js.map +1 -1
- package/dist/evaluators/intent-alignment.d.ts +4 -0
- package/dist/evaluators/intent-alignment.d.ts.map +1 -1
- package/dist/evaluators/intent-alignment.js +163 -0
- package/dist/evaluators/intent-alignment.js.map +1 -1
- package/dist/evaluators/logic-review.js +1 -1
- package/dist/evaluators/logic-review.js.map +1 -1
- package/dist/evaluators/maintainability.js +1 -1
- package/dist/evaluators/maintainability.js.map +1 -1
- package/dist/evaluators/over-engineering.js +3 -3
- package/dist/evaluators/over-engineering.js.map +1 -1
- package/dist/evaluators/project.d.ts +12 -0
- package/dist/evaluators/project.d.ts.map +1 -1
- package/dist/evaluators/project.js +86 -0
- package/dist/evaluators/project.js.map +1 -1
- package/dist/evaluators/security.js +2 -2
- package/dist/evaluators/security.js.map +1 -1
- package/dist/evaluators/ux.js +1 -1
- package/dist/evaluators/ux.js.map +1 -1
- package/dist/finding-lifecycle.d.ts +9 -0
- package/dist/finding-lifecycle.d.ts.map +1 -1
- package/dist/finding-lifecycle.js +15 -0
- package/dist/finding-lifecycle.js.map +1 -1
- package/dist/fix-history.d.ts +9 -0
- package/dist/fix-history.d.ts.map +1 -1
- package/dist/fix-history.js +15 -0
- package/dist/fix-history.js.map +1 -1
- package/dist/formatters/sarif.d.ts +3 -0
- package/dist/formatters/sarif.d.ts.map +1 -1
- package/dist/formatters/sarif.js +36 -12
- package/dist/formatters/sarif.js.map +1 -1
- package/dist/github-app.d.ts +16 -1
- package/dist/github-app.d.ts.map +1 -1
- package/dist/github-app.js +85 -2
- package/dist/github-app.js.map +1 -1
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -1
- package/dist/judge-registry.d.ts +157 -0
- package/dist/judge-registry.d.ts.map +1 -0
- package/dist/judge-registry.js +273 -0
- package/dist/judge-registry.js.map +1 -0
- package/dist/judges/accessibility.d.ts.map +1 -1
- package/dist/judges/accessibility.js +4 -0
- package/dist/judges/accessibility.js.map +1 -1
- package/dist/judges/agent-instructions.d.ts.map +1 -1
- package/dist/judges/agent-instructions.js +4 -0
- package/dist/judges/agent-instructions.js.map +1 -1
- package/dist/judges/ai-code-safety.d.ts.map +1 -1
- package/dist/judges/ai-code-safety.js +4 -0
- package/dist/judges/ai-code-safety.js.map +1 -1
- package/dist/judges/api-contract.d.ts.map +1 -1
- package/dist/judges/api-contract.js +4 -0
- package/dist/judges/api-contract.js.map +1 -1
- package/dist/judges/api-design.d.ts.map +1 -1
- package/dist/judges/api-design.js +4 -0
- package/dist/judges/api-design.js.map +1 -1
- package/dist/judges/authentication.d.ts.map +1 -1
- package/dist/judges/authentication.js +4 -0
- package/dist/judges/authentication.js.map +1 -1
- package/dist/judges/backwards-compatibility.d.ts.map +1 -1
- package/dist/judges/backwards-compatibility.js +4 -0
- package/dist/judges/backwards-compatibility.js.map +1 -1
- package/dist/judges/caching.d.ts.map +1 -1
- package/dist/judges/caching.js +4 -0
- package/dist/judges/caching.js.map +1 -1
- package/dist/judges/ci-cd.d.ts.map +1 -1
- package/dist/judges/ci-cd.js +4 -0
- package/dist/judges/ci-cd.js.map +1 -1
- package/dist/judges/cloud-readiness.d.ts.map +1 -1
- package/dist/judges/cloud-readiness.js +4 -0
- package/dist/judges/cloud-readiness.js.map +1 -1
- package/dist/judges/code-structure.d.ts.map +1 -1
- package/dist/judges/code-structure.js +4 -0
- package/dist/judges/code-structure.js.map +1 -1
- package/dist/judges/compliance.d.ts.map +1 -1
- package/dist/judges/compliance.js +4 -0
- package/dist/judges/compliance.js.map +1 -1
- package/dist/judges/concurrency.d.ts.map +1 -1
- package/dist/judges/concurrency.js +4 -0
- package/dist/judges/concurrency.js.map +1 -1
- package/dist/judges/configuration-management.d.ts.map +1 -1
- package/dist/judges/configuration-management.js +4 -0
- package/dist/judges/configuration-management.js.map +1 -1
- package/dist/judges/cost-effectiveness.d.ts.map +1 -1
- package/dist/judges/cost-effectiveness.js +4 -0
- package/dist/judges/cost-effectiveness.js.map +1 -1
- package/dist/judges/cybersecurity.d.ts.map +1 -1
- package/dist/judges/cybersecurity.js +4 -0
- package/dist/judges/cybersecurity.js.map +1 -1
- package/dist/judges/data-security.d.ts.map +1 -1
- package/dist/judges/data-security.js +4 -0
- package/dist/judges/data-security.js.map +1 -1
- package/dist/judges/data-sovereignty.d.ts.map +1 -1
- package/dist/judges/data-sovereignty.js +4 -0
- package/dist/judges/data-sovereignty.js.map +1 -1
- package/dist/judges/database.d.ts.map +1 -1
- package/dist/judges/database.js +4 -0
- package/dist/judges/database.js.map +1 -1
- package/dist/judges/dependency-health.d.ts.map +1 -1
- package/dist/judges/dependency-health.js +4 -0
- package/dist/judges/dependency-health.js.map +1 -1
- package/dist/judges/documentation.d.ts.map +1 -1
- package/dist/judges/documentation.js +4 -0
- package/dist/judges/documentation.js.map +1 -1
- package/dist/judges/error-handling.d.ts.map +1 -1
- package/dist/judges/error-handling.js +4 -0
- package/dist/judges/error-handling.js.map +1 -1
- package/dist/judges/ethics-bias.d.ts.map +1 -1
- package/dist/judges/ethics-bias.js +4 -0
- package/dist/judges/ethics-bias.js.map +1 -1
- package/dist/judges/false-positive-review.d.ts.map +1 -1
- package/dist/judges/false-positive-review.js +2 -0
- package/dist/judges/false-positive-review.js.map +1 -1
- package/dist/judges/framework-safety.d.ts.map +1 -1
- package/dist/judges/framework-safety.js +4 -0
- package/dist/judges/framework-safety.js.map +1 -1
- package/dist/judges/hallucination-detection.d.ts.map +1 -1
- package/dist/judges/hallucination-detection.js +4 -0
- package/dist/judges/hallucination-detection.js.map +1 -1
- package/dist/judges/iac-security.d.ts.map +1 -1
- package/dist/judges/iac-security.js +4 -0
- package/dist/judges/iac-security.js.map +1 -1
- package/dist/judges/index.d.ts +59 -0
- package/dist/judges/index.d.ts.map +1 -1
- package/dist/judges/index.js +65 -189
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/intent-alignment.d.ts.map +1 -1
- package/dist/judges/intent-alignment.js +4 -0
- package/dist/judges/intent-alignment.js.map +1 -1
- package/dist/judges/internationalization.d.ts.map +1 -1
- package/dist/judges/internationalization.js +4 -0
- package/dist/judges/internationalization.js.map +1 -1
- package/dist/judges/logging-privacy.d.ts.map +1 -1
- package/dist/judges/logging-privacy.js +4 -0
- package/dist/judges/logging-privacy.js.map +1 -1
- package/dist/judges/logic-review.d.ts.map +1 -1
- package/dist/judges/logic-review.js +4 -0
- package/dist/judges/logic-review.js.map +1 -1
- package/dist/judges/maintainability.d.ts.map +1 -1
- package/dist/judges/maintainability.js +4 -0
- package/dist/judges/maintainability.js.map +1 -1
- package/dist/judges/model-fingerprint.d.ts.map +1 -1
- package/dist/judges/model-fingerprint.js +4 -0
- package/dist/judges/model-fingerprint.js.map +1 -1
- package/dist/judges/multi-turn-coherence.d.ts.map +1 -1
- package/dist/judges/multi-turn-coherence.js +4 -0
- package/dist/judges/multi-turn-coherence.js.map +1 -1
- package/dist/judges/observability.d.ts.map +1 -1
- package/dist/judges/observability.js +4 -0
- package/dist/judges/observability.js.map +1 -1
- package/dist/judges/over-engineering.d.ts.map +1 -1
- package/dist/judges/over-engineering.js +4 -0
- package/dist/judges/over-engineering.js.map +1 -1
- package/dist/judges/performance.d.ts.map +1 -1
- package/dist/judges/performance.js +4 -0
- package/dist/judges/performance.js.map +1 -1
- package/dist/judges/portability.d.ts.map +1 -1
- package/dist/judges/portability.js +4 -0
- package/dist/judges/portability.js.map +1 -1
- package/dist/judges/rate-limiting.d.ts.map +1 -1
- package/dist/judges/rate-limiting.js +4 -0
- package/dist/judges/rate-limiting.js.map +1 -1
- package/dist/judges/reliability.d.ts.map +1 -1
- package/dist/judges/reliability.js +4 -0
- package/dist/judges/reliability.js.map +1 -1
- package/dist/judges/scalability.d.ts.map +1 -1
- package/dist/judges/scalability.js +4 -0
- package/dist/judges/scalability.js.map +1 -1
- package/dist/judges/security.d.ts.map +1 -1
- package/dist/judges/security.js +4 -0
- package/dist/judges/security.js.map +1 -1
- package/dist/judges/software-practices.d.ts.map +1 -1
- package/dist/judges/software-practices.js +4 -0
- package/dist/judges/software-practices.js.map +1 -1
- package/dist/judges/testing.d.ts.map +1 -1
- package/dist/judges/testing.js +4 -0
- package/dist/judges/testing.js.map +1 -1
- package/dist/judges/ux.d.ts.map +1 -1
- package/dist/judges/ux.js +4 -0
- package/dist/judges/ux.js.map +1 -1
- package/dist/parallel.d.ts +53 -0
- package/dist/parallel.d.ts.map +1 -0
- package/dist/parallel.js +170 -0
- package/dist/parallel.js.map +1 -0
- package/dist/plugins.d.ts +8 -51
- package/dist/plugins.d.ts.map +1 -1
- package/dist/plugins.js +16 -125
- package/dist/plugins.js.map +1 -1
- package/dist/security-ids.d.ts +24 -0
- package/dist/security-ids.d.ts.map +1 -0
- package/dist/security-ids.js +240 -0
- package/dist/security-ids.js.map +1 -0
- package/dist/tools/prompts.d.ts +4 -0
- package/dist/tools/prompts.d.ts.map +1 -1
- package/dist/tools/prompts.js +6 -4
- package/dist/tools/prompts.js.map +1 -1
- package/dist/tools/register-scaffold.d.ts +3 -0
- package/dist/tools/register-scaffold.d.ts.map +1 -0
- package/dist/tools/register-scaffold.js +399 -0
- package/dist/tools/register-scaffold.js.map +1 -0
- package/dist/tools/register.d.ts +1 -1
- package/dist/tools/register.d.ts.map +1 -1
- package/dist/tools/register.js +3 -1
- package/dist/tools/register.js.map +1 -1
- package/dist/types.d.ts +75 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +3 -2
- package/server.json +2 -2
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `judges fix --interactive` — Interactive fix mode.
|
|
3
|
+
*
|
|
4
|
+
* Presents each auto-fixable finding one by one, letting the developer
|
|
5
|
+
* accept, skip, or view the diff before applying each patch.
|
|
6
|
+
* Reduces trust barrier by giving full control over what gets changed.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* judges fix src/app.ts --interactive # Interactive per-finding review
|
|
10
|
+
* judges fix src/app.ts -I # Short form
|
|
11
|
+
*/
|
|
12
|
+
import { readFileSync, writeFileSync } from "fs";
|
|
13
|
+
import { createInterface } from "readline";
|
|
14
|
+
import { applyPatches } from "./fix.js";
|
|
15
|
+
// ─── Diff Display ───────────────────────────────────────────────────────────
|
|
16
|
+
function formatPatchDiff(patch) {
|
|
17
|
+
const lines = [];
|
|
18
|
+
lines.push(` Line ${patch.startLine}–${patch.endLine}:`);
|
|
19
|
+
for (const line of patch.oldText.split("\n")) {
|
|
20
|
+
lines.push(` \x1b[31m- ${line}\x1b[0m`);
|
|
21
|
+
}
|
|
22
|
+
for (const line of patch.newText.split("\n")) {
|
|
23
|
+
lines.push(` \x1b[32m+ ${line}\x1b[0m`);
|
|
24
|
+
}
|
|
25
|
+
return lines.join("\n");
|
|
26
|
+
}
|
|
27
|
+
function formatFindingHeader(finding, index, total) {
|
|
28
|
+
const sevColors = {
|
|
29
|
+
critical: "\x1b[31;1m",
|
|
30
|
+
high: "\x1b[31m",
|
|
31
|
+
medium: "\x1b[33m",
|
|
32
|
+
low: "\x1b[36m",
|
|
33
|
+
info: "\x1b[37m",
|
|
34
|
+
};
|
|
35
|
+
const color = sevColors[finding.severity] || "\x1b[37m";
|
|
36
|
+
const reset = "\x1b[0m";
|
|
37
|
+
return [
|
|
38
|
+
"",
|
|
39
|
+
`═══════════════════════════════════════════════════════════════`,
|
|
40
|
+
` [${index + 1}/${total}] ${color}${finding.severity.toUpperCase()}${reset} ${finding.ruleId} — ${finding.title}`,
|
|
41
|
+
` ${finding.description}`,
|
|
42
|
+
`───────────────────────────────────────────────────────────────`,
|
|
43
|
+
].join("\n");
|
|
44
|
+
}
|
|
45
|
+
// ─── Interactive Prompt ─────────────────────────────────────────────────────
|
|
46
|
+
async function promptUser(question) {
|
|
47
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
48
|
+
return new Promise((resolve) => {
|
|
49
|
+
rl.question(question, (answer) => {
|
|
50
|
+
rl.close();
|
|
51
|
+
resolve(answer.trim().toLowerCase());
|
|
52
|
+
});
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
// ─── Interactive Fix Runner ─────────────────────────────────────────────────
|
|
56
|
+
/**
|
|
57
|
+
* Run interactive fix mode — present each fixable finding and let the user
|
|
58
|
+
* accept or skip it before applying.
|
|
59
|
+
*/
|
|
60
|
+
export async function runInteractiveFix(filePath, findings) {
|
|
61
|
+
const fixable = findings.filter((f) => f.patch);
|
|
62
|
+
if (fixable.length === 0) {
|
|
63
|
+
console.log("\n No auto-fixable findings.\n");
|
|
64
|
+
return { accepted: 0, skipped: 0, total: 0 };
|
|
65
|
+
}
|
|
66
|
+
console.log(`\n Found ${fixable.length} auto-fixable finding(s). Review each one:\n`);
|
|
67
|
+
const accepted = [];
|
|
68
|
+
let skipped = 0;
|
|
69
|
+
for (let i = 0; i < fixable.length; i++) {
|
|
70
|
+
const finding = fixable[i];
|
|
71
|
+
console.log(formatFindingHeader(finding, i, fixable.length));
|
|
72
|
+
console.log(formatPatchDiff(finding.patch));
|
|
73
|
+
console.log("");
|
|
74
|
+
const answer = await promptUser(" Apply this fix? [y]es / [n]o / [a]ll / [q]uit: ");
|
|
75
|
+
switch (answer) {
|
|
76
|
+
case "y":
|
|
77
|
+
case "yes":
|
|
78
|
+
accepted.push({
|
|
79
|
+
ruleId: finding.ruleId,
|
|
80
|
+
title: finding.title,
|
|
81
|
+
severity: finding.severity,
|
|
82
|
+
patch: finding.patch,
|
|
83
|
+
lineNumbers: finding.lineNumbers,
|
|
84
|
+
});
|
|
85
|
+
console.log(" ✓ Accepted\n");
|
|
86
|
+
break;
|
|
87
|
+
case "a":
|
|
88
|
+
case "all":
|
|
89
|
+
// Accept this and all remaining
|
|
90
|
+
accepted.push({
|
|
91
|
+
ruleId: finding.ruleId,
|
|
92
|
+
title: finding.title,
|
|
93
|
+
severity: finding.severity,
|
|
94
|
+
patch: finding.patch,
|
|
95
|
+
lineNumbers: finding.lineNumbers,
|
|
96
|
+
});
|
|
97
|
+
for (let j = i + 1; j < fixable.length; j++) {
|
|
98
|
+
accepted.push({
|
|
99
|
+
ruleId: fixable[j].ruleId,
|
|
100
|
+
title: fixable[j].title,
|
|
101
|
+
severity: fixable[j].severity,
|
|
102
|
+
patch: fixable[j].patch,
|
|
103
|
+
lineNumbers: fixable[j].lineNumbers,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
console.log(` ✓ Accepted all remaining ${fixable.length - i} fix(es)\n`);
|
|
107
|
+
i = fixable.length; // break out of loop
|
|
108
|
+
break;
|
|
109
|
+
case "q":
|
|
110
|
+
case "quit":
|
|
111
|
+
skipped += fixable.length - i;
|
|
112
|
+
console.log(" Quit — remaining fixes skipped.\n");
|
|
113
|
+
i = fixable.length; // break out of loop
|
|
114
|
+
break;
|
|
115
|
+
case "n":
|
|
116
|
+
case "no":
|
|
117
|
+
default:
|
|
118
|
+
skipped++;
|
|
119
|
+
console.log(" ⏭ Skipped\n");
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
// Apply accepted patches
|
|
124
|
+
if (accepted.length > 0) {
|
|
125
|
+
const code = readFileSync(filePath, "utf-8");
|
|
126
|
+
const result = applyPatches(code, accepted);
|
|
127
|
+
writeFileSync(filePath, result.result, "utf-8");
|
|
128
|
+
console.log(` ✅ Applied ${result.applied} fix(es) to ${filePath}`);
|
|
129
|
+
if (result.skipped > 0) {
|
|
130
|
+
console.log(` ⏭ ${result.skipped} fix(es) could not be applied (source changed)`);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
console.log(`\n Summary: ${accepted.length} accepted, ${skipped} skipped out of ${fixable.length}\n`);
|
|
134
|
+
return {
|
|
135
|
+
accepted: accepted.length,
|
|
136
|
+
skipped,
|
|
137
|
+
total: fixable.length,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
//# sourceMappingURL=interactive-fix.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"interactive-fix.js","sourceRoot":"","sources":["../../src/commands/interactive-fix.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AACjD,OAAO,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAE3C,OAAO,EAAE,YAAY,EAAuB,MAAM,UAAU,CAAC;AAU7D,+EAA+E;AAE/E,SAAS,eAAe,CAAC,KAAY;IACnC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,OAAO,GAAG,CAAC,CAAC;IAC1D,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QAC7C,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,SAAS,CAAC,CAAC;IAC3C,CAAC;IACD,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QAC7C,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,SAAS,CAAC,CAAC;IAC3C,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,mBAAmB,CAAC,OAAgB,EAAE,KAAa,EAAE,KAAa;IACzE,MAAM,SAAS,GAA2B;QACxC,QAAQ,EAAE,YAAY;QACtB,IAAI,EAAE,UAAU;QAChB,MAAM,EAAE,UAAU;QAClB,GAAG,EAAE,UAAU;QACf,IAAI,EAAE,UAAU;KACjB,CAAC;IACF,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,UAAU,CAAC;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC;IAExB,OAAO;QACL,EAAE;QACF,iEAAiE;QACjE,MAAM,KAAK,GAAG,CAAC,IAAI,KAAK,KAAK,KAAK,GAAG,OAAO,CAAC,QAAQ,CAAC,WAAW,EAAE,GAAG,KAAK,IAAI,OAAO,CAAC,MAAM,MAAM,OAAO,CAAC,KAAK,EAAE;QAClH,KAAK,OAAO,CAAC,WAAW,EAAE;QAC1B,iEAAiE;KAClE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACf,CAAC;AAED,+EAA+E;AAE/E,KAAK,UAAU,UAAU,CAAC,QAAgB;IACxC,MAAM,EAAE,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7E,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,MAAM,EAAE,EAAE;YAC/B,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED,+EAA+E;AAE/E;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,QAAgB,EAAE,QAAmB;IAC3E,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IAEhD,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,CAAC,GAAG,CAAC,iCAAiC,CAAC,CAAC;QAC/C,OAAO,EAAE,QAAQ,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;IAC/C,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,aAAa,OAAO,CAAC,MAAM,8CAA8C,CAAC,CAAC;IAEvF,MAAM,QAAQ,GAAqB,EAAE,CAAC;IACtC,IAAI,OAAO,GAAG,CAAC,CAAC;IAEhB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QAE3B,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,OAAO,EAAE,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,OAAO,CAAC,KAAM,CAAC,CAAC,CAAC;QAC7C,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAEhB,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,mDAAmD,CAAC,CAAC;QAErF,QAAQ,MAAM,EAAE,CAAC;YACf,KAAK,GAAG,CAAC;YACT,KAAK,KAAK;gBACR,QAAQ,CAAC,IAAI,CAAC;oBACZ,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;oBACpB,QAAQ,EAAE,OAAO,CAAC,QAAQ;oBAC1B,KAAK,EAAE,OAAO,CAAC,KAAM;oBACrB,WAAW,EAAE,OAAO,CAAC,WAAW;iBACjC,CAAC,CAAC;gBACH,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;gBAC9B,MAAM;YAER,KAAK,GAAG,CAAC;YACT,KAAK,KAAK;gBACR,gCAAgC;gBAChC,QAAQ,CAAC,IAAI,CAAC;oBACZ,MAAM,EAAE,OAAO,CAAC,MAAM;oBACtB,KAAK,EAAE,OAAO,CAAC,KAAK;oBACpB,QAAQ,EAAE,OAAO,CAAC,QAAQ;oBAC1B,KAAK,EAAE,OAAO,CAAC,KAAM;oBACrB,WAAW,EAAE,OAAO,CAAC,WAAW;iBACjC,CAAC,CAAC;gBACH,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBAC5C,QAAQ,CAAC,IAAI,CAAC;wBACZ,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM;wBACzB,KAAK,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK;wBACvB,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ;wBAC7B,KAAK,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,KAAM;wBACxB,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,WAAW;qBACpC,CAAC,CAAC;gBACL,CAAC;gBACD,OAAO,CAAC,GAAG,CAAC,8BAA8B,OAAO,CAAC,MAAM,GAAG,CAAC,YAAY,CAAC,CAAC;gBAC1E,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,oBAAoB;gBACxC,MAAM;YAER,KAAK,GAAG,CAAC;YACT,KAAK,MAAM;gBACT,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;gBAC9B,OAAO,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC;gBACnD,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,oBAAoB;gBACxC,MAAM;YAER,KAAK,GAAG,CAAC;YACT,KAAK,IAAI,CAAC;YACV;gBACE,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC;gBAC7B,MAAM;QACV,CAAC;IACH,CAAC;IAED,yBAAyB;IACzB,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC7C,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;QAC5C,aAAa,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAChD,OAAO,CAAC,GAAG,CAAC,eAAe,MAAM,CAAC,OAAO,eAAe,QAAQ,EAAE,CAAC,CAAC;QACpE,IAAI,MAAM,CAAC,OAAO,GAAG,CAAC,EAAE,CAAC;YACvB,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,OAAO,gDAAgD,CAAC,CAAC;QACrF,CAAC;IACH,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,gBAAgB,QAAQ,CAAC,MAAM,cAAc,OAAO,mBAAmB,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC;IAEvG,OAAO;QACL,QAAQ,EAAE,QAAQ,CAAC,MAAM;QACzB,OAAO;QACP,KAAK,EAAE,OAAO,CAAC,MAAM;KACtB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Benchmark — types, scoring, and formatting for the probabilistic
|
|
3
|
+
* (LLM prompt) layer of the Judges Panel benchmark.
|
|
4
|
+
*
|
|
5
|
+
* This module does NOT make LLM API calls. It provides:
|
|
6
|
+
* - Types for LLM benchmark snapshots
|
|
7
|
+
* - Rule-ID parsing from LLM response text
|
|
8
|
+
* - Prompt construction (mirrors MCP-served prompts exactly)
|
|
9
|
+
* - Scoring logic (same methodology as L1 deterministic benchmark)
|
|
10
|
+
*
|
|
11
|
+
* The actual LLM API calls live in scripts/run-llm-benchmark.ts,
|
|
12
|
+
* keeping the npm package free of LLM API dependencies.
|
|
13
|
+
*/
|
|
14
|
+
import type { JudgeDefinition } from "../types.js";
|
|
15
|
+
import type { BenchmarkCase, CategoryResult, JudgeBenchmarkResult, DifficultyResult } from "./benchmark.js";
|
|
16
|
+
export interface LlmBenchmarkSnapshot {
|
|
17
|
+
/** Timestamp of this LLM benchmark run */
|
|
18
|
+
timestamp: string;
|
|
19
|
+
/** Version of judges used */
|
|
20
|
+
version: string;
|
|
21
|
+
/** LLM model used (e.g. "gpt-4o", "claude-sonnet-4-20250514") */
|
|
22
|
+
model: string;
|
|
23
|
+
/** API provider (e.g. "openai", "anthropic") */
|
|
24
|
+
provider: string;
|
|
25
|
+
/** Prompt mode used: full-tribunal or per-judge */
|
|
26
|
+
promptMode: "tribunal" | "per-judge";
|
|
27
|
+
/** Total test cases run */
|
|
28
|
+
totalCases: number;
|
|
29
|
+
/** Cases where at least one expected rule was detected */
|
|
30
|
+
detected: number;
|
|
31
|
+
/** Cases where no expected rule was detected */
|
|
32
|
+
missed: number;
|
|
33
|
+
/** Total expected findings across all cases */
|
|
34
|
+
totalExpected: number;
|
|
35
|
+
/** True positives (prefix-based matching) */
|
|
36
|
+
truePositives: number;
|
|
37
|
+
/** False negatives (prefix-based matching) */
|
|
38
|
+
falseNegatives: number;
|
|
39
|
+
/** False positives */
|
|
40
|
+
falsePositives: number;
|
|
41
|
+
/** Precision: TP / (TP + FP) */
|
|
42
|
+
precision: number;
|
|
43
|
+
/** Recall: TP / (TP + FN) */
|
|
44
|
+
recall: number;
|
|
45
|
+
/** F1 Score */
|
|
46
|
+
f1Score: number;
|
|
47
|
+
/** Detection rate: cases detected / total cases */
|
|
48
|
+
detectionRate: number;
|
|
49
|
+
/** Per-category breakdown */
|
|
50
|
+
perCategory: Record<string, CategoryResult>;
|
|
51
|
+
/** Per-judge breakdown */
|
|
52
|
+
perJudge: Record<string, JudgeBenchmarkResult>;
|
|
53
|
+
/** Per-difficulty breakdown */
|
|
54
|
+
perDifficulty: Record<string, DifficultyResult>;
|
|
55
|
+
/** Individual case results */
|
|
56
|
+
cases: LlmCaseResult[];
|
|
57
|
+
/** Total tokens used (if available from API) */
|
|
58
|
+
totalTokensUsed?: number;
|
|
59
|
+
/** Total run duration in seconds */
|
|
60
|
+
durationSeconds: number;
|
|
61
|
+
}
|
|
62
|
+
export interface LlmCaseResult {
|
|
63
|
+
caseId: string;
|
|
64
|
+
category: string;
|
|
65
|
+
difficulty: string;
|
|
66
|
+
passed: boolean;
|
|
67
|
+
expectedRuleIds: string[];
|
|
68
|
+
detectedRuleIds: string[];
|
|
69
|
+
missedRuleIds: string[];
|
|
70
|
+
falsePositiveRuleIds: string[];
|
|
71
|
+
/** Raw LLM response text */
|
|
72
|
+
rawResponse: string;
|
|
73
|
+
/** Tokens used for this case */
|
|
74
|
+
tokensUsed?: number;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Extract unique rule IDs from LLM response text.
|
|
78
|
+
* Matches patterns like CYBER-001, SEC-003, AUTH-001, etc.
|
|
79
|
+
*/
|
|
80
|
+
export declare function parseLlmRuleIds(response: string): string[];
|
|
81
|
+
/**
|
|
82
|
+
* Construct a per-judge prompt — identical to the MCP-served `judge-{id}` prompt.
|
|
83
|
+
*/
|
|
84
|
+
export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string): string;
|
|
85
|
+
/**
|
|
86
|
+
* Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
|
|
87
|
+
*/
|
|
88
|
+
export declare function constructTribunalPrompt(code: string, language: string): string;
|
|
89
|
+
/**
|
|
90
|
+
* Select a stratified sample of benchmark cases, ensuring representation
|
|
91
|
+
* across categories, difficulties, and both clean/dirty cases.
|
|
92
|
+
*/
|
|
93
|
+
export declare function selectStratifiedSample(cases: BenchmarkCase[], targetSize: number): BenchmarkCase[];
|
|
94
|
+
/**
|
|
95
|
+
* Score a single LLM benchmark case using prefix-based matching.
|
|
96
|
+
* Returns a fully populated LlmCaseResult.
|
|
97
|
+
*/
|
|
98
|
+
export declare function scoreLlmCase(tc: BenchmarkCase, detectedRuleIds: string[], rawResponse: string, tokensUsed?: number): LlmCaseResult;
|
|
99
|
+
/**
|
|
100
|
+
* Compute aggregate metrics for an LLM benchmark snapshot from raw case results.
|
|
101
|
+
* Uses the same prefix-based matching methodology as the L1 benchmark.
|
|
102
|
+
*/
|
|
103
|
+
export declare function computeLlmMetrics(rawCases: LlmCaseResult[], version: string, model: string, provider: string, promptMode: "tribunal" | "per-judge", durationSeconds: number, totalTokensUsed?: number): LlmBenchmarkSnapshot;
|
|
104
|
+
/**
|
|
105
|
+
* Format an LLM benchmark snapshot as a markdown section for the benchmark report.
|
|
106
|
+
*/
|
|
107
|
+
export declare function formatLlmSnapshotMarkdown(snapshot: LlmBenchmarkSnapshot): string;
|
|
108
|
+
/**
|
|
109
|
+
* Format a layer comparison table showing L1 vs L2 side by side.
|
|
110
|
+
*/
|
|
111
|
+
export declare function formatLayerComparisonMarkdown(l1: {
|
|
112
|
+
detectionRate: number;
|
|
113
|
+
precision: number;
|
|
114
|
+
recall: number;
|
|
115
|
+
f1Score: number;
|
|
116
|
+
falsePositives: number;
|
|
117
|
+
totalCases: number;
|
|
118
|
+
}, l2: LlmBenchmarkSnapshot): string;
|
|
119
|
+
//# sourceMappingURL=llm-benchmark.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm-benchmark.d.ts","sourceRoot":"","sources":["../../src/commands/llm-benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACnD,OAAO,KAAK,EAAE,aAAa,EAAE,cAAc,EAAE,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAM5G,MAAM,WAAW,oBAAoB;IACnC,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;IAClB,6BAA6B;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,iEAAiE;IACjE,KAAK,EAAE,MAAM,CAAC;IACd,gDAAgD;IAChD,QAAQ,EAAE,MAAM,CAAC;IACjB,mDAAmD;IACnD,UAAU,EAAE,UAAU,GAAG,WAAW,CAAC;IACrC,2BAA2B;IAC3B,UAAU,EAAE,MAAM,CAAC;IACnB,0DAA0D;IAC1D,QAAQ,EAAE,MAAM,CAAC;IACjB,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAC;IACf,+CAA+C;IAC/C,aAAa,EAAE,MAAM,CAAC;IACtB,6CAA6C;IAC7C,aAAa,EAAE,MAAM,CAAC;IACtB,8CAA8C;IAC9C,cAAc,EAAE,MAAM,CAAC;IACvB,sBAAsB;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,gCAAgC;IAChC,SAAS,EAAE,MAAM,CAAC;IAClB,6BAA6B;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,eAAe;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,mDAAmD;IACnD,aAAa,EAAE,MAAM,CAAC;IACtB,6BAA6B;IAC7B,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;IAC5C,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,oBAAoB,CAAC,CAAC;IAC/C,+BAA+B;IAC/B,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAAC;IAChD,8BAA8B;IAC9B,KAAK,EAAE,aAAa,EAAE,CAAC;IACvB,gDAAgD;IAChD,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,oCAAoC;IACpC,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,aAAa;IAC5B,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,OAAO,CAAC;IAChB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,oBAAoB,EAAE,MAAM,EAAE,CAAC;IAC/B,4BAA4B;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,gCAAgC;IAChC,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAID;;;GAGG;AACH,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,CAW1D;AAOD;;GAEG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,eAAe,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,CAMtG;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,CAqB9E;AAID;;;GAGG;AACH,wBAAgB,sBAAsB,CAAC,KAAK,EAAE,aAAa,EAAE,EAAE,UAAU,EAAE,MAAM,GAAG,aAAa,EAAE,CAqDlG;AAID;;;GAGG;AACH,wBAAgB,YAAY,CAC1B,EAAE,EAAE,aAAa,EACjB,eAAe,EAAE,MAAM,EAAE,EACzB,WAAW,EAAE,MAAM,EACnB,UAAU,CAAC,EAAE,MAAM,GAClB,aAAa,CAmCf;AAED;;;GAGG;AACH,wBAAgB,iBAAiB,CAC/B,QAAQ,EAAE,aAAa,EAAE,EACzB,OAAO,EAAE,MAAM,EACf,KAAK,EAAE,MAAM,EACb,QAAQ,EAAE,MAAM,EAChB,UAAU,EAAE,UAAU,GAAG,WAAW,EACpC,eAAe,EAAE,MAAM,EACvB,eAAe,CAAC,EAAE,MAAM,GACvB,oBAAoB,CA+HtB;AAID;;GAEG;AACH,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,oBAAoB,GAAG,MAAM,CAkGhF;AAED;;GAEG;AACH,wBAAgB,6BAA6B,CAC3C,EAAE,EAAE;IACF,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;CACpB,EACD,EAAE,EAAE,oBAAoB,GACvB,MAAM,CAsBR"}
|
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Benchmark — types, scoring, and formatting for the probabilistic
|
|
3
|
+
* (LLM prompt) layer of the Judges Panel benchmark.
|
|
4
|
+
*
|
|
5
|
+
* This module does NOT make LLM API calls. It provides:
|
|
6
|
+
* - Types for LLM benchmark snapshots
|
|
7
|
+
* - Rule-ID parsing from LLM response text
|
|
8
|
+
* - Prompt construction (mirrors MCP-served prompts exactly)
|
|
9
|
+
* - Scoring logic (same methodology as L1 deterministic benchmark)
|
|
10
|
+
*
|
|
11
|
+
* The actual LLM API calls live in scripts/run-llm-benchmark.ts,
|
|
12
|
+
* keeping the npm package free of LLM API dependencies.
|
|
13
|
+
*/
|
|
14
|
+
import { JUDGES } from "../judges/index.js";
|
|
15
|
+
import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE } from "../tools/prompts.js";
|
|
16
|
+
// ─── Rule ID Parsing ────────────────────────────────────────────────────────
|
|
17
|
+
/**
|
|
18
|
+
* Extract unique rule IDs from LLM response text.
|
|
19
|
+
* Matches patterns like CYBER-001, SEC-003, AUTH-001, etc.
|
|
20
|
+
*/
|
|
21
|
+
export function parseLlmRuleIds(response) {
|
|
22
|
+
const validPrefixes = new Set(JUDGES.map((j) => j.rulePrefix));
|
|
23
|
+
const pattern = /\b([A-Z]{2,})-(\d{3})\b/g;
|
|
24
|
+
const found = new Set();
|
|
25
|
+
let match;
|
|
26
|
+
while ((match = pattern.exec(response)) !== null) {
|
|
27
|
+
if (validPrefixes.has(match[1])) {
|
|
28
|
+
found.add(match[0]);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
return [...found];
|
|
32
|
+
}
|
|
33
|
+
// ─── Prompt Construction ────────────────────────────────────────────────────
|
|
34
|
+
// These construct the exact same prompts served via MCP, ensuring the
|
|
35
|
+
// benchmark tests the same prompts real users experience.
|
|
36
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
37
|
+
/**
|
|
38
|
+
* Construct a per-judge prompt — identical to the MCP-served `judge-{id}` prompt.
|
|
39
|
+
*/
|
|
40
|
+
export function constructPerJudgePrompt(judge, code, language) {
|
|
41
|
+
return (`${judge.systemPrompt}\n\n${PRECISION_MANDATE}\n\n` +
|
|
42
|
+
`Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
|
|
43
|
+
`\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. End with an overall score (0-100) and verdict (pass/warning/fail).`);
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
|
|
47
|
+
*/
|
|
48
|
+
export function constructTribunalPrompt(code, language) {
|
|
49
|
+
const judgeInstructions = JUDGES.map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
|
|
50
|
+
return (`You are the Judges Panel — a panel of ${JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
|
|
51
|
+
`## Universal Evaluation Directives\n\n` +
|
|
52
|
+
`${SHARED_ADVERSARIAL_MANDATE}\n\n` +
|
|
53
|
+
`${PRECISION_MANDATE}\n\n` +
|
|
54
|
+
`## Evaluation Instructions\n\n` +
|
|
55
|
+
`Evaluate the following ${language} code from the perspective of ALL ${JUDGES.length} judges below. For each judge, provide:\n` +
|
|
56
|
+
`1. Judge name and domain\n` +
|
|
57
|
+
`2. Verdict (PASS / WARNING / FAIL)\n` +
|
|
58
|
+
`3. Score (0-100)\n` +
|
|
59
|
+
`4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
|
|
60
|
+
`Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
|
|
61
|
+
`## The Judges\n\n${judgeInstructions}\n\n` +
|
|
62
|
+
`## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\``);
|
|
63
|
+
}
|
|
64
|
+
// ─── Stratified Sampling ────────────────────────────────────────────────────
|
|
65
|
+
/**
|
|
66
|
+
* Select a stratified sample of benchmark cases, ensuring representation
|
|
67
|
+
* across categories, difficulties, and both clean/dirty cases.
|
|
68
|
+
*/
|
|
69
|
+
export function selectStratifiedSample(cases, targetSize) {
|
|
70
|
+
if (targetSize >= cases.length)
|
|
71
|
+
return [...cases];
|
|
72
|
+
// Group by category
|
|
73
|
+
const byCategory = {};
|
|
74
|
+
for (const c of cases) {
|
|
75
|
+
if (!byCategory[c.category])
|
|
76
|
+
byCategory[c.category] = [];
|
|
77
|
+
byCategory[c.category].push(c);
|
|
78
|
+
}
|
|
79
|
+
const categories = Object.keys(byCategory);
|
|
80
|
+
const perCategory = Math.max(2, Math.floor(targetSize / categories.length));
|
|
81
|
+
const selected = [];
|
|
82
|
+
const usedIds = new Set();
|
|
83
|
+
for (const cat of categories) {
|
|
84
|
+
const pool = byCategory[cat];
|
|
85
|
+
// Ensure difficulty coverage within each category
|
|
86
|
+
const byDiff = {};
|
|
87
|
+
for (const c of pool) {
|
|
88
|
+
if (!byDiff[c.difficulty])
|
|
89
|
+
byDiff[c.difficulty] = [];
|
|
90
|
+
byDiff[c.difficulty].push(c);
|
|
91
|
+
}
|
|
92
|
+
let taken = 0;
|
|
93
|
+
for (const diff of ["easy", "medium", "hard"]) {
|
|
94
|
+
if (taken >= perCategory)
|
|
95
|
+
break;
|
|
96
|
+
const diffPool = byDiff[diff] || [];
|
|
97
|
+
const remaining = perCategory - taken;
|
|
98
|
+
const diffCount = diff === "easy" ? 3 : diff === "medium" ? 2 : 1;
|
|
99
|
+
const toTake = Math.max(1, Math.ceil(remaining / diffCount));
|
|
100
|
+
for (let i = 0; i < Math.min(toTake, diffPool.length) && taken < perCategory; i++) {
|
|
101
|
+
if (!usedIds.has(diffPool[i].id)) {
|
|
102
|
+
selected.push(diffPool[i]);
|
|
103
|
+
usedIds.add(diffPool[i].id);
|
|
104
|
+
taken++;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Fill remaining slots if under target
|
|
110
|
+
if (selected.length < targetSize) {
|
|
111
|
+
for (const c of cases) {
|
|
112
|
+
if (selected.length >= targetSize)
|
|
113
|
+
break;
|
|
114
|
+
if (!usedIds.has(c.id)) {
|
|
115
|
+
selected.push(c);
|
|
116
|
+
usedIds.add(c.id);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return selected;
|
|
121
|
+
}
|
|
122
|
+
// ─── Scoring ────────────────────────────────────────────────────────────────
|
|
123
|
+
/**
|
|
124
|
+
* Score a single LLM benchmark case using prefix-based matching.
|
|
125
|
+
* Returns a fully populated LlmCaseResult.
|
|
126
|
+
*/
|
|
127
|
+
export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
|
|
128
|
+
const _expectedPrefixes = new Set(tc.expectedRuleIds.map((r) => r.split("-")[0]));
|
|
129
|
+
const detectedPrefixes = new Set(detectedRuleIds.map((r) => r.split("-")[0]));
|
|
130
|
+
const matchedExpected = tc.expectedRuleIds.filter((expected) => {
|
|
131
|
+
const prefix = expected.split("-")[0];
|
|
132
|
+
return detectedPrefixes.has(prefix);
|
|
133
|
+
});
|
|
134
|
+
const missedExpected = tc.expectedRuleIds.filter((expected) => {
|
|
135
|
+
const prefix = expected.split("-")[0];
|
|
136
|
+
return !detectedPrefixes.has(prefix);
|
|
137
|
+
});
|
|
138
|
+
const falsePositiveIds = tc.unexpectedRuleIds
|
|
139
|
+
? detectedRuleIds.filter((found) => {
|
|
140
|
+
const prefix = found.split("-")[0];
|
|
141
|
+
return tc.unexpectedRuleIds.some((u) => u.split("-")[0] === prefix);
|
|
142
|
+
})
|
|
143
|
+
: [];
|
|
144
|
+
const casePassed = tc.expectedRuleIds.length === 0 ? falsePositiveIds.length === 0 : matchedExpected.length > 0;
|
|
145
|
+
return {
|
|
146
|
+
caseId: tc.id,
|
|
147
|
+
category: tc.category,
|
|
148
|
+
difficulty: tc.difficulty,
|
|
149
|
+
passed: casePassed,
|
|
150
|
+
expectedRuleIds: tc.expectedRuleIds,
|
|
151
|
+
detectedRuleIds,
|
|
152
|
+
missedRuleIds: missedExpected,
|
|
153
|
+
falsePositiveRuleIds: falsePositiveIds,
|
|
154
|
+
rawResponse,
|
|
155
|
+
tokensUsed,
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Compute aggregate metrics for an LLM benchmark snapshot from raw case results.
|
|
160
|
+
* Uses the same prefix-based matching methodology as the L1 benchmark.
|
|
161
|
+
*/
|
|
162
|
+
export function computeLlmMetrics(rawCases, version, model, provider, promptMode, durationSeconds, totalTokensUsed) {
|
|
163
|
+
const perCategory = {};
|
|
164
|
+
const perJudge = {};
|
|
165
|
+
const perDifficulty = {};
|
|
166
|
+
let totalTP = 0;
|
|
167
|
+
let totalFN = 0;
|
|
168
|
+
let totalFP = 0;
|
|
169
|
+
let totalDetected = 0;
|
|
170
|
+
for (const c of rawCases) {
|
|
171
|
+
const caseTP = c.expectedRuleIds.length - c.missedRuleIds.length;
|
|
172
|
+
const caseFN = c.missedRuleIds.length;
|
|
173
|
+
const caseFP = c.falsePositiveRuleIds.length;
|
|
174
|
+
if (c.passed)
|
|
175
|
+
totalDetected++;
|
|
176
|
+
totalTP += caseTP;
|
|
177
|
+
totalFN += caseFN;
|
|
178
|
+
totalFP += caseFP;
|
|
179
|
+
// Per-difficulty
|
|
180
|
+
if (!perDifficulty[c.difficulty]) {
|
|
181
|
+
perDifficulty[c.difficulty] = { difficulty: c.difficulty, total: 0, detected: 0, detectionRate: 0 };
|
|
182
|
+
}
|
|
183
|
+
perDifficulty[c.difficulty].total++;
|
|
184
|
+
if (c.passed)
|
|
185
|
+
perDifficulty[c.difficulty].detected++;
|
|
186
|
+
// Per-category
|
|
187
|
+
if (!perCategory[c.category]) {
|
|
188
|
+
perCategory[c.category] = {
|
|
189
|
+
category: c.category,
|
|
190
|
+
total: 0,
|
|
191
|
+
detected: 0,
|
|
192
|
+
truePositives: 0,
|
|
193
|
+
falseNegatives: 0,
|
|
194
|
+
falsePositives: 0,
|
|
195
|
+
precision: 0,
|
|
196
|
+
recall: 0,
|
|
197
|
+
f1Score: 0,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
const cat = perCategory[c.category];
|
|
201
|
+
cat.total++;
|
|
202
|
+
if (c.passed)
|
|
203
|
+
cat.detected++;
|
|
204
|
+
cat.truePositives += caseTP;
|
|
205
|
+
cat.falseNegatives += caseFN;
|
|
206
|
+
cat.falsePositives += caseFP;
|
|
207
|
+
// Per-judge
|
|
208
|
+
const expectedPrefixes = new Set(c.expectedRuleIds.map((r) => r.split("-")[0]));
|
|
209
|
+
const isCleanCase = c.expectedRuleIds.length === 0;
|
|
210
|
+
for (const ruleId of c.detectedRuleIds) {
|
|
211
|
+
const prefix = ruleId.split("-")[0];
|
|
212
|
+
if (!perJudge[prefix]) {
|
|
213
|
+
perJudge[prefix] = {
|
|
214
|
+
judgeId: prefix,
|
|
215
|
+
total: 0,
|
|
216
|
+
truePositives: 0,
|
|
217
|
+
falseNegatives: 0,
|
|
218
|
+
falsePositives: 0,
|
|
219
|
+
precision: 0,
|
|
220
|
+
recall: 0,
|
|
221
|
+
f1Score: 0,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
const jb = perJudge[prefix];
|
|
225
|
+
jb.total++;
|
|
226
|
+
if (expectedPrefixes.has(prefix)) {
|
|
227
|
+
jb.truePositives++;
|
|
228
|
+
}
|
|
229
|
+
else if (isCleanCase) {
|
|
230
|
+
jb.falsePositives++;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
// Compute aggregate metrics
|
|
235
|
+
const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 1;
|
|
236
|
+
const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 1;
|
|
237
|
+
const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
238
|
+
// Compute per-difficulty rates
|
|
239
|
+
for (const d of Object.values(perDifficulty)) {
|
|
240
|
+
d.detectionRate = d.total > 0 ? d.detected / d.total : 0;
|
|
241
|
+
}
|
|
242
|
+
// Compute per-category metrics
|
|
243
|
+
for (const cat of Object.values(perCategory)) {
|
|
244
|
+
cat.precision =
|
|
245
|
+
cat.truePositives + cat.falsePositives > 0 ? cat.truePositives / (cat.truePositives + cat.falsePositives) : 1;
|
|
246
|
+
cat.recall =
|
|
247
|
+
cat.truePositives + cat.falseNegatives > 0 ? cat.truePositives / (cat.truePositives + cat.falseNegatives) : 1;
|
|
248
|
+
cat.f1Score = cat.precision + cat.recall > 0 ? (2 * cat.precision * cat.recall) / (cat.precision + cat.recall) : 0;
|
|
249
|
+
}
|
|
250
|
+
// Compute per-judge metrics
|
|
251
|
+
for (const jb of Object.values(perJudge)) {
|
|
252
|
+
jb.precision =
|
|
253
|
+
jb.truePositives + jb.falsePositives > 0 ? jb.truePositives / (jb.truePositives + jb.falsePositives) : 1;
|
|
254
|
+
jb.recall =
|
|
255
|
+
jb.truePositives + jb.falseNegatives > 0 ? jb.truePositives / (jb.truePositives + jb.falseNegatives) : 1;
|
|
256
|
+
jb.f1Score = jb.precision + jb.recall > 0 ? (2 * jb.precision * jb.recall) / (jb.precision + jb.recall) : 0;
|
|
257
|
+
}
|
|
258
|
+
return {
|
|
259
|
+
timestamp: new Date().toISOString(),
|
|
260
|
+
version,
|
|
261
|
+
model,
|
|
262
|
+
provider,
|
|
263
|
+
promptMode,
|
|
264
|
+
totalCases: rawCases.length,
|
|
265
|
+
detected: totalDetected,
|
|
266
|
+
missed: rawCases.length - totalDetected,
|
|
267
|
+
totalExpected: rawCases.reduce((s, c) => s + c.expectedRuleIds.length, 0),
|
|
268
|
+
truePositives: totalTP,
|
|
269
|
+
falseNegatives: totalFN,
|
|
270
|
+
falsePositives: totalFP,
|
|
271
|
+
precision,
|
|
272
|
+
recall,
|
|
273
|
+
f1Score,
|
|
274
|
+
detectionRate: rawCases.length > 0 ? totalDetected / rawCases.length : 0,
|
|
275
|
+
perCategory,
|
|
276
|
+
perJudge,
|
|
277
|
+
perDifficulty,
|
|
278
|
+
cases: rawCases,
|
|
279
|
+
durationSeconds,
|
|
280
|
+
...(totalTokensUsed ? { totalTokensUsed } : {}),
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
// ─── Markdown Formatting ────────────────────────────────────────────────────
|
|
284
|
+
/**
|
|
285
|
+
* Format an LLM benchmark snapshot as a markdown section for the benchmark report.
|
|
286
|
+
*/
|
|
287
|
+
export function formatLlmSnapshotMarkdown(snapshot) {
|
|
288
|
+
const lines = [];
|
|
289
|
+
const pct = (n) => `${(n * 100).toFixed(1)}%`;
|
|
290
|
+
const l2Grade = snapshot.f1Score >= 0.9
|
|
291
|
+
? "A"
|
|
292
|
+
: snapshot.f1Score >= 0.8
|
|
293
|
+
? "B"
|
|
294
|
+
: snapshot.f1Score >= 0.7
|
|
295
|
+
? "C"
|
|
296
|
+
: snapshot.f1Score >= 0.6
|
|
297
|
+
? "D"
|
|
298
|
+
: "F";
|
|
299
|
+
const l2GradeEmoji = l2Grade === "A" ? "🟢" : l2Grade === "B" ? "🟡" : l2Grade === "C" ? "🟠" : "🔴";
|
|
300
|
+
lines.push("## Layer 2 — LLM Deep Review");
|
|
301
|
+
lines.push("");
|
|
302
|
+
lines.push(`> Model: **${snapshot.model}** · Provider: ${snapshot.provider} · ${snapshot.totalCases} test cases · ${new Date(snapshot.timestamp).toLocaleDateString()}`);
|
|
303
|
+
lines.push("");
|
|
304
|
+
lines.push(`| Metric | Value |`);
|
|
305
|
+
lines.push(`|--------|-------|`);
|
|
306
|
+
lines.push(`| Overall Grade | ${l2GradeEmoji} **${l2Grade}** |`);
|
|
307
|
+
lines.push(`| Test Cases | ${snapshot.totalCases} |`);
|
|
308
|
+
lines.push(`| Detection Rate | ${pct(snapshot.detectionRate)} (${snapshot.detected}/${snapshot.totalCases}) |`);
|
|
309
|
+
lines.push(`| Precision | ${pct(snapshot.precision)} |`);
|
|
310
|
+
lines.push(`| Recall | ${pct(snapshot.recall)} |`);
|
|
311
|
+
lines.push(`| F1 Score | ${pct(snapshot.f1Score)} |`);
|
|
312
|
+
lines.push(`| True Positives | ${snapshot.truePositives} |`);
|
|
313
|
+
lines.push(`| False Negatives | ${snapshot.falseNegatives} |`);
|
|
314
|
+
lines.push(`| False Positives | ${snapshot.falsePositives} |`);
|
|
315
|
+
if (snapshot.totalTokensUsed) {
|
|
316
|
+
lines.push(`| Total Tokens | ${snapshot.totalTokensUsed.toLocaleString()} |`);
|
|
317
|
+
}
|
|
318
|
+
lines.push(`| Run Duration | ${snapshot.durationSeconds}s |`);
|
|
319
|
+
lines.push("");
|
|
320
|
+
// Per-difficulty
|
|
321
|
+
if (Object.keys(snapshot.perDifficulty).length > 0) {
|
|
322
|
+
lines.push("### L2 Detection by Difficulty");
|
|
323
|
+
lines.push("");
|
|
324
|
+
lines.push("| Difficulty | Detected | Total | Rate |");
|
|
325
|
+
lines.push("|------------|----------|-------|------|");
|
|
326
|
+
for (const diff of ["easy", "medium", "hard"]) {
|
|
327
|
+
const d = snapshot.perDifficulty[diff];
|
|
328
|
+
if (d) {
|
|
329
|
+
lines.push(`| ${diff} | ${d.detected} | ${d.total} | ${pct(d.detectionRate)} |`);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
lines.push("");
|
|
333
|
+
}
|
|
334
|
+
// Per-category
|
|
335
|
+
if (Object.keys(snapshot.perCategory).length > 0) {
|
|
336
|
+
lines.push("### L2 Results by Category");
|
|
337
|
+
lines.push("");
|
|
338
|
+
lines.push("| Category | Detected | Total | Precision | Recall | F1 |");
|
|
339
|
+
lines.push("|----------|----------|-------|-----------|--------|-----|");
|
|
340
|
+
for (const [cat, stats] of Object.entries(snapshot.perCategory).sort(([a], [b]) => a.localeCompare(b))) {
|
|
341
|
+
lines.push(`| ${cat} | ${stats.detected} | ${stats.total} | ${pct(stats.precision)} | ${pct(stats.recall)} | ${pct(stats.f1Score)} |`);
|
|
342
|
+
}
|
|
343
|
+
lines.push("");
|
|
344
|
+
}
|
|
345
|
+
// Per-judge
|
|
346
|
+
if (Object.keys(snapshot.perJudge).length > 0) {
|
|
347
|
+
lines.push("### L2 Results by Judge");
|
|
348
|
+
lines.push("");
|
|
349
|
+
lines.push("| Judge | Findings | TP | FP | Precision |");
|
|
350
|
+
lines.push("|-------|----------|-----|-----|-----------|");
|
|
351
|
+
for (const [judgeId, stats] of Object.entries(snapshot.perJudge).sort(([a], [b]) => a.localeCompare(b))) {
|
|
352
|
+
lines.push(`| ${judgeId} | ${stats.total} | ${stats.truePositives} | ${stats.falsePositives} | ${pct(stats.precision)} |`);
|
|
353
|
+
}
|
|
354
|
+
lines.push("");
|
|
355
|
+
}
|
|
356
|
+
// Failed cases
|
|
357
|
+
const failed = snapshot.cases.filter((c) => !c.passed);
|
|
358
|
+
if (failed.length > 0 && failed.length <= 50) {
|
|
359
|
+
lines.push("### L2 Failed Cases");
|
|
360
|
+
lines.push("");
|
|
361
|
+
lines.push("| Case | Difficulty | Category | Missed Rules | False Positives |");
|
|
362
|
+
lines.push("|------|------------|----------|--------------|-----------------|");
|
|
363
|
+
for (const c of failed) {
|
|
364
|
+
const missed = c.missedRuleIds.length > 0 ? c.missedRuleIds.join(", ") : "—";
|
|
365
|
+
const fps = c.falsePositiveRuleIds.length > 0 ? c.falsePositiveRuleIds.join(", ") : "—";
|
|
366
|
+
lines.push(`| ${c.caseId} | ${c.difficulty} | ${c.category} | ${missed} | ${fps} |`);
|
|
367
|
+
}
|
|
368
|
+
lines.push("");
|
|
369
|
+
}
|
|
370
|
+
return lines.join("\n");
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Format a layer comparison table showing L1 vs L2 side by side.
|
|
374
|
+
*/
|
|
375
|
+
export function formatLayerComparisonMarkdown(l1, l2) {
|
|
376
|
+
const lines = [];
|
|
377
|
+
const pct = (n) => `${(n * 100).toFixed(1)}%`;
|
|
378
|
+
lines.push("## Layer Comparison");
|
|
379
|
+
lines.push("");
|
|
380
|
+
lines.push(`| Metric | L1 (Deterministic) | L2 (${l2.model}) |`);
|
|
381
|
+
lines.push(`|--------|-------------------|${"-".repeat(Math.max(l2.model.length + 4, 5))}|`);
|
|
382
|
+
lines.push(`| Test Cases | ${l1.totalCases} | ${l2.totalCases} |`);
|
|
383
|
+
lines.push(`| Detection Rate | ${pct(l1.detectionRate)} | ${pct(l2.detectionRate)} |`);
|
|
384
|
+
lines.push(`| Precision | ${pct(l1.precision)} | ${pct(l2.precision)} |`);
|
|
385
|
+
lines.push(`| Recall | ${pct(l1.recall)} | ${pct(l2.recall)} |`);
|
|
386
|
+
lines.push(`| F1 Score | ${pct(l1.f1Score)} | ${pct(l2.f1Score)} |`);
|
|
387
|
+
lines.push(`| False Positives | ${l1.falsePositives} | ${l2.falsePositives} |`);
|
|
388
|
+
lines.push("");
|
|
389
|
+
lines.push("The two layers are **complementary**:");
|
|
390
|
+
lines.push("- **L1** provides fast, reliable baseline detection with high precision and zero cost");
|
|
391
|
+
lines.push("- **L2** catches sophisticated issues that patterns miss, at the cost of API calls and latency");
|
|
392
|
+
lines.push("- Together, they deliver defense-in-depth code analysis");
|
|
393
|
+
lines.push("");
|
|
394
|
+
return lines.join("\n");
|
|
395
|
+
}
|
|
396
|
+
//# sourceMappingURL=llm-benchmark.js.map
|