@kevinrabun/judges 3.121.0 โ 3.123.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -9
- package/agents/authentication.judge.md +17 -0
- package/agents/cybersecurity.judge.md +18 -0
- package/agents/error-handling.judge.md +9 -0
- package/agents/rate-limiting.judge.md +9 -0
- package/agents/security.judge.md +18 -0
- package/dist/api.d.ts +1 -2
- package/dist/api.js +1 -3
- package/dist/commands/benchmark-advanced.js +1 -1
- package/dist/commands/benchmark-ai-agents.js +3 -3
- package/dist/commands/benchmark-infrastructure.js +1 -1
- package/dist/commands/benchmark-quality-ops.js +1 -1
- package/dist/commands/benchmark.d.ts +7 -0
- package/dist/commands/benchmark.js +7 -7
- package/dist/commands/llm-benchmark.d.ts +22 -2
- package/dist/commands/llm-benchmark.js +184 -23
- package/dist/config.js +1 -1
- package/dist/judges/authentication.js +17 -0
- package/dist/judges/cybersecurity.js +18 -0
- package/dist/judges/error-handling.js +9 -0
- package/dist/judges/rate-limiting.js +9 -0
- package/dist/judges/security.js +18 -0
- package/dist/probabilistic/llm-response-validator.js +2 -2
- package/dist/reports/public-repo-report.js +9 -1
- package/dist/skill-loader.js +9 -6
- package/dist/tools/prompts.d.ts +3 -2
- package/dist/tools/prompts.js +25 -51
- package/package.json +2 -2
- package/server.json +13 -4
- package/src/skill-loader.ts +9 -6
package/README.md
CHANGED
|
@@ -15,7 +15,7 @@ An MCP (Model Context Protocol) server that provides a panel of **45 specialized
|
|
|
15
15
|
[](https://www.npmjs.com/package/@kevinrabun/judges)
|
|
16
16
|
[](https://www.npmjs.com/package/@kevinrabun/judges)
|
|
17
17
|
[](https://opensource.org/licenses/MIT)
|
|
18
|
-
[](https://github.com/KevinRabun/judges/actions)
|
|
19
19
|
|
|
20
20
|
> ๐ฐ **Packages**
|
|
21
21
|
> - **CLI**: `@kevinrabun/judges-cli` โ binary `judges` (use `npx @kevinrabun/judges-cli eval --file app.ts`).
|
|
@@ -731,6 +731,8 @@ Use `--preset` to apply pre-configured evaluation settings:
|
|
|
731
731
|
| `healthtech` | Healthcare โ HIPAA compliance, data sovereignty, encryption, audit trails |
|
|
732
732
|
| `saas` | Multi-tenant SaaS โ tenant isolation, rate limiting, scalability |
|
|
733
733
|
| `government` | Government/public sector โ compliance, sovereignty, authentication |
|
|
734
|
+
| `open-source` | Open-source projects โ documentation, backwards compatibility, security, dependency health |
|
|
735
|
+
| `ai-review` | AI-generated code review โ hallucination detection, security, authentication, correctness |
|
|
734
736
|
|
|
735
737
|
```bash
|
|
736
738
|
judges eval --preset security-only src/api.ts
|
|
@@ -833,7 +835,7 @@ The tribunal operates in three layers:
|
|
|
833
835
|
|
|
834
836
|
2. **AST-Based Structural Analysis** โ The Code Structure judge (`STRUCT-*` rules) uses real Abstract Syntax Tree parsing to measure cyclomatic complexity, nesting depth, function length, parameter count, dead code, and type safety with precision that regex cannot achieve. All supported languages โ **TypeScript, JavaScript, Python, Rust, Go, Java, C#, and C++** โ are parsed via **tree-sitter WASM grammars** (real syntax trees compiled to WebAssembly, in-process, zero native dependencies). A scope-tracking structural parser is kept as a fallback when WASM grammars are unavailable. No external AST server required.
|
|
835
837
|
|
|
836
|
-
3. **LLM-Powered Deep Analysis (Prompts)** โ The server exposes MCP prompts (e.g., `judge-data-security`, `
|
|
838
|
+
3. **LLM-Powered Deep Analysis (Prompts)** โ The server exposes MCP prompts (e.g., `judge-data-security`, `judge-cybersecurity`) that provide each judge's expert persona as a system prompt. When used by an LLM-based client (Copilot, Claude, Cursor, etc.), the host LLM performs deeper, context-aware probabilistic analysis beyond what static patterns can detect. This is where the `systemPrompt` on each judge comes alive โ Judges itself makes no LLM calls, but it provides the expert criteria so your AI assistant can act as 45 specialized reviewers.
|
|
837
839
|
|
|
838
840
|
---
|
|
839
841
|
|
|
@@ -877,7 +879,7 @@ When your AI coding assistant connects to multiple MCP servers, each one contrib
|
|
|
877
879
|
โ Judges โ โ CVE / โ โ Linter โ
|
|
878
880
|
โ Panel โ โ SBOM โ โ Server โ
|
|
879
881
|
โ โโโโโโโโโโโโโโ โโโโโโโโโโ โโโโโโโโโโ
|
|
880
|
-
โ
|
|
882
|
+
โ 44 Heuristic โ Vuln DB Style &
|
|
881
883
|
โ judges โ scanning correctness
|
|
882
884
|
โ + AST judge โ
|
|
883
885
|
โโโโโโโโโโโโโโโโ
|
|
@@ -1130,7 +1132,7 @@ Re-run the tribunal with **prior findings as context** for iterative refinement.
|
|
|
1130
1132
|
|
|
1131
1133
|
#### Judge IDs
|
|
1132
1134
|
|
|
1133
|
-
`data-security` ยท `cybersecurity` ยท `cost-effectiveness` ยท `scalability` ยท `cloud-readiness` ยท `software-practices` ยท `accessibility` ยท `api-design` ยท `reliability` ยท `observability` ยท `performance` ยท `compliance` ยท `data-sovereignty` ยท `testing` ยท `documentation` ยท `internationalization` ยท `dependency-health` ยท `concurrency` ยท `ethics-bias` ยท `maintainability` ยท `error-handling` ยท `authentication` ยท `database` ยท `caching` ยท `configuration-management` ยท `backwards-compatibility` ยท `portability` ยท `ux` ยท `logging-privacy` ยท `rate-limiting` ยท `ci-cd` ยท `code-structure` ยท `agent-instructions` ยท `ai-code-safety` ยท `framework-safety` ยท `iac-security` ยท `false-positive-review`
|
|
1135
|
+
`data-security` ยท `cybersecurity` ยท `security` ยท `cost-effectiveness` ยท `scalability` ยท `cloud-readiness` ยท `software-practices` ยท `accessibility` ยท `api-design` ยท `api-contract` ยท `reliability` ยท `observability` ยท `performance` ยท `compliance` ยท `data-sovereignty` ยท `testing` ยท `documentation` ยท `internationalization` ยท `dependency-health` ยท `concurrency` ยท `ethics-bias` ยท `maintainability` ยท `error-handling` ยท `authentication` ยท `database` ยท `caching` ยท `configuration-management` ยท `backwards-compatibility` ยท `portability` ยท `ux` ยท `logging-privacy` ยท `rate-limiting` ยท `ci-cd` ยท `code-structure` ยท `agent-instructions` ยท `ai-code-safety` ยท `framework-safety` ยท `iac-security` ยท `hallucination-detection` ยท `intent-alignment` ยท `multi-turn-coherence` ยท `model-fingerprint` ยท `over-engineering` ยท `logic-review` ยท `false-positive-review`
|
|
1134
1136
|
|
|
1135
1137
|
---
|
|
1136
1138
|
|
|
@@ -1186,7 +1188,6 @@ Each judge has a corresponding prompt for LLM-powered deep analysis:
|
|
|
1186
1188
|
| `judge-over-engineering` | Deep review of unnecessary abstractions, wrapper-mania, premature generalization |
|
|
1187
1189
|
| `judge-logic-review` | Deep review of logic correctness, semantic mismatches, and dead code in AI-generated code |
|
|
1188
1190
|
| `judge-false-positive-review` | Meta-judge review of pattern-based findings for false positive detection and accuracy |
|
|
1189
|
-
| `full-tribunal` | all 45 judges in a single prompt |
|
|
1190
1191
|
<!-- PROMPTS_TABLE_END -->
|
|
1191
1192
|
|
|
1192
1193
|
---
|
|
@@ -1216,7 +1217,7 @@ Create a `.judgesrc.json` (or `.judgesrc`) file in your project root to customiz
|
|
|
1216
1217
|
| Field | Type | Default | Description |
|
|
1217
1218
|
|-------|------|---------|-------------|
|
|
1218
1219
|
| `$schema` | `string` | โ | JSON Schema URL for IDE validation |
|
|
1219
|
-
| `preset` | `string` | โ | Named preset (see [Named Presets](#named-presets) for all
|
|
1220
|
+
| `preset` | `string` | โ | Named preset (see [Named Presets](#named-presets) for all 22 options) |
|
|
1220
1221
|
| `minSeverity` | `string` | `"info"` | Minimum severity to report: `critical` ยท `high` ยท `medium` ยท `low` ยท `info` |
|
|
1221
1222
|
| `disabledRules` | `string[]` | `[]` | Rule IDs or prefix wildcards to suppress (e.g. `"COST-*"`, `"SEC-003"`) |
|
|
1222
1223
|
| `disabledJudges` | `string[]` | `[]` | Judge IDs to skip entirely (e.g. `"cost-effectiveness"`) |
|
|
@@ -1344,7 +1345,7 @@ judges/
|
|
|
1344
1345
|
โ โโโ evaluators/ # Analysis engine for each judge
|
|
1345
1346
|
โ โ โโโ index.ts # evaluateWithJudge(), evaluateWithTribunal(), evaluateProject(), etc.
|
|
1346
1347
|
โ โ โโโ shared.ts # Scoring, verdict logic, markdown formatters
|
|
1347
|
-
โ โ โโโ *.ts # One analyzer per judge (
|
|
1348
|
+
โ โ โโโ *.ts # One analyzer per judge (45 files)
|
|
1348
1349
|
โ โโโ formatters/ # Output formatters
|
|
1349
1350
|
โ โ โโโ sarif.ts # SARIF 2.1.0 output
|
|
1350
1351
|
โ โ โโโ html.ts # Self-contained HTML report (dark/light theme, filters)
|
|
@@ -1371,12 +1372,12 @@ judges/
|
|
|
1371
1372
|
โ โ โโโ config-share.ts # Shareable team/org configuration
|
|
1372
1373
|
โ โโโ presets.ts # Named evaluation presets (strict, lenient, security-only, โฆ)
|
|
1373
1374
|
โ โโโ patches/
|
|
1374
|
-
โ โ โโโ index.ts #
|
|
1375
|
+
โ โ โโโ index.ts # 201 deterministic auto-fix patch rules
|
|
1375
1376
|
โ โโโ tools/ # MCP tool registrations
|
|
1376
1377
|
โ โ โโโ register.ts # Tool registration orchestrator
|
|
1377
1378
|
โ โ โโโ register-evaluation.ts # Evaluation tools (evaluate_code, etc.)
|
|
1378
1379
|
โ โ โโโ register-workflow.ts # Workflow tools (app builder, reports, etc.)
|
|
1379
|
-
โ โ โโโ prompts.ts # MCP prompt registrations (per-judge
|
|
1380
|
+
โ โ โโโ prompts.ts # MCP prompt registrations (per-judge prompts)
|
|
1380
1381
|
โ โ โโโ schemas.ts # Zod schemas for tool parameters
|
|
1381
1382
|
โ โโโ reports/
|
|
1382
1383
|
โ โ โโโ public-repo-report.ts # Public repo clone + full tribunal report generation
|
|
@@ -30,6 +30,23 @@ RULES FOR YOUR EVALUATION:
|
|
|
30
30
|
- Flag any endpoint that accepts user input without verifying the caller's identity and permissions.
|
|
31
31
|
- Score from 0-100 where 100 means robust auth implementation.
|
|
32
32
|
|
|
33
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
34
|
+
- Authentication middleware protects all routes that handle user data or state changes.
|
|
35
|
+
- Passwords are hashed with bcrypt, scrypt, or argon2 โ not stored in plaintext or weak hashes.
|
|
36
|
+
- JWTs are verified with explicit algorithm restrictions, expiration, and issuer/audience checks.
|
|
37
|
+
- Sessions use secure, httpOnly, sameSite cookies with proper expiration and rotation.
|
|
38
|
+
- OAuth/OIDC flows use PKCE, validate state parameters, and allowlist redirect URIs.
|
|
39
|
+
- API keys are transmitted in headers (not query params) and scoped to minimum permissions.
|
|
40
|
+
If the code meets these criteria, authentication is implemented correctly. Do NOT manufacture findings.
|
|
41
|
+
|
|
42
|
+
DOMAIN BOUNDARY (defer these to other judges):
|
|
43
|
+
- Injection attacks and XSS exploit paths โ defer to CYBER judge.
|
|
44
|
+
- General security posture and cryptographic practices โ defer to SEC judge.
|
|
45
|
+
- Rate limiting on login endpoints โ defer to RATE judge (unless auth logic itself is broken).
|
|
46
|
+
- Error handling in auth flows โ defer to ERR judge.
|
|
47
|
+
- Data privacy in auth tokens/logs โ defer to DATA/LOGPRIV judges.
|
|
48
|
+
Only flag issues within YOUR domain: authentication middleware gaps, credential handling, token security, session management, authorization checks, OAuth/OIDC implementation, privilege escalation.
|
|
49
|
+
|
|
33
50
|
FALSE POSITIVE AVOIDANCE:
|
|
34
51
|
- Do NOT flag code that uses established authentication libraries (passport, next-auth, Spring Security, etc.) following their documented patterns.
|
|
35
52
|
- JWT verification with explicit algorithm restrictions and proper expiration checks is correct implementation, not a vulnerability.
|
|
@@ -29,6 +29,24 @@ RULES FOR YOUR EVALUATION:
|
|
|
29
29
|
- Reference OWASP, CWE IDs, and CVE IDs where applicable.
|
|
30
30
|
- Score from 0-100 where 100 means no exploitable vulnerabilities found.
|
|
31
31
|
|
|
32
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
33
|
+
- Input validation and sanitization are applied to user-controlled data before use in queries, commands, or output.
|
|
34
|
+
- Passwords are hashed with bcrypt, scrypt, or argon2 โ not MD5/SHA1.
|
|
35
|
+
- Database queries use parameterized statements or an ORM with proper escaping.
|
|
36
|
+
- Security middleware is present (helmet, CORS, CSRF tokens) for web applications.
|
|
37
|
+
- Secrets are loaded from environment variables or a secrets manager, not hardcoded.
|
|
38
|
+
- Dependencies are imported from standard registries with version pinning.
|
|
39
|
+
- Error responses do not leak stack traces or internal details to clients.
|
|
40
|
+
If the code meets these criteria, it is implementing security correctly. Do NOT manufacture findings.
|
|
41
|
+
|
|
42
|
+
DOMAIN BOUNDARY (defer these to other judges):
|
|
43
|
+
- Rate limiting, throttling, and abuse prevention โ defer to RATE judge.
|
|
44
|
+
- Authentication flows, session management, OAuth/OIDC โ defer to AUTH judge.
|
|
45
|
+
- General security posture, defense-in-depth patterns โ defer to SEC judge.
|
|
46
|
+
- Error handling completeness and error propagation โ defer to ERR judge.
|
|
47
|
+
- Data privacy, PII handling, logging of sensitive data โ defer to DATA/LOGPRIV judges.
|
|
48
|
+
Only flag issues within YOUR domain: injection attacks, XSS, CSRF/SSRF, dependency CVEs, cryptographic weaknesses, OWASP Top 10 violations with concrete exploit paths.
|
|
49
|
+
|
|
32
50
|
FALSE POSITIVE AVOIDANCE:
|
|
33
51
|
- Do NOT flag established security library usage (helmet, cors, bcrypt, argon2, parameterized queries) as security issues โ these ARE the correct patterns.
|
|
34
52
|
- Code that properly validates input, uses HTTPS, and parameterizes queries is implementing security correctly.
|
|
@@ -30,6 +30,15 @@ RULES FOR YOUR EVALUATION:
|
|
|
30
30
|
- Flag any code path that could throw without a handler in scope.
|
|
31
31
|
- Score from 0-100 where 100 means robust error handling.
|
|
32
32
|
|
|
33
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
34
|
+
- Try-catch blocks wrap code paths that can throw, with meaningful handling (log, re-throw, or recover).
|
|
35
|
+
- Async operations use try-catch or .catch() to handle rejections.
|
|
36
|
+
- Error responses return consistent structures with appropriate HTTP status codes.
|
|
37
|
+
- Resources (connections, file handles, streams) are cleaned up in finally blocks or using disposal patterns.
|
|
38
|
+
- Framework error middleware or global handlers are present (Express error middleware, Spring @ExceptionHandler, etc.).
|
|
39
|
+
- Stack traces and internal details are not exposed to end users in error responses.
|
|
40
|
+
If the code meets these criteria, error handling is implemented correctly. Do NOT manufacture findings.
|
|
41
|
+
|
|
33
42
|
FALSE POSITIVE AVOIDANCE:
|
|
34
43
|
- Do NOT flag error handling in code that delegates error handling to a framework (Express middleware, Spring @ExceptionHandler, etc.).
|
|
35
44
|
- Try-catch with logging and re-throw is a valid error handling pattern, not a deficiency.
|
|
@@ -30,6 +30,15 @@ RULES FOR YOUR EVALUATION:
|
|
|
30
30
|
- Consider both inbound (protecting your service) and outbound (respecting others') rate limits.
|
|
31
31
|
- Score from 0-100 where 100 means comprehensive rate limiting.
|
|
32
32
|
|
|
33
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
34
|
+
- Rate limiting middleware is applied to public-facing API endpoints (express-rate-limit, API gateway config, etc.).
|
|
35
|
+
- Request body size limits are configured (bodyParser limits, multer limits, etc.).
|
|
36
|
+
- List/query endpoints have pagination with enforced maximum page sizes.
|
|
37
|
+
- External API calls use bounded retries with exponential backoff and jitter.
|
|
38
|
+
- Connection pools and concurrent request limits are bounded.
|
|
39
|
+
If the code meets these criteria, rate limiting is implemented correctly. Do NOT manufacture findings.
|
|
40
|
+
IMPORTANT: CLI tools, data scripts, utility libraries, batch processors, and internal services do NOT need rate limiting. If the code is not a public-facing API or web server, report ZERO findings.
|
|
41
|
+
|
|
33
42
|
FALSE POSITIVE AVOIDANCE:
|
|
34
43
|
- Only flag rate-limiting issues in code that accepts external requests (APIs, WebSocket servers, public endpoints).
|
|
35
44
|
- Do NOT flag internal services, batch processors, CLI tools, or cron jobs for missing rate limiting.
|
package/agents/security.judge.md
CHANGED
|
@@ -30,6 +30,24 @@ RULES FOR YOUR EVALUATION:
|
|
|
30
30
|
- Reference CWE IDs where applicable.
|
|
31
31
|
- Score from 0-100 where 100 means excellent security posture.
|
|
32
32
|
|
|
33
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
34
|
+
- Security middleware is configured (helmet, CORS, CSRF protection) for web applications.
|
|
35
|
+
- User input is validated before use in data flows (queries, file ops, HTTP requests).
|
|
36
|
+
- Cryptographic operations use modern algorithms (AES-256, SHA-256+, bcrypt/argon2).
|
|
37
|
+
- Secrets are sourced from environment variables or a secrets manager, not hardcoded.
|
|
38
|
+
- Deserialization of untrusted data uses safe mechanisms (JSON.parse, not pickle/eval).
|
|
39
|
+
- JWT/token verification includes algorithm restrictions and expiration checks.
|
|
40
|
+
- No user-controlled URLs are used in redirects without validation.
|
|
41
|
+
If the code meets these criteria, it has a strong security posture. Do NOT manufacture findings.
|
|
42
|
+
|
|
43
|
+
DOMAIN BOUNDARY (defer these to other judges):
|
|
44
|
+
- Injection attacks (SQL, XSS, command injection) with exploit paths โ defer to CYBER judge.
|
|
45
|
+
- Authentication flows, credential storage, session management โ defer to AUTH judge.
|
|
46
|
+
- Rate limiting and abuse prevention โ defer to RATE judge.
|
|
47
|
+
- Error handling patterns and error propagation โ defer to ERR judge.
|
|
48
|
+
- Infrastructure-as-code security โ defer to IAC judge.
|
|
49
|
+
Only flag issues within YOUR domain: insecure data flows, weak cryptography, missing security controls, unsafe deserialization, XML security, secret management, mass assignment, redirect validation.
|
|
50
|
+
|
|
33
51
|
FALSE POSITIVE AVOIDANCE:
|
|
34
52
|
- Do NOT flag code that uses established security libraries correctly (helmet, bcrypt, argon2, parameterized queries, CSRF tokens, rate limiters, proper TLS configuration).
|
|
35
53
|
- Do NOT flag security controls in non-application code (CI/CD configs, IaC templates, documentation examples) unless they contain actual secrets or credentials.
|
package/dist/api.d.ts
CHANGED
|
@@ -70,10 +70,9 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
|
|
|
70
70
|
export type { ToolProfile, ToolCapability, ComparisonResult } from "./comparison.js";
|
|
71
71
|
export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
|
|
72
72
|
export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGateResult, L2CoverageAnalysis, L2JudgeCoverage, L2CategoryCoverage, } from "./commands/benchmark.js";
|
|
73
|
-
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
73
|
+
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
74
74
|
export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
|
|
75
75
|
export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
|
|
76
|
-
export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
|
|
77
76
|
export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
|
|
78
77
|
export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
|
|
79
78
|
export { buildContextSnippets } from "./context/context-snippets.js";
|
package/dist/api.js
CHANGED
|
@@ -80,9 +80,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
|
|
|
80
80
|
// โโโ Benchmark Gate โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
81
81
|
export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
|
|
82
82
|
// โโโ LLM Benchmark โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
83
|
-
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
84
|
-
// โโโ LLM Benchmark Optimizer (Self-Teaching) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
85
|
-
export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
|
|
83
|
+
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
86
84
|
// Review autopilot (GitHub App / scripts)
|
|
87
85
|
export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
|
|
88
86
|
export { buildContextSnippets } from "./context/context-snippets.js";
|
|
@@ -229,7 +229,7 @@ def load_config(path):
|
|
|
229
229
|
public void authenticateUser(String u, String p) { /* 40 lines */ }
|
|
230
230
|
public void scheduleTask(Task t) { /* 20 lines */ }
|
|
231
231
|
}`,
|
|
232
|
-
expectedRuleIds: ["
|
|
232
|
+
expectedRuleIds: ["STRUCT-001", "MAINT-001", "DOC-001"],
|
|
233
233
|
category: "code-structure",
|
|
234
234
|
difficulty: "medium",
|
|
235
235
|
},
|
|
@@ -27,7 +27,7 @@ app.post("/api/users", validateSchema(userSchema), async (req, res) => {
|
|
|
27
27
|
await user.save();
|
|
28
28
|
res.json({ id: user.id });
|
|
29
29
|
});`,
|
|
30
|
-
expectedRuleIds: ["
|
|
30
|
+
expectedRuleIds: ["HALLU-001"],
|
|
31
31
|
category: "hallucination",
|
|
32
32
|
difficulty: "medium",
|
|
33
33
|
},
|
|
@@ -57,7 +57,7 @@ app.post("/api/users", validateSchema(userSchema), async (req, res) => {
|
|
|
57
57
|
|
|
58
58
|
return { formatted, config, serialized };
|
|
59
59
|
}`,
|
|
60
|
-
expectedRuleIds: ["
|
|
60
|
+
expectedRuleIds: ["HALLU-001"],
|
|
61
61
|
category: "hallucination",
|
|
62
62
|
difficulty: "easy",
|
|
63
63
|
},
|
|
@@ -1018,7 +1018,7 @@ def delete_user(request):
|
|
|
1018
1018
|
async auditLog(action: string) { /* 30 lines */ }
|
|
1019
1019
|
// 2000+ lines, 50+ methods, handles everything
|
|
1020
1020
|
}`,
|
|
1021
|
-
expectedRuleIds: ["
|
|
1021
|
+
expectedRuleIds: ["MAINT-001"],
|
|
1022
1022
|
category: "software-development",
|
|
1023
1023
|
difficulty: "medium",
|
|
1024
1024
|
},
|
|
@@ -1201,7 +1201,7 @@ jobs:
|
|
|
1201
1201
|
-H "Authorization: Bearer \${{ secrets.DEPLOY_TOKEN }}" \\
|
|
1202
1202
|
-d '{"sha": "\${{ github.sha }}"}'
|
|
1203
1203
|
- run: echo "$\{{ secrets.AWS_SECRET_KEY }}" > /tmp/key`,
|
|
1204
|
-
expectedRuleIds: ["
|
|
1204
|
+
expectedRuleIds: ["SEC-001"],
|
|
1205
1205
|
category: "cicd",
|
|
1206
1206
|
difficulty: "medium",
|
|
1207
1207
|
},
|
|
@@ -25,6 +25,13 @@ export interface BenchmarkCase {
|
|
|
25
25
|
expectedRuleIds: string[];
|
|
26
26
|
/** Rule IDs that should NOT be detected (known false positives) */
|
|
27
27
|
unexpectedRuleIds?: string[];
|
|
28
|
+
/**
|
|
29
|
+
* Acceptable rule prefixes: findings from these judge domains are
|
|
30
|
+
* domain-relevant and should NOT count as false positives even if not
|
|
31
|
+
* in expectedRuleIds. For example, a SQL-injection case may acceptably
|
|
32
|
+
* also trigger AUTH or SEC findings.
|
|
33
|
+
*/
|
|
34
|
+
acceptablePrefixes?: string[];
|
|
28
35
|
/** Category of vulnerability (e.g. "injection", "auth", "xss") */
|
|
29
36
|
category: string;
|
|
30
37
|
/** Difficulty level */
|
|
@@ -844,7 +844,7 @@ function getErrorMessage(code: number): string {
|
|
|
844
844
|
"bower": "^1.8.0"
|
|
845
845
|
}
|
|
846
846
|
}`,
|
|
847
|
-
expectedRuleIds: ["DEPS-001"
|
|
847
|
+
expectedRuleIds: ["DEPS-001"],
|
|
848
848
|
category: "dependency-health",
|
|
849
849
|
difficulty: "easy",
|
|
850
850
|
},
|
|
@@ -2337,13 +2337,13 @@ export function runBenchmarkSuite(cases, judgeId) {
|
|
|
2337
2337
|
cat.truePositives += caseTP;
|
|
2338
2338
|
cat.falseNegatives += caseFN;
|
|
2339
2339
|
cat.falsePositives += caseFP;
|
|
2340
|
-
// Per-judge accumulators
|
|
2341
|
-
|
|
2342
|
-
// Dirty-case "extra" detections are legitimate secondary findings and
|
|
2343
|
-
// should not inflate per-judge false-positive rates.
|
|
2344
|
-
const isCleanCase = tc.expectedRuleIds.length === 0;
|
|
2340
|
+
// Per-judge accumulators (deduplicate by prefix per case to match case-level FP counting)
|
|
2341
|
+
const seenPrefixes = new Set();
|
|
2345
2342
|
for (const ruleId of foundRuleIds) {
|
|
2346
2343
|
const prefix = ruleId.split("-")[0];
|
|
2344
|
+
if (seenPrefixes.has(prefix))
|
|
2345
|
+
continue;
|
|
2346
|
+
seenPrefixes.add(prefix);
|
|
2347
2347
|
if (!perJudge[prefix]) {
|
|
2348
2348
|
perJudge[prefix] = {
|
|
2349
2349
|
judgeId: prefix,
|
|
@@ -2361,7 +2361,7 @@ export function runBenchmarkSuite(cases, judgeId) {
|
|
|
2361
2361
|
if (expectedPrefixes.has(prefix)) {
|
|
2362
2362
|
jb.truePositives++;
|
|
2363
2363
|
}
|
|
2364
|
-
else
|
|
2364
|
+
else {
|
|
2365
2365
|
jb.falsePositives++;
|
|
2366
2366
|
}
|
|
2367
2367
|
}
|
|
@@ -17,6 +17,19 @@ import type { JudgeDefinition } from "../types.js";
|
|
|
17
17
|
import type { BenchmarkCase, CategoryResult, JudgeBenchmarkResult, DifficultyResult } from "./benchmark.js";
|
|
18
18
|
import type { PromptAmendment } from "./llm-benchmark-optimizer.js";
|
|
19
19
|
export declare const TRIBUNAL_JUDGES: JudgeDefinition[];
|
|
20
|
+
/**
|
|
21
|
+
* Get acceptable prefixes for a benchmark case. Uses the case's explicit
|
|
22
|
+
* acceptablePrefixes if defined, otherwise falls back to the category map.
|
|
23
|
+
* Expected prefixes are always included (they're TPs, not FPs).
|
|
24
|
+
*/
|
|
25
|
+
export declare function getAcceptablePrefixes(tc: BenchmarkCase): Set<string>;
|
|
26
|
+
/**
|
|
27
|
+
* Select a focused subset of tribunal judges relevant to a benchmark case's
|
|
28
|
+
* category. Returns core judges + category-specific judges, typically 8-15
|
|
29
|
+
* instead of the full 35. Returns undefined if no routing is possible
|
|
30
|
+
* (unknown category), signalling the caller to use all tribunal judges.
|
|
31
|
+
*/
|
|
32
|
+
export declare function selectJudgesForCategory(category: string): JudgeDefinition[] | undefined;
|
|
20
33
|
export interface LlmBenchmarkSnapshot {
|
|
21
34
|
/** Timestamp of this LLM benchmark run */
|
|
22
35
|
timestamp: string;
|
|
@@ -48,6 +61,8 @@ export interface LlmBenchmarkSnapshot {
|
|
|
48
61
|
recall: number;
|
|
49
62
|
/** F1 Score */
|
|
50
63
|
f1Score: number;
|
|
64
|
+
/** Severity-weighted F1 โ penalizes critical/high FPs more heavily */
|
|
65
|
+
weightedF1Score?: number;
|
|
51
66
|
/** Detection rate: cases detected / total cases */
|
|
52
67
|
detectionRate: number;
|
|
53
68
|
/** Per-category breakdown */
|
|
@@ -102,8 +117,9 @@ export declare function extractValidatedLlmFindings(response: string, prefixes?:
|
|
|
102
117
|
export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
|
|
103
118
|
/**
|
|
104
119
|
* Construct the full-tribunal prompt โ identical to the MCP-served `full-tribunal` prompt.
|
|
120
|
+
* When `judges` is provided, uses that filtered list instead of all tribunal judges.
|
|
105
121
|
*/
|
|
106
|
-
export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
|
|
122
|
+
export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[], judges?: JudgeDefinition[]): string;
|
|
107
123
|
/**
|
|
108
124
|
* Select a stratified sample of benchmark cases, ensuring representation
|
|
109
125
|
* across categories, difficulties, and both clean/dirty cases.
|
|
@@ -112,8 +128,12 @@ export declare function selectStratifiedSample(cases: BenchmarkCase[], targetSiz
|
|
|
112
128
|
/**
|
|
113
129
|
* Score a single LLM benchmark case using prefix-based matching.
|
|
114
130
|
* Returns a fully populated LlmCaseResult.
|
|
131
|
+
*
|
|
132
|
+
* @param topKPrefixes - If set, only keep the first `topKPrefixes` unique
|
|
133
|
+
* detected prefixes (in the order they appear in the LLM response).
|
|
134
|
+
* This prevents verbose tribunal output from inflating FP counts.
|
|
115
135
|
*/
|
|
116
|
-
export declare function scoreLlmCase(tc: BenchmarkCase, detectedRuleIds: string[], rawResponse: string, tokensUsed?: number): LlmCaseResult;
|
|
136
|
+
export declare function scoreLlmCase(tc: BenchmarkCase, detectedRuleIds: string[], rawResponse: string, tokensUsed?: number, topKPrefixes?: number): LlmCaseResult;
|
|
117
137
|
/**
|
|
118
138
|
* Compute aggregate metrics for an LLM benchmark snapshot from raw case results.
|
|
119
139
|
* Uses the same prefix-based matching methodology as the L1 benchmark.
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
* `scripts/run-llm-benchmark.ts` has been removed.
|
|
15
15
|
*/
|
|
16
16
|
import { JUDGES } from "../judges/index.js";
|
|
17
|
-
import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE } from "../tools/prompts.js";
|
|
17
|
+
import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE, CLEAN_CODE_GATE, } from "../tools/prompts.js";
|
|
18
18
|
import { extractAndValidateLlmFindings, mergeFindings } from "../probabilistic/llm-response-validator.js";
|
|
19
19
|
import { formatAmendmentSection } from "./llm-benchmark-optimizer.js";
|
|
20
20
|
// โโโ Tribunal Judge Filtering โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
@@ -22,6 +22,89 @@ import { formatAmendmentSection } from "./llm-benchmark-optimizer.js";
|
|
|
22
22
|
// near-100% false positives in single-pass tribunal mode and are excluded.
|
|
23
23
|
const TRIBUNAL_EXCLUDED_PREFIXES = new Set(["INTENT", "COH", "MFPR", "FPR", "OVER"]);
|
|
24
24
|
export const TRIBUNAL_JUDGES = JUDGES.filter((j) => !TRIBUNAL_EXCLUDED_PREFIXES.has(j.rulePrefix));
|
|
25
|
+
// โโโ Category โ Acceptable Prefixes Mapping โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
26
|
+
// For each benchmark case category, these judge prefixes are domain-relevant
|
|
27
|
+
// and findings from them should NOT count as false positives even when not
|
|
28
|
+
// in expectedRuleIds. This prevents legitimate cross-domain observations
|
|
29
|
+
// from inflating the FP metric.
|
|
30
|
+
// โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
31
|
+
const CATEGORY_ACCEPTABLE_PREFIXES = {
|
|
32
|
+
injection: ["SEC", "CYBER", "DB", "DATA", "ERR", "FW", "LOGIC"],
|
|
33
|
+
xss: ["SEC", "CYBER", "FW", "DATA", "ERR", "LOGIC"],
|
|
34
|
+
auth: ["AUTH", "SEC", "CYBER", "DATA", "CFG", "ERR", "LOGIC"],
|
|
35
|
+
"rate-limiting": ["RATE", "PERF", "SCALE", "REL", "SEC", "ERR"],
|
|
36
|
+
"error-handling": ["ERR", "REL", "OBS", "LOGIC", "MAINT", "STRUCT"],
|
|
37
|
+
"data-security": ["DATA", "SEC", "CYBER", "LOGPRIV", "SOV", "CFG", "ERR"],
|
|
38
|
+
security: ["SEC", "CYBER", "AUTH", "DATA", "FW", "CFG", "ERR", "LOGIC"],
|
|
39
|
+
concurrency: ["CONC", "PERF", "REL", "LOGIC", "ERR", "MAINT"],
|
|
40
|
+
performance: ["PERF", "SCALE", "CACHE", "DB", "CONC", "LOGIC", "MAINT"],
|
|
41
|
+
database: ["DB", "SEC", "DATA", "PERF", "ERR", "LOGIC"],
|
|
42
|
+
"api-design": ["API", "ERR", "AUTH", "SEC", "STRUCT", "LOGIC", "MAINT"],
|
|
43
|
+
observability: ["OBS", "LOGPRIV", "REL", "ERR", "CFG", "MAINT"],
|
|
44
|
+
reliability: ["REL", "ERR", "CONC", "PERF", "OBS", "LOGIC"],
|
|
45
|
+
scalability: ["SCALE", "PERF", "CACHE", "CLOUD", "CONC", "STRUCT"],
|
|
46
|
+
"cloud-readiness": ["CLOUD", "CFG", "CICD", "SCALE", "PORTA", "SEC"],
|
|
47
|
+
configuration: ["CFG", "SEC", "DATA", "CLOUD", "ERR"],
|
|
48
|
+
maintainability: ["MAINT", "STRUCT", "SWDEV", "DOC", "LOGIC", "ERR"],
|
|
49
|
+
"code-structure": ["STRUCT", "MAINT", "SWDEV", "LOGIC", "ERR"],
|
|
50
|
+
documentation: ["DOC", "MAINT", "SWDEV", "STRUCT"],
|
|
51
|
+
testing: ["TEST", "SWDEV", "LOGIC", "ERR", "MAINT"],
|
|
52
|
+
"cost-effectiveness": ["COST", "CLOUD", "SCALE", "PERF", "IAC"],
|
|
53
|
+
compliance: ["COMP", "DATA", "SOV", "LOGPRIV", "SEC", "CYBER"],
|
|
54
|
+
accessibility: ["A11Y", "UX", "I18N", "STRUCT", "LOGIC"],
|
|
55
|
+
internationalization: ["I18N", "A11Y", "UX", "STRUCT"],
|
|
56
|
+
"dependency-health": ["DEPS", "SEC", "COMPAT", "MAINT"],
|
|
57
|
+
"logging-privacy": ["LOGPRIV", "DATA", "OBS", "SEC", "ERR"],
|
|
58
|
+
"backwards-compatibility": ["COMPAT", "API", "STRUCT", "LOGIC"],
|
|
59
|
+
caching: ["CACHE", "PERF", "SCALE", "REL", "LOGIC"],
|
|
60
|
+
"ethics-bias": ["ETHICS", "DATA", "COMP", "SEC"],
|
|
61
|
+
portability: ["PORTA", "CLOUD", "STRUCT", "CFG"],
|
|
62
|
+
"ci-cd": ["CICD", "SEC", "CFG", "CLOUD", "TEST"],
|
|
63
|
+
"iac-security": ["IAC", "SEC", "CYBER", "CFG", "CLOUD"],
|
|
64
|
+
cloud: ["CLOUD", "IAC", "SEC", "CYBER", "CFG", "SCALE"],
|
|
65
|
+
ethics: ["ETHICS", "A11Y", "UX", "DATA", "COMP"],
|
|
66
|
+
"framework-safety": ["FW", "SEC", "CYBER", "ERR", "LOGIC"],
|
|
67
|
+
"framework-security": ["FW", "SEC", "CYBER", "AUTH", "ERR", "API", "COMP", "OBS", "COMPAT", "CONC", "DOC"],
|
|
68
|
+
"agent-instructions": ["AGENT", "SEC", "CYBER", "AICS", "ERR", "LOGIC"],
|
|
69
|
+
cicd: ["CICD", "SEC", "CFG", "CLOUD", "TEST", "PORTA"],
|
|
70
|
+
ux: ["UX", "ERR", "SEC", "A11Y", "I18N", "LOGIC"],
|
|
71
|
+
"software-practices": ["SWDEV", "MAINT", "STRUCT", "DOC", "LOGIC", "ERR"],
|
|
72
|
+
"software-development": ["SWDEV", "MAINT", "STRUCT", "DOC", "LOGIC", "ERR"],
|
|
73
|
+
"code-quality": ["MAINT", "API", "STRUCT", "SWDEV", "LOGIC", "ERR"],
|
|
74
|
+
"supply-chain": ["DEPS", "SEC", "COMPAT", "MAINT"],
|
|
75
|
+
"ai-security": ["AICS", "SEC", "CYBER", "DATA", "ERR", "LOGIC"],
|
|
76
|
+
clean: [], // Clean code โ no acceptable prefixes, all findings are FPs
|
|
77
|
+
};
|
|
78
|
+
/**
|
|
79
|
+
* Get acceptable prefixes for a benchmark case. Uses the case's explicit
|
|
80
|
+
* acceptablePrefixes if defined, otherwise falls back to the category map.
|
|
81
|
+
* Expected prefixes are always included (they're TPs, not FPs).
|
|
82
|
+
*/
|
|
83
|
+
export function getAcceptablePrefixes(tc) {
|
|
84
|
+
const explicit = tc.acceptablePrefixes;
|
|
85
|
+
const fromCategory = CATEGORY_ACCEPTABLE_PREFIXES[tc.category] ?? [];
|
|
86
|
+
const combined = new Set([...tc.expectedRuleIds.map((r) => r.split("-")[0]), ...(explicit ?? fromCategory)]);
|
|
87
|
+
return combined;
|
|
88
|
+
}
|
|
89
|
+
// โโโ Core Judges (always included in routed tribunals) โโโโโโโโโโโโโโโโโโโโโโ
|
|
90
|
+
// These judges provide universal code quality signals and should always be
|
|
91
|
+
// part of the tribunal regardless of category.
|
|
92
|
+
const CORE_JUDGE_PREFIXES = new Set(["SEC", "ERR", "LOGIC", "STRUCT", "MAINT"]);
|
|
93
|
+
/**
|
|
94
|
+
* Select a focused subset of tribunal judges relevant to a benchmark case's
|
|
95
|
+
* category. Returns core judges + category-specific judges, typically 8-15
|
|
96
|
+
* instead of the full 35. Returns undefined if no routing is possible
|
|
97
|
+
* (unknown category), signalling the caller to use all tribunal judges.
|
|
98
|
+
*/
|
|
99
|
+
export function selectJudgesForCategory(category) {
|
|
100
|
+
const acceptable = CATEGORY_ACCEPTABLE_PREFIXES[category];
|
|
101
|
+
if (!acceptable || acceptable.length === 0)
|
|
102
|
+
return undefined;
|
|
103
|
+
const targetPrefixes = new Set([...CORE_JUDGE_PREFIXES, ...acceptable]);
|
|
104
|
+
const selected = TRIBUNAL_JUDGES.filter((j) => targetPrefixes.has(j.rulePrefix));
|
|
105
|
+
// Only route if we meaningfully reduced the set (at least 40% fewer)
|
|
106
|
+
return selected.length < TRIBUNAL_JUDGES.length * 0.6 ? selected : undefined;
|
|
107
|
+
}
|
|
25
108
|
// โโโ Rule ID Parsing โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
26
109
|
/**
|
|
27
110
|
* Extract unique rule IDs from LLM response text.
|
|
@@ -41,7 +124,7 @@ export function getTribunalValidPrefixes() {
|
|
|
41
124
|
}
|
|
42
125
|
export function parseLlmRuleIds(response) {
|
|
43
126
|
const validPrefixes = getValidRulePrefixes();
|
|
44
|
-
const pattern = /\b([A-Z]
|
|
127
|
+
const pattern = /\b([A-Z][A-Z0-9]+)-(\d{1,3})\b/g;
|
|
45
128
|
const found = new Set();
|
|
46
129
|
let match;
|
|
47
130
|
while ((match = pattern.exec(response)) !== null) {
|
|
@@ -85,35 +168,55 @@ export function constructPerJudgePrompt(judge, code, language, contextSnippets =
|
|
|
85
168
|
(amendmentSection ? `${amendmentSection}\n` : "") +
|
|
86
169
|
contextSection +
|
|
87
170
|
`${criteria}\n\n` +
|
|
171
|
+
`${CLEAN_CODE_GATE}\n\n` +
|
|
88
172
|
`Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
|
|
89
173
|
`\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
|
|
90
174
|
}
|
|
91
175
|
/**
|
|
92
176
|
* Construct the full-tribunal prompt โ identical to the MCP-served `full-tribunal` prompt.
|
|
177
|
+
* When `judges` is provided, uses that filtered list instead of all tribunal judges.
|
|
93
178
|
*/
|
|
94
|
-
export function constructTribunalPrompt(code, language, contextSnippets = [], amendments) {
|
|
95
|
-
const
|
|
179
|
+
export function constructTribunalPrompt(code, language, contextSnippets = [], amendments, judges) {
|
|
180
|
+
const activeJudges = judges ?? TRIBUNAL_JUDGES;
|
|
181
|
+
const judgeInstructions = activeJudges
|
|
182
|
+
.map((j) => `### ${j.name} โ ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`)
|
|
183
|
+
.join("\n\n---\n\n");
|
|
96
184
|
const contextSection = contextSnippets.length
|
|
97
185
|
? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
|
|
98
186
|
: "";
|
|
99
187
|
const amendmentSection = formatAmendmentSection(amendments ?? []);
|
|
100
|
-
return (`You are the Judges Panel โ a panel of ${
|
|
188
|
+
return (`You are the Judges Panel โ a panel of ${activeJudges.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
|
|
101
189
|
`## Universal Evaluation Directives\n\n` +
|
|
102
190
|
`${SHARED_ADVERSARIAL_MANDATE}\n\n` +
|
|
103
191
|
`${PRECISION_MANDATE}\n\n` +
|
|
192
|
+
`${CLEAN_CODE_GATE}\n\n` +
|
|
104
193
|
`DOMAIN SCOPE DIRECTIVE (applies to ALL judges):\n` +
|
|
105
194
|
`- Each judge MUST only report findings within their stated domain expertise.\n` +
|
|
106
195
|
`- A CI/CD judge should NOT report authentication findings. An ethics judge should NOT report performance findings.\n` +
|
|
107
196
|
`- If code falls entirely outside your domain (e.g., a YAML CI workflow being evaluated by the Database judge), report ZERO findings for that judge.\n` +
|
|
108
|
-
`- Cross-domain observations should ONLY be reported by the judge whose domain they fall under.\n
|
|
197
|
+
`- Cross-domain observations should ONLY be reported by the judge whose domain they fall under.\n` +
|
|
198
|
+
`- HARD LIMIT: Each judge may report AT MOST 2 findings. If a judge has more than 2 potential findings, keep only the 2 highest-severity, highest-confidence ones and discard the rest.\n\n` +
|
|
109
199
|
(amendmentSection ? `${amendmentSection}\n` : "") +
|
|
110
200
|
contextSection +
|
|
111
201
|
`## Evaluation Instructions\n\n` +
|
|
112
|
-
`Evaluate the following ${language} code from the perspective of ALL ${
|
|
113
|
-
|
|
114
|
-
`
|
|
115
|
-
|
|
116
|
-
`
|
|
202
|
+
`Evaluate the following ${language} code from the perspective of ALL ${activeJudges.length} judges below.\n\n` +
|
|
203
|
+
`### Output Format โ Tiered Findings\n` +
|
|
204
|
+
`Organize ALL findings into three tiers:\n\n` +
|
|
205
|
+
`**๐ด MUST FIX** (critical/high severity โ blocks merge):\n` +
|
|
206
|
+
`These are real bugs, security vulnerabilities, data loss risks, or correctness issues. ` +
|
|
207
|
+
`Report at most 5 findings here. Each must have concrete code evidence.\n\n` +
|
|
208
|
+
`**๐ก WORTH REVIEWING** (medium severity โ warrants discussion):\n` +
|
|
209
|
+
`Design flaws, maintainability concerns, or reliability risks that a senior reviewer would flag. ` +
|
|
210
|
+
`Only include findings with specific code evidence.\n\n` +
|
|
211
|
+
`**๐ข INFORMATIONAL** (low/info severity โ optional improvements):\n` +
|
|
212
|
+
`Minor style or optimization suggestions. Limit to the most impactful 3. Omit if none are genuinely useful.\n\n` +
|
|
213
|
+
`For each finding, provide:\n` +
|
|
214
|
+
`1. Rule ID (using the judge's prefix)\n` +
|
|
215
|
+
`2. Severity (critical/high/medium/low/info)\n` +
|
|
216
|
+
`3. Confidence (0-100%): How certain are you this is a real issue? Only include findings โฅ80%.\n` +
|
|
217
|
+
`4. Judge name and domain\n` +
|
|
218
|
+
`5. Specific code evidence (line numbers, patterns)\n` +
|
|
219
|
+
`6. Description and recommendation\n\n` +
|
|
117
220
|
`For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
|
|
118
221
|
`Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
|
|
119
222
|
`## The Judges\n\n${judgeInstructions}\n\n` +
|
|
@@ -181,8 +284,24 @@ export function selectStratifiedSample(cases, targetSize) {
|
|
|
181
284
|
/**
|
|
182
285
|
* Score a single LLM benchmark case using prefix-based matching.
|
|
183
286
|
* Returns a fully populated LlmCaseResult.
|
|
287
|
+
*
|
|
288
|
+
* @param topKPrefixes - If set, only keep the first `topKPrefixes` unique
|
|
289
|
+
* detected prefixes (in the order they appear in the LLM response).
|
|
290
|
+
* This prevents verbose tribunal output from inflating FP counts.
|
|
184
291
|
*/
|
|
185
|
-
export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
|
|
292
|
+
export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed, topKPrefixes) {
|
|
293
|
+
// โโ Optional top-K prefix cap โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
294
|
+
let filteredDetected = detectedRuleIds;
|
|
295
|
+
if (topKPrefixes !== undefined && topKPrefixes > 0) {
|
|
296
|
+
const seenPrefixes = new Set();
|
|
297
|
+
filteredDetected = detectedRuleIds.filter((id) => {
|
|
298
|
+
const prefix = id.split("-")[0];
|
|
299
|
+
if (seenPrefixes.size >= topKPrefixes && !seenPrefixes.has(prefix))
|
|
300
|
+
return false;
|
|
301
|
+
seenPrefixes.add(prefix);
|
|
302
|
+
return true;
|
|
303
|
+
});
|
|
304
|
+
}
|
|
186
305
|
// โโ Prefix-level FP deduplication โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
187
306
|
// TPs are counted per-expected-rule using prefix matching: a single
|
|
188
307
|
// detected CYBER-xxx satisfies all expected CYBER-yyy rules.
|
|
@@ -191,7 +310,7 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
|
|
|
191
310
|
// generates for that prefix. This prevents verbose LLM output from
|
|
192
311
|
// inflating the FP metric (e.g. CYBER-001โฆ005 on clean code = 1 FP,
|
|
193
312
|
// not 5).
|
|
194
|
-
const detectedPrefixes = new Set(
|
|
313
|
+
const detectedPrefixes = new Set(filteredDetected.map((r) => r.split("-")[0]));
|
|
195
314
|
const matchedExpected = tc.expectedRuleIds.filter((expected) => {
|
|
196
315
|
const prefix = expected.split("-")[0];
|
|
197
316
|
return detectedPrefixes.has(prefix);
|
|
@@ -203,19 +322,19 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
|
|
|
203
322
|
// For clean cases (no expected findings), ALL detections are false positives.
|
|
204
323
|
// For dirty cases with unexpectedRuleIds, FPs are detections matching those prefixes.
|
|
205
324
|
// For dirty cases WITHOUT unexpectedRuleIds, FPs are detections whose prefix
|
|
206
|
-
// doesn't match any expected
|
|
325
|
+
// doesn't match any expected or acceptable prefix.
|
|
207
326
|
const isCleanCase = tc.expectedRuleIds.length === 0;
|
|
208
|
-
const
|
|
327
|
+
const acceptablePrefixes = getAcceptablePrefixes(tc);
|
|
209
328
|
const falsePositiveIdsRaw = isCleanCase
|
|
210
|
-
?
|
|
329
|
+
? filteredDetected
|
|
211
330
|
: tc.unexpectedRuleIds
|
|
212
|
-
?
|
|
331
|
+
? filteredDetected.filter((found) => {
|
|
213
332
|
const prefix = found.split("-")[0];
|
|
214
333
|
return tc.unexpectedRuleIds.some((u) => u.split("-")[0] === prefix);
|
|
215
334
|
})
|
|
216
|
-
:
|
|
335
|
+
: filteredDetected.filter((found) => {
|
|
217
336
|
const prefix = found.split("-")[0];
|
|
218
|
-
return !
|
|
337
|
+
return !acceptablePrefixes.has(prefix);
|
|
219
338
|
});
|
|
220
339
|
// Deduplicate FPs by prefix โ keep one representative rule ID per prefix
|
|
221
340
|
const fpPrefixSeen = new Set();
|
|
@@ -233,7 +352,7 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
|
|
|
233
352
|
difficulty: tc.difficulty,
|
|
234
353
|
passed: casePassed,
|
|
235
354
|
expectedRuleIds: tc.expectedRuleIds,
|
|
236
|
-
detectedRuleIds,
|
|
355
|
+
detectedRuleIds: filteredDetected,
|
|
237
356
|
missedRuleIds: missedExpected,
|
|
238
357
|
falsePositiveRuleIds: falsePositiveIds,
|
|
239
358
|
rawResponse,
|
|
@@ -289,11 +408,16 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
|
|
|
289
408
|
cat.truePositives += caseTP;
|
|
290
409
|
cat.falseNegatives += caseFN;
|
|
291
410
|
cat.falsePositives += caseFP;
|
|
292
|
-
// Per-judge
|
|
411
|
+
// Per-judge (deduplicate by prefix per case to match case-level FP counting)
|
|
412
|
+
// Use pre-computed falsePositiveRuleIds to stay consistent with scoreLlmCase
|
|
413
|
+
const fpPrefixes = new Set(c.falsePositiveRuleIds.map((r) => r.split("-")[0]));
|
|
293
414
|
const expectedPrefixes = new Set(c.expectedRuleIds.map((r) => r.split("-")[0]));
|
|
294
|
-
const
|
|
415
|
+
const seenPrefixes = new Set();
|
|
295
416
|
for (const ruleId of c.detectedRuleIds) {
|
|
296
417
|
const prefix = ruleId.split("-")[0];
|
|
418
|
+
if (seenPrefixes.has(prefix))
|
|
419
|
+
continue;
|
|
420
|
+
seenPrefixes.add(prefix);
|
|
297
421
|
if (!perJudge[prefix]) {
|
|
298
422
|
perJudge[prefix] = {
|
|
299
423
|
judgeId: prefix,
|
|
@@ -311,15 +435,48 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
|
|
|
311
435
|
if (expectedPrefixes.has(prefix)) {
|
|
312
436
|
jb.truePositives++;
|
|
313
437
|
}
|
|
314
|
-
else if (
|
|
438
|
+
else if (fpPrefixes.has(prefix)) {
|
|
315
439
|
jb.falsePositives++;
|
|
316
440
|
}
|
|
441
|
+
// Acceptable (non-expected, non-FP) detections are silently ignored
|
|
317
442
|
}
|
|
318
443
|
}
|
|
319
444
|
// Compute aggregate metrics
|
|
320
445
|
const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 1;
|
|
321
446
|
const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 1;
|
|
322
447
|
const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
448
|
+
// Severity-weighted F1 โ re-extract findings from raw responses to get
|
|
449
|
+
// severity info, then weight FPs: critical/high=3x, medium=1x, low/info=0.3x
|
|
450
|
+
const SEVERITY_WEIGHTS = {
|
|
451
|
+
critical: 3,
|
|
452
|
+
high: 3,
|
|
453
|
+
medium: 1,
|
|
454
|
+
low: 0.3,
|
|
455
|
+
info: 0.3,
|
|
456
|
+
};
|
|
457
|
+
let weightedFP = 0;
|
|
458
|
+
const tribunalPrefixes = getTribunalValidPrefixes();
|
|
459
|
+
for (const c of rawCases) {
|
|
460
|
+
if (c.falsePositiveRuleIds.length === 0)
|
|
461
|
+
continue;
|
|
462
|
+
const fpSet = new Set(c.falsePositiveRuleIds.map((r) => r.split("-")[0]));
|
|
463
|
+
const validation = extractValidatedLlmFindings(c.rawResponse, tribunalPrefixes);
|
|
464
|
+
// Map finding ruleId prefix โ max severity weight
|
|
465
|
+
const prefixMaxWeight = new Map();
|
|
466
|
+
for (const f of validation.findings) {
|
|
467
|
+
const prefix = f.ruleId.split("-")[0];
|
|
468
|
+
if (!fpSet.has(prefix))
|
|
469
|
+
continue;
|
|
470
|
+
const weight = SEVERITY_WEIGHTS[f.severity] ?? 1;
|
|
471
|
+
prefixMaxWeight.set(prefix, Math.max(prefixMaxWeight.get(prefix) ?? 0, weight));
|
|
472
|
+
}
|
|
473
|
+
// Sum weights for FP prefixes (use weight=1 default if severity unknown)
|
|
474
|
+
for (const prefix of fpSet) {
|
|
475
|
+
weightedFP += prefixMaxWeight.get(prefix) ?? 1;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
const weightedPrecision = totalTP + weightedFP > 0 ? totalTP / (totalTP + weightedFP) : 1;
|
|
479
|
+
const weightedF1Score = weightedPrecision + recall > 0 ? (2 * weightedPrecision * recall) / (weightedPrecision + recall) : 0;
|
|
323
480
|
// Compute per-difficulty rates
|
|
324
481
|
for (const d of Object.values(perDifficulty)) {
|
|
325
482
|
d.detectionRate = d.total > 0 ? d.detected / d.total : 0;
|
|
@@ -356,6 +513,7 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
|
|
|
356
513
|
precision,
|
|
357
514
|
recall,
|
|
358
515
|
f1Score,
|
|
516
|
+
weightedF1Score,
|
|
359
517
|
detectionRate: rawCases.length > 0 ? totalDetected / rawCases.length : 0,
|
|
360
518
|
perCategory,
|
|
361
519
|
perJudge,
|
|
@@ -394,6 +552,9 @@ export function formatLlmSnapshotMarkdown(snapshot) {
|
|
|
394
552
|
lines.push(`| Precision | ${pct(snapshot.precision)} |`);
|
|
395
553
|
lines.push(`| Recall | ${pct(snapshot.recall)} |`);
|
|
396
554
|
lines.push(`| F1 Score | ${pct(snapshot.f1Score)} |`);
|
|
555
|
+
if (snapshot.weightedF1Score !== null && snapshot.weightedF1Score !== undefined) {
|
|
556
|
+
lines.push(`| Weighted F1 | ${pct(snapshot.weightedF1Score)} |`);
|
|
557
|
+
}
|
|
397
558
|
lines.push(`| True Positives | ${snapshot.truePositives} |`);
|
|
398
559
|
lines.push(`| False Negatives | ${snapshot.falseNegatives} |`);
|
|
399
560
|
lines.push(`| False Positives | ${snapshot.falsePositives} |`);
|
package/dist/config.js
CHANGED
|
@@ -12,7 +12,7 @@ import { matchGlobPath } from "./tools/command-safety.js";
|
|
|
12
12
|
export function expandEnvPlaceholders(content) {
|
|
13
13
|
if (!content)
|
|
14
14
|
return content;
|
|
15
|
-
return content.replace(/\$\{([^}]
|
|
15
|
+
return content.replace(/\$\{([^}]{1,100})\}/g, (_match, varName) => {
|
|
16
16
|
const envVal = process.env[varName];
|
|
17
17
|
return envVal !== undefined ? envVal : "";
|
|
18
18
|
});
|
|
@@ -29,6 +29,23 @@ RULES FOR YOUR EVALUATION:
|
|
|
29
29
|
- Flag any endpoint that accepts user input without verifying the caller's identity and permissions.
|
|
30
30
|
- Score from 0-100 where 100 means robust auth implementation.
|
|
31
31
|
|
|
32
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
33
|
+
- Authentication middleware protects all routes that handle user data or state changes.
|
|
34
|
+
- Passwords are hashed with bcrypt, scrypt, or argon2 โ not stored in plaintext or weak hashes.
|
|
35
|
+
- JWTs are verified with explicit algorithm restrictions, expiration, and issuer/audience checks.
|
|
36
|
+
- Sessions use secure, httpOnly, sameSite cookies with proper expiration and rotation.
|
|
37
|
+
- OAuth/OIDC flows use PKCE, validate state parameters, and allowlist redirect URIs.
|
|
38
|
+
- API keys are transmitted in headers (not query params) and scoped to minimum permissions.
|
|
39
|
+
If the code meets these criteria, authentication is implemented correctly. Do NOT manufacture findings.
|
|
40
|
+
|
|
41
|
+
DOMAIN BOUNDARY (defer these to other judges):
|
|
42
|
+
- Injection attacks and XSS exploit paths โ defer to CYBER judge.
|
|
43
|
+
- General security posture and cryptographic practices โ defer to SEC judge.
|
|
44
|
+
- Rate limiting on login endpoints โ defer to RATE judge (unless auth logic itself is broken).
|
|
45
|
+
- Error handling in auth flows โ defer to ERR judge.
|
|
46
|
+
- Data privacy in auth tokens/logs โ defer to DATA/LOGPRIV judges.
|
|
47
|
+
Only flag issues within YOUR domain: authentication middleware gaps, credential handling, token security, session management, authorization checks, OAuth/OIDC implementation, privilege escalation.
|
|
48
|
+
|
|
32
49
|
FALSE POSITIVE AVOIDANCE:
|
|
33
50
|
- Do NOT flag code that uses established authentication libraries (passport, next-auth, Spring Security, etc.) following their documented patterns.
|
|
34
51
|
- JWT verification with explicit algorithm restrictions and proper expiration checks is correct implementation, not a vulnerability.
|
|
@@ -28,6 +28,24 @@ RULES FOR YOUR EVALUATION:
|
|
|
28
28
|
- Reference OWASP, CWE IDs, and CVE IDs where applicable.
|
|
29
29
|
- Score from 0-100 where 100 means no exploitable vulnerabilities found.
|
|
30
30
|
|
|
31
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
32
|
+
- Input validation and sanitization are applied to user-controlled data before use in queries, commands, or output.
|
|
33
|
+
- Passwords are hashed with bcrypt, scrypt, or argon2 โ not MD5/SHA1.
|
|
34
|
+
- Database queries use parameterized statements or an ORM with proper escaping.
|
|
35
|
+
- Security middleware is present (helmet, CORS, CSRF tokens) for web applications.
|
|
36
|
+
- Secrets are loaded from environment variables or a secrets manager, not hardcoded.
|
|
37
|
+
- Dependencies are imported from standard registries with version pinning.
|
|
38
|
+
- Error responses do not leak stack traces or internal details to clients.
|
|
39
|
+
If the code meets these criteria, it is implementing security correctly. Do NOT manufacture findings.
|
|
40
|
+
|
|
41
|
+
DOMAIN BOUNDARY (defer these to other judges):
|
|
42
|
+
- Rate limiting, throttling, and abuse prevention โ defer to RATE judge.
|
|
43
|
+
- Authentication flows, session management, OAuth/OIDC โ defer to AUTH judge.
|
|
44
|
+
- General security posture, defense-in-depth patterns โ defer to SEC judge.
|
|
45
|
+
- Error handling completeness and error propagation โ defer to ERR judge.
|
|
46
|
+
- Data privacy, PII handling, logging of sensitive data โ defer to DATA/LOGPRIV judges.
|
|
47
|
+
Only flag issues within YOUR domain: injection attacks, XSS, CSRF/SSRF, dependency CVEs, cryptographic weaknesses, OWASP Top 10 violations with concrete exploit paths.
|
|
48
|
+
|
|
31
49
|
FALSE POSITIVE AVOIDANCE:
|
|
32
50
|
- Do NOT flag established security library usage (helmet, cors, bcrypt, argon2, parameterized queries) as security issues โ these ARE the correct patterns.
|
|
33
51
|
- Code that properly validates input, uses HTTPS, and parameterizes queries is implementing security correctly.
|
|
@@ -29,6 +29,15 @@ RULES FOR YOUR EVALUATION:
|
|
|
29
29
|
- Flag any code path that could throw without a handler in scope.
|
|
30
30
|
- Score from 0-100 where 100 means robust error handling.
|
|
31
31
|
|
|
32
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
33
|
+
- Try-catch blocks wrap code paths that can throw, with meaningful handling (log, re-throw, or recover).
|
|
34
|
+
- Async operations use try-catch or .catch() to handle rejections.
|
|
35
|
+
- Error responses return consistent structures with appropriate HTTP status codes.
|
|
36
|
+
- Resources (connections, file handles, streams) are cleaned up in finally blocks or using disposal patterns.
|
|
37
|
+
- Framework error middleware or global handlers are present (Express error middleware, Spring @ExceptionHandler, etc.).
|
|
38
|
+
- Stack traces and internal details are not exposed to end users in error responses.
|
|
39
|
+
If the code meets these criteria, error handling is implemented correctly. Do NOT manufacture findings.
|
|
40
|
+
|
|
32
41
|
FALSE POSITIVE AVOIDANCE:
|
|
33
42
|
- Do NOT flag error handling in code that delegates error handling to a framework (Express middleware, Spring @ExceptionHandler, etc.).
|
|
34
43
|
- Try-catch with logging and re-throw is a valid error handling pattern, not a deficiency.
|
|
@@ -29,6 +29,15 @@ RULES FOR YOUR EVALUATION:
|
|
|
29
29
|
- Consider both inbound (protecting your service) and outbound (respecting others') rate limits.
|
|
30
30
|
- Score from 0-100 where 100 means comprehensive rate limiting.
|
|
31
31
|
|
|
32
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
33
|
+
- Rate limiting middleware is applied to public-facing API endpoints (express-rate-limit, API gateway config, etc.).
|
|
34
|
+
- Request body size limits are configured (bodyParser limits, multer limits, etc.).
|
|
35
|
+
- List/query endpoints have pagination with enforced maximum page sizes.
|
|
36
|
+
- External API calls use bounded retries with exponential backoff and jitter.
|
|
37
|
+
- Connection pools and concurrent request limits are bounded.
|
|
38
|
+
If the code meets these criteria, rate limiting is implemented correctly. Do NOT manufacture findings.
|
|
39
|
+
IMPORTANT: CLI tools, data scripts, utility libraries, batch processors, and internal services do NOT need rate limiting. If the code is not a public-facing API or web server, report ZERO findings.
|
|
40
|
+
|
|
32
41
|
FALSE POSITIVE AVOIDANCE:
|
|
33
42
|
- Only flag rate-limiting issues in code that accepts external requests (APIs, WebSocket servers, public endpoints).
|
|
34
43
|
- Do NOT flag internal services, batch processors, CLI tools, or cron jobs for missing rate limiting.
|
package/dist/judges/security.js
CHANGED
|
@@ -29,6 +29,24 @@ RULES FOR YOUR EVALUATION:
|
|
|
29
29
|
- Reference CWE IDs where applicable.
|
|
30
30
|
- Score from 0-100 where 100 means excellent security posture.
|
|
31
31
|
|
|
32
|
+
CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
|
|
33
|
+
- Security middleware is configured (helmet, CORS, CSRF protection) for web applications.
|
|
34
|
+
- User input is validated before use in data flows (queries, file ops, HTTP requests).
|
|
35
|
+
- Cryptographic operations use modern algorithms (AES-256, SHA-256+, bcrypt/argon2).
|
|
36
|
+
- Secrets are sourced from environment variables or a secrets manager, not hardcoded.
|
|
37
|
+
- Deserialization of untrusted data uses safe mechanisms (JSON.parse, not pickle/eval).
|
|
38
|
+
- JWT/token verification includes algorithm restrictions and expiration checks.
|
|
39
|
+
- No user-controlled URLs are used in redirects without validation.
|
|
40
|
+
If the code meets these criteria, it has a strong security posture. Do NOT manufacture findings.
|
|
41
|
+
|
|
42
|
+
DOMAIN BOUNDARY (defer these to other judges):
|
|
43
|
+
- Injection attacks (SQL, XSS, command injection) with exploit paths โ defer to CYBER judge.
|
|
44
|
+
- Authentication flows, credential storage, session management โ defer to AUTH judge.
|
|
45
|
+
- Rate limiting and abuse prevention โ defer to RATE judge.
|
|
46
|
+
- Error handling patterns and error propagation โ defer to ERR judge.
|
|
47
|
+
- Infrastructure-as-code security โ defer to IAC judge.
|
|
48
|
+
Only flag issues within YOUR domain: insecure data flows, weak cryptography, missing security controls, unsafe deserialization, XML security, secret management, mass assignment, redirect validation.
|
|
49
|
+
|
|
32
50
|
FALSE POSITIVE AVOIDANCE:
|
|
33
51
|
- Do NOT flag code that uses established security libraries correctly (helmet, bcrypt, argon2, parameterized queries, CSRF tokens, rate limiters, proper TLS configuration).
|
|
34
52
|
- Do NOT flag security controls in non-application code (CI/CD configs, IaC templates, documentation examples) unless they contain actual secrets or credentials.
|
|
@@ -5,7 +5,7 @@ const SEVERITY_SET = new Set(["critical", "high", "medium", "low", "info"]);
|
|
|
5
5
|
* Attempt to parse a JSON payload embedded in LLM output. Supports fenced code blocks and raw JSON.
|
|
6
6
|
*/
|
|
7
7
|
function parseJsonBlock(text) {
|
|
8
|
-
const fenceMatch = text.match(/```(?:json)
|
|
8
|
+
const fenceMatch = text.match(/```(?:json)?[ \t]*\n([\s\S]*?)\n[ \t]*```/i) ?? text.match(/```(?:json)?[ \t]*([\s\S]*?)```/i);
|
|
9
9
|
if (fenceMatch) {
|
|
10
10
|
try {
|
|
11
11
|
return JSON.parse(fenceMatch[1]);
|
|
@@ -25,7 +25,7 @@ function normalizeRuleId(id) {
|
|
|
25
25
|
return id.trim().toUpperCase();
|
|
26
26
|
}
|
|
27
27
|
function isValidRuleId(id, validPrefixes) {
|
|
28
|
-
const match = id.match(/^([A-Z]
|
|
28
|
+
const match = id.match(/^([A-Z][A-Z0-9]+)-\d{1,3}$/);
|
|
29
29
|
if (!match)
|
|
30
30
|
return false;
|
|
31
31
|
return validPrefixes.has(match[1]);
|
|
@@ -215,7 +215,15 @@ function countBySeverity(findings) {
|
|
|
215
215
|
function compileExcludeRegexes(patterns) {
|
|
216
216
|
if (!patterns || patterns.length === 0)
|
|
217
217
|
return [];
|
|
218
|
-
return patterns.map((pattern) =>
|
|
218
|
+
return patterns.map((pattern) => {
|
|
219
|
+
try {
|
|
220
|
+
return new RegExp(pattern, "i");
|
|
221
|
+
}
|
|
222
|
+
catch {
|
|
223
|
+
// Invalid regex from user input โ treat as literal string match
|
|
224
|
+
return new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i");
|
|
225
|
+
}
|
|
226
|
+
});
|
|
219
227
|
}
|
|
220
228
|
function isLikelyNonProductionPath(path) {
|
|
221
229
|
return /(^|\/)(test|tests|__tests__|spec|specs|e2e)(\/|\.|$)|\.(?:test|tests|spec|specs|e2e)\.[^/]+$|mock|fixture|fixtures|(^|\/)docs(-|\/)i18n(\/|$)|(^|\/)docs(\/|$)/i.test(path);
|
package/dist/skill-loader.js
CHANGED
|
@@ -25,7 +25,7 @@ export function parseSkillFrontmatter(raw) {
|
|
|
25
25
|
i++;
|
|
26
26
|
continue;
|
|
27
27
|
}
|
|
28
|
-
const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)\
|
|
28
|
+
const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
|
|
29
29
|
if (!kv) {
|
|
30
30
|
i++;
|
|
31
31
|
continue;
|
|
@@ -64,9 +64,10 @@ export function parseSkillFrontmatter(raw) {
|
|
|
64
64
|
if (typeof value === "string" && ((value.startsWith("[") && value.endsWith("]")) || value.includes(","))) {
|
|
65
65
|
// simple array parsing: split on comma
|
|
66
66
|
const normalized = value
|
|
67
|
-
.replace(
|
|
68
|
-
.replace(/\]\
|
|
69
|
-
.split(
|
|
67
|
+
.replace(/^[ \t]*\[/, "")
|
|
68
|
+
.replace(/\][ \t]*$/, "")
|
|
69
|
+
.split(",")
|
|
70
|
+
.map((s) => s.trim())
|
|
70
71
|
.filter(Boolean);
|
|
71
72
|
value = normalized;
|
|
72
73
|
}
|
|
@@ -93,13 +94,15 @@ export function validateSkillFrontmatter(meta, sourcePath) {
|
|
|
93
94
|
agents: Array.isArray(meta.agents)
|
|
94
95
|
? meta.agents
|
|
95
96
|
: String(meta.agents ?? "")
|
|
96
|
-
.split(
|
|
97
|
+
.split(",")
|
|
98
|
+
.map((s) => s.trim())
|
|
97
99
|
.filter(Boolean),
|
|
98
100
|
tags: Array.isArray(meta.tags)
|
|
99
101
|
? meta.tags
|
|
100
102
|
: meta.tags
|
|
101
103
|
? String(meta.tags)
|
|
102
|
-
.split(
|
|
104
|
+
.split(",")
|
|
105
|
+
.map((s) => s.trim())
|
|
103
106
|
.filter(Boolean)
|
|
104
107
|
: undefined,
|
|
105
108
|
priority: meta.priority ? Number(meta.priority) : 10,
|
package/dist/tools/prompts.d.ts
CHANGED
|
@@ -2,7 +2,9 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
|
2
2
|
/** Adversarial evaluation stance โ shared across all judges. */
|
|
3
3
|
export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Report only real problems, risks, and deficiencies that exist in the actual code.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.";
|
|
4
4
|
/** Precision override โ ensures evidence-based findings. */
|
|
5
|
-
export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.";
|
|
5
|
+
export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.\n\nCOMMON FALSE POSITIVE PATTERNS (do NOT report these):\n- ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.\n- LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.\n- MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.\n- SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. \"Could be stronger\" is NOT a vulnerability.\n- STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.";
|
|
6
|
+
/** Clean code gate โ explicit instructions when code quality is high. */
|
|
7
|
+
export declare const CLEAN_CODE_GATE = "CLEAN CODE GATE (applies AFTER individual judge evaluation):\n- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.\n- Do NOT report stylistic preferences, alternative approaches, or \"nice to have\" improvements as findings. These are opinions, not defects.\n- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).\n- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.\n- SELF-CHECK before finalizing: For each finding, ask \"Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?\" If the answer is not a clear YES, discard the finding.\n- The goal is to match what a thoughtful, experienced human reviewer would flag \u2014 not to demonstrate comprehensive knowledge of every possible concern.";
|
|
6
8
|
/**
|
|
7
9
|
* Extract only the unique evaluation criteria from a judge's systemPrompt,
|
|
8
10
|
* stripping the persona introduction line, the ADVERSARIAL MANDATE block,
|
|
@@ -21,6 +23,5 @@ export declare function getCondensedCriteria(systemPrompt: string): string;
|
|
|
21
23
|
/**
|
|
22
24
|
* Register all MCP prompts on the given server:
|
|
23
25
|
* - One per-judge prompt (`judge-{id}`) for single-persona deep reviews
|
|
24
|
-
* - A `full-tribunal` prompt that convenes all judges at once
|
|
25
26
|
*/
|
|
26
27
|
export declare function registerPrompts(server: McpServer): void;
|
package/dist/tools/prompts.js
CHANGED
|
@@ -2,18 +2,15 @@
|
|
|
2
2
|
// Expose judge system prompts as MCP prompts so LLM-based clients can use
|
|
3
3
|
// them for deeper, AI-powered analysis beyond pattern matching.
|
|
4
4
|
//
|
|
5
|
-
//
|
|
6
|
-
// precision mandate
|
|
7
|
-
//
|
|
8
|
-
// the unique evaluation criteria, domain-specific rules, and FP-avoidance
|
|
9
|
-
// guidance. This reduces the tribunal prompt by ~40 000 chars (~10 000
|
|
10
|
-
// tokens) without removing any evaluation criteria.
|
|
5
|
+
// Each per-judge prompt includes shared behavioural directives (adversarial
|
|
6
|
+
// mandate, precision mandate, clean-code gate) plus the judge's unique
|
|
7
|
+
// evaluation criteria, domain-specific rules, and FP-avoidance guidance.
|
|
11
8
|
// โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
12
9
|
import { z } from "zod";
|
|
13
10
|
import { JUDGES } from "../judges/index.js";
|
|
14
|
-
// โโโ Shared Behavioural Directives
|
|
15
|
-
//
|
|
16
|
-
//
|
|
11
|
+
// โโโ Shared Behavioural Directives & Gates โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
12
|
+
// Included in every per-judge prompt to ensure consistent evaluation
|
|
13
|
+
// behaviour across all judges.
|
|
17
14
|
// โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
18
15
|
/** Adversarial evaluation stance โ shared across all judges. */
|
|
19
16
|
export const SHARED_ADVERSARIAL_MANDATE = `ADVERSARIAL MANDATE (applies to ALL judges):
|
|
@@ -32,7 +29,22 @@ export const PRECISION_MANDATE = `PRECISION MANDATE (this section OVERRIDES the
|
|
|
32
29
|
- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.
|
|
33
30
|
- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.
|
|
34
31
|
- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (โฅ80%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.
|
|
35
|
-
- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding
|
|
32
|
+
- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.
|
|
33
|
+
|
|
34
|
+
COMMON FALSE POSITIVE PATTERNS (do NOT report these):
|
|
35
|
+
- ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.
|
|
36
|
+
- LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.
|
|
37
|
+
- MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.
|
|
38
|
+
- SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. "Could be stronger" is NOT a vulnerability.
|
|
39
|
+
- STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.`;
|
|
40
|
+
/** Clean code gate โ explicit instructions when code quality is high. */
|
|
41
|
+
export const CLEAN_CODE_GATE = `CLEAN CODE GATE (applies AFTER individual judge evaluation):
|
|
42
|
+
- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.
|
|
43
|
+
- Do NOT report stylistic preferences, alternative approaches, or "nice to have" improvements as findings. These are opinions, not defects.
|
|
44
|
+
- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).
|
|
45
|
+
- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.
|
|
46
|
+
- SELF-CHECK before finalizing: For each finding, ask "Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?" If the answer is not a clear YES, discard the finding.
|
|
47
|
+
- The goal is to match what a thoughtful, experienced human reviewer would flag โ not to demonstrate comprehensive knowledge of every possible concern.`;
|
|
36
48
|
// โโโ Criteria Extraction โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
37
49
|
/**
|
|
38
50
|
* Extract only the unique evaluation criteria from a judge's systemPrompt,
|
|
@@ -73,13 +85,11 @@ export function getCondensedCriteria(systemPrompt) {
|
|
|
73
85
|
/**
|
|
74
86
|
* Register all MCP prompts on the given server:
|
|
75
87
|
* - One per-judge prompt (`judge-{id}`) for single-persona deep reviews
|
|
76
|
-
* - A `full-tribunal` prompt that convenes all judges at once
|
|
77
88
|
*/
|
|
78
89
|
export function registerPrompts(server) {
|
|
79
90
|
// โโ Per-judge prompts โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
80
|
-
// Each prompt uses condensed criteria
|
|
81
|
-
//
|
|
82
|
-
// and better precision on clean code.
|
|
91
|
+
// Each prompt uses condensed criteria plus the shared mandates for
|
|
92
|
+
// better precision on clean code.
|
|
83
93
|
for (const judge of JUDGES) {
|
|
84
94
|
server.prompt(`judge-${judge.id}`, `Use the ${judge.name} persona to perform a deep ${judge.domain} review of code. This prompt provides the judge's expert criteria for LLM-powered analysis that goes beyond pattern matching.`, {
|
|
85
95
|
code: z.string().describe("The source code to evaluate"),
|
|
@@ -92,6 +102,7 @@ export function registerPrompts(server) {
|
|
|
92
102
|
`${SHARED_ADVERSARIAL_MANDATE}\n\n` +
|
|
93
103
|
`${PRECISION_MANDATE}\n\n` +
|
|
94
104
|
`${criteria}\n\n` +
|
|
105
|
+
`${CLEAN_CODE_GATE}\n\n` +
|
|
95
106
|
`Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
|
|
96
107
|
(context ? `\n\nAdditional context: ${context}` : "") +
|
|
97
108
|
`\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`;
|
|
@@ -108,41 +119,4 @@ export function registerPrompts(server) {
|
|
|
108
119
|
};
|
|
109
120
|
});
|
|
110
121
|
}
|
|
111
|
-
// โโ Full tribunal prompt (token-optimised) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
112
|
-
// Shared directives (adversarial mandate, precision mandate) are stated
|
|
113
|
-
// ONCE in the preamble. Each judge section includes only its unique
|
|
114
|
-
// evaluation criteria, domain-specific rules, and FP-avoidance guidance.
|
|
115
|
-
server.prompt("full-tribunal", `Convene the full Judges Panel โ all ${JUDGES.length} judges evaluate the code in their respective domains and produce a combined verdict.`, {
|
|
116
|
-
code: z.string().describe("The source code to evaluate"),
|
|
117
|
-
language: z.string().describe("The programming language"),
|
|
118
|
-
context: z.string().optional().describe("Additional context about the code"),
|
|
119
|
-
}, async ({ code, language, context }) => {
|
|
120
|
-
const judgeInstructions = JUDGES.map((j) => `### ${j.name} โ ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
|
|
121
|
-
const userMessage = `You are the Judges Panel โ a panel of ${JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
|
|
122
|
-
`## Universal Evaluation Directives\n\n` +
|
|
123
|
-
`${SHARED_ADVERSARIAL_MANDATE}\n\n` +
|
|
124
|
-
`${PRECISION_MANDATE}\n\n` +
|
|
125
|
-
`## Evaluation Instructions\n\n` +
|
|
126
|
-
`Evaluate the following ${language} code from the perspective of ALL ${JUDGES.length} judges below. For each judge, provide:\n` +
|
|
127
|
-
`1. Judge name and domain\n` +
|
|
128
|
-
`2. Verdict (PASS / WARNING / FAIL)\n` +
|
|
129
|
-
`3. Score (0-100)\n` +
|
|
130
|
-
`4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
|
|
131
|
-
`For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
|
|
132
|
-
`Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
|
|
133
|
-
`## The Judges\n\n${judgeInstructions}\n\n` +
|
|
134
|
-
`## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\`` +
|
|
135
|
-
(context ? `\n\n## Additional Context\n${context}` : "");
|
|
136
|
-
return {
|
|
137
|
-
messages: [
|
|
138
|
-
{
|
|
139
|
-
role: "user",
|
|
140
|
-
content: {
|
|
141
|
-
type: "text",
|
|
142
|
-
text: userMessage,
|
|
143
|
-
},
|
|
144
|
-
},
|
|
145
|
-
],
|
|
146
|
-
};
|
|
147
|
-
});
|
|
148
122
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kevinrabun/judges",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.123.0",
|
|
4
4
|
"description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
|
|
5
5
|
"mcpName": "io.github.KevinRabun/judges",
|
|
6
6
|
"type": "module",
|
|
@@ -145,7 +145,7 @@
|
|
|
145
145
|
"zod": "^4.3.6"
|
|
146
146
|
},
|
|
147
147
|
"devDependencies": {
|
|
148
|
-
"@anthropic-ai/sdk": "^0.
|
|
148
|
+
"@anthropic-ai/sdk": "^0.80.0",
|
|
149
149
|
"@eslint/js": "^10.0.1",
|
|
150
150
|
"@types/node": "^25.3.0",
|
|
151
151
|
"@typescript-eslint/eslint-plugin": "^8.56.1",
|
package/server.json
CHANGED
|
@@ -3,16 +3,25 @@
|
|
|
3
3
|
"name": "io.github.KevinRabun/judges",
|
|
4
4
|
"title": "Judges Panel",
|
|
5
5
|
"description": "45 judges that evaluate AI-generated code for security, cost, and quality with built-in AST.",
|
|
6
|
+
"websiteUrl": "https://kevinrabun.github.io/judges/",
|
|
6
7
|
"repository": {
|
|
7
|
-
"url": "https://github.com/
|
|
8
|
-
"source": "github"
|
|
8
|
+
"url": "https://github.com/KevinRabun/judges",
|
|
9
|
+
"source": "github",
|
|
10
|
+
"id": "1161966307"
|
|
9
11
|
},
|
|
10
|
-
"
|
|
12
|
+
"icons": [
|
|
13
|
+
{
|
|
14
|
+
"src": "https://raw.githubusercontent.com/KevinRabun/judges/main/vscode-extension/icon.png",
|
|
15
|
+
"sizes": ["128x128"],
|
|
16
|
+
"mimeType": "image/png"
|
|
17
|
+
}
|
|
18
|
+
],
|
|
19
|
+
"version": "3.123.0",
|
|
11
20
|
"packages": [
|
|
12
21
|
{
|
|
13
22
|
"registryType": "npm",
|
|
14
23
|
"identifier": "@kevinrabun/judges",
|
|
15
|
-
"version": "3.
|
|
24
|
+
"version": "3.123.0",
|
|
16
25
|
"transport": {
|
|
17
26
|
"type": "stdio"
|
|
18
27
|
}
|
package/src/skill-loader.ts
CHANGED
|
@@ -44,7 +44,7 @@ export function parseSkillFrontmatter(raw: string): { meta: SkillMeta; body: str
|
|
|
44
44
|
i++;
|
|
45
45
|
continue;
|
|
46
46
|
}
|
|
47
|
-
const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)\
|
|
47
|
+
const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
|
|
48
48
|
if (!kv) {
|
|
49
49
|
i++;
|
|
50
50
|
continue;
|
|
@@ -85,9 +85,10 @@ export function parseSkillFrontmatter(raw: string): { meta: SkillMeta; body: str
|
|
|
85
85
|
if (typeof value === "string" && ((value.startsWith("[") && value.endsWith("]")) || value.includes(","))) {
|
|
86
86
|
// simple array parsing: split on comma
|
|
87
87
|
const normalized = (value as string)
|
|
88
|
-
.replace(
|
|
89
|
-
.replace(/\]\
|
|
90
|
-
.split(
|
|
88
|
+
.replace(/^[ \t]*\[/, "")
|
|
89
|
+
.replace(/\][ \t]*$/, "")
|
|
90
|
+
.split(",")
|
|
91
|
+
.map((s) => s.trim())
|
|
91
92
|
.filter(Boolean);
|
|
92
93
|
value = normalized;
|
|
93
94
|
} else if (
|
|
@@ -117,13 +118,15 @@ export function validateSkillFrontmatter(meta: SkillMeta, sourcePath: string): S
|
|
|
117
118
|
agents: Array.isArray(meta.agents)
|
|
118
119
|
? (meta.agents as string[])
|
|
119
120
|
: String(meta.agents ?? "")
|
|
120
|
-
.split(
|
|
121
|
+
.split(",")
|
|
122
|
+
.map((s) => s.trim())
|
|
121
123
|
.filter(Boolean),
|
|
122
124
|
tags: Array.isArray(meta.tags)
|
|
123
125
|
? (meta.tags as string[])
|
|
124
126
|
: meta.tags
|
|
125
127
|
? String(meta.tags)
|
|
126
|
-
.split(
|
|
128
|
+
.split(",")
|
|
129
|
+
.map((s) => s.trim())
|
|
127
130
|
.filter(Boolean)
|
|
128
131
|
: undefined,
|
|
129
132
|
priority: meta.priority ? Number(meta.priority) : 10,
|