@kevinrabun/judges 3.121.0 โ†’ 3.123.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -15,7 +15,7 @@ An MCP (Model Context Protocol) server that provides a panel of **45 specialized
15
15
  [![npm](https://img.shields.io/npm/v/@kevinrabun/judges)](https://www.npmjs.com/package/@kevinrabun/judges)
16
16
  [![npm downloads](https://img.shields.io/npm/dw/@kevinrabun/judges)](https://www.npmjs.com/package/@kevinrabun/judges)
17
17
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
18
- [![Tests](https://img.shields.io/badge/tests-2481-brightgreen)](https://github.com/KevinRabun/judges/actions)
18
+ [![Tests](https://img.shields.io/badge/tests-2482-brightgreen)](https://github.com/KevinRabun/judges/actions)
19
19
 
20
20
  > ๐Ÿ”ฐ **Packages**
21
21
  > - **CLI**: `@kevinrabun/judges-cli` โ†’ binary `judges` (use `npx @kevinrabun/judges-cli eval --file app.ts`).
@@ -731,6 +731,8 @@ Use `--preset` to apply pre-configured evaluation settings:
731
731
  | `healthtech` | Healthcare โ€” HIPAA compliance, data sovereignty, encryption, audit trails |
732
732
  | `saas` | Multi-tenant SaaS โ€” tenant isolation, rate limiting, scalability |
733
733
  | `government` | Government/public sector โ€” compliance, sovereignty, authentication |
734
+ | `open-source` | Open-source projects โ€” documentation, backwards compatibility, security, dependency health |
735
+ | `ai-review` | AI-generated code review โ€” hallucination detection, security, authentication, correctness |
734
736
 
735
737
  ```bash
736
738
  judges eval --preset security-only src/api.ts
@@ -833,7 +835,7 @@ The tribunal operates in three layers:
833
835
 
834
836
  2. **AST-Based Structural Analysis** โ€” The Code Structure judge (`STRUCT-*` rules) uses real Abstract Syntax Tree parsing to measure cyclomatic complexity, nesting depth, function length, parameter count, dead code, and type safety with precision that regex cannot achieve. All supported languages โ€” **TypeScript, JavaScript, Python, Rust, Go, Java, C#, and C++** โ€” are parsed via **tree-sitter WASM grammars** (real syntax trees compiled to WebAssembly, in-process, zero native dependencies). A scope-tracking structural parser is kept as a fallback when WASM grammars are unavailable. No external AST server required.
835
837
 
836
- 3. **LLM-Powered Deep Analysis (Prompts)** โ€” The server exposes MCP prompts (e.g., `judge-data-security`, `full-tribunal`) that provide each judge's expert persona as a system prompt. When used by an LLM-based client (Copilot, Claude, Cursor, etc.), the host LLM performs deeper, context-aware probabilistic analysis beyond what static patterns can detect. This is where the `systemPrompt` on each judge comes alive โ€” Judges itself makes no LLM calls, but it provides the expert criteria so your AI assistant can act as 45 specialized reviewers.
838
+ 3. **LLM-Powered Deep Analysis (Prompts)** โ€” The server exposes MCP prompts (e.g., `judge-data-security`, `judge-cybersecurity`) that provide each judge's expert persona as a system prompt. When used by an LLM-based client (Copilot, Claude, Cursor, etc.), the host LLM performs deeper, context-aware probabilistic analysis beyond what static patterns can detect. This is where the `systemPrompt` on each judge comes alive โ€” Judges itself makes no LLM calls, but it provides the expert criteria so your AI assistant can act as 45 specialized reviewers.
837
839
 
838
840
  ---
839
841
 
@@ -877,7 +879,7 @@ When your AI coding assistant connects to multiple MCP servers, each one contrib
877
879
  โ”‚ Judges โ”‚ โ”‚ CVE / โ”‚ โ”‚ Linter โ”‚
878
880
  โ”‚ Panel โ”‚ โ”‚ SBOM โ”‚ โ”‚ Server โ”‚
879
881
  โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
880
- โ”‚ 36 Heuristic โ”‚ Vuln DB Style &
882
+ โ”‚ 44 Heuristic โ”‚ Vuln DB Style &
881
883
  โ”‚ judges โ”‚ scanning correctness
882
884
  โ”‚ + AST judge โ”‚
883
885
  โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
@@ -1130,7 +1132,7 @@ Re-run the tribunal with **prior findings as context** for iterative refinement.
1130
1132
 
1131
1133
  #### Judge IDs
1132
1134
 
1133
- `data-security` ยท `cybersecurity` ยท `cost-effectiveness` ยท `scalability` ยท `cloud-readiness` ยท `software-practices` ยท `accessibility` ยท `api-design` ยท `reliability` ยท `observability` ยท `performance` ยท `compliance` ยท `data-sovereignty` ยท `testing` ยท `documentation` ยท `internationalization` ยท `dependency-health` ยท `concurrency` ยท `ethics-bias` ยท `maintainability` ยท `error-handling` ยท `authentication` ยท `database` ยท `caching` ยท `configuration-management` ยท `backwards-compatibility` ยท `portability` ยท `ux` ยท `logging-privacy` ยท `rate-limiting` ยท `ci-cd` ยท `code-structure` ยท `agent-instructions` ยท `ai-code-safety` ยท `framework-safety` ยท `iac-security` ยท `false-positive-review`
1135
+ `data-security` ยท `cybersecurity` ยท `security` ยท `cost-effectiveness` ยท `scalability` ยท `cloud-readiness` ยท `software-practices` ยท `accessibility` ยท `api-design` ยท `api-contract` ยท `reliability` ยท `observability` ยท `performance` ยท `compliance` ยท `data-sovereignty` ยท `testing` ยท `documentation` ยท `internationalization` ยท `dependency-health` ยท `concurrency` ยท `ethics-bias` ยท `maintainability` ยท `error-handling` ยท `authentication` ยท `database` ยท `caching` ยท `configuration-management` ยท `backwards-compatibility` ยท `portability` ยท `ux` ยท `logging-privacy` ยท `rate-limiting` ยท `ci-cd` ยท `code-structure` ยท `agent-instructions` ยท `ai-code-safety` ยท `framework-safety` ยท `iac-security` ยท `hallucination-detection` ยท `intent-alignment` ยท `multi-turn-coherence` ยท `model-fingerprint` ยท `over-engineering` ยท `logic-review` ยท `false-positive-review`
1134
1136
 
1135
1137
  ---
1136
1138
 
@@ -1186,7 +1188,6 @@ Each judge has a corresponding prompt for LLM-powered deep analysis:
1186
1188
  | `judge-over-engineering` | Deep review of unnecessary abstractions, wrapper-mania, premature generalization |
1187
1189
  | `judge-logic-review` | Deep review of logic correctness, semantic mismatches, and dead code in AI-generated code |
1188
1190
  | `judge-false-positive-review` | Meta-judge review of pattern-based findings for false positive detection and accuracy |
1189
- | `full-tribunal` | all 45 judges in a single prompt |
1190
1191
  <!-- PROMPTS_TABLE_END -->
1191
1192
 
1192
1193
  ---
@@ -1216,7 +1217,7 @@ Create a `.judgesrc.json` (or `.judgesrc`) file in your project root to customiz
1216
1217
  | Field | Type | Default | Description |
1217
1218
  |-------|------|---------|-------------|
1218
1219
  | `$schema` | `string` | โ€” | JSON Schema URL for IDE validation |
1219
- | `preset` | `string` | โ€” | Named preset (see [Named Presets](#named-presets) for all 18 options) |
1220
+ | `preset` | `string` | โ€” | Named preset (see [Named Presets](#named-presets) for all 22 options) |
1220
1221
  | `minSeverity` | `string` | `"info"` | Minimum severity to report: `critical` ยท `high` ยท `medium` ยท `low` ยท `info` |
1221
1222
  | `disabledRules` | `string[]` | `[]` | Rule IDs or prefix wildcards to suppress (e.g. `"COST-*"`, `"SEC-003"`) |
1222
1223
  | `disabledJudges` | `string[]` | `[]` | Judge IDs to skip entirely (e.g. `"cost-effectiveness"`) |
@@ -1344,7 +1345,7 @@ judges/
1344
1345
  โ”‚ โ”œโ”€โ”€ evaluators/ # Analysis engine for each judge
1345
1346
  โ”‚ โ”‚ โ”œโ”€โ”€ index.ts # evaluateWithJudge(), evaluateWithTribunal(), evaluateProject(), etc.
1346
1347
  โ”‚ โ”‚ โ”œโ”€โ”€ shared.ts # Scoring, verdict logic, markdown formatters
1347
- โ”‚ โ”‚ โ””โ”€โ”€ *.ts # One analyzer per judge (39 files)
1348
+ โ”‚ โ”‚ โ””โ”€โ”€ *.ts # One analyzer per judge (45 files)
1348
1349
  โ”‚ โ”œโ”€โ”€ formatters/ # Output formatters
1349
1350
  โ”‚ โ”‚ โ”œโ”€โ”€ sarif.ts # SARIF 2.1.0 output
1350
1351
  โ”‚ โ”‚ โ”œโ”€โ”€ html.ts # Self-contained HTML report (dark/light theme, filters)
@@ -1371,12 +1372,12 @@ judges/
1371
1372
  โ”‚ โ”‚ โ””โ”€โ”€ config-share.ts # Shareable team/org configuration
1372
1373
  โ”‚ โ”œโ”€โ”€ presets.ts # Named evaluation presets (strict, lenient, security-only, โ€ฆ)
1373
1374
  โ”‚ โ”œโ”€โ”€ patches/
1374
- โ”‚ โ”‚ โ””โ”€โ”€ index.ts # 53 deterministic auto-fix patch rules
1375
+ โ”‚ โ”‚ โ””โ”€โ”€ index.ts # 201 deterministic auto-fix patch rules
1375
1376
  โ”‚ โ”œโ”€โ”€ tools/ # MCP tool registrations
1376
1377
  โ”‚ โ”‚ โ”œโ”€โ”€ register.ts # Tool registration orchestrator
1377
1378
  โ”‚ โ”‚ โ”œโ”€โ”€ register-evaluation.ts # Evaluation tools (evaluate_code, etc.)
1378
1379
  โ”‚ โ”‚ โ”œโ”€โ”€ register-workflow.ts # Workflow tools (app builder, reports, etc.)
1379
- โ”‚ โ”‚ โ”œโ”€โ”€ prompts.ts # MCP prompt registrations (per-judge + full-tribunal)
1380
+ โ”‚ โ”‚ โ”œโ”€โ”€ prompts.ts # MCP prompt registrations (per-judge prompts)
1380
1381
  โ”‚ โ”‚ โ””โ”€โ”€ schemas.ts # Zod schemas for tool parameters
1381
1382
  โ”‚ โ”œโ”€โ”€ reports/
1382
1383
  โ”‚ โ”‚ โ””โ”€โ”€ public-repo-report.ts # Public repo clone + full tribunal report generation
@@ -30,6 +30,23 @@ RULES FOR YOUR EVALUATION:
30
30
  - Flag any endpoint that accepts user input without verifying the caller's identity and permissions.
31
31
  - Score from 0-100 where 100 means robust auth implementation.
32
32
 
33
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
34
+ - Authentication middleware protects all routes that handle user data or state changes.
35
+ - Passwords are hashed with bcrypt, scrypt, or argon2 โ€” not stored in plaintext or weak hashes.
36
+ - JWTs are verified with explicit algorithm restrictions, expiration, and issuer/audience checks.
37
+ - Sessions use secure, httpOnly, sameSite cookies with proper expiration and rotation.
38
+ - OAuth/OIDC flows use PKCE, validate state parameters, and allowlist redirect URIs.
39
+ - API keys are transmitted in headers (not query params) and scoped to minimum permissions.
40
+ If the code meets these criteria, authentication is implemented correctly. Do NOT manufacture findings.
41
+
42
+ DOMAIN BOUNDARY (defer these to other judges):
43
+ - Injection attacks and XSS exploit paths โ†’ defer to CYBER judge.
44
+ - General security posture and cryptographic practices โ†’ defer to SEC judge.
45
+ - Rate limiting on login endpoints โ†’ defer to RATE judge (unless auth logic itself is broken).
46
+ - Error handling in auth flows โ†’ defer to ERR judge.
47
+ - Data privacy in auth tokens/logs โ†’ defer to DATA/LOGPRIV judges.
48
+ Only flag issues within YOUR domain: authentication middleware gaps, credential handling, token security, session management, authorization checks, OAuth/OIDC implementation, privilege escalation.
49
+
33
50
  FALSE POSITIVE AVOIDANCE:
34
51
  - Do NOT flag code that uses established authentication libraries (passport, next-auth, Spring Security, etc.) following their documented patterns.
35
52
  - JWT verification with explicit algorithm restrictions and proper expiration checks is correct implementation, not a vulnerability.
@@ -29,6 +29,24 @@ RULES FOR YOUR EVALUATION:
29
29
  - Reference OWASP, CWE IDs, and CVE IDs where applicable.
30
30
  - Score from 0-100 where 100 means no exploitable vulnerabilities found.
31
31
 
32
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
33
+ - Input validation and sanitization are applied to user-controlled data before use in queries, commands, or output.
34
+ - Passwords are hashed with bcrypt, scrypt, or argon2 โ€” not MD5/SHA1.
35
+ - Database queries use parameterized statements or an ORM with proper escaping.
36
+ - Security middleware is present (helmet, CORS, CSRF tokens) for web applications.
37
+ - Secrets are loaded from environment variables or a secrets manager, not hardcoded.
38
+ - Dependencies are imported from standard registries with version pinning.
39
+ - Error responses do not leak stack traces or internal details to clients.
40
+ If the code meets these criteria, it is implementing security correctly. Do NOT manufacture findings.
41
+
42
+ DOMAIN BOUNDARY (defer these to other judges):
43
+ - Rate limiting, throttling, and abuse prevention โ†’ defer to RATE judge.
44
+ - Authentication flows, session management, OAuth/OIDC โ†’ defer to AUTH judge.
45
+ - General security posture, defense-in-depth patterns โ†’ defer to SEC judge.
46
+ - Error handling completeness and error propagation โ†’ defer to ERR judge.
47
+ - Data privacy, PII handling, logging of sensitive data โ†’ defer to DATA/LOGPRIV judges.
48
+ Only flag issues within YOUR domain: injection attacks, XSS, CSRF/SSRF, dependency CVEs, cryptographic weaknesses, OWASP Top 10 violations with concrete exploit paths.
49
+
32
50
  FALSE POSITIVE AVOIDANCE:
33
51
  - Do NOT flag established security library usage (helmet, cors, bcrypt, argon2, parameterized queries) as security issues โ€” these ARE the correct patterns.
34
52
  - Code that properly validates input, uses HTTPS, and parameterizes queries is implementing security correctly.
@@ -30,6 +30,15 @@ RULES FOR YOUR EVALUATION:
30
30
  - Flag any code path that could throw without a handler in scope.
31
31
  - Score from 0-100 where 100 means robust error handling.
32
32
 
33
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
34
+ - Try-catch blocks wrap code paths that can throw, with meaningful handling (log, re-throw, or recover).
35
+ - Async operations use try-catch or .catch() to handle rejections.
36
+ - Error responses return consistent structures with appropriate HTTP status codes.
37
+ - Resources (connections, file handles, streams) are cleaned up in finally blocks or using disposal patterns.
38
+ - Framework error middleware or global handlers are present (Express error middleware, Spring @ExceptionHandler, etc.).
39
+ - Stack traces and internal details are not exposed to end users in error responses.
40
+ If the code meets these criteria, error handling is implemented correctly. Do NOT manufacture findings.
41
+
33
42
  FALSE POSITIVE AVOIDANCE:
34
43
  - Do NOT flag error handling in code that delegates error handling to a framework (Express middleware, Spring @ExceptionHandler, etc.).
35
44
  - Try-catch with logging and re-throw is a valid error handling pattern, not a deficiency.
@@ -30,6 +30,15 @@ RULES FOR YOUR EVALUATION:
30
30
  - Consider both inbound (protecting your service) and outbound (respecting others') rate limits.
31
31
  - Score from 0-100 where 100 means comprehensive rate limiting.
32
32
 
33
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
34
+ - Rate limiting middleware is applied to public-facing API endpoints (express-rate-limit, API gateway config, etc.).
35
+ - Request body size limits are configured (bodyParser limits, multer limits, etc.).
36
+ - List/query endpoints have pagination with enforced maximum page sizes.
37
+ - External API calls use bounded retries with exponential backoff and jitter.
38
+ - Connection pools and concurrent request limits are bounded.
39
+ If the code meets these criteria, rate limiting is implemented correctly. Do NOT manufacture findings.
40
+ IMPORTANT: CLI tools, data scripts, utility libraries, batch processors, and internal services do NOT need rate limiting. If the code is not a public-facing API or web server, report ZERO findings.
41
+
33
42
  FALSE POSITIVE AVOIDANCE:
34
43
  - Only flag rate-limiting issues in code that accepts external requests (APIs, WebSocket servers, public endpoints).
35
44
  - Do NOT flag internal services, batch processors, CLI tools, or cron jobs for missing rate limiting.
@@ -30,6 +30,24 @@ RULES FOR YOUR EVALUATION:
30
30
  - Reference CWE IDs where applicable.
31
31
  - Score from 0-100 where 100 means excellent security posture.
32
32
 
33
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
34
+ - Security middleware is configured (helmet, CORS, CSRF protection) for web applications.
35
+ - User input is validated before use in data flows (queries, file ops, HTTP requests).
36
+ - Cryptographic operations use modern algorithms (AES-256, SHA-256+, bcrypt/argon2).
37
+ - Secrets are sourced from environment variables or a secrets manager, not hardcoded.
38
+ - Deserialization of untrusted data uses safe mechanisms (JSON.parse, not pickle/eval).
39
+ - JWT/token verification includes algorithm restrictions and expiration checks.
40
+ - No user-controlled URLs are used in redirects without validation.
41
+ If the code meets these criteria, it has a strong security posture. Do NOT manufacture findings.
42
+
43
+ DOMAIN BOUNDARY (defer these to other judges):
44
+ - Injection attacks (SQL, XSS, command injection) with exploit paths โ†’ defer to CYBER judge.
45
+ - Authentication flows, credential storage, session management โ†’ defer to AUTH judge.
46
+ - Rate limiting and abuse prevention โ†’ defer to RATE judge.
47
+ - Error handling patterns and error propagation โ†’ defer to ERR judge.
48
+ - Infrastructure-as-code security โ†’ defer to IAC judge.
49
+ Only flag issues within YOUR domain: insecure data flows, weak cryptography, missing security controls, unsafe deserialization, XML security, secret management, mass assignment, redirect validation.
50
+
33
51
  FALSE POSITIVE AVOIDANCE:
34
52
  - Do NOT flag code that uses established security libraries correctly (helmet, bcrypt, argon2, parameterized queries, CSRF tokens, rate limiters, proper TLS configuration).
35
53
  - Do NOT flag security controls in non-application code (CI/CD configs, IaC templates, documentation examples) unless they contain actual secrets or credentials.
package/dist/api.d.ts CHANGED
@@ -70,10 +70,9 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
70
70
  export type { ToolProfile, ToolCapability, ComparisonResult } from "./comparison.js";
71
71
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
72
72
  export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGateResult, L2CoverageAnalysis, L2JudgeCoverage, L2CategoryCoverage, } from "./commands/benchmark.js";
73
- export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
73
+ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
74
74
  export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
75
75
  export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
76
- export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
77
76
  export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
78
77
  export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
79
78
  export { buildContextSnippets } from "./context/context-snippets.js";
package/dist/api.js CHANGED
@@ -80,9 +80,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
80
80
  // โ”€โ”€โ”€ Benchmark Gate โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
81
81
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
82
82
  // โ”€โ”€โ”€ LLM Benchmark โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
83
- export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
84
- // โ”€โ”€โ”€ LLM Benchmark Optimizer (Self-Teaching) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
85
- export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
83
+ export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
86
84
  // Review autopilot (GitHub App / scripts)
87
85
  export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
88
86
  export { buildContextSnippets } from "./context/context-snippets.js";
@@ -229,7 +229,7 @@ def load_config(path):
229
229
  public void authenticateUser(String u, String p) { /* 40 lines */ }
230
230
  public void scheduleTask(Task t) { /* 20 lines */ }
231
231
  }`,
232
- expectedRuleIds: ["DATA-001", "COMP-001", "SOV-001", "DOC-001"],
232
+ expectedRuleIds: ["STRUCT-001", "MAINT-001", "DOC-001"],
233
233
  category: "code-structure",
234
234
  difficulty: "medium",
235
235
  },
@@ -27,7 +27,7 @@ app.post("/api/users", validateSchema(userSchema), async (req, res) => {
27
27
  await user.save();
28
28
  res.json({ id: user.id });
29
29
  });`,
30
- expectedRuleIds: ["SCALE-001", "PERF-001", "COST-001", "API-001", "COMP-001"],
30
+ expectedRuleIds: ["HALLU-001"],
31
31
  category: "hallucination",
32
32
  difficulty: "medium",
33
33
  },
@@ -57,7 +57,7 @@ app.post("/api/users", validateSchema(userSchema), async (req, res) => {
57
57
 
58
58
  return { formatted, config, serialized };
59
59
  }`,
60
- expectedRuleIds: ["CYBER-001", "CONC-001", "CACHE-001", "SEC-001"],
60
+ expectedRuleIds: ["HALLU-001"],
61
61
  category: "hallucination",
62
62
  difficulty: "easy",
63
63
  },
@@ -1018,7 +1018,7 @@ def delete_user(request):
1018
1018
  async auditLog(action: string) { /* 30 lines */ }
1019
1019
  // 2000+ lines, 50+ methods, handles everything
1020
1020
  }`,
1021
- expectedRuleIds: ["SOV-001"],
1021
+ expectedRuleIds: ["MAINT-001"],
1022
1022
  category: "software-development",
1023
1023
  difficulty: "medium",
1024
1024
  },
@@ -205,7 +205,7 @@ spec:
205
205
  }
206
206
  ]
207
207
  }`,
208
- expectedRuleIds: ["DEPS-001"],
208
+ expectedRuleIds: ["IAC-001"],
209
209
  category: "cloud",
210
210
  difficulty: "easy",
211
211
  },
@@ -1201,7 +1201,7 @@ jobs:
1201
1201
  -H "Authorization: Bearer \${{ secrets.DEPLOY_TOKEN }}" \\
1202
1202
  -d '{"sha": "\${{ github.sha }}"}'
1203
1203
  - run: echo "$\{{ secrets.AWS_SECRET_KEY }}" > /tmp/key`,
1204
- expectedRuleIds: ["CLOUD-001", "PORTA-001"],
1204
+ expectedRuleIds: ["SEC-001"],
1205
1205
  category: "cicd",
1206
1206
  difficulty: "medium",
1207
1207
  },
@@ -25,6 +25,13 @@ export interface BenchmarkCase {
25
25
  expectedRuleIds: string[];
26
26
  /** Rule IDs that should NOT be detected (known false positives) */
27
27
  unexpectedRuleIds?: string[];
28
+ /**
29
+ * Acceptable rule prefixes: findings from these judge domains are
30
+ * domain-relevant and should NOT count as false positives even if not
31
+ * in expectedRuleIds. For example, a SQL-injection case may acceptably
32
+ * also trigger AUTH or SEC findings.
33
+ */
34
+ acceptablePrefixes?: string[];
28
35
  /** Category of vulnerability (e.g. "injection", "auth", "xss") */
29
36
  category: string;
30
37
  /** Difficulty level */
@@ -844,7 +844,7 @@ function getErrorMessage(code: number): string {
844
844
  "bower": "^1.8.0"
845
845
  }
846
846
  }`,
847
- expectedRuleIds: ["DEPS-001", "SUPPLY-001"],
847
+ expectedRuleIds: ["DEPS-001"],
848
848
  category: "dependency-health",
849
849
  difficulty: "easy",
850
850
  },
@@ -2337,13 +2337,13 @@ export function runBenchmarkSuite(cases, judgeId) {
2337
2337
  cat.truePositives += caseTP;
2338
2338
  cat.falseNegatives += caseFN;
2339
2339
  cat.falsePositives += caseFP;
2340
- // Per-judge accumulators
2341
- // Only count detections on clean cases (expectedRuleIds empty) as FP.
2342
- // Dirty-case "extra" detections are legitimate secondary findings and
2343
- // should not inflate per-judge false-positive rates.
2344
- const isCleanCase = tc.expectedRuleIds.length === 0;
2340
+ // Per-judge accumulators (deduplicate by prefix per case to match case-level FP counting)
2341
+ const seenPrefixes = new Set();
2345
2342
  for (const ruleId of foundRuleIds) {
2346
2343
  const prefix = ruleId.split("-")[0];
2344
+ if (seenPrefixes.has(prefix))
2345
+ continue;
2346
+ seenPrefixes.add(prefix);
2347
2347
  if (!perJudge[prefix]) {
2348
2348
  perJudge[prefix] = {
2349
2349
  judgeId: prefix,
@@ -2361,7 +2361,7 @@ export function runBenchmarkSuite(cases, judgeId) {
2361
2361
  if (expectedPrefixes.has(prefix)) {
2362
2362
  jb.truePositives++;
2363
2363
  }
2364
- else if (isCleanCase) {
2364
+ else {
2365
2365
  jb.falsePositives++;
2366
2366
  }
2367
2367
  }
@@ -17,6 +17,19 @@ import type { JudgeDefinition } from "../types.js";
17
17
  import type { BenchmarkCase, CategoryResult, JudgeBenchmarkResult, DifficultyResult } from "./benchmark.js";
18
18
  import type { PromptAmendment } from "./llm-benchmark-optimizer.js";
19
19
  export declare const TRIBUNAL_JUDGES: JudgeDefinition[];
20
+ /**
21
+ * Get acceptable prefixes for a benchmark case. Uses the case's explicit
22
+ * acceptablePrefixes if defined, otherwise falls back to the category map.
23
+ * Expected prefixes are always included (they're TPs, not FPs).
24
+ */
25
+ export declare function getAcceptablePrefixes(tc: BenchmarkCase): Set<string>;
26
+ /**
27
+ * Select a focused subset of tribunal judges relevant to a benchmark case's
28
+ * category. Returns core judges + category-specific judges, typically 8-15
29
+ * instead of the full 35. Returns undefined if no routing is possible
30
+ * (unknown category), signalling the caller to use all tribunal judges.
31
+ */
32
+ export declare function selectJudgesForCategory(category: string): JudgeDefinition[] | undefined;
20
33
  export interface LlmBenchmarkSnapshot {
21
34
  /** Timestamp of this LLM benchmark run */
22
35
  timestamp: string;
@@ -48,6 +61,8 @@ export interface LlmBenchmarkSnapshot {
48
61
  recall: number;
49
62
  /** F1 Score */
50
63
  f1Score: number;
64
+ /** Severity-weighted F1 โ€” penalizes critical/high FPs more heavily */
65
+ weightedF1Score?: number;
51
66
  /** Detection rate: cases detected / total cases */
52
67
  detectionRate: number;
53
68
  /** Per-category breakdown */
@@ -102,8 +117,9 @@ export declare function extractValidatedLlmFindings(response: string, prefixes?:
102
117
  export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
103
118
  /**
104
119
  * Construct the full-tribunal prompt โ€” identical to the MCP-served `full-tribunal` prompt.
120
+ * When `judges` is provided, uses that filtered list instead of all tribunal judges.
105
121
  */
106
- export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
122
+ export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[], judges?: JudgeDefinition[]): string;
107
123
  /**
108
124
  * Select a stratified sample of benchmark cases, ensuring representation
109
125
  * across categories, difficulties, and both clean/dirty cases.
@@ -112,8 +128,12 @@ export declare function selectStratifiedSample(cases: BenchmarkCase[], targetSiz
112
128
  /**
113
129
  * Score a single LLM benchmark case using prefix-based matching.
114
130
  * Returns a fully populated LlmCaseResult.
131
+ *
132
+ * @param topKPrefixes - If set, only keep the first `topKPrefixes` unique
133
+ * detected prefixes (in the order they appear in the LLM response).
134
+ * This prevents verbose tribunal output from inflating FP counts.
115
135
  */
116
- export declare function scoreLlmCase(tc: BenchmarkCase, detectedRuleIds: string[], rawResponse: string, tokensUsed?: number): LlmCaseResult;
136
+ export declare function scoreLlmCase(tc: BenchmarkCase, detectedRuleIds: string[], rawResponse: string, tokensUsed?: number, topKPrefixes?: number): LlmCaseResult;
117
137
  /**
118
138
  * Compute aggregate metrics for an LLM benchmark snapshot from raw case results.
119
139
  * Uses the same prefix-based matching methodology as the L1 benchmark.
@@ -14,7 +14,7 @@
14
14
  * `scripts/run-llm-benchmark.ts` has been removed.
15
15
  */
16
16
  import { JUDGES } from "../judges/index.js";
17
- import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE } from "../tools/prompts.js";
17
+ import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE, CLEAN_CODE_GATE, } from "../tools/prompts.js";
18
18
  import { extractAndValidateLlmFindings, mergeFindings } from "../probabilistic/llm-response-validator.js";
19
19
  import { formatAmendmentSection } from "./llm-benchmark-optimizer.js";
20
20
  // โ”€โ”€โ”€ Tribunal Judge Filtering โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@@ -22,6 +22,89 @@ import { formatAmendmentSection } from "./llm-benchmark-optimizer.js";
22
22
  // near-100% false positives in single-pass tribunal mode and are excluded.
23
23
  const TRIBUNAL_EXCLUDED_PREFIXES = new Set(["INTENT", "COH", "MFPR", "FPR", "OVER"]);
24
24
  export const TRIBUNAL_JUDGES = JUDGES.filter((j) => !TRIBUNAL_EXCLUDED_PREFIXES.has(j.rulePrefix));
25
+ // โ”€โ”€โ”€ Category โ†’ Acceptable Prefixes Mapping โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
26
+ // For each benchmark case category, these judge prefixes are domain-relevant
27
+ // and findings from them should NOT count as false positives even when not
28
+ // in expectedRuleIds. This prevents legitimate cross-domain observations
29
+ // from inflating the FP metric.
30
+ // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
31
+ const CATEGORY_ACCEPTABLE_PREFIXES = {
32
+ injection: ["SEC", "CYBER", "DB", "DATA", "ERR", "FW", "LOGIC"],
33
+ xss: ["SEC", "CYBER", "FW", "DATA", "ERR", "LOGIC"],
34
+ auth: ["AUTH", "SEC", "CYBER", "DATA", "CFG", "ERR", "LOGIC"],
35
+ "rate-limiting": ["RATE", "PERF", "SCALE", "REL", "SEC", "ERR"],
36
+ "error-handling": ["ERR", "REL", "OBS", "LOGIC", "MAINT", "STRUCT"],
37
+ "data-security": ["DATA", "SEC", "CYBER", "LOGPRIV", "SOV", "CFG", "ERR"],
38
+ security: ["SEC", "CYBER", "AUTH", "DATA", "FW", "CFG", "ERR", "LOGIC"],
39
+ concurrency: ["CONC", "PERF", "REL", "LOGIC", "ERR", "MAINT"],
40
+ performance: ["PERF", "SCALE", "CACHE", "DB", "CONC", "LOGIC", "MAINT"],
41
+ database: ["DB", "SEC", "DATA", "PERF", "ERR", "LOGIC"],
42
+ "api-design": ["API", "ERR", "AUTH", "SEC", "STRUCT", "LOGIC", "MAINT"],
43
+ observability: ["OBS", "LOGPRIV", "REL", "ERR", "CFG", "MAINT"],
44
+ reliability: ["REL", "ERR", "CONC", "PERF", "OBS", "LOGIC"],
45
+ scalability: ["SCALE", "PERF", "CACHE", "CLOUD", "CONC", "STRUCT"],
46
+ "cloud-readiness": ["CLOUD", "CFG", "CICD", "SCALE", "PORTA", "SEC"],
47
+ configuration: ["CFG", "SEC", "DATA", "CLOUD", "ERR"],
48
+ maintainability: ["MAINT", "STRUCT", "SWDEV", "DOC", "LOGIC", "ERR"],
49
+ "code-structure": ["STRUCT", "MAINT", "SWDEV", "LOGIC", "ERR"],
50
+ documentation: ["DOC", "MAINT", "SWDEV", "STRUCT"],
51
+ testing: ["TEST", "SWDEV", "LOGIC", "ERR", "MAINT"],
52
+ "cost-effectiveness": ["COST", "CLOUD", "SCALE", "PERF", "IAC"],
53
+ compliance: ["COMP", "DATA", "SOV", "LOGPRIV", "SEC", "CYBER"],
54
+ accessibility: ["A11Y", "UX", "I18N", "STRUCT", "LOGIC"],
55
+ internationalization: ["I18N", "A11Y", "UX", "STRUCT"],
56
+ "dependency-health": ["DEPS", "SEC", "COMPAT", "MAINT"],
57
+ "logging-privacy": ["LOGPRIV", "DATA", "OBS", "SEC", "ERR"],
58
+ "backwards-compatibility": ["COMPAT", "API", "STRUCT", "LOGIC"],
59
+ caching: ["CACHE", "PERF", "SCALE", "REL", "LOGIC"],
60
+ "ethics-bias": ["ETHICS", "DATA", "COMP", "SEC"],
61
+ portability: ["PORTA", "CLOUD", "STRUCT", "CFG"],
62
+ "ci-cd": ["CICD", "SEC", "CFG", "CLOUD", "TEST"],
63
+ "iac-security": ["IAC", "SEC", "CYBER", "CFG", "CLOUD"],
64
+ cloud: ["CLOUD", "IAC", "SEC", "CYBER", "CFG", "SCALE"],
65
+ ethics: ["ETHICS", "A11Y", "UX", "DATA", "COMP"],
66
+ "framework-safety": ["FW", "SEC", "CYBER", "ERR", "LOGIC"],
67
+ "framework-security": ["FW", "SEC", "CYBER", "AUTH", "ERR", "API", "COMP", "OBS", "COMPAT", "CONC", "DOC"],
68
+ "agent-instructions": ["AGENT", "SEC", "CYBER", "AICS", "ERR", "LOGIC"],
69
+ cicd: ["CICD", "SEC", "CFG", "CLOUD", "TEST", "PORTA"],
70
+ ux: ["UX", "ERR", "SEC", "A11Y", "I18N", "LOGIC"],
71
+ "software-practices": ["SWDEV", "MAINT", "STRUCT", "DOC", "LOGIC", "ERR"],
72
+ "software-development": ["SWDEV", "MAINT", "STRUCT", "DOC", "LOGIC", "ERR"],
73
+ "code-quality": ["MAINT", "API", "STRUCT", "SWDEV", "LOGIC", "ERR"],
74
+ "supply-chain": ["DEPS", "SEC", "COMPAT", "MAINT"],
75
+ "ai-security": ["AICS", "SEC", "CYBER", "DATA", "ERR", "LOGIC"],
76
+ clean: [], // Clean code โ€” no acceptable prefixes, all findings are FPs
77
+ };
78
+ /**
79
+ * Get acceptable prefixes for a benchmark case. Uses the case's explicit
80
+ * acceptablePrefixes if defined, otherwise falls back to the category map.
81
+ * Expected prefixes are always included (they're TPs, not FPs).
82
+ */
83
+ export function getAcceptablePrefixes(tc) {
84
+ const explicit = tc.acceptablePrefixes;
85
+ const fromCategory = CATEGORY_ACCEPTABLE_PREFIXES[tc.category] ?? [];
86
+ const combined = new Set([...tc.expectedRuleIds.map((r) => r.split("-")[0]), ...(explicit ?? fromCategory)]);
87
+ return combined;
88
+ }
89
+ // โ”€โ”€โ”€ Core Judges (always included in routed tribunals) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
90
+ // These judges provide universal code quality signals and should always be
91
+ // part of the tribunal regardless of category.
92
+ const CORE_JUDGE_PREFIXES = new Set(["SEC", "ERR", "LOGIC", "STRUCT", "MAINT"]);
93
+ /**
94
+ * Select a focused subset of tribunal judges relevant to a benchmark case's
95
+ * category. Returns core judges + category-specific judges, typically 8-15
96
+ * instead of the full 35. Returns undefined if no routing is possible
97
+ * (unknown category), signalling the caller to use all tribunal judges.
98
+ */
99
+ export function selectJudgesForCategory(category) {
100
+ const acceptable = CATEGORY_ACCEPTABLE_PREFIXES[category];
101
+ if (!acceptable || acceptable.length === 0)
102
+ return undefined;
103
+ const targetPrefixes = new Set([...CORE_JUDGE_PREFIXES, ...acceptable]);
104
+ const selected = TRIBUNAL_JUDGES.filter((j) => targetPrefixes.has(j.rulePrefix));
105
+ // Only route if we meaningfully reduced the set (at least 40% fewer)
106
+ return selected.length < TRIBUNAL_JUDGES.length * 0.6 ? selected : undefined;
107
+ }
25
108
  // โ”€โ”€โ”€ Rule ID Parsing โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
26
109
  /**
27
110
  * Extract unique rule IDs from LLM response text.
@@ -41,7 +124,7 @@ export function getTribunalValidPrefixes() {
41
124
  }
42
125
  export function parseLlmRuleIds(response) {
43
126
  const validPrefixes = getValidRulePrefixes();
44
- const pattern = /\b([A-Z]{2,})-(\d{3})\b/g;
127
+ const pattern = /\b([A-Z][A-Z0-9]+)-(\d{1,3})\b/g;
45
128
  const found = new Set();
46
129
  let match;
47
130
  while ((match = pattern.exec(response)) !== null) {
@@ -85,35 +168,55 @@ export function constructPerJudgePrompt(judge, code, language, contextSnippets =
85
168
  (amendmentSection ? `${amendmentSection}\n` : "") +
86
169
  contextSection +
87
170
  `${criteria}\n\n` +
171
+ `${CLEAN_CODE_GATE}\n\n` +
88
172
  `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
89
173
  `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
90
174
  }
91
175
  /**
92
176
  * Construct the full-tribunal prompt โ€” identical to the MCP-served `full-tribunal` prompt.
177
+ * When `judges` is provided, uses that filtered list instead of all tribunal judges.
93
178
  */
94
- export function constructTribunalPrompt(code, language, contextSnippets = [], amendments) {
95
- const judgeInstructions = TRIBUNAL_JUDGES.map((j) => `### ${j.name} โ€” ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
179
+ export function constructTribunalPrompt(code, language, contextSnippets = [], amendments, judges) {
180
+ const activeJudges = judges ?? TRIBUNAL_JUDGES;
181
+ const judgeInstructions = activeJudges
182
+ .map((j) => `### ${j.name} โ€” ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`)
183
+ .join("\n\n---\n\n");
96
184
  const contextSection = contextSnippets.length
97
185
  ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
98
186
  : "";
99
187
  const amendmentSection = formatAmendmentSection(amendments ?? []);
100
- return (`You are the Judges Panel โ€” a panel of ${TRIBUNAL_JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
188
+ return (`You are the Judges Panel โ€” a panel of ${activeJudges.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
101
189
  `## Universal Evaluation Directives\n\n` +
102
190
  `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
103
191
  `${PRECISION_MANDATE}\n\n` +
192
+ `${CLEAN_CODE_GATE}\n\n` +
104
193
  `DOMAIN SCOPE DIRECTIVE (applies to ALL judges):\n` +
105
194
  `- Each judge MUST only report findings within their stated domain expertise.\n` +
106
195
  `- A CI/CD judge should NOT report authentication findings. An ethics judge should NOT report performance findings.\n` +
107
196
  `- If code falls entirely outside your domain (e.g., a YAML CI workflow being evaluated by the Database judge), report ZERO findings for that judge.\n` +
108
- `- Cross-domain observations should ONLY be reported by the judge whose domain they fall under.\n\n` +
197
+ `- Cross-domain observations should ONLY be reported by the judge whose domain they fall under.\n` +
198
+ `- HARD LIMIT: Each judge may report AT MOST 2 findings. If a judge has more than 2 potential findings, keep only the 2 highest-severity, highest-confidence ones and discard the rest.\n\n` +
109
199
  (amendmentSection ? `${amendmentSection}\n` : "") +
110
200
  contextSection +
111
201
  `## Evaluation Instructions\n\n` +
112
- `Evaluate the following ${language} code from the perspective of ALL ${TRIBUNAL_JUDGES.length} judges below. For each judge, provide:\n` +
113
- `1. Judge name and domain\n` +
114
- `2. Verdict (PASS / WARNING / FAIL)\n` +
115
- `3. Score (0-100)\n` +
116
- `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
202
+ `Evaluate the following ${language} code from the perspective of ALL ${activeJudges.length} judges below.\n\n` +
203
+ `### Output Format โ€” Tiered Findings\n` +
204
+ `Organize ALL findings into three tiers:\n\n` +
205
+ `**๐Ÿ”ด MUST FIX** (critical/high severity โ€” blocks merge):\n` +
206
+ `These are real bugs, security vulnerabilities, data loss risks, or correctness issues. ` +
207
+ `Report at most 5 findings here. Each must have concrete code evidence.\n\n` +
208
+ `**๐ŸŸก WORTH REVIEWING** (medium severity โ€” warrants discussion):\n` +
209
+ `Design flaws, maintainability concerns, or reliability risks that a senior reviewer would flag. ` +
210
+ `Only include findings with specific code evidence.\n\n` +
211
+ `**๐ŸŸข INFORMATIONAL** (low/info severity โ€” optional improvements):\n` +
212
+ `Minor style or optimization suggestions. Limit to the most impactful 3. Omit if none are genuinely useful.\n\n` +
213
+ `For each finding, provide:\n` +
214
+ `1. Rule ID (using the judge's prefix)\n` +
215
+ `2. Severity (critical/high/medium/low/info)\n` +
216
+ `3. Confidence (0-100%): How certain are you this is a real issue? Only include findings โ‰ฅ80%.\n` +
217
+ `4. Judge name and domain\n` +
218
+ `5. Specific code evidence (line numbers, patterns)\n` +
219
+ `6. Description and recommendation\n\n` +
117
220
  `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
118
221
  `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
119
222
  `## The Judges\n\n${judgeInstructions}\n\n` +
@@ -181,8 +284,24 @@ export function selectStratifiedSample(cases, targetSize) {
181
284
  /**
182
285
  * Score a single LLM benchmark case using prefix-based matching.
183
286
  * Returns a fully populated LlmCaseResult.
287
+ *
288
+ * @param topKPrefixes - If set, only keep the first `topKPrefixes` unique
289
+ * detected prefixes (in the order they appear in the LLM response).
290
+ * This prevents verbose tribunal output from inflating FP counts.
184
291
  */
185
- export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
292
+ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed, topKPrefixes) {
293
+ // โ”€โ”€ Optional top-K prefix cap โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
294
+ let filteredDetected = detectedRuleIds;
295
+ if (topKPrefixes !== undefined && topKPrefixes > 0) {
296
+ const seenPrefixes = new Set();
297
+ filteredDetected = detectedRuleIds.filter((id) => {
298
+ const prefix = id.split("-")[0];
299
+ if (seenPrefixes.size >= topKPrefixes && !seenPrefixes.has(prefix))
300
+ return false;
301
+ seenPrefixes.add(prefix);
302
+ return true;
303
+ });
304
+ }
186
305
  // โ”€โ”€ Prefix-level FP deduplication โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
187
306
  // TPs are counted per-expected-rule using prefix matching: a single
188
307
  // detected CYBER-xxx satisfies all expected CYBER-yyy rules.
@@ -191,7 +310,7 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
191
310
  // generates for that prefix. This prevents verbose LLM output from
192
311
  // inflating the FP metric (e.g. CYBER-001โ€ฆ005 on clean code = 1 FP,
193
312
  // not 5).
194
- const detectedPrefixes = new Set(detectedRuleIds.map((r) => r.split("-")[0]));
313
+ const detectedPrefixes = new Set(filteredDetected.map((r) => r.split("-")[0]));
195
314
  const matchedExpected = tc.expectedRuleIds.filter((expected) => {
196
315
  const prefix = expected.split("-")[0];
197
316
  return detectedPrefixes.has(prefix);
@@ -203,19 +322,19 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
203
322
  // For clean cases (no expected findings), ALL detections are false positives.
204
323
  // For dirty cases with unexpectedRuleIds, FPs are detections matching those prefixes.
205
324
  // For dirty cases WITHOUT unexpectedRuleIds, FPs are detections whose prefix
206
- // doesn't match any expected prefix (prevents silent over-reporting).
325
+ // doesn't match any expected or acceptable prefix.
207
326
  const isCleanCase = tc.expectedRuleIds.length === 0;
208
- const expectedPrefixes = new Set(tc.expectedRuleIds.map((r) => r.split("-")[0]));
327
+ const acceptablePrefixes = getAcceptablePrefixes(tc);
209
328
  const falsePositiveIdsRaw = isCleanCase
210
- ? detectedRuleIds
329
+ ? filteredDetected
211
330
  : tc.unexpectedRuleIds
212
- ? detectedRuleIds.filter((found) => {
331
+ ? filteredDetected.filter((found) => {
213
332
  const prefix = found.split("-")[0];
214
333
  return tc.unexpectedRuleIds.some((u) => u.split("-")[0] === prefix);
215
334
  })
216
- : detectedRuleIds.filter((found) => {
335
+ : filteredDetected.filter((found) => {
217
336
  const prefix = found.split("-")[0];
218
- return !expectedPrefixes.has(prefix);
337
+ return !acceptablePrefixes.has(prefix);
219
338
  });
220
339
  // Deduplicate FPs by prefix โ€” keep one representative rule ID per prefix
221
340
  const fpPrefixSeen = new Set();
@@ -233,7 +352,7 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
233
352
  difficulty: tc.difficulty,
234
353
  passed: casePassed,
235
354
  expectedRuleIds: tc.expectedRuleIds,
236
- detectedRuleIds,
355
+ detectedRuleIds: filteredDetected,
237
356
  missedRuleIds: missedExpected,
238
357
  falsePositiveRuleIds: falsePositiveIds,
239
358
  rawResponse,
@@ -289,11 +408,16 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
289
408
  cat.truePositives += caseTP;
290
409
  cat.falseNegatives += caseFN;
291
410
  cat.falsePositives += caseFP;
292
- // Per-judge
411
+ // Per-judge (deduplicate by prefix per case to match case-level FP counting)
412
+ // Use pre-computed falsePositiveRuleIds to stay consistent with scoreLlmCase
413
+ const fpPrefixes = new Set(c.falsePositiveRuleIds.map((r) => r.split("-")[0]));
293
414
  const expectedPrefixes = new Set(c.expectedRuleIds.map((r) => r.split("-")[0]));
294
- const isCleanCase = c.expectedRuleIds.length === 0;
415
+ const seenPrefixes = new Set();
295
416
  for (const ruleId of c.detectedRuleIds) {
296
417
  const prefix = ruleId.split("-")[0];
418
+ if (seenPrefixes.has(prefix))
419
+ continue;
420
+ seenPrefixes.add(prefix);
297
421
  if (!perJudge[prefix]) {
298
422
  perJudge[prefix] = {
299
423
  judgeId: prefix,
@@ -311,15 +435,48 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
311
435
  if (expectedPrefixes.has(prefix)) {
312
436
  jb.truePositives++;
313
437
  }
314
- else if (isCleanCase) {
438
+ else if (fpPrefixes.has(prefix)) {
315
439
  jb.falsePositives++;
316
440
  }
441
+ // Acceptable (non-expected, non-FP) detections are silently ignored
317
442
  }
318
443
  }
319
444
  // Compute aggregate metrics
320
445
  const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 1;
321
446
  const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 1;
322
447
  const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
448
+ // Severity-weighted F1 โ€” re-extract findings from raw responses to get
449
+ // severity info, then weight FPs: critical/high=3x, medium=1x, low/info=0.3x
450
+ const SEVERITY_WEIGHTS = {
451
+ critical: 3,
452
+ high: 3,
453
+ medium: 1,
454
+ low: 0.3,
455
+ info: 0.3,
456
+ };
457
+ let weightedFP = 0;
458
+ const tribunalPrefixes = getTribunalValidPrefixes();
459
+ for (const c of rawCases) {
460
+ if (c.falsePositiveRuleIds.length === 0)
461
+ continue;
462
+ const fpSet = new Set(c.falsePositiveRuleIds.map((r) => r.split("-")[0]));
463
+ const validation = extractValidatedLlmFindings(c.rawResponse, tribunalPrefixes);
464
+ // Map finding ruleId prefix โ†’ max severity weight
465
+ const prefixMaxWeight = new Map();
466
+ for (const f of validation.findings) {
467
+ const prefix = f.ruleId.split("-")[0];
468
+ if (!fpSet.has(prefix))
469
+ continue;
470
+ const weight = SEVERITY_WEIGHTS[f.severity] ?? 1;
471
+ prefixMaxWeight.set(prefix, Math.max(prefixMaxWeight.get(prefix) ?? 0, weight));
472
+ }
473
+ // Sum weights for FP prefixes (use weight=1 default if severity unknown)
474
+ for (const prefix of fpSet) {
475
+ weightedFP += prefixMaxWeight.get(prefix) ?? 1;
476
+ }
477
+ }
478
+ const weightedPrecision = totalTP + weightedFP > 0 ? totalTP / (totalTP + weightedFP) : 1;
479
+ const weightedF1Score = weightedPrecision + recall > 0 ? (2 * weightedPrecision * recall) / (weightedPrecision + recall) : 0;
323
480
  // Compute per-difficulty rates
324
481
  for (const d of Object.values(perDifficulty)) {
325
482
  d.detectionRate = d.total > 0 ? d.detected / d.total : 0;
@@ -356,6 +513,7 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
356
513
  precision,
357
514
  recall,
358
515
  f1Score,
516
+ weightedF1Score,
359
517
  detectionRate: rawCases.length > 0 ? totalDetected / rawCases.length : 0,
360
518
  perCategory,
361
519
  perJudge,
@@ -394,6 +552,9 @@ export function formatLlmSnapshotMarkdown(snapshot) {
394
552
  lines.push(`| Precision | ${pct(snapshot.precision)} |`);
395
553
  lines.push(`| Recall | ${pct(snapshot.recall)} |`);
396
554
  lines.push(`| F1 Score | ${pct(snapshot.f1Score)} |`);
555
+ if (snapshot.weightedF1Score !== null && snapshot.weightedF1Score !== undefined) {
556
+ lines.push(`| Weighted F1 | ${pct(snapshot.weightedF1Score)} |`);
557
+ }
397
558
  lines.push(`| True Positives | ${snapshot.truePositives} |`);
398
559
  lines.push(`| False Negatives | ${snapshot.falseNegatives} |`);
399
560
  lines.push(`| False Positives | ${snapshot.falsePositives} |`);
package/dist/config.js CHANGED
@@ -12,7 +12,7 @@ import { matchGlobPath } from "./tools/command-safety.js";
12
12
  export function expandEnvPlaceholders(content) {
13
13
  if (!content)
14
14
  return content;
15
- return content.replace(/\$\{([^}]+)\}/g, (_match, varName) => {
15
+ return content.replace(/\$\{([^}]{1,100})\}/g, (_match, varName) => {
16
16
  const envVal = process.env[varName];
17
17
  return envVal !== undefined ? envVal : "";
18
18
  });
@@ -29,6 +29,23 @@ RULES FOR YOUR EVALUATION:
29
29
  - Flag any endpoint that accepts user input without verifying the caller's identity and permissions.
30
30
  - Score from 0-100 where 100 means robust auth implementation.
31
31
 
32
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
33
+ - Authentication middleware protects all routes that handle user data or state changes.
34
+ - Passwords are hashed with bcrypt, scrypt, or argon2 โ€” not stored in plaintext or weak hashes.
35
+ - JWTs are verified with explicit algorithm restrictions, expiration, and issuer/audience checks.
36
+ - Sessions use secure, httpOnly, sameSite cookies with proper expiration and rotation.
37
+ - OAuth/OIDC flows use PKCE, validate state parameters, and allowlist redirect URIs.
38
+ - API keys are transmitted in headers (not query params) and scoped to minimum permissions.
39
+ If the code meets these criteria, authentication is implemented correctly. Do NOT manufacture findings.
40
+
41
+ DOMAIN BOUNDARY (defer these to other judges):
42
+ - Injection attacks and XSS exploit paths โ†’ defer to CYBER judge.
43
+ - General security posture and cryptographic practices โ†’ defer to SEC judge.
44
+ - Rate limiting on login endpoints โ†’ defer to RATE judge (unless auth logic itself is broken).
45
+ - Error handling in auth flows โ†’ defer to ERR judge.
46
+ - Data privacy in auth tokens/logs โ†’ defer to DATA/LOGPRIV judges.
47
+ Only flag issues within YOUR domain: authentication middleware gaps, credential handling, token security, session management, authorization checks, OAuth/OIDC implementation, privilege escalation.
48
+
32
49
  FALSE POSITIVE AVOIDANCE:
33
50
  - Do NOT flag code that uses established authentication libraries (passport, next-auth, Spring Security, etc.) following their documented patterns.
34
51
  - JWT verification with explicit algorithm restrictions and proper expiration checks is correct implementation, not a vulnerability.
@@ -28,6 +28,24 @@ RULES FOR YOUR EVALUATION:
28
28
  - Reference OWASP, CWE IDs, and CVE IDs where applicable.
29
29
  - Score from 0-100 where 100 means no exploitable vulnerabilities found.
30
30
 
31
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
32
+ - Input validation and sanitization are applied to user-controlled data before use in queries, commands, or output.
33
+ - Passwords are hashed with bcrypt, scrypt, or argon2 โ€” not MD5/SHA1.
34
+ - Database queries use parameterized statements or an ORM with proper escaping.
35
+ - Security middleware is present (helmet, CORS, CSRF tokens) for web applications.
36
+ - Secrets are loaded from environment variables or a secrets manager, not hardcoded.
37
+ - Dependencies are imported from standard registries with version pinning.
38
+ - Error responses do not leak stack traces or internal details to clients.
39
+ If the code meets these criteria, it is implementing security correctly. Do NOT manufacture findings.
40
+
41
+ DOMAIN BOUNDARY (defer these to other judges):
42
+ - Rate limiting, throttling, and abuse prevention โ†’ defer to RATE judge.
43
+ - Authentication flows, session management, OAuth/OIDC โ†’ defer to AUTH judge.
44
+ - General security posture, defense-in-depth patterns โ†’ defer to SEC judge.
45
+ - Error handling completeness and error propagation โ†’ defer to ERR judge.
46
+ - Data privacy, PII handling, logging of sensitive data โ†’ defer to DATA/LOGPRIV judges.
47
+ Only flag issues within YOUR domain: injection attacks, XSS, CSRF/SSRF, dependency CVEs, cryptographic weaknesses, OWASP Top 10 violations with concrete exploit paths.
48
+
31
49
  FALSE POSITIVE AVOIDANCE:
32
50
  - Do NOT flag established security library usage (helmet, cors, bcrypt, argon2, parameterized queries) as security issues โ€” these ARE the correct patterns.
33
51
  - Code that properly validates input, uses HTTPS, and parameterizes queries is implementing security correctly.
@@ -29,6 +29,15 @@ RULES FOR YOUR EVALUATION:
29
29
  - Flag any code path that could throw without a handler in scope.
30
30
  - Score from 0-100 where 100 means robust error handling.
31
31
 
32
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
33
+ - Try-catch blocks wrap code paths that can throw, with meaningful handling (log, re-throw, or recover).
34
+ - Async operations use try-catch or .catch() to handle rejections.
35
+ - Error responses return consistent structures with appropriate HTTP status codes.
36
+ - Resources (connections, file handles, streams) are cleaned up in finally blocks or using disposal patterns.
37
+ - Framework error middleware or global handlers are present (Express error middleware, Spring @ExceptionHandler, etc.).
38
+ - Stack traces and internal details are not exposed to end users in error responses.
39
+ If the code meets these criteria, error handling is implemented correctly. Do NOT manufacture findings.
40
+
32
41
  FALSE POSITIVE AVOIDANCE:
33
42
  - Do NOT flag error handling in code that delegates error handling to a framework (Express middleware, Spring @ExceptionHandler, etc.).
34
43
  - Try-catch with logging and re-throw is a valid error handling pattern, not a deficiency.
@@ -29,6 +29,15 @@ RULES FOR YOUR EVALUATION:
29
29
  - Consider both inbound (protecting your service) and outbound (respecting others') rate limits.
30
30
  - Score from 0-100 where 100 means comprehensive rate limiting.
31
31
 
32
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
33
+ - Rate limiting middleware is applied to public-facing API endpoints (express-rate-limit, API gateway config, etc.).
34
+ - Request body size limits are configured (bodyParser limits, multer limits, etc.).
35
+ - List/query endpoints have pagination with enforced maximum page sizes.
36
+ - External API calls use bounded retries with exponential backoff and jitter.
37
+ - Connection pools and concurrent request limits are bounded.
38
+ If the code meets these criteria, rate limiting is implemented correctly. Do NOT manufacture findings.
39
+ IMPORTANT: CLI tools, data scripts, utility libraries, batch processors, and internal services do NOT need rate limiting. If the code is not a public-facing API or web server, report ZERO findings.
40
+
32
41
  FALSE POSITIVE AVOIDANCE:
33
42
  - Only flag rate-limiting issues in code that accepts external requests (APIs, WebSocket servers, public endpoints).
34
43
  - Do NOT flag internal services, batch processors, CLI tools, or cron jobs for missing rate limiting.
@@ -29,6 +29,24 @@ RULES FOR YOUR EVALUATION:
29
29
  - Reference CWE IDs where applicable.
30
30
  - Score from 0-100 where 100 means excellent security posture.
31
31
 
32
+ CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
33
+ - Security middleware is configured (helmet, CORS, CSRF protection) for web applications.
34
+ - User input is validated before use in data flows (queries, file ops, HTTP requests).
35
+ - Cryptographic operations use modern algorithms (AES-256, SHA-256+, bcrypt/argon2).
36
+ - Secrets are sourced from environment variables or a secrets manager, not hardcoded.
37
+ - Deserialization of untrusted data uses safe mechanisms (JSON.parse, not pickle/eval).
38
+ - JWT/token verification includes algorithm restrictions and expiration checks.
39
+ - No user-controlled URLs are used in redirects without validation.
40
+ If the code meets these criteria, it has a strong security posture. Do NOT manufacture findings.
41
+
42
+ DOMAIN BOUNDARY (defer these to other judges):
43
+ - Injection attacks (SQL, XSS, command injection) with exploit paths โ†’ defer to CYBER judge.
44
+ - Authentication flows, credential storage, session management โ†’ defer to AUTH judge.
45
+ - Rate limiting and abuse prevention โ†’ defer to RATE judge.
46
+ - Error handling patterns and error propagation โ†’ defer to ERR judge.
47
+ - Infrastructure-as-code security โ†’ defer to IAC judge.
48
+ Only flag issues within YOUR domain: insecure data flows, weak cryptography, missing security controls, unsafe deserialization, XML security, secret management, mass assignment, redirect validation.
49
+
32
50
  FALSE POSITIVE AVOIDANCE:
33
51
  - Do NOT flag code that uses established security libraries correctly (helmet, bcrypt, argon2, parameterized queries, CSRF tokens, rate limiters, proper TLS configuration).
34
52
  - Do NOT flag security controls in non-application code (CI/CD configs, IaC templates, documentation examples) unless they contain actual secrets or credentials.
@@ -5,7 +5,7 @@ const SEVERITY_SET = new Set(["critical", "high", "medium", "low", "info"]);
5
5
  * Attempt to parse a JSON payload embedded in LLM output. Supports fenced code blocks and raw JSON.
6
6
  */
7
7
  function parseJsonBlock(text) {
8
- const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/i);
8
+ const fenceMatch = text.match(/```(?:json)?[ \t]*\n([\s\S]*?)\n[ \t]*```/i) ?? text.match(/```(?:json)?[ \t]*([\s\S]*?)```/i);
9
9
  if (fenceMatch) {
10
10
  try {
11
11
  return JSON.parse(fenceMatch[1]);
@@ -25,7 +25,7 @@ function normalizeRuleId(id) {
25
25
  return id.trim().toUpperCase();
26
26
  }
27
27
  function isValidRuleId(id, validPrefixes) {
28
- const match = id.match(/^([A-Z]{2,})-\d{3}$/);
28
+ const match = id.match(/^([A-Z][A-Z0-9]+)-\d{1,3}$/);
29
29
  if (!match)
30
30
  return false;
31
31
  return validPrefixes.has(match[1]);
@@ -215,7 +215,15 @@ function countBySeverity(findings) {
215
215
  function compileExcludeRegexes(patterns) {
216
216
  if (!patterns || patterns.length === 0)
217
217
  return [];
218
- return patterns.map((pattern) => new RegExp(pattern, "i"));
218
+ return patterns.map((pattern) => {
219
+ try {
220
+ return new RegExp(pattern, "i");
221
+ }
222
+ catch {
223
+ // Invalid regex from user input โ€” treat as literal string match
224
+ return new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i");
225
+ }
226
+ });
219
227
  }
220
228
  function isLikelyNonProductionPath(path) {
221
229
  return /(^|\/)(test|tests|__tests__|spec|specs|e2e)(\/|\.|$)|\.(?:test|tests|spec|specs|e2e)\.[^/]+$|mock|fixture|fixtures|(^|\/)docs(-|\/)i18n(\/|$)|(^|\/)docs(\/|$)/i.test(path);
@@ -25,7 +25,7 @@ export function parseSkillFrontmatter(raw) {
25
25
  i++;
26
26
  continue;
27
27
  }
28
- const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/);
28
+ const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
29
29
  if (!kv) {
30
30
  i++;
31
31
  continue;
@@ -64,9 +64,10 @@ export function parseSkillFrontmatter(raw) {
64
64
  if (typeof value === "string" && ((value.startsWith("[") && value.endsWith("]")) || value.includes(","))) {
65
65
  // simple array parsing: split on comma
66
66
  const normalized = value
67
- .replace(/^\s*\[/, "")
68
- .replace(/\]\s*$/, "")
69
- .split(/\s*,\s*/)
67
+ .replace(/^[ \t]*\[/, "")
68
+ .replace(/\][ \t]*$/, "")
69
+ .split(",")
70
+ .map((s) => s.trim())
70
71
  .filter(Boolean);
71
72
  value = normalized;
72
73
  }
@@ -93,13 +94,15 @@ export function validateSkillFrontmatter(meta, sourcePath) {
93
94
  agents: Array.isArray(meta.agents)
94
95
  ? meta.agents
95
96
  : String(meta.agents ?? "")
96
- .split(/\s*,\s*/)
97
+ .split(",")
98
+ .map((s) => s.trim())
97
99
  .filter(Boolean),
98
100
  tags: Array.isArray(meta.tags)
99
101
  ? meta.tags
100
102
  : meta.tags
101
103
  ? String(meta.tags)
102
- .split(/\s*,\s*/)
104
+ .split(",")
105
+ .map((s) => s.trim())
103
106
  .filter(Boolean)
104
107
  : undefined,
105
108
  priority: meta.priority ? Number(meta.priority) : 10,
@@ -2,7 +2,9 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
2
  /** Adversarial evaluation stance โ€” shared across all judges. */
3
3
  export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Report only real problems, risks, and deficiencies that exist in the actual code.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.";
4
4
  /** Precision override โ€” ensures evidence-based findings. */
5
- export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.";
5
+ export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.\n\nCOMMON FALSE POSITIVE PATTERNS (do NOT report these):\n- ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.\n- LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.\n- MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.\n- SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. \"Could be stronger\" is NOT a vulnerability.\n- STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.";
6
+ /** Clean code gate โ€” explicit instructions when code quality is high. */
7
+ export declare const CLEAN_CODE_GATE = "CLEAN CODE GATE (applies AFTER individual judge evaluation):\n- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.\n- Do NOT report stylistic preferences, alternative approaches, or \"nice to have\" improvements as findings. These are opinions, not defects.\n- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).\n- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.\n- SELF-CHECK before finalizing: For each finding, ask \"Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?\" If the answer is not a clear YES, discard the finding.\n- The goal is to match what a thoughtful, experienced human reviewer would flag \u2014 not to demonstrate comprehensive knowledge of every possible concern.";
6
8
  /**
7
9
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
8
10
  * stripping the persona introduction line, the ADVERSARIAL MANDATE block,
@@ -21,6 +23,5 @@ export declare function getCondensedCriteria(systemPrompt: string): string;
21
23
  /**
22
24
  * Register all MCP prompts on the given server:
23
25
  * - One per-judge prompt (`judge-{id}`) for single-persona deep reviews
24
- * - A `full-tribunal` prompt that convenes all judges at once
25
26
  */
26
27
  export declare function registerPrompts(server: McpServer): void;
@@ -2,18 +2,15 @@
2
2
  // Expose judge system prompts as MCP prompts so LLM-based clients can use
3
3
  // them for deeper, AI-powered analysis beyond pattern matching.
4
4
  //
5
- // Token-optimised: shared behavioural directives (adversarial mandate,
6
- // precision mandate) are stated ONCE in the tribunal preamble instead of
7
- // being duplicated across all 44 judges. Per-judge sections include only
8
- // the unique evaluation criteria, domain-specific rules, and FP-avoidance
9
- // guidance. This reduces the tribunal prompt by ~40 000 chars (~10 000
10
- // tokens) without removing any evaluation criteria.
5
+ // Each per-judge prompt includes shared behavioural directives (adversarial
6
+ // mandate, precision mandate, clean-code gate) plus the judge's unique
7
+ // evaluation criteria, domain-specific rules, and FP-avoidance guidance.
11
8
  // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
12
9
  import { z } from "zod";
13
10
  import { JUDGES } from "../judges/index.js";
14
- // โ”€โ”€โ”€ Shared Behavioural Directives โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
15
- // Stated ONCE in the tribunal preamble so every judge benefits without
16
- // repeating the text 39 times.
11
+ // โ”€โ”€โ”€ Shared Behavioural Directives & Gates โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
12
+ // Included in every per-judge prompt to ensure consistent evaluation
13
+ // behaviour across all judges.
17
14
  // โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
18
15
  /** Adversarial evaluation stance โ€” shared across all judges. */
19
16
  export const SHARED_ADVERSARIAL_MANDATE = `ADVERSARIAL MANDATE (applies to ALL judges):
@@ -32,7 +29,22 @@ export const PRECISION_MANDATE = `PRECISION MANDATE (this section OVERRIDES the
32
29
  - RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.
33
30
  - SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.
34
31
  - CONFIDENCE THRESHOLD: Only report findings where you are highly confident (โ‰ฅ80%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.
35
- - FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.`;
32
+ - FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.
33
+
34
+ COMMON FALSE POSITIVE PATTERNS (do NOT report these):
35
+ - ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.
36
+ - LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.
37
+ - MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.
38
+ - SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. "Could be stronger" is NOT a vulnerability.
39
+ - STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.`;
40
+ /** Clean code gate โ€” explicit instructions when code quality is high. */
41
+ export const CLEAN_CODE_GATE = `CLEAN CODE GATE (applies AFTER individual judge evaluation):
42
+ - Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.
43
+ - Do NOT report stylistic preferences, alternative approaches, or "nice to have" improvements as findings. These are opinions, not defects.
44
+ - Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).
45
+ - Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.
46
+ - SELF-CHECK before finalizing: For each finding, ask "Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?" If the answer is not a clear YES, discard the finding.
47
+ - The goal is to match what a thoughtful, experienced human reviewer would flag โ€” not to demonstrate comprehensive knowledge of every possible concern.`;
36
48
  // โ”€โ”€โ”€ Criteria Extraction โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
37
49
  /**
38
50
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
@@ -73,13 +85,11 @@ export function getCondensedCriteria(systemPrompt) {
73
85
  /**
74
86
  * Register all MCP prompts on the given server:
75
87
  * - One per-judge prompt (`judge-{id}`) for single-persona deep reviews
76
- * - A `full-tribunal` prompt that convenes all judges at once
77
88
  */
78
89
  export function registerPrompts(server) {
79
90
  // โ”€โ”€ Per-judge prompts โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
80
- // Each prompt uses condensed criteria (adversarial mandate stripped) plus
81
- // the shared mandates, mirroring the tribunal architecture for consistency
82
- // and better precision on clean code.
91
+ // Each prompt uses condensed criteria plus the shared mandates for
92
+ // better precision on clean code.
83
93
  for (const judge of JUDGES) {
84
94
  server.prompt(`judge-${judge.id}`, `Use the ${judge.name} persona to perform a deep ${judge.domain} review of code. This prompt provides the judge's expert criteria for LLM-powered analysis that goes beyond pattern matching.`, {
85
95
  code: z.string().describe("The source code to evaluate"),
@@ -92,6 +102,7 @@ export function registerPrompts(server) {
92
102
  `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
93
103
  `${PRECISION_MANDATE}\n\n` +
94
104
  `${criteria}\n\n` +
105
+ `${CLEAN_CODE_GATE}\n\n` +
95
106
  `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
96
107
  (context ? `\n\nAdditional context: ${context}` : "") +
97
108
  `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`;
@@ -108,41 +119,4 @@ export function registerPrompts(server) {
108
119
  };
109
120
  });
110
121
  }
111
- // โ”€โ”€ Full tribunal prompt (token-optimised) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
112
- // Shared directives (adversarial mandate, precision mandate) are stated
113
- // ONCE in the preamble. Each judge section includes only its unique
114
- // evaluation criteria, domain-specific rules, and FP-avoidance guidance.
115
- server.prompt("full-tribunal", `Convene the full Judges Panel โ€” all ${JUDGES.length} judges evaluate the code in their respective domains and produce a combined verdict.`, {
116
- code: z.string().describe("The source code to evaluate"),
117
- language: z.string().describe("The programming language"),
118
- context: z.string().optional().describe("Additional context about the code"),
119
- }, async ({ code, language, context }) => {
120
- const judgeInstructions = JUDGES.map((j) => `### ${j.name} โ€” ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
121
- const userMessage = `You are the Judges Panel โ€” a panel of ${JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
122
- `## Universal Evaluation Directives\n\n` +
123
- `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
124
- `${PRECISION_MANDATE}\n\n` +
125
- `## Evaluation Instructions\n\n` +
126
- `Evaluate the following ${language} code from the perspective of ALL ${JUDGES.length} judges below. For each judge, provide:\n` +
127
- `1. Judge name and domain\n` +
128
- `2. Verdict (PASS / WARNING / FAIL)\n` +
129
- `3. Score (0-100)\n` +
130
- `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
131
- `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
132
- `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
133
- `## The Judges\n\n${judgeInstructions}\n\n` +
134
- `## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\`` +
135
- (context ? `\n\n## Additional Context\n${context}` : "");
136
- return {
137
- messages: [
138
- {
139
- role: "user",
140
- content: {
141
- type: "text",
142
- text: userMessage,
143
- },
144
- },
145
- ],
146
- };
147
- });
148
122
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges",
3
- "version": "3.121.0",
3
+ "version": "3.123.0",
4
4
  "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
5
5
  "mcpName": "io.github.KevinRabun/judges",
6
6
  "type": "module",
@@ -145,7 +145,7 @@
145
145
  "zod": "^4.3.6"
146
146
  },
147
147
  "devDependencies": {
148
- "@anthropic-ai/sdk": "^0.79.0",
148
+ "@anthropic-ai/sdk": "^0.80.0",
149
149
  "@eslint/js": "^10.0.1",
150
150
  "@types/node": "^25.3.0",
151
151
  "@typescript-eslint/eslint-plugin": "^8.56.1",
package/server.json CHANGED
@@ -3,16 +3,25 @@
3
3
  "name": "io.github.KevinRabun/judges",
4
4
  "title": "Judges Panel",
5
5
  "description": "45 judges that evaluate AI-generated code for security, cost, and quality with built-in AST.",
6
+ "websiteUrl": "https://kevinrabun.github.io/judges/",
6
7
  "repository": {
7
- "url": "https://github.com/kevinrabun/judges",
8
- "source": "github"
8
+ "url": "https://github.com/KevinRabun/judges",
9
+ "source": "github",
10
+ "id": "1161966307"
9
11
  },
10
- "version": "3.121.0",
12
+ "icons": [
13
+ {
14
+ "src": "https://raw.githubusercontent.com/KevinRabun/judges/main/vscode-extension/icon.png",
15
+ "sizes": ["128x128"],
16
+ "mimeType": "image/png"
17
+ }
18
+ ],
19
+ "version": "3.123.0",
11
20
  "packages": [
12
21
  {
13
22
  "registryType": "npm",
14
23
  "identifier": "@kevinrabun/judges",
15
- "version": "3.121.0",
24
+ "version": "3.123.0",
16
25
  "transport": {
17
26
  "type": "stdio"
18
27
  }
@@ -44,7 +44,7 @@ export function parseSkillFrontmatter(raw: string): { meta: SkillMeta; body: str
44
44
  i++;
45
45
  continue;
46
46
  }
47
- const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/);
47
+ const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
48
48
  if (!kv) {
49
49
  i++;
50
50
  continue;
@@ -85,9 +85,10 @@ export function parseSkillFrontmatter(raw: string): { meta: SkillMeta; body: str
85
85
  if (typeof value === "string" && ((value.startsWith("[") && value.endsWith("]")) || value.includes(","))) {
86
86
  // simple array parsing: split on comma
87
87
  const normalized = (value as string)
88
- .replace(/^\s*\[/, "")
89
- .replace(/\]\s*$/, "")
90
- .split(/\s*,\s*/)
88
+ .replace(/^[ \t]*\[/, "")
89
+ .replace(/\][ \t]*$/, "")
90
+ .split(",")
91
+ .map((s) => s.trim())
91
92
  .filter(Boolean);
92
93
  value = normalized;
93
94
  } else if (
@@ -117,13 +118,15 @@ export function validateSkillFrontmatter(meta: SkillMeta, sourcePath: string): S
117
118
  agents: Array.isArray(meta.agents)
118
119
  ? (meta.agents as string[])
119
120
  : String(meta.agents ?? "")
120
- .split(/\s*,\s*/)
121
+ .split(",")
122
+ .map((s) => s.trim())
121
123
  .filter(Boolean),
122
124
  tags: Array.isArray(meta.tags)
123
125
  ? (meta.tags as string[])
124
126
  : meta.tags
125
127
  ? String(meta.tags)
126
- .split(/\s*,\s*/)
128
+ .split(",")
129
+ .map((s) => s.trim())
127
130
  .filter(Boolean)
128
131
  : undefined,
129
132
  priority: meta.priority ? Number(meta.priority) : 10,