@kevinrabun/judges-cli 3.127.3 → 3.128.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Multi-file benchmark scenarios — cross-file vulnerability patterns.
3
+ *
4
+ * These test cases represent real-world patterns where security issues
5
+ * span multiple files (auth middleware in one file, unprotected routes
6
+ * in another; database config in one file, raw queries in another; etc.)
7
+ *
8
+ * Each case has a primary `code` file and additional `files` for context.
9
+ * The benchmark runner uses `evaluateProject()` for these cases.
10
+ */
11
+ import type { BenchmarkCase } from "./benchmark.js";
12
+ export declare const BENCHMARK_MULTI_FILE: BenchmarkCase[];
@@ -0,0 +1,427 @@
1
+ /**
2
+ * Multi-file benchmark scenarios — cross-file vulnerability patterns.
3
+ *
4
+ * These test cases represent real-world patterns where security issues
5
+ * span multiple files (auth middleware in one file, unprotected routes
6
+ * in another; database config in one file, raw queries in another; etc.)
7
+ *
8
+ * Each case has a primary `code` file and additional `files` for context.
9
+ * The benchmark runner uses `evaluateProject()` for these cases.
10
+ */
11
+ export const BENCHMARK_MULTI_FILE = [
12
+ // ─── Auth middleware defined but not applied to all routes ──────────────
13
+ {
14
+ id: "multi-auth-middleware-not-applied",
15
+ description: "Auth middleware exists in middleware.ts but sensitive routes in api.ts don't use it",
16
+ language: "typescript",
17
+ code: `// api.ts — routes without auth middleware
18
+ import express from "express";
19
+ import { db } from "./db";
20
+
21
+ const router = express.Router();
22
+
23
+ // Public endpoints (fine)
24
+ router.get("/health", (req, res) => res.json({ ok: true }));
25
+
26
+ // SENSITIVE: these should require auth but don't use requireAuth
27
+ router.get("/api/users", async (req, res) => {
28
+ const users = await db.query("SELECT id, name, email FROM users");
29
+ res.json(users);
30
+ });
31
+
32
+ router.delete("/api/users/:id", async (req, res) => {
33
+ await db.query("DELETE FROM users WHERE id = $1", [req.params.id]);
34
+ res.json({ deleted: true });
35
+ });
36
+
37
+ router.get("/api/admin/settings", async (req, res) => {
38
+ const settings = await db.query("SELECT * FROM settings");
39
+ res.json(settings);
40
+ });
41
+
42
+ export default router;`,
43
+ files: [
44
+ {
45
+ path: "middleware.ts",
46
+ content: `import jwt from "jsonwebtoken";
47
+
48
+ export function requireAuth(req, res, next) {
49
+ const token = req.headers.authorization?.replace("Bearer ", "");
50
+ if (!token) return res.status(401).json({ error: "Unauthorized" });
51
+ try {
52
+ req.user = jwt.verify(token, process.env.JWT_SECRET, { algorithms: ["HS256"] });
53
+ next();
54
+ } catch {
55
+ res.status(401).json({ error: "Invalid token" });
56
+ }
57
+ }
58
+
59
+ export function requireAdmin(req, res, next) {
60
+ if (req.user?.role !== "admin") return res.status(403).json({ error: "Forbidden" });
61
+ next();
62
+ }`,
63
+ language: "typescript",
64
+ },
65
+ {
66
+ path: "db.ts",
67
+ content: `import { Pool } from "pg";
68
+ export const db = new Pool({ connectionString: process.env.DATABASE_URL });`,
69
+ language: "typescript",
70
+ },
71
+ ],
72
+ expectedRuleIds: ["AUTH-001", "AUTH-002"],
73
+ acceptablePrefixes: ["SEC", "CYBER", "API"],
74
+ category: "auth",
75
+ difficulty: "hard",
76
+ },
77
+ // ─── Database connection without pooling, queries in separate file ─────
78
+ {
79
+ id: "multi-db-no-pool-separate-files",
80
+ description: "Database opens new connection per query; queries in separate service file",
81
+ language: "typescript",
82
+ code: `// user-service.ts — queries using per-request connections
83
+ import { getConnection } from "./database";
84
+
85
+ export async function getUsers(search?: string) {
86
+ const conn = await getConnection();
87
+ const query = search
88
+ ? \`SELECT * FROM users WHERE name LIKE '%\${search}%'\`
89
+ : "SELECT * FROM users";
90
+ const result = await conn.query(query);
91
+ // connection never closed
92
+ return result.rows;
93
+ }
94
+
95
+ export async function updateUser(id: string, data: any) {
96
+ const conn = await getConnection();
97
+ await conn.query(\`UPDATE users SET name = '\${data.name}' WHERE id = '\${id}'\`);
98
+ }`,
99
+ files: [
100
+ {
101
+ path: "database.ts",
102
+ content: `import { Client } from "pg";
103
+
104
+ // Creates a NEW connection every time — no pooling
105
+ export async function getConnection() {
106
+ const client = new Client({ connectionString: process.env.DATABASE_URL });
107
+ await client.connect();
108
+ return client;
109
+ }`,
110
+ language: "typescript",
111
+ },
112
+ ],
113
+ expectedRuleIds: ["DB-001", "CYBER-001", "SEC-001"],
114
+ acceptablePrefixes: ["PERF", "SCALE", "REL"],
115
+ category: "database",
116
+ difficulty: "medium",
117
+ },
118
+ // ─── Config/secrets in env file imported by multiple modules ───────────
119
+ {
120
+ id: "multi-config-secrets-spread",
121
+ description: "Secrets hardcoded in config, imported across service files",
122
+ language: "typescript",
123
+ code: `// payment-service.ts — imports secrets from config
124
+ import { config } from "./config";
125
+ import Stripe from "stripe";
126
+
127
+ const stripe = new Stripe(config.STRIPE_SECRET_KEY);
128
+
129
+ export async function chargeCustomer(customerId: string, amount: number) {
130
+ return stripe.charges.create({
131
+ amount,
132
+ currency: "usd",
133
+ customer: customerId,
134
+ });
135
+ }`,
136
+ files: [
137
+ {
138
+ path: "config.ts",
139
+ content: `// DANGER: hardcoded secrets
140
+ export const config = {
141
+ STRIPE_SECRET_KEY: "sk_test_FAKE_KEY_FOR_BENCHMARK_ONLY_000000",
142
+ JWT_SECRET: "super-secret-jwt-key-2024",
143
+ DATABASE_URL: "postgres://admin:password123@db.prod.internal:5432/maindb",
144
+ AWS_ACCESS_KEY: "AKIA_FAKE_KEY_FOR_BENCHMARK",
145
+ AWS_SECRET_KEY: "FAKE_SECRET_KEY_FOR_BENCHMARK_TESTING_ONLY",
146
+ SENDGRID_API_KEY: "SG.FAKE_KEY_FOR_BENCHMARK_ONLY",
147
+ };`,
148
+ language: "typescript",
149
+ },
150
+ {
151
+ path: "email-service.ts",
152
+ content: `import { config } from "./config";
153
+ import sgMail from "@sendgrid/mail";
154
+
155
+ sgMail.setApiKey(config.SENDGRID_API_KEY);
156
+
157
+ export function sendWelcome(email: string) {
158
+ return sgMail.send({ to: email, from: "noreply@app.com", subject: "Welcome", text: "Hello!" });
159
+ }`,
160
+ language: "typescript",
161
+ },
162
+ ],
163
+ expectedRuleIds: ["AUTH-001", "DATA-001", "CYBER-001"],
164
+ acceptablePrefixes: ["SEC", "CFG"],
165
+ category: "data-security",
166
+ difficulty: "easy",
167
+ },
168
+ // ─── Error handler defined but not wired into Express app ──────────────
169
+ {
170
+ id: "multi-error-handler-not-wired",
171
+ description: "Error handler middleware defined but never registered in the Express app",
172
+ language: "typescript",
173
+ code: `// app.ts — Express app without error handler
174
+ import express from "express";
175
+ import userRoutes from "./routes/users";
176
+ import orderRoutes from "./routes/orders";
177
+
178
+ const app = express();
179
+ app.use(express.json());
180
+
181
+ app.use("/api/users", userRoutes);
182
+ app.use("/api/orders", orderRoutes);
183
+ // Missing: app.use(errorHandler)
184
+
185
+ app.listen(3000);`,
186
+ files: [
187
+ {
188
+ path: "middleware/error-handler.ts",
189
+ content: `export function errorHandler(err, req, res, next) {
190
+ console.error(err.stack);
191
+ const status = err.statusCode || 500;
192
+ res.status(status).json({
193
+ error: { message: err.message, code: err.code || "INTERNAL_ERROR" },
194
+ });
195
+ }`,
196
+ language: "typescript",
197
+ },
198
+ {
199
+ path: "routes/users.ts",
200
+ content: `import { Router } from "express";
201
+ const router = Router();
202
+ router.get("/", async (req, res) => {
203
+ const users = await db.query("SELECT * FROM users");
204
+ res.json(users); // throws on DB error — no try/catch, no error middleware
205
+ });
206
+ export default router;`,
207
+ language: "typescript",
208
+ },
209
+ ],
210
+ expectedRuleIds: ["ERR-001"],
211
+ acceptablePrefixes: ["REL", "FW", "SWDEV"],
212
+ category: "error-handling",
213
+ difficulty: "hard",
214
+ },
215
+ // ─── Rate limiter on auth but not on API data endpoints ────────────────
216
+ {
217
+ id: "multi-rate-limit-partial",
218
+ description: "Rate limiting on login endpoint but missing on data-heavy API",
219
+ language: "typescript",
220
+ code: `// routes/data.ts — no rate limiting on expensive queries
221
+ import { Router } from "express";
222
+
223
+ const router = Router();
224
+
225
+ router.get("/api/reports", async (req, res) => {
226
+ // Expensive aggregation — no rate limit, no pagination
227
+ const report = await db.query(\`
228
+ SELECT u.name, COUNT(o.id) as order_count, SUM(o.total) as total_spent
229
+ FROM users u JOIN orders o ON u.id = o.user_id
230
+ GROUP BY u.name ORDER BY total_spent DESC
231
+ \`);
232
+ res.json(report.rows);
233
+ });
234
+
235
+ router.get("/api/export", async (req, res) => {
236
+ // Full table dump — no limit
237
+ const all = await db.query("SELECT * FROM transactions");
238
+ res.json(all.rows);
239
+ });
240
+
241
+ export default router;`,
242
+ files: [
243
+ {
244
+ path: "routes/auth.ts",
245
+ content: `import { Router } from "express";
246
+ import rateLimit from "express-rate-limit";
247
+
248
+ const loginLimiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 5 });
249
+ const router = Router();
250
+
251
+ router.post("/login", loginLimiter, async (req, res) => {
252
+ const { email, password } = req.body;
253
+ const user = await authenticate(email, password);
254
+ res.json({ token: generateToken(user) });
255
+ });
256
+
257
+ export default router;`,
258
+ language: "typescript",
259
+ },
260
+ ],
261
+ expectedRuleIds: ["RATE-001", "DB-001"],
262
+ acceptablePrefixes: ["PERF", "SCALE", "SEC", "COST"],
263
+ category: "rate-limiting",
264
+ difficulty: "medium",
265
+ },
266
+ // ─── Logging PII in error handler, sanitized in normal flow ────────────
267
+ {
268
+ id: "multi-pii-leak-error-path",
269
+ description: "User data sanitized in normal responses but leaked in error logs",
270
+ language: "typescript",
271
+ code: `// services/user-service.ts — handles users correctly in normal flow
272
+ import { logger } from "./logger";
273
+
274
+ export async function processUserUpdate(userId: string, updates: any) {
275
+ try {
276
+ const user = await db.findUser(userId);
277
+ if (!user) throw new Error("User not found");
278
+ await db.updateUser(userId, sanitize(updates));
279
+ return { success: true };
280
+ } catch (error) {
281
+ // LEAK: full user object + updates (may contain SSN, password) in error log
282
+ logger.error("Update failed", { userId, user: await db.findUser(userId), updates, error });
283
+ throw error;
284
+ }
285
+ }
286
+
287
+ function sanitize(data: any) {
288
+ const { ssn, password, creditCard, ...safe } = data;
289
+ return safe;
290
+ }`,
291
+ files: [
292
+ {
293
+ path: "logger.ts",
294
+ content: `import winston from "winston";
295
+
296
+ export const logger = winston.createLogger({
297
+ level: "info",
298
+ format: winston.format.json(),
299
+ transports: [
300
+ new winston.transports.File({ filename: "error.log", level: "error" }),
301
+ new winston.transports.File({ filename: "combined.log" }),
302
+ new winston.transports.Console(),
303
+ ],
304
+ });`,
305
+ language: "typescript",
306
+ },
307
+ ],
308
+ expectedRuleIds: ["LOGPRIV-001", "DATA-001"],
309
+ acceptablePrefixes: ["SEC", "COMP", "ERR"],
310
+ category: "logging-privacy",
311
+ difficulty: "hard",
312
+ },
313
+ // ─── Clean multi-file: well-structured Express app ─────────────────────
314
+ {
315
+ id: "clean-multi-file-express-app",
316
+ description: "Well-structured Express app with auth, error handling, rate limiting, and proper logging",
317
+ language: "typescript",
318
+ code: `// app.ts — well-organized Express application
319
+ import express from "express";
320
+ import helmet from "helmet";
321
+ import cors from "cors";
322
+ import { authRouter } from "./routes/auth";
323
+ import { apiRouter } from "./routes/api";
324
+ import { requireAuth } from "./middleware/auth";
325
+ import { errorHandler } from "./middleware/error-handler";
326
+ import { requestLogger } from "./middleware/logger";
327
+
328
+ const app = express();
329
+
330
+ app.use(helmet());
331
+ app.use(cors({ origin: process.env.ALLOWED_ORIGINS?.split(",") }));
332
+ app.use(express.json({ limit: "10kb" }));
333
+ app.use(requestLogger);
334
+
335
+ app.use("/auth", authRouter);
336
+ app.use("/api", requireAuth, apiRouter);
337
+
338
+ app.use(errorHandler);
339
+
340
+ export default app;`,
341
+ files: [
342
+ {
343
+ path: "middleware/auth.ts",
344
+ content: `import jwt from "jsonwebtoken";
345
+ export function requireAuth(req, res, next) {
346
+ const token = req.headers.authorization?.replace("Bearer ", "");
347
+ if (!token) return res.status(401).json({ error: "Authentication required" });
348
+ try {
349
+ req.user = jwt.verify(token, process.env.JWT_SECRET, { algorithms: ["HS256"] });
350
+ next();
351
+ } catch { res.status(401).json({ error: "Invalid token" }); }
352
+ }`,
353
+ language: "typescript",
354
+ },
355
+ {
356
+ path: "middleware/error-handler.ts",
357
+ content: `export function errorHandler(err, req, res, next) {
358
+ const status = err.statusCode || 500;
359
+ res.status(status).json({ error: { message: status === 500 ? "Internal error" : err.message } });
360
+ }`,
361
+ language: "typescript",
362
+ },
363
+ ],
364
+ expectedRuleIds: [],
365
+ category: "clean",
366
+ difficulty: "hard",
367
+ },
368
+ // ─── Clean multi-file: Python FastAPI with dependency injection ────────
369
+ {
370
+ id: "clean-multi-file-fastapi",
371
+ description: "Well-structured FastAPI app with auth deps, connection pooling, and error handling",
372
+ language: "python",
373
+ code: `# main.py
374
+ from fastapi import FastAPI, Depends, HTTPException
375
+ from .auth import get_current_user
376
+ from .database import get_db
377
+ from .models import UserResponse
378
+
379
+ app = FastAPI()
380
+
381
+ @app.get("/api/users/me", response_model=UserResponse)
382
+ async def get_me(user = Depends(get_current_user)):
383
+ return user
384
+
385
+ @app.get("/api/users/{user_id}", response_model=UserResponse)
386
+ async def get_user(user_id: int, db = Depends(get_db), _ = Depends(get_current_user)):
387
+ user = await db.fetch_one("SELECT id, name, email FROM users WHERE id = $1", user_id)
388
+ if not user:
389
+ raise HTTPException(status_code=404, detail="User not found")
390
+ return user`,
391
+ files: [
392
+ {
393
+ path: "auth.py",
394
+ content: `from fastapi import Depends, HTTPException
395
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
396
+ import jwt, os
397
+
398
+ security = HTTPBearer()
399
+
400
+ async def get_current_user(creds: HTTPAuthorizationCredentials = Depends(security)):
401
+ try:
402
+ payload = jwt.decode(creds.credentials, os.environ["JWT_SECRET"], algorithms=["HS256"])
403
+ return payload
404
+ except jwt.InvalidTokenError:
405
+ raise HTTPException(status_code=401, detail="Invalid token")`,
406
+ language: "python",
407
+ },
408
+ {
409
+ path: "database.py",
410
+ content: `import asyncpg, os
411
+
412
+ pool = None
413
+
414
+ async def get_db():
415
+ global pool
416
+ if pool is None:
417
+ pool = await asyncpg.create_pool(os.environ["DATABASE_URL"], min_size=5, max_size=20)
418
+ async with pool.acquire() as conn:
419
+ yield conn`,
420
+ language: "python",
421
+ },
422
+ ],
423
+ expectedRuleIds: [],
424
+ category: "clean",
425
+ difficulty: "hard",
426
+ },
427
+ ];
@@ -38,6 +38,17 @@ export interface BenchmarkCase {
38
38
  difficulty: "easy" | "medium" | "hard";
39
39
  /** AI model/tool that generated this code (e.g. "gpt-4", "claude", "copilot") */
40
40
  aiSource?: string;
41
+ /**
42
+ * Multi-file scenario. When present, the case represents a cross-file
43
+ * vulnerability pattern. The `code` field serves as the primary file,
44
+ * and `files` provides additional project files for context.
45
+ * The benchmark runner uses `evaluateProject()` instead of single-file eval.
46
+ */
47
+ files?: Array<{
48
+ path: string;
49
+ content: string;
50
+ language: string;
51
+ }>;
41
52
  }
42
53
  export interface BenchmarkResult {
43
54
  /** Timestamp of run */
@@ -12,7 +12,7 @@
12
12
  */
13
13
  import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
14
14
  import { resolve, dirname } from "path";
15
- import { evaluateWithTribunal, evaluateWithJudge } from "../evaluators/index.js";
15
+ import { evaluateWithTribunal, evaluateWithJudge, evaluateProject } from "../evaluators/index.js";
16
16
  import { getJudge, JUDGES } from "../judges/index.js";
17
17
  import { formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown } from "./llm-benchmark.js";
18
18
  import { EXPANDED_BENCHMARK_CASES } from "./benchmark-expanded.js";
@@ -25,6 +25,8 @@ import { BENCHMARK_COMPLIANCE_ETHICS } from "./benchmark-compliance-ethics.js";
25
25
  import { BENCHMARK_AI_AGENTS } from "./benchmark-ai-agents.js";
26
26
  import { BENCHMARK_ADVANCED_CASES } from "./benchmark-advanced.js";
27
27
  import { BENCHMARK_AI_OUTPUT } from "./benchmark-ai-output.js";
28
+ import { BENCHMARK_COVERAGE_GAPS } from "./benchmark-coverage-gaps.js";
29
+ import { BENCHMARK_MULTI_FILE } from "./benchmark-multi-file.js";
28
30
  // ─── Built-in Benchmark Cases ───────────────────────────────────────────────
29
31
  export const BENCHMARK_CASES = [
30
32
  // ── SQL Injection ──
@@ -2203,6 +2205,8 @@ export function UserList({ users, onSelect, searchLabel = "Search users" }: User
2203
2205
  ...BENCHMARK_AI_AGENTS,
2204
2206
  ...BENCHMARK_ADVANCED_CASES,
2205
2207
  ...BENCHMARK_AI_OUTPUT,
2208
+ ...BENCHMARK_COVERAGE_GAPS,
2209
+ ...BENCHMARK_MULTI_FILE,
2206
2210
  ];
2207
2211
  // ─── Benchmark Runner ───────────────────────────────────────────────────────
2208
2212
  export function runBenchmarkSuite(cases, judgeId) {
@@ -2227,6 +2231,12 @@ export function runBenchmarkSuite(cases, judgeId) {
2227
2231
  const evaluation = evaluateWithJudge(judge, tc.code, tc.language);
2228
2232
  findings = evaluation.findings;
2229
2233
  }
2234
+ else if (tc.files && tc.files.length > 0) {
2235
+ // Multi-file scenario — use project evaluation with cross-file analysis
2236
+ const projectFiles = [{ path: tc.id + "." + tc.language, content: tc.code, language: tc.language }, ...tc.files];
2237
+ const projectVerdict = evaluateProject(projectFiles);
2238
+ findings = projectVerdict.fileResults.flatMap((f) => f.findings);
2239
+ }
2230
2240
  else {
2231
2241
  const verdict = evaluateWithTribunal(tc.code, tc.language);
2232
2242
  findings = verdict.findings;
@@ -376,11 +376,12 @@ function getFpReason(finding, lines, isIaC, fileCategory, filePath) {
376
376
  if (finding.isAbsenceBased && fileCategory === "types") {
377
377
  return "Absence-based rule does not apply to pure type-definition files — no runtime logic to evaluate.";
378
378
  }
379
- // ── 2d. Benchmark CLI files: SEC/HALLU on embedded code specimens ──
379
+ // ── 2d. Benchmark files: findings on embedded code specimens ──
380
380
  // Benchmark files in the commands/ directory contain intentional
381
381
  // vulnerable-code snippets embedded as template literal strings. These
382
- // are test data, not real vulnerabilities.
383
- if (fileCategory === "cli" && filePath && /benchmark/i.test(filePath) && /^(?:SEC|HALLU)-/.test(finding.ruleId)) {
382
+ // are test data, not real vulnerabilities. Suppress ALL findings when
383
+ // the file is a benchmark fixture with many template literals.
384
+ if (filePath && /benchmark/i.test(filePath)) {
384
385
  const codeText = lines.join("\n");
385
386
  const templateLiteralCount = (codeText.match(/`[^`]{50,}/g) || []).length;
386
387
  if (templateLiteralCount >= 5) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges-cli",
3
- "version": "3.127.3",
3
+ "version": "3.128.1",
4
4
  "description": "CLI wrapper for the Judges code review toolkit.",
5
5
  "type": "module",
6
6
  "main": "dist/cli.js",