@kevinrabun/judges-cli 3.127.2 → 3.128.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/benchmark-coverage-gaps.d.ts +11 -0
- package/dist/commands/benchmark-coverage-gaps.js +910 -0
- package/dist/commands/benchmark-multi-file.d.ts +12 -0
- package/dist/commands/benchmark-multi-file.js +427 -0
- package/dist/commands/benchmark.d.ts +11 -0
- package/dist/commands/benchmark.js +11 -1
- package/dist/evaluators/false-positive-review.js +4 -3
- package/package.json +1 -1
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-file benchmark scenarios — cross-file vulnerability patterns.
|
|
3
|
+
*
|
|
4
|
+
* These test cases represent real-world patterns where security issues
|
|
5
|
+
* span multiple files (auth middleware in one file, unprotected routes
|
|
6
|
+
* in another; database config in one file, raw queries in another; etc.)
|
|
7
|
+
*
|
|
8
|
+
* Each case has a primary `code` file and additional `files` for context.
|
|
9
|
+
* The benchmark runner uses `evaluateProject()` for these cases.
|
|
10
|
+
*/
|
|
11
|
+
import type { BenchmarkCase } from "./benchmark.js";
|
|
12
|
+
export declare const BENCHMARK_MULTI_FILE: BenchmarkCase[];
|
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-file benchmark scenarios — cross-file vulnerability patterns.
|
|
3
|
+
*
|
|
4
|
+
* These test cases represent real-world patterns where security issues
|
|
5
|
+
* span multiple files (auth middleware in one file, unprotected routes
|
|
6
|
+
* in another; database config in one file, raw queries in another; etc.)
|
|
7
|
+
*
|
|
8
|
+
* Each case has a primary `code` file and additional `files` for context.
|
|
9
|
+
* The benchmark runner uses `evaluateProject()` for these cases.
|
|
10
|
+
*/
|
|
11
|
+
export const BENCHMARK_MULTI_FILE = [
|
|
12
|
+
// ─── Auth middleware defined but not applied to all routes ──────────────
|
|
13
|
+
{
|
|
14
|
+
id: "multi-auth-middleware-not-applied",
|
|
15
|
+
description: "Auth middleware exists in middleware.ts but sensitive routes in api.ts don't use it",
|
|
16
|
+
language: "typescript",
|
|
17
|
+
code: `// api.ts — routes without auth middleware
|
|
18
|
+
import express from "express";
|
|
19
|
+
import { db } from "./db";
|
|
20
|
+
|
|
21
|
+
const router = express.Router();
|
|
22
|
+
|
|
23
|
+
// Public endpoints (fine)
|
|
24
|
+
router.get("/health", (req, res) => res.json({ ok: true }));
|
|
25
|
+
|
|
26
|
+
// SENSITIVE: these should require auth but don't use requireAuth
|
|
27
|
+
router.get("/api/users", async (req, res) => {
|
|
28
|
+
const users = await db.query("SELECT id, name, email FROM users");
|
|
29
|
+
res.json(users);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
router.delete("/api/users/:id", async (req, res) => {
|
|
33
|
+
await db.query("DELETE FROM users WHERE id = $1", [req.params.id]);
|
|
34
|
+
res.json({ deleted: true });
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
router.get("/api/admin/settings", async (req, res) => {
|
|
38
|
+
const settings = await db.query("SELECT * FROM settings");
|
|
39
|
+
res.json(settings);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
export default router;`,
|
|
43
|
+
files: [
|
|
44
|
+
{
|
|
45
|
+
path: "middleware.ts",
|
|
46
|
+
content: `import jwt from "jsonwebtoken";
|
|
47
|
+
|
|
48
|
+
export function requireAuth(req, res, next) {
|
|
49
|
+
const token = req.headers.authorization?.replace("Bearer ", "");
|
|
50
|
+
if (!token) return res.status(401).json({ error: "Unauthorized" });
|
|
51
|
+
try {
|
|
52
|
+
req.user = jwt.verify(token, process.env.JWT_SECRET, { algorithms: ["HS256"] });
|
|
53
|
+
next();
|
|
54
|
+
} catch {
|
|
55
|
+
res.status(401).json({ error: "Invalid token" });
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export function requireAdmin(req, res, next) {
|
|
60
|
+
if (req.user?.role !== "admin") return res.status(403).json({ error: "Forbidden" });
|
|
61
|
+
next();
|
|
62
|
+
}`,
|
|
63
|
+
language: "typescript",
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
path: "db.ts",
|
|
67
|
+
content: `import { Pool } from "pg";
|
|
68
|
+
export const db = new Pool({ connectionString: process.env.DATABASE_URL });`,
|
|
69
|
+
language: "typescript",
|
|
70
|
+
},
|
|
71
|
+
],
|
|
72
|
+
expectedRuleIds: ["AUTH-001", "AUTH-002"],
|
|
73
|
+
acceptablePrefixes: ["SEC", "CYBER", "API"],
|
|
74
|
+
category: "auth",
|
|
75
|
+
difficulty: "hard",
|
|
76
|
+
},
|
|
77
|
+
// ─── Database connection without pooling, queries in separate file ─────
|
|
78
|
+
{
|
|
79
|
+
id: "multi-db-no-pool-separate-files",
|
|
80
|
+
description: "Database opens new connection per query; queries in separate service file",
|
|
81
|
+
language: "typescript",
|
|
82
|
+
code: `// user-service.ts — queries using per-request connections
|
|
83
|
+
import { getConnection } from "./database";
|
|
84
|
+
|
|
85
|
+
export async function getUsers(search?: string) {
|
|
86
|
+
const conn = await getConnection();
|
|
87
|
+
const query = search
|
|
88
|
+
? \`SELECT * FROM users WHERE name LIKE '%\${search}%'\`
|
|
89
|
+
: "SELECT * FROM users";
|
|
90
|
+
const result = await conn.query(query);
|
|
91
|
+
// connection never closed
|
|
92
|
+
return result.rows;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export async function updateUser(id: string, data: any) {
|
|
96
|
+
const conn = await getConnection();
|
|
97
|
+
await conn.query(\`UPDATE users SET name = '\${data.name}' WHERE id = '\${id}'\`);
|
|
98
|
+
}`,
|
|
99
|
+
files: [
|
|
100
|
+
{
|
|
101
|
+
path: "database.ts",
|
|
102
|
+
content: `import { Client } from "pg";
|
|
103
|
+
|
|
104
|
+
// Creates a NEW connection every time — no pooling
|
|
105
|
+
export async function getConnection() {
|
|
106
|
+
const client = new Client({ connectionString: process.env.DATABASE_URL });
|
|
107
|
+
await client.connect();
|
|
108
|
+
return client;
|
|
109
|
+
}`,
|
|
110
|
+
language: "typescript",
|
|
111
|
+
},
|
|
112
|
+
],
|
|
113
|
+
expectedRuleIds: ["DB-001", "CYBER-001", "SEC-001"],
|
|
114
|
+
acceptablePrefixes: ["PERF", "SCALE", "REL"],
|
|
115
|
+
category: "database",
|
|
116
|
+
difficulty: "medium",
|
|
117
|
+
},
|
|
118
|
+
// ─── Config/secrets in env file imported by multiple modules ───────────
|
|
119
|
+
{
|
|
120
|
+
id: "multi-config-secrets-spread",
|
|
121
|
+
description: "Secrets hardcoded in config, imported across service files",
|
|
122
|
+
language: "typescript",
|
|
123
|
+
code: `// payment-service.ts — imports secrets from config
|
|
124
|
+
import { config } from "./config";
|
|
125
|
+
import Stripe from "stripe";
|
|
126
|
+
|
|
127
|
+
const stripe = new Stripe(config.STRIPE_SECRET_KEY);
|
|
128
|
+
|
|
129
|
+
export async function chargeCustomer(customerId: string, amount: number) {
|
|
130
|
+
return stripe.charges.create({
|
|
131
|
+
amount,
|
|
132
|
+
currency: "usd",
|
|
133
|
+
customer: customerId,
|
|
134
|
+
});
|
|
135
|
+
}`,
|
|
136
|
+
files: [
|
|
137
|
+
{
|
|
138
|
+
path: "config.ts",
|
|
139
|
+
content: `// DANGER: hardcoded secrets
|
|
140
|
+
export const config = {
|
|
141
|
+
STRIPE_SECRET_KEY: "sk_test_FAKE_KEY_FOR_BENCHMARK_ONLY_000000",
|
|
142
|
+
JWT_SECRET: "super-secret-jwt-key-2024",
|
|
143
|
+
DATABASE_URL: "postgres://admin:password123@db.prod.internal:5432/maindb",
|
|
144
|
+
AWS_ACCESS_KEY: "AKIA_FAKE_KEY_FOR_BENCHMARK",
|
|
145
|
+
AWS_SECRET_KEY: "FAKE_SECRET_KEY_FOR_BENCHMARK_TESTING_ONLY",
|
|
146
|
+
SENDGRID_API_KEY: "SG.FAKE_KEY_FOR_BENCHMARK_ONLY",
|
|
147
|
+
};`,
|
|
148
|
+
language: "typescript",
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
path: "email-service.ts",
|
|
152
|
+
content: `import { config } from "./config";
|
|
153
|
+
import sgMail from "@sendgrid/mail";
|
|
154
|
+
|
|
155
|
+
sgMail.setApiKey(config.SENDGRID_API_KEY);
|
|
156
|
+
|
|
157
|
+
export function sendWelcome(email: string) {
|
|
158
|
+
return sgMail.send({ to: email, from: "noreply@app.com", subject: "Welcome", text: "Hello!" });
|
|
159
|
+
}`,
|
|
160
|
+
language: "typescript",
|
|
161
|
+
},
|
|
162
|
+
],
|
|
163
|
+
expectedRuleIds: ["AUTH-001", "DATA-001", "CYBER-001"],
|
|
164
|
+
acceptablePrefixes: ["SEC", "CFG"],
|
|
165
|
+
category: "data-security",
|
|
166
|
+
difficulty: "easy",
|
|
167
|
+
},
|
|
168
|
+
// ─── Error handler defined but not wired into Express app ──────────────
|
|
169
|
+
{
|
|
170
|
+
id: "multi-error-handler-not-wired",
|
|
171
|
+
description: "Error handler middleware defined but never registered in the Express app",
|
|
172
|
+
language: "typescript",
|
|
173
|
+
code: `// app.ts — Express app without error handler
|
|
174
|
+
import express from "express";
|
|
175
|
+
import userRoutes from "./routes/users";
|
|
176
|
+
import orderRoutes from "./routes/orders";
|
|
177
|
+
|
|
178
|
+
const app = express();
|
|
179
|
+
app.use(express.json());
|
|
180
|
+
|
|
181
|
+
app.use("/api/users", userRoutes);
|
|
182
|
+
app.use("/api/orders", orderRoutes);
|
|
183
|
+
// Missing: app.use(errorHandler)
|
|
184
|
+
|
|
185
|
+
app.listen(3000);`,
|
|
186
|
+
files: [
|
|
187
|
+
{
|
|
188
|
+
path: "middleware/error-handler.ts",
|
|
189
|
+
content: `export function errorHandler(err, req, res, next) {
|
|
190
|
+
console.error(err.stack);
|
|
191
|
+
const status = err.statusCode || 500;
|
|
192
|
+
res.status(status).json({
|
|
193
|
+
error: { message: err.message, code: err.code || "INTERNAL_ERROR" },
|
|
194
|
+
});
|
|
195
|
+
}`,
|
|
196
|
+
language: "typescript",
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
path: "routes/users.ts",
|
|
200
|
+
content: `import { Router } from "express";
|
|
201
|
+
const router = Router();
|
|
202
|
+
router.get("/", async (req, res) => {
|
|
203
|
+
const users = await db.query("SELECT * FROM users");
|
|
204
|
+
res.json(users); // throws on DB error — no try/catch, no error middleware
|
|
205
|
+
});
|
|
206
|
+
export default router;`,
|
|
207
|
+
language: "typescript",
|
|
208
|
+
},
|
|
209
|
+
],
|
|
210
|
+
expectedRuleIds: ["ERR-001"],
|
|
211
|
+
acceptablePrefixes: ["REL", "FW", "SWDEV"],
|
|
212
|
+
category: "error-handling",
|
|
213
|
+
difficulty: "hard",
|
|
214
|
+
},
|
|
215
|
+
// ─── Rate limiter on auth but not on API data endpoints ────────────────
|
|
216
|
+
{
|
|
217
|
+
id: "multi-rate-limit-partial",
|
|
218
|
+
description: "Rate limiting on login endpoint but missing on data-heavy API",
|
|
219
|
+
language: "typescript",
|
|
220
|
+
code: `// routes/data.ts — no rate limiting on expensive queries
|
|
221
|
+
import { Router } from "express";
|
|
222
|
+
|
|
223
|
+
const router = Router();
|
|
224
|
+
|
|
225
|
+
router.get("/api/reports", async (req, res) => {
|
|
226
|
+
// Expensive aggregation — no rate limit, no pagination
|
|
227
|
+
const report = await db.query(\`
|
|
228
|
+
SELECT u.name, COUNT(o.id) as order_count, SUM(o.total) as total_spent
|
|
229
|
+
FROM users u JOIN orders o ON u.id = o.user_id
|
|
230
|
+
GROUP BY u.name ORDER BY total_spent DESC
|
|
231
|
+
\`);
|
|
232
|
+
res.json(report.rows);
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
router.get("/api/export", async (req, res) => {
|
|
236
|
+
// Full table dump — no limit
|
|
237
|
+
const all = await db.query("SELECT * FROM transactions");
|
|
238
|
+
res.json(all.rows);
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
export default router;`,
|
|
242
|
+
files: [
|
|
243
|
+
{
|
|
244
|
+
path: "routes/auth.ts",
|
|
245
|
+
content: `import { Router } from "express";
|
|
246
|
+
import rateLimit from "express-rate-limit";
|
|
247
|
+
|
|
248
|
+
const loginLimiter = rateLimit({ windowMs: 15 * 60 * 1000, max: 5 });
|
|
249
|
+
const router = Router();
|
|
250
|
+
|
|
251
|
+
router.post("/login", loginLimiter, async (req, res) => {
|
|
252
|
+
const { email, password } = req.body;
|
|
253
|
+
const user = await authenticate(email, password);
|
|
254
|
+
res.json({ token: generateToken(user) });
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
export default router;`,
|
|
258
|
+
language: "typescript",
|
|
259
|
+
},
|
|
260
|
+
],
|
|
261
|
+
expectedRuleIds: ["RATE-001", "DB-001"],
|
|
262
|
+
acceptablePrefixes: ["PERF", "SCALE", "SEC", "COST"],
|
|
263
|
+
category: "rate-limiting",
|
|
264
|
+
difficulty: "medium",
|
|
265
|
+
},
|
|
266
|
+
// ─── Logging PII in error handler, sanitized in normal flow ────────────
|
|
267
|
+
{
|
|
268
|
+
id: "multi-pii-leak-error-path",
|
|
269
|
+
description: "User data sanitized in normal responses but leaked in error logs",
|
|
270
|
+
language: "typescript",
|
|
271
|
+
code: `// services/user-service.ts — handles users correctly in normal flow
|
|
272
|
+
import { logger } from "./logger";
|
|
273
|
+
|
|
274
|
+
export async function processUserUpdate(userId: string, updates: any) {
|
|
275
|
+
try {
|
|
276
|
+
const user = await db.findUser(userId);
|
|
277
|
+
if (!user) throw new Error("User not found");
|
|
278
|
+
await db.updateUser(userId, sanitize(updates));
|
|
279
|
+
return { success: true };
|
|
280
|
+
} catch (error) {
|
|
281
|
+
// LEAK: full user object + updates (may contain SSN, password) in error log
|
|
282
|
+
logger.error("Update failed", { userId, user: await db.findUser(userId), updates, error });
|
|
283
|
+
throw error;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
function sanitize(data: any) {
|
|
288
|
+
const { ssn, password, creditCard, ...safe } = data;
|
|
289
|
+
return safe;
|
|
290
|
+
}`,
|
|
291
|
+
files: [
|
|
292
|
+
{
|
|
293
|
+
path: "logger.ts",
|
|
294
|
+
content: `import winston from "winston";
|
|
295
|
+
|
|
296
|
+
export const logger = winston.createLogger({
|
|
297
|
+
level: "info",
|
|
298
|
+
format: winston.format.json(),
|
|
299
|
+
transports: [
|
|
300
|
+
new winston.transports.File({ filename: "error.log", level: "error" }),
|
|
301
|
+
new winston.transports.File({ filename: "combined.log" }),
|
|
302
|
+
new winston.transports.Console(),
|
|
303
|
+
],
|
|
304
|
+
});`,
|
|
305
|
+
language: "typescript",
|
|
306
|
+
},
|
|
307
|
+
],
|
|
308
|
+
expectedRuleIds: ["LOGPRIV-001", "DATA-001"],
|
|
309
|
+
acceptablePrefixes: ["SEC", "COMP", "ERR"],
|
|
310
|
+
category: "logging-privacy",
|
|
311
|
+
difficulty: "hard",
|
|
312
|
+
},
|
|
313
|
+
// ─── Clean multi-file: well-structured Express app ─────────────────────
|
|
314
|
+
{
|
|
315
|
+
id: "clean-multi-file-express-app",
|
|
316
|
+
description: "Well-structured Express app with auth, error handling, rate limiting, and proper logging",
|
|
317
|
+
language: "typescript",
|
|
318
|
+
code: `// app.ts — well-organized Express application
|
|
319
|
+
import express from "express";
|
|
320
|
+
import helmet from "helmet";
|
|
321
|
+
import cors from "cors";
|
|
322
|
+
import { authRouter } from "./routes/auth";
|
|
323
|
+
import { apiRouter } from "./routes/api";
|
|
324
|
+
import { requireAuth } from "./middleware/auth";
|
|
325
|
+
import { errorHandler } from "./middleware/error-handler";
|
|
326
|
+
import { requestLogger } from "./middleware/logger";
|
|
327
|
+
|
|
328
|
+
const app = express();
|
|
329
|
+
|
|
330
|
+
app.use(helmet());
|
|
331
|
+
app.use(cors({ origin: process.env.ALLOWED_ORIGINS?.split(",") }));
|
|
332
|
+
app.use(express.json({ limit: "10kb" }));
|
|
333
|
+
app.use(requestLogger);
|
|
334
|
+
|
|
335
|
+
app.use("/auth", authRouter);
|
|
336
|
+
app.use("/api", requireAuth, apiRouter);
|
|
337
|
+
|
|
338
|
+
app.use(errorHandler);
|
|
339
|
+
|
|
340
|
+
export default app;`,
|
|
341
|
+
files: [
|
|
342
|
+
{
|
|
343
|
+
path: "middleware/auth.ts",
|
|
344
|
+
content: `import jwt from "jsonwebtoken";
|
|
345
|
+
export function requireAuth(req, res, next) {
|
|
346
|
+
const token = req.headers.authorization?.replace("Bearer ", "");
|
|
347
|
+
if (!token) return res.status(401).json({ error: "Authentication required" });
|
|
348
|
+
try {
|
|
349
|
+
req.user = jwt.verify(token, process.env.JWT_SECRET, { algorithms: ["HS256"] });
|
|
350
|
+
next();
|
|
351
|
+
} catch { res.status(401).json({ error: "Invalid token" }); }
|
|
352
|
+
}`,
|
|
353
|
+
language: "typescript",
|
|
354
|
+
},
|
|
355
|
+
{
|
|
356
|
+
path: "middleware/error-handler.ts",
|
|
357
|
+
content: `export function errorHandler(err, req, res, next) {
|
|
358
|
+
const status = err.statusCode || 500;
|
|
359
|
+
res.status(status).json({ error: { message: status === 500 ? "Internal error" : err.message } });
|
|
360
|
+
}`,
|
|
361
|
+
language: "typescript",
|
|
362
|
+
},
|
|
363
|
+
],
|
|
364
|
+
expectedRuleIds: [],
|
|
365
|
+
category: "clean",
|
|
366
|
+
difficulty: "hard",
|
|
367
|
+
},
|
|
368
|
+
// ─── Clean multi-file: Python FastAPI with dependency injection ────────
|
|
369
|
+
{
|
|
370
|
+
id: "clean-multi-file-fastapi",
|
|
371
|
+
description: "Well-structured FastAPI app with auth deps, connection pooling, and error handling",
|
|
372
|
+
language: "python",
|
|
373
|
+
code: `# main.py
|
|
374
|
+
from fastapi import FastAPI, Depends, HTTPException
|
|
375
|
+
from .auth import get_current_user
|
|
376
|
+
from .database import get_db
|
|
377
|
+
from .models import UserResponse
|
|
378
|
+
|
|
379
|
+
app = FastAPI()
|
|
380
|
+
|
|
381
|
+
@app.get("/api/users/me", response_model=UserResponse)
|
|
382
|
+
async def get_me(user = Depends(get_current_user)):
|
|
383
|
+
return user
|
|
384
|
+
|
|
385
|
+
@app.get("/api/users/{user_id}", response_model=UserResponse)
|
|
386
|
+
async def get_user(user_id: int, db = Depends(get_db), _ = Depends(get_current_user)):
|
|
387
|
+
user = await db.fetch_one("SELECT id, name, email FROM users WHERE id = $1", user_id)
|
|
388
|
+
if not user:
|
|
389
|
+
raise HTTPException(status_code=404, detail="User not found")
|
|
390
|
+
return user`,
|
|
391
|
+
files: [
|
|
392
|
+
{
|
|
393
|
+
path: "auth.py",
|
|
394
|
+
content: `from fastapi import Depends, HTTPException
|
|
395
|
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
|
396
|
+
import jwt, os
|
|
397
|
+
|
|
398
|
+
security = HTTPBearer()
|
|
399
|
+
|
|
400
|
+
async def get_current_user(creds: HTTPAuthorizationCredentials = Depends(security)):
|
|
401
|
+
try:
|
|
402
|
+
payload = jwt.decode(creds.credentials, os.environ["JWT_SECRET"], algorithms=["HS256"])
|
|
403
|
+
return payload
|
|
404
|
+
except jwt.InvalidTokenError:
|
|
405
|
+
raise HTTPException(status_code=401, detail="Invalid token")`,
|
|
406
|
+
language: "python",
|
|
407
|
+
},
|
|
408
|
+
{
|
|
409
|
+
path: "database.py",
|
|
410
|
+
content: `import asyncpg, os
|
|
411
|
+
|
|
412
|
+
pool = None
|
|
413
|
+
|
|
414
|
+
async def get_db():
|
|
415
|
+
global pool
|
|
416
|
+
if pool is None:
|
|
417
|
+
pool = await asyncpg.create_pool(os.environ["DATABASE_URL"], min_size=5, max_size=20)
|
|
418
|
+
async with pool.acquire() as conn:
|
|
419
|
+
yield conn`,
|
|
420
|
+
language: "python",
|
|
421
|
+
},
|
|
422
|
+
],
|
|
423
|
+
expectedRuleIds: [],
|
|
424
|
+
category: "clean",
|
|
425
|
+
difficulty: "hard",
|
|
426
|
+
},
|
|
427
|
+
];
|
|
@@ -38,6 +38,17 @@ export interface BenchmarkCase {
|
|
|
38
38
|
difficulty: "easy" | "medium" | "hard";
|
|
39
39
|
/** AI model/tool that generated this code (e.g. "gpt-4", "claude", "copilot") */
|
|
40
40
|
aiSource?: string;
|
|
41
|
+
/**
|
|
42
|
+
* Multi-file scenario. When present, the case represents a cross-file
|
|
43
|
+
* vulnerability pattern. The `code` field serves as the primary file,
|
|
44
|
+
* and `files` provides additional project files for context.
|
|
45
|
+
* The benchmark runner uses `evaluateProject()` instead of single-file eval.
|
|
46
|
+
*/
|
|
47
|
+
files?: Array<{
|
|
48
|
+
path: string;
|
|
49
|
+
content: string;
|
|
50
|
+
language: string;
|
|
51
|
+
}>;
|
|
41
52
|
}
|
|
42
53
|
export interface BenchmarkResult {
|
|
43
54
|
/** Timestamp of run */
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
14
14
|
import { resolve, dirname } from "path";
|
|
15
|
-
import { evaluateWithTribunal, evaluateWithJudge } from "../evaluators/index.js";
|
|
15
|
+
import { evaluateWithTribunal, evaluateWithJudge, evaluateProject } from "../evaluators/index.js";
|
|
16
16
|
import { getJudge, JUDGES } from "../judges/index.js";
|
|
17
17
|
import { formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown } from "./llm-benchmark.js";
|
|
18
18
|
import { EXPANDED_BENCHMARK_CASES } from "./benchmark-expanded.js";
|
|
@@ -25,6 +25,8 @@ import { BENCHMARK_COMPLIANCE_ETHICS } from "./benchmark-compliance-ethics.js";
|
|
|
25
25
|
import { BENCHMARK_AI_AGENTS } from "./benchmark-ai-agents.js";
|
|
26
26
|
import { BENCHMARK_ADVANCED_CASES } from "./benchmark-advanced.js";
|
|
27
27
|
import { BENCHMARK_AI_OUTPUT } from "./benchmark-ai-output.js";
|
|
28
|
+
import { BENCHMARK_COVERAGE_GAPS } from "./benchmark-coverage-gaps.js";
|
|
29
|
+
import { BENCHMARK_MULTI_FILE } from "./benchmark-multi-file.js";
|
|
28
30
|
// ─── Built-in Benchmark Cases ───────────────────────────────────────────────
|
|
29
31
|
export const BENCHMARK_CASES = [
|
|
30
32
|
// ── SQL Injection ──
|
|
@@ -2203,6 +2205,8 @@ export function UserList({ users, onSelect, searchLabel = "Search users" }: User
|
|
|
2203
2205
|
...BENCHMARK_AI_AGENTS,
|
|
2204
2206
|
...BENCHMARK_ADVANCED_CASES,
|
|
2205
2207
|
...BENCHMARK_AI_OUTPUT,
|
|
2208
|
+
...BENCHMARK_COVERAGE_GAPS,
|
|
2209
|
+
...BENCHMARK_MULTI_FILE,
|
|
2206
2210
|
];
|
|
2207
2211
|
// ─── Benchmark Runner ───────────────────────────────────────────────────────
|
|
2208
2212
|
export function runBenchmarkSuite(cases, judgeId) {
|
|
@@ -2227,6 +2231,12 @@ export function runBenchmarkSuite(cases, judgeId) {
|
|
|
2227
2231
|
const evaluation = evaluateWithJudge(judge, tc.code, tc.language);
|
|
2228
2232
|
findings = evaluation.findings;
|
|
2229
2233
|
}
|
|
2234
|
+
else if (tc.files && tc.files.length > 0) {
|
|
2235
|
+
// Multi-file scenario — use project evaluation with cross-file analysis
|
|
2236
|
+
const projectFiles = [{ path: tc.id + "." + tc.language, content: tc.code, language: tc.language }, ...tc.files];
|
|
2237
|
+
const projectVerdict = evaluateProject(projectFiles);
|
|
2238
|
+
findings = projectVerdict.fileResults.flatMap((f) => f.findings);
|
|
2239
|
+
}
|
|
2230
2240
|
else {
|
|
2231
2241
|
const verdict = evaluateWithTribunal(tc.code, tc.language);
|
|
2232
2242
|
findings = verdict.findings;
|
|
@@ -376,11 +376,12 @@ function getFpReason(finding, lines, isIaC, fileCategory, filePath) {
|
|
|
376
376
|
if (finding.isAbsenceBased && fileCategory === "types") {
|
|
377
377
|
return "Absence-based rule does not apply to pure type-definition files — no runtime logic to evaluate.";
|
|
378
378
|
}
|
|
379
|
-
// ── 2d. Benchmark
|
|
379
|
+
// ── 2d. Benchmark files: findings on embedded code specimens ──
|
|
380
380
|
// Benchmark files in the commands/ directory contain intentional
|
|
381
381
|
// vulnerable-code snippets embedded as template literal strings. These
|
|
382
|
-
// are test data, not real vulnerabilities.
|
|
383
|
-
|
|
382
|
+
// are test data, not real vulnerabilities. Suppress ALL findings when
|
|
383
|
+
// the file is a benchmark fixture with many template literals.
|
|
384
|
+
if (filePath && /benchmark/i.test(filePath)) {
|
|
384
385
|
const codeText = lines.join("\n");
|
|
385
386
|
const templateLiteralCount = (codeText.match(/`[^`]{50,}/g) || []).length;
|
|
386
387
|
if (templateLiteralCount >= 5) {
|