@reaatech/pi-bench-mcp-server 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,770 @@
1
+ // src/mcp-server.ts
2
+ import { readFile } from "fs/promises";
3
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
4
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
5
+ import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
6
+ import { createMockAdapter } from "@reaatech/pi-bench-adapters";
7
+ import { createRebuffAdapter } from "@reaatech/pi-bench-adapters";
8
+ import { createLakeraAdapter } from "@reaatech/pi-bench-adapters";
9
+ import { createLLMGuardAdapter } from "@reaatech/pi-bench-adapters";
10
+ import { createGarakAdapter } from "@reaatech/pi-bench-adapters";
11
+ import { createModerationAdapter } from "@reaatech/pi-bench-adapters";
12
+ import { getCategoryIds } from "@reaatech/pi-bench-core";
13
+ import { DefenseScoreSchema } from "@reaatech/pi-bench-core";
14
+ import { generateDefaultCorpus } from "@reaatech/pi-bench-corpus";
15
+ import { createLeaderboardManager } from "@reaatech/pi-bench-leaderboard";
16
+ import {
17
+ getDefaultLeaderboardPath,
18
+ loadLeaderboardEntries,
19
+ saveLeaderboardEntries
20
+ } from "@reaatech/pi-bench-leaderboard";
21
+ import { createLogger } from "@reaatech/pi-bench-observability";
22
+ import { createBenchmarkEngine } from "@reaatech/pi-bench-runner";
23
+ import { createDefenseEvaluator } from "@reaatech/pi-bench-runner";
24
+ import { generateBenignSamples } from "@reaatech/pi-bench-runner";
25
+
26
+ // src/report-data.ts
27
+ import { calculateDefenseScore } from "@reaatech/pi-bench-scoring";
28
+ function isObject(value) {
29
+ return typeof value === "object" && value !== null;
30
+ }
31
+ function isBenchmarkResult(value) {
32
+ return isObject(value) && Array.isArray(value.attackResults) && Array.isArray(value.benignResults) && typeof value.defense === "string" && typeof value.defenseVersion === "string";
33
+ }
34
+ function toCategoryBreakdown(categoryScores) {
35
+ if (!categoryScores) {
36
+ return {};
37
+ }
38
+ return Object.fromEntries(
39
+ Object.entries(categoryScores).map(([category, score]) => [
40
+ category,
41
+ {
42
+ detectionRate: score.detectionRate,
43
+ totalAttacks: score.totalAttacks
44
+ }
45
+ ])
46
+ );
47
+ }
48
+ function normalizeReportData(results) {
49
+ if (!isObject(results)) {
50
+ throw new Error("Results must be a JSON object");
51
+ }
52
+ const score = isObject(results.score) ? results.score : void 0;
53
+ const overallMetrics = isObject(results.overallMetrics) ? results.overallMetrics : void 0;
54
+ const categoryBreakdown = isObject(results.categoryBreakdown) ? results.categoryBreakdown : void 0;
55
+ if (score && overallMetrics) {
56
+ return {
57
+ defense: typeof results.defense === "string" ? results.defense : score.defense,
58
+ version: typeof results.defenseVersion === "string" ? results.defenseVersion : score.version,
59
+ corpusVersion: typeof results.corpusVersion === "string" ? results.corpusVersion : isObject(results.metadata) && typeof results.metadata.corpusVersion === "string" ? results.metadata.corpusVersion : void 0,
60
+ generatedAt: isObject(results.metadata) && typeof results.metadata.generatedAt === "string" ? results.metadata.generatedAt : typeof results.timestamp === "string" ? results.timestamp : void 0,
61
+ detectionRate: typeof overallMetrics.detectionRate === "number" ? overallMetrics.detectionRate : 1 - score.attackSuccessRate,
62
+ falsePositiveRate: typeof overallMetrics.fpr === "number" ? overallMetrics.fpr : score.falsePositiveRate,
63
+ totalAttacks: typeof overallMetrics.totalAttacks === "number" ? overallMetrics.totalAttacks : score.totalSamples,
64
+ avgLatencyMs: typeof overallMetrics.avgLatencyMs === "number" ? overallMetrics.avgLatencyMs : score.avgLatencyMs,
65
+ categoryBreakdown: categoryBreakdown || toCategoryBreakdown(score.categoryScores)
66
+ };
67
+ }
68
+ if (isBenchmarkResult(results)) {
69
+ const derivedScore = calculateDefenseScore(results);
70
+ return {
71
+ defense: results.defense,
72
+ version: results.defenseVersion,
73
+ corpusVersion: results.corpusVersion,
74
+ generatedAt: results.timestamp,
75
+ detectionRate: 1 - derivedScore.attackSuccessRate,
76
+ falsePositiveRate: derivedScore.falsePositiveRate,
77
+ totalAttacks: derivedScore.totalSamples,
78
+ avgLatencyMs: derivedScore.avgLatencyMs,
79
+ categoryBreakdown: toCategoryBreakdown(derivedScore.categoryScores)
80
+ };
81
+ }
82
+ if (typeof results.detectionRate === "number" && typeof results.falsePositiveRate === "number" && typeof results.totalAttacks === "number" && typeof results.avgLatencyMs === "number") {
83
+ return {
84
+ defense: typeof results.defense === "string" ? results.defense : void 0,
85
+ version: typeof results.version === "string" ? results.version : void 0,
86
+ corpusVersion: typeof results.corpusVersion === "string" ? results.corpusVersion : void 0,
87
+ generatedAt: typeof results.timestamp === "string" ? results.timestamp : void 0,
88
+ detectionRate: results.detectionRate,
89
+ falsePositiveRate: results.falsePositiveRate,
90
+ totalAttacks: results.totalAttacks,
91
+ avgLatencyMs: results.avgLatencyMs,
92
+ categoryBreakdown: categoryBreakdown || {}
93
+ };
94
+ }
95
+ throw new Error("Results file is not a recognized benchmark or report format");
96
+ }
97
+
98
+ // src/mcp-server.ts
99
+ var logger = createLogger();
100
+ var DEFAULT_CONFIG = {
101
+ name: "prompt-injection-bench",
102
+ version: "1.0.0"
103
+ };
104
+ function validateFilePath(filePath) {
105
+ if (!filePath || filePath.trim().length === 0) {
106
+ throw new Error("File path cannot be empty");
107
+ }
108
+ const normalized = filePath.replace(/\\/g, "/");
109
+ if (normalized.includes("../") || normalized.includes("..\\")) {
110
+ throw new Error("File path cannot contain path traversal sequences");
111
+ }
112
+ if (normalized.startsWith("/")) {
113
+ throw new Error("Absolute file paths are not allowed");
114
+ }
115
+ if (normalized.includes("~")) {
116
+ throw new Error("Home directory shortcuts are not allowed");
117
+ }
118
+ if (/^\.\./.test(normalized)) {
119
+ throw new Error("File path cannot start with parent directory reference");
120
+ }
121
+ if (normalized.includes("//")) {
122
+ throw new Error("File path cannot contain double slashes");
123
+ }
124
+ if (/^\s/.test(normalized) || /\s$/.test(normalized)) {
125
+ throw new Error("File path cannot have leading or trailing whitespace");
126
+ }
127
+ if (/\.\./.test(normalized)) {
128
+ const parts = normalized.split("/");
129
+ for (const part of parts) {
130
+ if (part === ".." || part.endsWith("..")) {
131
+ throw new Error("File path contains parent directory references");
132
+ }
133
+ }
134
+ }
135
+ const pathSegments = normalized.split("/").filter(Boolean);
136
+ for (const segment of pathSegments) {
137
+ if (segment === ".." || segment.includes("..")) {
138
+ throw new Error("File path contains invalid segments");
139
+ }
140
+ }
141
+ }
142
+ async function loadAdapter(name) {
143
+ switch (name.toLowerCase()) {
144
+ case "mock":
145
+ return createMockAdapter(0.85, 0.05);
146
+ case "rebuff":
147
+ return createRebuffAdapter();
148
+ case "lakera":
149
+ return createLakeraAdapter();
150
+ case "llm-guard":
151
+ return createLLMGuardAdapter();
152
+ case "garak":
153
+ return createGarakAdapter();
154
+ case "moderation-openai":
155
+ return createModerationAdapter({ provider: "openai" });
156
+ case "moderation-azure":
157
+ return createModerationAdapter({ provider: "azure" });
158
+ case "moderation-anthropic":
159
+ return createModerationAdapter({ provider: "anthropic" });
160
+ case "moderation-cohere":
161
+ return createModerationAdapter({ provider: "cohere" });
162
+ default:
163
+ throw new Error(`Unknown defense adapter: ${name}`);
164
+ }
165
+ }
166
+ var BenchmarkMCPServer = class {
167
+ server;
168
+ config;
169
+ constructor(config = {}) {
170
+ this.config = { ...DEFAULT_CONFIG, ...config };
171
+ this.server = new Server(
172
+ {
173
+ name: this.config.name,
174
+ version: this.config.version
175
+ },
176
+ {
177
+ capabilities: {
178
+ tools: {}
179
+ }
180
+ }
181
+ );
182
+ this.setupToolHandlers();
183
+ }
184
+ /**
185
+ * Get the list of available tools (exposed for testing)
186
+ */
187
+ getToolDefinitions() {
188
+ return [
189
+ {
190
+ name: "run_benchmark",
191
+ description: "Execute a full benchmark against defenses",
192
+ inputSchema: {
193
+ type: "object",
194
+ properties: {
195
+ defense: { type: "string", description: "Defense adapter name" },
196
+ corpus: { type: "string", description: "Corpus version to use", default: "2026.04" },
197
+ categories: {
198
+ type: "array",
199
+ items: { type: "string" },
200
+ description: "Attack categories to include"
201
+ },
202
+ parallel: { type: "number", description: "Number of parallel executions", default: 10 },
203
+ timeout_ms: { type: "number", description: "Timeout per attack in ms", default: 3e4 }
204
+ },
205
+ required: ["defense"]
206
+ }
207
+ },
208
+ {
209
+ name: "compare_defenses",
210
+ description: "Compare multiple defense results",
211
+ inputSchema: {
212
+ type: "object",
213
+ properties: {
214
+ results: {
215
+ type: "array",
216
+ items: { type: "string" },
217
+ description: "Paths to result files to compare"
218
+ },
219
+ significance_level: {
220
+ type: "number",
221
+ description: "Statistical significance level",
222
+ default: 0.05
223
+ }
224
+ },
225
+ required: ["results"]
226
+ }
227
+ },
228
+ {
229
+ name: "generate_report",
230
+ description: "Generate HTML/JSON reports from benchmark results",
231
+ inputSchema: {
232
+ type: "object",
233
+ properties: {
234
+ results: { type: "string", description: "Path to results file" },
235
+ format: {
236
+ type: "string",
237
+ enum: ["json", "html", "markdown"],
238
+ description: "Output format",
239
+ default: "json"
240
+ },
241
+ include_categories: {
242
+ type: "boolean",
243
+ description: "Include category breakdown",
244
+ default: true
245
+ },
246
+ output: { type: "string", description: "Output file path" }
247
+ },
248
+ required: ["results"]
249
+ }
250
+ },
251
+ {
252
+ name: "submit_results",
253
+ description: "Submit results to the public leaderboard",
254
+ inputSchema: {
255
+ type: "object",
256
+ properties: {
257
+ results: { type: "string", description: "Path to results file" },
258
+ defense_name: { type: "string", description: "Name of the defense" },
259
+ defense_version: { type: "string", description: "Version of the defense" },
260
+ reproducibility_proof: {
261
+ type: "object",
262
+ properties: {
263
+ seed: { type: "string", description: "Seed used for the run" },
264
+ corpus_version: { type: "string", description: "Corpus version used" },
265
+ adapter_versions: {
266
+ type: "object",
267
+ additionalProperties: { type: "string" },
268
+ description: "Versions of adapters used"
269
+ }
270
+ },
271
+ description: "Reproducibility proof data"
272
+ }
273
+ },
274
+ required: ["results", "defense_name"]
275
+ }
276
+ }
277
+ ];
278
+ }
279
+ /**
280
+ * Set up tool request handlers
281
+ */
282
+ setupToolHandlers() {
283
+ this.server.setRequestHandler(ListToolsRequestSchema, async () => {
284
+ return {
285
+ tools: this.getToolDefinitions()
286
+ };
287
+ });
288
+ this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
289
+ const { name, arguments: args } = request.params;
290
+ switch (name) {
291
+ case "run_benchmark":
292
+ return await this.handleRunBenchmark(args);
293
+ case "compare_defenses":
294
+ return await this.handleCompareDefenses(args);
295
+ case "generate_report":
296
+ return await this.handleGenerateReport(args);
297
+ case "submit_results":
298
+ return await this.handleSubmitResults(args);
299
+ default:
300
+ throw new Error(`Unknown tool: ${name}`);
301
+ }
302
+ });
303
+ }
304
+ /**
305
+ * Handle run_benchmark tool call
306
+ */
307
+ async handleRunBenchmark(args) {
308
+ const runArgs = args;
309
+ const defenseName = runArgs.defense;
310
+ const corpusVersion = runArgs.corpus || "2026.04";
311
+ const parallel = runArgs.parallel || 10;
312
+ const timeoutMs = runArgs.timeout_ms || 3e4;
313
+ const categories = runArgs.categories;
314
+ logger.info("MCP: Starting benchmark", {
315
+ defense: defenseName,
316
+ corpus: corpusVersion,
317
+ parallel,
318
+ timeoutMs
319
+ });
320
+ try {
321
+ const adapter = await loadAdapter(defenseName);
322
+ if (adapter.initialize) {
323
+ await adapter.initialize?.();
324
+ }
325
+ const engine = createBenchmarkEngine({ maxParallel: parallel, defaultTimeoutMs: timeoutMs });
326
+ engine.setAdapter(adapter);
327
+ const corpus = generateDefaultCorpus();
328
+ const attackSamples = categories ? corpus.filter((s) => categories.includes(s.category)) : corpus;
329
+ const benignSamples = generateBenignSamples(100);
330
+ const result = await engine.runBenchmark(
331
+ {
332
+ defense: defenseName,
333
+ corpusVersion,
334
+ categories: categories || getCategoryIds(),
335
+ parallel,
336
+ timeoutMs,
337
+ seed: Date.now().toString(36)
338
+ },
339
+ attackSamples,
340
+ benignSamples
341
+ );
342
+ const evaluator = createDefenseEvaluator();
343
+ const evaluation = evaluator.evaluate(result);
344
+ await adapter.cleanup?.();
345
+ const output = {
346
+ status: "success",
347
+ runId: result.runId,
348
+ defense: defenseName,
349
+ defenseVersion: adapter.version,
350
+ corpusVersion,
351
+ score: evaluation.score,
352
+ overallMetrics: evaluation.overallMetrics,
353
+ categoryBreakdown: evaluation.categoryBreakdown,
354
+ timestamp: result.timestamp
355
+ };
356
+ return {
357
+ content: [
358
+ {
359
+ type: "text",
360
+ text: JSON.stringify(output, null, 2)
361
+ }
362
+ ]
363
+ };
364
+ } catch (error) {
365
+ logger.error("MCP: Benchmark failed", {
366
+ error: error instanceof Error ? error.message : String(error)
367
+ });
368
+ return {
369
+ content: [
370
+ {
371
+ type: "text",
372
+ text: JSON.stringify(
373
+ {
374
+ status: "error",
375
+ error: error instanceof Error ? error.message : "Unknown error"
376
+ },
377
+ null,
378
+ 2
379
+ )
380
+ }
381
+ ]
382
+ };
383
+ }
384
+ }
385
+ /**
386
+ * Handle compare_defenses tool call
387
+ */
388
+ async handleCompareDefenses(args) {
389
+ const compareArgs = args;
390
+ const resultFiles = compareArgs.results;
391
+ const significanceLevel = compareArgs.significance_level || 0.05;
392
+ logger.info("MCP: Comparing defenses", { files: resultFiles, significance: significanceLevel });
393
+ try {
394
+ if (resultFiles.length < 2) {
395
+ throw new Error("At least 2 result files are required for comparison");
396
+ }
397
+ const results = await Promise.all(
398
+ resultFiles.map(async (file) => {
399
+ validateFilePath(file);
400
+ const content = await readFile(file, "utf-8");
401
+ const parsed = JSON.parse(content);
402
+ const score = parsed.score || parsed;
403
+ const validated = DefenseScoreSchema.parse(score);
404
+ return { ...parsed, score: validated };
405
+ })
406
+ );
407
+ const evaluator = createDefenseEvaluator();
408
+ const comparisons = [];
409
+ for (let i = 0; i < results.length; i++) {
410
+ for (let j = i + 1; j < results.length; j++) {
411
+ const score1 = results[i].score;
412
+ const score2 = results[j].score;
413
+ const defense1Name = results[i].defense || results[i].defenseName || `Defense ${i + 1}`;
414
+ const defense2Name = results[j].defense || results[j].defenseName || `Defense ${j + 1}`;
415
+ const comparison = evaluator.compare(score1, score2);
416
+ comparisons.push({
417
+ defense1: defense1Name,
418
+ defense2: defense2Name,
419
+ winner: comparison.winner,
420
+ scoreDifference: comparison.scoreDifference,
421
+ asrDifference: comparison.asrDifference,
422
+ fprDifference: comparison.fprDifference
423
+ });
424
+ }
425
+ }
426
+ return {
427
+ content: [
428
+ {
429
+ type: "text",
430
+ text: JSON.stringify(
431
+ {
432
+ status: "success",
433
+ comparisons,
434
+ significanceLevel,
435
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
436
+ },
437
+ null,
438
+ 2
439
+ )
440
+ }
441
+ ]
442
+ };
443
+ } catch (error) {
444
+ logger.error("MCP: Compare failed", {
445
+ error: error instanceof Error ? error.message : String(error)
446
+ });
447
+ return {
448
+ content: [
449
+ {
450
+ type: "text",
451
+ text: JSON.stringify(
452
+ {
453
+ status: "error",
454
+ error: error instanceof Error ? error.message : "Unknown error"
455
+ },
456
+ null,
457
+ 2
458
+ )
459
+ }
460
+ ]
461
+ };
462
+ }
463
+ }
464
+ /**
465
+ * Handle generate_report tool call
466
+ */
467
+ async handleGenerateReport(args) {
468
+ const reportArgs = args;
469
+ const resultsFile = reportArgs.results;
470
+ const format = reportArgs.format || "json";
471
+ const includeCategories = reportArgs.include_categories ?? true;
472
+ logger.info("MCP: Generating report", { results: resultsFile, format });
473
+ try {
474
+ validateFilePath(resultsFile);
475
+ const content = await readFile(resultsFile, "utf-8");
476
+ const results = JSON.parse(content);
477
+ const reportData = normalizeReportData(results);
478
+ let report;
479
+ if (format === "markdown") {
480
+ report = generateMarkdownReport(reportData, includeCategories);
481
+ } else if (format === "html") {
482
+ report = generateHtmlReport(reportData, includeCategories);
483
+ } else {
484
+ report = JSON.stringify(reportData, null, 2);
485
+ }
486
+ return {
487
+ content: [
488
+ {
489
+ type: "text",
490
+ text: report
491
+ }
492
+ ]
493
+ };
494
+ } catch (error) {
495
+ logger.error("MCP: Report generation failed", {
496
+ error: error instanceof Error ? error.message : String(error)
497
+ });
498
+ return {
499
+ content: [
500
+ {
501
+ type: "text",
502
+ text: JSON.stringify(
503
+ {
504
+ status: "error",
505
+ error: error instanceof Error ? error.message : "Unknown error"
506
+ },
507
+ null,
508
+ 2
509
+ )
510
+ }
511
+ ]
512
+ };
513
+ }
514
+ }
515
+ /**
516
+ * Handle submit_results tool call
517
+ */
518
+ async handleSubmitResults(args) {
519
+ const submitArgs = args;
520
+ const resultsFile = submitArgs.results;
521
+ const defenseName = submitArgs.defense_name;
522
+ const defenseVersion = submitArgs.defense_version || "1.0.0";
523
+ const proof = submitArgs.reproducibility_proof;
524
+ logger.info("MCP: Submitting results", { defense: defenseName, version: defenseVersion });
525
+ try {
526
+ validateFilePath(resultsFile);
527
+ const content = await readFile(resultsFile, "utf-8");
528
+ const parsed = JSON.parse(content);
529
+ if (parsed.score) {
530
+ DefenseScoreSchema.parse(parsed.score);
531
+ }
532
+ const results = parsed;
533
+ const proofHash = proof?.seed || `local-${Date.now().toString(36)}`;
534
+ const manager = createLeaderboardManager();
535
+ manager.replaceEntries(loadLeaderboardEntries());
536
+ manager.addEntry({
537
+ defense: defenseName,
538
+ version: defenseVersion,
539
+ overallScore: results.score?.overallScore || results.overallScore || 0,
540
+ submittedAt: (/* @__PURE__ */ new Date()).toISOString(),
541
+ corpusVersion: results.corpusVersion || "2026.04",
542
+ categoryScores: results.score?.categoryScores ? Object.fromEntries(
543
+ Object.entries(results.score.categoryScores).map(([category, value]) => [
544
+ category,
545
+ value.detectionRate || 0
546
+ ])
547
+ ) : {},
548
+ proofHash,
549
+ submitter: "mcp-user"
550
+ });
551
+ saveLeaderboardEntries(manager.getAllEntries());
552
+ const entry = {
553
+ status: "success",
554
+ submission: {
555
+ defense: defenseName,
556
+ version: defenseVersion,
557
+ overallScore: results.score?.overallScore || results.overallScore || 0,
558
+ submittedAt: (/* @__PURE__ */ new Date()).toISOString(),
559
+ corpusVersion: results.corpusVersion || "2026.04",
560
+ proofHash,
561
+ storagePath: getDefaultLeaderboardPath()
562
+ }
563
+ };
564
+ return {
565
+ content: [
566
+ {
567
+ type: "text",
568
+ text: JSON.stringify(entry, null, 2)
569
+ }
570
+ ]
571
+ };
572
+ } catch (error) {
573
+ logger.error("MCP: Submission failed", {
574
+ error: error instanceof Error ? error.message : String(error)
575
+ });
576
+ return {
577
+ content: [
578
+ {
579
+ type: "text",
580
+ text: JSON.stringify(
581
+ {
582
+ status: "error",
583
+ error: error instanceof Error ? error.message : "Unknown error"
584
+ },
585
+ null,
586
+ 2
587
+ )
588
+ }
589
+ ]
590
+ };
591
+ }
592
+ }
593
+ /**
594
+ * Start the MCP server
595
+ */
596
+ async start() {
597
+ const transport = new StdioServerTransport();
598
+ await this.server.connect(transport);
599
+ console.log("MCP server started");
600
+ }
601
+ };
602
+ function generateMarkdownReport(results, includeCategories) {
603
+ let md = "# Prompt Injection Benchmark Report\n\n";
604
+ md += `Generated: ${results.generatedAt || (/* @__PURE__ */ new Date()).toISOString()}
605
+
606
+ `;
607
+ if (results.defense) {
608
+ md += `Defense: ${results.defense}${results.version ? ` (${results.version})` : ""}
609
+
610
+ `;
611
+ }
612
+ md += "## Summary\n\n";
613
+ md += "| Metric | Value |\n|--------|-------|\n";
614
+ md += `| Detection Rate | ${(results.detectionRate * 100).toFixed(1)}% |
615
+ `;
616
+ md += `| False Positive Rate | ${(results.falsePositiveRate * 100).toFixed(1)}% |
617
+ `;
618
+ md += `| Total Attacks | ${results.totalAttacks || 0} |
619
+ `;
620
+ md += `| Avg Latency | ${results.avgLatencyMs.toFixed(1)}ms |
621
+ `;
622
+ if (includeCategories && Object.keys(results.categoryBreakdown).length > 0) {
623
+ md += "\n## Category Breakdown\n\n";
624
+ md += "| Category | Detection Rate | Attacks |\n|----------|----------------|---------|\n";
625
+ for (const [category, data] of Object.entries(results.categoryBreakdown)) {
626
+ md += `| ${category} | ${(data.detectionRate * 100).toFixed(1)}% | ${data.totalAttacks} |
627
+ `;
628
+ }
629
+ }
630
+ return md;
631
+ }
632
+ function generateHtmlReport(results, includeCategories) {
633
+ return `<!DOCTYPE html>
634
+ <html>
635
+ <head>
636
+ <title>Prompt Injection Benchmark Report</title>
637
+ <style>
638
+ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 800px; margin: 40px auto; padding: 0 20px; }
639
+ h1 { color: #333; }
640
+ table { border-collapse: collapse; width: 100%; margin: 20px 0; }
641
+ th, td { border: 1px solid #ddd; padding: 12px; text-align: left; }
642
+ th { background: #f5f5f5; }
643
+ .metric { display: inline-block; margin: 10px 20px 10px 0; padding: 15px 25px; background: #f8f9fa; border-radius: 8px; }
644
+ .metric-value { font-size: 24px; font-weight: bold; color: #007bff; }
645
+ .metric-label { font-size: 12px; color: #666; text-transform: uppercase; }
646
+ </style>
647
+ </head>
648
+ <body>
649
+ <h1>Prompt Injection Benchmark Report</h1>
650
+ <p>Generated: ${results.generatedAt || (/* @__PURE__ */ new Date()).toISOString()}</p>
651
+ ${results.defense ? `<p>Defense: ${results.defense}${results.version ? ` (${results.version})` : ""}</p>` : ""}
652
+
653
+ <h2>Summary</h2>
654
+ <div class="metric">
655
+ <div class="metric-value">${(results.detectionRate * 100).toFixed(1)}%</div>
656
+ <div class="metric-label">Detection Rate</div>
657
+ </div>
658
+ <div class="metric">
659
+ <div class="metric-value">${(results.falsePositiveRate * 100).toFixed(1)}%</div>
660
+ <div class="metric-label">False Positive Rate</div>
661
+ </div>
662
+ <div class="metric">
663
+ <div class="metric-value">${results.totalAttacks || 0}</div>
664
+ <div class="metric-label">Total Attacks</div>
665
+ </div>
666
+ <div class="metric">
667
+ <div class="metric-value">${results.avgLatencyMs.toFixed(1)}ms</div>
668
+ <div class="metric-label">Avg Latency</div>
669
+ </div>
670
+ ${includeCategories && Object.keys(results.categoryBreakdown).length > 0 ? `
671
+ <h2>Category Breakdown</h2>
672
+ <table>
673
+ <tr><th>Category</th><th>Detection Rate</th><th>Attacks</th></tr>
674
+ ${Object.entries(results.categoryBreakdown).map(
675
+ ([category, data]) => `<tr><td>${category}</td><td>${(data.detectionRate * 100).toFixed(1)}%</td><td>${data.totalAttacks}</td></tr>`
676
+ ).join("")}
677
+ </table>
678
+ ` : ""}
679
+ </body>
680
+ </html>`;
681
+ }
682
+ function createMCPServer(config) {
683
+ return new BenchmarkMCPServer(config);
684
+ }
685
+
686
+ // src/seed-manager.ts
687
+ import { createHash } from "crypto";
688
+ var SeedManager = class {
689
+ seed;
690
+ config;
691
+ constructor(config = {}) {
692
+ this.config = {
693
+ baseSeed: Date.now(),
694
+ corpusVersion: "2026.04",
695
+ adapterVersions: {},
696
+ ...config
697
+ };
698
+ this.seed = this.config.baseSeed;
699
+ }
700
+ /**
701
+ * Get the current seed
702
+ */
703
+ getSeed() {
704
+ return this.seed;
705
+ }
706
+ /**
707
+ * Set a new seed
708
+ */
709
+ setSeed(seed) {
710
+ this.seed = seed;
711
+ }
712
+ /**
713
+ * Generate a deterministic random number
714
+ */
715
+ next() {
716
+ const a = 1664525;
717
+ const c = 1013904223;
718
+ const m = 2 ** 32;
719
+ this.seed = (a * this.seed + c) % m;
720
+ return this.seed / m;
721
+ }
722
+ /**
723
+ * Generate a deterministic random integer in range
724
+ */
725
+ nextInt(min, max) {
726
+ return Math.floor(this.next() * (max - min + 1)) + min;
727
+ }
728
+ /**
729
+ * Shuffle an array deterministically
730
+ */
731
+ shuffle(array) {
732
+ const shuffled = [...array];
733
+ for (let i = shuffled.length - 1; i > 0; i--) {
734
+ const j = this.nextInt(0, i);
735
+ const temp = shuffled[i];
736
+ const swapVal = shuffled[j];
737
+ if (temp == null || swapVal == null) continue;
738
+ shuffled[i] = swapVal;
739
+ shuffled[j] = temp;
740
+ }
741
+ return shuffled;
742
+ }
743
+ /**
744
+ * Create a hash from config for reproducibility proof
745
+ */
746
+ createReproducibilityHash() {
747
+ const data = JSON.stringify({
748
+ seed: this.seed,
749
+ corpusVersion: this.config.corpusVersion,
750
+ adapterVersions: this.config.adapterVersions
751
+ });
752
+ return createHash("sha256").update(data).digest("hex");
753
+ }
754
+ /**
755
+ * Get the full config for persistence
756
+ */
757
+ getConfig() {
758
+ return { ...this.config, baseSeed: this.seed };
759
+ }
760
+ };
761
+ function createSeedManager(config) {
762
+ return new SeedManager(config);
763
+ }
764
+ export {
765
+ BenchmarkMCPServer,
766
+ SeedManager,
767
+ createMCPServer,
768
+ createSeedManager,
769
+ normalizeReportData
770
+ };