cipher-security 2.0.8 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/bin/cipher.js +11 -1
  2. package/lib/agent-runtime/handlers/architect.js +199 -0
  3. package/lib/agent-runtime/handlers/base.js +240 -0
  4. package/lib/agent-runtime/handlers/blue.js +220 -0
  5. package/lib/agent-runtime/handlers/incident.js +161 -0
  6. package/lib/agent-runtime/handlers/privacy.js +190 -0
  7. package/lib/agent-runtime/handlers/purple.js +209 -0
  8. package/lib/agent-runtime/handlers/recon.js +174 -0
  9. package/lib/agent-runtime/handlers/red.js +246 -0
  10. package/lib/agent-runtime/handlers/researcher.js +170 -0
  11. package/lib/agent-runtime/handlers.js +35 -0
  12. package/lib/agent-runtime/index.js +196 -0
  13. package/lib/agent-runtime/parser.js +316 -0
  14. package/lib/analyze/consistency.js +566 -0
  15. package/lib/analyze/constitution.js +110 -0
  16. package/lib/analyze/sharding.js +251 -0
  17. package/lib/autonomous/agent-tool.js +165 -0
  18. package/lib/autonomous/feedback-loop.js +13 -6
  19. package/lib/autonomous/framework.js +17 -0
  20. package/lib/autonomous/handoff.js +506 -0
  21. package/lib/autonomous/modes/blue.js +26 -0
  22. package/lib/autonomous/modes/red.js +585 -0
  23. package/lib/autonomous/modes/researcher.js +322 -0
  24. package/lib/autonomous/researcher.js +12 -45
  25. package/lib/autonomous/runner.js +9 -537
  26. package/lib/benchmark/agent.js +88 -26
  27. package/lib/benchmark/baselines.js +3 -0
  28. package/lib/benchmark/claude-code-solver.js +254 -0
  29. package/lib/benchmark/cognitive.js +283 -0
  30. package/lib/benchmark/index.js +12 -2
  31. package/lib/benchmark/knowledge.js +281 -0
  32. package/lib/benchmark/llm.js +156 -15
  33. package/lib/benchmark/models.js +5 -2
  34. package/lib/benchmark/nyu-ctf.js +192 -0
  35. package/lib/benchmark/overthewire.js +347 -0
  36. package/lib/benchmark/picoctf.js +281 -0
  37. package/lib/benchmark/prompts.js +280 -0
  38. package/lib/benchmark/registry.js +219 -0
  39. package/lib/benchmark/remote-solver.js +356 -0
  40. package/lib/benchmark/remote-target.js +263 -0
  41. package/lib/benchmark/reporter.js +35 -0
  42. package/lib/benchmark/runner.js +174 -10
  43. package/lib/benchmark/sandbox.js +35 -0
  44. package/lib/benchmark/scorer.js +22 -4
  45. package/lib/benchmark/solver.js +34 -1
  46. package/lib/benchmark/tools.js +262 -16
  47. package/lib/commands.js +9 -0
  48. package/lib/execution/council.js +434 -0
  49. package/lib/execution/parallel.js +292 -0
  50. package/lib/gates/circuit-breaker.js +135 -0
  51. package/lib/gates/confidence.js +302 -0
  52. package/lib/gates/corrections.js +219 -0
  53. package/lib/gates/self-check.js +245 -0
  54. package/lib/gateway/commands.js +727 -0
  55. package/lib/guardrails/engine.js +364 -0
  56. package/lib/mcp/server.js +349 -3
  57. package/lib/memory/compressor.js +94 -7
  58. package/lib/pipeline/hooks.js +288 -0
  59. package/lib/pipeline/index.js +11 -0
  60. package/lib/review/budget.js +210 -0
  61. package/lib/review/engine.js +526 -0
  62. package/lib/review/layers/acceptance-auditor.js +279 -0
  63. package/lib/review/layers/blind-hunter.js +500 -0
  64. package/lib/review/layers/defense-in-depth.js +209 -0
  65. package/lib/review/layers/edge-case-hunter.js +266 -0
  66. package/lib/review/panel.js +519 -0
  67. package/lib/review/two-stage.js +244 -0
  68. package/lib/session/cost-tracker.js +203 -0
  69. package/lib/session/logger.js +349 -0
  70. package/package.json +1 -1
@@ -1206,3 +1206,730 @@ export async function handleUpdate(args = {}) {
1206
1206
  return { error: true, message: `Update failed: ${err.message}` };
1207
1207
  }
1208
1208
  }
1209
+
1210
+ // ---------------------------------------------------------------------------
1211
+ // Review — Multi-Layer Code Review Engine
1212
+ // ---------------------------------------------------------------------------
1213
+
1214
+ /**
1215
+ * cipher review <target> [--format json|text] [--min-severity critical|high|medium|low|info]
1216
+ *
1217
+ * Run 3-layer parallel code review: Blind Hunter (pattern-based vuln detection),
1218
+ * Edge Case Hunter (boundary/failure analysis), Acceptance Auditor (security architecture).
1219
+ *
1220
+ * @param {string[]|object} args
1221
+ * @returns {Promise<object>}
1222
+ */
1223
+ export async function handleReview(args = {}) {
1224
+ const argList = Array.isArray(args) ? args : [args.target].filter(Boolean);
1225
+ const target = argList.find((a) => !a.startsWith('-'));
1226
+
1227
+ if (!target) {
1228
+ return {
1229
+ error: true,
1230
+ message: [
1231
+ 'Usage: cipher review <file|dir|code> [options]',
1232
+ '',
1233
+ 'Options:',
1234
+ ' --format <text|json> Output format (default: text)',
1235
+ ' --min-severity <level> Filter: critical, high, medium, low, info',
1236
+ '',
1237
+ 'Runs 3 parallel review layers:',
1238
+ ' • Blind Hunter — Pattern-based vulnerability detection',
1239
+ ' • Edge Case Hunter — Boundary condition & failure analysis',
1240
+ ' • Acceptance Auditor — Security architecture review',
1241
+ ].join('\n'),
1242
+ };
1243
+ }
1244
+
1245
+ const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
1246
+ const minSeverity = argList.find((a, i) => argList[i - 1] === '--min-severity');
1247
+
1248
+ try {
1249
+ const { createReviewEngine } = await import('../review/engine.js');
1250
+ const engine = await createReviewEngine();
1251
+ const result = await engine.review(target, { minSeverity });
1252
+
1253
+ if (format === 'json') {
1254
+ return result.toJSON();
1255
+ }
1256
+ return { output: result.toReport() };
1257
+ } catch (err) {
1258
+ return { error: true, message: `Review failed: ${err.message}` };
1259
+ }
1260
+ }
1261
+
1262
+ // ---------------------------------------------------------------------------
1263
+ // Panel — Expert Panel Security Assessment
1264
+ // ---------------------------------------------------------------------------
1265
+
1266
+ /**
1267
+ * cipher panel <target> [--format json|text]
1268
+ *
1269
+ * Run 3-persona expert panel assessment: Red Team, Blue Team, Architect.
1270
+ *
1271
+ * @param {string[]|object} args
1272
+ * @returns {Promise<object>}
1273
+ */
1274
+ export async function handlePanel(args = {}) {
1275
+ const argList = Array.isArray(args) ? args : [args.target].filter(Boolean);
1276
+ const target = argList.find((a) => !a.startsWith('-'));
1277
+
1278
+ if (!target) {
1279
+ return {
1280
+ error: true,
1281
+ message: [
1282
+ 'Usage: cipher panel <file|dir|code> [options]',
1283
+ '',
1284
+ 'Options:',
1285
+ ' --format <text|json> Output format (default: text)',
1286
+ '',
1287
+ 'Runs 3 simulated expert persona reviews:',
1288
+ ' • Red Team Expert — Attack surface, exploitation chains',
1289
+ ' • Blue Team Expert — Detection gaps, logging, monitoring',
1290
+ ' • Architect Expert — Trust boundaries, auth design, OWASP',
1291
+ ].join('\n'),
1292
+ };
1293
+ }
1294
+
1295
+ const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
1296
+
1297
+ try {
1298
+ const { panelReview } = await import('../review/panel.js');
1299
+ const result = await panelReview(target, { format });
1300
+
1301
+ if (format === 'json') {
1302
+ return result.toJSON();
1303
+ }
1304
+ return { output: result.toReport() };
1305
+ } catch (err) {
1306
+ return { error: true, message: `Panel review failed: ${err.message}` };
1307
+ }
1308
+ }
1309
+
1310
+ // ---------------------------------------------------------------------------
1311
+ // Analyze — Cross-Artifact Consistency Analyzer
1312
+ // ---------------------------------------------------------------------------
1313
+
1314
+ /**
1315
+ * cipher analyze [--format json|text] [--root <path>]
1316
+ *
1317
+ * Scan CIPHER artifacts for stale references, orphans, mode mismatches,
1318
+ * coverage gaps, and structural issues.
1319
+ *
1320
+ * @param {string[]|object} args
1321
+ * @returns {Promise<object>}
1322
+ */
1323
+ export async function handleAnalyze(args = {}) {
1324
+ const argList = Array.isArray(args) ? args : [];
1325
+ const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
1326
+ const root = argList.find((a, i) => argList[i - 1] === '--root');
1327
+
1328
+ try {
1329
+ const { ConsistencyAnalyzer } = await import('../analyze/consistency.js');
1330
+ const analyzer = new ConsistencyAnalyzer(root || undefined);
1331
+ const result = analyzer.analyze();
1332
+
1333
+ if (format === 'json') {
1334
+ return result.toJSON();
1335
+ }
1336
+ return { output: result.toReport() };
1337
+ } catch (err) {
1338
+ return { error: true, message: `Analyze failed: ${err.message}` };
1339
+ }
1340
+ }
1341
+
1342
+ // ---------------------------------------------------------------------------
1343
+ // Shard — Semantic Document Sharding
1344
+ // ---------------------------------------------------------------------------
1345
+
1346
+ /**
1347
+ * cipher shard <file> [--level N] [--output <dir>] [--dry-run]
1348
+ *
1349
+ * Split a large markdown document into semantically coherent chunks.
1350
+ *
1351
+ * @param {string[]|object} args
1352
+ * @returns {Promise<object>}
1353
+ */
1354
+ export async function handleShard(args = {}) {
1355
+ const argList = Array.isArray(args) ? args : [args.target].filter(Boolean);
1356
+ const target = argList.find((a) => !a.startsWith('-'));
1357
+
1358
+ if (!target) {
1359
+ return {
1360
+ error: true,
1361
+ message: [
1362
+ 'Usage: cipher shard <file.md> [options]',
1363
+ '',
1364
+ 'Options:',
1365
+ ' --level <N> Heading level to split at (default: 2)',
1366
+ ' --output <dir> Output directory',
1367
+ ' --dry-run Preview shards without writing files',
1368
+ ' --format <json> Output as JSON',
1369
+ ].join('\n'),
1370
+ };
1371
+ }
1372
+
1373
+ const level = parseInt(argList.find((a, i) => argList[i - 1] === '--level') || '2', 10);
1374
+ const output = argList.find((a, i) => argList[i - 1] === '--output');
1375
+ const dryRun = argList.includes('--dry-run');
1376
+ const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
1377
+
1378
+ try {
1379
+ const { shardDocument } = await import('../analyze/sharding.js');
1380
+ const result = shardDocument(target, { level, output, dryRun });
1381
+
1382
+ if (format === 'json') {
1383
+ return result.toJSON();
1384
+ }
1385
+ return { output: result.toReport() };
1386
+ } catch (err) {
1387
+ return { error: true, message: `Shard failed: ${err.message}` };
1388
+ }
1389
+ }
1390
+
1391
+ // ---------------------------------------------------------------------------
1392
+ // Guardrail — Test Input/Output Guardrails
1393
+ // ---------------------------------------------------------------------------
1394
+
1395
+ /**
1396
+ * cipher guardrail <text> [--type input|output|both]
1397
+ *
1398
+ * Test text against CIPHER's guardrail tripwire system.
1399
+ *
1400
+ * @param {string[]|object} args
1401
+ * @returns {Promise<object>}
1402
+ */
1403
+ export async function handleGuardrail(args = {}) {
1404
+ const argList = Array.isArray(args) ? args : [args.text].filter(Boolean);
1405
+ const text = argList.filter((a) => !a.startsWith('-')).join(' ');
1406
+
1407
+ if (!text) {
1408
+ return {
1409
+ error: true,
1410
+ message: [
1411
+ 'Usage: cipher guardrail <text> [options]',
1412
+ '',
1413
+ 'Options:',
1414
+ ' --type <input|output|both> Which guardrails to run (default: both)',
1415
+ ' --format <json> Output as JSON',
1416
+ '',
1417
+ 'Tests text against guardrail tripwires:',
1418
+ ' • Prompt injection detection',
1419
+ ' • Scope compliance validation',
1420
+ ' • Dangerous command detection',
1421
+ ' • Data leak prevention',
1422
+ ].join('\n'),
1423
+ };
1424
+ }
1425
+
1426
+ const type = argList.find((a, i) => argList[i - 1] === '--type') || 'both';
1427
+ const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
1428
+
1429
+ try {
1430
+ const { createGuardrailEngine } = await import('../guardrails/engine.js');
1431
+ const engine = createGuardrailEngine();
1432
+ const results = await engine.audit(text);
1433
+
1434
+ if (format === 'json') {
1435
+ return {
1436
+ tripped: results.length > 0,
1437
+ tripwires: results.map((r) => ({
1438
+ guardrail: r.guardrail,
1439
+ type: r.type,
1440
+ severity: r.severity,
1441
+ reason: r.reason,
1442
+ action: r.action,
1443
+ })),
1444
+ };
1445
+ }
1446
+
1447
+ if (results.length === 0) {
1448
+ return { output: '✓ No guardrails tripped.' };
1449
+ }
1450
+
1451
+ const lines = [`✗ ${results.length} guardrail${results.length !== 1 ? 's' : ''} tripped:`, ''];
1452
+ for (const r of results) {
1453
+ lines.push(` [${r.severity.toUpperCase()}] ${r.guardrail} (${r.type})`);
1454
+ lines.push(` Reason: ${r.reason}`);
1455
+ lines.push(` Action: ${r.action}`);
1456
+ }
1457
+ return { output: lines.join('\n') };
1458
+ } catch (err) {
1459
+ return { error: true, message: `Guardrail check failed: ${err.message}` };
1460
+ }
1461
+ }
1462
+
1463
+ // ---------------------------------------------------------------------------
1464
+ // Chain — multi-mode agent chain
1465
+ // ---------------------------------------------------------------------------
1466
+
1467
+ export async function handleChain(args = {}) {
1468
+ try {
1469
+ const { initModes, availableModes } = await import('../autonomous/runner.js');
1470
+ const { runChain } = await import('../autonomous/handoff.js');
1471
+
1472
+ await initModes();
1473
+
1474
+ // Parse args — can be array or object
1475
+ const argList = Array.isArray(args) ? args : (args._ || []);
1476
+ const modesArg = argList[0] || '';
1477
+ const taskIdx = argList.indexOf('--task');
1478
+ const task = taskIdx >= 0 ? argList.slice(taskIdx + 1).join(' ') : '';
1479
+ const backendIdx = argList.indexOf('--backend');
1480
+ const backend = backendIdx >= 0 ? argList[backendIdx + 1] : null;
1481
+
1482
+ if (!modesArg) {
1483
+ const available = availableModes().join(', ');
1484
+ return {
1485
+ error: true,
1486
+ message: `Usage: cipher chain <modes> --task "<task>"\n\nModes (comma-separated): ${available}\nExample: cipher chain red,purple,blue --task "assess target"`,
1487
+ };
1488
+ }
1489
+
1490
+ const modes = modesArg.split(',').map(m => m.trim().toUpperCase()).filter(Boolean);
1491
+ const available = new Set(availableModes());
1492
+
1493
+ for (const mode of modes) {
1494
+ if (!available.has(mode)) {
1495
+ return {
1496
+ error: true,
1497
+ message: `Unknown mode: '${mode}'. Available: ${[...available].sort().join(', ')}`,
1498
+ };
1499
+ }
1500
+ }
1501
+
1502
+ if (!task) {
1503
+ return {
1504
+ error: true,
1505
+ message: 'Missing --task flag. Usage: cipher chain <modes> --task "<task description>"',
1506
+ };
1507
+ }
1508
+
1509
+ const result = await runChain(modes, { task, user_message: task }, { backend });
1510
+
1511
+ // Format output
1512
+ const lines = [];
1513
+ lines.push(`Chain: ${modes.join(' → ')}`);
1514
+ lines.push(`Duration: ${result.totalDurationS.toFixed(2)}s`);
1515
+ lines.push(`Tokens: ${result.totalTokensIn} in / ${result.totalTokensOut} out`);
1516
+ lines.push('');
1517
+
1518
+ for (let i = 0; i < result.results.length; i++) {
1519
+ const r = result.results[i];
1520
+ lines.push(`── ${r.mode} ──`);
1521
+ if (r.error) {
1522
+ lines.push(` Error: ${r.error}`);
1523
+ } else {
1524
+ lines.push(` ${(r.outputText || '').slice(0, 500)}`);
1525
+ }
1526
+ lines.push('');
1527
+ }
1528
+
1529
+ if (result.events.length > 0) {
1530
+ lines.push('── Handoff Timeline ──');
1531
+ for (const e of result.events) {
1532
+ const ts = new Date(e.timestamp * 1000).toISOString().slice(11, 19);
1533
+ lines.push(` ${ts} ${e.sourceMode} → ${e.targetMode} [${e.status}]`);
1534
+ }
1535
+ }
1536
+
1537
+ if (result.error) {
1538
+ lines.push(`\nChain error: ${result.error}`);
1539
+ }
1540
+
1541
+ return { output: lines.join('\n') };
1542
+ } catch (err) {
1543
+ return { error: true, message: `Chain failed: ${err.message}` };
1544
+ }
1545
+ }
1546
+
1547
+ // ---------------------------------------------------------------------------
1548
+ // Council — multi-model consensus (stub, fully implemented in M016/S04)
1549
+ // ---------------------------------------------------------------------------
1550
+
1551
+ export async function handleCouncil(args = {}) {
1552
+ try {
1553
+ const { LLMCouncil, runCouncil } = await import('../execution/council.js');
1554
+ const argList = Array.isArray(args) ? args : (args._ || []);
1555
+
1556
+ const dryRun = argList.includes('--dry-run');
1557
+ const membersIdx = argList.indexOf('--members');
1558
+ const members = membersIdx >= 0 ? parseInt(argList[membersIdx + 1], 10) : 3;
1559
+ const backendIdx = argList.indexOf('--backend');
1560
+ const backend = backendIdx >= 0 ? argList[backendIdx + 1] : null;
1561
+ const task = argList.filter(a => !a.startsWith('--') && (membersIdx < 0 || argList.indexOf(a) !== membersIdx + 1) && (backendIdx < 0 || argList.indexOf(a) !== backendIdx + 1)).join(' ');
1562
+
1563
+ if (!task) {
1564
+ return { error: true, message: 'Usage: cipher council "<task>" [--dry-run] [--members N] [--backend <backend>]' };
1565
+ }
1566
+
1567
+ const result = await runCouncil(task, { members, backend, dryRun });
1568
+ return { output: JSON.stringify(result, null, 2) };
1569
+ } catch (err) {
1570
+ if (err.code === 'ERR_MODULE_NOT_FOUND' || err.message?.includes('Cannot find module')) {
1571
+ return { error: true, message: 'Council module not yet available. Coming in M016/S04.' };
1572
+ }
1573
+ return { error: true, message: `Council failed: ${err.message}` };
1574
+ }
1575
+ }
1576
+
1577
+ // ---------------------------------------------------------------------------
1578
+ // Resume — resume interrupted sessions
1579
+ // ---------------------------------------------------------------------------
1580
+
1581
+ export async function handleResume(args = {}) {
1582
+ try {
1583
+ const { listSessions, loadSession } = await import('../session/logger.js');
1584
+ const argList = Array.isArray(args) ? args : (args._ || []);
1585
+
1586
+ // cipher resume --list
1587
+ if (argList.includes('--list') || argList.length === 0) {
1588
+ const sessions = listSessions({ limit: 20 });
1589
+ if (sessions.length === 0) {
1590
+ return { output: 'No sessions found. Run `cipher <mode> --autonomous "task"` to create one.' };
1591
+ }
1592
+
1593
+ const lines = ['Recent Sessions:', ''];
1594
+ for (const s of sessions) {
1595
+ const cost = s.totalCostUSD ? ` $${s.totalCostUSD.toFixed(4)}` : '';
1596
+ const tokens = s.totalTokensIn + s.totalTokensOut;
1597
+ lines.push(` ${s.sessionId} ${(s.mode || '?').padEnd(12)} ${(s.status || '?').padEnd(12)} ${tokens} tokens${cost}`);
1598
+ if (s.task) lines.push(` Task: ${s.task.slice(0, 80)}`);
1599
+ }
1600
+ return { output: lines.join('\n') };
1601
+ }
1602
+
1603
+ // cipher resume <session-id>
1604
+ const sessionId = argList.find(a => !a.startsWith('--'));
1605
+ if (!sessionId) {
1606
+ return { error: true, message: 'Usage: cipher resume <session-id> | cipher resume --list' };
1607
+ }
1608
+
1609
+ const session = loadSession(sessionId);
1610
+ if (!session) {
1611
+ return { error: true, message: `Session not found: ${sessionId}` };
1612
+ }
1613
+
1614
+ const meta = session.metadata;
1615
+ const autoFlag = argList.includes('--auto');
1616
+
1617
+ // Build resume summary
1618
+ const lines = [
1619
+ `Session: ${meta.sessionId}`,
1620
+ `Mode: ${meta.mode}`,
1621
+ `Task: ${meta.task}`,
1622
+ `Status: ${meta.status}`,
1623
+ `Interactions: ${meta.interactionCount}`,
1624
+ `Tokens: ${meta.totalTokensIn} in / ${meta.totalTokensOut} out`,
1625
+ `Cost: $${(meta.totalCostUSD || 0).toFixed(4)}`,
1626
+ '',
1627
+ ];
1628
+
1629
+ // Show last few interactions
1630
+ const interactions = session.entries.filter(e => e.type === 'interaction');
1631
+ if (interactions.length > 0) {
1632
+ lines.push('Last interactions:');
1633
+ const recent = interactions.slice(-3);
1634
+ for (const e of recent) {
1635
+ lines.push(` Turn ${e.turn}: ${e.toolCalls} tool calls, ${e.tokensIn}+${e.tokensOut} tokens`);
1636
+ if (e.outputSummary) lines.push(` ${e.outputSummary.slice(0, 100)}`);
1637
+ }
1638
+ lines.push('');
1639
+ }
1640
+
1641
+ if (meta.status === 'completed') {
1642
+ lines.push('Session already completed. Nothing to resume.');
1643
+ return { output: lines.join('\n') };
1644
+ }
1645
+
1646
+ if (autoFlag) {
1647
+ // Auto-continue: re-run the mode with context from prior session
1648
+ const { runAutonomous, initModes } = await import('../autonomous/runner.js');
1649
+ const { SessionLogger } = await import('../session/logger.js');
1650
+ const { CostTracker } = await import('../session/cost-tracker.js');
1651
+
1652
+ await initModes();
1653
+
1654
+ const backendIdx = argList.indexOf('--backend');
1655
+ const backend = backendIdx >= 0 ? argList[backendIdx + 1] : null;
1656
+
1657
+ // Build continuation task with prior context
1658
+ const priorSummaries = interactions.slice(-5).map(e => e.outputSummary).filter(Boolean).join('\n');
1659
+ const continuationTask = {
1660
+ task: meta.task,
1661
+ user_message: `[Resuming session ${meta.sessionId}]\n\nPrior progress (${meta.interactionCount} interactions):\n${priorSummaries || '(no summaries)'}\n\nContinue the task: ${meta.task}`,
1662
+ };
1663
+
1664
+ // Create continuation session logger
1665
+ const logger = new SessionLogger({ sessionId: sessionId + '-cont', mode: meta.mode, task: meta.task });
1666
+ logger.start({ resumedFrom: sessionId });
1667
+
1668
+ const tracker = new CostTracker({ model: 'default', sessionId: logger.sessionId, logger });
1669
+
1670
+ try {
1671
+ const result = await runAutonomous(meta.mode, continuationTask, backend);
1672
+ tracker.track(result.tokensIn, result.tokensOut);
1673
+ logger.logInteraction({
1674
+ mode: result.mode,
1675
+ turn: 1,
1676
+ toolCalls: result.toolCalls,
1677
+ tokensIn: result.tokensIn,
1678
+ tokensOut: result.tokensOut,
1679
+ costUSD: tracker.totalCost,
1680
+ outputSummary: (result.outputText || '').slice(0, 200),
1681
+ });
1682
+ logger.end('completed');
1683
+ lines.push(`Resumed and completed. New session: ${logger.sessionId}`);
1684
+ lines.push(`Output: ${(result.outputText || '').slice(0, 500)}`);
1685
+ } catch (err) {
1686
+ logger.end('failed', { error: err.message });
1687
+ lines.push(`Resume failed: ${err.message}`);
1688
+ }
1689
+ } else {
1690
+ lines.push('To auto-continue this session, run:');
1691
+ lines.push(` cipher resume ${sessionId} --auto`);
1692
+ }
1693
+
1694
+ return { output: lines.join('\n') };
1695
+ } catch (err) {
1696
+ return { error: true, message: `Resume failed: ${err.message}` };
1697
+ }
1698
+ }
1699
+
1700
+ // ---------------------------------------------------------------------------
1701
+ // Benchmark — run XBOW benchmark suite
1702
+ // ---------------------------------------------------------------------------
1703
+
1704
+ export async function handleBenchmark(args = {}) {
1705
+ try {
1706
+ const argList = Array.isArray(args) ? args : (args._ || []);
1707
+ const subCommand = argList[0] || 'status';
1708
+ const suite = argList.find((_, i) => argList[i - 1] === '--suite') || 'xbow';
1709
+
1710
+ // cipher benchmark status — show available benchmarks and baselines
1711
+ if (subCommand === 'status' || subCommand === 'list') {
1712
+ const { BenchmarkBuilder, HarnessConfig, ALL_BASELINES } = await import('../benchmark/index.js');
1713
+ const { enumerateNyuChallenges, NYU_BASELINES, NYU_CTF_CLONE_DIR, NYU_CATEGORIES } = await import('../benchmark/nyu-ctf.js');
1714
+
1715
+ const lines = [];
1716
+
1717
+ // XBOW
1718
+ const builder = new BenchmarkBuilder();
1719
+ let xbowBenchmarks = [];
1720
+ try { xbowBenchmarks = builder.listBenchmarks(); } catch { /* not cloned */ }
1721
+ if (xbowBenchmarks.length > 0) {
1722
+ const byLevel = {};
1723
+ for (const b of xbowBenchmarks) byLevel[b.level] = (byLevel[b.level] || 0) + 1;
1724
+ lines.push(`XBOW: ${xbowBenchmarks.length} benchmarks`);
1725
+ lines.push(` Level 1: ${byLevel[1] || 0}, Level 2: ${byLevel[2] || 0}, Level 3: ${byLevel[3] || 0}`);
1726
+ lines.push(' Baselines:');
1727
+ for (const bl of ALL_BASELINES) lines.push(` ${bl.name}: ${bl.overallPct.toFixed(1)}%`);
1728
+ } else {
1729
+ lines.push('XBOW: not cloned (cipher benchmark clone --suite xbow)');
1730
+ }
1731
+
1732
+ // NYU CTF
1733
+ const nyuChallenges = enumerateNyuChallenges(NYU_CTF_CLONE_DIR);
1734
+ if (nyuChallenges.length > 0) {
1735
+ const byCat = {};
1736
+ for (const c of nyuChallenges) byCat[c.category] = (byCat[c.category] || 0) + 1;
1737
+ lines.push('');
1738
+ lines.push(`NYU CTF: ${nyuChallenges.length} challenges`);
1739
+ lines.push(` Categories: ${Object.entries(byCat).map(([c,n]) => `${c}(${n})`).join(', ')}`);
1740
+ lines.push(' Baselines:');
1741
+ for (const bl of NYU_BASELINES) lines.push(` ${bl.name}: ${bl.overallPct.toFixed(1)}%`);
1742
+ } else {
1743
+ lines.push('');
1744
+ lines.push('NYU CTF: not cloned (cipher benchmark clone --suite nyu)');
1745
+ }
1746
+
1747
+ // PicoCTF
1748
+ const { enumeratePicoChallenges, getPicoCatalogStats } = await import('../benchmark/picoctf.js');
1749
+ const picoStats = getPicoCatalogStats();
1750
+ lines.push('');
1751
+ lines.push(`PicoCTF: ${picoStats.total} challenges (bundled catalog)`);
1752
+ lines.push(` Categories: ${Object.entries(picoStats.byCategory).map(([c,n]) => `${c}(${n})`).join(', ')}`);
1753
+ lines.push(` Target types: ${Object.entries(picoStats.byTargetType).map(([t,n]) => `${t}(${n})`).join(', ')}`);
1754
+
1755
+ // OverTheWire
1756
+ const { getOtwCatalogStats, getProgressStats } = await import('../benchmark/overthewire.js');
1757
+ const otwStats = getOtwCatalogStats();
1758
+ const otwProgress = getProgressStats();
1759
+ lines.push('');
1760
+ lines.push(`OverTheWire: ${otwStats.total} levels across ${Object.keys(otwStats.byWargame).length} wargames`);
1761
+ for (const [wg, stats] of Object.entries(otwProgress)) {
1762
+ lines.push(` ${stats.name}: ${stats.solved}/${stats.total} solved (${stats.pct}%)`);
1763
+ }
1764
+
1765
+ return { output: lines.join('\n') };
1766
+ }
1767
+
1768
+ // cipher benchmark clone — clone benchmark repos
1769
+ if (subCommand === 'clone') {
1770
+ if (suite === 'pico') {
1771
+ const { clonePicoCTF, enumeratePicoChallenges } = await import('../benchmark/picoctf.js');
1772
+ clonePicoCTF();
1773
+ const challenges = enumeratePicoChallenges();
1774
+ return { output: `Initialized PicoCTF catalog: ${challenges.length} challenges` };
1775
+ }
1776
+
1777
+ if (suite === 'otw') {
1778
+ const { cloneOtw, enumerateOtwChallenges } = await import('../benchmark/overthewire.js');
1779
+ cloneOtw();
1780
+ const challenges = enumerateOtwChallenges();
1781
+ return { output: `Initialized OverTheWire wargames: ${challenges.length} levels` };
1782
+ }
1783
+
1784
+ if (suite === 'nyu') {
1785
+ const { NYU_CTF_REPO_URL, NYU_CTF_CLONE_DIR, enumerateNyuChallenges } = await import('../benchmark/nyu-ctf.js');
1786
+ const { existsSync } = await import('node:fs');
1787
+ const { spawnSync } = await import('node:child_process');
1788
+ const { mkdirSync } = await import('node:fs');
1789
+ const { join, dirname } = await import('node:path');
1790
+
1791
+ if (existsSync(join(NYU_CTF_CLONE_DIR, '.git'))) {
1792
+ spawnSync('git', ['-C', NYU_CTF_CLONE_DIR, 'pull', '--ff-only'], { timeout: 120000, stdio: 'pipe' });
1793
+ } else {
1794
+ mkdirSync(dirname(NYU_CTF_CLONE_DIR), { recursive: true });
1795
+ spawnSync('git', ['clone', '--depth=1', NYU_CTF_REPO_URL, NYU_CTF_CLONE_DIR], { timeout: 300000, stdio: 'pipe' });
1796
+ }
1797
+ const challenges = enumerateNyuChallenges(NYU_CTF_CLONE_DIR);
1798
+ return { output: `Cloned NYU CTF benchmarks: ${challenges.length} challenges` };
1799
+ }
1800
+
1801
+ const { BenchmarkBuilder } = await import('../benchmark/index.js');
1802
+ const builder = new BenchmarkBuilder();
1803
+ builder.clone();
1804
+ const benchmarks = builder.listBenchmarks(true);
1805
+ return { output: `Cloned XBOW benchmarks: ${benchmarks.length} available` };
1806
+ }
1807
+
1808
+ // cipher benchmark run — execute benchmarks
1809
+ if (subCommand === 'run') {
1810
+ const { BenchmarkBuilder, getSolver, runBenchmarks, generateJsonReport, generateMarkdownReport } = await import('../benchmark/index.js');
1811
+ const { writeFileSync, mkdirSync } = await import('node:fs');
1812
+ const { join } = await import('node:path');
1813
+ const { homedir } = await import('node:os');
1814
+
1815
+ const all = argList.includes('--all');
1816
+ const solverName = argList.find((_, i) => argList[i - 1] === '--solver') || 'autonomous';
1817
+ const backend = argList.find((_, i) => argList[i - 1] === '--backend') || null;
1818
+ const concurrency = parseInt(argList.find((_, i) => argList[i - 1] === '--concurrency') || '1', 10);
1819
+ const retries = parseInt(argList.find((_, i) => argList[i - 1] === '--retries') || '1', 10);
1820
+ const levelFilter = argList.find((_, i) => argList[i - 1] === '--level');
1821
+ const tagFilter = argList.find((_, i) => argList[i - 1] === '--tag');
1822
+ const categoryFilter = argList.find((_, i) => argList[i - 1] === '--category');
1823
+ const specificNames = argList.filter(a => /^(XBEN-|nyu-)/.test(a));
1824
+
1825
+ if (!all && specificNames.length === 0) {
1826
+ return {
1827
+ error: true,
1828
+ message: [
1829
+ 'Usage: cipher benchmark run [options]',
1830
+ '',
1831
+ 'Options:',
1832
+ ' --all Run all benchmarks',
1833
+ ' --suite <name> Benchmark suite: xbow (default), nyu, pico, otw',
1834
+ ' --solver <name> Solver: autonomous (default), autonomous-multi, claude-code, remote, stub',
1835
+ ' --backend <name> LLM backend: ollama, claude',
1836
+ ' --concurrency <n> Parallel benchmarks (default 1)',
1837
+ ' --retries <n> Attempts per benchmark / pass@k (default 1)',
1838
+ ' --level <n> Filter by level (1-5)',
1839
+ ' --tag <tag> Filter by tag',
1840
+ ' --category <cat> Filter by category (NYU: web, pwn, rev, crypto, forensics, misc)',
1841
+ ' XBEN-001-24 ... Specific benchmark names',
1842
+ ].join('\n'),
1843
+ };
1844
+ }
1845
+
1846
+ const builder = new BenchmarkBuilder();
1847
+ const solver = getSolver(solverName, { backend });
1848
+ const traces = [];
1849
+
1850
+ process.stderr.write(`Starting ${suite} benchmark run: solver=${solverName} backend=${backend || 'auto'} concurrency=${concurrency} retries=${retries}\n`);
1851
+
1852
+ // Apply category filter for NYU
1853
+ let effectiveTagFilter = tagFilter || undefined;
1854
+ if (categoryFilter && suite === 'nyu') {
1855
+ effectiveTagFilter = categoryFilter;
1856
+ }
1857
+
1858
+ const report = await runBenchmarks({
1859
+ builder,
1860
+ solver,
1861
+ benchmarkNames: specificNames.length > 0 ? specificNames : undefined,
1862
+ runAll: all,
1863
+ levelFilter: levelFilter ? parseInt(levelFilter, 10) : undefined,
1864
+ tagFilter: effectiveTagFilter,
1865
+ concurrency,
1866
+ retries,
1867
+ onResult: (name, result) => {
1868
+ const status = result.passed ? '✅' : result.solverResult.error ? '⚠️' : '❌';
1869
+ process.stderr.write(` ${status} ${name} (${result.solverResult.durationS.toFixed(1)}s)\n`);
1870
+ },
1871
+ onTrace: (event) => traces.push(event),
1872
+ });
1873
+
1874
+ // Compute pass@k if retries > 1
1875
+ const passAtK = retries > 1 ? `pass@${retries}` : 'pass@1';
1876
+
1877
+ // Save reports
1878
+ const reportDir = join(homedir(), '.cipher', 'benchmark-reports');
1879
+ mkdirSync(reportDir, { recursive: true });
1880
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
1881
+
1882
+ const jsonPath = join(reportDir, `${suite}-${timestamp}.json`);
1883
+ writeFileSync(jsonPath, generateJsonReport(report, solverName));
1884
+
1885
+ const mdPath = join(reportDir, `${suite}-${timestamp}.md`);
1886
+ writeFileSync(mdPath, generateMarkdownReport(report, solverName));
1887
+
1888
+ const tracePath = join(reportDir, `${suite}-${timestamp}-traces.json`);
1889
+ writeFileSync(tracePath, JSON.stringify(traces, null, 2));
1890
+
1891
+ // Category breakdown for NYU CTF
1892
+ const categoryLines = [];
1893
+ if (suite === 'nyu') {
1894
+ const byCat = {};
1895
+ for (const r of report.results) {
1896
+ const cat = r.config.category || 'misc';
1897
+ if (!byCat[cat]) byCat[cat] = { total: 0, passed: 0 };
1898
+ byCat[cat].total++;
1899
+ if (r.passed) byCat[cat].passed++;
1900
+ }
1901
+ categoryLines.push('', 'By Category:');
1902
+ for (const [cat, data] of Object.entries(byCat).sort(([a],[b]) => a.localeCompare(b))) {
1903
+ const pct = data.total > 0 ? (data.passed / data.total * 100).toFixed(1) : '0.0';
1904
+ categoryLines.push(` ${cat}: ${data.passed}/${data.total} (${pct}%)`);
1905
+ }
1906
+ }
1907
+
1908
+ // Summary output
1909
+ const lines = [
1910
+ `${suite.toUpperCase()} Benchmark Run Complete (${passAtK})`,
1911
+ ` Total: ${report.total}`,
1912
+ ` Passed: ${report.passed} (${report.passRate.toFixed(1)}%)`,
1913
+ ` Failed: ${report.failed}`,
1914
+ ` Skipped: ${report.skipped}`,
1915
+ ` Duration: ${report.durationS.toFixed(1)}s`,
1916
+ ` Cost: $${report.totalCostUsd.toFixed(4)}`,
1917
+ ...categoryLines,
1918
+ '',
1919
+ `Reports saved to:`,
1920
+ ` JSON: ${jsonPath}`,
1921
+ ` Markdown: ${mdPath}`,
1922
+ ` Traces: ${tracePath}`,
1923
+ ];
1924
+
1925
+ return { output: lines.join('\n') };
1926
+ }
1927
+
1928
+ return {
1929
+ error: true,
1930
+ message: 'Usage: cipher benchmark <status|clone|run> [options]',
1931
+ };
1932
+ } catch (err) {
1933
+ return { error: true, message: `Benchmark failed: ${err.message}` };
1934
+ }
1935
+ }