agentxchain 2.101.0 → 2.102.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,775 @@
1
+ import { mkdirSync, writeFileSync, rmSync } from 'fs';
2
+ import { join, resolve } from 'path';
3
+ import { tmpdir } from 'os';
4
+ import { randomBytes } from 'crypto';
5
+ import { execSync } from 'child_process';
6
+ import chalk from 'chalk';
7
+ import { runAdmissionControl } from '../lib/admission-control.js';
8
+ import { validateStagedTurnResult } from '../lib/turn-result-validator.js';
9
+ import { getTurnStagingResultPath } from '../lib/turn-paths.js';
10
+ import { verifyExportArtifact } from '../lib/export-verifier.js';
11
+ import { resolveBenchmarkWorkload } from './benchmark-workloads.js';
12
+
13
+ /**
14
+ * `agentxchain benchmark` — governance compliance proof.
15
+ *
16
+ * Runs a complete governed lifecycle in a temp dir using canned turn results,
17
+ * then measures governance metrics: admission control, retry handling,
18
+ * gate satisfaction, and export verification.
19
+ *
20
+ * No API keys required. Proves the governance engine is correct.
21
+ */
22
+
23
+ const BUILTIN_BENCHMARK_PHASES = Object.freeze({
24
+ planning: Object.freeze({
25
+ id: 'planning',
26
+ handler: 'planning',
27
+ role: Object.freeze({
28
+ id: 'pm',
29
+ title: 'Product Manager',
30
+ mandate: 'Scope and accept.',
31
+ write_authority: 'review_only',
32
+ }),
33
+ runtime: Object.freeze({
34
+ id: 'manual-pm',
35
+ type: 'manual',
36
+ }),
37
+ prompt: '# PM Prompt\nBenchmark PM.',
38
+ allowed_next_roles: Object.freeze(['pm', 'human']),
39
+ gate: Object.freeze({
40
+ id: 'planning_signoff',
41
+ requires_files: Object.freeze(['.planning/PM_SIGNOFF.md']),
42
+ requires_human_approval: true,
43
+ }),
44
+ }),
45
+ implementation: Object.freeze({
46
+ id: 'implementation',
47
+ handler: 'implementation',
48
+ role: Object.freeze({
49
+ id: 'dev',
50
+ title: 'Developer',
51
+ mandate: 'Implement and verify.',
52
+ write_authority: 'authoritative',
53
+ }),
54
+ runtime: Object.freeze({
55
+ id: 'manual-dev',
56
+ type: 'manual',
57
+ }),
58
+ prompt: '# Dev Prompt\nBenchmark Dev.',
59
+ allowed_next_roles: Object.freeze(['dev', 'qa', 'human']),
60
+ gate: Object.freeze({
61
+ id: 'implementation_complete',
62
+ requires_files: Object.freeze(['.planning/IMPLEMENTATION_NOTES.md']),
63
+ requires_human_approval: true,
64
+ }),
65
+ }),
66
+ qa: Object.freeze({
67
+ id: 'qa',
68
+ handler: 'qa',
69
+ role: Object.freeze({
70
+ id: 'qa',
71
+ title: 'QA Reviewer',
72
+ mandate: 'Challenge and approve.',
73
+ write_authority: 'review_only',
74
+ }),
75
+ runtime: Object.freeze({
76
+ id: 'manual-qa',
77
+ type: 'manual',
78
+ }),
79
+ prompt: '# QA Prompt\nBenchmark QA.',
80
+ allowed_next_roles: Object.freeze(['dev', 'qa', 'human']),
81
+ gate: Object.freeze({
82
+ id: 'qa_ship_verdict',
83
+ requires_files: Object.freeze(['.planning/acceptance-matrix.md', '.planning/ship-verdict.md']),
84
+ requires_human_approval: true,
85
+ }),
86
+ }),
87
+ });
88
+
89
+ function deepClone(value) {
90
+ return JSON.parse(JSON.stringify(value));
91
+ }
92
+
93
+ function buildBenchmarkPhaseSpecs(workload) {
94
+ const phaseOrder = Array.isArray(workload?.phase_order) && workload.phase_order.length > 0
95
+ ? workload.phase_order
96
+ : ['planning', 'implementation', 'qa'];
97
+ const customPhases = workload?.custom_phases || {};
98
+ const phaseSpecs = phaseOrder.map((phaseId) => {
99
+ const source = customPhases[phaseId] || BUILTIN_BENCHMARK_PHASES[phaseId];
100
+ if (!source) {
101
+ throw new Error(`Benchmark workload "${workload?.id || 'unknown'}" references unknown phase "${phaseId}".`);
102
+ }
103
+ return deepClone(source);
104
+ });
105
+
106
+ const phaseIds = phaseSpecs.map((phase) => phase.id);
107
+ if (new Set(phaseIds).size !== phaseIds.length) {
108
+ throw new Error(`Benchmark workload "${workload?.id || 'unknown'}" defines duplicate phases: ${phaseIds.join(', ')}.`);
109
+ }
110
+ if (phaseIds[0] !== 'planning') {
111
+ throw new Error(`Benchmark workload "${workload?.id || 'unknown'}" must start with the planning phase.`);
112
+ }
113
+ if (!phaseIds.includes('implementation')) {
114
+ throw new Error(`Benchmark workload "${workload?.id || 'unknown'}" must include the implementation phase.`);
115
+ }
116
+ if (phaseIds[phaseIds.length - 1] !== 'qa') {
117
+ throw new Error(`Benchmark workload "${workload?.id || 'unknown'}" must end with the qa phase.`);
118
+ }
119
+ if (phaseIds.indexOf('implementation') > phaseIds.indexOf('qa')) {
120
+ throw new Error(`Benchmark workload "${workload?.id || 'unknown'}" must place implementation before qa.`);
121
+ }
122
+
123
+ return phaseSpecs;
124
+ }
125
+
126
+ function makeConfig(phaseSpecs) {
127
+ const config = {
128
+ schema_version: 4,
129
+ protocol_mode: 'governed',
130
+ project: { id: 'agentxchain-benchmark', name: 'AgentXchain Benchmark', goal: 'Governance compliance proof workload', default_branch: 'main' },
131
+ roles: {},
132
+ runtimes: {},
133
+ routing: {},
134
+ gates: {},
135
+ budget: { per_turn_max_usd: 1.0, per_run_max_usd: 5.0 },
136
+ rules: { challenge_required: true, max_turn_retries: 2, max_deadlock_cycles: 1 },
137
+ files: {
138
+ talk: 'TALK.md',
139
+ history: '.agentxchain/history.jsonl',
140
+ state: '.agentxchain/state.json',
141
+ },
142
+ compat: {
143
+ next_owner_source: 'state-json',
144
+ lock_based_coordination: false,
145
+ original_version: 4,
146
+ },
147
+ };
148
+
149
+ for (const phaseSpec of phaseSpecs) {
150
+ config.roles[phaseSpec.role.id] = {
151
+ title: phaseSpec.role.title,
152
+ mandate: phaseSpec.role.mandate,
153
+ write_authority: phaseSpec.role.write_authority,
154
+ runtime: phaseSpec.runtime.id,
155
+ runtime_class: phaseSpec.runtime.type,
156
+ runtime_id: phaseSpec.runtime.id,
157
+ };
158
+ config.runtimes[phaseSpec.runtime.id] = { type: phaseSpec.runtime.type };
159
+ config.routing[phaseSpec.id] = {
160
+ entry_role: phaseSpec.role.id,
161
+ allowed_next_roles: [...phaseSpec.allowed_next_roles],
162
+ exit_gate: phaseSpec.gate.id,
163
+ };
164
+ config.gates[phaseSpec.gate.id] = {
165
+ requires_files: [...phaseSpec.gate.requires_files],
166
+ requires_human_approval: phaseSpec.gate.requires_human_approval,
167
+ };
168
+ }
169
+
170
+ if (phaseSpecs.length > 3) {
171
+ config.budget.per_run_max_usd = 5.0 + phaseSpecs.length - 3;
172
+ }
173
+
174
+ return config;
175
+ }
176
+
177
+ function makeTurnResult(runId, turnId, role, runtimeId, phase, opts = {}) {
178
+ return {
179
+ schema_version: '1.0',
180
+ run_id: runId,
181
+ turn_id: turnId,
182
+ role,
183
+ runtime_id: runtimeId,
184
+ status: 'completed',
185
+ summary: `Benchmark turn: ${role} in ${phase}`,
186
+ decisions: [{ id: `DEC-${String(opts.decisionNum || 1).padStart(3, '0')}`, category: 'scope', statement: `Benchmark decision by ${role}`, rationale: 'Governance compliance proof.' }],
187
+ objections: opts.objections || [],
188
+ files_changed: opts.files_changed || [],
189
+ artifacts_created: [],
190
+ verification: { status: 'pass', commands: [], evidence_summary: 'Benchmark verification.', machine_evidence: [] },
191
+ artifact: { type: opts.artifact_type || 'review', ref: null },
192
+ proposed_next_role: opts.proposed_next_role || 'human',
193
+ phase_transition_request: opts.phase_transition_request || null,
194
+ run_completion_request: opts.run_completion_request || false,
195
+ needs_human_reason: null,
196
+ cost: { input_tokens: 0, output_tokens: 0, usd: 0 },
197
+ };
198
+ }
199
+
200
+ function scaffoldProject(root, config, phaseSpecs) {
201
+ writeFileSync(join(root, 'agentxchain.json'), JSON.stringify(config, null, 2));
202
+ mkdirSync(join(root, '.agentxchain/prompts'), { recursive: true });
203
+ mkdirSync(join(root, '.planning'), { recursive: true });
204
+
205
+ writeFileSync(join(root, '.agentxchain/state.json'), JSON.stringify({
206
+ schema_version: '1.1',
207
+ project_id: config.project.id,
208
+ status: 'idle',
209
+ phase: 'planning',
210
+ run_id: null,
211
+ active_turns: {},
212
+ next_role: null,
213
+ pending_phase_transition: null,
214
+ pending_run_completion: null,
215
+ blocked_on: null,
216
+ blocked_reason: null,
217
+ }, null, 2));
218
+
219
+ writeFileSync(join(root, '.agentxchain/history.jsonl'), '');
220
+ writeFileSync(join(root, '.agentxchain/decision-ledger.jsonl'), '');
221
+ const writtenPrompts = new Set();
222
+ for (const phaseSpec of phaseSpecs) {
223
+ const promptPath = join(root, '.agentxchain/prompts', `${phaseSpec.role.id}.md`);
224
+ if (!phaseSpec.prompt || writtenPrompts.has(promptPath)) continue;
225
+ writeFileSync(promptPath, `${phaseSpec.prompt}\n`);
226
+ writtenPrompts.add(promptPath);
227
+ }
228
+ writeFileSync(join(root, 'TALK.md'), '# Benchmark Log\n');
229
+ writeFileSync(join(root, '.planning/PM_SIGNOFF.md'), '# PM Planning Sign-Off\n\nApproved: NO\n');
230
+ writeFileSync(join(root, '.planning/ROADMAP.md'), '# Roadmap\n\n## Wave 1\n\n### Phase: Planning\n');
231
+ }
232
+
233
+ function gitInit(root) {
234
+ execSync('git init', { cwd: root, stdio: 'ignore' });
235
+ execSync('git config user.email "benchmark@agentxchain.dev"', { cwd: root, stdio: 'ignore' });
236
+ execSync('git config user.name "AgentXchain Benchmark"', { cwd: root, stdio: 'ignore' });
237
+ execSync('git add -A', { cwd: root, stdio: 'ignore' });
238
+ execSync('git commit -m "benchmark: scaffold"', { cwd: root, stdio: 'ignore' });
239
+ }
240
+
241
+ function gitCommit(root, message) {
242
+ execSync('git add -A', { cwd: root, stdio: 'ignore' });
243
+ execSync(`git commit -m "${message}" --allow-empty`, { cwd: root, stdio: 'ignore' });
244
+ }
245
+
246
+ function stageTurnResult(root, turnId, result) {
247
+ const stagingDir = join(root, '.agentxchain/staging', turnId);
248
+ mkdirSync(stagingDir, { recursive: true });
249
+ writeFileSync(join(stagingDir, 'turn-result.json'), JSON.stringify(result, null, 2));
250
+ }
251
+
252
+ function recordTurn(metrics, phase, outcome = 'accepted') {
253
+ metrics.turns.total++;
254
+ metrics.turns.per_phase[phase] = (metrics.turns.per_phase[phase] || 0) + 1;
255
+ if (outcome === 'accepted') {
256
+ metrics.turns.accepted++;
257
+ } else if (outcome === 'rejected') {
258
+ metrics.turns.rejected++;
259
+ }
260
+ }
261
+
262
+ function makeInvalidRetryResult(runId, turnId, role, runtimeId, phase) {
263
+ const invalid = makeTurnResult(runId, turnId, role, runtimeId, phase, {
264
+ files_changed: ['benchmark-module.js'],
265
+ artifact_type: 'commit',
266
+ proposed_next_role: 'dev',
267
+ phase_transition_request: null,
268
+ decisionNum: 2,
269
+ });
270
+ delete invalid.schema_version;
271
+ return invalid;
272
+ }
273
+
274
+ function recordGateEvaluation(metrics, outcome) {
275
+ metrics.gates.evaluated++;
276
+ if (outcome === 'passed') {
277
+ metrics.gates.passed++;
278
+ } else if (outcome === 'failed') {
279
+ metrics.gates.failed++;
280
+ }
281
+ }
282
+
283
+ function getNextPhaseId(phaseSpecs, index) {
284
+ return phaseSpecs[index + 1]?.id || null;
285
+ }
286
+
287
+ function countExecutionArtifacts(phaseResult) {
288
+ return phaseResult.decisions.length + phaseResult.files_changed.length;
289
+ }
290
+
291
+ async function executeGenericPhase({
292
+ root,
293
+ config,
294
+ runId,
295
+ phaseSpec,
296
+ nextPhaseId,
297
+ metrics,
298
+ assignTurn,
299
+ acceptTurn,
300
+ approvePhaseGate,
301
+ }) {
302
+ const assignResult = assignTurn(root, config, phaseSpec.role.id);
303
+ if (!assignResult.ok) {
304
+ throw new Error(`${phaseSpec.role.title} assign failed: ${assignResult.error}`);
305
+ }
306
+ const turnId = assignResult.turn.turn_id;
307
+ const execution = phaseSpec.execution || {};
308
+ const filesChanged = Array.isArray(execution.files_changed)
309
+ ? [...execution.files_changed]
310
+ : Array.isArray(execution.files_to_write)
311
+ ? execution.files_to_write.map((file) => file.path)
312
+ : [];
313
+ const phaseResult = makeTurnResult(runId, turnId, phaseSpec.role.id, phaseSpec.runtime.id, phaseSpec.id, {
314
+ files_changed: filesChanged,
315
+ artifact_type: execution.artifact_type || 'commit',
316
+ proposed_next_role: execution.proposed_next_role || 'human',
317
+ phase_transition_request: nextPhaseId,
318
+ decisionNum: execution.decision_num || 1,
319
+ objections: execution.objections || [],
320
+ });
321
+ stageTurnResult(root, turnId, phaseResult);
322
+
323
+ for (const file of execution.files_to_write || []) {
324
+ writeFileSync(join(root, file.path), file.content);
325
+ }
326
+ gitCommit(root, execution.commit_message || `benchmark: ${phaseSpec.id}`);
327
+
328
+ const acceptResult = acceptTurn(root, config);
329
+ if (!acceptResult.ok) {
330
+ throw new Error(`${phaseSpec.role.title} accept failed: ${acceptResult.error}`);
331
+ }
332
+ gitCommit(root, execution.accept_commit_message || `benchmark: accept ${phaseSpec.role.id}`);
333
+
334
+ recordTurn(metrics, phaseSpec.id, 'accepted');
335
+ metrics.artifacts.total += countExecutionArtifacts(phaseResult);
336
+
337
+ const gateResult = approvePhaseGate(root, config);
338
+ if (!gateResult.ok) {
339
+ metrics.gates.evaluated++;
340
+ metrics.gates.failed++;
341
+ throw new Error(`${phaseSpec.role.title} gate failed: ${gateResult.error}`);
342
+ }
343
+ recordGateEvaluation(metrics, 'passed');
344
+ metrics.phases.completed++;
345
+ metrics.phases.names.push(phaseSpec.id);
346
+ gitCommit(root, execution.gate_commit_message || `benchmark: ${phaseSpec.id} gate`);
347
+ }
348
+
349
+ function failEarlyBenchmark(jsonMode, error, validWorkloads = []) {
350
+ const payload = {
351
+ version: '1.0',
352
+ result: 'fail',
353
+ error,
354
+ valid_workloads: validWorkloads,
355
+ };
356
+ if (jsonMode) {
357
+ process.stdout.write(JSON.stringify(payload, null, 2) + '\n');
358
+ } else {
359
+ console.error(chalk.red(`\n Benchmark FAIL: ${error}\n`));
360
+ if (validWorkloads.length > 0) {
361
+ console.error(` Valid workloads: ${validWorkloads.join(', ')}\n`);
362
+ }
363
+ }
364
+ process.exitCode = 1;
365
+ }
366
+
367
+ function assertExpectedWorkloadSignals(workload, metrics) {
368
+ if (workload.rejected_turn_expected && metrics.turns.rejected < 1) {
369
+ throw new Error(`Workload "${workload.id}" expected at least one rejected turn, but none were observed.`);
370
+ }
371
+ if (!workload.rejected_turn_expected && metrics.turns.rejected > 0) {
372
+ throw new Error(`Workload "${workload.id}" does not allow rejected turns, but ${metrics.turns.rejected} were observed.`);
373
+ }
374
+ if (workload.gate_failure_expected && metrics.gates.failed < 1) {
375
+ throw new Error(`Workload "${workload.id}" expected at least one failed gate evaluation, but none were observed.`);
376
+ }
377
+ if (!workload.gate_failure_expected && metrics.gates.failed > 0) {
378
+ throw new Error(`Workload "${workload.id}" does not allow failed gate evaluations, but ${metrics.gates.failed} were observed.`);
379
+ }
380
+ }
381
+
382
+ async function buildAndVerifyRunExport(root) {
383
+ const { buildRunExport } = await import('../lib/export.js');
384
+ const exportResult = buildRunExport(root);
385
+ if (!exportResult.ok) {
386
+ return {
387
+ ok: false,
388
+ error: exportResult.error,
389
+ };
390
+ }
391
+
392
+ const verification = verifyExportArtifact(exportResult.export);
393
+ if (!verification.ok) {
394
+ return {
395
+ ok: false,
396
+ error: verification.errors.join('; '),
397
+ exportArtifact: exportResult.export,
398
+ verificationReport: verification.report,
399
+ };
400
+ }
401
+
402
+ return {
403
+ ok: true,
404
+ exportArtifact: exportResult.export,
405
+ verificationReport: verification.report,
406
+ };
407
+ }
408
+
409
+ function buildProofArtifactPaths(outputDir) {
410
+ if (!outputDir) return null;
411
+ return {
412
+ directory: outputDir,
413
+ metrics: join(outputDir, 'metrics.json'),
414
+ export: join(outputDir, 'run-export.json'),
415
+ verify_export: join(outputDir, 'verify-export.json'),
416
+ workload: join(outputDir, 'workload.json'),
417
+ };
418
+ }
419
+
420
+ function persistBenchmarkArtifacts(paths, metrics, workload, exportArtifact, verificationReport) {
421
+ if (!paths) return;
422
+ mkdirSync(paths.directory, { recursive: true });
423
+ writeFileSync(paths.metrics, JSON.stringify(metrics, null, 2) + '\n');
424
+ writeFileSync(paths.workload, JSON.stringify(workload, null, 2) + '\n');
425
+ if (exportArtifact) {
426
+ writeFileSync(paths.export, JSON.stringify(exportArtifact, null, 2) + '\n');
427
+ }
428
+ if (verificationReport) {
429
+ writeFileSync(paths.verify_export, JSON.stringify(verificationReport, null, 2) + '\n');
430
+ }
431
+ }
432
+
433
+ export async function benchmarkCommand(opts = {}) {
434
+ const jsonMode = opts.json || false;
435
+ const workloadResolution = resolveBenchmarkWorkload(opts);
436
+ if (!workloadResolution.ok) {
437
+ failEarlyBenchmark(jsonMode, workloadResolution.error, workloadResolution.valid_workloads || []);
438
+ return;
439
+ }
440
+ const benchmarkWorkload = workloadResolution.workload;
441
+ let phaseSpecs;
442
+ try {
443
+ phaseSpecs = buildBenchmarkPhaseSpecs(benchmarkWorkload);
444
+ } catch (error) {
445
+ failEarlyBenchmark(jsonMode, error.message, workloadResolution.valid_workloads || []);
446
+ return;
447
+ }
448
+ const startTime = Date.now();
449
+ const outputDir = opts.output ? resolve(String(opts.output)) : null;
450
+ const proofArtifactPaths = buildProofArtifactPaths(outputDir);
451
+
452
+ const root = join(tmpdir(), `agentxchain-benchmark-${randomBytes(6).toString('hex')}`);
453
+ mkdirSync(root, { recursive: true });
454
+
455
+ const metrics = {
456
+ version: '1.0',
457
+ workload: benchmarkWorkload.id,
458
+ mode: benchmarkWorkload.id,
459
+ selected_via: workloadResolution.selected_via,
460
+ result: 'fail',
461
+ phases: { completed: 0, total: 3, names: [] },
462
+ turns: { total: 0, accepted: 0, rejected: 0, per_phase: {} },
463
+ gates: { evaluated: 0, passed: 0, failed: 0 },
464
+ artifacts: { total: 0 },
465
+ admission_control: 'fail',
466
+ export_verification: 'fail',
467
+ proof_artifacts: proofArtifactPaths,
468
+ elapsed_ms: 0,
469
+ error: null,
470
+ };
471
+ const workload = {
472
+ version: '1.0',
473
+ workload: benchmarkWorkload.id,
474
+ mode: benchmarkWorkload.id,
475
+ label: benchmarkWorkload.label,
476
+ description: benchmarkWorkload.description,
477
+ selected_via: workloadResolution.selected_via,
478
+ run_id: null,
479
+ project_id: 'agentxchain-benchmark',
480
+ expected_phase_order: null, // set after config is built
481
+ rejected_turn_expected: benchmarkWorkload.rejected_turn_expected,
482
+ gate_failure_expected: benchmarkWorkload.gate_failure_expected,
483
+ recovery_branch: benchmarkWorkload.recovery_branch,
484
+ proof_artifacts: proofArtifactPaths,
485
+ };
486
+ let exportArtifact = null;
487
+ let verificationReport = null;
488
+
489
+ try {
490
+ execSync('git --version', { stdio: 'ignore' });
491
+
492
+ const {
493
+ loadState,
494
+ initRun,
495
+ assignTurn,
496
+ acceptTurn,
497
+ rejectTurn,
498
+ approvePhaseGate,
499
+ approveCompletionGate,
500
+ } = await import('../lib/runner-interface.js');
501
+
502
+ if (!jsonMode) {
503
+ console.log('');
504
+ console.log(chalk.bold(' AgentXchain Benchmark — Governed Delivery Compliance'));
505
+ console.log(chalk.dim(' ' + '─'.repeat(54)));
506
+ console.log('');
507
+ console.log(` Workload ${benchmarkWorkload.id === 'baseline' ? chalk.green(benchmarkWorkload.label.toUpperCase()) : chalk.yellow(benchmarkWorkload.label.toUpperCase())}`);
508
+ console.log('');
509
+ }
510
+
511
+ // ── Scaffold ──────────────────────────────────────────────────────────
512
+ const config = makeConfig(phaseSpecs);
513
+ scaffoldProject(root, config, phaseSpecs);
514
+ gitInit(root);
515
+
516
+ // Adjust metrics and workload metadata for actual phase count
517
+ const phaseNames = phaseSpecs.map((phase) => phase.id);
518
+ metrics.phases.total = phaseNames.length;
519
+ workload.expected_phase_order = phaseNames;
520
+
521
+ // ── Admission Control ────────────────────────────────────────────────
522
+ const admission = runAdmissionControl(config, config);
523
+ metrics.admission_control = admission.ok ? 'pass' : 'fail';
524
+ if (!admission.ok) {
525
+ throw new Error(`Admission control failed: ${admission.errors.join('; ')}`);
526
+ }
527
+
528
+ // ── Init run ──────────────────────────────────────────────────────────
529
+ const runResult = initRun(root, config);
530
+ if (!runResult.ok) throw new Error(`initRun failed: ${runResult.error}`);
531
+ const runId = runResult.state.run_id;
532
+ workload.run_id = runId;
533
+
534
+ // ── Planning Phase ───────────────────────────────────────────────────
535
+ const pmAssign = assignTurn(root, config, 'pm');
536
+ if (!pmAssign.ok) throw new Error(`PM assign failed: ${pmAssign.error}`);
537
+ const pmTurnId = pmAssign.turn.turn_id;
538
+
539
+ const phaseAfterPlanning = getNextPhaseId(phaseSpecs, 0) || 'implementation';
540
+ const pmResult = makeTurnResult(runId, pmTurnId, 'pm', 'manual-pm', 'planning', {
541
+ proposed_next_role: 'human',
542
+ phase_transition_request: phaseAfterPlanning,
543
+ decisionNum: 1,
544
+ objections: [{ id: 'OBJ-001', severity: 'medium', statement: 'Benchmark scope challenge: verify edge case handling.', status: 'raised' }],
545
+ });
546
+ stageTurnResult(root, pmTurnId, pmResult);
547
+
548
+ writeFileSync(join(root, '.planning/PM_SIGNOFF.md'), '# PM Planning Sign-Off\n\nApproved: YES\n');
549
+ gitCommit(root, 'benchmark: pm planning');
550
+
551
+ const pmAccept = acceptTurn(root, config);
552
+ if (!pmAccept.ok) throw new Error(`PM accept failed: ${pmAccept.error}`);
553
+ gitCommit(root, 'benchmark: accept pm');
554
+
555
+ recordTurn(metrics, 'planning', 'accepted');
556
+ metrics.artifacts.total += pmResult.decisions.length;
557
+
558
+ // Planning gate
559
+ const planGate = approvePhaseGate(root, config);
560
+ if (!planGate.ok) {
561
+ metrics.gates.evaluated++;
562
+ metrics.gates.failed++;
563
+ throw new Error(`Planning gate failed: ${planGate.error}`);
564
+ }
565
+ recordGateEvaluation(metrics, 'passed');
566
+ metrics.phases.completed++;
567
+ metrics.phases.names.push('planning');
568
+ gitCommit(root, 'benchmark: planning gate');
569
+
570
+ for (let phaseIndex = 1; phaseIndex < phaseSpecs.length; phaseIndex += 1) {
571
+ const phaseSpec = phaseSpecs[phaseIndex];
572
+ const nextPhaseId = getNextPhaseId(phaseSpecs, phaseIndex);
573
+
574
+ if (phaseSpec.handler === 'generic') {
575
+ await executeGenericPhase({
576
+ root,
577
+ config,
578
+ runId,
579
+ phaseSpec,
580
+ nextPhaseId,
581
+ metrics,
582
+ assignTurn,
583
+ acceptTurn,
584
+ approvePhaseGate,
585
+ });
586
+ continue;
587
+ }
588
+
589
+ if (phaseSpec.handler === 'implementation') {
590
+ const devAssign = assignTurn(root, config, phaseSpec.role.id);
591
+ if (!devAssign.ok) throw new Error(`Dev assign failed: ${devAssign.error}`);
592
+ const devTurnId = devAssign.turn.turn_id;
593
+
594
+ if (benchmarkWorkload.implementation.reject_invalid_first_attempt) {
595
+ const invalidDevResult = makeInvalidRetryResult(runId, devTurnId, phaseSpec.role.id, phaseSpec.runtime.id, phaseSpec.id);
596
+ stageTurnResult(root, devTurnId, invalidDevResult);
597
+
598
+ const validation = validateStagedTurnResult(root, loadState(root, config), config, {
599
+ stagingPath: getTurnStagingResultPath(devTurnId),
600
+ });
601
+ if (validation.ok) {
602
+ throw new Error('Benchmark stress mode expected the first implementation attempt to fail validation.');
603
+ }
604
+
605
+ const rejectResult = rejectTurn(root, config, validation, 'Benchmark stress: reject invalid implementation attempt');
606
+ if (!rejectResult.ok) {
607
+ throw new Error(`Dev reject failed: ${rejectResult.error}`);
608
+ }
609
+
610
+ recordTurn(metrics, phaseSpec.id, 'rejected');
611
+ }
612
+
613
+ const devResult = makeTurnResult(runId, devTurnId, phaseSpec.role.id, phaseSpec.runtime.id, phaseSpec.id, {
614
+ files_changed: ['benchmark-module.js', '.planning/IMPLEMENTATION_NOTES.md'],
615
+ artifact_type: 'commit',
616
+ proposed_next_role: nextPhaseId === 'qa' ? 'qa' : 'human',
617
+ phase_transition_request: nextPhaseId,
618
+ decisionNum: 2,
619
+ });
620
+ stageTurnResult(root, devTurnId, devResult);
621
+
622
+ writeFileSync(join(root, 'benchmark-module.js'), '// benchmark implementation artifact\nmodule.exports = { ok: true };\n');
623
+ writeFileSync(join(root, '.planning/IMPLEMENTATION_NOTES.md'), '# Implementation Notes\n\n## Changes\n\n- Benchmark implementation artifact created\n\n## Verification\n\n- All assertions pass\n');
624
+ gitCommit(root, 'benchmark: dev implementation');
625
+
626
+ const devAccept = acceptTurn(root, config);
627
+ if (!devAccept.ok) throw new Error(`Dev accept failed: ${devAccept.error}`);
628
+ gitCommit(root, 'benchmark: accept dev');
629
+
630
+ recordTurn(metrics, phaseSpec.id, 'accepted');
631
+ metrics.artifacts.total += countExecutionArtifacts(devResult);
632
+
633
+ const implGate = approvePhaseGate(root, config);
634
+ if (!implGate.ok) {
635
+ metrics.gates.evaluated++;
636
+ metrics.gates.failed++;
637
+ throw new Error(`Implementation gate failed: ${implGate.error}`);
638
+ }
639
+ recordGateEvaluation(metrics, 'passed');
640
+ metrics.phases.completed++;
641
+ metrics.phases.names.push(phaseSpec.id);
642
+ gitCommit(root, 'benchmark: implementation gate');
643
+ continue;
644
+ }
645
+
646
+ if (phaseSpec.handler === 'qa') {
647
+ const qaAssign = assignTurn(root, config, phaseSpec.role.id);
648
+ if (!qaAssign.ok) throw new Error(`QA assign failed: ${qaAssign.error}`);
649
+ const qaTurnId = qaAssign.turn.turn_id;
650
+
651
+ const qaResult = makeTurnResult(runId, qaTurnId, phaseSpec.role.id, phaseSpec.runtime.id, phaseSpec.id, {
652
+ proposed_next_role: 'human',
653
+ run_completion_request: true,
654
+ decisionNum: 3,
655
+ objections: [{ id: 'OBJ-002', severity: 'low', statement: 'Benchmark QA challenge: verify compliance coverage.', status: 'raised' }],
656
+ });
657
+ stageTurnResult(root, qaTurnId, qaResult);
658
+
659
+ writeFileSync(join(root, '.planning/acceptance-matrix.md'), '# Acceptance Matrix\n\n| Req # | Requirement | Status |\n|-------|-------------|--------|\n| 1 | Governance compliance | PASS |\n');
660
+ if (!benchmarkWorkload.qa.missing_completion_files.includes('.planning/ship-verdict.md')) {
661
+ writeFileSync(join(root, '.planning/ship-verdict.md'), '# Ship Verdict\n\n## Verdict: SHIP\n');
662
+ }
663
+ gitCommit(root, 'benchmark: qa review');
664
+
665
+ const qaAccept = acceptTurn(root, config);
666
+ if (!qaAccept.ok) throw new Error(`QA accept failed: ${qaAccept.error}`);
667
+ gitCommit(root, 'benchmark: accept qa');
668
+
669
+ recordTurn(metrics, phaseSpec.id, 'accepted');
670
+ metrics.artifacts.total += qaResult.decisions.length;
671
+
672
+ if (benchmarkWorkload.qa.fail_completion_once) {
673
+ const failedCompletionState = loadState(root, config);
674
+ const gateFailure = failedCompletionState?.last_gate_failure;
675
+ if (!gateFailure || gateFailure.gate_type !== 'run_completion') {
676
+ throw new Error(`Workload "${benchmarkWorkload.id}" expected a run-completion gate failure after the first QA turn.`);
677
+ }
678
+ const missingFiles = Array.isArray(gateFailure.missing_files) ? gateFailure.missing_files : [];
679
+ for (const requiredPath of benchmarkWorkload.qa.missing_completion_files) {
680
+ if (!missingFiles.includes(requiredPath)) {
681
+ throw new Error(`Workload "${benchmarkWorkload.id}" expected missing completion artifact "${requiredPath}", but observed: ${missingFiles.join(', ') || 'none'}.`);
682
+ }
683
+ }
684
+ recordGateEvaluation(metrics, 'failed');
685
+
686
+ const qaRecoveryAssign = assignTurn(root, config, benchmarkWorkload.qa.recovery_role);
687
+ if (!qaRecoveryAssign.ok) {
688
+ throw new Error(`QA recovery assign failed: ${qaRecoveryAssign.error}`);
689
+ }
690
+ const qaRecoveryTurnId = qaRecoveryAssign.turn.turn_id;
691
+ const qaRecoveryResult = makeTurnResult(runId, qaRecoveryTurnId, benchmarkWorkload.qa.recovery_role, phaseSpec.runtime.id, phaseSpec.id, {
692
+ proposed_next_role: 'human',
693
+ run_completion_request: true,
694
+ decisionNum: 4,
695
+ objections: [{ id: 'OBJ-003', severity: 'medium', statement: 'Benchmark QA recovery: restore the missing ship verdict before completion.', status: 'raised' }],
696
+ });
697
+ stageTurnResult(root, qaRecoveryTurnId, qaRecoveryResult);
698
+
699
+ for (const requiredPath of benchmarkWorkload.qa.missing_completion_files) {
700
+ writeFileSync(join(root, requiredPath), '# Ship Verdict\n\n## Verdict: SHIP\n');
701
+ }
702
+ gitCommit(root, 'benchmark: qa recovery');
703
+
704
+ const qaRecoveryAccept = acceptTurn(root, config);
705
+ if (!qaRecoveryAccept.ok) {
706
+ throw new Error(`QA recovery accept failed: ${qaRecoveryAccept.error}`);
707
+ }
708
+ gitCommit(root, 'benchmark: accept qa recovery');
709
+
710
+ recordTurn(metrics, phaseSpec.id, 'accepted');
711
+ metrics.artifacts.total += qaRecoveryResult.decisions.length + benchmarkWorkload.qa.missing_completion_files.length;
712
+ }
713
+
714
+ const completionResult = approveCompletionGate(root, config);
715
+ if (!completionResult.ok) throw new Error(`Completion failed: ${completionResult.error}`);
716
+
717
+ recordGateEvaluation(metrics, 'passed');
718
+ metrics.phases.completed++;
719
+ metrics.phases.names.push(phaseSpec.id);
720
+ continue;
721
+ }
722
+ throw new Error(`Benchmark workload "${benchmarkWorkload.id}" uses unsupported phase handler "${phaseSpec.handler}" for phase "${phaseSpec.id}".`);
723
+ }
724
+
725
+ // ── Export Verification ──────────────────────────────────────────────
726
+ const exportVerification = await buildAndVerifyRunExport(root);
727
+ if (!exportVerification.ok) {
728
+ metrics.export_verification = 'fail';
729
+ throw new Error(`Export verification failed: ${exportVerification.error}`);
730
+ }
731
+ exportArtifact = exportVerification.exportArtifact;
732
+ verificationReport = exportVerification.verificationReport;
733
+ metrics.export_verification = 'pass';
734
+
735
+ assertExpectedWorkloadSignals(benchmarkWorkload, metrics);
736
+
737
+ // ── Done ─────────────────────────────────────────────────────────────
738
+ metrics.result = 'pass';
739
+ metrics.elapsed_ms = Date.now() - startTime;
740
+ persistBenchmarkArtifacts(proofArtifactPaths, metrics, workload, exportArtifact, verificationReport);
741
+
742
+ if (jsonMode) {
743
+ process.stdout.write(JSON.stringify(metrics, null, 2) + '\n');
744
+ } else {
745
+ console.log(` Phases completed ${chalk.green(`${metrics.phases.completed}/${metrics.phases.total}`)} (${metrics.phases.names.join(' → ')})`);
746
+ console.log(` Turns executed ${chalk.bold(String(metrics.turns.total))} (${metrics.turns.accepted} accepted, ${metrics.turns.rejected} rejected; ${Object.entries(metrics.turns.per_phase).map(([p, n]) => `${n} ${p}`).join(', ')})`);
747
+ console.log(` Gate evaluations ${chalk.green(`${metrics.gates.passed}/${metrics.gates.evaluated}`)} passed`);
748
+ console.log(` Artifacts produced ${chalk.bold(String(metrics.artifacts.total))}`);
749
+ console.log(` Admission control ${chalk.green('PASS')}`);
750
+ console.log(` Export verification ${metrics.export_verification === 'pass' ? chalk.green('PASS') : chalk.red('FAIL')}`);
751
+ if (proofArtifactPaths) {
752
+ console.log(` Proof artifacts ${chalk.dim(proofArtifactPaths.directory)}`);
753
+ }
754
+ console.log(` Elapsed ${chalk.dim((metrics.elapsed_ms / 1000).toFixed(1) + 's')}`);
755
+ console.log('');
756
+ console.log(` Result: ${chalk.green.bold('PASS')} ${chalk.green('✓')}`);
757
+ console.log('');
758
+ }
759
+ } catch (err) {
760
+ metrics.result = 'fail';
761
+ metrics.error = err.message;
762
+ metrics.elapsed_ms = Date.now() - startTime;
763
+ persistBenchmarkArtifacts(proofArtifactPaths, metrics, workload, exportArtifact, verificationReport);
764
+
765
+ if (jsonMode) {
766
+ process.stdout.write(JSON.stringify(metrics, null, 2) + '\n');
767
+ } else {
768
+ console.error(chalk.red(`\n Benchmark FAIL: ${err.message}\n`));
769
+ }
770
+
771
+ process.exitCode = 1;
772
+ } finally {
773
+ try { rmSync(root, { recursive: true, force: true }); } catch {}
774
+ }
775
+ }