@fusionkit/ensemble 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/dist/agent.d.ts +21 -0
  2. package/dist/agent.js +186 -0
  3. package/dist/artifacts.d.ts +21 -0
  4. package/dist/artifacts.js +36 -0
  5. package/dist/claude-code.d.ts +25 -0
  6. package/dist/claude-code.js +398 -0
  7. package/dist/codex.d.ts +69 -0
  8. package/dist/codex.js +467 -0
  9. package/dist/command.d.ts +15 -0
  10. package/dist/command.js +82 -0
  11. package/dist/dashboard.d.ts +62 -0
  12. package/dist/dashboard.js +788 -0
  13. package/dist/external-executor.d.ts +56 -0
  14. package/dist/external-executor.js +288 -0
  15. package/dist/harness.d.ts +337 -0
  16. package/dist/harness.js +1 -0
  17. package/dist/index.d.ts +30 -0
  18. package/dist/index.js +15 -0
  19. package/dist/isolation.d.ts +25 -0
  20. package/dist/isolation.js +509 -0
  21. package/dist/judge.d.ts +77 -0
  22. package/dist/judge.js +16 -0
  23. package/dist/mock.d.ts +20 -0
  24. package/dist/mock.js +56 -0
  25. package/dist/run.d.ts +5 -0
  26. package/dist/run.js +520 -0
  27. package/dist/synthesis.d.ts +25 -0
  28. package/dist/synthesis.js +221 -0
  29. package/dist/test/codex.test.d.ts +1 -0
  30. package/dist/test/codex.test.js +237 -0
  31. package/dist/test/dashboard.test.d.ts +1 -0
  32. package/dist/test/dashboard.test.js +214 -0
  33. package/dist/test/ensemble.test.d.ts +1 -0
  34. package/dist/test/ensemble.test.js +780 -0
  35. package/dist/test/external-executor.test.d.ts +1 -0
  36. package/dist/test/external-executor.test.js +273 -0
  37. package/dist/test/isolation.test.d.ts +1 -0
  38. package/dist/test/isolation.test.js +359 -0
  39. package/dist/test/tool-executor.test.d.ts +1 -0
  40. package/dist/test/tool-executor.test.js +113 -0
  41. package/dist/test/unified.test.d.ts +1 -0
  42. package/dist/test/unified.test.js +150 -0
  43. package/dist/tool-executor.d.ts +14 -0
  44. package/dist/tool-executor.js +156 -0
  45. package/dist/trace.d.ts +8 -0
  46. package/dist/trace.js +7 -0
  47. package/dist/unified.d.ts +101 -0
  48. package/dist/unified.js +422 -0
  49. package/dist/worktree.d.ts +25 -0
  50. package/dist/worktree.js +75 -0
  51. package/package.json +35 -0
@@ -0,0 +1,780 @@
1
+ import assert from "node:assert/strict";
2
+ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
3
+ import { tmpdir } from "node:os";
4
+ import { join } from "node:path";
5
+ import { test } from "node:test";
6
+ import { assertJudgeSynthesisRecordV1, assertHarnessCandidateRecordV1, assertHarnessRunRequestV1, assertHarnessRunResultV1, MODEL_FUSION_SCHEMA_BUNDLE_HASH, requestHash, responseHash } from "@fusionkit/protocol";
7
+ import { gitText } from "@fusionkit/workspace";
8
+ import { claudeCodeHarness, claudeCodeHarnessCredentialSkipReason } from "../claude-code.js";
9
+ import { createCommandHarness } from "../command.js";
10
+ import { codexConfigToml, codexHarness, codexHarnessCredentialSkipReason } from "../codex.js";
11
+ import { createMockJudgeSynthesizer } from "../judge.js";
12
+ import { createMockHarness } from "../mock.js";
13
+ import { runEnsemble } from "../run.js";
14
+ const BASE_DESCRIPTOR = {
15
+ id: "ensemble_test",
16
+ models: [
17
+ { id: "fast", model: "fake-fast" },
18
+ { id: "writer", model: "fake-writer" }
19
+ ],
20
+ runtime: { id: "local" },
21
+ judge: { id: "judge", model: "fake-judge" },
22
+ policy: {
23
+ id: "policy",
24
+ allowedTools: ["read_file"],
25
+ sideEffects: "read_only",
26
+ timeoutMs: 1_000
27
+ },
28
+ prompt: "Summarize model-fusion evidence.",
29
+ sourceRepo: "handoffkit",
30
+ baseGitSha: "a".repeat(40)
31
+ };
32
+ function descriptor(overrides = {}) {
33
+ return {
34
+ ...BASE_DESCRIPTOR,
35
+ harness: createMockHarness(),
36
+ ...overrides
37
+ };
38
+ }
39
+ function modelCallRecord(callId, model = "fake-fast") {
40
+ return {
41
+ schema: "model-call-record.v1",
42
+ schema_version: "v1",
43
+ schema_bundle_hash: MODEL_FUSION_SCHEMA_BUNDLE_HASH,
44
+ producer: "ensemble-test",
45
+ producer_version: "0.1.0",
46
+ producer_git_sha: "0".repeat(40),
47
+ created_at: "2026-06-16T00:00:00.000Z",
48
+ call_id: callId,
49
+ endpoint_id: "test-endpoint",
50
+ model,
51
+ request_hash: requestHash({ prompt: "test" }),
52
+ response_hash: responseHash({ output: "ok" }),
53
+ messages: [{ role: "user", content: requestHash("test") }],
54
+ status: "succeeded",
55
+ side_effects: "none",
56
+ started_at: "2026-06-16T00:00:00.000Z",
57
+ finished_at: "2026-06-16T00:00:00.010Z",
58
+ latency_ms: 10,
59
+ metadata: { unknown_usage: true, unknown_cost: true }
60
+ };
61
+ }
62
+ function makeRepo() {
63
+ const root = mkdtempSync(join(tmpdir(), "ensemble-repo-"));
64
+ const repo = join(root, "repo");
65
+ mkdirSync(repo);
66
+ gitText(repo, ["init", "--quiet", "--initial-branch=main"]);
67
+ gitText(repo, ["config", "user.email", "ensemble@warrant.local"]);
68
+ gitText(repo, ["config", "user.name", "ensemble"]);
69
+ writeFileSync(join(repo, "README.md"), "# ensemble\n");
70
+ gitText(repo, ["add", "-A"]);
71
+ gitText(repo, ["commit", "--quiet", "-m", "init"]);
72
+ return {
73
+ repo,
74
+ outputRoot: join(root, "out"),
75
+ head: gitText(repo, ["rev-parse", "HEAD"]).trim(),
76
+ cleanup: () => rmSync(root, { recursive: true, force: true })
77
+ };
78
+ }
79
+ function addFilePatch(path, content) {
80
+ const lines = content.endsWith("\n") ? content.slice(0, -1).split("\n") : content.split("\n");
81
+ return [
82
+ `diff --git a/${path} b/${path}`,
83
+ "new file mode 100644",
84
+ "--- /dev/null",
85
+ `+++ b/${path}`,
86
+ `@@ -0,0 +1,${lines.length} @@`,
87
+ ...lines.map((line) => `+${line}`),
88
+ ""
89
+ ].join("\n");
90
+ }
91
+ function emptyCodexHome() {
92
+ const path = mkdtempSync(join(tmpdir(), "ensemble-codex-empty-home-"));
93
+ return { path, cleanup: () => rmSync(path, { recursive: true, force: true }) };
94
+ }
95
+ test("mock adapter runs N candidates and emits valid model-fusion records", async () => {
96
+ const result = await runEnsemble(descriptor({
97
+ harness: createMockHarness({
98
+ candidates: {
99
+ writer: { score: 0.8, transcript: "writer transcript" }
100
+ }
101
+ })
102
+ }));
103
+ assert.equal(result.candidates.length, 2);
104
+ assertHarnessRunRequestV1(result.harnessRunRequest);
105
+ assertHarnessRunResultV1(result.harnessRunResult);
106
+ for (const candidate of result.candidates) {
107
+ assertHarnessCandidateRecordV1(candidate);
108
+ assert.equal(candidate.status, "succeeded");
109
+ }
110
+ assert.equal(result.harnessRunResult.status, "succeeded");
111
+ assert.ok(result.artifacts.length >= 4);
112
+ });
113
+ test("command adapter records command output, artifact, tool record, and verification", async () => {
114
+ const result = await runEnsemble(descriptor({
115
+ models: [{ id: "command", model: "local-shell" }],
116
+ harness: createCommandHarness({
117
+ command: "printf command-ok"
118
+ })
119
+ }));
120
+ assert.equal(result.candidates.length, 1);
121
+ assert.equal(result.harnessRunResult.status, "succeeded");
122
+ assert.equal(result.toolRecords.length, 1);
123
+ assert.equal(result.toolRecords[0]?.status, "succeeded");
124
+ assert.equal(result.artifacts[0]?.kind, "log");
125
+ const metadata = result.candidates[0]?.metadata;
126
+ assert.equal(metadata?.verification?.status, "succeeded");
127
+ assert.deepEqual(result.summary?.candidates[0]?.toolExecutionIds, [
128
+ "exec_ensemble_test_command_0"
129
+ ]);
130
+ });
131
+ test("claude-code adapter can replace mock and skip clearly without credentials", async () => {
132
+ const result = await runEnsemble(descriptor({
133
+ models: [{ id: "claude", model: "claude-sonnet-4-6" }],
134
+ harness: claudeCodeHarness({ env: {} })
135
+ }));
136
+ assert.equal(result.candidates.length, 1);
137
+ assert.equal(result.harnessRunResult.status, "skipped");
138
+ assert.equal(result.candidates[0]?.status, "skipped");
139
+ assert.equal(result.candidates[0]?.error?.kind, "capability_missing");
140
+ assert.match(result.candidates[0]?.error?.message ?? "", /missing Claude Code credential/);
141
+ assert.match(result.summary?.candidates[0]?.verification?.evidence[0] ?? "", /missing Claude/);
142
+ });
143
+ test("claude-code adapter delegates through a session backend from a generic descriptor", async () => {
144
+ const repo = makeRepo();
145
+ const seen = {};
146
+ const backend = {
147
+ isolation: "vercel-sandbox",
148
+ supports: () => true,
149
+ execute: async (input) => {
150
+ seen.agentKind = input.contract.agent.kind;
151
+ seen.env = input.execution.env;
152
+ seen.repoDir = input.repoDir;
153
+ assert.equal(input.contract.isolation, "vercel-sandbox");
154
+ assert.equal(input.contract.execution?.kind, "agent");
155
+ assert.equal(input.secrets.length, 0);
156
+ writeFileSync(join(input.repoDir, "CLAUDE_RESULT.md"), "fake claude result\n");
157
+ input.emit({
158
+ type: "command.executed",
159
+ argvHash: requestHash({ adapter: "claude-code" }),
160
+ exitCode: 0
161
+ });
162
+ return { exitCode: 0, log: Buffer.from("fake claude transcript") };
163
+ }
164
+ };
165
+ try {
166
+ const result = await runEnsemble(descriptor({
167
+ models: [{ id: "claude", model: "claude-sonnet-4-6" }],
168
+ harness: claudeCodeHarness({
169
+ env: {
170
+ ANTHROPIC_API_KEY: "sk-ant-test",
171
+ VERCEL_TOKEN: "vercel-test"
172
+ },
173
+ backend
174
+ }),
175
+ workspace: repo.repo,
176
+ baseGitSha: repo.head,
177
+ outputRoot: repo.outputRoot,
178
+ cleanupWorktrees: true
179
+ }));
180
+ assert.equal(result.harnessRunResult.status, "succeeded");
181
+ assert.equal(result.candidates[0]?.status, "succeeded");
182
+ assert.equal(seen.agentKind, "claude-code");
183
+ assert.equal(seen.env?.ANTHROPIC_API_KEY, "sk-ant-test");
184
+ assert.equal(Object.hasOwn(seen.env ?? {}, "VERCEL_TOKEN"), false);
185
+ assert.notEqual(seen.repoDir, repo.repo);
186
+ assert.ok(result.artifacts.some((artifact) => artifact.kind === "patch"));
187
+ assert.match(result.candidates[0]?.metadata?.adapter, /claude-code/);
188
+ }
189
+ finally {
190
+ repo.cleanup();
191
+ }
192
+ });
193
+ test("smoke: claude-code adapter runs live when credentials are available", { skip: liveClaudeSmokeSkipReason() }, async () => {
194
+ const repo = makeRepo();
195
+ try {
196
+ const result = await runEnsemble(descriptor({
197
+ id: "claude_smoke",
198
+ models: [{ id: "claude", model: "claude-sonnet-4-6" }],
199
+ harness: claudeCodeHarness(),
200
+ runtime: {
201
+ id: "vercel-sandbox",
202
+ isolation: {
203
+ kind: "microvm",
204
+ networkPolicy: {
205
+ defaultDeny: true,
206
+ allowHosts: [
207
+ "registry.npmjs.org",
208
+ "api.anthropic.com",
209
+ "ai-gateway.vercel.sh"
210
+ ]
211
+ }
212
+ }
213
+ },
214
+ policy: {
215
+ id: "claude-smoke-policy",
216
+ allowedTools: ["read_file"],
217
+ sideEffects: "read_only",
218
+ timeoutMs: 180_000
219
+ },
220
+ prompt: "Read README.md if present, then reply exactly CLAUDE_LIVE_SMOKE_OK. Do not modify files.",
221
+ workspace: repo.repo,
222
+ baseGitSha: repo.head,
223
+ outputRoot: repo.outputRoot,
224
+ cleanupWorktrees: true
225
+ }));
226
+ assertHarnessRunResultV1(result.harnessRunResult);
227
+ assert.equal(result.harnessRunResult.status, "succeeded");
228
+ assert.equal(result.candidates[0]?.status, "succeeded");
229
+ }
230
+ finally {
231
+ repo.cleanup();
232
+ }
233
+ });
234
+ test("codex config declares a Responses provider without touching Cursor records", () => {
235
+ const toml = codexConfigToml({
236
+ model: "gpt-5.5-codex",
237
+ sandboxMode: "workspace-write",
238
+ approvalPolicy: "never",
239
+ provider: {
240
+ providerId: "warrant-local",
241
+ name: "Warrant Local",
242
+ baseUrl: "https://gateway.example.com/v1/responses",
243
+ apiKeyEnvName: "WARRANT_CODEX_API_KEY",
244
+ requiresOpenAiAuth: true
245
+ }
246
+ });
247
+ assert.match(toml, /model = "gpt-5\.5-codex"/);
248
+ assert.match(toml, /model_provider = "warrant-local"/);
249
+ assert.match(toml, /\[model_providers\.warrant-local\]/);
250
+ assert.match(toml, /base_url = "https:\/\/gateway\.example\.com\/v1"/);
251
+ assert.match(toml, /wire_api = "responses"/);
252
+ assert.match(toml, /env_key = "WARRANT_CODEX_API_KEY"/);
253
+ assert.equal(toml.includes("cursor"), false);
254
+ });
255
+ test("codex adapter emits schema-valid skipped output without credentials", async () => {
256
+ const codexHome = emptyCodexHome();
257
+ try {
258
+ const result = await runEnsemble(descriptor({
259
+ models: [{ id: "codex", model: "gpt-5.5-codex" }],
260
+ harness: codexHarness({
261
+ env: { CODEX_HOME: codexHome.path },
262
+ provider: { kind: "ambient" }
263
+ })
264
+ }));
265
+ assertHarnessRunResultV1(result.harnessRunResult);
266
+ assert.equal(result.harnessRunResult.status, "skipped");
267
+ assert.equal(result.candidates[0]?.status, "skipped");
268
+ assert.equal(result.candidates[0]?.error?.kind, "capability_missing");
269
+ assert.match(result.candidates[0]?.error?.message ?? "", /Codex credentials are absent/);
270
+ assert.match(result.summary?.candidates[0]?.verification?.evidence[0] ?? "", /Codex credentials/);
271
+ }
272
+ finally {
273
+ codexHome.cleanup();
274
+ }
275
+ });
276
+ test("codex adapter runs through an injected Responses runner and records evidence", async () => {
277
+ const calls = [];
278
+ const result = await runEnsemble(descriptor({
279
+ models: [{ id: "codex", model: "gpt-5.5-codex" }],
280
+ harness: codexHarness({
281
+ env: {},
282
+ provider: {
283
+ kind: "responses",
284
+ baseUrl: "http://127.0.0.1:8787/v1/responses",
285
+ apiKey: "inline-test-key",
286
+ requiresOpenAiAuth: true,
287
+ providerId: "local-responses"
288
+ },
289
+ runner: (input) => {
290
+ calls.push(input);
291
+ const codexHome = input.env.CODEX_HOME;
292
+ assert.ok(codexHome);
293
+ const config = readFileSync(join(codexHome, "config.toml"), "utf8");
294
+ assert.match(config, /model_provider = "local-responses"/);
295
+ assert.match(config, /base_url = "http:\/\/127\.0\.0\.1:8787\/v1"/);
296
+ assert.match(config, /wire_api = "responses"/);
297
+ assert.match(config, /env_key = "WARRANT_CODEX_PROVIDER_API_KEY"/);
298
+ assert.equal(input.env.WARRANT_CODEX_PROVIDER_API_KEY, "inline-test-key");
299
+ return {
300
+ stdout: '{"type":"message","content":"codex-ok"}\n',
301
+ stderr: "",
302
+ exitCode: 0
303
+ };
304
+ }
305
+ })
306
+ }));
307
+ assert.equal(calls.length, 1);
308
+ assert.deepEqual(calls[0]?.args.slice(0, 3), [
309
+ "exec",
310
+ "--json",
311
+ "--skip-git-repo-check"
312
+ ]);
313
+ assert.equal(result.harnessRunResult.status, "succeeded");
314
+ assert.equal(result.candidates[0]?.status, "succeeded");
315
+ assert.equal(result.toolRecords[0]?.status, "succeeded");
316
+ assert.match(result.candidates[0]?.metadata?.adapter, /codex/);
317
+ });
318
+ function liveClaudeSmokeSkipReason() {
319
+ if (process.env.WARRANT_CLAUDE_SMOKE !== "1") {
320
+ return "set WARRANT_CLAUDE_SMOKE=1 plus Claude Code credentials to run the live Claude Code smoke";
321
+ }
322
+ return claudeCodeHarnessCredentialSkipReason() ?? false;
323
+ }
324
+ function liveCodexSmokeSkipReason() {
325
+ if (process.env.WARRANT_CODEX_SMOKE !== "1") {
326
+ return "set WARRANT_CODEX_SMOKE=1 plus Codex credentials to run the live Codex smoke";
327
+ }
328
+ return codexHarnessCredentialSkipReason() ?? false;
329
+ }
330
+ test("codex adapter live smoke is credential-gated", { skip: liveCodexSmokeSkipReason() }, async () => {
331
+ const repo = makeRepo();
332
+ try {
333
+ const result = await runEnsemble(descriptor({
334
+ prompt: "Read README.md if present, then reply exactly CODEX_LIVE_SMOKE_OK. Do not modify files.",
335
+ models: [{ id: "codex", model: process.env.WARRANT_CODEX_SMOKE_MODEL ?? "gpt-5.5-codex" }],
336
+ harness: codexHarness({
337
+ timeoutMs: 60_000,
338
+ sandboxMode: "read-only",
339
+ approvalPolicy: "never"
340
+ }),
341
+ workspace: repo.repo,
342
+ baseGitSha: repo.head,
343
+ outputRoot: repo.outputRoot
344
+ }));
345
+ assertHarnessRunResultV1(result.harnessRunResult);
346
+ assert.notEqual(result.harnessRunResult.status, "skipped");
347
+ }
348
+ finally {
349
+ repo.cleanup();
350
+ }
351
+ });
352
+ test("command adapter records optional container hardening metadata", async () => {
353
+ const driver = {
354
+ id: "fake-ensemble-container",
355
+ supportsNetworkPolicy: true,
356
+ execute(input) {
357
+ assert.equal(input.image, "node:22-hardening");
358
+ return {
359
+ stdout: "container-hardening",
360
+ stderr: "",
361
+ exitCode: 0,
362
+ cleanup: { attempted: true, succeeded: true }
363
+ };
364
+ }
365
+ };
366
+ const result = await runEnsemble(descriptor({
367
+ models: [{ id: "command", model: "local-shell" }],
368
+ runtime: {
369
+ id: "local",
370
+ isolation: {
371
+ kind: "container",
372
+ image: "node:22-hardening",
373
+ driver,
374
+ mountPolicy: { readOnlyCachePaths: ["/tmp/cache"] },
375
+ networkPolicy: { defaultDeny: true, allowHosts: [], enforce: true },
376
+ secretPolicy: {
377
+ secretNames: ["API_TOKEN"],
378
+ secretValueHashes: ["sha256:" + "b".repeat(64)],
379
+ injectedEnvNames: ["API_TOKEN"]
380
+ }
381
+ }
382
+ },
383
+ harness: createCommandHarness({
384
+ command: "printf container-hardening"
385
+ })
386
+ }));
387
+ const metadata = result.candidates[0]?.metadata;
388
+ assert.equal(metadata?.hardening?.requested_isolation, "container");
389
+ assert.equal(metadata?.hardening?.runtime.image, "node:22-hardening");
390
+ assert.equal(metadata?.hardening?.mount_policy.read_only_caches[0], "/tmp/cache");
391
+ assert.equal(metadata?.hardening?.cleanup.status, "succeeded");
392
+ assert.equal(metadata?.hardening?.secret_absence.secret_names[0], "API_TOKEN");
393
+ assert.equal(result.summary?.candidates[0]?.hardening?.actual_isolation, "container");
394
+ assert.equal((result.harnessRunRequest.metadata?.hardening)
395
+ .requested_isolation, "container");
396
+ assert.equal((result.harnessRunResult.metadata?.hardening)
397
+ .cleanup_succeeded, 1);
398
+ });
399
+ test("command adapter records optional microVM hardening metadata", async () => {
400
+ const driver = {
401
+ id: "fake-ensemble-microvm",
402
+ provider: "vercel-sandbox",
403
+ supportsNetworkPolicy: true,
404
+ execute(input) {
405
+ assert.equal(input.provider, "vercel-sandbox");
406
+ assert.equal(input.runtime, "node24");
407
+ assert.equal(input.snapshotId, "snap_ensemble");
408
+ return {
409
+ stdout: "microvm-hardening",
410
+ stderr: "",
411
+ exitCode: 0,
412
+ actualIsolation: "vercel-sandbox",
413
+ runtime: {
414
+ provider: "vercel-sandbox",
415
+ runtime: "node24",
416
+ snapshotId: "snap_ensemble",
417
+ sandboxId: "sbx_ensemble",
418
+ runtimeDigest: "sha256:" + "d".repeat(64)
419
+ },
420
+ cleanup: { attempted: true, succeeded: true }
421
+ };
422
+ }
423
+ };
424
+ const result = await runEnsemble(descriptor({
425
+ models: [{ id: "command", model: "local-shell" }],
426
+ runtime: {
427
+ id: "local",
428
+ isolation: {
429
+ kind: "microvm",
430
+ provider: "vercel-sandbox",
431
+ runtime: "node24",
432
+ snapshotId: "snap_ensemble",
433
+ driver,
434
+ networkPolicy: { defaultDeny: true, allowHosts: [], enforce: true },
435
+ secretPolicy: {
436
+ secretNames: ["VERCEL_TOKEN"],
437
+ secretValueHashes: ["sha256:" + "e".repeat(64)],
438
+ injectedEnvNames: ["VERCEL_TOKEN"]
439
+ }
440
+ }
441
+ },
442
+ harness: createCommandHarness({
443
+ command: "printf microvm-hardening"
444
+ })
445
+ }));
446
+ const metadata = result.candidates[0]?.metadata;
447
+ assert.equal(metadata?.hardening?.requested_isolation, "microvm");
448
+ assert.equal(metadata?.hardening?.actual_isolation, "vercel-sandbox");
449
+ assert.equal(metadata?.hardening?.runtime.provider, "vercel-sandbox");
450
+ assert.equal(metadata?.hardening?.runtime.snapshot_id, "snap_ensemble");
451
+ assert.equal(metadata?.hardening?.runtime.sandbox_id, "sbx_ensemble");
452
+ assert.equal(metadata?.hardening?.runtime.driver, "fake-ensemble-microvm");
453
+ assert.equal(metadata?.hardening?.cleanup.status, "succeeded");
454
+ assert.equal(result.summary?.candidates[0]?.hardening?.actual_isolation, "vercel-sandbox");
455
+ assert.equal((result.harnessRunRequest.metadata?.hardening)
456
+ .requested_isolation, "microvm");
457
+ assert.equal((result.harnessRunResult.metadata?.hardening)
458
+ .cleanup_succeeded, 1);
459
+ });
460
+ test("mock adapter preserves optional container request as process fallback metadata", async () => {
461
+ const result = await runEnsemble(descriptor({
462
+ runtime: {
463
+ id: "local",
464
+ isolation: {
465
+ kind: "container",
466
+ image: "node:22-hardening",
467
+ networkPolicy: { defaultDeny: true, allowHosts: [], enforce: true }
468
+ }
469
+ }
470
+ }));
471
+ const metadata = result.candidates[0]?.metadata;
472
+ assert.equal(metadata?.hardening?.requested_isolation, "container");
473
+ assert.equal(metadata?.hardening?.actual_isolation, "process");
474
+ assert.equal(metadata?.hardening?.cleanup.status, "not_required");
475
+ assert.equal(metadata?.hardening?.network_policy.enforced, false);
476
+ assert.equal(result.summary?.candidates[0]?.hardening?.requested_isolation, "container");
477
+ });
478
+ test("command adapter maps non-zero exit to failed protocol status", async () => {
479
+ const result = await runEnsemble(descriptor({
480
+ models: [{ id: "command", model: "local-shell" }],
481
+ harness: createCommandHarness({
482
+ command: "exit 7"
483
+ })
484
+ }));
485
+ assert.equal(result.harnessRunResult.status, "failed");
486
+ assert.equal(result.candidates[0]?.status, "failed");
487
+ assert.equal(result.toolRecords[0]?.status, "failed");
488
+ });
489
+ test("descriptor rejects zero models and ad hoc checks", async () => {
490
+ await assert.rejects(() => runEnsemble(descriptor({ models: [] })), /at least one model/);
491
+ await assert.rejects(() => runEnsemble({
492
+ ...descriptor(),
493
+ checks: ["npm test"]
494
+ }), /ad hoc checks/);
495
+ });
496
+ test("terminal candidate records and result arrays are immutable", async () => {
497
+ const result = await runEnsemble(descriptor());
498
+ assert.equal(Object.isFrozen(result), true);
499
+ assert.equal(Object.isFrozen(result.candidates), true);
500
+ assert.equal(Object.isFrozen(result.candidates[0]), true);
501
+ assert.equal(Object.isFrozen(result.candidates[0]?.artifacts), true);
502
+ assert.equal(Object.isFrozen(result.artifacts[0]), true);
503
+ assert.throws(() => {
504
+ result.candidates.push({});
505
+ });
506
+ assert.throws(() => {
507
+ (result.candidates[0]?.artifacts).push({});
508
+ });
509
+ });
510
+ test("review evidence is attached but never becomes final selection", async () => {
511
+ const reviewEvidence = {
512
+ strategy: "smallest-diff",
513
+ scorecards: [{ candidate_id: "fast", diffBytes: 10 }],
514
+ reason: "deterministic evidence only"
515
+ };
516
+ const result = await runEnsemble(descriptor({ reviewEvidence }));
517
+ assert.deepEqual(result.reviewEvidence, reviewEvidence);
518
+ assert.equal("chosen" in result, false);
519
+ assert.equal("selected_candidate_id" in result.harnessRunResult, false);
520
+ });
521
+ test("adapter-provided model call records link candidates and summary metadata", async () => {
522
+ const record = modelCallRecord("model_call_fast");
523
+ const result = await runEnsemble(descriptor({
524
+ models: [{ id: "fast", model: "fake-fast" }],
525
+ harness: createMockHarness({
526
+ candidates: {
527
+ fast: { modelCallRecord: record }
528
+ }
529
+ })
530
+ }));
531
+ assert.equal(result.candidates[0]?.model_call_id, "model_call_fast");
532
+ assert.equal(result.modelCallRecords.length, 1);
533
+ assert.equal(result.modelCallRecords[0]?.call_id, "model_call_fast");
534
+ assert.equal(result.summary?.modelCallRecords.length, 1);
535
+ assert.equal(result.summary?.candidates[0]?.modelCallId, "model_call_fast");
536
+ assert.ok(result.artifacts.some((artifact) => artifact.artifact_id.includes("model_call_record")));
537
+ });
538
+ test("candidate worktrees are created from one snapshot and summarized after cleanup", async () => {
539
+ const repo = makeRepo();
540
+ try {
541
+ const harness = createMockHarness({
542
+ candidates: {
543
+ fast: { transcript: "fast transcript", summary: "fast summary" },
544
+ writer: { transcript: "writer transcript", summary: "writer summary" }
545
+ }
546
+ });
547
+ const result = await runEnsemble(descriptor({
548
+ harness,
549
+ workspace: repo.repo,
550
+ baseGitSha: repo.head,
551
+ outputRoot: repo.outputRoot,
552
+ cleanupWorktrees: true
553
+ }));
554
+ assert.equal(result.candidates.length, 2);
555
+ assert.equal(result.summary?.snapshot?.baseGitSha, repo.head);
556
+ assert.equal(result.summary?.candidates.length, 2);
557
+ assert.ok(result.summaryPath);
558
+ assert.equal(existsSync(result.summaryPath), true);
559
+ for (const candidate of result.candidates) {
560
+ assert.ok(candidate.branch_name);
561
+ assert.ok(candidate.worktree_path);
562
+ assert.equal(existsSync(candidate.worktree_path), false);
563
+ assert.ok(candidate.artifacts?.some((artifact) => artifact.kind === "worktree"));
564
+ assert.ok(candidate.artifacts?.some((artifact) => artifact.kind === "transcript"));
565
+ }
566
+ const summary = JSON.parse(readFileSync(result.summaryPath, "utf8"));
567
+ assert.equal(summary.finalPatchPath, null);
568
+ assert.equal(summary.candidates.length, 2);
569
+ assert.ok(summary.candidates.every((candidate) => candidate.worktreePath));
570
+ }
571
+ finally {
572
+ repo.cleanup();
573
+ }
574
+ });
575
+ test("candidate worktree diffs become patch artifacts", async () => {
576
+ const repo = makeRepo();
577
+ try {
578
+ const harness = {
579
+ id: "worktree-writer",
580
+ prepare: () => undefined,
581
+ capabilities: () => ({ workspace_write: "supported" }),
582
+ verificationProfile: () => ({
583
+ id: "worktree-writer",
584
+ requiredEvidence: ["patch", "worktree"]
585
+ }),
586
+ collectArtifacts: () => [],
587
+ run: ({ model, worktree }) => {
588
+ assert.ok(worktree);
589
+ writeFileSync(join(worktree.path, `${model.id}.txt`), `${model.model}\n`);
590
+ return {
591
+ model,
592
+ status: "succeeded",
593
+ transcript: `${model.id} wrote a file`,
594
+ verification: { status: "succeeded", evidence: ["file written"], exitCode: 0 }
595
+ };
596
+ }
597
+ };
598
+ const result = await runEnsemble(descriptor({
599
+ harness,
600
+ workspace: repo.repo,
601
+ baseGitSha: repo.head,
602
+ outputRoot: repo.outputRoot
603
+ }));
604
+ assert.equal(result.candidates.length, 2);
605
+ assert.ok(result.candidates.every((candidate) => candidate.artifacts?.some((artifact) => artifact.kind === "patch")));
606
+ assert.ok(result.summary?.candidates.every((candidate) => candidate.diffArtifacts.length === 1));
607
+ }
608
+ finally {
609
+ repo.cleanup();
610
+ }
611
+ });
612
+ test("adapter cleanup runs when collection fails", async () => {
613
+ let cleaned = false;
614
+ const harness = {
615
+ id: "cleanup",
616
+ prepare: () => undefined,
617
+ capabilities: () => ({ cleanup: "supported" }),
618
+ verificationProfile: () => ({ id: "cleanup", requiredEvidence: [] }),
619
+ run: ({ model }) => ({ model, status: "succeeded" }),
620
+ collectArtifacts: () => {
621
+ throw new Error("boom");
622
+ },
623
+ cleanup: () => {
624
+ cleaned = true;
625
+ }
626
+ };
627
+ await assert.rejects(() => runEnsemble(descriptor({ harness })), /boom/);
628
+ assert.equal(cleaned, true);
629
+ });
630
+ test("judge synthesis creates a final patch artifact from the original base", async () => {
631
+ const repo = makeRepo();
632
+ try {
633
+ const result = await runEnsemble(descriptor({
634
+ workspace: repo.repo,
635
+ baseGitSha: repo.head,
636
+ outputRoot: repo.outputRoot,
637
+ judge: {
638
+ id: "judge",
639
+ synthesizer: createMockJudgeSynthesizer({
640
+ output: {
641
+ decision: "synthesize",
642
+ finalOutput: "final patch",
643
+ rationale: "combine candidate evidence",
644
+ patch: {
645
+ content: addFilePatch("final.txt", "final\n"),
646
+ sourceCandidateIds: ["ensemble_test_fast_0"],
647
+ author: "judge"
648
+ },
649
+ contributions: [{ candidateId: "ensemble_test_fast_0", reason: "used evidence" }],
650
+ rejections: [{ candidateId: "ensemble_test_writer_1", reason: "less complete" }]
651
+ },
652
+ verificationResults: [
653
+ { status: "succeeded", evidence: ["final tests passed"], exitCode: 0 }
654
+ ]
655
+ })
656
+ }
657
+ }));
658
+ assert.ok(result.judgeSynthesisRecord);
659
+ assertJudgeSynthesisRecordV1(result.judgeSynthesisRecord);
660
+ assert.equal(result.judgeSynthesisRecord.decision, "synthesize");
661
+ assert.ok(result.finalPatchPath);
662
+ assert.equal(result.summary?.finalPatchPath, result.finalPatchPath);
663
+ const finalPatchArtifact = result.artifacts.find((artifact) => artifact.artifact_id.endsWith("_final_patch"));
664
+ assert.ok(finalPatchArtifact?.uri);
665
+ const finalPatch = readFileSync(new URL(finalPatchArtifact.uri), "utf8");
666
+ assert.ok(finalPatch.includes("final.txt"));
667
+ assert.ok(!finalPatch.includes("fast.txt"), "candidate worktree output is not the base");
668
+ }
669
+ finally {
670
+ repo.cleanup();
671
+ }
672
+ });
673
+ test("judge synthesis patch conflicts produce conflict artifacts", async () => {
674
+ const repo = makeRepo();
675
+ try {
676
+ const result = await runEnsemble(descriptor({
677
+ workspace: repo.repo,
678
+ baseGitSha: repo.head,
679
+ outputRoot: repo.outputRoot,
680
+ judge: {
681
+ id: "judge",
682
+ synthesizer: createMockJudgeSynthesizer({
683
+ output: {
684
+ decision: "synthesize",
685
+ finalOutput: "bad patch",
686
+ patch: {
687
+ content: "this is not a patch",
688
+ sourceCandidateIds: ["ensemble_test_fast_0"]
689
+ }
690
+ }
691
+ })
692
+ }
693
+ }));
694
+ assert.equal(result.judgeSynthesisRecord?.status, "failed");
695
+ assert.equal(result.judgeSynthesisRecord?.decision, "failed");
696
+ assert.equal(result.failureSummary?.reason, "patch_conflict");
697
+ assert.ok(result.artifacts.some((artifact) => artifact.artifact_id.includes("patch_conflict")));
698
+ }
699
+ finally {
700
+ repo.cleanup();
701
+ }
702
+ });
703
+ test("judge synthesis performs one repair round and records success", async () => {
704
+ const repo = makeRepo();
705
+ try {
706
+ const result = await runEnsemble(descriptor({
707
+ workspace: repo.repo,
708
+ baseGitSha: repo.head,
709
+ outputRoot: repo.outputRoot,
710
+ judge: {
711
+ id: "judge",
712
+ synthesizer: createMockJudgeSynthesizer({
713
+ output: {
714
+ decision: "synthesize",
715
+ finalOutput: "needs repair",
716
+ patch: { content: addFilePatch("initial.txt", "initial\n") }
717
+ },
718
+ repairOutput: {
719
+ decision: "synthesize",
720
+ finalOutput: "repaired",
721
+ patch: { content: addFilePatch("repair.txt", "repair\n") }
722
+ },
723
+ verificationResults: [
724
+ { status: "failed", evidence: ["initial failed"], exitCode: 1 },
725
+ { status: "succeeded", evidence: ["repair passed"], exitCode: 0 }
726
+ ]
727
+ })
728
+ }
729
+ }));
730
+ assert.equal(result.repairAttempts?.length, 1);
731
+ assert.equal(result.repairAttempts?.[0]?.status, "succeeded");
732
+ assert.equal(result.judgeSynthesisRecord?.status, "succeeded");
733
+ assert.equal(result.failureSummary, undefined);
734
+ }
735
+ finally {
736
+ repo.cleanup();
737
+ }
738
+ });
739
+ test("failed repair returns failure summary without deterministic fallback winner", async () => {
740
+ const repo = makeRepo();
741
+ try {
742
+ const result = await runEnsemble(descriptor({
743
+ workspace: repo.repo,
744
+ baseGitSha: repo.head,
745
+ outputRoot: repo.outputRoot,
746
+ reviewEvidence: {
747
+ strategy: "tests-pass-smallest-diff",
748
+ scorecards: [{ candidate_id: "ensemble_test_fast_0", diffBytes: 1 }],
749
+ reason: "evidence only"
750
+ },
751
+ judge: {
752
+ id: "judge",
753
+ synthesizer: createMockJudgeSynthesizer({
754
+ output: {
755
+ decision: "synthesize",
756
+ finalOutput: "needs repair",
757
+ patch: { content: addFilePatch("initial.txt", "initial\n") }
758
+ },
759
+ repairOutput: {
760
+ decision: "synthesize",
761
+ finalOutput: "still broken",
762
+ patch: { content: addFilePatch("repair.txt", "repair\n") }
763
+ },
764
+ verificationResults: [
765
+ { status: "failed", evidence: ["initial failed"], exitCode: 1 },
766
+ { status: "failed", evidence: ["repair failed"], exitCode: 1 }
767
+ ]
768
+ })
769
+ }
770
+ }));
771
+ assert.equal(result.judgeSynthesisRecord?.status, "failed");
772
+ assert.equal(result.judgeSynthesisRecord?.decision, "repair_required");
773
+ assert.equal(result.failureSummary?.reason, "repair_failed");
774
+ assert.equal("chosen" in result, false);
775
+ assert.equal(result.judgeSynthesisRecord.selected_candidate_id, undefined);
776
+ }
777
+ finally {
778
+ repo.cleanup();
779
+ }
780
+ });