@agent-relay/sdk 3.2.3 → 3.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/dist/communicate/a2a-bridge.d.ts +25 -0
  2. package/dist/communicate/a2a-bridge.d.ts.map +1 -0
  3. package/dist/communicate/a2a-bridge.js +89 -0
  4. package/dist/communicate/a2a-bridge.js.map +1 -0
  5. package/dist/communicate/a2a-server.d.ts +31 -0
  6. package/dist/communicate/a2a-server.d.ts.map +1 -0
  7. package/dist/communicate/a2a-server.js +220 -0
  8. package/dist/communicate/a2a-server.js.map +1 -0
  9. package/dist/communicate/a2a-transport.d.ts +48 -0
  10. package/dist/communicate/a2a-transport.d.ts.map +1 -0
  11. package/dist/communicate/a2a-transport.js +302 -0
  12. package/dist/communicate/a2a-transport.js.map +1 -0
  13. package/dist/communicate/a2a-types.d.ts +107 -0
  14. package/dist/communicate/a2a-types.d.ts.map +1 -0
  15. package/dist/communicate/a2a-types.js +209 -0
  16. package/dist/communicate/a2a-types.js.map +1 -0
  17. package/dist/communicate/adapters/claude-sdk.d.ts +28 -0
  18. package/dist/communicate/adapters/claude-sdk.d.ts.map +1 -0
  19. package/dist/communicate/adapters/claude-sdk.js +47 -0
  20. package/dist/communicate/adapters/claude-sdk.js.map +1 -0
  21. package/dist/communicate/adapters/crewai.d.ts +42 -0
  22. package/dist/communicate/adapters/crewai.d.ts.map +1 -0
  23. package/dist/communicate/adapters/crewai.js +95 -0
  24. package/dist/communicate/adapters/crewai.js.map +1 -0
  25. package/dist/communicate/adapters/google-adk.d.ts +53 -0
  26. package/dist/communicate/adapters/google-adk.d.ts.map +1 -0
  27. package/dist/communicate/adapters/google-adk.js +77 -0
  28. package/dist/communicate/adapters/google-adk.js.map +1 -0
  29. package/dist/communicate/adapters/index.d.ts +7 -0
  30. package/dist/communicate/adapters/index.d.ts.map +1 -0
  31. package/dist/communicate/adapters/index.js +7 -0
  32. package/dist/communicate/adapters/index.js.map +1 -0
  33. package/dist/communicate/adapters/langgraph.d.ts +40 -0
  34. package/dist/communicate/adapters/langgraph.d.ts.map +1 -0
  35. package/dist/communicate/adapters/langgraph.js +77 -0
  36. package/dist/communicate/adapters/langgraph.js.map +1 -0
  37. package/dist/communicate/adapters/openai-agents.d.ts +25 -0
  38. package/dist/communicate/adapters/openai-agents.d.ts.map +1 -0
  39. package/dist/communicate/adapters/openai-agents.js +70 -0
  40. package/dist/communicate/adapters/openai-agents.js.map +1 -0
  41. package/dist/communicate/adapters/pi.d.ts +45 -0
  42. package/dist/communicate/adapters/pi.d.ts.map +1 -0
  43. package/dist/communicate/adapters/pi.js +59 -0
  44. package/dist/communicate/adapters/pi.js.map +1 -0
  45. package/dist/communicate/core.d.ts +58 -0
  46. package/dist/communicate/core.d.ts.map +1 -0
  47. package/dist/communicate/core.js +128 -0
  48. package/dist/communicate/core.js.map +1 -0
  49. package/dist/communicate/index.d.ts +4 -0
  50. package/dist/communicate/index.d.ts.map +1 -0
  51. package/dist/communicate/index.js +4 -0
  52. package/dist/communicate/index.js.map +1 -0
  53. package/dist/communicate/transport.d.ts +36 -0
  54. package/dist/communicate/transport.d.ts.map +1 -0
  55. package/dist/communicate/transport.js +371 -0
  56. package/dist/communicate/transport.js.map +1 -0
  57. package/dist/communicate/types.d.ts +58 -0
  58. package/dist/communicate/types.d.ts.map +1 -0
  59. package/dist/communicate/types.js +66 -0
  60. package/dist/communicate/types.js.map +1 -0
  61. package/dist/models.d.ts +1 -1
  62. package/dist/models.d.ts.map +1 -1
  63. package/dist/models.js +2 -2
  64. package/dist/models.js.map +1 -1
  65. package/dist/workflows/builder.d.ts +35 -5
  66. package/dist/workflows/builder.d.ts.map +1 -1
  67. package/dist/workflows/builder.js +81 -7
  68. package/dist/workflows/builder.js.map +1 -1
  69. package/dist/workflows/cli.js +14 -1
  70. package/dist/workflows/cli.js.map +1 -1
  71. package/dist/workflows/runner.d.ts +10 -2
  72. package/dist/workflows/runner.d.ts.map +1 -1
  73. package/dist/workflows/runner.js +95 -1
  74. package/dist/workflows/runner.js.map +1 -1
  75. package/dist/workflows/types.d.ts +11 -0
  76. package/dist/workflows/types.d.ts.map +1 -1
  77. package/package.json +48 -2
  78. package/dist/__tests__/completion-pipeline.test.d.ts +0 -14
  79. package/dist/__tests__/completion-pipeline.test.d.ts.map +0 -1
  80. package/dist/__tests__/completion-pipeline.test.js +0 -1476
  81. package/dist/__tests__/completion-pipeline.test.js.map +0 -1
  82. package/dist/__tests__/contract-fixtures.test.d.ts +0 -2
  83. package/dist/__tests__/contract-fixtures.test.d.ts.map +0 -1
  84. package/dist/__tests__/contract-fixtures.test.js +0 -152
  85. package/dist/__tests__/contract-fixtures.test.js.map +0 -1
  86. package/dist/__tests__/e2e-owner-review.test.d.ts +0 -16
  87. package/dist/__tests__/e2e-owner-review.test.d.ts.map +0 -1
  88. package/dist/__tests__/e2e-owner-review.test.js +0 -640
  89. package/dist/__tests__/e2e-owner-review.test.js.map +0 -1
  90. package/dist/__tests__/facade.test.d.ts +0 -2
  91. package/dist/__tests__/facade.test.d.ts.map +0 -1
  92. package/dist/__tests__/facade.test.js +0 -305
  93. package/dist/__tests__/facade.test.js.map +0 -1
  94. package/dist/__tests__/integration.test.d.ts +0 -2
  95. package/dist/__tests__/integration.test.d.ts.map +0 -1
  96. package/dist/__tests__/integration.test.js +0 -205
  97. package/dist/__tests__/integration.test.js.map +0 -1
  98. package/dist/__tests__/pty.test.d.ts +0 -2
  99. package/dist/__tests__/pty.test.d.ts.map +0 -1
  100. package/dist/__tests__/pty.test.js +0 -20
  101. package/dist/__tests__/pty.test.js.map +0 -1
  102. package/dist/__tests__/quickstart.test.d.ts +0 -2
  103. package/dist/__tests__/quickstart.test.d.ts.map +0 -1
  104. package/dist/__tests__/quickstart.test.js +0 -176
  105. package/dist/__tests__/quickstart.test.js.map +0 -1
  106. package/dist/__tests__/spawn-from-env.test.d.ts +0 -2
  107. package/dist/__tests__/spawn-from-env.test.d.ts.map +0 -1
  108. package/dist/__tests__/spawn-from-env.test.js +0 -222
  109. package/dist/__tests__/spawn-from-env.test.js.map +0 -1
  110. package/dist/__tests__/unit.test.d.ts +0 -2
  111. package/dist/__tests__/unit.test.d.ts.map +0 -1
  112. package/dist/__tests__/unit.test.js +0 -357
  113. package/dist/__tests__/unit.test.js.map +0 -1
@@ -1,1476 +0,0 @@
1
- /**
2
- * Completion Pipeline tests for Point-Person-Led Completion spec.
3
- *
4
- * Validates:
5
- * 1. Evidence-based completion (verification passes without marker)
6
- * 2. Owner decision parsing (OWNER_DECISION: COMPLETE/INCOMPLETE_RETRY/INCOMPLETE_FAIL)
7
- * 3. Tolerant review parsing (accepts semantic equivalents)
8
- * 4. Channel evidence contributions (WORKER_DONE signals)
9
- * 5. Backward compatibility with marker-based workflows
10
- * 6. Codex/Gemini/Supervisor pattern compatibility
11
- * 7. Map-reduce workflows remain unaffected
12
- */
13
- import { describe, it, expect, vi, beforeEach } from 'vitest';
14
- // ── Mock fetch to prevent real HTTP calls (Relaycast provisioning) ───────────
15
- const mockFetch = vi.fn().mockResolvedValue({
16
- ok: true,
17
- json: () => Promise.resolve({ data: { api_key: 'rk_live_test', workspace_id: 'ws-test' } }),
18
- text: () => Promise.resolve(''),
19
- });
20
- vi.stubGlobal('fetch', mockFetch);
21
- // ── Mock RelayCast SDK ───────────────────────────────────────────────────────
22
- const mockRelaycastAgent = {
23
- send: vi.fn().mockResolvedValue(undefined),
24
- heartbeat: vi.fn().mockResolvedValue(undefined),
25
- channels: {
26
- create: vi.fn().mockResolvedValue(undefined),
27
- join: vi.fn().mockResolvedValue(undefined),
28
- invite: vi.fn().mockResolvedValue(undefined),
29
- },
30
- };
31
- const mockRelaycast = {
32
- agents: {
33
- register: vi.fn().mockResolvedValue({ token: 'token-1' }),
34
- },
35
- as: vi.fn().mockReturnValue(mockRelaycastAgent),
36
- };
37
- class MockRelayError extends Error {
38
- code;
39
- constructor(code, message, status = 400) {
40
- super(message);
41
- this.code = code;
42
- this.name = 'RelayError';
43
- this.status = status;
44
- }
45
- }
46
- vi.mock('@relaycast/sdk', () => ({
47
- RelayCast: vi.fn().mockImplementation(() => mockRelaycast),
48
- RelayError: MockRelayError,
49
- }));
50
- // ── Mock AgentRelay ──────────────────────────────────────────────────────────
51
- let waitForExitFn;
52
- let waitForIdleFn;
53
- let mockSpawnOutputs = [];
54
- vi.mock('node:child_process', async () => {
55
- const actual = await vi.importActual('node:child_process');
56
- const { EventEmitter } = await import('node:events');
57
- return {
58
- ...actual,
59
- spawn: vi.fn().mockImplementation(() => {
60
- const child = new EventEmitter();
61
- child.pid = 4242;
62
- child.kill = vi.fn();
63
- child.stdout = new EventEmitter();
64
- child.stderr = new EventEmitter();
65
- const output = mockSpawnOutputs.shift() ?? '';
66
- queueMicrotask(() => {
67
- if (output)
68
- child.stdout.emit('data', Buffer.from(output));
69
- child.emit('close', 0, null);
70
- });
71
- return child;
72
- }),
73
- };
74
- });
75
- const mockAgent = {
76
- name: 'test-agent-abc',
77
- get waitForExit() {
78
- return waitForExitFn;
79
- },
80
- get waitForIdle() {
81
- return waitForIdleFn;
82
- },
83
- release: vi.fn().mockResolvedValue(undefined),
84
- };
85
- const mockHuman = {
86
- name: 'WorkflowRunner',
87
- sendMessage: vi.fn().mockResolvedValue(undefined),
88
- };
89
- function never() {
90
- return new Promise(() => { });
91
- }
92
- const defaultSpawnPtyImplementation = async ({ name, task, }) => {
93
- const queued = mockSpawnOutputs.shift();
94
- const stepComplete = task?.match(/STEP_COMPLETE:([^\n]+)/)?.[1]?.trim();
95
- const isReview = task?.includes('REVIEW_DECISION: APPROVE or REJECT');
96
- const output = queued ??
97
- (isReview
98
- ? 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: looks good\n'
99
- : stepComplete
100
- ? `STEP_COMPLETE:${stepComplete}\n`
101
- : 'STEP_COMPLETE:unknown\n');
102
- queueMicrotask(() => {
103
- if (typeof mockRelayInstance.onWorkerOutput === 'function') {
104
- mockRelayInstance.onWorkerOutput({ name, chunk: output });
105
- }
106
- });
107
- return { ...mockAgent, name };
108
- };
109
- const mockRelayInstance = {
110
- spawnPty: vi.fn().mockImplementation(defaultSpawnPtyImplementation),
111
- human: vi.fn().mockReturnValue(mockHuman),
112
- shutdown: vi.fn().mockResolvedValue(undefined),
113
- onBrokerStderr: vi.fn().mockReturnValue(() => { }),
114
- onWorkerOutput: null,
115
- onMessageReceived: null,
116
- onAgentSpawned: null,
117
- onAgentExited: null,
118
- onAgentIdle: null,
119
- listAgentsRaw: vi.fn().mockResolvedValue([]),
120
- };
121
- let relayEventCounter = 0;
122
- function emitRelayChannelMessage(message) {
123
- setTimeout(() => {
124
- mockRelayInstance.onMessageReceived?.({
125
- eventId: `evt-${++relayEventCounter}`,
126
- from: message.from,
127
- to: message.to,
128
- text: message.text,
129
- threadId: undefined,
130
- });
131
- }, 0);
132
- }
133
- vi.mock('../relay.js', () => ({
134
- AgentRelay: vi.fn().mockImplementation(() => mockRelayInstance),
135
- }));
136
- // Import after mocking
137
- const { WorkflowRunner } = await import('../workflows/runner.js');
138
- // ── Test fixtures ────────────────────────────────────────────────────────────
139
- function makeDb() {
140
- const runs = new Map();
141
- const steps = new Map();
142
- return {
143
- insertRun: vi.fn(async (run) => {
144
- runs.set(run.id, { ...run });
145
- }),
146
- updateRun: vi.fn(async (id, patch) => {
147
- const existing = runs.get(id);
148
- if (existing)
149
- runs.set(id, { ...existing, ...patch });
150
- }),
151
- getRun: vi.fn(async (id) => {
152
- const run = runs.get(id);
153
- return run ? { ...run } : null;
154
- }),
155
- insertStep: vi.fn(async (step) => {
156
- steps.set(step.id, { ...step });
157
- }),
158
- updateStep: vi.fn(async (id, patch) => {
159
- const existing = steps.get(id);
160
- if (existing)
161
- steps.set(id, { ...existing, ...patch });
162
- }),
163
- getStepsByRunId: vi.fn(async (runId) => {
164
- return [...steps.values()].filter((s) => s.runId === runId);
165
- }),
166
- };
167
- }
168
- function makeConfig(overrides = {}) {
169
- return {
170
- version: '1',
171
- name: 'completion-pipeline-test',
172
- swarm: { pattern: 'dag' },
173
- agents: [
174
- { name: 'agent-a', cli: 'claude' },
175
- { name: 'agent-b', cli: 'claude' },
176
- ],
177
- workflows: [
178
- {
179
- name: 'default',
180
- steps: [
181
- { name: 'step-1', agent: 'agent-a', task: 'Do step 1' },
182
- { name: 'step-2', agent: 'agent-b', task: 'Do step 2', dependsOn: ['step-1'] },
183
- ],
184
- },
185
- ],
186
- trajectories: false,
187
- ...overrides,
188
- };
189
- }
190
- function makeSupervisedConfig(stepOverrides = {}) {
191
- return makeConfig({
192
- agents: [
193
- { name: 'specialist', cli: 'claude', role: 'engineer' },
194
- { name: 'team-lead', cli: 'claude', role: 'lead coordinator' },
195
- { name: 'reviewer-1', cli: 'claude', role: 'reviewer' },
196
- ],
197
- workflows: [
198
- {
199
- name: 'default',
200
- steps: [
201
- {
202
- name: 'step-1',
203
- agent: 'specialist',
204
- task: 'Implement the requested change',
205
- ...stepOverrides,
206
- },
207
- ],
208
- },
209
- ],
210
- });
211
- }
212
- function makeChannelSupervisedConfig(channel, stepOverrides = {}) {
213
- const config = makeSupervisedConfig(stepOverrides);
214
- config.swarm = { ...config.swarm, channel };
215
- return config;
216
- }
217
- async function getStepRow(db, runId, stepName) {
218
- const steps = await db.getStepsByRunId(runId);
219
- return steps.find((step) => step.stepName === stepName);
220
- }
221
- // ── Tests ────────────────────────────────────────────────────────────────────
222
- describe('Completion Pipeline', () => {
223
- let db;
224
- let runner;
225
- beforeEach(() => {
226
- vi.clearAllMocks();
227
- relayEventCounter = 0;
228
- waitForExitFn = vi.fn().mockResolvedValue('exited');
229
- waitForIdleFn = vi.fn().mockImplementation(() => never());
230
- mockSpawnOutputs = [];
231
- mockAgent.release.mockResolvedValue(undefined);
232
- mockRelayInstance.spawnPty.mockImplementation(defaultSpawnPtyImplementation);
233
- mockRelayInstance.onWorkerOutput = null;
234
- db = makeDb();
235
- runner = new WorkflowRunner({ db, workspaceId: 'ws-test' });
236
- });
237
- // ── Unit Test 1: Verification passes without marker ───────────────────
238
- describe('evidence-based completion without marker', () => {
239
- it('should complete step when verification passes but STEP_COMPLETE marker is missing', async () => {
240
- // Worker output contains the verification target but no STEP_COMPLETE marker
241
- mockSpawnOutputs = [
242
- 'worker output with expected content\n',
243
- 'Owner observed the work is done\nSTEP_COMPLETE:step-1\n',
244
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
245
- ];
246
- const config = makeSupervisedConfig({
247
- verification: { type: 'output_contains', value: 'expected content' },
248
- });
249
- const run = await runner.execute(config, 'default');
250
- expect(run.status).toBe('completed');
251
- }, 15000);
252
- it('should complete self-owned step when verification passes without marker', async () => {
253
- // Agent output has verified content but no STEP_COMPLETE marker
254
- // With the completion pipeline, verification passing should be sufficient
255
- mockSpawnOutputs = [
256
- 'All tests passed\nBuild successful\nSTEP_COMPLETE:step-1\n',
257
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: tests pass\n',
258
- ];
259
- const config = makeConfig({
260
- workflows: [
261
- {
262
- name: 'default',
263
- steps: [
264
- {
265
- name: 'step-1',
266
- agent: 'agent-a',
267
- task: 'Run tests',
268
- verification: { type: 'output_contains', value: 'All tests passed' },
269
- },
270
- ],
271
- },
272
- ],
273
- });
274
- const run = await runner.execute(config, 'default');
275
- expect(run.status).toBe('completed');
276
- }, 15000);
277
- });
278
- // ── Unit Test 2: Owner approves despite malformed worker marker ────────
279
- describe('owner decision overrides malformed markers', () => {
280
- it('should complete step when owner approves despite malformed worker marker', async () => {
281
- // Worker outputs a malformed marker, but owner's STEP_COMPLETE is correct
282
- mockSpawnOutputs = [
283
- 'STEP_COMPLET:step-1\n', // typo in worker marker
284
- 'Checked worker output, work is done\nSTEP_COMPLETE:step-1\n',
285
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: owner confirmed\n',
286
- ];
287
- const run = await runner.execute(makeSupervisedConfig(), 'default');
288
- expect(run.status).toBe('completed');
289
- }, 15000);
290
- it('should complete when owner provides OWNER_DECISION: COMPLETE', async () => {
291
- // Owner uses the structured decision format
292
- mockSpawnOutputs = [
293
- 'worker finished work\n',
294
- 'OWNER_DECISION: COMPLETE\nREASON: verified artifacts\nSTEP_COMPLETE:step-1\n',
295
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: owner confirmed\n',
296
- ];
297
- const run = await runner.execute(makeSupervisedConfig(), 'default');
298
- expect(run.status).toBe('completed');
299
- }, 15000);
300
- });
301
- // ── Unit Test 3: Owner requests retry via OWNER_DECISION ──────────────
302
- describe('owner decision retry', () => {
303
- it('should fail with a clear error when owner requests INCOMPLETE_RETRY and retries are disabled', async () => {
304
- mockSpawnOutputs = [
305
- 'worker first attempt\n',
306
- 'OWNER_DECISION: INCOMPLETE_RETRY\nREASON: missing error handling\n',
307
- ];
308
- const run = await runner.execute(makeSupervisedConfig({ retries: 0 }), 'default');
309
- expect(run.status).toBe('failed');
310
- expect(run.error).toContain('no retries are configured (maxRetries=0)');
311
- expect(run.error).toContain('OWNER_DECISION: INCOMPLETE_RETRY');
312
- const steps = await db.getStepsByRunId(run.id);
313
- expect(steps).toHaveLength(1);
314
- expect(steps[0]?.status).toBe('failed');
315
- expect(steps[0]?.completionReason).toBe('retry_requested_by_owner');
316
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(2);
317
- }, 15000);
318
- it('should retry and complete when owner requests INCOMPLETE_RETRY and retries remain', async () => {
319
- const retryEvents = [];
320
- runner.on((event) => {
321
- if (event.type === 'step:retrying') {
322
- retryEvents.push({ type: event.type, stepName: event.stepName });
323
- }
324
- });
325
- // First attempt: owner requests retry
326
- // Second attempt: owner approves
327
- mockSpawnOutputs = [
328
- 'worker first attempt\n',
329
- 'OWNER_DECISION: INCOMPLETE_RETRY\nREASON: missing error handling\n',
330
- 'worker second attempt with error handling\n',
331
- 'STEP_COMPLETE:step-1\n',
332
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: retry succeeded\n',
333
- ];
334
- const config = makeSupervisedConfig({ retries: 1 });
335
- const run = await runner.execute(config, 'default');
336
- expect(run.status).toBe('completed');
337
- expect(retryEvents).toEqual([{ type: 'step:retrying', stepName: 'step-1' }]);
338
- const steps = await db.getStepsByRunId(run.id);
339
- expect(steps).toHaveLength(1);
340
- expect(steps[0]?.status).toBe('completed');
341
- expect(steps[0]?.retryCount).toBe(1);
342
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(5);
343
- }, 15000);
344
- it('should fail after retries are exhausted when owner keeps requesting INCOMPLETE_RETRY', async () => {
345
- mockSpawnOutputs = [
346
- 'worker first attempt\n',
347
- 'OWNER_DECISION: INCOMPLETE_RETRY\nREASON: missing tests\n',
348
- 'worker second attempt\n',
349
- 'OWNER_DECISION: INCOMPLETE_RETRY\nREASON: still missing tests\n',
350
- ];
351
- const run = await runner.execute(makeSupervisedConfig({ retries: 1 }), 'default');
352
- expect(run.status).toBe('failed');
353
- expect(run.error).toContain('retry budget is exhausted (maxRetries=1)');
354
- expect(run.error).toContain('after 2 total attempts');
355
- const steps = await db.getStepsByRunId(run.id);
356
- expect(steps).toHaveLength(1);
357
- expect(steps[0]?.status).toBe('failed');
358
- expect(steps[0]?.completionReason).toBe('retry_requested_by_owner');
359
- expect(steps[0]?.retryCount).toBe(1);
360
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(4);
361
- }, 15000);
362
- it('should honor INCOMPLETE_RETRY from a non-interactive reviewer step', async () => {
363
- const localDb = makeDb();
364
- runner = new WorkflowRunner({ db: localDb, workspaceId: 'ws-test' });
365
- mockSpawnOutputs = ['OWNER_DECISION: INCOMPLETE_RETRY\nREASON: explicit retry requested\n'];
366
- const run = await runner.execute(makeConfig({
367
- agents: [{ name: 'reviewer', cli: 'claude', preset: 'reviewer' }],
368
- workflows: [
369
- {
370
- name: 'default',
371
- steps: [
372
- {
373
- name: 'review-step',
374
- agent: 'reviewer',
375
- task: 'Review the artifact and decide whether to retry.',
376
- verification: { type: 'output_contains', value: 'OWNER_DECISION: INCOMPLETE_RETRY' },
377
- },
378
- ],
379
- },
380
- ],
381
- }), 'default');
382
- expect(run.status).toBe('failed');
383
- expect(run.error).toContain('owner requested another attempt');
384
- const steps = await localDb.getStepsByRunId(run.id);
385
- expect(steps).toHaveLength(1);
386
- expect(steps[0]?.status).toBe('failed');
387
- expect(steps[0]?.completionReason).toBe('retry_requested_by_owner');
388
- }, 15000);
389
- it('should not complete a self-owned step when INCOMPLETE_RETRY conflicts with success signals', async () => {
390
- mockSpawnOutputs = [
391
- [
392
- 'OWNER_DECISION: INCOMPLETE_RETRY',
393
- 'REASON: owner wants another verification pass',
394
- 'STEP_COMPLETE:step-1',
395
- 'expected content',
396
- 'verified locally',
397
- ].join('\n'),
398
- ];
399
- const run = await runner.execute(makeConfig({
400
- workflows: [
401
- {
402
- name: 'default',
403
- steps: [
404
- {
405
- name: 'step-1',
406
- agent: 'agent-a',
407
- task: 'Run tests',
408
- retries: 0,
409
- verification: { type: 'output_contains', value: 'expected content' },
410
- },
411
- ],
412
- },
413
- ],
414
- }), 'default');
415
- expect(run.status).toBe('failed');
416
- expect(run.error).toContain('no retries are configured (maxRetries=0)');
417
- const steps = await db.getStepsByRunId(run.id);
418
- expect(steps).toHaveLength(1);
419
- expect(steps[0]?.status).toBe('failed');
420
- expect(steps[0]?.completionReason).toBe('retry_requested_by_owner');
421
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(1);
422
- }, 15000);
423
- it('should not let passing verification override INCOMPLETE_RETRY', async () => {
424
- mockSpawnOutputs = [
425
- 'worker output with expected content\n',
426
- [
427
- 'OWNER_DECISION: INCOMPLETE_RETRY',
428
- 'REASON: missing WORKER_DONE marker',
429
- 'verified artifacts after inspecting output',
430
- 'worker finished implementation',
431
- ].join('\n'),
432
- ];
433
- const run = await runner.execute(makeSupervisedConfig({
434
- verification: { type: 'output_contains', value: 'expected content' },
435
- }), 'default');
436
- expect(run.status).toBe('failed');
437
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(2);
438
- }, 15000);
439
- it('should not let passing verification override NEEDS_CLARIFICATION', async () => {
440
- mockSpawnOutputs = [
441
- 'worker output with expected content\n',
442
- [
443
- 'OWNER_DECISION: NEEDS_CLARIFICATION',
444
- 'REASON: owner needs proof of the channel handoff',
445
- 'verified artifacts after inspecting output',
446
- ].join('\n'),
447
- ];
448
- const run = await runner.execute(makeSupervisedConfig({
449
- verification: { type: 'output_contains', value: 'expected content' },
450
- }), 'default');
451
- expect(run.status).toBe('failed');
452
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(2);
453
- }, 15000);
454
- });
455
- // ── Unit Test 4: Owner rejects AND verification fails ─────────────────
456
- describe('double failure: owner reject + verification fail', () => {
457
- it('should fail step when owner rejects AND verification also fails', async () => {
458
- mockSpawnOutputs = [
459
- 'worker output without expected content\n',
460
- 'OWNER_DECISION: INCOMPLETE_FAIL\nREASON: work is wrong\n',
461
- ];
462
- const config = makeSupervisedConfig({
463
- verification: { type: 'output_contains', value: 'expected output' },
464
- });
465
- const run = await runner.execute(config, 'default');
466
- expect(run.status).toBe('failed');
467
- }, 15000);
468
- it('should fail when owner rejects even if verification passes', async () => {
469
- mockSpawnOutputs = [
470
- 'worker output with expected content\n',
471
- [
472
- 'OWNER_DECISION: INCOMPLETE_FAIL',
473
- 'REASON: work is incomplete without WORKER_DONE proof',
474
- 'artifacts verified locally',
475
- 'worker finished implementation',
476
- ].join('\n'),
477
- ];
478
- const run = await runner.execute(makeSupervisedConfig({
479
- verification: { type: 'output_contains', value: 'expected content' },
480
- }), 'default');
481
- expect(run.status).toBe('failed');
482
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(2);
483
- }, 15000);
484
- it('should still complete by owner decision when COMPLETE and verification both pass', async () => {
485
- mockSpawnOutputs = [
486
- 'worker output with expected content\n',
487
- 'OWNER_DECISION: COMPLETE\nREASON: verified artifacts\n',
488
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: owner confirmed\n',
489
- ];
490
- const run = await runner.execute(makeSupervisedConfig({
491
- verification: { type: 'output_contains', value: 'expected content' },
492
- }), 'default');
493
- expect(run.status).toBe('completed');
494
- const [step] = await db.getStepsByRunId(run.id);
495
- expect(step?.completionReason).toBe('completed_by_owner_decision');
496
- }, 15000);
497
- it('should fail verification before accepting OWNER_DECISION COMPLETE', async () => {
498
- mockSpawnOutputs = [
499
- 'worker output without the required token\n',
500
- 'OWNER_DECISION: COMPLETE\nREASON: verified artifacts\n',
501
- ];
502
- const run = await runner.execute(makeSupervisedConfig({
503
- verification: { type: 'output_contains', value: 'expected content' },
504
- }), 'default');
505
- expect(run.status).toBe('failed');
506
- expect(mockRelayInstance.spawnPty).toHaveBeenCalledTimes(2);
507
- }, 15000);
508
- it('should still complete as verified when no owner decision is provided and verification passes', async () => {
509
- mockSpawnOutputs = [
510
- 'worker output with expected content\n',
511
- 'Owner checked the output and left no structured decision.\n',
512
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verification passed\n',
513
- ];
514
- const run = await runner.execute(makeSupervisedConfig({
515
- verification: { type: 'output_contains', value: 'expected content' },
516
- }), 'default');
517
- expect(run.status).toBe('completed');
518
- const [step] = await db.getStepsByRunId(run.id);
519
- expect(step?.completionReason).toBe('completed_verified');
520
- }, 15000);
521
- });
522
- // ── Unit Test 5: Tolerant review parser ────────────────────────────────
523
- describe('tolerant review parsing', () => {
524
- it('should accept standard REVIEW_DECISION: APPROVE format', async () => {
525
- const events = [];
526
- runner.on((event) => {
527
- if (event.type === 'step:review-completed') {
528
- events.push({ type: event.type, decision: event.decision });
529
- }
530
- });
531
- mockSpawnOutputs = [
532
- 'STEP_COMPLETE:step-1\n',
533
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: all good\n',
534
- ];
535
- const run = await runner.execute(makeConfig(), 'default');
536
- expect(run.status).toBe('completed');
537
- expect(events).toContainEqual({ type: 'step:review-completed', decision: 'approved' });
538
- }, 15000);
539
- it('should accept standard REVIEW_DECISION: REJECT format', async () => {
540
- const events = [];
541
- runner.on((event) => {
542
- if (event.type === 'step:review-completed') {
543
- events.push({ type: event.type, decision: event.decision });
544
- }
545
- });
546
- mockSpawnOutputs = [
547
- 'STEP_COMPLETE:step-1\n',
548
- 'REVIEW_DECISION: REJECT\nREVIEW_REASON: needs work\n',
549
- ];
550
- const run = await runner.execute(makeConfig(), 'default');
551
- expect(run.status).toBe('failed');
552
- expect(run.error).toContain('review rejected');
553
- expect(events).toContainEqual({ type: 'step:review-completed', decision: 'rejected' });
554
- }, 15000);
555
- // These tests validate the tolerant parser once it's implemented.
556
- // The tolerant parser should accept semantic equivalents.
557
- it('should still fail on review output with no usable approval or rejection signal', async () => {
558
- mockSpawnOutputs = [
559
- 'STEP_COMPLETE:step-1\n',
560
- 'I need more context before deciding.\n',
561
- ];
562
- const run = await runner.execute(makeConfig(), 'default');
563
- expect(run.status).toBe('failed');
564
- expect(run.error).toContain('review response malformed');
565
- }, 15000);
566
- });
567
- // ── Unit Test 6: Channel evidence ─────────────────────────────────────
568
- describe('channel evidence for completion', () => {
569
- it('should capture WORKER_DONE signals from channel messages', async () => {
570
- // Worker posts done signal, owner observes and confirms
571
- mockSpawnOutputs = [
572
- 'WORKER_DONE: all tasks completed\n',
573
- 'Worker reported done on channel, verified artifacts\nSTEP_COMPLETE:step-1\n',
574
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: channel evidence confirms\n',
575
- ];
576
- const run = await runner.execute(makeSupervisedConfig(), 'default');
577
- expect(run.status).toBe('completed');
578
- // Verify the channel received the worker done signal
579
- const channelMessages = mockRelaycastAgent.send.mock.calls.map(([, text]) => text);
580
- expect(channelMessages.some((text) => text.includes('WORKER_DONE'))).toBe(true);
581
- const evidence = runner.getStepCompletionEvidence('step-1');
582
- const workerDoneSignals = evidence?.coordinationSignals.filter((signal) => signal.kind === 'worker_done' && signal.source === 'channel') ?? [];
583
- expect(workerDoneSignals.some((signal) => signal.sender === 'specialist')).toBe(true);
584
- }, 15000);
585
- it('should forward worker channel evidence to the owner prompt', async () => {
586
- mockSpawnOutputs = [
587
- 'implementation complete\nWORKER_DONE: finished feature\n',
588
- 'Observed WORKER_DONE on channel\nSTEP_COMPLETE:step-1\n',
589
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: looks good\n',
590
- ];
591
- const run = await runner.execute(makeSupervisedConfig(), 'default');
592
- expect(run.status).toBe('completed');
593
- }, 15000);
594
- it('should not count lead-authored WORKER_DONE channel posts as worker completion evidence', async () => {
595
- waitForExitFn = vi.fn().mockImplementation(async () => {
596
- await new Promise((resolve) => setTimeout(resolve, 5));
597
- return 'exited';
598
- });
599
- mockRelayInstance.spawnPty.mockImplementation(async ({ name, task }) => {
600
- const agent = await defaultSpawnPtyImplementation({ name, task });
601
- if (task?.includes('You are the step owner/supervisor for step "step-1".')) {
602
- emitRelayChannelMessage({
603
- from: agent.name,
604
- to: 'completion-provenance',
605
- text: 'WORKER_DONE: lead summarized the handoff',
606
- });
607
- }
608
- return agent;
609
- });
610
- mockSpawnOutputs = [
611
- 'worker progress update only\n',
612
- 'Owner observed the channel but left no decision.\n',
613
- ];
614
- const config = makeSupervisedConfig();
615
- config.swarm = { ...config.swarm, channel: 'completion-provenance' };
616
- const run = await runner.execute(config, 'default');
617
- expect(run.status).toBe('failed');
618
- expect(run.error).toContain('owner completion decision missing');
619
- await new Promise((resolve) => setTimeout(resolve, 0));
620
- const evidence = runner.getStepCompletionEvidence('step-1');
621
- const spoofedPosts = evidence?.channelPosts.filter((post) => post.sender === 'team-lead' && post.text.includes('WORKER_DONE')) ?? [];
622
- expect(spoofedPosts.length).toBeGreaterThan(0);
623
- expect(evidence?.coordinationSignals.filter((signal) => signal.kind === 'worker_done') ?? []).toHaveLength(0);
624
- const spoofedPost = evidence?.channelPosts.find((post) => post.sender === 'team-lead' && post.text.includes('WORKER_DONE'));
625
- expect(spoofedPost?.signals.some((signal) => signal.kind === 'worker_done') ?? false).toBe(false);
626
- }, 15000);
627
- it('should filter wrong-agent coordination signals from the evidence view', async () => {
628
- mockSpawnOutputs = [
629
- 'LEAD_DONE: worker cannot declare lead completion\nWORKER_DONE: all tasks completed\n',
630
- 'Owner confirmed\nSTEP_COMPLETE:step-1\n',
631
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
632
- ];
633
- const run = await runner.execute(makeSupervisedConfig(), 'default');
634
- expect(run.status).toBe('completed');
635
- const evidence = runner.getStepCompletionEvidence('step-1');
636
- expect(evidence?.coordinationSignals.filter((signal) => signal.kind === 'lead_done')).toHaveLength(0);
637
- expect(evidence?.coordinationSignals.some((signal) => signal.kind === 'worker_done' && signal.sender === 'specialist')).toBe(true);
638
- }, 15000);
639
- });
640
- describe('happy-path lead-worker workflow proof', () => {
641
- it('should complete by evidence when the worker posts WORKER_DONE on the channel', async () => {
642
- const channel = 'happy-path-worker-done';
643
- waitForExitFn = vi.fn().mockImplementation(async () => {
644
- await new Promise((resolve) => setTimeout(resolve, 5));
645
- return 'exited';
646
- });
647
- mockRelayInstance.spawnPty.mockImplementation(async ({ name, task }) => {
648
- const agent = await defaultSpawnPtyImplementation({ name, task });
649
- if (name.includes('step-1-worker')) {
650
- emitRelayChannelMessage({
651
- from: agent.name,
652
- to: channel,
653
- text: 'WORKER_DONE: implementation shipped',
654
- });
655
- }
656
- return agent;
657
- });
658
- mockSpawnOutputs = [
659
- 'artifact bundle ready\n',
660
- 'Lead verified the worker handoff is complete and safe.\n',
661
- ];
662
- const run = await runner.execute(makeChannelSupervisedConfig(channel), 'default');
663
- expect(run.status).toBe('completed');
664
- const step = await getStepRow(db, run.id, 'step-1');
665
- expect(step?.completionReason).toBe('completed_by_evidence');
666
- const evidence = runner.getStepCompletionEvidence('step-1');
667
- expect(evidence?.coordinationSignals.some((signal) => signal.kind === 'worker_done' &&
668
- signal.source === 'channel' &&
669
- signal.sender === 'specialist')).toBe(true);
670
- expect(evidence?.coordinationSignals.some((signal) => signal.kind === 'step_complete')).toBe(false);
671
- }, 15000);
672
- it('should capture WORKER_DONE plus LEAD_DONE and complete cleanly', async () => {
673
- const channel = 'happy-path-lead-worker-done';
674
- waitForExitFn = vi.fn().mockImplementation(async () => {
675
- await new Promise((resolve) => setTimeout(resolve, 5));
676
- return 'exited';
677
- });
678
- mockRelayInstance.spawnPty.mockImplementation(async ({ name, task }) => {
679
- const agent = await defaultSpawnPtyImplementation({ name, task });
680
- if (name.includes('step-1-worker')) {
681
- emitRelayChannelMessage({
682
- from: agent.name,
683
- to: channel,
684
- text: 'WORKER_DONE: handoff package posted',
685
- });
686
- }
687
- if (name.includes('step-1-owner')) {
688
- emitRelayChannelMessage({
689
- from: agent.name,
690
- to: channel,
691
- text: 'LEAD_DONE: lead confirmed the worker handoff',
692
- });
693
- }
694
- return agent;
695
- });
696
- mockSpawnOutputs = [
697
- 'artifact bundle ready\n',
698
- 'Lead confirmed the handoff is complete and safe for review.\n',
699
- ];
700
- const run = await runner.execute(makeChannelSupervisedConfig(channel), 'default');
701
- expect(run.status).toBe('completed');
702
- const step = await getStepRow(db, run.id, 'step-1');
703
- expect(step?.completionReason).toBe('completed_by_evidence');
704
- const evidence = runner.getStepCompletionEvidence('step-1');
705
- expect(evidence?.coordinationSignals.some((signal) => signal.kind === 'worker_done' &&
706
- signal.source === 'channel' &&
707
- signal.sender === 'specialist')).toBe(true);
708
- expect(evidence?.coordinationSignals.some((signal) => signal.kind === 'lead_done' &&
709
- signal.source === 'channel' &&
710
- signal.sender === 'team-lead')).toBe(true);
711
- }, 15000);
712
- it('should complete as verified when lead-worker verification passes without coordination markers', async () => {
713
- mockSpawnOutputs = [
714
- 'worker output with expected content\n',
715
- 'Lead checked the implementation and found it correct.\n',
716
- ];
717
- const run = await runner.execute(makeSupervisedConfig({ verification: { type: 'output_contains', value: 'expected content' } }), 'default');
718
- expect(run.status).toBe('completed');
719
- const step = await getStepRow(db, run.id, 'step-1');
720
- expect(step?.completionReason).toBe('completed_verified');
721
- const evidence = runner.getStepCompletionEvidence('step-1');
722
- expect(evidence?.coordinationSignals.some((signal) => signal.kind === 'worker_done')).toBe(false);
723
- expect(evidence?.coordinationSignals.some((signal) => signal.kind === 'lead_done')).toBe(false);
724
- }, 15000);
725
- it('should complete multiple supervised workers in sequence for a map-reduce style flow', async () => {
726
- const channel = 'happy-path-map-reduce';
727
- waitForExitFn = vi.fn().mockImplementation(async () => {
728
- await new Promise((resolve) => setTimeout(resolve, 5));
729
- return 'exited';
730
- });
731
- mockRelayInstance.spawnPty.mockImplementation(async ({ name, task }) => {
732
- const isReview = task?.includes('REVIEW_DECISION: APPROVE or REJECT');
733
- const output = isReview
734
- ? 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: map-reduce happy path verified\n'
735
- : name.includes('map-1-worker')
736
- ? 'map artifact A ready\n'
737
- : name.includes('map-1-owner')
738
- ? 'Lead verified shard A is complete and safe.\n'
739
- : name.includes('map-2-worker')
740
- ? 'map artifact B ready\n'
741
- : name.includes('map-2-owner')
742
- ? 'Lead verified shard B is complete and safe.\n'
743
- : name.includes('reduce-worker')
744
- ? 'reduce artifact ready\n'
745
- : name.includes('reduce-owner')
746
- ? 'Lead verified the reduction is complete and safe.\n'
747
- : 'STEP_COMPLETE:unknown\n';
748
- queueMicrotask(() => {
749
- if (typeof mockRelayInstance.onWorkerOutput === 'function') {
750
- mockRelayInstance.onWorkerOutput({ name, chunk: output });
751
- }
752
- });
753
- const agent = { ...mockAgent, name };
754
- if (name.includes('map-1-worker')) {
755
- emitRelayChannelMessage({
756
- from: agent.name,
757
- to: channel,
758
- text: 'WORKER_DONE: map shard A complete',
759
- });
760
- }
761
- if (name.includes('map-2-worker')) {
762
- emitRelayChannelMessage({
763
- from: agent.name,
764
- to: channel,
765
- text: 'WORKER_DONE: map shard B complete',
766
- });
767
- }
768
- if (name.includes('reduce-worker')) {
769
- emitRelayChannelMessage({
770
- from: agent.name,
771
- to: channel,
772
- text: 'WORKER_DONE: reduce pass complete',
773
- });
774
- }
775
- return agent;
776
- });
777
- const config = makeConfig({
778
- swarm: { pattern: 'map-reduce', channel },
779
- agents: [
780
- { name: 'mapper-1', cli: 'claude', role: 'engineer' },
781
- { name: 'mapper-2', cli: 'claude', role: 'engineer' },
782
- { name: 'reducer', cli: 'claude', role: 'engineer' },
783
- { name: 'team-lead', cli: 'claude', role: 'lead coordinator' },
784
- { name: 'reviewer-1', cli: 'claude', role: 'reviewer' },
785
- ],
786
- workflows: [
787
- {
788
- name: 'default',
789
- steps: [
790
- { name: 'map-1', agent: 'mapper-1', task: 'Process shard A' },
791
- { name: 'map-2', agent: 'mapper-2', task: 'Process shard B' },
792
- { name: 'reduce', agent: 'reducer', task: 'Combine mapped results', dependsOn: ['map-1', 'map-2'] },
793
- ],
794
- },
795
- ],
796
- });
797
- const run = await runner.execute(config, 'default');
798
- expect(run.status).toBe('completed');
799
- const steps = await db.getStepsByRunId(run.id);
800
- expect(steps.map((step) => step.stepName)).toEqual(['map-1', 'map-2', 'reduce']);
801
- expect(steps.map((step) => step.status)).toEqual(['completed', 'completed', 'completed']);
802
- expect(steps.map((step) => step.completionReason)).toEqual([
803
- 'completed_by_evidence',
804
- 'completed_by_evidence',
805
- 'completed_by_evidence',
806
- ]);
807
- expect(runner
808
- .getStepCompletionEvidence('reduce')
809
- ?.coordinationSignals.some((signal) => signal.kind === 'worker_done' &&
810
- signal.source === 'channel' &&
811
- signal.sender === 'reducer')).toBe(true);
812
- }, 15000);
813
- it('should still complete when WORKER_DONE lands after the lead checks the work', async () => {
814
- const channel = 'happy-path-delayed-worker-done';
815
- const observedOrder = [];
816
- mockRelayInstance.spawnPty.mockImplementation(async ({ name, task }) => {
817
- const agent = await defaultSpawnPtyImplementation({ name, task });
818
- if (name.includes('step-1-worker')) {
819
- setTimeout(() => {
820
- observedOrder.push('worker-done-message');
821
- emitRelayChannelMessage({
822
- from: agent.name,
823
- to: channel,
824
- text: 'WORKER_DONE: delayed handoff posted',
825
- });
826
- }, 10);
827
- return {
828
- ...agent,
829
- waitForExit: vi.fn().mockImplementation(async () => {
830
- await new Promise((resolve) => setTimeout(resolve, 15));
831
- return 'exited';
832
- }),
833
- };
834
- }
835
- if (name.includes('step-1-owner')) {
836
- return {
837
- ...agent,
838
- waitForExit: vi.fn().mockImplementation(async () => {
839
- observedOrder.push('owner-finished-check');
840
- return 'exited';
841
- }),
842
- };
843
- }
844
- return agent;
845
- });
846
- mockSpawnOutputs = [
847
- 'artifact bundle ready but handoff signal is delayed\n',
848
- 'Lead checked the artifacts early and the work still looks complete and safe.\n',
849
- ];
850
- const run = await runner.execute(makeChannelSupervisedConfig(channel), 'default');
851
- expect(run.status).toBe('completed');
852
- expect(observedOrder).toEqual(['owner-finished-check', 'worker-done-message']);
853
- const step = await getStepRow(db, run.id, 'step-1');
854
- expect(step?.completionReason).toBe('completed_by_evidence');
855
- expect(runner
856
- .getStepCompletionEvidence('step-1')
857
- ?.coordinationSignals.some((signal) => signal.kind === 'worker_done' &&
858
- signal.source === 'channel' &&
859
- signal.value === 'delayed handoff posted')).toBe(true);
860
- }, 15000);
861
- });
862
- // ── Integration Test 1: Codex lead/worker without marker ──────────────
863
- describe('Codex lead/worker completion', () => {
864
- it('should complete when codex lead omits STEP_COMPLETE but owner logic still completes', async () => {
865
- // Codex agents use `codex exec` and may not emit the exact marker.
866
- // With a verification gate, the step should still complete.
867
- mockSpawnOutputs = [
868
- 'worker: implemented the feature\n',
869
- 'Lead verified: all changes look correct\nSTEP_COMPLETE:step-1\n',
870
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
871
- ];
872
- const config = makeSupervisedConfig();
873
- // Override to codex CLI
874
- config.agents = [
875
- { name: 'specialist', cli: 'codex', role: 'engineer' },
876
- { name: 'team-lead', cli: 'codex', role: 'lead coordinator' },
877
- { name: 'reviewer-1', cli: 'claude', role: 'reviewer' },
878
- ];
879
- const run = await runner.execute(config, 'default');
880
- expect(run.status).toBe('completed');
881
- expect(mockRelayInstance.spawnPty.mock.calls.some(([input]) => input.cli === 'codex' &&
882
- Array.isArray(input.args) &&
883
- input.args.includes('--dangerously-bypass-approvals-and-sandbox'))).toBe(true);
884
- }, 15000);
885
- });
886
- // ── Integration Test 2: Gemini lead/worker with channel completion ────
887
- describe('Gemini lead/worker with channel completion', () => {
888
- it('should complete when gemini worker posts channel completion and owner finalizes', async () => {
889
- mockSpawnOutputs = [
890
- 'Worker output: feature implemented\nWORKER_DONE: task complete\n',
891
- 'Observed worker completion on channel\nSTEP_COMPLETE:step-1\n',
892
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: channel evidence\n',
893
- ];
894
- const config = makeSupervisedConfig();
895
- config.agents = [
896
- { name: 'specialist', cli: 'gemini', role: 'engineer' },
897
- { name: 'team-lead', cli: 'gemini', role: 'lead coordinator' },
898
- { name: 'reviewer-1', cli: 'claude', role: 'reviewer' },
899
- ];
900
- const run = await runner.execute(config, 'default');
901
- expect(run.status).toBe('completed');
902
- }, 15000);
903
- });
904
- // ── Integration Test 3: Supervisor without exact review sentinel ───────
905
- describe('Supervisor workflow completion', () => {
906
- it('should complete supervised step with standard review flow', async () => {
907
- mockSpawnOutputs = [
908
- 'worker built the feature\n',
909
- 'Verified: code passes tests\nSTEP_COMPLETE:step-1\n',
910
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: correct implementation\n',
911
- ];
912
- const run = await runner.execute(makeSupervisedConfig(), 'default');
913
- expect(run.status).toBe('completed');
914
- }, 15000);
915
- });
916
- // ── Integration Test 4: Map-reduce workflow remains unaffected ─────────
917
- describe('Map-reduce workflow backward compatibility', () => {
918
- it('should complete map-reduce workflow with standard markers', async () => {
919
- const config = makeConfig({
920
- swarm: { pattern: 'map-reduce' },
921
- agents: [
922
- { name: 'mapper-1', cli: 'claude' },
923
- { name: 'mapper-2', cli: 'claude' },
924
- { name: 'reducer', cli: 'claude' },
925
- ],
926
- workflows: [
927
- {
928
- name: 'default',
929
- steps: [
930
- { name: 'map-1', agent: 'mapper-1', task: 'Process chunk A' },
931
- { name: 'map-2', agent: 'mapper-2', task: 'Process chunk B' },
932
- { name: 'reduce', agent: 'reducer', task: 'Combine results', dependsOn: ['map-1', 'map-2'] },
933
- ],
934
- },
935
- ],
936
- });
937
- const run = await runner.execute(config, 'default');
938
- expect(run.status).toBe('completed');
939
- }, 15000);
940
- });
941
- // ── Integration Test 5: Legacy marker-based workflows ─────────────────
942
- describe('Legacy marker-based workflows', () => {
943
- it('should still complete with explicit STEP_COMPLETE marker (backward compat)', async () => {
944
- // The classic marker-based flow should continue to work unchanged
945
- const run = await runner.execute(makeConfig(), 'default');
946
- expect(run.status).toBe('completed');
947
- }, 15000);
948
- it('should still fail when marker, owner decision, and evidence are all missing', async () => {
949
- mockSpawnOutputs = ['Did the work but no marker\n'];
950
- const run = await runner.execute(makeConfig(), 'default');
951
- expect(run.status).toBe('failed');
952
- expect(run.error).toContain('owner completion decision missing');
953
- }, 15000);
954
- it('should still support explicit REVIEW_DECISION: APPROVE flow', async () => {
955
- mockSpawnOutputs = [
956
- 'STEP_COMPLETE:step-1\n',
957
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: standard approval\n',
958
- ];
959
- const events = [];
960
- runner.on((event) => {
961
- if (event.type === 'step:review-completed') {
962
- events.push({ type: event.type, decision: event.decision });
963
- }
964
- });
965
- const run = await runner.execute(makeConfig(), 'default');
966
- expect(run.status).toBe('completed');
967
- expect(events).toContainEqual({ type: 'step:review-completed', decision: 'approved' });
968
- }, 15000);
969
- it('should still support explicit REVIEW_DECISION: REJECT flow', async () => {
970
- mockSpawnOutputs = [
971
- 'STEP_COMPLETE:step-1\n',
972
- 'REVIEW_DECISION: REJECT\nREVIEW_REASON: standard rejection\n',
973
- ];
974
- const run = await runner.execute(makeConfig(), 'default');
975
- expect(run.status).toBe('failed');
976
- expect(run.error).toContain('review rejected');
977
- }, 15000);
978
- it('should still fail closed on malformed review output', async () => {
979
- mockSpawnOutputs = [
980
- 'STEP_COMPLETE:step-1\n',
981
- 'I think this looks ok\n',
982
- ];
983
- const run = await runner.execute(makeConfig(), 'default');
984
- expect(run.status).toBe('failed');
985
- expect(run.error).toContain('review response malformed');
986
- }, 15000);
987
- it('should preserve owner/specialist separation in supervised workflows', async () => {
988
- mockSpawnOutputs = [
989
- 'worker finished\n',
990
- 'Owner verified\nSTEP_COMPLETE:step-1\n',
991
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: good\n',
992
- ];
993
- const ownerAssignments = [];
994
- runner.on((event) => {
995
- if (event.type === 'step:owner-assigned') {
996
- ownerAssignments.push({ owner: event.ownerName, specialist: event.specialistName });
997
- }
998
- });
999
- const run = await runner.execute(makeSupervisedConfig(), 'default');
1000
- expect(run.status).toBe('completed');
1001
- expect(ownerAssignments).toHaveLength(1);
1002
- expect(ownerAssignments[0].owner).toBe('team-lead');
1003
- expect(ownerAssignments[0].specialist).toBe('specialist');
1004
- }, 15000);
1005
- });
1006
- // ── Backward compat: event emission ───────────────────────────────────
1007
- describe('backward compatibility: event emission', () => {
1008
- it('should emit run:started and run:completed events', async () => {
1009
- const events = [];
1010
- runner.on((event) => events.push(event.type));
1011
- await runner.execute(makeConfig(), 'default');
1012
- expect(events).toContain('run:started');
1013
- expect(events).toContain('run:completed');
1014
- }, 15000);
1015
- it('should emit step:started and step:completed events in order', async () => {
1016
- const stepEvents = [];
1017
- runner.on((event) => {
1018
- if (event.type.startsWith('step:')) {
1019
- stepEvents.push({
1020
- type: event.type,
1021
- stepName: 'stepName' in event ? event.stepName : undefined,
1022
- });
1023
- }
1024
- });
1025
- await runner.execute(makeConfig(), 'default');
1026
- const startedSteps = stepEvents.filter((e) => e.type === 'step:started');
1027
- const completedSteps = stepEvents.filter((e) => e.type === 'step:completed');
1028
- expect(startedSteps).toHaveLength(2);
1029
- expect(completedSteps).toHaveLength(2);
1030
- }, 15000);
1031
- it('should emit owner-assigned events for all steps', async () => {
1032
- const ownerEvents = [];
1033
- runner.on((event) => {
1034
- if (event.type === 'step:owner-assigned') {
1035
- ownerEvents.push(event.stepName);
1036
- }
1037
- });
1038
- await runner.execute(makeConfig(), 'default');
1039
- expect(ownerEvents).toHaveLength(2);
1040
- }, 15000);
1041
- it('should emit review-completed events for all interactive steps', async () => {
1042
- const reviewEvents = [];
1043
- runner.on((event) => {
1044
- if (event.type === 'step:review-completed') {
1045
- reviewEvents.push(event.stepName);
1046
- }
1047
- });
1048
- await runner.execute(makeConfig(), 'default');
1049
- expect(reviewEvents).toHaveLength(2);
1050
- }, 15000);
1051
- });
1052
- // ── Backward compat: DAG execution ordering ───────────────────────────
1053
- describe('backward compatibility: DAG execution', () => {
1054
- it('should execute steps in dependency order', async () => {
1055
- const completedSteps = [];
1056
- runner.on((event) => {
1057
- if (event.type === 'step:completed') {
1058
- completedSteps.push(event.stepName);
1059
- }
1060
- });
1061
- await runner.execute(makeConfig(), 'default');
1062
- const idx1 = completedSteps.indexOf('step-1');
1063
- const idx2 = completedSteps.indexOf('step-2');
1064
- expect(idx1).toBeLessThan(idx2);
1065
- }, 15000);
1066
- it('should run parallel steps concurrently', async () => {
1067
- const startTimes = {};
1068
- runner.on((event) => {
1069
- if (event.type === 'step:started') {
1070
- startTimes[event.stepName] = Date.now();
1071
- }
1072
- });
1073
- const config = makeConfig({
1074
- workflows: [
1075
- {
1076
- name: 'default',
1077
- steps: [
1078
- { name: 'a', agent: 'agent-a', task: 'Do A' },
1079
- { name: 'b', agent: 'agent-b', task: 'Do B' },
1080
- { name: 'c', agent: 'agent-a', task: 'Do C', dependsOn: ['a', 'b'] },
1081
- ],
1082
- },
1083
- ],
1084
- });
1085
- const run = await runner.execute(config, 'default');
1086
- expect(run.status).toBe('completed');
1087
- // a and b should start nearly simultaneously (within 100ms)
1088
- const diff = Math.abs((startTimes['a'] ?? 0) - (startTimes['b'] ?? 0));
1089
- expect(diff).toBeLessThan(1000);
1090
- }, 15000);
1091
- });
1092
- // ── Backward compat: CLI command building ─────────────────────────────
1093
- describe('backward compatibility: CLI command building', () => {
1094
- it('should build claude command correctly', () => {
1095
- const { cmd, args } = WorkflowRunner.buildNonInteractiveCommand('claude', 'Task');
1096
- expect(cmd).toBe('claude');
1097
- expect(args).toContain('-p');
1098
- });
1099
- it('should build codex command correctly', () => {
1100
- const { cmd, args } = WorkflowRunner.buildNonInteractiveCommand('codex', 'Task');
1101
- expect(cmd).toBe('codex');
1102
- expect(args).toContain('exec');
1103
- });
1104
- it('should build gemini command correctly', () => {
1105
- const { cmd, args } = WorkflowRunner.buildNonInteractiveCommand('gemini', 'Task');
1106
- expect(cmd).toBe('gemini');
1107
- expect(args).toContain('-p');
1108
- });
1109
- });
1110
- // ── Backward compat: variable resolution ──────────────────────────────
1111
- describe('backward compatibility: variable resolution', () => {
1112
- it('should resolve {{var}} in step tasks', async () => {
1113
- const config = makeConfig();
1114
- config.workflows[0].steps[0].task = 'Build {{feature}}';
1115
- const run = await runner.execute(config, 'default', { feature: 'auth' });
1116
- expect(run.status, run.error).toBe('completed');
1117
- }, 15000);
1118
- it('should throw on unresolved variables', () => {
1119
- const config = makeConfig({
1120
- agents: [{ name: 'a', cli: 'claude', task: 'Fix {{unknown}}' }],
1121
- });
1122
- expect(() => runner.resolveVariables(config, {})).toThrow('Unresolved variable: {{unknown}}');
1123
- });
1124
- });
1125
- // ── Backward compat: review PTY echo handling ─────────────────────────
1126
- describe('backward compatibility: review PTY echo handling', () => {
1127
- it('should parse last REVIEW_DECISION when PTY echoes prompt', async () => {
1128
- const events = [];
1129
- runner.on((event) => {
1130
- if (event.type === 'step:review-completed') {
1131
- events.push({ type: event.type, decision: event.decision });
1132
- }
1133
- });
1134
- const echoedPrompt = 'Return exactly:\nREVIEW_DECISION: APPROVE or REJECT\nREVIEW_REASON: <one sentence>\n';
1135
- const actualResponse = 'REVIEW_DECISION: REJECT\nREVIEW_REASON: code has bugs\n';
1136
- mockSpawnOutputs = ['STEP_COMPLETE:step-1\n', echoedPrompt + actualResponse];
1137
- const run = await runner.execute(makeConfig(), 'default');
1138
- expect(run.status).toBe('failed');
1139
- expect(events).toContainEqual({ type: 'step:review-completed', decision: 'rejected' });
1140
- }, 15000);
1141
- });
1142
- // ── Backward compat: timeout handling ─────────────────────────────────
1143
- describe('backward compatibility: timeout handling', () => {
1144
- it('should emit step:owner-timeout on timeout', async () => {
1145
- const events = [];
1146
- runner.on((event) => {
1147
- if (event.type === 'step:owner-timeout') {
1148
- events.push({ type: event.type, stepName: event.stepName });
1149
- }
1150
- });
1151
- waitForExitFn = vi.fn().mockResolvedValue('timeout');
1152
- waitForIdleFn = vi.fn().mockResolvedValue('timeout');
1153
- const run = await runner.execute(makeConfig(), 'default');
1154
- expect(run.status).toBe('failed');
1155
- expect(events).toContainEqual({ type: 'step:owner-timeout', stepName: 'step-1' });
1156
- }, 15000);
1157
- });
1158
- // ── Phase 1 compatibility mode ────────────────────────────────────────
1159
- describe('Phase 1 compatibility mode', () => {
1160
- it('should keep markers as fast-path for completion', async () => {
1161
- // When the marker is present, it should complete immediately without
1162
- // needing to evaluate the full evidence pipeline
1163
- const run = await runner.execute(makeConfig(), 'default');
1164
- expect(run.status).toBe('completed');
1165
- }, 15000);
1166
- it('should accept both old marker format and new OWNER_DECISION format', async () => {
1167
- // Old format still works
1168
- mockSpawnOutputs = ['STEP_COMPLETE:step-1\n'];
1169
- const run1 = await runner.execute(makeConfig({
1170
- workflows: [
1171
- { name: 'default', steps: [{ name: 'step-1', agent: 'agent-a', task: 'Do it' }] },
1172
- ],
1173
- }), 'default');
1174
- expect(run1.status).toBe('completed');
1175
- }, 15000);
1176
- });
1177
- // ── Evidence interface tests ──────────────────────────────────────────
1178
- describe('evidence collection interface', () => {
1179
- it('should expose getStepCompletionEvidence() on runner', () => {
1180
- expect(typeof runner.getStepCompletionEvidence).toBe('function');
1181
- });
1182
- it('should return undefined for unknown step names', () => {
1183
- const evidence = runner.getStepCompletionEvidence('nonexistent-step');
1184
- expect(evidence).toBeUndefined();
1185
- });
1186
- it('should return evidence with correct shape after step execution', async () => {
1187
- const run = await runner.execute(makeConfig(), 'default');
1188
- expect(run.status).toBe('completed');
1189
- const evidence = runner.getStepCompletionEvidence('step-1');
1190
- if (evidence) {
1191
- // Verify the evidence structure matches StepCompletionEvidence
1192
- expect(evidence.stepName).toBe('step-1');
1193
- expect(evidence).toHaveProperty('channelPosts');
1194
- expect(evidence).toHaveProperty('files');
1195
- expect(evidence).toHaveProperty('process');
1196
- expect(evidence).toHaveProperty('toolSideEffects');
1197
- expect(evidence).toHaveProperty('coordinationSignals');
1198
- expect(Array.isArray(evidence.channelPosts)).toBe(true);
1199
- expect(Array.isArray(evidence.files)).toBe(true);
1200
- expect(Array.isArray(evidence.toolSideEffects)).toBe(true);
1201
- expect(Array.isArray(evidence.coordinationSignals)).toBe(true);
1202
- }
1203
- }, 15000);
1204
- it('should collect evidence for supervised steps', async () => {
1205
- mockSpawnOutputs = [
1206
- 'worker completed the implementation\n',
1207
- 'Owner verified work\nSTEP_COMPLETE:step-1\n',
1208
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: good\n',
1209
- ];
1210
- const run = await runner.execute(makeSupervisedConfig(), 'default');
1211
- expect(run.status).toBe('completed');
1212
- const evidence = runner.getStepCompletionEvidence('step-1');
1213
- if (evidence) {
1214
- expect(evidence.stepName).toBe('step-1');
1215
- // Supervised steps should have channel posts from worker output forwarding
1216
- expect(evidence.channelPosts.length).toBeGreaterThanOrEqual(0);
1217
- }
1218
- }, 15000);
1219
- it('should capture WORKER_DONE as a coordination signal', async () => {
1220
- mockSpawnOutputs = [
1221
- 'WORKER_DONE: all tasks completed\n',
1222
- 'Owner confirmed\nSTEP_COMPLETE:step-1\n',
1223
- 'REVIEW_DECISION: APPROVE\nREVIEW_REASON: verified\n',
1224
- ];
1225
- const run = await runner.execute(makeSupervisedConfig(), 'default');
1226
- expect(run.status).toBe('completed');
1227
- const evidence = runner.getStepCompletionEvidence('step-1');
1228
- if (evidence) {
1229
- const workerDoneSignals = evidence.coordinationSignals.filter((s) => s.kind === 'worker_done');
1230
- // If the evidence collector detected the WORKER_DONE signal, it should be present
1231
- if (workerDoneSignals.length > 0) {
1232
- expect(workerDoneSignals[0].kind).toBe('worker_done');
1233
- }
1234
- }
1235
- }, 15000);
1236
- it('should return a defensive copy (not a live reference)', async () => {
1237
- const run = await runner.execute(makeConfig(), 'default');
1238
- expect(run.status).toBe('completed');
1239
- const evidence1 = runner.getStepCompletionEvidence('step-1');
1240
- const evidence2 = runner.getStepCompletionEvidence('step-1');
1241
- if (evidence1 && evidence2) {
1242
- expect(evidence1).not.toBe(evidence2); // structuredClone should return a new object
1243
- expect(evidence1).toEqual(evidence2); // but with the same content
1244
- }
1245
- }, 15000);
1246
- });
1247
- // ── completionReason field on step rows ───────────────────────────────
1248
- describe('completionReason on step rows', () => {
1249
- it('should set completionReason on completed steps', async () => {
1250
- const run = await runner.execute(makeConfig(), 'default');
1251
- expect(run.status).toBe('completed');
1252
- const steps = await db.getStepsByRunId(run.id);
1253
- const completedSteps = steps.filter((s) => s.status === 'completed');
1254
- expect(completedSteps.length).toBeGreaterThan(0);
1255
- for (const step of completedSteps) {
1256
- if (step.completionReason) {
1257
- // completionReason should be a valid value
1258
- const validReasons = [
1259
- 'completed_verified',
1260
- 'completed_by_owner_decision',
1261
- 'completed_by_evidence',
1262
- 'completed_by_process_exit',
1263
- 'retry_requested_by_owner',
1264
- 'failed_verification',
1265
- 'failed_owner_decision',
1266
- 'failed_no_evidence',
1267
- ];
1268
- expect(validReasons).toContain(step.completionReason);
1269
- }
1270
- }
1271
- }, 15000);
1272
- });
1273
- describe('process-exit fallback (compliance reduction)', () => {
1274
- it('should complete step via process exit code 0 when no coordination signal is posted', async () => {
1275
- // Agent exits cleanly (code 0) but doesn't post STEP_COMPLETE or OWNER_DECISION.
1276
- // With verification configured (exit_code), the runner should infer completion.
1277
- const config = makeConfig({
1278
- swarm: { pattern: 'dag', completionGracePeriodMs: 5000 },
1279
- agents: [{ name: 'agent-a', cli: 'claude' }],
1280
- workflows: [
1281
- {
1282
- name: 'default',
1283
- steps: [
1284
- {
1285
- name: 'silent-worker',
1286
- agent: 'agent-a',
1287
- task: 'Do some work silently',
1288
- verification: { type: 'exit_code', value: '0' },
1289
- },
1290
- ],
1291
- },
1292
- ],
1293
- });
1294
- // Output has no STEP_COMPLETE, no OWNER_DECISION — just normal work output
1295
- mockSpawnOutputs = ['Implemented the auth module. All tests pass.'];
1296
- const localDb = makeDb();
1297
- runner = new WorkflowRunner({ db: localDb, workspaceId: 'ws-test' });
1298
- const events = [];
1299
- const run = await runner.execute(config, 'default');
1300
- expect(run.status).toBe('completed');
1301
- const steps = await localDb.getStepsByRunId(run.id);
1302
- const step = steps.find((s) => s.stepName === 'silent-worker');
1303
- expect(step?.status).toBe('completed');
1304
- // Should be completed_by_process_exit or completed_verified (exit_code verification)
1305
- expect(step?.completionReason).toBeDefined();
1306
- }, 15000);
1307
- it('should fail when process exits with non-zero code and no signal', async () => {
1308
- // Agent exits with non-zero and no coordination signal — should fail
1309
- const config = makeConfig({
1310
- swarm: { pattern: 'dag', completionGracePeriodMs: 5000 },
1311
- agents: [{ name: 'agent-a', cli: 'claude' }],
1312
- workflows: [
1313
- {
1314
- name: 'default',
1315
- steps: [
1316
- {
1317
- name: 'failing-worker',
1318
- agent: 'agent-a',
1319
- task: 'Try something',
1320
- },
1321
- ],
1322
- },
1323
- ],
1324
- });
1325
- // No STEP_COMPLETE, no OWNER_DECISION, and we'll simulate a non-clean exit
1326
- // by having the output lack any positive signals
1327
- mockSpawnOutputs = ['Error: something went wrong'];
1328
- const localDb = makeDb();
1329
- runner = new WorkflowRunner({ db: localDb, workspaceId: 'ws-test' });
1330
- const run = await runner.execute(config, 'default');
1331
- expect(run.status).toBe('failed');
1332
- }, 15000);
1333
- it('should respect completionGracePeriodMs: 0 to disable fallback', async () => {
1334
- // With grace period disabled, missing signals should always fail
1335
- const config = makeConfig({
1336
- swarm: { pattern: 'dag', completionGracePeriodMs: 0 },
1337
- agents: [{ name: 'agent-a', cli: 'claude' }],
1338
- workflows: [
1339
- {
1340
- name: 'default',
1341
- steps: [
1342
- {
1343
- name: 'strict-worker',
1344
- agent: 'agent-a',
1345
- task: 'Do work with strict compliance required',
1346
- },
1347
- ],
1348
- },
1349
- ],
1350
- });
1351
- // Output has no signals at all
1352
- mockSpawnOutputs = ['Work completed but no signal posted.'];
1353
- const localDb = makeDb();
1354
- runner = new WorkflowRunner({ db: localDb, workspaceId: 'ws-test' });
1355
- const run = await runner.execute(config, 'default');
1356
- expect(run.status).toBe('failed');
1357
- }, 15000);
1358
- it('should complete via evidence when process exits 0 and owner output has positive conclusion', async () => {
1359
- // Agent posts no explicit signal but says "done" + exit code 0 is captured as evidence
1360
- const config = makeConfig({
1361
- swarm: { pattern: 'dag' },
1362
- agents: [{ name: 'agent-a', cli: 'claude' }],
1363
- workflows: [
1364
- {
1365
- name: 'default',
1366
- steps: [
1367
- {
1368
- name: 'wordy-worker',
1369
- agent: 'agent-a',
1370
- task: 'Implement the feature',
1371
- verification: { type: 'exit_code', value: '0' },
1372
- },
1373
- ],
1374
- },
1375
- ],
1376
- });
1377
- // Output contains positive conclusion words but no explicit marker
1378
- mockSpawnOutputs = ['Feature implemented and verified. All artifacts are correct and complete.'];
1379
- const localDb = makeDb();
1380
- runner = new WorkflowRunner({ db: localDb, workspaceId: 'ws-test' });
1381
- const run = await runner.execute(config, 'default');
1382
- expect(run.status).toBe('completed');
1383
- }, 15000);
1384
- });
1385
- describe('template re-quoting regression (parseOwnerDecision)', () => {
1386
- it('should not pick COMPLETE from re-quoted template when agent said INCOMPLETE_RETRY', async () => {
1387
- // Bug repro: agent says INCOMPLETE_RETRY then re-quotes the template format,
1388
- // causing the last-match heuristic to pick COMPLETE from the template line.
1389
- mockSpawnOutputs = [
1390
- 'worker did the task\n',
1391
- [
1392
- 'STEP OWNER CONTRACT:',
1393
- '- Preferred final decision format:',
1394
- ' OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL|NEEDS_CLARIFICATION',
1395
- ' REASON: <one sentence>',
1396
- '',
1397
- 'OWNER_DECISION: INCOMPLETE_RETRY',
1398
- 'REASON: Tests are still failing',
1399
- '',
1400
- 'I chose INCOMPLETE_RETRY as per the options OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL|NEEDS_CLARIFICATION',
1401
- ].join('\n'),
1402
- ];
1403
- const run = await runner.execute(makeSupervisedConfig({ retries: 0 }), 'default');
1404
- expect(run.status).toBe('failed');
1405
- expect(run.error).toContain('INCOMPLETE_RETRY');
1406
- const steps = await db.getStepsByRunId(run.id);
1407
- expect(steps[0]?.completionReason).toBe('retry_requested_by_owner');
1408
- }, 15000);
1409
- it('should correctly parse COMPLETE when it is the real decision, not just template text', async () => {
1410
- // Ensure the fix doesn't break the happy path — agent says COMPLETE after echoed template
1411
- mockSpawnOutputs = [
1412
- 'worker did the task\n',
1413
- [
1414
- 'STEP OWNER CONTRACT:',
1415
- '- Preferred final decision format:',
1416
- ' OWNER_DECISION: COMPLETE|INCOMPLETE_RETRY|INCOMPLETE_FAIL|NEEDS_CLARIFICATION',
1417
- '',
1418
- 'OWNER_DECISION: COMPLETE',
1419
- 'REASON: Worker finished the task successfully',
1420
- ].join('\n'),
1421
- ];
1422
- const run = await runner.execute(makeSupervisedConfig({ retries: 0 }), 'default');
1423
- expect(run.status).toBe('completed');
1424
- const steps = await db.getStepsByRunId(run.id);
1425
- expect(steps[0]?.completionReason).toBe('completed_by_owner_decision');
1426
- }, 15000);
1427
- });
1428
- describe('fallback guards against explicit retry signals', () => {
1429
- it('should not complete via evidence fallback when output contains INCOMPLETE_RETRY', async () => {
1430
- // Bug repro: parseOwnerDecision returns null (garbled PTY), but raw output
1431
- // contains INCOMPLETE_RETRY. judgeOwnerCompletionByEvidence should refuse
1432
- // to infer completion.
1433
- mockSpawnOutputs = [
1434
- 'worker completed locally\n',
1435
- [
1436
- 'I reviewed the worker output. The task looks done but tests are failing.',
1437
- 'OW NER_DECISION: INCOMPLETE_RETRY', // garbled by PTY line wrap
1438
- 'REASON: tests failing',
1439
- 'The worker completed the implementation but verification failed.',
1440
- 'OWNER_DECISION: INCOMPLETE_RETRY', // clear signal in raw output
1441
- ].join('\n'),
1442
- ];
1443
- const run = await runner.execute(makeSupervisedConfig({ retries: 0 }), 'default');
1444
- expect(run.status).toBe('failed');
1445
- }, 15000);
1446
- it('should not complete via process-exit fallback when output contains INCOMPLETE_RETRY', async () => {
1447
- const config = makeConfig({
1448
- swarm: { pattern: 'dag', completionGracePeriodMs: 5000 },
1449
- agents: [{ name: 'agent-a', cli: 'claude' }],
1450
- workflows: [
1451
- {
1452
- name: 'default',
1453
- steps: [
1454
- {
1455
- name: 'retried-worker',
1456
- agent: 'agent-a',
1457
- task: 'Do work',
1458
- verification: { type: 'exit_code', value: '0' },
1459
- },
1460
- ],
1461
- },
1462
- ],
1463
- });
1464
- // Agent exits code 0 and verification passes, BUT output contains INCOMPLETE_RETRY
1465
- mockSpawnOutputs = [
1466
- 'Implemented the feature.\nOWNER_DECISION: INCOMPLETE_RETRY\nREASON: needs more tests\n',
1467
- ];
1468
- const localDb = makeDb();
1469
- runner = new WorkflowRunner({ db: localDb, workspaceId: 'ws-test' });
1470
- const run = await runner.execute(config, 'default');
1471
- // Should NOT complete — the explicit retry signal should prevent fallback
1472
- expect(run.status).toBe('failed');
1473
- }, 15000);
1474
- });
1475
- });
1476
- //# sourceMappingURL=completion-pipeline.test.js.map