@cleocode/playbooks 2026.4.91 → 2026.4.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,462 @@
1
+ /**
2
+ * T934 — Starter playbook E2E integration tests.
3
+ *
4
+ * Each starter `.cantbook` shipped under `packages/playbooks/starter/` is:
5
+ * 1. Loaded from disk (so the shipped file is what we test).
6
+ * 2. Parsed via the real {@link parsePlaybook}.
7
+ * 3. Executed end-to-end against a real in-memory `node:sqlite` DB with the
8
+ * T889 migration applied.
9
+ * 4. Driven by an in-process stub {@link AgentDispatcher} so every node is
10
+ * exercised without touching a real agent runtime.
11
+ *
12
+ * No `@cleocode/*` module is mocked. The only injected surface is the
13
+ * dispatcher, which matches what production code passes to
14
+ * {@link executePlaybook}. This proves the starter playbooks actually reach
15
+ * their documented terminal states via the T930 runtime state machine.
16
+ *
17
+ * @task T934 — Starter Playbooks
18
+ */
19
+ import { readFileSync } from 'node:fs';
20
+ import { createRequire } from 'node:module';
21
+ import { dirname, resolve } from 'node:path';
22
+ import type { DatabaseSync as _DatabaseSyncType } from 'node:sqlite';
23
+ import { fileURLToPath } from 'node:url';
24
+ import type { PlaybookDefinition } from '@cleocode/contracts';
25
+ import { afterEach, beforeEach, describe, expect, it } from 'vitest';
26
+ import { approveGate } from '../approval.js';
27
+ import { parsePlaybook } from '../parser.js';
28
+ import {
29
+ type AgentDispatcher,
30
+ type AgentDispatchInput,
31
+ type AgentDispatchResult,
32
+ executePlaybook,
33
+ resumePlaybook,
34
+ } from '../runtime.js';
35
+ import { getPlaybookRun, listPlaybookApprovals } from '../state.js';
36
+
37
+ const _require = createRequire(import.meta.url);
38
+ type DatabaseSync = _DatabaseSyncType;
39
+ const { DatabaseSync } = _require('node:sqlite') as {
40
+ DatabaseSync: new (...args: ConstructorParameters<typeof _DatabaseSyncType>) => DatabaseSync;
41
+ };
42
+
43
+ const __dirname = dirname(fileURLToPath(import.meta.url));
44
+
45
+ /** Absolute path to the T889 playbook-tables migration SQL. */
46
+ const MIGRATION_SQL_PATH = resolve(
47
+ __dirname,
48
+ '../../../core/migrations/drizzle-tasks/20260417220000_t889-playbook-tables/migration.sql',
49
+ );
50
+
51
+ /** Absolute path to the `starter/` directory shipped with this package. */
52
+ const STARTER_DIR = resolve(__dirname, '../../starter');
53
+
54
+ // -- DB helpers --------------------------------------------------------------
55
+
56
+ /**
57
+ * Apply a multi-statement Drizzle migration file (split on the
58
+ * `--> statement-breakpoint` token emitted by `drizzle-kit generate`).
59
+ * Comment-only statements are skipped so the loop never feeds SQLite an
60
+ * empty block.
61
+ */
62
+ function applyMigration(db: DatabaseSync, sql: string): void {
63
+ const statements = sql
64
+ .split(/--> statement-breakpoint/)
65
+ .map((s) => s.trim())
66
+ .filter((s) => s.length > 0);
67
+ for (const stmt of statements) {
68
+ const lines = stmt.split('\n');
69
+ const hasSql = lines.some((l) => l.trim().length > 0 && !l.trim().startsWith('--'));
70
+ if (hasSql) db.exec(stmt);
71
+ }
72
+ }
73
+
74
+ // -- Stub dispatcher ---------------------------------------------------------
75
+
76
+ /**
77
+ * Recorded call shape used to assert that each node ran the expected number
78
+ * of times and observed the expected accumulated context.
79
+ */
80
+ interface RecordedCall {
81
+ nodeId: string;
82
+ agentId: string;
83
+ iteration: number;
84
+ contextSnapshot: Record<string, unknown>;
85
+ }
86
+
87
+ /**
88
+ * Build a stub {@link AgentDispatcher} that records every call and delegates
89
+ * the success/failure decision to a user-supplied handler. The handler gets
90
+ * the full {@link AgentDispatchInput} so it can react to node id, iteration,
91
+ * or accumulated context.
92
+ */
93
+ function makeRecordingDispatcher(
94
+ handler: (input: AgentDispatchInput) => AgentDispatchResult | Promise<AgentDispatchResult>,
95
+ ): AgentDispatcher & { calls: RecordedCall[] } {
96
+ const calls: RecordedCall[] = [];
97
+ return {
98
+ calls,
99
+ async dispatch(input: AgentDispatchInput): Promise<AgentDispatchResult> {
100
+ calls.push({
101
+ nodeId: input.nodeId,
102
+ agentId: input.agentId,
103
+ iteration: input.iteration,
104
+ contextSnapshot: { ...input.context },
105
+ });
106
+ return handler(input);
107
+ },
108
+ };
109
+ }
110
+
111
+ /**
112
+ * Default "always succeed" handler that echoes the node id into the context
113
+ * via `{<nodeId>_done: true}`. Enough for the happy-path assertions.
114
+ */
115
+ function alwaysSucceed(input: AgentDispatchInput): AgentDispatchResult {
116
+ return {
117
+ status: 'success',
118
+ output: {
119
+ [`${input.nodeId}_done`]: true,
120
+ lastNode: input.nodeId,
121
+ lastAgent: input.agentId,
122
+ },
123
+ };
124
+ }
125
+
126
+ // -- Shared loader -----------------------------------------------------------
127
+
128
+ /**
129
+ * Load and parse a starter `.cantbook` by filename stem. Returns both the
130
+ * validated definition and its SHA-256 source hash so callers can feed
131
+ * executePlaybook without duplicating the fs read.
132
+ */
133
+ function loadStarter(stem: 'rcasd' | 'ivtr' | 'release'): {
134
+ definition: PlaybookDefinition;
135
+ sourceHash: string;
136
+ } {
137
+ const src = readFileSync(resolve(STARTER_DIR, `${stem}.cantbook`), 'utf8');
138
+ const { definition, sourceHash } = parsePlaybook(src);
139
+ return { definition, sourceHash };
140
+ }
141
+
142
+ // ---------------------------------------------------------------------------
143
+
144
+ describe('T934: starter playbooks — E2E against stubbed dispatcher', () => {
145
+ let db: DatabaseSync;
146
+
147
+ beforeEach(() => {
148
+ db = new DatabaseSync(':memory:');
149
+ db.exec('PRAGMA foreign_keys=ON');
150
+ applyMigration(db, readFileSync(MIGRATION_SQL_PATH, 'utf8'));
151
+ });
152
+ afterEach(() => db.close());
153
+
154
+ // -------------------------------------------------------------------------
155
+ // rcasd — 5 linear agentic stages, all must run in declaration order.
156
+ // -------------------------------------------------------------------------
157
+ describe('rcasd.cantbook', () => {
158
+ it('parses cleanly and declares 5 agentic RCASD stages', () => {
159
+ const { definition } = loadStarter('rcasd');
160
+ expect(definition.name).toBe('rcasd');
161
+ expect(definition.nodes).toHaveLength(5);
162
+ expect(definition.edges).toHaveLength(4);
163
+ expect(definition.nodes.map((n) => n.id)).toEqual([
164
+ 'research',
165
+ 'consensus',
166
+ 'architecture',
167
+ 'specification',
168
+ 'decomposition',
169
+ ]);
170
+ for (const n of definition.nodes) {
171
+ expect(n.type).toBe('agentic');
172
+ }
173
+ });
174
+
175
+ it('executes all 5 stages in order and reaches `completed` terminal state', async () => {
176
+ const { definition, sourceHash } = loadStarter('rcasd');
177
+ const dispatcher = makeRecordingDispatcher(alwaysSucceed);
178
+
179
+ const result = await executePlaybook({
180
+ db,
181
+ playbook: definition,
182
+ playbookHash: sourceHash,
183
+ initialContext: { epicId: 'T999', scope: 'global', taskId: 'T999' },
184
+ dispatcher,
185
+ });
186
+
187
+ expect(result.terminalStatus).toBe('completed');
188
+ // Every RCASD stage must have fired exactly once, in declaration order.
189
+ expect(dispatcher.calls.map((c) => c.nodeId)).toEqual([
190
+ 'research',
191
+ 'consensus',
192
+ 'architecture',
193
+ 'specification',
194
+ 'decomposition',
195
+ ]);
196
+ // Every stage observed iteration=1 (no retries on happy path).
197
+ expect(dispatcher.calls.every((c) => c.iteration === 1)).toBe(true);
198
+ // Context carries the initial inputs plus per-stage success markers.
199
+ expect(result.finalContext).toMatchObject({
200
+ epicId: 'T999',
201
+ scope: 'global',
202
+ research_done: true,
203
+ consensus_done: true,
204
+ architecture_done: true,
205
+ specification_done: true,
206
+ decomposition_done: true,
207
+ lastNode: 'decomposition',
208
+ });
209
+ // Later stages see outputs of prior stages in their context snapshot.
210
+ const decompositionCall = dispatcher.calls.find((c) => c.nodeId === 'decomposition');
211
+ expect(decompositionCall?.contextSnapshot).toMatchObject({
212
+ research_done: true,
213
+ consensus_done: true,
214
+ architecture_done: true,
215
+ specification_done: true,
216
+ });
217
+
218
+ const run = getPlaybookRun(db, result.runId);
219
+ expect(run?.status).toBe('completed');
220
+ expect(run?.currentNode).toBeNull();
221
+ expect(run?.completedAt).toBeTruthy();
222
+ });
223
+ });
224
+
225
+ // -------------------------------------------------------------------------
226
+ // ivtr — implement → validate → test, with inject_into wiring for retries.
227
+ // -------------------------------------------------------------------------
228
+ describe('ivtr.cantbook', () => {
229
+ it('parses cleanly and declares implement/validate/test with iteration caps', () => {
230
+ const { definition } = loadStarter('ivtr');
231
+ expect(definition.name).toBe('ivtr');
232
+ expect(definition.nodes).toHaveLength(3);
233
+ expect(definition.nodes.map((n) => n.id)).toEqual(['implement', 'validate', 'test']);
234
+
235
+ const implementNode = definition.nodes.find((n) => n.id === 'implement');
236
+ const validateNode = definition.nodes.find((n) => n.id === 'validate');
237
+ const testNode = definition.nodes.find((n) => n.id === 'test');
238
+
239
+ // Iteration caps are populated (runtime needs them for loop bounds).
240
+ expect(implementNode?.on_failure?.max_iterations).toBe(3);
241
+ expect(validateNode?.on_failure?.max_iterations).toBe(2);
242
+ expect(testNode?.on_failure?.max_iterations).toBe(2);
243
+
244
+ // validate + test both bounce back to implement on sustained failure.
245
+ expect(validateNode?.on_failure?.inject_into).toBe('implement');
246
+ expect(testNode?.on_failure?.inject_into).toBe('implement');
247
+ });
248
+
249
+ it('happy path: implement → validate → test completes in one pass', async () => {
250
+ const { definition, sourceHash } = loadStarter('ivtr');
251
+ const dispatcher = makeRecordingDispatcher(alwaysSucceed);
252
+
253
+ const result = await executePlaybook({
254
+ db,
255
+ playbook: definition,
256
+ playbookHash: sourceHash,
257
+ initialContext: { taskId: 'T934', maxAttempts: 3 },
258
+ dispatcher,
259
+ });
260
+
261
+ expect(result.terminalStatus).toBe('completed');
262
+ expect(dispatcher.calls.map((c) => c.nodeId)).toEqual(['implement', 'validate', 'test']);
263
+ expect(result.finalContext).toMatchObject({
264
+ taskId: 'T934',
265
+ implement_done: true,
266
+ validate_done: true,
267
+ test_done: true,
268
+ });
269
+ });
270
+
271
+ it('loop behavior: validate failure bounces back to implement via inject_into', async () => {
272
+ const { definition, sourceHash } = loadStarter('ivtr');
273
+
274
+ // Validate fails on its first attempt, then succeeds after implement
275
+ // re-runs with the enriched context. Test always succeeds.
276
+ let validateCalls = 0;
277
+ const dispatcher = makeRecordingDispatcher((input) => {
278
+ if (input.nodeId === 'validate') {
279
+ validateCalls += 1;
280
+ // First two attempts fail → exhausts validate's max_iterations=2.
281
+ // That triggers inject_into: 'implement'. Implement then reruns,
282
+ // and on the next validate attempt we let it succeed.
283
+ if (validateCalls <= 2) {
284
+ return { status: 'failure', output: {}, error: `validate miss #${validateCalls}` };
285
+ }
286
+ return { status: 'success', output: { validate_done: true, passed: true } };
287
+ }
288
+ return alwaysSucceed(input);
289
+ });
290
+
291
+ const result = await executePlaybook({
292
+ db,
293
+ playbook: definition,
294
+ playbookHash: sourceHash,
295
+ initialContext: { taskId: 'T934-LOOP' },
296
+ dispatcher,
297
+ });
298
+
299
+ expect(result.terminalStatus).toBe('completed');
300
+ // implement ran at least twice (original + re-injected), validate three
301
+ // times (two misses + one pass), test once.
302
+ const byNode = dispatcher.calls.reduce<Record<string, number>>((acc, c) => {
303
+ acc[c.nodeId] = (acc[c.nodeId] ?? 0) + 1;
304
+ return acc;
305
+ }, {});
306
+ expect(byNode['implement']).toBeGreaterThanOrEqual(2);
307
+ expect(byNode['validate']).toBe(3);
308
+ expect(byNode['test']).toBe(1);
309
+ // inject_into enriches context with the last error/fail-node markers.
310
+ expect(result.finalContext).toMatchObject({
311
+ __lastError: 'validate miss #2',
312
+ __lastFailedNode: 'validate',
313
+ test_done: true,
314
+ });
315
+ });
316
+
317
+ it('iteration cap: sustained failure terminates with exceeded_iteration_cap', async () => {
318
+ const { definition, sourceHash } = loadStarter('ivtr');
319
+
320
+ // Force every implement attempt to fail. Implement has cap=3 and no
321
+ // inject_into, so it should retry in-place and then trip the cap.
322
+ const dispatcher = makeRecordingDispatcher((input) => {
323
+ if (input.nodeId === 'implement') {
324
+ return { status: 'failure', output: {}, error: 'impl bust' };
325
+ }
326
+ return alwaysSucceed(input);
327
+ });
328
+
329
+ const result = await executePlaybook({
330
+ db,
331
+ playbook: definition,
332
+ playbookHash: sourceHash,
333
+ initialContext: { taskId: 'T934-FAIL' },
334
+ dispatcher,
335
+ });
336
+
337
+ expect(result.terminalStatus).toBe('exceeded_iteration_cap');
338
+ expect(result.exceededNodeId).toBe('implement');
339
+ expect(result.errorContext).toBe('impl bust');
340
+ // implement fired exactly 3 times (its max_iterations), then terminated.
341
+ const implementCalls = dispatcher.calls.filter((c) => c.nodeId === 'implement');
342
+ expect(implementCalls).toHaveLength(3);
343
+ });
344
+ });
345
+
346
+ // -------------------------------------------------------------------------
347
+ // release — version_bump → changelog → APPROVAL → publish.
348
+ // -------------------------------------------------------------------------
349
+ describe('release.cantbook', () => {
350
+ it('parses cleanly with one approval node between changelog and publish', () => {
351
+ const { definition } = loadStarter('release');
352
+ expect(definition.name).toBe('release');
353
+ expect(definition.nodes).toHaveLength(4);
354
+ expect(definition.nodes.map((n) => n.id)).toEqual([
355
+ 'version_bump',
356
+ 'changelog',
357
+ 'approval',
358
+ 'publish',
359
+ ]);
360
+ const approvalNode = definition.nodes.find((n) => n.id === 'approval');
361
+ expect(approvalNode?.type).toBe('approval');
362
+ if (approvalNode?.type === 'approval') {
363
+ expect(approvalNode.policy).toBe('conservative');
364
+ expect(approvalNode.prompt).toMatch(/Approve release/);
365
+ }
366
+ });
367
+
368
+ it('pauses at approval gate with a signed HMAC resume token', async () => {
369
+ const { definition, sourceHash } = loadStarter('release');
370
+ const dispatcher = makeRecordingDispatcher(alwaysSucceed);
371
+
372
+ const result = await executePlaybook({
373
+ db,
374
+ playbook: definition,
375
+ playbookHash: sourceHash,
376
+ initialContext: { targetVersion: '2026.4.92', channel: 'latest', taskId: 'T934' },
377
+ dispatcher,
378
+ approvalSecret: 't934-test-secret',
379
+ });
380
+
381
+ expect(result.terminalStatus).toBe('pending_approval');
382
+ expect(result.approvalToken).toBeDefined();
383
+ // Approval.ts truncates the HMAC to 32 hex chars.
384
+ expect(result.approvalToken).toMatch(/^[0-9a-f]{32}$/);
385
+ // version_bump + changelog ran; publish did not.
386
+ expect(dispatcher.calls.map((c) => c.nodeId)).toEqual(['version_bump', 'changelog']);
387
+
388
+ const approvals = listPlaybookApprovals(db, result.runId);
389
+ expect(approvals).toHaveLength(1);
390
+ expect(approvals[0]?.status).toBe('pending');
391
+ expect(approvals[0]?.nodeId).toBe('approval');
392
+ expect(approvals[0]?.token).toBe(result.approvalToken);
393
+
394
+ const run = getPlaybookRun(db, result.runId);
395
+ expect(run?.status).toBe('paused');
396
+ expect(run?.currentNode).toBe('approval');
397
+ });
398
+
399
+ it('resume after approval walks through publish to `completed`', async () => {
400
+ const { definition, sourceHash } = loadStarter('release');
401
+ const dispatcher = makeRecordingDispatcher(alwaysSucceed);
402
+
403
+ const first = await executePlaybook({
404
+ db,
405
+ playbook: definition,
406
+ playbookHash: sourceHash,
407
+ initialContext: { targetVersion: '2026.4.92', channel: 'latest', taskId: 'T934' },
408
+ dispatcher,
409
+ approvalSecret: 't934-test-secret',
410
+ });
411
+ expect(first.terminalStatus).toBe('pending_approval');
412
+ if (first.approvalToken === undefined) {
413
+ throw new Error('expected approval token from first execution');
414
+ }
415
+
416
+ // HITL approves.
417
+ approveGate(db, first.approvalToken, 'keaton@cleo', 'ship it');
418
+
419
+ const second = await resumePlaybook({
420
+ db,
421
+ playbook: definition,
422
+ approvalToken: first.approvalToken,
423
+ dispatcher,
424
+ approvalSecret: 't934-test-secret',
425
+ });
426
+
427
+ expect(second.terminalStatus).toBe('completed');
428
+ // publish must have fired exactly once after the gate released.
429
+ const publishCalls = dispatcher.calls.filter((c) => c.nodeId === 'publish');
430
+ expect(publishCalls).toHaveLength(1);
431
+ // The resumed run should record an approval trace on the context.
432
+ expect(second.finalContext['__lastApproval']).toMatchObject({
433
+ nodeId: 'approval',
434
+ approver: 'keaton@cleo',
435
+ reason: 'ship it',
436
+ });
437
+
438
+ const run = getPlaybookRun(db, first.runId);
439
+ expect(run?.status).toBe('completed');
440
+ expect(run?.currentNode).toBeNull();
441
+ });
442
+ });
443
+
444
+ // -------------------------------------------------------------------------
445
+ // Cross-playbook invariants
446
+ // -------------------------------------------------------------------------
447
+ describe('cross-playbook invariants', () => {
448
+ it('each starter has a unique source hash (independent definitions)', () => {
449
+ const hashes = new Set(
450
+ (['rcasd', 'ivtr', 'release'] as const).map((n) => loadStarter(n).sourceHash),
451
+ );
452
+ expect(hashes.size).toBe(3);
453
+ });
454
+
455
+ it('each starter uses schema version "1.0"', () => {
456
+ for (const stem of ['rcasd', 'ivtr', 'release'] as const) {
457
+ const { definition } = loadStarter(stem);
458
+ expect(definition.version).toBe('1.0');
459
+ }
460
+ });
461
+ });
462
+ });
package/src/index.ts CHANGED
@@ -49,6 +49,23 @@ export {
49
49
  evaluatePolicy,
50
50
  type PolicyRule,
51
51
  } from './policy.js';
52
+ // W4-10 / T930: playbook runtime state machine + HITL resume
53
+ export {
54
+ type AgentDispatcher,
55
+ type AgentDispatchInput,
56
+ type AgentDispatchResult,
57
+ type DeterministicRunInput,
58
+ type DeterministicRunner,
59
+ type DeterministicRunResult,
60
+ E_PLAYBOOK_RESUME_BLOCKED,
61
+ E_PLAYBOOK_RUNTIME_INVALID,
62
+ type ExecutePlaybookOptions,
63
+ type ExecutePlaybookResult,
64
+ executePlaybook,
65
+ type PlaybookTerminalStatus,
66
+ type ResumePlaybookOptions,
67
+ resumePlaybook,
68
+ } from './runtime.js';
52
69
  // W4-8: state layer CRUD for playbook_runs + playbook_approvals
53
70
  export {
54
71
  type CreatePlaybookApprovalInput,