@quinteroac/agents-coding-toolkit 0.1.0-preview

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/AGENTS.md +7 -0
  2. package/README.md +127 -0
  3. package/package.json +34 -0
  4. package/scaffold/.agents/flow/archived/tmpl_.gitkeep +0 -0
  5. package/scaffold/.agents/flow/tmpl_README.md +7 -0
  6. package/scaffold/.agents/flow/tmpl_iteration_close_checklist.example.md +11 -0
  7. package/scaffold/.agents/skills/automated-fix/tmpl_SKILL.md +67 -0
  8. package/scaffold/.agents/skills/create-issue/tmpl_SKILL.md +68 -0
  9. package/scaffold/.agents/skills/create-pr-document/tmpl_SKILL.md +125 -0
  10. package/scaffold/.agents/skills/create-project-context/tmpl_SKILL.md +168 -0
  11. package/scaffold/.agents/skills/create-test-plan/tmpl_SKILL.md +86 -0
  12. package/scaffold/.agents/skills/debug/tmpl_SKILL.md +19 -0
  13. package/scaffold/.agents/skills/evaluate/tmpl_SKILL.md +19 -0
  14. package/scaffold/.agents/skills/execute-test-batch/tmpl_SKILL.md +49 -0
  15. package/scaffold/.agents/skills/execute-test-case/tmpl_SKILL.md +47 -0
  16. package/scaffold/.agents/skills/implement-user-story/tmpl_SKILL.md +68 -0
  17. package/scaffold/.agents/skills/plan-refactor/tmpl_SKILL.md +19 -0
  18. package/scaffold/.agents/skills/refactor-prd/tmpl_SKILL.md +19 -0
  19. package/scaffold/.agents/skills/refine-pr-document/tmpl_SKILL.md +108 -0
  20. package/scaffold/.agents/skills/refine-project-context/tmpl_SKILL.md +157 -0
  21. package/scaffold/.agents/skills/refine-test-plan/tmpl_SKILL.md +76 -0
  22. package/scaffold/.agents/tmpl_PROJECT_CONTEXT.md +3 -0
  23. package/scaffold/.agents/tmpl_state.example.json +26 -0
  24. package/scaffold/.agents/tmpl_state_rules.md +29 -0
  25. package/scaffold/docs/nvst-flow/templates/tmpl_CHANGELOG.md +18 -0
  26. package/scaffold/docs/nvst-flow/templates/tmpl_TECHNICAL_DEBT.md +11 -0
  27. package/scaffold/docs/nvst-flow/templates/tmpl_it_000001_evaluation-report.md +19 -0
  28. package/scaffold/docs/nvst-flow/templates/tmpl_it_000001_product-requirement-document.md +19 -0
  29. package/scaffold/docs/nvst-flow/templates/tmpl_it_000001_refactor_plan.md +19 -0
  30. package/scaffold/docs/nvst-flow/templates/tmpl_it_000001_test-plan.md +19 -0
  31. package/scaffold/docs/nvst-flow/tmpl_COMMANDS.md +0 -0
  32. package/scaffold/docs/nvst-flow/tmpl_QUICK_USE.md +0 -0
  33. package/scaffold/docs/tmpl_PLACEHOLDER.md +0 -0
  34. package/scaffold/schemas/node-shims.d.ts +15 -0
  35. package/scaffold/schemas/tmpl_issues.ts +19 -0
  36. package/scaffold/schemas/tmpl_prd.ts +26 -0
  37. package/scaffold/schemas/tmpl_progress.ts +39 -0
  38. package/scaffold/schemas/tmpl_state.ts +81 -0
  39. package/scaffold/schemas/tmpl_test-plan.ts +20 -0
  40. package/scaffold/schemas/tmpl_validate-progress.ts +13 -0
  41. package/scaffold/schemas/tmpl_validate-state.ts +13 -0
  42. package/scaffold/tmpl_AGENTS.md +7 -0
  43. package/schemas/prd.ts +26 -0
  44. package/schemas/progress.ts +39 -0
  45. package/schemas/state.ts +81 -0
  46. package/schemas/test-plan.test.ts +53 -0
  47. package/schemas/test-plan.ts +20 -0
  48. package/schemas/validate-progress.ts +13 -0
  49. package/schemas/validate-state.ts +13 -0
  50. package/src/agent.test.ts +37 -0
  51. package/src/agent.ts +225 -0
  52. package/src/cli-path.ts +4 -0
  53. package/src/cli.ts +578 -0
  54. package/src/commands/approve-project-context.ts +37 -0
  55. package/src/commands/approve-requirement.ts +217 -0
  56. package/src/commands/approve-test-plan.test.ts +193 -0
  57. package/src/commands/approve-test-plan.ts +202 -0
  58. package/src/commands/create-issue.test.ts +484 -0
  59. package/src/commands/create-issue.ts +371 -0
  60. package/src/commands/create-project-context.ts +96 -0
  61. package/src/commands/create-prototype.test.ts +153 -0
  62. package/src/commands/create-prototype.ts +425 -0
  63. package/src/commands/create-test-plan.test.ts +381 -0
  64. package/src/commands/create-test-plan.ts +248 -0
  65. package/src/commands/define-requirement.ts +47 -0
  66. package/src/commands/destroy.ts +113 -0
  67. package/src/commands/execute-automated-fix.test.ts +580 -0
  68. package/src/commands/execute-automated-fix.ts +363 -0
  69. package/src/commands/execute-manual-fix.test.ts +343 -0
  70. package/src/commands/execute-manual-fix.ts +203 -0
  71. package/src/commands/execute-test-plan.test.ts +1891 -0
  72. package/src/commands/execute-test-plan.ts +722 -0
  73. package/src/commands/init.ts +85 -0
  74. package/src/commands/refine-project-context.ts +74 -0
  75. package/src/commands/refine-requirement.ts +60 -0
  76. package/src/commands/refine-test-plan.test.ts +200 -0
  77. package/src/commands/refine-test-plan.ts +93 -0
  78. package/src/commands/start-iteration.test.ts +144 -0
  79. package/src/commands/start-iteration.ts +101 -0
  80. package/src/commands/write-json.ts +136 -0
  81. package/src/install.test.ts +124 -0
  82. package/src/pack.test.ts +103 -0
  83. package/src/state.test.ts +66 -0
  84. package/src/state.ts +52 -0
  85. package/tsconfig.json +15 -0
@@ -0,0 +1,1891 @@
1
+ import { afterEach, describe, expect, test } from "bun:test";
2
+ import { mkdtemp, mkdir, readFile, readdir, rm, writeFile } from "node:fs/promises";
3
+ import { tmpdir } from "node:os";
4
+ import { join } from "node:path";
5
+
6
+ import { parseProvider, type AgentResult } from "../agent";
7
+ import { readState, writeState } from "../state";
8
+ import { runExecuteTestPlan, type ManualTestUserInput } from "./execute-test-plan";
9
+
10
+ async function createProjectRoot(): Promise<string> {
11
+ return mkdtemp(join(tmpdir(), "nvst-execute-test-plan-"));
12
+ }
13
+
14
+ async function withCwd<T>(cwd: string, fn: () => Promise<T>): Promise<T> {
15
+ const previous = process.cwd();
16
+ process.chdir(cwd);
17
+ try {
18
+ return await fn();
19
+ } finally {
20
+ process.chdir(previous);
21
+ }
22
+ }
23
+
24
+ async function seedState(
25
+ projectRoot: string,
26
+ tpStatus: "pending" | "created",
27
+ tpFile: string | null,
28
+ ) {
29
+ await mkdir(join(projectRoot, ".agents", "flow"), { recursive: true });
30
+
31
+ await writeState(projectRoot, {
32
+ current_iteration: "000005",
33
+ current_phase: "prototype",
34
+ phases: {
35
+ define: {
36
+ requirement_definition: { status: "approved", file: "it_000005_product-requirement-document.md" },
37
+ prd_generation: { status: "completed", file: "it_000005_PRD.json" },
38
+ },
39
+ prototype: {
40
+ project_context: { status: "created", file: ".agents/PROJECT_CONTEXT.md" },
41
+ test_plan: { status: "created", file: "it_000005_test-plan.md" },
42
+ tp_generation: { status: tpStatus, file: tpFile },
43
+ prototype_build: { status: "pending", file: null },
44
+ test_execution: { status: "pending", file: null },
45
+ prototype_approved: false,
46
+ },
47
+ refactor: {
48
+ evaluation_report: { status: "pending", file: null },
49
+ refactor_plan: { status: "pending", file: null },
50
+ refactor_execution: { status: "pending", file: null },
51
+ changelog: { status: "pending", file: null },
52
+ },
53
+ },
54
+ last_updated: "2026-02-21T00:00:00.000Z",
55
+ updated_by: "seed",
56
+ history: [],
57
+ });
58
+ }
59
+
60
+ async function writeProjectContext(projectRoot: string, content = "# Project Context\n- use bun:test\n") {
61
+ await writeFile(join(projectRoot, ".agents", "PROJECT_CONTEXT.md"), content, "utf8");
62
+ }
63
+
64
+ async function writeApprovedTpJson(projectRoot: string, fileName: string) {
65
+ const tpPath = join(projectRoot, ".agents", "flow", fileName);
66
+ await writeFile(
67
+ tpPath,
68
+ JSON.stringify(
69
+ {
70
+ overallStatus: "pending",
71
+ scope: ["Scope A"],
72
+ environmentData: ["Env A"],
73
+ automatedTests: [
74
+ {
75
+ id: "TC-US001-01",
76
+ description: "Automated case one",
77
+ status: "pending",
78
+ correlatedRequirements: ["US-001", "FR-1"],
79
+ },
80
+ {
81
+ id: "TC-US001-02",
82
+ description: "Automated case two",
83
+ status: "pending",
84
+ correlatedRequirements: ["US-001", "FR-2"],
85
+ },
86
+ ],
87
+ exploratoryManualTests: [
88
+ {
89
+ id: "TC-US001-03",
90
+ description: "Manual case",
91
+ status: "pending",
92
+ correlatedRequirements: ["US-001", "FR-3"],
93
+ },
94
+ ],
95
+ },
96
+ null,
97
+ 2,
98
+ ) + "\n",
99
+ "utf8",
100
+ );
101
+ }
102
+
103
+ const createdRoots: string[] = [];
104
+
105
+ afterEach(async () => {
106
+ await Promise.all(createdRoots.splice(0).map((root) => rm(root, { recursive: true, force: true })));
107
+ });
108
+
109
+ describe("execute test-plan command", () => {
110
+ test("registers execute test-plan command in CLI dispatch with --agent provider", async () => {
111
+ const source = await readFile(join(process.cwd(), "src", "cli.ts"), "utf8");
112
+
113
+ expect(source).toContain('import { runExecuteTestPlan } from "./commands/execute-test-plan";');
114
+ expect(source).toContain("if (command === \"execute\") {");
115
+ expect(source).toContain('if (subcommand === "test-plan") {');
116
+ expect(source).toContain("const { provider, remainingArgs: postAgentArgs } = parseAgentArg(args.slice(1));");
117
+ expect(source).toContain("await runExecuteTestPlan({ provider });");
118
+ expect(source).toContain("execute test-plan --agent <provider>");
119
+ });
120
+
121
+ test("fails when tp_generation.status is not created", async () => {
122
+ const projectRoot = await createProjectRoot();
123
+ createdRoots.push(projectRoot);
124
+
125
+ await seedState(projectRoot, "pending", "it_000005_TP.json");
126
+ await writeProjectContext(projectRoot);
127
+
128
+ await withCwd(projectRoot, async () => {
129
+ await expect(runExecuteTestPlan({ provider: "codex" })).rejects.toThrow(
130
+ "Cannot execute test plan: prototype.tp_generation.status must be created. Current status: 'pending'. Run `bun nvst approve test-plan` first.",
131
+ );
132
+ });
133
+ });
134
+
135
+ // AC01: automated tests with status != passed are collected and sent to a single agent invocation
136
+ // AC02: agent prompt includes full list of pending automated test cases as JSON array
137
+ // AC03: agent returns JSON array of results with {testCaseId, status, evidence, notes}
138
+ // AC04: each result recorded in progress file and as separate artifact
139
+ test("batches all pending automated tests into a single agent invocation with JSON array prompt and results", async () => {
140
+ const projectRoot = await createProjectRoot();
141
+ createdRoots.push(projectRoot);
142
+
143
+ const tpFileName = "it_000005_TP.json";
144
+ await seedState(projectRoot, "created", tpFileName);
145
+ await writeProjectContext(projectRoot, "# Project Context\nUse bun test and tsc checks.\n");
146
+ await writeApprovedTpJson(projectRoot, tpFileName);
147
+
148
+ let batchInvocationCount = 0;
149
+ let manualPromptCount = 0;
150
+ let capturedBatchPrompt = "";
151
+
152
+ const capturedLogs: string[] = [];
153
+ const originalConsoleLog = console.log;
154
+ console.log = (...args: unknown[]) => {
155
+ capturedLogs.push(args.map((arg) => String(arg)).join(" "));
156
+ };
157
+
158
+ try {
159
+ await withCwd(projectRoot, async () => {
160
+ await runExecuteTestPlan(
161
+ { provider: "gemini" },
162
+ {
163
+ loadSkillFn: async (_projectRoot, skillName) => {
164
+ if (skillName === "execute-test-batch") return "Run batch test cases and output strict JSON array.";
165
+ return "Run this test case and output strict JSON.";
166
+ },
167
+ invokeAgentFn: async (options): Promise<AgentResult> => {
168
+ expect(options.interactive).toBe(false);
169
+
170
+ // AC02: batch prompt includes test_cases context with JSON array
171
+ batchInvocationCount += 1;
172
+ capturedBatchPrompt = options.prompt;
173
+
174
+ // Verify prompt contains both automated test case IDs
175
+ expect(options.prompt).toContain("TC-US001-01");
176
+ expect(options.prompt).toContain("TC-US001-02");
177
+ // Should NOT contain manual test in batch prompt
178
+ expect(options.prompt).not.toContain("TC-US001-03");
179
+ expect(options.prompt).toContain("### project_context");
180
+ expect(options.prompt).toContain("Use bun test and tsc checks.");
181
+
182
+ // AC03: return JSON array of results
183
+ return {
184
+ exitCode: 0,
185
+ stdout: JSON.stringify([
186
+ {
187
+ testCaseId: "TC-US001-01",
188
+ status: "passed",
189
+ evidence: "Batch evidence for case one",
190
+ notes: "Batch executed successfully",
191
+ },
192
+ {
193
+ testCaseId: "TC-US001-02",
194
+ status: "passed",
195
+ evidence: "Batch evidence for case two",
196
+ notes: "Batch executed successfully",
197
+ },
198
+ ]),
199
+ stderr: "",
200
+ };
201
+ },
202
+ promptManualTestFn: async () => {
203
+ manualPromptCount += 1;
204
+ return { status: "passed", evidence: "Manual evidence", notes: "Manual executed successfully" };
205
+ },
206
+ },
207
+ );
208
+ });
209
+ } finally {
210
+ console.log = originalConsoleLog;
211
+ }
212
+
213
+ // AC01: single invocation for automated tests
214
+ expect(batchInvocationCount).toBe(1);
215
+ // Manual tests prompted individually to user
216
+ expect(manualPromptCount).toBe(1);
217
+
218
+ // AC02: batch prompt includes JSON array of test cases
219
+ expect(capturedBatchPrompt).toContain("### test_cases");
220
+ const testCasesMatch = capturedBatchPrompt.split("### test_cases")[1];
221
+ expect(testCasesMatch).toBeDefined();
222
+ // The test_cases context should contain a valid JSON array
223
+ const testCasesJson = testCasesMatch!.split("###")[0].trim();
224
+ const parsedTestCases = JSON.parse(testCasesJson) as Array<{ id: string }>;
225
+ expect(parsedTestCases).toHaveLength(2);
226
+ expect(parsedTestCases[0]?.id).toBe("TC-US001-01");
227
+ expect(parsedTestCases[1]?.id).toBe("TC-US001-02");
228
+
229
+ expect(capturedLogs.at(-1)).toContain("3/3 tests passed, 0 failed");
230
+
231
+ // AC04: each result recorded as separate artifact
232
+ const reportRaw = await readFile(
233
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
234
+ "utf8",
235
+ );
236
+ const report = JSON.parse(reportRaw) as {
237
+ executedTestIds: string[];
238
+ results: Array<{
239
+ testCaseId: string;
240
+ description: string;
241
+ correlatedRequirements: string[];
242
+ payload: { status: string; evidence: string; notes: string };
243
+ artifactReferences: string[];
244
+ }>;
245
+ };
246
+
247
+ expect(report.executedTestIds).toEqual(["TC-US001-01", "TC-US001-02", "TC-US001-03"]);
248
+ expect(report.results).toHaveLength(3);
249
+ expect(report.results[0]?.payload).toEqual({
250
+ status: "passed",
251
+ evidence: "Batch evidence for case one",
252
+ notes: "Batch executed successfully",
253
+ });
254
+ expect(report.results[0]?.description).toBe("Automated case one");
255
+ expect(report.results[0]?.correlatedRequirements).toEqual(["US-001", "FR-1"]);
256
+ expect(report.results[0]?.artifactReferences).toHaveLength(1);
257
+
258
+ // AC04: separate artifact per test case
259
+ const artifactsDirPath = join(projectRoot, ".agents", "flow", "it_000005_test-execution-artifacts");
260
+ const artifactFileNames = await readdir(artifactsDirPath);
261
+ expect(artifactFileNames.length).toBe(3);
262
+ for (const result of report.results) {
263
+ expect(result.artifactReferences.length).toBeGreaterThan(0);
264
+ for (const artifactReference of result.artifactReferences) {
265
+ const artifactRaw = await readFile(join(projectRoot, artifactReference), "utf8");
266
+ const artifact = JSON.parse(artifactRaw) as {
267
+ testCaseId: string;
268
+ attemptNumber: number;
269
+ prompt: string;
270
+ agentExitCode: number;
271
+ };
272
+ expect(artifact.testCaseId).toBe(result.testCaseId);
273
+ expect(artifact.attemptNumber).toBe(1);
274
+ expect(artifact.agentExitCode).toBe(0);
275
+ }
276
+ }
277
+
278
+ // AC04: progress file records each individual test
279
+ const progressRaw = await readFile(
280
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-progress.json"),
281
+ "utf8",
282
+ );
283
+ const progress = JSON.parse(progressRaw) as {
284
+ entries: Array<{
285
+ id: string;
286
+ type: "automated" | "exploratory_manual";
287
+ status: "pending" | "in_progress" | "passed" | "failed";
288
+ attempt_count: number;
289
+ last_agent_exit_code: number | null;
290
+ last_error_summary: string;
291
+ }>;
292
+ };
293
+
294
+ expect(progress.entries).toHaveLength(3);
295
+ expect(progress.entries[0]).toMatchObject({
296
+ id: "TC-US001-01",
297
+ type: "automated",
298
+ status: "passed",
299
+ attempt_count: 1,
300
+ last_agent_exit_code: 0,
301
+ last_error_summary: "",
302
+ });
303
+ expect(progress.entries[1]).toMatchObject({
304
+ id: "TC-US001-02",
305
+ type: "automated",
306
+ status: "passed",
307
+ attempt_count: 1,
308
+ });
309
+ expect(progress.entries[2]).toMatchObject({
310
+ id: "TC-US001-03",
311
+ type: "exploratory_manual",
312
+ status: "passed",
313
+ attempt_count: 1,
314
+ });
315
+
316
+ const markdownReportRaw = await readFile(
317
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-report.md"),
318
+ "utf8",
319
+ );
320
+ expect(markdownReportRaw).toContain("# Test Execution Report (Iteration 000005)");
321
+ expect(markdownReportRaw).toContain("- Total Tests: 3");
322
+ expect(markdownReportRaw).toContain("- Passed: 3");
323
+ expect(markdownReportRaw).toContain("- Failed: 0");
324
+
325
+ const state = await readState(projectRoot);
326
+ expect(state.phases.prototype.test_execution.status).toBe("completed");
327
+ expect(state.updated_by).toBe("nvst:execute-test-plan");
328
+ });
329
+
330
+ // AC05: if agent session fails (non-zero exit), all automated tests in batch marked as failed with invocation_failed
331
+ test("marks all automated tests as failed with invocation_failed when batch agent session fails", async () => {
332
+ const projectRoot = await createProjectRoot();
333
+ createdRoots.push(projectRoot);
334
+
335
+ await seedState(projectRoot, "created", "it_000005_TP.json");
336
+ await writeProjectContext(projectRoot);
337
+ await writeApprovedTpJson(projectRoot, "it_000005_TP.json");
338
+
339
+ await withCwd(projectRoot, async () => {
340
+ await runExecuteTestPlan(
341
+ { provider: "claude" },
342
+ {
343
+ loadSkillFn: async (_pr, name) => {
344
+ if (name === "execute-test-batch") return "batch skill";
345
+ return "single skill";
346
+ },
347
+ invokeAgentFn: async (options): Promise<AgentResult> => {
348
+ // Batch invocation fails
349
+ return { exitCode: 1, stdout: "", stderr: "agent crashed" };
350
+ },
351
+ promptManualTestFn: async () => {
352
+ return { status: "passed", evidence: "ok", notes: "ok" };
353
+ },
354
+ },
355
+ );
356
+ });
357
+
358
+ const reportRaw = await readFile(
359
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
360
+ "utf8",
361
+ );
362
+ const report = JSON.parse(reportRaw) as {
363
+ results: Array<{
364
+ testCaseId: string;
365
+ payload: { status: string; notes: string };
366
+ passFail: "pass" | "fail" | null;
367
+ agentExitCode: number;
368
+ }>;
369
+ };
370
+
371
+ // Both automated tests marked as invocation_failed
372
+ expect(report.results[0]?.payload.status).toBe("invocation_failed");
373
+ expect(report.results[0]?.payload.notes).toContain("Agent invocation failed with exit code 1");
374
+ expect(report.results[0]?.passFail).toBeNull();
375
+ expect(report.results[0]?.agentExitCode).toBe(1);
376
+
377
+ expect(report.results[1]?.payload.status).toBe("invocation_failed");
378
+ expect(report.results[1]?.passFail).toBeNull();
379
+ expect(report.results[1]?.agentExitCode).toBe(1);
380
+
381
+ // Manual test still passed (via user prompt)
382
+ expect(report.results[2]?.payload.status).toBe("passed");
383
+ expect(report.results[2]?.passFail).toBe("pass");
384
+
385
+ const state = await readState(projectRoot);
386
+ expect(state.phases.prototype.test_execution.status).toBe("failed");
387
+ });
388
+
389
+ // AC06: if agent returns partial results, unmatched tests marked as failed
390
+ test("marks unmatched automated tests as failed when agent returns partial batch results", async () => {
391
+ const projectRoot = await createProjectRoot();
392
+ createdRoots.push(projectRoot);
393
+
394
+ await seedState(projectRoot, "created", "it_000005_TP.json");
395
+ await writeProjectContext(projectRoot);
396
+ await writeApprovedTpJson(projectRoot, "it_000005_TP.json");
397
+
398
+ await withCwd(projectRoot, async () => {
399
+ await runExecuteTestPlan(
400
+ { provider: "codex" },
401
+ {
402
+ loadSkillFn: async (_pr, name) => {
403
+ if (name === "execute-test-batch") return "batch skill";
404
+ return "single skill";
405
+ },
406
+ invokeAgentFn: async (options): Promise<AgentResult> => {
407
+ // Return results for only the first test case (partial)
408
+ return {
409
+ exitCode: 0,
410
+ stdout: JSON.stringify([
411
+ {
412
+ testCaseId: "TC-US001-01",
413
+ status: "passed",
414
+ evidence: "First test ok",
415
+ notes: "Passed",
416
+ },
417
+ ]),
418
+ stderr: "",
419
+ };
420
+ },
421
+ promptManualTestFn: async () => {
422
+ return { status: "passed", evidence: "ok", notes: "ok" };
423
+ },
424
+ },
425
+ );
426
+ });
427
+
428
+ const reportRaw = await readFile(
429
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
430
+ "utf8",
431
+ );
432
+ const report = JSON.parse(reportRaw) as {
433
+ results: Array<{
434
+ testCaseId: string;
435
+ payload: { status: string; evidence: string; notes: string };
436
+ passFail: "pass" | "fail" | null;
437
+ }>;
438
+ };
439
+
440
+ // First automated test passed
441
+ expect(report.results[0]?.testCaseId).toBe("TC-US001-01");
442
+ expect(report.results[0]?.payload.status).toBe("passed");
443
+ expect(report.results[0]?.passFail).toBe("pass");
444
+
445
+ // Second automated test: no result from agent -> failed
446
+ expect(report.results[1]?.testCaseId).toBe("TC-US001-02");
447
+ expect(report.results[1]?.payload.status).toBe("failed");
448
+ expect(report.results[1]?.payload.notes).toContain("No result returned by agent");
449
+ expect(report.results[1]?.passFail).toBe("fail");
450
+
451
+ // Manual test still passed
452
+ expect(report.results[2]?.testCaseId).toBe("TC-US001-03");
453
+ expect(report.results[2]?.payload.status).toBe("passed");
454
+
455
+ const state = await readState(projectRoot);
456
+ expect(state.phases.prototype.test_execution.status).toBe("failed");
457
+ });
458
+
459
+ // AC07: resume behavior preserved - already-passed automated tests excluded from batch
460
+ test("excludes already-passed automated tests from the batch on resume", async () => {
461
+ const projectRoot = await createProjectRoot();
462
+ createdRoots.push(projectRoot);
463
+
464
+ const tpFileName = "it_000005_TP.json";
465
+ await seedState(projectRoot, "created", tpFileName);
466
+ await writeProjectContext(projectRoot);
467
+ await writeApprovedTpJson(projectRoot, tpFileName);
468
+
469
+ await withCwd(projectRoot, async () => {
470
+ // First run: first automated test passes, second fails
471
+ await runExecuteTestPlan(
472
+ { provider: "claude" },
473
+ {
474
+ loadSkillFn: async (_pr, name) => {
475
+ if (name === "execute-test-batch") return "batch skill";
476
+ return "single skill";
477
+ },
478
+ invokeAgentFn: async (): Promise<AgentResult> => {
479
+ return {
480
+ exitCode: 0,
481
+ stdout: JSON.stringify([
482
+ {
483
+ testCaseId: "TC-US001-01",
484
+ status: "passed",
485
+ evidence: "ok",
486
+ notes: "ok",
487
+ },
488
+ {
489
+ testCaseId: "TC-US001-02",
490
+ status: "failed",
491
+ evidence: "assertion mismatch",
492
+ notes: "failed on second case",
493
+ },
494
+ ]),
495
+ stderr: "",
496
+ };
497
+ },
498
+ promptManualTestFn: async () => {
499
+ return { status: "passed", evidence: "ok", notes: "ok" };
500
+ },
501
+ },
502
+ );
503
+
504
+ // Second run: only the failed test should be in the batch
505
+ let rerunBatchPrompt = "";
506
+ await runExecuteTestPlan(
507
+ { provider: "claude" },
508
+ {
509
+ loadSkillFn: async (_pr, name) => {
510
+ if (name === "execute-test-batch") return "batch skill";
511
+ return "single skill";
512
+ },
513
+ invokeAgentFn: async (options): Promise<AgentResult> => {
514
+ rerunBatchPrompt = options.prompt;
515
+ return {
516
+ exitCode: 0,
517
+ stdout: JSON.stringify([
518
+ {
519
+ testCaseId: "TC-US001-02",
520
+ status: "passed",
521
+ evidence: "retry ok",
522
+ notes: "retry succeeded",
523
+ },
524
+ ]),
525
+ stderr: "",
526
+ };
527
+ },
528
+ promptManualTestFn: async () => {
529
+ // Manual test already passed, should not be called
530
+ throw new Error("Should not prompt for already-passed manual test");
531
+ },
532
+ },
533
+ );
534
+
535
+ // AC07: already-passed TC-US001-01 excluded from batch
536
+ expect(rerunBatchPrompt).toContain("TC-US001-02");
537
+ expect(rerunBatchPrompt).not.toContain("TC-US001-01");
538
+ });
539
+
540
+ const progressRaw = await readFile(
541
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-progress.json"),
542
+ "utf8",
543
+ );
544
+ const progress = JSON.parse(progressRaw) as {
545
+ entries: Array<{ id: string; status: string; attempt_count: number }>;
546
+ };
547
+
548
+ expect(progress.entries.find((entry) => entry.id === "TC-US001-01")).toMatchObject({
549
+ status: "passed",
550
+ attempt_count: 1,
551
+ });
552
+ expect(progress.entries.find((entry) => entry.id === "TC-US001-02")).toMatchObject({
553
+ status: "passed",
554
+ attempt_count: 2,
555
+ });
556
+ expect(progress.entries.find((entry) => entry.id === "TC-US001-03")).toMatchObject({
557
+ status: "passed",
558
+ attempt_count: 1,
559
+ });
560
+ });
561
+
562
+ test("derives pass/fail from payload status for automated batch and manual user input", async () => {
563
+ const projectRoot = await createProjectRoot();
564
+ createdRoots.push(projectRoot);
565
+
566
+ await seedState(projectRoot, "created", "it_000005_TP.json");
567
+ await writeProjectContext(projectRoot);
568
+ await writeApprovedTpJson(projectRoot, "it_000005_TP.json");
569
+
570
+ await withCwd(projectRoot, async () => {
571
+ await runExecuteTestPlan(
572
+ { provider: "claude" },
573
+ {
574
+ loadSkillFn: async (_pr, name) => {
575
+ if (name === "execute-test-batch") return "batch skill";
576
+ return "single skill";
577
+ },
578
+ invokeAgentFn: async (): Promise<AgentResult> => {
579
+ return {
580
+ exitCode: 0,
581
+ stdout: JSON.stringify([
582
+ {
583
+ testCaseId: "TC-US001-01",
584
+ status: "failed",
585
+ evidence: "Assertion mismatch",
586
+ notes: "Expected error message not found",
587
+ },
588
+ {
589
+ testCaseId: "TC-US001-02",
590
+ status: "skipped",
591
+ evidence: "",
592
+ notes: "Blocked by missing credentials",
593
+ },
594
+ ]),
595
+ stderr: "",
596
+ };
597
+ },
598
+ promptManualTestFn: async () => {
599
+ return { status: "failed", evidence: "UI broken", notes: "Button not clickable" };
600
+ },
601
+ },
602
+ );
603
+ });
604
+
605
+ const reportRaw = await readFile(
606
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
607
+ "utf8",
608
+ );
609
+ const report = JSON.parse(reportRaw) as {
610
+ results: Array<{
611
+ testCaseId: string;
612
+ payload: { status: string; evidence: string; notes: string };
613
+ passFail: "pass" | "fail" | null;
614
+ agentExitCode: number;
615
+ }>;
616
+ };
617
+
618
+ expect(report.results[0]?.payload.status).toBe("failed");
619
+ expect(report.results[0]?.passFail).toBe("fail");
620
+
621
+ expect(report.results[1]?.payload.status).toBe("skipped");
622
+ expect(report.results[1]?.passFail).toBeNull();
623
+
624
+ // Manual test: user reported failed
625
+ expect(report.results[2]?.payload.status).toBe("failed");
626
+ expect(report.results[2]?.payload.evidence).toBe("UI broken");
627
+ expect(report.results[2]?.payload.notes).toBe("Button not clickable");
628
+ expect(report.results[2]?.passFail).toBe("fail");
629
+ expect(report.results[2]?.agentExitCode).toBe(0);
630
+
631
+ const state = await readState(projectRoot);
632
+ expect(state.phases.prototype.test_execution.status).toBe("failed");
633
+ });
634
+
635
+ test("supports claude, codex, gemini, and cursor providers", () => {
636
+ expect(parseProvider("claude")).toBe("claude");
637
+ expect(parseProvider("codex")).toBe("codex");
638
+ expect(parseProvider("gemini")).toBe("gemini");
639
+ expect(parseProvider("cursor")).toBe("cursor");
640
+ });
641
+
642
+ test("updates execution progress file after each test case result from batch", async () => {
643
+ const projectRoot = await createProjectRoot();
644
+ createdRoots.push(projectRoot);
645
+
646
+ const tpFileName = "it_000005_TP.json";
647
+ await seedState(projectRoot, "created", tpFileName);
648
+ await writeProjectContext(projectRoot);
649
+ await writeApprovedTpJson(projectRoot, tpFileName);
650
+
651
+ const progressSnapshots: string[] = [];
652
+
653
+ await withCwd(projectRoot, async () => {
654
+ await runExecuteTestPlan(
655
+ { provider: "codex" },
656
+ {
657
+ loadSkillFn: async (_pr, name) => {
658
+ if (name === "execute-test-batch") return "batch skill";
659
+ return "single skill";
660
+ },
661
+ invokeAgentFn: async (): Promise<AgentResult> => {
662
+ return {
663
+ exitCode: 0,
664
+ stdout: JSON.stringify([
665
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "ok" },
666
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "ok", notes: "ok" },
667
+ ]),
668
+ stderr: "",
669
+ };
670
+ },
671
+ promptManualTestFn: async () => {
672
+ return { status: "passed", evidence: "ok", notes: "ok" };
673
+ },
674
+ writeFileFn: async (path, data) => {
675
+ const pathAsString = path.toString();
676
+ if (pathAsString.endsWith("it_000005_test-execution-progress.json")) {
677
+ progressSnapshots.push(data.toString());
678
+ }
679
+ await writeFile(pathAsString, data.toString(), "utf8");
680
+ return 0;
681
+ },
682
+ },
683
+ );
684
+ });
685
+
686
+ // Progress should be written: initial, in_progress for batch, result per automated test, manual in_progress, manual result
687
+ expect(progressSnapshots.length).toBeGreaterThanOrEqual(5);
688
+ expect(progressSnapshots.at(-1)).toContain('"attempt_count": 1');
689
+ expect(progressSnapshots.at(-1)).toContain('"status": "passed"');
690
+ });
691
+
692
+ test("fails with a descriptive error when execute-test-case skill is missing", async () => {
693
+ const projectRoot = await createProjectRoot();
694
+ createdRoots.push(projectRoot);
695
+
696
+ await seedState(projectRoot, "created", "it_000005_TP.json");
697
+ await writeProjectContext(projectRoot);
698
+ await writeApprovedTpJson(projectRoot, "it_000005_TP.json");
699
+
700
+ await withCwd(projectRoot, async () => {
701
+ await expect(
702
+ runExecuteTestPlan(
703
+ { provider: "codex" },
704
+ {
705
+ loadSkillFn: async (_pr, name) => {
706
+ if (name === "execute-test-case") throw new Error("missing");
707
+ return "batch skill";
708
+ },
709
+ },
710
+ ),
711
+ ).rejects.toThrow(
712
+ "Required skill missing: expected .agents/skills/execute-test-case/SKILL.md.",
713
+ );
714
+ });
715
+ });
716
+
717
+ test("fails with a descriptive error when execute-test-batch skill is missing", async () => {
718
+ const projectRoot = await createProjectRoot();
719
+ createdRoots.push(projectRoot);
720
+
721
+ await seedState(projectRoot, "created", "it_000005_TP.json");
722
+ await writeProjectContext(projectRoot);
723
+ await writeApprovedTpJson(projectRoot, "it_000005_TP.json");
724
+
725
+ await withCwd(projectRoot, async () => {
726
+ await expect(
727
+ runExecuteTestPlan(
728
+ { provider: "codex" },
729
+ {
730
+ loadSkillFn: async (_pr, name) => {
731
+ if (name === "execute-test-batch") throw new Error("missing");
732
+ return "single skill";
733
+ },
734
+ },
735
+ ),
736
+ ).rejects.toThrow(
737
+ "Required skill missing: expected .agents/skills/execute-test-batch/SKILL.md.",
738
+ );
739
+ });
740
+ });
741
+
742
+ test("handles batch with no pending automated tests (all already passed)", async () => {
743
+ const projectRoot = await createProjectRoot();
744
+ createdRoots.push(projectRoot);
745
+
746
+ const tpFileName = "it_000005_TP.json";
747
+ await seedState(projectRoot, "created", tpFileName);
748
+ await writeProjectContext(projectRoot);
749
+ await writeApprovedTpJson(projectRoot, tpFileName);
750
+
751
+ await withCwd(projectRoot, async () => {
752
+ // First run: all pass
753
+ await runExecuteTestPlan(
754
+ { provider: "claude" },
755
+ {
756
+ loadSkillFn: async (_pr, name) => {
757
+ if (name === "execute-test-batch") return "batch skill";
758
+ return "single skill";
759
+ },
760
+ invokeAgentFn: async (): Promise<AgentResult> => {
761
+ return {
762
+ exitCode: 0,
763
+ stdout: JSON.stringify([
764
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "ok" },
765
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "ok", notes: "ok" },
766
+ ]),
767
+ stderr: "",
768
+ };
769
+ },
770
+ promptManualTestFn: async () => {
771
+ return { status: "passed", evidence: "ok", notes: "ok" };
772
+ },
773
+ },
774
+ );
775
+
776
+ // Second run: no batch invocation should happen
777
+ let batchCalled = false;
778
+ await runExecuteTestPlan(
779
+ { provider: "claude" },
780
+ {
781
+ loadSkillFn: async (_pr, name) => {
782
+ if (name === "execute-test-batch") return "batch skill";
783
+ return "single skill";
784
+ },
785
+ invokeAgentFn: async (options): Promise<AgentResult> => {
786
+ if (options.prompt.includes("test_cases")) {
787
+ batchCalled = true;
788
+ }
789
+ return {
790
+ exitCode: 0,
791
+ stdout: JSON.stringify([]),
792
+ stderr: "",
793
+ };
794
+ },
795
+ promptManualTestFn: async () => {
796
+ // Manual test already passed, should not be called
797
+ throw new Error("Should not prompt for already-passed manual test");
798
+ },
799
+ },
800
+ );
801
+
802
+ expect(batchCalled).toBe(false);
803
+ });
804
+ });
805
+ });
806
+
807
+ describe("US-003: execute-test-case skill batch mode", () => {
808
+ // US-003-AC01: skill accepts an array of test case definitions
809
+ test("AC01: skill instructs agent to accept an array of test case definitions", async () => {
810
+ const skillPath = join(
811
+ process.cwd(),
812
+ ".agents",
813
+ "skills",
814
+ "execute-test-case",
815
+ "SKILL.md",
816
+ );
817
+ const source = await readFile(skillPath, "utf8");
818
+
819
+ expect(source.startsWith("---\n")).toBe(true);
820
+ expect(source).toContain("name: execute-test-case");
821
+ expect(source).toContain("description:");
822
+ expect(source).toContain("user-invocable: false");
823
+ expect(source).toContain("`test_cases`");
824
+ expect(source).toContain("`project_context`");
825
+ expect(source).toContain("JSON array of test case objects");
826
+ });
827
+
828
+ // US-003-AC02: skill instructs return of JSON array with {testCaseId, status, evidence, notes}
829
+ test("AC02: skill instructs agent to return JSON array with testCaseId, status, evidence, notes", async () => {
830
+ const skillPath = join(
831
+ process.cwd(),
832
+ ".agents",
833
+ "skills",
834
+ "execute-test-case",
835
+ "SKILL.md",
836
+ );
837
+ const source = await readFile(skillPath, "utf8");
838
+
839
+ expect(source).toContain('"testCaseId"');
840
+ expect(source).toContain('"status": "passed|failed|skipped"');
841
+ expect(source).toContain('"evidence": "string"');
842
+ expect(source).toContain('"notes": "string"');
843
+ expect(source).toContain("Do not output markdown or additional text outside the JSON array.");
844
+ });
845
+
846
+ // US-003-AC03: skill states agent must execute each test in order and report individual results
847
+ test("AC03: skill states agent must execute each test in order and report individual results", async () => {
848
+ const skillPath = join(
849
+ process.cwd(),
850
+ ".agents",
851
+ "skills",
852
+ "execute-test-case",
853
+ "SKILL.md",
854
+ );
855
+ const source = await readFile(skillPath, "utf8");
856
+
857
+ expect(source).toContain("Execute each test case in order");
858
+ expect(source).toContain("one result object per test case");
859
+ expect(source).toContain("Every test case in the input must have a corresponding result in the output array.");
860
+ });
861
+
862
+ // US-003-AC04: backward references to single-test-case mode are removed
863
+ test("AC04: no backward references to single-test-case mode in skill", async () => {
864
+ const skillPath = join(
865
+ process.cwd(),
866
+ ".agents",
867
+ "skills",
868
+ "execute-test-case",
869
+ "SKILL.md",
870
+ );
871
+ const source = await readFile(skillPath, "utf8");
872
+
873
+ expect(source).not.toContain("Execute exactly one test case");
874
+ expect(source).not.toContain("`test_case_definition`");
875
+ expect(source).not.toContain("single test case");
876
+ expect(source).not.toContain("outside the JSON object");
877
+ });
878
+
879
+ // US-003-AC04: backward references removed from production code
880
+ test("AC04: unused single-test-case functions removed from production code", async () => {
881
+ const source = await readFile(join(process.cwd(), "src", "commands", "execute-test-plan.ts"), "utf8");
882
+
883
+ expect(source).not.toContain("function buildExecutionPrompt(");
884
+ expect(source).not.toContain("function parseExecutionPayload(");
885
+ expect(source).not.toContain("test_case_definition");
886
+ });
887
+ });
888
+
889
+ describe("US-002: manual tests with user interaction", () => {
890
+ // US-002-AC01: After automated tests complete, each pending manual test is presented sequentially
891
+ test("presents pending manual tests sequentially after automated tests complete", async () => {
892
+ const projectRoot = await createProjectRoot();
893
+ createdRoots.push(projectRoot);
894
+
895
+ const tpFileName = "it_000005_TP.json";
896
+ await seedState(projectRoot, "created", tpFileName);
897
+ await writeProjectContext(projectRoot);
898
+
899
+ // Test plan with two manual tests
900
+ const tpPath = join(projectRoot, ".agents", "flow", tpFileName);
901
+ await writeFile(
902
+ tpPath,
903
+ JSON.stringify({
904
+ overallStatus: "pending",
905
+ scope: ["Scope A"],
906
+ environmentData: ["Env A"],
907
+ automatedTests: [
908
+ { id: "TC-AUTO-01", description: "Auto test", status: "pending", correlatedRequirements: ["US-001"] },
909
+ ],
910
+ exploratoryManualTests: [
911
+ { id: "TC-MAN-01", description: "First manual test", status: "pending", correlatedRequirements: ["US-002", "FR-1"] },
912
+ { id: "TC-MAN-02", description: "Second manual test", status: "pending", correlatedRequirements: ["US-002", "FR-2"] },
913
+ ],
914
+ }, null, 2) + "\n",
915
+ "utf8",
916
+ );
917
+
918
+ const promptedTestIds: string[] = [];
919
+
920
+ await withCwd(projectRoot, async () => {
921
+ await runExecuteTestPlan(
922
+ { provider: "claude" },
923
+ {
924
+ loadSkillFn: async (_pr, name) => {
925
+ if (name === "execute-test-batch") return "batch skill";
926
+ return "single skill";
927
+ },
928
+ invokeAgentFn: async (): Promise<AgentResult> => {
929
+ return {
930
+ exitCode: 0,
931
+ stdout: JSON.stringify([
932
+ { testCaseId: "TC-AUTO-01", status: "passed", evidence: "ok", notes: "ok" },
933
+ ]),
934
+ stderr: "",
935
+ };
936
+ },
937
+ promptManualTestFn: async (testCase) => {
938
+ promptedTestIds.push(testCase.id);
939
+ return { status: "passed", evidence: "Looks good", notes: "" };
940
+ },
941
+ },
942
+ );
943
+ });
944
+
945
+ // AC01: both manual tests were prompted sequentially
946
+ expect(promptedTestIds).toEqual(["TC-MAN-01", "TC-MAN-02"]);
947
+ });
948
+
949
+ // US-002-AC02: For each manual test, user sees test ID, description, correlated requirements, and expected result
950
+ test("passes test ID, description, and correlated requirements to user prompt function", async () => {
951
+ const projectRoot = await createProjectRoot();
952
+ createdRoots.push(projectRoot);
953
+
954
+ const tpFileName = "it_000005_TP.json";
955
+ await seedState(projectRoot, "created", tpFileName);
956
+ await writeProjectContext(projectRoot);
957
+ await writeApprovedTpJson(projectRoot, tpFileName);
958
+
959
+ interface CapturedTestCase {
960
+ id: string;
961
+ description: string;
962
+ correlatedRequirements: string[];
963
+ }
964
+ const capturedTestCases: CapturedTestCase[] = [];
965
+
966
+ await withCwd(projectRoot, async () => {
967
+ await runExecuteTestPlan(
968
+ { provider: "claude" },
969
+ {
970
+ loadSkillFn: async (_pr, name) => {
971
+ if (name === "execute-test-batch") return "batch skill";
972
+ return "single skill";
973
+ },
974
+ invokeAgentFn: async (): Promise<AgentResult> => {
975
+ return {
976
+ exitCode: 0,
977
+ stdout: JSON.stringify([
978
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "ok" },
979
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "ok", notes: "ok" },
980
+ ]),
981
+ stderr: "",
982
+ };
983
+ },
984
+ promptManualTestFn: async (testCase) => {
985
+ capturedTestCases.push({
986
+ id: testCase.id,
987
+ description: testCase.description,
988
+ correlatedRequirements: testCase.correlatedRequirements,
989
+ });
990
+ return { status: "passed", evidence: "ok", notes: "" };
991
+ },
992
+ },
993
+ );
994
+ });
995
+
996
+ // AC02: prompt received the full test case info
997
+ expect(capturedTestCases).toHaveLength(1);
998
+ expect(capturedTestCases[0]?.id).toBe("TC-US001-03");
999
+ expect(capturedTestCases[0]?.description).toBe("Manual case");
1000
+ expect(capturedTestCases[0]?.correlatedRequirements).toEqual(["US-001", "FR-3"]);
1001
+ });
1002
+
1003
+ // US-002-AC03: user enters status, evidence, and notes
1004
+ test("records user-provided status, evidence, and notes for manual tests", async () => {
1005
+ const projectRoot = await createProjectRoot();
1006
+ createdRoots.push(projectRoot);
1007
+
1008
+ const tpFileName = "it_000005_TP.json";
1009
+ await seedState(projectRoot, "created", tpFileName);
1010
+ await writeProjectContext(projectRoot);
1011
+ await writeApprovedTpJson(projectRoot, tpFileName);
1012
+
1013
+ await withCwd(projectRoot, async () => {
1014
+ await runExecuteTestPlan(
1015
+ { provider: "claude" },
1016
+ {
1017
+ loadSkillFn: async (_pr, name) => {
1018
+ if (name === "execute-test-batch") return "batch skill";
1019
+ return "single skill";
1020
+ },
1021
+ invokeAgentFn: async (): Promise<AgentResult> => {
1022
+ return {
1023
+ exitCode: 0,
1024
+ stdout: JSON.stringify([
1025
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "ok" },
1026
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "ok", notes: "ok" },
1027
+ ]),
1028
+ stderr: "",
1029
+ };
1030
+ },
1031
+ promptManualTestFn: async () => {
1032
+ return {
1033
+ status: "failed" as const,
1034
+ evidence: "Button did not respond to click",
1035
+ notes: "Tested on Chrome 120",
1036
+ };
1037
+ },
1038
+ },
1039
+ );
1040
+ });
1041
+
1042
+ const reportRaw = await readFile(
1043
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
1044
+ "utf8",
1045
+ );
1046
+ const report = JSON.parse(reportRaw) as {
1047
+ results: Array<{
1048
+ testCaseId: string;
1049
+ payload: { status: string; evidence: string; notes: string };
1050
+ passFail: "pass" | "fail" | null;
1051
+ }>;
1052
+ };
1053
+
1054
+ const manualResult = report.results.find((r) => r.testCaseId === "TC-US001-03");
1055
+ expect(manualResult).toBeDefined();
1056
+ expect(manualResult!.payload.status).toBe("failed");
1057
+ expect(manualResult!.payload.evidence).toBe("Button did not respond to click");
1058
+ expect(manualResult!.payload.notes).toBe("Tested on Chrome 120");
1059
+ expect(manualResult!.passFail).toBe("fail");
1060
+ });
1061
+
1062
+ // US-002-AC04: each manual test result recorded in progress file and as separate artifact
1063
+ test("records each manual test in progress file and writes artifact", async () => {
1064
+ const projectRoot = await createProjectRoot();
1065
+ createdRoots.push(projectRoot);
1066
+
1067
+ const tpFileName = "it_000005_TP.json";
1068
+ await seedState(projectRoot, "created", tpFileName);
1069
+ await writeProjectContext(projectRoot);
1070
+ await writeApprovedTpJson(projectRoot, tpFileName);
1071
+
1072
+ await withCwd(projectRoot, async () => {
1073
+ await runExecuteTestPlan(
1074
+ { provider: "claude" },
1075
+ {
1076
+ loadSkillFn: async (_pr, name) => {
1077
+ if (name === "execute-test-batch") return "batch skill";
1078
+ return "single skill";
1079
+ },
1080
+ invokeAgentFn: async (): Promise<AgentResult> => {
1081
+ return {
1082
+ exitCode: 0,
1083
+ stdout: JSON.stringify([
1084
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "ok" },
1085
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "ok", notes: "ok" },
1086
+ ]),
1087
+ stderr: "",
1088
+ };
1089
+ },
1090
+ promptManualTestFn: async () => {
1091
+ return { status: "passed", evidence: "All good", notes: "Verified manually" };
1092
+ },
1093
+ },
1094
+ );
1095
+ });
1096
+
1097
+ // Check progress file
1098
+ const progressRaw = await readFile(
1099
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-progress.json"),
1100
+ "utf8",
1101
+ );
1102
+ const progress = JSON.parse(progressRaw) as {
1103
+ entries: Array<{
1104
+ id: string;
1105
+ type: string;
1106
+ status: string;
1107
+ attempt_count: number;
1108
+ last_agent_exit_code: number | null;
1109
+ }>;
1110
+ };
1111
+
1112
+ const manualEntry = progress.entries.find((e) => e.id === "TC-US001-03");
1113
+ expect(manualEntry).toBeDefined();
1114
+ expect(manualEntry!.type).toBe("exploratory_manual");
1115
+ expect(manualEntry!.status).toBe("passed");
1116
+ expect(manualEntry!.attempt_count).toBe(1);
1117
+ expect(manualEntry!.last_agent_exit_code).toBeNull();
1118
+
1119
+ // Check artifact file
1120
+ const artifactsDirPath = join(projectRoot, ".agents", "flow", "it_000005_test-execution-artifacts");
1121
+ const artifactFileNames = await readdir(artifactsDirPath);
1122
+ const manualArtifact = artifactFileNames.find((name) => name.includes("TC-US001-03"));
1123
+ expect(manualArtifact).toBeDefined();
1124
+
1125
+ const artifactRaw = await readFile(join(artifactsDirPath, manualArtifact!), "utf8");
1126
+ const artifact = JSON.parse(artifactRaw) as {
1127
+ testCaseId: string;
1128
+ attemptNumber: number;
1129
+ prompt: string;
1130
+ agentExitCode: number;
1131
+ payload: { status: string; evidence: string; notes: string };
1132
+ };
1133
+ expect(artifact.testCaseId).toBe("TC-US001-03");
1134
+ expect(artifact.attemptNumber).toBe(1);
1135
+ expect(artifact.prompt).toBe("manual-user-input");
1136
+ expect(artifact.agentExitCode).toBe(0);
1137
+ expect(artifact.payload.status).toBe("passed");
1138
+ expect(artifact.payload.evidence).toBe("All good");
1139
+ expect(artifact.payload.notes).toBe("Verified manually");
1140
+ });
1141
+
1142
+ // US-002-AC05: resume behavior preserved - already-passed manual tests are skipped
1143
+ test("skips already-passed manual tests on resume", async () => {
1144
+ const projectRoot = await createProjectRoot();
1145
+ createdRoots.push(projectRoot);
1146
+
1147
+ const tpFileName = "it_000005_TP.json";
1148
+ await seedState(projectRoot, "created", tpFileName);
1149
+ await writeProjectContext(projectRoot);
1150
+
1151
+ // Two manual tests
1152
+ const tpPath = join(projectRoot, ".agents", "flow", tpFileName);
1153
+ await writeFile(
1154
+ tpPath,
1155
+ JSON.stringify({
1156
+ overallStatus: "pending",
1157
+ scope: ["Scope A"],
1158
+ environmentData: ["Env A"],
1159
+ automatedTests: [],
1160
+ exploratoryManualTests: [
1161
+ { id: "TC-MAN-01", description: "First manual", status: "pending", correlatedRequirements: ["US-002"] },
1162
+ { id: "TC-MAN-02", description: "Second manual", status: "pending", correlatedRequirements: ["US-002"] },
1163
+ ],
1164
+ }, null, 2) + "\n",
1165
+ "utf8",
1166
+ );
1167
+
1168
+ await withCwd(projectRoot, async () => {
1169
+ // First run: first passes, second fails
1170
+ let firstRunCount = 0;
1171
+ await runExecuteTestPlan(
1172
+ { provider: "claude" },
1173
+ {
1174
+ loadSkillFn: async () => "skill",
1175
+ invokeAgentFn: async (): Promise<AgentResult> => {
1176
+ return { exitCode: 0, stdout: JSON.stringify([]), stderr: "" };
1177
+ },
1178
+ promptManualTestFn: async (testCase) => {
1179
+ firstRunCount += 1;
1180
+ if (testCase.id === "TC-MAN-01") {
1181
+ return { status: "passed", evidence: "ok", notes: "" };
1182
+ }
1183
+ return { status: "failed", evidence: "broken", notes: "error" };
1184
+ },
1185
+ },
1186
+ );
1187
+ expect(firstRunCount).toBe(2);
1188
+
1189
+ // Second run: only the failed test should be prompted
1190
+ const secondRunPromptedIds: string[] = [];
1191
+ await runExecuteTestPlan(
1192
+ { provider: "claude" },
1193
+ {
1194
+ loadSkillFn: async () => "skill",
1195
+ invokeAgentFn: async (): Promise<AgentResult> => {
1196
+ return { exitCode: 0, stdout: JSON.stringify([]), stderr: "" };
1197
+ },
1198
+ promptManualTestFn: async (testCase) => {
1199
+ secondRunPromptedIds.push(testCase.id);
1200
+ return { status: "passed", evidence: "fixed", notes: "" };
1201
+ },
1202
+ },
1203
+ );
1204
+
1205
+ // AC05: already-passed TC-MAN-01 was skipped
1206
+ expect(secondRunPromptedIds).toEqual(["TC-MAN-02"]);
1207
+ });
1208
+
1209
+ const progressRaw = await readFile(
1210
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-progress.json"),
1211
+ "utf8",
1212
+ );
1213
+ const progress = JSON.parse(progressRaw) as {
1214
+ entries: Array<{ id: string; status: string; attempt_count: number }>;
1215
+ };
1216
+
1217
+ expect(progress.entries.find((e) => e.id === "TC-MAN-01")).toMatchObject({
1218
+ status: "passed",
1219
+ attempt_count: 1,
1220
+ });
1221
+ expect(progress.entries.find((e) => e.id === "TC-MAN-02")).toMatchObject({
1222
+ status: "passed",
1223
+ attempt_count: 2,
1224
+ });
1225
+ });
1226
+
1227
+ // US-002-AC03: user can enter skipped status
1228
+ test("handles skipped status from manual user input", async () => {
1229
+ const projectRoot = await createProjectRoot();
1230
+ createdRoots.push(projectRoot);
1231
+
1232
+ const tpFileName = "it_000005_TP.json";
1233
+ await seedState(projectRoot, "created", tpFileName);
1234
+ await writeProjectContext(projectRoot);
1235
+ await writeApprovedTpJson(projectRoot, tpFileName);
1236
+
1237
+ await withCwd(projectRoot, async () => {
1238
+ await runExecuteTestPlan(
1239
+ { provider: "claude" },
1240
+ {
1241
+ loadSkillFn: async (_pr, name) => {
1242
+ if (name === "execute-test-batch") return "batch skill";
1243
+ return "single skill";
1244
+ },
1245
+ invokeAgentFn: async (): Promise<AgentResult> => {
1246
+ return {
1247
+ exitCode: 0,
1248
+ stdout: JSON.stringify([
1249
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "ok" },
1250
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "ok", notes: "ok" },
1251
+ ]),
1252
+ stderr: "",
1253
+ };
1254
+ },
1255
+ promptManualTestFn: async () => {
1256
+ return { status: "skipped", evidence: "", notes: "Not applicable for this environment" };
1257
+ },
1258
+ },
1259
+ );
1260
+ });
1261
+
1262
+ const reportRaw = await readFile(
1263
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
1264
+ "utf8",
1265
+ );
1266
+ const report = JSON.parse(reportRaw) as {
1267
+ results: Array<{
1268
+ testCaseId: string;
1269
+ payload: { status: string; notes: string };
1270
+ passFail: "pass" | "fail" | null;
1271
+ }>;
1272
+ };
1273
+
1274
+ const manualResult = report.results.find((r) => r.testCaseId === "TC-US001-03");
1275
+ expect(manualResult).toBeDefined();
1276
+ expect(manualResult!.payload.status).toBe("skipped");
1277
+ expect(manualResult!.passFail).toBeNull();
1278
+ });
1279
+ });
1280
+
1281
+ describe("execute-test-batch skill definition", () => {
1282
+ test("includes required batch execution guidance and JSON array contract", async () => {
1283
+ const skillPath = join(
1284
+ process.cwd(),
1285
+ ".agents",
1286
+ "skills",
1287
+ "execute-test-batch",
1288
+ "SKILL.md",
1289
+ );
1290
+ const source = await readFile(skillPath, "utf8");
1291
+
1292
+ expect(source.startsWith("---\n")).toBe(true);
1293
+ expect(source).toContain("name: execute-test-batch");
1294
+ expect(source).toContain("description:");
1295
+ expect(source).toContain("user-invocable: false");
1296
+ expect(source).toContain("`test_cases`");
1297
+ expect(source).toContain("`project_context`");
1298
+ expect(source).toContain('"testCaseId"');
1299
+ expect(source).toContain('"status": "passed|failed|skipped"');
1300
+ expect(source).toContain('"evidence": "string"');
1301
+ expect(source).toContain('"notes": "string"');
1302
+ expect(source).toContain("Do not output markdown or additional text outside the JSON array.");
1303
+ });
1304
+ });
1305
+
1306
+ describe("US-004: preserve report and state tracking compatibility", () => {
1307
+ // US-004-AC01: progress file tracks all tests with correct statuses and attempt counts
1308
+ test("AC01: progress file tracks all tests (automated + manual) with correct statuses and attempt counts", async () => {
1309
+ const projectRoot = await createProjectRoot();
1310
+ createdRoots.push(projectRoot);
1311
+
1312
+ const tpFileName = "it_000005_TP.json";
1313
+ await seedState(projectRoot, "created", tpFileName);
1314
+ await writeProjectContext(projectRoot);
1315
+ await writeApprovedTpJson(projectRoot, tpFileName);
1316
+
1317
+ await withCwd(projectRoot, async () => {
1318
+ await runExecuteTestPlan(
1319
+ { provider: "claude" },
1320
+ {
1321
+ loadSkillFn: async (_pr, name) => {
1322
+ if (name === "execute-test-batch") return "batch skill";
1323
+ return "single skill";
1324
+ },
1325
+ invokeAgentFn: async (): Promise<AgentResult> => {
1326
+ return {
1327
+ exitCode: 0,
1328
+ stdout: JSON.stringify([
1329
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "" },
1330
+ { testCaseId: "TC-US001-02", status: "failed", evidence: "err", notes: "assertion fail" },
1331
+ ]),
1332
+ stderr: "",
1333
+ };
1334
+ },
1335
+ promptManualTestFn: async () => {
1336
+ return { status: "passed", evidence: "manual ok", notes: "" };
1337
+ },
1338
+ },
1339
+ );
1340
+ });
1341
+
1342
+ const progressRaw = await readFile(
1343
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-progress.json"),
1344
+ "utf8",
1345
+ );
1346
+ const progress = JSON.parse(progressRaw) as {
1347
+ entries: Array<{
1348
+ id: string;
1349
+ type: string;
1350
+ status: string;
1351
+ attempt_count: number;
1352
+ last_agent_exit_code: number | null;
1353
+ last_error_summary: string;
1354
+ updated_at: string;
1355
+ }>;
1356
+ };
1357
+
1358
+ // All three tests tracked
1359
+ expect(progress.entries).toHaveLength(3);
1360
+
1361
+ // Automated passed test
1362
+ expect(progress.entries[0]).toMatchObject({
1363
+ id: "TC-US001-01",
1364
+ type: "automated",
1365
+ status: "passed",
1366
+ attempt_count: 1,
1367
+ last_agent_exit_code: 0,
1368
+ last_error_summary: "",
1369
+ });
1370
+ expect(progress.entries[0]!.updated_at).toBeTruthy();
1371
+
1372
+ // Automated failed test
1373
+ expect(progress.entries[1]).toMatchObject({
1374
+ id: "TC-US001-02",
1375
+ type: "automated",
1376
+ status: "failed",
1377
+ attempt_count: 1,
1378
+ last_agent_exit_code: 0,
1379
+ last_error_summary: "assertion fail",
1380
+ });
1381
+
1382
+ // Manual passed test
1383
+ expect(progress.entries[2]).toMatchObject({
1384
+ id: "TC-US001-03",
1385
+ type: "exploratory_manual",
1386
+ status: "passed",
1387
+ attempt_count: 1,
1388
+ last_agent_exit_code: null,
1389
+ last_error_summary: "",
1390
+ });
1391
+ });
1392
+
1393
+ // US-004-AC01: attempt counts increment on retry
1394
+ test("AC01: attempt counts increment correctly on retries", async () => {
1395
+ const projectRoot = await createProjectRoot();
1396
+ createdRoots.push(projectRoot);
1397
+
1398
+ const tpFileName = "it_000005_TP.json";
1399
+ await seedState(projectRoot, "created", tpFileName);
1400
+ await writeProjectContext(projectRoot);
1401
+ await writeApprovedTpJson(projectRoot, tpFileName);
1402
+
1403
+ await withCwd(projectRoot, async () => {
1404
+ // First run: one automated fails
1405
+ await runExecuteTestPlan(
1406
+ { provider: "claude" },
1407
+ {
1408
+ loadSkillFn: async (_pr, name) => {
1409
+ if (name === "execute-test-batch") return "batch skill";
1410
+ return "single skill";
1411
+ },
1412
+ invokeAgentFn: async (): Promise<AgentResult> => {
1413
+ return {
1414
+ exitCode: 0,
1415
+ stdout: JSON.stringify([
1416
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "" },
1417
+ { testCaseId: "TC-US001-02", status: "failed", evidence: "err", notes: "fail" },
1418
+ ]),
1419
+ stderr: "",
1420
+ };
1421
+ },
1422
+ promptManualTestFn: async () => {
1423
+ return { status: "passed", evidence: "ok", notes: "" };
1424
+ },
1425
+ },
1426
+ );
1427
+
1428
+ // Second run: retry fixes the failure
1429
+ await runExecuteTestPlan(
1430
+ { provider: "claude" },
1431
+ {
1432
+ loadSkillFn: async (_pr, name) => {
1433
+ if (name === "execute-test-batch") return "batch skill";
1434
+ return "single skill";
1435
+ },
1436
+ invokeAgentFn: async (): Promise<AgentResult> => {
1437
+ return {
1438
+ exitCode: 0,
1439
+ stdout: JSON.stringify([
1440
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "retry ok", notes: "" },
1441
+ ]),
1442
+ stderr: "",
1443
+ };
1444
+ },
1445
+ promptManualTestFn: async () => {
1446
+ throw new Error("Should not prompt for already-passed manual test");
1447
+ },
1448
+ },
1449
+ );
1450
+ });
1451
+
1452
+ const progressRaw = await readFile(
1453
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-progress.json"),
1454
+ "utf8",
1455
+ );
1456
+ const progress = JSON.parse(progressRaw) as {
1457
+ entries: Array<{ id: string; attempt_count: number; status: string }>;
1458
+ };
1459
+
1460
+ expect(progress.entries.find((e) => e.id === "TC-US001-01")).toMatchObject({
1461
+ attempt_count: 1,
1462
+ status: "passed",
1463
+ });
1464
+ expect(progress.entries.find((e) => e.id === "TC-US001-02")).toMatchObject({
1465
+ attempt_count: 2,
1466
+ status: "passed",
1467
+ });
1468
+ expect(progress.entries.find((e) => e.id === "TC-US001-03")).toMatchObject({
1469
+ attempt_count: 1,
1470
+ status: "passed",
1471
+ });
1472
+ });
1473
+
1474
+ // US-004-AC02: execution artifacts written per test case per attempt with correct schema
1475
+ test("AC02: artifacts written per test case per attempt with correct directory structure and schema", async () => {
1476
+ const projectRoot = await createProjectRoot();
1477
+ createdRoots.push(projectRoot);
1478
+
1479
+ const tpFileName = "it_000005_TP.json";
1480
+ await seedState(projectRoot, "created", tpFileName);
1481
+ await writeProjectContext(projectRoot);
1482
+ await writeApprovedTpJson(projectRoot, tpFileName);
1483
+
1484
+ await withCwd(projectRoot, async () => {
1485
+ // First run: one test fails
1486
+ await runExecuteTestPlan(
1487
+ { provider: "claude" },
1488
+ {
1489
+ loadSkillFn: async (_pr, name) => {
1490
+ if (name === "execute-test-batch") return "batch skill";
1491
+ return "single skill";
1492
+ },
1493
+ invokeAgentFn: async (): Promise<AgentResult> => {
1494
+ return {
1495
+ exitCode: 0,
1496
+ stdout: JSON.stringify([
1497
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "fine" },
1498
+ { testCaseId: "TC-US001-02", status: "failed", evidence: "err", notes: "broken" },
1499
+ ]),
1500
+ stderr: "some stderr",
1501
+ };
1502
+ },
1503
+ promptManualTestFn: async () => {
1504
+ return { status: "passed", evidence: "manual ok", notes: "verified" };
1505
+ },
1506
+ },
1507
+ );
1508
+
1509
+ // Second run: retry the failed test
1510
+ await runExecuteTestPlan(
1511
+ { provider: "claude" },
1512
+ {
1513
+ loadSkillFn: async (_pr, name) => {
1514
+ if (name === "execute-test-batch") return "batch skill";
1515
+ return "single skill";
1516
+ },
1517
+ invokeAgentFn: async (): Promise<AgentResult> => {
1518
+ return {
1519
+ exitCode: 0,
1520
+ stdout: JSON.stringify([
1521
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "fixed", notes: "ok now" },
1522
+ ]),
1523
+ stderr: "",
1524
+ };
1525
+ },
1526
+ promptManualTestFn: async () => {
1527
+ throw new Error("Should not prompt for already-passed manual test");
1528
+ },
1529
+ },
1530
+ );
1531
+ });
1532
+
1533
+ const artifactsDirPath = join(projectRoot, ".agents", "flow", "it_000005_test-execution-artifacts");
1534
+ const artifactFileNames = (await readdir(artifactsDirPath)).sort();
1535
+
1536
+ // 3 from first run + 1 retry = 4 artifacts
1537
+ expect(artifactFileNames).toHaveLength(4);
1538
+
1539
+ // Verify artifact file naming: {sanitized_id}_attempt_{padded_number}.json
1540
+ expect(artifactFileNames).toContain("TC-US001-01_attempt_001.json");
1541
+ expect(artifactFileNames).toContain("TC-US001-02_attempt_001.json");
1542
+ expect(artifactFileNames).toContain("TC-US001-02_attempt_002.json");
1543
+ expect(artifactFileNames).toContain("TC-US001-03_attempt_001.json");
1544
+
1545
+ // Verify automated artifact schema
1546
+ const autoArtifactRaw = await readFile(
1547
+ join(artifactsDirPath, "TC-US001-01_attempt_001.json"),
1548
+ "utf8",
1549
+ );
1550
+ const autoArtifact = JSON.parse(autoArtifactRaw) as Record<string, unknown>;
1551
+ expect(autoArtifact).toHaveProperty("testCaseId", "TC-US001-01");
1552
+ expect(autoArtifact).toHaveProperty("attemptNumber", 1);
1553
+ expect(autoArtifact).toHaveProperty("prompt");
1554
+ expect(typeof autoArtifact.prompt).toBe("string");
1555
+ expect(autoArtifact).toHaveProperty("agentExitCode", 0);
1556
+ expect(autoArtifact).toHaveProperty("stdout");
1557
+ expect(autoArtifact).toHaveProperty("stderr", "some stderr");
1558
+ expect(autoArtifact).toHaveProperty("payload");
1559
+ const autoPayload = autoArtifact.payload as Record<string, unknown>;
1560
+ expect(autoPayload).toMatchObject({ status: "passed", evidence: "ok", notes: "fine" });
1561
+
1562
+ // Verify manual artifact schema
1563
+ const manualArtifactRaw = await readFile(
1564
+ join(artifactsDirPath, "TC-US001-03_attempt_001.json"),
1565
+ "utf8",
1566
+ );
1567
+ const manualArtifact = JSON.parse(manualArtifactRaw) as Record<string, unknown>;
1568
+ expect(manualArtifact).toHaveProperty("testCaseId", "TC-US001-03");
1569
+ expect(manualArtifact).toHaveProperty("attemptNumber", 1);
1570
+ expect(manualArtifact).toHaveProperty("prompt", "manual-user-input");
1571
+ expect(manualArtifact).toHaveProperty("agentExitCode", 0);
1572
+ expect(manualArtifact).toHaveProperty("stdout");
1573
+ expect(manualArtifact).toHaveProperty("stderr", "");
1574
+ const manualPayload = manualArtifact.payload as Record<string, unknown>;
1575
+ expect(manualPayload).toMatchObject({ status: "passed", evidence: "manual ok", notes: "verified" });
1576
+
1577
+ // Verify retry artifact has incremented attempt number
1578
+ const retryArtifactRaw = await readFile(
1579
+ join(artifactsDirPath, "TC-US001-02_attempt_002.json"),
1580
+ "utf8",
1581
+ );
1582
+ const retryArtifact = JSON.parse(retryArtifactRaw) as Record<string, unknown>;
1583
+ expect(retryArtifact).toHaveProperty("testCaseId", "TC-US001-02");
1584
+ expect(retryArtifact).toHaveProperty("attemptNumber", 2);
1585
+ });
1586
+
1587
+ // US-004-AC03: markdown report and JSON results have identical structure
1588
+ test("AC03: markdown report and JSON results files generated with correct structure", async () => {
1589
+ const projectRoot = await createProjectRoot();
1590
+ createdRoots.push(projectRoot);
1591
+
1592
+ const tpFileName = "it_000005_TP.json";
1593
+ await seedState(projectRoot, "created", tpFileName);
1594
+ await writeProjectContext(projectRoot);
1595
+ await writeApprovedTpJson(projectRoot, tpFileName);
1596
+
1597
+ await withCwd(projectRoot, async () => {
1598
+ await runExecuteTestPlan(
1599
+ { provider: "claude" },
1600
+ {
1601
+ loadSkillFn: async (_pr, name) => {
1602
+ if (name === "execute-test-batch") return "batch skill";
1603
+ return "single skill";
1604
+ },
1605
+ invokeAgentFn: async (): Promise<AgentResult> => {
1606
+ return {
1607
+ exitCode: 0,
1608
+ stdout: JSON.stringify([
1609
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ev1", notes: "n1" },
1610
+ { testCaseId: "TC-US001-02", status: "failed", evidence: "ev2", notes: "n2" },
1611
+ ]),
1612
+ stderr: "",
1613
+ };
1614
+ },
1615
+ promptManualTestFn: async () => {
1616
+ return { status: "skipped", evidence: "", notes: "N/A" };
1617
+ },
1618
+ },
1619
+ );
1620
+ });
1621
+
1622
+ // Verify JSON results structure
1623
+ const resultsRaw = await readFile(
1624
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
1625
+ "utf8",
1626
+ );
1627
+ const results = JSON.parse(resultsRaw) as Record<string, unknown>;
1628
+
1629
+ expect(results).toHaveProperty("iteration", "000005");
1630
+ expect(results).toHaveProperty("testPlanFile", tpFileName);
1631
+ expect(results).toHaveProperty("executedTestIds");
1632
+ expect(Array.isArray(results.executedTestIds)).toBe(true);
1633
+ expect(results).toHaveProperty("results");
1634
+ expect(Array.isArray(results.results)).toBe(true);
1635
+
1636
+ const resultEntries = results.results as Array<Record<string, unknown>>;
1637
+ expect(resultEntries).toHaveLength(3);
1638
+
1639
+ // Verify each result has required fields
1640
+ for (const entry of resultEntries) {
1641
+ expect(entry).toHaveProperty("testCaseId");
1642
+ expect(entry).toHaveProperty("description");
1643
+ expect(entry).toHaveProperty("correlatedRequirements");
1644
+ expect(Array.isArray(entry.correlatedRequirements)).toBe(true);
1645
+ expect(entry).toHaveProperty("mode");
1646
+ expect(["automated", "exploratory_manual"]).toContain(entry.mode as string);
1647
+ expect(entry).toHaveProperty("payload");
1648
+ const payload = entry.payload as Record<string, unknown>;
1649
+ expect(payload).toHaveProperty("status");
1650
+ expect(payload).toHaveProperty("evidence");
1651
+ expect(payload).toHaveProperty("notes");
1652
+ expect(entry).toHaveProperty("passFail");
1653
+ expect([null, "pass", "fail"]).toContain(entry.passFail as string | null);
1654
+ expect(entry).toHaveProperty("agentExitCode");
1655
+ expect(typeof entry.agentExitCode).toBe("number");
1656
+ expect(entry).toHaveProperty("artifactReferences");
1657
+ expect(Array.isArray(entry.artifactReferences)).toBe(true);
1658
+ }
1659
+
1660
+ // Verify passFail derivation
1661
+ expect(resultEntries[0]!.passFail).toBe("pass"); // passed -> pass
1662
+ expect(resultEntries[1]!.passFail).toBe("fail"); // failed -> fail
1663
+ expect(resultEntries[2]!.passFail).toBeNull(); // skipped -> null
1664
+
1665
+ // Verify mode field
1666
+ expect(resultEntries[0]!.mode).toBe("automated");
1667
+ expect(resultEntries[1]!.mode).toBe("automated");
1668
+ expect(resultEntries[2]!.mode).toBe("exploratory_manual");
1669
+
1670
+ // Verify markdown report structure
1671
+ const markdownRaw = await readFile(
1672
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-report.md"),
1673
+ "utf8",
1674
+ );
1675
+
1676
+ expect(markdownRaw).toContain("# Test Execution Report (Iteration 000005)");
1677
+ expect(markdownRaw).toContain("- Test Plan: `it_000005_TP.json`");
1678
+ expect(markdownRaw).toContain("- Total Tests: 3");
1679
+ expect(markdownRaw).toContain("- Passed: 1");
1680
+ expect(markdownRaw).toContain("- Failed: 2");
1681
+ expect(markdownRaw).toContain("| Test ID | Description | Status | Correlated Requirements | Artifacts |");
1682
+ expect(markdownRaw).toContain("| --- | --- | --- | --- | --- |");
1683
+ // All three test cases appear in table
1684
+ expect(markdownRaw).toContain("TC-US001-01");
1685
+ expect(markdownRaw).toContain("TC-US001-02");
1686
+ expect(markdownRaw).toContain("TC-US001-03");
1687
+ // Artifact references present in table
1688
+ expect(markdownRaw).toContain("_attempt_001.json");
1689
+ });
1690
+
1691
+ // US-004-AC04: state transitions follow the same rules
1692
+ test("AC04: state is in_progress during execution, completed when all pass", async () => {
1693
+ const projectRoot = await createProjectRoot();
1694
+ createdRoots.push(projectRoot);
1695
+
1696
+ const tpFileName = "it_000005_TP.json";
1697
+ await seedState(projectRoot, "created", tpFileName);
1698
+ await writeProjectContext(projectRoot);
1699
+ await writeApprovedTpJson(projectRoot, tpFileName);
1700
+
1701
+ const stateSnapshots: Array<{ status: string; file: string | null }> = [];
1702
+
1703
+ await withCwd(projectRoot, async () => {
1704
+ await runExecuteTestPlan(
1705
+ { provider: "claude" },
1706
+ {
1707
+ loadSkillFn: async (_pr, name) => {
1708
+ if (name === "execute-test-batch") return "batch skill";
1709
+ return "single skill";
1710
+ },
1711
+ invokeAgentFn: async (): Promise<AgentResult> => {
1712
+ // Capture state during execution
1713
+ const midState = await readState(projectRoot);
1714
+ stateSnapshots.push({
1715
+ status: midState.phases.prototype.test_execution.status,
1716
+ file: midState.phases.prototype.test_execution.file,
1717
+ });
1718
+ return {
1719
+ exitCode: 0,
1720
+ stdout: JSON.stringify([
1721
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "" },
1722
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "ok", notes: "" },
1723
+ ]),
1724
+ stderr: "",
1725
+ };
1726
+ },
1727
+ promptManualTestFn: async () => {
1728
+ return { status: "passed", evidence: "ok", notes: "" };
1729
+ },
1730
+ },
1731
+ );
1732
+ });
1733
+
1734
+ // During execution: in_progress
1735
+ expect(stateSnapshots).toHaveLength(1);
1736
+ expect(stateSnapshots[0]!.status).toBe("in_progress");
1737
+ expect(stateSnapshots[0]!.file).toBe("it_000005_test-execution-progress.json");
1738
+
1739
+ // After execution (all passed): completed
1740
+ const finalState = await readState(projectRoot);
1741
+ expect(finalState.phases.prototype.test_execution.status).toBe("completed");
1742
+ expect(finalState.phases.prototype.test_execution.file).toBe("it_000005_test-execution-progress.json");
1743
+ expect(finalState.updated_by).toBe("nvst:execute-test-plan");
1744
+ });
1745
+
1746
+ test("AC04: state is failed when any test fails", async () => {
1747
+ const projectRoot = await createProjectRoot();
1748
+ createdRoots.push(projectRoot);
1749
+
1750
+ const tpFileName = "it_000005_TP.json";
1751
+ await seedState(projectRoot, "created", tpFileName);
1752
+ await writeProjectContext(projectRoot);
1753
+ await writeApprovedTpJson(projectRoot, tpFileName);
1754
+
1755
+ await withCwd(projectRoot, async () => {
1756
+ await runExecuteTestPlan(
1757
+ { provider: "claude" },
1758
+ {
1759
+ loadSkillFn: async (_pr, name) => {
1760
+ if (name === "execute-test-batch") return "batch skill";
1761
+ return "single skill";
1762
+ },
1763
+ invokeAgentFn: async (): Promise<AgentResult> => {
1764
+ return {
1765
+ exitCode: 0,
1766
+ stdout: JSON.stringify([
1767
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "" },
1768
+ { testCaseId: "TC-US001-02", status: "failed", evidence: "err", notes: "fail" },
1769
+ ]),
1770
+ stderr: "",
1771
+ };
1772
+ },
1773
+ promptManualTestFn: async () => {
1774
+ return { status: "passed", evidence: "ok", notes: "" };
1775
+ },
1776
+ },
1777
+ );
1778
+ });
1779
+
1780
+ const finalState = await readState(projectRoot);
1781
+ expect(finalState.phases.prototype.test_execution.status).toBe("failed");
1782
+ expect(finalState.phases.prototype.test_execution.file).toBe("it_000005_test-execution-progress.json");
1783
+ expect(finalState.updated_by).toBe("nvst:execute-test-plan");
1784
+ });
1785
+
1786
+ test("AC04: state is failed when agent invocation fails", async () => {
1787
+ const projectRoot = await createProjectRoot();
1788
+ createdRoots.push(projectRoot);
1789
+
1790
+ const tpFileName = "it_000005_TP.json";
1791
+ await seedState(projectRoot, "created", tpFileName);
1792
+ await writeProjectContext(projectRoot);
1793
+ await writeApprovedTpJson(projectRoot, tpFileName);
1794
+
1795
+ await withCwd(projectRoot, async () => {
1796
+ await runExecuteTestPlan(
1797
+ { provider: "claude" },
1798
+ {
1799
+ loadSkillFn: async (_pr, name) => {
1800
+ if (name === "execute-test-batch") return "batch skill";
1801
+ return "single skill";
1802
+ },
1803
+ invokeAgentFn: async (): Promise<AgentResult> => {
1804
+ return { exitCode: 1, stdout: "", stderr: "crashed" };
1805
+ },
1806
+ promptManualTestFn: async () => {
1807
+ return { status: "passed", evidence: "ok", notes: "" };
1808
+ },
1809
+ },
1810
+ );
1811
+ });
1812
+
1813
+ const finalState = await readState(projectRoot);
1814
+ expect(finalState.phases.prototype.test_execution.status).toBe("failed");
1815
+ });
1816
+
1817
+ // US-004-AC03: executedTestIds in JSON results tracks only tests run in this execution
1818
+ test("AC03: executedTestIds tracks only tests executed in current run, not previously passed", async () => {
1819
+ const projectRoot = await createProjectRoot();
1820
+ createdRoots.push(projectRoot);
1821
+
1822
+ const tpFileName = "it_000005_TP.json";
1823
+ await seedState(projectRoot, "created", tpFileName);
1824
+ await writeProjectContext(projectRoot);
1825
+ await writeApprovedTpJson(projectRoot, tpFileName);
1826
+
1827
+ await withCwd(projectRoot, async () => {
1828
+ // First run: one automated fails
1829
+ await runExecuteTestPlan(
1830
+ { provider: "claude" },
1831
+ {
1832
+ loadSkillFn: async (_pr, name) => {
1833
+ if (name === "execute-test-batch") return "batch skill";
1834
+ return "single skill";
1835
+ },
1836
+ invokeAgentFn: async (): Promise<AgentResult> => {
1837
+ return {
1838
+ exitCode: 0,
1839
+ stdout: JSON.stringify([
1840
+ { testCaseId: "TC-US001-01", status: "passed", evidence: "ok", notes: "" },
1841
+ { testCaseId: "TC-US001-02", status: "failed", evidence: "err", notes: "fail" },
1842
+ ]),
1843
+ stderr: "",
1844
+ };
1845
+ },
1846
+ promptManualTestFn: async () => {
1847
+ return { status: "passed", evidence: "ok", notes: "" };
1848
+ },
1849
+ },
1850
+ );
1851
+
1852
+ // Second run: only failed test retried
1853
+ await runExecuteTestPlan(
1854
+ { provider: "claude" },
1855
+ {
1856
+ loadSkillFn: async (_pr, name) => {
1857
+ if (name === "execute-test-batch") return "batch skill";
1858
+ return "single skill";
1859
+ },
1860
+ invokeAgentFn: async (): Promise<AgentResult> => {
1861
+ return {
1862
+ exitCode: 0,
1863
+ stdout: JSON.stringify([
1864
+ { testCaseId: "TC-US001-02", status: "passed", evidence: "fixed", notes: "" },
1865
+ ]),
1866
+ stderr: "",
1867
+ };
1868
+ },
1869
+ promptManualTestFn: async () => {
1870
+ throw new Error("Should not prompt");
1871
+ },
1872
+ },
1873
+ );
1874
+ });
1875
+
1876
+ const resultsRaw = await readFile(
1877
+ join(projectRoot, ".agents", "flow", "it_000005_test-execution-results.json"),
1878
+ "utf8",
1879
+ );
1880
+ const results = JSON.parse(resultsRaw) as {
1881
+ executedTestIds: string[];
1882
+ results: Array<{ testCaseId: string }>;
1883
+ };
1884
+
1885
+ // Only TC-US001-02 was executed in the second run
1886
+ expect(results.executedTestIds).toEqual(["TC-US001-02"]);
1887
+ // But all results are still present
1888
+ expect(results.results).toHaveLength(3);
1889
+ expect(results.results.map((r) => r.testCaseId)).toEqual(["TC-US001-01", "TC-US001-02", "TC-US001-03"]);
1890
+ });
1891
+ });