ace-test-runner-e2e 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.ace-defaults/e2e-runner/config.yml +70 -0
  3. data/.ace-defaults/nav/protocols/guide-sources/ace-test-runner-e2e.yml +11 -0
  4. data/.ace-defaults/nav/protocols/skill-sources/ace-test-runner-e2e.yml +19 -0
  5. data/.ace-defaults/nav/protocols/tmpl-sources/ace-test-runner-e2e.yml +12 -0
  6. data/.ace-defaults/nav/protocols/wfi-sources/ace-test-runner-e2e.yml +11 -0
  7. data/CHANGELOG.md +1166 -0
  8. data/LICENSE +21 -0
  9. data/README.md +42 -0
  10. data/Rakefile +15 -0
  11. data/exe/ace-test-e2e +15 -0
  12. data/exe/ace-test-e2e-sh +67 -0
  13. data/exe/ace-test-e2e-suite +13 -0
  14. data/handbook/guides/e2e-testing.g.md +124 -0
  15. data/handbook/guides/scenario-yml-reference.g.md +182 -0
  16. data/handbook/guides/tc-authoring.g.md +131 -0
  17. data/handbook/skills/as-e2e-create/SKILL.md +30 -0
  18. data/handbook/skills/as-e2e-fix/SKILL.md +35 -0
  19. data/handbook/skills/as-e2e-manage/SKILL.md +31 -0
  20. data/handbook/skills/as-e2e-plan-changes/SKILL.md +30 -0
  21. data/handbook/skills/as-e2e-review/SKILL.md +35 -0
  22. data/handbook/skills/as-e2e-rewrite/SKILL.md +31 -0
  23. data/handbook/skills/as-e2e-run/SKILL.md +48 -0
  24. data/handbook/skills/as-e2e-setup-sandbox/SKILL.md +34 -0
  25. data/handbook/templates/ace-taskflow-fixture.template.md +322 -0
  26. data/handbook/templates/agent-experience-report.template.md +89 -0
  27. data/handbook/templates/metadata.template.yml +49 -0
  28. data/handbook/templates/scenario.yml.template.yml +60 -0
  29. data/handbook/templates/tc-file.template.md +45 -0
  30. data/handbook/templates/test-report.template.md +94 -0
  31. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +126 -0
  32. data/handbook/workflow-instructions/e2e/create.wf.md +395 -0
  33. data/handbook/workflow-instructions/e2e/execute.wf.md +253 -0
  34. data/handbook/workflow-instructions/e2e/fix.wf.md +166 -0
  35. data/handbook/workflow-instructions/e2e/manage.wf.md +179 -0
  36. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +255 -0
  37. data/handbook/workflow-instructions/e2e/review.wf.md +286 -0
  38. data/handbook/workflow-instructions/e2e/rewrite.wf.md +281 -0
  39. data/handbook/workflow-instructions/e2e/run.wf.md +355 -0
  40. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +461 -0
  41. data/lib/ace/test/end_to_end_runner/atoms/display_helpers.rb +234 -0
  42. data/lib/ace/test/end_to_end_runner/atoms/prompt_builder.rb +199 -0
  43. data/lib/ace/test/end_to_end_runner/atoms/result_parser.rb +166 -0
  44. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +166 -0
  45. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +244 -0
  46. data/lib/ace/test/end_to_end_runner/atoms/suite_report_prompt_builder.rb +103 -0
  47. data/lib/ace/test/end_to_end_runner/atoms/tc_fidelity_validator.rb +39 -0
  48. data/lib/ace/test/end_to_end_runner/atoms/test_case_parser.rb +108 -0
  49. data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +130 -0
  50. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +156 -0
  51. data/lib/ace/test/end_to_end_runner/models/test_case.rb +47 -0
  52. data/lib/ace/test/end_to_end_runner/models/test_result.rb +115 -0
  53. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +90 -0
  54. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +92 -0
  55. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +75 -0
  56. data/lib/ace/test/end_to_end_runner/molecules/failure_finder.rb +203 -0
  57. data/lib/ace/test/end_to_end_runner/molecules/fixture_copier.rb +35 -0
  58. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +121 -0
  59. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +182 -0
  60. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +321 -0
  61. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +131 -0
  62. data/lib/ace/test/end_to_end_runner/molecules/progress_display_manager.rb +172 -0
  63. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +259 -0
  64. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +254 -0
  65. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +181 -0
  66. data/lib/ace/test/end_to_end_runner/molecules/simple_display_manager.rb +72 -0
  67. data/lib/ace/test/end_to_end_runner/molecules/suite_progress_display_manager.rb +223 -0
  68. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +277 -0
  69. data/lib/ace/test/end_to_end_runner/molecules/suite_simple_display_manager.rb +116 -0
  70. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +136 -0
  71. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +332 -0
  72. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +830 -0
  73. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +442 -0
  74. data/lib/ace/test/end_to_end_runner/version.rb +9 -0
  75. data/lib/ace/test/end_to_end_runner.rb +71 -0
  76. metadata +220 -0
@@ -0,0 +1,395 @@
1
+ ---
2
+ doc-type: workflow
3
+ title: Create E2E Test Workflow
4
+ purpose: Create a new E2E test scenario from template
5
+ ace-docs:
6
+ last-updated: 2026-03-12
7
+ last-checked: 2026-03-21
8
+ ---
9
+
10
+ # Create E2E Test Workflow
11
+
12
+ This workflow guides an agent through creating a new E2E test scenario.
13
+
14
+ ## Arguments
15
+
16
+ - `PACKAGE` (required) - The package for the test (e.g., `ace-lint`)
17
+ - `AREA` (required) - The test area code (e.g., `LINT`, `REVIEW`, `GIT`)
18
+ - `--format ts` (optional, default) - Test format. Creates a directory with `scenario.yml`, `runner.yml.md`, `verifier.yml.md`, and TC runner/verifier pairs (TS-format). This is the only supported format.
19
+ - `--context <description>` (optional) - Description of what the test should verify
20
+
21
+ ## Canonical Conventions
22
+
23
+ - Scenario ID format: `TS-<PACKAGE_SHORT>-<NNN>[-slug]`
24
+ - Standalone files: `TC-*.runner.md` and `TC-*.verify.md`
25
+ - TC artifact layout: `results/tc/{NN}/`
26
+ - Summary counters: `tcs-passed`, `tcs-failed`, `tcs-total`, `failed[].tc`
27
+ - CLI split reminder:
28
+ - `ace-test-e2e` for single-package execution
29
+ - `ace-test-e2e-suite` for suite-level execution
30
+
31
+ ## Authoring Contract
32
+
33
+ - Runner files (`runner.yml.md`, `TC-*.runner.md`) are execution-only.
34
+ - Verifier files (`verifier.yml.md`, `TC-*.verify.md`) are verdict-only with impact-first evidence order:
35
+ 1. sandbox/project state impact
36
+ 2. explicit artifacts
37
+ 3. debug captures as fallback
38
+ - Setup belongs to `scenario.yml` `setup:` and fixtures; do not duplicate setup in runner TC instructions.
39
+
40
+ ## Workflow Steps
41
+
42
+ ### 1. Validate Inputs
43
+
44
+ **Check package exists:**
45
+ ```bash
46
+ test -d "{PACKAGE}" && echo "Package exists" || echo "Package not found"
47
+ ```
48
+
49
+ If package doesn't exist, list available packages:
50
+ ```bash
51
+ ls -d */ | grep -E "^ace-" | sed 's/\/$//'
52
+ ```
53
+
54
+ **Normalize area code:**
55
+ - Convert to uppercase (e.g., `lint` -> `LINT`)
56
+ - Verify it's a valid area name (2-10 alphanumeric characters)
57
+
58
+ ### 2. Generate Test ID
59
+
60
+ Find the next available test ID:
61
+
62
+ ```bash
63
+ # Search TS-format directories
64
+ find {PACKAGE}/test/e2e -maxdepth 1 -type d -name "TS-{AREA}-*" 2>/dev/null | \
65
+ sed 's/.*TS-{AREA}-\([0-9]*\).*/\1/'
66
+ ```
67
+
68
+ Sort and take the highest number:
69
+ - If no existing tests: use `001`
70
+ - Otherwise: increment the highest number by 1
71
+ - Format as three digits (e.g., `001`, `002`, `015`)
72
+
73
+ Result: `TS-{AREA}-{NNN}` (e.g., `TS-LINT-003`)
74
+
75
+ ### 3. Create Directory
76
+
77
+ Ensure the E2E test directory exists:
78
+
79
+ ```bash
80
+ mkdir -p {PACKAGE}/test/e2e
81
+ ```
82
+
83
+ ### 4. Generate Test Slug
84
+
85
+ Create a kebab-case slug:
86
+
87
+ **If --context provided:**
88
+ - Extract key words from the context description
89
+ - Convert to lowercase
90
+ - Replace spaces with hyphens
91
+ - Limit to 5-6 words
92
+
93
+ **If no context:**
94
+ - Use a placeholder: `new-test-scenario`
95
+
96
+ Example: "Test config file validation" -> `config-file-validation`
97
+
98
+ The slug is the directory name suffix: `TS-LINT-003-config-file-validation/`
99
+
100
+ ### 5. Load Template
101
+
102
+ Load the test template:
103
+ ```bash
104
+ ace-bundle tmpl://test-e2e
105
+ ```
106
+
107
+ Or read directly:
108
+ ```
109
+ ace-test-runner-e2e/handbook/templates/test-e2e.template.md
110
+ ```
111
+
112
+ ### 6. Populate Template
113
+
114
+ Replace template placeholders with actual values:
115
+
116
+ | Placeholder | Value |
117
+ |-------------|-------|
118
+ | `{AREA}` | Area code (uppercase) |
119
+ | `{NNN}` | Sequential number (3 digits) |
120
+ | `{short-pkg}` | Package name without `ace-` prefix (e.g., `git-commit`) |
121
+ | `{short-id}` | Lowercase test number (e.g., `ts001`) |
122
+ | `{Descriptive Title}` | Generated from context or area |
123
+ | `{area-name}` | Area code (lowercase) |
124
+
125
+ Initial values for optional fields:
126
+ - `priority: medium`
127
+ - `duration: ~10min`
128
+ - `automation-candidate: false`
129
+ - `cost-tier: smoke`
130
+ - `tags: [{cost-tier}, "use-case:{area}"]`
131
+ - `e2e-justification:` (brief statement of why this cannot be unit-only)
132
+ - `unit-coverage-reviewed:` (list of unit test files checked during Value Gate)
133
+ - `last-verified:` (leave empty)
134
+ - `verified-by:` (leave empty)
135
+
136
+ ### 7. E2E Value Gate Check
137
+
138
+ Before generating test cases, verify the proposed test has genuine E2E value.
139
+
140
+ **Check unit test coverage:**
141
+ ```bash
142
+ # Search for existing unit tests covering this area
143
+ find {PACKAGE}/test/atoms {PACKAGE}/test/molecules {PACKAGE}/test/organisms \
144
+ -name "*_test.rb" 2>/dev/null | head -20
145
+ ```
146
+
147
+ Read the relevant test files and count assertions covering the behavior described in `--context`.
148
+
149
+ **Apply the gate per TC:**
150
+ For each proposed TC, answer: **"Does this require the full CLI binary + real external tools + real filesystem I/O?"**
151
+
152
+ - If **YES**: proceed to TC generation
153
+ - If **NO**: note that unit tests cover this behavior and skip the TC
154
+ - If **PARTIAL**: create the TC but scope it to only the E2E-exclusive aspects
155
+
156
+ **Example decisions:**
157
+ - "Test that invalid YAML config produces error" — check if `atoms/config_parser_test.rb` already asserts this. If so, **skip** (unit test covers it). If unit test checks parsing but not the full CLI exit code path, **create** a TC scoped to just the exit code.
158
+ - "Test that StandardRB subprocess executes and returns results" — unit tests stub the subprocess. **Create** this as E2E because it requires the real tool.
159
+
160
+ If all proposed TCs fail the gate, report to the user:
161
+ ```
162
+ All proposed behaviors are already covered by unit tests in {PACKAGE}/test/.
163
+ No E2E test needed. Consider adding unit tests instead if coverage gaps exist.
164
+ ```
165
+
166
+ ### 7a. E2E Decision Record (Required)
167
+
168
+ Before writing files, produce a decision record table for every candidate TC:
169
+
170
+ | TC ID | Decision (KEEP/ADD/SKIP) | E2E-only reason | Unit tests reviewed |
171
+ |-------|---------------------------|-----------------|---------------------|
172
+ | {tc-id} | {decision} | {why this needs real CLI/tools/fs} | {path1,path2} |
173
+
174
+ Rules:
175
+ - No TC may be created without a row in this table.
176
+ - If decision is `SKIP`, include the unit-test evidence that replaces it.
177
+ - At least one `unit tests reviewed` path is required for each row.
178
+ - The scenario-level `unit-coverage-reviewed` field must include the union of all referenced unit test files.
179
+
180
+ ### 8. Context-Based Generation (if --context)
181
+
182
+ If a context description was provided, enhance the test with:
183
+
184
+ **Research the package:**
185
+ 1. **Run unit tests first** (`ace-test` in the package) — they are the ground truth for implemented behavior
186
+ 2. Examine the relevant code in `{PACKAGE}/lib/`
187
+ 3. Check existing unit tests for expected behavior patterns
188
+ 4. Understand the feature being tested
189
+ 5. **Run the tool** to observe actual behavior, output format, file paths, and exit codes
190
+ 6. **Verify config/input formats** by reading the actual parsing code — never assume formats from design specs or task descriptions
191
+
192
+ **Generate test content:**
193
+ 1. Write a clear objective based on the context
194
+ 2. Identify prerequisites for the test
195
+ 3. Create appropriate test data setup
196
+ 4. Generate test cases following the rules below
197
+ 5. Define success criteria
198
+
199
+ #### Test Case Generation Rules
200
+
201
+ **MUST (required for all E2E tests):**
202
+ - **Verify the feature is implemented** before writing the test — read the actual implementation code, not just task specs or design documents
203
+ - **Verify config/input formats** by reading the parsing code — never assume formats from BDD specs, task descriptions, or documentation
204
+ - Include an error/negative TC only when it validates E2E-exclusive behavior (real CLI parser/runtime/tooling/filesystem) or when unit coverage has a documented gap
205
+ - Verify actual file paths by running the tool first — never hardcode paths from documentation or assumptions
206
+ - Use explicit `&& echo "PASS" || echo "FAIL"` patterns for every verification step
207
+ - Check specific exit codes for error commands (not just "non-zero")
208
+
209
+ **SHOULD (strongly recommended):**
210
+ - Test the real user journey — structure TCs as a sequential workflow, not isolated commands
211
+ - Verify exit codes for all commands, not just error cases
212
+ - Include negative assertions (files/directories that should NOT exist)
213
+ - Capture and check CLI output content, not just exit codes
214
+ - Verify that status values match actual implementation (e.g., `done` vs `completed`)
215
+
216
+ **COST-AWARE (reduce LLM invocations):**
217
+ - Consolidate assertions that share the same CLI invocation into a single TC. For example, after running `ace-lint file.rb`, check exit code, report.json structure, and ok.md existence in ONE TC — not three.
218
+ - Target 2-5 TCs per scenario. More than 5 suggests the scenario is too broad; split into focused scenarios. Fewer than 2 suggests merging with a related scenario.
219
+ - Never create a TC for a single assertion when that assertion could be appended to an existing TC that runs the same command.
220
+
221
+ #### Recommended TC Ordering
222
+
223
+ 1. **Error paths first** — wrong args, missing files, no prior state (run from clean state)
224
+ 2. **Happy path start** — create/init with correct args, verify output
225
+ 3. **Structure verification** — check actual on-disk file structure with negative assertions
226
+ 4. **Lifecycle operations** — status, advance, fail, retry in workflow order
227
+ 5. **End state** — verify completion message, all steps terminal
228
+
229
+ This ordering ensures error TCs run before any state is created (clean environment), and happy-path TCs build on each other sequentially.
230
+
231
+ See: **e2e-testing.g.md § "Avoiding False Positive Tests"** for the full list of anti-patterns and the reviewer checklist.
232
+
233
+ #### CLI-Based Testing Requirement
234
+
235
+ **E2E tests MUST test through the CLI interface, not library imports.**
236
+
237
+ **Valid approach:**
238
+ ```bash
239
+ OUTPUT=$(ace-review --preset code --subject "diff:HEAD~1" --auto-execute 2>&1)
240
+ EXIT_CODE=$?
241
+ [ "$EXIT_CODE" -eq 0 ] && echo "PASS" || echo "FAIL"
242
+ ```
243
+
244
+ **Invalid approach (this is integration/unit testing, not E2E):**
245
+ ```bash
246
+ bundle exec ruby -e '
247
+ require_relative "lib/ace/review"
248
+ result = Ace::Review::SomeClass.method(args)
249
+ '
250
+ ```
251
+
252
+ **For execution tests (LLM, API calls):**
253
+ - Use `--auto-execute` to make real API calls
254
+ - Using only `--dry-run` cannot verify actual execution behavior
255
+ - Keep costs minimal: cheap models, tiny prompts, small diffs
256
+
257
+ #### Common Anti-Patterns to Avoid
258
+
259
+ **Writing tests from design specs before implementation:**
260
+ - Task descriptions and BDD specs often describe *intended* behavior with *proposed* config formats
261
+ - The actual implementation may use different formats, different commands, or different workflows
262
+ - Example: A spec might describe `jobs:` with explicit `number:` and `parent:` fields, but implementation uses `steps:` with auto-generated numbers and dynamic hierarchy via `add --after --child`
263
+ - **Fix:** Always read the actual implementation code (especially config parsing) before writing test data
264
+
265
+ **Assuming static vs dynamic behavior:**
266
+ - Tests may assume features work at config-time (static) when they actually work at runtime (dynamic)
267
+ - Example: Assuming hierarchy is defined in config when it's actually built dynamically via commands
268
+ - **Fix:** Trace the actual code path for the feature being tested
269
+
270
+ **Splitting one command into many redundant TCs:**
271
+ - Multiple TCs each validate one assertion after the same CLI invocation, creating overlap with unit tests and increasing run cost
272
+ - Example: TC-A checks exit code, TC-B checks report file, TC-C checks summary text for the same command run
273
+ - **Fix:** Consolidate those assertions into one TC and move formatter/parser details to unit tests
274
+
275
+ **Example for "Test config file validation":**
276
+ ```markdown
277
+ ## Test Cases
278
+
279
+ ### TC-001: Error — Missing Config File
280
+ **Objective:** Verify that a nonexistent config file produces exit code 3 and a clear error
281
+
282
+ ### TC-002: Error — Malformed YAML Config
283
+ **Objective:** Verify malformed YAML is handled gracefully with actionable error message
284
+
285
+ ### TC-003: Valid Config File
286
+ **Objective:** Verify valid configuration files are accepted
287
+
288
+ ### TC-004: Verify On-Disk Structure
289
+ **Objective:** Check actual file paths created, with negative assertions for wrong paths
290
+ ```
291
+
292
+ ### 9. Write Test Files
293
+
294
+ Create the scenario directory with separate files:
295
+ ```bash
296
+ mkdir -p {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}
297
+ ```
298
+
299
+ Write `scenario.yml` (metadata and setup):
300
+ ```
301
+ {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}/scenario.yml
302
+ ```
303
+
304
+ Write scenario pair configs:
305
+ ```
306
+ {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}/runner.yml.md
307
+ {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}/verifier.yml.md
308
+ ```
309
+
310
+ Write individual TC runner/verifier files for each test case:
311
+ ```
312
+ {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}/TC-001-{tc-slug}.runner.md
313
+ {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}/TC-001-{tc-slug}.verify.md
314
+ ```
315
+
316
+ Optionally create a fixtures directory if test data is needed:
317
+ ```bash
318
+ mkdir -p {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}/fixtures
319
+ ```
320
+
321
+ Example: `ace-lint/test/e2e/TS-LINT-003-config-file-validation/scenario.yml`
322
+
323
+ ### 10. Report Result
324
+
325
+ Output a summary:
326
+
327
+ ```markdown
328
+ ## E2E Test Created
329
+
330
+ **Test ID:** TS-{AREA}-{NNN}
331
+ **Format:** TS (directory-based)
332
+ **Package:** {package}
333
+ **Directory:** {PACKAGE}/test/e2e/TS-{AREA}-{NNN}-{slug}/
334
+ **Files:**
335
+ - scenario.yml
336
+ - runner.yml.md
337
+ - verifier.yml.md
338
+ - TC-001-{tc-slug}.runner.md
339
+ - TC-001-{tc-slug}.verify.md
340
+
341
+ ### Next Steps
342
+
343
+ 1. Review and customize `scenario.yml` and TC files
344
+ 2. Add fixtures to the `fixtures/` directory if needed
345
+ 3. Review the E2E Decision Record and ensure `unit-coverage-reviewed` is populated
346
+ 4. Run the test with `ace-test-e2e {package} TS-{AREA}-{NNN}`
347
+ 5. Update `last-verified` after successful execution
348
+ ```
349
+
350
+ ## Example Invocations
351
+
352
+ **Create a test:**
353
+ ```bash
354
+ ace-bundle wfi://e2e/create
355
+ ```
356
+
357
+ Creates: `ace-lint/test/e2e/TS-LINT-003-new-test-scenario/` with `scenario.yml` and TC files.
358
+
359
+ **Create a contextual test:**
360
+ ```bash
361
+ ace-bundle wfi://e2e/create
362
+ ```
363
+
364
+ Creates: `ace-lint/test/e2e/TS-LINT-003-config-file-validation/` with `scenario.yml` and TC files for config validation.
365
+
366
+ **Create test for new area:**
367
+ ```bash
368
+ ace-bundle wfi://e2e/create
369
+ ```
370
+
371
+ Creates: `ace-review/test/e2e/TS-COMMENT-001-pr-comment-threading/` with `scenario.yml` and TC files.
372
+
373
+ ## Error Handling
374
+
375
+ ### Package Not Found
376
+
377
+ ```
378
+ Error: Package '{package}' not found.
379
+
380
+ Available packages:
381
+ - ace-lint
382
+ - ace-review
383
+ - ace-test-runner-e2e
384
+ ```
385
+
386
+ ### Invalid Area Code
387
+
388
+ ```
389
+ Error: Invalid area code '{area}'.
390
+
391
+ Area codes must be:
392
+ - 2-10 characters
393
+ - Alphanumeric only
394
+ - Will be converted to uppercase
395
+ ```
@@ -0,0 +1,253 @@
1
+ ---
2
+ doc-type: workflow
3
+ title: Execute E2E Test Workflow
4
+ purpose: Execute test cases in a pre-populated sandbox with reporting
5
+ ace-docs:
6
+ last-updated: 2026-03-04
7
+ last-checked: 2026-03-21
8
+ ---
9
+
10
+ # Execute E2E Test Workflow
11
+
12
+ This workflow guides an agent through executing test cases in a **pre-populated sandbox**. The sandbox was created by `SetupExecutor` — this workflow handles only execution and reporting.
13
+
14
+ ## SetupExecutor Contract
15
+
16
+ Before this workflow is invoked, `SetupExecutor` has already:
17
+ - Created an isolated sandbox directory under `.ace-local/test-e2e/`
18
+ - Initialized git (`git init`, user config, `.gitignore`)
19
+ - Installed `mise.toml` for tool version management
20
+ - Created `.ace` symlinks for configuration access
21
+ - Created `results/tc/{NN}/` directories for each TC
22
+ - Copied fixtures from the scenario's `fixtures/` directory
23
+ - Placed `TC-*.runner.md` and `TC-*.verify.md` files in the sandbox
24
+
25
+ Tag filtering happens at discovery time (before `SetupExecutor` runs). By the time this workflow executes, only matching scenarios are included.
26
+
27
+ ## Arguments
28
+
29
+ - `PACKAGE` (required) - Package containing the test (e.g., `ace-lint`)
30
+ - `TEST_ID` (required) - Test identifier (e.g., `TS-LINT-001`)
31
+ - `--sandbox SANDBOX_PATH` (required) - Path to pre-populated sandbox directory
32
+ - `--run-id RUN_ID` (optional) - Pre-generated timestamp ID for deterministic report paths
33
+ - `--env KEY=VALUE[,...]` (optional) - Comma-separated environment variables to set before execution
34
+ - `--verify` (optional) - Enable independent verifier mode (second agent pass with sandbox inspection)
35
+ - `TEST_CASES` (optional) - Comma-separated TC IDs to execute (e.g., `TC-001,tc-003,002`)
36
+
37
+ **TC ID normalization:** `TC-001` (unchanged), `tc-001` → `TC-001`, `001` → `TC-001`, `1` → `TC-001`, `TC-1` → `TC-001`
38
+
39
+ ## Canonical Conventions
40
+
41
+ - `ace-test-e2e` runs single-package scenarios; `ace-test-e2e-suite` runs suite-level execution
42
+ - Scenario IDs: `TS-<PACKAGE_SHORT>-<NNN>[-slug]`
43
+ - Standalone TC pairs: `TC-*.runner.md` + `TC-*.verify.md`
44
+ - TC artifacts: `results/tc/{NN}/`
45
+ - Summary counters: `tcs-passed`, `tcs-failed`, `tcs-total`, `failed[].tc`
46
+
47
+ ## Execution Contract
48
+
49
+ - Runner is execution-only: execute declared TC actions and capture evidence.
50
+ - Verifier is verification-only: determine PASS/FAIL using impact-first ordering:
51
+ 1. sandbox/project state impact
52
+ 2. explicit artifacts
53
+ 3. debug captures (`stdout`/`stderr`/exit) as fallback
54
+ - Do not interpret setup ownership in runner TC files; setup is owned by `scenario.yml` + fixtures.
55
+
56
+ ## Dual-Agent Verifier
57
+
58
+ When `--verify` is passed (or always-on for CLI pipeline runs), execution follows a dual-agent pattern:
59
+
60
+ 1. **Runner agent** executes TC steps and produces artifacts in `results/tc/{NN}/`
61
+ 2. **Verifier agent** independently inspects the sandbox and artifacts against `TC-*.verify.md` expectations
62
+ 3. **Report generator** (`PipelineReportGenerator`) produces deterministic summary from verifier output
63
+
64
+ The verifier has no access to the runner's conversation — it evaluates purely from on-disk evidence. This prevents self-confirmation bias.
65
+
66
+ ## Subagent Mode
67
+
68
+ When invoked as a subagent (via Task tool from orchestrator):
69
+
70
+ **Return contract:**
71
+ ```markdown
72
+ - **Test ID**: {test-id}
73
+ - **Status**: pass | fail | partial
74
+ - **Passed**: {count}
75
+ - **Failed**: {count}
76
+ - **Total**: {count}
77
+ - **Report Paths**: {timestamp}-{short-pkg}-{short-id}.*
78
+ - **Issues**: Brief description or "None"
79
+ ```
80
+
81
+ Do NOT return full report contents — they are on disk.
82
+
83
+ ## TC-Level Execution Mode
84
+
85
+ When invoked with `--tc-mode`, only a single TC is executed.
86
+
87
+ **TC-Level Arguments:**
88
+ - `PACKAGE` (required), `TEST_ID` (required), `TC_ID` (required)
89
+ - `--tc-mode` (required), `--sandbox SANDBOX_PATH` (required)
90
+ - `--run-id RUN_ID` (optional)
91
+
92
+ **TC-Level Steps:**
93
+ 1. Verify `SANDBOX_PATH` exists
94
+ 2. `cd SANDBOX_PATH`
95
+ 3. Execute TC steps from the runner file
96
+ 4. Write per-TC reports to `{RUN_ID}-{pkg}-{scenario}-{tc}-reports/`
97
+ 5. Return TC-level contract
98
+
99
+ **TC-Level Rules:**
100
+ - Do NOT create or modify sandbox — `SetupExecutor` already prepared it
101
+ - Execute only the steps described in the TC content
102
+ - Report actual results even if they differ from expected
103
+
104
+ ---
105
+
106
+ ## Sandbox Rules
107
+
108
+ - Do NOT create or modify sandbox setup — it is already prepared
109
+ - Do NOT run environment setup, prerequisite checks, or test data creation
110
+ - Focus exclusively on TC execution and reporting
111
+
112
+ ## Workflow Steps
113
+
114
+ ### 1. Set Up Execution Environment
115
+
116
+ 1. Parse `--env` and export each `KEY=VALUE`
117
+ 2. `cd SANDBOX_PATH`
118
+ 3. Set `TIMESTAMP_ID` from `--run-id` or generate with `ace-b36ts encode`
119
+
120
+ **Expected variables:**
121
+ - `SANDBOX_PATH` — Pre-populated sandbox (cwd)
122
+ - `TIMESTAMP_ID` — Unique run identifier
123
+ - Any variables from `--env`
124
+
125
+ ### 2. Discover and Filter Test Cases
126
+
127
+ Find TC definitions in the sandbox:
128
+
129
+ ```bash
130
+ find "${SANDBOX_PATH}" -name "TC-*.runner.md" -o -name "TC-*.verify.md" 2>/dev/null | sort
131
+ ```
132
+
133
+ List all found TCs before proceeding:
134
+ ```
135
+ Found N test case files:
136
+ - TC-001: {filename}
137
+ - TC-002: {filename}
138
+ ```
139
+
140
+ > **TC FIDELITY RULE:** Execute ONLY discovered `TC-*.runner.md` + `TC-*.verify.md` pairs. Do NOT invent TCs. Every runner must have a matching verifier and vice versa. Missing pairs are errors — report them and skip the unmatched TC.
141
+
142
+ If `TEST_CASES` argument provided, normalize IDs to `TC-NNN` format and filter. Only execute matching TCs.
143
+
144
+ ### 3. Execute Test Cases
145
+
146
+ > **Use `ace-test-e2e-sh "$SANDBOX_PATH"` for ALL commands.**
147
+
148
+ For each TC (TC-NNN):
149
+
150
+ 1. **Check filter** — skip if `FILTERED_CASES` is set and TC not in list
151
+ 2. **Read** the runner file objective
152
+ 3. **Execute** runner steps, save artifacts to `results/tc/{NN}/`
153
+ 4. **Capture** exit codes, output, error messages
154
+ 5. **Evaluate** against verifier expectations
155
+ 6. **Record** Pass/Fail with per-TC evidence
156
+
157
+ **Self-check:** Before writing reports, verify your result table has exactly N rows matching discovered TCs (or filtered subset).
158
+
159
+ Track friction points for the experience report.
160
+
161
+ ### 4. Write Reports
162
+
163
+ Write three report files to `${SANDBOX_PATH}-reports/`.
164
+
165
+ ```bash
166
+ REPORT_DIR="${SANDBOX_PATH}-reports"
167
+ mkdir -p "$REPORT_DIR"
168
+ ```
169
+
170
+ Replace all `{placeholder}` values with actual data.
171
+
172
+ #### 4.1 summary.r.md
173
+
174
+ ```yaml
175
+ ---
176
+ test-id: {test-id}
177
+ package: {package}
178
+ agent: {agent-name}
179
+ executed: {timestamp}
180
+ status: pass|fail|partial|incomplete
181
+ tcs-passed: {count}
182
+ tcs-failed: {count}
183
+ tcs-total: {count}
184
+ score: "{passed}/{total}"
185
+ verdict: pass|fail|partial|incomplete
186
+ filtered: true|false
187
+ failed:
188
+ - tc: TC-NNN
189
+ category: tool-bug|runner-error|test-spec-error|infrastructure-error
190
+ evidence: "brief evidence"
191
+ ---
192
+ ```
193
+
194
+ Followed by test information table, results summary, and TC evaluation details.
195
+
196
+ #### 4.2 experience.r.md
197
+
198
+ Agent experience report with friction points, root cause analysis, improvement suggestions, and positive observations.
199
+
200
+ #### 4.3 metadata.yml
201
+
202
+ ```yaml
203
+ run-id: "{TIMESTAMP_ID}"
204
+ test-id: "{test-id}"
205
+ package: "{package}"
206
+ status: "{status}"
207
+ score: {0.0-1.0}
208
+ verdict: pass|partial|fail
209
+ tcs-passed: {count}
210
+ tcs-failed: {count}
211
+ tcs-total: {count}
212
+ failed:
213
+ - tc: TC-NNN
214
+ category: tool-bug|runner-error|test-spec-error|infrastructure-error
215
+ evidence: "brief evidence"
216
+ test_cases:
217
+ filtered: true|false
218
+ executed: [TC-001, TC-003]
219
+ git:
220
+ branch: "{branch}"
221
+ commit: "{short-sha}"
222
+ ```
223
+
224
+ #### 4.4 Report file paths
225
+
226
+ ```
227
+ Reports written:
228
+ - ${REPORT_DIR}/summary.r.md
229
+ - ${REPORT_DIR}/experience.r.md
230
+ - ${REPORT_DIR}/metadata.yml
231
+ ```
232
+
233
+ ### 5. Return Summary
234
+
235
+ ```markdown
236
+ ## E2E Test Execution Report
237
+ **Test ID:** {test-id} | **Package:** {package} | **Status:** {PASS/FAIL}
238
+
239
+ | Test Case | Description | Status |
240
+ |-----------|-------------|--------|
241
+ | TC-001 | ... | Pass |
242
+
243
+ Reports: `.ace-local/test-e2e/{timestamp}-{short-pkg}-{short-id}-reports/`
244
+ ```
245
+
246
+ ## Error Handling
247
+
248
+ | Failure | Action |
249
+ |---------|--------|
250
+ | TC fails | Record details, continue remaining TCs, include in report |
251
+ | Sandbox missing/corrupted | Report error, do NOT recreate, return error summary |
252
+ | TC filter mismatch | STOP, do not write reports, offer re-run |
253
+ | Missing TC pair file | Report error for that TC, skip it, continue others |