ace-test-runner-e2e 0.29.8 → 0.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.ace-defaults/e2e-runner/config.yml +14 -2
  3. data/CHANGELOG.md +233 -0
  4. data/README.md +2 -2
  5. data/exe/ace-test-e2e-sh +9 -4
  6. data/handbook/guides/e2e-testing.g.md +75 -9
  7. data/handbook/guides/scenario-yml-reference.g.md +21 -8
  8. data/handbook/guides/tc-authoring.g.md +23 -5
  9. data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
  10. data/handbook/skills/as-e2e-review/SKILL.md +2 -2
  11. data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
  12. data/handbook/templates/agent-experience-report.template.md +3 -2
  13. data/handbook/templates/scenario.yml.template.yml +7 -2
  14. data/handbook/templates/tc-file.template.md +16 -4
  15. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
  16. data/handbook/workflow-instructions/e2e/create.wf.md +128 -25
  17. data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
  18. data/handbook/workflow-instructions/e2e/fix.wf.md +84 -15
  19. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +33 -1
  20. data/handbook/workflow-instructions/e2e/review.wf.md +40 -25
  21. data/handbook/workflow-instructions/e2e/rewrite.wf.md +22 -8
  22. data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
  23. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
  24. data/lib/ace/test/end_to_end_runner/atoms/artifact_contract_validator.rb +138 -0
  25. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
  26. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
  27. data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +195 -5
  28. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +58 -9
  29. data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
  30. data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
  31. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
  32. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
  33. data/lib/ace/test/end_to_end_runner/molecules/artifact_pruner.rb +61 -0
  34. data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
  35. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
  36. data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
  37. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +235 -18
  38. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +164 -13
  39. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
  40. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +121 -18
  41. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +15 -12
  42. data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +374 -0
  43. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +83 -5
  44. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +121 -16
  45. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +422 -97
  46. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
  47. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
  48. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +98 -18
  49. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +159 -19
  50. data/lib/ace/test/end_to_end_runner/version.rb +1 -1
  51. data/lib/ace/test/end_to_end_runner.rb +4 -0
  52. metadata +21 -2
@@ -56,6 +56,15 @@ Build a change inventory:
56
56
  - **Removed features** — deleted files or deprecated modules
57
57
  - **Unchanged features** — stable code with no recent modifications
58
58
 
59
+ Before classifying TCs, also check whether the package change alters a public contract that downstream retained E2E tests commonly pin:
60
+ - status words
61
+ - JSON keys or output schema
62
+ - CLI command/flag shapes
63
+ - lifecycle semantics
64
+ - ownership/state semantics
65
+
66
+ If yes, add an explicit downstream retained-E2E sweep list to the plan instead of limiting scope to the package under edit.
67
+
59
68
  ### 3. Classify Each Existing TC
60
69
 
61
70
  For each TC listed in the coverage matrix, assign exactly one classification:
@@ -71,6 +80,7 @@ For REMOVE due to overlap, replacement evidence is mandatory:
71
80
 
72
81
  **KEEP** — The TC has genuine E2E value and needs no changes. Criteria (all must be true):
73
82
  - TC passes the E2E Value Gate (tests real CLI binary + external tools + filesystem I/O)
83
+ - TC passes the Public-Surface Gate (user can do the job from docs/usage/`--help` without hidden recipes or workarounds)
74
84
  - Related source code has no changes since `last-verified`
75
85
  - TC structure is valid and assertions are current
76
86
 
@@ -79,6 +89,8 @@ For REMOVE due to overlap, replacement evidence is mandatory:
79
89
  - TC scope is too broad (should be narrowed to only E2E-exclusive aspects)
80
90
  - TC scope is too narrow (missing assertions for related behavior in same CLI invocation)
81
91
  - TC has structure issues flagged in the review
92
+ - TC has undeclared or wildcard verifier-dependent artifact paths
93
+ - TC is hidden-recipe-driven or workaround-driven but the underlying user job should still be supported by the public surface after scenario/docs/help correction
82
94
 
83
95
  **CONSOLIDATE** — The TC should merge with another TC. Criteria (any one is sufficient):
84
96
  - Multiple TCs share the same CLI invocation and could be a single TC with multiple assertions
@@ -91,6 +103,7 @@ For each classification, document:
91
103
  - For REMOVE (overlap): replacement evidence (`existing unit tests` or `planned unit backfill`)
92
104
  - For MODIFY: what specifically needs to change
93
105
  - For CONSOLIDATE: the target TC and which assertions merge
106
+ - Whether the current TC is public-surface-valid, hidden-recipe-driven, workaround-driven, or checking an unsupported internal detail
94
107
 
95
108
  ### 4. Identify New TCs Needed
96
109
 
@@ -112,6 +125,12 @@ For each candidate, answer: "Does this require the full CLI binary + real extern
112
125
  - If NO: skip — unit tests cover this (or add explicit unit test action if coverage is missing)
113
126
  - If YES: include in the plan
114
127
 
128
+ **Filter through Public-Surface Gate:**
129
+ For each candidate, answer: "Can a user do this job through the public tool surface without hidden recipes or workarounds?"
130
+ - If NO because the job should be supported: add a product/docs/help improvement action and do not encode the workaround into the TC
131
+ - If NO because the detail is not user-visible: skip or narrow the TC
132
+ - If YES: keep planning the TC
133
+
115
134
  ### 5. Propose Scenario Structure
116
135
 
117
136
  Group all planned TCs (KEEP + MODIFY + CONSOLIDATE targets + ADD) into scenarios:
@@ -176,6 +195,19 @@ Format the complete change plan:
176
195
  |----|---------------|
177
196
  | {tc-id} | Update assertions — {feature} behavior changed in {commit} |
178
197
  | {tc-id} | Narrow scope — remove assertions covered by unit tests |
198
+ | {tc-id} | Remove hidden recipe / workaround dependence — rewrite around public docs/help/CLI path |
199
+
200
+ ### Public-Surface Gaps ({n} actions)
201
+
202
+ | Action | Target | Why |
203
+ |--------|--------|-----|
204
+ | Update docs/help/CLI | {package/path} | {job is valid but current public surface is too weak for the E2E path} |
205
+
206
+ ### Downstream Retained-E2E Sweep ({n} actions)
207
+
208
+ | Scenario | Trigger | Change Needed |
209
+ |----------|---------|---------------|
210
+ | {scenario-id} | {renamed key / lifecycle shift / command-shape change} | {update retained verifier/runner contract} |
179
211
 
180
212
  ### CONSOLIDATE ({n} TCs → {n} TCs)
181
213
 
@@ -252,4 +284,4 @@ implementation code and at least one E2E test before planning changes.
252
284
  If the user rejects the plan:
253
285
  1. Ask which classifications they disagree with
254
286
  2. Adjust the plan based on feedback
255
- 3. Re-present the updated plan
287
+ 3. Re-present the updated plan
@@ -13,7 +13,10 @@ This workflow performs deep exploration of a package to produce a **coverage mat
13
13
 
14
14
  During review, treat the runner/verifier split as a first-class quality check:
15
15
  - Runner must be execution-only (no verdict language).
16
- - Verifier must be impact-first (sandbox impact before artifacts/debug).
16
+ - Verifier must be impact-first (sandbox impact before runner observations and debug).
17
+ - `results/tc/{NN}/` must contain only declared verifier-dependent evidence.
18
+ - Every verifier-dependent artifact path must be declared by runner/setup; verifier-only or wildcard references are contract drift.
19
+ - Goal-style TCs must also pass the public-surface check: the runner should be able to do the job from docs/usage/`--help` and the tool under test, without hidden recipes or workarounds.
17
20
 
18
21
  **Pipeline position:** Stage 1 of 3 (Explore)
19
22
 
@@ -86,9 +89,8 @@ Map what unit tests cover at each layer:
86
89
 
87
90
  **List all test files by layer:**
88
91
  ```bash
89
- find {PACKAGE}/test/atoms -name "*_test.rb" 2>/dev/null | sort
90
- find {PACKAGE}/test/molecules -name "*_test.rb" 2>/dev/null | sort
91
- find {PACKAGE}/test/organisms -name "*_test.rb" 2>/dev/null | sort
92
+ find {PACKAGE}/test/fast -name "*_test.rb" 2>/dev/null | sort
93
+ find {PACKAGE}/test/feat -name "*_test.rb" 2>/dev/null | sort
92
94
  ```
93
95
 
94
96
  **For each test file:**
@@ -100,7 +102,7 @@ Build a unit test map:
100
102
 
101
103
  | Test File | Layer | Feature Covered | Test Count | Assertion Count |
102
104
  |-----------|-------|-----------------|------------|-----------------|
103
- | {path} | atom | {feature} | {n} | {n} |
105
+ | {path} | fast/feat | {feature} | {n} | {n} |
104
106
 
105
107
  ### 4. Inventory Existing E2E Coverage
106
108
 
@@ -116,22 +118,35 @@ find {PACKAGE}/test/e2e -name "scenario.yml" -path "*/TS-*" 2>/dev/null | sort
116
118
  - `tags`, `cost-tier`, `e2e-justification`, `unit-coverage-reviewed`
117
119
  - `last-verified`, `verified-by`
118
120
  - Extract the objective (what the TC verifies)
121
+ - Record TC style:
122
+ - `public-surface`
123
+ - `retained-contract`
124
+ - Record the TC's primary oracle:
125
+ - final sandbox state / real product output
126
+ - runner observations as supporting context
127
+ - debug fallback only when necessary
128
+ - Record whether the job is achievable from the public surface:
129
+ - `valid`
130
+ - `hidden-recipe-driven`
131
+ - `workaround-driven`
132
+ - `unsupported-detail`
133
+ - Record qualitative friction:
134
+ - `low`, `medium`, `high`
119
135
  - Identify which CLI commands the TC runs
120
136
  - Record command fingerprint (`command + key flags`) for each command assertion
121
- - Count verification steps (PASS/FAIL checks)
122
137
  - Map to the feature it tests
123
138
  - Mark TC evidence status:
124
- - `complete` when `e2e-justification` is present, command artifacts are present, and `unit-coverage-reviewed` has at least one path
139
+ - `complete` when `e2e-justification` is present, the verifier is end-state-first, and `unit-coverage-reviewed` has at least one path
125
140
  - `missing` otherwise
126
- - `at-risk` when evidence is existence-only or duplicate command invocations are detected
141
+ - `at-risk` when evidence is existence-only, helper-artifact-driven, duplicate command invocations are detected, the TC is hidden-recipe/workaround-driven, or verifier-dependent artifacts are undeclared
127
142
 
128
143
  If `--scope` was provided, filter to only the specified scenario.
129
144
 
130
145
  Build an E2E test map:
131
146
 
132
- | TC ID | Title | Command Invocations | Feature Tested | Verifications | Tags | Cost Tier | E2E Justification | Unit Coverage Reviewed | Evidence | False-Positive Risk |
133
- |-------|-------|-------------|----------------|---------------|------|-----------|-------------------|------------------------|----------|
134
- | {id} | {title} | {command list} | {feature} | {n} | {tags} | {tier} | {reason or "(missing)"} | {files or "(missing)"} | {complete/missing/at-risk} | {low/medium/high} |
147
+ | TC ID | Style | Title | Command Invocations | Feature Tested | Primary Oracle | Public Surface Fit | Artifact Contract | Friction | Tags | Cost Tier | E2E Justification | Unit Coverage Reviewed | Evidence | False-Positive Risk |
148
+ |-------|-------|-------|-------------|----------------|----------------|--------------------|-------------------|----------|------|-----------|-------------------|------------------------|----------|---------------------|
149
+ | {id} | {public-surface/retained-contract} | {title} | {command list} | {feature} | {state / output / observations+fallback} | {valid/hidden-recipe/workaround/unsupported-detail} | {declared/undeclared/wildcard} | {low/medium/high} | {tags} | {tier} | {reason or "(missing)"} | {files or "(missing)"} | {complete/missing/at-risk} | {low/medium/high} |
135
150
 
136
151
  ### 5. Build Coverage Matrix
137
152
 
@@ -139,7 +154,7 @@ Combine the three inventories into a single coverage matrix:
139
154
 
140
155
  **Matrix structure:**
141
156
  - **Rows:** Features/behaviors from step 2
142
- - **Columns:** Unit Tests (atoms/molecules/organisms) | E2E Tests
157
+ - **Columns:** Unit Tests (`fast`/`feat`) | E2E Tests
143
158
  - **Cells:** Test file references + counts, or "none"
144
159
 
145
160
  ```markdown
@@ -147,10 +162,10 @@ Combine the three inventories into a single coverage matrix:
147
162
 
148
163
  | Feature | Unit Tests | E2E Tests | Evidence Strength | False-Positive Risk | Status |
149
164
  |---------|-----------|-----------|------------------|----------------------|--------|
150
- | {feature} | {test files} ({n} assertions) | {TC IDs} ({n} verifications) | command-output/state+content | low | Covered |
165
+ | {feature} | {test files} ({n} assertions) | {TC IDs} | state+content + observations | low | Covered |
151
166
  | {feature} | {test files} ({n} assertions) | none | none | n/a | Unit-only |
152
- | {feature} | none | {TC IDs} ({n} verifications) | command-output | low | E2E-only |
153
- | {feature} | {test files} ({n} assertions) | {TC IDs} ({n} verifications) | command-output or existence-only | medium/high | Overlap |
167
+ | {feature} | none | {TC IDs} | state+content | low | E2E-only |
168
+ | {feature} | {test files} ({n} assertions) | {TC IDs} | debug-heavy, helper-artifact-driven, or workaround-driven | medium/high | Overlap |
154
169
  | {feature} | none | none | none | high | Gap |
155
170
  ```
156
171
 
@@ -171,7 +186,7 @@ Produce the full review report with actionable findings:
171
186
 
172
187
  **Reviewed:** {timestamp}
173
188
  **Scope:** {package-wide or scenario-id}
174
- **Workflow version:** 2.1
189
+ **Workflow version:** 2.2
175
190
 
176
191
  ### Summary
177
192
 
@@ -182,8 +197,8 @@ Produce the full review report with actionable findings:
182
197
  | Unit assertions | {n} |
183
198
  | E2E scenarios | {n} |
184
199
  | E2E test cases | {n} |
185
- | TCs with decision evidence | {n}/{total} |
186
- | High-risk false-positive TCs | {n}/{total} |
200
+ | TCs with end-state-first evidence | {n}/{total} |
201
+ | High-risk helper-artifact TCs | {n}/{total} |
187
202
 
188
203
  ### Coverage Matrix
189
204
 
@@ -196,19 +211,19 @@ TCs that may fail the E2E Value Gate (unit tests cover the same behavior or high
196
211
  | TC ID | Feature | Overlapping Unit Tests | Recommendation |
197
212
  |-------|---------|----------------------|----------------|
198
213
  | {id} | {feature} | {test files} | Remove — unit tests cover this fully |
199
- | {id} | {feature} | {test files} | Keep — TC tests CLI pipeline, units test logic |
200
- | {id} | {feature} | {test files} | Strengthen — currently existence-only or duplicate command assertions |
214
+ | {id} | {feature} | {test files} | Keep — TC tests real CLI journey and final integrated outcome |
215
+ | {id} | {feature} | {test files} | Strengthen — currently helper-artifact-driven, workaround-driven, or debug-heavy |
201
216
 
202
217
  **Candidates for removal:** {n} TCs have full overlap with unit tests
203
218
 
204
219
  ### E2E Decision Record Coverage
205
220
 
206
- | TC ID | Evidence Status | Missing Fields |
207
- |-------|------------------|----------------|
208
- | {id} | complete | none |
209
- | {id} | missing | e2e-justification, unit-coverage-reviewed |
221
+ | TC ID | Style | Evidence Status | Public Surface Fit | Artifact Contract | Friction | Missing Fields / Contract Drift |
222
+ |-------|-------|------------------|--------------------|-------------------|----------|-------------------------------|
223
+ | {id} | public-surface | complete | valid | declared | low | none |
224
+ | {id} | retained-contract | missing | hidden-recipe-driven | undeclared | high | e2e-justification, unit-coverage-reviewed, end-state oracle |
210
225
 
211
- **Action:** Any TC with missing evidence should be updated in `scenario.yml` during the next rewrite cycle.
226
+ **Action:** Any TC with missing evidence, undeclared/wildcard artifact drift, hidden recipes, workaround dependence, or unsupported internal-detail checks should be updated during the next rewrite cycle.
212
227
 
213
228
  ### Gap Analysis
214
229
 
@@ -30,7 +30,8 @@ ace-bundle wfi://e2e/review → ace-bundle wfi://e2e/plan-changes → ace-bu
30
30
 
31
31
  - Keep scenario IDs in `TS-<PACKAGE_SHORT>-<NNN>[-slug]`
32
32
  - Keep standalone pairs as `TC-*.runner.md` + `TC-*.verify.md`
33
- - Keep TC artifact outputs under `results/tc/{NN}/`
33
+ - Keep TC outcome artifacts under `results/tc/{NN}/`
34
+ - Keep runner observations in harness reports, not sandbox helper files
34
35
  - Keep summary report fields as `tcs-passed`, `tcs-failed`, `tcs-total`, `failed[].tc`
35
36
  - CLI split reminder:
36
37
  - `ace-test-e2e` runs single-package tests
@@ -41,6 +42,10 @@ ace-bundle wfi://e2e/review → ace-bundle wfi://e2e/plan-changes → ace-bu
41
42
  - Normalize runner files to execution-only language.
42
43
  - Normalize verifier files to verdict-only, impact-first validation.
43
44
  - Keep setup concerns in `scenario.yml` and fixtures, not in TC runner setup sections.
45
+ - Keep only declared verifier-dependent evidence under `results/tc/{NN}/`.
46
+ - Move verifier-only artifact references into explicit runner/setup declarations.
47
+ - Replace wildcard artifact paths with exact declared files.
48
+ - Rewrite goal-style TCs around the public user path. Do not preserve hidden recipes, workaround branches, or supporting-tool probes as the way the runner reaches the goal.
44
49
 
45
50
  ## Workflow Steps
46
51
 
@@ -120,16 +125,20 @@ Follow the E2E test writing rules:
120
125
 
121
126
  - **Run the tool first** to verify actual behavior before writing assertions
122
127
  - Apply the E2E Value Gate — every TC must require real CLI binary + external tools + filesystem I/O
123
- - Use `&& echo "PASS" || echo "FAIL"` patterns for every verification step
124
128
  - Follow TC ordering: error paths first, happy path, structure verification, lifecycle, end state
125
129
  - Consolidate assertions sharing the same CLI invocation into a single TC
126
130
  - Target 2-5 TCs per scenario
127
131
  - Test through the CLI interface, not library imports
128
- - Add command-level evidence in every runner:
129
- - command output (`*.stdout`/`*.stderr`)
130
- - command exit status (`*.exit`)
131
- - Add at least one behavioral/content assertion per command assertion set
132
+ - Write runner goals as “do the job” outcomes, not “write a report for the verifier” chores
133
+ - Keep `results/tc/{NN}/` for declared verifier-dependent evidence only; avoid undeclared helper YAML, reflections, and verifier-facing manifests
134
+ - Keep only declared verifier-dependent evidence under `results/tc/{NN}/`; small supporting captures are acceptable when they are explicit and necessary
135
+ - Use runner observations as the only non-filesystem secondary evidence source
136
+ - Make final sandbox state or real product output the primary oracle whenever possible
137
+ - Add behavioral/content assertions only when CLI output itself is part of the user-visible outcome
132
138
  - Remove duplicate command-only TCs; fold related assertions into one TC where possible
139
+ - Do not encode exact workaround procedures, hidden command recipes, or internal debugging tricks the user would not infer from docs/usage/`--help`
140
+ - For watch/live-output flows, rewrite to a bounded-session pattern with explicit shutdown evidence
141
+ - If the job is valid but the public surface is too weak, plan a product/docs/help fix instead of hardcoding the workaround into the TC
133
142
 
134
143
  **Load the TC template for reference:**
135
144
  ```bash
@@ -146,7 +155,11 @@ For each TC classified as MODIFY:
146
155
  - **Narrow scope** — remove assertions that unit tests cover, keep only E2E-exclusive checks
147
156
  - **Broaden scope** — add assertions for related behavior tested by the same CLI invocation
148
157
  - **Fix structure** — add missing sections, fix formatting issues
149
- - **Add evidence gates** — if the existing TC relies on existence-only or missing exit/status checks, add explicit command output assertions and `.exit` captures
158
+ - **Replace helper-artifact oracles** — if the existing TC relies on runner-written helper files, rewrite it around final sandbox state plus runner observations
159
+ - **Declare verifier-dependent artifacts** — if the verifier names a `results/tc/...` file, ensure the runner or setup declares the exact same path
160
+ - **Remove wildcard declarations** — replace `results/tc/.../*` or `results/tc/.../foo.*` with exact paths
161
+ - **Add evidence gates** — if the existing TC relies on existence-only or missing end-state checks, strengthen the primary oracle before falling back to debug captures
162
+ - **Remove hidden recipes/workarounds** — if the existing TC teaches the runner how to bypass the public surface, rewrite it around the supported user path or narrow/remove the TC
150
163
  3. Update the `last-verified` field if the TC was re-run during modification
151
164
  4. Write the updated TC runner/verifier files
152
165
 
@@ -234,7 +247,8 @@ Present the execution summary:
234
247
  - [ ] TC count matches plan: {yes/no}
235
248
  - [ ] No stale references: {yes/no}
236
249
  - [ ] All scenarios have 2-5 TCs: {yes/no}
237
- - [ ] All modified/created TCs include command output + exit artifacts: {yes/no}
250
+ - [ ] Modified/created TCs avoid undeclared helper files in `results/tc/{NN}/`: {yes/no}
251
+ - [ ] Modified/created TCs declare every verifier-dependent artifact path: {yes/no}
238
252
 
239
253
  ### Next Steps
240
254
 
@@ -1,4 +1,12 @@
1
1
  ---
2
+ name: e2e-run
3
+ description: Execute an E2E test scenario with full agent guidance
4
+ allowed-tools:
5
+ - Bash(ace-bundle:*)
6
+ - Read
7
+ - Write
8
+ - Glob
9
+ - Grep
2
10
  doc-type: workflow
3
11
  title: Run E2E Test Workflow
4
12
  purpose: Execute an E2E test scenario with full agent guidance
@@ -13,7 +21,7 @@ This workflow guides an agent through executing an E2E test scenario. It support
13
21
 
14
22
  ## Arguments
15
23
 
16
- - `PACKAGE` (optional) - Package containing the test (e.g., `ace-lint`). If omitted, looks for `test/e2e/` in project root.
24
+ - `PACKAGE` (optional) - Package containing the test (e.g., `ace-lint`). If omitted, discovery uses `test/feat/` and `test/e2e/` in the project root.
17
25
  - `TEST_ID` (optional) - Test identifier (e.g., `TS-LINT-001`). If omitted, runs all tests.
18
26
  - `--run-id RUN_ID` (optional) - Pre-generated timestamp ID for deterministic report paths.
19
27
  - `--report-dir PATH` (optional) - Explicit report directory path (skips computed `${TEST_DIR}-reports`).
@@ -33,18 +41,25 @@ This workflow guides an agent through executing an E2E test scenario. It support
33
41
  - `ace-test-e2e` runs single-package scenarios; `ace-test-e2e-suite` runs suite-level execution
34
42
  - Scenario IDs: `TS-<PACKAGE_SHORT>-<NNN>[-slug]`
35
43
  - Standalone TC pairs: `TC-*.runner.md` + `TC-*.verify.md`
36
- - TC artifacts: `results/tc/{NN}/`
44
+ - TC outcome artifacts: `results/tc/{NN}/`
37
45
  - Summary counters: `tcs-passed`, `tcs-failed`, `tcs-total`, `failed[].tc`
38
46
  - Tag filtering happens at discovery time (before sandbox setup)
39
47
 
40
48
  ## Execution Contract
41
49
 
42
- - Runner instructions are execution-only: perform actions and write evidence.
50
+ - Runner instructions are execution-only: perform actions and return final observations.
51
+ - The runner should follow the public user path from docs/usage/`--help` and the tool under test itself. Do not encode or normalize hidden recipes and workarounds.
43
52
  - Verifier instructions are verification-only: assign verdicts using impact-first checks:
53
+
44
54
  1. sandbox/project state impact
45
- 2. explicit artifacts
46
- 3. debug captures as fallback
55
+ 2. runner observations
56
+ 3. explicit outcome artifacts
57
+ 4. debug captures as fallback
58
+
47
59
  - Do not place ad-hoc setup logic in TC runner files; sandbox setup belongs to `scenario.yml` and fixtures.
60
+ - Do not place helper inputs, reflections, or temp manifests under `results/tc/{NN}/`.
61
+ - Do not ask the runner to write verifier-facing summaries or audit files when final sandbox state can prove the goal directly.
62
+ - If the runner observations show a workaround was needed, treat that as a docs/help/product or scenario-design gap, not a successful steady-state contract.
48
63
 
49
64
  ## Execution Environment Guardrail
50
65
 
@@ -55,12 +70,12 @@ This workflow guides an agent through executing an E2E test scenario. It support
55
70
 
56
71
  For CLI providers (`ace-test-e2e`), the deterministic 6-phase pipeline handles execution automatically:
57
72
 
58
- 1. **Setup** `SetupExecutor` creates sandbox (git init, mise.toml, .ace symlinks, `results/tc/{NN}/` dirs)
59
- 2. **Runner prompt** `SkillPromptBuilder` assembles context from `runner.yml.md` + `TC-*.runner.md`
60
- 3. **Runner LLM** Agent executes TC steps in sandbox, produces artifacts
61
- 4. **Verifier prompt** `SkillPromptBuilder` assembles context from `verifier.yml.md` + `TC-*.verify.md`
62
- 5. **Verifier LLM** Independent agent evaluates artifacts against expectations
63
- 6. **Report** `PipelineReportGenerator` produces deterministic summary
73
+ 1. **Setup** -- `SetupExecutor` creates sandbox (git init, mise.toml, .ace symlinks, `results/tc/{NN}/` dirs)
74
+ 2. **Runner prompt** -- `SkillPromptBuilder` assembles context from `runner.yml.md` + `TC-*.runner.md`
75
+ 3. **Runner LLM** -- Agent executes TC steps in sandbox and returns final observations
76
+ 4. **Verifier prompt** -- `SkillPromptBuilder` assembles context from `verifier.yml.md` + `TC-*.verify.md` and includes runner observations
77
+ 5. **Verifier LLM** -- Independent agent evaluates artifacts against expectations
78
+ 6. **Report** -- `PipelineReportGenerator` produces deterministic summary and persists runner observations in harness-managed reports
64
79
 
65
80
  When this workflow is invoked directly (not via CLI pipeline), the agent performs steps 1-6 manually using the workflow steps below.
66
81
 
@@ -83,7 +98,8 @@ When invoked as a subagent (via a batch orchestrator such as an assignment fan-o
83
98
  - **Failed**: {count}
84
99
  - **Total**: {count}
85
100
  - **Report Paths**: {timestamp}-{short-pkg}-{short-id}.*
86
- - **Issues**: Brief description or "None"
101
+ - **Observations**: Brief factual summary or "None"
102
+ - **Issues**: Brief description or "None" (legacy alias if `Observations` is unavailable)
87
103
  ```
88
104
 
89
105
  Do NOT return full report contents, detailed TC output, or setup logs.
@@ -95,6 +111,7 @@ Do NOT return full report contents, detailed TC output, or setup logs.
95
111
  When invoked with `--tc-mode`, the sandbox is pre-populated by `SetupExecutor` and only a single TC is executed. Steps 1-5 of standard mode are skipped.
96
112
 
97
113
  **TC-Level Arguments:**
114
+
98
115
  - `PACKAGE` (required), `TEST_ID` (required), `TC_ID` (required)
99
116
  - `--tc-mode` (required), `--sandbox SANDBOX_PATH` (required)
100
117
  - `--run-id RUN_ID` (optional), `--env KEY=VALUE,...` (optional)
@@ -108,7 +125,8 @@ When invoked with `--tc-mode`, the sandbox is pre-populated by `SetupExecutor` a
108
125
  6. Return TC-level contract
109
126
 
110
127
  **TC-Level Rules:**
111
- - Do NOT create or modify sandbox — `SetupExecutor` already prepared it
128
+
129
+ - Do NOT create or modify sandbox -- `SetupExecutor` already prepared it
112
130
  - Always export `--env` variables before executing test steps
113
131
  - Report actual results even if they differ from expected
114
132
 
@@ -138,6 +156,7 @@ If no tests found after filtering, report error and exit.
138
156
  ### 2. Read Test Scenario
139
157
 
140
158
  For each scenario file, read and parse:
159
+
141
160
  - `test-id`, `title`, `priority`, `duration`, `requires`, `tags`
142
161
 
143
162
  **Multiple tests:** Execute steps 2-7 for each scenario sequentially, then generate a combined summary.
@@ -172,22 +191,24 @@ Report missing prerequisites before proceeding.
172
191
  **Pre-generated Run ID:** If `--run-id` was provided, set `TIMESTAMP_ID=$RUN_ID` instead of generating a new one.
173
192
 
174
193
  **Directory naming convention:**
175
- - `{timestamp}` — 6-char base36 timestamp
176
- - `{short-pkg}` package without `ace-` prefix (e.g., `lint`)
177
- - `{short-id}` lowercase prefix + number (e.g., `ts001`)
194
+
195
+ - `{timestamp}` -- 6-char base36 timestamp
196
+ - `{short-pkg}` -- package without `ace-` prefix (e.g., `lint`)
197
+ - `{short-id}` -- lowercase prefix + number (e.g., `ts001`)
178
198
 
179
199
  ```
180
200
  .ace-local/test-e2e/
181
201
  ├── 8osvnh-lint-ts001/ # Sandbox
182
202
  ├── 8osvnh-lint-ts001-reports/ # Reports (summary.r.md, experience.r.md, metadata.yml)
183
- └── 8osynv-final-report.md # Suite report (sibling)
203
+ └── 8osynv-suite-report.md # Suite report (sibling)
184
204
  ```
185
205
 
186
206
  **Expected variables after setup:**
187
- - `PROJECT_ROOT` — Original project directory
188
- - `TEST_DIR` Sandbox directory (cwd after setup)
189
- - `REPORTS_DIR` Reports directory
190
- - `TIMESTAMP_ID` Unique run identifier
207
+
208
+ - `PROJECT_ROOT` -- Original project directory
209
+ - `TEST_DIR` -- Sandbox directory (cwd after setup)
210
+ - `REPORTS_DIR` -- Reports directory
211
+ - `TIMESTAMP_ID` -- Unique run identifier
191
212
 
192
213
  ### 4.1 Sandbox Isolation Checkpoint (MANDATORY)
193
214
 
@@ -198,7 +219,7 @@ echo "=== SANDBOX ISOLATION CHECK ==="
198
219
  CURRENT_DIR="$(pwd)"
199
220
  [[ "$CURRENT_DIR" == *".ace-local/test-e2e/"* ]] && echo "PASS: In sandbox" || echo "FAIL: NOT in sandbox"
200
221
  git rev-parse --git-dir >/dev/null 2>&1 && { [ -z "$(git remote -v 2>/dev/null)" ] && echo "PASS: No remotes" || echo "FAIL: Remotes found"; } || echo "PASS: No git"
201
- [ -f "CLAUDE.md" ] || [ -f "Gemfile" ] || [ -d ".ace-taskflow" ] && echo "FAIL: Project markers found" || echo "PASS: No markers"
222
+ [ -f "CLAUDE.md" ] || [ -f "Gemfile" ] || [ -d ".ace-task" ] && echo "FAIL: Project markers found" || echo "PASS: No markers"
202
223
  echo "=== END CHECK ==="
203
224
  ```
204
225
 
@@ -208,7 +229,7 @@ echo "=== END CHECK ==="
208
229
  ### 5. Create Test Data
209
230
 
210
231
  > **Use `ace-test-e2e-sh "$TEST_DIR"` for ALL commands after setup.**
211
- > Each bash block runs in a fresh shell the wrapper ensures sandbox isolation.
232
+ > Each bash block runs in a fresh shell -- the wrapper ensures sandbox isolation.
212
233
 
213
234
  Execute test data creation commands from the scenario, writing files inside `$TEST_DIR/`.
214
235
 
@@ -219,9 +240,9 @@ Execute test data creation commands from the scenario, writing files inside `$TE
219
240
  If `FILTERED_CASES` is set, execute only matching TCs. Otherwise execute all.
220
241
 
221
242
  For each TC (TC-NNN):
222
- 1. **Check filter** skip if not in `FILTERED_CASES`
243
+ 1. **Check filter** -- skip if not in `FILTERED_CASES`
223
244
  2. **Read** the runner file (`TC-NNN-*.runner.md`)
224
- 3. **Execute** runner steps, save artifacts to `results/tc/{NN}/`
245
+ 3. **Execute** runner steps and create only final outcome artifacts under `results/tc/{NN}/`
225
246
  4. **Verify** against paired `.verify.md` expectations
226
247
  5. **Record** status (Pass/Fail) with evidence
227
248
 
@@ -232,6 +253,7 @@ Track friction points during execution for the experience report.
232
253
  Write three report files to the reports directory.
233
254
 
234
255
  **Report path setup:**
256
+
235
257
  ```bash
236
258
  REPORT_DIR="${PROVIDED_REPORT_DIR:-${TEST_DIR}-reports}"
237
259
  mkdir -p "$REPORT_DIR"
@@ -302,6 +324,7 @@ Sandbox directories in `.ace-local/test-e2e/` are gitignored.
302
324
  Summarize execution in the response. Reports are persisted to disk.
303
325
 
304
326
  **Single test:**
327
+
305
328
  ```markdown
306
329
  ## E2E Test Execution Report
307
330
  **Test ID:** {test-id} | **Package:** {package} | **Status:** {PASS/FAIL}
@@ -316,6 +339,7 @@ Reports: `.ace-local/test-e2e/{timestamp}-{short-pkg}-{short-id}-reports/`
316
339
  ### 10. Update Test Scenario
317
340
 
318
341
  If all tests pass, update `scenario.yml`:
342
+
319
343
  ```yaml
320
344
  last-verified: {today's date}
321
345
  verified-by: claude-{model}
@@ -352,4 +376,4 @@ ace-test-e2e ace-lint --exclude-tags deep
352
376
 
353
377
  # All tests in project root
354
378
  ace-test-e2e
355
- ```
379
+ ```
@@ -132,7 +132,7 @@ else
132
132
  fi
133
133
 
134
134
  # Check 3: Project root markers should NOT exist
135
- if [ -f "CLAUDE.md" ] || [ -f "Gemfile" ] || [ -d ".ace-taskflow" ]; then
135
+ if [ -f "CLAUDE.md" ] || [ -f "Gemfile" ] || [ -d ".ace-task" ]; then
136
136
  echo "FAIL: Main project markers found - NOT an isolated repo!"
137
137
  echo " ACTION: STOP - You are in the main repository."
138
138
  else
@@ -321,7 +321,7 @@ Add setup directives to `scenario.yml`:
321
321
  # scenario.yml
322
322
  setup:
323
323
  - git-init
324
- - run: "cp $PROJECT_ROOT_PATH/mise.toml mise.toml && mise trust mise.toml"
324
+ - run: "cp ${ACE_E2E_SOURCE_ROOT:-$PROJECT_ROOT_PATH}/mise.toml mise.toml && mise trust mise.toml"
325
325
  - copy-fixtures
326
326
  - agent-env:
327
327
  PROJECT_ROOT_PATH: "."
@@ -405,7 +405,7 @@ else
405
405
  fi
406
406
 
407
407
  # Check 3: Project markers
408
- if [ -f "CLAUDE.md" ] || [ -f "Gemfile" ] || [ -d ".ace-taskflow" ]; then
408
+ if [ -f "CLAUDE.md" ] || [ -f "Gemfile" ] || [ -d ".ace-task" ]; then
409
409
  echo "FAIL: Main project markers found!"
410
410
  exit 1
411
411
  else
@@ -458,4 +458,4 @@ ace-test-e2e-sh "$REPO_DIR" git status
458
458
  ## See Also
459
459
 
460
460
  - [E2E Testing Guide](guide://e2e-testing)
461
- - [Test Suite Health](guide://test-suite-health)
461
+ - [Test Suite Health](guide://test-suite-health)
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ace
4
+ module Test
5
+ module EndToEndRunner
6
+ module Atoms
7
+ # Validates that verifier-visible artifact paths are explicitly declared by
8
+ # runner instructions or scenario setup, and normalizes grouped capture
9
+ # shorthand such as `foo.stdout`, `.stderr`, `.exit`.
10
+ class ArtifactContractValidator
11
+ Reference = Struct.new(:path, :optional, :source, :line, keyword_init: true)
12
+
13
+ FULL_PATH_PATTERN = /
14
+ (?:`|"|')?
15
+ (results\/tc\/\d{2}\/[^\s`)"']+|results\/tc\/\d{2}\/)
16
+ (?:`|"|')?
17
+ (\s*\(optional\))?
18
+ /ix
19
+ SUFFIX_PATTERN = /,\s*(?:`|"|')?(\.[A-Za-z0-9*._-]+)(?:`|"|')?(\s*\(optional\))?/i
20
+ WILDCARD_PATTERN = /[*?\[]/.freeze
21
+
22
+ class << self
23
+ def extract(markdown, source:)
24
+ markdown.to_s.each_line.with_index(1).flat_map do |line, line_number|
25
+ extract_from_line(line, source: source, line_number: line_number)
26
+ end
27
+ end
28
+
29
+ def references_from_paths(paths, source:)
30
+ Array(paths).filter_map do |path|
31
+ normalized = normalize(path)
32
+ next if normalized.nil?
33
+
34
+ Reference.new(path: normalized, optional: false, source: source, line: nil)
35
+ end
36
+ end
37
+
38
+ def validate!(tc_id:, scenario_dir:, runner_references:, verifier_references:, scenario_references:)
39
+ invalid_wildcards = (runner_references + verifier_references + scenario_references).select do |reference|
40
+ wildcard?(reference.path)
41
+ end
42
+ unless invalid_wildcards.empty?
43
+ raise ArgumentError,
44
+ "Wildcard artifact path(s) are not supported for #{tc_id} in #{scenario_dir}: " \
45
+ "#{format_references(invalid_wildcards)}"
46
+ end
47
+
48
+ declared_paths = normalized_paths(scenario_references + runner_references)
49
+ undeclared = verifier_references.reject do |reference|
50
+ declared_paths.include?(normalize(reference.path))
51
+ end
52
+ return if undeclared.empty?
53
+
54
+ raise ArgumentError,
55
+ "Verifier references undeclared artifact(s) for #{tc_id} in #{scenario_dir}: " \
56
+ "#{format_references(undeclared)}. " \
57
+ "Declare exact artifact paths in the runner file or scenario.yml sandbox-layout."
58
+ end
59
+
60
+ private
61
+
62
+ def extract_from_line(line, source:, line_number:)
63
+ matches = []
64
+ line.to_enum(:scan, FULL_PATH_PATTERN).each do
65
+ matches << {
66
+ start: Regexp.last_match.begin(0),
67
+ end: Regexp.last_match.end(0),
68
+ path: normalize(Regexp.last_match[1]),
69
+ optional: !Regexp.last_match[2].to_s.empty?
70
+ }
71
+ end
72
+
73
+ matches.each_with_index.flat_map do |match, index|
74
+ refs = [
75
+ Reference.new(
76
+ path: match[:path],
77
+ optional: match[:optional],
78
+ source: source,
79
+ line: line_number
80
+ )
81
+ ]
82
+
83
+ next_match = matches[index + 1]
84
+ suffix_region = line[match[:end]...(next_match ? next_match[:start] : line.length)].to_s
85
+ suffix_base = suffix_base_for(match[:path])
86
+ next refs if suffix_base.nil?
87
+
88
+ suffix_region.to_enum(:scan, SUFFIX_PATTERN).each do
89
+ refs << Reference.new(
90
+ path: "#{suffix_base}#{Regexp.last_match[1]}",
91
+ optional: !Regexp.last_match[2].to_s.empty?,
92
+ source: source,
93
+ line: line_number
94
+ )
95
+ end
96
+ refs
97
+ end
98
+ end
99
+
100
+ def suffix_base_for(path)
101
+ return nil if path.nil?
102
+ return nil if path.match?(%r{\Aresults/tc/\d{2}\z})
103
+
104
+ path.sub(/\.[^.\/]+\z/, "").tap do |value|
105
+ return nil if value == path
106
+ end
107
+ end
108
+
109
+ def normalized_paths(references)
110
+ references.map { |reference| normalize(reference.path) }.compact.uniq
111
+ end
112
+
113
+ def normalize(path)
114
+ value = path.to_s.strip
115
+ return nil unless value.start_with?("results/tc/")
116
+
117
+ value.sub(%r{/+\z}, "")
118
+ end
119
+
120
+ def wildcard?(path)
121
+ path.to_s.match?(WILDCARD_PATTERN)
122
+ end
123
+
124
+ def format_references(references)
125
+ references.uniq { |reference| [reference.path, reference.source, reference.line] }.map do |reference|
126
+ if reference.line
127
+ "#{reference.path} (#{reference.source}:#{reference.line})"
128
+ else
129
+ "#{reference.path} (#{reference.source})"
130
+ end
131
+ end.join(", ")
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end