ace-test-runner-e2e 0.38.11 → 0.40.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +60 -0
  3. data/handbook/guides/e2e-testing.g.md +35 -3
  4. data/handbook/guides/scenario-yml-reference.g.md +8 -3
  5. data/handbook/guides/tc-authoring.g.md +15 -4
  6. data/handbook/templates/tc-file.template.md +4 -2
  7. data/handbook/workflow-instructions/e2e/create.wf.md +13 -3
  8. data/handbook/workflow-instructions/e2e/fix.wf.md +19 -0
  9. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +16 -0
  10. data/handbook/workflow-instructions/e2e/review.wf.md +14 -10
  11. data/handbook/workflow-instructions/e2e/rewrite.wf.md +10 -3
  12. data/lib/ace/test/end_to_end_runner/atoms/artifact_contract_validator.rb +138 -0
  13. data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +195 -5
  14. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +37 -1
  15. data/lib/ace/test/end_to_end_runner/molecules/artifact_pruner.rb +61 -0
  16. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +90 -14
  17. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +43 -5
  18. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +7 -5
  19. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +2 -0
  20. data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +101 -9
  21. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +28 -30
  22. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +24 -1
  23. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +182 -1
  24. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +25 -3
  25. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +44 -5
  26. data/lib/ace/test/end_to_end_runner/version.rb +1 -1
  27. data/lib/ace/test/end_to_end_runner.rb +2 -0
  28. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1a1e81b2b077a6bca7e75e1572743a31f20abe1ef5ebcb69ea82c7f55e95fd4b
4
- data.tar.gz: e1791696e6cbbb58decab800387e005ee22f134734be2736d5c00f109273dd57
3
+ metadata.gz: d66fec48d22d05660c8851a50dca74a05d78314f0a0185e9fe71cbed378b624f
4
+ data.tar.gz: 974e1a357c134b270624df6c99de78d351da80c43b51623770a775ff96d10715
5
5
  SHA512:
6
- metadata.gz: 143efde4ad09db543ff0865da3de1a94343c278a64c12f55f1786718725846b5cec565e144687b8c9d16bbc15736bf73abb34b30b49fa746a56ed522978e6434
7
- data.tar.gz: b166bec29e9f10d0eff3b692d6526c22e0960252fb1a9d6d125c646b47a14137472775f73a9120365ebb608d7f2ccc38f88b4694e1b144165cf1a2994512ed2e
6
+ metadata.gz: '0866fe9a27cfb959f199a20f1057191ee806d201a8ae024b0bc0180dedea8c9b1984d759ae914f5f7ece9db0730d57a91d105272e8837eaca50a14e0fdafb4f8'
7
+ data.tar.gz: c3676f29eb9bcbcbcb343518f871a1636ff9fb2d57f2af09916ea9e991b1a944c58ed7f92b9f07f09280539843627afb90441843809250c493912857084c861a
data/CHANGELOG.md CHANGED
@@ -7,6 +7,66 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.40.2] - 2026-06-30
11
+
12
+ ### Technical
13
+ - Stabilized fast tests with sandbox-local fake backends and fixture setup executors so path handling remains independent of host temporary directories.
14
+
15
+ ## [0.40.1] - 2026-04-24
16
+
17
+ ### Fixed
18
+ - Removed suite-specific wording from the single-command `ace-test-e2e` help/output path so the `RunTest` CLI stays scoped to the single-command surface while preserving prune-artifact guidance.
19
+
20
+ ## [0.40.0] - 2026-04-24
21
+
22
+ ### Changed
23
+ - Added `--[no-]prune-artifacts` to `ace-test-e2e` and `ace-test-e2e-suite` so operators can clear stale `.ace-local/test-e2e` run artifacts before execution while preserving suite reports and the shared `runtime-cache/`.
24
+
25
+ ## [0.39.1] - 2026-04-24
26
+
27
+ ### Fixed
28
+ - Resolved suite-shared runtime reuse in child `ace-test-e2e` subprocesses by honoring inherited `ACE_E2E_SHARED_RUNTIME_ROOT` from the process environment instead of rebuilding sandbox-local runtimes after prewarming.
29
+
30
+ ## [0.39.0] - 2026-04-24
31
+
32
+ ### Changed
33
+ - Added `--[no-]retry-failures-once` for full-suite reruns, including flaky-recovery reporting when a failed first pass succeeds on the retry pass.
34
+ - Reused a suite-shared E2E runtime cache under `.ace-local/test-e2e/runtime-cache/` so parallel sandbox workers stop rebuilding the same Bundler environment and native extensions for every scenario.
35
+
36
+ ## [0.38.17] - 2026-04-24
37
+
38
+ ### Fixed
39
+ - Detected fixture-commit setup flows across the full setup sequence instead of only single-step `git add && git commit` commands, restoring support-path git excludes for split-step fixture repositories.
40
+
41
+ ## [0.38.16] - 2026-04-23
42
+
43
+ ### Fixed
44
+ - Enforced runner-owned verifier artifact contracts in scenario loading, expanded grouped `.stdout` / `.stderr` / `.exit` shorthand, and rejected verifier-only or wildcard artifact declarations that previously let retained E2E drift slip through.
45
+
46
+ ### Technical
47
+ - Updated E2E guides, templates, and create/review/plan/rewrite/fix workflows to distinguish `public-surface` versus `retained-contract` TCs and require explicit downstream retained-E2E sweeps after public contract changes.
48
+
49
+ ## [0.38.15] - 2026-04-23
50
+
51
+ ### Fixed
52
+ - Passed declared artifact contracts directly into runner prompts, added one bounded runner repair pass when required captures are still missing, and persisted repair metadata so missing-artifact E2E failures can recover before verifier judgment.
53
+
54
+ ## [0.38.14] - 2026-04-23
55
+
56
+ ### Fixed
57
+ - Limited deterministic sandbox git excludes to setup-commit scenarios so copied package trees remain visible to ignore-aware tools while fixture-repo support paths stay unstaged.
58
+
59
+ ## [0.38.13] - 2026-04-23
60
+
61
+ ### Fixed
62
+ - Enabled role-based verifier fallback in pipeline execution so successful runner phases still produce verifier results when the first verifier provider is unavailable.
63
+ - Seeded deterministic sandbox git excludes for copied package trees and fixture-commit support paths so setup-time `git add -A` no longer stages runner support files or copied package content into fixture repositories.
64
+
65
+ ## [0.38.12] - 2026-04-23
66
+
67
+ ### Changed
68
+ - Updated default ACE sandbox bootstrap to use `ace-config sync ace-llm-providers-cli` before `ace-handbook sync`, matching the renamed config sync command and minimal quick-start config requirement.
69
+
10
70
  ## [0.38.11] - 2026-04-20
11
71
 
12
72
  ### Fixed
@@ -14,6 +14,10 @@ ace-docs:
14
14
  E2E tests are executed by an AI agent and reserved for behaviors that require real CLI execution, real tools, and real filesystem side effects.
15
15
  They must also answer a user-journey question: can a user do the job from the tool's public surface, and how much friction does that journey have?
16
16
 
17
+ In practice, ACE uses two valid TC styles:
18
+ - **Public-surface TCs** — prove a user job from docs/usage/`--help` and the CLI itself.
19
+ - **Retained-contract TCs** — pin a previously fragile integrated behavior with deterministic, explicitly declared evidence.
20
+
17
21
  ## Canonical Conventions
18
22
 
19
23
  - CLI split:
@@ -33,7 +37,7 @@ They must also answer a user-journey question: can a user do the job from the to
33
37
 
34
38
  - Runner is **execution-only**:
35
39
  - perform user-like CLI actions in sandbox
36
- - produce only final outcome evidence under `results/tc/{NN}/`
40
+ - produce only declared outcome evidence under `results/tc/{NN}/`
37
41
  - return final runner observations through the harness contract
38
42
  - do not issue PASS/FAIL verdicts
39
43
  - do not perform verifier-style assertion/classification
@@ -46,6 +50,11 @@ They must also answer a user-journey question: can a user do the job from the to
46
50
  2. runner observations
47
51
  3. explicit TC artifacts that are true product outcomes
48
52
  4. debug captures (`stdout`, `stderr`, `*.exit`, metadata) only as fallback
53
+ - Artifact contract ownership:
54
+ - runner instructions and `scenario.yml` setup/layout declare verifier-visible artifact paths
55
+ - verifier consumes that contract; it does not create new required artifact paths
56
+ - grouped shorthand such as ``results/tc/02/help.stdout`, `.stderr`, `.exit`` counts as an exact declaration of all three files
57
+ - wildcard artifact paths such as `results/tc/02/output.*` are not valid declarations
49
58
  - Setup ownership:
50
59
  - sandbox preparation belongs to `scenario.yml` `setup:` + `fixtures/`
51
60
  - TC runner files must not define independent environment setup procedures
@@ -77,6 +86,21 @@ When an E2E failure shows that a valid user job is not discoverable from docs, u
77
86
  docs/help drift. Failure analysis must record the stale or missing public surface and the exact docs/help target to
78
87
  update instead of teaching the runner a workaround.
79
88
 
89
+ ## TC Style Selection
90
+
91
+ Use **public-surface** style when the goal is a real user journey and the primary oracle should stay on user-visible behavior.
92
+
93
+ Use **retained-contract** style when the integrated behavior matters but final sandbox state alone is not enough. In that case, small declared supporting captures are valid, for example:
94
+ - `.stdout`, `.stderr`, `.exit`
95
+ - `command.txt`
96
+ - `path-check.txt`
97
+ - `artifact-check.txt`
98
+
99
+ Even retained-contract TCs must not rely on:
100
+ - verifier-only artifact declarations
101
+ - wildcard artifact paths
102
+ - reflections, PASS/FAIL summaries, or verifier-facing manifests under `results/`
103
+
80
104
  ## Cost and Scope
81
105
 
82
106
  - Keep scenarios small and coherent.
@@ -128,12 +152,16 @@ This prevents duplicate assertions across test layers.
128
152
  - Keep runner goals aligned with the public user path; if the runner needs a workaround, surface that as friction rather than teaching the workaround.
129
153
  - Keep verifier expectations impact-first, then artifacts, then debug fallback.
130
154
  - Preserve strict TC pairing (`runner` + `verify`).
131
- - Keep `results/tc/{NN}/` for outcome artifacts only.
132
- - Do not instruct runners to create helper YAML, path files, command files, or reflections in `results/`.
155
+ - Keep `results/tc/{NN}/` for declared verifier-dependent evidence only.
156
+ - Declare every verifier-dependent file path in runner instructions or scenario setup. Do not rely on verifier-only path references.
157
+ - Allow small supporting captures only when they are explicitly declared and materially improve confidence.
158
+ - Do not use wildcard artifact paths.
159
+ - Do not instruct runners to create reflections, PASS/FAIL summaries, verifier-facing manifests, or ad hoc temp inputs in `results/`.
133
160
  - Do not judge success from runner-authored summaries when final sandbox state can prove the goal directly.
134
161
  - Use runner observations only to explain ambiguity or missing side effects, not to replace missing end-state evidence.
135
162
  - Treat any workaround noted in runner observations as a product/docs/help or scenario-design smell that must be fixed, not preserved.
136
163
  - Avoid hidden dependencies between TCs unless explicitly intended.
164
+ - For `--watch` or other live-output commands, use a bounded-session pattern with explicit termination behavior and captured exit codes.
137
165
 
138
166
  ## Execution Artifacts
139
167
 
@@ -150,9 +178,13 @@ Before approving new/updated E2E tests:
150
178
  - [ ] `runner.yml.md` and `verifier.yml.md` exist
151
179
  - [ ] Every TC has both `.runner.md` and `.verify.md`
152
180
  - [ ] Artifacts are scoped to `results/tc/{NN}/`
181
+ - [ ] Every verifier-dependent artifact path is declared by runner/setup
182
+ - [ ] No verifier depends on wildcard or verifier-only artifact paths
153
183
  - [ ] Verifier primary oracle is final sandbox state or real product output, not helper artifacts
154
184
  - [ ] Runner observations are the only non-filesystem secondary evidence source
185
+ - [ ] TC style is explicit in the review (`public-surface` or `retained-contract`)
155
186
  - [ ] Scenario can be completed from docs/usage/`--help` without hidden recipes or workaround instructions
187
+ - [ ] Any internal-detail assertion is part of the public contract or justified as retained-contract evidence
156
188
  - [ ] Any friction/workaround found during review is treated as a gap, not as a runner script opportunity
157
189
  - [ ] Failure analysis records docs/help drift from failed public user paths, or explicitly records `None`
158
190
  - [ ] Value-gate metadata is present (`e2e-justification`, `unit-coverage-reviewed`, `cost-tier`)
@@ -46,7 +46,7 @@ Example: `ace-lint/test/e2e/TS-LINT-001-lint-pipeline/scenario.yml`
46
46
  |-------|------|---------|-------------|
47
47
  | `priority` | string | `medium` | Test priority: `high`, `medium`, `low` |
48
48
  | `tool-under-test` | string | — | Primary command/tool validated |
49
- | `sandbox-layout` | object | `{}` | Outcome-path hints used to precreate directories and guide verification |
49
+ | `sandbox-layout` | object | `{}` | Directory-level outcome hints used to precreate `results/tc/*` paths and guide verification |
50
50
  | `duration` | string | — | Estimated duration (e.g., `~15min`) |
51
51
  | `timeout` | integer | — | Optional per-scenario execution timeout in seconds |
52
52
  | `automation-candidate` | boolean | `false` | Whether test is automatable |
@@ -73,7 +73,10 @@ Pairing rule:
73
73
  Artifact layout conventions:
74
74
  - canonical: `results/tc/{NN}/`
75
75
  - avoid non-TC-scoped result folders
76
- - keep only real outcome artifacts under `results/tc/{NN}/`; runner observations live in harness reports, not sandbox helper files
76
+ - keep only declared verifier-dependent evidence under `results/tc/{NN}/`; runner observations live in harness reports, not sandbox helper files
77
+ - file-level verifier checks must be declared by the runner; `sandbox-layout` does not replace exact file declarations
78
+ - grouped shorthand such as ``results/tc/01/help.stdout`, `.stderr`, `.exit`` is valid for exact sibling captures
79
+ - wildcard artifact paths are not supported
77
80
  - absence of a declared path is debug context, not a standalone failure reason
78
81
 
79
82
  Canonical summary report fields:
@@ -85,7 +88,8 @@ Canonical summary report fields:
85
88
  Role contract:
86
89
  - `runner.yml.md` + `TC-*.runner.md` are execution-only.
87
90
  - `verifier.yml.md` + `TC-*.verify.md` are verification-only with impact-first checks.
88
- - Goal-style scenarios should be solvable from the public surface (docs/usage/`--help` + tool under test) without hidden recipes or workaround instructions.
91
+ - Public-surface TCs should be solvable from the public surface (docs/usage/`--help` + tool under test) without hidden recipes or workaround instructions.
92
+ - Retained-contract TCs may keep small declared supporting captures when they materially improve confidence.
89
93
 
90
94
  ## `requires` Object
91
95
 
@@ -130,6 +134,7 @@ setup:
130
134
  Setup rules:
131
135
  - Setup is fail-fast. Do not hide setup failures with `|| true`.
132
136
  - Setup belongs in `scenario.yml` and fixtures, not in TC runner instructions.
137
+ - Use setup to create prerequisite state, not verifier-facing helper files under `results/`.
133
138
  - If setup fails (for example, missing `mise trust` support), stop scenario execution and report infrastructure failure.
134
139
 
135
140
  ## Complete Example
@@ -32,6 +32,11 @@ Inline `.tc.md` and frontmatter `mode` values are no longer supported.
32
32
  - TC outcome artifacts write to `results/tc/{NN}/`
33
33
  - Summary counters use `tcs-passed`, `tcs-failed`, and `tcs-total`
34
34
 
35
+ ## TC Styles
36
+
37
+ - **Public-surface**: prove a documented user job from docs/usage/`--help` and the CLI.
38
+ - **Retained-contract**: pin an integrated behavior with deterministic, explicitly declared supporting evidence when end-state checks alone are insufficient.
39
+
35
40
  ## File Naming
36
41
 
37
42
  - `TC-{NNN}` — test case number (e.g., TC-001)
@@ -82,8 +87,9 @@ Run `ace-lint` and produce report artifacts for a valid file.
82
87
  ## Constraints
83
88
 
84
89
  - Use only sandbox paths
85
- - Keep only final outcome evidence under `results/tc/01/`
86
- - Do not place helper inputs, manifests, command transcripts, or reflections under `results/tc/01/`
90
+ - Keep only declared verifier-dependent evidence under `results/tc/01/`
91
+ - Declare exact paths for any verifier-dependent captures, for example ``results/tc/01/help.stdout`, `.stderr`, `.exit``
92
+ - Do not place helper inputs, manifests, PASS/FAIL summaries, or reflections under `results/tc/01/`
87
93
  - Execute actions only; do not assign PASS/FAIL or final verdicts
88
94
  ```
89
95
 
@@ -122,14 +128,19 @@ Pass only when all expectations are satisfied by on-disk evidence.
122
128
 
123
129
  - Keep each TC focused on one coherent behavior path.
124
130
  - Ensure goal numbers and TC numbers remain aligned (`TC-001` -> Goal 1).
131
+ - Choose the TC style up front: `public-surface` or `retained-contract`.
125
132
  - Keep runner files execution-only and verifier files verdict-only.
126
133
  - Make verifier expectations deterministic with impact-first ordering.
127
- - Keep `results/tc/{NN}/` for outcome artifacts only.
134
+ - Keep `results/tc/{NN}/` for declared verifier-dependent evidence only.
135
+ - Declare every verifier-dependent path in the runner or setup. Do not rely on verifier-only references.
136
+ - Grouped capture shorthand is valid only for exact sibling files, for example ``foo.stdout`, `.stderr`, `.exit``.
137
+ - Do not use wildcard artifact paths.
128
138
  - Use harness-provided runner observations as the only non-filesystem secondary evidence source.
129
139
  - Prefer final sandbox state and real product output over raw debug captures.
130
- - Do not ask the runner to write setup inputs, audit manifests, or final reflections for the verifier.
140
+ - Do not ask the runner to write setup inputs, audit manifests, verifier-facing summaries, or final reflections for the verifier.
131
141
  - Do not teach the runner hidden recipes or workaround sequences; if the path is not discoverable from docs/usage/`--help`, the TC is wrong or the public surface needs improvement.
132
142
  - Use runner observations to record friction and workaround pressure, not to normalize it.
143
+ - For watch/live-output flows, use a bounded-session pattern with explicit shutdown and captured exit code.
133
144
  - Record why each scenario remains E2E via `e2e-justification` and `unit-coverage-reviewed` in `scenario.yml`.
134
145
 
135
146
  ## Related
@@ -22,7 +22,9 @@ ace-docs:
22
22
 
23
23
  - Use only declared scenario tools (`ace-*` and explicit exceptions)
24
24
  - Keep only product outcomes or essential command captures under `results/tc/{NN}/`
25
- - Do not write helper inputs, reflections, manifests, or temp files under `results/tc/{NN}/`
25
+ - Declare every verifier-dependent path explicitly in the runner or scenario setup
26
+ - Grouped capture shorthand such as ``results/tc/{NN}/cmd.stdout`, `.stderr`, `.exit`` is allowed for exact sibling files
27
+ - Do not write helper inputs, reflections, PASS/FAIL summaries, manifests, or temp files under `results/tc/{NN}/`
26
28
  - Do not write outside sandbox
27
29
  - Execute actions only; do not assign PASS/FAIL in runner file
28
30
  - Follow the public user path from docs/usage/`--help`; do not embed hidden recipes or workaround branches in the TC
@@ -51,5 +53,5 @@ Companion verifier file (`TC-{NNN}-{slug}.verify.md`) example:
51
53
 
52
54
  ## Verdict
53
55
 
54
- - Pass when the public path works from sandbox evidence. Missing helper artifacts alone should not fail the goal.
56
+ - Pass when the public path or retained contract is satisfied from sandbox evidence. Undeclared helper artifacts alone should not fail the goal.
55
57
  -->
@@ -41,7 +41,10 @@ This workflow guides an agent through creating a new E2E test scenario.
41
41
  ## Authoring Contract
42
42
 
43
43
  - Runner files (`runner.yml.md`, `TC-*.runner.md`) are execution-only.
44
- - Goal-style TCs must prove two things:
44
+ - Every TC must be authored as one of:
45
+ - **public-surface** — a user job from docs/usage/`--help` and the CLI
46
+ - **retained-contract** — a deterministic integrated regression check with declared supporting evidence
47
+ - Goal-style/public-surface TCs must prove two things:
45
48
  - the tool works
46
49
  - a user can do the job from the public surface (`README`, usage docs, `--help`, and the CLI itself) without hidden recipes or workarounds
47
50
  - Verifier files (`verifier.yml.md`, `TC-*.verify.md`) are verdict-only with impact-first evidence order:
@@ -52,7 +55,10 @@ This workflow guides an agent through creating a new E2E test scenario.
52
55
  4. debug captures as fallback
53
56
 
54
57
  - Setup belongs to `scenario.yml` `setup:` and fixtures; do not duplicate setup in runner TC instructions.
55
- - Keep `results/tc/{NN}/` for real outcome artifacts only; do not ask the runner to write helper YAML, path files, command files, reflections, or verifier-facing manifests there.
58
+ - Keep `results/tc/{NN}/` for declared verifier-dependent evidence only.
59
+ - Declare every verifier-dependent path in the runner or setup. Grouped shorthand such as ``foo.stdout`, `.stderr`, `.exit`` is allowed for exact sibling captures.
60
+ - Do not use wildcard artifact paths.
61
+ - Do not ask the runner to write reflections, verifier-facing manifests, or undeclared helper files there.
56
62
  - Do not encode hidden command recipes, fallback detours, or workaround sequences in runner TC files. If the job cannot be done from the public surface, treat that as a product/docs/help gap or remove/narrow the TC.
57
63
 
58
64
  ## Workflow Steps
@@ -248,9 +254,12 @@ Rules:
248
254
  - `existence-only` is never valid for KEEP/ADD. Use it only for SKIP rows with explicit unit-test replacement.
249
255
  - `helper-artifact-driven` is never valid for KEEP/ADD when final sandbox state could prove the goal directly.
250
256
  - `hidden-recipe-driven` and `workaround-driven` are never valid for KEEP/ADD.
257
+ - Every verifier-dependent artifact must be declared by runner/setup; verifier-only references are invalid.
258
+ - Wildcard artifact paths are never valid for KEEP/ADD.
251
259
  - `SKIP` rows must include replacement unit-test evidence.
252
260
  - Non-skipped rows must identify the primary oracle for the TC: final sandbox state, real product output, or debug fallback.
253
261
  - Non-skipped rows must state why the job is achievable from the public surface without hidden recipes.
262
+ - Non-skipped rows must identify TC style: `public-surface` or `retained-contract`.
254
263
  - At least one `unit tests reviewed` path is required for every row.
255
264
  - The scenario-level `unit-coverage-reviewed` field must include the union of all referenced unit test files.
256
265
 
@@ -267,6 +276,7 @@ Rules:
267
276
  - No TC may be created without a row in this table.
268
277
  - If decision is `SKIP`, include the unit-test evidence that replaces it.
269
278
  - If the public-surface path is missing or workaround-driven, the TC must be `SKIP` or explicitly planned as a product/docs/help improvement before creation.
279
+ - If the TC uses live refresh or watch behavior, include a bounded-session capture plan with explicit shutdown behavior and exit-code expectations.
270
280
  - At least one `unit tests reviewed` path is required for each row.
271
281
  - The scenario-level `unit-coverage-reviewed` field must include the union of all referenced unit test files.
272
282
 
@@ -301,7 +311,7 @@ If a context description was provided, enhance the test with:
301
311
  - Write runner goals as user outcomes, not “create a report” chores for the verifier
302
312
  - Check specific exit codes for error commands (not just "non-zero")
303
313
  - Make final sandbox state or real product output the primary oracle whenever possible
304
- - Do not require runner-authored helper files under `results/tc/{NN}/`
314
+ - Do not require undeclared or verifier-facing helper files under `results/tc/{NN}/`
305
315
  - Add at least one behavioral/content assertion when CLI output itself is part of the outcome being tested
306
316
 
307
317
  **SHOULD (strongly recommended):**
@@ -110,6 +110,7 @@ Apply fixes in this order:
110
110
  - Preserve role split: runner is execution-only, verifier is impact-first verdict
111
111
  - Keep implementation unchanged unless analysis is revised
112
112
  - Remove hidden recipes, workaround branches, and unsupported internal-detail checks from goal-style TCs
113
+ - Repair undeclared or wildcard artifact contracts before weakening product assertions
113
114
 
114
115
  4. Rerun the selected failing scope after each fix
115
116
 
@@ -150,6 +151,12 @@ ace-test-e2e ace-bundle TS-BUNDLE-001
150
151
  - Keep one active scenario/TC at a time
151
152
  - Preserve cost-conscious rerun discipline
152
153
 
154
+ 6a. If the fix changes a public contract, run a downstream retained-E2E sweep
155
+
156
+ - Trigger this sweep when the fix changes status words, JSON keys, command shapes, lifecycle semantics, or ownership/state semantics
157
+ - Grep impacted scenarios and downstream consumers before concluding the fix
158
+ - Update retained runner/verifier contracts in the same change set whenever feasible
159
+
153
160
  7. Run a final explicit failing-scenario checkpoint before concluding the fix session
154
161
 
155
162
  After the currently targeted failures are addressed, require one final:
@@ -179,6 +186,18 @@ Analysis Source: reused existing analysis | generated via `wfi://e2e/analyze-fai
179
186
  | ... | ... | ... | ... | pass/fail |
180
187
  ```
181
188
 
189
+ Also include:
190
+
191
+ ```markdown
192
+ ## Fix Classification Totals
193
+
194
+ | Bucket | Count |
195
+ |---|---|
196
+ | Product bug | {n} |
197
+ | Harness bug | {n} |
198
+ | Retained test/spec drift | {n} |
199
+ ```
200
+
182
201
  If the analysis reported docs/help drift, include:
183
202
 
184
203
  ```markdown
@@ -56,6 +56,15 @@ Build a change inventory:
56
56
  - **Removed features** — deleted files or deprecated modules
57
57
  - **Unchanged features** — stable code with no recent modifications
58
58
 
59
+ Before classifying TCs, also check whether the package change alters a public contract that downstream retained E2E tests commonly pin:
60
+ - status words
61
+ - JSON keys or output schema
62
+ - CLI command/flag shapes
63
+ - lifecycle semantics
64
+ - ownership/state semantics
65
+
66
+ If yes, add an explicit downstream retained-E2E sweep list to the plan instead of limiting scope to the package under edit.
67
+
59
68
  ### 3. Classify Each Existing TC
60
69
 
61
70
  For each TC listed in the coverage matrix, assign exactly one classification:
@@ -80,6 +89,7 @@ For REMOVE due to overlap, replacement evidence is mandatory:
80
89
  - TC scope is too broad (should be narrowed to only E2E-exclusive aspects)
81
90
  - TC scope is too narrow (missing assertions for related behavior in same CLI invocation)
82
91
  - TC has structure issues flagged in the review
92
+ - TC has undeclared or wildcard verifier-dependent artifact paths
83
93
  - TC is hidden-recipe-driven or workaround-driven but the underlying user job should still be supported by the public surface after scenario/docs/help correction
84
94
 
85
95
  **CONSOLIDATE** — The TC should merge with another TC. Criteria (any one is sufficient):
@@ -193,6 +203,12 @@ Format the complete change plan:
193
203
  |--------|--------|-----|
194
204
  | Update docs/help/CLI | {package/path} | {job is valid but current public surface is too weak for the E2E path} |
195
205
 
206
+ ### Downstream Retained-E2E Sweep ({n} actions)
207
+
208
+ | Scenario | Trigger | Change Needed |
209
+ |----------|---------|---------------|
210
+ | {scenario-id} | {renamed key / lifecycle shift / command-shape change} | {update retained verifier/runner contract} |
211
+
196
212
  ### CONSOLIDATE ({n} TCs → {n} TCs)
197
213
 
198
214
  | Source TCs | Target TC | Merged Assertions |
@@ -14,7 +14,8 @@ This workflow performs deep exploration of a package to produce a **coverage mat
14
14
  During review, treat the runner/verifier split as a first-class quality check:
15
15
  - Runner must be execution-only (no verdict language).
16
16
  - Verifier must be impact-first (sandbox impact before runner observations and debug).
17
- - `results/tc/{NN}/` must not be used for helper inputs or verifier-feeding helper reports.
17
+ - `results/tc/{NN}/` must contain only declared verifier-dependent evidence.
18
+ - Every verifier-dependent artifact path must be declared by runner/setup; verifier-only or wildcard references are contract drift.
18
19
  - Goal-style TCs must also pass the public-surface check: the runner should be able to do the job from docs/usage/`--help` and the tool under test, without hidden recipes or workarounds.
19
20
 
20
21
  **Pipeline position:** Stage 1 of 3 (Explore)
@@ -117,6 +118,9 @@ find {PACKAGE}/test/e2e -name "scenario.yml" -path "*/TS-*" 2>/dev/null | sort
117
118
  - `tags`, `cost-tier`, `e2e-justification`, `unit-coverage-reviewed`
118
119
  - `last-verified`, `verified-by`
119
120
  - Extract the objective (what the TC verifies)
121
+ - Record TC style:
122
+ - `public-surface`
123
+ - `retained-contract`
120
124
  - Record the TC's primary oracle:
121
125
  - final sandbox state / real product output
122
126
  - runner observations as supporting context
@@ -134,15 +138,15 @@ find {PACKAGE}/test/e2e -name "scenario.yml" -path "*/TS-*" 2>/dev/null | sort
134
138
  - Mark TC evidence status:
135
139
  - `complete` when `e2e-justification` is present, the verifier is end-state-first, and `unit-coverage-reviewed` has at least one path
136
140
  - `missing` otherwise
137
- - `at-risk` when evidence is existence-only, helper-artifact-driven, duplicate command invocations are detected, or the TC is hidden-recipe/workaround-driven
141
+ - `at-risk` when evidence is existence-only, helper-artifact-driven, duplicate command invocations are detected, the TC is hidden-recipe/workaround-driven, or verifier-dependent artifacts are undeclared
138
142
 
139
143
  If `--scope` was provided, filter to only the specified scenario.
140
144
 
141
145
  Build an E2E test map:
142
146
 
143
- | TC ID | Title | Command Invocations | Feature Tested | Primary Oracle | Public Surface Fit | Friction | Tags | Cost Tier | E2E Justification | Unit Coverage Reviewed | Evidence | False-Positive Risk |
144
- |-------|-------|-------------|----------------|----------------|--------------------|----------|------|-----------|-------------------|------------------------|----------|---------------------|
145
- | {id} | {title} | {command list} | {feature} | {state / output / observations+fallback} | {valid/hidden-recipe/workaround/unsupported-detail} | {low/medium/high} | {tags} | {tier} | {reason or "(missing)"} | {files or "(missing)"} | {complete/missing/at-risk} | {low/medium/high} |
147
+ | TC ID | Style | Title | Command Invocations | Feature Tested | Primary Oracle | Public Surface Fit | Artifact Contract | Friction | Tags | Cost Tier | E2E Justification | Unit Coverage Reviewed | Evidence | False-Positive Risk |
148
+ |-------|-------|-------|-------------|----------------|----------------|--------------------|-------------------|----------|------|-----------|-------------------|------------------------|----------|---------------------|
149
+ | {id} | {public-surface/retained-contract} | {title} | {command list} | {feature} | {state / output / observations+fallback} | {valid/hidden-recipe/workaround/unsupported-detail} | {declared/undeclared/wildcard} | {low/medium/high} | {tags} | {tier} | {reason or "(missing)"} | {files or "(missing)"} | {complete/missing/at-risk} | {low/medium/high} |
146
150
 
147
151
  ### 5. Build Coverage Matrix
148
152
 
@@ -214,12 +218,12 @@ TCs that may fail the E2E Value Gate (unit tests cover the same behavior or high
214
218
 
215
219
  ### E2E Decision Record Coverage
216
220
 
217
- | TC ID | Evidence Status | Public Surface Fit | Friction | Missing Fields / Contract Drift |
218
- |-------|------------------|--------------------|----------|-------------------------------|
219
- | {id} | complete | valid | low | none |
220
- | {id} | missing | hidden-recipe-driven | high | e2e-justification, unit-coverage-reviewed, end-state oracle |
221
+ | TC ID | Style | Evidence Status | Public Surface Fit | Artifact Contract | Friction | Missing Fields / Contract Drift |
222
+ |-------|-------|------------------|--------------------|-------------------|----------|-------------------------------|
223
+ | {id} | public-surface | complete | valid | declared | low | none |
224
+ | {id} | retained-contract | missing | hidden-recipe-driven | undeclared | high | e2e-justification, unit-coverage-reviewed, end-state oracle |
221
225
 
222
- **Action:** Any TC with missing evidence, helper-artifact drift, hidden recipes, workaround dependence, or unsupported internal-detail checks should be updated during the next rewrite cycle.
226
+ **Action:** Any TC with missing evidence, undeclared/wildcard artifact drift, hidden recipes, workaround dependence, or unsupported internal-detail checks should be updated during the next rewrite cycle.
223
227
 
224
228
  ### Gap Analysis
225
229
 
@@ -42,7 +42,9 @@ ace-bundle wfi://e2e/review → ace-bundle wfi://e2e/plan-changes → ace-bu
42
42
  - Normalize runner files to execution-only language.
43
43
  - Normalize verifier files to verdict-only, impact-first validation.
44
44
  - Keep setup concerns in `scenario.yml` and fixtures, not in TC runner setup sections.
45
- - Remove helper artifact requirements from `results/tc/{NN}/`; use runner observations instead.
45
+ - Keep only declared verifier-dependent evidence under `results/tc/{NN}/`.
46
+ - Move verifier-only artifact references into explicit runner/setup declarations.
47
+ - Replace wildcard artifact paths with exact declared files.
46
48
  - Rewrite goal-style TCs around the public user path. Do not preserve hidden recipes, workaround branches, or supporting-tool probes as the way the runner reaches the goal.
47
49
 
48
50
  ## Workflow Steps
@@ -128,12 +130,14 @@ Follow the E2E test writing rules:
128
130
  - Target 2-5 TCs per scenario
129
131
  - Test through the CLI interface, not library imports
130
132
  - Write runner goals as “do the job” outcomes, not “write a report for the verifier” chores
131
- - Keep `results/tc/{NN}/` for real outcomes only; avoid helper YAML, path files, command files, and reflections
133
+ - Keep `results/tc/{NN}/` for declared verifier-dependent evidence only; avoid undeclared helper YAML, reflections, and verifier-facing manifests
134
+ - Keep only declared verifier-dependent evidence under `results/tc/{NN}/`; small supporting captures are acceptable when they are explicit and necessary
132
135
  - Use runner observations as the only non-filesystem secondary evidence source
133
136
  - Make final sandbox state or real product output the primary oracle whenever possible
134
137
  - Add behavioral/content assertions only when CLI output itself is part of the user-visible outcome
135
138
  - Remove duplicate command-only TCs; fold related assertions into one TC where possible
136
139
  - Do not encode exact workaround procedures, hidden command recipes, or internal debugging tricks the user would not infer from docs/usage/`--help`
140
+ - For watch/live-output flows, rewrite to a bounded-session pattern with explicit shutdown evidence
137
141
  - If the job is valid but the public surface is too weak, plan a product/docs/help fix instead of hardcoding the workaround into the TC
138
142
 
139
143
  **Load the TC template for reference:**
@@ -152,6 +156,8 @@ For each TC classified as MODIFY:
152
156
  - **Broaden scope** — add assertions for related behavior tested by the same CLI invocation
153
157
  - **Fix structure** — add missing sections, fix formatting issues
154
158
  - **Replace helper-artifact oracles** — if the existing TC relies on runner-written helper files, rewrite it around final sandbox state plus runner observations
159
+ - **Declare verifier-dependent artifacts** — if the verifier names a `results/tc/...` file, ensure the runner or setup declares the exact same path
160
+ - **Remove wildcard declarations** — replace `results/tc/.../*` or `results/tc/.../foo.*` with exact paths
155
161
  - **Add evidence gates** — if the existing TC relies on existence-only or missing end-state checks, strengthen the primary oracle before falling back to debug captures
156
162
  - **Remove hidden recipes/workarounds** — if the existing TC teaches the runner how to bypass the public surface, rewrite it around the supported user path or narrow/remove the TC
157
163
  3. Update the `last-verified` field if the TC was re-run during modification
@@ -241,7 +247,8 @@ Present the execution summary:
241
247
  - [ ] TC count matches plan: {yes/no}
242
248
  - [ ] No stale references: {yes/no}
243
249
  - [ ] All scenarios have 2-5 TCs: {yes/no}
244
- - [ ] Modified/created TCs avoid helper files in `results/tc/{NN}/`: {yes/no}
250
+ - [ ] Modified/created TCs avoid undeclared helper files in `results/tc/{NN}/`: {yes/no}
251
+ - [ ] Modified/created TCs declare every verifier-dependent artifact path: {yes/no}
245
252
 
246
253
  ### Next Steps
247
254
 
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ace
4
+ module Test
5
+ module EndToEndRunner
6
+ module Atoms
7
+ # Validates that verifier-visible artifact paths are explicitly declared by
8
+ # runner instructions or scenario setup, and normalizes grouped capture
9
+ # shorthand such as `foo.stdout`, `.stderr`, `.exit`.
10
+ class ArtifactContractValidator
11
+ Reference = Struct.new(:path, :optional, :source, :line, keyword_init: true)
12
+
13
+ FULL_PATH_PATTERN = /
14
+ (?:`|"|')?
15
+ (results\/tc\/\d{2}\/[^\s`)"']+|results\/tc\/\d{2}\/)
16
+ (?:`|"|')?
17
+ (\s*\(optional\))?
18
+ /ix
19
+ SUFFIX_PATTERN = /,\s*(?:`|"|')?(\.[A-Za-z0-9*._-]+)(?:`|"|')?(\s*\(optional\))?/i
20
+ WILDCARD_PATTERN = /[*?\[]/.freeze
21
+
22
+ class << self
23
+ def extract(markdown, source:)
24
+ markdown.to_s.each_line.with_index(1).flat_map do |line, line_number|
25
+ extract_from_line(line, source: source, line_number: line_number)
26
+ end
27
+ end
28
+
29
+ def references_from_paths(paths, source:)
30
+ Array(paths).filter_map do |path|
31
+ normalized = normalize(path)
32
+ next if normalized.nil?
33
+
34
+ Reference.new(path: normalized, optional: false, source: source, line: nil)
35
+ end
36
+ end
37
+
38
+ def validate!(tc_id:, scenario_dir:, runner_references:, verifier_references:, scenario_references:)
39
+ invalid_wildcards = (runner_references + verifier_references + scenario_references).select do |reference|
40
+ wildcard?(reference.path)
41
+ end
42
+ unless invalid_wildcards.empty?
43
+ raise ArgumentError,
44
+ "Wildcard artifact path(s) are not supported for #{tc_id} in #{scenario_dir}: " \
45
+ "#{format_references(invalid_wildcards)}"
46
+ end
47
+
48
+ declared_paths = normalized_paths(scenario_references + runner_references)
49
+ undeclared = verifier_references.reject do |reference|
50
+ declared_paths.include?(normalize(reference.path))
51
+ end
52
+ return if undeclared.empty?
53
+
54
+ raise ArgumentError,
55
+ "Verifier references undeclared artifact(s) for #{tc_id} in #{scenario_dir}: " \
56
+ "#{format_references(undeclared)}. " \
57
+ "Declare exact artifact paths in the runner file or scenario.yml sandbox-layout."
58
+ end
59
+
60
+ private
61
+
62
+ def extract_from_line(line, source:, line_number:)
63
+ matches = []
64
+ line.to_enum(:scan, FULL_PATH_PATTERN).each do
65
+ matches << {
66
+ start: Regexp.last_match.begin(0),
67
+ end: Regexp.last_match.end(0),
68
+ path: normalize(Regexp.last_match[1]),
69
+ optional: !Regexp.last_match[2].to_s.empty?
70
+ }
71
+ end
72
+
73
+ matches.each_with_index.flat_map do |match, index|
74
+ refs = [
75
+ Reference.new(
76
+ path: match[:path],
77
+ optional: match[:optional],
78
+ source: source,
79
+ line: line_number
80
+ )
81
+ ]
82
+
83
+ next_match = matches[index + 1]
84
+ suffix_region = line[match[:end]...(next_match ? next_match[:start] : line.length)].to_s
85
+ suffix_base = suffix_base_for(match[:path])
86
+ next refs if suffix_base.nil?
87
+
88
+ suffix_region.to_enum(:scan, SUFFIX_PATTERN).each do
89
+ refs << Reference.new(
90
+ path: "#{suffix_base}#{Regexp.last_match[1]}",
91
+ optional: !Regexp.last_match[2].to_s.empty?,
92
+ source: source,
93
+ line: line_number
94
+ )
95
+ end
96
+ refs
97
+ end
98
+ end
99
+
100
+ def suffix_base_for(path)
101
+ return nil if path.nil?
102
+ return nil if path.match?(%r{\Aresults/tc/\d{2}\z})
103
+
104
+ path.sub(/\.[^.\/]+\z/, "").tap do |value|
105
+ return nil if value == path
106
+ end
107
+ end
108
+
109
+ def normalized_paths(references)
110
+ references.map { |reference| normalize(reference.path) }.compact.uniq
111
+ end
112
+
113
+ def normalize(path)
114
+ value = path.to_s.strip
115
+ return nil unless value.start_with?("results/tc/")
116
+
117
+ value.sub(%r{/+\z}, "")
118
+ end
119
+
120
+ def wildcard?(path)
121
+ path.to_s.match?(WILDCARD_PATTERN)
122
+ end
123
+
124
+ def format_references(references)
125
+ references.uniq { |reference| [reference.path, reference.source, reference.line] }.map do |reference|
126
+ if reference.line
127
+ "#{reference.path} (#{reference.source}:#{reference.line})"
128
+ else
129
+ "#{reference.path} (#{reference.source})"
130
+ end
131
+ end.join(", ")
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end