ace-test-runner-e2e 0.29.8 → 0.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.ace-defaults/e2e-runner/config.yml +14 -2
  3. data/CHANGELOG.md +233 -0
  4. data/README.md +2 -2
  5. data/exe/ace-test-e2e-sh +9 -4
  6. data/handbook/guides/e2e-testing.g.md +75 -9
  7. data/handbook/guides/scenario-yml-reference.g.md +21 -8
  8. data/handbook/guides/tc-authoring.g.md +23 -5
  9. data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
  10. data/handbook/skills/as-e2e-review/SKILL.md +2 -2
  11. data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
  12. data/handbook/templates/agent-experience-report.template.md +3 -2
  13. data/handbook/templates/scenario.yml.template.yml +7 -2
  14. data/handbook/templates/tc-file.template.md +16 -4
  15. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
  16. data/handbook/workflow-instructions/e2e/create.wf.md +128 -25
  17. data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
  18. data/handbook/workflow-instructions/e2e/fix.wf.md +84 -15
  19. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +33 -1
  20. data/handbook/workflow-instructions/e2e/review.wf.md +40 -25
  21. data/handbook/workflow-instructions/e2e/rewrite.wf.md +22 -8
  22. data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
  23. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
  24. data/lib/ace/test/end_to_end_runner/atoms/artifact_contract_validator.rb +138 -0
  25. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
  26. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
  27. data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +195 -5
  28. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +58 -9
  29. data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
  30. data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
  31. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
  32. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
  33. data/lib/ace/test/end_to_end_runner/molecules/artifact_pruner.rb +61 -0
  34. data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
  35. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
  36. data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
  37. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +235 -18
  38. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +164 -13
  39. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
  40. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +121 -18
  41. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +15 -12
  42. data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +374 -0
  43. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +83 -5
  44. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +121 -16
  45. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +422 -97
  46. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
  47. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
  48. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +98 -18
  49. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +159 -19
  50. data/lib/ace/test/end_to_end_runner/version.rb +1 -1
  51. data/lib/ace/test/end_to_end_runner.rb +4 -0
  52. metadata +21 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6206e4d6f65fe1ab5c27d1b5479e37af079b3554bd6289786aa71ce62e4ecf50
4
- data.tar.gz: bedb5fa2830bc1f2818e2246acbbab15ec6a7227898bf4dba2b23b6521eb8d5b
3
+ metadata.gz: ae94a3ebd8b4ed697d8b3b5a705030236659ab9b8e34fa4e5baa421a22d1b781
4
+ data.tar.gz: 8d65447a2174a8fe614d0fce6a8ab2503d39b06e76584aa7af4ba9cedaf3f8bc
5
5
  SHA512:
6
- metadata.gz: 9d06bc8d9447debe2b48128b7c45ea0a357d01677b9b8b48fa508c5f8a078e8b4ed0396a4d4d38c06be25573701d4bd75b6f481438d116e3499b4b1890a9edd5
7
- data.tar.gz: b148663600b83ffde9821761a1ef4a7b43d11a2efc10d97433bb090d4ae2bfc8f7dd997756190cdafc894c497f1eada99faf3b78dcf7df38ed7eec9d57cedec3
6
+ metadata.gz: 3889a846fd3631330728fe5e144259328f08ebd7ee35d4d8f5358fb149f85ad1dc4071132d1db283bb402f1cbed16655f1e790a644e40508e1d104b4fb24e0f5
7
+ data.tar.gz: c68c945a12f8ab86c23ad3714584afbc9e507a7cc8d91b089c99048e43df579adae9f407e769e9669e17910c5853c0919b81846fb278b5d72722220b29187101
@@ -2,12 +2,16 @@
2
2
  # This file provides defaults for the ace-test-runner-e2e gem
3
3
 
4
4
  paths:
5
+ # Preferred location for deterministic preflight tests in packages.
6
+ preflight: "test/feat"
5
7
  # Where test scenarios are stored in packages
6
8
  scenarios: "test/e2e"
7
9
  # Directory for test execution artifacts (gitignored)
8
10
  cache_dir: ".ace-local/test-e2e"
9
11
 
10
12
  patterns:
13
+ # Glob pattern for deterministic preflight tests.
14
+ preflight: "test/feat/**/*_test.rb"
11
15
  # Glob pattern for finding test scenarios (TS-format directories)
12
16
  discovery: "test/e2e/TS-*/scenario.yml"
13
17
 
@@ -38,13 +42,21 @@ reporting:
38
42
 
39
43
  # Execution defaults
40
44
  execution:
41
- # Default LLM provider:model for test execution
42
- provider: "role:e2e-executor"
45
+ # Legacy provider fallback when runner/verifier are not explicitly split
46
+ provider: "role:e2e-runner"
47
+ # LLM provider:model for runner execution
48
+ runner_provider: "role:e2e-runner"
49
+ # LLM provider:model for verifier execution
50
+ verifier_provider: "role:e2e-verifier"
43
51
  # Timeout per test in seconds
44
52
  timeout: 600
45
53
  # Number of tests to run in parallel (1 = sequential)
46
54
  parallel: 3
47
55
 
56
+ sandbox:
57
+ profile: "ace-default"
58
+ ruby_version: "3.4.9"
59
+
48
60
  # Provider configuration
49
61
  providers:
50
62
  # CLI providers use deterministic pipeline execution (runner + verifier)
data/CHANGELOG.md CHANGED
@@ -7,6 +7,239 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.40.1] - 2026-04-24
11
+
12
+ ### Fixed
13
+ - Removed suite-specific wording from the single-command `ace-test-e2e` help/output path so the `RunTest` CLI stays scoped to the single-command surface while preserving prune-artifact guidance.
14
+
15
+ ## [0.40.0] - 2026-04-24
16
+
17
+ ### Changed
18
+ - Added `--[no-]prune-artifacts` to `ace-test-e2e` and `ace-test-e2e-suite` so operators can clear stale `.ace-local/test-e2e` run artifacts before execution while preserving suite reports and the shared `runtime-cache/`.
19
+
20
+ ## [0.39.1] - 2026-04-24
21
+
22
+ ### Fixed
23
+ - Resolved suite-shared runtime reuse in child `ace-test-e2e` subprocesses by honoring inherited `ACE_E2E_SHARED_RUNTIME_ROOT` from the process environment instead of rebuilding sandbox-local runtimes after prewarming.
24
+
25
+ ## [0.39.0] - 2026-04-24
26
+
27
+ ### Changed
28
+ - Added `--[no-]retry-failures-once` for full-suite reruns, including flaky-recovery reporting when a failed first pass succeeds on the retry pass.
29
+ - Reused a suite-shared E2E runtime cache under `.ace-local/test-e2e/runtime-cache/` so parallel sandbox workers stop rebuilding the same Bundler environment and native extensions for every scenario.
30
+
31
+ ## [0.38.17] - 2026-04-24
32
+
33
+ ### Fixed
34
+ - Detected fixture-commit setup flows across the full setup sequence instead of only single-step `git add && git commit` commands, restoring support-path git excludes for split-step fixture repositories.
35
+
36
+ ## [0.38.16] - 2026-04-23
37
+
38
+ ### Fixed
39
+ - Enforced runner-owned verifier artifact contracts in scenario loading, expanded grouped `.stdout` / `.stderr` / `.exit` shorthand, and rejected verifier-only or wildcard artifact declarations that previously let retained E2E drift slip through.
40
+
41
+ ### Technical
42
+ - Updated E2E guides, templates, and create/review/plan/rewrite/fix workflows to distinguish `public-surface` versus `retained-contract` TCs and require explicit downstream retained-E2E sweeps after public contract changes.
43
+
44
+ ## [0.38.15] - 2026-04-23
45
+
46
+ ### Fixed
47
+ - Passed declared artifact contracts directly into runner prompts, added one bounded runner repair pass when required captures are still missing, and persisted repair metadata so missing-artifact E2E failures can recover before verifier judgment.
48
+
49
+ ## [0.38.14] - 2026-04-23
50
+
51
+ ### Fixed
52
+ - Limited deterministic sandbox git excludes to setup-commit scenarios so copied package trees remain visible to ignore-aware tools while fixture-repo support paths stay unstaged.
53
+
54
+ ## [0.38.13] - 2026-04-23
55
+
56
+ ### Fixed
57
+ - Enabled role-based verifier fallback in pipeline execution so successful runner phases still produce verifier results when the first verifier provider is unavailable.
58
+ - Seeded deterministic sandbox git excludes for copied package trees and fixture-commit support paths so setup-time `git add -A` no longer stages runner support files or copied package content into fixture repositories.
59
+
60
+ ## [0.38.12] - 2026-04-23
61
+
62
+ ### Changed
63
+ - Updated default ACE sandbox bootstrap to use `ace-config sync ace-llm-providers-cli` before `ace-handbook sync`, matching the renamed config sync command and minimal quick-start config requirement.
64
+
65
+ ## [0.38.11] - 2026-04-20
66
+
67
+ ### Fixed
68
+ - Spaced batch run IDs by 100ms in `TestOrchestrator` so generated 50ms-format IDs remain unique under fast consecutive suite execution.
69
+
70
+ ## [0.38.10] - 2026-04-19
71
+
72
+ ### Fixed
73
+ - Added strict runner ordering guidance, verifier artifact mtimes, and direct goal-number-to-TC mapping so E2E reports classify out-of-order postcondition captures as runner errors instead of shifting failed TC IDs.
74
+
75
+ ## [0.38.9] - 2026-04-19
76
+
77
+ ### Changed
78
+ - Strengthened the E2E failure-analysis and fix workflows to require explicit docs/help drift reporting for every failed TC, so stale usage docs or CLI help surfaced by E2E failures become concrete fix targets instead of hidden runner workarounds.
79
+
80
+ ## [0.38.8] - 2026-04-16
81
+
82
+ ### Fixed
83
+ - Synced protocol-source package trees into prepared sandboxes before deterministic setup, preserved the sanitized setup environment for runner and verifier execution, and tightened the shared runner contract to require direct `ace-*` commands with immediate `.stdout` / `.stderr` / `.exit` persistence.
84
+
85
+ ## [0.38.7] - 2026-04-16
86
+
87
+ ### Fixed
88
+ - Reused already prepared CLI-provider sandboxes during pipeline execution so the runner no longer rewrites tracked sandbox state after deterministic setup, which prevents staged-path failures caused by post-setup provider-directory symlinks.
89
+
90
+ ## [0.38.6] - 2026-04-16
91
+
92
+ ### Fixed
93
+ - Scoped declared sandbox-layout artifacts to the active test case, recorded present-versus-missing required artifacts in harness snapshots and report metadata, and passed that contract into verifier prompts.
94
+ - Added canonical goal-verdict reporting so generated scenario reports keep the authoritative failed-TC mapping even when narrative evidence includes contradictory wording.
95
+
96
+ ## [0.38.5] - 2026-04-16
97
+
98
+ ### Fixed
99
+ - Synced package protocol-source manifests into copied E2E sandboxes so bundled workflow and skill resolution continues to work after sandbox setup.
100
+ - Hardened the shared runner prompt contract to preserve sandbox runtime `PATH`/environment and forbid wrapper patterns that break direct `ace-*` execution.
101
+
102
+ ## [0.38.4] - 2026-04-16
103
+
104
+ ### Fixed
105
+ - Built a dedicated sandbox runtime for E2E runs with sandbox-local Gemfile, Bundler state, gem home, bin shims, verifier sandbox context, preserved report-directory reuse, and wrapper-compatible launch behavior so sandboxed commands stop leaking back into the source worktree.
106
+
107
+ ## [0.38.3] - 2026-04-16
108
+
109
+ ### Fixed
110
+ - Stripped inherited Bundler and Ruby env leakage from sandboxed E2E subprocesses, created sandbox-local Bundler state, preserved failure-stub report directories in suite aggregation, and aligned shared setup templates/docs with the `ACE_E2E_SOURCE_ROOT` source-root contract.
111
+
112
+ ## [0.38.2] - 2026-04-16
113
+
114
+ ### Fixed
115
+ - Prepared setup steps with sandbox runtime environment, hardened runtime directory permissions for tmux access, and kept sandbox support paths aligned with the active `bubblewrap` execution model.
116
+
117
+ ## [0.38.1] - 2026-04-15
118
+
119
+ ### Fixed
120
+ - Tightened the Linux `bubblewrap` sandbox mounts to preserve required device access such as `/dev/null` while keeping the host filesystem isolated.
121
+ - Moved sandbox support directories outside the copied repo workspace so E2E setup steps like `git add -A` no longer stage sandbox home, tmp, or runtime files.
122
+
123
+ ## [0.38.0] - 2026-04-15
124
+
125
+ ### Changed
126
+ - Rewrote `TS-RUNNER-001` to use public fixture-driven discovery (`copy-fixtures`) and expanded suite control-flow coverage beyond help-only output.
127
+ - Added `TS-RUNNER-002` to cover real non-dry run report generation, verifier-output evidence, and explicit `ace-test-e2e-sh` public shell-helper usage.
128
+ - Updated `docs/usage.md` with safe shell-helper workflows tied to deterministic `.ace-local/test-e2e/` report paths.
129
+
130
+ ### Fixed
131
+ - Routed setup/runner/verifier subprocesses through the new sandbox backend, kept user-facing verifier metadata in written reports, and taught the minimal verifier parser to accept standalone `Results: X/Y passed` summaries.
132
+
133
+ ## [0.37.2] - 2026-04-14
134
+
135
+ ### Changed
136
+ - Added a canonical public-surface gate across the E2E handbook so goal-based scenarios must prove both that the tool works and that a user can complete the job from docs, `--help`, and the public CLI without hidden recipes or workarounds.
137
+ - Updated the create/review/plan/rewrite/run/fix workflow guidance, shared guides, and templates to treat workaround-driven scenarios as invalid or at-risk and to record friction through runner observations instead of teaching fallback procedures.
138
+
139
+ ## [0.37.1] - 2026-04-13
140
+
141
+ ### Changed
142
+ - Updated the canonical E2E create/review/rewrite/run guidance, templates, and references so goal-based scenarios are written around final sandbox state plus runner observations instead of helper artifacts under `results/`.
143
+
144
+ ## [0.37.0] - 2026-04-13
145
+
146
+ ### Changed
147
+ - Made runner `Observations` the canonical non-filesystem evidence channel for goal-based E2E scenarios, passed them directly into verifier prompts, and persisted them through the harness-managed report surface.
148
+ - Updated the shared E2E template, authoring guides, and rewrite/run workflows to require goal achievement from sandbox end state first, using runner observations as the only secondary evidence source instead of helper artifacts under `results/`.
149
+
150
+ ## [0.36.1] - 2026-04-13
151
+
152
+ ### Fixed
153
+ - Preferred canonical per-scenario `report.md` metadata when building aggregate package and suite reports so failed TC mappings no longer drift from the underlying scenario reports.
154
+ - Added explicit dirty-worktree diagnostics to suite reporting so tracked repo mutations are surfaced as runner diagnostics instead of being inferred after the fact.
155
+
156
+ ### Changed
157
+ - Updated the canonical E2E failure-analysis and fix workflows plus usage guidance to treat aggregate reports as indexes and per-scenario reports as the source of truth for TC-level triage.
158
+
159
+ ## [0.36.0] - 2026-04-13
160
+
161
+ ### Fixed
162
+ - Renamed aggregated E2E outputs to scope-specific package and suite report filenames instead of the ambiguous shared `final-report` label.
163
+ - Stripped ambient `TMUX` and `TMUX_PANE` state from setup and pipeline subprocess environments so E2E runs do not accidentally attach to the operator's live tmux session.
164
+
165
+ ### Technical
166
+ - Updated suite orchestrator/report writer coverage and E2E workflow guidance around the explicit package-vs-suite report contract.
167
+
168
+ ## [0.35.0] - 2026-04-13
169
+ ### Changed
170
+ - **ace-test-runner-e2e v0.35.0**: Added optional scenario artifact declarations via `(optional)`, separated required and optional artifact tracking, and included optional outputs in manifests and snapshots without failing scenarios when they are absent.
171
+
172
+ ## [0.34.1] - 2026-04-13
173
+
174
+ ### Changed
175
+ - Completed the batch i05 migration follow-through for this package and aligned it with the restarted `fast` / `feat` / `e2e` verification model.
176
+
177
+ ### Technical
178
+ - Included in the coordinated assignment-driven patch release for batch i05 package updates.
179
+
180
+
181
+ ## [0.34.0] - 2026-04-12
182
+
183
+ ### Changed
184
+ - Migrated package deterministic tests to the restarted `fast`/`feat` layout by moving `test/atoms`, `test/commands`, `test/handbook`, `test/models`, `test/molecules`, and `test/organisms` under `test/fast/`, and moving legacy `test/integration` coverage into `test/feat/`.
185
+ - Updated package docs and CLI wording to teach `fast`/`feat` deterministic coverage plus scenario-only `test/e2e` execution via `ace-test-e2e`.
186
+ - Refreshed `TS-RUNNER-001` scenario metadata and decision-record unit coverage references to point at migrated `test/fast` paths.
187
+
188
+ ## [0.33.1] - 2026-04-12
189
+
190
+ ### Fixed
191
+ - Made suite final reports deterministic for canonical sections by deriving summary rows, failed-test details, reports tables, and the overall line from runtime results instead of model-authored prose.
192
+ - Added regression coverage so hallucinated scenario titles, failed TC IDs, and duplicate overall lines are ignored or replaced before report files are written.
193
+
194
+ ## [0.33.0] - 2026-04-11
195
+
196
+ ### Changed
197
+ - Made `wfi://e2e/fix` a self-bootstrapping workflow that reuses existing failure analysis when present and generates it via `wfi://e2e/analyze-failures` when missing or incomplete.
198
+ - Updated the canonical `as-e2e-fix` skill contract to state that missing analysis is generated automatically before fixes are applied.
199
+
200
+ ### Technical
201
+
202
+ - Refactored `ConfigLoader` molecule tests to use config mock mode, removing dependency on monorepo `.ace` overrides and making the test contract stable across environments.
203
+
204
+ ## [0.32.2] - 2026-04-11
205
+
206
+ ### Fixed
207
+ - Generated per-scenario CLI batch `run_id`s from explicit 50ms timestamp buckets so parallel package runs no longer occasionally reuse the same report-path ID and trip the unique-run-id orchestration contract.
208
+
209
+ ## [0.32.1] - 2026-04-11
210
+
211
+ ### Technical
212
+ - Synced the canonical `as-e2e-review` skill description with the package-targeted assign verification contract so shipped metadata no longer implies broader scenario-sweep execution.
213
+
214
+ ## [0.31.0] - 2026-04-10
215
+
216
+ ### Changed
217
+ - Restored the two-phase E2E harness to run deterministic `test/integration` coverage before agent scenarios from `test/e2e`, with integration failures short-circuiting scenario execution.
218
+ - Added deterministic integration execution, richer per-test-case manifests and artifact snapshotting, and refreshed CLI/docs/workflows/tests around the restarted layout and role-based runner/verifier contract.
219
+
220
+ ### Fixed
221
+ - Accepted minimal verifier evidence responses in the runner pipeline so successful scenario runs no longer fail when a verifier omits the full structured envelope.
222
+
223
+ ## [0.30.2] - 2026-04-10
224
+
225
+ ### Fixed
226
+ - Surface `git diff` stderr when affected-package detection fails so invalid refs and shallow-clone failures no longer look like empty affected sets.
227
+
228
+ ## [0.30.1] - 2026-04-10
229
+
230
+ ### Fixed
231
+ - Raised the `ace-support-test-helpers` runtime dependency floor to `~> 0.14` so released installs accept the shared sandbox package-copy helper line used by the restarted runner.
232
+ - Restored the `TS-RUNNER-001` smoke scenario fixture source path so the CLI smoke scenario resolves its canonical demo fixture again.
233
+
234
+ ## [0.30.0] - 2026-04-10
235
+
236
+ ### Changed
237
+ - Reworked `ace-test-runner-e2e` back into a two-phase contract, with deterministic integration from `test/integration` before agent scenarios from `test/e2e`.
238
+ - Switched sandbox orchestration to the shared package-copy helper and refreshed CLI/docs/workflows for the restarted E2E structure.
239
+
240
+ ### Fixed
241
+ - Hardened affected-file detection by capturing git diff stderr so provider-side affected checks fail with clearer diagnostics.
242
+
10
243
  ## [0.29.8] - 2026-04-01
11
244
 
12
245
  ### Fixed
data/README.md CHANGED
@@ -18,11 +18,11 @@
18
18
 
19
19
  ![ace-test-runner-e2e demo](docs/demo/ace-test-runner-e2e-getting-started.gif)
20
20
 
21
- `ace-test-runner-e2e` runs realistic workflow scenarios through coding agents so teams can validate behavior beyond unit and integration coverage while keeping execution reproducible and isolated from the working tree.
21
+ `ace-test-runner-e2e` runs realistic workflow scenarios through coding agents so teams can validate behavior beyond deterministic package tests while keeping execution reproducible and isolated from the working tree.
22
22
 
23
23
  ## How It Works
24
24
 
25
- 1. Discover E2E scenario definitions from package-local `test/e2e/` suites with metadata, tags, and command flows.
25
+ 1. Discover deterministic preflight tests from package-local `test/feat/` and agent scenarios from `test/e2e/`, preserving metadata, tags, and command flows.
26
26
  2. Execute scenarios inside reproducible sandboxes that isolate agent runs from the working tree.
27
27
  3. Produce structured reports that are easy to inspect, compare across runs, and feed back into triage workflows.
28
28
 
data/exe/ace-test-e2e-sh CHANGED
@@ -1,6 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
  # frozen_string_literal: true
3
3
 
4
+ require_relative "../lib/ace/test/end_to_end_runner"
5
+
4
6
  # ace-test-e2e-sh - Execute commands within E2E test sandbox
5
7
  #
6
8
  # Usage:
@@ -57,11 +59,14 @@ unless Dir.exist?(test_dir)
57
59
  exit 1
58
60
  end
59
61
 
60
- Dir.chdir(test_dir)
61
- ENV["PROJECT_ROOT_PATH"] = test_dir
62
+ backend = Ace::Test::EndToEndRunner::Molecules::BwrapSandboxBackend.new(
63
+ sandbox_root: test_dir,
64
+ source_root: ENV["ACE_E2E_SOURCE_ROOT"]
65
+ )
66
+ env = backend.prepared_env("PROJECT_ROOT_PATH" => test_dir, "ACE_E2E_SOURCE_ROOT" => ENV["ACE_E2E_SOURCE_ROOT"])
62
67
 
63
68
  if ARGV.empty?
64
- exec "bash"
69
+ backend.exec(["bash"], chdir: test_dir, env: env)
65
70
  else
66
- exec(*ARGV)
71
+ backend.exec(ARGV, chdir: test_dir, env: env)
67
72
  end
@@ -3,8 +3,8 @@ doc-type: guide
3
3
  title: E2E Testing Guide
4
4
  purpose: Conventions and best practices for agent-executed end-to-end tests
5
5
  ace-docs:
6
- last-updated: 2026-03-12
7
- last-checked: 2026-03-21
6
+ last-updated: 2026-04-19
7
+ last-checked: 2026-04-19
8
8
  ---
9
9
 
10
10
  # E2E Testing Guide
@@ -12,6 +12,11 @@ ace-docs:
12
12
  ## Overview
13
13
 
14
14
  E2E tests are executed by an AI agent and reserved for behaviors that require real CLI execution, real tools, and real filesystem side effects.
15
+ They must also answer a user-journey question: can a user do the job from the tool's public surface, and how much friction does that journey have?
16
+
17
+ In practice, ACE uses two valid TC styles:
18
+ - **Public-surface TCs** — prove a user job from docs/usage/`--help` and the CLI itself.
19
+ - **Retained-contract TCs** — pin a previously fragile integrated behavior with deterministic, explicitly declared evidence.
15
20
 
16
21
  ## Canonical Conventions
17
22
 
@@ -24,7 +29,7 @@ E2E tests are executed by an AI agent and reserved for behaviors that require re
24
29
  - `TC-*.verify.md`
25
30
  - `runner.yml.md`
26
31
  - `verifier.yml.md`
27
- - TC artifacts use `results/tc/{NN}/`
32
+ - TC outcome artifacts use `results/tc/{NN}/`
28
33
  - Summary reports use `tcs-passed`, `tcs-failed`, `tcs-total`, and `failed[].tc`
29
34
  - Scenarios declare `tags` for discovery-time filtering via `--tags`/`--exclude-tags`
30
35
 
@@ -32,15 +37,24 @@ E2E tests are executed by an AI agent and reserved for behaviors that require re
32
37
 
33
38
  - Runner is **execution-only**:
34
39
  - perform user-like CLI actions in sandbox
35
- - produce evidence files under `results/tc/{NN}/`
40
+ - produce only declared outcome evidence under `results/tc/{NN}/`
41
+ - return final runner observations through the harness contract
36
42
  - do not issue PASS/FAIL verdicts
37
43
  - do not perform verifier-style assertion/classification
44
+ - do not invent workarounds or hidden command recipes to compensate for docs/help/CLI gaps
38
45
  - Verifier is **verification-only**:
39
46
  - evaluate TC outcome from sandbox evidence
47
+ - use runner observations as the only non-filesystem secondary evidence source
40
48
  - apply an **impact-first** evidence order:
41
49
  1. sandbox/project state impact
42
- 2. explicit TC artifacts
43
- 3. debug captures (`stdout`, `stderr`, `*.exit`, metadata) only as fallback
50
+ 2. runner observations
51
+ 3. explicit TC artifacts that are true product outcomes
52
+ 4. debug captures (`stdout`, `stderr`, `*.exit`, metadata) only as fallback
53
+ - Artifact contract ownership:
54
+ - runner instructions and `scenario.yml` setup/layout declare verifier-visible artifact paths
55
+ - verifier consumes that contract; it does not create new required artifact paths
56
+ - grouped shorthand such as ``results/tc/02/help.stdout`, `.stderr`, `.exit`` counts as an exact declaration of all three files
57
+ - wildcard artifact paths such as `results/tc/02/output.*` are not valid declarations
44
58
  - Setup ownership:
45
59
  - sandbox preparation belongs to `scenario.yml` `setup:` + `fixtures/`
46
60
  - TC runner files must not define independent environment setup procedures
@@ -52,7 +66,40 @@ Before adding a TC, confirm the behavior needs:
52
66
  - real external tools/processes
53
67
  - real filesystem I/O and environment state
54
68
 
55
- If not, keep coverage in unit/integration tests.
69
+ If not, keep coverage in `fast`/`feat` tests.
70
+
71
+ ## Public-Surface Gate
72
+
73
+ Before keeping or adding a goal-style TC, confirm the user job is achievable from:
74
+ - package README / usage docs
75
+ - `--help`
76
+ - declared fixtures/setup
77
+ - the tool under test itself
78
+
79
+ Reject or rewrite the TC if it depends on:
80
+ - hidden recipes embedded in runner instructions
81
+ - workaround branches for unsupported or undocumented behavior
82
+ - direct supporting-tool probes as the primary oracle
83
+ - internal details that are not necessary to prove the user job
84
+
85
+ When an E2E failure shows that a valid user job is not discoverable from docs, usage guides, or `--help`, treat that as
86
+ docs/help drift. Failure analysis must record the stale or missing public surface and the exact docs/help target to
87
+ update instead of teaching the runner a workaround.
88
+
89
+ ## TC Style Selection
90
+
91
+ Use **public-surface** style when the goal is a real user journey and the primary oracle should stay on user-visible behavior.
92
+
93
+ Use **retained-contract** style when the integrated behavior matters but final sandbox state alone is not enough. In that case, small declared supporting captures are valid, for example:
94
+ - `.stdout`, `.stderr`, `.exit`
95
+ - `command.txt`
96
+ - `path-check.txt`
97
+ - `artifact-check.txt`
98
+
99
+ Even retained-contract TCs must not rely on:
100
+ - verifier-only artifact declarations
101
+ - wildcard artifact paths
102
+ - reflections, PASS/FAIL summaries, or verifier-facing manifests under `results/`
56
103
 
57
104
  ## Cost and Scope
58
105
 
@@ -79,6 +126,7 @@ The verifier is always-on for standalone goal-mode TCs in the CLI pipeline. For
79
126
  ## Scenario Layout
80
127
 
81
128
  ```text
129
+ {package}/test/feat/**/*_test.rb
82
130
  {package}/test/e2e/TS-{AREA}-{NNN}-{slug}/
83
131
  scenario.yml
84
132
  runner.yml.md
@@ -101,10 +149,19 @@ This prevents duplicate assertions across test layers.
101
149
  ## Authoring Rules
102
150
 
103
151
  - Keep runner goals outcome-oriented and deterministic.
152
+ - Keep runner goals aligned with the public user path; if the runner needs a workaround, surface that as friction rather than teaching the workaround.
104
153
  - Keep verifier expectations impact-first, then artifacts, then debug fallback.
105
154
  - Preserve strict TC pairing (`runner` + `verify`).
106
- - Keep outputs inside `results/tc/{NN}/`.
155
+ - Keep `results/tc/{NN}/` for declared verifier-dependent evidence only.
156
+ - Declare every verifier-dependent file path in runner instructions or scenario setup. Do not rely on verifier-only path references.
157
+ - Allow small supporting captures only when they are explicitly declared and materially improve confidence.
158
+ - Do not use wildcard artifact paths.
159
+ - Do not instruct runners to create reflections, PASS/FAIL summaries, verifier-facing manifests, or ad hoc temp inputs in `results/`.
160
+ - Do not judge success from runner-authored summaries when final sandbox state can prove the goal directly.
161
+ - Use runner observations only to explain ambiguity or missing side effects, not to replace missing end-state evidence.
162
+ - Treat any workaround noted in runner observations as a product/docs/help or scenario-design smell that must be fixed, not preserved.
107
163
  - Avoid hidden dependencies between TCs unless explicitly intended.
164
+ - For `--watch` or other live-output commands, use a bounded-session pattern with explicit termination behavior and captured exit codes.
108
165
 
109
166
  ## Execution Artifacts
110
167
 
@@ -121,4 +178,13 @@ Before approving new/updated E2E tests:
121
178
  - [ ] `runner.yml.md` and `verifier.yml.md` exist
122
179
  - [ ] Every TC has both `.runner.md` and `.verify.md`
123
180
  - [ ] Artifacts are scoped to `results/tc/{NN}/`
124
- - [ ] Value-gate metadata is present (`e2e-justification`, `unit-coverage-reviewed`, `cost-tier`)
181
+ - [ ] Every verifier-dependent artifact path is declared by runner/setup
182
+ - [ ] No verifier depends on wildcard or verifier-only artifact paths
183
+ - [ ] Verifier primary oracle is final sandbox state or real product output, not helper artifacts
184
+ - [ ] Runner observations are the only non-filesystem secondary evidence source
185
+ - [ ] TC style is explicit in the review (`public-surface` or `retained-contract`)
186
+ - [ ] Scenario can be completed from docs/usage/`--help` without hidden recipes or workaround instructions
187
+ - [ ] Any internal-detail assertion is part of the public contract or justified as retained-contract evidence
188
+ - [ ] Any friction/workaround found during review is treated as a gap, not as a runner script opportunity
189
+ - [ ] Failure analysis records docs/help drift from failed public user paths, or explicitly records `None`
190
+ - [ ] Value-gate metadata is present (`e2e-justification`, `unit-coverage-reviewed`, `cost-tier`)
@@ -46,14 +46,14 @@ Example: `ace-lint/test/e2e/TS-LINT-001-lint-pipeline/scenario.yml`
46
46
  |-------|------|---------|-------------|
47
47
  | `priority` | string | `medium` | Test priority: `high`, `medium`, `low` |
48
48
  | `tool-under-test` | string | — | Primary command/tool validated |
49
- | `sandbox-layout` | object | `{}` | Declared artifact paths and expected outputs |
49
+ | `sandbox-layout` | object | `{}` | Directory-level outcome hints used to precreate `results/tc/*` paths and guide verification |
50
50
  | `duration` | string | — | Estimated duration (e.g., `~15min`) |
51
51
  | `timeout` | integer | — | Optional per-scenario execution timeout in seconds |
52
52
  | `automation-candidate` | boolean | `false` | Whether test is automatable |
53
53
  | `tags` | array | `[]` | Scenario tags for filtering with `--tags`/`--exclude-tags` (OR semantics) |
54
54
  | `cost-tier` | string | `smoke` | Run profile: `smoke`, `happy-path`, `deep` |
55
55
  | `e2e-justification` | string | — | Why E2E is needed |
56
- | `unit-coverage-reviewed` | array | `[]` | Unit/integration files reviewed |
56
+ | `unit-coverage-reviewed` | array | `[]` | Deterministic test files reviewed (`test/fast` and/or `test/feat`) |
57
57
  | `requires` | object | — | Test prerequisites |
58
58
  | `setup` | array | `[]` | Setup directives before execution |
59
59
  | `last-verified` | string | — | Last successful verification date |
@@ -73,6 +73,11 @@ Pairing rule:
73
73
  Artifact layout conventions:
74
74
  - canonical: `results/tc/{NN}/`
75
75
  - avoid non-TC-scoped result folders
76
+ - keep only declared verifier-dependent evidence under `results/tc/{NN}/`; runner observations live in harness reports, not sandbox helper files
77
+ - file-level verifier checks must be declared by the runner; `sandbox-layout` does not replace exact file declarations
78
+ - grouped shorthand such as ``results/tc/01/help.stdout`, `.stderr`, `.exit`` is valid for exact sibling captures
79
+ - wildcard artifact paths are not supported
80
+ - absence of a declared path is debug context, not a standalone failure reason
76
81
 
77
82
  Canonical summary report fields:
78
83
  - `tcs-passed`
@@ -83,6 +88,8 @@ Canonical summary report fields:
83
88
  Role contract:
84
89
  - `runner.yml.md` + `TC-*.runner.md` are execution-only.
85
90
  - `verifier.yml.md` + `TC-*.verify.md` are verification-only with impact-first checks.
91
+ - Public-surface TCs should be solvable from the public surface (docs/usage/`--help` + tool under test) without hidden recipes or workaround instructions.
92
+ - Retained-contract TCs may keep small declared supporting captures when they materially improve confidence.
86
93
 
87
94
  ## `requires` Object
88
95
 
@@ -92,6 +99,11 @@ requires:
92
99
  ruby: ">= 3.0"
93
100
  ```
94
101
 
102
+ `requires.tools` rules:
103
+ - declare execution prerequisites and supporting environment dependencies
104
+ - do not use `requires.tools` as permission to make fallback probes the primary oracle
105
+ - for ACE CLI scenarios, support tools are setup/dependency context unless the scenario is explicitly about that support tool itself
106
+
95
107
  ## `setup` Directives
96
108
 
97
109
  Available directives:
@@ -112,7 +124,7 @@ setup:
112
124
  - git-init
113
125
  - tmux-session:
114
126
  name-source: run-id
115
- - run: "cp $PROJECT_ROOT_PATH/mise.toml mise.toml && mise trust mise.toml"
127
+ - run: "cp ${ACE_E2E_SOURCE_ROOT:-$PROJECT_ROOT_PATH}/mise.toml mise.toml && mise trust mise.toml"
116
128
  - copy-fixtures
117
129
  - run: git add -A && git commit -m "initial" --quiet
118
130
  - agent-env:
@@ -122,6 +134,7 @@ setup:
122
134
  Setup rules:
123
135
  - Setup is fail-fast. Do not hide setup failures with `|| true`.
124
136
  - Setup belongs in `scenario.yml` and fixtures, not in TC runner instructions.
137
+ - Use setup to create prerequisite state, not verifier-facing helper files under `results/`.
125
138
  - If setup fails (for example, missing `mise trust` support), stop scenario execution and report infrastructure failure.
126
139
 
127
140
  ## Complete Example
@@ -137,17 +150,17 @@ cost-tier: smoke
137
150
  tags: [smoke, "use-case:lint"]
138
151
  e2e-justification: "Validates real subprocess behavior and report file generation"
139
152
  unit-coverage-reviewed:
140
- - test/molecules/lint_runner_test.rb
141
- - test/organisms/lint_orchestrator_test.rb
153
+ - test/fast/molecules/lint_runner_test.rb
154
+ - test/fast/organisms/lint_orchestrator_test.rb
142
155
  tool-under-test: ace-lint
143
156
  sandbox-layout:
144
- results/tc/01/: "help artifacts"
157
+ results/tc/01/: "Goal 1 outcome artifacts"
145
158
  requires:
146
159
  tools: [ace-lint, standardrb, jq]
147
160
  ruby: ">= 3.0"
148
161
  setup:
149
162
  - git-init
150
- - run: "cp $PROJECT_ROOT_PATH/mise.toml mise.toml && mise trust mise.toml"
163
+ - run: "cp ${ACE_E2E_SOURCE_ROOT:-$PROJECT_ROOT_PATH}/mise.toml mise.toml && mise trust mise.toml"
151
164
  - copy-fixtures
152
165
  - agent-env:
153
166
  PROJECT_ROOT_PATH: "."
@@ -179,4 +192,4 @@ test/e2e/TS-LINT-001-lint-pipeline/
179
192
  ├── TC-001-help-survey.runner.md
180
193
  ├── TC-001-help-survey.verify.md
181
194
  └── fixtures/
182
- ```
195
+ ```
@@ -29,9 +29,14 @@ Inline `.tc.md` and frontmatter `mode` values are no longer supported.
29
29
  - Scenario-level config files:
30
30
  - `runner.yml.md`
31
31
  - `verifier.yml.md`
32
- - TC artifacts write to `results/tc/{NN}/`
32
+ - TC outcome artifacts write to `results/tc/{NN}/`
33
33
  - Summary counters use `tcs-passed`, `tcs-failed`, and `tcs-total`
34
34
 
35
+ ## TC Styles
36
+
37
+ - **Public-surface**: prove a documented user job from docs/usage/`--help` and the CLI.
38
+ - **Retained-contract**: pin an integrated behavior with deterministic, explicitly declared supporting evidence when end-state checks alone are insufficient.
39
+
35
40
  ## File Naming
36
41
 
37
42
  - `TC-{NNN}` — test case number (e.g., TC-001)
@@ -77,12 +82,14 @@ Run `ace-lint` and produce report artifacts for a valid file.
77
82
  ## Workspace
78
83
 
79
84
  - Root: sandbox directory
80
- - Output: `results/tc/01/`
85
+ - Outcome artifacts: `results/tc/01/`
81
86
 
82
87
  ## Constraints
83
88
 
84
89
  - Use only sandbox paths
85
- - Keep evidence under `results/tc/01/`
90
+ - Keep only declared verifier-dependent evidence under `results/tc/01/`
91
+ - Declare exact paths for any verifier-dependent captures, for example ``results/tc/01/help.stdout`, `.stderr`, `.exit``
92
+ - Do not place helper inputs, manifests, PASS/FAIL summaries, or reflections under `results/tc/01/`
86
93
  - Execute actions only; do not assign PASS/FAIL or final verdicts
87
94
  ```
88
95
 
@@ -102,6 +109,7 @@ Example:
102
109
 
103
110
  - **Impact Checks**: target sandbox/project state changed as expected
104
111
  - **Artifact Checks**: `results/tc/01/report.json` exists and is valid
112
+ - **Runner Observations**: use harness-provided end-of-run observations only as supporting context
105
113
  - **Debug Fallback**: inspect `stdout`/`stderr`/`*.exit` only when primary checks are inconclusive
106
114
 
107
115
  ## Verdict
@@ -120,12 +128,22 @@ Pass only when all expectations are satisfied by on-disk evidence.
120
128
 
121
129
  - Keep each TC focused on one coherent behavior path.
122
130
  - Ensure goal numbers and TC numbers remain aligned (`TC-001` -> Goal 1).
131
+ - Choose the TC style up front: `public-surface` or `retained-contract`.
123
132
  - Keep runner files execution-only and verifier files verdict-only.
124
133
  - Make verifier expectations deterministic with impact-first ordering.
125
- - Keep all artifacts under `results/tc/{NN}/` to avoid cross-goal contamination.
134
+ - Keep `results/tc/{NN}/` for declared verifier-dependent evidence only.
135
+ - Declare every verifier-dependent path in the runner or setup. Do not rely on verifier-only references.
136
+ - Grouped capture shorthand is valid only for exact sibling files, for example ``foo.stdout`, `.stderr`, `.exit``.
137
+ - Do not use wildcard artifact paths.
138
+ - Use harness-provided runner observations as the only non-filesystem secondary evidence source.
139
+ - Prefer final sandbox state and real product output over raw debug captures.
140
+ - Do not ask the runner to write setup inputs, audit manifests, verifier-facing summaries, or final reflections for the verifier.
141
+ - Do not teach the runner hidden recipes or workaround sequences; if the path is not discoverable from docs/usage/`--help`, the TC is wrong or the public surface needs improvement.
142
+ - Use runner observations to record friction and workaround pressure, not to normalize it.
143
+ - For watch/live-output flows, use a bounded-session pattern with explicit shutdown and captured exit code.
126
144
  - Record why each scenario remains E2E via `e2e-justification` and `unit-coverage-reviewed` in `scenario.yml`.
127
145
 
128
146
  ## Related
129
147
 
130
148
  - [scenario.yml Reference](scenario-yml-reference.g.md)
131
- - [E2E Testing Guide](e2e-testing.g.md)
149
+ - [E2E Testing Guide](e2e-testing.g.md)
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: as-e2e-fix
3
- description: Diagnose, fix, and rerun failing E2E tests systematically
3
+ description: Diagnose, fix, and rerun failing E2E tests systematically, generating failure analysis when needed
4
4
  # context: no-fork
5
5
  # agent: general-purpose
6
6
  user-invocable: true
@@ -32,4 +32,4 @@ skill:
32
32
  workflow: wfi://e2e/fix
33
33
  ---
34
34
 
35
- Load and run `ace-bundle wfi://e2e/fix` in the current project, then follow the loaded workflow as the source of truth and execute it end-to-end instead of only summarizing it.
35
+ Load and run `ace-bundle wfi://e2e/fix` in the current project, then follow the loaded workflow as the source of truth and execute it end-to-end instead of only summarizing it. If E2E failure analysis is missing or incomplete, generate it via `wfi://e2e/analyze-failures` as part of the fix workflow before applying changes.