ace-test-runner-e2e 0.29.8 → 0.38.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.ace-defaults/e2e-runner/config.yml +14 -2
  3. data/CHANGELOG.md +178 -0
  4. data/README.md +2 -2
  5. data/exe/ace-test-e2e-sh +9 -4
  6. data/handbook/guides/e2e-testing.g.md +43 -9
  7. data/handbook/guides/scenario-yml-reference.g.md +16 -8
  8. data/handbook/guides/tc-authoring.g.md +12 -5
  9. data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
  10. data/handbook/skills/as-e2e-review/SKILL.md +2 -2
  11. data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
  12. data/handbook/templates/agent-experience-report.template.md +3 -2
  13. data/handbook/templates/scenario.yml.template.yml +7 -2
  14. data/handbook/templates/tc-file.template.md +14 -4
  15. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
  16. data/handbook/workflow-instructions/e2e/create.wf.md +118 -25
  17. data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
  18. data/handbook/workflow-instructions/e2e/fix.wf.md +65 -15
  19. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +17 -1
  20. data/handbook/workflow-instructions/e2e/review.wf.md +36 -25
  21. data/handbook/workflow-instructions/e2e/rewrite.wf.md +15 -8
  22. data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
  23. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
  24. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
  25. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
  26. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +21 -8
  27. data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
  28. data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
  29. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
  30. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
  31. data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
  32. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
  33. data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
  34. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +157 -16
  35. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +121 -8
  36. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
  37. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +119 -18
  38. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +13 -12
  39. data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +282 -0
  40. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +85 -5
  41. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +98 -16
  42. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +241 -97
  43. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
  44. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
  45. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +73 -15
  46. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +120 -19
  47. data/lib/ace/test/end_to_end_runner/version.rb +1 -1
  48. data/lib/ace/test/end_to_end_runner.rb +2 -0
  49. metadata +19 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6206e4d6f65fe1ab5c27d1b5479e37af079b3554bd6289786aa71ce62e4ecf50
4
- data.tar.gz: bedb5fa2830bc1f2818e2246acbbab15ec6a7227898bf4dba2b23b6521eb8d5b
3
+ metadata.gz: 1a1e81b2b077a6bca7e75e1572743a31f20abe1ef5ebcb69ea82c7f55e95fd4b
4
+ data.tar.gz: e1791696e6cbbb58decab800387e005ee22f134734be2736d5c00f109273dd57
5
5
  SHA512:
6
- metadata.gz: 9d06bc8d9447debe2b48128b7c45ea0a357d01677b9b8b48fa508c5f8a078e8b4ed0396a4d4d38c06be25573701d4bd75b6f481438d116e3499b4b1890a9edd5
7
- data.tar.gz: b148663600b83ffde9821761a1ef4a7b43d11a2efc10d97433bb090d4ae2bfc8f7dd997756190cdafc894c497f1eada99faf3b78dcf7df38ed7eec9d57cedec3
6
+ metadata.gz: 143efde4ad09db543ff0865da3de1a94343c278a64c12f55f1786718725846b5cec565e144687b8c9d16bbc15736bf73abb34b30b49fa746a56ed522978e6434
7
+ data.tar.gz: b166bec29e9f10d0eff3b692d6526c22e0960252fb1a9d6d125c646b47a14137472775f73a9120365ebb608d7f2ccc38f88b4694e1b144165cf1a2994512ed2e
@@ -2,12 +2,16 @@
2
2
  # This file provides defaults for the ace-test-runner-e2e gem
3
3
 
4
4
  paths:
5
+ # Preferred location for deterministic preflight tests in packages.
6
+ preflight: "test/feat"
5
7
  # Where test scenarios are stored in packages
6
8
  scenarios: "test/e2e"
7
9
  # Directory for test execution artifacts (gitignored)
8
10
  cache_dir: ".ace-local/test-e2e"
9
11
 
10
12
  patterns:
13
+ # Glob pattern for deterministic preflight tests.
14
+ preflight: "test/feat/**/*_test.rb"
11
15
  # Glob pattern for finding test scenarios (TS-format directories)
12
16
  discovery: "test/e2e/TS-*/scenario.yml"
13
17
 
@@ -38,13 +42,21 @@ reporting:
38
42
 
39
43
  # Execution defaults
40
44
  execution:
41
- # Default LLM provider:model for test execution
42
- provider: "role:e2e-executor"
45
+ # Legacy provider fallback when runner/verifier are not explicitly split
46
+ provider: "role:e2e-runner"
47
+ # LLM provider:model for runner execution
48
+ runner_provider: "role:e2e-runner"
49
+ # LLM provider:model for verifier execution
50
+ verifier_provider: "role:e2e-verifier"
43
51
  # Timeout per test in seconds
44
52
  timeout: 600
45
53
  # Number of tests to run in parallel (1 = sequential)
46
54
  parallel: 3
47
55
 
56
+ sandbox:
57
+ profile: "ace-default"
58
+ ruby_version: "3.4.9"
59
+
48
60
  # Provider configuration
49
61
  providers:
50
62
  # CLI providers use deterministic pipeline execution (runner + verifier)
data/CHANGELOG.md CHANGED
@@ -7,6 +7,184 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.38.11] - 2026-04-20
11
+
12
+ ### Fixed
13
+ - Spaced batch run IDs by 100ms in `TestOrchestrator` so generated 50ms-format IDs remain unique under fast consecutive suite execution.
14
+
15
+ ## [0.38.10] - 2026-04-19
16
+
17
+ ### Fixed
18
+ - Added strict runner ordering guidance, verifier artifact mtimes, and direct goal-number-to-TC mapping so E2E reports classify out-of-order postcondition captures as runner errors instead of shifting failed TC IDs.
19
+
20
+ ## [0.38.9] - 2026-04-19
21
+
22
+ ### Changed
23
+ - Strengthened the E2E failure-analysis and fix workflows to require explicit docs/help drift reporting for every failed TC, so stale usage docs or CLI help surfaced by E2E failures become concrete fix targets instead of hidden runner workarounds.
24
+
25
+ ## [0.38.8] - 2026-04-16
26
+
27
+ ### Fixed
28
+ - Synced protocol-source package trees into prepared sandboxes before deterministic setup, preserved the sanitized setup environment for runner and verifier execution, and tightened the shared runner contract to require direct `ace-*` commands with immediate `.stdout` / `.stderr` / `.exit` persistence.
29
+
30
+ ## [0.38.7] - 2026-04-16
31
+
32
+ ### Fixed
33
+ - Reused already prepared CLI-provider sandboxes during pipeline execution so the runner no longer rewrites tracked sandbox state after deterministic setup, which prevents staged-path failures caused by post-setup provider-directory symlinks.
34
+
35
+ ## [0.38.6] - 2026-04-16
36
+
37
+ ### Fixed
38
+ - Scoped declared sandbox-layout artifacts to the active test case, recorded present-versus-missing required artifacts in harness snapshots and report metadata, and passed that contract into verifier prompts.
39
+ - Added canonical goal-verdict reporting so generated scenario reports keep the authoritative failed-TC mapping even when narrative evidence includes contradictory wording.
40
+
41
+ ## [0.38.5] - 2026-04-16
42
+
43
+ ### Fixed
44
+ - Synced package protocol-source manifests into copied E2E sandboxes so bundled workflow and skill resolution continues to work after sandbox setup.
45
+ - Hardened the shared runner prompt contract to preserve sandbox runtime `PATH`/environment and forbid wrapper patterns that break direct `ace-*` execution.
46
+
47
+ ## [0.38.4] - 2026-04-16
48
+
49
+ ### Fixed
50
+ - Built a dedicated sandbox runtime for E2E runs with sandbox-local Gemfile, Bundler state, gem home, bin shims, verifier sandbox context, preserved report-directory reuse, and wrapper-compatible launch behavior so sandboxed commands stop leaking back into the source worktree.
51
+
52
+ ## [0.38.3] - 2026-04-16
53
+
54
+ ### Fixed
55
+ - Stripped inherited Bundler and Ruby env leakage from sandboxed E2E subprocesses, created sandbox-local Bundler state, preserved failure-stub report directories in suite aggregation, and aligned shared setup templates/docs with the `ACE_E2E_SOURCE_ROOT` source-root contract.
56
+
57
+ ## [0.38.2] - 2026-04-16
58
+
59
+ ### Fixed
60
+ - Prepared setup steps with sandbox runtime environment, hardened runtime directory permissions for tmux access, and kept sandbox support paths aligned with the active `bubblewrap` execution model.
61
+
62
+ ## [0.38.1] - 2026-04-15
63
+
64
+ ### Fixed
65
+ - Tightened the Linux `bubblewrap` sandbox mounts to preserve required device access such as `/dev/null` while keeping the host filesystem isolated.
66
+ - Moved sandbox support directories outside the copied repo workspace so E2E setup steps like `git add -A` no longer stage sandbox home, tmp, or runtime files.
67
+
68
+ ## [0.38.0] - 2026-04-15
69
+
70
+ ### Changed
71
+ - Rewrote `TS-RUNNER-001` to use public fixture-driven discovery (`copy-fixtures`) and expanded suite control-flow coverage beyond help-only output.
72
+ - Added `TS-RUNNER-002` to cover real non-dry run report generation, verifier-output evidence, and explicit `ace-test-e2e-sh` public shell-helper usage.
73
+ - Updated `docs/usage.md` with safe shell-helper workflows tied to deterministic `.ace-local/test-e2e/` report paths.
74
+
75
+ ### Fixed
76
+ - Routed setup/runner/verifier subprocesses through the new sandbox backend, kept user-facing verifier metadata in written reports, and taught the minimal verifier parser to accept standalone `Results: X/Y passed` summaries.
77
+
78
+ ## [0.37.2] - 2026-04-14
79
+
80
+ ### Changed
81
+ - Added a canonical public-surface gate across the E2E handbook so goal-based scenarios must prove both that the tool works and that a user can complete the job from docs, `--help`, and the public CLI without hidden recipes or workarounds.
82
+ - Updated the create/review/plan/rewrite/run/fix workflow guidance, shared guides, and templates to treat workaround-driven scenarios as invalid or at-risk and to record friction through runner observations instead of teaching fallback procedures.
83
+
84
+ ## [0.37.1] - 2026-04-13
85
+
86
+ ### Changed
87
+ - Updated the canonical E2E create/review/rewrite/run guidance, templates, and references so goal-based scenarios are written around final sandbox state plus runner observations instead of helper artifacts under `results/`.
88
+
89
+ ## [0.37.0] - 2026-04-13
90
+
91
+ ### Changed
92
+ - Made runner `Observations` the canonical non-filesystem evidence channel for goal-based E2E scenarios, passed them directly into verifier prompts, and persisted them through the harness-managed report surface.
93
+ - Updated the shared E2E template, authoring guides, and rewrite/run workflows to require goal achievement from sandbox end state first, using runner observations as the only secondary evidence source instead of helper artifacts under `results/`.
94
+
95
+ ## [0.36.1] - 2026-04-13
96
+
97
+ ### Fixed
98
+ - Preferred canonical per-scenario `report.md` metadata when building aggregate package and suite reports so failed TC mappings no longer drift from the underlying scenario reports.
99
+ - Added explicit dirty-worktree diagnostics to suite reporting so tracked repo mutations are surfaced as runner diagnostics instead of being inferred after the fact.
100
+
101
+ ### Changed
102
+ - Updated the canonical E2E failure-analysis and fix workflows plus usage guidance to treat aggregate reports as indexes and per-scenario reports as the source of truth for TC-level triage.
103
+
104
+ ## [0.36.0] - 2026-04-13
105
+
106
+ ### Fixed
107
+ - Renamed aggregated E2E outputs to scope-specific package and suite report filenames instead of the ambiguous shared `final-report` label.
108
+ - Stripped ambient `TMUX` and `TMUX_PANE` state from setup and pipeline subprocess environments so E2E runs do not accidentally attach to the operator's live tmux session.
109
+
110
+ ### Technical
111
+ - Updated suite orchestrator/report writer coverage and E2E workflow guidance around the explicit package-vs-suite report contract.
112
+
113
+ ## [0.35.0] - 2026-04-13
114
+ ### Changed
115
+ - **ace-test-runner-e2e v0.35.0**: Added optional scenario artifact declarations via `(optional)`, separated required and optional artifact tracking, and included optional outputs in manifests and snapshots without failing scenarios when they are absent.
116
+
117
+ ## [0.34.1] - 2026-04-13
118
+
119
+ ### Changed
120
+ - Completed the batch i05 migration follow-through for this package and aligned it with the restarted `fast` / `feat` / `e2e` verification model.
121
+
122
+ ### Technical
123
+ - Included in the coordinated assignment-driven patch release for batch i05 package updates.
124
+
125
+
126
+ ## [0.34.0] - 2026-04-12
127
+
128
+ ### Changed
129
+ - Migrated package deterministic tests to the restarted `fast`/`feat` layout by moving `test/atoms`, `test/commands`, `test/handbook`, `test/models`, `test/molecules`, and `test/organisms` under `test/fast/`, and moving legacy `test/integration` coverage into `test/feat/`.
130
+ - Updated package docs and CLI wording to teach `fast`/`feat` deterministic coverage plus scenario-only `test/e2e` execution via `ace-test-e2e`.
131
+ - Refreshed `TS-RUNNER-001` scenario metadata and decision-record unit coverage references to point at migrated `test/fast` paths.
132
+
133
+ ## [0.33.1] - 2026-04-12
134
+
135
+ ### Fixed
136
+ - Made suite final reports deterministic for canonical sections by deriving summary rows, failed-test details, reports tables, and the overall line from runtime results instead of model-authored prose.
137
+ - Added regression coverage so hallucinated scenario titles, failed TC IDs, and duplicate overall lines are ignored or replaced before report files are written.
138
+
139
+ ## [0.33.0] - 2026-04-11
140
+
141
+ ### Changed
142
+ - Made `wfi://e2e/fix` a self-bootstrapping workflow that reuses existing failure analysis when present and generates it via `wfi://e2e/analyze-failures` when missing or incomplete.
143
+ - Updated the canonical `as-e2e-fix` skill contract to state that missing analysis is generated automatically before fixes are applied.
144
+
145
+ ### Technical
146
+
147
+ - Refactored `ConfigLoader` molecule tests to use config mock mode, removing dependency on monorepo `.ace` overrides and making the test contract stable across environments.
148
+
149
+ ## [0.32.2] - 2026-04-11
150
+
151
+ ### Fixed
152
+ - Generated per-scenario CLI batch `run_id`s from explicit 50ms timestamp buckets so parallel package runs no longer occasionally reuse the same report-path ID and trip the unique-run-id orchestration contract.
153
+
154
+ ## [0.32.1] - 2026-04-11
155
+
156
+ ### Technical
157
+ - Synced the canonical `as-e2e-review` skill description with the package-targeted assign verification contract so shipped metadata no longer implies broader scenario-sweep execution.
158
+
159
+ ## [0.31.0] - 2026-04-10
160
+
161
+ ### Changed
162
+ - Restored the two-phase E2E harness to run deterministic `test/integration` coverage before agent scenarios from `test/e2e`, with integration failures short-circuiting scenario execution.
163
+ - Added deterministic integration execution, richer per-test-case manifests and artifact snapshotting, and refreshed CLI/docs/workflows/tests around the restarted layout and role-based runner/verifier contract.
164
+
165
+ ### Fixed
166
+ - Accepted minimal verifier evidence responses in the runner pipeline so successful scenario runs no longer fail when a verifier omits the full structured envelope.
167
+
168
+ ## [0.30.2] - 2026-04-10
169
+
170
+ ### Fixed
171
+ - Surface `git diff` stderr when affected-package detection fails so invalid refs and shallow-clone failures no longer look like empty affected sets.
172
+
173
+ ## [0.30.1] - 2026-04-10
174
+
175
+ ### Fixed
176
+ - Raised the `ace-support-test-helpers` runtime dependency floor to `~> 0.14` so released installs accept the shared sandbox package-copy helper line used by the restarted runner.
177
+ - Restored the `TS-RUNNER-001` smoke scenario fixture source path so the CLI smoke scenario resolves its canonical demo fixture again.
178
+
179
+ ## [0.30.0] - 2026-04-10
180
+
181
+ ### Changed
182
+ - Reworked `ace-test-runner-e2e` back into a two-phase contract, with deterministic integration from `test/integration` before agent scenarios from `test/e2e`.
183
+ - Switched sandbox orchestration to the shared package-copy helper and refreshed CLI/docs/workflows for the restarted E2E structure.
184
+
185
+ ### Fixed
186
+ - Hardened affected-file detection by capturing git diff stderr so provider-side affected checks fail with clearer diagnostics.
187
+
10
188
  ## [0.29.8] - 2026-04-01
11
189
 
12
190
  ### Fixed
data/README.md CHANGED
@@ -18,11 +18,11 @@
18
18
 
19
19
  ![ace-test-runner-e2e demo](docs/demo/ace-test-runner-e2e-getting-started.gif)
20
20
 
21
- `ace-test-runner-e2e` runs realistic workflow scenarios through coding agents so teams can validate behavior beyond unit and integration coverage while keeping execution reproducible and isolated from the working tree.
21
+ `ace-test-runner-e2e` runs realistic workflow scenarios through coding agents so teams can validate behavior beyond deterministic package tests while keeping execution reproducible and isolated from the working tree.
22
22
 
23
23
  ## How It Works
24
24
 
25
- 1. Discover E2E scenario definitions from package-local `test/e2e/` suites with metadata, tags, and command flows.
25
+ 1. Discover deterministic preflight tests from package-local `test/feat/` and agent scenarios from `test/e2e/`, preserving metadata, tags, and command flows.
26
26
  2. Execute scenarios inside reproducible sandboxes that isolate agent runs from the working tree.
27
27
  3. Produce structured reports that are easy to inspect, compare across runs, and feed back into triage workflows.
28
28
 
data/exe/ace-test-e2e-sh CHANGED
@@ -1,6 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
  # frozen_string_literal: true
3
3
 
4
+ require_relative "../lib/ace/test/end_to_end_runner"
5
+
4
6
  # ace-test-e2e-sh - Execute commands within E2E test sandbox
5
7
  #
6
8
  # Usage:
@@ -57,11 +59,14 @@ unless Dir.exist?(test_dir)
57
59
  exit 1
58
60
  end
59
61
 
60
- Dir.chdir(test_dir)
61
- ENV["PROJECT_ROOT_PATH"] = test_dir
62
+ backend = Ace::Test::EndToEndRunner::Molecules::BwrapSandboxBackend.new(
63
+ sandbox_root: test_dir,
64
+ source_root: ENV["ACE_E2E_SOURCE_ROOT"]
65
+ )
66
+ env = backend.prepared_env("PROJECT_ROOT_PATH" => test_dir, "ACE_E2E_SOURCE_ROOT" => ENV["ACE_E2E_SOURCE_ROOT"])
62
67
 
63
68
  if ARGV.empty?
64
- exec "bash"
69
+ backend.exec(["bash"], chdir: test_dir, env: env)
65
70
  else
66
- exec(*ARGV)
71
+ backend.exec(ARGV, chdir: test_dir, env: env)
67
72
  end
@@ -3,8 +3,8 @@ doc-type: guide
3
3
  title: E2E Testing Guide
4
4
  purpose: Conventions and best practices for agent-executed end-to-end tests
5
5
  ace-docs:
6
- last-updated: 2026-03-12
7
- last-checked: 2026-03-21
6
+ last-updated: 2026-04-19
7
+ last-checked: 2026-04-19
8
8
  ---
9
9
 
10
10
  # E2E Testing Guide
@@ -12,6 +12,7 @@ ace-docs:
12
12
  ## Overview
13
13
 
14
14
  E2E tests are executed by an AI agent and reserved for behaviors that require real CLI execution, real tools, and real filesystem side effects.
15
+ They must also answer a user-journey question: can a user do the job from the tool's public surface, and how much friction does that journey have?
15
16
 
16
17
  ## Canonical Conventions
17
18
 
@@ -24,7 +25,7 @@ E2E tests are executed by an AI agent and reserved for behaviors that require re
24
25
  - `TC-*.verify.md`
25
26
  - `runner.yml.md`
26
27
  - `verifier.yml.md`
27
- - TC artifacts use `results/tc/{NN}/`
28
+ - TC outcome artifacts use `results/tc/{NN}/`
28
29
  - Summary reports use `tcs-passed`, `tcs-failed`, `tcs-total`, and `failed[].tc`
29
30
  - Scenarios declare `tags` for discovery-time filtering via `--tags`/`--exclude-tags`
30
31
 
@@ -32,15 +33,19 @@ E2E tests are executed by an AI agent and reserved for behaviors that require re
32
33
 
33
34
  - Runner is **execution-only**:
34
35
  - perform user-like CLI actions in sandbox
35
- - produce evidence files under `results/tc/{NN}/`
36
+ - produce only final outcome evidence under `results/tc/{NN}/`
37
+ - return final runner observations through the harness contract
36
38
  - do not issue PASS/FAIL verdicts
37
39
  - do not perform verifier-style assertion/classification
40
+ - do not invent workarounds or hidden command recipes to compensate for docs/help/CLI gaps
38
41
  - Verifier is **verification-only**:
39
42
  - evaluate TC outcome from sandbox evidence
43
+ - use runner observations as the only non-filesystem secondary evidence source
40
44
  - apply an **impact-first** evidence order:
41
45
  1. sandbox/project state impact
42
- 2. explicit TC artifacts
43
- 3. debug captures (`stdout`, `stderr`, `*.exit`, metadata) only as fallback
46
+ 2. runner observations
47
+ 3. explicit TC artifacts that are true product outcomes
48
+ 4. debug captures (`stdout`, `stderr`, `*.exit`, metadata) only as fallback
44
49
  - Setup ownership:
45
50
  - sandbox preparation belongs to `scenario.yml` `setup:` + `fixtures/`
46
51
  - TC runner files must not define independent environment setup procedures
@@ -52,7 +57,25 @@ Before adding a TC, confirm the behavior needs:
52
57
  - real external tools/processes
53
58
  - real filesystem I/O and environment state
54
59
 
55
- If not, keep coverage in unit/integration tests.
60
+ If not, keep coverage in `fast`/`feat` tests.
61
+
62
+ ## Public-Surface Gate
63
+
64
+ Before keeping or adding a goal-style TC, confirm the user job is achievable from:
65
+ - package README / usage docs
66
+ - `--help`
67
+ - declared fixtures/setup
68
+ - the tool under test itself
69
+
70
+ Reject or rewrite the TC if it depends on:
71
+ - hidden recipes embedded in runner instructions
72
+ - workaround branches for unsupported or undocumented behavior
73
+ - direct supporting-tool probes as the primary oracle
74
+ - internal details that are not necessary to prove the user job
75
+
76
+ When an E2E failure shows that a valid user job is not discoverable from docs, usage guides, or `--help`, treat that as
77
+ docs/help drift. Failure analysis must record the stale or missing public surface and the exact docs/help target to
78
+ update instead of teaching the runner a workaround.
56
79
 
57
80
  ## Cost and Scope
58
81
 
@@ -79,6 +102,7 @@ The verifier is always-on for standalone goal-mode TCs in the CLI pipeline. For
79
102
  ## Scenario Layout
80
103
 
81
104
  ```text
105
+ {package}/test/feat/**/*_test.rb
82
106
  {package}/test/e2e/TS-{AREA}-{NNN}-{slug}/
83
107
  scenario.yml
84
108
  runner.yml.md
@@ -101,9 +125,14 @@ This prevents duplicate assertions across test layers.
101
125
  ## Authoring Rules
102
126
 
103
127
  - Keep runner goals outcome-oriented and deterministic.
128
+ - Keep runner goals aligned with the public user path; if the runner needs a workaround, surface that as friction rather than teaching the workaround.
104
129
  - Keep verifier expectations impact-first, then artifacts, then debug fallback.
105
130
  - Preserve strict TC pairing (`runner` + `verify`).
106
- - Keep outputs inside `results/tc/{NN}/`.
131
+ - Keep `results/tc/{NN}/` for outcome artifacts only.
132
+ - Do not instruct runners to create helper YAML, path files, command files, or reflections in `results/`.
133
+ - Do not judge success from runner-authored summaries when final sandbox state can prove the goal directly.
134
+ - Use runner observations only to explain ambiguity or missing side effects, not to replace missing end-state evidence.
135
+ - Treat any workaround noted in runner observations as a product/docs/help or scenario-design smell that must be fixed, not preserved.
107
136
  - Avoid hidden dependencies between TCs unless explicitly intended.
108
137
 
109
138
  ## Execution Artifacts
@@ -121,4 +150,9 @@ Before approving new/updated E2E tests:
121
150
  - [ ] `runner.yml.md` and `verifier.yml.md` exist
122
151
  - [ ] Every TC has both `.runner.md` and `.verify.md`
123
152
  - [ ] Artifacts are scoped to `results/tc/{NN}/`
124
- - [ ] Value-gate metadata is present (`e2e-justification`, `unit-coverage-reviewed`, `cost-tier`)
153
+ - [ ] Verifier primary oracle is final sandbox state or real product output, not helper artifacts
154
+ - [ ] Runner observations are the only non-filesystem secondary evidence source
155
+ - [ ] Scenario can be completed from docs/usage/`--help` without hidden recipes or workaround instructions
156
+ - [ ] Any friction/workaround found during review is treated as a gap, not as a runner script opportunity
157
+ - [ ] Failure analysis records docs/help drift from failed public user paths, or explicitly records `None`
158
+ - [ ] Value-gate metadata is present (`e2e-justification`, `unit-coverage-reviewed`, `cost-tier`)
@@ -46,14 +46,14 @@ Example: `ace-lint/test/e2e/TS-LINT-001-lint-pipeline/scenario.yml`
46
46
  |-------|------|---------|-------------|
47
47
  | `priority` | string | `medium` | Test priority: `high`, `medium`, `low` |
48
48
  | `tool-under-test` | string | — | Primary command/tool validated |
49
- | `sandbox-layout` | object | `{}` | Declared artifact paths and expected outputs |
49
+ | `sandbox-layout` | object | `{}` | Outcome-path hints used to precreate directories and guide verification |
50
50
  | `duration` | string | — | Estimated duration (e.g., `~15min`) |
51
51
  | `timeout` | integer | — | Optional per-scenario execution timeout in seconds |
52
52
  | `automation-candidate` | boolean | `false` | Whether test is automatable |
53
53
  | `tags` | array | `[]` | Scenario tags for filtering with `--tags`/`--exclude-tags` (OR semantics) |
54
54
  | `cost-tier` | string | `smoke` | Run profile: `smoke`, `happy-path`, `deep` |
55
55
  | `e2e-justification` | string | — | Why E2E is needed |
56
- | `unit-coverage-reviewed` | array | `[]` | Unit/integration files reviewed |
56
+ | `unit-coverage-reviewed` | array | `[]` | Deterministic test files reviewed (`test/fast` and/or `test/feat`) |
57
57
  | `requires` | object | — | Test prerequisites |
58
58
  | `setup` | array | `[]` | Setup directives before execution |
59
59
  | `last-verified` | string | — | Last successful verification date |
@@ -73,6 +73,8 @@ Pairing rule:
73
73
  Artifact layout conventions:
74
74
  - canonical: `results/tc/{NN}/`
75
75
  - avoid non-TC-scoped result folders
76
+ - keep only real outcome artifacts under `results/tc/{NN}/`; runner observations live in harness reports, not sandbox helper files
77
+ - absence of a declared path is debug context, not a standalone failure reason
76
78
 
77
79
  Canonical summary report fields:
78
80
  - `tcs-passed`
@@ -83,6 +85,7 @@ Canonical summary report fields:
83
85
  Role contract:
84
86
  - `runner.yml.md` + `TC-*.runner.md` are execution-only.
85
87
  - `verifier.yml.md` + `TC-*.verify.md` are verification-only with impact-first checks.
88
+ - Goal-style scenarios should be solvable from the public surface (docs/usage/`--help` + tool under test) without hidden recipes or workaround instructions.
86
89
 
87
90
  ## `requires` Object
88
91
 
@@ -92,6 +95,11 @@ requires:
92
95
  ruby: ">= 3.0"
93
96
  ```
94
97
 
98
+ `requires.tools` rules:
99
+ - declare execution prerequisites and supporting environment dependencies
100
+ - do not use `requires.tools` as permission to make fallback probes the primary oracle
101
+ - for ACE CLI scenarios, support tools are setup/dependency context unless the scenario is explicitly about that support tool itself
102
+
95
103
  ## `setup` Directives
96
104
 
97
105
  Available directives:
@@ -112,7 +120,7 @@ setup:
112
120
  - git-init
113
121
  - tmux-session:
114
122
  name-source: run-id
115
- - run: "cp $PROJECT_ROOT_PATH/mise.toml mise.toml && mise trust mise.toml"
123
+ - run: "cp ${ACE_E2E_SOURCE_ROOT:-$PROJECT_ROOT_PATH}/mise.toml mise.toml && mise trust mise.toml"
116
124
  - copy-fixtures
117
125
  - run: git add -A && git commit -m "initial" --quiet
118
126
  - agent-env:
@@ -137,17 +145,17 @@ cost-tier: smoke
137
145
  tags: [smoke, "use-case:lint"]
138
146
  e2e-justification: "Validates real subprocess behavior and report file generation"
139
147
  unit-coverage-reviewed:
140
- - test/molecules/lint_runner_test.rb
141
- - test/organisms/lint_orchestrator_test.rb
148
+ - test/fast/molecules/lint_runner_test.rb
149
+ - test/fast/organisms/lint_orchestrator_test.rb
142
150
  tool-under-test: ace-lint
143
151
  sandbox-layout:
144
- results/tc/01/: "help artifacts"
152
+ results/tc/01/: "Goal 1 outcome artifacts"
145
153
  requires:
146
154
  tools: [ace-lint, standardrb, jq]
147
155
  ruby: ">= 3.0"
148
156
  setup:
149
157
  - git-init
150
- - run: "cp $PROJECT_ROOT_PATH/mise.toml mise.toml && mise trust mise.toml"
158
+ - run: "cp ${ACE_E2E_SOURCE_ROOT:-$PROJECT_ROOT_PATH}/mise.toml mise.toml && mise trust mise.toml"
151
159
  - copy-fixtures
152
160
  - agent-env:
153
161
  PROJECT_ROOT_PATH: "."
@@ -179,4 +187,4 @@ test/e2e/TS-LINT-001-lint-pipeline/
179
187
  ├── TC-001-help-survey.runner.md
180
188
  ├── TC-001-help-survey.verify.md
181
189
  └── fixtures/
182
- ```
190
+ ```
@@ -29,7 +29,7 @@ Inline `.tc.md` and frontmatter `mode` values are no longer supported.
29
29
  - Scenario-level config files:
30
30
  - `runner.yml.md`
31
31
  - `verifier.yml.md`
32
- - TC artifacts write to `results/tc/{NN}/`
32
+ - TC outcome artifacts write to `results/tc/{NN}/`
33
33
  - Summary counters use `tcs-passed`, `tcs-failed`, and `tcs-total`
34
34
 
35
35
  ## File Naming
@@ -77,12 +77,13 @@ Run `ace-lint` and produce report artifacts for a valid file.
77
77
  ## Workspace
78
78
 
79
79
  - Root: sandbox directory
80
- - Output: `results/tc/01/`
80
+ - Outcome artifacts: `results/tc/01/`
81
81
 
82
82
  ## Constraints
83
83
 
84
84
  - Use only sandbox paths
85
- - Keep evidence under `results/tc/01/`
85
+ - Keep only final outcome evidence under `results/tc/01/`
86
+ - Do not place helper inputs, manifests, command transcripts, or reflections under `results/tc/01/`
86
87
  - Execute actions only; do not assign PASS/FAIL or final verdicts
87
88
  ```
88
89
 
@@ -102,6 +103,7 @@ Example:
102
103
 
103
104
  - **Impact Checks**: target sandbox/project state changed as expected
104
105
  - **Artifact Checks**: `results/tc/01/report.json` exists and is valid
106
+ - **Runner Observations**: use harness-provided end-of-run observations only as supporting context
105
107
  - **Debug Fallback**: inspect `stdout`/`stderr`/`*.exit` only when primary checks are inconclusive
106
108
 
107
109
  ## Verdict
@@ -122,10 +124,15 @@ Pass only when all expectations are satisfied by on-disk evidence.
122
124
  - Ensure goal numbers and TC numbers remain aligned (`TC-001` -> Goal 1).
123
125
  - Keep runner files execution-only and verifier files verdict-only.
124
126
  - Make verifier expectations deterministic with impact-first ordering.
125
- - Keep all artifacts under `results/tc/{NN}/` to avoid cross-goal contamination.
127
+ - Keep `results/tc/{NN}/` for outcome artifacts only.
128
+ - Use harness-provided runner observations as the only non-filesystem secondary evidence source.
129
+ - Prefer final sandbox state and real product output over raw debug captures.
130
+ - Do not ask the runner to write setup inputs, audit manifests, or final reflections for the verifier.
131
+ - Do not teach the runner hidden recipes or workaround sequences; if the path is not discoverable from docs/usage/`--help`, the TC is wrong or the public surface needs improvement.
132
+ - Use runner observations to record friction and workaround pressure, not to normalize it.
126
133
  - Record why each scenario remains E2E via `e2e-justification` and `unit-coverage-reviewed` in `scenario.yml`.
127
134
 
128
135
  ## Related
129
136
 
130
137
  - [scenario.yml Reference](scenario-yml-reference.g.md)
131
- - [E2E Testing Guide](e2e-testing.g.md)
138
+ - [E2E Testing Guide](e2e-testing.g.md)
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: as-e2e-fix
3
- description: Diagnose, fix, and rerun failing E2E tests systematically
3
+ description: Diagnose, fix, and rerun failing E2E tests systematically, generating failure analysis when needed
4
4
  # context: no-fork
5
5
  # agent: general-purpose
6
6
  user-invocable: true
@@ -32,4 +32,4 @@ skill:
32
32
  workflow: wfi://e2e/fix
33
33
  ---
34
34
 
35
- Load and run `ace-bundle wfi://e2e/fix` in the current project, then follow the loaded workflow as the source of truth and execute it end-to-end instead of only summarizing it.
35
+ Load and run `ace-bundle wfi://e2e/fix` in the current project, then follow the loaded workflow as the source of truth and execute it end-to-end instead of only summarizing it. If E2E failure analysis is missing or incomplete, generate it via `wfi://e2e/analyze-failures` as part of the fix workflow before applying changes.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: as-e2e-review
3
- description: Deep exploration producing a coverage matrix of functionality, unit tests, and E2E tests
3
+ description: Review E2E coverage for modified packages and run targeted package scenarios
4
4
  # bundle: wfi://e2e/review
5
5
  # agent: general-purpose
6
6
  user-invocable: true
@@ -24,7 +24,7 @@ assign:
24
24
  source: wfi://e2e/review
25
25
  steps:
26
26
  - name: verify-e2e
27
- description: Review E2E coverage for modified packages and run targeted scenarios
27
+ description: Review E2E coverage for modified packages and run targeted package scenarios
28
28
  tags: [testing, e2e, verification]
29
29
  skill:
30
30
  kind: workflow