ace-test-runner-e2e 0.29.8 → 0.38.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ace-defaults/e2e-runner/config.yml +14 -2
- data/CHANGELOG.md +178 -0
- data/README.md +2 -2
- data/exe/ace-test-e2e-sh +9 -4
- data/handbook/guides/e2e-testing.g.md +43 -9
- data/handbook/guides/scenario-yml-reference.g.md +16 -8
- data/handbook/guides/tc-authoring.g.md +12 -5
- data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
- data/handbook/skills/as-e2e-review/SKILL.md +2 -2
- data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
- data/handbook/templates/agent-experience-report.template.md +3 -2
- data/handbook/templates/scenario.yml.template.yml +7 -2
- data/handbook/templates/tc-file.template.md +14 -4
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
- data/handbook/workflow-instructions/e2e/create.wf.md +118 -25
- data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
- data/handbook/workflow-instructions/e2e/fix.wf.md +65 -15
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +17 -1
- data/handbook/workflow-instructions/e2e/review.wf.md +36 -25
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +15 -8
- data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +21 -8
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
- data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
- data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +157 -16
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +121 -8
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +119 -18
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +13 -12
- data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +282 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +85 -5
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +98 -16
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +241 -97
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +73 -15
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +120 -19
- data/lib/ace/test/end_to_end_runner/version.rb +1 -1
- data/lib/ace/test/end_to_end_runner.rb +2 -0
- metadata +19 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1a1e81b2b077a6bca7e75e1572743a31f20abe1ef5ebcb69ea82c7f55e95fd4b
|
|
4
|
+
data.tar.gz: e1791696e6cbbb58decab800387e005ee22f134734be2736d5c00f109273dd57
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 143efde4ad09db543ff0865da3de1a94343c278a64c12f55f1786718725846b5cec565e144687b8c9d16bbc15736bf73abb34b30b49fa746a56ed522978e6434
|
|
7
|
+
data.tar.gz: b166bec29e9f10d0eff3b692d6526c22e0960252fb1a9d6d125c646b47a14137472775f73a9120365ebb608d7f2ccc38f88b4694e1b144165cf1a2994512ed2e
|
|
@@ -2,12 +2,16 @@
|
|
|
2
2
|
# This file provides defaults for the ace-test-runner-e2e gem
|
|
3
3
|
|
|
4
4
|
paths:
|
|
5
|
+
# Preferred location for deterministic preflight tests in packages.
|
|
6
|
+
preflight: "test/feat"
|
|
5
7
|
# Where test scenarios are stored in packages
|
|
6
8
|
scenarios: "test/e2e"
|
|
7
9
|
# Directory for test execution artifacts (gitignored)
|
|
8
10
|
cache_dir: ".ace-local/test-e2e"
|
|
9
11
|
|
|
10
12
|
patterns:
|
|
13
|
+
# Glob pattern for deterministic preflight tests.
|
|
14
|
+
preflight: "test/feat/**/*_test.rb"
|
|
11
15
|
# Glob pattern for finding test scenarios (TS-format directories)
|
|
12
16
|
discovery: "test/e2e/TS-*/scenario.yml"
|
|
13
17
|
|
|
@@ -38,13 +42,21 @@ reporting:
|
|
|
38
42
|
|
|
39
43
|
# Execution defaults
|
|
40
44
|
execution:
|
|
41
|
-
#
|
|
42
|
-
provider: "role:e2e-
|
|
45
|
+
# Legacy provider fallback when runner/verifier are not explicitly split
|
|
46
|
+
provider: "role:e2e-runner"
|
|
47
|
+
# LLM provider:model for runner execution
|
|
48
|
+
runner_provider: "role:e2e-runner"
|
|
49
|
+
# LLM provider:model for verifier execution
|
|
50
|
+
verifier_provider: "role:e2e-verifier"
|
|
43
51
|
# Timeout per test in seconds
|
|
44
52
|
timeout: 600
|
|
45
53
|
# Number of tests to run in parallel (1 = sequential)
|
|
46
54
|
parallel: 3
|
|
47
55
|
|
|
56
|
+
sandbox:
|
|
57
|
+
profile: "ace-default"
|
|
58
|
+
ruby_version: "3.4.9"
|
|
59
|
+
|
|
48
60
|
# Provider configuration
|
|
49
61
|
providers:
|
|
50
62
|
# CLI providers use deterministic pipeline execution (runner + verifier)
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,184 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.38.11] - 2026-04-20
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- Spaced batch run IDs by 100ms in `TestOrchestrator` so generated 50ms-format IDs remain unique under fast consecutive suite execution.
|
|
14
|
+
|
|
15
|
+
## [0.38.10] - 2026-04-19
|
|
16
|
+
|
|
17
|
+
### Fixed
|
|
18
|
+
- Added strict runner ordering guidance, verifier artifact mtimes, and direct goal-number-to-TC mapping so E2E reports classify out-of-order postcondition captures as runner errors instead of shifting failed TC IDs.
|
|
19
|
+
|
|
20
|
+
## [0.38.9] - 2026-04-19
|
|
21
|
+
|
|
22
|
+
### Changed
|
|
23
|
+
- Strengthened the E2E failure-analysis and fix workflows to require explicit docs/help drift reporting for every failed TC, so stale usage docs or CLI help surfaced by E2E failures become concrete fix targets instead of hidden runner workarounds.
|
|
24
|
+
|
|
25
|
+
## [0.38.8] - 2026-04-16
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
- Synced protocol-source package trees into prepared sandboxes before deterministic setup, preserved the sanitized setup environment for runner and verifier execution, and tightened the shared runner contract to require direct `ace-*` commands with immediate `.stdout` / `.stderr` / `.exit` persistence.
|
|
29
|
+
|
|
30
|
+
## [0.38.7] - 2026-04-16
|
|
31
|
+
|
|
32
|
+
### Fixed
|
|
33
|
+
- Reused already prepared CLI-provider sandboxes during pipeline execution so the runner no longer rewrites tracked sandbox state after deterministic setup, which prevents staged-path failures caused by post-setup provider-directory symlinks.
|
|
34
|
+
|
|
35
|
+
## [0.38.6] - 2026-04-16
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
- Scoped declared sandbox-layout artifacts to the active test case, recorded present-versus-missing required artifacts in harness snapshots and report metadata, and passed that contract into verifier prompts.
|
|
39
|
+
- Added canonical goal-verdict reporting so generated scenario reports keep the authoritative failed-TC mapping even when narrative evidence includes contradictory wording.
|
|
40
|
+
|
|
41
|
+
## [0.38.5] - 2026-04-16
|
|
42
|
+
|
|
43
|
+
### Fixed
|
|
44
|
+
- Synced package protocol-source manifests into copied E2E sandboxes so bundled workflow and skill resolution continues to work after sandbox setup.
|
|
45
|
+
- Hardened the shared runner prompt contract to preserve sandbox runtime `PATH`/environment and forbid wrapper patterns that break direct `ace-*` execution.
|
|
46
|
+
|
|
47
|
+
## [0.38.4] - 2026-04-16
|
|
48
|
+
|
|
49
|
+
### Fixed
|
|
50
|
+
- Built a dedicated sandbox runtime for E2E runs with sandbox-local Gemfile, Bundler state, gem home, bin shims, verifier sandbox context, preserved report-directory reuse, and wrapper-compatible launch behavior so sandboxed commands stop leaking back into the source worktree.
|
|
51
|
+
|
|
52
|
+
## [0.38.3] - 2026-04-16
|
|
53
|
+
|
|
54
|
+
### Fixed
|
|
55
|
+
- Stripped inherited Bundler and Ruby env leakage from sandboxed E2E subprocesses, created sandbox-local Bundler state, preserved failure-stub report directories in suite aggregation, and aligned shared setup templates/docs with the `ACE_E2E_SOURCE_ROOT` source-root contract.
|
|
56
|
+
|
|
57
|
+
## [0.38.2] - 2026-04-16
|
|
58
|
+
|
|
59
|
+
### Fixed
|
|
60
|
+
- Prepared setup steps with sandbox runtime environment, hardened runtime directory permissions for tmux access, and kept sandbox support paths aligned with the active `bubblewrap` execution model.
|
|
61
|
+
|
|
62
|
+
## [0.38.1] - 2026-04-15
|
|
63
|
+
|
|
64
|
+
### Fixed
|
|
65
|
+
- Tightened the Linux `bubblewrap` sandbox mounts to preserve required device access such as `/dev/null` while keeping the host filesystem isolated.
|
|
66
|
+
- Moved sandbox support directories outside the copied repo workspace so E2E setup steps like `git add -A` no longer stage sandbox home, tmp, or runtime files.
|
|
67
|
+
|
|
68
|
+
## [0.38.0] - 2026-04-15
|
|
69
|
+
|
|
70
|
+
### Changed
|
|
71
|
+
- Rewrote `TS-RUNNER-001` to use public fixture-driven discovery (`copy-fixtures`) and expanded suite control-flow coverage beyond help-only output.
|
|
72
|
+
- Added `TS-RUNNER-002` to cover real non-dry run report generation, verifier-output evidence, and explicit `ace-test-e2e-sh` public shell-helper usage.
|
|
73
|
+
- Updated `docs/usage.md` with safe shell-helper workflows tied to deterministic `.ace-local/test-e2e/` report paths.
|
|
74
|
+
|
|
75
|
+
### Fixed
|
|
76
|
+
- Routed setup/runner/verifier subprocesses through the new sandbox backend, kept user-facing verifier metadata in written reports, and taught the minimal verifier parser to accept standalone `Results: X/Y passed` summaries.
|
|
77
|
+
|
|
78
|
+
## [0.37.2] - 2026-04-14
|
|
79
|
+
|
|
80
|
+
### Changed
|
|
81
|
+
- Added a canonical public-surface gate across the E2E handbook so goal-based scenarios must prove both that the tool works and that a user can complete the job from docs, `--help`, and the public CLI without hidden recipes or workarounds.
|
|
82
|
+
- Updated the create/review/plan/rewrite/run/fix workflow guidance, shared guides, and templates to treat workaround-driven scenarios as invalid or at-risk and to record friction through runner observations instead of teaching fallback procedures.
|
|
83
|
+
|
|
84
|
+
## [0.37.1] - 2026-04-13
|
|
85
|
+
|
|
86
|
+
### Changed
|
|
87
|
+
- Updated the canonical E2E create/review/rewrite/run guidance, templates, and references so goal-based scenarios are written around final sandbox state plus runner observations instead of helper artifacts under `results/`.
|
|
88
|
+
|
|
89
|
+
## [0.37.0] - 2026-04-13
|
|
90
|
+
|
|
91
|
+
### Changed
|
|
92
|
+
- Made runner `Observations` the canonical non-filesystem evidence channel for goal-based E2E scenarios, passed them directly into verifier prompts, and persisted them through the harness-managed report surface.
|
|
93
|
+
- Updated the shared E2E template, authoring guides, and rewrite/run workflows to require goal achievement from sandbox end state first, using runner observations as the only secondary evidence source instead of helper artifacts under `results/`.
|
|
94
|
+
|
|
95
|
+
## [0.36.1] - 2026-04-13
|
|
96
|
+
|
|
97
|
+
### Fixed
|
|
98
|
+
- Preferred canonical per-scenario `report.md` metadata when building aggregate package and suite reports so failed TC mappings no longer drift from the underlying scenario reports.
|
|
99
|
+
- Added explicit dirty-worktree diagnostics to suite reporting so tracked repo mutations are surfaced as runner diagnostics instead of being inferred after the fact.
|
|
100
|
+
|
|
101
|
+
### Changed
|
|
102
|
+
- Updated the canonical E2E failure-analysis and fix workflows plus usage guidance to treat aggregate reports as indexes and per-scenario reports as the source of truth for TC-level triage.
|
|
103
|
+
|
|
104
|
+
## [0.36.0] - 2026-04-13
|
|
105
|
+
|
|
106
|
+
### Fixed
|
|
107
|
+
- Renamed aggregated E2E outputs to scope-specific package and suite report filenames instead of the ambiguous shared `final-report` label.
|
|
108
|
+
- Stripped ambient `TMUX` and `TMUX_PANE` state from setup and pipeline subprocess environments so E2E runs do not accidentally attach to the operator's live tmux session.
|
|
109
|
+
|
|
110
|
+
### Technical
|
|
111
|
+
- Updated suite orchestrator/report writer coverage and E2E workflow guidance around the explicit package-vs-suite report contract.
|
|
112
|
+
|
|
113
|
+
## [0.35.0] - 2026-04-13
|
|
114
|
+
### Changed
|
|
115
|
+
- **ace-test-runner-e2e v0.35.0**: Added optional scenario artifact declarations via `(optional)`, separated required and optional artifact tracking, and included optional outputs in manifests and snapshots without failing scenarios when they are absent.
|
|
116
|
+
|
|
117
|
+
## [0.34.1] - 2026-04-13
|
|
118
|
+
|
|
119
|
+
### Changed
|
|
120
|
+
- Completed the batch i05 migration follow-through for this package and aligned it with the restarted `fast` / `feat` / `e2e` verification model.
|
|
121
|
+
|
|
122
|
+
### Technical
|
|
123
|
+
- Included in the coordinated assignment-driven patch release for batch i05 package updates.
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
## [0.34.0] - 2026-04-12
|
|
127
|
+
|
|
128
|
+
### Changed
|
|
129
|
+
- Migrated package deterministic tests to the restarted `fast`/`feat` layout by moving `test/atoms`, `test/commands`, `test/handbook`, `test/models`, `test/molecules`, and `test/organisms` under `test/fast/`, and moving legacy `test/integration` coverage into `test/feat/`.
|
|
130
|
+
- Updated package docs and CLI wording to teach `fast`/`feat` deterministic coverage plus scenario-only `test/e2e` execution via `ace-test-e2e`.
|
|
131
|
+
- Refreshed `TS-RUNNER-001` scenario metadata and decision-record unit coverage references to point at migrated `test/fast` paths.
|
|
132
|
+
|
|
133
|
+
## [0.33.1] - 2026-04-12
|
|
134
|
+
|
|
135
|
+
### Fixed
|
|
136
|
+
- Made suite final reports deterministic for canonical sections by deriving summary rows, failed-test details, reports tables, and the overall line from runtime results instead of model-authored prose.
|
|
137
|
+
- Added regression coverage so hallucinated scenario titles, failed TC IDs, and duplicate overall lines are ignored or replaced before report files are written.
|
|
138
|
+
|
|
139
|
+
## [0.33.0] - 2026-04-11
|
|
140
|
+
|
|
141
|
+
### Changed
|
|
142
|
+
- Made `wfi://e2e/fix` a self-bootstrapping workflow that reuses existing failure analysis when present and generates it via `wfi://e2e/analyze-failures` when missing or incomplete.
|
|
143
|
+
- Updated the canonical `as-e2e-fix` skill contract to state that missing analysis is generated automatically before fixes are applied.
|
|
144
|
+
|
|
145
|
+
### Technical
|
|
146
|
+
|
|
147
|
+
- Refactored `ConfigLoader` molecule tests to use config mock mode, removing dependency on monorepo `.ace` overrides and making the test contract stable across environments.
|
|
148
|
+
|
|
149
|
+
## [0.32.2] - 2026-04-11
|
|
150
|
+
|
|
151
|
+
### Fixed
|
|
152
|
+
- Generated per-scenario CLI batch `run_id`s from explicit 50ms timestamp buckets so parallel package runs no longer occasionally reuse the same report-path ID and trip the unique-run-id orchestration contract.
|
|
153
|
+
|
|
154
|
+
## [0.32.1] - 2026-04-11
|
|
155
|
+
|
|
156
|
+
### Technical
|
|
157
|
+
- Synced the canonical `as-e2e-review` skill description with the package-targeted assign verification contract so shipped metadata no longer implies broader scenario-sweep execution.
|
|
158
|
+
|
|
159
|
+
## [0.31.0] - 2026-04-10
|
|
160
|
+
|
|
161
|
+
### Changed
|
|
162
|
+
- Restored the two-phase E2E harness to run deterministic `test/integration` coverage before agent scenarios from `test/e2e`, with integration failures short-circuiting scenario execution.
|
|
163
|
+
- Added deterministic integration execution, richer per-test-case manifests and artifact snapshotting, and refreshed CLI/docs/workflows/tests around the restarted layout and role-based runner/verifier contract.
|
|
164
|
+
|
|
165
|
+
### Fixed
|
|
166
|
+
- Accepted minimal verifier evidence responses in the runner pipeline so successful scenario runs no longer fail when a verifier omits the full structured envelope.
|
|
167
|
+
|
|
168
|
+
## [0.30.2] - 2026-04-10
|
|
169
|
+
|
|
170
|
+
### Fixed
|
|
171
|
+
- Surface `git diff` stderr when affected-package detection fails so invalid refs and shallow-clone failures no longer look like empty affected sets.
|
|
172
|
+
|
|
173
|
+
## [0.30.1] - 2026-04-10
|
|
174
|
+
|
|
175
|
+
### Fixed
|
|
176
|
+
- Raised the `ace-support-test-helpers` runtime dependency floor to `~> 0.14` so released installs accept the shared sandbox package-copy helper line used by the restarted runner.
|
|
177
|
+
- Restored the `TS-RUNNER-001` smoke scenario fixture source path so the CLI smoke scenario resolves its canonical demo fixture again.
|
|
178
|
+
|
|
179
|
+
## [0.30.0] - 2026-04-10
|
|
180
|
+
|
|
181
|
+
### Changed
|
|
182
|
+
- Reworked `ace-test-runner-e2e` back into a two-phase contract, with deterministic integration from `test/integration` before agent scenarios from `test/e2e`.
|
|
183
|
+
- Switched sandbox orchestration to the shared package-copy helper and refreshed CLI/docs/workflows for the restarted E2E structure.
|
|
184
|
+
|
|
185
|
+
### Fixed
|
|
186
|
+
- Hardened affected-file detection by capturing git diff stderr so provider-side affected checks fail with clearer diagnostics.
|
|
187
|
+
|
|
10
188
|
## [0.29.8] - 2026-04-01
|
|
11
189
|
|
|
12
190
|
### Fixed
|
data/README.md
CHANGED
|
@@ -18,11 +18,11 @@
|
|
|
18
18
|
|
|
19
19
|

|
|
20
20
|
|
|
21
|
-
`ace-test-runner-e2e` runs realistic workflow scenarios through coding agents so teams can validate behavior beyond
|
|
21
|
+
`ace-test-runner-e2e` runs realistic workflow scenarios through coding agents so teams can validate behavior beyond deterministic package tests while keeping execution reproducible and isolated from the working tree.
|
|
22
22
|
|
|
23
23
|
## How It Works
|
|
24
24
|
|
|
25
|
-
1. Discover
|
|
25
|
+
1. Discover deterministic preflight tests from package-local `test/feat/` and agent scenarios from `test/e2e/`, preserving metadata, tags, and command flows.
|
|
26
26
|
2. Execute scenarios inside reproducible sandboxes that isolate agent runs from the working tree.
|
|
27
27
|
3. Produce structured reports that are easy to inspect, compare across runs, and feed back into triage workflows.
|
|
28
28
|
|
data/exe/ace-test-e2e-sh
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
# frozen_string_literal: true
|
|
3
3
|
|
|
4
|
+
require_relative "../lib/ace/test/end_to_end_runner"
|
|
5
|
+
|
|
4
6
|
# ace-test-e2e-sh - Execute commands within E2E test sandbox
|
|
5
7
|
#
|
|
6
8
|
# Usage:
|
|
@@ -57,11 +59,14 @@ unless Dir.exist?(test_dir)
|
|
|
57
59
|
exit 1
|
|
58
60
|
end
|
|
59
61
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
+
backend = Ace::Test::EndToEndRunner::Molecules::BwrapSandboxBackend.new(
|
|
63
|
+
sandbox_root: test_dir,
|
|
64
|
+
source_root: ENV["ACE_E2E_SOURCE_ROOT"]
|
|
65
|
+
)
|
|
66
|
+
env = backend.prepared_env("PROJECT_ROOT_PATH" => test_dir, "ACE_E2E_SOURCE_ROOT" => ENV["ACE_E2E_SOURCE_ROOT"])
|
|
62
67
|
|
|
63
68
|
if ARGV.empty?
|
|
64
|
-
exec
|
|
69
|
+
backend.exec(["bash"], chdir: test_dir, env: env)
|
|
65
70
|
else
|
|
66
|
-
exec(
|
|
71
|
+
backend.exec(ARGV, chdir: test_dir, env: env)
|
|
67
72
|
end
|
|
@@ -3,8 +3,8 @@ doc-type: guide
|
|
|
3
3
|
title: E2E Testing Guide
|
|
4
4
|
purpose: Conventions and best practices for agent-executed end-to-end tests
|
|
5
5
|
ace-docs:
|
|
6
|
-
last-updated: 2026-
|
|
7
|
-
last-checked: 2026-
|
|
6
|
+
last-updated: 2026-04-19
|
|
7
|
+
last-checked: 2026-04-19
|
|
8
8
|
---
|
|
9
9
|
|
|
10
10
|
# E2E Testing Guide
|
|
@@ -12,6 +12,7 @@ ace-docs:
|
|
|
12
12
|
## Overview
|
|
13
13
|
|
|
14
14
|
E2E tests are executed by an AI agent and reserved for behaviors that require real CLI execution, real tools, and real filesystem side effects.
|
|
15
|
+
They must also answer a user-journey question: can a user do the job from the tool's public surface, and how much friction does that journey have?
|
|
15
16
|
|
|
16
17
|
## Canonical Conventions
|
|
17
18
|
|
|
@@ -24,7 +25,7 @@ E2E tests are executed by an AI agent and reserved for behaviors that require re
|
|
|
24
25
|
- `TC-*.verify.md`
|
|
25
26
|
- `runner.yml.md`
|
|
26
27
|
- `verifier.yml.md`
|
|
27
|
-
- TC artifacts use `results/tc/{NN}/`
|
|
28
|
+
- TC outcome artifacts use `results/tc/{NN}/`
|
|
28
29
|
- Summary reports use `tcs-passed`, `tcs-failed`, `tcs-total`, and `failed[].tc`
|
|
29
30
|
- Scenarios declare `tags` for discovery-time filtering via `--tags`/`--exclude-tags`
|
|
30
31
|
|
|
@@ -32,15 +33,19 @@ E2E tests are executed by an AI agent and reserved for behaviors that require re
|
|
|
32
33
|
|
|
33
34
|
- Runner is **execution-only**:
|
|
34
35
|
- perform user-like CLI actions in sandbox
|
|
35
|
-
- produce evidence
|
|
36
|
+
- produce only final outcome evidence under `results/tc/{NN}/`
|
|
37
|
+
- return final runner observations through the harness contract
|
|
36
38
|
- do not issue PASS/FAIL verdicts
|
|
37
39
|
- do not perform verifier-style assertion/classification
|
|
40
|
+
- do not invent workarounds or hidden command recipes to compensate for docs/help/CLI gaps
|
|
38
41
|
- Verifier is **verification-only**:
|
|
39
42
|
- evaluate TC outcome from sandbox evidence
|
|
43
|
+
- use runner observations as the only non-filesystem secondary evidence source
|
|
40
44
|
- apply an **impact-first** evidence order:
|
|
41
45
|
1. sandbox/project state impact
|
|
42
|
-
2.
|
|
43
|
-
3.
|
|
46
|
+
2. runner observations
|
|
47
|
+
3. explicit TC artifacts that are true product outcomes
|
|
48
|
+
4. debug captures (`stdout`, `stderr`, `*.exit`, metadata) only as fallback
|
|
44
49
|
- Setup ownership:
|
|
45
50
|
- sandbox preparation belongs to `scenario.yml` `setup:` + `fixtures/`
|
|
46
51
|
- TC runner files must not define independent environment setup procedures
|
|
@@ -52,7 +57,25 @@ Before adding a TC, confirm the behavior needs:
|
|
|
52
57
|
- real external tools/processes
|
|
53
58
|
- real filesystem I/O and environment state
|
|
54
59
|
|
|
55
|
-
If not, keep coverage in
|
|
60
|
+
If not, keep coverage in `fast`/`feat` tests.
|
|
61
|
+
|
|
62
|
+
## Public-Surface Gate
|
|
63
|
+
|
|
64
|
+
Before keeping or adding a goal-style TC, confirm the user job is achievable from:
|
|
65
|
+
- package README / usage docs
|
|
66
|
+
- `--help`
|
|
67
|
+
- declared fixtures/setup
|
|
68
|
+
- the tool under test itself
|
|
69
|
+
|
|
70
|
+
Reject or rewrite the TC if it depends on:
|
|
71
|
+
- hidden recipes embedded in runner instructions
|
|
72
|
+
- workaround branches for unsupported or undocumented behavior
|
|
73
|
+
- direct supporting-tool probes as the primary oracle
|
|
74
|
+
- internal details that are not necessary to prove the user job
|
|
75
|
+
|
|
76
|
+
When an E2E failure shows that a valid user job is not discoverable from docs, usage guides, or `--help`, treat that as
|
|
77
|
+
docs/help drift. Failure analysis must record the stale or missing public surface and the exact docs/help target to
|
|
78
|
+
update instead of teaching the runner a workaround.
|
|
56
79
|
|
|
57
80
|
## Cost and Scope
|
|
58
81
|
|
|
@@ -79,6 +102,7 @@ The verifier is always-on for standalone goal-mode TCs in the CLI pipeline. For
|
|
|
79
102
|
## Scenario Layout
|
|
80
103
|
|
|
81
104
|
```text
|
|
105
|
+
{package}/test/feat/**/*_test.rb
|
|
82
106
|
{package}/test/e2e/TS-{AREA}-{NNN}-{slug}/
|
|
83
107
|
scenario.yml
|
|
84
108
|
runner.yml.md
|
|
@@ -101,9 +125,14 @@ This prevents duplicate assertions across test layers.
|
|
|
101
125
|
## Authoring Rules
|
|
102
126
|
|
|
103
127
|
- Keep runner goals outcome-oriented and deterministic.
|
|
128
|
+
- Keep runner goals aligned with the public user path; if the runner needs a workaround, surface that as friction rather than teaching the workaround.
|
|
104
129
|
- Keep verifier expectations impact-first, then artifacts, then debug fallback.
|
|
105
130
|
- Preserve strict TC pairing (`runner` + `verify`).
|
|
106
|
-
- Keep
|
|
131
|
+
- Keep `results/tc/{NN}/` for outcome artifacts only.
|
|
132
|
+
- Do not instruct runners to create helper YAML, path files, command files, or reflections in `results/`.
|
|
133
|
+
- Do not judge success from runner-authored summaries when final sandbox state can prove the goal directly.
|
|
134
|
+
- Use runner observations only to explain ambiguity or missing side effects, not to replace missing end-state evidence.
|
|
135
|
+
- Treat any workaround noted in runner observations as a product/docs/help or scenario-design smell that must be fixed, not preserved.
|
|
107
136
|
- Avoid hidden dependencies between TCs unless explicitly intended.
|
|
108
137
|
|
|
109
138
|
## Execution Artifacts
|
|
@@ -121,4 +150,9 @@ Before approving new/updated E2E tests:
|
|
|
121
150
|
- [ ] `runner.yml.md` and `verifier.yml.md` exist
|
|
122
151
|
- [ ] Every TC has both `.runner.md` and `.verify.md`
|
|
123
152
|
- [ ] Artifacts are scoped to `results/tc/{NN}/`
|
|
124
|
-
- [ ]
|
|
153
|
+
- [ ] Verifier primary oracle is final sandbox state or real product output, not helper artifacts
|
|
154
|
+
- [ ] Runner observations are the only non-filesystem secondary evidence source
|
|
155
|
+
- [ ] Scenario can be completed from docs/usage/`--help` without hidden recipes or workaround instructions
|
|
156
|
+
- [ ] Any friction/workaround found during review is treated as a gap, not as a runner script opportunity
|
|
157
|
+
- [ ] Failure analysis records docs/help drift from failed public user paths, or explicitly records `None`
|
|
158
|
+
- [ ] Value-gate metadata is present (`e2e-justification`, `unit-coverage-reviewed`, `cost-tier`)
|
|
@@ -46,14 +46,14 @@ Example: `ace-lint/test/e2e/TS-LINT-001-lint-pipeline/scenario.yml`
|
|
|
46
46
|
|-------|------|---------|-------------|
|
|
47
47
|
| `priority` | string | `medium` | Test priority: `high`, `medium`, `low` |
|
|
48
48
|
| `tool-under-test` | string | — | Primary command/tool validated |
|
|
49
|
-
| `sandbox-layout` | object | `{}` |
|
|
49
|
+
| `sandbox-layout` | object | `{}` | Outcome-path hints used to precreate directories and guide verification |
|
|
50
50
|
| `duration` | string | — | Estimated duration (e.g., `~15min`) |
|
|
51
51
|
| `timeout` | integer | — | Optional per-scenario execution timeout in seconds |
|
|
52
52
|
| `automation-candidate` | boolean | `false` | Whether test is automatable |
|
|
53
53
|
| `tags` | array | `[]` | Scenario tags for filtering with `--tags`/`--exclude-tags` (OR semantics) |
|
|
54
54
|
| `cost-tier` | string | `smoke` | Run profile: `smoke`, `happy-path`, `deep` |
|
|
55
55
|
| `e2e-justification` | string | — | Why E2E is needed |
|
|
56
|
-
| `unit-coverage-reviewed` | array | `[]` |
|
|
56
|
+
| `unit-coverage-reviewed` | array | `[]` | Deterministic test files reviewed (`test/fast` and/or `test/feat`) |
|
|
57
57
|
| `requires` | object | — | Test prerequisites |
|
|
58
58
|
| `setup` | array | `[]` | Setup directives before execution |
|
|
59
59
|
| `last-verified` | string | — | Last successful verification date |
|
|
@@ -73,6 +73,8 @@ Pairing rule:
|
|
|
73
73
|
Artifact layout conventions:
|
|
74
74
|
- canonical: `results/tc/{NN}/`
|
|
75
75
|
- avoid non-TC-scoped result folders
|
|
76
|
+
- keep only real outcome artifacts under `results/tc/{NN}/`; runner observations live in harness reports, not sandbox helper files
|
|
77
|
+
- absence of a declared path is debug context, not a standalone failure reason
|
|
76
78
|
|
|
77
79
|
Canonical summary report fields:
|
|
78
80
|
- `tcs-passed`
|
|
@@ -83,6 +85,7 @@ Canonical summary report fields:
|
|
|
83
85
|
Role contract:
|
|
84
86
|
- `runner.yml.md` + `TC-*.runner.md` are execution-only.
|
|
85
87
|
- `verifier.yml.md` + `TC-*.verify.md` are verification-only with impact-first checks.
|
|
88
|
+
- Goal-style scenarios should be solvable from the public surface (docs/usage/`--help` + tool under test) without hidden recipes or workaround instructions.
|
|
86
89
|
|
|
87
90
|
## `requires` Object
|
|
88
91
|
|
|
@@ -92,6 +95,11 @@ requires:
|
|
|
92
95
|
ruby: ">= 3.0"
|
|
93
96
|
```
|
|
94
97
|
|
|
98
|
+
`requires.tools` rules:
|
|
99
|
+
- declare execution prerequisites and supporting environment dependencies
|
|
100
|
+
- do not use `requires.tools` as permission to make fallback probes the primary oracle
|
|
101
|
+
- for ACE CLI scenarios, support tools are setup/dependency context unless the scenario is explicitly about that support tool itself
|
|
102
|
+
|
|
95
103
|
## `setup` Directives
|
|
96
104
|
|
|
97
105
|
Available directives:
|
|
@@ -112,7 +120,7 @@ setup:
|
|
|
112
120
|
- git-init
|
|
113
121
|
- tmux-session:
|
|
114
122
|
name-source: run-id
|
|
115
|
-
- run: "cp $PROJECT_ROOT_PATH/mise.toml mise.toml && mise trust mise.toml"
|
|
123
|
+
- run: "cp ${ACE_E2E_SOURCE_ROOT:-$PROJECT_ROOT_PATH}/mise.toml mise.toml && mise trust mise.toml"
|
|
116
124
|
- copy-fixtures
|
|
117
125
|
- run: git add -A && git commit -m "initial" --quiet
|
|
118
126
|
- agent-env:
|
|
@@ -137,17 +145,17 @@ cost-tier: smoke
|
|
|
137
145
|
tags: [smoke, "use-case:lint"]
|
|
138
146
|
e2e-justification: "Validates real subprocess behavior and report file generation"
|
|
139
147
|
unit-coverage-reviewed:
|
|
140
|
-
- test/molecules/lint_runner_test.rb
|
|
141
|
-
- test/organisms/lint_orchestrator_test.rb
|
|
148
|
+
- test/fast/molecules/lint_runner_test.rb
|
|
149
|
+
- test/fast/organisms/lint_orchestrator_test.rb
|
|
142
150
|
tool-under-test: ace-lint
|
|
143
151
|
sandbox-layout:
|
|
144
|
-
results/tc/01/: "
|
|
152
|
+
results/tc/01/: "Goal 1 outcome artifacts"
|
|
145
153
|
requires:
|
|
146
154
|
tools: [ace-lint, standardrb, jq]
|
|
147
155
|
ruby: ">= 3.0"
|
|
148
156
|
setup:
|
|
149
157
|
- git-init
|
|
150
|
-
- run: "cp $PROJECT_ROOT_PATH/mise.toml mise.toml && mise trust mise.toml"
|
|
158
|
+
- run: "cp ${ACE_E2E_SOURCE_ROOT:-$PROJECT_ROOT_PATH}/mise.toml mise.toml && mise trust mise.toml"
|
|
151
159
|
- copy-fixtures
|
|
152
160
|
- agent-env:
|
|
153
161
|
PROJECT_ROOT_PATH: "."
|
|
@@ -179,4 +187,4 @@ test/e2e/TS-LINT-001-lint-pipeline/
|
|
|
179
187
|
├── TC-001-help-survey.runner.md
|
|
180
188
|
├── TC-001-help-survey.verify.md
|
|
181
189
|
└── fixtures/
|
|
182
|
-
```
|
|
190
|
+
```
|
|
@@ -29,7 +29,7 @@ Inline `.tc.md` and frontmatter `mode` values are no longer supported.
|
|
|
29
29
|
- Scenario-level config files:
|
|
30
30
|
- `runner.yml.md`
|
|
31
31
|
- `verifier.yml.md`
|
|
32
|
-
- TC artifacts write to `results/tc/{NN}/`
|
|
32
|
+
- TC outcome artifacts write to `results/tc/{NN}/`
|
|
33
33
|
- Summary counters use `tcs-passed`, `tcs-failed`, and `tcs-total`
|
|
34
34
|
|
|
35
35
|
## File Naming
|
|
@@ -77,12 +77,13 @@ Run `ace-lint` and produce report artifacts for a valid file.
|
|
|
77
77
|
## Workspace
|
|
78
78
|
|
|
79
79
|
- Root: sandbox directory
|
|
80
|
-
-
|
|
80
|
+
- Outcome artifacts: `results/tc/01/`
|
|
81
81
|
|
|
82
82
|
## Constraints
|
|
83
83
|
|
|
84
84
|
- Use only sandbox paths
|
|
85
|
-
- Keep evidence under `results/tc/01/`
|
|
85
|
+
- Keep only final outcome evidence under `results/tc/01/`
|
|
86
|
+
- Do not place helper inputs, manifests, command transcripts, or reflections under `results/tc/01/`
|
|
86
87
|
- Execute actions only; do not assign PASS/FAIL or final verdicts
|
|
87
88
|
```
|
|
88
89
|
|
|
@@ -102,6 +103,7 @@ Example:
|
|
|
102
103
|
|
|
103
104
|
- **Impact Checks**: target sandbox/project state changed as expected
|
|
104
105
|
- **Artifact Checks**: `results/tc/01/report.json` exists and is valid
|
|
106
|
+
- **Runner Observations**: use harness-provided end-of-run observations only as supporting context
|
|
105
107
|
- **Debug Fallback**: inspect `stdout`/`stderr`/`*.exit` only when primary checks are inconclusive
|
|
106
108
|
|
|
107
109
|
## Verdict
|
|
@@ -122,10 +124,15 @@ Pass only when all expectations are satisfied by on-disk evidence.
|
|
|
122
124
|
- Ensure goal numbers and TC numbers remain aligned (`TC-001` -> Goal 1).
|
|
123
125
|
- Keep runner files execution-only and verifier files verdict-only.
|
|
124
126
|
- Make verifier expectations deterministic with impact-first ordering.
|
|
125
|
-
- Keep
|
|
127
|
+
- Keep `results/tc/{NN}/` for outcome artifacts only.
|
|
128
|
+
- Use harness-provided runner observations as the only non-filesystem secondary evidence source.
|
|
129
|
+
- Prefer final sandbox state and real product output over raw debug captures.
|
|
130
|
+
- Do not ask the runner to write setup inputs, audit manifests, or final reflections for the verifier.
|
|
131
|
+
- Do not teach the runner hidden recipes or workaround sequences; if the path is not discoverable from docs/usage/`--help`, the TC is wrong or the public surface needs improvement.
|
|
132
|
+
- Use runner observations to record friction and workaround pressure, not to normalize it.
|
|
126
133
|
- Record why each scenario remains E2E via `e2e-justification` and `unit-coverage-reviewed` in `scenario.yml`.
|
|
127
134
|
|
|
128
135
|
## Related
|
|
129
136
|
|
|
130
137
|
- [scenario.yml Reference](scenario-yml-reference.g.md)
|
|
131
|
-
- [E2E Testing Guide](e2e-testing.g.md)
|
|
138
|
+
- [E2E Testing Guide](e2e-testing.g.md)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: as-e2e-fix
|
|
3
|
-
description: Diagnose, fix, and rerun failing E2E tests systematically
|
|
3
|
+
description: Diagnose, fix, and rerun failing E2E tests systematically, generating failure analysis when needed
|
|
4
4
|
# context: no-fork
|
|
5
5
|
# agent: general-purpose
|
|
6
6
|
user-invocable: true
|
|
@@ -32,4 +32,4 @@ skill:
|
|
|
32
32
|
workflow: wfi://e2e/fix
|
|
33
33
|
---
|
|
34
34
|
|
|
35
|
-
Load and run `ace-bundle wfi://e2e/fix` in the current project, then follow the loaded workflow as the source of truth and execute it end-to-end instead of only summarizing it.
|
|
35
|
+
Load and run `ace-bundle wfi://e2e/fix` in the current project, then follow the loaded workflow as the source of truth and execute it end-to-end instead of only summarizing it. If E2E failure analysis is missing or incomplete, generate it via `wfi://e2e/analyze-failures` as part of the fix workflow before applying changes.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: as-e2e-review
|
|
3
|
-
description:
|
|
3
|
+
description: Review E2E coverage for modified packages and run targeted package scenarios
|
|
4
4
|
# bundle: wfi://e2e/review
|
|
5
5
|
# agent: general-purpose
|
|
6
6
|
user-invocable: true
|
|
@@ -24,7 +24,7 @@ assign:
|
|
|
24
24
|
source: wfi://e2e/review
|
|
25
25
|
steps:
|
|
26
26
|
- name: verify-e2e
|
|
27
|
-
description: Review E2E coverage for modified packages and run targeted scenarios
|
|
27
|
+
description: Review E2E coverage for modified packages and run targeted package scenarios
|
|
28
28
|
tags: [testing, e2e, verification]
|
|
29
29
|
skill:
|
|
30
30
|
kind: workflow
|