ace-test-runner-e2e 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.ace-defaults/e2e-runner/config.yml +70 -0
- data/.ace-defaults/nav/protocols/guide-sources/ace-test-runner-e2e.yml +11 -0
- data/.ace-defaults/nav/protocols/skill-sources/ace-test-runner-e2e.yml +19 -0
- data/.ace-defaults/nav/protocols/tmpl-sources/ace-test-runner-e2e.yml +12 -0
- data/.ace-defaults/nav/protocols/wfi-sources/ace-test-runner-e2e.yml +11 -0
- data/CHANGELOG.md +1166 -0
- data/LICENSE +21 -0
- data/README.md +42 -0
- data/Rakefile +15 -0
- data/exe/ace-test-e2e +15 -0
- data/exe/ace-test-e2e-sh +67 -0
- data/exe/ace-test-e2e-suite +13 -0
- data/handbook/guides/e2e-testing.g.md +124 -0
- data/handbook/guides/scenario-yml-reference.g.md +182 -0
- data/handbook/guides/tc-authoring.g.md +131 -0
- data/handbook/skills/as-e2e-create/SKILL.md +30 -0
- data/handbook/skills/as-e2e-fix/SKILL.md +35 -0
- data/handbook/skills/as-e2e-manage/SKILL.md +31 -0
- data/handbook/skills/as-e2e-plan-changes/SKILL.md +30 -0
- data/handbook/skills/as-e2e-review/SKILL.md +35 -0
- data/handbook/skills/as-e2e-rewrite/SKILL.md +31 -0
- data/handbook/skills/as-e2e-run/SKILL.md +48 -0
- data/handbook/skills/as-e2e-setup-sandbox/SKILL.md +34 -0
- data/handbook/templates/ace-taskflow-fixture.template.md +322 -0
- data/handbook/templates/agent-experience-report.template.md +89 -0
- data/handbook/templates/metadata.template.yml +49 -0
- data/handbook/templates/scenario.yml.template.yml +60 -0
- data/handbook/templates/tc-file.template.md +45 -0
- data/handbook/templates/test-report.template.md +94 -0
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +126 -0
- data/handbook/workflow-instructions/e2e/create.wf.md +395 -0
- data/handbook/workflow-instructions/e2e/execute.wf.md +253 -0
- data/handbook/workflow-instructions/e2e/fix.wf.md +166 -0
- data/handbook/workflow-instructions/e2e/manage.wf.md +179 -0
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +255 -0
- data/handbook/workflow-instructions/e2e/review.wf.md +286 -0
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +281 -0
- data/handbook/workflow-instructions/e2e/run.wf.md +355 -0
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +461 -0
- data/lib/ace/test/end_to_end_runner/atoms/display_helpers.rb +234 -0
- data/lib/ace/test/end_to_end_runner/atoms/prompt_builder.rb +199 -0
- data/lib/ace/test/end_to_end_runner/atoms/result_parser.rb +166 -0
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +166 -0
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +244 -0
- data/lib/ace/test/end_to_end_runner/atoms/suite_report_prompt_builder.rb +103 -0
- data/lib/ace/test/end_to_end_runner/atoms/tc_fidelity_validator.rb +39 -0
- data/lib/ace/test/end_to_end_runner/atoms/test_case_parser.rb +108 -0
- data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +130 -0
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +156 -0
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +47 -0
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +115 -0
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +90 -0
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +92 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +75 -0
- data/lib/ace/test/end_to_end_runner/molecules/failure_finder.rb +203 -0
- data/lib/ace/test/end_to_end_runner/molecules/fixture_copier.rb +35 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +121 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +182 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +321 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +131 -0
- data/lib/ace/test/end_to_end_runner/molecules/progress_display_manager.rb +172 -0
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +259 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +254 -0
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +181 -0
- data/lib/ace/test/end_to_end_runner/molecules/simple_display_manager.rb +72 -0
- data/lib/ace/test/end_to_end_runner/molecules/suite_progress_display_manager.rb +223 -0
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +277 -0
- data/lib/ace/test/end_to_end_runner/molecules/suite_simple_display_manager.rb +116 -0
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +136 -0
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +332 -0
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +830 -0
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +442 -0
- data/lib/ace/test/end_to_end_runner/version.rb +9 -0
- data/lib/ace/test/end_to_end_runner.rb +71 -0
- metadata +220 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
|
|
5
|
+
module Ace
|
|
6
|
+
module Test
|
|
7
|
+
module EndToEndRunner
|
|
8
|
+
module Molecules
|
|
9
|
+
# Scans cache for failed test cases from previous E2E test runs
|
|
10
|
+
#
|
|
11
|
+
# Reads metadata.yml files from .ace-local/test-e2e/*-reports/ directories
|
|
12
|
+
# and extracts failed_test_cases arrays. Used by --only-failures CLI flag
|
|
13
|
+
# to re-run only tests that failed previously.
|
|
14
|
+
#
|
|
15
|
+
# Note: This is a Molecule (not an Atom) because it performs filesystem
|
|
16
|
+
# I/O via Dir.glob and YAML file reading.
|
|
17
|
+
class FailureFinder
|
|
18
|
+
CACHE_DIR = ".ace-local/test-e2e"
|
|
19
|
+
METADATA_FILE = "metadata.yml"
|
|
20
|
+
REPORTS_SUFFIX = "-reports"
|
|
21
|
+
|
|
22
|
+
# Find failed test cases for a specific package
|
|
23
|
+
#
|
|
24
|
+
# Scans cache directory for the most recent metadata.yml per test-id
|
|
25
|
+
# within the specified package, returning aggregated failed test case IDs.
|
|
26
|
+
#
|
|
27
|
+
# @param package [String] Package name (e.g., "ace-lint")
|
|
28
|
+
# @param base_dir [String] Base directory to search from (default: current dir)
|
|
29
|
+
# @return [Array<String>] Failed test case IDs (e.g., ["TC-001", "TC-003"])
|
|
30
|
+
def find_failures(package:, base_dir: Dir.pwd)
|
|
31
|
+
metadata_files = discover_metadata_files(base_dir)
|
|
32
|
+
return [] if metadata_files.empty?
|
|
33
|
+
|
|
34
|
+
# Filter to package and get most recent per test-id
|
|
35
|
+
package_metadata = filter_by_package(metadata_files, package)
|
|
36
|
+
most_recent = most_recent_per_test(package_metadata)
|
|
37
|
+
|
|
38
|
+
extract_all_failed_ids(most_recent)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Find failed test cases across all packages
|
|
42
|
+
#
|
|
43
|
+
# Scans cache directory for the most recent metadata.yml per test-id
|
|
44
|
+
# across all packages, returning aggregated failed test case IDs.
|
|
45
|
+
#
|
|
46
|
+
# @param base_dir [String] Base directory to search from (default: current dir)
|
|
47
|
+
# @return [Array<String>] Failed test case IDs
|
|
48
|
+
def find_all_failures(base_dir: Dir.pwd)
|
|
49
|
+
metadata_files = discover_metadata_files(base_dir)
|
|
50
|
+
return [] if metadata_files.empty?
|
|
51
|
+
|
|
52
|
+
most_recent = most_recent_per_test(metadata_files)
|
|
53
|
+
extract_all_failed_ids(most_recent)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Find failed test cases grouped by package
|
|
57
|
+
#
|
|
58
|
+
# Scans cache directory for the most recent metadata.yml per test-id
|
|
59
|
+
# within each package, returning a hash mapping package names to their
|
|
60
|
+
# failed test case IDs.
|
|
61
|
+
#
|
|
62
|
+
# @param packages [Array<String>] Package names to scan
|
|
63
|
+
# @param base_dir [String] Base directory to search from (default: current dir)
|
|
64
|
+
# @return [Hash{String => Array<String>}] Package name to failed test case IDs
|
|
65
|
+
def find_failures_by_package(packages:, base_dir: Dir.pwd)
|
|
66
|
+
metadata_files = discover_metadata_files(base_dir)
|
|
67
|
+
return {} if metadata_files.empty?
|
|
68
|
+
|
|
69
|
+
result = {}
|
|
70
|
+
packages.each do |package|
|
|
71
|
+
package_metadata = filter_by_package(metadata_files, package)
|
|
72
|
+
most_recent = most_recent_per_test(package_metadata)
|
|
73
|
+
failed_ids = extract_all_failed_ids(most_recent)
|
|
74
|
+
result[package] = failed_ids unless failed_ids.empty?
|
|
75
|
+
end
|
|
76
|
+
result
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Find failed test scenarios grouped by package and scenario (test-id)
|
|
80
|
+
#
|
|
81
|
+
# Like find_failures_by_package but preserves per-scenario granularity.
|
|
82
|
+
# Callers can use this to re-run full failed scenarios.
|
|
83
|
+
#
|
|
84
|
+
# @param packages [Array<String>] Package names to scan
|
|
85
|
+
# @param base_dir [String] Base directory to search from (default: current dir)
|
|
86
|
+
# @return [Hash{String => Hash{String => Array<String>}}]
|
|
87
|
+
# Package name => { test-id => failed TC IDs }
|
|
88
|
+
def find_failures_by_scenario(packages:, base_dir: Dir.pwd)
|
|
89
|
+
metadata_files = discover_metadata_files(base_dir)
|
|
90
|
+
return {} if metadata_files.empty?
|
|
91
|
+
|
|
92
|
+
result = {}
|
|
93
|
+
packages.each do |package|
|
|
94
|
+
package_metadata = filter_by_package(metadata_files, package)
|
|
95
|
+
most_recent = most_recent_per_test(package_metadata)
|
|
96
|
+
|
|
97
|
+
scenario_failures = {}
|
|
98
|
+
most_recent.each do |entry|
|
|
99
|
+
test_id = entry[:data]["test-id"]
|
|
100
|
+
failed_ids = extract_failed_test_cases(entry[:data])
|
|
101
|
+
scenario_failures[test_id] = failed_ids unless failed_ids.empty?
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
result[package] = scenario_failures unless scenario_failures.empty?
|
|
105
|
+
end
|
|
106
|
+
result
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
private
|
|
110
|
+
|
|
111
|
+
# Discover all metadata.yml files in the cache directory
|
|
112
|
+
#
|
|
113
|
+
# @param base_dir [String] Base directory
|
|
114
|
+
# @return [Array<Hash>] Parsed metadata entries with :path and :data keys
|
|
115
|
+
def discover_metadata_files(base_dir)
|
|
116
|
+
cache_path = File.join(base_dir, CACHE_DIR)
|
|
117
|
+
return [] unless Dir.exist?(cache_path)
|
|
118
|
+
|
|
119
|
+
pattern = File.join(cache_path, "*#{REPORTS_SUFFIX}", METADATA_FILE)
|
|
120
|
+
Dir.glob(pattern).filter_map { |path| load_metadata(path) }
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Safely load and parse a metadata.yml file
|
|
124
|
+
#
|
|
125
|
+
# @param path [String] Absolute path to metadata.yml
|
|
126
|
+
# @return [Hash, nil] Hash with :path and :data keys, or nil on error
|
|
127
|
+
def load_metadata(path)
|
|
128
|
+
data = YAML.safe_load_file(path, permitted_classes: [Date])
|
|
129
|
+
return nil unless data.is_a?(Hash)
|
|
130
|
+
|
|
131
|
+
{path: path, data: data}
|
|
132
|
+
rescue => e
|
|
133
|
+
warn "Warning: Could not parse #{path}: #{e.message}" if ENV["DEBUG"]
|
|
134
|
+
nil
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Filter metadata entries by package name
|
|
138
|
+
#
|
|
139
|
+
# @param entries [Array<Hash>] Metadata entries
|
|
140
|
+
# @param package [String] Package name to filter by
|
|
141
|
+
# @return [Array<Hash>] Filtered entries
|
|
142
|
+
def filter_by_package(entries, package)
|
|
143
|
+
entries.select { |entry| entry[:data]["package"] == package }
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Get the most recent metadata entry per test-id
|
|
147
|
+
#
|
|
148
|
+
# Uses the report directory name (which contains a timestamp prefix)
|
|
149
|
+
# to determine recency. Later timestamps sort higher alphabetically.
|
|
150
|
+
#
|
|
151
|
+
# @param entries [Array<Hash>] Metadata entries
|
|
152
|
+
# @return [Array<Hash>] Most recent entry per test-id
|
|
153
|
+
def most_recent_per_test(entries)
|
|
154
|
+
grouped = entries.group_by { |entry| entry[:data]["test-id"] }
|
|
155
|
+
grouped.map do |_test_id, group|
|
|
156
|
+
# Sort by directory name (timestamp prefix ensures chronological order)
|
|
157
|
+
group.max_by { |entry| File.basename(File.dirname(entry[:path])) }
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Extract failed test case IDs from metadata entries
|
|
162
|
+
#
|
|
163
|
+
# Checks both the `failed_test_cases` array (from task 259.03 ReportWriter)
|
|
164
|
+
# and falls back to checking `status: "fail"` for older metadata formats.
|
|
165
|
+
#
|
|
166
|
+
# @param entries [Array<Hash>] Most recent metadata entries
|
|
167
|
+
# @return [Array<String>] Aggregated failed test case IDs
|
|
168
|
+
def extract_all_failed_ids(entries)
|
|
169
|
+
entries.flat_map { |entry| extract_failed_test_cases(entry[:data]) }.uniq
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Extract failed test case IDs from a single metadata hash
|
|
173
|
+
#
|
|
174
|
+
# Returns specific TC IDs when available, or ["*"] as a wildcard
|
|
175
|
+
# when metadata indicates failure but lacks granular test case data
|
|
176
|
+
# (common in legacy/CLI-agent-written metadata).
|
|
177
|
+
#
|
|
178
|
+
# @param data [Hash] Parsed metadata.yml data
|
|
179
|
+
# @return [Array<String>] Failed test case IDs, ["*"] for wildcard, or []
|
|
180
|
+
def extract_failed_test_cases(data)
|
|
181
|
+
# TC-first schema: failed: [{tc: "TC-001", ...}, ...]
|
|
182
|
+
failed_entries = data["failed"]
|
|
183
|
+
if failed_entries.is_a?(Array) && !failed_entries.empty?
|
|
184
|
+
tc_ids = failed_entries.filter_map { |entry| entry.is_a?(Hash) ? entry["tc"] : nil }.compact
|
|
185
|
+
return tc_ids unless tc_ids.empty?
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Primary: use failed_test_cases array (written by ReportWriter or workflow template)
|
|
189
|
+
failed_ids = data["failed_test_cases"]
|
|
190
|
+
return Array(failed_ids) if failed_ids.is_a?(Array) && !failed_ids.empty?
|
|
191
|
+
|
|
192
|
+
# Fallback: metadata has failures but no specific test case IDs.
|
|
193
|
+
# Return wildcard to signal "re-run entire test scenario".
|
|
194
|
+
status = data["status"]
|
|
195
|
+
return ["*"] if %w[fail partial error incomplete].include?(status)
|
|
196
|
+
|
|
197
|
+
[]
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
5
|
+
module Ace
|
|
6
|
+
module Test
|
|
7
|
+
module EndToEndRunner
|
|
8
|
+
module Molecules
|
|
9
|
+
# Copies fixture files from a scenario's fixtures/ directory into a sandbox
|
|
10
|
+
#
|
|
11
|
+
# Preserves the full directory tree structure. Used by SetupExecutor
|
|
12
|
+
# to populate sandboxes with test data files.
|
|
13
|
+
#
|
|
14
|
+
# Note: This is a Molecule (not an Atom) because it performs filesystem
|
|
15
|
+
# I/O via FileUtils.cp_r and Dir.glob.
|
|
16
|
+
class FixtureCopier
|
|
17
|
+
# Copy fixture tree into target directory
|
|
18
|
+
#
|
|
19
|
+
# @param source_dir [String] Path to the fixtures/ directory
|
|
20
|
+
# @param target_dir [String] Path to the sandbox directory
|
|
21
|
+
# @return [Array<String>] Relative paths of copied files and directories
|
|
22
|
+
# @raise [ArgumentError] If source_dir does not exist
|
|
23
|
+
def copy(source_dir:, target_dir:)
|
|
24
|
+
raise ArgumentError, "Fixture source directory not found: #{source_dir}" unless Dir.exist?(source_dir)
|
|
25
|
+
|
|
26
|
+
FileUtils.mkdir_p(target_dir)
|
|
27
|
+
FileUtils.cp_r("#{source_dir}/.", target_dir)
|
|
28
|
+
|
|
29
|
+
Dir.glob("**/*", base: target_dir).sort
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ace/llm"
|
|
4
|
+
require "ace/llm/query_interface"
|
|
5
|
+
|
|
6
|
+
module Ace
|
|
7
|
+
module Test
|
|
8
|
+
module EndToEndRunner
|
|
9
|
+
module Molecules
|
|
10
|
+
# Executes standalone scenarios using the deterministic pipeline.
|
|
11
|
+
class PipelineExecutor
|
|
12
|
+
# @param provider [String]
|
|
13
|
+
# @param timeout [Integer]
|
|
14
|
+
# @param sandbox_builder [Molecules::PipelineSandboxBuilder]
|
|
15
|
+
# @param prompt_bundler [Molecules::PipelinePromptBundler]
|
|
16
|
+
# @param report_generator [Molecules::PipelineReportGenerator]
|
|
17
|
+
def initialize(provider:, timeout:, sandbox_builder: nil, prompt_bundler: nil, report_generator: nil)
|
|
18
|
+
@provider = provider
|
|
19
|
+
@timeout = timeout
|
|
20
|
+
@sandbox_builder = sandbox_builder || PipelineSandboxBuilder.new
|
|
21
|
+
@prompt_bundler = prompt_bundler || PipelinePromptBundler.new
|
|
22
|
+
@report_generator = report_generator || PipelineReportGenerator.new
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# @param scenario [Models::TestScenario]
|
|
26
|
+
# @param cli_args [String, nil]
|
|
27
|
+
# @param sandbox_path [String]
|
|
28
|
+
# @param report_dir [String]
|
|
29
|
+
# @param env_vars [Hash, nil]
|
|
30
|
+
# @param test_cases [Array<String>, nil]
|
|
31
|
+
# @return [Models::TestResult]
|
|
32
|
+
def execute(scenario:, cli_args:, sandbox_path:, report_dir:, env_vars: nil, test_cases: nil)
|
|
33
|
+
started_at = Time.now
|
|
34
|
+
|
|
35
|
+
build_env = @sandbox_builder.build(
|
|
36
|
+
scenario: scenario,
|
|
37
|
+
sandbox_path: sandbox_path,
|
|
38
|
+
test_cases: test_cases
|
|
39
|
+
)
|
|
40
|
+
merged_env = (env_vars || {}).merge(build_env)
|
|
41
|
+
|
|
42
|
+
runner = @prompt_bundler.prepare_runner(
|
|
43
|
+
scenario: scenario,
|
|
44
|
+
sandbox_path: sandbox_path,
|
|
45
|
+
test_cases: test_cases
|
|
46
|
+
)
|
|
47
|
+
run_llm(
|
|
48
|
+
prompt_path: runner[:prompt_path],
|
|
49
|
+
system_path: runner[:system_path],
|
|
50
|
+
output_path: runner[:output_path],
|
|
51
|
+
cli_args: cli_args,
|
|
52
|
+
env_vars: merged_env
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
verifier = @prompt_bundler.prepare_verifier(
|
|
56
|
+
scenario: scenario,
|
|
57
|
+
sandbox_path: sandbox_path,
|
|
58
|
+
test_cases: test_cases
|
|
59
|
+
)
|
|
60
|
+
verifier_response = run_llm(
|
|
61
|
+
prompt_path: verifier[:prompt_path],
|
|
62
|
+
system_path: verifier[:system_path],
|
|
63
|
+
output_path: verifier[:output_path],
|
|
64
|
+
cli_args: cli_args,
|
|
65
|
+
env_vars: merged_env
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
@report_generator.generate(
|
|
69
|
+
scenario: scenario,
|
|
70
|
+
verifier_output: verifier_response[:text],
|
|
71
|
+
report_dir: report_dir,
|
|
72
|
+
provider: @provider,
|
|
73
|
+
started_at: started_at,
|
|
74
|
+
completed_at: Time.now
|
|
75
|
+
)
|
|
76
|
+
rescue => e
|
|
77
|
+
begin
|
|
78
|
+
@report_generator.write_failure_report(
|
|
79
|
+
scenario: scenario,
|
|
80
|
+
report_dir: report_dir,
|
|
81
|
+
provider: @provider,
|
|
82
|
+
started_at: started_at || Time.now,
|
|
83
|
+
completed_at: Time.now,
|
|
84
|
+
error_message: "#{e.class}: #{e.message}"
|
|
85
|
+
)
|
|
86
|
+
rescue => write_error
|
|
87
|
+
Models::TestResult.new(
|
|
88
|
+
test_id: scenario.test_id,
|
|
89
|
+
status: "error",
|
|
90
|
+
summary: "Execution pipeline failed",
|
|
91
|
+
error: "#{e.class}: #{e.message}; failed to write error report: #{write_error.class}: #{write_error.message}",
|
|
92
|
+
started_at: started_at || Time.now,
|
|
93
|
+
completed_at: Time.now
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
private
|
|
99
|
+
|
|
100
|
+
def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:)
|
|
101
|
+
prompt = File.read(prompt_path)
|
|
102
|
+
system = File.read(system_path)
|
|
103
|
+
working_dir = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
|
|
104
|
+
|
|
105
|
+
Ace::LLM::QueryInterface.query(
|
|
106
|
+
@provider,
|
|
107
|
+
prompt,
|
|
108
|
+
system: system,
|
|
109
|
+
cli_args: cli_args,
|
|
110
|
+
timeout: @timeout,
|
|
111
|
+
fallback: false,
|
|
112
|
+
output: output_path,
|
|
113
|
+
working_dir: working_dir,
|
|
114
|
+
subprocess_env: env_vars
|
|
115
|
+
)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "date"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "yaml"
|
|
6
|
+
|
|
7
|
+
module Ace
|
|
8
|
+
module Test
|
|
9
|
+
module EndToEndRunner
|
|
10
|
+
module Molecules
|
|
11
|
+
# Prepares deterministic runner/verifier prompt files for pipeline execution.
|
|
12
|
+
class PipelinePromptBundler
|
|
13
|
+
RUNNER_SYSTEM_PROMPT = <<~PROMPT
|
|
14
|
+
You are an E2E test executor working in a sandbox directory.
|
|
15
|
+
|
|
16
|
+
Rules:
|
|
17
|
+
- Execute each goal in order
|
|
18
|
+
- Save all artifacts to results/tc/{NN}/ directories as specified
|
|
19
|
+
- Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep artifact writes under SANDBOX_ROOT/results
|
|
20
|
+
- Do not fabricate output - all artifacts must come from real tool execution
|
|
21
|
+
- If a goal fails, note the failure and continue to the next goal
|
|
22
|
+
- After all goals, output a brief summary of what you produced for each goal
|
|
23
|
+
PROMPT
|
|
24
|
+
|
|
25
|
+
VERIFIER_SYSTEM_PROMPT = <<~PROMPT
|
|
26
|
+
You are an E2E test verifier. You inspect artifacts and render PASS/FAIL verdicts.
|
|
27
|
+
|
|
28
|
+
Rules:
|
|
29
|
+
- Evaluate each goal independently based solely on the artifacts provided
|
|
30
|
+
- Do not speculate about what the runner did - only judge what exists
|
|
31
|
+
- For each failed goal, include a category:
|
|
32
|
+
test-spec-error | tool-bug | runner-error | infrastructure-error
|
|
33
|
+
- For each goal, cite specific evidence (filenames, content snippets)
|
|
34
|
+
- Follow the output format exactly
|
|
35
|
+
PROMPT
|
|
36
|
+
|
|
37
|
+
# @param scenario [Models::TestScenario]
|
|
38
|
+
# @param sandbox_path [String]
|
|
39
|
+
# @param test_cases [Array<String>, nil]
|
|
40
|
+
# @return [Hash]
|
|
41
|
+
def prepare_runner(scenario:, sandbox_path:, test_cases: nil)
|
|
42
|
+
cache_dir = ensure_cache_dir(sandbox_path)
|
|
43
|
+
system_path = File.join(cache_dir, "runner-system.md")
|
|
44
|
+
prompt_path = File.join(cache_dir, "runner-prompt.md")
|
|
45
|
+
|
|
46
|
+
File.write(system_path, RUNNER_SYSTEM_PROMPT)
|
|
47
|
+
|
|
48
|
+
bundled = bundle_markdown_file(File.join(scenario.dir_path, "runner.yml.md"), test_cases: test_cases)
|
|
49
|
+
bundled = bundled.gsub("Workspace root: (current directory)", "Workspace root: #{File.expand_path(sandbox_path)}")
|
|
50
|
+
File.write(prompt_path, bundled)
|
|
51
|
+
|
|
52
|
+
{
|
|
53
|
+
system_path: system_path,
|
|
54
|
+
prompt_path: prompt_path,
|
|
55
|
+
output_path: File.join(cache_dir, "runner-output.md")
|
|
56
|
+
}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# @param scenario [Models::TestScenario]
|
|
60
|
+
# @param sandbox_path [String]
|
|
61
|
+
# @param test_cases [Array<String>, nil]
|
|
62
|
+
# @return [Hash]
|
|
63
|
+
def prepare_verifier(scenario:, sandbox_path:, test_cases: nil)
|
|
64
|
+
cache_dir = ensure_cache_dir(sandbox_path)
|
|
65
|
+
system_path = File.join(cache_dir, "verifier-system.md")
|
|
66
|
+
prompt_path = File.join(cache_dir, "verifier-prompt.md")
|
|
67
|
+
|
|
68
|
+
File.write(system_path, VERIFIER_SYSTEM_PROMPT)
|
|
69
|
+
|
|
70
|
+
artifacts = build_artifact_section(sandbox_path)
|
|
71
|
+
criteria = bundle_markdown_file(File.join(scenario.dir_path, "verifier.yml.md"), test_cases: test_cases)
|
|
72
|
+
File.write(prompt_path, [artifacts, criteria].join("\n\n---\n\n"))
|
|
73
|
+
|
|
74
|
+
{
|
|
75
|
+
system_path: system_path,
|
|
76
|
+
prompt_path: prompt_path,
|
|
77
|
+
output_path: File.join(cache_dir, "verifier-output.md")
|
|
78
|
+
}
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
private
|
|
82
|
+
|
|
83
|
+
def ensure_cache_dir(sandbox_path)
|
|
84
|
+
cache_dir = File.join(File.expand_path(sandbox_path), ".ace-local", "e2e")
|
|
85
|
+
FileUtils.mkdir_p(cache_dir)
|
|
86
|
+
cache_dir
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def bundle_markdown_file(path, test_cases: nil)
|
|
90
|
+
raw = File.read(path)
|
|
91
|
+
frontmatter, body = split_frontmatter(raw)
|
|
92
|
+
bundle_files = parse_bundle_files(frontmatter, path)
|
|
93
|
+
selected_ids = normalize_selected_ids(test_cases)
|
|
94
|
+
|
|
95
|
+
included_paths = bundle_files.select do |entry|
|
|
96
|
+
include_bundle_entry?(entry, selected_ids)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
sections = [body.rstrip]
|
|
100
|
+
included_paths.each do |entry|
|
|
101
|
+
full_path = File.expand_path(entry, File.dirname(path))
|
|
102
|
+
sections << File.read(full_path).rstrip
|
|
103
|
+
end
|
|
104
|
+
sections.reject(&:empty?).join("\n\n---\n\n")
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def split_frontmatter(raw)
|
|
108
|
+
match = raw.match(/\A---\s*\r?\n(.*?)\r?\n---\s*\r?\n(.*)\z/m)
|
|
109
|
+
return [{}, raw] unless match
|
|
110
|
+
|
|
111
|
+
parsed = YAML.safe_load(match[1], permitted_classes: [Date]) || {}
|
|
112
|
+
[parsed, match[2]]
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def parse_bundle_files(frontmatter, path)
|
|
116
|
+
files = frontmatter.dig("bundle", "files")
|
|
117
|
+
return [] unless files.is_a?(Array)
|
|
118
|
+
|
|
119
|
+
files.map(&:to_s).reject(&:empty?)
|
|
120
|
+
rescue Psych::SyntaxError => e
|
|
121
|
+
raise ArgumentError, "Invalid YAML frontmatter in #{path}: #{e.message}"
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def normalize_selected_ids(test_cases)
|
|
125
|
+
return nil unless test_cases && !test_cases.empty?
|
|
126
|
+
|
|
127
|
+
test_cases.map { |tc| tc.to_s.upcase }.to_set
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def include_bundle_entry?(entry, selected_ids)
|
|
131
|
+
return true unless selected_ids
|
|
132
|
+
|
|
133
|
+
tc_id = extract_tc_id(entry)
|
|
134
|
+
return true if tc_id.nil?
|
|
135
|
+
|
|
136
|
+
selected_ids.include?(tc_id)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def extract_tc_id(path)
|
|
140
|
+
match = File.basename(path).match(/\A(TC-\d+[a-z]*)/i)
|
|
141
|
+
match ? match[1].upcase : nil
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def build_artifact_section(sandbox_path)
|
|
145
|
+
sandbox_path = File.expand_path(sandbox_path)
|
|
146
|
+
files = Dir.glob(File.join(sandbox_path, "results", "**", "*")).select { |f| File.file?(f) }.sort
|
|
147
|
+
tree_entries = files.map { |f| relative_path(f, sandbox_path) }
|
|
148
|
+
|
|
149
|
+
parts = []
|
|
150
|
+
parts << "# Sandbox Artifacts"
|
|
151
|
+
parts << ""
|
|
152
|
+
parts << "## Directory tree"
|
|
153
|
+
parts << "```"
|
|
154
|
+
parts.concat(tree_entries)
|
|
155
|
+
parts << "```"
|
|
156
|
+
parts << ""
|
|
157
|
+
parts << "## File contents"
|
|
158
|
+
parts << ""
|
|
159
|
+
|
|
160
|
+
files.each do |file|
|
|
161
|
+
parts << "### `#{relative_path(file, sandbox_path)}`"
|
|
162
|
+
parts << "```"
|
|
163
|
+
parts << safe_read(file)
|
|
164
|
+
parts << "```"
|
|
165
|
+
parts << ""
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
parts.join("\n").rstrip
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def relative_path(path, root)
|
|
172
|
+
File.expand_path(path).sub("#{File.expand_path(root)}/", "")
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def safe_read(path)
|
|
176
|
+
File.binread(path).encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|