ace-test-runner-e2e 0.29.6 → 0.38.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ace-defaults/e2e-runner/config.yml +14 -2
- data/CHANGELOG.md +187 -0
- data/README.md +2 -2
- data/exe/ace-test-e2e-sh +9 -4
- data/handbook/guides/e2e-testing.g.md +43 -9
- data/handbook/guides/scenario-yml-reference.g.md +16 -8
- data/handbook/guides/tc-authoring.g.md +12 -5
- data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
- data/handbook/skills/as-e2e-review/SKILL.md +2 -2
- data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
- data/handbook/templates/agent-experience-report.template.md +3 -2
- data/handbook/templates/scenario.yml.template.yml +13 -2
- data/handbook/templates/tc-file.template.md +14 -4
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
- data/handbook/workflow-instructions/e2e/create.wf.md +139 -23
- data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
- data/handbook/workflow-instructions/e2e/fix.wf.md +65 -15
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +17 -1
- data/handbook/workflow-instructions/e2e/review.wf.md +44 -28
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +17 -3
- data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +21 -8
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
- data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
- data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +165 -25
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +121 -8
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +119 -18
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +13 -12
- data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +282 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +85 -5
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +98 -16
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +241 -97
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +73 -15
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +120 -19
- data/lib/ace/test/end_to_end_runner/version.rb +1 -1
- data/lib/ace/test/end_to_end_runner.rb +2 -0
- metadata +19 -2
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "open3"
|
|
5
|
+
require "shellwords"
|
|
6
|
+
require "ace/test_support/sandbox_package_copy"
|
|
7
|
+
|
|
8
|
+
module Ace
|
|
9
|
+
module Test
|
|
10
|
+
module EndToEndRunner
|
|
11
|
+
module Molecules
|
|
12
|
+
# Runs deterministic preflight tests inside a sandboxed package copy.
|
|
13
|
+
class IntegrationRunner
|
|
14
|
+
def initialize(base_dir: Dir.pwd, package_copy: nil)
|
|
15
|
+
@base_dir = File.expand_path(base_dir)
|
|
16
|
+
@package_copy = package_copy || Ace::TestSupport::SandboxPackageCopy.new(source_root: @base_dir)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def run(package:, files:, timestamp:, output: $stdout)
|
|
20
|
+
return nil if files.nil? || files.empty?
|
|
21
|
+
|
|
22
|
+
started_at = Time.now
|
|
23
|
+
sandbox_root = File.join(@base_dir, ".ace-local", "test-e2e", "#{timestamp}-#{package}-preflight")
|
|
24
|
+
FileUtils.mkdir_p(sandbox_root)
|
|
25
|
+
|
|
26
|
+
package_copy_result = @package_copy.prepare(package_name: package, sandbox_root: sandbox_root)
|
|
27
|
+
package_root = resolve_package_root(sandbox_root, package)
|
|
28
|
+
env = package_copy_result[:env] || {}
|
|
29
|
+
|
|
30
|
+
test_cases = files.map do |file|
|
|
31
|
+
run_file(package_root, file, env, output)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
status = if test_cases.any? { |tc| tc[:status] == "error" }
|
|
35
|
+
"error"
|
|
36
|
+
elsif test_cases.any? { |tc| tc[:status] == "fail" }
|
|
37
|
+
"fail"
|
|
38
|
+
else
|
|
39
|
+
"pass"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
Models::TestResult.new(
|
|
43
|
+
test_id: "PREFLIGHT",
|
|
44
|
+
status: status,
|
|
45
|
+
test_cases: test_cases,
|
|
46
|
+
summary: preflight_summary(status, test_cases),
|
|
47
|
+
started_at: started_at,
|
|
48
|
+
completed_at: Time.now,
|
|
49
|
+
metadata: {
|
|
50
|
+
phase: "preflight",
|
|
51
|
+
package: package,
|
|
52
|
+
sandbox_root: sandbox_root
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def run_file(package_root, file, env, output)
|
|
60
|
+
relative = file.sub(%r{\A#{Regexp.escape(@base_dir)}/?}, "")
|
|
61
|
+
package_relative = relative.sub(%r{\A[^/]+/}, "")
|
|
62
|
+
|
|
63
|
+
stdout, stderr, status = Open3.capture3(
|
|
64
|
+
env,
|
|
65
|
+
"ace-test",
|
|
66
|
+
package_relative,
|
|
67
|
+
chdir: package_root
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
output.puts "Preflight: #{package_relative} (#{status.success? ? "pass" : "fail"})"
|
|
71
|
+
|
|
72
|
+
{
|
|
73
|
+
id: package_relative,
|
|
74
|
+
description: package_relative,
|
|
75
|
+
status: status.success? ? "pass" : "fail",
|
|
76
|
+
actual: stdout,
|
|
77
|
+
notes: stderr,
|
|
78
|
+
metadata: {
|
|
79
|
+
phase: "preflight",
|
|
80
|
+
exit_status: status.exitstatus,
|
|
81
|
+
command: Shellwords.join(["ace-test", package_relative])
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
rescue StandardError => e
|
|
85
|
+
output.puts "Preflight: #{package_relative} (error)"
|
|
86
|
+
|
|
87
|
+
{
|
|
88
|
+
id: package_relative,
|
|
89
|
+
description: package_relative,
|
|
90
|
+
status: "error",
|
|
91
|
+
actual: "",
|
|
92
|
+
notes: e.message,
|
|
93
|
+
metadata: {
|
|
94
|
+
phase: "preflight",
|
|
95
|
+
command: Shellwords.join(["ace-test", package_relative])
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def resolve_package_root(sandbox_root, package)
|
|
101
|
+
candidate = File.join(sandbox_root, package)
|
|
102
|
+
return candidate if Dir.exist?(candidate)
|
|
103
|
+
|
|
104
|
+
sandbox_root
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def preflight_summary(status, test_cases)
|
|
108
|
+
passed = test_cases.count { |tc| tc[:status] == "pass" }
|
|
109
|
+
total = test_cases.size
|
|
110
|
+
prefix =
|
|
111
|
+
case status
|
|
112
|
+
when "pass" then "Preflight passed"
|
|
113
|
+
when "fail" then "Preflight failed"
|
|
114
|
+
else "Preflight errored"
|
|
115
|
+
end
|
|
116
|
+
"#{prefix}: #{passed}/#{total} files passed"
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "json"
|
|
5
|
+
require "time"
|
|
3
6
|
require "ace/llm"
|
|
4
7
|
require "ace/llm/query_interface"
|
|
5
8
|
|
|
@@ -9,17 +12,25 @@ module Ace
|
|
|
9
12
|
module Molecules
|
|
10
13
|
# Executes standalone scenarios using the deterministic pipeline.
|
|
11
14
|
class PipelineExecutor
|
|
15
|
+
AMBIENT_TMUX_ENV_VARS = %w[TMUX TMUX_PANE].freeze
|
|
16
|
+
|
|
12
17
|
# @param provider [String]
|
|
18
|
+
# @param verifier_provider [String, nil]
|
|
13
19
|
# @param timeout [Integer]
|
|
14
20
|
# @param sandbox_builder [Molecules::PipelineSandboxBuilder]
|
|
15
21
|
# @param prompt_bundler [Molecules::PipelinePromptBundler]
|
|
16
22
|
# @param report_generator [Molecules::PipelineReportGenerator]
|
|
17
|
-
def initialize(provider:, timeout:, sandbox_builder: nil, prompt_bundler: nil,
|
|
23
|
+
def initialize(provider:, verifier_provider: nil, timeout:, sandbox_builder: nil, prompt_bundler: nil,
|
|
24
|
+
report_generator: nil, sandbox_backend_factory: nil)
|
|
18
25
|
@provider = provider
|
|
26
|
+
@verifier_provider = verifier_provider || provider
|
|
19
27
|
@timeout = timeout
|
|
20
28
|
@sandbox_builder = sandbox_builder || PipelineSandboxBuilder.new
|
|
21
29
|
@prompt_bundler = prompt_bundler || PipelinePromptBundler.new
|
|
22
30
|
@report_generator = report_generator || PipelineReportGenerator.new
|
|
31
|
+
@sandbox_backend_factory = sandbox_backend_factory || lambda { |sandbox_path, source_root: nil|
|
|
32
|
+
Molecules::BwrapSandboxBackend.new(sandbox_root: sandbox_path, source_root: source_root)
|
|
33
|
+
}
|
|
23
34
|
end
|
|
24
35
|
|
|
25
36
|
# @param scenario [Models::TestScenario]
|
|
@@ -31,57 +42,89 @@ module Ace
|
|
|
31
42
|
# @return [Models::TestResult]
|
|
32
43
|
def execute(scenario:, cli_args:, sandbox_path:, report_dir:, env_vars: nil, test_cases: nil)
|
|
33
44
|
started_at = Time.now
|
|
45
|
+
FileUtils.mkdir_p(report_dir)
|
|
46
|
+
write_command_record(report_dir, "runner", provider: @provider, cli_args: cli_args)
|
|
47
|
+
write_tc_manifests(report_dir, scenario, test_cases: test_cases)
|
|
34
48
|
|
|
35
|
-
build_env =
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
build_env = if prepared_sandbox?(sandbox_path, env_vars)
|
|
50
|
+
@sandbox_builder.prepare_existing_sandbox(
|
|
51
|
+
scenario: scenario,
|
|
52
|
+
sandbox_path: sandbox_path,
|
|
53
|
+
test_cases: test_cases
|
|
54
|
+
)
|
|
55
|
+
else
|
|
56
|
+
@sandbox_builder.build(
|
|
57
|
+
scenario: scenario,
|
|
58
|
+
sandbox_path: sandbox_path,
|
|
59
|
+
test_cases: test_cases
|
|
60
|
+
)
|
|
61
|
+
end
|
|
62
|
+
merged_env = sanitize_subprocess_env((env_vars || {}).merge(build_env))
|
|
63
|
+
sandbox_backend = @sandbox_backend_factory.call(
|
|
64
|
+
sandbox_path,
|
|
65
|
+
source_root: merged_env["ACE_E2E_SOURCE_ROOT"] || merged_env[:ACE_E2E_SOURCE_ROOT]
|
|
39
66
|
)
|
|
40
|
-
merged_env =
|
|
67
|
+
merged_env = sandbox_backend.prepared_env(merged_env)
|
|
41
68
|
|
|
42
69
|
runner = @prompt_bundler.prepare_runner(
|
|
43
70
|
scenario: scenario,
|
|
44
71
|
sandbox_path: sandbox_path,
|
|
45
72
|
test_cases: test_cases
|
|
46
73
|
)
|
|
47
|
-
run_llm(
|
|
74
|
+
runner_response = run_llm(
|
|
48
75
|
prompt_path: runner[:prompt_path],
|
|
49
76
|
system_path: runner[:system_path],
|
|
50
77
|
output_path: runner[:output_path],
|
|
51
78
|
cli_args: cli_args,
|
|
52
|
-
env_vars: merged_env
|
|
79
|
+
env_vars: merged_env,
|
|
80
|
+
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
81
|
+
provider: @provider
|
|
53
82
|
)
|
|
83
|
+
runner_observations = extract_runner_observations(runner_response[:text])
|
|
84
|
+
artifact_contract = snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases: test_cases)
|
|
54
85
|
|
|
55
86
|
verifier = @prompt_bundler.prepare_verifier(
|
|
56
87
|
scenario: scenario,
|
|
57
88
|
sandbox_path: sandbox_path,
|
|
58
|
-
test_cases: test_cases
|
|
89
|
+
test_cases: test_cases,
|
|
90
|
+
runner_observations: runner_observations,
|
|
91
|
+
artifact_contract: artifact_contract
|
|
59
92
|
)
|
|
93
|
+
write_command_record(report_dir, "verifier", provider: @verifier_provider, cli_args: cli_args)
|
|
60
94
|
verifier_response = run_llm(
|
|
61
95
|
prompt_path: verifier[:prompt_path],
|
|
62
96
|
system_path: verifier[:system_path],
|
|
63
97
|
output_path: verifier[:output_path],
|
|
64
98
|
cli_args: cli_args,
|
|
65
|
-
env_vars: merged_env
|
|
99
|
+
env_vars: merged_env,
|
|
100
|
+
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
101
|
+
provider: @verifier_provider
|
|
66
102
|
)
|
|
67
103
|
|
|
68
104
|
@report_generator.generate(
|
|
69
105
|
scenario: scenario,
|
|
70
106
|
verifier_output: verifier_response[:text],
|
|
71
107
|
report_dir: report_dir,
|
|
72
|
-
provider: @
|
|
108
|
+
provider: @verifier_provider,
|
|
73
109
|
started_at: started_at,
|
|
74
|
-
completed_at: Time.now
|
|
110
|
+
completed_at: Time.now,
|
|
111
|
+
metadata: base_metadata(
|
|
112
|
+
report_dir,
|
|
113
|
+
runner_observations: runner_observations,
|
|
114
|
+
artifact_contract: artifact_contract
|
|
115
|
+
)
|
|
75
116
|
)
|
|
76
117
|
rescue => e
|
|
77
118
|
begin
|
|
78
119
|
@report_generator.write_failure_report(
|
|
79
120
|
scenario: scenario,
|
|
80
121
|
report_dir: report_dir,
|
|
81
|
-
provider: @
|
|
122
|
+
provider: @verifier_provider,
|
|
82
123
|
started_at: started_at || Time.now,
|
|
83
124
|
completed_at: Time.now,
|
|
84
|
-
error_message: "#{e.class}: #{e.message}"
|
|
125
|
+
error_message: "#{e.class}: #{e.message}",
|
|
126
|
+
failure_category: "runner-error",
|
|
127
|
+
metadata: base_metadata(report_dir)
|
|
85
128
|
)
|
|
86
129
|
rescue => write_error
|
|
87
130
|
Models::TestResult.new(
|
|
@@ -97,24 +140,121 @@ module Ace
|
|
|
97
140
|
|
|
98
141
|
private
|
|
99
142
|
|
|
100
|
-
def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:)
|
|
143
|
+
def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:, subprocess_command_prefix:, provider:)
|
|
101
144
|
prompt = File.read(prompt_path)
|
|
102
145
|
system = File.read(system_path)
|
|
103
146
|
sandbox_dir = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
|
|
104
147
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
148
|
+
Ace::LLM::QueryInterface.query(
|
|
149
|
+
provider,
|
|
150
|
+
prompt,
|
|
151
|
+
system: system,
|
|
152
|
+
cli_args: cli_args,
|
|
153
|
+
timeout: @timeout,
|
|
154
|
+
fallback: false,
|
|
155
|
+
output: output_path,
|
|
156
|
+
subprocess_env: env_vars,
|
|
157
|
+
subprocess_command_prefix: subprocess_command_prefix,
|
|
158
|
+
working_dir: sandbox_dir
|
|
159
|
+
)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def write_tc_manifests(report_dir, scenario, test_cases:)
|
|
163
|
+
selected = select_test_cases(scenario, test_cases)
|
|
164
|
+
selected.each do |test_case|
|
|
165
|
+
manifest = {
|
|
166
|
+
tc_id: test_case.tc_id,
|
|
167
|
+
title: test_case.title,
|
|
168
|
+
declared_artifacts: Array(test_case.declared_artifacts),
|
|
169
|
+
optional_artifacts: Array(test_case.optional_artifacts),
|
|
170
|
+
goal_format: test_case.goal_format
|
|
171
|
+
}
|
|
172
|
+
File.write(
|
|
173
|
+
File.join(report_dir, "#{test_case.short_id}.manifest.json"),
|
|
174
|
+
JSON.pretty_generate(manifest)
|
|
115
175
|
)
|
|
116
176
|
end
|
|
117
177
|
end
|
|
178
|
+
|
|
179
|
+
def write_command_record(report_dir, phase, provider:, cli_args:)
|
|
180
|
+
record = {
|
|
181
|
+
phase: phase,
|
|
182
|
+
provider: provider,
|
|
183
|
+
cli_args: cli_args,
|
|
184
|
+
timeout: @timeout,
|
|
185
|
+
recorded_at: Time.now.utc.iso8601
|
|
186
|
+
}
|
|
187
|
+
File.write(
|
|
188
|
+
File.join(report_dir, "#{phase}.command.json"),
|
|
189
|
+
JSON.pretty_generate(record)
|
|
190
|
+
)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases:)
|
|
194
|
+
snapshot = select_test_cases(scenario, test_cases).to_h do |test_case|
|
|
195
|
+
required = Array(test_case.declared_artifacts).sort
|
|
196
|
+
optional = Array(test_case.optional_artifacts).sort
|
|
197
|
+
present_required = required.select { |path| File.exist?(File.join(sandbox_path, path)) }
|
|
198
|
+
present_optional = optional.select { |path| File.exist?(File.join(sandbox_path, path)) }
|
|
199
|
+
missing_required = required - present_required
|
|
200
|
+
|
|
201
|
+
[test_case.tc_id, {
|
|
202
|
+
"present_artifacts" => (present_required + present_optional).sort,
|
|
203
|
+
"required_artifacts" => required,
|
|
204
|
+
"present_required_artifacts" => present_required,
|
|
205
|
+
"missing_required_artifacts" => missing_required,
|
|
206
|
+
"optional_artifacts" => optional,
|
|
207
|
+
"present_optional_artifacts" => present_optional
|
|
208
|
+
}]
|
|
209
|
+
end
|
|
210
|
+
File.write(File.join(report_dir, "artifact-snapshot.json"), JSON.pretty_generate(snapshot))
|
|
211
|
+
snapshot
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def select_test_cases(scenario, test_cases)
|
|
215
|
+
return Array(scenario.test_cases) if test_cases.nil? || test_cases.empty?
|
|
216
|
+
|
|
217
|
+
wanted = test_cases.map { |value| value.to_s.upcase }
|
|
218
|
+
Array(scenario.test_cases).select { |tc| wanted.include?(tc.tc_id.to_s.upcase) }
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def base_metadata(report_dir, runner_observations: nil, artifact_contract: nil)
|
|
222
|
+
metadata = {
|
|
223
|
+
"runner_provider" => @provider,
|
|
224
|
+
"verifier_provider" => @verifier_provider,
|
|
225
|
+
"report_dir" => report_dir
|
|
226
|
+
}
|
|
227
|
+
if runner_observations && !runner_observations.empty?
|
|
228
|
+
metadata["runner_observations"] = runner_observations
|
|
229
|
+
end
|
|
230
|
+
if artifact_contract
|
|
231
|
+
metadata["missing_required_artifacts"] = artifact_contract.to_h.transform_values do |entry|
|
|
232
|
+
Array(entry["missing_required_artifacts"])
|
|
233
|
+
end.reject { |_tc_id, paths| paths.empty? }
|
|
234
|
+
end
|
|
235
|
+
metadata
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def sanitize_subprocess_env(env_vars)
|
|
239
|
+
sanitized = env_vars.reject { |key, _value| AMBIENT_TMUX_ENV_VARS.include?(key.to_s) }
|
|
240
|
+
AMBIENT_TMUX_ENV_VARS.each { |key| sanitized[key] = nil }
|
|
241
|
+
sanitized
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def prepared_sandbox?(sandbox_path, env_vars)
|
|
245
|
+
return false unless env_vars.is_a?(Hash) && !env_vars.empty?
|
|
246
|
+
|
|
247
|
+
env_root = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
|
|
248
|
+
return false if env_root.to_s.strip.empty?
|
|
249
|
+
|
|
250
|
+
File.expand_path(env_root) == File.expand_path(sandbox_path)
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def extract_runner_observations(text)
|
|
254
|
+
Atoms::SkillResultParser.parse(text)[:observations].to_s
|
|
255
|
+
rescue Atoms::ResultParser::ParseError
|
|
256
|
+
""
|
|
257
|
+
end
|
|
118
258
|
end
|
|
119
259
|
end
|
|
120
260
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "date"
|
|
4
4
|
require "fileutils"
|
|
5
|
+
require "time"
|
|
5
6
|
require "yaml"
|
|
6
7
|
|
|
7
8
|
module Ace
|
|
@@ -15,21 +16,35 @@ module Ace
|
|
|
15
16
|
|
|
16
17
|
Rules:
|
|
17
18
|
- Execute each goal in order
|
|
18
|
-
-
|
|
19
|
-
-
|
|
19
|
+
- Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep any declared outcome artifacts under SANDBOX_ROOT/results
|
|
20
|
+
- Preserve the sandbox runtime environment; do not reset PATH, HOME, or other provided env vars
|
|
21
|
+
- If `ACE_E2E_SANDBOX_RUNTIME_ROOT` is set, make sure command execution uses `$ACE_E2E_SANDBOX_RUNTIME_ROOT/bin` on PATH in the shell where you run scenario commands
|
|
22
|
+
- Run `ace-*` commands directly; do not wrap them with `timeout`, `env -i`, or other execution wrappers that can change behavior or hide diagnostics
|
|
23
|
+
- Do not bypass the public CLI with repo-local executables such as `./exe/ace-*`, `bin/ace-*`, or `ruby .../exe/ace-*`
|
|
20
24
|
- Do not fabricate output - all artifacts must come from real tool execution
|
|
25
|
+
- Never background commands or start dependent verification captures before the command they verify has completed
|
|
26
|
+
- When a goal requires command captures, keep stdout and stderr separate; do not merge streams and do not use `2>&1`
|
|
27
|
+
- A command capture set is incomplete unless the matching `.stdout`, `.stderr`, and `.exit` files all exist
|
|
28
|
+
- Persist each command's `.stdout`, `.stderr`, and `.exit` files immediately after that command finishes, before starting the next command
|
|
29
|
+
- For commands that establish state, write that command's `.exit` file before running any list/status/fs-check/tmux verification for the same goal
|
|
30
|
+
- When a successful command prints a filesystem path to a generated artifact, copy that artifact into `results/` if the goal asks for supporting evidence from the generated file
|
|
21
31
|
- If a goal fails, note the failure and continue to the next goal
|
|
22
|
-
-
|
|
32
|
+
- Do not create synthetic helper reports or temp input files under results/ unless the scenario explicitly treats them as product outcomes
|
|
33
|
+
- After all goals, return concise runner observations describing what you did and what happened
|
|
23
34
|
PROMPT
|
|
24
35
|
|
|
25
36
|
VERIFIER_SYSTEM_PROMPT = <<~PROMPT
|
|
26
37
|
You are an E2E test verifier. You inspect artifacts and render PASS/FAIL verdicts.
|
|
27
38
|
|
|
28
39
|
Rules:
|
|
29
|
-
- Evaluate each goal independently based
|
|
30
|
-
-
|
|
40
|
+
- Evaluate each goal independently based on sandbox state first, then runner observations, then raw debug captures only when needed
|
|
41
|
+
- Treat declared artifacts and helper filenames as hints, not as the source of truth
|
|
42
|
+
- If a helper file is missing or stale, inspect the sandbox directly before failing the goal
|
|
43
|
+
- Use artifact mtimes to detect runner ordering mistakes; if postcondition captures are older than the primary command's stdout/stderr/exit, classify the goal as `runner-error` unless direct sandbox state proves a product failure after the command completed
|
|
44
|
+
- Use read-only commands in the sandbox when they materially improve confidence (for example: git log/status/show, ls/find/cat)
|
|
45
|
+
- Do not speculate beyond the provided sandbox evidence and runner observations
|
|
31
46
|
- For each failed goal, include a category:
|
|
32
|
-
test-spec-error | tool-bug | runner-error | infrastructure-error
|
|
47
|
+
test-spec-error | tool-bug | runner-error | infrastructure-error | missing-artifact
|
|
33
48
|
- For each goal, cite specific evidence (filenames, content snippets)
|
|
34
49
|
- Follow the output format exactly
|
|
35
50
|
PROMPT
|
|
@@ -60,16 +75,20 @@ module Ace
|
|
|
60
75
|
# @param sandbox_path [String]
|
|
61
76
|
# @param test_cases [Array<String>, nil]
|
|
62
77
|
# @return [Hash]
|
|
63
|
-
def prepare_verifier(scenario:, sandbox_path:, test_cases: nil)
|
|
78
|
+
def prepare_verifier(scenario:, sandbox_path:, test_cases: nil, runner_observations: nil, artifact_contract: nil)
|
|
64
79
|
cache_dir = ensure_cache_dir(sandbox_path)
|
|
65
80
|
system_path = File.join(cache_dir, "verifier-system.md")
|
|
66
81
|
prompt_path = File.join(cache_dir, "verifier-prompt.md")
|
|
67
82
|
|
|
68
83
|
File.write(system_path, VERIFIER_SYSTEM_PROMPT)
|
|
69
84
|
|
|
85
|
+
project_context = build_project_context_section(scenario)
|
|
86
|
+
sandbox_context = build_sandbox_context_section(sandbox_path)
|
|
70
87
|
artifacts = build_artifact_section(sandbox_path)
|
|
88
|
+
contract = build_artifact_contract_section(artifact_contract)
|
|
89
|
+
observations = build_runner_observation_section(runner_observations)
|
|
71
90
|
criteria = bundle_markdown_file(File.join(scenario.dir_path, "verifier.yml.md"), test_cases: test_cases)
|
|
72
|
-
File.write(prompt_path, [artifacts, criteria].join("\n\n---\n\n"))
|
|
91
|
+
File.write(prompt_path, [project_context, sandbox_context, artifacts, contract, observations, criteria].join("\n\n---\n\n"))
|
|
73
92
|
|
|
74
93
|
{
|
|
75
94
|
system_path: system_path,
|
|
@@ -154,6 +173,13 @@ module Ace
|
|
|
154
173
|
parts.concat(tree_entries)
|
|
155
174
|
parts << "```"
|
|
156
175
|
parts << ""
|
|
176
|
+
parts << "## File metadata"
|
|
177
|
+
parts << "```"
|
|
178
|
+
files.each do |file|
|
|
179
|
+
parts << "#{relative_path(file, sandbox_path)}\tmtime=#{File.mtime(file).utc.iso8601}"
|
|
180
|
+
end
|
|
181
|
+
parts << "```"
|
|
182
|
+
parts << ""
|
|
157
183
|
parts << "## File contents"
|
|
158
184
|
parts << ""
|
|
159
185
|
|
|
@@ -168,6 +194,93 @@ module Ace
|
|
|
168
194
|
parts.join("\n").rstrip
|
|
169
195
|
end
|
|
170
196
|
|
|
197
|
+
def build_project_context_section(scenario)
|
|
198
|
+
package_root = File.expand_path("../../..", scenario.dir_path)
|
|
199
|
+
source_root = File.expand_path("..", package_root)
|
|
200
|
+
files = [
|
|
201
|
+
File.join(package_root, "README.md"),
|
|
202
|
+
File.join(package_root, "docs", "usage.md"),
|
|
203
|
+
File.join(package_root, "docs", "getting-started.md"),
|
|
204
|
+
File.join(source_root, "CLAUDE.md")
|
|
205
|
+
].select { |path| File.file?(path) }.first(3)
|
|
206
|
+
|
|
207
|
+
parts = []
|
|
208
|
+
parts << "# Project Context"
|
|
209
|
+
parts << ""
|
|
210
|
+
parts << "- Package: `#{scenario.package}`"
|
|
211
|
+
parts << "- Test ID: `#{scenario.test_id}`"
|
|
212
|
+
parts << "- Sandbox profile: `#{scenario.sandbox_profile}`"
|
|
213
|
+
parts << ""
|
|
214
|
+
|
|
215
|
+
files.each do |file|
|
|
216
|
+
parts << "## `#{File.basename(file)}`"
|
|
217
|
+
parts << "```"
|
|
218
|
+
parts << safe_read(file)
|
|
219
|
+
parts << "```"
|
|
220
|
+
parts << ""
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
parts.join("\n").rstrip
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def build_sandbox_context_section(sandbox_path)
|
|
227
|
+
sandbox_path = File.expand_path(sandbox_path)
|
|
228
|
+
entries = Dir.glob(File.join(sandbox_path, "*"), File::FNM_DOTMATCH)
|
|
229
|
+
.reject { |path| %w[. ..].include?(File.basename(path)) }
|
|
230
|
+
.sort
|
|
231
|
+
|
|
232
|
+
parts = []
|
|
233
|
+
parts << "# Sandbox Context"
|
|
234
|
+
parts << ""
|
|
235
|
+
parts << "- Sandbox root: `#{sandbox_path}`"
|
|
236
|
+
parts << "- Inspect the sandbox directly when verifying source-of-truth state."
|
|
237
|
+
parts << ""
|
|
238
|
+
parts << "## Top-level entries"
|
|
239
|
+
parts << "```"
|
|
240
|
+
parts.concat(entries.map { |path| relative_path(path, sandbox_path) })
|
|
241
|
+
parts << "```"
|
|
242
|
+
|
|
243
|
+
parts.join("\n").rstrip
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def build_runner_observation_section(runner_observations)
|
|
247
|
+
<<~MARKDOWN.rstrip
|
|
248
|
+
# Runner Observations
|
|
249
|
+
|
|
250
|
+
#{runner_observations.to_s.strip.empty? ? "(none provided)" : runner_observations.to_s.strip}
|
|
251
|
+
MARKDOWN
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def build_artifact_contract_section(artifact_contract)
|
|
255
|
+
return "# Artifact Contract\n\n(no snapshot provided)" if artifact_contract.nil? || artifact_contract.empty?
|
|
256
|
+
|
|
257
|
+
parts = []
|
|
258
|
+
parts << "# Artifact Contract"
|
|
259
|
+
parts << ""
|
|
260
|
+
parts << "Use this only as supporting context. Missing helper artifacts may be acceptable when sandbox state still proves the goal."
|
|
261
|
+
parts << ""
|
|
262
|
+
|
|
263
|
+
artifact_contract.sort.each do |tc_id, entry|
|
|
264
|
+
parts << "## #{tc_id}"
|
|
265
|
+
parts << ""
|
|
266
|
+
parts << "- Required artifacts: #{format_artifact_list(entry["required_artifacts"])}"
|
|
267
|
+
parts << "- Present required artifacts: #{format_artifact_list(entry["present_required_artifacts"])}"
|
|
268
|
+
parts << "- Missing required artifacts: #{format_artifact_list(entry["missing_required_artifacts"])}"
|
|
269
|
+
parts << "- Optional artifacts: #{format_artifact_list(entry["optional_artifacts"])}"
|
|
270
|
+
parts << "- Present optional artifacts: #{format_artifact_list(entry["present_optional_artifacts"])}"
|
|
271
|
+
parts << ""
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
parts.join("\n").rstrip
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
def format_artifact_list(paths)
|
|
278
|
+
items = Array(paths)
|
|
279
|
+
return "(none)" if items.empty?
|
|
280
|
+
|
|
281
|
+
items.map { |path| "`#{path}`" }.join(", ")
|
|
282
|
+
end
|
|
283
|
+
|
|
171
284
|
def relative_path(path, root)
|
|
172
285
|
File.expand_path(path).sub("#{File.expand_path(root)}/", "")
|
|
173
286
|
end
|