ace-test-runner-e2e 0.29.8 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ace-defaults/e2e-runner/config.yml +14 -2
- data/CHANGELOG.md +233 -0
- data/README.md +2 -2
- data/exe/ace-test-e2e-sh +9 -4
- data/handbook/guides/e2e-testing.g.md +75 -9
- data/handbook/guides/scenario-yml-reference.g.md +21 -8
- data/handbook/guides/tc-authoring.g.md +23 -5
- data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
- data/handbook/skills/as-e2e-review/SKILL.md +2 -2
- data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
- data/handbook/templates/agent-experience-report.template.md +3 -2
- data/handbook/templates/scenario.yml.template.yml +7 -2
- data/handbook/templates/tc-file.template.md +16 -4
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
- data/handbook/workflow-instructions/e2e/create.wf.md +128 -25
- data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
- data/handbook/workflow-instructions/e2e/fix.wf.md +84 -15
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +33 -1
- data/handbook/workflow-instructions/e2e/review.wf.md +40 -25
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +22 -8
- data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
- data/lib/ace/test/end_to_end_runner/atoms/artifact_contract_validator.rb +138 -0
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
- data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +195 -5
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +58 -9
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
- data/lib/ace/test/end_to_end_runner/molecules/artifact_pruner.rb +61 -0
- data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
- data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +235 -18
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +164 -13
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +121 -18
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +15 -12
- data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +374 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +83 -5
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +121 -16
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +422 -97
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +98 -18
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +159 -19
- data/lib/ace/test/end_to_end_runner/version.rb +1 -1
- data/lib/ace/test/end_to_end_runner.rb +4 -0
- metadata +21 -2
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "json"
|
|
5
|
+
require "time"
|
|
3
6
|
require "ace/llm"
|
|
4
7
|
require "ace/llm/query_interface"
|
|
5
8
|
|
|
@@ -9,17 +12,25 @@ module Ace
|
|
|
9
12
|
module Molecules
|
|
10
13
|
# Executes standalone scenarios using the deterministic pipeline.
|
|
11
14
|
class PipelineExecutor
|
|
15
|
+
AMBIENT_TMUX_ENV_VARS = %w[TMUX TMUX_PANE].freeze
|
|
16
|
+
|
|
12
17
|
# @param provider [String]
|
|
18
|
+
# @param verifier_provider [String, nil]
|
|
13
19
|
# @param timeout [Integer]
|
|
14
20
|
# @param sandbox_builder [Molecules::PipelineSandboxBuilder]
|
|
15
21
|
# @param prompt_bundler [Molecules::PipelinePromptBundler]
|
|
16
22
|
# @param report_generator [Molecules::PipelineReportGenerator]
|
|
17
|
-
def initialize(provider:, timeout:, sandbox_builder: nil, prompt_bundler: nil,
|
|
23
|
+
def initialize(provider:, verifier_provider: nil, timeout:, sandbox_builder: nil, prompt_bundler: nil,
|
|
24
|
+
report_generator: nil, sandbox_backend_factory: nil)
|
|
18
25
|
@provider = provider
|
|
26
|
+
@verifier_provider = verifier_provider || provider
|
|
19
27
|
@timeout = timeout
|
|
20
28
|
@sandbox_builder = sandbox_builder || PipelineSandboxBuilder.new
|
|
21
29
|
@prompt_bundler = prompt_bundler || PipelinePromptBundler.new
|
|
22
30
|
@report_generator = report_generator || PipelineReportGenerator.new
|
|
31
|
+
@sandbox_backend_factory = sandbox_backend_factory || lambda { |sandbox_path, source_root: nil|
|
|
32
|
+
Molecules::BwrapSandboxBackend.new(sandbox_root: sandbox_path, source_root: source_root)
|
|
33
|
+
}
|
|
23
34
|
end
|
|
24
35
|
|
|
25
36
|
# @param scenario [Models::TestScenario]
|
|
@@ -31,57 +42,126 @@ module Ace
|
|
|
31
42
|
# @return [Models::TestResult]
|
|
32
43
|
def execute(scenario:, cli_args:, sandbox_path:, report_dir:, env_vars: nil, test_cases: nil)
|
|
33
44
|
started_at = Time.now
|
|
45
|
+
FileUtils.mkdir_p(report_dir)
|
|
46
|
+
write_command_record(report_dir, "runner", provider: @provider, cli_args: cli_args)
|
|
47
|
+
write_tc_manifests(report_dir, scenario, test_cases: test_cases)
|
|
34
48
|
|
|
35
|
-
build_env =
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
build_env = if prepared_sandbox?(sandbox_path, env_vars)
|
|
50
|
+
@sandbox_builder.prepare_existing_sandbox(
|
|
51
|
+
scenario: scenario,
|
|
52
|
+
sandbox_path: sandbox_path,
|
|
53
|
+
test_cases: test_cases
|
|
54
|
+
)
|
|
55
|
+
else
|
|
56
|
+
@sandbox_builder.build(
|
|
57
|
+
scenario: scenario,
|
|
58
|
+
sandbox_path: sandbox_path,
|
|
59
|
+
test_cases: test_cases
|
|
60
|
+
)
|
|
61
|
+
end
|
|
62
|
+
merged_env = sanitize_subprocess_env((env_vars || {}).merge(build_env))
|
|
63
|
+
sandbox_backend = @sandbox_backend_factory.call(
|
|
64
|
+
sandbox_path,
|
|
65
|
+
source_root: merged_env["ACE_E2E_SOURCE_ROOT"] || merged_env[:ACE_E2E_SOURCE_ROOT]
|
|
39
66
|
)
|
|
40
|
-
merged_env =
|
|
67
|
+
merged_env = sandbox_backend.prepared_env(merged_env)
|
|
41
68
|
|
|
42
69
|
runner = @prompt_bundler.prepare_runner(
|
|
43
70
|
scenario: scenario,
|
|
44
71
|
sandbox_path: sandbox_path,
|
|
45
|
-
test_cases: test_cases
|
|
72
|
+
test_cases: test_cases,
|
|
73
|
+
artifact_contract: declared_artifact_contract(scenario, test_cases: test_cases)
|
|
46
74
|
)
|
|
47
|
-
run_llm(
|
|
75
|
+
runner_response = run_llm(
|
|
48
76
|
prompt_path: runner[:prompt_path],
|
|
49
77
|
system_path: runner[:system_path],
|
|
50
78
|
output_path: runner[:output_path],
|
|
51
79
|
cli_args: cli_args,
|
|
52
|
-
env_vars: merged_env
|
|
80
|
+
env_vars: merged_env,
|
|
81
|
+
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
82
|
+
provider: @provider,
|
|
83
|
+
fallback: false
|
|
84
|
+
)
|
|
85
|
+
runner_observations = extract_runner_observations(runner_response[:text])
|
|
86
|
+
initial_artifact_contract = snapshot_artifacts(
|
|
87
|
+
report_dir,
|
|
88
|
+
sandbox_path,
|
|
89
|
+
scenario,
|
|
90
|
+
test_cases: test_cases,
|
|
91
|
+
snapshot_name: "artifact-snapshot.initial.json"
|
|
53
92
|
)
|
|
93
|
+
artifact_contract = initial_artifact_contract
|
|
94
|
+
|
|
95
|
+
if missing_required_artifacts?(artifact_contract)
|
|
96
|
+
write_command_record(report_dir, "runner-repair", provider: @provider, cli_args: cli_args)
|
|
97
|
+
repair_runner = @prompt_bundler.prepare_runner(
|
|
98
|
+
scenario: scenario,
|
|
99
|
+
sandbox_path: sandbox_path,
|
|
100
|
+
test_cases: test_cases,
|
|
101
|
+
artifact_contract: artifact_contract,
|
|
102
|
+
repair_mode: true
|
|
103
|
+
)
|
|
104
|
+
repair_response = run_llm(
|
|
105
|
+
prompt_path: repair_runner[:prompt_path],
|
|
106
|
+
system_path: repair_runner[:system_path],
|
|
107
|
+
output_path: repair_runner[:output_path],
|
|
108
|
+
cli_args: cli_args,
|
|
109
|
+
env_vars: merged_env,
|
|
110
|
+
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
111
|
+
provider: @provider,
|
|
112
|
+
fallback: false
|
|
113
|
+
)
|
|
114
|
+
repair_observations = extract_runner_observations(repair_response[:text])
|
|
115
|
+
runner_observations = merge_runner_observations(runner_observations, repair_observations)
|
|
116
|
+
artifact_contract = snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases: test_cases)
|
|
117
|
+
else
|
|
118
|
+
write_artifact_snapshot(report_dir, "artifact-snapshot.json", artifact_contract)
|
|
119
|
+
end
|
|
54
120
|
|
|
55
121
|
verifier = @prompt_bundler.prepare_verifier(
|
|
56
122
|
scenario: scenario,
|
|
57
123
|
sandbox_path: sandbox_path,
|
|
58
|
-
test_cases: test_cases
|
|
124
|
+
test_cases: test_cases,
|
|
125
|
+
runner_observations: runner_observations,
|
|
126
|
+
artifact_contract: artifact_contract
|
|
59
127
|
)
|
|
128
|
+
write_command_record(report_dir, "verifier", provider: @verifier_provider, cli_args: cli_args)
|
|
60
129
|
verifier_response = run_llm(
|
|
61
130
|
prompt_path: verifier[:prompt_path],
|
|
62
131
|
system_path: verifier[:system_path],
|
|
63
132
|
output_path: verifier[:output_path],
|
|
64
133
|
cli_args: cli_args,
|
|
65
|
-
env_vars: merged_env
|
|
134
|
+
env_vars: merged_env,
|
|
135
|
+
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
136
|
+
provider: @verifier_provider,
|
|
137
|
+
fallback: query_fallback_for(@verifier_provider)
|
|
66
138
|
)
|
|
67
139
|
|
|
68
140
|
@report_generator.generate(
|
|
69
141
|
scenario: scenario,
|
|
70
142
|
verifier_output: verifier_response[:text],
|
|
71
143
|
report_dir: report_dir,
|
|
72
|
-
provider: @
|
|
144
|
+
provider: @verifier_provider,
|
|
73
145
|
started_at: started_at,
|
|
74
|
-
completed_at: Time.now
|
|
146
|
+
completed_at: Time.now,
|
|
147
|
+
metadata: base_metadata(
|
|
148
|
+
report_dir,
|
|
149
|
+
runner_observations: runner_observations,
|
|
150
|
+
artifact_contract: artifact_contract,
|
|
151
|
+
initial_artifact_contract: initial_artifact_contract
|
|
152
|
+
)
|
|
75
153
|
)
|
|
76
154
|
rescue => e
|
|
77
155
|
begin
|
|
78
156
|
@report_generator.write_failure_report(
|
|
79
157
|
scenario: scenario,
|
|
80
158
|
report_dir: report_dir,
|
|
81
|
-
provider: @
|
|
159
|
+
provider: @verifier_provider,
|
|
82
160
|
started_at: started_at || Time.now,
|
|
83
161
|
completed_at: Time.now,
|
|
84
|
-
error_message: "#{e.class}: #{e.message}"
|
|
162
|
+
error_message: "#{e.class}: #{e.message}",
|
|
163
|
+
failure_category: "runner-error",
|
|
164
|
+
metadata: base_metadata(report_dir)
|
|
85
165
|
)
|
|
86
166
|
rescue => write_error
|
|
87
167
|
Models::TestResult.new(
|
|
@@ -97,23 +177,160 @@ module Ace
|
|
|
97
177
|
|
|
98
178
|
private
|
|
99
179
|
|
|
100
|
-
def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:)
|
|
180
|
+
def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:, subprocess_command_prefix:, provider:, fallback:)
|
|
101
181
|
prompt = File.read(prompt_path)
|
|
102
182
|
system = File.read(system_path)
|
|
103
183
|
sandbox_dir = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
|
|
104
184
|
|
|
105
185
|
Ace::LLM::QueryInterface.query(
|
|
106
|
-
|
|
186
|
+
provider,
|
|
107
187
|
prompt,
|
|
108
188
|
system: system,
|
|
109
189
|
cli_args: cli_args,
|
|
110
190
|
timeout: @timeout,
|
|
111
|
-
fallback:
|
|
191
|
+
fallback: fallback,
|
|
112
192
|
output: output_path,
|
|
113
193
|
subprocess_env: env_vars,
|
|
194
|
+
subprocess_command_prefix: subprocess_command_prefix,
|
|
114
195
|
working_dir: sandbox_dir
|
|
115
196
|
)
|
|
116
197
|
end
|
|
198
|
+
|
|
199
|
+
def query_fallback_for(provider)
|
|
200
|
+
provider.to_s.start_with?("role:")
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def write_tc_manifests(report_dir, scenario, test_cases:)
|
|
204
|
+
selected = select_test_cases(scenario, test_cases)
|
|
205
|
+
selected.each do |test_case|
|
|
206
|
+
manifest = {
|
|
207
|
+
tc_id: test_case.tc_id,
|
|
208
|
+
title: test_case.title,
|
|
209
|
+
declared_artifacts: Array(test_case.declared_artifacts),
|
|
210
|
+
optional_artifacts: Array(test_case.optional_artifacts),
|
|
211
|
+
goal_format: test_case.goal_format
|
|
212
|
+
}
|
|
213
|
+
File.write(
|
|
214
|
+
File.join(report_dir, "#{test_case.short_id}.manifest.json"),
|
|
215
|
+
JSON.pretty_generate(manifest)
|
|
216
|
+
)
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def write_command_record(report_dir, phase, provider:, cli_args:)
|
|
221
|
+
record = {
|
|
222
|
+
phase: phase,
|
|
223
|
+
provider: provider,
|
|
224
|
+
cli_args: cli_args,
|
|
225
|
+
timeout: @timeout,
|
|
226
|
+
recorded_at: Time.now.utc.iso8601
|
|
227
|
+
}
|
|
228
|
+
File.write(
|
|
229
|
+
File.join(report_dir, "#{phase}.command.json"),
|
|
230
|
+
JSON.pretty_generate(record)
|
|
231
|
+
)
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases:, snapshot_name: "artifact-snapshot.json")
|
|
235
|
+
snapshot = declared_artifact_contract(scenario, test_cases: test_cases, sandbox_path: sandbox_path)
|
|
236
|
+
write_artifact_snapshot(report_dir, snapshot_name, snapshot)
|
|
237
|
+
snapshot
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def declared_artifact_contract(scenario, test_cases:, sandbox_path: nil)
|
|
241
|
+
select_test_cases(scenario, test_cases).to_h do |test_case|
|
|
242
|
+
required = Array(test_case.declared_artifacts).sort
|
|
243
|
+
optional = Array(test_case.optional_artifacts).sort
|
|
244
|
+
present_required = present_artifacts(required, sandbox_path)
|
|
245
|
+
present_optional = present_artifacts(optional, sandbox_path)
|
|
246
|
+
missing_required = required - present_required
|
|
247
|
+
|
|
248
|
+
[test_case.tc_id, {
|
|
249
|
+
"present_artifacts" => (present_required + present_optional).sort,
|
|
250
|
+
"required_artifacts" => required,
|
|
251
|
+
"present_required_artifacts" => present_required,
|
|
252
|
+
"missing_required_artifacts" => missing_required,
|
|
253
|
+
"optional_artifacts" => optional,
|
|
254
|
+
"present_optional_artifacts" => present_optional
|
|
255
|
+
}]
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def write_artifact_snapshot(report_dir, snapshot_name, snapshot)
|
|
260
|
+
File.write(File.join(report_dir, snapshot_name), JSON.pretty_generate(snapshot))
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def present_artifacts(paths, sandbox_path)
|
|
264
|
+
return [] unless sandbox_path
|
|
265
|
+
|
|
266
|
+
Array(paths).select { |path| File.exist?(File.join(sandbox_path, path)) }
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def missing_required_artifacts?(artifact_contract)
|
|
270
|
+
artifact_contract.any? do |_tc_id, entry|
|
|
271
|
+
Array(entry["missing_required_artifacts"]).any?
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def merge_runner_observations(initial_observations, repair_observations)
|
|
276
|
+
initial = initial_observations.to_s.strip
|
|
277
|
+
repair = repair_observations.to_s.strip
|
|
278
|
+
return initial if repair.empty?
|
|
279
|
+
return repair if initial.empty?
|
|
280
|
+
|
|
281
|
+
"#{initial}\n\nRepair pass:\n#{repair}"
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def select_test_cases(scenario, test_cases)
|
|
285
|
+
return Array(scenario.test_cases) if test_cases.nil? || test_cases.empty?
|
|
286
|
+
|
|
287
|
+
wanted = test_cases.map { |value| value.to_s.upcase }
|
|
288
|
+
Array(scenario.test_cases).select { |tc| wanted.include?(tc.tc_id.to_s.upcase) }
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
def base_metadata(report_dir, runner_observations: nil, artifact_contract: nil, initial_artifact_contract: nil)
|
|
292
|
+
metadata = {
|
|
293
|
+
"runner_provider" => @provider,
|
|
294
|
+
"verifier_provider" => @verifier_provider,
|
|
295
|
+
"report_dir" => report_dir
|
|
296
|
+
}
|
|
297
|
+
if runner_observations && !runner_observations.empty?
|
|
298
|
+
metadata["runner_observations"] = runner_observations
|
|
299
|
+
end
|
|
300
|
+
if artifact_contract
|
|
301
|
+
metadata["missing_required_artifacts"] = artifact_contract.to_h.transform_values do |entry|
|
|
302
|
+
Array(entry["missing_required_artifacts"])
|
|
303
|
+
end.reject { |_tc_id, paths| paths.empty? }
|
|
304
|
+
end
|
|
305
|
+
if initial_artifact_contract
|
|
306
|
+
metadata["initial_missing_required_artifacts"] = initial_artifact_contract.to_h.transform_values do |entry|
|
|
307
|
+
Array(entry["missing_required_artifacts"])
|
|
308
|
+
end.reject { |_tc_id, paths| paths.empty? }
|
|
309
|
+
metadata["artifact_repair_attempted"] = true if missing_required_artifacts?(initial_artifact_contract)
|
|
310
|
+
end
|
|
311
|
+
metadata
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
def sanitize_subprocess_env(env_vars)
|
|
315
|
+
sanitized = env_vars.reject { |key, _value| AMBIENT_TMUX_ENV_VARS.include?(key.to_s) }
|
|
316
|
+
AMBIENT_TMUX_ENV_VARS.each { |key| sanitized[key] = nil }
|
|
317
|
+
sanitized
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def prepared_sandbox?(sandbox_path, env_vars)
|
|
321
|
+
return false unless env_vars.is_a?(Hash) && !env_vars.empty?
|
|
322
|
+
|
|
323
|
+
env_root = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
|
|
324
|
+
return false if env_root.to_s.strip.empty?
|
|
325
|
+
|
|
326
|
+
File.expand_path(env_root) == File.expand_path(sandbox_path)
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
def extract_runner_observations(text)
|
|
330
|
+
Atoms::SkillResultParser.parse(text)[:observations].to_s
|
|
331
|
+
rescue Atoms::ResultParser::ParseError
|
|
332
|
+
""
|
|
333
|
+
end
|
|
117
334
|
end
|
|
118
335
|
end
|
|
119
336
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "date"
|
|
4
4
|
require "fileutils"
|
|
5
|
+
require "time"
|
|
5
6
|
require "yaml"
|
|
6
7
|
|
|
7
8
|
module Ace
|
|
@@ -15,21 +16,35 @@ module Ace
|
|
|
15
16
|
|
|
16
17
|
Rules:
|
|
17
18
|
- Execute each goal in order
|
|
18
|
-
-
|
|
19
|
-
-
|
|
19
|
+
- Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep any declared outcome artifacts under SANDBOX_ROOT/results
|
|
20
|
+
- Preserve the sandbox runtime environment; do not reset PATH, HOME, or other provided env vars
|
|
21
|
+
- If `ACE_E2E_SANDBOX_RUNTIME_ROOT` is set, make sure command execution uses `$ACE_E2E_SANDBOX_RUNTIME_ROOT/bin` on PATH in the shell where you run scenario commands
|
|
22
|
+
- Run `ace-*` commands directly; do not wrap them with `timeout`, `env -i`, or other execution wrappers that can change behavior or hide diagnostics
|
|
23
|
+
- Do not bypass the public CLI with repo-local executables such as `./exe/ace-*`, `bin/ace-*`, or `ruby .../exe/ace-*`
|
|
20
24
|
- Do not fabricate output - all artifacts must come from real tool execution
|
|
25
|
+
- Never background commands or start dependent verification captures before the command they verify has completed
|
|
26
|
+
- When a goal requires command captures, keep stdout and stderr separate; do not merge streams and do not use `2>&1`
|
|
27
|
+
- A command capture set is incomplete unless the matching `.stdout`, `.stderr`, and `.exit` files all exist
|
|
28
|
+
- Persist each command's `.stdout`, `.stderr`, and `.exit` files immediately after that command finishes, before starting the next command
|
|
29
|
+
- For commands that establish state, write that command's `.exit` file before running any list/status/fs-check/tmux verification for the same goal
|
|
30
|
+
- When a successful command prints a filesystem path to a generated artifact, copy that artifact into `results/` if the goal asks for supporting evidence from the generated file
|
|
21
31
|
- If a goal fails, note the failure and continue to the next goal
|
|
22
|
-
-
|
|
32
|
+
- Do not create synthetic helper reports or temp input files under results/ unless the scenario explicitly treats them as product outcomes
|
|
33
|
+
- After all goals, return concise runner observations describing what you did and what happened
|
|
23
34
|
PROMPT
|
|
24
35
|
|
|
25
36
|
VERIFIER_SYSTEM_PROMPT = <<~PROMPT
|
|
26
37
|
You are an E2E test verifier. You inspect artifacts and render PASS/FAIL verdicts.
|
|
27
38
|
|
|
28
39
|
Rules:
|
|
29
|
-
- Evaluate each goal independently based
|
|
30
|
-
-
|
|
40
|
+
- Evaluate each goal independently based on sandbox state first, then runner observations, then raw debug captures only when needed
|
|
41
|
+
- Treat declared artifacts and helper filenames as hints, not as the source of truth
|
|
42
|
+
- If a helper file is missing or stale, inspect the sandbox directly before failing the goal
|
|
43
|
+
- Use artifact mtimes to detect runner ordering mistakes; if postcondition captures are older than the primary command's stdout/stderr/exit, classify the goal as `runner-error` unless direct sandbox state proves a product failure after the command completed
|
|
44
|
+
- Use read-only commands in the sandbox when they materially improve confidence (for example: git log/status/show, ls/find/cat)
|
|
45
|
+
- Do not speculate beyond the provided sandbox evidence and runner observations
|
|
31
46
|
- For each failed goal, include a category:
|
|
32
|
-
test-spec-error | tool-bug | runner-error | infrastructure-error
|
|
47
|
+
test-spec-error | tool-bug | runner-error | infrastructure-error | missing-artifact
|
|
33
48
|
- For each goal, cite specific evidence (filenames, content snippets)
|
|
34
49
|
- Follow the output format exactly
|
|
35
50
|
PROMPT
|
|
@@ -38,21 +53,23 @@ module Ace
|
|
|
38
53
|
# @param sandbox_path [String]
|
|
39
54
|
# @param test_cases [Array<String>, nil]
|
|
40
55
|
# @return [Hash]
|
|
41
|
-
def prepare_runner(scenario:, sandbox_path:, test_cases: nil)
|
|
56
|
+
def prepare_runner(scenario:, sandbox_path:, test_cases: nil, artifact_contract: nil, repair_mode: false)
|
|
42
57
|
cache_dir = ensure_cache_dir(sandbox_path)
|
|
43
|
-
|
|
44
|
-
|
|
58
|
+
file_prefix = repair_mode ? "runner-repair" : "runner"
|
|
59
|
+
system_path = File.join(cache_dir, "#{file_prefix}-system.md")
|
|
60
|
+
prompt_path = File.join(cache_dir, "#{file_prefix}-prompt.md")
|
|
45
61
|
|
|
46
62
|
File.write(system_path, RUNNER_SYSTEM_PROMPT)
|
|
47
63
|
|
|
48
64
|
bundled = bundle_markdown_file(File.join(scenario.dir_path, "runner.yml.md"), test_cases: test_cases)
|
|
49
65
|
bundled = bundled.gsub("Workspace root: (current directory)", "Workspace root: #{File.expand_path(sandbox_path)}")
|
|
50
|
-
|
|
66
|
+
contract = build_runner_artifact_contract_section(artifact_contract, repair_mode: repair_mode)
|
|
67
|
+
File.write(prompt_path, [bundled, contract].reject(&:empty?).join("\n\n---\n\n"))
|
|
51
68
|
|
|
52
69
|
{
|
|
53
70
|
system_path: system_path,
|
|
54
71
|
prompt_path: prompt_path,
|
|
55
|
-
output_path: File.join(cache_dir, "
|
|
72
|
+
output_path: File.join(cache_dir, "#{file_prefix}-output.md")
|
|
56
73
|
}
|
|
57
74
|
end
|
|
58
75
|
|
|
@@ -60,16 +77,20 @@ module Ace
|
|
|
60
77
|
# @param sandbox_path [String]
|
|
61
78
|
# @param test_cases [Array<String>, nil]
|
|
62
79
|
# @return [Hash]
|
|
63
|
-
def prepare_verifier(scenario:, sandbox_path:, test_cases: nil)
|
|
80
|
+
def prepare_verifier(scenario:, sandbox_path:, test_cases: nil, runner_observations: nil, artifact_contract: nil)
|
|
64
81
|
cache_dir = ensure_cache_dir(sandbox_path)
|
|
65
82
|
system_path = File.join(cache_dir, "verifier-system.md")
|
|
66
83
|
prompt_path = File.join(cache_dir, "verifier-prompt.md")
|
|
67
84
|
|
|
68
85
|
File.write(system_path, VERIFIER_SYSTEM_PROMPT)
|
|
69
86
|
|
|
87
|
+
project_context = build_project_context_section(scenario)
|
|
88
|
+
sandbox_context = build_sandbox_context_section(sandbox_path)
|
|
70
89
|
artifacts = build_artifact_section(sandbox_path)
|
|
90
|
+
contract = build_artifact_contract_section(artifact_contract)
|
|
91
|
+
observations = build_runner_observation_section(runner_observations)
|
|
71
92
|
criteria = bundle_markdown_file(File.join(scenario.dir_path, "verifier.yml.md"), test_cases: test_cases)
|
|
72
|
-
File.write(prompt_path, [artifacts, criteria].join("\n\n---\n\n"))
|
|
93
|
+
File.write(prompt_path, [project_context, sandbox_context, artifacts, contract, observations, criteria].join("\n\n---\n\n"))
|
|
73
94
|
|
|
74
95
|
{
|
|
75
96
|
system_path: system_path,
|
|
@@ -154,6 +175,13 @@ module Ace
|
|
|
154
175
|
parts.concat(tree_entries)
|
|
155
176
|
parts << "```"
|
|
156
177
|
parts << ""
|
|
178
|
+
parts << "## File metadata"
|
|
179
|
+
parts << "```"
|
|
180
|
+
files.each do |file|
|
|
181
|
+
parts << "#{relative_path(file, sandbox_path)}\tmtime=#{File.mtime(file).utc.iso8601}"
|
|
182
|
+
end
|
|
183
|
+
parts << "```"
|
|
184
|
+
parts << ""
|
|
157
185
|
parts << "## File contents"
|
|
158
186
|
parts << ""
|
|
159
187
|
|
|
@@ -168,6 +196,129 @@ module Ace
|
|
|
168
196
|
parts.join("\n").rstrip
|
|
169
197
|
end
|
|
170
198
|
|
|
199
|
+
def build_project_context_section(scenario)
|
|
200
|
+
package_root = File.expand_path("../../..", scenario.dir_path)
|
|
201
|
+
source_root = File.expand_path("..", package_root)
|
|
202
|
+
files = [
|
|
203
|
+
File.join(package_root, "README.md"),
|
|
204
|
+
File.join(package_root, "docs", "usage.md"),
|
|
205
|
+
File.join(package_root, "docs", "getting-started.md"),
|
|
206
|
+
File.join(source_root, "CLAUDE.md")
|
|
207
|
+
].select { |path| File.file?(path) }.first(3)
|
|
208
|
+
|
|
209
|
+
parts = []
|
|
210
|
+
parts << "# Project Context"
|
|
211
|
+
parts << ""
|
|
212
|
+
parts << "- Package: `#{scenario.package}`"
|
|
213
|
+
parts << "- Test ID: `#{scenario.test_id}`"
|
|
214
|
+
parts << "- Sandbox profile: `#{scenario.sandbox_profile}`"
|
|
215
|
+
parts << ""
|
|
216
|
+
|
|
217
|
+
files.each do |file|
|
|
218
|
+
parts << "## `#{File.basename(file)}`"
|
|
219
|
+
parts << "```"
|
|
220
|
+
parts << safe_read(file)
|
|
221
|
+
parts << "```"
|
|
222
|
+
parts << ""
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
parts.join("\n").rstrip
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def build_sandbox_context_section(sandbox_path)
|
|
229
|
+
sandbox_path = File.expand_path(sandbox_path)
|
|
230
|
+
entries = Dir.glob(File.join(sandbox_path, "*"), File::FNM_DOTMATCH)
|
|
231
|
+
.reject { |path| %w[. ..].include?(File.basename(path)) }
|
|
232
|
+
.sort
|
|
233
|
+
|
|
234
|
+
parts = []
|
|
235
|
+
parts << "# Sandbox Context"
|
|
236
|
+
parts << ""
|
|
237
|
+
parts << "- Sandbox root: `#{sandbox_path}`"
|
|
238
|
+
parts << "- Inspect the sandbox directly when verifying source-of-truth state."
|
|
239
|
+
parts << ""
|
|
240
|
+
parts << "## Top-level entries"
|
|
241
|
+
parts << "```"
|
|
242
|
+
parts.concat(entries.map { |path| relative_path(path, sandbox_path) })
|
|
243
|
+
parts << "```"
|
|
244
|
+
|
|
245
|
+
parts.join("\n").rstrip
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def build_runner_observation_section(runner_observations)
|
|
249
|
+
<<~MARKDOWN.rstrip
|
|
250
|
+
# Runner Observations
|
|
251
|
+
|
|
252
|
+
#{runner_observations.to_s.strip.empty? ? "(none provided)" : runner_observations.to_s.strip}
|
|
253
|
+
MARKDOWN
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def build_runner_artifact_contract_section(artifact_contract, repair_mode:)
|
|
257
|
+
return "" if artifact_contract.nil? || artifact_contract.empty?
|
|
258
|
+
|
|
259
|
+
parts = []
|
|
260
|
+
parts << "# Artifact Contract"
|
|
261
|
+
parts << ""
|
|
262
|
+
if repair_mode
|
|
263
|
+
parts << "This is a bounded repair pass."
|
|
264
|
+
parts << "- Do not rerun goals whose required artifacts are already complete."
|
|
265
|
+
parts << "- For each goal with missing required artifacts, produce only the missing files."
|
|
266
|
+
parts << "- Prefer the minimal real public command needed to create the missing capture set."
|
|
267
|
+
parts << "- If the missing file is supporting evidence copied from an already-generated real artifact, copy that real artifact into `results/`."
|
|
268
|
+
parts << "- Do not invent content, fabricate captures, or rewrite unrelated artifacts."
|
|
269
|
+
else
|
|
270
|
+
parts << "A goal is not complete unless every required artifact for that goal exists on disk under `results/`."
|
|
271
|
+
parts << "- After finishing each goal, self-check the required artifact list below."
|
|
272
|
+
parts << "- If a required artifact is missing, fix it before moving on."
|
|
273
|
+
end
|
|
274
|
+
parts << ""
|
|
275
|
+
|
|
276
|
+
artifact_contract.sort.each do |tc_id, entry|
|
|
277
|
+
parts << "## #{tc_id}"
|
|
278
|
+
parts << ""
|
|
279
|
+
parts << "- Required artifacts: #{format_artifact_list(entry["required_artifacts"])}"
|
|
280
|
+
missing = Array(entry["missing_required_artifacts"])
|
|
281
|
+
unless missing.empty?
|
|
282
|
+
parts << "- Missing required artifacts: #{format_artifact_list(missing)}"
|
|
283
|
+
end
|
|
284
|
+
optional = Array(entry["optional_artifacts"])
|
|
285
|
+
parts << "- Optional artifacts: #{format_artifact_list(optional)}" unless optional.empty?
|
|
286
|
+
parts << ""
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
parts.join("\n").rstrip
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
def build_artifact_contract_section(artifact_contract)
|
|
293
|
+
return "# Artifact Contract\n\n(no snapshot provided)" if artifact_contract.nil? || artifact_contract.empty?
|
|
294
|
+
|
|
295
|
+
parts = []
|
|
296
|
+
parts << "# Artifact Contract"
|
|
297
|
+
parts << ""
|
|
298
|
+
parts << "Use this only as supporting context. Missing helper artifacts may be acceptable when sandbox state still proves the goal."
|
|
299
|
+
parts << ""
|
|
300
|
+
|
|
301
|
+
artifact_contract.sort.each do |tc_id, entry|
|
|
302
|
+
parts << "## #{tc_id}"
|
|
303
|
+
parts << ""
|
|
304
|
+
parts << "- Required artifacts: #{format_artifact_list(entry["required_artifacts"])}"
|
|
305
|
+
parts << "- Present required artifacts: #{format_artifact_list(entry["present_required_artifacts"])}"
|
|
306
|
+
parts << "- Missing required artifacts: #{format_artifact_list(entry["missing_required_artifacts"])}"
|
|
307
|
+
parts << "- Optional artifacts: #{format_artifact_list(entry["optional_artifacts"])}"
|
|
308
|
+
parts << "- Present optional artifacts: #{format_artifact_list(entry["present_optional_artifacts"])}"
|
|
309
|
+
parts << ""
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
parts.join("\n").rstrip
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def format_artifact_list(paths)
|
|
316
|
+
items = Array(paths)
|
|
317
|
+
return "(none)" if items.empty?
|
|
318
|
+
|
|
319
|
+
items.map { |path| "`#{path}`" }.join(", ")
|
|
320
|
+
end
|
|
321
|
+
|
|
171
322
|
def relative_path(path, root)
|
|
172
323
|
File.expand_path(path).sub("#{File.expand_path(root)}/", "")
|
|
173
324
|
end
|