ace-test-runner-e2e 0.29.8 → 0.40.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.ace-defaults/e2e-runner/config.yml +14 -2
  3. data/CHANGELOG.md +233 -0
  4. data/README.md +2 -2
  5. data/exe/ace-test-e2e-sh +9 -4
  6. data/handbook/guides/e2e-testing.g.md +75 -9
  7. data/handbook/guides/scenario-yml-reference.g.md +21 -8
  8. data/handbook/guides/tc-authoring.g.md +23 -5
  9. data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
  10. data/handbook/skills/as-e2e-review/SKILL.md +2 -2
  11. data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
  12. data/handbook/templates/agent-experience-report.template.md +3 -2
  13. data/handbook/templates/scenario.yml.template.yml +7 -2
  14. data/handbook/templates/tc-file.template.md +16 -4
  15. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
  16. data/handbook/workflow-instructions/e2e/create.wf.md +128 -25
  17. data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
  18. data/handbook/workflow-instructions/e2e/fix.wf.md +84 -15
  19. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +33 -1
  20. data/handbook/workflow-instructions/e2e/review.wf.md +40 -25
  21. data/handbook/workflow-instructions/e2e/rewrite.wf.md +22 -8
  22. data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
  23. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
  24. data/lib/ace/test/end_to_end_runner/atoms/artifact_contract_validator.rb +138 -0
  25. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
  26. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
  27. data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +195 -5
  28. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +58 -9
  29. data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
  30. data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
  31. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
  32. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
  33. data/lib/ace/test/end_to_end_runner/molecules/artifact_pruner.rb +61 -0
  34. data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
  35. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
  36. data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
  37. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +235 -18
  38. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +164 -13
  39. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
  40. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +121 -18
  41. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +15 -12
  42. data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +374 -0
  43. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +83 -5
  44. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +121 -16
  45. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +422 -97
  46. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
  47. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
  48. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +98 -18
  49. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +159 -19
  50. data/lib/ace/test/end_to_end_runner/version.rb +1 -1
  51. data/lib/ace/test/end_to_end_runner.rb +4 -0
  52. metadata +21 -2
@@ -1,5 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "fileutils"
4
+ require "json"
5
+ require "time"
3
6
  require "ace/llm"
4
7
  require "ace/llm/query_interface"
5
8
 
@@ -9,17 +12,25 @@ module Ace
9
12
  module Molecules
10
13
  # Executes standalone scenarios using the deterministic pipeline.
11
14
  class PipelineExecutor
15
+ AMBIENT_TMUX_ENV_VARS = %w[TMUX TMUX_PANE].freeze
16
+
12
17
  # @param provider [String]
18
+ # @param verifier_provider [String, nil]
13
19
  # @param timeout [Integer]
14
20
  # @param sandbox_builder [Molecules::PipelineSandboxBuilder]
15
21
  # @param prompt_bundler [Molecules::PipelinePromptBundler]
16
22
  # @param report_generator [Molecules::PipelineReportGenerator]
17
- def initialize(provider:, timeout:, sandbox_builder: nil, prompt_bundler: nil, report_generator: nil)
23
+ def initialize(provider:, verifier_provider: nil, timeout:, sandbox_builder: nil, prompt_bundler: nil,
24
+ report_generator: nil, sandbox_backend_factory: nil)
18
25
  @provider = provider
26
+ @verifier_provider = verifier_provider || provider
19
27
  @timeout = timeout
20
28
  @sandbox_builder = sandbox_builder || PipelineSandboxBuilder.new
21
29
  @prompt_bundler = prompt_bundler || PipelinePromptBundler.new
22
30
  @report_generator = report_generator || PipelineReportGenerator.new
31
+ @sandbox_backend_factory = sandbox_backend_factory || lambda { |sandbox_path, source_root: nil|
32
+ Molecules::BwrapSandboxBackend.new(sandbox_root: sandbox_path, source_root: source_root)
33
+ }
23
34
  end
24
35
 
25
36
  # @param scenario [Models::TestScenario]
@@ -31,57 +42,126 @@ module Ace
31
42
  # @return [Models::TestResult]
32
43
  def execute(scenario:, cli_args:, sandbox_path:, report_dir:, env_vars: nil, test_cases: nil)
33
44
  started_at = Time.now
45
+ FileUtils.mkdir_p(report_dir)
46
+ write_command_record(report_dir, "runner", provider: @provider, cli_args: cli_args)
47
+ write_tc_manifests(report_dir, scenario, test_cases: test_cases)
34
48
 
35
- build_env = @sandbox_builder.build(
36
- scenario: scenario,
37
- sandbox_path: sandbox_path,
38
- test_cases: test_cases
49
+ build_env = if prepared_sandbox?(sandbox_path, env_vars)
50
+ @sandbox_builder.prepare_existing_sandbox(
51
+ scenario: scenario,
52
+ sandbox_path: sandbox_path,
53
+ test_cases: test_cases
54
+ )
55
+ else
56
+ @sandbox_builder.build(
57
+ scenario: scenario,
58
+ sandbox_path: sandbox_path,
59
+ test_cases: test_cases
60
+ )
61
+ end
62
+ merged_env = sanitize_subprocess_env((env_vars || {}).merge(build_env))
63
+ sandbox_backend = @sandbox_backend_factory.call(
64
+ sandbox_path,
65
+ source_root: merged_env["ACE_E2E_SOURCE_ROOT"] || merged_env[:ACE_E2E_SOURCE_ROOT]
39
66
  )
40
- merged_env = (env_vars || {}).merge(build_env)
67
+ merged_env = sandbox_backend.prepared_env(merged_env)
41
68
 
42
69
  runner = @prompt_bundler.prepare_runner(
43
70
  scenario: scenario,
44
71
  sandbox_path: sandbox_path,
45
- test_cases: test_cases
72
+ test_cases: test_cases,
73
+ artifact_contract: declared_artifact_contract(scenario, test_cases: test_cases)
46
74
  )
47
- run_llm(
75
+ runner_response = run_llm(
48
76
  prompt_path: runner[:prompt_path],
49
77
  system_path: runner[:system_path],
50
78
  output_path: runner[:output_path],
51
79
  cli_args: cli_args,
52
- env_vars: merged_env
80
+ env_vars: merged_env,
81
+ subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
82
+ provider: @provider,
83
+ fallback: false
84
+ )
85
+ runner_observations = extract_runner_observations(runner_response[:text])
86
+ initial_artifact_contract = snapshot_artifacts(
87
+ report_dir,
88
+ sandbox_path,
89
+ scenario,
90
+ test_cases: test_cases,
91
+ snapshot_name: "artifact-snapshot.initial.json"
53
92
  )
93
+ artifact_contract = initial_artifact_contract
94
+
95
+ if missing_required_artifacts?(artifact_contract)
96
+ write_command_record(report_dir, "runner-repair", provider: @provider, cli_args: cli_args)
97
+ repair_runner = @prompt_bundler.prepare_runner(
98
+ scenario: scenario,
99
+ sandbox_path: sandbox_path,
100
+ test_cases: test_cases,
101
+ artifact_contract: artifact_contract,
102
+ repair_mode: true
103
+ )
104
+ repair_response = run_llm(
105
+ prompt_path: repair_runner[:prompt_path],
106
+ system_path: repair_runner[:system_path],
107
+ output_path: repair_runner[:output_path],
108
+ cli_args: cli_args,
109
+ env_vars: merged_env,
110
+ subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
111
+ provider: @provider,
112
+ fallback: false
113
+ )
114
+ repair_observations = extract_runner_observations(repair_response[:text])
115
+ runner_observations = merge_runner_observations(runner_observations, repair_observations)
116
+ artifact_contract = snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases: test_cases)
117
+ else
118
+ write_artifact_snapshot(report_dir, "artifact-snapshot.json", artifact_contract)
119
+ end
54
120
 
55
121
  verifier = @prompt_bundler.prepare_verifier(
56
122
  scenario: scenario,
57
123
  sandbox_path: sandbox_path,
58
- test_cases: test_cases
124
+ test_cases: test_cases,
125
+ runner_observations: runner_observations,
126
+ artifact_contract: artifact_contract
59
127
  )
128
+ write_command_record(report_dir, "verifier", provider: @verifier_provider, cli_args: cli_args)
60
129
  verifier_response = run_llm(
61
130
  prompt_path: verifier[:prompt_path],
62
131
  system_path: verifier[:system_path],
63
132
  output_path: verifier[:output_path],
64
133
  cli_args: cli_args,
65
- env_vars: merged_env
134
+ env_vars: merged_env,
135
+ subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
136
+ provider: @verifier_provider,
137
+ fallback: query_fallback_for(@verifier_provider)
66
138
  )
67
139
 
68
140
  @report_generator.generate(
69
141
  scenario: scenario,
70
142
  verifier_output: verifier_response[:text],
71
143
  report_dir: report_dir,
72
- provider: @provider,
144
+ provider: @verifier_provider,
73
145
  started_at: started_at,
74
- completed_at: Time.now
146
+ completed_at: Time.now,
147
+ metadata: base_metadata(
148
+ report_dir,
149
+ runner_observations: runner_observations,
150
+ artifact_contract: artifact_contract,
151
+ initial_artifact_contract: initial_artifact_contract
152
+ )
75
153
  )
76
154
  rescue => e
77
155
  begin
78
156
  @report_generator.write_failure_report(
79
157
  scenario: scenario,
80
158
  report_dir: report_dir,
81
- provider: @provider,
159
+ provider: @verifier_provider,
82
160
  started_at: started_at || Time.now,
83
161
  completed_at: Time.now,
84
- error_message: "#{e.class}: #{e.message}"
162
+ error_message: "#{e.class}: #{e.message}",
163
+ failure_category: "runner-error",
164
+ metadata: base_metadata(report_dir)
85
165
  )
86
166
  rescue => write_error
87
167
  Models::TestResult.new(
@@ -97,23 +177,160 @@ module Ace
97
177
 
98
178
  private
99
179
 
100
- def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:)
180
+ def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:, subprocess_command_prefix:, provider:, fallback:)
101
181
  prompt = File.read(prompt_path)
102
182
  system = File.read(system_path)
103
183
  sandbox_dir = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
104
184
 
105
185
  Ace::LLM::QueryInterface.query(
106
- @provider,
186
+ provider,
107
187
  prompt,
108
188
  system: system,
109
189
  cli_args: cli_args,
110
190
  timeout: @timeout,
111
- fallback: false,
191
+ fallback: fallback,
112
192
  output: output_path,
113
193
  subprocess_env: env_vars,
194
+ subprocess_command_prefix: subprocess_command_prefix,
114
195
  working_dir: sandbox_dir
115
196
  )
116
197
  end
198
+
199
+ def query_fallback_for(provider)
200
+ provider.to_s.start_with?("role:")
201
+ end
202
+
203
+ def write_tc_manifests(report_dir, scenario, test_cases:)
204
+ selected = select_test_cases(scenario, test_cases)
205
+ selected.each do |test_case|
206
+ manifest = {
207
+ tc_id: test_case.tc_id,
208
+ title: test_case.title,
209
+ declared_artifacts: Array(test_case.declared_artifacts),
210
+ optional_artifacts: Array(test_case.optional_artifacts),
211
+ goal_format: test_case.goal_format
212
+ }
213
+ File.write(
214
+ File.join(report_dir, "#{test_case.short_id}.manifest.json"),
215
+ JSON.pretty_generate(manifest)
216
+ )
217
+ end
218
+ end
219
+
220
+ def write_command_record(report_dir, phase, provider:, cli_args:)
221
+ record = {
222
+ phase: phase,
223
+ provider: provider,
224
+ cli_args: cli_args,
225
+ timeout: @timeout,
226
+ recorded_at: Time.now.utc.iso8601
227
+ }
228
+ File.write(
229
+ File.join(report_dir, "#{phase}.command.json"),
230
+ JSON.pretty_generate(record)
231
+ )
232
+ end
233
+
234
+ def snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases:, snapshot_name: "artifact-snapshot.json")
235
+ snapshot = declared_artifact_contract(scenario, test_cases: test_cases, sandbox_path: sandbox_path)
236
+ write_artifact_snapshot(report_dir, snapshot_name, snapshot)
237
+ snapshot
238
+ end
239
+
240
+ def declared_artifact_contract(scenario, test_cases:, sandbox_path: nil)
241
+ select_test_cases(scenario, test_cases).to_h do |test_case|
242
+ required = Array(test_case.declared_artifacts).sort
243
+ optional = Array(test_case.optional_artifacts).sort
244
+ present_required = present_artifacts(required, sandbox_path)
245
+ present_optional = present_artifacts(optional, sandbox_path)
246
+ missing_required = required - present_required
247
+
248
+ [test_case.tc_id, {
249
+ "present_artifacts" => (present_required + present_optional).sort,
250
+ "required_artifacts" => required,
251
+ "present_required_artifacts" => present_required,
252
+ "missing_required_artifacts" => missing_required,
253
+ "optional_artifacts" => optional,
254
+ "present_optional_artifacts" => present_optional
255
+ }]
256
+ end
257
+ end
258
+
259
+ def write_artifact_snapshot(report_dir, snapshot_name, snapshot)
260
+ File.write(File.join(report_dir, snapshot_name), JSON.pretty_generate(snapshot))
261
+ end
262
+
263
+ def present_artifacts(paths, sandbox_path)
264
+ return [] unless sandbox_path
265
+
266
+ Array(paths).select { |path| File.exist?(File.join(sandbox_path, path)) }
267
+ end
268
+
269
+ def missing_required_artifacts?(artifact_contract)
270
+ artifact_contract.any? do |_tc_id, entry|
271
+ Array(entry["missing_required_artifacts"]).any?
272
+ end
273
+ end
274
+
275
+ def merge_runner_observations(initial_observations, repair_observations)
276
+ initial = initial_observations.to_s.strip
277
+ repair = repair_observations.to_s.strip
278
+ return initial if repair.empty?
279
+ return repair if initial.empty?
280
+
281
+ "#{initial}\n\nRepair pass:\n#{repair}"
282
+ end
283
+
284
+ def select_test_cases(scenario, test_cases)
285
+ return Array(scenario.test_cases) if test_cases.nil? || test_cases.empty?
286
+
287
+ wanted = test_cases.map { |value| value.to_s.upcase }
288
+ Array(scenario.test_cases).select { |tc| wanted.include?(tc.tc_id.to_s.upcase) }
289
+ end
290
+
291
+ def base_metadata(report_dir, runner_observations: nil, artifact_contract: nil, initial_artifact_contract: nil)
292
+ metadata = {
293
+ "runner_provider" => @provider,
294
+ "verifier_provider" => @verifier_provider,
295
+ "report_dir" => report_dir
296
+ }
297
+ if runner_observations && !runner_observations.empty?
298
+ metadata["runner_observations"] = runner_observations
299
+ end
300
+ if artifact_contract
301
+ metadata["missing_required_artifacts"] = artifact_contract.to_h.transform_values do |entry|
302
+ Array(entry["missing_required_artifacts"])
303
+ end.reject { |_tc_id, paths| paths.empty? }
304
+ end
305
+ if initial_artifact_contract
306
+ metadata["initial_missing_required_artifacts"] = initial_artifact_contract.to_h.transform_values do |entry|
307
+ Array(entry["missing_required_artifacts"])
308
+ end.reject { |_tc_id, paths| paths.empty? }
309
+ metadata["artifact_repair_attempted"] = true if missing_required_artifacts?(initial_artifact_contract)
310
+ end
311
+ metadata
312
+ end
313
+
314
+ def sanitize_subprocess_env(env_vars)
315
+ sanitized = env_vars.reject { |key, _value| AMBIENT_TMUX_ENV_VARS.include?(key.to_s) }
316
+ AMBIENT_TMUX_ENV_VARS.each { |key| sanitized[key] = nil }
317
+ sanitized
318
+ end
319
+
320
+ def prepared_sandbox?(sandbox_path, env_vars)
321
+ return false unless env_vars.is_a?(Hash) && !env_vars.empty?
322
+
323
+ env_root = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
324
+ return false if env_root.to_s.strip.empty?
325
+
326
+ File.expand_path(env_root) == File.expand_path(sandbox_path)
327
+ end
328
+
329
+ def extract_runner_observations(text)
330
+ Atoms::SkillResultParser.parse(text)[:observations].to_s
331
+ rescue Atoms::ResultParser::ParseError
332
+ ""
333
+ end
117
334
  end
118
335
  end
119
336
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "date"
4
4
  require "fileutils"
5
+ require "time"
5
6
  require "yaml"
6
7
 
7
8
  module Ace
@@ -15,21 +16,35 @@ module Ace
15
16
 
16
17
  Rules:
17
18
  - Execute each goal in order
18
- - Save all artifacts to results/tc/{NN}/ directories as specified
19
- - Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep artifact writes under SANDBOX_ROOT/results
19
+ - Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep any declared outcome artifacts under SANDBOX_ROOT/results
20
+ - Preserve the sandbox runtime environment; do not reset PATH, HOME, or other provided env vars
21
+ - If `ACE_E2E_SANDBOX_RUNTIME_ROOT` is set, make sure command execution uses `$ACE_E2E_SANDBOX_RUNTIME_ROOT/bin` on PATH in the shell where you run scenario commands
22
+ - Run `ace-*` commands directly; do not wrap them with `timeout`, `env -i`, or other execution wrappers that can change behavior or hide diagnostics
23
+ - Do not bypass the public CLI with repo-local executables such as `./exe/ace-*`, `bin/ace-*`, or `ruby .../exe/ace-*`
20
24
  - Do not fabricate output - all artifacts must come from real tool execution
25
+ - Never background commands or start dependent verification captures before the command they verify has completed
26
+ - When a goal requires command captures, keep stdout and stderr separate; do not merge streams and do not use `2>&1`
27
+ - A command capture set is incomplete unless the matching `.stdout`, `.stderr`, and `.exit` files all exist
28
+ - Persist each command's `.stdout`, `.stderr`, and `.exit` files immediately after that command finishes, before starting the next command
29
+ - For commands that establish state, write that command's `.exit` file before running any list/status/fs-check/tmux verification for the same goal
30
+ - When a successful command prints a filesystem path to a generated artifact, copy that artifact into `results/` if the goal asks for supporting evidence from the generated file
21
31
  - If a goal fails, note the failure and continue to the next goal
22
- - After all goals, output a brief summary of what you produced for each goal
32
+ - Do not create synthetic helper reports or temp input files under results/ unless the scenario explicitly treats them as product outcomes
33
+ - After all goals, return concise runner observations describing what you did and what happened
23
34
  PROMPT
24
35
 
25
36
  VERIFIER_SYSTEM_PROMPT = <<~PROMPT
26
37
  You are an E2E test verifier. You inspect artifacts and render PASS/FAIL verdicts.
27
38
 
28
39
  Rules:
29
- - Evaluate each goal independently based solely on the artifacts provided
30
- - Do not speculate about what the runner did - only judge what exists
40
+ - Evaluate each goal independently based on sandbox state first, then runner observations, then raw debug captures only when needed
41
+ - Treat declared artifacts and helper filenames as hints, not as the source of truth
42
+ - If a helper file is missing or stale, inspect the sandbox directly before failing the goal
43
+ - Use artifact mtimes to detect runner ordering mistakes; if postcondition captures are older than the primary command's stdout/stderr/exit, classify the goal as `runner-error` unless direct sandbox state proves a product failure after the command completed
44
+ - Use read-only commands in the sandbox when they materially improve confidence (for example: git log/status/show, ls/find/cat)
45
+ - Do not speculate beyond the provided sandbox evidence and runner observations
31
46
  - For each failed goal, include a category:
32
- test-spec-error | tool-bug | runner-error | infrastructure-error
47
+ test-spec-error | tool-bug | runner-error | infrastructure-error | missing-artifact
33
48
  - For each goal, cite specific evidence (filenames, content snippets)
34
49
  - Follow the output format exactly
35
50
  PROMPT
@@ -38,21 +53,23 @@ module Ace
38
53
  # @param sandbox_path [String]
39
54
  # @param test_cases [Array<String>, nil]
40
55
  # @return [Hash]
41
- def prepare_runner(scenario:, sandbox_path:, test_cases: nil)
56
+ def prepare_runner(scenario:, sandbox_path:, test_cases: nil, artifact_contract: nil, repair_mode: false)
42
57
  cache_dir = ensure_cache_dir(sandbox_path)
43
- system_path = File.join(cache_dir, "runner-system.md")
44
- prompt_path = File.join(cache_dir, "runner-prompt.md")
58
+ file_prefix = repair_mode ? "runner-repair" : "runner"
59
+ system_path = File.join(cache_dir, "#{file_prefix}-system.md")
60
+ prompt_path = File.join(cache_dir, "#{file_prefix}-prompt.md")
45
61
 
46
62
  File.write(system_path, RUNNER_SYSTEM_PROMPT)
47
63
 
48
64
  bundled = bundle_markdown_file(File.join(scenario.dir_path, "runner.yml.md"), test_cases: test_cases)
49
65
  bundled = bundled.gsub("Workspace root: (current directory)", "Workspace root: #{File.expand_path(sandbox_path)}")
50
- File.write(prompt_path, bundled)
66
+ contract = build_runner_artifact_contract_section(artifact_contract, repair_mode: repair_mode)
67
+ File.write(prompt_path, [bundled, contract].reject(&:empty?).join("\n\n---\n\n"))
51
68
 
52
69
  {
53
70
  system_path: system_path,
54
71
  prompt_path: prompt_path,
55
- output_path: File.join(cache_dir, "runner-output.md")
72
+ output_path: File.join(cache_dir, "#{file_prefix}-output.md")
56
73
  }
57
74
  end
58
75
 
@@ -60,16 +77,20 @@ module Ace
60
77
  # @param sandbox_path [String]
61
78
  # @param test_cases [Array<String>, nil]
62
79
  # @return [Hash]
63
- def prepare_verifier(scenario:, sandbox_path:, test_cases: nil)
80
+ def prepare_verifier(scenario:, sandbox_path:, test_cases: nil, runner_observations: nil, artifact_contract: nil)
64
81
  cache_dir = ensure_cache_dir(sandbox_path)
65
82
  system_path = File.join(cache_dir, "verifier-system.md")
66
83
  prompt_path = File.join(cache_dir, "verifier-prompt.md")
67
84
 
68
85
  File.write(system_path, VERIFIER_SYSTEM_PROMPT)
69
86
 
87
+ project_context = build_project_context_section(scenario)
88
+ sandbox_context = build_sandbox_context_section(sandbox_path)
70
89
  artifacts = build_artifact_section(sandbox_path)
90
+ contract = build_artifact_contract_section(artifact_contract)
91
+ observations = build_runner_observation_section(runner_observations)
71
92
  criteria = bundle_markdown_file(File.join(scenario.dir_path, "verifier.yml.md"), test_cases: test_cases)
72
- File.write(prompt_path, [artifacts, criteria].join("\n\n---\n\n"))
93
+ File.write(prompt_path, [project_context, sandbox_context, artifacts, contract, observations, criteria].join("\n\n---\n\n"))
73
94
 
74
95
  {
75
96
  system_path: system_path,
@@ -154,6 +175,13 @@ module Ace
154
175
  parts.concat(tree_entries)
155
176
  parts << "```"
156
177
  parts << ""
178
+ parts << "## File metadata"
179
+ parts << "```"
180
+ files.each do |file|
181
+ parts << "#{relative_path(file, sandbox_path)}\tmtime=#{File.mtime(file).utc.iso8601}"
182
+ end
183
+ parts << "```"
184
+ parts << ""
157
185
  parts << "## File contents"
158
186
  parts << ""
159
187
 
@@ -168,6 +196,129 @@ module Ace
168
196
  parts.join("\n").rstrip
169
197
  end
170
198
 
199
+ def build_project_context_section(scenario)
200
+ package_root = File.expand_path("../../..", scenario.dir_path)
201
+ source_root = File.expand_path("..", package_root)
202
+ files = [
203
+ File.join(package_root, "README.md"),
204
+ File.join(package_root, "docs", "usage.md"),
205
+ File.join(package_root, "docs", "getting-started.md"),
206
+ File.join(source_root, "CLAUDE.md")
207
+ ].select { |path| File.file?(path) }.first(3)
208
+
209
+ parts = []
210
+ parts << "# Project Context"
211
+ parts << ""
212
+ parts << "- Package: `#{scenario.package}`"
213
+ parts << "- Test ID: `#{scenario.test_id}`"
214
+ parts << "- Sandbox profile: `#{scenario.sandbox_profile}`"
215
+ parts << ""
216
+
217
+ files.each do |file|
218
+ parts << "## `#{File.basename(file)}`"
219
+ parts << "```"
220
+ parts << safe_read(file)
221
+ parts << "```"
222
+ parts << ""
223
+ end
224
+
225
+ parts.join("\n").rstrip
226
+ end
227
+
228
+ def build_sandbox_context_section(sandbox_path)
229
+ sandbox_path = File.expand_path(sandbox_path)
230
+ entries = Dir.glob(File.join(sandbox_path, "*"), File::FNM_DOTMATCH)
231
+ .reject { |path| %w[. ..].include?(File.basename(path)) }
232
+ .sort
233
+
234
+ parts = []
235
+ parts << "# Sandbox Context"
236
+ parts << ""
237
+ parts << "- Sandbox root: `#{sandbox_path}`"
238
+ parts << "- Inspect the sandbox directly when verifying source-of-truth state."
239
+ parts << ""
240
+ parts << "## Top-level entries"
241
+ parts << "```"
242
+ parts.concat(entries.map { |path| relative_path(path, sandbox_path) })
243
+ parts << "```"
244
+
245
+ parts.join("\n").rstrip
246
+ end
247
+
248
+ def build_runner_observation_section(runner_observations)
249
+ <<~MARKDOWN.rstrip
250
+ # Runner Observations
251
+
252
+ #{runner_observations.to_s.strip.empty? ? "(none provided)" : runner_observations.to_s.strip}
253
+ MARKDOWN
254
+ end
255
+
256
+ def build_runner_artifact_contract_section(artifact_contract, repair_mode:)
257
+ return "" if artifact_contract.nil? || artifact_contract.empty?
258
+
259
+ parts = []
260
+ parts << "# Artifact Contract"
261
+ parts << ""
262
+ if repair_mode
263
+ parts << "This is a bounded repair pass."
264
+ parts << "- Do not rerun goals whose required artifacts are already complete."
265
+ parts << "- For each goal with missing required artifacts, produce only the missing files."
266
+ parts << "- Prefer the minimal real public command needed to create the missing capture set."
267
+ parts << "- If the missing file is supporting evidence copied from an already-generated real artifact, copy that real artifact into `results/`."
268
+ parts << "- Do not invent content, fabricate captures, or rewrite unrelated artifacts."
269
+ else
270
+ parts << "A goal is not complete unless every required artifact for that goal exists on disk under `results/`."
271
+ parts << "- After finishing each goal, self-check the required artifact list below."
272
+ parts << "- If a required artifact is missing, fix it before moving on."
273
+ end
274
+ parts << ""
275
+
276
+ artifact_contract.sort.each do |tc_id, entry|
277
+ parts << "## #{tc_id}"
278
+ parts << ""
279
+ parts << "- Required artifacts: #{format_artifact_list(entry["required_artifacts"])}"
280
+ missing = Array(entry["missing_required_artifacts"])
281
+ unless missing.empty?
282
+ parts << "- Missing required artifacts: #{format_artifact_list(missing)}"
283
+ end
284
+ optional = Array(entry["optional_artifacts"])
285
+ parts << "- Optional artifacts: #{format_artifact_list(optional)}" unless optional.empty?
286
+ parts << ""
287
+ end
288
+
289
+ parts.join("\n").rstrip
290
+ end
291
+
292
+ def build_artifact_contract_section(artifact_contract)
293
+ return "# Artifact Contract\n\n(no snapshot provided)" if artifact_contract.nil? || artifact_contract.empty?
294
+
295
+ parts = []
296
+ parts << "# Artifact Contract"
297
+ parts << ""
298
+ parts << "Use this only as supporting context. Missing helper artifacts may be acceptable when sandbox state still proves the goal."
299
+ parts << ""
300
+
301
+ artifact_contract.sort.each do |tc_id, entry|
302
+ parts << "## #{tc_id}"
303
+ parts << ""
304
+ parts << "- Required artifacts: #{format_artifact_list(entry["required_artifacts"])}"
305
+ parts << "- Present required artifacts: #{format_artifact_list(entry["present_required_artifacts"])}"
306
+ parts << "- Missing required artifacts: #{format_artifact_list(entry["missing_required_artifacts"])}"
307
+ parts << "- Optional artifacts: #{format_artifact_list(entry["optional_artifacts"])}"
308
+ parts << "- Present optional artifacts: #{format_artifact_list(entry["present_optional_artifacts"])}"
309
+ parts << ""
310
+ end
311
+
312
+ parts.join("\n").rstrip
313
+ end
314
+
315
+ def format_artifact_list(paths)
316
+ items = Array(paths)
317
+ return "(none)" if items.empty?
318
+
319
+ items.map { |path| "`#{path}`" }.join(", ")
320
+ end
321
+
171
322
  def relative_path(path, root)
172
323
  File.expand_path(path).sub("#{File.expand_path(root)}/", "")
173
324
  end