ace-test-runner-e2e 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +7 -0
  2. data/.ace-defaults/e2e-runner/config.yml +70 -0
  3. data/.ace-defaults/nav/protocols/guide-sources/ace-test-runner-e2e.yml +11 -0
  4. data/.ace-defaults/nav/protocols/skill-sources/ace-test-runner-e2e.yml +19 -0
  5. data/.ace-defaults/nav/protocols/tmpl-sources/ace-test-runner-e2e.yml +12 -0
  6. data/.ace-defaults/nav/protocols/wfi-sources/ace-test-runner-e2e.yml +11 -0
  7. data/CHANGELOG.md +1166 -0
  8. data/LICENSE +21 -0
  9. data/README.md +42 -0
  10. data/Rakefile +15 -0
  11. data/exe/ace-test-e2e +15 -0
  12. data/exe/ace-test-e2e-sh +67 -0
  13. data/exe/ace-test-e2e-suite +13 -0
  14. data/handbook/guides/e2e-testing.g.md +124 -0
  15. data/handbook/guides/scenario-yml-reference.g.md +182 -0
  16. data/handbook/guides/tc-authoring.g.md +131 -0
  17. data/handbook/skills/as-e2e-create/SKILL.md +30 -0
  18. data/handbook/skills/as-e2e-fix/SKILL.md +35 -0
  19. data/handbook/skills/as-e2e-manage/SKILL.md +31 -0
  20. data/handbook/skills/as-e2e-plan-changes/SKILL.md +30 -0
  21. data/handbook/skills/as-e2e-review/SKILL.md +35 -0
  22. data/handbook/skills/as-e2e-rewrite/SKILL.md +31 -0
  23. data/handbook/skills/as-e2e-run/SKILL.md +48 -0
  24. data/handbook/skills/as-e2e-setup-sandbox/SKILL.md +34 -0
  25. data/handbook/templates/ace-taskflow-fixture.template.md +322 -0
  26. data/handbook/templates/agent-experience-report.template.md +89 -0
  27. data/handbook/templates/metadata.template.yml +49 -0
  28. data/handbook/templates/scenario.yml.template.yml +60 -0
  29. data/handbook/templates/tc-file.template.md +45 -0
  30. data/handbook/templates/test-report.template.md +94 -0
  31. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +126 -0
  32. data/handbook/workflow-instructions/e2e/create.wf.md +395 -0
  33. data/handbook/workflow-instructions/e2e/execute.wf.md +253 -0
  34. data/handbook/workflow-instructions/e2e/fix.wf.md +166 -0
  35. data/handbook/workflow-instructions/e2e/manage.wf.md +179 -0
  36. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +255 -0
  37. data/handbook/workflow-instructions/e2e/review.wf.md +286 -0
  38. data/handbook/workflow-instructions/e2e/rewrite.wf.md +281 -0
  39. data/handbook/workflow-instructions/e2e/run.wf.md +355 -0
  40. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +461 -0
  41. data/lib/ace/test/end_to_end_runner/atoms/display_helpers.rb +234 -0
  42. data/lib/ace/test/end_to_end_runner/atoms/prompt_builder.rb +199 -0
  43. data/lib/ace/test/end_to_end_runner/atoms/result_parser.rb +166 -0
  44. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +166 -0
  45. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +244 -0
  46. data/lib/ace/test/end_to_end_runner/atoms/suite_report_prompt_builder.rb +103 -0
  47. data/lib/ace/test/end_to_end_runner/atoms/tc_fidelity_validator.rb +39 -0
  48. data/lib/ace/test/end_to_end_runner/atoms/test_case_parser.rb +108 -0
  49. data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +130 -0
  50. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +156 -0
  51. data/lib/ace/test/end_to_end_runner/models/test_case.rb +47 -0
  52. data/lib/ace/test/end_to_end_runner/models/test_result.rb +115 -0
  53. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +90 -0
  54. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +92 -0
  55. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +75 -0
  56. data/lib/ace/test/end_to_end_runner/molecules/failure_finder.rb +203 -0
  57. data/lib/ace/test/end_to_end_runner/molecules/fixture_copier.rb +35 -0
  58. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +121 -0
  59. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +182 -0
  60. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +321 -0
  61. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +131 -0
  62. data/lib/ace/test/end_to_end_runner/molecules/progress_display_manager.rb +172 -0
  63. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +259 -0
  64. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +254 -0
  65. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +181 -0
  66. data/lib/ace/test/end_to_end_runner/molecules/simple_display_manager.rb +72 -0
  67. data/lib/ace/test/end_to_end_runner/molecules/suite_progress_display_manager.rb +223 -0
  68. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +277 -0
  69. data/lib/ace/test/end_to_end_runner/molecules/suite_simple_display_manager.rb +116 -0
  70. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +136 -0
  71. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +332 -0
  72. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +830 -0
  73. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +442 -0
  74. data/lib/ace/test/end_to_end_runner/version.rb +9 -0
  75. data/lib/ace/test/end_to_end_runner.rb +71 -0
  76. metadata +220 -0
@@ -0,0 +1,203 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+
5
+ module Ace
6
+ module Test
7
+ module EndToEndRunner
8
+ module Molecules
9
+ # Scans cache for failed test cases from previous E2E test runs
10
+ #
11
+ # Reads metadata.yml files from .ace-local/test-e2e/*-reports/ directories
12
+ # and extracts failed_test_cases arrays. Used by --only-failures CLI flag
13
+ # to re-run only tests that failed previously.
14
+ #
15
+ # Note: This is a Molecule (not an Atom) because it performs filesystem
16
+ # I/O via Dir.glob and YAML file reading.
17
+ class FailureFinder
18
+ CACHE_DIR = ".ace-local/test-e2e"
19
+ METADATA_FILE = "metadata.yml"
20
+ REPORTS_SUFFIX = "-reports"
21
+
22
+ # Find failed test cases for a specific package
23
+ #
24
+ # Scans cache directory for the most recent metadata.yml per test-id
25
+ # within the specified package, returning aggregated failed test case IDs.
26
+ #
27
+ # @param package [String] Package name (e.g., "ace-lint")
28
+ # @param base_dir [String] Base directory to search from (default: current dir)
29
+ # @return [Array<String>] Failed test case IDs (e.g., ["TC-001", "TC-003"])
30
+ def find_failures(package:, base_dir: Dir.pwd)
31
+ metadata_files = discover_metadata_files(base_dir)
32
+ return [] if metadata_files.empty?
33
+
34
+ # Filter to package and get most recent per test-id
35
+ package_metadata = filter_by_package(metadata_files, package)
36
+ most_recent = most_recent_per_test(package_metadata)
37
+
38
+ extract_all_failed_ids(most_recent)
39
+ end
40
+
41
+ # Find failed test cases across all packages
42
+ #
43
+ # Scans cache directory for the most recent metadata.yml per test-id
44
+ # across all packages, returning aggregated failed test case IDs.
45
+ #
46
+ # @param base_dir [String] Base directory to search from (default: current dir)
47
+ # @return [Array<String>] Failed test case IDs
48
+ def find_all_failures(base_dir: Dir.pwd)
49
+ metadata_files = discover_metadata_files(base_dir)
50
+ return [] if metadata_files.empty?
51
+
52
+ most_recent = most_recent_per_test(metadata_files)
53
+ extract_all_failed_ids(most_recent)
54
+ end
55
+
56
+ # Find failed test cases grouped by package
57
+ #
58
+ # Scans cache directory for the most recent metadata.yml per test-id
59
+ # within each package, returning a hash mapping package names to their
60
+ # failed test case IDs.
61
+ #
62
+ # @param packages [Array<String>] Package names to scan
63
+ # @param base_dir [String] Base directory to search from (default: current dir)
64
+ # @return [Hash{String => Array<String>}] Package name to failed test case IDs
65
+ def find_failures_by_package(packages:, base_dir: Dir.pwd)
66
+ metadata_files = discover_metadata_files(base_dir)
67
+ return {} if metadata_files.empty?
68
+
69
+ result = {}
70
+ packages.each do |package|
71
+ package_metadata = filter_by_package(metadata_files, package)
72
+ most_recent = most_recent_per_test(package_metadata)
73
+ failed_ids = extract_all_failed_ids(most_recent)
74
+ result[package] = failed_ids unless failed_ids.empty?
75
+ end
76
+ result
77
+ end
78
+
79
+ # Find failed test scenarios grouped by package and scenario (test-id)
80
+ #
81
+ # Like find_failures_by_package but preserves per-scenario granularity.
82
+ # Callers can use this to re-run full failed scenarios.
83
+ #
84
+ # @param packages [Array<String>] Package names to scan
85
+ # @param base_dir [String] Base directory to search from (default: current dir)
86
+ # @return [Hash{String => Hash{String => Array<String>}}]
87
+ # Package name => { test-id => failed TC IDs }
88
+ def find_failures_by_scenario(packages:, base_dir: Dir.pwd)
89
+ metadata_files = discover_metadata_files(base_dir)
90
+ return {} if metadata_files.empty?
91
+
92
+ result = {}
93
+ packages.each do |package|
94
+ package_metadata = filter_by_package(metadata_files, package)
95
+ most_recent = most_recent_per_test(package_metadata)
96
+
97
+ scenario_failures = {}
98
+ most_recent.each do |entry|
99
+ test_id = entry[:data]["test-id"]
100
+ failed_ids = extract_failed_test_cases(entry[:data])
101
+ scenario_failures[test_id] = failed_ids unless failed_ids.empty?
102
+ end
103
+
104
+ result[package] = scenario_failures unless scenario_failures.empty?
105
+ end
106
+ result
107
+ end
108
+
109
+ private
110
+
111
+ # Discover all metadata.yml files in the cache directory
112
+ #
113
+ # @param base_dir [String] Base directory
114
+ # @return [Array<Hash>] Parsed metadata entries with :path and :data keys
115
+ def discover_metadata_files(base_dir)
116
+ cache_path = File.join(base_dir, CACHE_DIR)
117
+ return [] unless Dir.exist?(cache_path)
118
+
119
+ pattern = File.join(cache_path, "*#{REPORTS_SUFFIX}", METADATA_FILE)
120
+ Dir.glob(pattern).filter_map { |path| load_metadata(path) }
121
+ end
122
+
123
+ # Safely load and parse a metadata.yml file
124
+ #
125
+ # @param path [String] Absolute path to metadata.yml
126
+ # @return [Hash, nil] Hash with :path and :data keys, or nil on error
127
+ def load_metadata(path)
128
+ data = YAML.safe_load_file(path, permitted_classes: [Date])
129
+ return nil unless data.is_a?(Hash)
130
+
131
+ {path: path, data: data}
132
+ rescue => e
133
+ warn "Warning: Could not parse #{path}: #{e.message}" if ENV["DEBUG"]
134
+ nil
135
+ end
136
+
137
+ # Filter metadata entries by package name
138
+ #
139
+ # @param entries [Array<Hash>] Metadata entries
140
+ # @param package [String] Package name to filter by
141
+ # @return [Array<Hash>] Filtered entries
142
+ def filter_by_package(entries, package)
143
+ entries.select { |entry| entry[:data]["package"] == package }
144
+ end
145
+
146
+ # Get the most recent metadata entry per test-id
147
+ #
148
+ # Uses the report directory name (which contains a timestamp prefix)
149
+ # to determine recency. Later timestamps sort higher alphabetically.
150
+ #
151
+ # @param entries [Array<Hash>] Metadata entries
152
+ # @return [Array<Hash>] Most recent entry per test-id
153
+ def most_recent_per_test(entries)
154
+ grouped = entries.group_by { |entry| entry[:data]["test-id"] }
155
+ grouped.map do |_test_id, group|
156
+ # Sort by directory name (timestamp prefix ensures chronological order)
157
+ group.max_by { |entry| File.basename(File.dirname(entry[:path])) }
158
+ end
159
+ end
160
+
161
+ # Extract failed test case IDs from metadata entries
162
+ #
163
+ # Checks both the `failed_test_cases` array (from task 259.03 ReportWriter)
164
+ # and falls back to checking `status: "fail"` for older metadata formats.
165
+ #
166
+ # @param entries [Array<Hash>] Most recent metadata entries
167
+ # @return [Array<String>] Aggregated failed test case IDs
168
+ def extract_all_failed_ids(entries)
169
+ entries.flat_map { |entry| extract_failed_test_cases(entry[:data]) }.uniq
170
+ end
171
+
172
+ # Extract failed test case IDs from a single metadata hash
173
+ #
174
+ # Returns specific TC IDs when available, or ["*"] as a wildcard
175
+ # when metadata indicates failure but lacks granular test case data
176
+ # (common in legacy/CLI-agent-written metadata).
177
+ #
178
+ # @param data [Hash] Parsed metadata.yml data
179
+ # @return [Array<String>] Failed test case IDs, ["*"] for wildcard, or []
180
+ def extract_failed_test_cases(data)
181
+ # TC-first schema: failed: [{tc: "TC-001", ...}, ...]
182
+ failed_entries = data["failed"]
183
+ if failed_entries.is_a?(Array) && !failed_entries.empty?
184
+ tc_ids = failed_entries.filter_map { |entry| entry.is_a?(Hash) ? entry["tc"] : nil }.compact
185
+ return tc_ids unless tc_ids.empty?
186
+ end
187
+
188
+ # Primary: use failed_test_cases array (written by ReportWriter or workflow template)
189
+ failed_ids = data["failed_test_cases"]
190
+ return Array(failed_ids) if failed_ids.is_a?(Array) && !failed_ids.empty?
191
+
192
+ # Fallback: metadata has failures but no specific test case IDs.
193
+ # Return wildcard to signal "re-run entire test scenario".
194
+ status = data["status"]
195
+ return ["*"] if %w[fail partial error incomplete].include?(status)
196
+
197
+ []
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module Ace
6
+ module Test
7
+ module EndToEndRunner
8
+ module Molecules
9
+ # Copies fixture files from a scenario's fixtures/ directory into a sandbox
10
+ #
11
+ # Preserves the full directory tree structure. Used by SetupExecutor
12
+ # to populate sandboxes with test data files.
13
+ #
14
+ # Note: This is a Molecule (not an Atom) because it performs filesystem
15
+ # I/O via FileUtils.cp_r and Dir.glob.
16
+ class FixtureCopier
17
+ # Copy fixture tree into target directory
18
+ #
19
+ # @param source_dir [String] Path to the fixtures/ directory
20
+ # @param target_dir [String] Path to the sandbox directory
21
+ # @return [Array<String>] Relative paths of copied files and directories
22
+ # @raise [ArgumentError] If source_dir does not exist
23
+ def copy(source_dir:, target_dir:)
24
+ raise ArgumentError, "Fixture source directory not found: #{source_dir}" unless Dir.exist?(source_dir)
25
+
26
+ FileUtils.mkdir_p(target_dir)
27
+ FileUtils.cp_r("#{source_dir}/.", target_dir)
28
+
29
+ Dir.glob("**/*", base: target_dir).sort
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ace/llm"
4
+ require "ace/llm/query_interface"
5
+
6
+ module Ace
7
+ module Test
8
+ module EndToEndRunner
9
+ module Molecules
10
+ # Executes standalone scenarios using the deterministic pipeline.
11
+ class PipelineExecutor
12
+ # @param provider [String]
13
+ # @param timeout [Integer]
14
+ # @param sandbox_builder [Molecules::PipelineSandboxBuilder]
15
+ # @param prompt_bundler [Molecules::PipelinePromptBundler]
16
+ # @param report_generator [Molecules::PipelineReportGenerator]
17
+ def initialize(provider:, timeout:, sandbox_builder: nil, prompt_bundler: nil, report_generator: nil)
18
+ @provider = provider
19
+ @timeout = timeout
20
+ @sandbox_builder = sandbox_builder || PipelineSandboxBuilder.new
21
+ @prompt_bundler = prompt_bundler || PipelinePromptBundler.new
22
+ @report_generator = report_generator || PipelineReportGenerator.new
23
+ end
24
+
25
+ # @param scenario [Models::TestScenario]
26
+ # @param cli_args [String, nil]
27
+ # @param sandbox_path [String]
28
+ # @param report_dir [String]
29
+ # @param env_vars [Hash, nil]
30
+ # @param test_cases [Array<String>, nil]
31
+ # @return [Models::TestResult]
32
+ def execute(scenario:, cli_args:, sandbox_path:, report_dir:, env_vars: nil, test_cases: nil)
33
+ started_at = Time.now
34
+
35
+ build_env = @sandbox_builder.build(
36
+ scenario: scenario,
37
+ sandbox_path: sandbox_path,
38
+ test_cases: test_cases
39
+ )
40
+ merged_env = (env_vars || {}).merge(build_env)
41
+
42
+ runner = @prompt_bundler.prepare_runner(
43
+ scenario: scenario,
44
+ sandbox_path: sandbox_path,
45
+ test_cases: test_cases
46
+ )
47
+ run_llm(
48
+ prompt_path: runner[:prompt_path],
49
+ system_path: runner[:system_path],
50
+ output_path: runner[:output_path],
51
+ cli_args: cli_args,
52
+ env_vars: merged_env
53
+ )
54
+
55
+ verifier = @prompt_bundler.prepare_verifier(
56
+ scenario: scenario,
57
+ sandbox_path: sandbox_path,
58
+ test_cases: test_cases
59
+ )
60
+ verifier_response = run_llm(
61
+ prompt_path: verifier[:prompt_path],
62
+ system_path: verifier[:system_path],
63
+ output_path: verifier[:output_path],
64
+ cli_args: cli_args,
65
+ env_vars: merged_env
66
+ )
67
+
68
+ @report_generator.generate(
69
+ scenario: scenario,
70
+ verifier_output: verifier_response[:text],
71
+ report_dir: report_dir,
72
+ provider: @provider,
73
+ started_at: started_at,
74
+ completed_at: Time.now
75
+ )
76
+ rescue => e
77
+ begin
78
+ @report_generator.write_failure_report(
79
+ scenario: scenario,
80
+ report_dir: report_dir,
81
+ provider: @provider,
82
+ started_at: started_at || Time.now,
83
+ completed_at: Time.now,
84
+ error_message: "#{e.class}: #{e.message}"
85
+ )
86
+ rescue => write_error
87
+ Models::TestResult.new(
88
+ test_id: scenario.test_id,
89
+ status: "error",
90
+ summary: "Execution pipeline failed",
91
+ error: "#{e.class}: #{e.message}; failed to write error report: #{write_error.class}: #{write_error.message}",
92
+ started_at: started_at || Time.now,
93
+ completed_at: Time.now
94
+ )
95
+ end
96
+ end
97
+
98
+ private
99
+
100
+ def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:)
101
+ prompt = File.read(prompt_path)
102
+ system = File.read(system_path)
103
+ working_dir = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
104
+
105
+ Ace::LLM::QueryInterface.query(
106
+ @provider,
107
+ prompt,
108
+ system: system,
109
+ cli_args: cli_args,
110
+ timeout: @timeout,
111
+ fallback: false,
112
+ output: output_path,
113
+ working_dir: working_dir,
114
+ subprocess_env: env_vars
115
+ )
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,182 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "date"
4
+ require "fileutils"
5
+ require "yaml"
6
+
7
+ module Ace
8
+ module Test
9
+ module EndToEndRunner
10
+ module Molecules
11
+ # Prepares deterministic runner/verifier prompt files for pipeline execution.
12
+ class PipelinePromptBundler
13
+ RUNNER_SYSTEM_PROMPT = <<~PROMPT
14
+ You are an E2E test executor working in a sandbox directory.
15
+
16
+ Rules:
17
+ - Execute each goal in order
18
+ - Save all artifacts to results/tc/{NN}/ directories as specified
19
+ - Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep artifact writes under SANDBOX_ROOT/results
20
+ - Do not fabricate output - all artifacts must come from real tool execution
21
+ - If a goal fails, note the failure and continue to the next goal
22
+ - After all goals, output a brief summary of what you produced for each goal
23
+ PROMPT
24
+
25
+ VERIFIER_SYSTEM_PROMPT = <<~PROMPT
26
+ You are an E2E test verifier. You inspect artifacts and render PASS/FAIL verdicts.
27
+
28
+ Rules:
29
+ - Evaluate each goal independently based solely on the artifacts provided
30
+ - Do not speculate about what the runner did - only judge what exists
31
+ - For each failed goal, include a category:
32
+ test-spec-error | tool-bug | runner-error | infrastructure-error
33
+ - For each goal, cite specific evidence (filenames, content snippets)
34
+ - Follow the output format exactly
35
+ PROMPT
36
+
37
+ # @param scenario [Models::TestScenario]
38
+ # @param sandbox_path [String]
39
+ # @param test_cases [Array<String>, nil]
40
+ # @return [Hash]
41
+ def prepare_runner(scenario:, sandbox_path:, test_cases: nil)
42
+ cache_dir = ensure_cache_dir(sandbox_path)
43
+ system_path = File.join(cache_dir, "runner-system.md")
44
+ prompt_path = File.join(cache_dir, "runner-prompt.md")
45
+
46
+ File.write(system_path, RUNNER_SYSTEM_PROMPT)
47
+
48
+ bundled = bundle_markdown_file(File.join(scenario.dir_path, "runner.yml.md"), test_cases: test_cases)
49
+ bundled = bundled.gsub("Workspace root: (current directory)", "Workspace root: #{File.expand_path(sandbox_path)}")
50
+ File.write(prompt_path, bundled)
51
+
52
+ {
53
+ system_path: system_path,
54
+ prompt_path: prompt_path,
55
+ output_path: File.join(cache_dir, "runner-output.md")
56
+ }
57
+ end
58
+
59
+ # @param scenario [Models::TestScenario]
60
+ # @param sandbox_path [String]
61
+ # @param test_cases [Array<String>, nil]
62
+ # @return [Hash]
63
+ def prepare_verifier(scenario:, sandbox_path:, test_cases: nil)
64
+ cache_dir = ensure_cache_dir(sandbox_path)
65
+ system_path = File.join(cache_dir, "verifier-system.md")
66
+ prompt_path = File.join(cache_dir, "verifier-prompt.md")
67
+
68
+ File.write(system_path, VERIFIER_SYSTEM_PROMPT)
69
+
70
+ artifacts = build_artifact_section(sandbox_path)
71
+ criteria = bundle_markdown_file(File.join(scenario.dir_path, "verifier.yml.md"), test_cases: test_cases)
72
+ File.write(prompt_path, [artifacts, criteria].join("\n\n---\n\n"))
73
+
74
+ {
75
+ system_path: system_path,
76
+ prompt_path: prompt_path,
77
+ output_path: File.join(cache_dir, "verifier-output.md")
78
+ }
79
+ end
80
+
81
+ private
82
+
83
+ def ensure_cache_dir(sandbox_path)
84
+ cache_dir = File.join(File.expand_path(sandbox_path), ".ace-local", "e2e")
85
+ FileUtils.mkdir_p(cache_dir)
86
+ cache_dir
87
+ end
88
+
89
+ def bundle_markdown_file(path, test_cases: nil)
90
+ raw = File.read(path)
91
+ frontmatter, body = split_frontmatter(raw)
92
+ bundle_files = parse_bundle_files(frontmatter, path)
93
+ selected_ids = normalize_selected_ids(test_cases)
94
+
95
+ included_paths = bundle_files.select do |entry|
96
+ include_bundle_entry?(entry, selected_ids)
97
+ end
98
+
99
+ sections = [body.rstrip]
100
+ included_paths.each do |entry|
101
+ full_path = File.expand_path(entry, File.dirname(path))
102
+ sections << File.read(full_path).rstrip
103
+ end
104
+ sections.reject(&:empty?).join("\n\n---\n\n")
105
+ end
106
+
107
+ def split_frontmatter(raw)
108
+ match = raw.match(/\A---\s*\r?\n(.*?)\r?\n---\s*\r?\n(.*)\z/m)
109
+ return [{}, raw] unless match
110
+
111
+ parsed = YAML.safe_load(match[1], permitted_classes: [Date]) || {}
112
+ [parsed, match[2]]
113
+ end
114
+
115
+ def parse_bundle_files(frontmatter, path)
116
+ files = frontmatter.dig("bundle", "files")
117
+ return [] unless files.is_a?(Array)
118
+
119
+ files.map(&:to_s).reject(&:empty?)
120
+ rescue Psych::SyntaxError => e
121
+ raise ArgumentError, "Invalid YAML frontmatter in #{path}: #{e.message}"
122
+ end
123
+
124
+ def normalize_selected_ids(test_cases)
125
+ return nil unless test_cases && !test_cases.empty?
126
+
127
+ test_cases.map { |tc| tc.to_s.upcase }.to_set
128
+ end
129
+
130
+ def include_bundle_entry?(entry, selected_ids)
131
+ return true unless selected_ids
132
+
133
+ tc_id = extract_tc_id(entry)
134
+ return true if tc_id.nil?
135
+
136
+ selected_ids.include?(tc_id)
137
+ end
138
+
139
+ def extract_tc_id(path)
140
+ match = File.basename(path).match(/\A(TC-\d+[a-z]*)/i)
141
+ match ? match[1].upcase : nil
142
+ end
143
+
144
+ def build_artifact_section(sandbox_path)
145
+ sandbox_path = File.expand_path(sandbox_path)
146
+ files = Dir.glob(File.join(sandbox_path, "results", "**", "*")).select { |f| File.file?(f) }.sort
147
+ tree_entries = files.map { |f| relative_path(f, sandbox_path) }
148
+
149
+ parts = []
150
+ parts << "# Sandbox Artifacts"
151
+ parts << ""
152
+ parts << "## Directory tree"
153
+ parts << "```"
154
+ parts.concat(tree_entries)
155
+ parts << "```"
156
+ parts << ""
157
+ parts << "## File contents"
158
+ parts << ""
159
+
160
+ files.each do |file|
161
+ parts << "### `#{relative_path(file, sandbox_path)}`"
162
+ parts << "```"
163
+ parts << safe_read(file)
164
+ parts << "```"
165
+ parts << ""
166
+ end
167
+
168
+ parts.join("\n").rstrip
169
+ end
170
+
171
+ def relative_path(path, root)
172
+ File.expand_path(path).sub("#{File.expand_path(root)}/", "")
173
+ end
174
+
175
+ def safe_read(path)
176
+ File.binread(path).encode("UTF-8", invalid: :replace, undef: :replace, replace: "?")
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
182
+ end