ace-test-runner-e2e 0.29.8 → 0.38.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.ace-defaults/e2e-runner/config.yml +14 -2
  3. data/CHANGELOG.md +178 -0
  4. data/README.md +2 -2
  5. data/exe/ace-test-e2e-sh +9 -4
  6. data/handbook/guides/e2e-testing.g.md +43 -9
  7. data/handbook/guides/scenario-yml-reference.g.md +16 -8
  8. data/handbook/guides/tc-authoring.g.md +12 -5
  9. data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
  10. data/handbook/skills/as-e2e-review/SKILL.md +2 -2
  11. data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
  12. data/handbook/templates/agent-experience-report.template.md +3 -2
  13. data/handbook/templates/scenario.yml.template.yml +7 -2
  14. data/handbook/templates/tc-file.template.md +14 -4
  15. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
  16. data/handbook/workflow-instructions/e2e/create.wf.md +118 -25
  17. data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
  18. data/handbook/workflow-instructions/e2e/fix.wf.md +65 -15
  19. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +17 -1
  20. data/handbook/workflow-instructions/e2e/review.wf.md +36 -25
  21. data/handbook/workflow-instructions/e2e/rewrite.wf.md +15 -8
  22. data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
  23. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
  24. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
  25. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
  26. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +21 -8
  27. data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
  28. data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
  29. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
  30. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
  31. data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
  32. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
  33. data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
  34. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +157 -16
  35. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +121 -8
  36. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
  37. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +119 -18
  38. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +13 -12
  39. data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +282 -0
  40. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +85 -5
  41. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +98 -16
  42. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +241 -97
  43. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
  44. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
  45. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +73 -15
  46. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +120 -19
  47. data/lib/ace/test/end_to_end_runner/version.rb +1 -1
  48. data/lib/ace/test/end_to_end_runner.rb +2 -0
  49. metadata +19 -2
@@ -1,7 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "fileutils"
4
+ require "ostruct"
4
5
  require "yaml"
6
+ require "set"
7
+ require "date"
5
8
  require "ace/llm"
6
9
  require "ace/llm/query_interface"
7
10
 
@@ -9,7 +12,7 @@ module Ace
9
12
  module Test
10
13
  module EndToEndRunner
11
14
  module Molecules
12
- # Writes a suite-level final report aggregating all test results
15
+ # Writes an aggregated package or suite report
13
16
  #
14
17
  # Uses LLM synthesis to generate rich reports with root cause analysis,
15
18
  # friction insights, and improvement suggestions. Falls back to a static
@@ -22,7 +25,12 @@ module Ace
22
25
  @timeout = reporting["timeout"] || 60
23
26
  end
24
27
 
25
- # Write a suite-level final report
28
+ REPORT_KINDS = {
29
+ package: ->(timestamp, package) { "#{timestamp}-#{package}-report.md" },
30
+ suite: ->(timestamp, _package) { "#{timestamp}-suite-report.md" }
31
+ }.freeze
32
+
33
+ # Write an aggregated report
26
34
  #
27
35
  # @param results [Array<Models::TestResult>] Test results (ordered)
28
36
  # @param scenarios [Array<Models::TestScenario>] Corresponding scenarios
@@ -30,22 +38,31 @@ module Ace
30
38
  # @param timestamp [String] Timestamp ID for this run
31
39
  # @param base_dir [String] Base directory for cache output
32
40
  # @return [String] Path to the written report file
33
- def write(results, scenarios, package:, timestamp:, base_dir:)
41
+ def write(results, scenarios, package:, timestamp:, base_dir:, report_kind: :package, diagnostics: nil)
34
42
  cache_dir = File.join(base_dir, ".ace-local", "test-e2e")
35
43
  FileUtils.mkdir_p(cache_dir)
36
44
 
37
- report_path = File.join(cache_dir, "#{timestamp}-final-report.md")
45
+ report_path = File.join(cache_dir, report_filename(report_kind, timestamp, package))
38
46
 
39
47
  overall_status = compute_status(results)
40
48
  executed_at = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
41
-
42
- content = synthesize_report(
43
- results, scenarios,
49
+ results_data = build_results_data(results, scenarios)
50
+ narrative_sections = synthesize_narrative_sections(
51
+ results_data,
44
52
  package: package,
45
53
  timestamp: timestamp,
46
54
  overall_status: overall_status,
47
55
  executed_at: executed_at
48
56
  )
57
+ content = build_report(
58
+ results_data,
59
+ package: package,
60
+ timestamp: timestamp,
61
+ overall_status: overall_status,
62
+ executed_at: executed_at,
63
+ narrative_sections: narrative_sections,
64
+ diagnostics: diagnostics
65
+ )
49
66
 
50
67
  File.write(report_path, content)
51
68
  report_path
@@ -53,10 +70,16 @@ module Ace
53
70
 
54
71
  private
55
72
 
56
- # Attempt LLM synthesis, falling back to static template
57
- def synthesize_report(results, scenarios, package:, timestamp:, overall_status:, executed_at:)
58
- results_data = build_results_data(results, scenarios)
73
+ def report_filename(report_kind, timestamp, package)
74
+ builder = REPORT_KINDS[report_kind.to_sym]
75
+ raise ArgumentError, "Unknown report kind: #{report_kind}" unless builder
59
76
 
77
+ builder.call(timestamp, package)
78
+ end
79
+
80
+ # Attempt LLM synthesis for narrative sections only, falling back to
81
+ # deterministic defaults when the model is unavailable or malformed.
82
+ def synthesize_narrative_sections(results_data, package:, timestamp:, overall_status:, executed_at:)
60
83
  prompt_builder = Atoms::SuiteReportPromptBuilder.new
61
84
  user_prompt = prompt_builder.build(
62
85
  results_data,
@@ -73,51 +96,35 @@ module Ace
73
96
  timeout: @timeout,
74
97
  temperature: 0.3
75
98
  )
76
-
77
- total_passed = results.sum(&:passed_count)
78
- total_tc = results.sum(&:total_count)
79
- validate_overall_line(response[:text], total_passed, total_tc)
99
+ extract_narrative_sections(response[:text])
80
100
  rescue => e
81
- # LLM failed fall back to static report
82
- warn "Warning: LLM synthesis failed (#{e.class}: #{e.message}), using static report" if ENV["DEBUG"]
83
- executed_date = Time.now.utc.strftime("%Y-%m-%d")
84
- total_passed = results.sum(&:passed_count)
85
- total_failed = results.sum(&:failed_count)
86
- total_tc = results.sum(&:total_count)
87
-
88
- build_static_report(
89
- results, scenarios,
90
- package: package,
91
- timestamp: timestamp,
92
- overall_status: overall_status,
93
- executed_at: executed_at,
94
- executed_date: executed_date,
95
- total_passed: total_passed,
96
- total_failed: total_failed,
97
- total_tc: total_tc
98
- )
101
+ warn "Warning: LLM synthesis failed (#{e.class}: #{e.message}), using deterministic narrative" if ENV["DEBUG"]
102
+ fallback_narrative_sections(results_data)
99
103
  end
100
104
 
101
105
  # Read summary and experience report content from each result's report dir
102
106
  def build_results_data(results, scenarios)
103
107
  results.each_with_index.map do |result, i|
104
- scenario = scenarios[i]
108
+ scenario = scenario_for_result(result, scenarios, i)
105
109
  report_dir = result.report_dir
106
110
 
107
111
  summary_content = read_report_file(report_dir, "summary.r.md")
108
112
  experience_content = read_report_file(report_dir, "experience.r.md")
109
113
 
114
+ report_metadata = read_report_frontmatter(report_dir)
115
+
110
116
  {
111
117
  test_id: result.test_id,
112
118
  title: scenario.title,
113
119
  status: result.status,
114
- passed: result.passed_count,
115
- failed: result.failed_count,
116
- total: result.total_count,
117
- test_cases: result.test_cases,
120
+ passed: reported_count(report_metadata, result, "passed"),
121
+ failed: reported_count(report_metadata, result, "failed"),
122
+ total: reported_count(report_metadata, result, "total"),
123
+ test_cases: canonical_test_cases(report_metadata, result),
118
124
  report_dir_name: report_dir ? File.basename(report_dir) : nil,
119
125
  summary_content: summary_content,
120
- experience_content: experience_content
126
+ experience_content: experience_content,
127
+ canonical_tc_source: !report_metadata.empty?
121
128
  }
122
129
  end
123
130
  end
@@ -132,21 +139,71 @@ module Ace
132
139
  File.read(path)
133
140
  end
134
141
 
135
- # Validate the LLM-generated Overall line against deterministic totals.
136
- # If the LLM hallucinated wrong numbers, replace the line with correct values.
137
- def validate_overall_line(report_text, expected_passed, expected_total)
138
- expected_pct = (expected_total > 0) ? (expected_passed * 100.0 / expected_total).round(0) : 0
139
- correct_line = "**Overall:** #{expected_passed}/#{expected_total} test cases passed (#{expected_pct}%)"
142
+ def read_report_frontmatter(report_dir)
143
+ return {} unless report_dir
140
144
 
141
- # Match patterns like "**Overall:** X/Y test cases passed (Z%)"
142
- overall_pattern = /\*\*Overall:\*\*\s*\d+\/\d+\s+test cases passed\s*\(\d+%\)/
145
+ path = File.join(report_dir, "report.md")
146
+ return {} unless File.exist?(path)
143
147
 
144
- if report_text.match?(overall_pattern)
145
- report_text.gsub(overall_pattern, correct_line)
146
- else
147
- # No Overall line found — append the correct one after the summary table
148
- "#{report_text.rstrip}\n\n#{correct_line}\n"
148
+ content = File.read(path)
149
+ match = content.match(/\A---\s*\n(.*?)\n---\s*\n/m)
150
+ return {} unless match
151
+
152
+ YAML.safe_load(match[1], permitted_classes: [Time, Date]) || {}
153
+ rescue
154
+ {}
155
+ end
156
+
157
+ def reported_count(report_metadata, result, kind)
158
+ key = "tcs-#{kind}"
159
+ fallback =
160
+ case kind
161
+ when "passed" then result.passed_count
162
+ when "failed" then result.failed_count
163
+ else result.total_count
164
+ end
165
+ report_metadata[key] || fallback
166
+ end
167
+
168
+ def canonical_test_cases(report_metadata, result)
169
+ return result.test_cases if report_metadata.empty?
170
+
171
+ failed_entries = Array(report_metadata["failed"]).filter_map do |entry|
172
+ next unless entry.is_a?(Hash)
173
+
174
+ id = entry["tc"] || entry[:tc]
175
+ next unless id
176
+
177
+ {
178
+ id: id,
179
+ description: "",
180
+ status: "fail",
181
+ notes: entry["evidence"] || entry[:evidence] || "See scenario report for details",
182
+ category: entry["category"] || entry[:category] || "runner-error"
183
+ }
184
+ end
185
+
186
+ failed_ids = failed_entries.map { |entry| entry[:id] }.to_set
187
+ Array(report_metadata["canonical-failed-tcs"]).each do |tc_id|
188
+ next if failed_ids.include?(tc_id)
189
+
190
+ failed_entries << {
191
+ id: tc_id,
192
+ description: "",
193
+ status: "fail",
194
+ notes: "See scenario report for details",
195
+ category: "runner-error"
196
+ }
149
197
  end
198
+
199
+ passed_entries = Array(report_metadata["passed"]).filter_map do |tc_id|
200
+ next if failed_ids.include?(tc_id)
201
+
202
+ {id: tc_id, description: "", status: "pass", notes: ""}
203
+ end
204
+
205
+ canonical = passed_entries + failed_entries
206
+ canonical.empty? ? result.test_cases : canonical
150
207
  end
151
208
 
152
209
  def compute_status(results)
@@ -163,22 +220,26 @@ module Ace
163
220
  end
164
221
  end
165
222
 
166
- # Static fallback report (original template-based approach)
167
- def build_static_report(results, scenarios, package:, timestamp:, overall_status:,
168
- executed_at:, executed_date:, total_passed:, total_failed:, total_tc:)
169
- total_skipped = results.count(&:skipped?)
223
+ def build_report(results_data, package:, timestamp:, overall_status:, executed_at:, narrative_sections:, diagnostics:)
224
+ total_skipped = results_data.count { |r| r[:status] == "skip" }
225
+ total_passed = results_data.sum { |r| r[:passed] }
226
+ total_tc = results_data.sum { |r| r[:total] }
170
227
 
171
228
  parts = []
172
229
  parts << build_frontmatter(
173
230
  timestamp: timestamp, package: package, overall_status: overall_status,
174
- tests_run: results.size, executed_at: executed_at, skipped: total_skipped
231
+ tests_run: results_data.size, executed_at: executed_at, skipped: total_skipped
175
232
  )
176
- parts << build_header(package: package, tests_run: results.size, executed_date: executed_date, skipped: total_skipped)
177
- parts << build_summary_table(results, scenarios)
233
+ parts << build_header(package: package)
234
+ parts << build_summary_table(results_data)
178
235
  parts << build_overall_line(total_passed: total_passed, total_tc: total_tc)
179
- parts << build_failed_section(results, scenarios) if results.any?(&:failed?)
180
- parts << build_reports_section(results, scenarios)
181
- parts.join("\n")
236
+ parts << build_failed_section(results_data) if results_data.any? { |r| r[:failed].positive? }
237
+ parts << build_runner_diagnostics_section(diagnostics)
238
+ parts << build_narrative_section("Friction Analysis", narrative_sections[:friction])
239
+ parts << build_narrative_section("Improvement Suggestions", narrative_sections[:improvements])
240
+ parts << build_narrative_section("Positive Observations", narrative_sections[:positive])
241
+ parts << build_reports_section(results_data)
242
+ parts.compact.join("\n")
182
243
  end
183
244
 
184
245
  def build_frontmatter(timestamp:, package:, overall_status:, tests_run:, executed_at:, skipped: 0)
@@ -194,82 +255,165 @@ module Ace
194
255
  FRONTMATTER
195
256
  end
196
257
 
197
- def build_header(package:, tests_run:, executed_date:, skipped: 0)
198
- skipped_info = (skipped > 0) ? " (#{skipped} skipped)" : ""
258
+ def build_header(package:)
199
259
  <<~HEADER
200
- # E2E Test Suite Report
201
-
202
- **Package:** #{package}
203
- **Tests:** #{tests_run}#{skipped_info}
204
- **Executed:** #{executed_date}
260
+ # E2E Suite Report: `#{package}`
205
261
  HEADER
206
262
  end
207
263
 
208
- def build_summary_table(results, scenarios)
209
- rows = results.each_with_index.map do |result, i|
210
- scenario = scenarios[i]
211
- status_label = result.status.capitalize
212
- passed = result.skipped? ? "-" : result.passed_count.to_s
213
- failed = result.skipped? ? "-" : result.failed_count.to_s
214
- total = result.skipped? ? "-" : result.total_count.to_s
215
- "| #{result.test_id} | #{scenario.title} | #{status_label} | #{passed} | #{failed} | #{total} |"
264
+ def build_summary_table(results_data)
265
+ rows = results_data.map do |result|
266
+ status_label = result[:status].capitalize
267
+ passed = (result[:status] == "skip") ? "-" : result[:passed].to_s
268
+ failed = (result[:status] == "skip") ? "-" : result[:failed].to_s
269
+ total = (result[:status] == "skip") ? "-" : result[:total].to_s
270
+ "| #{result[:test_id]} | #{result[:title]} | #{status_label} | #{passed} | #{failed} | #{total} |"
216
271
  end
217
272
 
218
273
  <<~TABLE
219
- ## Summary
274
+ ## Summary Table
220
275
 
221
276
  | Test ID | Title | Status | Passed | Failed | Total |
222
- |---------|-------|--------|--------|--------|-------|
277
+ |---|---|---:|---:|---:|---:|
223
278
  #{rows.join("\n")}
224
279
  TABLE
225
280
  end
226
281
 
227
282
  def build_overall_line(total_passed:, total_tc:)
228
- pct = (total_tc > 0) ? (total_passed * 100.0 / total_tc).round(0) : 0
229
- "**Overall:** #{total_passed}/#{total_tc} test cases passed (#{pct}%)\n"
283
+ pct = (total_tc > 0) ? (total_passed * 100.0 / total_tc).round(1) : 0.0
284
+ formatted_pct = (pct % 1).zero? ? pct.to_i.to_s : format("%.1f", pct)
285
+ <<~OVERALL
286
+ ## Overall Line
287
+
288
+ **Overall:** #{total_passed}/#{total_tc} test cases passed (#{formatted_pct}%)
289
+ OVERALL
230
290
  end
231
291
 
232
- def build_failed_section(results, scenarios)
292
+ def build_failed_section(results_data)
233
293
  parts = ["\n## Failed Tests\n"]
234
294
 
235
- results.each_with_index do |result, i|
236
- next if result.success? || result.skipped?
295
+ results_data.each do |result|
296
+ next unless result[:failed].positive?
237
297
 
238
- scenario = scenarios[i]
239
- parts << "### #{result.test_id}: #{scenario.title} (#{result.passed_count}/#{result.total_count})\n"
298
+ parts << "### #{result[:test_id]}"
299
+ parts << ""
300
+ parts << "**Failed test case details**"
240
301
 
241
- failed_tcs = result.test_cases.select { |tc| tc[:status] == "fail" }
302
+ failed_tcs = result[:test_cases].select { |tc| tc[:status] == "fail" }
242
303
  if failed_tcs.any?
243
- parts << "**Failed Test Cases:**"
244
304
  failed_tcs.each do |tc|
245
- parts << "- #{tc[:id]}: #{tc[:description]}"
305
+ category = tc[:category] || "runner-error"
306
+ details = tc[:notes].to_s.strip
307
+ details = tc[:description].to_s if details.empty?
308
+ parts << "- `#{tc[:id]}` (#{category}) — #{details}"
246
309
  end
247
- parts << ""
310
+ else
311
+ parts << "- Exact failed TC mapping unavailable in aggregate view — see scenario report for canonical details."
248
312
  end
249
313
 
250
- if result.report_dir
251
- parts << "**Report:** #{result.report_dir}\n"
314
+ if result[:report_dir_name]
315
+ parts << ""
316
+ parts << "**Report directory:** `#{result[:report_dir_name]}`"
252
317
  end
318
+ parts << ""
253
319
  end
254
320
 
255
321
  parts.join("\n")
256
322
  end
257
323
 
258
- def build_reports_section(results, scenarios)
259
- rows = results.each_with_index.map do |result, i|
260
- dir = result.report_dir ? File.basename(result.report_dir) : "N/A"
261
- "| #{result.test_id} | #{dir} |"
324
+ def build_runner_diagnostics_section(diagnostics)
325
+ return nil unless diagnostics.is_a?(Hash) && diagnostics[:dirty_worktree]
326
+
327
+ entries = Array(diagnostics[:new_tracked_entries]).map { |line| "- `#{line}`" }.join("\n")
328
+ entries = "- No specific entries captured." if entries.empty?
329
+
330
+ <<~SECTION
331
+ ## Runner Diagnostics
332
+
333
+ Suite execution introduced new tracked working-tree changes relative to the pre-run snapshot.
334
+
335
+ #{entries}
336
+ SECTION
337
+ end
338
+
339
+ def build_narrative_section(title, content)
340
+ return nil if content.to_s.strip.empty?
341
+
342
+ <<~SECTION
343
+ ## #{title}
344
+
345
+ #{content.to_s.strip}
346
+ SECTION
347
+ end
348
+
349
+ def build_reports_section(results_data)
350
+ rows = results_data.map do |result|
351
+ dir = result[:report_dir_name] || "N/A"
352
+ "| #{result[:test_id]} | `#{dir}` |"
262
353
  end
263
354
 
264
355
  <<~SECTION
265
356
 
266
- ## Reports
357
+ ## Reports Table
267
358
 
268
- | Test ID | Reports Folder |
269
- |---------|----------------|
359
+ | Test ID | Report Directory |
360
+ |---|---|
270
361
  #{rows.join("\n")}
271
362
  SECTION
272
363
  end
364
+
365
+ def extract_narrative_sections(report_text)
366
+ text = report_text.to_s
367
+ sections = {
368
+ friction: extract_markdown_section(text, "Friction Analysis"),
369
+ improvements: extract_markdown_section(text, "Improvement Suggestions"),
370
+ positive: extract_markdown_section(text, "Positive Observations")
371
+ }
372
+
373
+ fallback = strip_canonical_sections(text)
374
+ has_markdown_sections = text.match?(/^\#{2,3}\s+/)
375
+ sections[:positive] = fallback if sections.values.all? { |value| value.to_s.strip.empty? } &&
376
+ !fallback.empty? && !has_markdown_sections
377
+ sections
378
+ end
379
+
380
+ def extract_markdown_section(text, heading)
381
+ match = text.match(/^\#{2,3}\s+#{Regexp.escape(heading)}\s*$\n?(.*?)(?=^\#{1,3}\s|\z)/mi)
382
+ return "" unless match
383
+
384
+ match[1].to_s.strip
385
+ end
386
+
387
+ def strip_canonical_sections(text)
388
+ body = text.to_s.dup
389
+ body.sub!(/\A---.*?^---\s*/m, "")
390
+ body.gsub!(/^\#{1,3}\s+.*$/, "")
391
+ body.gsub!(/^\|.*\|\s*$/, "")
392
+ body.gsub!(/^\*\*Overall:\*\*.*$/, "")
393
+ body.lines.map(&:rstrip).reject(&:empty?).join("\n").strip
394
+ end
395
+
396
+ def fallback_narrative_sections(results_data)
397
+ failed_results = results_data.select { |result| result[:failed].positive? }
398
+
399
+ {
400
+ friction: failed_results.empty? ? "" : failed_results.map { |result|
401
+ "- #{result[:test_id]} had #{result[:failed]} failing test case(s); inspect `#{result[:report_dir_name]}` for scenario details."
402
+ }.join("\n"),
403
+ improvements: failed_results.empty? ? "" : failed_results.map { |result|
404
+ "- Re-run #{result[:test_id]} after the targeted fix and confirm the failing test case set is empty."
405
+ }.join("\n"),
406
+ positive: results_data.select { |result| result[:failed].zero? }.map { |result|
407
+ "- #{result[:test_id]} passed #{result[:passed]}/#{result[:total]} test cases."
408
+ }.join("\n")
409
+ }
410
+ end
411
+
412
+ def scenario_for_result(result, scenarios, index)
413
+ scenarios[index] || OpenStruct.new(
414
+ title: result.metadata[:phase] == "preflight" || result.metadata["phase"] == "preflight" ? "Preflight" : result.test_id
415
+ )
416
+ end
273
417
  end
274
418
  end
275
419
  end
@@ -6,16 +6,18 @@ module Ace
6
6
  module Test
7
7
  module EndToEndRunner
8
8
  module Molecules
9
- # Discovers E2E test scenario directories (TS-*/scenario.yml) in packages
9
+ # Discovers deterministic preflight tests and agent E2E scenarios in packages
10
10
  #
11
11
  # Finds test scenarios in the TS-format directory structure:
12
+ # {package}/test/feat/**/*_test.rb
12
13
  # {package}/test/e2e/TS-*/scenario.yml
13
14
  #
14
15
  # Note: This is a Molecule (not an Atom) because it performs filesystem
15
16
  # I/O via Dir.glob.
16
17
  class TestDiscoverer
17
- TEST_DIR = "test/e2e"
18
+ TEST_DIRS = ["test/e2e"].freeze
18
19
  SCENARIO_FILE = "scenario.yml"
20
+ DEFAULT_PREFLIGHT_GLOBS = ["test/feat/**/*_test.rb"].freeze
19
21
  SCENARIO_DIR_PATTERN = "TS-*"
20
22
 
21
23
  # Find E2E test scenario files matching criteria
@@ -47,6 +49,17 @@ module Ace
47
49
  ).map(&:file_path).sort
48
50
  end
49
51
 
52
+ # @return [Array<String>] Sorted list of matching deterministic preflight test files
53
+ def find_integration_tests(package:, base_dir: Dir.pwd)
54
+ package_path = File.join(base_dir, package)
55
+ preflight_globs.each do |glob|
56
+ files = Dir.glob(File.join(package_path, glob)).sort
57
+ return files unless files.empty?
58
+ end
59
+
60
+ []
61
+ end
62
+
50
63
  # Find TS-format scenario directories and load them as TestScenario models
51
64
  #
52
65
  # @param package [String] Package name
@@ -56,9 +69,11 @@ module Ace
56
69
  # @param base_dir [String] Base directory to search from
57
70
  # @return [Array<Models::TestScenario>] Loaded scenario models with test_cases
58
71
  def find_scenarios(package:, test_id: nil, tags: nil, exclude_tags: nil, base_dir: Dir.pwd)
59
- test_dir = File.join(base_dir, package, TEST_DIR)
60
- pattern = File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
61
- scenario_files = Dir.glob(pattern).sort
72
+ patterns = TEST_DIRS.map do |test_dir_name|
73
+ test_dir = File.join(base_dir, package, test_dir_name)
74
+ File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
75
+ end
76
+ scenario_files = Dir.glob(patterns).sort
62
77
 
63
78
  loader = ScenarioLoader.new
64
79
  scenarios = scenario_files.map do |yml_path|
@@ -82,11 +97,13 @@ module Ace
82
97
  # @param base_dir [String] Base directory to search from
83
98
  # @return [Array<String>] Sorted list of package names
84
99
  def list_packages(base_dir: Dir.pwd)
85
- pattern = File.join(base_dir, "*/#{TEST_DIR}/#{SCENARIO_DIR_PATTERN}/#{SCENARIO_FILE}")
100
+ patterns = TEST_DIRS.map do |test_dir_name|
101
+ File.join(base_dir, "*/#{test_dir_name}/#{SCENARIO_DIR_PATTERN}/#{SCENARIO_FILE}")
102
+ end
86
103
 
87
104
  base = Pathname.new(base_dir)
88
105
 
89
- Dir.glob(pattern)
106
+ Dir.glob(patterns)
90
107
  .map { |f| Pathname.new(f).relative_path_from(base).each_filename.first }
91
108
  .uniq
92
109
  .sort
@@ -96,12 +113,14 @@ module Ace
96
113
 
97
114
  # Build glob pattern for finding TS-format scenario.yml files
98
115
  def build_scenario_pattern(package, test_id, base_dir)
99
- test_dir = File.join(base_dir, package, TEST_DIR)
100
-
101
- if test_id
102
- File.join(test_dir, "*#{test_id}*", SCENARIO_FILE)
103
- else
104
- File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
116
+ TEST_DIRS.map do |test_dir_name|
117
+ test_dir = File.join(base_dir, package, test_dir_name)
118
+
119
+ if test_id
120
+ File.join(test_dir, "*#{test_id}*", SCENARIO_FILE)
121
+ else
122
+ File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
123
+ end
105
124
  end
106
125
  end
107
126
 
@@ -129,6 +148,12 @@ module Ace
129
148
 
130
149
  filtered
131
150
  end
151
+
152
+ def preflight_globs
153
+ configured = Molecules::ConfigLoader.load.dig("patterns", "preflight")
154
+ globs = [configured, *DEFAULT_PREFLIGHT_GLOBS].compact.uniq
155
+ globs.reject(&:empty?)
156
+ end
132
157
  end
133
158
  end
134
159
  end
@@ -16,12 +16,18 @@ module Ace
16
16
  # @param provider [String] LLM provider:model string
17
17
  # @param timeout [Integer] Request timeout in seconds
18
18
  # @param config [Hash] Configuration hash (string keys) from ConfigLoader
19
- def initialize(provider: nil, timeout: nil, config: nil)
19
+ def initialize(provider: nil, timeout: nil, config: nil, sandbox_backend_factory: nil)
20
20
  config ||= Molecules::ConfigLoader.load
21
- @provider = provider || config.dig("execution", "provider") || "claude:sonnet"
21
+ @provider = provider || config.dig("execution", "runner_provider") ||
22
+ config.dig("execution", "provider") || "claude:sonnet"
23
+ @verifier_provider = config.dig("execution", "verifier_provider") ||
24
+ config.dig("execution", "provider") || @provider
22
25
  @timeout = timeout || config.dig("execution", "timeout") || 300
23
26
  @prompt_builder = Atoms::PromptBuilder.new
24
27
  @cli_provider_adapter = Atoms::CliProviderAdapter.new(config)
28
+ @sandbox_backend_factory = sandbox_backend_factory || lambda { |sandbox_path, source_root: nil|
29
+ Molecules::BwrapSandboxBackend.new(sandbox_root: sandbox_path, source_root: source_root)
30
+ }
25
31
  end
26
32
 
27
33
  # Execute a single test scenario via LLM
@@ -192,9 +198,10 @@ module Ace
192
198
  # Execute TC via skill invocation for CLI providers
193
199
  def execute_tc_via_skill(test_case, sandbox_path, scenario, cli_args: nil, run_id: nil, env_vars: nil)
194
200
  with_tc_error_handling(scenario) do |started_at|
201
+ sandbox_backend, prepared_env = prepared_env_for(sandbox_path, env_vars)
195
202
  prompt = @cli_provider_adapter.build_tc_skill_prompt(
196
203
  test_case: test_case, scenario: scenario,
197
- sandbox_path: sandbox_path, run_id: run_id, env_vars: env_vars
204
+ sandbox_path: sandbox_path, run_id: run_id, env_vars: prepared_env
198
205
  )
199
206
 
200
207
  response = Ace::LLM::QueryInterface.query(
@@ -202,7 +209,8 @@ module Ace
202
209
  system: nil, cli_args: cli_args,
203
210
  timeout: @timeout, fallback: false,
204
211
  working_dir: sandbox_path,
205
- subprocess_env: env_vars
212
+ subprocess_env: prepared_env,
213
+ subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: prepared_env)
206
214
  )
207
215
 
208
216
  invocation_error = detect_skill_invocation_error(response[:text])
@@ -322,9 +330,23 @@ module Ace
322
330
  @pipeline_executors ||= {}
323
331
  @pipeline_executors[timeout] ||= Molecules::PipelineExecutor.new(
324
332
  provider: @provider,
325
- timeout: timeout
333
+ verifier_provider: @verifier_provider,
334
+ timeout: timeout,
335
+ sandbox_backend_factory: @sandbox_backend_factory
326
336
  )
327
337
  end
338
+
339
+ def build_sandbox_backend(sandbox_path, env_vars)
340
+ @sandbox_backend_factory.call(
341
+ sandbox_path,
342
+ source_root: env_vars&.dig("ACE_E2E_SOURCE_ROOT") || env_vars&.dig(:ACE_E2E_SOURCE_ROOT)
343
+ )
344
+ end
345
+
346
+ def prepared_env_for(sandbox_path, env_vars)
347
+ sandbox_backend = build_sandbox_backend(sandbox_path, env_vars || {})
348
+ [sandbox_backend, sandbox_backend.prepared_env(env_vars || {})]
349
+ end
328
350
  end
329
351
  end
330
352
  end