ace-test-runner-e2e 0.29.8 → 0.38.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ace-defaults/e2e-runner/config.yml +14 -2
- data/CHANGELOG.md +178 -0
- data/README.md +2 -2
- data/exe/ace-test-e2e-sh +9 -4
- data/handbook/guides/e2e-testing.g.md +43 -9
- data/handbook/guides/scenario-yml-reference.g.md +16 -8
- data/handbook/guides/tc-authoring.g.md +12 -5
- data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
- data/handbook/skills/as-e2e-review/SKILL.md +2 -2
- data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
- data/handbook/templates/agent-experience-report.template.md +3 -2
- data/handbook/templates/scenario.yml.template.yml +7 -2
- data/handbook/templates/tc-file.template.md +14 -4
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
- data/handbook/workflow-instructions/e2e/create.wf.md +118 -25
- data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
- data/handbook/workflow-instructions/e2e/fix.wf.md +65 -15
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +17 -1
- data/handbook/workflow-instructions/e2e/review.wf.md +36 -25
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +15 -8
- data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +21 -8
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
- data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
- data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +157 -16
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +121 -8
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +119 -18
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +13 -12
- data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +282 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +85 -5
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +98 -16
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +241 -97
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +73 -15
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +120 -19
- data/lib/ace/test/end_to_end_runner/version.rb +1 -1
- data/lib/ace/test/end_to_end_runner.rb +2 -0
- metadata +19 -2
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "fileutils"
|
|
4
|
+
require "ostruct"
|
|
4
5
|
require "yaml"
|
|
6
|
+
require "set"
|
|
7
|
+
require "date"
|
|
5
8
|
require "ace/llm"
|
|
6
9
|
require "ace/llm/query_interface"
|
|
7
10
|
|
|
@@ -9,7 +12,7 @@ module Ace
|
|
|
9
12
|
module Test
|
|
10
13
|
module EndToEndRunner
|
|
11
14
|
module Molecules
|
|
12
|
-
# Writes
|
|
15
|
+
# Writes an aggregated package or suite report
|
|
13
16
|
#
|
|
14
17
|
# Uses LLM synthesis to generate rich reports with root cause analysis,
|
|
15
18
|
# friction insights, and improvement suggestions. Falls back to a static
|
|
@@ -22,7 +25,12 @@ module Ace
|
|
|
22
25
|
@timeout = reporting["timeout"] || 60
|
|
23
26
|
end
|
|
24
27
|
|
|
25
|
-
|
|
28
|
+
REPORT_KINDS = {
|
|
29
|
+
package: ->(timestamp, package) { "#{timestamp}-#{package}-report.md" },
|
|
30
|
+
suite: ->(timestamp, _package) { "#{timestamp}-suite-report.md" }
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
33
|
+
# Write an aggregated report
|
|
26
34
|
#
|
|
27
35
|
# @param results [Array<Models::TestResult>] Test results (ordered)
|
|
28
36
|
# @param scenarios [Array<Models::TestScenario>] Corresponding scenarios
|
|
@@ -30,22 +38,31 @@ module Ace
|
|
|
30
38
|
# @param timestamp [String] Timestamp ID for this run
|
|
31
39
|
# @param base_dir [String] Base directory for cache output
|
|
32
40
|
# @return [String] Path to the written report file
|
|
33
|
-
def write(results, scenarios, package:, timestamp:, base_dir:)
|
|
41
|
+
def write(results, scenarios, package:, timestamp:, base_dir:, report_kind: :package, diagnostics: nil)
|
|
34
42
|
cache_dir = File.join(base_dir, ".ace-local", "test-e2e")
|
|
35
43
|
FileUtils.mkdir_p(cache_dir)
|
|
36
44
|
|
|
37
|
-
report_path = File.join(cache_dir,
|
|
45
|
+
report_path = File.join(cache_dir, report_filename(report_kind, timestamp, package))
|
|
38
46
|
|
|
39
47
|
overall_status = compute_status(results)
|
|
40
48
|
executed_at = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
49
|
+
results_data = build_results_data(results, scenarios)
|
|
50
|
+
narrative_sections = synthesize_narrative_sections(
|
|
51
|
+
results_data,
|
|
44
52
|
package: package,
|
|
45
53
|
timestamp: timestamp,
|
|
46
54
|
overall_status: overall_status,
|
|
47
55
|
executed_at: executed_at
|
|
48
56
|
)
|
|
57
|
+
content = build_report(
|
|
58
|
+
results_data,
|
|
59
|
+
package: package,
|
|
60
|
+
timestamp: timestamp,
|
|
61
|
+
overall_status: overall_status,
|
|
62
|
+
executed_at: executed_at,
|
|
63
|
+
narrative_sections: narrative_sections,
|
|
64
|
+
diagnostics: diagnostics
|
|
65
|
+
)
|
|
49
66
|
|
|
50
67
|
File.write(report_path, content)
|
|
51
68
|
report_path
|
|
@@ -53,10 +70,16 @@ module Ace
|
|
|
53
70
|
|
|
54
71
|
private
|
|
55
72
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
73
|
+
def report_filename(report_kind, timestamp, package)
|
|
74
|
+
builder = REPORT_KINDS[report_kind.to_sym]
|
|
75
|
+
raise ArgumentError, "Unknown report kind: #{report_kind}" unless builder
|
|
59
76
|
|
|
77
|
+
builder.call(timestamp, package)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Attempt LLM synthesis for narrative sections only, falling back to
|
|
81
|
+
# deterministic defaults when the model is unavailable or malformed.
|
|
82
|
+
def synthesize_narrative_sections(results_data, package:, timestamp:, overall_status:, executed_at:)
|
|
60
83
|
prompt_builder = Atoms::SuiteReportPromptBuilder.new
|
|
61
84
|
user_prompt = prompt_builder.build(
|
|
62
85
|
results_data,
|
|
@@ -73,51 +96,35 @@ module Ace
|
|
|
73
96
|
timeout: @timeout,
|
|
74
97
|
temperature: 0.3
|
|
75
98
|
)
|
|
76
|
-
|
|
77
|
-
total_passed = results.sum(&:passed_count)
|
|
78
|
-
total_tc = results.sum(&:total_count)
|
|
79
|
-
validate_overall_line(response[:text], total_passed, total_tc)
|
|
99
|
+
extract_narrative_sections(response[:text])
|
|
80
100
|
rescue => e
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
executed_date = Time.now.utc.strftime("%Y-%m-%d")
|
|
84
|
-
total_passed = results.sum(&:passed_count)
|
|
85
|
-
total_failed = results.sum(&:failed_count)
|
|
86
|
-
total_tc = results.sum(&:total_count)
|
|
87
|
-
|
|
88
|
-
build_static_report(
|
|
89
|
-
results, scenarios,
|
|
90
|
-
package: package,
|
|
91
|
-
timestamp: timestamp,
|
|
92
|
-
overall_status: overall_status,
|
|
93
|
-
executed_at: executed_at,
|
|
94
|
-
executed_date: executed_date,
|
|
95
|
-
total_passed: total_passed,
|
|
96
|
-
total_failed: total_failed,
|
|
97
|
-
total_tc: total_tc
|
|
98
|
-
)
|
|
101
|
+
warn "Warning: LLM synthesis failed (#{e.class}: #{e.message}), using deterministic narrative" if ENV["DEBUG"]
|
|
102
|
+
fallback_narrative_sections(results_data)
|
|
99
103
|
end
|
|
100
104
|
|
|
101
105
|
# Read summary and experience report content from each result's report dir
|
|
102
106
|
def build_results_data(results, scenarios)
|
|
103
107
|
results.each_with_index.map do |result, i|
|
|
104
|
-
scenario = scenarios
|
|
108
|
+
scenario = scenario_for_result(result, scenarios, i)
|
|
105
109
|
report_dir = result.report_dir
|
|
106
110
|
|
|
107
111
|
summary_content = read_report_file(report_dir, "summary.r.md")
|
|
108
112
|
experience_content = read_report_file(report_dir, "experience.r.md")
|
|
109
113
|
|
|
114
|
+
report_metadata = read_report_frontmatter(report_dir)
|
|
115
|
+
|
|
110
116
|
{
|
|
111
117
|
test_id: result.test_id,
|
|
112
118
|
title: scenario.title,
|
|
113
119
|
status: result.status,
|
|
114
|
-
passed: result
|
|
115
|
-
failed: result
|
|
116
|
-
total: result
|
|
117
|
-
test_cases: result
|
|
120
|
+
passed: reported_count(report_metadata, result, "passed"),
|
|
121
|
+
failed: reported_count(report_metadata, result, "failed"),
|
|
122
|
+
total: reported_count(report_metadata, result, "total"),
|
|
123
|
+
test_cases: canonical_test_cases(report_metadata, result),
|
|
118
124
|
report_dir_name: report_dir ? File.basename(report_dir) : nil,
|
|
119
125
|
summary_content: summary_content,
|
|
120
|
-
experience_content: experience_content
|
|
126
|
+
experience_content: experience_content,
|
|
127
|
+
canonical_tc_source: !report_metadata.empty?
|
|
121
128
|
}
|
|
122
129
|
end
|
|
123
130
|
end
|
|
@@ -132,21 +139,71 @@ module Ace
|
|
|
132
139
|
File.read(path)
|
|
133
140
|
end
|
|
134
141
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def validate_overall_line(report_text, expected_passed, expected_total)
|
|
138
|
-
expected_pct = (expected_total > 0) ? (expected_passed * 100.0 / expected_total).round(0) : 0
|
|
139
|
-
correct_line = "**Overall:** #{expected_passed}/#{expected_total} test cases passed (#{expected_pct}%)"
|
|
142
|
+
def read_report_frontmatter(report_dir)
|
|
143
|
+
return {} unless report_dir
|
|
140
144
|
|
|
141
|
-
|
|
142
|
-
|
|
145
|
+
path = File.join(report_dir, "report.md")
|
|
146
|
+
return {} unless File.exist?(path)
|
|
143
147
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
148
|
+
content = File.read(path)
|
|
149
|
+
match = content.match(/\A---\s*\n(.*?)\n---\s*\n/m)
|
|
150
|
+
return {} unless match
|
|
151
|
+
|
|
152
|
+
YAML.safe_load(match[1], permitted_classes: [Time, Date]) || {}
|
|
153
|
+
rescue
|
|
154
|
+
{}
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def reported_count(report_metadata, result, kind)
|
|
158
|
+
key = "tcs-#{kind}"
|
|
159
|
+
fallback =
|
|
160
|
+
case kind
|
|
161
|
+
when "passed" then result.passed_count
|
|
162
|
+
when "failed" then result.failed_count
|
|
163
|
+
else result.total_count
|
|
164
|
+
end
|
|
165
|
+
report_metadata[key] || fallback
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def canonical_test_cases(report_metadata, result)
|
|
169
|
+
return result.test_cases if report_metadata.empty?
|
|
170
|
+
|
|
171
|
+
failed_entries = Array(report_metadata["failed"]).filter_map do |entry|
|
|
172
|
+
next unless entry.is_a?(Hash)
|
|
173
|
+
|
|
174
|
+
id = entry["tc"] || entry[:tc]
|
|
175
|
+
next unless id
|
|
176
|
+
|
|
177
|
+
{
|
|
178
|
+
id: id,
|
|
179
|
+
description: "",
|
|
180
|
+
status: "fail",
|
|
181
|
+
notes: entry["evidence"] || entry[:evidence] || "See scenario report for details",
|
|
182
|
+
category: entry["category"] || entry[:category] || "runner-error"
|
|
183
|
+
}
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
failed_ids = failed_entries.map { |entry| entry[:id] }.to_set
|
|
187
|
+
Array(report_metadata["canonical-failed-tcs"]).each do |tc_id|
|
|
188
|
+
next if failed_ids.include?(tc_id)
|
|
189
|
+
|
|
190
|
+
failed_entries << {
|
|
191
|
+
id: tc_id,
|
|
192
|
+
description: "",
|
|
193
|
+
status: "fail",
|
|
194
|
+
notes: "See scenario report for details",
|
|
195
|
+
category: "runner-error"
|
|
196
|
+
}
|
|
149
197
|
end
|
|
198
|
+
|
|
199
|
+
passed_entries = Array(report_metadata["passed"]).filter_map do |tc_id|
|
|
200
|
+
next if failed_ids.include?(tc_id)
|
|
201
|
+
|
|
202
|
+
{id: tc_id, description: "", status: "pass", notes: ""}
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
canonical = passed_entries + failed_entries
|
|
206
|
+
canonical.empty? ? result.test_cases : canonical
|
|
150
207
|
end
|
|
151
208
|
|
|
152
209
|
def compute_status(results)
|
|
@@ -163,22 +220,26 @@ module Ace
|
|
|
163
220
|
end
|
|
164
221
|
end
|
|
165
222
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
223
|
+
def build_report(results_data, package:, timestamp:, overall_status:, executed_at:, narrative_sections:, diagnostics:)
|
|
224
|
+
total_skipped = results_data.count { |r| r[:status] == "skip" }
|
|
225
|
+
total_passed = results_data.sum { |r| r[:passed] }
|
|
226
|
+
total_tc = results_data.sum { |r| r[:total] }
|
|
170
227
|
|
|
171
228
|
parts = []
|
|
172
229
|
parts << build_frontmatter(
|
|
173
230
|
timestamp: timestamp, package: package, overall_status: overall_status,
|
|
174
|
-
tests_run:
|
|
231
|
+
tests_run: results_data.size, executed_at: executed_at, skipped: total_skipped
|
|
175
232
|
)
|
|
176
|
-
parts << build_header(package: package
|
|
177
|
-
parts << build_summary_table(
|
|
233
|
+
parts << build_header(package: package)
|
|
234
|
+
parts << build_summary_table(results_data)
|
|
178
235
|
parts << build_overall_line(total_passed: total_passed, total_tc: total_tc)
|
|
179
|
-
parts << build_failed_section(
|
|
180
|
-
parts <<
|
|
181
|
-
parts
|
|
236
|
+
parts << build_failed_section(results_data) if results_data.any? { |r| r[:failed].positive? }
|
|
237
|
+
parts << build_runner_diagnostics_section(diagnostics)
|
|
238
|
+
parts << build_narrative_section("Friction Analysis", narrative_sections[:friction])
|
|
239
|
+
parts << build_narrative_section("Improvement Suggestions", narrative_sections[:improvements])
|
|
240
|
+
parts << build_narrative_section("Positive Observations", narrative_sections[:positive])
|
|
241
|
+
parts << build_reports_section(results_data)
|
|
242
|
+
parts.compact.join("\n")
|
|
182
243
|
end
|
|
183
244
|
|
|
184
245
|
def build_frontmatter(timestamp:, package:, overall_status:, tests_run:, executed_at:, skipped: 0)
|
|
@@ -194,82 +255,165 @@ module Ace
|
|
|
194
255
|
FRONTMATTER
|
|
195
256
|
end
|
|
196
257
|
|
|
197
|
-
def build_header(package
|
|
198
|
-
skipped_info = (skipped > 0) ? " (#{skipped} skipped)" : ""
|
|
258
|
+
def build_header(package:)
|
|
199
259
|
<<~HEADER
|
|
200
|
-
# E2E
|
|
201
|
-
|
|
202
|
-
**Package:** #{package}
|
|
203
|
-
**Tests:** #{tests_run}#{skipped_info}
|
|
204
|
-
**Executed:** #{executed_date}
|
|
260
|
+
# E2E Suite Report: `#{package}`
|
|
205
261
|
HEADER
|
|
206
262
|
end
|
|
207
263
|
|
|
208
|
-
def build_summary_table(
|
|
209
|
-
rows =
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
"| #{result.test_id} | #{scenario.title} | #{status_label} | #{passed} | #{failed} | #{total} |"
|
|
264
|
+
def build_summary_table(results_data)
|
|
265
|
+
rows = results_data.map do |result|
|
|
266
|
+
status_label = result[:status].capitalize
|
|
267
|
+
passed = (result[:status] == "skip") ? "-" : result[:passed].to_s
|
|
268
|
+
failed = (result[:status] == "skip") ? "-" : result[:failed].to_s
|
|
269
|
+
total = (result[:status] == "skip") ? "-" : result[:total].to_s
|
|
270
|
+
"| #{result[:test_id]} | #{result[:title]} | #{status_label} | #{passed} | #{failed} | #{total} |"
|
|
216
271
|
end
|
|
217
272
|
|
|
218
273
|
<<~TABLE
|
|
219
|
-
## Summary
|
|
274
|
+
## Summary Table
|
|
220
275
|
|
|
221
276
|
| Test ID | Title | Status | Passed | Failed | Total |
|
|
222
|
-
|
|
277
|
+
|---|---|---:|---:|---:|---:|
|
|
223
278
|
#{rows.join("\n")}
|
|
224
279
|
TABLE
|
|
225
280
|
end
|
|
226
281
|
|
|
227
282
|
def build_overall_line(total_passed:, total_tc:)
|
|
228
|
-
pct = (total_tc > 0) ? (total_passed * 100.0 / total_tc).round(
|
|
229
|
-
|
|
283
|
+
pct = (total_tc > 0) ? (total_passed * 100.0 / total_tc).round(1) : 0.0
|
|
284
|
+
formatted_pct = (pct % 1).zero? ? pct.to_i.to_s : format("%.1f", pct)
|
|
285
|
+
<<~OVERALL
|
|
286
|
+
## Overall Line
|
|
287
|
+
|
|
288
|
+
**Overall:** #{total_passed}/#{total_tc} test cases passed (#{formatted_pct}%)
|
|
289
|
+
OVERALL
|
|
230
290
|
end
|
|
231
291
|
|
|
232
|
-
def build_failed_section(
|
|
292
|
+
def build_failed_section(results_data)
|
|
233
293
|
parts = ["\n## Failed Tests\n"]
|
|
234
294
|
|
|
235
|
-
|
|
236
|
-
next
|
|
295
|
+
results_data.each do |result|
|
|
296
|
+
next unless result[:failed].positive?
|
|
237
297
|
|
|
238
|
-
|
|
239
|
-
parts << "
|
|
298
|
+
parts << "### #{result[:test_id]}"
|
|
299
|
+
parts << ""
|
|
300
|
+
parts << "**Failed test case details**"
|
|
240
301
|
|
|
241
|
-
failed_tcs = result
|
|
302
|
+
failed_tcs = result[:test_cases].select { |tc| tc[:status] == "fail" }
|
|
242
303
|
if failed_tcs.any?
|
|
243
|
-
parts << "**Failed Test Cases:**"
|
|
244
304
|
failed_tcs.each do |tc|
|
|
245
|
-
|
|
305
|
+
category = tc[:category] || "runner-error"
|
|
306
|
+
details = tc[:notes].to_s.strip
|
|
307
|
+
details = tc[:description].to_s if details.empty?
|
|
308
|
+
parts << "- `#{tc[:id]}` (#{category}) — #{details}"
|
|
246
309
|
end
|
|
247
|
-
|
|
310
|
+
else
|
|
311
|
+
parts << "- Exact failed TC mapping unavailable in aggregate view — see scenario report for canonical details."
|
|
248
312
|
end
|
|
249
313
|
|
|
250
|
-
if result
|
|
251
|
-
parts << "
|
|
314
|
+
if result[:report_dir_name]
|
|
315
|
+
parts << ""
|
|
316
|
+
parts << "**Report directory:** `#{result[:report_dir_name]}`"
|
|
252
317
|
end
|
|
318
|
+
parts << ""
|
|
253
319
|
end
|
|
254
320
|
|
|
255
321
|
parts.join("\n")
|
|
256
322
|
end
|
|
257
323
|
|
|
258
|
-
def
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
324
|
+
def build_runner_diagnostics_section(diagnostics)
|
|
325
|
+
return nil unless diagnostics.is_a?(Hash) && diagnostics[:dirty_worktree]
|
|
326
|
+
|
|
327
|
+
entries = Array(diagnostics[:new_tracked_entries]).map { |line| "- `#{line}`" }.join("\n")
|
|
328
|
+
entries = "- No specific entries captured." if entries.empty?
|
|
329
|
+
|
|
330
|
+
<<~SECTION
|
|
331
|
+
## Runner Diagnostics
|
|
332
|
+
|
|
333
|
+
Suite execution introduced new tracked working-tree changes relative to the pre-run snapshot.
|
|
334
|
+
|
|
335
|
+
#{entries}
|
|
336
|
+
SECTION
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
def build_narrative_section(title, content)
|
|
340
|
+
return nil if content.to_s.strip.empty?
|
|
341
|
+
|
|
342
|
+
<<~SECTION
|
|
343
|
+
## #{title}
|
|
344
|
+
|
|
345
|
+
#{content.to_s.strip}
|
|
346
|
+
SECTION
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
def build_reports_section(results_data)
|
|
350
|
+
rows = results_data.map do |result|
|
|
351
|
+
dir = result[:report_dir_name] || "N/A"
|
|
352
|
+
"| #{result[:test_id]} | `#{dir}` |"
|
|
262
353
|
end
|
|
263
354
|
|
|
264
355
|
<<~SECTION
|
|
265
356
|
|
|
266
|
-
## Reports
|
|
357
|
+
## Reports Table
|
|
267
358
|
|
|
268
|
-
| Test ID |
|
|
269
|
-
|
|
359
|
+
| Test ID | Report Directory |
|
|
360
|
+
|---|---|
|
|
270
361
|
#{rows.join("\n")}
|
|
271
362
|
SECTION
|
|
272
363
|
end
|
|
364
|
+
|
|
365
|
+
def extract_narrative_sections(report_text)
|
|
366
|
+
text = report_text.to_s
|
|
367
|
+
sections = {
|
|
368
|
+
friction: extract_markdown_section(text, "Friction Analysis"),
|
|
369
|
+
improvements: extract_markdown_section(text, "Improvement Suggestions"),
|
|
370
|
+
positive: extract_markdown_section(text, "Positive Observations")
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
fallback = strip_canonical_sections(text)
|
|
374
|
+
has_markdown_sections = text.match?(/^\#{2,3}\s+/)
|
|
375
|
+
sections[:positive] = fallback if sections.values.all? { |value| value.to_s.strip.empty? } &&
|
|
376
|
+
!fallback.empty? && !has_markdown_sections
|
|
377
|
+
sections
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def extract_markdown_section(text, heading)
|
|
381
|
+
match = text.match(/^\#{2,3}\s+#{Regexp.escape(heading)}\s*$\n?(.*?)(?=^\#{1,3}\s|\z)/mi)
|
|
382
|
+
return "" unless match
|
|
383
|
+
|
|
384
|
+
match[1].to_s.strip
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
def strip_canonical_sections(text)
|
|
388
|
+
body = text.to_s.dup
|
|
389
|
+
body.sub!(/\A---.*?^---\s*/m, "")
|
|
390
|
+
body.gsub!(/^\#{1,3}\s+.*$/, "")
|
|
391
|
+
body.gsub!(/^\|.*\|\s*$/, "")
|
|
392
|
+
body.gsub!(/^\*\*Overall:\*\*.*$/, "")
|
|
393
|
+
body.lines.map(&:rstrip).reject(&:empty?).join("\n").strip
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def fallback_narrative_sections(results_data)
|
|
397
|
+
failed_results = results_data.select { |result| result[:failed].positive? }
|
|
398
|
+
|
|
399
|
+
{
|
|
400
|
+
friction: failed_results.empty? ? "" : failed_results.map { |result|
|
|
401
|
+
"- #{result[:test_id]} had #{result[:failed]} failing test case(s); inspect `#{result[:report_dir_name]}` for scenario details."
|
|
402
|
+
}.join("\n"),
|
|
403
|
+
improvements: failed_results.empty? ? "" : failed_results.map { |result|
|
|
404
|
+
"- Re-run #{result[:test_id]} after the targeted fix and confirm the failing test case set is empty."
|
|
405
|
+
}.join("\n"),
|
|
406
|
+
positive: results_data.select { |result| result[:failed].zero? }.map { |result|
|
|
407
|
+
"- #{result[:test_id]} passed #{result[:passed]}/#{result[:total]} test cases."
|
|
408
|
+
}.join("\n")
|
|
409
|
+
}
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
def scenario_for_result(result, scenarios, index)
|
|
413
|
+
scenarios[index] || OpenStruct.new(
|
|
414
|
+
title: result.metadata[:phase] == "preflight" || result.metadata["phase"] == "preflight" ? "Preflight" : result.test_id
|
|
415
|
+
)
|
|
416
|
+
end
|
|
273
417
|
end
|
|
274
418
|
end
|
|
275
419
|
end
|
|
@@ -6,16 +6,18 @@ module Ace
|
|
|
6
6
|
module Test
|
|
7
7
|
module EndToEndRunner
|
|
8
8
|
module Molecules
|
|
9
|
-
# Discovers
|
|
9
|
+
# Discovers deterministic preflight tests and agent E2E scenarios in packages
|
|
10
10
|
#
|
|
11
11
|
# Finds test scenarios in the TS-format directory structure:
|
|
12
|
+
# {package}/test/feat/**/*_test.rb
|
|
12
13
|
# {package}/test/e2e/TS-*/scenario.yml
|
|
13
14
|
#
|
|
14
15
|
# Note: This is a Molecule (not an Atom) because it performs filesystem
|
|
15
16
|
# I/O via Dir.glob.
|
|
16
17
|
class TestDiscoverer
|
|
17
|
-
|
|
18
|
+
TEST_DIRS = ["test/e2e"].freeze
|
|
18
19
|
SCENARIO_FILE = "scenario.yml"
|
|
20
|
+
DEFAULT_PREFLIGHT_GLOBS = ["test/feat/**/*_test.rb"].freeze
|
|
19
21
|
SCENARIO_DIR_PATTERN = "TS-*"
|
|
20
22
|
|
|
21
23
|
# Find E2E test scenario files matching criteria
|
|
@@ -47,6 +49,17 @@ module Ace
|
|
|
47
49
|
).map(&:file_path).sort
|
|
48
50
|
end
|
|
49
51
|
|
|
52
|
+
# @return [Array<String>] Sorted list of matching deterministic preflight test files
|
|
53
|
+
def find_integration_tests(package:, base_dir: Dir.pwd)
|
|
54
|
+
package_path = File.join(base_dir, package)
|
|
55
|
+
preflight_globs.each do |glob|
|
|
56
|
+
files = Dir.glob(File.join(package_path, glob)).sort
|
|
57
|
+
return files unless files.empty?
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
[]
|
|
61
|
+
end
|
|
62
|
+
|
|
50
63
|
# Find TS-format scenario directories and load them as TestScenario models
|
|
51
64
|
#
|
|
52
65
|
# @param package [String] Package name
|
|
@@ -56,9 +69,11 @@ module Ace
|
|
|
56
69
|
# @param base_dir [String] Base directory to search from
|
|
57
70
|
# @return [Array<Models::TestScenario>] Loaded scenario models with test_cases
|
|
58
71
|
def find_scenarios(package:, test_id: nil, tags: nil, exclude_tags: nil, base_dir: Dir.pwd)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
72
|
+
patterns = TEST_DIRS.map do |test_dir_name|
|
|
73
|
+
test_dir = File.join(base_dir, package, test_dir_name)
|
|
74
|
+
File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
|
|
75
|
+
end
|
|
76
|
+
scenario_files = Dir.glob(patterns).sort
|
|
62
77
|
|
|
63
78
|
loader = ScenarioLoader.new
|
|
64
79
|
scenarios = scenario_files.map do |yml_path|
|
|
@@ -82,11 +97,13 @@ module Ace
|
|
|
82
97
|
# @param base_dir [String] Base directory to search from
|
|
83
98
|
# @return [Array<String>] Sorted list of package names
|
|
84
99
|
def list_packages(base_dir: Dir.pwd)
|
|
85
|
-
|
|
100
|
+
patterns = TEST_DIRS.map do |test_dir_name|
|
|
101
|
+
File.join(base_dir, "*/#{test_dir_name}/#{SCENARIO_DIR_PATTERN}/#{SCENARIO_FILE}")
|
|
102
|
+
end
|
|
86
103
|
|
|
87
104
|
base = Pathname.new(base_dir)
|
|
88
105
|
|
|
89
|
-
Dir.glob(
|
|
106
|
+
Dir.glob(patterns)
|
|
90
107
|
.map { |f| Pathname.new(f).relative_path_from(base).each_filename.first }
|
|
91
108
|
.uniq
|
|
92
109
|
.sort
|
|
@@ -96,12 +113,14 @@ module Ace
|
|
|
96
113
|
|
|
97
114
|
# Build glob pattern for finding TS-format scenario.yml files
|
|
98
115
|
def build_scenario_pattern(package, test_id, base_dir)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
116
|
+
TEST_DIRS.map do |test_dir_name|
|
|
117
|
+
test_dir = File.join(base_dir, package, test_dir_name)
|
|
118
|
+
|
|
119
|
+
if test_id
|
|
120
|
+
File.join(test_dir, "*#{test_id}*", SCENARIO_FILE)
|
|
121
|
+
else
|
|
122
|
+
File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
|
|
123
|
+
end
|
|
105
124
|
end
|
|
106
125
|
end
|
|
107
126
|
|
|
@@ -129,6 +148,12 @@ module Ace
|
|
|
129
148
|
|
|
130
149
|
filtered
|
|
131
150
|
end
|
|
151
|
+
|
|
152
|
+
def preflight_globs
|
|
153
|
+
configured = Molecules::ConfigLoader.load.dig("patterns", "preflight")
|
|
154
|
+
globs = [configured, *DEFAULT_PREFLIGHT_GLOBS].compact.uniq
|
|
155
|
+
globs.reject(&:empty?)
|
|
156
|
+
end
|
|
132
157
|
end
|
|
133
158
|
end
|
|
134
159
|
end
|
|
@@ -16,12 +16,18 @@ module Ace
|
|
|
16
16
|
# @param provider [String] LLM provider:model string
|
|
17
17
|
# @param timeout [Integer] Request timeout in seconds
|
|
18
18
|
# @param config [Hash] Configuration hash (string keys) from ConfigLoader
|
|
19
|
-
def initialize(provider: nil, timeout: nil, config: nil)
|
|
19
|
+
def initialize(provider: nil, timeout: nil, config: nil, sandbox_backend_factory: nil)
|
|
20
20
|
config ||= Molecules::ConfigLoader.load
|
|
21
|
-
@provider = provider || config.dig("execution", "
|
|
21
|
+
@provider = provider || config.dig("execution", "runner_provider") ||
|
|
22
|
+
config.dig("execution", "provider") || "claude:sonnet"
|
|
23
|
+
@verifier_provider = config.dig("execution", "verifier_provider") ||
|
|
24
|
+
config.dig("execution", "provider") || @provider
|
|
22
25
|
@timeout = timeout || config.dig("execution", "timeout") || 300
|
|
23
26
|
@prompt_builder = Atoms::PromptBuilder.new
|
|
24
27
|
@cli_provider_adapter = Atoms::CliProviderAdapter.new(config)
|
|
28
|
+
@sandbox_backend_factory = sandbox_backend_factory || lambda { |sandbox_path, source_root: nil|
|
|
29
|
+
Molecules::BwrapSandboxBackend.new(sandbox_root: sandbox_path, source_root: source_root)
|
|
30
|
+
}
|
|
25
31
|
end
|
|
26
32
|
|
|
27
33
|
# Execute a single test scenario via LLM
|
|
@@ -192,9 +198,10 @@ module Ace
|
|
|
192
198
|
# Execute TC via skill invocation for CLI providers
|
|
193
199
|
def execute_tc_via_skill(test_case, sandbox_path, scenario, cli_args: nil, run_id: nil, env_vars: nil)
|
|
194
200
|
with_tc_error_handling(scenario) do |started_at|
|
|
201
|
+
sandbox_backend, prepared_env = prepared_env_for(sandbox_path, env_vars)
|
|
195
202
|
prompt = @cli_provider_adapter.build_tc_skill_prompt(
|
|
196
203
|
test_case: test_case, scenario: scenario,
|
|
197
|
-
sandbox_path: sandbox_path, run_id: run_id, env_vars:
|
|
204
|
+
sandbox_path: sandbox_path, run_id: run_id, env_vars: prepared_env
|
|
198
205
|
)
|
|
199
206
|
|
|
200
207
|
response = Ace::LLM::QueryInterface.query(
|
|
@@ -202,7 +209,8 @@ module Ace
|
|
|
202
209
|
system: nil, cli_args: cli_args,
|
|
203
210
|
timeout: @timeout, fallback: false,
|
|
204
211
|
working_dir: sandbox_path,
|
|
205
|
-
subprocess_env:
|
|
212
|
+
subprocess_env: prepared_env,
|
|
213
|
+
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: prepared_env)
|
|
206
214
|
)
|
|
207
215
|
|
|
208
216
|
invocation_error = detect_skill_invocation_error(response[:text])
|
|
@@ -322,9 +330,23 @@ module Ace
|
|
|
322
330
|
@pipeline_executors ||= {}
|
|
323
331
|
@pipeline_executors[timeout] ||= Molecules::PipelineExecutor.new(
|
|
324
332
|
provider: @provider,
|
|
325
|
-
|
|
333
|
+
verifier_provider: @verifier_provider,
|
|
334
|
+
timeout: timeout,
|
|
335
|
+
sandbox_backend_factory: @sandbox_backend_factory
|
|
326
336
|
)
|
|
327
337
|
end
|
|
338
|
+
|
|
339
|
+
def build_sandbox_backend(sandbox_path, env_vars)
|
|
340
|
+
@sandbox_backend_factory.call(
|
|
341
|
+
sandbox_path,
|
|
342
|
+
source_root: env_vars&.dig("ACE_E2E_SOURCE_ROOT") || env_vars&.dig(:ACE_E2E_SOURCE_ROOT)
|
|
343
|
+
)
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
def prepared_env_for(sandbox_path, env_vars)
|
|
347
|
+
sandbox_backend = build_sandbox_backend(sandbox_path, env_vars || {})
|
|
348
|
+
[sandbox_backend, sandbox_backend.prepared_env(env_vars || {})]
|
|
349
|
+
end
|
|
328
350
|
end
|
|
329
351
|
end
|
|
330
352
|
end
|