ace-test-runner-e2e 0.29.6 → 0.38.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ace-defaults/e2e-runner/config.yml +14 -2
- data/CHANGELOG.md +187 -0
- data/README.md +2 -2
- data/exe/ace-test-e2e-sh +9 -4
- data/handbook/guides/e2e-testing.g.md +43 -9
- data/handbook/guides/scenario-yml-reference.g.md +16 -8
- data/handbook/guides/tc-authoring.g.md +12 -5
- data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
- data/handbook/skills/as-e2e-review/SKILL.md +2 -2
- data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
- data/handbook/templates/agent-experience-report.template.md +3 -2
- data/handbook/templates/scenario.yml.template.yml +13 -2
- data/handbook/templates/tc-file.template.md +14 -4
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
- data/handbook/workflow-instructions/e2e/create.wf.md +139 -23
- data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
- data/handbook/workflow-instructions/e2e/fix.wf.md +65 -15
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +17 -1
- data/handbook/workflow-instructions/e2e/review.wf.md +44 -28
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +17 -3
- data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +21 -8
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
- data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
- data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +165 -25
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +121 -8
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +119 -18
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +13 -12
- data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +282 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +85 -5
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +98 -16
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +241 -97
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +73 -15
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +120 -19
- data/lib/ace/test/end_to_end_runner/version.rb +1 -1
- data/lib/ace/test/end_to_end_runner.rb +2 -0
- metadata +19 -2
|
@@ -9,7 +9,16 @@ module Ace
|
|
|
9
9
|
module Molecules
|
|
10
10
|
# Generates TC-first reports from standalone verifier output.
|
|
11
11
|
class PipelineReportGenerator
|
|
12
|
-
FAILURE_CATEGORIES = %w[
|
|
12
|
+
FAILURE_CATEGORIES = %w[
|
|
13
|
+
test-spec-error
|
|
14
|
+
tool-bug
|
|
15
|
+
runner-error
|
|
16
|
+
infrastructure-error
|
|
17
|
+
missing-artifact
|
|
18
|
+
state-drift
|
|
19
|
+
behavior-regression
|
|
20
|
+
discoverability-gap
|
|
21
|
+
].freeze
|
|
13
22
|
|
|
14
23
|
# @param report_writer [Molecules::ReportWriter]
|
|
15
24
|
def initialize(report_writer: nil)
|
|
@@ -23,8 +32,9 @@ module Ace
|
|
|
23
32
|
# @param started_at [Time]
|
|
24
33
|
# @param completed_at [Time]
|
|
25
34
|
# @return [Models::TestResult]
|
|
26
|
-
def generate(scenario:, verifier_output:, report_dir:, provider:, started_at:, completed_at:)
|
|
35
|
+
def generate(scenario:, verifier_output:, report_dir:, provider:, started_at:, completed_at:, metadata: {})
|
|
27
36
|
parsed = parse_verifier_output(verifier_output, scenario)
|
|
37
|
+
merged_metadata = metadata.merge(parsed[:metadata] || {})
|
|
28
38
|
|
|
29
39
|
result = Models::TestResult.new(
|
|
30
40
|
test_id: scenario.test_id,
|
|
@@ -32,8 +42,10 @@ module Ace
|
|
|
32
42
|
test_cases: parsed[:test_cases],
|
|
33
43
|
summary: parsed[:summary],
|
|
34
44
|
error: parsed[:error],
|
|
45
|
+
observations: parsed[:observations].to_s,
|
|
35
46
|
started_at: started_at,
|
|
36
|
-
completed_at: completed_at
|
|
47
|
+
completed_at: completed_at,
|
|
48
|
+
metadata: merged_metadata
|
|
37
49
|
)
|
|
38
50
|
|
|
39
51
|
FileUtils.mkdir_p(report_dir)
|
|
@@ -57,15 +69,18 @@ module Ace
|
|
|
57
69
|
# @param completed_at [Time]
|
|
58
70
|
# @param error_message [String]
|
|
59
71
|
# @return [Models::TestResult]
|
|
60
|
-
def write_failure_report(scenario:, report_dir:, provider:, started_at:, completed_at:, error_message
|
|
72
|
+
def write_failure_report(scenario:, report_dir:, provider:, started_at:, completed_at:, error_message:,
|
|
73
|
+
failure_category: "runner-error", test_cases: [], metadata: {})
|
|
61
74
|
result = Models::TestResult.new(
|
|
62
75
|
test_id: scenario.test_id,
|
|
63
76
|
status: "error",
|
|
64
|
-
test_cases:
|
|
77
|
+
test_cases: test_cases,
|
|
65
78
|
summary: "Execution pipeline failed",
|
|
66
79
|
error: error_message,
|
|
80
|
+
observations: metadata["runner_observations"].to_s,
|
|
67
81
|
started_at: started_at,
|
|
68
|
-
completed_at: completed_at
|
|
82
|
+
completed_at: completed_at,
|
|
83
|
+
metadata: metadata.merge("failure_category" => failure_category)
|
|
69
84
|
)
|
|
70
85
|
|
|
71
86
|
FileUtils.mkdir_p(report_dir)
|
|
@@ -83,14 +98,16 @@ module Ace
|
|
|
83
98
|
|
|
84
99
|
def parse_verifier_output(text, scenario)
|
|
85
100
|
goals = parse_goal_sections(text, scenario)
|
|
86
|
-
return build_result_from_goals(goals) unless goals.empty?
|
|
101
|
+
return build_result_from_goals(goals, text) unless goals.empty?
|
|
87
102
|
|
|
88
103
|
parsed = Atoms::SkillResultParser.parse_verifier(text)
|
|
89
104
|
{
|
|
90
105
|
status: parsed[:status],
|
|
91
106
|
test_cases: parsed[:test_cases],
|
|
92
107
|
summary: parsed[:summary],
|
|
93
|
-
error: parsed[:observations]
|
|
108
|
+
error: parsed[:observations],
|
|
109
|
+
observations: parsed[:observations],
|
|
110
|
+
metadata: extract_overall_user_outcome(text)
|
|
94
111
|
}
|
|
95
112
|
rescue Atoms::ResultParser::ParseError => e
|
|
96
113
|
issue = summarize_unstructured_verifier_output(text)
|
|
@@ -98,7 +115,9 @@ module Ace
|
|
|
98
115
|
status: "error",
|
|
99
116
|
test_cases: [],
|
|
100
117
|
summary: "Verifier returned unstructured output",
|
|
101
|
-
error: issue || e.message
|
|
118
|
+
error: issue || e.message,
|
|
119
|
+
observations: "",
|
|
120
|
+
metadata: {}
|
|
102
121
|
}
|
|
103
122
|
end
|
|
104
123
|
|
|
@@ -121,7 +140,9 @@ module Ace
|
|
|
121
140
|
evidence = extract_evidence(block)
|
|
122
141
|
next if verdict.nil?
|
|
123
142
|
|
|
124
|
-
|
|
143
|
+
direct_goal_id = format("TC-%03d", goal_number)
|
|
144
|
+
direct_match = scenario_test_cases.find { |tc| tc.tc_id.to_s.upcase == direct_goal_id }
|
|
145
|
+
tc_id = direct_match&.tc_id || scenario_test_cases[goal_number - 1]&.tc_id || direct_goal_id
|
|
125
146
|
category = extract_category(block, evidence)
|
|
126
147
|
|
|
127
148
|
{
|
|
@@ -180,10 +201,14 @@ module Ace
|
|
|
180
201
|
explicit = extract_field_token(block, %w[Category])
|
|
181
202
|
return normalize_category(explicit) if explicit
|
|
182
203
|
|
|
183
|
-
inline = block.to_s.match(
|
|
204
|
+
inline = block.to_s.match(
|
|
205
|
+
/`(test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)`/i
|
|
206
|
+
)
|
|
184
207
|
return normalize_category(inline[1]) if inline
|
|
185
208
|
|
|
186
|
-
paren = block.to_s.match(
|
|
209
|
+
paren = block.to_s.match(
|
|
210
|
+
/\((test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)\)/i
|
|
211
|
+
)
|
|
187
212
|
return normalize_category(paren[1]) if paren
|
|
188
213
|
|
|
189
214
|
normalize_category("#{block}\n#{evidence}")
|
|
@@ -191,7 +216,9 @@ module Ace
|
|
|
191
216
|
|
|
192
217
|
def normalize_category(value)
|
|
193
218
|
category = value.to_s.strip.downcase
|
|
194
|
-
match = category.match(
|
|
219
|
+
match = category.match(
|
|
220
|
+
/\b(test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)\b/
|
|
221
|
+
)
|
|
195
222
|
return match[1] if match
|
|
196
223
|
|
|
197
224
|
"runner-error"
|
|
@@ -222,7 +249,7 @@ module Ace
|
|
|
222
249
|
nil
|
|
223
250
|
end
|
|
224
251
|
|
|
225
|
-
def build_result_from_goals(goals)
|
|
252
|
+
def build_result_from_goals(goals, text)
|
|
226
253
|
passed = goals.count { |goal| goal[:status] == "pass" }
|
|
227
254
|
total = goals.size
|
|
228
255
|
status = if passed == total
|
|
@@ -236,10 +263,25 @@ module Ace
|
|
|
236
263
|
{
|
|
237
264
|
status: status,
|
|
238
265
|
test_cases: goals,
|
|
239
|
-
summary: "#{passed}/#{total} passed"
|
|
266
|
+
summary: "#{passed}/#{total} passed",
|
|
267
|
+
observations: "",
|
|
268
|
+
error: nil,
|
|
269
|
+
metadata: extract_overall_user_outcome(text)
|
|
240
270
|
}
|
|
241
271
|
end
|
|
242
272
|
|
|
273
|
+
def extract_overall_user_outcome(text)
|
|
274
|
+
works = text.to_s.match(/\*\*Works for end user\*\*:\s*(yes|partial|no)/i)&.captures&.first
|
|
275
|
+
friction = text.to_s.match(/^\s*[-*]?\s*\*\*Friction\*\*:\s*(.+?)\s*$/im)&.captures&.first
|
|
276
|
+
feedback = text.to_s.match(/^\s*[-*]?\s*\*\*Feedback\*\*:\s*(.+?)\s*$/im)&.captures&.first
|
|
277
|
+
|
|
278
|
+
metadata = {}
|
|
279
|
+
metadata["works_for_end_user"] = works.to_s.downcase unless works.to_s.empty?
|
|
280
|
+
metadata["user_friction"] = friction.to_s.strip unless friction.to_s.strip.empty?
|
|
281
|
+
metadata["user_feedback"] = feedback.to_s.strip unless feedback.to_s.strip.empty?
|
|
282
|
+
metadata
|
|
283
|
+
end
|
|
284
|
+
|
|
243
285
|
def summarize_unstructured_verifier_output(text)
|
|
244
286
|
summary = text.to_s.lines.map(&:strip).reject(&:empty?).first(3).join(" ")
|
|
245
287
|
return nil if summary.empty?
|
|
@@ -266,8 +308,8 @@ module Ace
|
|
|
266
308
|
"test-id" => scenario.test_id,
|
|
267
309
|
"title" => scenario.title,
|
|
268
310
|
"package" => scenario.package,
|
|
269
|
-
"runner-provider" => provider,
|
|
270
|
-
"verifier-provider" => provider,
|
|
311
|
+
"runner-provider" => result.metadata["runner_provider"] || provider,
|
|
312
|
+
"verifier-provider" => result.metadata["verifier_provider"] || provider,
|
|
271
313
|
"timestamp" => result.completed_at.utc.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
272
314
|
"tcs-passed" => passed,
|
|
273
315
|
"tcs-failed" => failed,
|
|
@@ -281,12 +323,21 @@ module Ace
|
|
|
281
323
|
"category" => tc[:category] || "runner-error",
|
|
282
324
|
"evidence" => tc[:notes].to_s
|
|
283
325
|
}
|
|
284
|
-
end
|
|
326
|
+
end,
|
|
327
|
+
"canonical-failed-tcs" => result.failed_test_case_ids
|
|
285
328
|
}
|
|
329
|
+
frontmatter["works-for-end-user"] = result.metadata["works_for_end_user"] if result.metadata["works_for_end_user"]
|
|
330
|
+
frontmatter["user-friction"] = result.metadata["user_friction"] if result.metadata["user_friction"]
|
|
331
|
+
frontmatter["user-feedback"] = result.metadata["user_feedback"] if result.metadata["user_feedback"]
|
|
332
|
+
frontmatter["missing-required-artifacts"] = result.metadata["missing_required_artifacts"] if result.metadata["missing_required_artifacts"]
|
|
286
333
|
frontmatter_yaml = YAML.dump(frontmatter).sub(/\A---\s*\n/, "").sub(/\.\.\.\s*\n\z/, "")
|
|
287
334
|
|
|
288
335
|
rows = result.test_cases.map do |tc|
|
|
289
|
-
"| #{tc[:id]} | #{tc[:status].upcase} | #{tc
|
|
336
|
+
"| #{tc[:id]} | #{tc[:status].upcase} | #{canonical_goal_evidence(tc)} |"
|
|
337
|
+
end.join("\n")
|
|
338
|
+
|
|
339
|
+
verdict_rows = result.test_cases.map do |tc|
|
|
340
|
+
"| #{tc[:id]} | #{tc[:status].upcase} |"
|
|
290
341
|
end.join("\n")
|
|
291
342
|
|
|
292
343
|
content = <<~REPORT
|
|
@@ -310,10 +361,31 @@ module Ace
|
|
|
310
361
|
| Failed | #{failed} |
|
|
311
362
|
| Total | #{total} |
|
|
312
363
|
| Score | #{(score * 100).round(1)}% |
|
|
364
|
+
|
|
365
|
+
## Canonical Goal Verdicts
|
|
366
|
+
|
|
367
|
+
| Goal | Canonical Verdict |
|
|
368
|
+
|------|-------------------|
|
|
369
|
+
#{verdict_rows}
|
|
370
|
+
|
|
371
|
+
## Overall User Outcome
|
|
372
|
+
|
|
373
|
+
| Field | Value |
|
|
374
|
+
|-------|-------|
|
|
375
|
+
| Works for end user | #{result.metadata["works_for_end_user"] || "unspecified"} |
|
|
376
|
+
| Friction | #{result.metadata["user_friction"] || "None"} |
|
|
377
|
+
| Feedback | #{result.metadata["user_feedback"] || "None"} |
|
|
313
378
|
REPORT
|
|
314
379
|
|
|
315
380
|
File.write(path, content)
|
|
316
381
|
end
|
|
382
|
+
|
|
383
|
+
def canonical_goal_evidence(test_case)
|
|
384
|
+
notes = test_case[:notes].to_s.strip
|
|
385
|
+
return notes unless notes.match?(/\bverdict\s+correction\b/i)
|
|
386
|
+
|
|
387
|
+
"Canonical verdict #{test_case[:status].to_s.upcase}. Preserved verifier note: #{notes}"
|
|
388
|
+
end
|
|
317
389
|
end
|
|
318
390
|
end
|
|
319
391
|
end
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require "fileutils"
|
|
4
4
|
require "open3"
|
|
5
|
+
require "yaml"
|
|
6
|
+
require "ace/test_support/sandbox_package_copy"
|
|
5
7
|
|
|
6
8
|
module Ace
|
|
7
9
|
module Test
|
|
@@ -10,8 +12,14 @@ module Ace
|
|
|
10
12
|
# Builds deterministic sandbox state for standalone execution.
|
|
11
13
|
class PipelineSandboxBuilder
|
|
12
14
|
# @param config_root [String] Project root used for provider symlink/bin path
|
|
13
|
-
def initialize(config_root: Dir.pwd)
|
|
15
|
+
def initialize(config_root: Dir.pwd, package_copy: nil, runtime_builder: nil, config: nil)
|
|
14
16
|
@config_root = File.expand_path(config_root)
|
|
17
|
+
@config = config || Molecules::ConfigLoader.load
|
|
18
|
+
@package_copy = package_copy || Ace::TestSupport::SandboxPackageCopy.new(source_root: @config_root)
|
|
19
|
+
@runtime_builder = runtime_builder || Molecules::SandboxRuntimeBuilder.new(
|
|
20
|
+
source_root: @config_root,
|
|
21
|
+
ruby_version: @config.dig("sandbox", "ruby_version") || Molecules::ConfigLoader.default_sandbox_ruby_version
|
|
22
|
+
)
|
|
15
23
|
end
|
|
16
24
|
|
|
17
25
|
# @param scenario [Models::TestScenario]
|
|
@@ -25,32 +33,69 @@ module Ace
|
|
|
25
33
|
FileUtils.mkdir_p(File.join(sandbox_path, "reports"))
|
|
26
34
|
|
|
27
35
|
initialize_git_repo(sandbox_path)
|
|
28
|
-
ensure_package_available(scenario.package, sandbox_path)
|
|
36
|
+
package_copy_result = ensure_package_available(scenario.package, sandbox_path)
|
|
37
|
+
sync_protocol_sources(sandbox_path)
|
|
38
|
+
runtime_result = @runtime_builder.prepare(
|
|
39
|
+
sandbox_root: sandbox_path,
|
|
40
|
+
env: package_copy_result[:env],
|
|
41
|
+
tool_names: scenario.requires.fetch("tools", [])
|
|
42
|
+
)
|
|
29
43
|
link_provider_configs(sandbox_path)
|
|
30
44
|
create_result_directories(scenario, sandbox_path, test_cases: test_cases)
|
|
31
|
-
|
|
45
|
+
run_default_bootstrap(scenario, sandbox_path, runtime_result[:env])
|
|
46
|
+
verify_tool_access(scenario, sandbox_path, runtime_result[:env])
|
|
32
47
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
48
|
+
runtime_result[:env]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Prepare only the runner/verifier layout for a sandbox that was
|
|
52
|
+
# already created by the deterministic setup path.
|
|
53
|
+
#
|
|
54
|
+
# This must not mutate tracked sandbox contents by copying packages,
|
|
55
|
+
# syncing protocol sources, or replacing config directories with
|
|
56
|
+
# symlinks after the scenario setup has already established git state.
|
|
57
|
+
#
|
|
58
|
+
# @param scenario [Models::TestScenario]
|
|
59
|
+
# @param sandbox_path [String]
|
|
60
|
+
# @param test_cases [Array<String>, nil] Optional TC filter
|
|
61
|
+
# @return [Hash] Additional environment variables (none required)
|
|
62
|
+
def prepare_existing_sandbox(scenario:, sandbox_path:, test_cases: nil)
|
|
63
|
+
sandbox_path = File.expand_path(sandbox_path)
|
|
64
|
+
FileUtils.mkdir_p(sandbox_path)
|
|
65
|
+
FileUtils.mkdir_p(File.join(sandbox_path, ".ace-local", "e2e"))
|
|
66
|
+
FileUtils.mkdir_p(File.join(sandbox_path, "reports"))
|
|
67
|
+
create_result_directories(scenario, sandbox_path, test_cases: test_cases)
|
|
68
|
+
{}
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Sync protocol source manifests and backing directories into a
|
|
72
|
+
# prepared sandbox before deterministic setup runs.
|
|
73
|
+
#
|
|
74
|
+
# This is safe before setup because no scenario-owned git baseline has
|
|
75
|
+
# been established yet. It is intentionally separate from
|
|
76
|
+
# prepare_existing_sandbox so the post-setup pipeline path remains
|
|
77
|
+
# non-mutating.
|
|
78
|
+
#
|
|
79
|
+
# @param sandbox_path [String]
|
|
80
|
+
# @return [void]
|
|
81
|
+
def sync_protocol_sources_into(sandbox_path)
|
|
82
|
+
sync_protocol_sources(File.expand_path(sandbox_path))
|
|
36
83
|
end
|
|
37
84
|
|
|
38
85
|
private
|
|
39
86
|
|
|
40
87
|
def ensure_package_available(package_name, sandbox_path)
|
|
41
88
|
package_name = package_name.to_s.strip
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
unless File.directory?(package_source)
|
|
50
|
-
raise "Scenario package not found: #{package_name} (expected #{package_source})"
|
|
89
|
+
if package_name.empty?
|
|
90
|
+
return {
|
|
91
|
+
env: {
|
|
92
|
+
"PROJECT_ROOT_PATH" => sandbox_path,
|
|
93
|
+
"ACE_E2E_SOURCE_ROOT" => @config_root
|
|
94
|
+
}
|
|
95
|
+
}
|
|
51
96
|
end
|
|
52
97
|
|
|
53
|
-
|
|
98
|
+
@package_copy.prepare(package_name: package_name, sandbox_root: sandbox_path)
|
|
54
99
|
end
|
|
55
100
|
|
|
56
101
|
def initialize_git_repo(sandbox_path)
|
|
@@ -62,6 +107,45 @@ module Ace
|
|
|
62
107
|
raise "Sandbox git init failed: #{stderr}".strip
|
|
63
108
|
end
|
|
64
109
|
|
|
110
|
+
def sync_protocol_sources(sandbox_path)
|
|
111
|
+
%w[skill wfi].each do |protocol|
|
|
112
|
+
Dir.glob(File.join(@config_root, "*", ".ace-defaults", "nav", "protocols",
|
|
113
|
+
"#{protocol}-sources", "*.yml")).sort.each do |manifest_path|
|
|
114
|
+
sync_protocol_source_manifest(protocol, manifest_path, sandbox_path)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def sync_protocol_source_manifest(protocol, manifest_path, sandbox_path)
|
|
120
|
+
source_data = YAML.safe_load_file(manifest_path) || {}
|
|
121
|
+
relative_path = source_data.dig("config", "relative_path").to_s.strip
|
|
122
|
+
return if relative_path.empty?
|
|
123
|
+
|
|
124
|
+
package_root = File.expand_path("../../../../..", manifest_path)
|
|
125
|
+
package_name = File.basename(package_root)
|
|
126
|
+
target_package_root = File.join(sandbox_path, package_name)
|
|
127
|
+
target_manifest_path = File.join(
|
|
128
|
+
target_package_root,
|
|
129
|
+
".ace-defaults",
|
|
130
|
+
"nav",
|
|
131
|
+
"protocols",
|
|
132
|
+
"#{protocol}-sources",
|
|
133
|
+
File.basename(manifest_path)
|
|
134
|
+
)
|
|
135
|
+
source_dir = File.join(package_root, relative_path)
|
|
136
|
+
target_dir = File.join(target_package_root, relative_path)
|
|
137
|
+
|
|
138
|
+
FileUtils.mkdir_p(File.dirname(target_manifest_path))
|
|
139
|
+
FileUtils.cp(manifest_path, target_manifest_path) unless File.exist?(target_manifest_path)
|
|
140
|
+
return unless File.directory?(source_dir)
|
|
141
|
+
return if File.exist?(target_dir)
|
|
142
|
+
|
|
143
|
+
FileUtils.mkdir_p(File.dirname(target_dir))
|
|
144
|
+
FileUtils.cp_r(source_dir, target_dir)
|
|
145
|
+
rescue Psych::SyntaxError
|
|
146
|
+
nil
|
|
147
|
+
end
|
|
148
|
+
|
|
65
149
|
def link_provider_configs(sandbox_path)
|
|
66
150
|
source = File.join(@config_root, ".ace", "llm", "providers")
|
|
67
151
|
target = File.join(sandbox_path, ".ace", "llm", "providers")
|
|
@@ -115,15 +199,32 @@ module Ace
|
|
|
115
199
|
match ? match[1].to_i : nil
|
|
116
200
|
end
|
|
117
201
|
|
|
118
|
-
def verify_tool_access(scenario, sandbox_path)
|
|
202
|
+
def verify_tool_access(scenario, sandbox_path, env)
|
|
119
203
|
tool = scenario.tool_under_test.to_s.strip
|
|
120
204
|
return if tool.empty?
|
|
121
205
|
|
|
122
|
-
_stdout, stderr, status = Open3.capture3(tool, "--help", chdir: sandbox_path)
|
|
206
|
+
_stdout, stderr, status = Open3.capture3(env, tool, "--help", chdir: sandbox_path)
|
|
123
207
|
return if status.success?
|
|
124
208
|
|
|
125
209
|
raise "Sandbox tool check failed for #{tool}: #{stderr}".strip
|
|
126
210
|
end
|
|
211
|
+
|
|
212
|
+
def run_default_bootstrap(scenario, sandbox_path, env)
|
|
213
|
+
return unless scenario.sandbox_profile == "ace-default"
|
|
214
|
+
|
|
215
|
+
stdout, stderr, status = Open3.capture3(
|
|
216
|
+
env,
|
|
217
|
+
"bash", "--noprofile", "--norc", "-c", "ace-config init && ace-handbook sync",
|
|
218
|
+
chdir: sandbox_path
|
|
219
|
+
)
|
|
220
|
+
return if status.success?
|
|
221
|
+
|
|
222
|
+
raise [
|
|
223
|
+
"Default sandbox bootstrap failed for #{scenario.test_id}",
|
|
224
|
+
stdout.to_s.strip,
|
|
225
|
+
stderr.to_s.strip
|
|
226
|
+
].reject(&:empty?).join("\n")
|
|
227
|
+
end
|
|
127
228
|
end
|
|
128
229
|
end
|
|
129
230
|
end
|
|
@@ -130,6 +130,8 @@ module Ace
|
|
|
130
130
|
# @return [String] Path to written file
|
|
131
131
|
def write_experience(result, scenario, report_dir, test_case = nil)
|
|
132
132
|
path = File.join(report_dir, "experience.r.md")
|
|
133
|
+
runner_observations = result.metadata["runner_observations"].to_s.strip
|
|
134
|
+
verifier_observations = result.observations.to_s.strip
|
|
133
135
|
|
|
134
136
|
tc_title_suffix = test_case ? " / #{test_case.tc_id}" : ""
|
|
135
137
|
|
|
@@ -154,22 +156,15 @@ module Ace
|
|
|
154
156
|
|
|
155
157
|
## Summary
|
|
156
158
|
|
|
157
|
-
|
|
158
|
-
#{(result.status == "pass") ? "No significant friction encountered." : "Test execution completed with issues noted below."}
|
|
159
|
+
Runner observations captured by the harness for this scenario.
|
|
159
160
|
|
|
160
|
-
##
|
|
161
|
+
## Runner Observations
|
|
161
162
|
|
|
162
|
-
|
|
163
|
+
#{runner_observations.empty? ? "- None provided by runner." : runner_observations}
|
|
163
164
|
|
|
164
|
-
|
|
165
|
+
## Verifier Notes
|
|
165
166
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
- #{result.error || "None observed"}
|
|
169
|
-
|
|
170
|
-
## Positive Observations
|
|
171
|
-
|
|
172
|
-
- Automated test execution completed successfully via LLM
|
|
167
|
+
- #{verifier_observations.empty? ? (result.error || "None recorded.") : verifier_observations}
|
|
173
168
|
REPORT
|
|
174
169
|
|
|
175
170
|
File.write(path, content)
|
|
@@ -219,6 +214,12 @@ module Ace
|
|
|
219
214
|
end,
|
|
220
215
|
"failed_test_cases" => result.failed_test_case_ids
|
|
221
216
|
}
|
|
217
|
+
metadata["runner_observations"] = result.metadata["runner_observations"] if result.metadata.key?("runner_observations")
|
|
218
|
+
metadata["verifier_observations"] = result.observations unless result.observations.to_s.empty?
|
|
219
|
+
metadata["missing_required_artifacts"] = result.metadata["missing_required_artifacts"] if result.metadata.key?("missing_required_artifacts")
|
|
220
|
+
metadata["works_for_end_user"] = result.metadata["works_for_end_user"] if result.metadata.key?("works_for_end_user")
|
|
221
|
+
metadata["user_friction"] = result.metadata["user_friction"] if result.metadata.key?("user_friction")
|
|
222
|
+
metadata["user_feedback"] = result.metadata["user_feedback"] if result.metadata.key?("user_feedback")
|
|
222
223
|
|
|
223
224
|
if test_case
|
|
224
225
|
metadata["scenario-id"] = scenario.test_id
|