ace-test-runner-e2e 0.29.6 → 0.38.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.ace-defaults/e2e-runner/config.yml +14 -2
  3. data/CHANGELOG.md +187 -0
  4. data/README.md +2 -2
  5. data/exe/ace-test-e2e-sh +9 -4
  6. data/handbook/guides/e2e-testing.g.md +43 -9
  7. data/handbook/guides/scenario-yml-reference.g.md +16 -8
  8. data/handbook/guides/tc-authoring.g.md +12 -5
  9. data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
  10. data/handbook/skills/as-e2e-review/SKILL.md +2 -2
  11. data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
  12. data/handbook/templates/agent-experience-report.template.md +3 -2
  13. data/handbook/templates/scenario.yml.template.yml +13 -2
  14. data/handbook/templates/tc-file.template.md +14 -4
  15. data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
  16. data/handbook/workflow-instructions/e2e/create.wf.md +139 -23
  17. data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
  18. data/handbook/workflow-instructions/e2e/fix.wf.md +65 -15
  19. data/handbook/workflow-instructions/e2e/plan-changes.wf.md +17 -1
  20. data/handbook/workflow-instructions/e2e/review.wf.md +44 -28
  21. data/handbook/workflow-instructions/e2e/rewrite.wf.md +17 -3
  22. data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
  23. data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
  24. data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
  25. data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
  26. data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +21 -8
  27. data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
  28. data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
  29. data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
  30. data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
  31. data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
  32. data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
  33. data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
  34. data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +165 -25
  35. data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +121 -8
  36. data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
  37. data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +119 -18
  38. data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +13 -12
  39. data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +282 -0
  40. data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +85 -5
  41. data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +98 -16
  42. data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +241 -97
  43. data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
  44. data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
  45. data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +73 -15
  46. data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +120 -19
  47. data/lib/ace/test/end_to_end_runner/version.rb +1 -1
  48. data/lib/ace/test/end_to_end_runner.rb +2 -0
  49. metadata +19 -2
@@ -9,7 +9,16 @@ module Ace
9
9
  module Molecules
10
10
  # Generates TC-first reports from standalone verifier output.
11
11
  class PipelineReportGenerator
12
- FAILURE_CATEGORIES = %w[test-spec-error tool-bug runner-error infrastructure-error].freeze
12
+ FAILURE_CATEGORIES = %w[
13
+ test-spec-error
14
+ tool-bug
15
+ runner-error
16
+ infrastructure-error
17
+ missing-artifact
18
+ state-drift
19
+ behavior-regression
20
+ discoverability-gap
21
+ ].freeze
13
22
 
14
23
  # @param report_writer [Molecules::ReportWriter]
15
24
  def initialize(report_writer: nil)
@@ -23,8 +32,9 @@ module Ace
23
32
  # @param started_at [Time]
24
33
  # @param completed_at [Time]
25
34
  # @return [Models::TestResult]
26
- def generate(scenario:, verifier_output:, report_dir:, provider:, started_at:, completed_at:)
35
+ def generate(scenario:, verifier_output:, report_dir:, provider:, started_at:, completed_at:, metadata: {})
27
36
  parsed = parse_verifier_output(verifier_output, scenario)
37
+ merged_metadata = metadata.merge(parsed[:metadata] || {})
28
38
 
29
39
  result = Models::TestResult.new(
30
40
  test_id: scenario.test_id,
@@ -32,8 +42,10 @@ module Ace
32
42
  test_cases: parsed[:test_cases],
33
43
  summary: parsed[:summary],
34
44
  error: parsed[:error],
45
+ observations: parsed[:observations].to_s,
35
46
  started_at: started_at,
36
- completed_at: completed_at
47
+ completed_at: completed_at,
48
+ metadata: merged_metadata
37
49
  )
38
50
 
39
51
  FileUtils.mkdir_p(report_dir)
@@ -57,15 +69,18 @@ module Ace
57
69
  # @param completed_at [Time]
58
70
  # @param error_message [String]
59
71
  # @return [Models::TestResult]
60
- def write_failure_report(scenario:, report_dir:, provider:, started_at:, completed_at:, error_message:)
72
+ def write_failure_report(scenario:, report_dir:, provider:, started_at:, completed_at:, error_message:,
73
+ failure_category: "runner-error", test_cases: [], metadata: {})
61
74
  result = Models::TestResult.new(
62
75
  test_id: scenario.test_id,
63
76
  status: "error",
64
- test_cases: [],
77
+ test_cases: test_cases,
65
78
  summary: "Execution pipeline failed",
66
79
  error: error_message,
80
+ observations: metadata["runner_observations"].to_s,
67
81
  started_at: started_at,
68
- completed_at: completed_at
82
+ completed_at: completed_at,
83
+ metadata: metadata.merge("failure_category" => failure_category)
69
84
  )
70
85
 
71
86
  FileUtils.mkdir_p(report_dir)
@@ -83,14 +98,16 @@ module Ace
83
98
 
84
99
  def parse_verifier_output(text, scenario)
85
100
  goals = parse_goal_sections(text, scenario)
86
- return build_result_from_goals(goals) unless goals.empty?
101
+ return build_result_from_goals(goals, text) unless goals.empty?
87
102
 
88
103
  parsed = Atoms::SkillResultParser.parse_verifier(text)
89
104
  {
90
105
  status: parsed[:status],
91
106
  test_cases: parsed[:test_cases],
92
107
  summary: parsed[:summary],
93
- error: parsed[:observations]
108
+ error: parsed[:observations],
109
+ observations: parsed[:observations],
110
+ metadata: extract_overall_user_outcome(text)
94
111
  }
95
112
  rescue Atoms::ResultParser::ParseError => e
96
113
  issue = summarize_unstructured_verifier_output(text)
@@ -98,7 +115,9 @@ module Ace
98
115
  status: "error",
99
116
  test_cases: [],
100
117
  summary: "Verifier returned unstructured output",
101
- error: issue || e.message
118
+ error: issue || e.message,
119
+ observations: "",
120
+ metadata: {}
102
121
  }
103
122
  end
104
123
 
@@ -121,7 +140,9 @@ module Ace
121
140
  evidence = extract_evidence(block)
122
141
  next if verdict.nil?
123
142
 
124
- tc_id = scenario_test_cases[goal_number - 1]&.tc_id || format("TC-%03d", goal_number)
143
+ direct_goal_id = format("TC-%03d", goal_number)
144
+ direct_match = scenario_test_cases.find { |tc| tc.tc_id.to_s.upcase == direct_goal_id }
145
+ tc_id = direct_match&.tc_id || scenario_test_cases[goal_number - 1]&.tc_id || direct_goal_id
125
146
  category = extract_category(block, evidence)
126
147
 
127
148
  {
@@ -180,10 +201,14 @@ module Ace
180
201
  explicit = extract_field_token(block, %w[Category])
181
202
  return normalize_category(explicit) if explicit
182
203
 
183
- inline = block.to_s.match(/`(test-spec-error|tool-bug|runner-error|infrastructure-error)`/i)
204
+ inline = block.to_s.match(
205
+ /`(test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)`/i
206
+ )
184
207
  return normalize_category(inline[1]) if inline
185
208
 
186
- paren = block.to_s.match(/\((test-spec-error|tool-bug|runner-error|infrastructure-error)\)/i)
209
+ paren = block.to_s.match(
210
+ /\((test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)\)/i
211
+ )
187
212
  return normalize_category(paren[1]) if paren
188
213
 
189
214
  normalize_category("#{block}\n#{evidence}")
@@ -191,7 +216,9 @@ module Ace
191
216
 
192
217
  def normalize_category(value)
193
218
  category = value.to_s.strip.downcase
194
- match = category.match(/\b(test-spec-error|tool-bug|runner-error|infrastructure-error)\b/)
219
+ match = category.match(
220
+ /\b(test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)\b/
221
+ )
195
222
  return match[1] if match
196
223
 
197
224
  "runner-error"
@@ -222,7 +249,7 @@ module Ace
222
249
  nil
223
250
  end
224
251
 
225
- def build_result_from_goals(goals)
252
+ def build_result_from_goals(goals, text)
226
253
  passed = goals.count { |goal| goal[:status] == "pass" }
227
254
  total = goals.size
228
255
  status = if passed == total
@@ -236,10 +263,25 @@ module Ace
236
263
  {
237
264
  status: status,
238
265
  test_cases: goals,
239
- summary: "#{passed}/#{total} passed"
266
+ summary: "#{passed}/#{total} passed",
267
+ observations: "",
268
+ error: nil,
269
+ metadata: extract_overall_user_outcome(text)
240
270
  }
241
271
  end
242
272
 
273
+ def extract_overall_user_outcome(text)
274
+ works = text.to_s.match(/\*\*Works for end user\*\*:\s*(yes|partial|no)/i)&.captures&.first
275
+ friction = text.to_s.match(/^\s*[-*]?\s*\*\*Friction\*\*:\s*(.+?)\s*$/im)&.captures&.first
276
+ feedback = text.to_s.match(/^\s*[-*]?\s*\*\*Feedback\*\*:\s*(.+?)\s*$/im)&.captures&.first
277
+
278
+ metadata = {}
279
+ metadata["works_for_end_user"] = works.to_s.downcase unless works.to_s.empty?
280
+ metadata["user_friction"] = friction.to_s.strip unless friction.to_s.strip.empty?
281
+ metadata["user_feedback"] = feedback.to_s.strip unless feedback.to_s.strip.empty?
282
+ metadata
283
+ end
284
+
243
285
  def summarize_unstructured_verifier_output(text)
244
286
  summary = text.to_s.lines.map(&:strip).reject(&:empty?).first(3).join(" ")
245
287
  return nil if summary.empty?
@@ -266,8 +308,8 @@ module Ace
266
308
  "test-id" => scenario.test_id,
267
309
  "title" => scenario.title,
268
310
  "package" => scenario.package,
269
- "runner-provider" => provider,
270
- "verifier-provider" => provider,
311
+ "runner-provider" => result.metadata["runner_provider"] || provider,
312
+ "verifier-provider" => result.metadata["verifier_provider"] || provider,
271
313
  "timestamp" => result.completed_at.utc.strftime("%Y-%m-%dT%H:%M:%SZ"),
272
314
  "tcs-passed" => passed,
273
315
  "tcs-failed" => failed,
@@ -281,12 +323,21 @@ module Ace
281
323
  "category" => tc[:category] || "runner-error",
282
324
  "evidence" => tc[:notes].to_s
283
325
  }
284
- end
326
+ end,
327
+ "canonical-failed-tcs" => result.failed_test_case_ids
285
328
  }
329
+ frontmatter["works-for-end-user"] = result.metadata["works_for_end_user"] if result.metadata["works_for_end_user"]
330
+ frontmatter["user-friction"] = result.metadata["user_friction"] if result.metadata["user_friction"]
331
+ frontmatter["user-feedback"] = result.metadata["user_feedback"] if result.metadata["user_feedback"]
332
+ frontmatter["missing-required-artifacts"] = result.metadata["missing_required_artifacts"] if result.metadata["missing_required_artifacts"]
286
333
  frontmatter_yaml = YAML.dump(frontmatter).sub(/\A---\s*\n/, "").sub(/\.\.\.\s*\n\z/, "")
287
334
 
288
335
  rows = result.test_cases.map do |tc|
289
- "| #{tc[:id]} | #{tc[:status].upcase} | #{tc[:notes]} |"
336
+ "| #{tc[:id]} | #{tc[:status].upcase} | #{canonical_goal_evidence(tc)} |"
337
+ end.join("\n")
338
+
339
+ verdict_rows = result.test_cases.map do |tc|
340
+ "| #{tc[:id]} | #{tc[:status].upcase} |"
290
341
  end.join("\n")
291
342
 
292
343
  content = <<~REPORT
@@ -310,10 +361,31 @@ module Ace
310
361
  | Failed | #{failed} |
311
362
  | Total | #{total} |
312
363
  | Score | #{(score * 100).round(1)}% |
364
+
365
+ ## Canonical Goal Verdicts
366
+
367
+ | Goal | Canonical Verdict |
368
+ |------|-------------------|
369
+ #{verdict_rows}
370
+
371
+ ## Overall User Outcome
372
+
373
+ | Field | Value |
374
+ |-------|-------|
375
+ | Works for end user | #{result.metadata["works_for_end_user"] || "unspecified"} |
376
+ | Friction | #{result.metadata["user_friction"] || "None"} |
377
+ | Feedback | #{result.metadata["user_feedback"] || "None"} |
313
378
  REPORT
314
379
 
315
380
  File.write(path, content)
316
381
  end
382
+
383
+ def canonical_goal_evidence(test_case)
384
+ notes = test_case[:notes].to_s.strip
385
+ return notes unless notes.match?(/\bverdict\s+correction\b/i)
386
+
387
+ "Canonical verdict #{test_case[:status].to_s.upcase}. Preserved verifier note: #{notes}"
388
+ end
317
389
  end
318
390
  end
319
391
  end
@@ -2,6 +2,8 @@
2
2
 
3
3
  require "fileutils"
4
4
  require "open3"
5
+ require "yaml"
6
+ require "ace/test_support/sandbox_package_copy"
5
7
 
6
8
  module Ace
7
9
  module Test
@@ -10,8 +12,14 @@ module Ace
10
12
  # Builds deterministic sandbox state for standalone execution.
11
13
  class PipelineSandboxBuilder
12
14
  # @param config_root [String] Project root used for provider symlink/bin path
13
- def initialize(config_root: Dir.pwd)
15
+ def initialize(config_root: Dir.pwd, package_copy: nil, runtime_builder: nil, config: nil)
14
16
  @config_root = File.expand_path(config_root)
17
+ @config = config || Molecules::ConfigLoader.load
18
+ @package_copy = package_copy || Ace::TestSupport::SandboxPackageCopy.new(source_root: @config_root)
19
+ @runtime_builder = runtime_builder || Molecules::SandboxRuntimeBuilder.new(
20
+ source_root: @config_root,
21
+ ruby_version: @config.dig("sandbox", "ruby_version") || Molecules::ConfigLoader.default_sandbox_ruby_version
22
+ )
15
23
  end
16
24
 
17
25
  # @param scenario [Models::TestScenario]
@@ -25,32 +33,69 @@ module Ace
25
33
  FileUtils.mkdir_p(File.join(sandbox_path, "reports"))
26
34
 
27
35
  initialize_git_repo(sandbox_path)
28
- ensure_package_available(scenario.package, sandbox_path)
36
+ package_copy_result = ensure_package_available(scenario.package, sandbox_path)
37
+ sync_protocol_sources(sandbox_path)
38
+ runtime_result = @runtime_builder.prepare(
39
+ sandbox_root: sandbox_path,
40
+ env: package_copy_result[:env],
41
+ tool_names: scenario.requires.fetch("tools", [])
42
+ )
29
43
  link_provider_configs(sandbox_path)
30
44
  create_result_directories(scenario, sandbox_path, test_cases: test_cases)
31
- verify_tool_access(scenario, sandbox_path)
45
+ run_default_bootstrap(scenario, sandbox_path, runtime_result[:env])
46
+ verify_tool_access(scenario, sandbox_path, runtime_result[:env])
32
47
 
33
- {
34
- "PROJECT_ROOT_PATH" => sandbox_path
35
- }
48
+ runtime_result[:env]
49
+ end
50
+
51
+ # Prepare only the runner/verifier layout for a sandbox that was
52
+ # already created by the deterministic setup path.
53
+ #
54
+ # This must not mutate tracked sandbox contents by copying packages,
55
+ # syncing protocol sources, or replacing config directories with
56
+ # symlinks after the scenario setup has already established git state.
57
+ #
58
+ # @param scenario [Models::TestScenario]
59
+ # @param sandbox_path [String]
60
+ # @param test_cases [Array<String>, nil] Optional TC filter
61
+ # @return [Hash] Additional environment variables (none required)
62
+ def prepare_existing_sandbox(scenario:, sandbox_path:, test_cases: nil)
63
+ sandbox_path = File.expand_path(sandbox_path)
64
+ FileUtils.mkdir_p(sandbox_path)
65
+ FileUtils.mkdir_p(File.join(sandbox_path, ".ace-local", "e2e"))
66
+ FileUtils.mkdir_p(File.join(sandbox_path, "reports"))
67
+ create_result_directories(scenario, sandbox_path, test_cases: test_cases)
68
+ {}
69
+ end
70
+
71
+ # Sync protocol source manifests and backing directories into a
72
+ # prepared sandbox before deterministic setup runs.
73
+ #
74
+ # This is safe before setup because no scenario-owned git baseline has
75
+ # been established yet. It is intentionally separate from
76
+ # prepare_existing_sandbox so the post-setup pipeline path remains
77
+ # non-mutating.
78
+ #
79
+ # @param sandbox_path [String]
80
+ # @return [void]
81
+ def sync_protocol_sources_into(sandbox_path)
82
+ sync_protocol_sources(File.expand_path(sandbox_path))
36
83
  end
37
84
 
38
85
  private
39
86
 
40
87
  def ensure_package_available(package_name, sandbox_path)
41
88
  package_name = package_name.to_s.strip
42
- return if package_name.empty?
43
-
44
- package_source = File.join(@config_root, package_name)
45
- package_target = File.join(sandbox_path, package_name)
46
-
47
- return if File.exist?(package_target)
48
-
49
- unless File.directory?(package_source)
50
- raise "Scenario package not found: #{package_name} (expected #{package_source})"
89
+ if package_name.empty?
90
+ return {
91
+ env: {
92
+ "PROJECT_ROOT_PATH" => sandbox_path,
93
+ "ACE_E2E_SOURCE_ROOT" => @config_root
94
+ }
95
+ }
51
96
  end
52
97
 
53
- FileUtils.cp_r(package_source, package_target)
98
+ @package_copy.prepare(package_name: package_name, sandbox_root: sandbox_path)
54
99
  end
55
100
 
56
101
  def initialize_git_repo(sandbox_path)
@@ -62,6 +107,45 @@ module Ace
62
107
  raise "Sandbox git init failed: #{stderr}".strip
63
108
  end
64
109
 
110
+ def sync_protocol_sources(sandbox_path)
111
+ %w[skill wfi].each do |protocol|
112
+ Dir.glob(File.join(@config_root, "*", ".ace-defaults", "nav", "protocols",
113
+ "#{protocol}-sources", "*.yml")).sort.each do |manifest_path|
114
+ sync_protocol_source_manifest(protocol, manifest_path, sandbox_path)
115
+ end
116
+ end
117
+ end
118
+
119
+ def sync_protocol_source_manifest(protocol, manifest_path, sandbox_path)
120
+ source_data = YAML.safe_load_file(manifest_path) || {}
121
+ relative_path = source_data.dig("config", "relative_path").to_s.strip
122
+ return if relative_path.empty?
123
+
124
+ package_root = File.expand_path("../../../../..", manifest_path)
125
+ package_name = File.basename(package_root)
126
+ target_package_root = File.join(sandbox_path, package_name)
127
+ target_manifest_path = File.join(
128
+ target_package_root,
129
+ ".ace-defaults",
130
+ "nav",
131
+ "protocols",
132
+ "#{protocol}-sources",
133
+ File.basename(manifest_path)
134
+ )
135
+ source_dir = File.join(package_root, relative_path)
136
+ target_dir = File.join(target_package_root, relative_path)
137
+
138
+ FileUtils.mkdir_p(File.dirname(target_manifest_path))
139
+ FileUtils.cp(manifest_path, target_manifest_path) unless File.exist?(target_manifest_path)
140
+ return unless File.directory?(source_dir)
141
+ return if File.exist?(target_dir)
142
+
143
+ FileUtils.mkdir_p(File.dirname(target_dir))
144
+ FileUtils.cp_r(source_dir, target_dir)
145
+ rescue Psych::SyntaxError
146
+ nil
147
+ end
148
+
65
149
  def link_provider_configs(sandbox_path)
66
150
  source = File.join(@config_root, ".ace", "llm", "providers")
67
151
  target = File.join(sandbox_path, ".ace", "llm", "providers")
@@ -115,15 +199,32 @@ module Ace
115
199
  match ? match[1].to_i : nil
116
200
  end
117
201
 
118
- def verify_tool_access(scenario, sandbox_path)
202
+ def verify_tool_access(scenario, sandbox_path, env)
119
203
  tool = scenario.tool_under_test.to_s.strip
120
204
  return if tool.empty?
121
205
 
122
- _stdout, stderr, status = Open3.capture3(tool, "--help", chdir: sandbox_path)
206
+ _stdout, stderr, status = Open3.capture3(env, tool, "--help", chdir: sandbox_path)
123
207
  return if status.success?
124
208
 
125
209
  raise "Sandbox tool check failed for #{tool}: #{stderr}".strip
126
210
  end
211
+
212
+ def run_default_bootstrap(scenario, sandbox_path, env)
213
+ return unless scenario.sandbox_profile == "ace-default"
214
+
215
+ stdout, stderr, status = Open3.capture3(
216
+ env,
217
+ "bash", "--noprofile", "--norc", "-c", "ace-config init && ace-handbook sync",
218
+ chdir: sandbox_path
219
+ )
220
+ return if status.success?
221
+
222
+ raise [
223
+ "Default sandbox bootstrap failed for #{scenario.test_id}",
224
+ stdout.to_s.strip,
225
+ stderr.to_s.strip
226
+ ].reject(&:empty?).join("\n")
227
+ end
127
228
  end
128
229
  end
129
230
  end
@@ -130,6 +130,8 @@ module Ace
130
130
  # @return [String] Path to written file
131
131
  def write_experience(result, scenario, report_dir, test_case = nil)
132
132
  path = File.join(report_dir, "experience.r.md")
133
+ runner_observations = result.metadata["runner_observations"].to_s.strip
134
+ verifier_observations = result.observations.to_s.strip
133
135
 
134
136
  tc_title_suffix = test_case ? " / #{test_case.tc_id}" : ""
135
137
 
@@ -154,22 +156,15 @@ module Ace
154
156
 
155
157
  ## Summary
156
158
 
157
- Executed via ace-test-e2e CLI using LLM provider.
158
- #{(result.status == "pass") ? "No significant friction encountered." : "Test execution completed with issues noted below."}
159
+ Runner observations captured by the harness for this scenario.
159
160
 
160
- ## Friction Points
161
+ ## Runner Observations
161
162
 
162
- ### Documentation Gaps
163
+ #{runner_observations.empty? ? "- None provided by runner." : runner_observations}
163
164
 
164
- - Automated execution via LLM - no documentation gaps observed
165
+ ## Verifier Notes
165
166
 
166
- ### Tool Behavior Issues
167
-
168
- - #{result.error || "None observed"}
169
-
170
- ## Positive Observations
171
-
172
- - Automated test execution completed successfully via LLM
167
+ - #{verifier_observations.empty? ? (result.error || "None recorded.") : verifier_observations}
173
168
  REPORT
174
169
 
175
170
  File.write(path, content)
@@ -219,6 +214,12 @@ module Ace
219
214
  end,
220
215
  "failed_test_cases" => result.failed_test_case_ids
221
216
  }
217
+ metadata["runner_observations"] = result.metadata["runner_observations"] if result.metadata.key?("runner_observations")
218
+ metadata["verifier_observations"] = result.observations unless result.observations.to_s.empty?
219
+ metadata["missing_required_artifacts"] = result.metadata["missing_required_artifacts"] if result.metadata.key?("missing_required_artifacts")
220
+ metadata["works_for_end_user"] = result.metadata["works_for_end_user"] if result.metadata.key?("works_for_end_user")
221
+ metadata["user_friction"] = result.metadata["user_friction"] if result.metadata.key?("user_friction")
222
+ metadata["user_feedback"] = result.metadata["user_feedback"] if result.metadata.key?("user_feedback")
222
223
 
223
224
  if test_case
224
225
  metadata["scenario-id"] = scenario.test_id