ace-test-runner-e2e 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.ace-defaults/e2e-runner/config.yml +70 -0
- data/.ace-defaults/nav/protocols/guide-sources/ace-test-runner-e2e.yml +11 -0
- data/.ace-defaults/nav/protocols/skill-sources/ace-test-runner-e2e.yml +19 -0
- data/.ace-defaults/nav/protocols/tmpl-sources/ace-test-runner-e2e.yml +12 -0
- data/.ace-defaults/nav/protocols/wfi-sources/ace-test-runner-e2e.yml +11 -0
- data/CHANGELOG.md +1166 -0
- data/LICENSE +21 -0
- data/README.md +42 -0
- data/Rakefile +15 -0
- data/exe/ace-test-e2e +15 -0
- data/exe/ace-test-e2e-sh +67 -0
- data/exe/ace-test-e2e-suite +13 -0
- data/handbook/guides/e2e-testing.g.md +124 -0
- data/handbook/guides/scenario-yml-reference.g.md +182 -0
- data/handbook/guides/tc-authoring.g.md +131 -0
- data/handbook/skills/as-e2e-create/SKILL.md +30 -0
- data/handbook/skills/as-e2e-fix/SKILL.md +35 -0
- data/handbook/skills/as-e2e-manage/SKILL.md +31 -0
- data/handbook/skills/as-e2e-plan-changes/SKILL.md +30 -0
- data/handbook/skills/as-e2e-review/SKILL.md +35 -0
- data/handbook/skills/as-e2e-rewrite/SKILL.md +31 -0
- data/handbook/skills/as-e2e-run/SKILL.md +48 -0
- data/handbook/skills/as-e2e-setup-sandbox/SKILL.md +34 -0
- data/handbook/templates/ace-taskflow-fixture.template.md +322 -0
- data/handbook/templates/agent-experience-report.template.md +89 -0
- data/handbook/templates/metadata.template.yml +49 -0
- data/handbook/templates/scenario.yml.template.yml +60 -0
- data/handbook/templates/tc-file.template.md +45 -0
- data/handbook/templates/test-report.template.md +94 -0
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +126 -0
- data/handbook/workflow-instructions/e2e/create.wf.md +395 -0
- data/handbook/workflow-instructions/e2e/execute.wf.md +253 -0
- data/handbook/workflow-instructions/e2e/fix.wf.md +166 -0
- data/handbook/workflow-instructions/e2e/manage.wf.md +179 -0
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +255 -0
- data/handbook/workflow-instructions/e2e/review.wf.md +286 -0
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +281 -0
- data/handbook/workflow-instructions/e2e/run.wf.md +355 -0
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +461 -0
- data/lib/ace/test/end_to_end_runner/atoms/display_helpers.rb +234 -0
- data/lib/ace/test/end_to_end_runner/atoms/prompt_builder.rb +199 -0
- data/lib/ace/test/end_to_end_runner/atoms/result_parser.rb +166 -0
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +166 -0
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +244 -0
- data/lib/ace/test/end_to_end_runner/atoms/suite_report_prompt_builder.rb +103 -0
- data/lib/ace/test/end_to_end_runner/atoms/tc_fidelity_validator.rb +39 -0
- data/lib/ace/test/end_to_end_runner/atoms/test_case_parser.rb +108 -0
- data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +130 -0
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +156 -0
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +47 -0
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +115 -0
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +90 -0
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +92 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +75 -0
- data/lib/ace/test/end_to_end_runner/molecules/failure_finder.rb +203 -0
- data/lib/ace/test/end_to_end_runner/molecules/fixture_copier.rb +35 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +121 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +182 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +321 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +131 -0
- data/lib/ace/test/end_to_end_runner/molecules/progress_display_manager.rb +172 -0
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +259 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +254 -0
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +181 -0
- data/lib/ace/test/end_to_end_runner/molecules/simple_display_manager.rb +72 -0
- data/lib/ace/test/end_to_end_runner/molecules/suite_progress_display_manager.rb +223 -0
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +277 -0
- data/lib/ace/test/end_to_end_runner/molecules/suite_simple_display_manager.rb +116 -0
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +136 -0
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +332 -0
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +830 -0
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +442 -0
- data/lib/ace/test/end_to_end_runner/version.rb +9 -0
- data/lib/ace/test/end_to_end_runner.rb +71 -0
- metadata +220 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ace
|
|
4
|
+
module Test
|
|
5
|
+
module EndToEndRunner
|
|
6
|
+
module Atoms
|
|
7
|
+
# Builds LLM prompts for E2E test execution
|
|
8
|
+
#
|
|
9
|
+
# Creates a system prompt that instructs the LLM to execute a test scenario
|
|
10
|
+
# and return structured JSON results, along with the user prompt containing
|
|
11
|
+
# the test scenario content.
|
|
12
|
+
class PromptBuilder
|
|
13
|
+
# System prompt for TC-level (single test case) execution
|
|
14
|
+
TC_SYSTEM_PROMPT = <<~PROMPT
|
|
15
|
+
You are an E2E test executor for the ACE (Agentic Coding Environment) toolkit.
|
|
16
|
+
|
|
17
|
+
Your task is to execute a single test case in a pre-populated sandbox and return structured results.
|
|
18
|
+
|
|
19
|
+
## Instructions
|
|
20
|
+
|
|
21
|
+
1. The test sandbox is pre-populated at the path provided — do NOT create or modify the sandbox setup
|
|
22
|
+
2. Read the test case steps carefully
|
|
23
|
+
3. Execute the test case steps in the sandbox
|
|
24
|
+
4. Record pass/fail status
|
|
25
|
+
5. Return results as JSON
|
|
26
|
+
|
|
27
|
+
## Output Format
|
|
28
|
+
|
|
29
|
+
You MUST return a JSON block wrapped in ```json fences with these fields:
|
|
30
|
+
|
|
31
|
+
```json
|
|
32
|
+
{
|
|
33
|
+
"test_id": "TS-XXX-NNN",
|
|
34
|
+
"tc_id": "TC-NNN",
|
|
35
|
+
"status": "pass|fail",
|
|
36
|
+
"actual": "What actually happened",
|
|
37
|
+
"notes": "Any additional observations",
|
|
38
|
+
"summary": "Brief result"
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Rules
|
|
43
|
+
|
|
44
|
+
- Execute ONLY the single test case provided
|
|
45
|
+
- Execute in the pre-populated sandbox (do not modify setup files)
|
|
46
|
+
- Record actual output/behavior, not just expected
|
|
47
|
+
- If the test case cannot be executed (missing tool, permission error), mark as "fail" with explanation
|
|
48
|
+
PROMPT
|
|
49
|
+
|
|
50
|
+
SYSTEM_PROMPT = <<~PROMPT
|
|
51
|
+
You are an E2E test executor for the ACE (Agentic Coding Environment) toolkit.
|
|
52
|
+
|
|
53
|
+
Your task is to execute the provided test scenario step by step and return structured results.
|
|
54
|
+
|
|
55
|
+
## Instructions
|
|
56
|
+
|
|
57
|
+
1. Read the test scenario carefully
|
|
58
|
+
2. Execute the Environment Setup commands
|
|
59
|
+
3. Create any Test Data as specified
|
|
60
|
+
4. Execute each Test Case (TC-NNN) in order
|
|
61
|
+
5. Record pass/fail status for each test case
|
|
62
|
+
6. Return results as JSON
|
|
63
|
+
|
|
64
|
+
## Output Format
|
|
65
|
+
|
|
66
|
+
You MUST return a JSON block wrapped in ```json fences with these fields:
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"test_id": "TS-XXX-NNN",
|
|
71
|
+
"status": "pass|fail|partial",
|
|
72
|
+
"test_cases": [
|
|
73
|
+
{
|
|
74
|
+
"id": "TC-001",
|
|
75
|
+
"description": "Brief description",
|
|
76
|
+
"status": "pass|fail",
|
|
77
|
+
"actual": "What actually happened",
|
|
78
|
+
"notes": "Any additional observations"
|
|
79
|
+
}
|
|
80
|
+
],
|
|
81
|
+
"summary": "Brief execution summary",
|
|
82
|
+
"observations": "Any friction points or issues discovered"
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Rules
|
|
87
|
+
|
|
88
|
+
- Execute ALL test cases, even if earlier ones fail
|
|
89
|
+
- Record actual output/behavior, not just expected
|
|
90
|
+
- Use "partial" status if some test cases pass and some fail
|
|
91
|
+
- Include meaningful observations about tool behavior
|
|
92
|
+
- If a test case cannot be executed (missing tool, permission error), mark as "fail" with explanation
|
|
93
|
+
PROMPT
|
|
94
|
+
|
|
95
|
+
# Build a TC-level user prompt for a single test case
|
|
96
|
+
#
|
|
97
|
+
# @param test_case [Models::TestCase] The single test case to execute
|
|
98
|
+
# @param scenario [Models::TestScenario] The parent scenario for metadata
|
|
99
|
+
# @param sandbox_path [String] Path to the pre-populated sandbox
|
|
100
|
+
# @return [String] The TC-level user prompt
|
|
101
|
+
def build_tc(test_case:, scenario:, sandbox_path:)
|
|
102
|
+
if test_case.pending?
|
|
103
|
+
return <<~PROMPT
|
|
104
|
+
# SKIP Test Case: #{scenario.test_id} / #{test_case.tc_id}
|
|
105
|
+
|
|
106
|
+
**Package:** #{scenario.package}
|
|
107
|
+
**Scenario:** #{scenario.title}
|
|
108
|
+
**Test Case:** #{test_case.title}
|
|
109
|
+
**Status:** PENDING — #{test_case.pending}
|
|
110
|
+
|
|
111
|
+
This test case is marked as pending and should NOT be executed.
|
|
112
|
+
Return the following JSON result:
|
|
113
|
+
|
|
114
|
+
```json
|
|
115
|
+
{
|
|
116
|
+
"test_id": "#{scenario.test_id}",
|
|
117
|
+
"tc_id": "#{test_case.tc_id}",
|
|
118
|
+
"status": "skip",
|
|
119
|
+
"actual": "Skipped — pending",
|
|
120
|
+
"notes": "#{test_case.pending}",
|
|
121
|
+
"summary": "Pending: #{test_case.pending}"
|
|
122
|
+
}
|
|
123
|
+
```
|
|
124
|
+
PROMPT
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
<<~PROMPT
|
|
128
|
+
# Execute Test Case: #{scenario.test_id} / #{test_case.tc_id}
|
|
129
|
+
|
|
130
|
+
**Package:** #{scenario.package}
|
|
131
|
+
**Scenario:** #{scenario.title}
|
|
132
|
+
**Test Case:** #{test_case.title}
|
|
133
|
+
**Sandbox Path:** #{sandbox_path}
|
|
134
|
+
|
|
135
|
+
## Test Case Content
|
|
136
|
+
|
|
137
|
+
#{test_case.content}
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
Execute the test case steps in the sandbox at `#{sandbox_path}` and return JSON results as specified in your instructions.
|
|
142
|
+
PROMPT
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Build the user prompt for a test scenario
|
|
146
|
+
#
|
|
147
|
+
# @param scenario [Models::TestScenario] The test scenario to execute
|
|
148
|
+
# @param test_cases [Array<String>, nil] Optional test case IDs to filter
|
|
149
|
+
# @return [String] The user prompt containing the test scenario
|
|
150
|
+
def build(scenario, test_cases: nil)
|
|
151
|
+
filter_instruction = if test_cases&.any?
|
|
152
|
+
"\n**IMPORTANT:** Execute ONLY the following test cases: #{test_cases.join(", ")}. Skip all other test cases.\n"
|
|
153
|
+
else
|
|
154
|
+
""
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
pending_instruction = build_pending_tc_skip_instruction(scenario)
|
|
158
|
+
|
|
159
|
+
execute_instruction = if test_cases&.any?
|
|
160
|
+
"Execute only the specified test cases (#{test_cases.join(", ")}) and return the JSON results as specified in your instructions."
|
|
161
|
+
else
|
|
162
|
+
"Execute all test cases in this scenario and return the JSON results as specified in your instructions."
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
<<~PROMPT
|
|
166
|
+
# Execute E2E Test: #{scenario.test_id}
|
|
167
|
+
|
|
168
|
+
**Package:** #{scenario.package}
|
|
169
|
+
**Title:** #{scenario.title}
|
|
170
|
+
**Priority:** #{scenario.priority}
|
|
171
|
+
#{filter_instruction}#{pending_instruction}
|
|
172
|
+
## Test Scenario
|
|
173
|
+
|
|
174
|
+
#{scenario.content}
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
#{execute_instruction}
|
|
179
|
+
PROMPT
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
private
|
|
183
|
+
|
|
184
|
+
# Build instruction for pending test cases if any exist
|
|
185
|
+
#
|
|
186
|
+
# @param scenario [Models::TestScenario] The test scenario
|
|
187
|
+
# @return [String] Pending instruction text or empty string
|
|
188
|
+
def build_pending_tc_skip_instruction(scenario)
|
|
189
|
+
pending_tcs = scenario.test_cases.select(&:pending?)
|
|
190
|
+
return "" unless pending_tcs.any?
|
|
191
|
+
|
|
192
|
+
lines = pending_tcs.map { |tc| "- #{tc.tc_id}: #{tc.pending}" }
|
|
193
|
+
"\n**SKIP these test cases (pending):**\n#{lines.join("\n")}\nFor skipped test cases, report status as \"skip\" in your results.\n"
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Ace
|
|
6
|
+
module Test
|
|
7
|
+
module EndToEndRunner
|
|
8
|
+
module Atoms
|
|
9
|
+
# Parses structured JSON results from LLM responses
|
|
10
|
+
#
|
|
11
|
+
# Extracts JSON from LLM text output, handling various formatting
|
|
12
|
+
# patterns including fenced code blocks and raw JSON.
|
|
13
|
+
class ResultParser
|
|
14
|
+
# Parse LLM response text into a structured result hash
|
|
15
|
+
#
|
|
16
|
+
# @param text [String] Raw LLM response text
|
|
17
|
+
# @return [Hash] Parsed result with :test_id, :status, :test_cases, :summary
|
|
18
|
+
# @raise [ParseError] If no valid JSON found in response
|
|
19
|
+
def self.parse(text)
|
|
20
|
+
json_str = extract_json(text)
|
|
21
|
+
raise ParseError, "No JSON found in LLM response" if json_str.nil?
|
|
22
|
+
|
|
23
|
+
parsed = JSON.parse(json_str, symbolize_names: true)
|
|
24
|
+
validate_result(parsed)
|
|
25
|
+
normalize_result(parsed)
|
|
26
|
+
rescue JSON::ParserError => e
|
|
27
|
+
raise ParseError, "Invalid JSON in LLM response: #{e.message}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Extract JSON from text, handling code fences and raw JSON
|
|
31
|
+
#
|
|
32
|
+
# @param text [String] Text potentially containing JSON
|
|
33
|
+
# @return [String, nil] Extracted JSON string or nil
|
|
34
|
+
def self.extract_json(text)
|
|
35
|
+
return nil if text.nil? || text.to_s.strip.empty?
|
|
36
|
+
|
|
37
|
+
stripped = text.to_s.strip
|
|
38
|
+
|
|
39
|
+
# Try to find JSON in code fences first
|
|
40
|
+
match = stripped.match(/```(?:json)?\s*\n(.*?)\n\s*```/m)
|
|
41
|
+
return match[1].strip if match
|
|
42
|
+
|
|
43
|
+
# Treat unfenced content as JSON only when the whole payload is a JSON object.
|
|
44
|
+
return stripped if stripped.start_with?("{") && stripped.end_with?("}")
|
|
45
|
+
|
|
46
|
+
nil
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Validate that parsed result has required fields
|
|
50
|
+
#
|
|
51
|
+
# @param result [Hash] Parsed JSON result
|
|
52
|
+
# @raise [ParseError] If required fields are missing
|
|
53
|
+
def self.validate_result(result)
|
|
54
|
+
required = %i[test_id status]
|
|
55
|
+
missing = required.reject { |field| result.key?(field) }
|
|
56
|
+
unless missing.empty?
|
|
57
|
+
raise ParseError, "Missing required fields in result: #{missing.join(", ")}"
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Normalize result to ensure consistent structure
|
|
62
|
+
#
|
|
63
|
+
# @param result [Hash] Parsed result
|
|
64
|
+
# @return [Hash] Normalized result
|
|
65
|
+
def self.normalize_result(result)
|
|
66
|
+
{
|
|
67
|
+
test_id: result[:test_id],
|
|
68
|
+
status: result[:status].to_s.downcase,
|
|
69
|
+
test_cases: normalize_test_cases(result[:test_cases] || []),
|
|
70
|
+
summary: result[:summary] || "",
|
|
71
|
+
observations: result[:observations] || ""
|
|
72
|
+
}
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Normalize test case entries
|
|
76
|
+
#
|
|
77
|
+
# @param test_cases [Array<Hash>] Raw test case data
|
|
78
|
+
# @return [Array<Hash>] Normalized test cases
|
|
79
|
+
def self.normalize_test_cases(test_cases)
|
|
80
|
+
test_cases.map do |tc|
|
|
81
|
+
{
|
|
82
|
+
id: tc[:id] || "unknown",
|
|
83
|
+
description: tc[:description] || "",
|
|
84
|
+
status: tc[:status] || "fail",
|
|
85
|
+
actual: tc[:actual] || "",
|
|
86
|
+
notes: tc[:notes] || "",
|
|
87
|
+
criteria: normalize_criteria(tc[:criteria] || [])
|
|
88
|
+
}
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Normalize optional criteria evaluations
|
|
93
|
+
#
|
|
94
|
+
# @param criteria [Array<Hash>] Raw criteria results
|
|
95
|
+
# @return [Array<Hash>] Normalized criteria entries
|
|
96
|
+
def self.normalize_criteria(criteria)
|
|
97
|
+
criteria.map do |criterion|
|
|
98
|
+
{
|
|
99
|
+
id: criterion[:id] || "",
|
|
100
|
+
description: criterion[:description] || criterion[:criterion] || "",
|
|
101
|
+
status: (criterion[:status] || "fail").to_s.downcase,
|
|
102
|
+
evidence: criterion[:evidence] || ""
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Parse TC-level LLM response into a structured result hash
|
|
108
|
+
#
|
|
109
|
+
# Handles single-TC JSON format with tc_id field. Falls back to
|
|
110
|
+
# parse() if the response contains multi-TC format.
|
|
111
|
+
#
|
|
112
|
+
# @param text [String] Raw LLM response text
|
|
113
|
+
# @return [Hash] Parsed result with single-entry :test_cases array
|
|
114
|
+
# @raise [ParseError] If no valid JSON found in response
|
|
115
|
+
def self.parse_tc(text)
|
|
116
|
+
json_str = extract_json(text)
|
|
117
|
+
raise ParseError, "No JSON found in LLM response" if json_str.nil?
|
|
118
|
+
|
|
119
|
+
parsed = JSON.parse(json_str, symbolize_names: true)
|
|
120
|
+
|
|
121
|
+
# If response has test_cases array, delegate to standard parse
|
|
122
|
+
return parse(text) if parsed.key?(:test_cases)
|
|
123
|
+
|
|
124
|
+
validate_tc_result(parsed)
|
|
125
|
+
normalize_tc_result(parsed)
|
|
126
|
+
rescue JSON::ParserError => e
|
|
127
|
+
raise ParseError, "Invalid JSON in LLM response: #{e.message}"
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Validate TC-level result fields
|
|
131
|
+
def self.validate_tc_result(result)
|
|
132
|
+
required = %i[test_id tc_id status]
|
|
133
|
+
missing = required.reject { |field| result.key?(field) }
|
|
134
|
+
unless missing.empty?
|
|
135
|
+
raise ParseError, "Missing required fields in TC result: #{missing.join(", ")}"
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Normalize TC-level result to standard format with single-entry test_cases
|
|
140
|
+
def self.normalize_tc_result(result)
|
|
141
|
+
{
|
|
142
|
+
test_id: result[:test_id],
|
|
143
|
+
status: result[:status],
|
|
144
|
+
test_cases: [{
|
|
145
|
+
id: result[:tc_id],
|
|
146
|
+
description: result[:summary] || "",
|
|
147
|
+
status: result[:status],
|
|
148
|
+
actual: result[:actual] || "",
|
|
149
|
+
notes: result[:notes] || "",
|
|
150
|
+
criteria: normalize_criteria(result[:criteria] || [])
|
|
151
|
+
}],
|
|
152
|
+
summary: result[:summary] || "",
|
|
153
|
+
observations: result[:notes] || ""
|
|
154
|
+
}
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private_class_method :validate_result, :normalize_result, :normalize_test_cases,
|
|
158
|
+
:normalize_criteria, :validate_tc_result, :normalize_tc_result
|
|
159
|
+
|
|
160
|
+
# Error raised when parsing LLM response fails
|
|
161
|
+
class ParseError < StandardError; end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ace
|
|
4
|
+
module Test
|
|
5
|
+
module EndToEndRunner
|
|
6
|
+
module Atoms
|
|
7
|
+
# Holds CLI-provider detection and CLI-args helpers.
|
|
8
|
+
#
|
|
9
|
+
# Standalone scenario execution for CLI providers now runs through the
|
|
10
|
+
# deterministic runner/verifier pipeline.
|
|
11
|
+
# Provider lists and CLI args are configurable via config.yml.
|
|
12
|
+
class CliProviderAdapter
|
|
13
|
+
# @param config [Hash] Configuration hash (string keys) with providers section
|
|
14
|
+
def initialize(config = {})
|
|
15
|
+
@cli_providers = config.dig("providers", "cli") || %w[claude gemini codex codexoss opencode pi]
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Check if a provider string refers to a CLI provider
|
|
19
|
+
#
|
|
20
|
+
# @param provider_string [String] Provider:model string (e.g., "claude:sonnet")
|
|
21
|
+
# @return [Boolean]
|
|
22
|
+
def self.cli_provider?(provider_string)
|
|
23
|
+
default_instance.cli_provider?(provider_string)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Extract provider name from provider:model string
|
|
27
|
+
#
|
|
28
|
+
# @param provider_string [String] e.g., "claude:sonnet"
|
|
29
|
+
# @return [String] e.g., "claude"
|
|
30
|
+
def self.provider_name(provider_string)
|
|
31
|
+
provider_string.to_s.split(":").first.to_s
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Instance method: check if a provider string refers to a CLI provider
|
|
35
|
+
#
|
|
36
|
+
# @param provider_string [String] Provider:model string
|
|
37
|
+
# @return [Boolean]
|
|
38
|
+
def cli_provider?(provider_string)
|
|
39
|
+
name = self.class.provider_name(provider_string)
|
|
40
|
+
@cli_providers.include?(name)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def build_execution_prompt(command:, tc_mode:)
|
|
44
|
+
return_contract = if tc_mode
|
|
45
|
+
"- **Test ID**: ...\n- **TC ID**: ...\n- **Status**: pass | fail\n- **Report Paths**: ...\n- **Issues**: ..."
|
|
46
|
+
else
|
|
47
|
+
"- **Test ID**: ...\n- **Status**: pass | fail | partial\n- **Passed**: ...\n- **Failed**: ...\n- **Total**: ...\n- **Report Paths**: ...\n- **Issues**: ..."
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
<<~PROMPT.strip
|
|
51
|
+
Run this as a slash command in the agent chat interface (not in bash):
|
|
52
|
+
#{command}
|
|
53
|
+
|
|
54
|
+
Execution requirements:
|
|
55
|
+
- Do not run `/ace-...` inside a shell command.
|
|
56
|
+
- If slash commands are unavailable, stop and report that limitation in `Issues`.
|
|
57
|
+
- Write reports under `.ace-local/test-e2e/*-reports/`.
|
|
58
|
+
- Return only this structured summary:
|
|
59
|
+
#{return_contract}
|
|
60
|
+
PROMPT
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
public
|
|
64
|
+
|
|
65
|
+
# Build a skill invocation prompt for scenario-level execution
|
|
66
|
+
#
|
|
67
|
+
# @param scenario [Models::TestScenario] The test scenario
|
|
68
|
+
# @param run_id [String, nil] Pre-generated run ID for deterministic report paths
|
|
69
|
+
# @param test_cases [Array<String>, nil] Optional test case IDs to filter
|
|
70
|
+
# @param sandbox_path [String, nil] Path to pre-populated sandbox (skips setup steps)
|
|
71
|
+
# @param env_vars [Hash, nil] Environment variables from setup execution
|
|
72
|
+
# @param report_dir [String, nil] Explicit report directory path (overrides computed path)
|
|
73
|
+
# @return [String] Skill invocation prompt
|
|
74
|
+
def build_skill_prompt(scenario, run_id: nil, test_cases: nil, sandbox_path: nil, env_vars: nil, report_dir: nil)
|
|
75
|
+
cmd = "/as-e2e-run #{scenario.package} #{scenario.test_id}"
|
|
76
|
+
cmd += " #{test_cases.join(",")}" if test_cases&.any?
|
|
77
|
+
cmd += " --run-id #{run_id}" if run_id
|
|
78
|
+
cmd += " --sandbox #{sandbox_path}" if sandbox_path
|
|
79
|
+
cmd += " --env #{env_vars.map { |k, v| "#{k}=#{v}" }.join(",")}" if env_vars&.any?
|
|
80
|
+
cmd += " --report-dir #{report_dir}" if report_dir
|
|
81
|
+
build_execution_prompt(command: cmd, tc_mode: false)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Build a TC-level skill invocation prompt
|
|
85
|
+
#
|
|
86
|
+
# @param test_case [Models::TestCase] The single test case
|
|
87
|
+
# @param scenario [Models::TestScenario] The parent scenario
|
|
88
|
+
# @param sandbox_path [String] Path to the pre-populated sandbox
|
|
89
|
+
# @param run_id [String, nil] Pre-generated run ID
|
|
90
|
+
# @param env_vars [Hash, nil] Environment variables from setup execution
|
|
91
|
+
# @return [String] Skill invocation prompt
|
|
92
|
+
def build_tc_skill_prompt(test_case:, scenario:, sandbox_path:, run_id: nil, env_vars: nil)
|
|
93
|
+
cmd = "/as-e2e-run #{scenario.package} #{scenario.test_id} #{test_case.tc_id} --tc-mode --sandbox #{sandbox_path}"
|
|
94
|
+
cmd += " --run-id #{run_id}" if run_id
|
|
95
|
+
cmd += " --env #{env_vars.map { |k, v| "#{k}=#{v}" }.join(",")}" if env_vars&.any?
|
|
96
|
+
build_execution_prompt(command: cmd, tc_mode: true)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Build an independent verifier prompt.
|
|
100
|
+
#
|
|
101
|
+
# This is intentionally a second invocation to avoid sharing runner context.
|
|
102
|
+
def build_verifier_prompt(scenario, run_id: nil, sandbox_path: nil, test_cases: nil, report_dir: nil)
|
|
103
|
+
report_dir ||= if run_id
|
|
104
|
+
".ace-local/test-e2e/#{scenario.dir_name(run_id)}-reports"
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
tc_filter = test_cases&.any? ? test_cases.join(", ") : "all discovered test cases"
|
|
108
|
+
sandbox_info = sandbox_path || "(unknown)"
|
|
109
|
+
report_info = report_dir || "(unknown)"
|
|
110
|
+
|
|
111
|
+
<<~PROMPT.strip
|
|
112
|
+
You are the independent verifier for an E2E scenario.
|
|
113
|
+
|
|
114
|
+
Verify this scenario in a new, isolated agent context:
|
|
115
|
+
- Package: #{scenario.package}
|
|
116
|
+
- Test ID: #{scenario.test_id}
|
|
117
|
+
- Sandbox path: #{sandbox_info}
|
|
118
|
+
- Report directory: #{report_info}
|
|
119
|
+
- Scope: #{tc_filter}
|
|
120
|
+
|
|
121
|
+
Verification requirements:
|
|
122
|
+
- Inspect sandbox artifacts and scenario files directly.
|
|
123
|
+
- Evaluate each test case using `TC-*.verify.md` criteria when present.
|
|
124
|
+
- Classify each failed test case with one category:
|
|
125
|
+
`test-spec-error`, `tool-bug`, `runner-error`, or `infrastructure-error`.
|
|
126
|
+
- Write/update report files under the report directory.
|
|
127
|
+
- Use TC-first schema in report frontmatter and metadata.
|
|
128
|
+
|
|
129
|
+
Return only this structured summary:
|
|
130
|
+
- **Test ID**: ...
|
|
131
|
+
- **Status**: pass | fail | partial | error
|
|
132
|
+
- **TCs Passed**: ...
|
|
133
|
+
- **TCs Failed**: ...
|
|
134
|
+
- **TCs Total**: ...
|
|
135
|
+
- **Score**: ...
|
|
136
|
+
- **Verdict**: pass | partial | fail
|
|
137
|
+
- **Failed TCs**: TC-001:tool-bug, TC-002:runner-error (or `None`)
|
|
138
|
+
- **Issues**: ...
|
|
139
|
+
PROMPT
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Lazily-loaded default instance backed by ConfigLoader
|
|
143
|
+
# @return [CliProviderAdapter]
|
|
144
|
+
def self.default_instance
|
|
145
|
+
@default_instance ||= begin
|
|
146
|
+
config = if defined?(Molecules::ConfigLoader)
|
|
147
|
+
Molecules::ConfigLoader.load
|
|
148
|
+
else
|
|
149
|
+
{}
|
|
150
|
+
end
|
|
151
|
+
new(config)
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Reset the default instance (for testing)
|
|
156
|
+
def self.reset_default_instance!
|
|
157
|
+
@default_instance = nil
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Backward-compatible alias while callers migrate off the legacy name.
|
|
162
|
+
SkillPromptBuilder = CliProviderAdapter
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|