ace-test-runner-e2e 0.29.8 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ace-defaults/e2e-runner/config.yml +14 -2
- data/CHANGELOG.md +233 -0
- data/README.md +2 -2
- data/exe/ace-test-e2e-sh +9 -4
- data/handbook/guides/e2e-testing.g.md +75 -9
- data/handbook/guides/scenario-yml-reference.g.md +21 -8
- data/handbook/guides/tc-authoring.g.md +23 -5
- data/handbook/skills/as-e2e-fix/SKILL.md +2 -2
- data/handbook/skills/as-e2e-review/SKILL.md +2 -2
- data/handbook/templates/ace-taskflow-fixture.template.md +17 -17
- data/handbook/templates/agent-experience-report.template.md +3 -2
- data/handbook/templates/scenario.yml.template.yml +7 -2
- data/handbook/templates/tc-file.template.md +16 -4
- data/handbook/workflow-instructions/e2e/analyze-failures.wf.md +53 -6
- data/handbook/workflow-instructions/e2e/create.wf.md +128 -25
- data/handbook/workflow-instructions/e2e/execute.wf.md +11 -7
- data/handbook/workflow-instructions/e2e/fix.wf.md +84 -15
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +33 -1
- data/handbook/workflow-instructions/e2e/review.wf.md +40 -25
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +22 -8
- data/handbook/workflow-instructions/e2e/run.wf.md +50 -26
- data/handbook/workflow-instructions/e2e/setup-sandbox.wf.md +4 -4
- data/lib/ace/test/end_to_end_runner/atoms/artifact_contract_validator.rb +138 -0
- data/lib/ace/test/end_to_end_runner/atoms/skill_prompt_builder.rb +7 -5
- data/lib/ace/test/end_to_end_runner/atoms/skill_result_parser.rb +73 -7
- data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +195 -5
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +58 -9
- data/lib/ace/test/end_to_end_runner/models/test_case.rb +8 -2
- data/lib/ace/test/end_to_end_runner/models/test_result.rb +9 -3
- data/lib/ace/test/end_to_end_runner/models/test_scenario.rb +4 -2
- data/lib/ace/test/end_to_end_runner/molecules/affected_detector.rb +7 -2
- data/lib/ace/test/end_to_end_runner/molecules/artifact_pruner.rb +61 -0
- data/lib/ace/test/end_to_end_runner/molecules/bwrap_sandbox_backend.rb +271 -0
- data/lib/ace/test/end_to_end_runner/molecules/config_loader.rb +28 -1
- data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb +122 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +235 -18
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +164 -13
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb +91 -19
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +121 -18
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +15 -12
- data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +374 -0
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +83 -5
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +121 -16
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +422 -97
- data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb +38 -13
- data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb +27 -5
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +98 -18
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +159 -19
- data/lib/ace/test/end_to_end_runner/version.rb +1 -1
- data/lib/ace/test/end_to_end_runner.rb +4 -0
- metadata +21 -2
|
@@ -35,7 +35,7 @@ module Ace
|
|
|
35
35
|
#
|
|
36
36
|
# Resolves role: references to their concrete provider before checking.
|
|
37
37
|
#
|
|
38
|
-
# @param provider_string [String] Provider:model string (e.g., "claude:sonnet", "role:e2e-
|
|
38
|
+
# @param provider_string [String] Provider:model string (e.g., "claude:sonnet", "role:e2e-runner")
|
|
39
39
|
# @return [Boolean]
|
|
40
40
|
def cli_provider?(provider_string)
|
|
41
41
|
resolved = resolve_provider_name(provider_string)
|
|
@@ -44,9 +44,9 @@ module Ace
|
|
|
44
44
|
|
|
45
45
|
def build_execution_prompt(command:, tc_mode:)
|
|
46
46
|
return_contract = if tc_mode
|
|
47
|
-
"- **Test ID**: ...\n- **TC ID**: ...\n- **Status**: pass | fail\n- **Report Paths**: ...\n- **Issues**: ..."
|
|
47
|
+
"- **Test ID**: ...\n- **TC ID**: ...\n- **Status**: pass | fail\n- **Report Paths**: ...\n- **Observations**: ...\n- **Issues**: ... (optional legacy alias)"
|
|
48
48
|
else
|
|
49
|
-
"- **Test ID**: ...\n- **Status**: pass | fail | partial\n- **Passed**: ...\n- **Failed**: ...\n- **Total**: ...\n- **Report Paths**: ...\n- **Issues**: ..."
|
|
49
|
+
"- **Test ID**: ...\n- **Status**: pass | fail | partial\n- **Passed**: ...\n- **Failed**: ...\n- **Total**: ...\n- **Report Paths**: ...\n- **Observations**: ...\n- **Issues**: ... (optional legacy alias)"
|
|
50
50
|
end
|
|
51
51
|
|
|
52
52
|
<<~PROMPT.strip
|
|
@@ -55,8 +55,9 @@ module Ace
|
|
|
55
55
|
|
|
56
56
|
Execution requirements:
|
|
57
57
|
- Do not run `/ace-...` inside a shell command.
|
|
58
|
-
- If slash commands are unavailable, stop and report that limitation in `
|
|
58
|
+
- If slash commands are unavailable, stop and report that limitation in `Observations`.
|
|
59
59
|
- Write reports under `.ace-local/test-e2e/*-reports/`.
|
|
60
|
+
- `Observations` is required and must be a concise factual summary of actions, outcomes, and blockers without verdict language.
|
|
60
61
|
- Return only this structured summary:
|
|
61
62
|
#{return_contract}
|
|
62
63
|
PROMPT
|
|
@@ -122,6 +123,7 @@ module Ace
|
|
|
122
123
|
|
|
123
124
|
Verification requirements:
|
|
124
125
|
- Inspect sandbox artifacts and scenario files directly.
|
|
126
|
+
- Judge from sandbox state first, then runner observations, then raw debug captures only when needed.
|
|
125
127
|
- Evaluate each test case using `TC-*.verify.md` criteria when present.
|
|
126
128
|
- Classify each failed test case with one category:
|
|
127
129
|
`test-spec-error`, `tool-bug`, `runner-error`, or `infrastructure-error`.
|
|
@@ -145,7 +147,7 @@ module Ace
|
|
|
145
147
|
|
|
146
148
|
# Resolve the bare provider name from a provider string.
|
|
147
149
|
# For role: references, resolves via ProviderModelParser to find the
|
|
148
|
-
# concrete provider (e.g. "role:e2e-
|
|
150
|
+
# concrete provider (e.g. "role:e2e-runner" → "claude").
|
|
149
151
|
def resolve_provider_name(provider_string)
|
|
150
152
|
name = self.class.provider_name(provider_string)
|
|
151
153
|
return name unless name == "role"
|
|
@@ -13,6 +13,7 @@ module Ace
|
|
|
13
13
|
# - **Failed**: 0
|
|
14
14
|
# - **Total**: 8
|
|
15
15
|
# - **Report Paths**: 8p5jo2-lint-ts001-reports/*
|
|
16
|
+
# - **Observations**: None
|
|
16
17
|
# - **Issues**: None
|
|
17
18
|
#
|
|
18
19
|
# Falls back to ResultParser.parse() for JSON responses.
|
|
@@ -45,6 +46,7 @@ module Ace
|
|
|
45
46
|
fields[:failed] = extract_field(text, "Failed")
|
|
46
47
|
fields[:total] = extract_field(text, "Total")
|
|
47
48
|
fields[:report_paths] = extract_field(text, "Report Paths")
|
|
49
|
+
fields[:observations] = extract_field(text, "Observations")
|
|
48
50
|
fields[:issues] = extract_field(text, "Issues")
|
|
49
51
|
|
|
50
52
|
# Need at least test_id and status for a valid parse
|
|
@@ -69,8 +71,7 @@ module Ace
|
|
|
69
71
|
passed.times { |i| test_cases << {id: "TC-#{format("%03d", i + 1)}", description: "", status: "pass", actual: "", notes: ""} }
|
|
70
72
|
failed.times { |i| test_cases << {id: "TC-#{format("%03d", passed + i + 1)}", description: "", status: "fail", actual: "", notes: ""} }
|
|
71
73
|
|
|
72
|
-
|
|
73
|
-
observations = (issues && issues.downcase != "none") ? issues : ""
|
|
74
|
+
observations = normalize_observations(parsed[:observations], parsed[:issues])
|
|
74
75
|
|
|
75
76
|
{
|
|
76
77
|
test_id: parsed[:test_id],
|
|
@@ -131,8 +132,8 @@ module Ace
|
|
|
131
132
|
fields[:failed_tcs] = extract_field(text, "Failed TCs")
|
|
132
133
|
fields[:issues] = extract_field(text, "Issues")
|
|
133
134
|
|
|
134
|
-
return
|
|
135
|
-
|
|
135
|
+
return parse_minimal_verifier(text) unless fields[:test_id] && fields[:status]
|
|
136
|
+
return parse(text) unless fields[:tcs_passed] && fields[:tcs_failed] && fields[:tcs_total]
|
|
136
137
|
|
|
137
138
|
passed = fields[:tcs_passed].to_i
|
|
138
139
|
failed = fields[:tcs_failed].to_i
|
|
@@ -180,6 +181,58 @@ module Ace
|
|
|
180
181
|
}
|
|
181
182
|
end
|
|
182
183
|
|
|
184
|
+
def self.parse_minimal_verifier(text)
|
|
185
|
+
compact = text.to_s.strip
|
|
186
|
+
results_match = compact.match(/Results:\s*(\d+)\s*\/\s*(\d+)\s*passed/i)
|
|
187
|
+
if results_match
|
|
188
|
+
passed = results_match[1].to_i
|
|
189
|
+
total = results_match[2].to_i
|
|
190
|
+
status = if total.zero?
|
|
191
|
+
"fail"
|
|
192
|
+
elsif passed == total
|
|
193
|
+
"pass"
|
|
194
|
+
elsif passed.zero?
|
|
195
|
+
"fail"
|
|
196
|
+
else
|
|
197
|
+
"partial"
|
|
198
|
+
end
|
|
199
|
+
failed = [total - passed, 0].max
|
|
200
|
+
test_cases = []
|
|
201
|
+
passed.times { |i| test_cases << {id: "TC-#{format("%03d", i + 1)}", description: "", status: "pass", actual: "", notes: ""} }
|
|
202
|
+
failed.times { |i| test_cases << {id: "TC-#{format("%03d", passed + i + 1)}", description: "", status: "fail", actual: "", notes: "", category: "unknown"} }
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
test_id: "",
|
|
206
|
+
status: status,
|
|
207
|
+
test_cases: test_cases,
|
|
208
|
+
summary: "#{passed}/#{total} passed",
|
|
209
|
+
observations: compact
|
|
210
|
+
}
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
status_match = compact.match(/\b(PASS|FAIL|PARTIAL|ERROR)\b/i)
|
|
214
|
+
return parse(text) unless status_match
|
|
215
|
+
|
|
216
|
+
status = normalize_status(status_match[1])
|
|
217
|
+
evidence = compact.sub(/^.*?\b#{Regexp.escape(status_match[1])}\b[:\-\s]*/i, "").strip
|
|
218
|
+
tc_status = (status == "pass") ? "pass" : "fail"
|
|
219
|
+
|
|
220
|
+
{
|
|
221
|
+
test_id: "",
|
|
222
|
+
status: status,
|
|
223
|
+
test_cases: [{
|
|
224
|
+
id: "TC-001",
|
|
225
|
+
description: "",
|
|
226
|
+
status: tc_status,
|
|
227
|
+
actual: "",
|
|
228
|
+
notes: evidence,
|
|
229
|
+
category: ((tc_status == "fail") ? "unknown" : nil)
|
|
230
|
+
}],
|
|
231
|
+
summary: evidence.empty? ? status : evidence,
|
|
232
|
+
observations: evidence
|
|
233
|
+
}
|
|
234
|
+
end
|
|
235
|
+
|
|
183
236
|
# Parse TC-level markdown return contract
|
|
184
237
|
def self.parse_tc_markdown(text)
|
|
185
238
|
fields = {}
|
|
@@ -188,6 +241,7 @@ module Ace
|
|
|
188
241
|
fields[:tc_id] = extract_field(text, "TC ID")
|
|
189
242
|
fields[:status] = extract_field(text, "Status")
|
|
190
243
|
fields[:report_paths] = extract_field(text, "Report Paths")
|
|
244
|
+
fields[:observations] = extract_field(text, "Observations")
|
|
191
245
|
fields[:issues] = extract_field(text, "Issues")
|
|
192
246
|
|
|
193
247
|
# Need test_id, tc_id, and status for a valid TC parse
|
|
@@ -200,8 +254,7 @@ module Ace
|
|
|
200
254
|
def self.to_tc_normalized(parsed)
|
|
201
255
|
parsed[:status] = normalize_status(parsed[:status])
|
|
202
256
|
|
|
203
|
-
|
|
204
|
-
observations = (issues && issues.downcase != "none") ? issues : ""
|
|
257
|
+
observations = normalize_observations(parsed[:observations], parsed[:issues])
|
|
205
258
|
|
|
206
259
|
{
|
|
207
260
|
test_id: parsed[:test_id],
|
|
@@ -234,9 +287,22 @@ module Ace
|
|
|
234
287
|
end
|
|
235
288
|
end
|
|
236
289
|
|
|
290
|
+
def self.normalize_observations(primary, fallback = nil)
|
|
291
|
+
[primary, fallback].each do |value|
|
|
292
|
+
next if value.nil?
|
|
293
|
+
|
|
294
|
+
normalized = value.to_s.strip
|
|
295
|
+
next if normalized.empty? || normalized.casecmp("none").zero?
|
|
296
|
+
|
|
297
|
+
return normalized
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
""
|
|
301
|
+
end
|
|
302
|
+
|
|
237
303
|
private_class_method :parse_markdown, :to_normalized, :extract_field,
|
|
238
304
|
:parse_tc_markdown, :to_tc_normalized, :normalize_status,
|
|
239
|
-
:parse_failed_tcs
|
|
305
|
+
:parse_failed_tcs, :parse_minimal_verifier, :normalize_observations
|
|
240
306
|
end
|
|
241
307
|
end
|
|
242
308
|
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "ace/b36ts"
|
|
3
4
|
require "ace/support/cli"
|
|
4
5
|
require "stringio"
|
|
5
6
|
require "ace/support/cli"
|
|
@@ -23,6 +24,7 @@ module Ace
|
|
|
23
24
|
in the monorepo. Tests run sequentially by default or in parallel
|
|
24
25
|
with --parallel flag. Use --affected to only test changed packages.
|
|
25
26
|
Use --only-failures to re-run only previously failed scenarios.
|
|
27
|
+
Full unfiltered suite runs retry failed scenarios once by default.
|
|
26
28
|
Optionally filter to specific packages with a comma-separated list.
|
|
27
29
|
|
|
28
30
|
Output:
|
|
@@ -40,6 +42,8 @@ module Ace
|
|
|
40
42
|
"--affected --parallel 8 # Parallel affected tests only",
|
|
41
43
|
"--only-failures # Re-run failed scenarios from cache",
|
|
42
44
|
"--affected --only-failures # Re-run failed scenarios in affected packages",
|
|
45
|
+
"--no-retry-failures-once # Disable default retry for a full suite run",
|
|
46
|
+
"--prune-artifacts # Remove stale .ace-local/test-e2e artifacts before running",
|
|
43
47
|
"--tags smoke,happy-path # Include scenarios by tag",
|
|
44
48
|
"--exclude-tags deep # Exclude scenarios by tag",
|
|
45
49
|
"--cli-args dangerously-skip-permissions # Pass args to provider"
|
|
@@ -50,6 +54,8 @@ module Ace
|
|
|
50
54
|
option :affected, type: :boolean, desc: "Only test affected packages"
|
|
51
55
|
option :only_failures, type: :boolean,
|
|
52
56
|
desc: "Re-run only previously failed scenarios"
|
|
57
|
+
option :retry_failures_once, type: :boolean,
|
|
58
|
+
desc: "Retry failed scenarios once after a full unfiltered suite run"
|
|
53
59
|
option :cli_args, type: :string,
|
|
54
60
|
desc: "Extra args for CLI-based LLM providers"
|
|
55
61
|
option :provider, type: :string, default: Molecules::ConfigLoader.default_provider,
|
|
@@ -61,6 +67,8 @@ module Ace
|
|
|
61
67
|
option :progress, type: :boolean, desc: "Enable live animated display"
|
|
62
68
|
option :verify, type: :boolean,
|
|
63
69
|
desc: "Run independent verifier pass for each scenario"
|
|
70
|
+
option :prune_artifacts, type: :boolean,
|
|
71
|
+
desc: "Remove stale .ace-local/test-e2e artifacts before running (preserves suite reports and runtime-cache)"
|
|
64
72
|
option :quiet, type: :boolean, aliases: %w[-q], desc: "Suppress non-essential output"
|
|
65
73
|
option :verbose, type: :boolean, aliases: %w[-v], desc: "Show verbose output"
|
|
66
74
|
option :debug, type: :boolean, aliases: %w[-d], desc: "Show debug output"
|
|
@@ -69,21 +77,36 @@ module Ace
|
|
|
69
77
|
options = coerce_types(options, parallel: :integer, timeout: :integer)
|
|
70
78
|
|
|
71
79
|
parallel = options[:parallel]
|
|
72
|
-
affected = options[:affected]
|
|
73
|
-
only_failures = options[:only_failures]
|
|
80
|
+
affected = !!options[:affected]
|
|
81
|
+
only_failures = !!options[:only_failures]
|
|
82
|
+
prune_artifacts = !!options[:prune_artifacts]
|
|
74
83
|
tags = parse_csv_list(options[:tags])
|
|
75
84
|
exclude_tags = parse_csv_list(options[:exclude_tags])
|
|
85
|
+
if only_failures && prune_artifacts
|
|
86
|
+
raise Ace::Support::Cli::Error.new(
|
|
87
|
+
"--prune-artifacts cannot be used with --only-failures"
|
|
88
|
+
)
|
|
89
|
+
end
|
|
90
|
+
retry_failures_once = resolve_retry_failures_once(
|
|
91
|
+
requested: options[:retry_failures_once],
|
|
92
|
+
packages: packages,
|
|
93
|
+
affected: affected,
|
|
94
|
+
only_failures: only_failures,
|
|
95
|
+
tags: tags,
|
|
96
|
+
exclude_tags: exclude_tags
|
|
97
|
+
)
|
|
76
98
|
|
|
77
99
|
output = quiet?(options) ? StringIO.new : $stdout
|
|
78
100
|
progress = options[:progress] && !quiet?(options)
|
|
101
|
+
prune_artifacts_if_requested(output: output, prune_artifacts: prune_artifacts, quiet: quiet?(options))
|
|
79
102
|
|
|
80
|
-
orchestrator =
|
|
103
|
+
orchestrator = build_orchestrator(
|
|
81
104
|
max_parallel: [parallel, 1].max,
|
|
82
105
|
output: output,
|
|
83
106
|
progress: progress
|
|
84
107
|
)
|
|
85
108
|
|
|
86
|
-
|
|
109
|
+
run_options = {
|
|
87
110
|
parallel: parallel > 0,
|
|
88
111
|
affected: affected,
|
|
89
112
|
only_failures: only_failures,
|
|
@@ -94,6 +117,13 @@ module Ace
|
|
|
94
117
|
tags: tags,
|
|
95
118
|
exclude_tags: exclude_tags,
|
|
96
119
|
verify: options[:verify]
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
results = run_suite_with_retry(
|
|
123
|
+
orchestrator,
|
|
124
|
+
run_options: run_options,
|
|
125
|
+
output: output,
|
|
126
|
+
retry_failures_once: retry_failures_once
|
|
97
127
|
)
|
|
98
128
|
|
|
99
129
|
if results[:total].zero?
|
|
@@ -110,13 +140,173 @@ module Ace
|
|
|
110
140
|
if results[:failed] > 0 || results[:errors] > 0
|
|
111
141
|
failed_count = results[:failed] + results[:errors]
|
|
112
142
|
raise Ace::Support::Cli::Error.new(
|
|
113
|
-
"#{failed_count} test(s) failed or errored"
|
|
143
|
+
results[:retry_attempted] ? "#{failed_count} test(s) failed or errored after retry" : "#{failed_count} test(s) failed or errored"
|
|
114
144
|
)
|
|
115
145
|
end
|
|
146
|
+
|
|
147
|
+
results
|
|
116
148
|
end
|
|
117
149
|
|
|
118
150
|
private
|
|
119
151
|
|
|
152
|
+
def build_orchestrator(max_parallel:, output:, progress:)
|
|
153
|
+
Organisms::SuiteOrchestrator.new(
|
|
154
|
+
max_parallel: max_parallel,
|
|
155
|
+
output: output,
|
|
156
|
+
progress: progress
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def build_retry_report_writer
|
|
161
|
+
Molecules::SuiteReportWriter.new(config: Molecules::ConfigLoader.load)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def build_artifact_pruner
|
|
165
|
+
Molecules::ArtifactPruner.new
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def prune_artifacts_if_requested(output:, prune_artifacts:, quiet:)
|
|
169
|
+
return unless prune_artifacts
|
|
170
|
+
|
|
171
|
+
result = build_artifact_pruner.prune(base_dir: Dir.pwd)
|
|
172
|
+
return if quiet
|
|
173
|
+
|
|
174
|
+
output.puts(
|
|
175
|
+
"Pruned #{result[:deleted_count]} artifact(s) from #{result[:root_display]} (preserved suite reports and runtime-cache)"
|
|
176
|
+
)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def run_suite_with_retry(orchestrator, run_options:, output:, retry_failures_once:)
|
|
180
|
+
initial_results = orchestrator.run(run_options)
|
|
181
|
+
annotated = annotate_results(
|
|
182
|
+
initial_results,
|
|
183
|
+
retry_attempted: false,
|
|
184
|
+
attempts: 1,
|
|
185
|
+
flaky_scenarios: [],
|
|
186
|
+
remaining_failures: failure_scenarios(initial_results),
|
|
187
|
+
initial_report_path: initial_results[:report_path],
|
|
188
|
+
retry_report_path: nil,
|
|
189
|
+
report_path: initial_results[:report_path]
|
|
190
|
+
)
|
|
191
|
+
return annotated unless retry_failures_once && suite_failed?(initial_results)
|
|
192
|
+
|
|
193
|
+
output.puts "Retrying failed scenarios once..."
|
|
194
|
+
retry_results = orchestrator.run(run_options.merge(only_failures: true))
|
|
195
|
+
if retry_results[:total].zero?
|
|
196
|
+
raise Ace::Support::Cli::Error.new(
|
|
197
|
+
"Retry pass found no failed test scenarios from attempt 1; aborting instead of silently passing"
|
|
198
|
+
)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
flaky_scenarios = recovered_flaky_scenarios(initial_results, retry_results)
|
|
202
|
+
remaining_failures = failure_scenarios(retry_results)
|
|
203
|
+
final_report_path = write_retry_summary_report(initial_results, retry_results)
|
|
204
|
+
output.puts "Final Report: #{final_report_path}" if final_report_path
|
|
205
|
+
|
|
206
|
+
if remaining_failures.empty?
|
|
207
|
+
output.puts "#{flaky_scenarios.length} scenario(s) recovered on retry and were marked flaky"
|
|
208
|
+
else
|
|
209
|
+
output.puts "#{remaining_failures.length} scenario(s) still failing after retry"
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
annotate_results(
|
|
213
|
+
retry_results,
|
|
214
|
+
retry_attempted: true,
|
|
215
|
+
attempts: 2,
|
|
216
|
+
flaky_scenarios: flaky_scenarios,
|
|
217
|
+
remaining_failures: remaining_failures,
|
|
218
|
+
initial_report_path: initial_results[:report_path],
|
|
219
|
+
retry_report_path: retry_results[:report_path],
|
|
220
|
+
report_path: final_report_path || retry_results[:report_path]
|
|
221
|
+
)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def write_retry_summary_report(initial_results, retry_results)
|
|
225
|
+
build_retry_report_writer.write_retry_summary(
|
|
226
|
+
initial_results: initial_results,
|
|
227
|
+
retry_results: retry_results,
|
|
228
|
+
timestamp: Ace::B36ts.encode(Time.now.utc, format: :"50ms"),
|
|
229
|
+
base_dir: Dir.pwd
|
|
230
|
+
)
|
|
231
|
+
rescue => e
|
|
232
|
+
warn "Warning: Failed to write retry summary report: #{e.message}" if ENV["DEBUG"]
|
|
233
|
+
nil
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def annotate_results(results, **extra)
|
|
237
|
+
results.merge(extra)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def suite_failed?(results)
|
|
241
|
+
results[:failed].to_i > 0 || results[:errors].to_i > 0
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def failure_scenarios(results)
|
|
245
|
+
scenario_result_index(results)
|
|
246
|
+
.values
|
|
247
|
+
.select { |result| result[:status] != "pass" }
|
|
248
|
+
.map { |result| result[:test_id] }
|
|
249
|
+
.sort
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def recovered_flaky_scenarios(initial_results, retry_results)
|
|
253
|
+
initial_by_test = scenario_result_index(initial_results)
|
|
254
|
+
retry_by_test = scenario_result_index(retry_results)
|
|
255
|
+
|
|
256
|
+
initial_by_test.each_with_object([]) do |(test_id, initial), flaky|
|
|
257
|
+
next if initial[:status] == "pass"
|
|
258
|
+
|
|
259
|
+
retry_result = retry_by_test[test_id]
|
|
260
|
+
next unless retry_result && retry_result[:status] == "pass"
|
|
261
|
+
|
|
262
|
+
flaky << {
|
|
263
|
+
"test_id" => test_id,
|
|
264
|
+
"initial_status" => initial[:status],
|
|
265
|
+
"retry_status" => retry_result[:status]
|
|
266
|
+
}
|
|
267
|
+
end.sort_by { |entry| entry["test_id"] }
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def scenario_result_index(results)
|
|
271
|
+
results.fetch(:packages, {}).values.flatten.each_with_object({}) do |result, index|
|
|
272
|
+
test_name = result[:test_name] || result[:test_id] || ""
|
|
273
|
+
test_id = test_name[/\A(TS-[A-Z0-9]+-\d+[a-z]*)/i, 1]&.upcase || test_name
|
|
274
|
+
next if test_id.empty?
|
|
275
|
+
|
|
276
|
+
index[test_id] = {
|
|
277
|
+
test_id: test_id,
|
|
278
|
+
status: result[:status],
|
|
279
|
+
summary: result[:summary],
|
|
280
|
+
error: result[:error]
|
|
281
|
+
}
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def resolve_retry_failures_once(requested:, packages:, affected:, only_failures:, tags:, exclude_tags:)
|
|
286
|
+
scoped = scoped_suite_run?(
|
|
287
|
+
packages: packages,
|
|
288
|
+
affected: affected,
|
|
289
|
+
only_failures: only_failures,
|
|
290
|
+
tags: tags,
|
|
291
|
+
exclude_tags: exclude_tags
|
|
292
|
+
)
|
|
293
|
+
if requested == true && scoped
|
|
294
|
+
raise Ace::Support::Cli::Error.new(
|
|
295
|
+
"--retry-failures-once is only supported for full unfiltered suite runs"
|
|
296
|
+
)
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
return requested unless requested.nil?
|
|
300
|
+
|
|
301
|
+
!scoped
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
def scoped_suite_run?(packages:, affected:, only_failures:, tags:, exclude_tags:)
|
|
305
|
+
[packages, affected, only_failures].any? ||
|
|
306
|
+
!tags.empty? ||
|
|
307
|
+
!exclude_tags.empty?
|
|
308
|
+
end
|
|
309
|
+
|
|
120
310
|
def parse_csv_list(raw)
|
|
121
311
|
return [] if raw.nil? || raw.strip.empty?
|
|
122
312
|
|
|
@@ -20,9 +20,9 @@ module Ace
|
|
|
20
20
|
desc <<~DESC.strip
|
|
21
21
|
Run E2E tests via LLM execution
|
|
22
22
|
|
|
23
|
-
Discovers and executes
|
|
24
|
-
Tests are sent to an LLM
|
|
25
|
-
structured results.
|
|
23
|
+
Discovers and executes deterministic preflight tests from test/feat
|
|
24
|
+
before TS-* agent scenarios from test/e2e. Tests are sent to an LLM
|
|
25
|
+
provider which executes the scenario steps and returns structured results.
|
|
26
26
|
|
|
27
27
|
Output:
|
|
28
28
|
Exit codes: 0 (all pass), 1 (any fail/error)
|
|
@@ -35,7 +35,8 @@ module Ace
|
|
|
35
35
|
"ace-lint --provider gemini:flash # Use specific provider",
|
|
36
36
|
"ace-lint --provider glite # Use API provider (predict mode)",
|
|
37
37
|
"ace-lint --tags smoke # Run only smoke-tagged scenarios",
|
|
38
|
-
"ace-lint
|
|
38
|
+
"ace-lint --prune-artifacts # Remove stale .ace-local/test-e2e artifacts before running",
|
|
39
|
+
"ace-lint TS-LINT-003 --dry-run # Preview preflight and scenario phases"
|
|
39
40
|
]
|
|
40
41
|
|
|
41
42
|
argument :package, required: true, desc: "Package name (e.g., ace-lint)"
|
|
@@ -55,11 +56,13 @@ module Ace
|
|
|
55
56
|
option :report_dir, type: :string,
|
|
56
57
|
desc: "Explicit report directory path (overrides computed path)"
|
|
57
58
|
option :dry_run, type: :boolean,
|
|
58
|
-
desc: "Preview which scenarios would run without executing"
|
|
59
|
+
desc: "Preview which preflight tests and scenarios would run without executing"
|
|
59
60
|
option :tags, type: :string,
|
|
60
61
|
desc: "Comma-separated scenario tags to include"
|
|
61
62
|
option :verify, type: :boolean,
|
|
62
63
|
desc: "Run independent verifier pass after runner execution"
|
|
64
|
+
option :prune_artifacts, type: :boolean,
|
|
65
|
+
desc: "Remove stale .ace-local/test-e2e artifacts before running (preserves final reports and runtime-cache)"
|
|
63
66
|
option :quiet, type: :boolean, aliases: %w[-q], desc: "Suppress non-essential output"
|
|
64
67
|
option :verbose, type: :boolean, aliases: %w[-v], desc: "Show verbose output"
|
|
65
68
|
option :debug, type: :boolean, aliases: %w[-d], desc: "Show debug output"
|
|
@@ -67,13 +70,22 @@ module Ace
|
|
|
67
70
|
def call(package:, test_id: nil, **options)
|
|
68
71
|
options = coerce_types(options, timeout: :integer, parallel: :integer)
|
|
69
72
|
output = quiet?(options) ? StringIO.new : $stdout
|
|
73
|
+
prune_artifacts = !!options[:prune_artifacts]
|
|
74
|
+
|
|
75
|
+
if options[:dry_run] && prune_artifacts
|
|
76
|
+
raise Ace::Support::Cli::Error.new(
|
|
77
|
+
"--prune-artifacts cannot be used with --dry-run"
|
|
78
|
+
)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
prune_artifacts_if_requested(output: output, prune_artifacts: prune_artifacts, quiet: quiet?(options))
|
|
70
82
|
|
|
71
83
|
# Handle dry-run mode
|
|
72
84
|
if options[:dry_run]
|
|
73
85
|
return handle_dry_run(package, test_id, output, tags: parse_tags(options[:tags]))
|
|
74
86
|
end
|
|
75
87
|
|
|
76
|
-
orchestrator =
|
|
88
|
+
orchestrator = build_orchestrator(
|
|
77
89
|
provider: options[:provider],
|
|
78
90
|
timeout: options[:timeout],
|
|
79
91
|
parallel: options[:parallel],
|
|
@@ -110,7 +122,31 @@ module Ace
|
|
|
110
122
|
|
|
111
123
|
private
|
|
112
124
|
|
|
113
|
-
|
|
125
|
+
def build_orchestrator(provider:, timeout:, parallel:, progress:)
|
|
126
|
+
Organisms::TestOrchestrator.new(
|
|
127
|
+
provider: provider,
|
|
128
|
+
timeout: timeout,
|
|
129
|
+
parallel: parallel,
|
|
130
|
+
progress: progress
|
|
131
|
+
)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def build_artifact_pruner
|
|
135
|
+
Molecules::ArtifactPruner.new
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def prune_artifacts_if_requested(output:, prune_artifacts:, quiet:)
|
|
139
|
+
return unless prune_artifacts
|
|
140
|
+
|
|
141
|
+
result = build_artifact_pruner.prune(base_dir: Dir.pwd)
|
|
142
|
+
return if quiet
|
|
143
|
+
|
|
144
|
+
output.puts(
|
|
145
|
+
"Pruned #{result[:deleted_count]} artifact(s) from #{result[:root_display]} (preserved final reports and runtime-cache)"
|
|
146
|
+
)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Handle dry-run mode: preview which preflight tests and scenarios would run
|
|
114
150
|
#
|
|
115
151
|
# @param package [String] Package name
|
|
116
152
|
# @param test_id [String, nil] Test ID
|
|
@@ -125,15 +161,28 @@ module Ace
|
|
|
125
161
|
tags: tags,
|
|
126
162
|
base_dir: Dir.pwd
|
|
127
163
|
)
|
|
128
|
-
|
|
164
|
+
preflight_files = discoverer.find_integration_tests(package: package, base_dir: Dir.pwd)
|
|
165
|
+
if files.empty? && preflight_files.empty?
|
|
129
166
|
raise Ace::Support::Cli::Error.new(
|
|
130
167
|
"No tests found for package '#{package}'" +
|
|
131
168
|
(test_id ? " with ID '#{test_id}'" : "")
|
|
132
169
|
)
|
|
133
170
|
end
|
|
134
171
|
|
|
135
|
-
output.puts "Dry run: preview of
|
|
172
|
+
output.puts "Dry run: preview of execution phases"
|
|
173
|
+
output.puts ""
|
|
174
|
+
output.puts "Phase 1: deterministic preflight"
|
|
175
|
+
if preflight_files.empty?
|
|
176
|
+
output.puts " (none)"
|
|
177
|
+
else
|
|
178
|
+
preflight_files.each do |file|
|
|
179
|
+
output.puts " [preflight] #{file}"
|
|
180
|
+
end
|
|
181
|
+
end
|
|
136
182
|
output.puts ""
|
|
183
|
+
output.puts "Phase 2: scenarios"
|
|
184
|
+
output.puts " (none)" if files.empty?
|
|
185
|
+
output.puts "" unless files.empty?
|
|
137
186
|
|
|
138
187
|
files.each do |file|
|
|
139
188
|
scenario = loader.load(File.dirname(file))
|
|
@@ -9,7 +9,8 @@ module Ace
|
|
|
9
9
|
# Contains parsed frontmatter metadata and the full markdown body
|
|
10
10
|
# from an independent test case file within a scenario directory.
|
|
11
11
|
class TestCase
|
|
12
|
-
attr_reader :tc_id, :title, :content, :file_path, :pending, :goal_format
|
|
12
|
+
attr_reader :tc_id, :title, :content, :file_path, :pending, :goal_format,
|
|
13
|
+
:declared_artifacts, :optional_artifacts
|
|
13
14
|
|
|
14
15
|
# @param tc_id [String] Test case identifier (e.g., "TC-001")
|
|
15
16
|
# @param title [String] Test case title from frontmatter
|
|
@@ -17,13 +18,18 @@ module Ace
|
|
|
17
18
|
# @param file_path [String] Absolute path to the source test file
|
|
18
19
|
# @param pending [String, nil] Pending reason (presence = pending, value = reason)
|
|
19
20
|
# @param goal_format [String, nil] Test case source format ("standalone")
|
|
20
|
-
|
|
21
|
+
# @param declared_artifacts [Array<String>] Required artifact paths under results/tc/*
|
|
22
|
+
# @param optional_artifacts [Array<String>] Optional artifact paths under results/tc/*
|
|
23
|
+
def initialize(tc_id:, title:, content:, file_path:, pending: nil, goal_format: nil,
|
|
24
|
+
declared_artifacts: [], optional_artifacts: [])
|
|
21
25
|
@tc_id = tc_id
|
|
22
26
|
@title = title
|
|
23
27
|
@content = content
|
|
24
28
|
@file_path = file_path
|
|
25
29
|
@pending = pending
|
|
26
30
|
@goal_format = goal_format
|
|
31
|
+
@declared_artifacts = declared_artifacts
|
|
32
|
+
@optional_artifacts = optional_artifacts
|
|
27
33
|
end
|
|
28
34
|
|
|
29
35
|
# Whether this test case is pending (should be skipped)
|
|
@@ -10,7 +10,7 @@ module Ace
|
|
|
10
10
|
# from executing a test scenario via LLM.
|
|
11
11
|
class TestResult
|
|
12
12
|
attr_reader :test_id, :status, :test_cases, :summary,
|
|
13
|
-
:started_at, :completed_at, :report_dir, :error
|
|
13
|
+
:started_at, :completed_at, :report_dir, :error, :metadata, :observations
|
|
14
14
|
|
|
15
15
|
# @param test_id [String] Test identifier
|
|
16
16
|
# @param status [String] Overall status: "pass", "fail", "partial", "error"
|
|
@@ -20,8 +20,10 @@ module Ace
|
|
|
20
20
|
# @param completed_at [Time] When execution completed
|
|
21
21
|
# @param report_dir [String, nil] Path to the reports directory
|
|
22
22
|
# @param error [String, nil] Error message if execution failed
|
|
23
|
+
# @param observations [String] Runner/verifier observations for report context
|
|
24
|
+
# @param metadata [Hash] Additional structured phase/report metadata
|
|
23
25
|
def initialize(test_id:, status:, test_cases: [], summary: "",
|
|
24
|
-
started_at: nil, completed_at: nil, report_dir: nil, error: nil)
|
|
26
|
+
started_at: nil, completed_at: nil, report_dir: nil, error: nil, observations: "", metadata: {})
|
|
25
27
|
@test_id = test_id
|
|
26
28
|
@status = status
|
|
27
29
|
@test_cases = test_cases
|
|
@@ -30,6 +32,8 @@ module Ace
|
|
|
30
32
|
@completed_at = completed_at || Time.now
|
|
31
33
|
@report_dir = report_dir
|
|
32
34
|
@error = error
|
|
35
|
+
@observations = observations.to_s
|
|
36
|
+
@metadata = metadata
|
|
33
37
|
end
|
|
34
38
|
|
|
35
39
|
# Check if the test passed
|
|
@@ -94,7 +98,9 @@ module Ace
|
|
|
94
98
|
started_at: started_at,
|
|
95
99
|
completed_at: completed_at,
|
|
96
100
|
report_dir: dir,
|
|
97
|
-
error: error
|
|
101
|
+
error: error,
|
|
102
|
+
observations: observations,
|
|
103
|
+
metadata: metadata
|
|
98
104
|
)
|
|
99
105
|
end
|
|
100
106
|
|