ace-test-runner-e2e 0.38.11 → 0.40.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +60 -0
- data/handbook/guides/e2e-testing.g.md +35 -3
- data/handbook/guides/scenario-yml-reference.g.md +8 -3
- data/handbook/guides/tc-authoring.g.md +15 -4
- data/handbook/templates/tc-file.template.md +4 -2
- data/handbook/workflow-instructions/e2e/create.wf.md +13 -3
- data/handbook/workflow-instructions/e2e/fix.wf.md +19 -0
- data/handbook/workflow-instructions/e2e/plan-changes.wf.md +16 -0
- data/handbook/workflow-instructions/e2e/review.wf.md +14 -10
- data/handbook/workflow-instructions/e2e/rewrite.wf.md +10 -3
- data/lib/ace/test/end_to_end_runner/atoms/artifact_contract_validator.rb +138 -0
- data/lib/ace/test/end_to_end_runner/cli/commands/run_suite.rb +195 -5
- data/lib/ace/test/end_to_end_runner/cli/commands/run_test.rb +37 -1
- data/lib/ace/test/end_to_end_runner/molecules/artifact_pruner.rb +61 -0
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb +90 -14
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb +43 -5
- data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb +7 -5
- data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb +2 -0
- data/lib/ace/test/end_to_end_runner/molecules/sandbox_runtime_builder.rb +101 -9
- data/lib/ace/test/end_to_end_runner/molecules/scenario_loader.rb +28 -30
- data/lib/ace/test/end_to_end_runner/molecules/setup_executor.rb +24 -1
- data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb +182 -1
- data/lib/ace/test/end_to_end_runner/organisms/suite_orchestrator.rb +25 -3
- data/lib/ace/test/end_to_end_runner/organisms/test_orchestrator.rb +44 -5
- data/lib/ace/test/end_to_end_runner/version.rb +1 -1
- data/lib/ace/test/end_to_end_runner.rb +2 -0
- metadata +4 -2
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "ace/b36ts"
|
|
3
4
|
require "ace/support/cli"
|
|
4
5
|
require "stringio"
|
|
5
6
|
require "ace/support/cli"
|
|
@@ -23,6 +24,7 @@ module Ace
|
|
|
23
24
|
in the monorepo. Tests run sequentially by default or in parallel
|
|
24
25
|
with --parallel flag. Use --affected to only test changed packages.
|
|
25
26
|
Use --only-failures to re-run only previously failed scenarios.
|
|
27
|
+
Full unfiltered suite runs retry failed scenarios once by default.
|
|
26
28
|
Optionally filter to specific packages with a comma-separated list.
|
|
27
29
|
|
|
28
30
|
Output:
|
|
@@ -40,6 +42,8 @@ module Ace
|
|
|
40
42
|
"--affected --parallel 8 # Parallel affected tests only",
|
|
41
43
|
"--only-failures # Re-run failed scenarios from cache",
|
|
42
44
|
"--affected --only-failures # Re-run failed scenarios in affected packages",
|
|
45
|
+
"--no-retry-failures-once # Disable default retry for a full suite run",
|
|
46
|
+
"--prune-artifacts # Remove stale .ace-local/test-e2e artifacts before running",
|
|
43
47
|
"--tags smoke,happy-path # Include scenarios by tag",
|
|
44
48
|
"--exclude-tags deep # Exclude scenarios by tag",
|
|
45
49
|
"--cli-args dangerously-skip-permissions # Pass args to provider"
|
|
@@ -50,6 +54,8 @@ module Ace
|
|
|
50
54
|
option :affected, type: :boolean, desc: "Only test affected packages"
|
|
51
55
|
option :only_failures, type: :boolean,
|
|
52
56
|
desc: "Re-run only previously failed scenarios"
|
|
57
|
+
option :retry_failures_once, type: :boolean,
|
|
58
|
+
desc: "Retry failed scenarios once after a full unfiltered suite run"
|
|
53
59
|
option :cli_args, type: :string,
|
|
54
60
|
desc: "Extra args for CLI-based LLM providers"
|
|
55
61
|
option :provider, type: :string, default: Molecules::ConfigLoader.default_provider,
|
|
@@ -61,6 +67,8 @@ module Ace
|
|
|
61
67
|
option :progress, type: :boolean, desc: "Enable live animated display"
|
|
62
68
|
option :verify, type: :boolean,
|
|
63
69
|
desc: "Run independent verifier pass for each scenario"
|
|
70
|
+
option :prune_artifacts, type: :boolean,
|
|
71
|
+
desc: "Remove stale .ace-local/test-e2e artifacts before running (preserves suite reports and runtime-cache)"
|
|
64
72
|
option :quiet, type: :boolean, aliases: %w[-q], desc: "Suppress non-essential output"
|
|
65
73
|
option :verbose, type: :boolean, aliases: %w[-v], desc: "Show verbose output"
|
|
66
74
|
option :debug, type: :boolean, aliases: %w[-d], desc: "Show debug output"
|
|
@@ -69,21 +77,36 @@ module Ace
|
|
|
69
77
|
options = coerce_types(options, parallel: :integer, timeout: :integer)
|
|
70
78
|
|
|
71
79
|
parallel = options[:parallel]
|
|
72
|
-
affected = options[:affected]
|
|
73
|
-
only_failures = options[:only_failures]
|
|
80
|
+
affected = !!options[:affected]
|
|
81
|
+
only_failures = !!options[:only_failures]
|
|
82
|
+
prune_artifacts = !!options[:prune_artifacts]
|
|
74
83
|
tags = parse_csv_list(options[:tags])
|
|
75
84
|
exclude_tags = parse_csv_list(options[:exclude_tags])
|
|
85
|
+
if only_failures && prune_artifacts
|
|
86
|
+
raise Ace::Support::Cli::Error.new(
|
|
87
|
+
"--prune-artifacts cannot be used with --only-failures"
|
|
88
|
+
)
|
|
89
|
+
end
|
|
90
|
+
retry_failures_once = resolve_retry_failures_once(
|
|
91
|
+
requested: options[:retry_failures_once],
|
|
92
|
+
packages: packages,
|
|
93
|
+
affected: affected,
|
|
94
|
+
only_failures: only_failures,
|
|
95
|
+
tags: tags,
|
|
96
|
+
exclude_tags: exclude_tags
|
|
97
|
+
)
|
|
76
98
|
|
|
77
99
|
output = quiet?(options) ? StringIO.new : $stdout
|
|
78
100
|
progress = options[:progress] && !quiet?(options)
|
|
101
|
+
prune_artifacts_if_requested(output: output, prune_artifacts: prune_artifacts, quiet: quiet?(options))
|
|
79
102
|
|
|
80
|
-
orchestrator =
|
|
103
|
+
orchestrator = build_orchestrator(
|
|
81
104
|
max_parallel: [parallel, 1].max,
|
|
82
105
|
output: output,
|
|
83
106
|
progress: progress
|
|
84
107
|
)
|
|
85
108
|
|
|
86
|
-
|
|
109
|
+
run_options = {
|
|
87
110
|
parallel: parallel > 0,
|
|
88
111
|
affected: affected,
|
|
89
112
|
only_failures: only_failures,
|
|
@@ -94,6 +117,13 @@ module Ace
|
|
|
94
117
|
tags: tags,
|
|
95
118
|
exclude_tags: exclude_tags,
|
|
96
119
|
verify: options[:verify]
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
results = run_suite_with_retry(
|
|
123
|
+
orchestrator,
|
|
124
|
+
run_options: run_options,
|
|
125
|
+
output: output,
|
|
126
|
+
retry_failures_once: retry_failures_once
|
|
97
127
|
)
|
|
98
128
|
|
|
99
129
|
if results[:total].zero?
|
|
@@ -110,13 +140,173 @@ module Ace
|
|
|
110
140
|
if results[:failed] > 0 || results[:errors] > 0
|
|
111
141
|
failed_count = results[:failed] + results[:errors]
|
|
112
142
|
raise Ace::Support::Cli::Error.new(
|
|
113
|
-
"#{failed_count} test(s) failed or errored"
|
|
143
|
+
results[:retry_attempted] ? "#{failed_count} test(s) failed or errored after retry" : "#{failed_count} test(s) failed or errored"
|
|
114
144
|
)
|
|
115
145
|
end
|
|
146
|
+
|
|
147
|
+
results
|
|
116
148
|
end
|
|
117
149
|
|
|
118
150
|
private
|
|
119
151
|
|
|
152
|
+
def build_orchestrator(max_parallel:, output:, progress:)
|
|
153
|
+
Organisms::SuiteOrchestrator.new(
|
|
154
|
+
max_parallel: max_parallel,
|
|
155
|
+
output: output,
|
|
156
|
+
progress: progress
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def build_retry_report_writer
|
|
161
|
+
Molecules::SuiteReportWriter.new(config: Molecules::ConfigLoader.load)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def build_artifact_pruner
|
|
165
|
+
Molecules::ArtifactPruner.new
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def prune_artifacts_if_requested(output:, prune_artifacts:, quiet:)
|
|
169
|
+
return unless prune_artifacts
|
|
170
|
+
|
|
171
|
+
result = build_artifact_pruner.prune(base_dir: Dir.pwd)
|
|
172
|
+
return if quiet
|
|
173
|
+
|
|
174
|
+
output.puts(
|
|
175
|
+
"Pruned #{result[:deleted_count]} artifact(s) from #{result[:root_display]} (preserved suite reports and runtime-cache)"
|
|
176
|
+
)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def run_suite_with_retry(orchestrator, run_options:, output:, retry_failures_once:)
|
|
180
|
+
initial_results = orchestrator.run(run_options)
|
|
181
|
+
annotated = annotate_results(
|
|
182
|
+
initial_results,
|
|
183
|
+
retry_attempted: false,
|
|
184
|
+
attempts: 1,
|
|
185
|
+
flaky_scenarios: [],
|
|
186
|
+
remaining_failures: failure_scenarios(initial_results),
|
|
187
|
+
initial_report_path: initial_results[:report_path],
|
|
188
|
+
retry_report_path: nil,
|
|
189
|
+
report_path: initial_results[:report_path]
|
|
190
|
+
)
|
|
191
|
+
return annotated unless retry_failures_once && suite_failed?(initial_results)
|
|
192
|
+
|
|
193
|
+
output.puts "Retrying failed scenarios once..."
|
|
194
|
+
retry_results = orchestrator.run(run_options.merge(only_failures: true))
|
|
195
|
+
if retry_results[:total].zero?
|
|
196
|
+
raise Ace::Support::Cli::Error.new(
|
|
197
|
+
"Retry pass found no failed test scenarios from attempt 1; aborting instead of silently passing"
|
|
198
|
+
)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
flaky_scenarios = recovered_flaky_scenarios(initial_results, retry_results)
|
|
202
|
+
remaining_failures = failure_scenarios(retry_results)
|
|
203
|
+
final_report_path = write_retry_summary_report(initial_results, retry_results)
|
|
204
|
+
output.puts "Final Report: #{final_report_path}" if final_report_path
|
|
205
|
+
|
|
206
|
+
if remaining_failures.empty?
|
|
207
|
+
output.puts "#{flaky_scenarios.length} scenario(s) recovered on retry and were marked flaky"
|
|
208
|
+
else
|
|
209
|
+
output.puts "#{remaining_failures.length} scenario(s) still failing after retry"
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
annotate_results(
|
|
213
|
+
retry_results,
|
|
214
|
+
retry_attempted: true,
|
|
215
|
+
attempts: 2,
|
|
216
|
+
flaky_scenarios: flaky_scenarios,
|
|
217
|
+
remaining_failures: remaining_failures,
|
|
218
|
+
initial_report_path: initial_results[:report_path],
|
|
219
|
+
retry_report_path: retry_results[:report_path],
|
|
220
|
+
report_path: final_report_path || retry_results[:report_path]
|
|
221
|
+
)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def write_retry_summary_report(initial_results, retry_results)
|
|
225
|
+
build_retry_report_writer.write_retry_summary(
|
|
226
|
+
initial_results: initial_results,
|
|
227
|
+
retry_results: retry_results,
|
|
228
|
+
timestamp: Ace::B36ts.encode(Time.now.utc, format: :"50ms"),
|
|
229
|
+
base_dir: Dir.pwd
|
|
230
|
+
)
|
|
231
|
+
rescue => e
|
|
232
|
+
warn "Warning: Failed to write retry summary report: #{e.message}" if ENV["DEBUG"]
|
|
233
|
+
nil
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def annotate_results(results, **extra)
|
|
237
|
+
results.merge(extra)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def suite_failed?(results)
|
|
241
|
+
results[:failed].to_i > 0 || results[:errors].to_i > 0
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def failure_scenarios(results)
|
|
245
|
+
scenario_result_index(results)
|
|
246
|
+
.values
|
|
247
|
+
.select { |result| result[:status] != "pass" }
|
|
248
|
+
.map { |result| result[:test_id] }
|
|
249
|
+
.sort
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def recovered_flaky_scenarios(initial_results, retry_results)
|
|
253
|
+
initial_by_test = scenario_result_index(initial_results)
|
|
254
|
+
retry_by_test = scenario_result_index(retry_results)
|
|
255
|
+
|
|
256
|
+
initial_by_test.each_with_object([]) do |(test_id, initial), flaky|
|
|
257
|
+
next if initial[:status] == "pass"
|
|
258
|
+
|
|
259
|
+
retry_result = retry_by_test[test_id]
|
|
260
|
+
next unless retry_result && retry_result[:status] == "pass"
|
|
261
|
+
|
|
262
|
+
flaky << {
|
|
263
|
+
"test_id" => test_id,
|
|
264
|
+
"initial_status" => initial[:status],
|
|
265
|
+
"retry_status" => retry_result[:status]
|
|
266
|
+
}
|
|
267
|
+
end.sort_by { |entry| entry["test_id"] }
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def scenario_result_index(results)
|
|
271
|
+
results.fetch(:packages, {}).values.flatten.each_with_object({}) do |result, index|
|
|
272
|
+
test_name = result[:test_name] || result[:test_id] || ""
|
|
273
|
+
test_id = test_name[/\A(TS-[A-Z0-9]+-\d+[a-z]*)/i, 1]&.upcase || test_name
|
|
274
|
+
next if test_id.empty?
|
|
275
|
+
|
|
276
|
+
index[test_id] = {
|
|
277
|
+
test_id: test_id,
|
|
278
|
+
status: result[:status],
|
|
279
|
+
summary: result[:summary],
|
|
280
|
+
error: result[:error]
|
|
281
|
+
}
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def resolve_retry_failures_once(requested:, packages:, affected:, only_failures:, tags:, exclude_tags:)
|
|
286
|
+
scoped = scoped_suite_run?(
|
|
287
|
+
packages: packages,
|
|
288
|
+
affected: affected,
|
|
289
|
+
only_failures: only_failures,
|
|
290
|
+
tags: tags,
|
|
291
|
+
exclude_tags: exclude_tags
|
|
292
|
+
)
|
|
293
|
+
if requested == true && scoped
|
|
294
|
+
raise Ace::Support::Cli::Error.new(
|
|
295
|
+
"--retry-failures-once is only supported for full unfiltered suite runs"
|
|
296
|
+
)
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
return requested unless requested.nil?
|
|
300
|
+
|
|
301
|
+
!scoped
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
def scoped_suite_run?(packages:, affected:, only_failures:, tags:, exclude_tags:)
|
|
305
|
+
[packages, affected, only_failures].any? ||
|
|
306
|
+
!tags.empty? ||
|
|
307
|
+
!exclude_tags.empty?
|
|
308
|
+
end
|
|
309
|
+
|
|
120
310
|
def parse_csv_list(raw)
|
|
121
311
|
return [] if raw.nil? || raw.strip.empty?
|
|
122
312
|
|
|
@@ -35,6 +35,7 @@ module Ace
|
|
|
35
35
|
"ace-lint --provider gemini:flash # Use specific provider",
|
|
36
36
|
"ace-lint --provider glite # Use API provider (predict mode)",
|
|
37
37
|
"ace-lint --tags smoke # Run only smoke-tagged scenarios",
|
|
38
|
+
"ace-lint --prune-artifacts # Remove stale .ace-local/test-e2e artifacts before running",
|
|
38
39
|
"ace-lint TS-LINT-003 --dry-run # Preview preflight and scenario phases"
|
|
39
40
|
]
|
|
40
41
|
|
|
@@ -60,6 +61,8 @@ module Ace
|
|
|
60
61
|
desc: "Comma-separated scenario tags to include"
|
|
61
62
|
option :verify, type: :boolean,
|
|
62
63
|
desc: "Run independent verifier pass after runner execution"
|
|
64
|
+
option :prune_artifacts, type: :boolean,
|
|
65
|
+
desc: "Remove stale .ace-local/test-e2e artifacts before running (preserves final reports and runtime-cache)"
|
|
63
66
|
option :quiet, type: :boolean, aliases: %w[-q], desc: "Suppress non-essential output"
|
|
64
67
|
option :verbose, type: :boolean, aliases: %w[-v], desc: "Show verbose output"
|
|
65
68
|
option :debug, type: :boolean, aliases: %w[-d], desc: "Show debug output"
|
|
@@ -67,13 +70,22 @@ module Ace
|
|
|
67
70
|
def call(package:, test_id: nil, **options)
|
|
68
71
|
options = coerce_types(options, timeout: :integer, parallel: :integer)
|
|
69
72
|
output = quiet?(options) ? StringIO.new : $stdout
|
|
73
|
+
prune_artifacts = !!options[:prune_artifacts]
|
|
74
|
+
|
|
75
|
+
if options[:dry_run] && prune_artifacts
|
|
76
|
+
raise Ace::Support::Cli::Error.new(
|
|
77
|
+
"--prune-artifacts cannot be used with --dry-run"
|
|
78
|
+
)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
prune_artifacts_if_requested(output: output, prune_artifacts: prune_artifacts, quiet: quiet?(options))
|
|
70
82
|
|
|
71
83
|
# Handle dry-run mode
|
|
72
84
|
if options[:dry_run]
|
|
73
85
|
return handle_dry_run(package, test_id, output, tags: parse_tags(options[:tags]))
|
|
74
86
|
end
|
|
75
87
|
|
|
76
|
-
orchestrator =
|
|
88
|
+
orchestrator = build_orchestrator(
|
|
77
89
|
provider: options[:provider],
|
|
78
90
|
timeout: options[:timeout],
|
|
79
91
|
parallel: options[:parallel],
|
|
@@ -110,6 +122,30 @@ module Ace
|
|
|
110
122
|
|
|
111
123
|
private
|
|
112
124
|
|
|
125
|
+
def build_orchestrator(provider:, timeout:, parallel:, progress:)
|
|
126
|
+
Organisms::TestOrchestrator.new(
|
|
127
|
+
provider: provider,
|
|
128
|
+
timeout: timeout,
|
|
129
|
+
parallel: parallel,
|
|
130
|
+
progress: progress
|
|
131
|
+
)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def build_artifact_pruner
|
|
135
|
+
Molecules::ArtifactPruner.new
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def prune_artifacts_if_requested(output:, prune_artifacts:, quiet:)
|
|
139
|
+
return unless prune_artifacts
|
|
140
|
+
|
|
141
|
+
result = build_artifact_pruner.prune(base_dir: Dir.pwd)
|
|
142
|
+
return if quiet
|
|
143
|
+
|
|
144
|
+
output.puts(
|
|
145
|
+
"Pruned #{result[:deleted_count]} artifact(s) from #{result[:root_display]} (preserved final reports and runtime-cache)"
|
|
146
|
+
)
|
|
147
|
+
end
|
|
148
|
+
|
|
113
149
|
# Handle dry-run mode: preview which preflight tests and scenarios would run
|
|
114
150
|
#
|
|
115
151
|
# @param package [String] Package name
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
|
|
5
|
+
module Ace
|
|
6
|
+
module Test
|
|
7
|
+
module EndToEndRunner
|
|
8
|
+
module Molecules
|
|
9
|
+
# Prunes stale E2E run artifacts while preserving suite reports and runtime cache.
|
|
10
|
+
class ArtifactPruner
|
|
11
|
+
ROOT_RELATIVE_PATH = File.join(".ace-local", "test-e2e")
|
|
12
|
+
PRESERVED_DIRECTORY_NAMES = %w[runtime-cache].freeze
|
|
13
|
+
PRESERVED_FILE_PATTERNS = [
|
|
14
|
+
/-suite-report\.md\z/,
|
|
15
|
+
/-suite-final-report\.md\z/
|
|
16
|
+
].freeze
|
|
17
|
+
|
|
18
|
+
def prune(base_dir: Dir.pwd)
|
|
19
|
+
root = File.join(File.expand_path(base_dir), ROOT_RELATIVE_PATH)
|
|
20
|
+
return summary(root, [], []) unless Dir.exist?(root)
|
|
21
|
+
|
|
22
|
+
removed_paths = []
|
|
23
|
+
preserved_paths = []
|
|
24
|
+
|
|
25
|
+
Dir.children(root).sort.each do |entry|
|
|
26
|
+
path = File.join(root, entry)
|
|
27
|
+
if preserve_entry?(entry, path)
|
|
28
|
+
preserved_paths << path
|
|
29
|
+
else
|
|
30
|
+
FileUtils.rm_rf(path)
|
|
31
|
+
removed_paths << path
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
summary(root, removed_paths, preserved_paths)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def preserve_entry?(entry, path)
|
|
41
|
+
return true if File.directory?(path) && PRESERVED_DIRECTORY_NAMES.include?(entry)
|
|
42
|
+
return false unless File.file?(path)
|
|
43
|
+
|
|
44
|
+
PRESERVED_FILE_PATTERNS.any? { |pattern| pattern.match?(entry) }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def summary(root, removed_paths, preserved_paths)
|
|
48
|
+
{
|
|
49
|
+
root: root,
|
|
50
|
+
root_display: ROOT_RELATIVE_PATH,
|
|
51
|
+
removed_paths: removed_paths,
|
|
52
|
+
preserved_paths: preserved_paths,
|
|
53
|
+
deleted_count: removed_paths.length,
|
|
54
|
+
preserved_count: preserved_paths.length
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -69,7 +69,8 @@ module Ace
|
|
|
69
69
|
runner = @prompt_bundler.prepare_runner(
|
|
70
70
|
scenario: scenario,
|
|
71
71
|
sandbox_path: sandbox_path,
|
|
72
|
-
test_cases: test_cases
|
|
72
|
+
test_cases: test_cases,
|
|
73
|
+
artifact_contract: declared_artifact_contract(scenario, test_cases: test_cases)
|
|
73
74
|
)
|
|
74
75
|
runner_response = run_llm(
|
|
75
76
|
prompt_path: runner[:prompt_path],
|
|
@@ -78,10 +79,44 @@ module Ace
|
|
|
78
79
|
cli_args: cli_args,
|
|
79
80
|
env_vars: merged_env,
|
|
80
81
|
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
81
|
-
provider: @provider
|
|
82
|
+
provider: @provider,
|
|
83
|
+
fallback: false
|
|
82
84
|
)
|
|
83
85
|
runner_observations = extract_runner_observations(runner_response[:text])
|
|
84
|
-
|
|
86
|
+
initial_artifact_contract = snapshot_artifacts(
|
|
87
|
+
report_dir,
|
|
88
|
+
sandbox_path,
|
|
89
|
+
scenario,
|
|
90
|
+
test_cases: test_cases,
|
|
91
|
+
snapshot_name: "artifact-snapshot.initial.json"
|
|
92
|
+
)
|
|
93
|
+
artifact_contract = initial_artifact_contract
|
|
94
|
+
|
|
95
|
+
if missing_required_artifacts?(artifact_contract)
|
|
96
|
+
write_command_record(report_dir, "runner-repair", provider: @provider, cli_args: cli_args)
|
|
97
|
+
repair_runner = @prompt_bundler.prepare_runner(
|
|
98
|
+
scenario: scenario,
|
|
99
|
+
sandbox_path: sandbox_path,
|
|
100
|
+
test_cases: test_cases,
|
|
101
|
+
artifact_contract: artifact_contract,
|
|
102
|
+
repair_mode: true
|
|
103
|
+
)
|
|
104
|
+
repair_response = run_llm(
|
|
105
|
+
prompt_path: repair_runner[:prompt_path],
|
|
106
|
+
system_path: repair_runner[:system_path],
|
|
107
|
+
output_path: repair_runner[:output_path],
|
|
108
|
+
cli_args: cli_args,
|
|
109
|
+
env_vars: merged_env,
|
|
110
|
+
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
111
|
+
provider: @provider,
|
|
112
|
+
fallback: false
|
|
113
|
+
)
|
|
114
|
+
repair_observations = extract_runner_observations(repair_response[:text])
|
|
115
|
+
runner_observations = merge_runner_observations(runner_observations, repair_observations)
|
|
116
|
+
artifact_contract = snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases: test_cases)
|
|
117
|
+
else
|
|
118
|
+
write_artifact_snapshot(report_dir, "artifact-snapshot.json", artifact_contract)
|
|
119
|
+
end
|
|
85
120
|
|
|
86
121
|
verifier = @prompt_bundler.prepare_verifier(
|
|
87
122
|
scenario: scenario,
|
|
@@ -98,7 +133,8 @@ module Ace
|
|
|
98
133
|
cli_args: cli_args,
|
|
99
134
|
env_vars: merged_env,
|
|
100
135
|
subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
|
|
101
|
-
provider: @verifier_provider
|
|
136
|
+
provider: @verifier_provider,
|
|
137
|
+
fallback: query_fallback_for(@verifier_provider)
|
|
102
138
|
)
|
|
103
139
|
|
|
104
140
|
@report_generator.generate(
|
|
@@ -111,7 +147,8 @@ module Ace
|
|
|
111
147
|
metadata: base_metadata(
|
|
112
148
|
report_dir,
|
|
113
149
|
runner_observations: runner_observations,
|
|
114
|
-
artifact_contract: artifact_contract
|
|
150
|
+
artifact_contract: artifact_contract,
|
|
151
|
+
initial_artifact_contract: initial_artifact_contract
|
|
115
152
|
)
|
|
116
153
|
)
|
|
117
154
|
rescue => e
|
|
@@ -140,7 +177,7 @@ module Ace
|
|
|
140
177
|
|
|
141
178
|
private
|
|
142
179
|
|
|
143
|
-
def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:, subprocess_command_prefix:, provider:)
|
|
180
|
+
def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:, subprocess_command_prefix:, provider:, fallback:)
|
|
144
181
|
prompt = File.read(prompt_path)
|
|
145
182
|
system = File.read(system_path)
|
|
146
183
|
sandbox_dir = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
|
|
@@ -151,7 +188,7 @@ module Ace
|
|
|
151
188
|
system: system,
|
|
152
189
|
cli_args: cli_args,
|
|
153
190
|
timeout: @timeout,
|
|
154
|
-
fallback:
|
|
191
|
+
fallback: fallback,
|
|
155
192
|
output: output_path,
|
|
156
193
|
subprocess_env: env_vars,
|
|
157
194
|
subprocess_command_prefix: subprocess_command_prefix,
|
|
@@ -159,6 +196,10 @@ module Ace
|
|
|
159
196
|
)
|
|
160
197
|
end
|
|
161
198
|
|
|
199
|
+
def query_fallback_for(provider)
|
|
200
|
+
provider.to_s.start_with?("role:")
|
|
201
|
+
end
|
|
202
|
+
|
|
162
203
|
def write_tc_manifests(report_dir, scenario, test_cases:)
|
|
163
204
|
selected = select_test_cases(scenario, test_cases)
|
|
164
205
|
selected.each do |test_case|
|
|
@@ -190,12 +231,18 @@ module Ace
|
|
|
190
231
|
)
|
|
191
232
|
end
|
|
192
233
|
|
|
193
|
-
def snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases:)
|
|
194
|
-
snapshot =
|
|
234
|
+
def snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases:, snapshot_name: "artifact-snapshot.json")
|
|
235
|
+
snapshot = declared_artifact_contract(scenario, test_cases: test_cases, sandbox_path: sandbox_path)
|
|
236
|
+
write_artifact_snapshot(report_dir, snapshot_name, snapshot)
|
|
237
|
+
snapshot
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def declared_artifact_contract(scenario, test_cases:, sandbox_path: nil)
|
|
241
|
+
select_test_cases(scenario, test_cases).to_h do |test_case|
|
|
195
242
|
required = Array(test_case.declared_artifacts).sort
|
|
196
243
|
optional = Array(test_case.optional_artifacts).sort
|
|
197
|
-
present_required = required
|
|
198
|
-
present_optional = optional
|
|
244
|
+
present_required = present_artifacts(required, sandbox_path)
|
|
245
|
+
present_optional = present_artifacts(optional, sandbox_path)
|
|
199
246
|
missing_required = required - present_required
|
|
200
247
|
|
|
201
248
|
[test_case.tc_id, {
|
|
@@ -207,8 +254,31 @@ module Ace
|
|
|
207
254
|
"present_optional_artifacts" => present_optional
|
|
208
255
|
}]
|
|
209
256
|
end
|
|
210
|
-
|
|
211
|
-
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def write_artifact_snapshot(report_dir, snapshot_name, snapshot)
|
|
260
|
+
File.write(File.join(report_dir, snapshot_name), JSON.pretty_generate(snapshot))
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def present_artifacts(paths, sandbox_path)
|
|
264
|
+
return [] unless sandbox_path
|
|
265
|
+
|
|
266
|
+
Array(paths).select { |path| File.exist?(File.join(sandbox_path, path)) }
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def missing_required_artifacts?(artifact_contract)
|
|
270
|
+
artifact_contract.any? do |_tc_id, entry|
|
|
271
|
+
Array(entry["missing_required_artifacts"]).any?
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def merge_runner_observations(initial_observations, repair_observations)
|
|
276
|
+
initial = initial_observations.to_s.strip
|
|
277
|
+
repair = repair_observations.to_s.strip
|
|
278
|
+
return initial if repair.empty?
|
|
279
|
+
return repair if initial.empty?
|
|
280
|
+
|
|
281
|
+
"#{initial}\n\nRepair pass:\n#{repair}"
|
|
212
282
|
end
|
|
213
283
|
|
|
214
284
|
def select_test_cases(scenario, test_cases)
|
|
@@ -218,7 +288,7 @@ module Ace
|
|
|
218
288
|
Array(scenario.test_cases).select { |tc| wanted.include?(tc.tc_id.to_s.upcase) }
|
|
219
289
|
end
|
|
220
290
|
|
|
221
|
-
def base_metadata(report_dir, runner_observations: nil, artifact_contract: nil)
|
|
291
|
+
def base_metadata(report_dir, runner_observations: nil, artifact_contract: nil, initial_artifact_contract: nil)
|
|
222
292
|
metadata = {
|
|
223
293
|
"runner_provider" => @provider,
|
|
224
294
|
"verifier_provider" => @verifier_provider,
|
|
@@ -232,6 +302,12 @@ module Ace
|
|
|
232
302
|
Array(entry["missing_required_artifacts"])
|
|
233
303
|
end.reject { |_tc_id, paths| paths.empty? }
|
|
234
304
|
end
|
|
305
|
+
if initial_artifact_contract
|
|
306
|
+
metadata["initial_missing_required_artifacts"] = initial_artifact_contract.to_h.transform_values do |entry|
|
|
307
|
+
Array(entry["missing_required_artifacts"])
|
|
308
|
+
end.reject { |_tc_id, paths| paths.empty? }
|
|
309
|
+
metadata["artifact_repair_attempted"] = true if missing_required_artifacts?(initial_artifact_contract)
|
|
310
|
+
end
|
|
235
311
|
metadata
|
|
236
312
|
end
|
|
237
313
|
|
|
@@ -53,21 +53,23 @@ module Ace
|
|
|
53
53
|
# @param sandbox_path [String]
|
|
54
54
|
# @param test_cases [Array<String>, nil]
|
|
55
55
|
# @return [Hash]
|
|
56
|
-
def prepare_runner(scenario:, sandbox_path:, test_cases: nil)
|
|
56
|
+
def prepare_runner(scenario:, sandbox_path:, test_cases: nil, artifact_contract: nil, repair_mode: false)
|
|
57
57
|
cache_dir = ensure_cache_dir(sandbox_path)
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
file_prefix = repair_mode ? "runner-repair" : "runner"
|
|
59
|
+
system_path = File.join(cache_dir, "#{file_prefix}-system.md")
|
|
60
|
+
prompt_path = File.join(cache_dir, "#{file_prefix}-prompt.md")
|
|
60
61
|
|
|
61
62
|
File.write(system_path, RUNNER_SYSTEM_PROMPT)
|
|
62
63
|
|
|
63
64
|
bundled = bundle_markdown_file(File.join(scenario.dir_path, "runner.yml.md"), test_cases: test_cases)
|
|
64
65
|
bundled = bundled.gsub("Workspace root: (current directory)", "Workspace root: #{File.expand_path(sandbox_path)}")
|
|
65
|
-
|
|
66
|
+
contract = build_runner_artifact_contract_section(artifact_contract, repair_mode: repair_mode)
|
|
67
|
+
File.write(prompt_path, [bundled, contract].reject(&:empty?).join("\n\n---\n\n"))
|
|
66
68
|
|
|
67
69
|
{
|
|
68
70
|
system_path: system_path,
|
|
69
71
|
prompt_path: prompt_path,
|
|
70
|
-
output_path: File.join(cache_dir, "
|
|
72
|
+
output_path: File.join(cache_dir, "#{file_prefix}-output.md")
|
|
71
73
|
}
|
|
72
74
|
end
|
|
73
75
|
|
|
@@ -251,6 +253,42 @@ module Ace
|
|
|
251
253
|
MARKDOWN
|
|
252
254
|
end
|
|
253
255
|
|
|
256
|
+
def build_runner_artifact_contract_section(artifact_contract, repair_mode:)
|
|
257
|
+
return "" if artifact_contract.nil? || artifact_contract.empty?
|
|
258
|
+
|
|
259
|
+
parts = []
|
|
260
|
+
parts << "# Artifact Contract"
|
|
261
|
+
parts << ""
|
|
262
|
+
if repair_mode
|
|
263
|
+
parts << "This is a bounded repair pass."
|
|
264
|
+
parts << "- Do not rerun goals whose required artifacts are already complete."
|
|
265
|
+
parts << "- For each goal with missing required artifacts, produce only the missing files."
|
|
266
|
+
parts << "- Prefer the minimal real public command needed to create the missing capture set."
|
|
267
|
+
parts << "- If the missing file is supporting evidence copied from an already-generated real artifact, copy that real artifact into `results/`."
|
|
268
|
+
parts << "- Do not invent content, fabricate captures, or rewrite unrelated artifacts."
|
|
269
|
+
else
|
|
270
|
+
parts << "A goal is not complete unless every required artifact for that goal exists on disk under `results/`."
|
|
271
|
+
parts << "- After finishing each goal, self-check the required artifact list below."
|
|
272
|
+
parts << "- If a required artifact is missing, fix it before moving on."
|
|
273
|
+
end
|
|
274
|
+
parts << ""
|
|
275
|
+
|
|
276
|
+
artifact_contract.sort.each do |tc_id, entry|
|
|
277
|
+
parts << "## #{tc_id}"
|
|
278
|
+
parts << ""
|
|
279
|
+
parts << "- Required artifacts: #{format_artifact_list(entry["required_artifacts"])}"
|
|
280
|
+
missing = Array(entry["missing_required_artifacts"])
|
|
281
|
+
unless missing.empty?
|
|
282
|
+
parts << "- Missing required artifacts: #{format_artifact_list(missing)}"
|
|
283
|
+
end
|
|
284
|
+
optional = Array(entry["optional_artifacts"])
|
|
285
|
+
parts << "- Optional artifacts: #{format_artifact_list(optional)}" unless optional.empty?
|
|
286
|
+
parts << ""
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
parts.join("\n").rstrip
|
|
290
|
+
end
|
|
291
|
+
|
|
254
292
|
def build_artifact_contract_section(artifact_contract)
|
|
255
293
|
return "# Artifact Contract\n\n(no snapshot provided)" if artifact_contract.nil? || artifact_contract.empty?
|
|
256
294
|
|