agentf 0.4.6 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,771 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "json"
5
+ require "open3"
6
+ require "rbconfig"
7
+ require "securerandom"
8
+ require "shellwords"
9
+ require "tempfile"
10
+ require "tmpdir"
11
+ require "timeout"
12
+ require "time"
13
+ require_relative "scenario"
14
+ require_relative "../mcp/server"
15
+ require_relative "../installer"
16
+
17
+ module Agentf
18
+ module Evals
19
+ class Runner
20
+ DEFAULT_OUTPUT_ROOT = File.expand_path("../../../tmp/evals", __dir__)
21
+
22
+ def initialize(root: nil, agentf_bin: nil, ruby_bin: RbConfig.ruby, output_root: DEFAULT_OUTPUT_ROOT)
23
+ @root = File.expand_path(root || File.join(Dir.pwd, "evals"))
24
+ @agentf_bin = File.expand_path(agentf_bin || File.join(__dir__, "../../../bin/agentf"))
25
+ @ruby_bin = ruby_bin
26
+ @output_root = File.expand_path(output_root || DEFAULT_OUTPUT_ROOT)
27
+ end
28
+
29
+ attr_reader :root, :agentf_bin, :ruby_bin, :output_root
30
+
31
+ def list
32
+ Scenario.discover(root)
33
+ end
34
+
35
+ def run(name:, keep_workspace: false, timeout_seconds: nil)
36
+ scenarios = resolve_scenarios(name)
37
+ started_at = Time.now.utc
38
+ FileUtils.mkdir_p(output_root)
39
+
40
+ results = scenarios.map do |scenario|
41
+ run_scenario(scenario, keep_workspace: keep_workspace, timeout_seconds: timeout_seconds)
42
+ end
43
+
44
+ {
45
+ "root" => root,
46
+ "output_root" => output_root,
47
+ "started_at" => started_at.iso8601,
48
+ "finished_at" => Time.now.utc.iso8601,
49
+ "count" => results.length,
50
+ "passed" => results.count { |result| result["status"] == "passed" },
51
+ "failed" => results.count { |result| result["status"] == "failed" },
52
+ "matrix" => summarize_matrix(results),
53
+ "results" => results
54
+ }
55
+ end
56
+
57
+ private
58
+
59
+ def resolve_scenarios(name)
60
+ scenarios = list
61
+ raise ArgumentError, "No eval scenarios found under #{root}" if scenarios.empty?
62
+
63
+ return scenarios if name.to_s == "all"
64
+
65
+ scenario = scenarios.find { |item| item.name == name }
66
+ raise ArgumentError, "Unknown eval scenario: #{name}" unless scenario
67
+
68
+ [scenario]
69
+ end
70
+
71
+ def run_scenario(scenario, keep_workspace:, timeout_seconds: nil)
72
+ scenario.validate!
73
+ artifact_dir = create_artifact_dir(scenario.name)
74
+ workspace = Dir.mktmpdir("agentf-eval-#{scenario.name}-")
75
+ copy_workspace_fixture(scenario, workspace)
76
+
77
+ env = build_env(scenario, workspace, artifact_dir)
78
+ effective_timeout = timeout_seconds || scenario.timeout_seconds
79
+
80
+ setup_result = run_optional_script(
81
+ script_path: scenario.setup_script_path,
82
+ step_name: "setup",
83
+ workspace: workspace,
84
+ artifact_dir: artifact_dir,
85
+ env: env,
86
+ timeout_seconds: effective_timeout
87
+ )
88
+
89
+ if setup_result["status"] == "failed"
90
+ return finalize_result(
91
+ scenario: scenario,
92
+ artifact_dir: artifact_dir,
93
+ workspace: workspace,
94
+ keep_workspace: keep_workspace,
95
+ status: "failed",
96
+ setup: setup_result,
97
+ agent_run: nil,
98
+ verify: nil,
99
+ failure_step: "setup"
100
+ )
101
+ end
102
+
103
+ execution_result = run_execution(
104
+ scenario: scenario,
105
+ workspace: workspace,
106
+ artifact_dir: artifact_dir,
107
+ env: env,
108
+ timeout_seconds: effective_timeout
109
+ )
110
+
111
+ if execution_result["status"] == "failed"
112
+ return finalize_result(
113
+ scenario: scenario,
114
+ artifact_dir: artifact_dir,
115
+ workspace: workspace,
116
+ keep_workspace: keep_workspace,
117
+ status: "failed",
118
+ setup: setup_result,
119
+ agent_run: execution_result,
120
+ verify: nil,
121
+ failure_step: scenario.execution_mode
122
+ )
123
+ end
124
+
125
+ verify_result = run_required_script(
126
+ script_path: scenario.verify_script_path,
127
+ step_name: "verify",
128
+ workspace: workspace,
129
+ artifact_dir: artifact_dir,
130
+ env: env,
131
+ timeout_seconds: effective_timeout
132
+ )
133
+
134
+ finalize_result(
135
+ scenario: scenario,
136
+ artifact_dir: artifact_dir,
137
+ workspace: workspace,
138
+ keep_workspace: keep_workspace,
139
+ status: verify_result["status"] == "passed" ? "passed" : "failed",
140
+ setup: setup_result,
141
+ agent_run: execution_result,
142
+ verify: verify_result,
143
+ failure_step: verify_result["status"] == "passed" ? nil : "verify"
144
+ )
145
+ rescue StandardError => e
146
+ finalize_result(
147
+ scenario: scenario,
148
+ artifact_dir: artifact_dir,
149
+ workspace: workspace,
150
+ keep_workspace: keep_workspace,
151
+ status: "failed",
152
+ setup: setup_result,
153
+ agent_run: nil,
154
+ verify: nil,
155
+ failure_step: "exception",
156
+ error: {
157
+ "class" => e.class.name,
158
+ "message" => e.message,
159
+ "backtrace" => Array(e.backtrace).first(8)
160
+ }
161
+ )
162
+ end
163
+
164
+ def create_artifact_dir(scenario_name)
165
+ timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S")
166
+ path = File.join(output_root, "#{timestamp}-#{scenario_name}")
167
+ FileUtils.mkdir_p(path)
168
+ path
169
+ end
170
+
171
+ def copy_workspace_fixture(scenario, workspace)
172
+ return unless scenario.workspace_path
173
+
174
+ FileUtils.cp_r(File.join(scenario.workspace_path, "."), workspace)
175
+ end
176
+
177
+ def build_env(scenario, workspace, artifact_dir)
178
+ project_name = "agentf-eval-#{scenario.name}-#{SecureRandom.hex(4)}"
179
+
180
+ {
181
+ "AGENTF_PROJECT_NAME" => project_name,
182
+ "AGENTF_AUTO_CONFIRM_MEMORIES" => scenario.auto_confirm_memories?.to_s,
183
+ "AGENTF_GEM_PATH" => File.expand_path("../../..", __dir__),
184
+ "AGENTF_EVAL_SCENARIO" => scenario.name,
185
+ "AGENTF_EVAL_SCENARIO_DIR" => scenario.path,
186
+ "AGENTF_EVAL_WORKDIR" => workspace,
187
+ "AGENTF_EVAL_ARTIFACT_DIR" => artifact_dir,
188
+ "AGENTF_EVAL_AGENTF_BIN" => agentf_bin,
189
+ "AGENTF_EVAL_RUBY" => ruby_bin,
190
+ "AGENTF_EVAL_RESULT_JSON" => File.join(artifact_dir, "agent_result.json"),
191
+ "AGENTF_EVAL_STDOUT" => File.join(artifact_dir, "agent_stdout.log"),
192
+ "AGENTF_EVAL_STDERR" => File.join(artifact_dir, "agent_stderr.log"),
193
+ "AGENTF_EVAL_HISTORY_PATH" => File.join(output_root, "history.jsonl")
194
+ }.merge(scenario.env)
195
+ end
196
+
197
+ def run_optional_script(script_path:, step_name:, workspace:, artifact_dir:, env:, timeout_seconds:)
198
+ return { "step" => step_name, "status" => "passed", "skipped" => true } unless script_path
199
+
200
+ run_script(
201
+ script_path: script_path,
202
+ step_name: step_name,
203
+ workspace: workspace,
204
+ artifact_dir: artifact_dir,
205
+ env: env,
206
+ timeout_seconds: timeout_seconds
207
+ )
208
+ end
209
+
210
+ def run_required_script(script_path:, step_name:, workspace:, artifact_dir:, env:, timeout_seconds:)
211
+ run_script(
212
+ script_path: script_path,
213
+ step_name: step_name,
214
+ workspace: workspace,
215
+ artifact_dir: artifact_dir,
216
+ env: env,
217
+ timeout_seconds: timeout_seconds
218
+ )
219
+ end
220
+
221
+ def run_script(script_path:, step_name:, workspace:, artifact_dir:, env:, timeout_seconds:)
222
+ command = ["sh", script_path]
223
+ execute_command(
224
+ command: command,
225
+ step_name: step_name,
226
+ workspace: workspace,
227
+ artifact_dir: artifact_dir,
228
+ env: env,
229
+ timeout_seconds: timeout_seconds
230
+ )
231
+ end
232
+
233
+ def run_execution(scenario:, workspace:, artifact_dir:, env:, timeout_seconds:)
234
+ case scenario.execution_mode
235
+ when "agent"
236
+ run_agent(scenario: scenario, workspace: workspace, artifact_dir: artifact_dir, env: env, timeout_seconds: timeout_seconds)
237
+ when "mcp"
238
+ run_mcp(scenario: scenario, workspace: workspace, artifact_dir: artifact_dir, env: env)
239
+ when "provider"
240
+ run_provider(scenario: scenario, workspace: workspace, artifact_dir: artifact_dir, env: env, timeout_seconds: timeout_seconds)
241
+ when "provider_runtime"
242
+ run_provider_runtime(scenario: scenario, workspace: workspace, artifact_dir: artifact_dir, env: env, timeout_seconds: timeout_seconds)
243
+ else
244
+ raise ArgumentError, "Unknown execution mode: #{scenario.execution_mode}"
245
+ end
246
+ end
247
+
248
+ def run_agent(scenario:, workspace:, artifact_dir:, env:, timeout_seconds:)
249
+ command = [ruby_bin, agentf_bin, "agent", scenario.agent, scenario.prompt, "--json"]
250
+ result = execute_command(
251
+ command: command,
252
+ step_name: "agent",
253
+ workspace: workspace,
254
+ artifact_dir: artifact_dir,
255
+ env: env.merge("AGENTF_SUPPRESS_AGENT_LOGS" => "true"),
256
+ timeout_seconds: timeout_seconds
257
+ )
258
+
259
+ parsed_output = extract_json_payload(result["stdout"])
260
+ File.write(env.fetch("AGENTF_EVAL_RESULT_JSON"), JSON.pretty_generate(parsed_output || { "raw_stdout" => result["stdout"] }))
261
+ result["parsed_output"] = parsed_output
262
+
263
+ retry_result = maybe_retry_agent_confirmation(
264
+ scenario: scenario,
265
+ initial_result: result,
266
+ workspace: workspace,
267
+ artifact_dir: artifact_dir,
268
+ env: env,
269
+ timeout_seconds: timeout_seconds
270
+ )
271
+
272
+ retry_result || result
273
+ end
274
+
275
+ def maybe_retry_agent_confirmation(scenario:, initial_result:, workspace:, artifact_dir:, env:, timeout_seconds:)
276
+ parsed_output = initial_result["parsed_output"]
277
+ return nil unless scenario.retry_on_confirmation?
278
+ return nil unless parsed_output.is_a?(Hash) && parsed_output["confirmation_required"] == true
279
+
280
+ retry_command = [ruby_bin, agentf_bin, "agent", scenario.agent, scenario.prompt, "--json", "--confirmed-write=#{scenario.confirmed_write_token}"]
281
+ retry_result = execute_command(
282
+ command: retry_command,
283
+ step_name: "agent_retry",
284
+ workspace: workspace,
285
+ artifact_dir: artifact_dir,
286
+ env: env.merge("AGENTF_SUPPRESS_AGENT_LOGS" => "true"),
287
+ timeout_seconds: timeout_seconds
288
+ )
289
+
290
+ retry_parsed_output = extract_json_payload(retry_result["stdout"])
291
+ retry_result["parsed_output"] = retry_parsed_output
292
+ retry_result["retry_count"] = 1
293
+ retry_result["flaky"] = retry_result["status"] == "passed"
294
+ initial_result["retry"] = retry_result
295
+ File.write(env.fetch("AGENTF_EVAL_RESULT_JSON"), JSON.pretty_generate(retry_parsed_output || { "raw_stdout" => retry_result["stdout"] }))
296
+ retry_result
297
+ end
298
+
299
+ def run_mcp(scenario:, workspace:, artifact_dir:, env:)
300
+ started_at = Time.now.utc
301
+ stdout_path = File.join(artifact_dir, "mcp_stdout.log")
302
+ stderr_path = File.join(artifact_dir, "mcp_stderr.log")
303
+ stdout = ""
304
+ stderr = ""
305
+ parsed_output = nil
306
+ status = "passed"
307
+
308
+ begin
309
+ project = env.fetch("AGENTF_PROJECT_NAME")
310
+ server = Agentf::MCP::Server.new(
311
+ explorer: Agentf::Commands::Explorer.new(base_path: workspace),
312
+ reviewer: Agentf::Commands::MemoryReviewer.new(project: project, memory: Agentf::Memory::RedisMemory.new(project: project)),
313
+ memory: Agentf::Memory::RedisMemory.new(project: project),
314
+ env: ENV.to_h.merge(env)
315
+ )
316
+ parsed_output = call_mcp_tool(server: server, tool_name: scenario.mcp_tool, payload: scenario.prompt_payload)
317
+ stdout = JSON.generate(parsed_output)
318
+ rescue StandardError => e
319
+ status = "failed"
320
+ stderr = "#{e.class}: #{e.message}\n#{Array(e.backtrace).first(8).join("\n")}"
321
+ end
322
+
323
+ File.write(stdout_path, stdout)
324
+ File.write(stderr_path, stderr)
325
+ File.write(env.fetch("AGENTF_EVAL_RESULT_JSON"), JSON.pretty_generate(parsed_output || { "raw_stdout" => stdout, "stderr" => stderr }))
326
+
327
+ {
328
+ "step" => "mcp",
329
+ "status" => status,
330
+ "command" => "mcp:#{scenario.mcp_tool}",
331
+ "exit_code" => status == "passed" ? 0 : 1,
332
+ "started_at" => started_at.iso8601,
333
+ "finished_at" => Time.now.utc.iso8601,
334
+ "stdout" => stdout,
335
+ "stderr" => stderr,
336
+ "stdout_path" => stdout_path,
337
+ "stderr_path" => stderr_path,
338
+ "parsed_output" => parsed_output
339
+ }
340
+ end
341
+
342
+ def run_provider(scenario:, workspace:, artifact_dir:, env:, timeout_seconds:)
343
+ install_result = install_provider_manifests(scenario: scenario, workspace: workspace)
344
+ return install_result if install_result["status"] == "failed"
345
+
346
+ provider_command = parse_provider_command(scenario.prompt)
347
+ command = [ruby_bin, agentf_bin, *provider_command]
348
+ result = execute_command(
349
+ command: command,
350
+ step_name: "provider",
351
+ workspace: workspace,
352
+ artifact_dir: artifact_dir,
353
+ env: env,
354
+ timeout_seconds: timeout_seconds
355
+ )
356
+
357
+ parsed_output = extract_json_payload(result["stdout"])
358
+ result["parsed_output"] = parsed_output
359
+ result["install"] = install_result
360
+ File.write(env.fetch("AGENTF_EVAL_RESULT_JSON"), JSON.pretty_generate(parsed_output || { "raw_stdout" => result["stdout"] }))
361
+ result
362
+ end
363
+
364
+ def run_provider_runtime(scenario:, workspace:, artifact_dir:, env:, timeout_seconds:)
365
+ install_result = install_provider_manifests(scenario: scenario, workspace: workspace)
366
+ return install_result if install_result["status"] == "failed"
367
+
368
+ case scenario.provider_name
369
+ when "opencode"
370
+ run_opencode_plugin_tool(
371
+ scenario: scenario,
372
+ workspace: workspace,
373
+ artifact_dir: artifact_dir,
374
+ env: env,
375
+ timeout_seconds: timeout_seconds,
376
+ install_result: install_result
377
+ )
378
+ when "copilot"
379
+ run_copilot_runtime_tool(
380
+ scenario: scenario,
381
+ workspace: workspace,
382
+ artifact_dir: artifact_dir,
383
+ env: env,
384
+ timeout_seconds: timeout_seconds,
385
+ install_result: install_result
386
+ )
387
+ else
388
+ raise ArgumentError, "Unsupported provider runtime eval: #{scenario.provider_name}"
389
+ end
390
+ end
391
+
392
+ def execute_command(command:, step_name:, workspace:, artifact_dir:, env:, timeout_seconds:)
393
+ stdout_path = File.join(artifact_dir, "#{step_name}_stdout.log")
394
+ stderr_path = File.join(artifact_dir, "#{step_name}_stderr.log")
395
+ started_at = Time.now.utc
396
+
397
+ stdout = ""
398
+ stderr = ""
399
+ status = nil
400
+
401
+ begin
402
+ Timeout.timeout(timeout_seconds) do
403
+ stdout, stderr, status = Open3.capture3(env, *command, chdir: workspace)
404
+ end
405
+ rescue Timeout::Error
406
+ stdout ||= ""
407
+ stderr = [stderr, "Command timed out after #{timeout_seconds} seconds"].compact.join("\n")
408
+ end
409
+
410
+ File.write(stdout_path, stdout)
411
+ File.write(stderr_path, stderr)
412
+
413
+ success = status&.success? && !stderr.include?("Command timed out after")
414
+
415
+ {
416
+ "step" => step_name,
417
+ "status" => success ? "passed" : "failed",
418
+ "command" => command.map { |part| Shellwords.escape(part.to_s) }.join(" "),
419
+ "exit_code" => status&.exitstatus,
420
+ "started_at" => started_at.iso8601,
421
+ "finished_at" => Time.now.utc.iso8601,
422
+ "stdout" => stdout,
423
+ "stderr" => stderr,
424
+ "stdout_path" => stdout_path,
425
+ "stderr_path" => stderr_path
426
+ }
427
+ end
428
+
429
+ def extract_json_payload(stdout)
430
+ stdout.to_s.lines.reverse_each do |line|
431
+ candidate = line.to_s.strip
432
+ next if candidate.empty?
433
+
434
+ return JSON.parse(candidate)
435
+ rescue JSON::ParserError
436
+ next
437
+ end
438
+
439
+ nil
440
+ end
441
+
442
+ def call_mcp_tool(server:, tool_name:, payload:)
443
+ args = payload.is_a?(Hash) ? payload.transform_keys(&:to_sym) : {}
444
+ raw = server.server.call_tool(tool_name, **args)
445
+ JSON.parse(raw)
446
+ rescue JSON::ParserError
447
+ { "raw" => raw }
448
+ end
449
+
450
+ def summarize_matrix(results)
451
+ providers = Hash.new { |hash, key| hash[key] = { "total" => 0, "passed" => 0, "failed" => 0 } }
452
+ models = Hash.new { |hash, key| hash[key] = { "total" => 0, "passed" => 0, "failed" => 0 } }
453
+
454
+ results.each do |result|
455
+ bucket = result["status"] == "passed" ? "passed" : "failed"
456
+ Array(result["providers"]).each do |provider|
457
+ providers[provider]["total"] += 1
458
+ providers[provider][bucket] += 1
459
+ end
460
+ Array(result["models"]).each do |model|
461
+ models[model]["total"] += 1
462
+ models[model][bucket] += 1
463
+ end
464
+ end
465
+
466
+ { "providers" => providers, "models" => models }
467
+ end
468
+
469
+ def run_opencode_plugin_tool(scenario:, workspace:, artifact_dir:, env:, timeout_seconds:, install_result:)
470
+ plugin_driver = ensure_opencode_eval_driver(workspace)
471
+ payload = scenario.prompt_payload
472
+ tool_name = scenario.provider_runtime_tool
473
+ tool_input = payload.is_a?(Hash) ? payload.fetch("input", {}) : {}
474
+
475
+ command = ["node", plugin_driver, tool_name, JSON.generate(tool_input)]
476
+ result = execute_command(
477
+ command: command,
478
+ step_name: "provider_runtime",
479
+ workspace: workspace,
480
+ artifact_dir: artifact_dir,
481
+ env: env,
482
+ timeout_seconds: timeout_seconds
483
+ )
484
+
485
+ parsed_output = extract_json_payload(result["stdout"])
486
+ result["parsed_output"] = parsed_output
487
+ result["install"] = install_result
488
+ File.write(env.fetch("AGENTF_EVAL_RESULT_JSON"), JSON.pretty_generate(parsed_output || { "raw_stdout" => result["stdout"] }))
489
+ result
490
+ end
491
+
492
+ def ensure_opencode_eval_driver(workspace)
493
+ path = File.join(workspace, ".opencode", "plugins", "agentf-eval-driver.cjs")
494
+ return path if File.exist?(path)
495
+
496
+ FileUtils.mkdir_p(File.dirname(path))
497
+ File.write(path, render_opencode_eval_driver)
498
+ path
499
+ end
500
+
501
+ def run_copilot_runtime_tool(scenario:, workspace:, artifact_dir:, env:, timeout_seconds:, install_result:)
502
+ payload = scenario.prompt_payload
503
+ tool_name = scenario.provider_runtime_tool
504
+ tool_input = payload.is_a?(Hash) ? payload.fetch("input", {}) : {}
505
+ project = env.fetch("AGENTF_PROJECT_NAME")
506
+ started_at = Time.now.utc
507
+ stdout_path = File.join(artifact_dir, "provider_runtime_stdout.log")
508
+ stderr_path = File.join(artifact_dir, "provider_runtime_stderr.log")
509
+
510
+ stdout = ""
511
+ stderr = ""
512
+ parsed_output = nil
513
+ status = "passed"
514
+
515
+ begin
516
+ server = Agentf::MCP::Server.new(
517
+ explorer: Agentf::Commands::Explorer.new(base_path: workspace),
518
+ reviewer: Agentf::Commands::MemoryReviewer.new(project: project, memory: Agentf::Memory::RedisMemory.new(project: project)),
519
+ memory: Agentf::Memory::RedisMemory.new(project: project),
520
+ env: ENV.to_h.merge(env)
521
+ )
522
+ parsed_output = call_mcp_tool(server: server, tool_name: tool_name, payload: tool_input)
523
+ stdout = JSON.generate(parsed_output)
524
+ rescue StandardError => e
525
+ status = "failed"
526
+ stderr = "#{e.class}: #{e.message}\n#{Array(e.backtrace).first(8).join("\n")}"
527
+ end
528
+
529
+ parsed_output = extract_copilot_runtime_output(tool_name: tool_name, payload: tool_input, parsed_output: parsed_output)
530
+ File.write(stdout_path, stdout)
531
+ File.write(stderr_path, stderr)
532
+ File.write(env.fetch("AGENTF_EVAL_RESULT_JSON"), JSON.pretty_generate(parsed_output || { "raw_stdout" => stdout, "stderr" => stderr }))
533
+
534
+ {
535
+ "step" => "provider_runtime",
536
+ "status" => status,
537
+ "command" => "copilot-mcp:#{tool_name}",
538
+ "exit_code" => status == "passed" ? 0 : 1,
539
+ "started_at" => started_at.iso8601,
540
+ "finished_at" => Time.now.utc.iso8601,
541
+ "stdout" => stdout,
542
+ "stderr" => stderr,
543
+ "stdout_path" => stdout_path,
544
+ "stderr_path" => stderr_path,
545
+ "parsed_output" => parsed_output,
546
+ "install" => install_result
547
+ }
548
+ end
549
+
550
+ def extract_copilot_runtime_output(tool_name:, payload:, parsed_output:)
551
+ return parsed_output unless tool_name == "agentf-memory-recent"
552
+
553
+ parsed_output || { "requested_tool" => tool_name, "input" => payload }
554
+ end
555
+
556
+ def render_opencode_eval_driver
557
+ <<~JAVASCRIPT
558
+ const fs = require("fs");
559
+ const path = require("path");
560
+ const { execFile } = require("child_process");
561
+ const { promisify } = require("util");
562
+ const execFileAsync = promisify(execFile);
563
+
564
+ async function main() {
565
+ const toolName = process.argv[2];
566
+ const rawInput = process.argv[3] || "{}";
567
+ if (!toolName) {
568
+ throw new Error("Missing tool name");
569
+ }
570
+
571
+ const workspaceDir = process.cwd();
572
+ const absDir = path.join(workspaceDir, ".opencode", "agents");
573
+
574
+ function parseFrontmatter(content) {
575
+ const res = {};
576
+ const fmStart = content.indexOf("---");
577
+ if (fmStart === -1) return res;
578
+ const rest = content.slice(fmStart + 3);
579
+ const fmEndIdx = rest.indexOf("---");
580
+ if (fmEndIdx === -1) return res;
581
+ const block = rest.slice(0, fmEndIdx).trim();
582
+ for (const line of block.split(String.fromCharCode(10))) {
583
+ const m = line.match(new RegExp("^\\\\s*([A-Za-z0-9_\\\\-]+)\\\\s*:\\s*(.+)\\\\s*$"));
584
+ if (!m) continue;
585
+ let value = m[2];
586
+ if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
587
+ value = value.slice(1, -1);
588
+ }
589
+ res[m[1]] = value;
590
+ }
591
+ return res;
592
+ }
593
+
594
+ async function ensureAgentfPreflight(directory) {
595
+ const projectBinary = path.join(path.resolve(directory), "bin", "agentf");
596
+ if (fs.existsSync(projectBinary)) return projectBinary;
597
+ const gemPath = process.env.AGENTF_GEM_PATH;
598
+ if (gemPath) {
599
+ const gemBinary = path.join(gemPath, "bin", "agentf");
600
+ if (fs.existsSync(gemBinary)) return gemBinary;
601
+ }
602
+ const { stdout } = await execFileAsync("command", ["-v", "agentf"], { shell: true });
603
+ const resolved = stdout.toString().trim();
604
+ if (!resolved) throw new Error("Unable to resolve agentf binary");
605
+ return resolved;
606
+ }
607
+
608
+ async function runAgentfCli(directory, subcommand, command, args) {
609
+ const binaryPath = await ensureAgentfPreflight(directory);
610
+ const commandArgs = [subcommand, command, ...args, "--json"];
611
+ const { stdout } = await execFileAsync(binaryPath, commandArgs, {
612
+ cwd: directory,
613
+ env: process.env,
614
+ maxBuffer: 1024 * 1024 * 5,
615
+ });
616
+ return JSON.parse(stdout.toString().trim() || "{}");
617
+ }
618
+
619
+ const staticTools = {
620
+ "agentf-memory-recent": {
621
+ async execute(_args, context) {
622
+ const limit = _args.limit ?? 10;
623
+ return runAgentfCli(context.directory, "memory", "recent", ["-n", String(limit)]);
624
+ },
625
+ },
626
+ "agentf-memory-search": {
627
+ async execute(_args, context) {
628
+ const limit = _args.limit ?? 10;
629
+ return runAgentfCli(context.directory, "memory", "search", [_args.query, "-n", String(limit)]);
630
+ },
631
+ },
632
+ };
633
+
634
+ const agentTools = {};
635
+ if (fs.existsSync(absDir)) {
636
+ for (const file of fs.readdirSync(absDir)) {
637
+ const full = path.join(absDir, file);
638
+ if (!fs.statSync(full).isFile()) continue;
639
+ const content = fs.readFileSync(full, "utf8");
640
+ const fm = parseFrontmatter(content);
641
+ const manifestToolName = fm.name || path.basename(file, path.extname(file));
642
+ if (staticTools[manifestToolName]) continue;
643
+ const agentName = manifestToolName.replace(/^agentf-/, "");
644
+ agentTools[manifestToolName] = {
645
+ async execute(_args, context) {
646
+ const cmdArgs = [];
647
+ if (_args.input !== undefined) {
648
+ cmdArgs.push(typeof _args.input === "object" ? JSON.stringify(_args.input) : String(_args.input));
649
+ }
650
+ if (_args.confirmedWrite) cmdArgs.push(`--confirmed-write=${_args.confirmedWrite}`);
651
+ return runAgentfCli(context.directory, "agent", agentName, cmdArgs);
652
+ },
653
+ };
654
+ }
655
+ }
656
+
657
+ const tools = { ...staticTools, ...agentTools };
658
+ const tool = tools[toolName];
659
+ if (!tool) throw new Error(`Unknown tool: ${toolName}`);
660
+
661
+ const input = JSON.parse(rawInput);
662
+ const result = await tool.execute(input, { directory: workspaceDir });
663
+ process.stdout.write(JSON.stringify(result));
664
+ }
665
+
666
+ main().catch((error) => {
667
+ process.stderr.write(String(error && error.stack ? error.stack : error));
668
+ process.exit(1);
669
+ });
670
+ JAVASCRIPT
671
+ end
672
+
673
+ def install_provider_manifests(scenario:, workspace:)
674
+ started_at = Time.now.utc
675
+ opencode_runtime = scenario.env.fetch("AGENTF_EVAL_OPENCODE_RUNTIME", "mcp")
676
+ installer = Agentf::Installer.new(
677
+ global_root: workspace,
678
+ local_root: workspace,
679
+ dry_run: false,
680
+ install_deps: scenario.provider_install_deps?,
681
+ opencode_runtime: opencode_runtime
682
+ )
683
+ writes = installer.install(
684
+ providers: [scenario.provider_name],
685
+ scope: scenario.provider_scope,
686
+ only_agents: scenario.install_agents.empty? ? nil : scenario.install_agents,
687
+ only_commands: scenario.install_commands.empty? ? nil : scenario.install_commands
688
+ )
689
+
690
+ {
691
+ "step" => "provider_install",
692
+ "status" => writes.any? { |write| write["status"] == "error" } ? "failed" : "passed",
693
+ "command" => "installer:#{scenario.provider_name}",
694
+ "exit_code" => writes.any? { |write| write["status"] == "error" } ? 1 : 0,
695
+ "started_at" => started_at.iso8601,
696
+ "finished_at" => Time.now.utc.iso8601,
697
+ "writes" => writes
698
+ }
699
+ rescue StandardError => e
700
+ {
701
+ "step" => "provider_install",
702
+ "status" => "failed",
703
+ "command" => "installer:#{scenario.provider_name}",
704
+ "exit_code" => 1,
705
+ "started_at" => started_at.iso8601,
706
+ "finished_at" => Time.now.utc.iso8601,
707
+ "stderr" => "#{e.class}: #{e.message}"
708
+ }
709
+ end
710
+
711
+ def parse_provider_command(prompt)
712
+ parsed = extract_json_payload(prompt)
713
+ return Array(parsed["command"]) if parsed.is_a?(Hash) && parsed["command"].is_a?(Array)
714
+
715
+ prompt.to_s.split(" ")
716
+ end
717
+
718
+ def finalize_result(scenario:, artifact_dir:, workspace:, keep_workspace:, status:, setup:, agent_run:, verify:, failure_step:, error: nil)
719
+ retry_count = agent_run.is_a?(Hash) ? agent_run.fetch("retry_count", 0).to_i : 0
720
+ result = {
721
+ "scenario" => scenario.name,
722
+ "description" => scenario.description,
723
+ "agent" => scenario.agent,
724
+ "execution_mode" => scenario.execution_mode,
725
+ "mcp_tool" => scenario.mcp_tool,
726
+ "providers" => scenario.providers,
727
+ "models" => scenario.models,
728
+ "status" => status,
729
+ "retry_count" => retry_count,
730
+ "flaky" => retry_count.positive? && status == "passed",
731
+ "artifact_dir" => artifact_dir,
732
+ "workspace" => workspace,
733
+ "setup" => setup,
734
+ "agent_run" => agent_run,
735
+ "verify" => verify,
736
+ "failure_step" => failure_step,
737
+ "error" => error,
738
+ "memory_effectiveness" => build_memory_effectiveness(scenario: scenario, agent_run: agent_run)
739
+ }
740
+
741
+ File.write(File.join(artifact_dir, "summary.json"), JSON.pretty_generate(result))
742
+ append_history(result)
743
+ FileUtils.remove_entry(workspace) if workspace && !keep_workspace && Dir.exist?(workspace)
744
+ result["workspace_removed"] = !keep_workspace
745
+ result
746
+ end
747
+
748
+ def append_history(result)
749
+ FileUtils.mkdir_p(output_root)
750
+ File.open(File.join(output_root, "history.jsonl"), "a") do |file|
751
+ file.puts(JSON.generate(result.merge("recorded_at" => Time.now.utc.iso8601)))
752
+ end
753
+ end
754
+
755
+ def build_memory_effectiveness(scenario:, agent_run:)
756
+ expected_titles = scenario.expected_memory_titles
757
+ return nil if expected_titles.empty?
758
+
759
+ payload = agent_run.is_a?(Hash) ? agent_run["parsed_output"] : nil
760
+ serialized = JSON.generate(payload || {})
761
+ matched_titles = expected_titles.select { |title| serialized.include?(title) }
762
+
763
+ {
764
+ "expected_titles" => expected_titles,
765
+ "matched_titles" => matched_titles,
766
+ "retrieved_expected_memory" => matched_titles.any?
767
+ }
768
+ end
769
+ end
770
+ end
771
+ end