agentf 0.4.6 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,203 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "arg_parser"
4
+ require_relative "../evals/runner"
5
+
6
+ module Agentf
7
+ module CLI
8
+ class Eval
9
+ include ArgParser
10
+
11
+ def initialize(runner: nil)
12
+ @runner = runner
13
+ @json_output = false
14
+ end
15
+
16
+ def run(args)
17
+ @json_output = !args.delete("--json").nil?
18
+ command = args.shift || "help"
19
+
20
+ case command
21
+ when "list"
22
+ list_scenarios(args)
23
+ when "run"
24
+ run_scenarios(args)
25
+ when "report"
26
+ report_results(args)
27
+ when "help", "--help", "-h"
28
+ show_help
29
+ else
30
+ $stderr.puts "Unknown eval command: #{command}"
31
+ $stderr.puts
32
+ show_help
33
+ exit 1
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def list_scenarios(args)
40
+ runner = build_runner(args)
41
+ scenarios = runner.list
42
+
43
+ if @json_output
44
+ puts JSON.generate({ "count" => scenarios.length, "scenarios" => scenarios.map(&:to_h) })
45
+ return
46
+ end
47
+
48
+ if scenarios.empty?
49
+ puts "No eval scenarios found under #{runner.root}"
50
+ return
51
+ end
52
+
53
+ puts "Eval scenarios (#{scenarios.length}):"
54
+ scenarios.each do |scenario|
55
+ suffix = scenario.description.empty? ? "" : " - #{scenario.description}"
56
+ target = if scenario.execution_mode == "mcp"
57
+ "mcp: #{scenario.mcp_tool}"
58
+ elsif scenario.execution_mode == "provider"
59
+ "provider: #{scenario.provider_name}"
60
+ else
61
+ "agent: #{scenario.agent}"
62
+ end
63
+ puts " - #{scenario.name} (#{target})#{suffix}"
64
+ end
65
+ end
66
+
67
+ def run_scenarios(args)
68
+ name = args.shift || "all"
69
+ keep_workspace = args.delete("--keep-workspace")
70
+ timeout_seconds = parse_integer_option(args, "--timeout=", default: 0)
71
+ runner = build_runner(args)
72
+ result = runner.run(name: name, keep_workspace: !!keep_workspace, timeout_seconds: timeout_seconds.positive? ? timeout_seconds : nil)
73
+
74
+ if @json_output
75
+ puts JSON.pretty_generate(result)
76
+ return
77
+ end
78
+
79
+ puts "Evals complete: #{result['passed']}/#{result['count']} passed"
80
+ result["results"].each do |scenario_result|
81
+ status = scenario_result["status"] == "passed" ? "PASS" : "FAIL"
82
+ detail = scenario_result["failure_step"] ? " (failed at #{scenario_result['failure_step']})" : ""
83
+ puts " - [#{status}] #{scenario_result['scenario']}#{detail}"
84
+ puts " artifacts: #{scenario_result['artifact_dir']}"
85
+ end
86
+
87
+ print_matrix_summary(result["matrix"])
88
+
89
+ exit 1 if result["failed"].positive?
90
+ end
91
+
92
+ def build_runner(args)
93
+ root = parse_single_option(args, "--root=")
94
+ output_root = parse_single_option(args, "--output-dir=")
95
+ @runner || Agentf::Evals::Runner.new(root: root, output_root: output_root)
96
+ end
97
+
98
+ def show_help
99
+ puts <<~HELP
100
+ Usage: agentf eval <command> [options]
101
+
102
+ Commands:
103
+ list List available eval scenarios
104
+ run <scenario|all> Run one scenario or all scenarios
105
+ report Summarize eval history
106
+
107
+ Options:
108
+ --root=<path> Scenario root directory (default: ./evals)
109
+ --output-dir=<path> Artifact output directory (default: tmp/evals)
110
+ --timeout=<seconds> Override per-scenario timeout
111
+ --keep-workspace Keep temp workspace after run
112
+ --json Output structured JSON
113
+
114
+ Examples:
115
+ agentf eval list
116
+ agentf eval run engineer_store_success
117
+ agentf eval report
118
+ agentf eval run all --json
119
+ HELP
120
+ end
121
+
122
+ def report_results(args)
123
+ output_root = parse_single_option(args, "--output-dir=")
124
+ limit = parse_integer_option(args, "--limit=", default: 0)
125
+ since = parse_single_option(args, "--since=")
126
+ scenario = parse_single_option(args, "--scenario=")
127
+ report = Agentf::Evals::Report.new(output_root: output_root || Agentf::Evals::Runner::DEFAULT_OUTPUT_ROOT)
128
+ result = report.generate(limit: limit.positive? ? limit : nil, since: since, scenario: scenario)
129
+
130
+ if @json_output
131
+ puts JSON.pretty_generate(result)
132
+ return
133
+ end
134
+
135
+ puts "Eval history: #{result['passes']}/#{result['count']} passed"
136
+ puts "Retries: #{result.dig('retry_summary', 'total_retries')} total, #{result.dig('retry_summary', 'flaky_runs')} flaky passes"
137
+ if result["memory_effectiveness"]
138
+ puts "Memory retrieval: #{result.dig('memory_effectiveness', 'retrieved_expected_memory')}/#{result.dig('memory_effectiveness', 'tracked_runs')} tracked runs retrieved expected memory"
139
+ end
140
+ print_comparison_table("Providers", result["providers"])
141
+ print_comparison_table("Models", result["models"])
142
+ print_scenario_trends(result["scenarios"])
143
+ print_matrix_summary({ "providers" => result["providers"], "models" => result["models"] })
144
+ end
145
+
146
+ def print_comparison_table(title, rows)
147
+ return if rows.to_h.empty?
148
+
149
+ puts "#{title}:"
150
+ puts " Name Pass Fail Retry Flaky"
151
+ rows.sort.each do |name, stats|
152
+ puts format(
153
+ " %-20s %4d %4d %5d %5d",
154
+ name,
155
+ stats["passed"].to_i,
156
+ stats["failed"].to_i,
157
+ stats["retried"].to_i,
158
+ stats["flaky"].to_i
159
+ )
160
+ end
161
+ end
162
+
163
+ def print_scenario_trends(rows)
164
+ return if rows.to_h.empty?
165
+
166
+ puts "Scenario trends:"
167
+ puts " Scenario Pass Fail Retry Flaky Mem"
168
+ rows.sort.each do |name, stats|
169
+ puts format(
170
+ " %-20s %4d %4d %5d %5d %3s",
171
+ name,
172
+ stats["passed"].to_i,
173
+ stats["failed"].to_i,
174
+ stats["retried"].to_i,
175
+ stats["flaky"].to_i,
176
+ stats.fetch("memory_retrieved", 0).to_i.positive? ? "yes" : "no"
177
+ )
178
+ end
179
+ end
180
+
181
+ def print_matrix_summary(matrix)
182
+ return unless matrix.is_a?(Hash)
183
+
184
+ providers = matrix.fetch("providers", {})
185
+ models = matrix.fetch("models", {})
186
+
187
+ unless providers.empty?
188
+ puts "Provider matrix:"
189
+ providers.each do |provider, stats|
190
+ puts " - #{provider}: #{stats['passed']}/#{stats['total']} passed"
191
+ end
192
+ end
193
+
194
+ unless models.empty?
195
+ puts "Model matrix:"
196
+ models.each do |model, stats|
197
+ puts " - #{model}: #{stats['passed']}/#{stats['total']} passed"
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
@@ -17,6 +17,7 @@ module Agentf
17
17
  local_root: Dir.pwd,
18
18
  dry_run: false,
19
19
  install_deps: true,
20
+ opencode_runtime: "mcp",
20
21
  only_agents: nil,
21
22
  only_commands: nil
22
23
  }
@@ -35,6 +36,7 @@ module Agentf
35
36
  local_root: @options[:local_root],
36
37
  dry_run: @options[:dry_run],
37
38
  install_deps: @options[:install_deps],
39
+ opencode_runtime: @options[:opencode_runtime],
38
40
  verbose: @options.fetch(:verbose, false)
39
41
  )
40
42
 
@@ -72,6 +74,9 @@ module Agentf
72
74
  # Extract --install-deps flag
73
75
  @options[:install_deps] = !args.delete("--install-deps").nil?
74
76
 
77
+ opencode_runtime = parse_single_option(args, "--opencode-runtime=")
78
+ @options[:opencode_runtime] = opencode_runtime if opencode_runtime
79
+
75
80
  # Extract --global-root and --local-root
76
81
  global_root = parse_single_option(args, "--global-root=")
77
82
  @options[:global_root] = File.expand_path(global_root) if global_root
@@ -107,6 +112,7 @@ module Agentf
107
112
  --local-root=PATH Root for local installs (default: current directory)
108
113
  --agent=LIST Only install specific agents (comma-separated)
109
114
  --command=LIST Only install specific commands (comma-separated)
115
+ --opencode-runtime=MODE Opencode runtime: mcp|plugin (default: mcp)
110
116
  --dry-run Show planned writes without writing files
111
117
 
112
118
  Examples:
@@ -114,6 +120,7 @@ module Agentf
114
120
  agentf install --provider=opencode,copilot --scope=local
115
121
  agentf install --provider=copilot --dry-run
116
122
  agentf install --agent=architect,specialist
123
+ agentf install --provider=opencode --opencode-runtime=plugin
117
124
  HELP
118
125
  end
119
126
  end
@@ -145,18 +145,30 @@ module Agentf
145
145
  constraints = parse_list_option(args, "--constraints=")
146
146
  priority = parse_integer_option(args, "--priority=", default: 1)
147
147
 
148
- intent_id = @memory.store_business_intent(
149
- title: title,
150
- description: description,
151
- tags: tags,
152
- constraints: constraints,
153
- priority: priority
154
- )
148
+ id = nil
149
+ res = safe_cli_memory_write(@memory, attempted: { command: "add-business-intent", args: { title: title, description: description, tags: tags, constraints: constraints, priority: priority } }) do
150
+ id = @memory.store_business_intent(
151
+ title: title,
152
+ description: description,
153
+ tags: tags,
154
+ constraints: constraints,
155
+ priority: priority
156
+ )
157
+ end
158
+
159
+ if res.is_a?(Hash) && res["confirmation_required"]
160
+ if @json_output
161
+ puts JSON.generate(res)
162
+ else
163
+ $stderr.puts "Confirmation required to store business intent: #{res['confirmation_details'].inspect}"
164
+ end
165
+ return
166
+ end
155
167
 
156
168
  if @json_output
157
- puts JSON.generate({ "id" => intent_id, "type" => "business_intent", "status" => "stored" })
169
+ puts JSON.generate({ "id" => id, "type" => "business_intent", "status" => "stored" })
158
170
  else
159
- puts "Stored business intent: #{intent_id}"
171
+ puts "Stored business intent: #{id}"
160
172
  end
161
173
  end
162
174
 
@@ -174,19 +186,31 @@ module Agentf
174
186
  non_goals = parse_list_option(args, "--non-goals=")
175
187
  related_task_id = parse_single_option(args, "--task=")
176
188
 
177
- intent_id = @memory.store_feature_intent(
178
- title: title,
179
- description: description,
180
- tags: tags,
181
- acceptance_criteria: acceptance_criteria,
182
- non_goals: non_goals,
183
- related_task_id: related_task_id
184
- )
189
+ id = nil
190
+ res = safe_cli_memory_write(@memory, attempted: { command: "add-feature-intent", args: { title: title, description: description, tags: tags, acceptance: acceptance_criteria, non_goals: non_goals, related_task_id: related_task_id } }) do
191
+ id = @memory.store_feature_intent(
192
+ title: title,
193
+ description: description,
194
+ tags: tags,
195
+ acceptance_criteria: acceptance_criteria,
196
+ non_goals: non_goals,
197
+ related_task_id: related_task_id
198
+ )
199
+ end
200
+
201
+ if res.is_a?(Hash) && res["confirmation_required"]
202
+ if @json_output
203
+ puts JSON.generate(res)
204
+ else
205
+ $stderr.puts "Confirmation required to store feature intent: #{res['confirmation_details'].inspect}"
206
+ end
207
+ return
208
+ end
185
209
 
186
210
  if @json_output
187
- puts JSON.generate({ "id" => intent_id, "type" => "feature_intent", "status" => "stored" })
211
+ puts JSON.generate({ "id" => id, "type" => "feature_intent", "status" => "stored" })
188
212
  else
189
- puts "Stored feature intent: #{intent_id}"
213
+ puts "Stored feature intent: #{id}"
190
214
  end
191
215
  end
192
216
 
@@ -204,20 +228,48 @@ module Agentf
204
228
  agent = parse_single_option(args, "--agent=") || Agentf::AgentRoles::ENGINEER
205
229
  code_snippet = parse_single_option(args, "--code=").to_s
206
230
 
207
- intent_id = @memory.store_episode(
208
- type: type,
209
- title: title,
210
- description: description,
211
- context: context,
212
- tags: tags,
213
- agent: agent,
214
- code_snippet: code_snippet
215
- )
231
+ id = nil
232
+ res = safe_cli_memory_write(@memory, attempted: { command: "add-#{type}", args: { title: title, description: description, tags: tags, context: context, agent: agent, code: code_snippet } }) do
233
+ id = @memory.store_episode(
234
+ type: type,
235
+ title: title,
236
+ description: description,
237
+ context: context,
238
+ tags: tags,
239
+ agent: agent,
240
+ code_snippet: code_snippet
241
+ )
242
+ end
243
+
244
+ if res.is_a?(Hash) && res["confirmation_required"]
245
+ if @json_output
246
+ puts JSON.generate(res)
247
+ else
248
+ $stderr.puts "Confirmation required to store #{type}: #{res['confirmation_details'].inspect}"
249
+ end
250
+ return
251
+ end
216
252
 
217
253
  if @json_output
218
- puts JSON.generate({ "id" => intent_id, "type" => type, "status" => "stored" })
254
+ puts JSON.generate({ "id" => id, "type" => type, "status" => "stored" })
219
255
  else
220
- puts "Stored #{type}: #{intent_id}"
256
+ puts "Stored #{type}: #{id}"
257
+ end
258
+ end
259
+
260
+ # Helper to standardize CLI memory write confirmation handling.
261
+ def safe_cli_memory_write(memory, attempted: {})
262
+ begin
263
+ yield
264
+ nil
265
+ rescue Agentf::Memory::RedisMemory::ConfirmationRequired => e
266
+ {
267
+ "confirmation_required" => true,
268
+ "confirmation_details" => e.details,
269
+ "attempted" => attempted,
270
+ "confirmed_write_token" => "confirmed",
271
+ "confirmation_prompt" => "Ask the user whether to save this memory. If they approve, rerun the same command with confirmation enabled. If they decline, do not retry."
272
+ }
221
273
  end
222
274
  end
223
275
 
@@ -7,6 +7,7 @@ require_relative "install"
7
7
  require_relative "update"
8
8
  require_relative "metrics"
9
9
  require_relative "architecture"
10
+ require_relative "eval"
10
11
 
11
12
  module Agentf
12
13
  module CLI
@@ -18,8 +19,8 @@ module Agentf
18
19
  # agentf install --provider opencode,copilot
19
20
  # agentf version
20
21
  # agentf help
21
- class Router
22
- SUBCOMMANDS = %w[memory code metrics architecture install update mcp-server version help].freeze
22
+ class Router
23
+ SUBCOMMANDS = %w[memory code metrics architecture install update eval agent mcp-server version help].freeze
23
24
 
24
25
  def run(args)
25
26
  subcommand = args.shift || "help"
@@ -42,8 +43,14 @@ module Agentf
42
43
  Architecture.new.run(args)
43
44
  when "update"
44
45
  Update.new.run(args)
46
+ when "eval"
47
+ Eval.new.run(args)
45
48
  when "mcp-server"
46
49
  start_mcp_server
50
+ when "agent"
51
+ # agent <AGENT_NAME> [payload]
52
+ require_relative "agent"
53
+ Agent.new.run(args)
47
54
  when "version", "--version", "-v"
48
55
  puts "agentf #{Agentf::VERSION}"
49
56
  when "help", "--help", "-h"
@@ -74,6 +81,8 @@ module Agentf
74
81
  architecture Analyze architecture layers and violations
75
82
  install Generate provider manifests (agents, commands, tools)
76
83
  update Regenerate manifests when gem version changes
84
+ eval Run black-box eval scenarios against `agentf agent`
85
+ agent Run a single agent directly
77
86
  mcp-server Start MCP server over stdio (for Copilot integration)
78
87
  version Show version
79
88
 
@@ -87,7 +96,7 @@ module Agentf
87
96
  AGENTF_WORKFLOW_CONTRACT_MODE=advisory|enforcing|off Contract behavior mode
88
97
  AGENTF_AGENT_CONTRACT_ENABLED=true|false Enable/disable per-agent contract checks
89
98
  AGENTF_AGENT_CONTRACT_MODE=advisory|enforcing|off Per-agent contract behavior mode
90
- AGENTF_DEFAULT_PACK=generic|rails_standard|rails_37signals|rails_feature_spec
99
+ (AGENTF_DEFAULT_PACK no longer used — orchestrator uses internal profiles)
91
100
  AGENTF_GEM_PATH=/path/to/gem Path to agentf gem (for OpenCode plugin binary resolution)
92
101
 
93
102
  Examples:
@@ -100,6 +109,9 @@ module Agentf
100
109
  agentf metrics parity --json
101
110
  agentf architecture analyze
102
111
  agentf architecture review --json
112
+ agentf eval list
113
+ agentf eval run all --json
114
+ agentf agent planner "Plan a refactor" --json
103
115
  agentf update
104
116
  agentf update --force --provider=opencode,copilot
105
117
  agentf mcp-server
@@ -34,7 +34,8 @@ module Agentf
34
34
  scope: "all",
35
35
  global_root: Dir.home,
36
36
  local_root: Dir.pwd,
37
- force: false
37
+ force: false,
38
+ opencode_runtime: "mcp"
38
39
  }
39
40
  end
40
41
 
@@ -78,6 +79,9 @@ module Agentf
78
79
 
79
80
  local_root = parse_single_option(args, "--local-root=")
80
81
  @options[:local_root] = File.expand_path(local_root) if local_root
82
+
83
+ opencode_runtime = parse_single_option(args, "--opencode-runtime=")
84
+ @options[:opencode_runtime] = opencode_runtime if opencode_runtime
81
85
  end
82
86
 
83
87
  def roots_for(scope)
@@ -113,7 +117,8 @@ module Agentf
113
117
 
114
118
  installer = @installer_class.new(
115
119
  global_root: root,
116
- local_root: root
120
+ local_root: root,
121
+ opencode_runtime: @options[:opencode_runtime]
117
122
  )
118
123
 
119
124
  results = installer.install(
@@ -191,12 +196,14 @@ module Agentf
191
196
  --scope=SCOPE Update scope: global|local|all (default: all)
192
197
  --global-root=PATH Root for global installs (default: $HOME)
193
198
  --local-root=PATH Root for local installs (default: current directory)
199
+ --opencode-runtime=MODE Opencode runtime: mcp|plugin (default: mcp)
194
200
  --force Regenerate even if version matches
195
201
 
196
202
  Examples:
197
203
  agentf update
198
204
  agentf update --force
199
205
  agentf update --provider=opencode,copilot --scope=local
206
+ agentf update --provider=opencode --opencode-runtime=plugin
200
207
  HELP
201
208
  end
202
209
  end
@@ -30,9 +30,10 @@ module Agentf
30
30
  }
31
31
  end
32
32
 
33
- def initialize(project: nil)
33
+ def initialize(project: nil, memory: nil)
34
34
  @project = project || Agentf.config.project_name
35
- @memory = Agentf::Memory::RedisMemory.new(project: @project)
35
+ # Allow injecting a memory instance for testing; default to real RedisMemory
36
+ @memory = memory || Agentf::Memory::RedisMemory.new(project: @project)
36
37
  end
37
38
 
38
39
  # Get recent memories
@@ -81,6 +82,13 @@ module Agentf
81
82
  { "error" => e.message }
82
83
  end
83
84
 
85
+ def get_intents(limit: 10)
86
+ intents = @memory.get_intents(limit: limit)
87
+ format_memories(intents)
88
+ rescue => e
89
+ { "error" => e.message }
90
+ end
91
+
84
92
  # Get all unique tags from memories
85
93
  def get_all_tags
86
94
  tags = @memory.get_all_tags
@@ -28,20 +28,22 @@ module Agentf
28
28
 
29
29
  def record_workflow(workflow_state)
30
30
  metrics = extract_metrics(workflow_state)
31
-
32
- @memory.store_episode(
33
- type: "success",
34
- title: metric_title(metrics),
35
- description: metric_description(metrics),
36
- context: metric_context(metrics),
37
- tags: metric_tags(metrics),
38
- agent: Agentf::AgentRoles::ORCHESTRATOR,
39
- code_snippet: ""
40
- )
41
-
42
- { "status" => "recorded", "metrics" => metrics }
43
- rescue StandardError => e
44
- { "status" => "error", "error" => e.message }
31
+ begin
32
+ @memory.store_episode(
33
+ type: "success",
34
+ title: metric_title(metrics),
35
+ description: metric_description(metrics),
36
+ context: metric_context(metrics),
37
+ tags: metric_tags(metrics),
38
+ agent: Agentf::AgentRoles::ORCHESTRATOR,
39
+ code_snippet: ""
40
+ )
41
+ { "status" => "recorded", "metrics" => metrics }
42
+ rescue Agentf::Memory::RedisMemory::ConfirmationRequired => e
43
+ { "status" => "confirmation_required", "confirmation_details" => e.details, "attempted" => { "action" => "record_workflow" } }
44
+ rescue StandardError => e
45
+ { "status" => "error", "error" => e.message }
46
+ end
45
47
  end
46
48
 
47
49
  def summary(limit: 100)
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Agentf
4
+ module Commands
5
+ class Registry
6
+ def initialize(map = {})
7
+ @map = map
8
+ end
9
+
10
+ def register(name, impl)
11
+ @map[name.to_s] = impl
12
+ end
13
+
14
+ def fetch(name)
15
+ @map.fetch(name.to_s)
16
+ end
17
+
18
+ def call(command_name, action, *args)
19
+ impl = fetch(command_name)
20
+ if impl.respond_to?(action)
21
+ impl.public_send(action, *args)
22
+ else
23
+ raise "Command #{command_name} does not implement #{action}"
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end