agentf 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,203 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "arg_parser"
4
+ require_relative "../evals/runner"
5
+
6
+ module Agentf
7
+ module CLI
8
+ class Eval
9
+ include ArgParser
10
+
11
+ def initialize(runner: nil)
12
+ @runner = runner
13
+ @json_output = false
14
+ end
15
+
16
+ def run(args)
17
+ @json_output = !args.delete("--json").nil?
18
+ command = args.shift || "help"
19
+
20
+ case command
21
+ when "list"
22
+ list_scenarios(args)
23
+ when "run"
24
+ run_scenarios(args)
25
+ when "report"
26
+ report_results(args)
27
+ when "help", "--help", "-h"
28
+ show_help
29
+ else
30
+ $stderr.puts "Unknown eval command: #{command}"
31
+ $stderr.puts
32
+ show_help
33
+ exit 1
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def list_scenarios(args)
40
+ runner = build_runner(args)
41
+ scenarios = runner.list
42
+
43
+ if @json_output
44
+ puts JSON.generate({ "count" => scenarios.length, "scenarios" => scenarios.map(&:to_h) })
45
+ return
46
+ end
47
+
48
+ if scenarios.empty?
49
+ puts "No eval scenarios found under #{runner.root}"
50
+ return
51
+ end
52
+
53
+ puts "Eval scenarios (#{scenarios.length}):"
54
+ scenarios.each do |scenario|
55
+ suffix = scenario.description.empty? ? "" : " - #{scenario.description}"
56
+ target = if scenario.execution_mode == "mcp"
57
+ "mcp: #{scenario.mcp_tool}"
58
+ elsif scenario.execution_mode == "provider"
59
+ "provider: #{scenario.provider_name}"
60
+ else
61
+ "agent: #{scenario.agent}"
62
+ end
63
+ puts " - #{scenario.name} (#{target})#{suffix}"
64
+ end
65
+ end
66
+
67
+ def run_scenarios(args)
68
+ name = args.shift || "all"
69
+ keep_workspace = args.delete("--keep-workspace")
70
+ timeout_seconds = parse_integer_option(args, "--timeout=", default: 0)
71
+ runner = build_runner(args)
72
+ result = runner.run(name: name, keep_workspace: !!keep_workspace, timeout_seconds: timeout_seconds.positive? ? timeout_seconds : nil)
73
+
74
+ if @json_output
75
+ puts JSON.pretty_generate(result)
76
+ return
77
+ end
78
+
79
+ puts "Evals complete: #{result['passed']}/#{result['count']} passed"
80
+ result["results"].each do |scenario_result|
81
+ status = scenario_result["status"] == "passed" ? "PASS" : "FAIL"
82
+ detail = scenario_result["failure_step"] ? " (failed at #{scenario_result['failure_step']})" : ""
83
+ puts " - [#{status}] #{scenario_result['scenario']}#{detail}"
84
+ puts " artifacts: #{scenario_result['artifact_dir']}"
85
+ end
86
+
87
+ print_matrix_summary(result["matrix"])
88
+
89
+ exit 1 if result["failed"].positive?
90
+ end
91
+
92
+ def build_runner(args)
93
+ root = parse_single_option(args, "--root=")
94
+ output_root = parse_single_option(args, "--output-dir=")
95
+ @runner || Agentf::Evals::Runner.new(root: root, output_root: output_root)
96
+ end
97
+
98
+ def show_help
99
+ puts <<~HELP
100
+ Usage: agentf eval <command> [options]
101
+
102
+ Commands:
103
+ list List available eval scenarios
104
+ run <scenario|all> Run one scenario or all scenarios
105
+ report Summarize eval history
106
+
107
+ Options:
108
+ --root=<path> Scenario root directory (default: ./evals)
109
+ --output-dir=<path> Artifact output directory (default: tmp/evals)
110
+ --timeout=<seconds> Override per-scenario timeout
111
+ --keep-workspace Keep temp workspace after run
112
+ --json Output structured JSON
113
+
114
+ Examples:
115
+ agentf eval list
116
+ agentf eval run engineer_episode_positive
117
+ agentf eval report
118
+ agentf eval run all --json
119
+ HELP
120
+ end
121
+
122
+ def report_results(args)
123
+ output_root = parse_single_option(args, "--output-dir=")
124
+ limit = parse_integer_option(args, "--limit=", default: 0)
125
+ since = parse_single_option(args, "--since=")
126
+ scenario = parse_single_option(args, "--scenario=")
127
+ report = Agentf::Evals::Report.new(output_root: output_root || Agentf::Evals::Runner::DEFAULT_OUTPUT_ROOT)
128
+ result = report.generate(limit: limit.positive? ? limit : nil, since: since, scenario: scenario)
129
+
130
+ if @json_output
131
+ puts JSON.pretty_generate(result)
132
+ return
133
+ end
134
+
135
+ puts "Eval history: #{result['passes']}/#{result['count']} passed"
136
+ puts "Retries: #{result.dig('retry_summary', 'total_retries')} total, #{result.dig('retry_summary', 'flaky_runs')} flaky passes"
137
+ if result["memory_effectiveness"]
138
+ puts "Memory retrieval: #{result.dig('memory_effectiveness', 'retrieved_expected_memory')}/#{result.dig('memory_effectiveness', 'tracked_runs')} tracked runs retrieved expected memory"
139
+ end
140
+ print_comparison_table("Providers", result["providers"])
141
+ print_comparison_table("Models", result["models"])
142
+ print_scenario_trends(result["scenarios"])
143
+ print_matrix_summary({ "providers" => result["providers"], "models" => result["models"] })
144
+ end
145
+
146
+ def print_comparison_table(title, rows)
147
+ return if rows.to_h.empty?
148
+
149
+ puts "#{title}:"
150
+ puts " Name Pass Fail Retry Flaky"
151
+ rows.sort.each do |name, stats|
152
+ puts format(
153
+ " %-20s %4d %4d %5d %5d",
154
+ name,
155
+ stats["passed"].to_i,
156
+ stats["failed"].to_i,
157
+ stats["retried"].to_i,
158
+ stats["flaky"].to_i
159
+ )
160
+ end
161
+ end
162
+
163
+ def print_scenario_trends(rows)
164
+ return if rows.to_h.empty?
165
+
166
+ puts "Scenario trends:"
167
+ puts " Scenario Pass Fail Retry Flaky Mem"
168
+ rows.sort.each do |name, stats|
169
+ puts format(
170
+ " %-20s %4d %4d %5d %5d %3s",
171
+ name,
172
+ stats["passed"].to_i,
173
+ stats["failed"].to_i,
174
+ stats["retried"].to_i,
175
+ stats["flaky"].to_i,
176
+ stats.fetch("memory_retrieved", 0).to_i.positive? ? "yes" : "no"
177
+ )
178
+ end
179
+ end
180
+
181
+ def print_matrix_summary(matrix)
182
+ return unless matrix.is_a?(Hash)
183
+
184
+ providers = matrix.fetch("providers", {})
185
+ models = matrix.fetch("models", {})
186
+
187
+ unless providers.empty?
188
+ puts "Provider matrix:"
189
+ providers.each do |provider, stats|
190
+ puts " - #{provider}: #{stats['passed']}/#{stats['total']} passed"
191
+ end
192
+ end
193
+
194
+ unless models.empty?
195
+ puts "Model matrix:"
196
+ models.each do |model, stats|
197
+ puts " - #{model}: #{stats['passed']}/#{stats['total']} passed"
198
+ end
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
@@ -17,6 +17,7 @@ module Agentf
17
17
  local_root: Dir.pwd,
18
18
  dry_run: false,
19
19
  install_deps: true,
20
+ opencode_runtime: "mcp",
20
21
  only_agents: nil,
21
22
  only_commands: nil
22
23
  }
@@ -35,6 +36,7 @@ module Agentf
35
36
  local_root: @options[:local_root],
36
37
  dry_run: @options[:dry_run],
37
38
  install_deps: @options[:install_deps],
39
+ opencode_runtime: @options[:opencode_runtime],
38
40
  verbose: @options.fetch(:verbose, false)
39
41
  )
40
42
 
@@ -72,6 +74,9 @@ module Agentf
72
74
  # Extract --install-deps flag
73
75
  @options[:install_deps] = !args.delete("--install-deps").nil?
74
76
 
77
+ opencode_runtime = parse_single_option(args, "--opencode-runtime=")
78
+ @options[:opencode_runtime] = opencode_runtime if opencode_runtime
79
+
75
80
  # Extract --global-root and --local-root
76
81
  global_root = parse_single_option(args, "--global-root=")
77
82
  @options[:global_root] = File.expand_path(global_root) if global_root
@@ -107,6 +112,7 @@ module Agentf
107
112
  --local-root=PATH Root for local installs (default: current directory)
108
113
  --agent=LIST Only install specific agents (comma-separated)
109
114
  --command=LIST Only install specific commands (comma-separated)
115
+ --opencode-runtime=MODE Opencode runtime: mcp|plugin (default: mcp)
110
116
  --dry-run Show planned writes without writing files
111
117
 
112
118
  Examples:
@@ -114,6 +120,7 @@ module Agentf
114
120
  agentf install --provider=opencode,copilot --scope=local
115
121
  agentf install --provider=copilot --dry-run
116
122
  agentf install --agent=architect,specialist
123
+ agentf install --provider=opencode --opencode-runtime=plugin
117
124
  HELP
118
125
  end
119
126
  end
@@ -13,7 +13,7 @@ module Agentf
13
13
  class Memory
14
14
  include ArgParser
15
15
 
16
- VALID_EPISODE_TYPES = %w[pitfall lesson success business_intent feature_intent].freeze
16
+ VALID_EPISODE_TYPES = %w[episode lesson playbook business_intent feature_intent incident].freeze
17
17
 
18
18
  def initialize(reviewer: nil, memory: nil)
19
19
  @reviewer = reviewer || Commands::MemoryReviewer.new
@@ -28,12 +28,10 @@ module Agentf
28
28
  case command
29
29
  when "recent", "list"
30
30
  list_memories(args)
31
- when "pitfalls"
32
- list_pitfalls(args)
31
+ when "episodes"
32
+ list_episodes(args)
33
33
  when "lessons"
34
34
  list_lessons(args)
35
- when "successes"
36
- list_successes(args)
37
35
  when "intents"
38
36
  list_intents(args)
39
37
  when "business-intents"
@@ -44,14 +42,10 @@ module Agentf
44
42
  add_business_intent(args)
45
43
  when "add-feature-intent"
46
44
  add_feature_intent(args)
45
+ when "add-playbook"
46
+ add_playbook(args)
47
47
  when "add-lesson"
48
48
  add_episode("lesson", args)
49
- when "add-success"
50
- add_episode("success", args)
51
- when "add-pitfall"
52
- add_episode("pitfall", args)
53
- when "tags"
54
- list_tags
55
49
  when "search"
56
50
  search_memories(args)
57
51
  when "delete"
@@ -62,8 +56,6 @@ module Agentf
62
56
  subgraph(args)
63
57
  when "summary", "stats"
64
58
  show_summary
65
- when "by-tag"
66
- by_tag(args)
67
59
  when "by-agent"
68
60
  by_agent(args)
69
61
  when "by-type"
@@ -86,9 +78,10 @@ module Agentf
86
78
  output(result)
87
79
  end
88
80
 
89
- def list_pitfalls(args)
81
+ def list_episodes(args)
90
82
  limit = extract_limit(args)
91
- result = @reviewer.get_pitfalls(limit: limit)
83
+ outcome = parse_single_option(args, "--outcome=")
84
+ result = @reviewer.get_episodes(limit: limit, outcome: outcome)
92
85
  output(result)
93
86
  end
94
87
 
@@ -98,12 +91,6 @@ module Agentf
98
91
  output(result)
99
92
  end
100
93
 
101
- def list_successes(args)
102
- limit = extract_limit(args)
103
- result = @reviewer.get_successes(limit: limit)
104
- output(result)
105
- end
106
-
107
94
  def list_intents(args)
108
95
  limit = extract_limit(args)
109
96
  kind = args.shift
@@ -141,22 +128,32 @@ module Agentf
141
128
  exit 1
142
129
  end
143
130
 
144
- tags = parse_list_option(args, "--tags=")
145
131
  constraints = parse_list_option(args, "--constraints=")
146
132
  priority = parse_integer_option(args, "--priority=", default: 1)
147
133
 
148
- intent_id = @memory.store_business_intent(
149
- title: title,
150
- description: description,
151
- tags: tags,
152
- constraints: constraints,
153
- priority: priority
154
- )
134
+ id = nil
135
+ res = safe_cli_memory_write(@memory, attempted: { command: "add-business-intent", args: { title: title, description: description, constraints: constraints, priority: priority } }) do
136
+ id = @memory.store_business_intent(
137
+ title: title,
138
+ description: description,
139
+ constraints: constraints,
140
+ priority: priority
141
+ )
142
+ end
143
+
144
+ if res.is_a?(Hash) && res["confirmation_required"]
145
+ if @json_output
146
+ puts JSON.generate(res)
147
+ else
148
+ $stderr.puts "Confirmation required to store business intent: #{res['confirmation_details'].inspect}"
149
+ end
150
+ return
151
+ end
155
152
 
156
153
  if @json_output
157
- puts JSON.generate({ "id" => intent_id, "type" => "business_intent", "status" => "stored" })
154
+ puts JSON.generate({ "id" => id, "type" => "business_intent", "status" => "stored" })
158
155
  else
159
- puts "Stored business intent: #{intent_id}"
156
+ puts "Stored business intent: #{id}"
160
157
  end
161
158
  end
162
159
 
@@ -169,24 +166,74 @@ module Agentf
169
166
  exit 1
170
167
  end
171
168
 
172
- tags = parse_list_option(args, "--tags=")
173
169
  acceptance_criteria = parse_list_option(args, "--acceptance=")
174
170
  non_goals = parse_list_option(args, "--non-goals=")
175
171
  related_task_id = parse_single_option(args, "--task=")
176
172
 
177
- intent_id = @memory.store_feature_intent(
178
- title: title,
179
- description: description,
180
- tags: tags,
181
- acceptance_criteria: acceptance_criteria,
182
- non_goals: non_goals,
183
- related_task_id: related_task_id
184
- )
173
+ id = nil
174
+ res = safe_cli_memory_write(@memory, attempted: { command: "add-feature-intent", args: { title: title, description: description, acceptance: acceptance_criteria, non_goals: non_goals, related_task_id: related_task_id } }) do
175
+ id = @memory.store_feature_intent(
176
+ title: title,
177
+ description: description,
178
+ acceptance_criteria: acceptance_criteria,
179
+ non_goals: non_goals,
180
+ related_task_id: related_task_id
181
+ )
182
+ end
183
+
184
+ if res.is_a?(Hash) && res["confirmation_required"]
185
+ if @json_output
186
+ puts JSON.generate(res)
187
+ else
188
+ $stderr.puts "Confirmation required to store feature intent: #{res['confirmation_details'].inspect}"
189
+ end
190
+ return
191
+ end
185
192
 
186
193
  if @json_output
187
- puts JSON.generate({ "id" => intent_id, "type" => "feature_intent", "status" => "stored" })
194
+ puts JSON.generate({ "id" => id, "type" => "feature_intent", "status" => "stored" })
188
195
  else
189
- puts "Stored feature intent: #{intent_id}"
196
+ puts "Stored feature intent: #{id}"
197
+ end
198
+ end
199
+
200
+ def add_playbook(args)
201
+ title = args.shift
202
+ description = args.shift
203
+
204
+ if title.to_s.empty? || description.to_s.empty?
205
+ $stderr.puts "Error: add-playbook requires <title> <description>"
206
+ exit 1
207
+ end
208
+
209
+ steps = parse_list_option(args, "--steps=")
210
+ feature_area = parse_single_option(args, "--feature-area=")
211
+ agent = parse_single_option(args, "--agent=") || Agentf::AgentRoles::PLANNER
212
+
213
+ id = nil
214
+ res = safe_cli_memory_write(@memory, attempted: { command: "add-playbook", args: { title: title, description: description, steps: steps, feature_area: feature_area, agent: agent } }) do
215
+ id = @memory.store_playbook(
216
+ title: title,
217
+ description: description,
218
+ steps: steps,
219
+ feature_area: feature_area,
220
+ agent: agent
221
+ )
222
+ end
223
+
224
+ if res.is_a?(Hash) && res["confirmation_required"]
225
+ if @json_output
226
+ puts JSON.generate(res)
227
+ else
228
+ $stderr.puts "Confirmation required to store playbook: #{res['confirmation_details'].inspect}"
229
+ end
230
+ return
231
+ end
232
+
233
+ if @json_output
234
+ puts JSON.generate({ "id" => id, "type" => "playbook", "status" => "stored" })
235
+ else
236
+ puts "Stored playbook: #{id}"
190
237
  end
191
238
  end
192
239
 
@@ -199,40 +246,53 @@ module Agentf
199
246
  exit 1
200
247
  end
201
248
 
202
- tags = parse_list_option(args, "--tags=")
203
249
  context = parse_single_option(args, "--context=").to_s
204
250
  agent = parse_single_option(args, "--agent=") || Agentf::AgentRoles::ENGINEER
205
251
  code_snippet = parse_single_option(args, "--code=").to_s
252
+ outcome = parse_single_option(args, "--outcome=")
253
+
254
+ id = nil
255
+ res = safe_cli_memory_write(@memory, attempted: { command: "add-#{type}", args: { title: title, description: description, context: context, agent: agent, code: code_snippet, outcome: outcome } }) do
256
+ id = @memory.store_episode(
257
+ type: type,
258
+ title: title,
259
+ description: description,
260
+ context: context,
261
+ agent: agent,
262
+ code_snippet: code_snippet,
263
+ outcome: outcome
264
+ )
265
+ end
206
266
 
207
- intent_id = @memory.store_episode(
208
- type: type,
209
- title: title,
210
- description: description,
211
- context: context,
212
- tags: tags,
213
- agent: agent,
214
- code_snippet: code_snippet
215
- )
267
+ if res.is_a?(Hash) && res["confirmation_required"]
268
+ if @json_output
269
+ puts JSON.generate(res)
270
+ else
271
+ $stderr.puts "Confirmation required to store #{type}: #{res['confirmation_details'].inspect}"
272
+ end
273
+ return
274
+ end
216
275
 
217
276
  if @json_output
218
- puts JSON.generate({ "id" => intent_id, "type" => type, "status" => "stored" })
277
+ puts JSON.generate({ "id" => id, "type" => type, "status" => "stored" })
219
278
  else
220
- puts "Stored #{type}: #{intent_id}"
279
+ puts "Stored #{type}: #{id}"
221
280
  end
222
281
  end
223
282
 
224
- def list_tags
225
- result = @reviewer.get_all_tags
226
- if @json_output
227
- puts JSON.generate(result)
228
- return
229
- end
230
-
231
- if result["tags"].empty?
232
- puts "No tags found."
233
- else
234
- puts "Tags (#{result["count"]}):"
235
- result["tags"].each { |tag| puts " - #{tag}" }
283
+ # Helper to standardize CLI memory write confirmation handling.
284
+ def safe_cli_memory_write(memory, attempted: {})
285
+ begin
286
+ yield
287
+ nil
288
+ rescue Agentf::Memory::RedisMemory::ConfirmationRequired => e
289
+ {
290
+ "confirmation_required" => true,
291
+ "confirmation_details" => e.details,
292
+ "attempted" => attempted,
293
+ "confirmed_write_token" => "confirmed",
294
+ "confirmation_prompt" => "Ask the user whether to save this memory. If they approve, rerun the same command with confirmation enabled. If they decline, do not retry."
295
+ }
236
296
  end
237
297
  end
238
298
 
@@ -266,19 +326,12 @@ module Agentf
266
326
  puts ""
267
327
  puts "By agent:"
268
328
  result["by_agent"].each { |agent, count| puts " #{agent}: #{count}" }
269
- puts ""
270
- puts "Unique tags: #{result["unique_tags"]}"
271
- end
272
329
 
273
- def by_tag(args)
274
- tag = args.shift
275
- if tag.nil? || tag.empty?
276
- $stderr.puts "Error: by-tag requires a tag name"
277
- exit 1
330
+ if result["by_outcome"].is_a?(Hash)
331
+ puts ""
332
+ puts "By outcome:"
333
+ result["by_outcome"].each { |outcome, count| puts " #{outcome}: #{count}" }
278
334
  end
279
- limit = extract_limit(args)
280
- result = @reviewer.get_by_tag(tag, limit: limit)
281
- output(result)
282
335
  end
283
336
 
284
337
  def by_agent(args)
@@ -488,8 +541,8 @@ module Agentf
488
541
  [#{mem["type"]&.upcase}] #{mem["title"]}
489
542
  #{mem["created_at"]} by #{mem["agent"]}
490
543
  #{mem["description"]}
544
+ #{"Outcome: #{mem['outcome']}" unless mem["outcome"].to_s.empty?}
491
545
  #{format_code(mem["code_snippet"]) unless mem["code_snippet"].to_s.empty?}
492
- Tags: #{mem["tags"]&.join(", ") || "none"}
493
546
  OUTPUT
494
547
  end
495
548
 
@@ -505,26 +558,22 @@ module Agentf
505
558
 
506
559
  Commands:
507
560
  recent, list List recent memories (default: 10)
508
- pitfalls List pitfalls (things that went wrong)
561
+ episodes List episode memories
509
562
  lessons List lessons learned
510
- successes List successes
511
563
  intents [kind] List intents (kind: business|feature)
512
564
  business-intents List business intents
513
565
  feature-intents List feature intents
514
566
  add-business-intent Store business intent
515
567
  add-feature-intent Store feature intent
568
+ add-playbook Store playbook memory
516
569
  add-lesson Store lesson memory
517
- add-success Store success memory
518
- add-pitfall Store pitfall memory
519
- tags List all unique tags
520
- search <query> Search memories by keyword
570
+ search <query> Search memories semantically
521
571
  delete id <memory_id> Delete one memory and related edges
522
572
  delete last -n <count> Delete most recent memories
523
573
  delete all Delete memories and graph/task keys
524
574
  neighbors <id> Traverse graph edges from a memory id
525
575
  subgraph <ids> Build graph from comma-separated seed ids
526
576
  summary, stats Show summary statistics
527
- by-tag <tag> Get memories with specific tag
528
577
  by-agent <agent> Get memories from specific agent
529
578
  by-type <type> Get memories by type (#{VALID_EPISODE_TYPES.join("|")})
530
579
 
@@ -534,18 +583,17 @@ module Agentf
534
583
 
535
584
  Examples:
536
585
  agentf memory recent -n 5
537
- agentf memory pitfalls
586
+ agentf memory episodes --outcome=negative
538
587
  agentf memory intents business -n 5
539
- agentf memory add-business-intent "Reliability" "Prioritize uptime" --tags=ops,platform --constraints="No downtime;No vendor lock-in"
588
+ agentf memory add-business-intent "Reliability" "Prioritize uptime" --constraints="No downtime;No vendor lock-in"
540
589
  agentf memory add-feature-intent "Agent handoff" "Improve orchestrator continuity" --acceptance="Keeps context;Preserves task state"
541
- agentf memory add-lesson "Refactor strategy" "Extracted adapter seam" --agent=PLANNER --tags=architecture
542
- agentf memory add-success "Provider install works" "Installed copilot + opencode manifests" --agent=ENGINEER
590
+ agentf memory add-playbook "Release rollout" "Safe deploy sequence" --steps="deploy canary;monitor;promote"
591
+ agentf memory add-lesson "Refactor strategy" "Extracted adapter seam" --agent=PLANNER
543
592
  agentf memory search "react"
544
593
  agentf memory delete id episode_abcd
545
594
  agentf memory delete last -n 10 --scope=project
546
595
  agentf memory delete all --scope=all --yes
547
596
  agentf memory neighbors episode_abcd --depth=2
548
- agentf memory by-tag "performance"
549
597
  agentf memory summary
550
598
  HELP
551
599
  end
@@ -7,6 +7,7 @@ require_relative "install"
7
7
  require_relative "update"
8
8
  require_relative "metrics"
9
9
  require_relative "architecture"
10
+ require_relative "eval"
10
11
 
11
12
  module Agentf
12
13
  module CLI
@@ -18,8 +19,8 @@ module Agentf
18
19
  # agentf install --provider opencode,copilot
19
20
  # agentf version
20
21
  # agentf help
21
- class Router
22
- SUBCOMMANDS = %w[memory code metrics architecture install update mcp-server version help].freeze
22
+ class Router
23
+ SUBCOMMANDS = %w[memory code metrics architecture install update eval agent mcp-server version help].freeze
23
24
 
24
25
  def run(args)
25
26
  subcommand = args.shift || "help"
@@ -42,8 +43,14 @@ module Agentf
42
43
  Architecture.new.run(args)
43
44
  when "update"
44
45
  Update.new.run(args)
46
+ when "eval"
47
+ Eval.new.run(args)
45
48
  when "mcp-server"
46
49
  start_mcp_server
50
+ when "agent"
51
+ # agent <AGENT_NAME> [payload]
52
+ require_relative "agent"
53
+ Agent.new.run(args)
47
54
  when "version", "--version", "-v"
48
55
  puts "agentf #{Agentf::VERSION}"
49
56
  when "help", "--help", "-h"
@@ -68,12 +75,14 @@ module Agentf
68
75
  Usage: agentf <command> [subcommand] [options]
69
76
 
70
77
  Commands:
71
- memory Manage agent memory (lessons, pitfalls, successes, intents)
78
+ memory Manage agent memory (episodes, lessons, playbooks, intents)
72
79
  code Explore codebase (glob, grep, tree, related files)
73
80
  metrics Show workflow success and provider parity metrics
74
81
  architecture Analyze architecture layers and violations
75
82
  install Generate provider manifests (agents, commands, tools)
76
83
  update Regenerate manifests when gem version changes
84
+ eval Run black-box eval scenarios against `agentf agent`
85
+ agent Run a single agent directly
77
86
  mcp-server Start MCP server over stdio (for Copilot integration)
78
87
  version Show version
79
88
 
@@ -87,7 +96,7 @@ module Agentf
87
96
  AGENTF_WORKFLOW_CONTRACT_MODE=advisory|enforcing|off Contract behavior mode
88
97
  AGENTF_AGENT_CONTRACT_ENABLED=true|false Enable/disable per-agent contract checks
89
98
  AGENTF_AGENT_CONTRACT_MODE=advisory|enforcing|off Per-agent contract behavior mode
90
- AGENTF_DEFAULT_PACK=generic|rails_standard|rails_37signals|rails_feature_spec
99
+ (AGENTF_DEFAULT_PACK no longer used — orchestrator uses internal profiles)
91
100
  AGENTF_GEM_PATH=/path/to/gem Path to agentf gem (for OpenCode plugin binary resolution)
92
101
 
93
102
  Examples:
@@ -100,6 +109,9 @@ module Agentf
100
109
  agentf metrics parity --json
101
110
  agentf architecture analyze
102
111
  agentf architecture review --json
112
+ agentf eval list
113
+ agentf eval run all --json
114
+ agentf agent planner "Plan a refactor" --json
103
115
  agentf update
104
116
  agentf update --force --provider=opencode,copilot
105
117
  agentf mcp-server