agentf 0.4.6 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/agentf/agents/architect.rb +4 -0
- data/lib/agentf/agents/base.rb +29 -1
- data/lib/agentf/agents/debugger.rb +33 -10
- data/lib/agentf/agents/designer.rb +19 -8
- data/lib/agentf/agents/documenter.rb +6 -0
- data/lib/agentf/agents/explorer.rb +31 -12
- data/lib/agentf/agents/reviewer.rb +5 -0
- data/lib/agentf/agents/security.rb +26 -16
- data/lib/agentf/agents/specialist.rb +32 -18
- data/lib/agentf/agents/tester.rb +47 -8
- data/lib/agentf/cli/agent.rb +95 -0
- data/lib/agentf/cli/eval.rb +203 -0
- data/lib/agentf/cli/install.rb +7 -0
- data/lib/agentf/cli/memory.rb +82 -30
- data/lib/agentf/cli/router.rb +15 -3
- data/lib/agentf/cli/update.rb +9 -2
- data/lib/agentf/commands/memory_reviewer.rb +10 -2
- data/lib/agentf/commands/metrics.rb +16 -14
- data/lib/agentf/commands/registry.rb +28 -0
- data/lib/agentf/evals/report.rb +134 -0
- data/lib/agentf/evals/runner.rb +771 -0
- data/lib/agentf/evals/scenario.rb +211 -0
- data/lib/agentf/installer.rb +486 -348
- data/lib/agentf/mcp/server.rb +291 -49
- data/lib/agentf/memory.rb +97 -19
- data/lib/agentf/service/providers.rb +10 -62
- data/lib/agentf/version.rb +1 -1
- data/lib/agentf/workflow_engine.rb +204 -73
- data/lib/agentf.rb +9 -3
- metadata +8 -3
- data/lib/agentf/packs.rb +0 -74
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Agentf
|
|
6
|
+
module Evals
|
|
7
|
+
class Report
|
|
8
|
+
def initialize(output_root: Runner::DEFAULT_OUTPUT_ROOT)
|
|
9
|
+
@output_root = File.expand_path(output_root)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
attr_reader :output_root
|
|
13
|
+
|
|
14
|
+
def generate(limit: nil, since: nil, scenario: nil)
|
|
15
|
+
records = load_history
|
|
16
|
+
records = filter_since(records, since)
|
|
17
|
+
records = filter_scenario(records, scenario)
|
|
18
|
+
records = records.last(limit) if limit && limit.positive?
|
|
19
|
+
|
|
20
|
+
{
|
|
21
|
+
"output_root" => output_root,
|
|
22
|
+
"history_path" => history_path,
|
|
23
|
+
"count" => records.length,
|
|
24
|
+
"passes" => records.count { |record| record["status"] == "passed" },
|
|
25
|
+
"failures" => records.count { |record| record["status"] == "failed" },
|
|
26
|
+
"retry_summary" => summarize_retries(records),
|
|
27
|
+
"memory_effectiveness" => summarize_memory_effectiveness(records),
|
|
28
|
+
"providers" => summarize_dimension(records, "providers"),
|
|
29
|
+
"models" => summarize_dimension(records, "models"),
|
|
30
|
+
"scenarios" => summarize_scenarios(records)
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def history_path
|
|
37
|
+
File.join(output_root, "history.jsonl")
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def load_history
|
|
41
|
+
return [] unless File.exist?(history_path)
|
|
42
|
+
|
|
43
|
+
File.readlines(history_path, chomp: true).filter_map do |line|
|
|
44
|
+
next if line.to_s.strip.empty?
|
|
45
|
+
|
|
46
|
+
JSON.parse(line)
|
|
47
|
+
rescue JSON::ParserError
|
|
48
|
+
nil
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def filter_since(records, since)
|
|
53
|
+
return records unless since
|
|
54
|
+
|
|
55
|
+
cutoff = since.is_a?(Time) ? since : Time.parse(since.to_s)
|
|
56
|
+
records.select do |record|
|
|
57
|
+
recorded_at = record["recorded_at"]
|
|
58
|
+
recorded_at && Time.parse(recorded_at) >= cutoff
|
|
59
|
+
rescue ArgumentError
|
|
60
|
+
false
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def filter_scenario(records, scenario)
|
|
65
|
+
return records if scenario.to_s.strip.empty?
|
|
66
|
+
|
|
67
|
+
records.select { |record| record["scenario"] == scenario }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def summarize_retries(records)
|
|
71
|
+
retried = records.count { |record| record["retry_count"].to_i.positive? }
|
|
72
|
+
{
|
|
73
|
+
"retried_runs" => retried,
|
|
74
|
+
"total_retries" => records.sum { |record| record["retry_count"].to_i },
|
|
75
|
+
"flaky_runs" => records.count { |record| record["flaky"] == true }
|
|
76
|
+
}
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def summarize_dimension(records, key)
|
|
80
|
+
summary = Hash.new { |hash, name| hash[name] = base_stats }
|
|
81
|
+
|
|
82
|
+
records.each do |record|
|
|
83
|
+
Array(record[key]).each do |name|
|
|
84
|
+
entry = summary[name]
|
|
85
|
+
update_stats(entry, record)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
summary
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def summarize_scenarios(records)
|
|
93
|
+
summary = Hash.new { |hash, name| hash[name] = base_stats.merge("last_status" => nil, "last_recorded_at" => nil) }
|
|
94
|
+
|
|
95
|
+
records.each do |record|
|
|
96
|
+
entry = summary[record["scenario"]]
|
|
97
|
+
update_stats(entry, record)
|
|
98
|
+
update_memory_effectiveness(entry, record)
|
|
99
|
+
entry["last_status"] = record["status"]
|
|
100
|
+
entry["last_recorded_at"] = record["recorded_at"]
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
summary
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def summarize_memory_effectiveness(records)
|
|
107
|
+
relevant = records.filter_map { |record| record["memory_effectiveness"] }
|
|
108
|
+
{
|
|
109
|
+
"tracked_runs" => relevant.length,
|
|
110
|
+
"retrieved_expected_memory" => relevant.count { |item| item["retrieved_expected_memory"] == true }
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def base_stats
|
|
115
|
+
{ "total" => 0, "passed" => 0, "failed" => 0, "retried" => 0, "flaky" => 0 }
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def update_stats(entry, record)
|
|
119
|
+
entry["total"] += 1
|
|
120
|
+
entry[record["status"] == "passed" ? "passed" : "failed"] += 1
|
|
121
|
+
entry["retried"] += 1 if record["retry_count"].to_i.positive?
|
|
122
|
+
entry["flaky"] += 1 if record["flaky"] == true
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def update_memory_effectiveness(entry, record)
|
|
126
|
+
effect = record["memory_effectiveness"]
|
|
127
|
+
return unless effect
|
|
128
|
+
|
|
129
|
+
entry["memory_tracked"] = entry.fetch("memory_tracked", 0) + 1
|
|
130
|
+
entry["memory_retrieved"] = entry.fetch("memory_retrieved", 0) + (effect["retrieved_expected_memory"] == true ? 1 : 0)
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|