agentf 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Agentf
6
+ module Evals
7
+ class Report
8
+ def initialize(output_root: Runner::DEFAULT_OUTPUT_ROOT)
9
+ @output_root = File.expand_path(output_root)
10
+ end
11
+
12
+ attr_reader :output_root
13
+
14
+ def generate(limit: nil, since: nil, scenario: nil)
15
+ records = load_history
16
+ records = filter_since(records, since)
17
+ records = filter_scenario(records, scenario)
18
+ records = records.last(limit) if limit && limit.positive?
19
+
20
+ {
21
+ "output_root" => output_root,
22
+ "history_path" => history_path,
23
+ "count" => records.length,
24
+ "passes" => records.count { |record| record["status"] == "passed" },
25
+ "failures" => records.count { |record| record["status"] == "failed" },
26
+ "retry_summary" => summarize_retries(records),
27
+ "memory_effectiveness" => summarize_memory_effectiveness(records),
28
+ "providers" => summarize_dimension(records, "providers"),
29
+ "models" => summarize_dimension(records, "models"),
30
+ "scenarios" => summarize_scenarios(records)
31
+ }
32
+ end
33
+
34
+ private
35
+
36
+ def history_path
37
+ File.join(output_root, "history.jsonl")
38
+ end
39
+
40
+ def load_history
41
+ return [] unless File.exist?(history_path)
42
+
43
+ File.readlines(history_path, chomp: true).filter_map do |line|
44
+ next if line.to_s.strip.empty?
45
+
46
+ JSON.parse(line)
47
+ rescue JSON::ParserError
48
+ nil
49
+ end
50
+ end
51
+
52
+ def filter_since(records, since)
53
+ return records unless since
54
+
55
+ cutoff = since.is_a?(Time) ? since : Time.parse(since.to_s)
56
+ records.select do |record|
57
+ recorded_at = record["recorded_at"]
58
+ recorded_at && Time.parse(recorded_at) >= cutoff
59
+ rescue ArgumentError
60
+ false
61
+ end
62
+ end
63
+
64
+ def filter_scenario(records, scenario)
65
+ return records if scenario.to_s.strip.empty?
66
+
67
+ records.select { |record| record["scenario"] == scenario }
68
+ end
69
+
70
+ def summarize_retries(records)
71
+ retried = records.count { |record| record["retry_count"].to_i.positive? }
72
+ {
73
+ "retried_runs" => retried,
74
+ "total_retries" => records.sum { |record| record["retry_count"].to_i },
75
+ "flaky_runs" => records.count { |record| record["flaky"] == true }
76
+ }
77
+ end
78
+
79
+ def summarize_dimension(records, key)
80
+ summary = Hash.new { |hash, name| hash[name] = base_stats }
81
+
82
+ records.each do |record|
83
+ Array(record[key]).each do |name|
84
+ entry = summary[name]
85
+ update_stats(entry, record)
86
+ end
87
+ end
88
+
89
+ summary
90
+ end
91
+
92
+ def summarize_scenarios(records)
93
+ summary = Hash.new { |hash, name| hash[name] = base_stats.merge("last_status" => nil, "last_recorded_at" => nil) }
94
+
95
+ records.each do |record|
96
+ entry = summary[record["scenario"]]
97
+ update_stats(entry, record)
98
+ update_memory_effectiveness(entry, record)
99
+ entry["last_status"] = record["status"]
100
+ entry["last_recorded_at"] = record["recorded_at"]
101
+ end
102
+
103
+ summary
104
+ end
105
+
106
+ def summarize_memory_effectiveness(records)
107
+ relevant = records.filter_map { |record| record["memory_effectiveness"] }
108
+ {
109
+ "tracked_runs" => relevant.length,
110
+ "retrieved_expected_memory" => relevant.count { |item| item["retrieved_expected_memory"] == true }
111
+ }
112
+ end
113
+
114
+ def base_stats
115
+ { "total" => 0, "passed" => 0, "failed" => 0, "retried" => 0, "flaky" => 0 }
116
+ end
117
+
118
+ def update_stats(entry, record)
119
+ entry["total"] += 1
120
+ entry[record["status"] == "passed" ? "passed" : "failed"] += 1
121
+ entry["retried"] += 1 if record["retry_count"].to_i.positive?
122
+ entry["flaky"] += 1 if record["flaky"] == true
123
+ end
124
+
125
+ def update_memory_effectiveness(entry, record)
126
+ effect = record["memory_effectiveness"]
127
+ return unless effect
128
+
129
+ entry["memory_tracked"] = entry.fetch("memory_tracked", 0) + 1
130
+ entry["memory_retrieved"] = entry.fetch("memory_retrieved", 0) + (effect["retrieved_expected_memory"] == true ? 1 : 0)
131
+ end
132
+ end
133
+ end
134
+ end