ruby_llm-agents 3.6.0 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -0
- data/app/controllers/ruby_llm/agents/executions_controller.rb +1 -3
- data/app/helpers/ruby_llm/agents/application_helper.rb +0 -27
- data/app/views/ruby_llm/agents/dashboard/index.html.erb +11 -11
- data/app/views/ruby_llm/agents/system_config/show.html.erb +0 -13
- data/lib/generators/ruby_llm_agents/templates/initializer.rb.tt +0 -15
- data/lib/ruby_llm/agents/core/version.rb +1 -1
- data/lib/ruby_llm/agents/eval/eval_result.rb +73 -0
- data/lib/ruby_llm/agents/eval/eval_run.rb +124 -0
- data/lib/ruby_llm/agents/eval/eval_suite.rb +264 -0
- data/lib/ruby_llm/agents/eval.rb +5 -0
- data/lib/ruby_llm/agents.rb +3 -0
- metadata +5 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ce728e318b0681df1f65dc93e4c264f35573e863b86841394ab218994dd3dd29
|
|
4
|
+
data.tar.gz: f036ab7df822277740a0f840afd52d3d68c36bbd37843ae16032da7f9406864e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6e63acc86ac7413983957abc46bbf5ba997230ae06480d6aa2eb77d6f25524e02cc58131112cda0b5eb413452078766d8afec001d98e56e6fb3ae2e8a0602dff
|
|
7
|
+
data.tar.gz: d01dba7531c3e503f7b00156a254c8a71ab582d20b194ad22d1f6bc1734cdd38b814e6581855274e08b49bb2c8d33e7655a02c2ed834771d9a990085ce743c84
|
data/README.md
CHANGED
|
@@ -162,6 +162,21 @@ result.url # => "https://..."
|
|
|
162
162
|
result.save("logo.png")
|
|
163
163
|
```
|
|
164
164
|
|
|
165
|
+
```ruby
|
|
166
|
+
# Evaluate agent quality with built-in scoring
|
|
167
|
+
class SupportRouter::Eval < RubyLLM::Agents::Eval::EvalSuite
|
|
168
|
+
agent SupportRouter
|
|
169
|
+
|
|
170
|
+
test_case "billing", input: { message: "charged twice" }, expected: "billing"
|
|
171
|
+
test_case "technical", input: { message: "500 error" }, expected: "technical"
|
|
172
|
+
test_case "greeting", input: { message: "hello" }, expected: "general"
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
run = SupportRouter::Eval.run!
|
|
176
|
+
puts run.summary
|
|
177
|
+
# SupportRouter eval: 3/3 passed (score: 1.0)
|
|
178
|
+
```
|
|
179
|
+
|
|
165
180
|
## Features
|
|
166
181
|
|
|
167
182
|
| Feature | Description | Docs |
|
|
@@ -184,6 +199,7 @@ result.save("logo.png")
|
|
|
184
199
|
| **Audio** | Text-to-speech (OpenAI, ElevenLabs), speech-to-text, dynamic pricing, 28+ output formats, dashboard audio playback | [Audio](https://github.com/adham90/ruby_llm-agents/wiki/Audio) |
|
|
185
200
|
| **Agent Composition** | Use agents as tools in other agents with automatic hierarchy tracking | [Tools](https://github.com/adham90/ruby_llm-agents/wiki/Tools) |
|
|
186
201
|
| **Queryable Agents** | Query execution history from agent classes with stats, replay, and cost breakdown | [Querying](https://github.com/adham90/ruby_llm-agents/wiki/Querying-Executions) |
|
|
202
|
+
| **Evaluation** | Test agent quality with exact match, contains, LLM judge, and custom scorers | [Evaluation](https://github.com/adham90/ruby_llm-agents/wiki/Evaluation) |
|
|
187
203
|
| **Alerts** | Slack, webhook, and custom notifications | [Alerts](https://github.com/adham90/ruby_llm-agents/wiki/Alerts) |
|
|
188
204
|
| **AS::Notifications** | 11 instrumentation events across execution, cache, budget, and reliability | [Events](https://github.com/adham90/ruby_llm-agents/wiki/ActiveSupport-Notifications) |
|
|
189
205
|
| **Custom Middleware** | Inject custom middleware globally or per-agent with positioning control | [Middleware](https://github.com/adham90/ruby_llm-agents/wiki/Custom-Middleware) |
|
|
@@ -267,6 +283,7 @@ mount RubyLLM::Agents::Engine => "/agents"
|
|
|
267
283
|
| [Multi-Tenancy](https://github.com/adham90/ruby_llm-agents/wiki/Multi-Tenancy) | Per-tenant budgets, isolation, configuration |
|
|
268
284
|
| [Async/Fiber](https://github.com/adham90/ruby_llm-agents/wiki/Async-Fiber) | Concurrent execution with Ruby fibers |
|
|
269
285
|
| [Testing Agents](https://github.com/adham90/ruby_llm-agents/wiki/Testing-Agents) | RSpec patterns, mocking, dry_run mode |
|
|
286
|
+
| [Evaluation](https://github.com/adham90/ruby_llm-agents/wiki/Evaluation) | Score agent quality with built-in and custom scorers |
|
|
270
287
|
| [Error Handling](https://github.com/adham90/ruby_llm-agents/wiki/Error-Handling) | Error types, recovery patterns |
|
|
271
288
|
| [Routing](https://github.com/adham90/ruby_llm-agents/wiki/Routing) | Message classification, routing DSL, inline classify |
|
|
272
289
|
| [Embeddings](https://github.com/adham90/ruby_llm-agents/wiki/Embeddings) | Vector embeddings, batching, caching, preprocessing |
|
|
@@ -96,9 +96,7 @@ module RubyLLM
|
|
|
96
96
|
# @param execution [Execution] The execution record
|
|
97
97
|
# @return [String] CSV row string
|
|
98
98
|
def generate_csv_row(execution)
|
|
99
|
-
redacted_error_message =
|
|
100
|
-
Redactor.redact_string(execution.error_message)
|
|
101
|
-
end
|
|
99
|
+
redacted_error_message = execution.error_message
|
|
102
100
|
|
|
103
101
|
CSV.generate_line([
|
|
104
102
|
execution.id,
|
|
@@ -120,33 +120,6 @@ module RubyLLM
|
|
|
120
120
|
end
|
|
121
121
|
end
|
|
122
122
|
|
|
123
|
-
# Redacts sensitive data from an object for display
|
|
124
|
-
#
|
|
125
|
-
# Uses the configured redaction rules to mask sensitive fields
|
|
126
|
-
# and patterns in the data.
|
|
127
|
-
#
|
|
128
|
-
# @param obj [Object] The object to redact (Hash, Array, or primitive)
|
|
129
|
-
# @return [Object] The redacted object
|
|
130
|
-
# @example
|
|
131
|
-
# redact_for_display({ password: "secret", name: "John" })
|
|
132
|
-
# #=> { password: "[REDACTED]", name: "John" }
|
|
133
|
-
def redact_for_display(obj)
|
|
134
|
-
Redactor.redact(obj)
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
# Syntax-highlights a redacted Ruby object as pretty-printed JSON
|
|
138
|
-
#
|
|
139
|
-
# Combines redaction and highlighting in one call.
|
|
140
|
-
#
|
|
141
|
-
# @param obj [Object] Any JSON-serializable Ruby object
|
|
142
|
-
# @return [ActiveSupport::SafeBuffer] HTML-safe highlighted redacted JSON
|
|
143
|
-
def highlight_json_redacted(obj)
|
|
144
|
-
return "" if obj.nil?
|
|
145
|
-
|
|
146
|
-
redacted = redact_for_display(obj)
|
|
147
|
-
highlight_json(redacted)
|
|
148
|
-
end
|
|
149
|
-
|
|
150
123
|
# Syntax-highlights a Ruby object as pretty-printed JSON
|
|
151
124
|
#
|
|
152
125
|
# Converts the object to JSON and applies color highlighting
|
|
@@ -2,27 +2,27 @@
|
|
|
2
2
|
<%= render partial: "ruby_llm/agents/dashboard/action_center", locals: { critical_alerts: @critical_alerts } %>
|
|
3
3
|
|
|
4
4
|
<!-- Stats Strip + Range Selector -->
|
|
5
|
-
<div class="flex items-center justify-between mb-3">
|
|
6
|
-
<div class="flex items-center gap-4">
|
|
7
|
-
<h1 class="text-[10px] font-medium text-gray-400 dark:text-gray-500 uppercase tracking-widest font-mono">overview</h1>
|
|
8
|
-
<div class="flex items-center gap-1.5 font-mono text-xs text-gray-400 dark:text-gray-500">
|
|
5
|
+
<div class="flex flex-wrap items-center justify-between gap-2 mb-3">
|
|
6
|
+
<div class="flex items-center gap-4 min-w-0">
|
|
7
|
+
<h1 class="text-[10px] font-medium text-gray-400 dark:text-gray-500 uppercase tracking-widest font-mono shrink-0">overview</h1>
|
|
8
|
+
<div class="flex flex-wrap items-center gap-x-1.5 gap-y-0.5 font-mono text-xs text-gray-400 dark:text-gray-500">
|
|
9
9
|
<% total = @now_strip[:success_today] + @now_strip[:errors_today] %>
|
|
10
|
-
<span class="text-gray-800 dark:text-gray-200"><%= number_with_delimiter(total) %></span> runs
|
|
10
|
+
<span class="whitespace-nowrap"><span class="text-gray-800 dark:text-gray-200"><%= number_with_delimiter(total) %></span> runs</span>
|
|
11
11
|
<span class="text-gray-300 dark:text-gray-700">·</span>
|
|
12
|
-
<span class="<%= @now_strip[:errors_today] > 0 ? 'text-red-500' : 'text-gray-800 dark:text-gray-200' %>"><%= @now_strip[:errors_today] %></span> errors<% if total > 0 && @now_strip[:errors_today] > 0 %> <span class="text-gray-300 dark:text-gray-600">(<%= (@now_strip[:errors_today].to_f / total * 100).round(1) %>%)</span><% end
|
|
12
|
+
<span class="whitespace-nowrap"><span class="<%= @now_strip[:errors_today] > 0 ? 'text-red-500' : 'text-gray-800 dark:text-gray-200' %>"><%= @now_strip[:errors_today] %></span> errors<% if total > 0 && @now_strip[:errors_today] > 0 %> <span class="text-gray-300 dark:text-gray-600">(<%= (@now_strip[:errors_today].to_f / total * 100).round(1) %>%)</span><% end %></span>
|
|
13
13
|
<span class="text-gray-300 dark:text-gray-700">·</span>
|
|
14
|
-
<span class="text-gray-800 dark:text-gray-200">$<%= number_with_precision(@now_strip[:cost_today], precision: 2) %></span>
|
|
14
|
+
<span class="text-gray-800 dark:text-gray-200 whitespace-nowrap">$<%= number_with_precision(@now_strip[:cost_today], precision: 2) %></span>
|
|
15
15
|
<% if @cache_savings[:count] > 0 %>
|
|
16
16
|
<span class="text-gray-300 dark:text-gray-700">·</span>
|
|
17
|
-
<span class="text-green-500"><%= number_with_delimiter(@cache_savings[:count]) %></span> cache hits
|
|
17
|
+
<span class="whitespace-nowrap"><span class="text-green-500"><%= number_with_delimiter(@cache_savings[:count]) %></span> cache hits</span>
|
|
18
18
|
<span class="text-gray-300 dark:text-gray-700">·</span>
|
|
19
|
-
<span class="text-green-500">$<%= number_with_precision(@cache_savings[:estimated_savings], precision: 2) %></span> saved
|
|
19
|
+
<span class="whitespace-nowrap"><span class="text-green-500">$<%= number_with_precision(@cache_savings[:estimated_savings], precision: 2) %></span> saved</span>
|
|
20
20
|
<span class="text-gray-300 dark:text-gray-700">·</span>
|
|
21
|
-
<span class="text-gray-800 dark:text-gray-200"><%= @cache_savings[:hit_rate] %>%</span> hit rate
|
|
21
|
+
<span class="whitespace-nowrap"><span class="text-gray-800 dark:text-gray-200"><%= @cache_savings[:hit_rate] %>%</span> hit rate</span>
|
|
22
22
|
<% end %>
|
|
23
23
|
</div>
|
|
24
24
|
</div>
|
|
25
|
-
<div class="relative font-mono text-xs" x-data="{ open: false, showCustom: false }" @click.outside="open = false; showCustom = false">
|
|
25
|
+
<div class="relative font-mono text-xs shrink-0" x-data="{ open: false, showCustom: false }" @click.outside="open = false; showCustom = false">
|
|
26
26
|
<button @click="open = !open" class="flex items-center gap-1 px-2 py-0.5 text-gray-900 dark:text-gray-100 hover:text-gray-600 dark:hover:text-gray-300">
|
|
27
27
|
<% if @selected_range == "custom" && @custom_from && @custom_to %>
|
|
28
28
|
<%= @custom_from.strftime("%b %-d") %> – <%= @custom_to.strftime("%b %-d") %>
|
|
@@ -6,8 +6,6 @@
|
|
|
6
6
|
budgets = @config.budgets || {}
|
|
7
7
|
budgets_enabled = @config.budgets_enabled?
|
|
8
8
|
alerts_enabled = @config.on_alert.respond_to?(:call)
|
|
9
|
-
redaction = @config.redaction || {}
|
|
10
|
-
redaction_enabled = redaction.present?
|
|
11
9
|
%>
|
|
12
10
|
|
|
13
11
|
<!-- ── system config ──────────────── -->
|
|
@@ -242,17 +240,6 @@
|
|
|
242
240
|
<span class="w-36 flex-shrink-0 text-gray-500 dark:text-gray-400">persist responses</span>
|
|
243
241
|
<span class="badge badge-sm <%= @config.persist_responses ? 'badge-success' : 'badge-timeout' %>"><%= @config.persist_responses ? 'on' : 'off' %></span>
|
|
244
242
|
</div>
|
|
245
|
-
<div class="flex items-center gap-3 py-0.5">
|
|
246
|
-
<span class="w-36 flex-shrink-0 text-gray-500 dark:text-gray-400">pii redaction</span>
|
|
247
|
-
<% if redaction_enabled %>
|
|
248
|
-
<span class="badge badge-sm badge-success">on</span>
|
|
249
|
-
<span class="text-gray-400 dark:text-gray-600"><%= @config.redaction_fields.count %> fields</span>
|
|
250
|
-
<span class="text-gray-300 dark:text-gray-700">·</span>
|
|
251
|
-
<span class="text-gray-400 dark:text-gray-600"><%= @config.redaction_patterns.count %> patterns</span>
|
|
252
|
-
<% else %>
|
|
253
|
-
<span class="text-gray-300 dark:text-gray-700">—</span>
|
|
254
|
-
<% end %>
|
|
255
|
-
</div>
|
|
256
243
|
</div>
|
|
257
244
|
|
|
258
245
|
<!-- Footer note -->
|
|
@@ -179,19 +179,4 @@ RubyLLM::Agents.configure do |config|
|
|
|
179
179
|
|
|
180
180
|
# Whether to persist LLM responses in execution records
|
|
181
181
|
# config.persist_responses = true
|
|
182
|
-
|
|
183
|
-
# Redaction configuration for PII and sensitive data
|
|
184
|
-
# - fields: Parameter names to redact (extends defaults: password, token, api_key, secret, etc.)
|
|
185
|
-
# - patterns: Regex patterns to match and redact in string values
|
|
186
|
-
# - placeholder: String to replace redacted values with
|
|
187
|
-
# - max_value_length: Truncate values longer than this (nil = no limit)
|
|
188
|
-
# config.redaction = {
|
|
189
|
-
# fields: %w[ssn credit_card phone_number email],
|
|
190
|
-
# patterns: [
|
|
191
|
-
# /\b\d{3}-\d{2}-\d{4}\b/, # SSN
|
|
192
|
-
# /\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b/ # Credit card
|
|
193
|
-
# ],
|
|
194
|
-
# placeholder: "[REDACTED]",
|
|
195
|
-
# max_value_length: 5000
|
|
196
|
-
# }
|
|
197
182
|
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
module Eval
|
|
6
|
+
# Holds the result of evaluating a single test case.
|
|
7
|
+
#
|
|
8
|
+
# Contains the test case definition, the agent's result, the score,
|
|
9
|
+
# and any error that occurred during execution.
|
|
10
|
+
class EvalResult
|
|
11
|
+
attr_reader :test_case, :agent_result, :score, :execution_id, :error
|
|
12
|
+
|
|
13
|
+
def initialize(test_case:, agent_result:, score:, execution_id: nil, error: nil)
|
|
14
|
+
@test_case = test_case
|
|
15
|
+
@agent_result = agent_result
|
|
16
|
+
@score = score
|
|
17
|
+
@execution_id = execution_id
|
|
18
|
+
@error = error
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_case_name
|
|
22
|
+
test_case.name
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def input
|
|
26
|
+
test_case.input
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def expected
|
|
30
|
+
test_case.expected
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def passed?(threshold = 0.5)
|
|
34
|
+
score.passed?(threshold)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def failed?(threshold = 0.5)
|
|
38
|
+
score.failed?(threshold)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def errored?
|
|
42
|
+
!error.nil?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def actual
|
|
46
|
+
return nil unless agent_result
|
|
47
|
+
|
|
48
|
+
if agent_result.respond_to?(:route)
|
|
49
|
+
{route: agent_result.route}
|
|
50
|
+
elsif agent_result.respond_to?(:content)
|
|
51
|
+
agent_result.content
|
|
52
|
+
else
|
|
53
|
+
agent_result
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def to_h
|
|
58
|
+
{
|
|
59
|
+
name: test_case_name,
|
|
60
|
+
score: score.value,
|
|
61
|
+
reason: score.reason,
|
|
62
|
+
passed: passed?,
|
|
63
|
+
input: input,
|
|
64
|
+
expected: expected,
|
|
65
|
+
actual: actual,
|
|
66
|
+
execution_id: execution_id,
|
|
67
|
+
error: error&.message
|
|
68
|
+
}
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
module Eval
|
|
6
|
+
# Aggregate results from running an eval suite.
|
|
7
|
+
#
|
|
8
|
+
# Provides score calculation, pass/fail counts, failure details,
|
|
9
|
+
# and a formatted summary string.
|
|
10
|
+
class EvalRun
|
|
11
|
+
attr_reader :suite, :results, :model, :pass_threshold,
|
|
12
|
+
:started_at, :completed_at
|
|
13
|
+
|
|
14
|
+
def initialize(suite:, results:, model:, pass_threshold:, started_at:, completed_at:)
|
|
15
|
+
@suite = suite
|
|
16
|
+
@results = results
|
|
17
|
+
@model = model
|
|
18
|
+
@pass_threshold = pass_threshold
|
|
19
|
+
@started_at = started_at
|
|
20
|
+
@completed_at = completed_at
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def agent_class
|
|
24
|
+
suite.respond_to?(:agent_class) ? suite.agent_class : suite
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Average score across all test cases (0.0 to 1.0)
|
|
28
|
+
def score
|
|
29
|
+
return 0.0 if results.empty?
|
|
30
|
+
|
|
31
|
+
results.sum { |r| r.score.value } / results.size.to_f
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def score_pct
|
|
35
|
+
(score * 100).round(1)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def total_cases
|
|
39
|
+
results.size
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def passed
|
|
43
|
+
results.count { |r| r.passed?(pass_threshold) }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def failed
|
|
47
|
+
results.count { |r| r.failed?(pass_threshold) }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def failures
|
|
51
|
+
results.select { |r| r.failed?(pass_threshold) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def errors
|
|
55
|
+
results.select(&:errored?)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def total_cost
|
|
59
|
+
results.sum do |r|
|
|
60
|
+
next 0 unless r.execution_id
|
|
61
|
+
|
|
62
|
+
if defined?(Execution)
|
|
63
|
+
Execution.find_by(id: r.execution_id)&.total_cost || 0
|
|
64
|
+
else
|
|
65
|
+
0
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
rescue
|
|
69
|
+
0
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def duration_ms
|
|
73
|
+
return 0 unless started_at && completed_at
|
|
74
|
+
|
|
75
|
+
((completed_at - started_at) * 1000).to_i
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def summary
|
|
79
|
+
agent_name = agent_class.respond_to?(:name) ? agent_class.name : agent_class.to_s
|
|
80
|
+
lines = ["#{agent_name} Eval — #{started_at.strftime("%Y-%m-%d %H:%M")}"]
|
|
81
|
+
lines << "Model: #{model} | Score: #{score_pct}% | #{passed}/#{total_cases} passed"
|
|
82
|
+
lines << "Cost: $#{"%.4f" % total_cost} | Duration: #{(duration_ms / 1000.0).round(1)}s"
|
|
83
|
+
|
|
84
|
+
if failures.any?
|
|
85
|
+
lines << ""
|
|
86
|
+
lines << "Failures:"
|
|
87
|
+
failures.each do |r|
|
|
88
|
+
lines << " - #{r.test_case_name}: expected #{r.expected.inspect}, got #{r.actual.inspect} (#{r.score.reason})"
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
if errors.any?
|
|
93
|
+
lines << ""
|
|
94
|
+
lines << "Errors:"
|
|
95
|
+
errors.each do |r|
|
|
96
|
+
lines << " - #{r.test_case_name}: #{r.error.message}"
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
lines.join("\n")
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def to_h
|
|
104
|
+
{
|
|
105
|
+
agent: agent_class.respond_to?(:name) ? agent_class.name : agent_class.to_s,
|
|
106
|
+
model: model,
|
|
107
|
+
score: score,
|
|
108
|
+
score_pct: score_pct,
|
|
109
|
+
total_cases: total_cases,
|
|
110
|
+
passed: passed,
|
|
111
|
+
failed: failed,
|
|
112
|
+
total_cost: total_cost,
|
|
113
|
+
duration_ms: duration_ms,
|
|
114
|
+
results: results.map(&:to_h)
|
|
115
|
+
}
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def to_json(*args)
|
|
119
|
+
to_h.to_json(*args)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
module Eval
|
|
6
|
+
# Score value object — returned by every scorer
|
|
7
|
+
Score = Struct.new(:value, :reason, keyword_init: true) do
|
|
8
|
+
def initialize(value:, reason: nil)
|
|
9
|
+
super(value: value.to_f.clamp(0.0, 1.0), reason: reason)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def passed?(threshold = 0.5)
|
|
13
|
+
value >= threshold
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def failed?(threshold = 0.5)
|
|
17
|
+
!passed?(threshold)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# A single test case definition
|
|
22
|
+
TestCase = Struct.new(:name, :input, :expected, :scorer, :options, keyword_init: true) do
|
|
23
|
+
def resolve_input
|
|
24
|
+
input.is_a?(Proc) ? input.call : input
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Defines test cases for an agent, runs them, scores results.
|
|
29
|
+
#
|
|
30
|
+
# @example
|
|
31
|
+
# class SupportRouter::Eval < RubyLLM::Agents::EvalSuite
|
|
32
|
+
# agent SupportRouter
|
|
33
|
+
# test_case "billing", input: { message: "charged twice" }, expected: { route: :billing }
|
|
34
|
+
# end
|
|
35
|
+
#
|
|
36
|
+
# run = SupportRouter::Eval.run!
|
|
37
|
+
# puts run.summary
|
|
38
|
+
class EvalSuite
|
|
39
|
+
class << self
|
|
40
|
+
attr_reader :agent_class, :test_cases, :eval_options
|
|
41
|
+
|
|
42
|
+
def inherited(subclass)
|
|
43
|
+
super
|
|
44
|
+
subclass.instance_variable_set(:@test_cases, [])
|
|
45
|
+
subclass.instance_variable_set(:@eval_options, {})
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# --- DSL ---
|
|
49
|
+
|
|
50
|
+
def agent(klass)
|
|
51
|
+
@agent_class = klass
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def test_case(name, input:, expected: nil, score: nil, **options)
|
|
55
|
+
@test_cases << TestCase.new(
|
|
56
|
+
name: name,
|
|
57
|
+
input: input,
|
|
58
|
+
expected: expected,
|
|
59
|
+
scorer: score,
|
|
60
|
+
options: options
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def dataset(path)
|
|
65
|
+
full_path = path.start_with?("/") ? path : Rails.root.join(path).to_s
|
|
66
|
+
cases = YAML.safe_load_file(full_path, permitted_classes: [Symbol], symbolize_names: true)
|
|
67
|
+
cases.each do |tc|
|
|
68
|
+
test_case(
|
|
69
|
+
tc[:name],
|
|
70
|
+
input: tc[:input],
|
|
71
|
+
expected: tc[:expected],
|
|
72
|
+
score: tc[:score]&.to_sym,
|
|
73
|
+
**tc.except(:name, :input, :expected, :score)
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def eval_model(value)
|
|
79
|
+
@eval_options[:model] = value
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def eval_temperature(value)
|
|
83
|
+
@eval_options[:temperature] = value
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# --- Running ---
|
|
87
|
+
|
|
88
|
+
def run!(model: nil, only: nil, pass_threshold: 0.5, overrides: {}, **options)
|
|
89
|
+
validate!
|
|
90
|
+
cases = only ? @test_cases.select { |tc| Array(only).include?(tc.name) } : @test_cases
|
|
91
|
+
resolved_model = model || @eval_options[:model]
|
|
92
|
+
temperature = @eval_options[:temperature]
|
|
93
|
+
started_at = Time.current
|
|
94
|
+
|
|
95
|
+
results = cases.map do |tc|
|
|
96
|
+
run_single(tc, model: resolved_model, temperature: temperature, overrides: overrides)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
EvalRun.new(
|
|
100
|
+
suite: self,
|
|
101
|
+
results: results,
|
|
102
|
+
model: resolved_model || (agent_class.respond_to?(:model) ? agent_class.model : nil),
|
|
103
|
+
pass_threshold: pass_threshold,
|
|
104
|
+
started_at: started_at,
|
|
105
|
+
completed_at: Time.current
|
|
106
|
+
)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def validate!
|
|
110
|
+
raise ConfigurationError, "No agent class set" unless @agent_class
|
|
111
|
+
raise ConfigurationError, "No test cases defined" if @test_cases.empty?
|
|
112
|
+
|
|
113
|
+
@test_cases.each do |tc|
|
|
114
|
+
next if tc.input.is_a?(Proc)
|
|
115
|
+
next unless @agent_class.respond_to?(:params)
|
|
116
|
+
|
|
117
|
+
agent_params = @agent_class.params
|
|
118
|
+
required = agent_params.select { |_, v| v[:required] }.keys
|
|
119
|
+
missing = required - tc.input.keys
|
|
120
|
+
if missing.any?
|
|
121
|
+
raise ConfigurationError,
|
|
122
|
+
"Test case '#{tc.name}' missing required params: #{missing.join(", ")}"
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
true
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def for(agent_klass, &block)
|
|
129
|
+
suite = Class.new(self)
|
|
130
|
+
suite.agent(agent_klass)
|
|
131
|
+
suite.instance_eval(&block) if block
|
|
132
|
+
suite
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
private
|
|
136
|
+
|
|
137
|
+
def run_single(tc, model:, temperature:, overrides:)
|
|
138
|
+
input = tc.resolve_input
|
|
139
|
+
call_options = input.dup
|
|
140
|
+
call_options.merge!(overrides) if overrides.any?
|
|
141
|
+
call_options[:model] = model if model
|
|
142
|
+
call_options[:temperature] = temperature if temperature
|
|
143
|
+
|
|
144
|
+
agent_result = agent_class.call(**call_options)
|
|
145
|
+
score = evaluate(tc, agent_result)
|
|
146
|
+
|
|
147
|
+
EvalResult.new(
|
|
148
|
+
test_case: tc,
|
|
149
|
+
agent_result: agent_result,
|
|
150
|
+
score: score,
|
|
151
|
+
execution_id: agent_result.respond_to?(:execution_id) ? agent_result.execution_id : nil
|
|
152
|
+
)
|
|
153
|
+
rescue ArgumentError
|
|
154
|
+
raise
|
|
155
|
+
rescue => e
|
|
156
|
+
EvalResult.new(
|
|
157
|
+
test_case: tc,
|
|
158
|
+
agent_result: nil,
|
|
159
|
+
score: Score.new(value: 0.0, reason: "Error: #{e.class}: #{e.message}"),
|
|
160
|
+
error: e
|
|
161
|
+
)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def evaluate(tc, agent_result)
|
|
165
|
+
case tc.scorer
|
|
166
|
+
when Proc
|
|
167
|
+
coerce_score(tc.scorer.call(agent_result, tc.expected))
|
|
168
|
+
when :contains
|
|
169
|
+
score_contains(agent_result, tc.expected)
|
|
170
|
+
when :llm_judge
|
|
171
|
+
score_llm_judge(agent_result, tc)
|
|
172
|
+
when :exact_match, nil
|
|
173
|
+
score_exact_match(agent_result, tc.expected)
|
|
174
|
+
else
|
|
175
|
+
raise ArgumentError, "Unknown scorer: #{tc.scorer}"
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def coerce_score(value)
|
|
180
|
+
case value
|
|
181
|
+
when Score then value
|
|
182
|
+
when Numeric then Score.new(value: value)
|
|
183
|
+
when true then Score.new(value: 1.0)
|
|
184
|
+
when false then Score.new(value: 0.0)
|
|
185
|
+
else Score.new(value: 0.0, reason: "Scorer returned #{value.class}")
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# --- Built-in scorers ---
|
|
190
|
+
|
|
191
|
+
def score_exact_match(result, expected)
|
|
192
|
+
actual = extract_comparable(result)
|
|
193
|
+
expected_val = normalize_expected(expected)
|
|
194
|
+
|
|
195
|
+
if actual == expected_val
|
|
196
|
+
Score.new(value: 1.0)
|
|
197
|
+
else
|
|
198
|
+
Score.new(value: 0.0, reason: "Expected #{expected_val.inspect}, got #{actual.inspect}")
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def score_contains(result, expected)
|
|
203
|
+
content = result.respond_to?(:content) ? result.content.to_s : result.to_s
|
|
204
|
+
targets = Array(expected)
|
|
205
|
+
missing = targets.reject { |e| content.downcase.include?(e.to_s.downcase) }
|
|
206
|
+
|
|
207
|
+
if missing.empty?
|
|
208
|
+
Score.new(value: 1.0)
|
|
209
|
+
else
|
|
210
|
+
Score.new(value: 0.0, reason: "Missing: #{missing.join(", ")}")
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def score_llm_judge(result, tc)
|
|
215
|
+
content = result.respond_to?(:content) ? result.content.to_s : result.to_s
|
|
216
|
+
criteria = tc.options[:criteria]
|
|
217
|
+
judge_model = tc.options[:judge_model] || "gpt-4o-mini"
|
|
218
|
+
|
|
219
|
+
prompt = <<~PROMPT
|
|
220
|
+
You are evaluating an AI agent's response. Score it from 0 to 10.
|
|
221
|
+
|
|
222
|
+
## Input
|
|
223
|
+
#{tc.input.inspect}
|
|
224
|
+
|
|
225
|
+
## Agent Response
|
|
226
|
+
#{content}
|
|
227
|
+
|
|
228
|
+
## Criteria
|
|
229
|
+
#{criteria}
|
|
230
|
+
|
|
231
|
+
Respond with ONLY a JSON object:
|
|
232
|
+
{"score": <0-10>, "reason": "<brief explanation>"}
|
|
233
|
+
PROMPT
|
|
234
|
+
|
|
235
|
+
chat = RubyLLM.chat(model: judge_model)
|
|
236
|
+
parsed = JSON.parse(chat.ask(prompt).content)
|
|
237
|
+
Score.new(value: parsed["score"].to_f / 10.0, reason: parsed["reason"])
|
|
238
|
+
rescue => e
|
|
239
|
+
Score.new(value: 0.0, reason: "Judge error: #{e.message}")
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def extract_comparable(result)
|
|
243
|
+
if result.respond_to?(:route)
|
|
244
|
+
{route: result.route}
|
|
245
|
+
elsif result.respond_to?(:content)
|
|
246
|
+
content = result.content
|
|
247
|
+
content.is_a?(Hash) ? content.transform_keys(&:to_sym) : content.to_s.strip
|
|
248
|
+
else
|
|
249
|
+
result
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def normalize_expected(expected)
|
|
254
|
+
case expected
|
|
255
|
+
when Hash then expected.transform_keys(&:to_sym)
|
|
256
|
+
when String then expected.strip
|
|
257
|
+
else expected
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
data/lib/ruby_llm/agents.rb
CHANGED
|
@@ -75,6 +75,9 @@ require_relative "agents/image/analyzer"
|
|
|
75
75
|
require_relative "agents/image/background_remover"
|
|
76
76
|
require_relative "agents/image/pipeline"
|
|
77
77
|
|
|
78
|
+
# Evaluation framework
|
|
79
|
+
require_relative "agents/eval"
|
|
80
|
+
|
|
78
81
|
# Rails integration
|
|
79
82
|
if defined?(Rails)
|
|
80
83
|
require_relative "agents/core/inflections"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-agents
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.
|
|
4
|
+
version: 3.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- adham90
|
|
@@ -237,6 +237,10 @@ files:
|
|
|
237
237
|
- lib/ruby_llm/agents/dsl/caching.rb
|
|
238
238
|
- lib/ruby_llm/agents/dsl/queryable.rb
|
|
239
239
|
- lib/ruby_llm/agents/dsl/reliability.rb
|
|
240
|
+
- lib/ruby_llm/agents/eval.rb
|
|
241
|
+
- lib/ruby_llm/agents/eval/eval_result.rb
|
|
242
|
+
- lib/ruby_llm/agents/eval/eval_run.rb
|
|
243
|
+
- lib/ruby_llm/agents/eval/eval_suite.rb
|
|
240
244
|
- lib/ruby_llm/agents/image/analyzer.rb
|
|
241
245
|
- lib/ruby_llm/agents/image/analyzer/dsl.rb
|
|
242
246
|
- lib/ruby_llm/agents/image/analyzer/execution.rb
|