ruby_llm-agents 3.5.5 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -0
- data/app/controllers/ruby_llm/agents/dashboard_controller.rb +155 -10
- data/app/controllers/ruby_llm/agents/executions_controller.rb +1 -3
- data/app/helpers/ruby_llm/agents/application_helper.rb +15 -28
- data/app/models/ruby_llm/agents/execution/replayable.rb +124 -0
- data/app/models/ruby_llm/agents/execution/scopes.rb +42 -1
- data/app/models/ruby_llm/agents/execution.rb +50 -1
- data/app/models/ruby_llm/agents/tenant/budgetable.rb +28 -4
- data/app/views/layouts/ruby_llm/agents/application.html.erb +41 -28
- data/app/views/ruby_llm/agents/agents/show.html.erb +16 -1
- data/app/views/ruby_llm/agents/dashboard/_top_tenants.html.erb +47 -0
- data/app/views/ruby_llm/agents/dashboard/index.html.erb +404 -107
- data/app/views/ruby_llm/agents/system_config/show.html.erb +0 -13
- data/lib/generators/ruby_llm_agents/rename_agent_generator.rb +53 -0
- data/lib/generators/ruby_llm_agents/templates/initializer.rb.tt +0 -15
- data/lib/generators/ruby_llm_agents/templates/rename_agent_migration.rb.tt +19 -0
- data/lib/ruby_llm/agents/agent_tool.rb +125 -0
- data/lib/ruby_llm/agents/audio/speaker.rb +5 -3
- data/lib/ruby_llm/agents/audio/speech_pricing.rb +63 -187
- data/lib/ruby_llm/agents/audio/transcriber.rb +5 -3
- data/lib/ruby_llm/agents/audio/transcription_pricing.rb +5 -7
- data/lib/ruby_llm/agents/base_agent.rb +144 -5
- data/lib/ruby_llm/agents/core/configuration.rb +178 -53
- data/lib/ruby_llm/agents/core/errors.rb +3 -77
- data/lib/ruby_llm/agents/core/instrumentation.rb +0 -17
- data/lib/ruby_llm/agents/core/version.rb +1 -1
- data/lib/ruby_llm/agents/dsl/base.rb +0 -8
- data/lib/ruby_llm/agents/dsl/queryable.rb +124 -0
- data/lib/ruby_llm/agents/dsl.rb +1 -0
- data/lib/ruby_llm/agents/eval/eval_result.rb +73 -0
- data/lib/ruby_llm/agents/eval/eval_run.rb +124 -0
- data/lib/ruby_llm/agents/eval/eval_suite.rb +264 -0
- data/lib/ruby_llm/agents/eval.rb +5 -0
- data/lib/ruby_llm/agents/image/concerns/image_operation_execution.rb +2 -1
- data/lib/ruby_llm/agents/image/generator/pricing.rb +75 -217
- data/lib/ruby_llm/agents/image/generator.rb +5 -3
- data/lib/ruby_llm/agents/infrastructure/attempt_tracker.rb +8 -0
- data/lib/ruby_llm/agents/infrastructure/circuit_breaker.rb +4 -2
- data/lib/ruby_llm/agents/pipeline/builder.rb +43 -0
- data/lib/ruby_llm/agents/pipeline/context.rb +11 -1
- data/lib/ruby_llm/agents/pipeline/executor.rb +1 -25
- data/lib/ruby_llm/agents/pipeline/middleware/budget.rb +26 -1
- data/lib/ruby_llm/agents/pipeline/middleware/cache.rb +18 -0
- data/lib/ruby_llm/agents/pipeline/middleware/instrumentation.rb +90 -0
- data/lib/ruby_llm/agents/pipeline/middleware/reliability.rb +29 -0
- data/lib/ruby_llm/agents/pipeline/middleware/tenant.rb +11 -4
- data/lib/ruby_llm/agents/pipeline.rb +0 -92
- data/lib/ruby_llm/agents/results/background_removal_result.rb +11 -1
- data/lib/ruby_llm/agents/results/base.rb +23 -1
- data/lib/ruby_llm/agents/results/embedding_result.rb +14 -1
- data/lib/ruby_llm/agents/results/image_analysis_result.rb +11 -1
- data/lib/ruby_llm/agents/results/image_edit_result.rb +11 -1
- data/lib/ruby_llm/agents/results/image_generation_result.rb +12 -3
- data/lib/ruby_llm/agents/results/image_pipeline_result.rb +11 -1
- data/lib/ruby_llm/agents/results/image_transform_result.rb +11 -1
- data/lib/ruby_llm/agents/results/image_upscale_result.rb +11 -1
- data/lib/ruby_llm/agents/results/image_variation_result.rb +11 -1
- data/lib/ruby_llm/agents/results/speech_result.rb +20 -1
- data/lib/ruby_llm/agents/results/transcription_result.rb +20 -1
- data/lib/ruby_llm/agents/text/embedder.rb +23 -18
- data/lib/ruby_llm/agents.rb +73 -5
- data/lib/tasks/ruby_llm_agents.rake +21 -0
- metadata +11 -6
- data/lib/ruby_llm/agents/infrastructure/reliability/breaker_manager.rb +0 -80
- data/lib/ruby_llm/agents/infrastructure/reliability/execution_constraints.rb +0 -69
- data/lib/ruby_llm/agents/infrastructure/reliability/executor.rb +0 -125
- data/lib/ruby_llm/agents/infrastructure/reliability/fallback_routing.rb +0 -72
- data/lib/ruby_llm/agents/infrastructure/reliability/retry_strategy.rb +0 -82
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
module DSL
|
|
6
|
+
# Adds execution querying capabilities to agent classes.
|
|
7
|
+
#
|
|
8
|
+
# Mixed into BaseAgent via `extend DSL::Queryable`, making all methods
|
|
9
|
+
# available as class methods on agent classes.
|
|
10
|
+
#
|
|
11
|
+
# @example Basic queries
|
|
12
|
+
# SupportAgent.executions.successful.recent
|
|
13
|
+
# SupportAgent.executions.today.expensive(0.50)
|
|
14
|
+
#
|
|
15
|
+
# @example Convenience methods
|
|
16
|
+
# SupportAgent.last_run
|
|
17
|
+
# SupportAgent.stats
|
|
18
|
+
# SupportAgent.total_spent(since: 1.week)
|
|
19
|
+
#
|
|
20
|
+
module Queryable
|
|
21
|
+
# Returns an ActiveRecord::Relation scoped to this agent's executions.
|
|
22
|
+
#
|
|
23
|
+
# @return [ActiveRecord::Relation]
|
|
24
|
+
#
|
|
25
|
+
# @example
|
|
26
|
+
# SupportAgent.executions.successful.last(5)
|
|
27
|
+
# SupportAgent.executions.where("total_cost > ?", 0.01)
|
|
28
|
+
#
|
|
29
|
+
def executions
|
|
30
|
+
RubyLLM::Agents::Execution.by_agent(name)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Returns the most recent execution for this agent.
|
|
34
|
+
#
|
|
35
|
+
# @return [RubyLLM::Agents::Execution, nil]
|
|
36
|
+
#
|
|
37
|
+
def last_run
|
|
38
|
+
executions.order(created_at: :desc).first
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Returns recent failed executions.
|
|
42
|
+
#
|
|
43
|
+
# @param since [ActiveSupport::Duration] Time window (default: 24.hours)
|
|
44
|
+
# @return [ActiveRecord::Relation]
|
|
45
|
+
#
|
|
46
|
+
def failures(since: 24.hours)
|
|
47
|
+
executions.failed.where("created_at > ?", since.ago)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Returns total cost spent by this agent.
|
|
51
|
+
#
|
|
52
|
+
# @param since [ActiveSupport::Duration, nil] Optional time window
|
|
53
|
+
# @return [BigDecimal] Total cost in USD
|
|
54
|
+
#
|
|
55
|
+
def total_spent(since: nil)
|
|
56
|
+
scope = executions
|
|
57
|
+
scope = scope.where("created_at > ?", since.ago) if since
|
|
58
|
+
scope.sum(:total_cost)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Returns a stats summary hash for this agent.
|
|
62
|
+
#
|
|
63
|
+
# @param since [ActiveSupport::Duration, nil] Time window
|
|
64
|
+
# @return [Hash] Stats summary
|
|
65
|
+
#
|
|
66
|
+
def stats(since: nil)
|
|
67
|
+
scope = executions
|
|
68
|
+
scope = scope.where("created_at > ?", since.ago) if since
|
|
69
|
+
|
|
70
|
+
total = scope.count
|
|
71
|
+
successful = scope.successful.count
|
|
72
|
+
|
|
73
|
+
{
|
|
74
|
+
total: total,
|
|
75
|
+
successful: successful,
|
|
76
|
+
failed: scope.failed.count,
|
|
77
|
+
success_rate: total.zero? ? 0.0 : (successful.to_f / total * 100).round(1),
|
|
78
|
+
avg_duration_ms: scope.average(:duration_ms)&.round,
|
|
79
|
+
avg_cost: total.zero? ? 0 : (scope.sum(:total_cost).to_f / total).round(6),
|
|
80
|
+
total_cost: scope.sum(:total_cost),
|
|
81
|
+
total_tokens: scope.sum(:total_tokens),
|
|
82
|
+
avg_tokens: scope.average(:total_tokens)&.round
|
|
83
|
+
}
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Returns cost breakdown by model for this agent.
|
|
87
|
+
#
|
|
88
|
+
# @param since [ActiveSupport::Duration, nil] Time window
|
|
89
|
+
# @return [Hash{String => Hash}] Costs per model
|
|
90
|
+
#
|
|
91
|
+
def cost_by_model(since: nil)
|
|
92
|
+
scope = executions
|
|
93
|
+
scope = scope.where("created_at > ?", since.ago) if since
|
|
94
|
+
|
|
95
|
+
scope.group(:model_id).pluck(
|
|
96
|
+
:model_id,
|
|
97
|
+
Arel.sql("COUNT(*)"),
|
|
98
|
+
Arel.sql("SUM(total_cost)"),
|
|
99
|
+
Arel.sql("AVG(total_cost)")
|
|
100
|
+
).each_with_object({}) do |(model, count, total, avg), hash|
|
|
101
|
+
hash[model] = {
|
|
102
|
+
count: count,
|
|
103
|
+
total_cost: total&.to_f&.round(6) || 0,
|
|
104
|
+
avg_cost: avg&.to_f&.round(6) || 0
|
|
105
|
+
}
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Returns executions matching specific parameter values.
|
|
110
|
+
#
|
|
111
|
+
# @param params [Hash] Parameter key-value pairs to match
|
|
112
|
+
# @return [ActiveRecord::Relation]
|
|
113
|
+
#
|
|
114
|
+
def with_params(**params)
|
|
115
|
+
scope = executions
|
|
116
|
+
params.each do |key, value|
|
|
117
|
+
scope = scope.with_parameter(key, value)
|
|
118
|
+
end
|
|
119
|
+
scope
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
data/lib/ruby_llm/agents/dsl.rb
CHANGED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
module Eval
|
|
6
|
+
# Holds the result of evaluating a single test case.
|
|
7
|
+
#
|
|
8
|
+
# Contains the test case definition, the agent's result, the score,
|
|
9
|
+
# and any error that occurred during execution.
|
|
10
|
+
class EvalResult
|
|
11
|
+
attr_reader :test_case, :agent_result, :score, :execution_id, :error
|
|
12
|
+
|
|
13
|
+
def initialize(test_case:, agent_result:, score:, execution_id: nil, error: nil)
|
|
14
|
+
@test_case = test_case
|
|
15
|
+
@agent_result = agent_result
|
|
16
|
+
@score = score
|
|
17
|
+
@execution_id = execution_id
|
|
18
|
+
@error = error
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_case_name
|
|
22
|
+
test_case.name
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def input
|
|
26
|
+
test_case.input
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def expected
|
|
30
|
+
test_case.expected
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def passed?(threshold = 0.5)
|
|
34
|
+
score.passed?(threshold)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def failed?(threshold = 0.5)
|
|
38
|
+
score.failed?(threshold)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def errored?
|
|
42
|
+
!error.nil?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def actual
|
|
46
|
+
return nil unless agent_result
|
|
47
|
+
|
|
48
|
+
if agent_result.respond_to?(:route)
|
|
49
|
+
{route: agent_result.route}
|
|
50
|
+
elsif agent_result.respond_to?(:content)
|
|
51
|
+
agent_result.content
|
|
52
|
+
else
|
|
53
|
+
agent_result
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def to_h
|
|
58
|
+
{
|
|
59
|
+
name: test_case_name,
|
|
60
|
+
score: score.value,
|
|
61
|
+
reason: score.reason,
|
|
62
|
+
passed: passed?,
|
|
63
|
+
input: input,
|
|
64
|
+
expected: expected,
|
|
65
|
+
actual: actual,
|
|
66
|
+
execution_id: execution_id,
|
|
67
|
+
error: error&.message
|
|
68
|
+
}
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
module Eval
|
|
6
|
+
# Aggregate results from running an eval suite.
|
|
7
|
+
#
|
|
8
|
+
# Provides score calculation, pass/fail counts, failure details,
|
|
9
|
+
# and a formatted summary string.
|
|
10
|
+
class EvalRun
|
|
11
|
+
attr_reader :suite, :results, :model, :pass_threshold,
|
|
12
|
+
:started_at, :completed_at
|
|
13
|
+
|
|
14
|
+
def initialize(suite:, results:, model:, pass_threshold:, started_at:, completed_at:)
|
|
15
|
+
@suite = suite
|
|
16
|
+
@results = results
|
|
17
|
+
@model = model
|
|
18
|
+
@pass_threshold = pass_threshold
|
|
19
|
+
@started_at = started_at
|
|
20
|
+
@completed_at = completed_at
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def agent_class
|
|
24
|
+
suite.respond_to?(:agent_class) ? suite.agent_class : suite
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Average score across all test cases (0.0 to 1.0)
|
|
28
|
+
def score
|
|
29
|
+
return 0.0 if results.empty?
|
|
30
|
+
|
|
31
|
+
results.sum { |r| r.score.value } / results.size.to_f
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def score_pct
|
|
35
|
+
(score * 100).round(1)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def total_cases
|
|
39
|
+
results.size
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def passed
|
|
43
|
+
results.count { |r| r.passed?(pass_threshold) }
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def failed
|
|
47
|
+
results.count { |r| r.failed?(pass_threshold) }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def failures
|
|
51
|
+
results.select { |r| r.failed?(pass_threshold) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def errors
|
|
55
|
+
results.select(&:errored?)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def total_cost
|
|
59
|
+
results.sum do |r|
|
|
60
|
+
next 0 unless r.execution_id
|
|
61
|
+
|
|
62
|
+
if defined?(Execution)
|
|
63
|
+
Execution.find_by(id: r.execution_id)&.total_cost || 0
|
|
64
|
+
else
|
|
65
|
+
0
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
rescue
|
|
69
|
+
0
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def duration_ms
|
|
73
|
+
return 0 unless started_at && completed_at
|
|
74
|
+
|
|
75
|
+
((completed_at - started_at) * 1000).to_i
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def summary
|
|
79
|
+
agent_name = agent_class.respond_to?(:name) ? agent_class.name : agent_class.to_s
|
|
80
|
+
lines = ["#{agent_name} Eval — #{started_at.strftime("%Y-%m-%d %H:%M")}"]
|
|
81
|
+
lines << "Model: #{model} | Score: #{score_pct}% | #{passed}/#{total_cases} passed"
|
|
82
|
+
lines << "Cost: $#{"%.4f" % total_cost} | Duration: #{(duration_ms / 1000.0).round(1)}s"
|
|
83
|
+
|
|
84
|
+
if failures.any?
|
|
85
|
+
lines << ""
|
|
86
|
+
lines << "Failures:"
|
|
87
|
+
failures.each do |r|
|
|
88
|
+
lines << " - #{r.test_case_name}: expected #{r.expected.inspect}, got #{r.actual.inspect} (#{r.score.reason})"
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
if errors.any?
|
|
93
|
+
lines << ""
|
|
94
|
+
lines << "Errors:"
|
|
95
|
+
errors.each do |r|
|
|
96
|
+
lines << " - #{r.test_case_name}: #{r.error.message}"
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
lines.join("\n")
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def to_h
|
|
104
|
+
{
|
|
105
|
+
agent: agent_class.respond_to?(:name) ? agent_class.name : agent_class.to_s,
|
|
106
|
+
model: model,
|
|
107
|
+
score: score,
|
|
108
|
+
score_pct: score_pct,
|
|
109
|
+
total_cases: total_cases,
|
|
110
|
+
passed: passed,
|
|
111
|
+
failed: failed,
|
|
112
|
+
total_cost: total_cost,
|
|
113
|
+
duration_ms: duration_ms,
|
|
114
|
+
results: results.map(&:to_h)
|
|
115
|
+
}
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def to_json(*args)
|
|
119
|
+
to_h.to_json(*args)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RubyLLM
|
|
4
|
+
module Agents
|
|
5
|
+
module Eval
|
|
6
|
+
# Score value object — returned by every scorer
|
|
7
|
+
Score = Struct.new(:value, :reason, keyword_init: true) do
|
|
8
|
+
def initialize(value:, reason: nil)
|
|
9
|
+
super(value: value.to_f.clamp(0.0, 1.0), reason: reason)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def passed?(threshold = 0.5)
|
|
13
|
+
value >= threshold
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def failed?(threshold = 0.5)
|
|
17
|
+
!passed?(threshold)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# A single test case definition
|
|
22
|
+
TestCase = Struct.new(:name, :input, :expected, :scorer, :options, keyword_init: true) do
|
|
23
|
+
def resolve_input
|
|
24
|
+
input.is_a?(Proc) ? input.call : input
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Defines test cases for an agent, runs them, scores results.
|
|
29
|
+
#
|
|
30
|
+
# @example
|
|
31
|
+
# class SupportRouter::Eval < RubyLLM::Agents::EvalSuite
|
|
32
|
+
# agent SupportRouter
|
|
33
|
+
# test_case "billing", input: { message: "charged twice" }, expected: { route: :billing }
|
|
34
|
+
# end
|
|
35
|
+
#
|
|
36
|
+
# run = SupportRouter::Eval.run!
|
|
37
|
+
# puts run.summary
|
|
38
|
+
class EvalSuite
|
|
39
|
+
class << self
|
|
40
|
+
attr_reader :agent_class, :test_cases, :eval_options
|
|
41
|
+
|
|
42
|
+
def inherited(subclass)
|
|
43
|
+
super
|
|
44
|
+
subclass.instance_variable_set(:@test_cases, [])
|
|
45
|
+
subclass.instance_variable_set(:@eval_options, {})
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# --- DSL ---
|
|
49
|
+
|
|
50
|
+
def agent(klass)
|
|
51
|
+
@agent_class = klass
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def test_case(name, input:, expected: nil, score: nil, **options)
|
|
55
|
+
@test_cases << TestCase.new(
|
|
56
|
+
name: name,
|
|
57
|
+
input: input,
|
|
58
|
+
expected: expected,
|
|
59
|
+
scorer: score,
|
|
60
|
+
options: options
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def dataset(path)
|
|
65
|
+
full_path = path.start_with?("/") ? path : Rails.root.join(path).to_s
|
|
66
|
+
cases = YAML.safe_load_file(full_path, permitted_classes: [Symbol], symbolize_names: true)
|
|
67
|
+
cases.each do |tc|
|
|
68
|
+
test_case(
|
|
69
|
+
tc[:name],
|
|
70
|
+
input: tc[:input],
|
|
71
|
+
expected: tc[:expected],
|
|
72
|
+
score: tc[:score]&.to_sym,
|
|
73
|
+
**tc.except(:name, :input, :expected, :score)
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def eval_model(value)
|
|
79
|
+
@eval_options[:model] = value
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def eval_temperature(value)
|
|
83
|
+
@eval_options[:temperature] = value
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# --- Running ---
|
|
87
|
+
|
|
88
|
+
def run!(model: nil, only: nil, pass_threshold: 0.5, overrides: {}, **options)
|
|
89
|
+
validate!
|
|
90
|
+
cases = only ? @test_cases.select { |tc| Array(only).include?(tc.name) } : @test_cases
|
|
91
|
+
resolved_model = model || @eval_options[:model]
|
|
92
|
+
temperature = @eval_options[:temperature]
|
|
93
|
+
started_at = Time.current
|
|
94
|
+
|
|
95
|
+
results = cases.map do |tc|
|
|
96
|
+
run_single(tc, model: resolved_model, temperature: temperature, overrides: overrides)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
EvalRun.new(
|
|
100
|
+
suite: self,
|
|
101
|
+
results: results,
|
|
102
|
+
model: resolved_model || (agent_class.respond_to?(:model) ? agent_class.model : nil),
|
|
103
|
+
pass_threshold: pass_threshold,
|
|
104
|
+
started_at: started_at,
|
|
105
|
+
completed_at: Time.current
|
|
106
|
+
)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def validate!
|
|
110
|
+
raise ConfigurationError, "No agent class set" unless @agent_class
|
|
111
|
+
raise ConfigurationError, "No test cases defined" if @test_cases.empty?
|
|
112
|
+
|
|
113
|
+
@test_cases.each do |tc|
|
|
114
|
+
next if tc.input.is_a?(Proc)
|
|
115
|
+
next unless @agent_class.respond_to?(:params)
|
|
116
|
+
|
|
117
|
+
agent_params = @agent_class.params
|
|
118
|
+
required = agent_params.select { |_, v| v[:required] }.keys
|
|
119
|
+
missing = required - tc.input.keys
|
|
120
|
+
if missing.any?
|
|
121
|
+
raise ConfigurationError,
|
|
122
|
+
"Test case '#{tc.name}' missing required params: #{missing.join(", ")}"
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
true
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def for(agent_klass, &block)
|
|
129
|
+
suite = Class.new(self)
|
|
130
|
+
suite.agent(agent_klass)
|
|
131
|
+
suite.instance_eval(&block) if block
|
|
132
|
+
suite
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
private
|
|
136
|
+
|
|
137
|
+
def run_single(tc, model:, temperature:, overrides:)
|
|
138
|
+
input = tc.resolve_input
|
|
139
|
+
call_options = input.dup
|
|
140
|
+
call_options.merge!(overrides) if overrides.any?
|
|
141
|
+
call_options[:model] = model if model
|
|
142
|
+
call_options[:temperature] = temperature if temperature
|
|
143
|
+
|
|
144
|
+
agent_result = agent_class.call(**call_options)
|
|
145
|
+
score = evaluate(tc, agent_result)
|
|
146
|
+
|
|
147
|
+
EvalResult.new(
|
|
148
|
+
test_case: tc,
|
|
149
|
+
agent_result: agent_result,
|
|
150
|
+
score: score,
|
|
151
|
+
execution_id: agent_result.respond_to?(:execution_id) ? agent_result.execution_id : nil
|
|
152
|
+
)
|
|
153
|
+
rescue ArgumentError
|
|
154
|
+
raise
|
|
155
|
+
rescue => e
|
|
156
|
+
EvalResult.new(
|
|
157
|
+
test_case: tc,
|
|
158
|
+
agent_result: nil,
|
|
159
|
+
score: Score.new(value: 0.0, reason: "Error: #{e.class}: #{e.message}"),
|
|
160
|
+
error: e
|
|
161
|
+
)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def evaluate(tc, agent_result)
|
|
165
|
+
case tc.scorer
|
|
166
|
+
when Proc
|
|
167
|
+
coerce_score(tc.scorer.call(agent_result, tc.expected))
|
|
168
|
+
when :contains
|
|
169
|
+
score_contains(agent_result, tc.expected)
|
|
170
|
+
when :llm_judge
|
|
171
|
+
score_llm_judge(agent_result, tc)
|
|
172
|
+
when :exact_match, nil
|
|
173
|
+
score_exact_match(agent_result, tc.expected)
|
|
174
|
+
else
|
|
175
|
+
raise ArgumentError, "Unknown scorer: #{tc.scorer}"
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def coerce_score(value)
|
|
180
|
+
case value
|
|
181
|
+
when Score then value
|
|
182
|
+
when Numeric then Score.new(value: value)
|
|
183
|
+
when true then Score.new(value: 1.0)
|
|
184
|
+
when false then Score.new(value: 0.0)
|
|
185
|
+
else Score.new(value: 0.0, reason: "Scorer returned #{value.class}")
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# --- Built-in scorers ---
|
|
190
|
+
|
|
191
|
+
def score_exact_match(result, expected)
|
|
192
|
+
actual = extract_comparable(result)
|
|
193
|
+
expected_val = normalize_expected(expected)
|
|
194
|
+
|
|
195
|
+
if actual == expected_val
|
|
196
|
+
Score.new(value: 1.0)
|
|
197
|
+
else
|
|
198
|
+
Score.new(value: 0.0, reason: "Expected #{expected_val.inspect}, got #{actual.inspect}")
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def score_contains(result, expected)
|
|
203
|
+
content = result.respond_to?(:content) ? result.content.to_s : result.to_s
|
|
204
|
+
targets = Array(expected)
|
|
205
|
+
missing = targets.reject { |e| content.downcase.include?(e.to_s.downcase) }
|
|
206
|
+
|
|
207
|
+
if missing.empty?
|
|
208
|
+
Score.new(value: 1.0)
|
|
209
|
+
else
|
|
210
|
+
Score.new(value: 0.0, reason: "Missing: #{missing.join(", ")}")
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def score_llm_judge(result, tc)
|
|
215
|
+
content = result.respond_to?(:content) ? result.content.to_s : result.to_s
|
|
216
|
+
criteria = tc.options[:criteria]
|
|
217
|
+
judge_model = tc.options[:judge_model] || "gpt-4o-mini"
|
|
218
|
+
|
|
219
|
+
prompt = <<~PROMPT
|
|
220
|
+
You are evaluating an AI agent's response. Score it from 0 to 10.
|
|
221
|
+
|
|
222
|
+
## Input
|
|
223
|
+
#{tc.input.inspect}
|
|
224
|
+
|
|
225
|
+
## Agent Response
|
|
226
|
+
#{content}
|
|
227
|
+
|
|
228
|
+
## Criteria
|
|
229
|
+
#{criteria}
|
|
230
|
+
|
|
231
|
+
Respond with ONLY a JSON object:
|
|
232
|
+
{"score": <0-10>, "reason": "<brief explanation>"}
|
|
233
|
+
PROMPT
|
|
234
|
+
|
|
235
|
+
chat = RubyLLM.chat(model: judge_model)
|
|
236
|
+
parsed = JSON.parse(chat.ask(prompt).content)
|
|
237
|
+
Score.new(value: parsed["score"].to_f / 10.0, reason: parsed["reason"])
|
|
238
|
+
rescue => e
|
|
239
|
+
Score.new(value: 0.0, reason: "Judge error: #{e.message}")
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
def extract_comparable(result)
|
|
243
|
+
if result.respond_to?(:route)
|
|
244
|
+
{route: result.route}
|
|
245
|
+
elsif result.respond_to?(:content)
|
|
246
|
+
content = result.content
|
|
247
|
+
content.is_a?(Hash) ? content.transform_keys(&:to_sym) : content.to_s.strip
|
|
248
|
+
else
|
|
249
|
+
result
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def normalize_expected(expected)
|
|
254
|
+
case expected
|
|
255
|
+
when Hash then expected.transform_keys(&:to_sym)
|
|
256
|
+
when String then expected.strip
|
|
257
|
+
else expected
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
@@ -85,7 +85,8 @@ module RubyLLM
|
|
|
85
85
|
if config.async_logging && defined?(ExecutionLoggerJob)
|
|
86
86
|
ExecutionLoggerJob.perform_later(execution_data)
|
|
87
87
|
else
|
|
88
|
-
RubyLLM::Agents::Execution.create!(execution_data)
|
|
88
|
+
execution = RubyLLM::Agents::Execution.create!(execution_data)
|
|
89
|
+
result.execution_id = execution.id if result.respond_to?(:execution_id=)
|
|
89
90
|
end
|
|
90
91
|
rescue => e
|
|
91
92
|
Rails.logger.error("[RubyLLM::Agents] Failed to record #{execution_type} execution: #{e.message}") if defined?(Rails)
|