eval-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/MILESTONES.md +13 -0
- data/lib/eval_ruby/comparison.rb +18 -1
- data/lib/eval_ruby/configuration.rb +25 -2
- data/lib/eval_ruby/dataset.rb +47 -1
- data/lib/eval_ruby/evaluator.rb +36 -0
- data/lib/eval_ruby/judges/anthropic.rb +8 -0
- data/lib/eval_ruby/judges/base.rb +11 -0
- data/lib/eval_ruby/judges/openai.rb +8 -0
- data/lib/eval_ruby/metrics/base.rb +8 -0
- data/lib/eval_ruby/metrics/context_precision.rb +10 -0
- data/lib/eval_ruby/metrics/context_recall.rb +10 -0
- data/lib/eval_ruby/metrics/correctness.rb +13 -0
- data/lib/eval_ruby/metrics/faithfulness.rb +10 -0
- data/lib/eval_ruby/metrics/mrr.rb +8 -0
- data/lib/eval_ruby/metrics/ndcg.rb +10 -0
- data/lib/eval_ruby/metrics/precision_at_k.rb +9 -0
- data/lib/eval_ruby/metrics/recall_at_k.rb +9 -0
- data/lib/eval_ruby/metrics/relevance.rb +10 -0
- data/lib/eval_ruby/report.rb +38 -1
- data/lib/eval_ruby/result.rb +29 -1
- data/lib/eval_ruby/rspec.rb +48 -6
- data/lib/eval_ruby/version.rb +1 -1
- data/lib/eval_ruby.rb +52 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 69f4642cd2505ab6b0b54f36cd76fea2043f76306c425fa69e29027e4d0dc901
|
|
4
|
+
data.tar.gz: 70125b286a374c01af966a0a044edb9b0cbca02fa9acd6b944f2b0825ad60981
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7ac13b5d60996d964a948b92bec466d3594c614e5a0bc3e138bfba31dda0dcca19876d01e2826ed1522962e7117b2eb69c7e3ea46567e4a09920f77f2031d09c
|
|
7
|
+
data.tar.gz: ca13c6f768516a2f21188c59511f045bd12c9fd4fe51819af705016c1282256b6765805a007265dbb3e299430c19b8fe80e1a29c22038b930b5799ab7d3a8355
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
eval-ruby (0.
|
|
4
|
+
eval-ruby (0.2.0)
|
|
5
5
|
csv
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -39,7 +39,7 @@ CHECKSUMS
|
|
|
39
39
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
40
40
|
crack (1.0.1) sha256=ff4a10390cd31d66440b7524eb1841874db86201d5b70032028553130b6d4c7e
|
|
41
41
|
csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
|
|
42
|
-
eval-ruby (0.
|
|
42
|
+
eval-ruby (0.2.0)
|
|
43
43
|
hashdiff (1.2.1) sha256=9c079dbc513dfc8833ab59c0c2d8f230fa28499cc5efb4b8dd276cf931457cd1
|
|
44
44
|
minitest (5.27.0) sha256=2d3b17f8a36fe7801c1adcffdbc38233b938eb0b4966e97a6739055a45fa77d5
|
|
45
45
|
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
|
data/MILESTONES.md
ADDED
data/lib/eval_ruby/comparison.rb
CHANGED
|
@@ -1,14 +1,27 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
|
+
# Statistical comparison of two evaluation reports using paired t-tests.
|
|
5
|
+
#
|
|
6
|
+
# @example
|
|
7
|
+
# comparison = EvalRuby.compare(report_a, report_b)
|
|
8
|
+
# puts comparison.summary
|
|
9
|
+
# comparison.significant_improvements # => [:faithfulness]
|
|
4
10
|
class Comparison
|
|
5
|
-
|
|
11
|
+
# @return [Report] baseline report
|
|
12
|
+
attr_reader :report_a
|
|
6
13
|
|
|
14
|
+
# @return [Report] comparison report
|
|
15
|
+
attr_reader :report_b
|
|
16
|
+
|
|
17
|
+
# @param report_a [Report] baseline
|
|
18
|
+
# @param report_b [Report] comparison
|
|
7
19
|
def initialize(report_a, report_b)
|
|
8
20
|
@report_a = report_a
|
|
9
21
|
@report_b = report_b
|
|
10
22
|
end
|
|
11
23
|
|
|
24
|
+
# @return [String] formatted comparison table with deltas and p-values
|
|
12
25
|
def summary
|
|
13
26
|
lines = [
|
|
14
27
|
format("%-20s | %-10s | %-10s | %-8s | %s", "Metric", "A", "B", "Delta", "p-value"),
|
|
@@ -35,6 +48,10 @@ module EvalRuby
|
|
|
35
48
|
lines.join("\n")
|
|
36
49
|
end
|
|
37
50
|
|
|
51
|
+
# Returns metrics where report_b is significantly better than report_a.
|
|
52
|
+
#
|
|
53
|
+
# @param alpha [Float] significance level (default 0.05)
|
|
54
|
+
# @return [Array<Symbol>] metric names with significant improvements
|
|
38
55
|
def significant_improvements(alpha: 0.05)
|
|
39
56
|
all_metrics.select do |metric|
|
|
40
57
|
scores_a = @report_a.results.filter_map { |r| r.scores[metric] }
|
|
@@ -1,9 +1,32 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
|
+
# Global configuration for EvalRuby.
|
|
5
|
+
#
|
|
6
|
+
# @example
|
|
7
|
+
# EvalRuby.configure do |config|
|
|
8
|
+
# config.judge_llm = :openai
|
|
9
|
+
# config.api_key = ENV["OPENAI_API_KEY"]
|
|
10
|
+
# config.judge_model = "gpt-4o"
|
|
11
|
+
# end
|
|
4
12
|
class Configuration
|
|
5
|
-
|
|
6
|
-
|
|
13
|
+
# @return [Symbol] LLM provider for judge (:openai or :anthropic)
|
|
14
|
+
attr_accessor :judge_llm
|
|
15
|
+
|
|
16
|
+
# @return [String] model name for the judge LLM
|
|
17
|
+
attr_accessor :judge_model
|
|
18
|
+
|
|
19
|
+
# @return [String, nil] API key for the judge LLM provider
|
|
20
|
+
attr_accessor :api_key
|
|
21
|
+
|
|
22
|
+
# @return [Float] default threshold for pass/fail decisions
|
|
23
|
+
attr_accessor :default_threshold
|
|
24
|
+
|
|
25
|
+
# @return [Integer] HTTP request timeout in seconds
|
|
26
|
+
attr_accessor :timeout
|
|
27
|
+
|
|
28
|
+
# @return [Integer] maximum number of retries on transient failures
|
|
29
|
+
attr_accessor :max_retries
|
|
7
30
|
|
|
8
31
|
def initialize
|
|
9
32
|
@judge_llm = :openai
|
data/lib/eval_ruby/dataset.rb
CHANGED
|
@@ -4,16 +4,36 @@ require "csv"
|
|
|
4
4
|
require "json"
|
|
5
5
|
|
|
6
6
|
module EvalRuby
|
|
7
|
+
# Collection of evaluation samples with import/export support.
|
|
8
|
+
# Supports CSV, JSON, and programmatic construction.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# dataset = EvalRuby::Dataset.new("my_test_set")
|
|
12
|
+
# dataset.add(question: "What is Ruby?", answer: "A language", ground_truth: "A language")
|
|
13
|
+
# report = EvalRuby.evaluate_batch(dataset)
|
|
7
14
|
class Dataset
|
|
8
15
|
include Enumerable
|
|
9
16
|
|
|
10
|
-
|
|
17
|
+
# @return [String] dataset name
|
|
18
|
+
attr_reader :name
|
|
11
19
|
|
|
20
|
+
# @return [Array<Hash>] sample entries
|
|
21
|
+
attr_reader :samples
|
|
22
|
+
|
|
23
|
+
# @param name [String] dataset name
|
|
12
24
|
def initialize(name = "default")
|
|
13
25
|
@name = name
|
|
14
26
|
@samples = []
|
|
15
27
|
end
|
|
16
28
|
|
|
29
|
+
# Adds a sample to the dataset.
|
|
30
|
+
#
|
|
31
|
+
# @param question [String]
|
|
32
|
+
# @param ground_truth [String, nil]
|
|
33
|
+
# @param relevant_contexts [Array<String>] alias for context
|
|
34
|
+
# @param answer [String, nil]
|
|
35
|
+
# @param context [Array<String>]
|
|
36
|
+
# @return [self]
|
|
17
37
|
def add(question:, ground_truth: nil, relevant_contexts: [], answer: nil, context: [])
|
|
18
38
|
@samples << {
|
|
19
39
|
question: question,
|
|
@@ -24,18 +44,26 @@ module EvalRuby
|
|
|
24
44
|
self
|
|
25
45
|
end
|
|
26
46
|
|
|
47
|
+
# @yield [Hash] each sample
|
|
27
48
|
def each(&block)
|
|
28
49
|
@samples.each(&block)
|
|
29
50
|
end
|
|
30
51
|
|
|
52
|
+
# @return [Integer] number of samples
|
|
31
53
|
def size
|
|
32
54
|
@samples.size
|
|
33
55
|
end
|
|
34
56
|
|
|
57
|
+
# @param index [Integer]
|
|
58
|
+
# @return [Hash] sample at index
|
|
35
59
|
def [](index)
|
|
36
60
|
@samples[index]
|
|
37
61
|
end
|
|
38
62
|
|
|
63
|
+
# Loads a dataset from a CSV file.
|
|
64
|
+
#
|
|
65
|
+
# @param path [String] path to CSV file
|
|
66
|
+
# @return [Dataset]
|
|
39
67
|
def self.from_csv(path)
|
|
40
68
|
dataset = new(File.basename(path, ".*"))
|
|
41
69
|
CSV.foreach(path, headers: true) do |row|
|
|
@@ -49,6 +77,10 @@ module EvalRuby
|
|
|
49
77
|
dataset
|
|
50
78
|
end
|
|
51
79
|
|
|
80
|
+
# Loads a dataset from a JSON file.
|
|
81
|
+
#
|
|
82
|
+
# @param path [String] path to JSON file
|
|
83
|
+
# @return [Dataset]
|
|
52
84
|
def self.from_json(path)
|
|
53
85
|
dataset = new(File.basename(path, ".*"))
|
|
54
86
|
data = JSON.parse(File.read(path))
|
|
@@ -64,6 +96,10 @@ module EvalRuby
|
|
|
64
96
|
dataset
|
|
65
97
|
end
|
|
66
98
|
|
|
99
|
+
# Exports dataset to CSV.
|
|
100
|
+
#
|
|
101
|
+
# @param path [String] output file path
|
|
102
|
+
# @return [void]
|
|
67
103
|
def to_csv(path)
|
|
68
104
|
CSV.open(path, "w") do |csv|
|
|
69
105
|
csv << %w[question answer context ground_truth]
|
|
@@ -78,10 +114,20 @@ module EvalRuby
|
|
|
78
114
|
end
|
|
79
115
|
end
|
|
80
116
|
|
|
117
|
+
# Exports dataset to JSON.
|
|
118
|
+
#
|
|
119
|
+
# @param path [String] output file path
|
|
120
|
+
# @return [void]
|
|
81
121
|
def to_json(path)
|
|
82
122
|
File.write(path, JSON.pretty_generate({name: @name, samples: @samples}))
|
|
83
123
|
end
|
|
84
124
|
|
|
125
|
+
# Generates a dataset from documents using an LLM.
|
|
126
|
+
#
|
|
127
|
+
# @param documents [Array<String>] file paths to source documents
|
|
128
|
+
# @param questions_per_doc [Integer] number of QA pairs per document
|
|
129
|
+
# @param llm [Symbol] LLM provider (:openai or :anthropic)
|
|
130
|
+
# @return [Dataset]
|
|
85
131
|
def self.generate(documents:, questions_per_doc: 5, llm: :openai)
|
|
86
132
|
config = EvalRuby.configuration.dup
|
|
87
133
|
config.judge_llm = llm
|
data/lib/eval_ruby/evaluator.rb
CHANGED
|
@@ -1,12 +1,25 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
|
+
# Runs all configured metrics on a given question/answer/context tuple.
|
|
5
|
+
#
|
|
6
|
+
# @example
|
|
7
|
+
# evaluator = EvalRuby::Evaluator.new
|
|
8
|
+
# result = evaluator.evaluate(question: "...", answer: "...", context: [...])
|
|
4
9
|
class Evaluator
|
|
10
|
+
# @param config [Configuration] configuration to use
|
|
5
11
|
def initialize(config = EvalRuby.configuration)
|
|
6
12
|
@config = config
|
|
7
13
|
@judge = build_judge(config)
|
|
8
14
|
end
|
|
9
15
|
|
|
16
|
+
# Evaluates an LLM response across quality metrics.
|
|
17
|
+
#
|
|
18
|
+
# @param question [String] the input question
|
|
19
|
+
# @param answer [String] the LLM-generated answer
|
|
20
|
+
# @param context [Array<String>] retrieved context chunks
|
|
21
|
+
# @param ground_truth [String, nil] expected correct answer
|
|
22
|
+
# @return [Result]
|
|
10
23
|
def evaluate(question:, answer:, context: [], ground_truth: nil)
|
|
11
24
|
scores = {}
|
|
12
25
|
details = {}
|
|
@@ -37,6 +50,12 @@ module EvalRuby
|
|
|
37
50
|
Result.new(scores: scores, details: details)
|
|
38
51
|
end
|
|
39
52
|
|
|
53
|
+
# Evaluates retrieval quality using IR metrics.
|
|
54
|
+
#
|
|
55
|
+
# @param question [String] the input question
|
|
56
|
+
# @param retrieved [Array<String>] retrieved document IDs
|
|
57
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
58
|
+
# @return [RetrievalResult]
|
|
40
59
|
def evaluate_retrieval(question:, retrieved:, relevant:)
|
|
41
60
|
RetrievalResult.new(retrieved: retrieved, relevant: relevant)
|
|
42
61
|
end
|
|
@@ -55,32 +74,49 @@ module EvalRuby
|
|
|
55
74
|
end
|
|
56
75
|
end
|
|
57
76
|
|
|
77
|
+
# Holds retrieval evaluation results with IR metric accessors.
|
|
78
|
+
#
|
|
79
|
+
# @example
|
|
80
|
+
# result = EvalRuby.evaluate_retrieval(question: "...", retrieved: [...], relevant: [...])
|
|
81
|
+
# result.precision_at_k(5) # => 0.6
|
|
82
|
+
# result.mrr # => 1.0
|
|
58
83
|
class RetrievalResult
|
|
84
|
+
# @param retrieved [Array<String>] retrieved document IDs in ranked order
|
|
85
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
59
86
|
def initialize(retrieved:, relevant:)
|
|
60
87
|
@retrieved = retrieved
|
|
61
88
|
@relevant = relevant
|
|
62
89
|
end
|
|
63
90
|
|
|
91
|
+
# @param k [Integer] number of top results to consider
|
|
92
|
+
# @return [Float] precision at k
|
|
64
93
|
def precision_at_k(k)
|
|
65
94
|
Metrics::PrecisionAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
|
|
66
95
|
end
|
|
67
96
|
|
|
97
|
+
# @param k [Integer] number of top results to consider
|
|
98
|
+
# @return [Float] recall at k
|
|
68
99
|
def recall_at_k(k)
|
|
69
100
|
Metrics::RecallAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
|
|
70
101
|
end
|
|
71
102
|
|
|
103
|
+
# @return [Float] mean reciprocal rank
|
|
72
104
|
def mrr
|
|
73
105
|
Metrics::MRR.new.call(retrieved: @retrieved, relevant: @relevant)
|
|
74
106
|
end
|
|
75
107
|
|
|
108
|
+
# @param k [Integer, nil] number of top results (nil for all)
|
|
109
|
+
# @return [Float] normalized discounted cumulative gain
|
|
76
110
|
def ndcg(k: nil)
|
|
77
111
|
Metrics::NDCG.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
|
|
78
112
|
end
|
|
79
113
|
|
|
114
|
+
# @return [Float] 1.0 if any relevant doc is retrieved, 0.0 otherwise
|
|
80
115
|
def hit_rate
|
|
81
116
|
@retrieved.any? { |doc| @relevant.include?(doc) } ? 1.0 : 0.0
|
|
82
117
|
end
|
|
83
118
|
|
|
119
|
+
# @return [Hash{Symbol => Float}] all retrieval metrics
|
|
84
120
|
def to_h
|
|
85
121
|
{
|
|
86
122
|
precision_at_k: precision_at_k(@retrieved.length),
|
|
@@ -6,14 +6,22 @@ require "uri"
|
|
|
6
6
|
|
|
7
7
|
module EvalRuby
|
|
8
8
|
module Judges
|
|
9
|
+
# Anthropic-based LLM judge using the Messages API.
|
|
10
|
+
# Requires an API key set via {Configuration#api_key}.
|
|
9
11
|
class Anthropic < Base
|
|
10
12
|
API_URL = "https://api.anthropic.com/v1/messages"
|
|
11
13
|
|
|
14
|
+
# @param config [Configuration]
|
|
15
|
+
# @raise [EvalRuby::Error] if API key is missing
|
|
12
16
|
def initialize(config)
|
|
13
17
|
super
|
|
14
18
|
raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
|
|
15
19
|
end
|
|
16
20
|
|
|
21
|
+
# @param prompt [String] the evaluation prompt
|
|
22
|
+
# @return [Hash, nil] parsed JSON response
|
|
23
|
+
# @raise [EvalRuby::Error] on API errors
|
|
24
|
+
# @raise [EvalRuby::TimeoutError] after max retries
|
|
17
25
|
def call(prompt)
|
|
18
26
|
retries = 0
|
|
19
27
|
begin
|
|
@@ -2,17 +2,28 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Judges
|
|
5
|
+
# Abstract base class for LLM judges.
|
|
6
|
+
# Subclasses must implement {#call} to send prompts to an LLM and parse JSON responses.
|
|
5
7
|
class Base
|
|
8
|
+
# @param config [Configuration]
|
|
6
9
|
def initialize(config)
|
|
7
10
|
@config = config
|
|
8
11
|
end
|
|
9
12
|
|
|
13
|
+
# Sends a prompt to the LLM and returns parsed JSON.
|
|
14
|
+
#
|
|
15
|
+
# @param prompt [String] the evaluation prompt
|
|
16
|
+
# @return [Hash, nil] parsed JSON response
|
|
10
17
|
def call(prompt)
|
|
11
18
|
raise NotImplementedError, "#{self.class}#call must be implemented"
|
|
12
19
|
end
|
|
13
20
|
|
|
14
21
|
private
|
|
15
22
|
|
|
23
|
+
# Extracts and parses the first JSON object from text.
|
|
24
|
+
#
|
|
25
|
+
# @param text [String] raw LLM response text
|
|
26
|
+
# @return [Hash, nil] parsed JSON or nil if not found
|
|
16
27
|
def parse_json_response(text)
|
|
17
28
|
match = text.match(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/m)
|
|
18
29
|
return nil unless match
|
|
@@ -6,14 +6,22 @@ require "uri"
|
|
|
6
6
|
|
|
7
7
|
module EvalRuby
|
|
8
8
|
module Judges
|
|
9
|
+
# OpenAI-based LLM judge using the Chat Completions API.
|
|
10
|
+
# Requires an API key set via {Configuration#api_key}.
|
|
9
11
|
class OpenAI < Base
|
|
10
12
|
API_URL = "https://api.openai.com/v1/chat/completions"
|
|
11
13
|
|
|
14
|
+
# @param config [Configuration]
|
|
15
|
+
# @raise [EvalRuby::Error] if API key is missing
|
|
12
16
|
def initialize(config)
|
|
13
17
|
super
|
|
14
18
|
raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
|
|
15
19
|
end
|
|
16
20
|
|
|
21
|
+
# @param prompt [String] the evaluation prompt
|
|
22
|
+
# @return [Hash, nil] parsed JSON response
|
|
23
|
+
# @raise [EvalRuby::Error] on API errors
|
|
24
|
+
# @raise [EvalRuby::TimeoutError] after max retries
|
|
17
25
|
def call(prompt)
|
|
18
26
|
retries = 0
|
|
19
27
|
begin
|
|
@@ -2,13 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Abstract base class for all evaluation metrics.
|
|
6
|
+
# Subclasses must implement {#call}.
|
|
5
7
|
class Base
|
|
8
|
+
# @return [EvalRuby::Judges::Base, nil] the LLM judge instance
|
|
6
9
|
attr_reader :judge
|
|
7
10
|
|
|
11
|
+
# @param judge [EvalRuby::Judges::Base, nil] LLM judge for evaluation
|
|
8
12
|
def initialize(judge: nil)
|
|
9
13
|
@judge = judge
|
|
10
14
|
end
|
|
11
15
|
|
|
16
|
+
# Evaluates the metric.
|
|
17
|
+
#
|
|
18
|
+
# @param kwargs [Hash] metric-specific keyword arguments
|
|
19
|
+
# @return [Hash{Symbol => Object}] must include :score and :details keys
|
|
12
20
|
def call(**kwargs)
|
|
13
21
|
raise NotImplementedError, "#{self.class}#call must be implemented"
|
|
14
22
|
end
|
|
@@ -2,6 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Measures the proportion of retrieved contexts that are relevant to the question.
|
|
6
|
+
# Uses an LLM judge to evaluate each context's relevance.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# metric = ContextPrecision.new(judge: judge)
|
|
10
|
+
# result = metric.call(question: "What is Ruby?", context: ["Ruby is...", "Weather..."])
|
|
11
|
+
# result[:score] # => 0.5
|
|
5
12
|
class ContextPrecision < Base
|
|
6
13
|
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
14
|
Given the following question and a list of retrieved contexts, evaluate
|
|
@@ -19,6 +26,9 @@ module EvalRuby
|
|
|
19
26
|
The score should be the proportion of relevant contexts (0.0 to 1.0).
|
|
20
27
|
PROMPT
|
|
21
28
|
|
|
29
|
+
# @param question [String] the input question
|
|
30
|
+
# @param context [Array<String>, String] retrieved context chunks
|
|
31
|
+
# @return [Hash] :score (Float 0.0-1.0) and :details (:evaluations Array)
|
|
22
32
|
def call(question:, context:, **_kwargs)
|
|
23
33
|
contexts = context.is_a?(Array) ? context : [context.to_s]
|
|
24
34
|
return {score: 0.0, details: {}} if contexts.empty?
|
|
@@ -2,6 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Measures whether retrieved contexts contain enough information to support the ground truth.
|
|
6
|
+
# Uses an LLM judge to check if each ground truth statement is attributable to context.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# metric = ContextRecall.new(judge: judge)
|
|
10
|
+
# result = metric.call(context: ["Ruby was created in 1995."], ground_truth: "Ruby was created in 1995.")
|
|
11
|
+
# result[:score] # => 1.0
|
|
5
12
|
class ContextRecall < Base
|
|
6
13
|
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
14
|
Given the following ground truth answer and retrieved contexts, evaluate
|
|
@@ -20,6 +27,9 @@ module EvalRuby
|
|
|
20
27
|
The score should be the proportion of statements attributed to context (0.0 to 1.0).
|
|
21
28
|
PROMPT
|
|
22
29
|
|
|
30
|
+
# @param context [Array<String>, String] retrieved context chunks
|
|
31
|
+
# @param ground_truth [String] expected correct answer
|
|
32
|
+
# @return [Hash] :score (Float 0.0-1.0) and :details (:statements Array)
|
|
23
33
|
def call(context:, ground_truth:, **_kwargs)
|
|
24
34
|
contexts = context.is_a?(Array) ? context : [context.to_s]
|
|
25
35
|
return {score: 0.0, details: {}} if contexts.empty?
|
|
@@ -2,6 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Measures factual correctness of an answer against ground truth.
|
|
6
|
+
# Uses LLM judge when available, falls back to token overlap F1 score.
|
|
7
|
+
#
|
|
8
|
+
# @example With LLM judge
|
|
9
|
+
# metric = Correctness.new(judge: judge)
|
|
10
|
+
# result = metric.call(answer: "Paris", ground_truth: "Paris")
|
|
11
|
+
#
|
|
12
|
+
# @example Without judge (string similarity)
|
|
13
|
+
# metric = Correctness.new
|
|
14
|
+
# result = metric.call(answer: "The capital is Paris", ground_truth: "Paris is the capital")
|
|
5
15
|
class Correctness < Base
|
|
6
16
|
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
17
|
Given the following answer and ground truth, evaluate whether the answer
|
|
@@ -23,6 +33,9 @@ module EvalRuby
|
|
|
23
33
|
Respond in JSON: {"reasoning": "...", "score": 0.0}
|
|
24
34
|
PROMPT
|
|
25
35
|
|
|
36
|
+
# @param answer [String] the LLM-generated answer
|
|
37
|
+
# @param ground_truth [String] the expected correct answer
|
|
38
|
+
# @return [Hash] :score (Float 0.0-1.0) and :details
|
|
26
39
|
def call(answer:, ground_truth:, **_kwargs)
|
|
27
40
|
if judge
|
|
28
41
|
llm_score(answer, ground_truth)
|
|
@@ -2,6 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Measures whether an answer is supported by the provided context.
|
|
6
|
+
# Uses an LLM judge to identify claims and check if each is supported.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# metric = Faithfulness.new(judge: judge)
|
|
10
|
+
# result = metric.call(answer: "Paris is in France", context: ["Paris is the capital of France."])
|
|
11
|
+
# result[:score] # => 1.0
|
|
5
12
|
class Faithfulness < Base
|
|
6
13
|
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
14
|
Given the following context and answer, evaluate whether the answer
|
|
@@ -25,6 +32,9 @@ module EvalRuby
|
|
|
25
32
|
Respond in JSON: {"claims": [{"claim": "...", "supported": true}], "score": 0.0}
|
|
26
33
|
PROMPT
|
|
27
34
|
|
|
35
|
+
# @param answer [String] the LLM-generated answer
|
|
36
|
+
# @param context [Array<String>, String] retrieved context chunks
|
|
37
|
+
# @return [Hash] :score (Float 0.0-1.0) and :details (:claims Array)
|
|
28
38
|
def call(answer:, context:, **_kwargs)
|
|
29
39
|
context_text = context.is_a?(Array) ? context.join("\n\n") : context.to_s
|
|
30
40
|
prompt = format(PROMPT_TEMPLATE, context: context_text, answer: answer)
|
|
@@ -2,7 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Computes Mean Reciprocal Rank: 1/(position of first relevant document).
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# MRR.new.call(retrieved: ["a", "b", "c"], relevant: ["b"])
|
|
9
|
+
# # => 0.5
|
|
5
10
|
class MRR < Base
|
|
11
|
+
# @param retrieved [Array<String>] retrieved document IDs in ranked order
|
|
12
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
13
|
+
# @return [Float] reciprocal rank (0.0-1.0)
|
|
6
14
|
def call(retrieved:, relevant:, **_kwargs)
|
|
7
15
|
retrieved.each_with_index do |doc, i|
|
|
8
16
|
return 1.0 / (i + 1) if relevant.include?(doc)
|
|
@@ -2,7 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Computes Normalized Discounted Cumulative Gain (NDCG).
|
|
6
|
+
# Measures ranking quality by comparing actual ranking to ideal ranking.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# NDCG.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"])
|
|
10
|
+
# # => 0.863
|
|
5
11
|
class NDCG < Base
|
|
12
|
+
# @param retrieved [Array<String>] retrieved document IDs in ranked order
|
|
13
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
14
|
+
# @param k [Integer, nil] number of top results (nil for all)
|
|
15
|
+
# @return [Float] NDCG score (0.0-1.0)
|
|
6
16
|
def call(retrieved:, relevant:, k: nil, **_kwargs)
|
|
7
17
|
k ||= retrieved.length
|
|
8
18
|
top_k = retrieved.first(k)
|
|
@@ -2,7 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Computes Precision@K: the fraction of top-k retrieved documents that are relevant.
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# PrecisionAtK.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"], k: 3)
|
|
9
|
+
# # => 0.667
|
|
5
10
|
class PrecisionAtK < Base
|
|
11
|
+
# @param retrieved [Array<String>] retrieved document IDs in ranked order
|
|
12
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
13
|
+
# @param k [Integer, nil] number of top results (nil for all)
|
|
14
|
+
# @return [Float] precision score (0.0-1.0)
|
|
6
15
|
def call(retrieved:, relevant:, k: nil, **_kwargs)
|
|
7
16
|
k ||= retrieved.length
|
|
8
17
|
top_k = retrieved.first(k)
|
|
@@ -2,7 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Computes Recall@K: the fraction of relevant documents found in the top-k results.
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# RecallAtK.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"], k: 3)
|
|
9
|
+
# # => 1.0
|
|
5
10
|
class RecallAtK < Base
|
|
11
|
+
# @param retrieved [Array<String>] retrieved document IDs in ranked order
|
|
12
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
13
|
+
# @param k [Integer, nil] number of top results (nil for all)
|
|
14
|
+
# @return [Float] recall score (0.0-1.0)
|
|
6
15
|
def call(retrieved:, relevant:, k: nil, **_kwargs)
|
|
7
16
|
return 0.0 if relevant.empty?
|
|
8
17
|
|
|
@@ -2,6 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
4
|
module Metrics
|
|
5
|
+
# Measures whether an answer is relevant to the question.
|
|
6
|
+
# Uses an LLM judge to evaluate relevance on a 0.0-1.0 scale.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# metric = Relevance.new(judge: judge)
|
|
10
|
+
# result = metric.call(question: "What is Ruby?", answer: "Ruby is a language.")
|
|
11
|
+
# result[:score] # => 0.95
|
|
5
12
|
class Relevance < Base
|
|
6
13
|
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
14
|
Given the following question and answer, evaluate whether the answer
|
|
@@ -21,6 +28,9 @@ module EvalRuby
|
|
|
21
28
|
Respond in JSON: {"reasoning": "...", "score": 0.0}
|
|
22
29
|
PROMPT
|
|
23
30
|
|
|
31
|
+
# @param question [String] the input question
|
|
32
|
+
# @param answer [String] the LLM-generated answer
|
|
33
|
+
# @return [Hash] :score (Float 0.0-1.0) and :details (:reasoning String)
|
|
24
34
|
def call(question:, answer:, **_kwargs)
|
|
25
35
|
prompt = format(PROMPT_TEMPLATE, question: question, answer: answer)
|
|
26
36
|
|
data/lib/eval_ruby/report.rb
CHANGED
|
@@ -4,15 +4,33 @@ require "csv"
|
|
|
4
4
|
require "json"
|
|
5
5
|
|
|
6
6
|
module EvalRuby
|
|
7
|
+
# Aggregated evaluation report across multiple samples.
|
|
8
|
+
# Provides statistical summaries, filtering, and export functionality.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# report = EvalRuby.evaluate_batch(dataset)
|
|
12
|
+
# puts report.summary
|
|
13
|
+
# report.to_csv("results.csv")
|
|
7
14
|
class Report
|
|
8
|
-
|
|
15
|
+
# @return [Array<Result>] individual evaluation results
|
|
16
|
+
attr_reader :results
|
|
9
17
|
|
|
18
|
+
# @return [Float, nil] total evaluation duration in seconds
|
|
19
|
+
attr_reader :duration
|
|
20
|
+
|
|
21
|
+
# @return [Array<Hash>] original sample data
|
|
22
|
+
attr_reader :samples
|
|
23
|
+
|
|
24
|
+
# @param results [Array<Result>]
|
|
25
|
+
# @param samples [Array<Hash>]
|
|
26
|
+
# @param duration [Float, nil]
|
|
10
27
|
def initialize(results:, samples: [], duration: nil)
|
|
11
28
|
@results = results
|
|
12
29
|
@samples = samples
|
|
13
30
|
@duration = duration
|
|
14
31
|
end
|
|
15
32
|
|
|
33
|
+
# @return [String] human-readable summary with mean and std for each metric
|
|
16
34
|
def summary
|
|
17
35
|
lines = []
|
|
18
36
|
metric_stats.each do |metric, stats|
|
|
@@ -23,6 +41,9 @@ module EvalRuby
|
|
|
23
41
|
lines.join("\n")
|
|
24
42
|
end
|
|
25
43
|
|
|
44
|
+
# Computes per-metric statistics (mean, std, min, max).
|
|
45
|
+
#
|
|
46
|
+
# @return [Hash{Symbol => Hash}] metric name to stats hash
|
|
26
47
|
def metric_stats
|
|
27
48
|
return {} if @results.empty?
|
|
28
49
|
|
|
@@ -39,15 +60,27 @@ module EvalRuby
|
|
|
39
60
|
end
|
|
40
61
|
end
|
|
41
62
|
|
|
63
|
+
# Returns the n worst-scoring results.
|
|
64
|
+
#
|
|
65
|
+
# @param n [Integer] number of results to return
|
|
66
|
+
# @return [Array<Result>]
|
|
42
67
|
def worst(n = 5)
|
|
43
68
|
@results.sort_by { |r| r.overall || 0.0 }.first(n)
|
|
44
69
|
end
|
|
45
70
|
|
|
71
|
+
# Returns results below the threshold.
|
|
72
|
+
#
|
|
73
|
+
# @param threshold [Float, nil] score threshold (defaults to config default_threshold)
|
|
74
|
+
# @return [Array<Result>]
|
|
46
75
|
def failures(threshold: nil)
|
|
47
76
|
threshold ||= EvalRuby.configuration.default_threshold
|
|
48
77
|
@results.select { |r| (r.overall || 0.0) < threshold }
|
|
49
78
|
end
|
|
50
79
|
|
|
80
|
+
# Exports results to CSV.
|
|
81
|
+
#
|
|
82
|
+
# @param path [String] output file path
|
|
83
|
+
# @return [void]
|
|
51
84
|
def to_csv(path)
|
|
52
85
|
return if @results.empty?
|
|
53
86
|
|
|
@@ -61,6 +94,10 @@ module EvalRuby
|
|
|
61
94
|
end
|
|
62
95
|
end
|
|
63
96
|
|
|
97
|
+
# Exports results to JSON.
|
|
98
|
+
#
|
|
99
|
+
# @param path [String] output file path
|
|
100
|
+
# @return [void]
|
|
64
101
|
def to_json(path)
|
|
65
102
|
data = @results.each_with_index.map do |result, i|
|
|
66
103
|
{index: i, scores: result.scores, overall: result.overall, sample: @samples[i]}
|
data/lib/eval_ruby/result.rb
CHANGED
|
@@ -1,20 +1,46 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module EvalRuby
|
|
4
|
+
# Holds evaluation scores and details for a single sample.
|
|
5
|
+
#
|
|
6
|
+
# @example
|
|
7
|
+
# result = EvalRuby.evaluate(question: "...", answer: "...", context: [...])
|
|
8
|
+
# result.faithfulness # => 0.95
|
|
9
|
+
# result.overall # => 0.87
|
|
4
10
|
class Result
|
|
5
11
|
METRICS = %i[faithfulness relevance correctness context_precision context_recall].freeze
|
|
6
12
|
|
|
7
|
-
|
|
13
|
+
# @return [Hash{Symbol => Float}] metric name to score mapping
|
|
14
|
+
attr_reader :scores
|
|
8
15
|
|
|
16
|
+
# @return [Hash{Symbol => Hash}] metric name to details mapping
|
|
17
|
+
attr_reader :details
|
|
18
|
+
|
|
19
|
+
# @param scores [Hash{Symbol => Float}]
|
|
20
|
+
# @param details [Hash{Symbol => Hash}]
|
|
9
21
|
def initialize(scores: {}, details: {})
|
|
10
22
|
@scores = scores
|
|
11
23
|
@details = details
|
|
12
24
|
end
|
|
13
25
|
|
|
14
26
|
METRICS.each do |metric|
|
|
27
|
+
# @!method faithfulness
|
|
28
|
+
# @return [Float, nil] faithfulness score
|
|
29
|
+
# @!method relevance
|
|
30
|
+
# @return [Float, nil] relevance score
|
|
31
|
+
# @!method correctness
|
|
32
|
+
# @return [Float, nil] correctness score
|
|
33
|
+
# @!method context_precision
|
|
34
|
+
# @return [Float, nil] context precision score
|
|
35
|
+
# @!method context_recall
|
|
36
|
+
# @return [Float, nil] context recall score
|
|
15
37
|
define_method(metric) { @scores[metric] }
|
|
16
38
|
end
|
|
17
39
|
|
|
40
|
+
# Computes a weighted average of all available scores.
|
|
41
|
+
#
|
|
42
|
+
# @param weights [Hash{Symbol => Float}, nil] custom weights per metric
|
|
43
|
+
# @return [Float, nil] weighted average score, or nil if no scores available
|
|
18
44
|
def overall(weights: nil)
|
|
19
45
|
weights ||= METRICS.each_with_object({}) { |m, h| h[m] = 1.0 }
|
|
20
46
|
available = @scores.select { |k, v| weights.key?(k) && v }
|
|
@@ -24,10 +50,12 @@ module EvalRuby
|
|
|
24
50
|
available.sum { |k, v| v * weights[k] } / total_weight
|
|
25
51
|
end
|
|
26
52
|
|
|
53
|
+
# @return [Hash] scores plus overall
|
|
27
54
|
def to_h
|
|
28
55
|
@scores.merge(overall: overall)
|
|
29
56
|
end
|
|
30
57
|
|
|
58
|
+
# @return [String] human-readable summary
|
|
31
59
|
def to_s
|
|
32
60
|
lines = @scores.map { |k, v| " #{k}: #{v&.round(4) || 'N/A'}" }
|
|
33
61
|
lines << " overall: #{overall&.round(4) || 'N/A'}"
|
data/lib/eval_ruby/rspec.rb
CHANGED
|
@@ -4,48 +4,67 @@ require "eval_ruby"
|
|
|
4
4
|
|
|
5
5
|
module EvalRuby
|
|
6
6
|
module RSpecMatchers
|
|
7
|
+
# RSpec matcher that checks if an answer is faithful to the given context.
|
|
8
|
+
#
|
|
9
|
+
# @example
|
|
10
|
+
# expect(answer).to be_faithful_to(context)
|
|
11
|
+
# expect(answer).to be_faithful_to(context).with_threshold(0.9)
|
|
7
12
|
class BeFaithfulTo
|
|
8
|
-
def initialize(context)
|
|
13
|
+
def initialize(context, judge: nil)
|
|
9
14
|
@context = Array(context)
|
|
10
15
|
@threshold = 0.8
|
|
16
|
+
@judge = judge
|
|
11
17
|
end
|
|
12
18
|
|
|
19
|
+
# @param threshold [Float] minimum faithfulness score (0.0 - 1.0)
|
|
20
|
+
# @return [self]
|
|
13
21
|
def with_threshold(threshold)
|
|
14
22
|
@threshold = threshold
|
|
15
23
|
self
|
|
16
24
|
end
|
|
17
25
|
|
|
26
|
+
# @param answer [String] the LLM-generated answer to evaluate
|
|
27
|
+
# @return [Boolean]
|
|
18
28
|
def matches?(answer)
|
|
19
29
|
@answer = answer
|
|
20
|
-
|
|
21
|
-
result = Metrics::Faithfulness.new(judge:
|
|
30
|
+
j = @judge || EvalRuby.send(:default_judge)
|
|
31
|
+
result = Metrics::Faithfulness.new(judge: j).call(answer: answer, context: @context)
|
|
22
32
|
@score = result[:score]
|
|
23
33
|
@score >= @threshold
|
|
24
34
|
end
|
|
25
35
|
|
|
36
|
+
# @return [String]
|
|
26
37
|
def failure_message
|
|
27
38
|
"expected answer to be faithful to context (threshold: #{@threshold}), but got score #{@score.round(4)}"
|
|
28
39
|
end
|
|
29
40
|
|
|
41
|
+
# @return [String]
|
|
30
42
|
def failure_message_when_negated
|
|
31
43
|
"expected answer not to be faithful to context, but got score #{@score.round(4)}"
|
|
32
44
|
end
|
|
33
45
|
end
|
|
34
46
|
|
|
47
|
+
# RSpec matcher that checks precision@k for retrieval results.
|
|
48
|
+
#
|
|
49
|
+
# @example
|
|
50
|
+
# expect(retrieval_result).to have_precision_at_k(5).above(0.8)
|
|
35
51
|
class HavePrecisionAtK
|
|
36
52
|
def initialize(k)
|
|
37
53
|
@k = k
|
|
38
54
|
@threshold = 0.5
|
|
39
55
|
end
|
|
40
56
|
|
|
57
|
+
# @param threshold [Float] minimum precision score (0.0 - 1.0)
|
|
58
|
+
# @return [self]
|
|
41
59
|
def above(threshold)
|
|
42
60
|
@threshold = threshold
|
|
43
61
|
self
|
|
44
62
|
end
|
|
45
63
|
|
|
64
|
+
# @param results [EvalRuby::RetrievalResult]
|
|
65
|
+
# @return [Boolean]
|
|
46
66
|
def matches?(results)
|
|
47
67
|
@results = results
|
|
48
|
-
# results should respond to retrieved and relevant, or be arrays
|
|
49
68
|
if results.is_a?(EvalRuby::RetrievalResult)
|
|
50
69
|
@score = results.precision_at_k(@k)
|
|
51
70
|
else
|
|
@@ -54,17 +73,40 @@ module EvalRuby
|
|
|
54
73
|
@score >= @threshold
|
|
55
74
|
end
|
|
56
75
|
|
|
76
|
+
# @return [String]
|
|
57
77
|
def failure_message
|
|
58
78
|
"expected precision@#{@k} >= #{@threshold}, but got #{@score.round(4)}"
|
|
59
79
|
end
|
|
60
80
|
end
|
|
61
81
|
|
|
62
|
-
|
|
63
|
-
|
|
82
|
+
# @param context [Array<String>, String] context to check faithfulness against
|
|
83
|
+
# @param judge [EvalRuby::Judges::Base, nil] optional judge (uses configured default if nil)
|
|
84
|
+
# @return [BeFaithfulTo]
|
|
85
|
+
def be_faithful_to(context, judge: nil)
|
|
86
|
+
BeFaithfulTo.new(context, judge: judge)
|
|
64
87
|
end
|
|
65
88
|
|
|
89
|
+
# @param k [Integer] number of top results to evaluate
|
|
90
|
+
# @return [HavePrecisionAtK]
|
|
66
91
|
def have_precision_at_k(k)
|
|
67
92
|
HavePrecisionAtK.new(k)
|
|
68
93
|
end
|
|
69
94
|
end
|
|
95
|
+
|
|
96
|
+
class << self
|
|
97
|
+
private
|
|
98
|
+
|
|
99
|
+
# Build a judge from the current configuration.
|
|
100
|
+
# @return [EvalRuby::Judges::Base]
|
|
101
|
+
def default_judge
|
|
102
|
+
case configuration.judge_llm
|
|
103
|
+
when :openai
|
|
104
|
+
Judges::OpenAI.new(configuration)
|
|
105
|
+
when :anthropic
|
|
106
|
+
Judges::Anthropic.new(configuration)
|
|
107
|
+
else
|
|
108
|
+
raise Error, "Unknown judge LLM: #{configuration.judge_llm}"
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
70
112
|
end
|
data/lib/eval_ruby/version.rb
CHANGED
data/lib/eval_ruby.rb
CHANGED
|
@@ -21,6 +21,27 @@ require_relative "eval_ruby/report"
|
|
|
21
21
|
require_relative "eval_ruby/dataset"
|
|
22
22
|
require_relative "eval_ruby/comparison"
|
|
23
23
|
|
|
24
|
+
# Evaluation framework for LLM and RAG applications.
|
|
25
|
+
# Measures quality metrics like faithfulness, relevance, context precision,
|
|
26
|
+
# and answer correctness. Think Ragas or DeepEval for Ruby.
|
|
27
|
+
#
|
|
28
|
+
# @example Quick evaluation
|
|
29
|
+
# result = EvalRuby.evaluate(
|
|
30
|
+
# question: "What is Ruby?",
|
|
31
|
+
# answer: "A programming language",
|
|
32
|
+
# context: ["Ruby is a dynamic, open source programming language."],
|
|
33
|
+
# ground_truth: "Ruby is a programming language created by Matz."
|
|
34
|
+
# )
|
|
35
|
+
# puts result.faithfulness # => 0.95
|
|
36
|
+
# puts result.overall # => 0.87
|
|
37
|
+
#
|
|
38
|
+
# @example Retrieval evaluation
|
|
39
|
+
# result = EvalRuby.evaluate_retrieval(
|
|
40
|
+
# question: "What is Ruby?",
|
|
41
|
+
# retrieved: ["doc_a", "doc_b", "doc_c"],
|
|
42
|
+
# relevant: ["doc_a", "doc_c"]
|
|
43
|
+
# )
|
|
44
|
+
# puts result.precision_at_k(3) # => 0.67
|
|
24
45
|
module EvalRuby
|
|
25
46
|
class Error < StandardError; end
|
|
26
47
|
class APIError < Error; end
|
|
@@ -28,18 +49,33 @@ module EvalRuby
|
|
|
28
49
|
class InvalidResponseError < Error; end
|
|
29
50
|
|
|
30
51
|
class << self
|
|
52
|
+
# @return [Configuration] the current configuration
|
|
31
53
|
def configuration
|
|
32
54
|
@configuration ||= Configuration.new
|
|
33
55
|
end
|
|
34
56
|
|
|
57
|
+
# Yields the configuration for modification.
|
|
58
|
+
#
|
|
59
|
+
# @yieldparam config [Configuration]
|
|
60
|
+
# @return [void]
|
|
35
61
|
def configure
|
|
36
62
|
yield(configuration)
|
|
37
63
|
end
|
|
38
64
|
|
|
65
|
+
# Resets configuration to defaults.
|
|
66
|
+
#
|
|
67
|
+
# @return [Configuration]
|
|
39
68
|
def reset_configuration!
|
|
40
69
|
@configuration = Configuration.new
|
|
41
70
|
end
|
|
42
71
|
|
|
72
|
+
# Evaluates an LLM response across multiple quality metrics.
|
|
73
|
+
#
|
|
74
|
+
# @param question [String] the input question
|
|
75
|
+
# @param answer [String] the LLM-generated answer
|
|
76
|
+
# @param context [Array<String>] retrieved context chunks
|
|
77
|
+
# @param ground_truth [String, nil] expected correct answer
|
|
78
|
+
# @return [Result]
|
|
43
79
|
def evaluate(question:, answer:, context: [], ground_truth: nil)
|
|
44
80
|
Evaluator.new.evaluate(
|
|
45
81
|
question: question,
|
|
@@ -49,6 +85,12 @@ module EvalRuby
|
|
|
49
85
|
)
|
|
50
86
|
end
|
|
51
87
|
|
|
88
|
+
# Evaluates retrieval quality using IR metrics.
|
|
89
|
+
#
|
|
90
|
+
# @param question [String] the input question
|
|
91
|
+
# @param retrieved [Array<String>] retrieved document IDs
|
|
92
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
93
|
+
# @return [RetrievalResult]
|
|
52
94
|
def evaluate_retrieval(question:, retrieved:, relevant:)
|
|
53
95
|
Evaluator.new.evaluate_retrieval(
|
|
54
96
|
question: question,
|
|
@@ -57,6 +99,11 @@ module EvalRuby
|
|
|
57
99
|
)
|
|
58
100
|
end
|
|
59
101
|
|
|
102
|
+
# Evaluates a batch of samples, optionally running them through a pipeline.
|
|
103
|
+
#
|
|
104
|
+
# @param dataset [Dataset, Array<Hash>] samples to evaluate
|
|
105
|
+
# @param pipeline [#query, nil] optional RAG pipeline to run queries through
|
|
106
|
+
# @return [Report]
|
|
60
107
|
def evaluate_batch(dataset, pipeline: nil)
|
|
61
108
|
samples = dataset.is_a?(Dataset) ? dataset.samples : dataset
|
|
62
109
|
evaluator = Evaluator.new
|
|
@@ -79,6 +126,11 @@ module EvalRuby
|
|
|
79
126
|
Report.new(results: results, samples: samples, duration: Time.now - start_time)
|
|
80
127
|
end
|
|
81
128
|
|
|
129
|
+
# Compares two evaluation reports with statistical significance testing.
|
|
130
|
+
#
|
|
131
|
+
# @param report_a [Report] baseline report
|
|
132
|
+
# @param report_b [Report] comparison report
|
|
133
|
+
# @return [Comparison]
|
|
82
134
|
def compare(report_a, report_b)
|
|
83
135
|
Comparison.new(report_a, report_b)
|
|
84
136
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: eval-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Johannes Dwi Cahyo
|
|
@@ -75,6 +75,7 @@ files:
|
|
|
75
75
|
- Gemfile
|
|
76
76
|
- Gemfile.lock
|
|
77
77
|
- LICENSE
|
|
78
|
+
- MILESTONES.md
|
|
78
79
|
- README.md
|
|
79
80
|
- Rakefile
|
|
80
81
|
- eval-ruby.gemspec
|