eval-ruby 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +60 -0
- data/Gemfile.lock +2 -2
- data/MILESTONES.md +13 -0
- data/README.md +39 -0
- data/lib/eval_ruby/comparison.rb +18 -1
- data/lib/eval_ruby/configuration.rb +37 -2
- data/lib/eval_ruby/dataset.rb +118 -13
- data/lib/eval_ruby/embedders/base.rb +29 -0
- data/lib/eval_ruby/embedders/openai.rb +83 -0
- data/lib/eval_ruby/evaluator.rb +36 -0
- data/lib/eval_ruby/judges/anthropic.rb +8 -0
- data/lib/eval_ruby/judges/base.rb +11 -0
- data/lib/eval_ruby/judges/openai.rb +8 -0
- data/lib/eval_ruby/metrics/base.rb +8 -0
- data/lib/eval_ruby/metrics/context_precision.rb +10 -0
- data/lib/eval_ruby/metrics/context_recall.rb +10 -0
- data/lib/eval_ruby/metrics/correctness.rb +13 -0
- data/lib/eval_ruby/metrics/faithfulness.rb +10 -0
- data/lib/eval_ruby/metrics/mrr.rb +8 -0
- data/lib/eval_ruby/metrics/ndcg.rb +10 -0
- data/lib/eval_ruby/metrics/precision_at_k.rb +9 -0
- data/lib/eval_ruby/metrics/recall_at_k.rb +9 -0
- data/lib/eval_ruby/metrics/relevance.rb +10 -0
- data/lib/eval_ruby/metrics/semantic_similarity.rb +72 -0
- data/lib/eval_ruby/report.rb +38 -1
- data/lib/eval_ruby/result.rb +29 -1
- data/lib/eval_ruby/rspec.rb +48 -6
- data/lib/eval_ruby/version.rb +1 -1
- data/lib/eval_ruby.rb +87 -3
- metadata +6 -1
data/lib/eval_ruby.rb
CHANGED
|
@@ -5,12 +5,15 @@ require_relative "eval_ruby/configuration"
|
|
|
5
5
|
require_relative "eval_ruby/judges/base"
|
|
6
6
|
require_relative "eval_ruby/judges/openai"
|
|
7
7
|
require_relative "eval_ruby/judges/anthropic"
|
|
8
|
+
require_relative "eval_ruby/embedders/base"
|
|
9
|
+
require_relative "eval_ruby/embedders/openai"
|
|
8
10
|
require_relative "eval_ruby/metrics/base"
|
|
9
11
|
require_relative "eval_ruby/metrics/faithfulness"
|
|
10
12
|
require_relative "eval_ruby/metrics/relevance"
|
|
11
13
|
require_relative "eval_ruby/metrics/correctness"
|
|
12
14
|
require_relative "eval_ruby/metrics/context_precision"
|
|
13
15
|
require_relative "eval_ruby/metrics/context_recall"
|
|
16
|
+
require_relative "eval_ruby/metrics/semantic_similarity"
|
|
14
17
|
require_relative "eval_ruby/metrics/precision_at_k"
|
|
15
18
|
require_relative "eval_ruby/metrics/recall_at_k"
|
|
16
19
|
require_relative "eval_ruby/metrics/mrr"
|
|
@@ -21,25 +24,72 @@ require_relative "eval_ruby/report"
|
|
|
21
24
|
require_relative "eval_ruby/dataset"
|
|
22
25
|
require_relative "eval_ruby/comparison"
|
|
23
26
|
|
|
27
|
+
# Evaluation framework for LLM and RAG applications.
|
|
28
|
+
# Measures quality metrics like faithfulness, relevance, context precision,
|
|
29
|
+
# and answer correctness. Think Ragas or DeepEval for Ruby.
|
|
30
|
+
#
|
|
31
|
+
# @example Quick evaluation
|
|
32
|
+
# result = EvalRuby.evaluate(
|
|
33
|
+
# question: "What is Ruby?",
|
|
34
|
+
# answer: "A programming language",
|
|
35
|
+
# context: ["Ruby is a dynamic, open source programming language."],
|
|
36
|
+
# ground_truth: "Ruby is a programming language created by Matz."
|
|
37
|
+
# )
|
|
38
|
+
# puts result.faithfulness # => 0.95
|
|
39
|
+
# puts result.overall # => 0.87
|
|
40
|
+
#
|
|
41
|
+
# @example Retrieval evaluation
|
|
42
|
+
# result = EvalRuby.evaluate_retrieval(
|
|
43
|
+
# question: "What is Ruby?",
|
|
44
|
+
# retrieved: ["doc_a", "doc_b", "doc_c"],
|
|
45
|
+
# relevant: ["doc_a", "doc_c"]
|
|
46
|
+
# )
|
|
47
|
+
# puts result.precision_at_k(3) # => 0.67
|
|
24
48
|
module EvalRuby
|
|
25
49
|
class Error < StandardError; end
|
|
26
50
|
class APIError < Error; end
|
|
27
51
|
class TimeoutError < Error; end
|
|
28
52
|
class InvalidResponseError < Error; end
|
|
29
53
|
|
|
54
|
+
# Progress snapshot yielded to the block passed to {.evaluate_batch}.
|
|
55
|
+
# @!attribute current [Integer] number of samples completed (1-indexed)
|
|
56
|
+
# @!attribute total [Integer] total samples in the batch
|
|
57
|
+
# @!attribute elapsed [Float] seconds since batch started
|
|
58
|
+
Progress = Struct.new(:current, :total, :elapsed, keyword_init: true) do
|
|
59
|
+
# @return [Float] completion percentage, 0.0–100.0
|
|
60
|
+
def percent
|
|
61
|
+
total.zero? ? 0.0 : (current.to_f / total * 100).round(2)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
30
65
|
class << self
|
|
66
|
+
# @return [Configuration] the current configuration
|
|
31
67
|
def configuration
|
|
32
68
|
@configuration ||= Configuration.new
|
|
33
69
|
end
|
|
34
70
|
|
|
71
|
+
# Yields the configuration for modification.
|
|
72
|
+
#
|
|
73
|
+
# @yieldparam config [Configuration]
|
|
74
|
+
# @return [void]
|
|
35
75
|
def configure
|
|
36
76
|
yield(configuration)
|
|
37
77
|
end
|
|
38
78
|
|
|
79
|
+
# Resets configuration to defaults.
|
|
80
|
+
#
|
|
81
|
+
# @return [Configuration]
|
|
39
82
|
def reset_configuration!
|
|
40
83
|
@configuration = Configuration.new
|
|
41
84
|
end
|
|
42
85
|
|
|
86
|
+
# Evaluates an LLM response across multiple quality metrics.
|
|
87
|
+
#
|
|
88
|
+
# @param question [String] the input question
|
|
89
|
+
# @param answer [String] the LLM-generated answer
|
|
90
|
+
# @param context [Array<String>] retrieved context chunks
|
|
91
|
+
# @param ground_truth [String, nil] expected correct answer
|
|
92
|
+
# @return [Result]
|
|
43
93
|
def evaluate(question:, answer:, context: [], ground_truth: nil)
|
|
44
94
|
Evaluator.new.evaluate(
|
|
45
95
|
question: question,
|
|
@@ -49,6 +99,12 @@ module EvalRuby
|
|
|
49
99
|
)
|
|
50
100
|
end
|
|
51
101
|
|
|
102
|
+
# Evaluates retrieval quality using IR metrics.
|
|
103
|
+
#
|
|
104
|
+
# @param question [String] the input question
|
|
105
|
+
# @param retrieved [Array<String>] retrieved document IDs
|
|
106
|
+
# @param relevant [Array<String>] ground-truth relevant document IDs
|
|
107
|
+
# @return [RetrievalResult]
|
|
52
108
|
def evaluate_retrieval(question:, retrieved:, relevant:)
|
|
53
109
|
Evaluator.new.evaluate_retrieval(
|
|
54
110
|
question: question,
|
|
@@ -57,13 +113,28 @@ module EvalRuby
|
|
|
57
113
|
)
|
|
58
114
|
end
|
|
59
115
|
|
|
60
|
-
|
|
116
|
+
# Evaluates a batch of samples, optionally running them through a pipeline.
|
|
117
|
+
#
|
|
118
|
+
# If a block is given, it is called after each sample with a {Progress}
|
|
119
|
+
# snapshot, useful for rendering progress bars or writing incremental logs.
|
|
120
|
+
#
|
|
121
|
+
# @param dataset [Dataset, Array<Hash>] samples to evaluate
|
|
122
|
+
# @param pipeline [#query, nil] optional RAG pipeline to run queries through
|
|
123
|
+
# @yieldparam progress [Progress] progress snapshot after each sample
|
|
124
|
+
# @return [Report]
|
|
125
|
+
#
|
|
126
|
+
# @example With progress callback
|
|
127
|
+
# EvalRuby.evaluate_batch(dataset) do |progress|
|
|
128
|
+
# puts "#{progress.current}/#{progress.total} (#{progress.percent}%)"
|
|
129
|
+
# end
|
|
130
|
+
def evaluate_batch(dataset, pipeline: nil, &progress_block)
|
|
61
131
|
samples = dataset.is_a?(Dataset) ? dataset.samples : dataset
|
|
62
132
|
evaluator = Evaluator.new
|
|
63
133
|
start_time = Time.now
|
|
134
|
+
total = samples.size
|
|
64
135
|
|
|
65
|
-
results = samples.map do |sample|
|
|
66
|
-
if pipeline
|
|
136
|
+
results = samples.each_with_index.map do |sample, i|
|
|
137
|
+
result = if pipeline
|
|
67
138
|
response = pipeline.query(sample[:question])
|
|
68
139
|
evaluator.evaluate(
|
|
69
140
|
question: sample[:question],
|
|
@@ -74,11 +145,24 @@ module EvalRuby
|
|
|
74
145
|
else
|
|
75
146
|
evaluator.evaluate(**sample.slice(:question, :answer, :context, :ground_truth))
|
|
76
147
|
end
|
|
148
|
+
|
|
149
|
+
progress_block&.call(Progress.new(
|
|
150
|
+
current: i + 1,
|
|
151
|
+
total: total,
|
|
152
|
+
elapsed: Time.now - start_time
|
|
153
|
+
))
|
|
154
|
+
|
|
155
|
+
result
|
|
77
156
|
end
|
|
78
157
|
|
|
79
158
|
Report.new(results: results, samples: samples, duration: Time.now - start_time)
|
|
80
159
|
end
|
|
81
160
|
|
|
161
|
+
# Compares two evaluation reports with statistical significance testing.
|
|
162
|
+
#
|
|
163
|
+
# @param report_a [Report] baseline report
|
|
164
|
+
# @param report_b [Report] comparison report
|
|
165
|
+
# @return [Comparison]
|
|
82
166
|
def compare(report_a, report_b)
|
|
83
167
|
Comparison.new(report_a, report_b)
|
|
84
168
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: eval-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Johannes Dwi Cahyo
|
|
@@ -72,9 +72,11 @@ executables: []
|
|
|
72
72
|
extensions: []
|
|
73
73
|
extra_rdoc_files: []
|
|
74
74
|
files:
|
|
75
|
+
- CHANGELOG.md
|
|
75
76
|
- Gemfile
|
|
76
77
|
- Gemfile.lock
|
|
77
78
|
- LICENSE
|
|
79
|
+
- MILESTONES.md
|
|
78
80
|
- README.md
|
|
79
81
|
- Rakefile
|
|
80
82
|
- eval-ruby.gemspec
|
|
@@ -82,6 +84,8 @@ files:
|
|
|
82
84
|
- lib/eval_ruby/comparison.rb
|
|
83
85
|
- lib/eval_ruby/configuration.rb
|
|
84
86
|
- lib/eval_ruby/dataset.rb
|
|
87
|
+
- lib/eval_ruby/embedders/base.rb
|
|
88
|
+
- lib/eval_ruby/embedders/openai.rb
|
|
85
89
|
- lib/eval_ruby/evaluator.rb
|
|
86
90
|
- lib/eval_ruby/judges/anthropic.rb
|
|
87
91
|
- lib/eval_ruby/judges/base.rb
|
|
@@ -96,6 +100,7 @@ files:
|
|
|
96
100
|
- lib/eval_ruby/metrics/precision_at_k.rb
|
|
97
101
|
- lib/eval_ruby/metrics/recall_at_k.rb
|
|
98
102
|
- lib/eval_ruby/metrics/relevance.rb
|
|
103
|
+
- lib/eval_ruby/metrics/semantic_similarity.rb
|
|
99
104
|
- lib/eval_ruby/minitest.rb
|
|
100
105
|
- lib/eval_ruby/report.rb
|
|
101
106
|
- lib/eval_ruby/result.rb
|