eval-ruby 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/eval_ruby.rb CHANGED
@@ -5,12 +5,15 @@ require_relative "eval_ruby/configuration"
5
5
  require_relative "eval_ruby/judges/base"
6
6
  require_relative "eval_ruby/judges/openai"
7
7
  require_relative "eval_ruby/judges/anthropic"
8
+ require_relative "eval_ruby/embedders/base"
9
+ require_relative "eval_ruby/embedders/openai"
8
10
  require_relative "eval_ruby/metrics/base"
9
11
  require_relative "eval_ruby/metrics/faithfulness"
10
12
  require_relative "eval_ruby/metrics/relevance"
11
13
  require_relative "eval_ruby/metrics/correctness"
12
14
  require_relative "eval_ruby/metrics/context_precision"
13
15
  require_relative "eval_ruby/metrics/context_recall"
16
+ require_relative "eval_ruby/metrics/semantic_similarity"
14
17
  require_relative "eval_ruby/metrics/precision_at_k"
15
18
  require_relative "eval_ruby/metrics/recall_at_k"
16
19
  require_relative "eval_ruby/metrics/mrr"
@@ -21,25 +24,72 @@ require_relative "eval_ruby/report"
21
24
  require_relative "eval_ruby/dataset"
22
25
  require_relative "eval_ruby/comparison"
23
26
 
27
+ # Evaluation framework for LLM and RAG applications.
28
+ # Measures quality metrics like faithfulness, relevance, context precision,
29
+ # and answer correctness. Think Ragas or DeepEval for Ruby.
30
+ #
31
+ # @example Quick evaluation
32
+ # result = EvalRuby.evaluate(
33
+ # question: "What is Ruby?",
34
+ # answer: "A programming language",
35
+ # context: ["Ruby is a dynamic, open source programming language."],
36
+ # ground_truth: "Ruby is a programming language created by Matz."
37
+ # )
38
+ # puts result.faithfulness # => 0.95
39
+ # puts result.overall # => 0.87
40
+ #
41
+ # @example Retrieval evaluation
42
+ # result = EvalRuby.evaluate_retrieval(
43
+ # question: "What is Ruby?",
44
+ # retrieved: ["doc_a", "doc_b", "doc_c"],
45
+ # relevant: ["doc_a", "doc_c"]
46
+ # )
47
+ # puts result.precision_at_k(3) # => 0.67
24
48
  module EvalRuby
25
49
  class Error < StandardError; end
26
50
  class APIError < Error; end
27
51
  class TimeoutError < Error; end
28
52
  class InvalidResponseError < Error; end
29
53
 
54
+ # Progress snapshot yielded to the block passed to {.evaluate_batch}.
55
+ # @!attribute current [Integer] number of samples completed (1-indexed)
56
+ # @!attribute total [Integer] total samples in the batch
57
+ # @!attribute elapsed [Float] seconds since batch started
58
+ Progress = Struct.new(:current, :total, :elapsed, keyword_init: true) do
59
+ # @return [Float] completion percentage, 0.0–100.0
60
+ def percent
61
+ total.zero? ? 0.0 : (current.to_f / total * 100).round(2)
62
+ end
63
+ end
64
+
30
65
  class << self
66
+ # @return [Configuration] the current configuration
31
67
  def configuration
32
68
  @configuration ||= Configuration.new
33
69
  end
34
70
 
71
+ # Yields the configuration for modification.
72
+ #
73
+ # @yieldparam config [Configuration]
74
+ # @return [void]
35
75
  def configure
36
76
  yield(configuration)
37
77
  end
38
78
 
79
+ # Resets configuration to defaults.
80
+ #
81
+ # @return [Configuration]
39
82
  def reset_configuration!
40
83
  @configuration = Configuration.new
41
84
  end
42
85
 
86
+ # Evaluates an LLM response across multiple quality metrics.
87
+ #
88
+ # @param question [String] the input question
89
+ # @param answer [String] the LLM-generated answer
90
+ # @param context [Array<String>] retrieved context chunks
91
+ # @param ground_truth [String, nil] expected correct answer
92
+ # @return [Result]
43
93
  def evaluate(question:, answer:, context: [], ground_truth: nil)
44
94
  Evaluator.new.evaluate(
45
95
  question: question,
@@ -49,6 +99,12 @@ module EvalRuby
49
99
  )
50
100
  end
51
101
 
102
+ # Evaluates retrieval quality using IR metrics.
103
+ #
104
+ # @param question [String] the input question
105
+ # @param retrieved [Array<String>] retrieved document IDs
106
+ # @param relevant [Array<String>] ground-truth relevant document IDs
107
+ # @return [RetrievalResult]
52
108
  def evaluate_retrieval(question:, retrieved:, relevant:)
53
109
  Evaluator.new.evaluate_retrieval(
54
110
  question: question,
@@ -57,13 +113,28 @@ module EvalRuby
57
113
  )
58
114
  end
59
115
 
60
- def evaluate_batch(dataset, pipeline: nil)
116
+ # Evaluates a batch of samples, optionally running them through a pipeline.
117
+ #
118
+ # If a block is given, it is called after each sample with a {Progress}
119
+ # snapshot, useful for rendering progress bars or writing incremental logs.
120
+ #
121
+ # @param dataset [Dataset, Array<Hash>] samples to evaluate
122
+ # @param pipeline [#query, nil] optional RAG pipeline to run queries through
123
+ # @yieldparam progress [Progress] progress snapshot after each sample
124
+ # @return [Report]
125
+ #
126
+ # @example With progress callback
127
+ # EvalRuby.evaluate_batch(dataset) do |progress|
128
+ # puts "#{progress.current}/#{progress.total} (#{progress.percent}%)"
129
+ # end
130
+ def evaluate_batch(dataset, pipeline: nil, &progress_block)
61
131
  samples = dataset.is_a?(Dataset) ? dataset.samples : dataset
62
132
  evaluator = Evaluator.new
63
133
  start_time = Time.now
134
+ total = samples.size
64
135
 
65
- results = samples.map do |sample|
66
- if pipeline
136
+ results = samples.each_with_index.map do |sample, i|
137
+ result = if pipeline
67
138
  response = pipeline.query(sample[:question])
68
139
  evaluator.evaluate(
69
140
  question: sample[:question],
@@ -74,11 +145,24 @@ module EvalRuby
74
145
  else
75
146
  evaluator.evaluate(**sample.slice(:question, :answer, :context, :ground_truth))
76
147
  end
148
+
149
+ progress_block&.call(Progress.new(
150
+ current: i + 1,
151
+ total: total,
152
+ elapsed: Time.now - start_time
153
+ ))
154
+
155
+ result
77
156
  end
78
157
 
79
158
  Report.new(results: results, samples: samples, duration: Time.now - start_time)
80
159
  end
81
160
 
161
+ # Compares two evaluation reports with statistical significance testing.
162
+ #
163
+ # @param report_a [Report] baseline report
164
+ # @param report_b [Report] comparison report
165
+ # @return [Comparison]
82
166
  def compare(report_a, report_b)
83
167
  Comparison.new(report_a, report_b)
84
168
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eval-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -72,9 +72,11 @@ executables: []
72
72
  extensions: []
73
73
  extra_rdoc_files: []
74
74
  files:
75
+ - CHANGELOG.md
75
76
  - Gemfile
76
77
  - Gemfile.lock
77
78
  - LICENSE
79
+ - MILESTONES.md
78
80
  - README.md
79
81
  - Rakefile
80
82
  - eval-ruby.gemspec
@@ -82,6 +84,8 @@ files:
82
84
  - lib/eval_ruby/comparison.rb
83
85
  - lib/eval_ruby/configuration.rb
84
86
  - lib/eval_ruby/dataset.rb
87
+ - lib/eval_ruby/embedders/base.rb
88
+ - lib/eval_ruby/embedders/openai.rb
85
89
  - lib/eval_ruby/evaluator.rb
86
90
  - lib/eval_ruby/judges/anthropic.rb
87
91
  - lib/eval_ruby/judges/base.rb
@@ -96,6 +100,7 @@ files:
96
100
  - lib/eval_ruby/metrics/precision_at_k.rb
97
101
  - lib/eval_ruby/metrics/recall_at_k.rb
98
102
  - lib/eval_ruby/metrics/relevance.rb
103
+ - lib/eval_ruby/metrics/semantic_similarity.rb
99
104
  - lib/eval_ruby/minitest.rb
100
105
  - lib/eval_ruby/report.rb
101
106
  - lib/eval_ruby/result.rb