eval-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 59b7bd64cf696d82a27cb6330ab948306904f444a8a64476f702ee1937bbccab
4
- data.tar.gz: f9c0c234f0712d37d309d460d1c3204d3e6f70bfbd8fe4dcff95ac508bcb7f34
3
+ metadata.gz: 69f4642cd2505ab6b0b54f36cd76fea2043f76306c425fa69e29027e4d0dc901
4
+ data.tar.gz: 70125b286a374c01af966a0a044edb9b0cbca02fa9acd6b944f2b0825ad60981
5
5
  SHA512:
6
- metadata.gz: 6a9f0c12a790b0098ba639bc236c6080c64c7fbb4ad9892c36810e93b88486bebaa42dca9245beaf828beca29e31793030d62ab232cad50107bc99905635a069
7
- data.tar.gz: 7c922b6fd8743d5241a254baf301aae2af6aba936fdeedbcb467f9549a8f16493c7571ec9724a5460a75228e415cbcaa852b915ef78f23aa8f1fcbddb85d9450
6
+ metadata.gz: 7ac13b5d60996d964a948b92bec466d3594c614e5a0bc3e138bfba31dda0dcca19876d01e2826ed1522962e7117b2eb69c7e3ea46567e4a09920f77f2031d09c
7
+ data.tar.gz: ca13c6f768516a2f21188c59511f045bd12c9fd4fe51819af705016c1282256b6765805a007265dbb3e299430c19b8fe80e1a29c22038b930b5799ab7d3a8355
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- eval-ruby (0.1.1)
4
+ eval-ruby (0.2.0)
5
5
  csv
6
6
 
7
7
  GEM
@@ -39,7 +39,7 @@ CHECKSUMS
39
39
  bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
40
40
  crack (1.0.1) sha256=ff4a10390cd31d66440b7524eb1841874db86201d5b70032028553130b6d4c7e
41
41
  csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
42
- eval-ruby (0.1.1)
42
+ eval-ruby (0.2.0)
43
43
  hashdiff (1.2.1) sha256=9c079dbc513dfc8833ab59c0c2d8f230fa28499cc5efb4b8dd276cf931457cd1
44
44
  minitest (5.27.0) sha256=2d3b17f8a36fe7801c1adcffdbc38233b938eb0b4966e97a6739055a45fa77d5
45
45
  public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
data/MILESTONES.md ADDED
@@ -0,0 +1,13 @@
1
+ # Milestones
2
+
3
+ ## v0.1.1 (2026-03-10)
4
+
5
+ ### Changes
6
+ - Retry logic
7
+ - API key validation
8
+ - string context fix
9
+ - std dev fix
10
+ - error subclasses
11
+
12
+ ## v0.1.0 (Initial release)
13
+ - Initial release
@@ -1,14 +1,27 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EvalRuby
4
+ # Statistical comparison of two evaluation reports using paired t-tests.
5
+ #
6
+ # @example
7
+ # comparison = EvalRuby.compare(report_a, report_b)
8
+ # puts comparison.summary
9
+ # comparison.significant_improvements # => [:faithfulness]
4
10
  class Comparison
5
- attr_reader :report_a, :report_b
11
+ # @return [Report] baseline report
12
+ attr_reader :report_a
6
13
 
14
+ # @return [Report] comparison report
15
+ attr_reader :report_b
16
+
17
+ # @param report_a [Report] baseline
18
+ # @param report_b [Report] comparison
7
19
  def initialize(report_a, report_b)
8
20
  @report_a = report_a
9
21
  @report_b = report_b
10
22
  end
11
23
 
24
+ # @return [String] formatted comparison table with deltas and p-values
12
25
  def summary
13
26
  lines = [
14
27
  format("%-20s | %-10s | %-10s | %-8s | %s", "Metric", "A", "B", "Delta", "p-value"),
@@ -35,6 +48,10 @@ module EvalRuby
35
48
  lines.join("\n")
36
49
  end
37
50
 
51
+ # Returns metrics where report_b is significantly better than report_a.
52
+ #
53
+ # @param alpha [Float] significance level (default 0.05)
54
+ # @return [Array<Symbol>] metric names with significant improvements
38
55
  def significant_improvements(alpha: 0.05)
39
56
  all_metrics.select do |metric|
40
57
  scores_a = @report_a.results.filter_map { |r| r.scores[metric] }
@@ -1,9 +1,32 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EvalRuby
4
+ # Global configuration for EvalRuby.
5
+ #
6
+ # @example
7
+ # EvalRuby.configure do |config|
8
+ # config.judge_llm = :openai
9
+ # config.api_key = ENV["OPENAI_API_KEY"]
10
+ # config.judge_model = "gpt-4o"
11
+ # end
4
12
  class Configuration
5
- attr_accessor :judge_llm, :judge_model, :api_key, :default_threshold,
6
- :timeout, :max_retries
13
+ # @return [Symbol] LLM provider for judge (:openai or :anthropic)
14
+ attr_accessor :judge_llm
15
+
16
+ # @return [String] model name for the judge LLM
17
+ attr_accessor :judge_model
18
+
19
+ # @return [String, nil] API key for the judge LLM provider
20
+ attr_accessor :api_key
21
+
22
+ # @return [Float] default threshold for pass/fail decisions
23
+ attr_accessor :default_threshold
24
+
25
+ # @return [Integer] HTTP request timeout in seconds
26
+ attr_accessor :timeout
27
+
28
+ # @return [Integer] maximum number of retries on transient failures
29
+ attr_accessor :max_retries
7
30
 
8
31
  def initialize
9
32
  @judge_llm = :openai
@@ -4,16 +4,36 @@ require "csv"
4
4
  require "json"
5
5
 
6
6
  module EvalRuby
7
+ # Collection of evaluation samples with import/export support.
8
+ # Supports CSV, JSON, and programmatic construction.
9
+ #
10
+ # @example
11
+ # dataset = EvalRuby::Dataset.new("my_test_set")
12
+ # dataset.add(question: "What is Ruby?", answer: "A language", ground_truth: "A language")
13
+ # report = EvalRuby.evaluate_batch(dataset)
7
14
  class Dataset
8
15
  include Enumerable
9
16
 
10
- attr_reader :name, :samples
17
+ # @return [String] dataset name
18
+ attr_reader :name
11
19
 
20
+ # @return [Array<Hash>] sample entries
21
+ attr_reader :samples
22
+
23
+ # @param name [String] dataset name
12
24
  def initialize(name = "default")
13
25
  @name = name
14
26
  @samples = []
15
27
  end
16
28
 
29
+ # Adds a sample to the dataset.
30
+ #
31
+ # @param question [String]
32
+ # @param ground_truth [String, nil]
33
+ # @param relevant_contexts [Array<String>] alias for context
34
+ # @param answer [String, nil]
35
+ # @param context [Array<String>]
36
+ # @return [self]
17
37
  def add(question:, ground_truth: nil, relevant_contexts: [], answer: nil, context: [])
18
38
  @samples << {
19
39
  question: question,
@@ -24,18 +44,26 @@ module EvalRuby
24
44
  self
25
45
  end
26
46
 
47
+ # @yield [Hash] each sample
27
48
  def each(&block)
28
49
  @samples.each(&block)
29
50
  end
30
51
 
52
+ # @return [Integer] number of samples
31
53
  def size
32
54
  @samples.size
33
55
  end
34
56
 
57
+ # @param index [Integer]
58
+ # @return [Hash] sample at index
35
59
  def [](index)
36
60
  @samples[index]
37
61
  end
38
62
 
63
+ # Loads a dataset from a CSV file.
64
+ #
65
+ # @param path [String] path to CSV file
66
+ # @return [Dataset]
39
67
  def self.from_csv(path)
40
68
  dataset = new(File.basename(path, ".*"))
41
69
  CSV.foreach(path, headers: true) do |row|
@@ -49,6 +77,10 @@ module EvalRuby
49
77
  dataset
50
78
  end
51
79
 
80
+ # Loads a dataset from a JSON file.
81
+ #
82
+ # @param path [String] path to JSON file
83
+ # @return [Dataset]
52
84
  def self.from_json(path)
53
85
  dataset = new(File.basename(path, ".*"))
54
86
  data = JSON.parse(File.read(path))
@@ -64,6 +96,10 @@ module EvalRuby
64
96
  dataset
65
97
  end
66
98
 
99
+ # Exports dataset to CSV.
100
+ #
101
+ # @param path [String] output file path
102
+ # @return [void]
67
103
  def to_csv(path)
68
104
  CSV.open(path, "w") do |csv|
69
105
  csv << %w[question answer context ground_truth]
@@ -78,10 +114,20 @@ module EvalRuby
78
114
  end
79
115
  end
80
116
 
117
+ # Exports dataset to JSON.
118
+ #
119
+ # @param path [String] output file path
120
+ # @return [void]
81
121
  def to_json(path)
82
122
  File.write(path, JSON.pretty_generate({name: @name, samples: @samples}))
83
123
  end
84
124
 
125
+ # Generates a dataset from documents using an LLM.
126
+ #
127
+ # @param documents [Array<String>] file paths to source documents
128
+ # @param questions_per_doc [Integer] number of QA pairs per document
129
+ # @param llm [Symbol] LLM provider (:openai or :anthropic)
130
+ # @return [Dataset]
85
131
  def self.generate(documents:, questions_per_doc: 5, llm: :openai)
86
132
  config = EvalRuby.configuration.dup
87
133
  config.judge_llm = llm
@@ -1,12 +1,25 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EvalRuby
4
+ # Runs all configured metrics on a given question/answer/context tuple.
5
+ #
6
+ # @example
7
+ # evaluator = EvalRuby::Evaluator.new
8
+ # result = evaluator.evaluate(question: "...", answer: "...", context: [...])
4
9
  class Evaluator
10
+ # @param config [Configuration] configuration to use
5
11
  def initialize(config = EvalRuby.configuration)
6
12
  @config = config
7
13
  @judge = build_judge(config)
8
14
  end
9
15
 
16
+ # Evaluates an LLM response across quality metrics.
17
+ #
18
+ # @param question [String] the input question
19
+ # @param answer [String] the LLM-generated answer
20
+ # @param context [Array<String>] retrieved context chunks
21
+ # @param ground_truth [String, nil] expected correct answer
22
+ # @return [Result]
10
23
  def evaluate(question:, answer:, context: [], ground_truth: nil)
11
24
  scores = {}
12
25
  details = {}
@@ -37,6 +50,12 @@ module EvalRuby
37
50
  Result.new(scores: scores, details: details)
38
51
  end
39
52
 
53
+ # Evaluates retrieval quality using IR metrics.
54
+ #
55
+ # @param question [String] the input question
56
+ # @param retrieved [Array<String>] retrieved document IDs
57
+ # @param relevant [Array<String>] ground-truth relevant document IDs
58
+ # @return [RetrievalResult]
40
59
  def evaluate_retrieval(question:, retrieved:, relevant:)
41
60
  RetrievalResult.new(retrieved: retrieved, relevant: relevant)
42
61
  end
@@ -55,32 +74,49 @@ module EvalRuby
55
74
  end
56
75
  end
57
76
 
77
+ # Holds retrieval evaluation results with IR metric accessors.
78
+ #
79
+ # @example
80
+ # result = EvalRuby.evaluate_retrieval(question: "...", retrieved: [...], relevant: [...])
81
+ # result.precision_at_k(5) # => 0.6
82
+ # result.mrr # => 1.0
58
83
  class RetrievalResult
84
+ # @param retrieved [Array<String>] retrieved document IDs in ranked order
85
+ # @param relevant [Array<String>] ground-truth relevant document IDs
59
86
  def initialize(retrieved:, relevant:)
60
87
  @retrieved = retrieved
61
88
  @relevant = relevant
62
89
  end
63
90
 
91
+ # @param k [Integer] number of top results to consider
92
+ # @return [Float] precision at k
64
93
  def precision_at_k(k)
65
94
  Metrics::PrecisionAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
66
95
  end
67
96
 
97
+ # @param k [Integer] number of top results to consider
98
+ # @return [Float] recall at k
68
99
  def recall_at_k(k)
69
100
  Metrics::RecallAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
70
101
  end
71
102
 
103
+ # @return [Float] mean reciprocal rank
72
104
  def mrr
73
105
  Metrics::MRR.new.call(retrieved: @retrieved, relevant: @relevant)
74
106
  end
75
107
 
108
+ # @param k [Integer, nil] number of top results (nil for all)
109
+ # @return [Float] normalized discounted cumulative gain
76
110
  def ndcg(k: nil)
77
111
  Metrics::NDCG.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
78
112
  end
79
113
 
114
+ # @return [Float] 1.0 if any relevant doc is retrieved, 0.0 otherwise
80
115
  def hit_rate
81
116
  @retrieved.any? { |doc| @relevant.include?(doc) } ? 1.0 : 0.0
82
117
  end
83
118
 
119
+ # @return [Hash{Symbol => Float}] all retrieval metrics
84
120
  def to_h
85
121
  {
86
122
  precision_at_k: precision_at_k(@retrieved.length),
@@ -6,14 +6,22 @@ require "uri"
6
6
 
7
7
  module EvalRuby
8
8
  module Judges
9
+ # Anthropic-based LLM judge using the Messages API.
10
+ # Requires an API key set via {Configuration#api_key}.
9
11
  class Anthropic < Base
10
12
  API_URL = "https://api.anthropic.com/v1/messages"
11
13
 
14
+ # @param config [Configuration]
15
+ # @raise [EvalRuby::Error] if API key is missing
12
16
  def initialize(config)
13
17
  super
14
18
  raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
15
19
  end
16
20
 
21
+ # @param prompt [String] the evaluation prompt
22
+ # @return [Hash, nil] parsed JSON response
23
+ # @raise [EvalRuby::Error] on API errors
24
+ # @raise [EvalRuby::TimeoutError] after max retries
17
25
  def call(prompt)
18
26
  retries = 0
19
27
  begin
@@ -2,17 +2,28 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Judges
5
+ # Abstract base class for LLM judges.
6
+ # Subclasses must implement {#call} to send prompts to an LLM and parse JSON responses.
5
7
  class Base
8
+ # @param config [Configuration]
6
9
  def initialize(config)
7
10
  @config = config
8
11
  end
9
12
 
13
+ # Sends a prompt to the LLM and returns parsed JSON.
14
+ #
15
+ # @param prompt [String] the evaluation prompt
16
+ # @return [Hash, nil] parsed JSON response
10
17
  def call(prompt)
11
18
  raise NotImplementedError, "#{self.class}#call must be implemented"
12
19
  end
13
20
 
14
21
  private
15
22
 
23
+ # Extracts and parses the first JSON object from text.
24
+ #
25
+ # @param text [String] raw LLM response text
26
+ # @return [Hash, nil] parsed JSON or nil if not found
16
27
  def parse_json_response(text)
17
28
  match = text.match(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/m)
18
29
  return nil unless match
@@ -6,14 +6,22 @@ require "uri"
6
6
 
7
7
  module EvalRuby
8
8
  module Judges
9
+ # OpenAI-based LLM judge using the Chat Completions API.
10
+ # Requires an API key set via {Configuration#api_key}.
9
11
  class OpenAI < Base
10
12
  API_URL = "https://api.openai.com/v1/chat/completions"
11
13
 
14
+ # @param config [Configuration]
15
+ # @raise [EvalRuby::Error] if API key is missing
12
16
  def initialize(config)
13
17
  super
14
18
  raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
15
19
  end
16
20
 
21
+ # @param prompt [String] the evaluation prompt
22
+ # @return [Hash, nil] parsed JSON response
23
+ # @raise [EvalRuby::Error] on API errors
24
+ # @raise [EvalRuby::TimeoutError] after max retries
17
25
  def call(prompt)
18
26
  retries = 0
19
27
  begin
@@ -2,13 +2,21 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Abstract base class for all evaluation metrics.
6
+ # Subclasses must implement {#call}.
5
7
  class Base
8
+ # @return [EvalRuby::Judges::Base, nil] the LLM judge instance
6
9
  attr_reader :judge
7
10
 
11
+ # @param judge [EvalRuby::Judges::Base, nil] LLM judge for evaluation
8
12
  def initialize(judge: nil)
9
13
  @judge = judge
10
14
  end
11
15
 
16
+ # Evaluates the metric.
17
+ #
18
+ # @param kwargs [Hash] metric-specific keyword arguments
19
+ # @return [Hash{Symbol => Object}] must include :score and :details keys
12
20
  def call(**kwargs)
13
21
  raise NotImplementedError, "#{self.class}#call must be implemented"
14
22
  end
@@ -2,6 +2,13 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Measures the proportion of retrieved contexts that are relevant to the question.
6
+ # Uses an LLM judge to evaluate each context's relevance.
7
+ #
8
+ # @example
9
+ # metric = ContextPrecision.new(judge: judge)
10
+ # result = metric.call(question: "What is Ruby?", context: ["Ruby is...", "Weather..."])
11
+ # result[:score] # => 0.5
5
12
  class ContextPrecision < Base
6
13
  PROMPT_TEMPLATE = <<~PROMPT
7
14
  Given the following question and a list of retrieved contexts, evaluate
@@ -19,6 +26,9 @@ module EvalRuby
19
26
  The score should be the proportion of relevant contexts (0.0 to 1.0).
20
27
  PROMPT
21
28
 
29
+ # @param question [String] the input question
30
+ # @param context [Array<String>, String] retrieved context chunks
31
+ # @return [Hash] :score (Float 0.0-1.0) and :details (:evaluations Array)
22
32
  def call(question:, context:, **_kwargs)
23
33
  contexts = context.is_a?(Array) ? context : [context.to_s]
24
34
  return {score: 0.0, details: {}} if contexts.empty?
@@ -2,6 +2,13 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Measures whether retrieved contexts contain enough information to support the ground truth.
6
+ # Uses an LLM judge to check if each ground truth statement is attributable to context.
7
+ #
8
+ # @example
9
+ # metric = ContextRecall.new(judge: judge)
10
+ # result = metric.call(context: ["Ruby was created in 1995."], ground_truth: "Ruby was created in 1995.")
11
+ # result[:score] # => 1.0
5
12
  class ContextRecall < Base
6
13
  PROMPT_TEMPLATE = <<~PROMPT
7
14
  Given the following ground truth answer and retrieved contexts, evaluate
@@ -20,6 +27,9 @@ module EvalRuby
20
27
  The score should be the proportion of statements attributed to context (0.0 to 1.0).
21
28
  PROMPT
22
29
 
30
+ # @param context [Array<String>, String] retrieved context chunks
31
+ # @param ground_truth [String] expected correct answer
32
+ # @return [Hash] :score (Float 0.0-1.0) and :details (:statements Array)
23
33
  def call(context:, ground_truth:, **_kwargs)
24
34
  contexts = context.is_a?(Array) ? context : [context.to_s]
25
35
  return {score: 0.0, details: {}} if contexts.empty?
@@ -2,6 +2,16 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Measures factual correctness of an answer against ground truth.
6
+ # Uses LLM judge when available, falls back to token overlap F1 score.
7
+ #
8
+ # @example With LLM judge
9
+ # metric = Correctness.new(judge: judge)
10
+ # result = metric.call(answer: "Paris", ground_truth: "Paris")
11
+ #
12
+ # @example Without judge (string similarity)
13
+ # metric = Correctness.new
14
+ # result = metric.call(answer: "The capital is Paris", ground_truth: "Paris is the capital")
5
15
  class Correctness < Base
6
16
  PROMPT_TEMPLATE = <<~PROMPT
7
17
  Given the following answer and ground truth, evaluate whether the answer
@@ -23,6 +33,9 @@ module EvalRuby
23
33
  Respond in JSON: {"reasoning": "...", "score": 0.0}
24
34
  PROMPT
25
35
 
36
+ # @param answer [String] the LLM-generated answer
37
+ # @param ground_truth [String] the expected correct answer
38
+ # @return [Hash] :score (Float 0.0-1.0) and :details
26
39
  def call(answer:, ground_truth:, **_kwargs)
27
40
  if judge
28
41
  llm_score(answer, ground_truth)
@@ -2,6 +2,13 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Measures whether an answer is supported by the provided context.
6
+ # Uses an LLM judge to identify claims and check if each is supported.
7
+ #
8
+ # @example
9
+ # metric = Faithfulness.new(judge: judge)
10
+ # result = metric.call(answer: "Paris is in France", context: ["Paris is the capital of France."])
11
+ # result[:score] # => 1.0
5
12
  class Faithfulness < Base
6
13
  PROMPT_TEMPLATE = <<~PROMPT
7
14
  Given the following context and answer, evaluate whether the answer
@@ -25,6 +32,9 @@ module EvalRuby
25
32
  Respond in JSON: {"claims": [{"claim": "...", "supported": true}], "score": 0.0}
26
33
  PROMPT
27
34
 
35
+ # @param answer [String] the LLM-generated answer
36
+ # @param context [Array<String>, String] retrieved context chunks
37
+ # @return [Hash] :score (Float 0.0-1.0) and :details (:claims Array)
28
38
  def call(answer:, context:, **_kwargs)
29
39
  context_text = context.is_a?(Array) ? context.join("\n\n") : context.to_s
30
40
  prompt = format(PROMPT_TEMPLATE, context: context_text, answer: answer)
@@ -2,7 +2,15 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Computes Mean Reciprocal Rank: 1/(position of first relevant document).
6
+ #
7
+ # @example
8
+ # MRR.new.call(retrieved: ["a", "b", "c"], relevant: ["b"])
9
+ # # => 0.5
5
10
  class MRR < Base
11
+ # @param retrieved [Array<String>] retrieved document IDs in ranked order
12
+ # @param relevant [Array<String>] ground-truth relevant document IDs
13
+ # @return [Float] reciprocal rank (0.0-1.0)
6
14
  def call(retrieved:, relevant:, **_kwargs)
7
15
  retrieved.each_with_index do |doc, i|
8
16
  return 1.0 / (i + 1) if relevant.include?(doc)
@@ -2,7 +2,17 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Computes Normalized Discounted Cumulative Gain (NDCG).
6
+ # Measures ranking quality by comparing actual ranking to ideal ranking.
7
+ #
8
+ # @example
9
+ # NDCG.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"])
10
+ # # => 0.863
5
11
  class NDCG < Base
12
+ # @param retrieved [Array<String>] retrieved document IDs in ranked order
13
+ # @param relevant [Array<String>] ground-truth relevant document IDs
14
+ # @param k [Integer, nil] number of top results (nil for all)
15
+ # @return [Float] NDCG score (0.0-1.0)
6
16
  def call(retrieved:, relevant:, k: nil, **_kwargs)
7
17
  k ||= retrieved.length
8
18
  top_k = retrieved.first(k)
@@ -2,7 +2,16 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Computes Precision@K: the fraction of top-k retrieved documents that are relevant.
6
+ #
7
+ # @example
8
+ # PrecisionAtK.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"], k: 3)
9
+ # # => 0.667
5
10
  class PrecisionAtK < Base
11
+ # @param retrieved [Array<String>] retrieved document IDs in ranked order
12
+ # @param relevant [Array<String>] ground-truth relevant document IDs
13
+ # @param k [Integer, nil] number of top results (nil for all)
14
+ # @return [Float] precision score (0.0-1.0)
6
15
  def call(retrieved:, relevant:, k: nil, **_kwargs)
7
16
  k ||= retrieved.length
8
17
  top_k = retrieved.first(k)
@@ -2,7 +2,16 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Computes Recall@K: the fraction of relevant documents found in the top-k results.
6
+ #
7
+ # @example
8
+ # RecallAtK.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"], k: 3)
9
+ # # => 1.0
5
10
  class RecallAtK < Base
11
+ # @param retrieved [Array<String>] retrieved document IDs in ranked order
12
+ # @param relevant [Array<String>] ground-truth relevant document IDs
13
+ # @param k [Integer, nil] number of top results (nil for all)
14
+ # @return [Float] recall score (0.0-1.0)
6
15
  def call(retrieved:, relevant:, k: nil, **_kwargs)
7
16
  return 0.0 if relevant.empty?
8
17
 
@@ -2,6 +2,13 @@
2
2
 
3
3
  module EvalRuby
4
4
  module Metrics
5
+ # Measures whether an answer is relevant to the question.
6
+ # Uses an LLM judge to evaluate relevance on a 0.0-1.0 scale.
7
+ #
8
+ # @example
9
+ # metric = Relevance.new(judge: judge)
10
+ # result = metric.call(question: "What is Ruby?", answer: "Ruby is a language.")
11
+ # result[:score] # => 0.95
5
12
  class Relevance < Base
6
13
  PROMPT_TEMPLATE = <<~PROMPT
7
14
  Given the following question and answer, evaluate whether the answer
@@ -21,6 +28,9 @@ module EvalRuby
21
28
  Respond in JSON: {"reasoning": "...", "score": 0.0}
22
29
  PROMPT
23
30
 
31
+ # @param question [String] the input question
32
+ # @param answer [String] the LLM-generated answer
33
+ # @return [Hash] :score (Float 0.0-1.0) and :details (:reasoning String)
24
34
  def call(question:, answer:, **_kwargs)
25
35
  prompt = format(PROMPT_TEMPLATE, question: question, answer: answer)
26
36
 
@@ -4,15 +4,33 @@ require "csv"
4
4
  require "json"
5
5
 
6
6
  module EvalRuby
7
+ # Aggregated evaluation report across multiple samples.
8
+ # Provides statistical summaries, filtering, and export functionality.
9
+ #
10
+ # @example
11
+ # report = EvalRuby.evaluate_batch(dataset)
12
+ # puts report.summary
13
+ # report.to_csv("results.csv")
7
14
  class Report
8
- attr_reader :results, :duration, :samples
15
+ # @return [Array<Result>] individual evaluation results
16
+ attr_reader :results
9
17
 
18
+ # @return [Float, nil] total evaluation duration in seconds
19
+ attr_reader :duration
20
+
21
+ # @return [Array<Hash>] original sample data
22
+ attr_reader :samples
23
+
24
+ # @param results [Array<Result>]
25
+ # @param samples [Array<Hash>]
26
+ # @param duration [Float, nil]
10
27
  def initialize(results:, samples: [], duration: nil)
11
28
  @results = results
12
29
  @samples = samples
13
30
  @duration = duration
14
31
  end
15
32
 
33
+ # @return [String] human-readable summary with mean and std for each metric
16
34
  def summary
17
35
  lines = []
18
36
  metric_stats.each do |metric, stats|
@@ -23,6 +41,9 @@ module EvalRuby
23
41
  lines.join("\n")
24
42
  end
25
43
 
44
+ # Computes per-metric statistics (mean, std, min, max).
45
+ #
46
+ # @return [Hash{Symbol => Hash}] metric name to stats hash
26
47
  def metric_stats
27
48
  return {} if @results.empty?
28
49
 
@@ -39,15 +60,27 @@ module EvalRuby
39
60
  end
40
61
  end
41
62
 
63
+ # Returns the n worst-scoring results.
64
+ #
65
+ # @param n [Integer] number of results to return
66
+ # @return [Array<Result>]
42
67
  def worst(n = 5)
43
68
  @results.sort_by { |r| r.overall || 0.0 }.first(n)
44
69
  end
45
70
 
71
+ # Returns results below the threshold.
72
+ #
73
+ # @param threshold [Float, nil] score threshold (defaults to config default_threshold)
74
+ # @return [Array<Result>]
46
75
  def failures(threshold: nil)
47
76
  threshold ||= EvalRuby.configuration.default_threshold
48
77
  @results.select { |r| (r.overall || 0.0) < threshold }
49
78
  end
50
79
 
80
+ # Exports results to CSV.
81
+ #
82
+ # @param path [String] output file path
83
+ # @return [void]
51
84
  def to_csv(path)
52
85
  return if @results.empty?
53
86
 
@@ -61,6 +94,10 @@ module EvalRuby
61
94
  end
62
95
  end
63
96
 
97
+ # Exports results to JSON.
98
+ #
99
+ # @param path [String] output file path
100
+ # @return [void]
64
101
  def to_json(path)
65
102
  data = @results.each_with_index.map do |result, i|
66
103
  {index: i, scores: result.scores, overall: result.overall, sample: @samples[i]}
@@ -1,20 +1,46 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EvalRuby
4
+ # Holds evaluation scores and details for a single sample.
5
+ #
6
+ # @example
7
+ # result = EvalRuby.evaluate(question: "...", answer: "...", context: [...])
8
+ # result.faithfulness # => 0.95
9
+ # result.overall # => 0.87
4
10
  class Result
5
11
  METRICS = %i[faithfulness relevance correctness context_precision context_recall].freeze
6
12
 
7
- attr_reader :scores, :details
13
+ # @return [Hash{Symbol => Float}] metric name to score mapping
14
+ attr_reader :scores
8
15
 
16
+ # @return [Hash{Symbol => Hash}] metric name to details mapping
17
+ attr_reader :details
18
+
19
+ # @param scores [Hash{Symbol => Float}]
20
+ # @param details [Hash{Symbol => Hash}]
9
21
  def initialize(scores: {}, details: {})
10
22
  @scores = scores
11
23
  @details = details
12
24
  end
13
25
 
14
26
  METRICS.each do |metric|
27
+ # @!method faithfulness
28
+ # @return [Float, nil] faithfulness score
29
+ # @!method relevance
30
+ # @return [Float, nil] relevance score
31
+ # @!method correctness
32
+ # @return [Float, nil] correctness score
33
+ # @!method context_precision
34
+ # @return [Float, nil] context precision score
35
+ # @!method context_recall
36
+ # @return [Float, nil] context recall score
15
37
  define_method(metric) { @scores[metric] }
16
38
  end
17
39
 
40
+ # Computes a weighted average of all available scores.
41
+ #
42
+ # @param weights [Hash{Symbol => Float}, nil] custom weights per metric
43
+ # @return [Float, nil] weighted average score, or nil if no scores available
18
44
  def overall(weights: nil)
19
45
  weights ||= METRICS.each_with_object({}) { |m, h| h[m] = 1.0 }
20
46
  available = @scores.select { |k, v| weights.key?(k) && v }
@@ -24,10 +50,12 @@ module EvalRuby
24
50
  available.sum { |k, v| v * weights[k] } / total_weight
25
51
  end
26
52
 
53
+ # @return [Hash] scores plus overall
27
54
  def to_h
28
55
  @scores.merge(overall: overall)
29
56
  end
30
57
 
58
+ # @return [String] human-readable summary
31
59
  def to_s
32
60
  lines = @scores.map { |k, v| " #{k}: #{v&.round(4) || 'N/A'}" }
33
61
  lines << " overall: #{overall&.round(4) || 'N/A'}"
@@ -4,48 +4,67 @@ require "eval_ruby"
4
4
 
5
5
  module EvalRuby
6
6
  module RSpecMatchers
7
+ # RSpec matcher that checks if an answer is faithful to the given context.
8
+ #
9
+ # @example
10
+ # expect(answer).to be_faithful_to(context)
11
+ # expect(answer).to be_faithful_to(context).with_threshold(0.9)
7
12
  class BeFaithfulTo
8
- def initialize(context)
13
+ def initialize(context, judge: nil)
9
14
  @context = Array(context)
10
15
  @threshold = 0.8
16
+ @judge = judge
11
17
  end
12
18
 
19
+ # @param threshold [Float] minimum faithfulness score (0.0 - 1.0)
20
+ # @return [self]
13
21
  def with_threshold(threshold)
14
22
  @threshold = threshold
15
23
  self
16
24
  end
17
25
 
26
+ # @param answer [String] the LLM-generated answer to evaluate
27
+ # @return [Boolean]
18
28
  def matches?(answer)
19
29
  @answer = answer
20
- judge = EvalRuby.send(:build_judge)
21
- result = Metrics::Faithfulness.new(judge: judge).call(answer: answer, context: @context)
30
+ j = @judge || EvalRuby.send(:default_judge)
31
+ result = Metrics::Faithfulness.new(judge: j).call(answer: answer, context: @context)
22
32
  @score = result[:score]
23
33
  @score >= @threshold
24
34
  end
25
35
 
36
+ # @return [String]
26
37
  def failure_message
27
38
  "expected answer to be faithful to context (threshold: #{@threshold}), but got score #{@score.round(4)}"
28
39
  end
29
40
 
41
+ # @return [String]
30
42
  def failure_message_when_negated
31
43
  "expected answer not to be faithful to context, but got score #{@score.round(4)}"
32
44
  end
33
45
  end
34
46
 
47
+ # RSpec matcher that checks precision@k for retrieval results.
48
+ #
49
+ # @example
50
+ # expect(retrieval_result).to have_precision_at_k(5).above(0.8)
35
51
  class HavePrecisionAtK
36
52
  def initialize(k)
37
53
  @k = k
38
54
  @threshold = 0.5
39
55
  end
40
56
 
57
+ # @param threshold [Float] minimum precision score (0.0 - 1.0)
58
+ # @return [self]
41
59
  def above(threshold)
42
60
  @threshold = threshold
43
61
  self
44
62
  end
45
63
 
64
+ # @param results [EvalRuby::RetrievalResult]
65
+ # @return [Boolean]
46
66
  def matches?(results)
47
67
  @results = results
48
- # results should respond to retrieved and relevant, or be arrays
49
68
  if results.is_a?(EvalRuby::RetrievalResult)
50
69
  @score = results.precision_at_k(@k)
51
70
  else
@@ -54,17 +73,40 @@ module EvalRuby
54
73
  @score >= @threshold
55
74
  end
56
75
 
76
+ # @return [String]
57
77
  def failure_message
58
78
  "expected precision@#{@k} >= #{@threshold}, but got #{@score.round(4)}"
59
79
  end
60
80
  end
61
81
 
62
- def be_faithful_to(context)
63
- BeFaithfulTo.new(context)
82
+ # @param context [Array<String>, String] context to check faithfulness against
83
+ # @param judge [EvalRuby::Judges::Base, nil] optional judge (uses configured default if nil)
84
+ # @return [BeFaithfulTo]
85
+ def be_faithful_to(context, judge: nil)
86
+ BeFaithfulTo.new(context, judge: judge)
64
87
  end
65
88
 
89
+ # @param k [Integer] number of top results to evaluate
90
+ # @return [HavePrecisionAtK]
66
91
  def have_precision_at_k(k)
67
92
  HavePrecisionAtK.new(k)
68
93
  end
69
94
  end
95
+
96
+ class << self
97
+ private
98
+
99
+ # Build a judge from the current configuration.
100
+ # @return [EvalRuby::Judges::Base]
101
+ def default_judge
102
+ case configuration.judge_llm
103
+ when :openai
104
+ Judges::OpenAI.new(configuration)
105
+ when :anthropic
106
+ Judges::Anthropic.new(configuration)
107
+ else
108
+ raise Error, "Unknown judge LLM: #{configuration.judge_llm}"
109
+ end
110
+ end
111
+ end
70
112
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EvalRuby
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/eval_ruby.rb CHANGED
@@ -21,6 +21,27 @@ require_relative "eval_ruby/report"
21
21
  require_relative "eval_ruby/dataset"
22
22
  require_relative "eval_ruby/comparison"
23
23
 
24
+ # Evaluation framework for LLM and RAG applications.
25
+ # Measures quality metrics like faithfulness, relevance, context precision,
26
+ # and answer correctness. Think Ragas or DeepEval for Ruby.
27
+ #
28
+ # @example Quick evaluation
29
+ # result = EvalRuby.evaluate(
30
+ # question: "What is Ruby?",
31
+ # answer: "A programming language",
32
+ # context: ["Ruby is a dynamic, open source programming language."],
33
+ # ground_truth: "Ruby is a programming language created by Matz."
34
+ # )
35
+ # puts result.faithfulness # => 0.95
36
+ # puts result.overall # => 0.87
37
+ #
38
+ # @example Retrieval evaluation
39
+ # result = EvalRuby.evaluate_retrieval(
40
+ # question: "What is Ruby?",
41
+ # retrieved: ["doc_a", "doc_b", "doc_c"],
42
+ # relevant: ["doc_a", "doc_c"]
43
+ # )
44
+ # puts result.precision_at_k(3) # => 0.67
24
45
  module EvalRuby
25
46
  class Error < StandardError; end
26
47
  class APIError < Error; end
@@ -28,18 +49,33 @@ module EvalRuby
28
49
  class InvalidResponseError < Error; end
29
50
 
30
51
  class << self
52
+ # @return [Configuration] the current configuration
31
53
  def configuration
32
54
  @configuration ||= Configuration.new
33
55
  end
34
56
 
57
+ # Yields the configuration for modification.
58
+ #
59
+ # @yieldparam config [Configuration]
60
+ # @return [void]
35
61
  def configure
36
62
  yield(configuration)
37
63
  end
38
64
 
65
+ # Resets configuration to defaults.
66
+ #
67
+ # @return [Configuration]
39
68
  def reset_configuration!
40
69
  @configuration = Configuration.new
41
70
  end
42
71
 
72
+ # Evaluates an LLM response across multiple quality metrics.
73
+ #
74
+ # @param question [String] the input question
75
+ # @param answer [String] the LLM-generated answer
76
+ # @param context [Array<String>] retrieved context chunks
77
+ # @param ground_truth [String, nil] expected correct answer
78
+ # @return [Result]
43
79
  def evaluate(question:, answer:, context: [], ground_truth: nil)
44
80
  Evaluator.new.evaluate(
45
81
  question: question,
@@ -49,6 +85,12 @@ module EvalRuby
49
85
  )
50
86
  end
51
87
 
88
+ # Evaluates retrieval quality using IR metrics.
89
+ #
90
+ # @param question [String] the input question
91
+ # @param retrieved [Array<String>] retrieved document IDs
92
+ # @param relevant [Array<String>] ground-truth relevant document IDs
93
+ # @return [RetrievalResult]
52
94
  def evaluate_retrieval(question:, retrieved:, relevant:)
53
95
  Evaluator.new.evaluate_retrieval(
54
96
  question: question,
@@ -57,6 +99,11 @@ module EvalRuby
57
99
  )
58
100
  end
59
101
 
102
+ # Evaluates a batch of samples, optionally running them through a pipeline.
103
+ #
104
+ # @param dataset [Dataset, Array<Hash>] samples to evaluate
105
+ # @param pipeline [#query, nil] optional RAG pipeline to run queries through
106
+ # @return [Report]
60
107
  def evaluate_batch(dataset, pipeline: nil)
61
108
  samples = dataset.is_a?(Dataset) ? dataset.samples : dataset
62
109
  evaluator = Evaluator.new
@@ -79,6 +126,11 @@ module EvalRuby
79
126
  Report.new(results: results, samples: samples, duration: Time.now - start_time)
80
127
  end
81
128
 
129
+ # Compares two evaluation reports with statistical significance testing.
130
+ #
131
+ # @param report_a [Report] baseline report
132
+ # @param report_b [Report] comparison report
133
+ # @return [Comparison]
82
134
  def compare(report_a, report_b)
83
135
  Comparison.new(report_a, report_b)
84
136
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eval-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -75,6 +75,7 @@ files:
75
75
  - Gemfile
76
76
  - Gemfile.lock
77
77
  - LICENSE
78
+ - MILESTONES.md
78
79
  - README.md
79
80
  - Rakefile
80
81
  - eval-ruby.gemspec