RubyGems - eval-ruby - Versions diffs - 0.1.1 → 0.2.0 - Mend

eval-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/MILESTONES.md +13 -0
data/lib/eval_ruby/comparison.rb +18 -1
data/lib/eval_ruby/configuration.rb +25 -2
data/lib/eval_ruby/dataset.rb +47 -1
data/lib/eval_ruby/evaluator.rb +36 -0
data/lib/eval_ruby/judges/anthropic.rb +8 -0
data/lib/eval_ruby/judges/base.rb +11 -0
data/lib/eval_ruby/judges/openai.rb +8 -0
data/lib/eval_ruby/metrics/base.rb +8 -0
data/lib/eval_ruby/metrics/context_precision.rb +10 -0
data/lib/eval_ruby/metrics/context_recall.rb +10 -0
data/lib/eval_ruby/metrics/correctness.rb +13 -0
data/lib/eval_ruby/metrics/faithfulness.rb +10 -0
data/lib/eval_ruby/metrics/mrr.rb +8 -0
data/lib/eval_ruby/metrics/ndcg.rb +10 -0
data/lib/eval_ruby/metrics/precision_at_k.rb +9 -0
data/lib/eval_ruby/metrics/recall_at_k.rb +9 -0
data/lib/eval_ruby/metrics/relevance.rb +10 -0
data/lib/eval_ruby/report.rb +38 -1
data/lib/eval_ruby/result.rb +29 -1
data/lib/eval_ruby/rspec.rb +48 -6
data/lib/eval_ruby/version.rb +1 -1
data/lib/eval_ruby.rb +52 -0
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 59b7bd64cf696d82a27cb6330ab948306904f444a8a64476f702ee1937bbccab
-  data.tar.gz: f9c0c234f0712d37d309d460d1c3204d3e6f70bfbd8fe4dcff95ac508bcb7f34
+  metadata.gz: 69f4642cd2505ab6b0b54f36cd76fea2043f76306c425fa69e29027e4d0dc901
+  data.tar.gz: 70125b286a374c01af966a0a044edb9b0cbca02fa9acd6b944f2b0825ad60981
 SHA512:
-  metadata.gz: 6a9f0c12a790b0098ba639bc236c6080c64c7fbb4ad9892c36810e93b88486bebaa42dca9245beaf828beca29e31793030d62ab232cad50107bc99905635a069
-  data.tar.gz: 7c922b6fd8743d5241a254baf301aae2af6aba936fdeedbcb467f9549a8f16493c7571ec9724a5460a75228e415cbcaa852b915ef78f23aa8f1fcbddb85d9450
+  metadata.gz: 7ac13b5d60996d964a948b92bec466d3594c614e5a0bc3e138bfba31dda0dcca19876d01e2826ed1522962e7117b2eb69c7e3ea46567e4a09920f77f2031d09c
+  data.tar.gz: ca13c6f768516a2f21188c59511f045bd12c9fd4fe51819af705016c1282256b6765805a007265dbb3e299430c19b8fe80e1a29c22038b930b5799ab7d3a8355

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    eval-ruby (0.1.1)
+    eval-ruby (0.2.0)
       csv
 GEM
@@ -39,7 +39,7 @@ CHECKSUMS
   bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
   crack (1.0.1) sha256=ff4a10390cd31d66440b7524eb1841874db86201d5b70032028553130b6d4c7e
   csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
-  eval-ruby (0.1.1)
+  eval-ruby (0.2.0)
   hashdiff (1.2.1) sha256=9c079dbc513dfc8833ab59c0c2d8f230fa28499cc5efb4b8dd276cf931457cd1
   minitest (5.27.0) sha256=2d3b17f8a36fe7801c1adcffdbc38233b938eb0b4966e97a6739055a45fa77d5
   public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623

data/MILESTONES.md ADDED Viewed

@@ -0,0 +1,13 @@
+# Milestones
+## v0.1.1 (2026-03-10)
+### Changes
+- Retry logic
+- API key validation
+- string context fix
+- std dev fix
+- error subclasses
+## v0.1.0 (Initial release)
+- Initial release

data/lib/eval_ruby/comparison.rb CHANGED Viewed

@@ -1,14 +1,27 @@
 # frozen_string_literal: true
 module EvalRuby
+  # Statistical comparison of two evaluation reports using paired t-tests.
+  #
+  # @example
+  #   comparison = EvalRuby.compare(report_a, report_b)
+  #   puts comparison.summary
+  #   comparison.significant_improvements # => [:faithfulness]
   class Comparison
-    attr_reader :report_a, :report_b
+    # @return [Report] baseline report
+    attr_reader :report_a
+    # @return [Report] comparison report
+    attr_reader :report_b
+    # @param report_a [Report] baseline
+    # @param report_b [Report] comparison
     def initialize(report_a, report_b)
       @report_a = report_a
       @report_b = report_b
     end
+    # @return [String] formatted comparison table with deltas and p-values
     def summary
       lines = [
         format("%-20s | %-10s | %-10s | %-8s | %s", "Metric", "A", "B", "Delta", "p-value"),
@@ -35,6 +48,10 @@ module EvalRuby
       lines.join("\n")
     end
+    # Returns metrics where report_b is significantly better than report_a.
+    #
+    # @param alpha [Float] significance level (default 0.05)
+    # @return [Array<Symbol>] metric names with significant improvements
     def significant_improvements(alpha: 0.05)
       all_metrics.select do |metric|
         scores_a = @report_a.results.filter_map { |r| r.scores[metric] }

data/lib/eval_ruby/configuration.rb CHANGED Viewed

@@ -1,9 +1,32 @@
 # frozen_string_literal: true
 module EvalRuby
+  # Global configuration for EvalRuby.
+  #
+  # @example
+  #   EvalRuby.configure do |config|
+  #     config.judge_llm = :openai
+  #     config.api_key = ENV["OPENAI_API_KEY"]
+  #     config.judge_model = "gpt-4o"
+  #   end
   class Configuration
-    attr_accessor :judge_llm, :judge_model, :api_key, :default_threshold,
-                  :timeout, :max_retries
+    # @return [Symbol] LLM provider for judge (:openai or :anthropic)
+    attr_accessor :judge_llm
+    # @return [String] model name for the judge LLM
+    attr_accessor :judge_model
+    # @return [String, nil] API key for the judge LLM provider
+    attr_accessor :api_key
+    # @return [Float] default threshold for pass/fail decisions
+    attr_accessor :default_threshold
+    # @return [Integer] HTTP request timeout in seconds
+    attr_accessor :timeout
+    # @return [Integer] maximum number of retries on transient failures
+    attr_accessor :max_retries
     def initialize
       @judge_llm = :openai

data/lib/eval_ruby/dataset.rb CHANGED Viewed

@@ -4,16 +4,36 @@ require "csv"
 require "json"
 module EvalRuby
+  # Collection of evaluation samples with import/export support.
+  # Supports CSV, JSON, and programmatic construction.
+  #
+  # @example
+  #   dataset = EvalRuby::Dataset.new("my_test_set")
+  #   dataset.add(question: "What is Ruby?", answer: "A language", ground_truth: "A language")
+  #   report = EvalRuby.evaluate_batch(dataset)
   class Dataset
     include Enumerable
-    attr_reader :name, :samples
+    # @return [String] dataset name
+    attr_reader :name
+    # @return [Array<Hash>] sample entries
+    attr_reader :samples
+    # @param name [String] dataset name
     def initialize(name = "default")
       @name = name
       @samples = []
     end
+    # Adds a sample to the dataset.
+    #
+    # @param question [String]
+    # @param ground_truth [String, nil]
+    # @param relevant_contexts [Array<String>] alias for context
+    # @param answer [String, nil]
+    # @param context [Array<String>]
+    # @return [self]
     def add(question:, ground_truth: nil, relevant_contexts: [], answer: nil, context: [])
       @samples << {
         question: question,
@@ -24,18 +44,26 @@ module EvalRuby
       self
     end
+    # @yield [Hash] each sample
     def each(&block)
       @samples.each(&block)
     end
+    # @return [Integer] number of samples
     def size
       @samples.size
     end
+    # @param index [Integer]
+    # @return [Hash] sample at index
     def [](index)
       @samples[index]
     end
+    # Loads a dataset from a CSV file.
+    #
+    # @param path [String] path to CSV file
+    # @return [Dataset]
     def self.from_csv(path)
       dataset = new(File.basename(path, ".*"))
       CSV.foreach(path, headers: true) do |row|
@@ -49,6 +77,10 @@ module EvalRuby
       dataset
     end
+    # Loads a dataset from a JSON file.
+    #
+    # @param path [String] path to JSON file
+    # @return [Dataset]
     def self.from_json(path)
       dataset = new(File.basename(path, ".*"))
       data = JSON.parse(File.read(path))
@@ -64,6 +96,10 @@ module EvalRuby
       dataset
     end
+    # Exports dataset to CSV.
+    #
+    # @param path [String] output file path
+    # @return [void]
     def to_csv(path)
       CSV.open(path, "w") do |csv|
         csv << %w[question answer context ground_truth]
@@ -78,10 +114,20 @@ module EvalRuby
       end
     end
+    # Exports dataset to JSON.
+    #
+    # @param path [String] output file path
+    # @return [void]
     def to_json(path)
       File.write(path, JSON.pretty_generate({name: @name, samples: @samples}))
     end
+    # Generates a dataset from documents using an LLM.
+    #
+    # @param documents [Array<String>] file paths to source documents
+    # @param questions_per_doc [Integer] number of QA pairs per document
+    # @param llm [Symbol] LLM provider (:openai or :anthropic)
+    # @return [Dataset]
     def self.generate(documents:, questions_per_doc: 5, llm: :openai)
       config = EvalRuby.configuration.dup
       config.judge_llm = llm

data/lib/eval_ruby/evaluator.rb CHANGED Viewed

@@ -1,12 +1,25 @@
 # frozen_string_literal: true
 module EvalRuby
+  # Runs all configured metrics on a given question/answer/context tuple.
+  #
+  # @example
+  #   evaluator = EvalRuby::Evaluator.new
+  #   result = evaluator.evaluate(question: "...", answer: "...", context: [...])
   class Evaluator
+    # @param config [Configuration] configuration to use
     def initialize(config = EvalRuby.configuration)
       @config = config
       @judge = build_judge(config)
     end
+    # Evaluates an LLM response across quality metrics.
+    #
+    # @param question [String] the input question
+    # @param answer [String] the LLM-generated answer
+    # @param context [Array<String>] retrieved context chunks
+    # @param ground_truth [String, nil] expected correct answer
+    # @return [Result]
     def evaluate(question:, answer:, context: [], ground_truth: nil)
       scores = {}
       details = {}
@@ -37,6 +50,12 @@ module EvalRuby
       Result.new(scores: scores, details: details)
     end
+    # Evaluates retrieval quality using IR metrics.
+    #
+    # @param question [String] the input question
+    # @param retrieved [Array<String>] retrieved document IDs
+    # @param relevant [Array<String>] ground-truth relevant document IDs
+    # @return [RetrievalResult]
     def evaluate_retrieval(question:, retrieved:, relevant:)
       RetrievalResult.new(retrieved: retrieved, relevant: relevant)
     end
@@ -55,32 +74,49 @@ module EvalRuby
     end
   end
+  # Holds retrieval evaluation results with IR metric accessors.
+  #
+  # @example
+  #   result = EvalRuby.evaluate_retrieval(question: "...", retrieved: [...], relevant: [...])
+  #   result.precision_at_k(5) # => 0.6
+  #   result.mrr               # => 1.0
   class RetrievalResult
+    # @param retrieved [Array<String>] retrieved document IDs in ranked order
+    # @param relevant [Array<String>] ground-truth relevant document IDs
     def initialize(retrieved:, relevant:)
       @retrieved = retrieved
       @relevant = relevant
     end
+    # @param k [Integer] number of top results to consider
+    # @return [Float] precision at k
     def precision_at_k(k)
       Metrics::PrecisionAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
     end
+    # @param k [Integer] number of top results to consider
+    # @return [Float] recall at k
     def recall_at_k(k)
       Metrics::RecallAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
     end
+    # @return [Float] mean reciprocal rank
     def mrr
       Metrics::MRR.new.call(retrieved: @retrieved, relevant: @relevant)
     end
+    # @param k [Integer, nil] number of top results (nil for all)
+    # @return [Float] normalized discounted cumulative gain
     def ndcg(k: nil)
       Metrics::NDCG.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
     end
+    # @return [Float] 1.0 if any relevant doc is retrieved, 0.0 otherwise
     def hit_rate
       @retrieved.any? { |doc| @relevant.include?(doc) } ? 1.0 : 0.0
     end
+    # @return [Hash{Symbol => Float}] all retrieval metrics
     def to_h
       {
         precision_at_k: precision_at_k(@retrieved.length),

data/lib/eval_ruby/judges/anthropic.rb CHANGED Viewed

@@ -6,14 +6,22 @@ require "uri"
 module EvalRuby
   module Judges
+    # Anthropic-based LLM judge using the Messages API.
+    # Requires an API key set via {Configuration#api_key}.
     class Anthropic < Base
       API_URL = "https://api.anthropic.com/v1/messages"
+      # @param config [Configuration]
+      # @raise [EvalRuby::Error] if API key is missing
       def initialize(config)
         super
         raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
       end
+      # @param prompt [String] the evaluation prompt
+      # @return [Hash, nil] parsed JSON response
+      # @raise [EvalRuby::Error] on API errors
+      # @raise [EvalRuby::TimeoutError] after max retries
       def call(prompt)
         retries = 0
         begin

data/lib/eval_ruby/judges/base.rb CHANGED Viewed

@@ -2,17 +2,28 @@
 module EvalRuby
   module Judges
+    # Abstract base class for LLM judges.
+    # Subclasses must implement {#call} to send prompts to an LLM and parse JSON responses.
     class Base
+      # @param config [Configuration]
       def initialize(config)
         @config = config
       end
+      # Sends a prompt to the LLM and returns parsed JSON.
+      #
+      # @param prompt [String] the evaluation prompt
+      # @return [Hash, nil] parsed JSON response
       def call(prompt)
         raise NotImplementedError, "#{self.class}#call must be implemented"
       end
       private
+      # Extracts and parses the first JSON object from text.
+      #
+      # @param text [String] raw LLM response text
+      # @return [Hash, nil] parsed JSON or nil if not found
       def parse_json_response(text)
         match = text.match(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/m)
         return nil unless match

data/lib/eval_ruby/judges/openai.rb CHANGED Viewed

@@ -6,14 +6,22 @@ require "uri"
 module EvalRuby
   module Judges
+    # OpenAI-based LLM judge using the Chat Completions API.
+    # Requires an API key set via {Configuration#api_key}.
     class OpenAI < Base
       API_URL = "https://api.openai.com/v1/chat/completions"
+      # @param config [Configuration]
+      # @raise [EvalRuby::Error] if API key is missing
       def initialize(config)
         super
         raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
       end
+      # @param prompt [String] the evaluation prompt
+      # @return [Hash, nil] parsed JSON response
+      # @raise [EvalRuby::Error] on API errors
+      # @raise [EvalRuby::TimeoutError] after max retries
       def call(prompt)
         retries = 0
         begin

data/lib/eval_ruby/metrics/base.rb CHANGED Viewed

@@ -2,13 +2,21 @@
 module EvalRuby
   module Metrics
+    # Abstract base class for all evaluation metrics.
+    # Subclasses must implement {#call}.
     class Base
+      # @return [EvalRuby::Judges::Base, nil] the LLM judge instance
       attr_reader :judge
+      # @param judge [EvalRuby::Judges::Base, nil] LLM judge for evaluation
       def initialize(judge: nil)
         @judge = judge
       end
+      # Evaluates the metric.
+      #
+      # @param kwargs [Hash] metric-specific keyword arguments
+      # @return [Hash{Symbol => Object}] must include :score and :details keys
       def call(**kwargs)
         raise NotImplementedError, "#{self.class}#call must be implemented"
       end

data/lib/eval_ruby/metrics/context_precision.rb CHANGED Viewed

@@ -2,6 +2,13 @@
 module EvalRuby
   module Metrics
+    # Measures the proportion of retrieved contexts that are relevant to the question.
+    # Uses an LLM judge to evaluate each context's relevance.
+    #
+    # @example
+    #   metric = ContextPrecision.new(judge: judge)
+    #   result = metric.call(question: "What is Ruby?", context: ["Ruby is...", "Weather..."])
+    #   result[:score] # => 0.5
     class ContextPrecision < Base
       PROMPT_TEMPLATE = <<~PROMPT
         Given the following question and a list of retrieved contexts, evaluate
@@ -19,6 +26,9 @@ module EvalRuby
         The score should be the proportion of relevant contexts (0.0 to 1.0).
       PROMPT
+      # @param question [String] the input question
+      # @param context [Array<String>, String] retrieved context chunks
+      # @return [Hash] :score (Float 0.0-1.0) and :details (:evaluations Array)
       def call(question:, context:, **_kwargs)
         contexts = context.is_a?(Array) ? context : [context.to_s]
         return {score: 0.0, details: {}} if contexts.empty?

data/lib/eval_ruby/metrics/context_recall.rb CHANGED Viewed

@@ -2,6 +2,13 @@
 module EvalRuby
   module Metrics
+    # Measures whether retrieved contexts contain enough information to support the ground truth.
+    # Uses an LLM judge to check if each ground truth statement is attributable to context.
+    #
+    # @example
+    #   metric = ContextRecall.new(judge: judge)
+    #   result = metric.call(context: ["Ruby was created in 1995."], ground_truth: "Ruby was created in 1995.")
+    #   result[:score] # => 1.0
     class ContextRecall < Base
       PROMPT_TEMPLATE = <<~PROMPT
         Given the following ground truth answer and retrieved contexts, evaluate
@@ -20,6 +27,9 @@ module EvalRuby
         The score should be the proportion of statements attributed to context (0.0 to 1.0).
       PROMPT
+      # @param context [Array<String>, String] retrieved context chunks
+      # @param ground_truth [String] expected correct answer
+      # @return [Hash] :score (Float 0.0-1.0) and :details (:statements Array)
       def call(context:, ground_truth:, **_kwargs)
         contexts = context.is_a?(Array) ? context : [context.to_s]
         return {score: 0.0, details: {}} if contexts.empty?

data/lib/eval_ruby/metrics/correctness.rb CHANGED Viewed

@@ -2,6 +2,16 @@
 module EvalRuby
   module Metrics
+    # Measures factual correctness of an answer against ground truth.
+    # Uses LLM judge when available, falls back to token overlap F1 score.
+    #
+    # @example With LLM judge
+    #   metric = Correctness.new(judge: judge)
+    #   result = metric.call(answer: "Paris", ground_truth: "Paris")
+    #
+    # @example Without judge (string similarity)
+    #   metric = Correctness.new
+    #   result = metric.call(answer: "The capital is Paris", ground_truth: "Paris is the capital")
     class Correctness < Base
       PROMPT_TEMPLATE = <<~PROMPT
         Given the following answer and ground truth, evaluate whether the answer
@@ -23,6 +33,9 @@ module EvalRuby
         Respond in JSON: {"reasoning": "...", "score": 0.0}
       PROMPT
+      # @param answer [String] the LLM-generated answer
+      # @param ground_truth [String] the expected correct answer
+      # @return [Hash] :score (Float 0.0-1.0) and :details
       def call(answer:, ground_truth:, **_kwargs)
         if judge
           llm_score(answer, ground_truth)

data/lib/eval_ruby/metrics/faithfulness.rb CHANGED Viewed

@@ -2,6 +2,13 @@
 module EvalRuby
   module Metrics
+    # Measures whether an answer is supported by the provided context.
+    # Uses an LLM judge to identify claims and check if each is supported.
+    #
+    # @example
+    #   metric = Faithfulness.new(judge: judge)
+    #   result = metric.call(answer: "Paris is in France", context: ["Paris is the capital of France."])
+    #   result[:score] # => 1.0
     class Faithfulness < Base
       PROMPT_TEMPLATE = <<~PROMPT
         Given the following context and answer, evaluate whether the answer
@@ -25,6 +32,9 @@ module EvalRuby
         Respond in JSON: {"claims": [{"claim": "...", "supported": true}], "score": 0.0}
       PROMPT
+      # @param answer [String] the LLM-generated answer
+      # @param context [Array<String>, String] retrieved context chunks
+      # @return [Hash] :score (Float 0.0-1.0) and :details (:claims Array)
       def call(answer:, context:, **_kwargs)
         context_text = context.is_a?(Array) ? context.join("\n\n") : context.to_s
         prompt = format(PROMPT_TEMPLATE, context: context_text, answer: answer)

data/lib/eval_ruby/metrics/mrr.rb CHANGED Viewed

@@ -2,7 +2,15 @@
 module EvalRuby
   module Metrics
+    # Computes Mean Reciprocal Rank: 1/(position of first relevant document).
+    #
+    # @example
+    #   MRR.new.call(retrieved: ["a", "b", "c"], relevant: ["b"])
+    #   # => 0.5
     class MRR < Base
+      # @param retrieved [Array<String>] retrieved document IDs in ranked order
+      # @param relevant [Array<String>] ground-truth relevant document IDs
+      # @return [Float] reciprocal rank (0.0-1.0)
       def call(retrieved:, relevant:, **_kwargs)
         retrieved.each_with_index do |doc, i|
           return 1.0 / (i + 1) if relevant.include?(doc)

data/lib/eval_ruby/metrics/ndcg.rb CHANGED Viewed

@@ -2,7 +2,17 @@
 module EvalRuby
   module Metrics
+    # Computes Normalized Discounted Cumulative Gain (NDCG).
+    # Measures ranking quality by comparing actual ranking to ideal ranking.
+    #
+    # @example
+    #   NDCG.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"])
+    #   # => 0.863
     class NDCG < Base
+      # @param retrieved [Array<String>] retrieved document IDs in ranked order
+      # @param relevant [Array<String>] ground-truth relevant document IDs
+      # @param k [Integer, nil] number of top results (nil for all)
+      # @return [Float] NDCG score (0.0-1.0)
       def call(retrieved:, relevant:, k: nil, **_kwargs)
         k ||= retrieved.length
         top_k = retrieved.first(k)

data/lib/eval_ruby/metrics/precision_at_k.rb CHANGED Viewed

@@ -2,7 +2,16 @@
 module EvalRuby
   module Metrics
+    # Computes Precision@K: the fraction of top-k retrieved documents that are relevant.
+    #
+    # @example
+    #   PrecisionAtK.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"], k: 3)
+    #   # => 0.667
     class PrecisionAtK < Base
+      # @param retrieved [Array<String>] retrieved document IDs in ranked order
+      # @param relevant [Array<String>] ground-truth relevant document IDs
+      # @param k [Integer, nil] number of top results (nil for all)
+      # @return [Float] precision score (0.0-1.0)
       def call(retrieved:, relevant:, k: nil, **_kwargs)
         k ||= retrieved.length
         top_k = retrieved.first(k)

data/lib/eval_ruby/metrics/recall_at_k.rb CHANGED Viewed

@@ -2,7 +2,16 @@
 module EvalRuby
   module Metrics
+    # Computes Recall@K: the fraction of relevant documents found in the top-k results.
+    #
+    # @example
+    #   RecallAtK.new.call(retrieved: ["a", "b", "c"], relevant: ["a", "c"], k: 3)
+    #   # => 1.0
     class RecallAtK < Base
+      # @param retrieved [Array<String>] retrieved document IDs in ranked order
+      # @param relevant [Array<String>] ground-truth relevant document IDs
+      # @param k [Integer, nil] number of top results (nil for all)
+      # @return [Float] recall score (0.0-1.0)
       def call(retrieved:, relevant:, k: nil, **_kwargs)
         return 0.0 if relevant.empty?

data/lib/eval_ruby/metrics/relevance.rb CHANGED Viewed

@@ -2,6 +2,13 @@
 module EvalRuby
   module Metrics
+    # Measures whether an answer is relevant to the question.
+    # Uses an LLM judge to evaluate relevance on a 0.0-1.0 scale.
+    #
+    # @example
+    #   metric = Relevance.new(judge: judge)
+    #   result = metric.call(question: "What is Ruby?", answer: "Ruby is a language.")
+    #   result[:score] # => 0.95
     class Relevance < Base
       PROMPT_TEMPLATE = <<~PROMPT
         Given the following question and answer, evaluate whether the answer
@@ -21,6 +28,9 @@ module EvalRuby
         Respond in JSON: {"reasoning": "...", "score": 0.0}
       PROMPT
+      # @param question [String] the input question
+      # @param answer [String] the LLM-generated answer
+      # @return [Hash] :score (Float 0.0-1.0) and :details (:reasoning String)
       def call(question:, answer:, **_kwargs)
         prompt = format(PROMPT_TEMPLATE, question: question, answer: answer)

data/lib/eval_ruby/report.rb CHANGED Viewed

@@ -4,15 +4,33 @@ require "csv"
 require "json"
 module EvalRuby
+  # Aggregated evaluation report across multiple samples.
+  # Provides statistical summaries, filtering, and export functionality.
+  #
+  # @example
+  #   report = EvalRuby.evaluate_batch(dataset)
+  #   puts report.summary
+  #   report.to_csv("results.csv")
   class Report
-    attr_reader :results, :duration, :samples
+    # @return [Array<Result>] individual evaluation results
+    attr_reader :results
+    # @return [Float, nil] total evaluation duration in seconds
+    attr_reader :duration
+    # @return [Array<Hash>] original sample data
+    attr_reader :samples
+    # @param results [Array<Result>]
+    # @param samples [Array<Hash>]
+    # @param duration [Float, nil]
     def initialize(results:, samples: [], duration: nil)
       @results = results
       @samples = samples
       @duration = duration
     end
+    # @return [String] human-readable summary with mean and std for each metric
     def summary
       lines = []
       metric_stats.each do |metric, stats|
@@ -23,6 +41,9 @@ module EvalRuby
       lines.join("\n")
     end
+    # Computes per-metric statistics (mean, std, min, max).
+    #
+    # @return [Hash{Symbol => Hash}] metric name to stats hash
     def metric_stats
       return {} if @results.empty?
@@ -39,15 +60,27 @@ module EvalRuby
       end
     end
+    # Returns the n worst-scoring results.
+    #
+    # @param n [Integer] number of results to return
+    # @return [Array<Result>]
     def worst(n = 5)
       @results.sort_by { |r| r.overall || 0.0 }.first(n)
     end
+    # Returns results below the threshold.
+    #
+    # @param threshold [Float, nil] score threshold (defaults to config default_threshold)
+    # @return [Array<Result>]
     def failures(threshold: nil)
       threshold ||= EvalRuby.configuration.default_threshold
       @results.select { |r| (r.overall || 0.0) < threshold }
     end
+    # Exports results to CSV.
+    #
+    # @param path [String] output file path
+    # @return [void]
     def to_csv(path)
       return if @results.empty?
@@ -61,6 +94,10 @@ module EvalRuby
       end
     end
+    # Exports results to JSON.
+    #
+    # @param path [String] output file path
+    # @return [void]
     def to_json(path)
       data = @results.each_with_index.map do |result, i|
         {index: i, scores: result.scores, overall: result.overall, sample: @samples[i]}

data/lib/eval_ruby/result.rb CHANGED Viewed

@@ -1,20 +1,46 @@
 # frozen_string_literal: true
 module EvalRuby
+  # Holds evaluation scores and details for a single sample.
+  #
+  # @example
+  #   result = EvalRuby.evaluate(question: "...", answer: "...", context: [...])
+  #   result.faithfulness  # => 0.95
+  #   result.overall       # => 0.87
   class Result
     METRICS = %i[faithfulness relevance correctness context_precision context_recall].freeze
-    attr_reader :scores, :details
+    # @return [Hash{Symbol => Float}] metric name to score mapping
+    attr_reader :scores
+    # @return [Hash{Symbol => Hash}] metric name to details mapping
+    attr_reader :details
+    # @param scores [Hash{Symbol => Float}]
+    # @param details [Hash{Symbol => Hash}]
     def initialize(scores: {}, details: {})
       @scores = scores
       @details = details
     end
     METRICS.each do |metric|
+      # @!method faithfulness
+      #   @return [Float, nil] faithfulness score
+      # @!method relevance
+      #   @return [Float, nil] relevance score
+      # @!method correctness
+      #   @return [Float, nil] correctness score
+      # @!method context_precision
+      #   @return [Float, nil] context precision score
+      # @!method context_recall
+      #   @return [Float, nil] context recall score
       define_method(metric) { @scores[metric] }
     end
+    # Computes a weighted average of all available scores.
+    #
+    # @param weights [Hash{Symbol => Float}, nil] custom weights per metric
+    # @return [Float, nil] weighted average score, or nil if no scores available
     def overall(weights: nil)
       weights ||= METRICS.each_with_object({}) { |m, h| h[m] = 1.0 }
       available = @scores.select { |k, v| weights.key?(k) && v }
@@ -24,10 +50,12 @@ module EvalRuby
       available.sum { |k, v| v * weights[k] } / total_weight
     end
+    # @return [Hash] scores plus overall
     def to_h
       @scores.merge(overall: overall)
     end
+    # @return [String] human-readable summary
     def to_s
       lines = @scores.map { |k, v| "  #{k}: #{v&.round(4) || 'N/A'}" }
       lines << "  overall: #{overall&.round(4) || 'N/A'}"

data/lib/eval_ruby/rspec.rb CHANGED Viewed

@@ -4,48 +4,67 @@ require "eval_ruby"
 module EvalRuby
   module RSpecMatchers
+    # RSpec matcher that checks if an answer is faithful to the given context.
+    #
+    # @example
+    #   expect(answer).to be_faithful_to(context)
+    #   expect(answer).to be_faithful_to(context).with_threshold(0.9)
     class BeFaithfulTo
-      def initialize(context)
+      def initialize(context, judge: nil)
         @context = Array(context)
         @threshold = 0.8
+        @judge = judge
       end
+      # @param threshold [Float] minimum faithfulness score (0.0 - 1.0)
+      # @return [self]
       def with_threshold(threshold)
         @threshold = threshold
         self
       end
+      # @param answer [String] the LLM-generated answer to evaluate
+      # @return [Boolean]
       def matches?(answer)
         @answer = answer
-        judge = EvalRuby.send(:build_judge)
-        result = Metrics::Faithfulness.new(judge: judge).call(answer: answer, context: @context)
+        j = @judge || EvalRuby.send(:default_judge)
+        result = Metrics::Faithfulness.new(judge: j).call(answer: answer, context: @context)
         @score = result[:score]
         @score >= @threshold
       end
+      # @return [String]
       def failure_message
         "expected answer to be faithful to context (threshold: #{@threshold}), but got score #{@score.round(4)}"
       end
+      # @return [String]
       def failure_message_when_negated
         "expected answer not to be faithful to context, but got score #{@score.round(4)}"
       end
     end
+    # RSpec matcher that checks precision@k for retrieval results.
+    #
+    # @example
+    #   expect(retrieval_result).to have_precision_at_k(5).above(0.8)
     class HavePrecisionAtK
       def initialize(k)
         @k = k
         @threshold = 0.5
       end
+      # @param threshold [Float] minimum precision score (0.0 - 1.0)
+      # @return [self]
       def above(threshold)
         @threshold = threshold
         self
       end
+      # @param results [EvalRuby::RetrievalResult]
+      # @return [Boolean]
       def matches?(results)
         @results = results
-        # results should respond to retrieved and relevant, or be arrays
         if results.is_a?(EvalRuby::RetrievalResult)
           @score = results.precision_at_k(@k)
         else
@@ -54,17 +73,40 @@ module EvalRuby
         @score >= @threshold
       end
+      # @return [String]
       def failure_message
         "expected precision@#{@k} >= #{@threshold}, but got #{@score.round(4)}"
       end
     end
-    def be_faithful_to(context)
-      BeFaithfulTo.new(context)
+    # @param context [Array<String>, String] context to check faithfulness against
+    # @param judge [EvalRuby::Judges::Base, nil] optional judge (uses configured default if nil)
+    # @return [BeFaithfulTo]
+    def be_faithful_to(context, judge: nil)
+      BeFaithfulTo.new(context, judge: judge)
     end
+    # @param k [Integer] number of top results to evaluate
+    # @return [HavePrecisionAtK]
     def have_precision_at_k(k)
       HavePrecisionAtK.new(k)
     end
   end
+  class << self
+    private
+    # Build a judge from the current configuration.
+    # @return [EvalRuby::Judges::Base]
+    def default_judge
+      case configuration.judge_llm
+      when :openai
+        Judges::OpenAI.new(configuration)
+      when :anthropic
+        Judges::Anthropic.new(configuration)
+      else
+        raise Error, "Unknown judge LLM: #{configuration.judge_llm}"
+      end
+    end
+  end
 end

data/lib/eval_ruby/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module EvalRuby
-  VERSION = "0.1.1"
+  VERSION = "0.2.0"
 end

data/lib/eval_ruby.rb CHANGED Viewed

@@ -21,6 +21,27 @@ require_relative "eval_ruby/report"
 require_relative "eval_ruby/dataset"
 require_relative "eval_ruby/comparison"
+# Evaluation framework for LLM and RAG applications.
+# Measures quality metrics like faithfulness, relevance, context precision,
+# and answer correctness. Think Ragas or DeepEval for Ruby.
+#
+# @example Quick evaluation
+#   result = EvalRuby.evaluate(
+#     question: "What is Ruby?",
+#     answer: "A programming language",
+#     context: ["Ruby is a dynamic, open source programming language."],
+#     ground_truth: "Ruby is a programming language created by Matz."
+#   )
+#   puts result.faithfulness  # => 0.95
+#   puts result.overall       # => 0.87
+#
+# @example Retrieval evaluation
+#   result = EvalRuby.evaluate_retrieval(
+#     question: "What is Ruby?",
+#     retrieved: ["doc_a", "doc_b", "doc_c"],
+#     relevant: ["doc_a", "doc_c"]
+#   )
+#   puts result.precision_at_k(3) # => 0.67
 module EvalRuby
   class Error < StandardError; end
   class APIError < Error; end
@@ -28,18 +49,33 @@ module EvalRuby
   class InvalidResponseError < Error; end
   class << self
+    # @return [Configuration] the current configuration
     def configuration
       @configuration ||= Configuration.new
     end
+    # Yields the configuration for modification.
+    #
+    # @yieldparam config [Configuration]
+    # @return [void]
     def configure
       yield(configuration)
     end
+    # Resets configuration to defaults.
+    #
+    # @return [Configuration]
     def reset_configuration!
       @configuration = Configuration.new
     end
+    # Evaluates an LLM response across multiple quality metrics.
+    #
+    # @param question [String] the input question
+    # @param answer [String] the LLM-generated answer
+    # @param context [Array<String>] retrieved context chunks
+    # @param ground_truth [String, nil] expected correct answer
+    # @return [Result]
     def evaluate(question:, answer:, context: [], ground_truth: nil)
       Evaluator.new.evaluate(
         question: question,
@@ -49,6 +85,12 @@ module EvalRuby
       )
     end
+    # Evaluates retrieval quality using IR metrics.
+    #
+    # @param question [String] the input question
+    # @param retrieved [Array<String>] retrieved document IDs
+    # @param relevant [Array<String>] ground-truth relevant document IDs
+    # @return [RetrievalResult]
     def evaluate_retrieval(question:, retrieved:, relevant:)
       Evaluator.new.evaluate_retrieval(
         question: question,
@@ -57,6 +99,11 @@ module EvalRuby
       )
     end
+    # Evaluates a batch of samples, optionally running them through a pipeline.
+    #
+    # @param dataset [Dataset, Array<Hash>] samples to evaluate
+    # @param pipeline [#query, nil] optional RAG pipeline to run queries through
+    # @return [Report]
     def evaluate_batch(dataset, pipeline: nil)
       samples = dataset.is_a?(Dataset) ? dataset.samples : dataset
       evaluator = Evaluator.new
@@ -79,6 +126,11 @@ module EvalRuby
       Report.new(results: results, samples: samples, duration: Time.now - start_time)
     end
+    # Compares two evaluation reports with statistical significance testing.
+    #
+    # @param report_a [Report] baseline report
+    # @param report_b [Report] comparison report
+    # @return [Comparison]
     def compare(report_a, report_b)
       Comparison.new(report_a, report_b)
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: eval-ruby
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - Johannes Dwi Cahyo
@@ -75,6 +75,7 @@ files:
 - Gemfile
 - Gemfile.lock
 - LICENSE
+- MILESTONES.md
 - README.md
 - Rakefile
 - eval-ruby.gemspec