eval-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class Base
6
+ attr_reader :judge
7
+
8
+ def initialize(judge: nil)
9
+ @judge = judge
10
+ end
11
+
12
+ def call(**kwargs)
13
+ raise NotImplementedError, "#{self.class}#call must be implemented"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class ContextPrecision < Base
6
+ PROMPT_TEMPLATE = <<~PROMPT
7
+ Given the following question and a list of retrieved contexts, evaluate
8
+ whether each context is relevant to answering the question.
9
+
10
+ Question:
11
+ %{question}
12
+
13
+ Contexts:
14
+ %{contexts}
15
+
16
+ For each context, determine if it is RELEVANT or NOT RELEVANT to answering the question.
17
+
18
+ Respond in JSON: {"evaluations": [{"index": 0, "relevant": true}], "score": 0.0}
19
+ The score should be the proportion of relevant contexts (0.0 to 1.0).
20
+ PROMPT
21
+
22
+ def call(question:, context:, **_kwargs)
23
+ contexts = Array(context)
24
+ return {score: 0.0, details: {}} if contexts.empty?
25
+
26
+ contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
27
+ prompt = format(PROMPT_TEMPLATE, question: question, contexts: contexts_text)
28
+
29
+ result = judge.call(prompt)
30
+ raise Error, "Judge returned invalid response for context_precision" unless result&.key?("score")
31
+
32
+ {
33
+ score: result["score"].to_f.clamp(0.0, 1.0),
34
+ details: {evaluations: result["evaluations"] || []}
35
+ }
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class ContextRecall < Base
6
+ PROMPT_TEMPLATE = <<~PROMPT
7
+ Given the following ground truth answer and retrieved contexts, evaluate
8
+ whether the contexts contain enough information to support the ground truth.
9
+
10
+ Ground Truth:
11
+ %{ground_truth}
12
+
13
+ Contexts:
14
+ %{contexts}
15
+
16
+ For each statement in the ground truth, determine if it can be attributed
17
+ to the retrieved contexts.
18
+
19
+ Respond in JSON: {"statements": [{"statement": "...", "attributed": true}], "score": 0.0}
20
+ The score should be the proportion of statements attributed to context (0.0 to 1.0).
21
+ PROMPT
22
+
23
+ def call(context:, ground_truth:, **_kwargs)
24
+ contexts = Array(context)
25
+ return {score: 0.0, details: {}} if contexts.empty?
26
+
27
+ contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
28
+ prompt = format(PROMPT_TEMPLATE, ground_truth: ground_truth, contexts: contexts_text)
29
+
30
+ result = judge.call(prompt)
31
+ raise Error, "Judge returned invalid response for context_recall" unless result&.key?("score")
32
+
33
+ {
34
+ score: result["score"].to_f.clamp(0.0, 1.0),
35
+ details: {statements: result["statements"] || []}
36
+ }
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class Correctness < Base
6
+ PROMPT_TEMPLATE = <<~PROMPT
7
+ Given the following answer and ground truth, evaluate whether the answer
8
+ is factually correct.
9
+
10
+ Answer:
11
+ %{answer}
12
+
13
+ Ground Truth:
14
+ %{ground_truth}
15
+
16
+ Evaluate correctness on a scale from 0.0 to 1.0 where:
17
+ - 1.0 = the answer is completely correct and matches the ground truth
18
+ - 0.5 = the answer is partially correct
19
+ - 0.0 = the answer is completely wrong
20
+
21
+ Consider both semantic meaning and factual accuracy, not just exact string matching.
22
+
23
+ Respond in JSON: {"reasoning": "...", "score": 0.0}
24
+ PROMPT
25
+
26
+ def call(answer:, ground_truth:, **_kwargs)
27
+ if judge
28
+ llm_score(answer, ground_truth)
29
+ else
30
+ string_similarity_score(answer, ground_truth)
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def llm_score(answer, ground_truth)
37
+ prompt = format(PROMPT_TEMPLATE, answer: answer, ground_truth: ground_truth)
38
+
39
+ result = judge.call(prompt)
40
+ raise Error, "Judge returned invalid response for correctness" unless result&.key?("score")
41
+
42
+ {
43
+ score: result["score"].to_f.clamp(0.0, 1.0),
44
+ details: {reasoning: result["reasoning"]}
45
+ }
46
+ end
47
+
48
+ def string_similarity_score(answer, ground_truth)
49
+ answer_tokens = tokenize(answer)
50
+ truth_tokens = tokenize(ground_truth)
51
+
52
+ return {score: 1.0, details: {method: :exact_match}} if answer_tokens == truth_tokens
53
+ return {score: 0.0, details: {method: :token_overlap}} if answer_tokens.empty? || truth_tokens.empty?
54
+
55
+ overlap = (answer_tokens & truth_tokens).size
56
+ precision = overlap.to_f / answer_tokens.size
57
+ recall = overlap.to_f / truth_tokens.size
58
+ f1 = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0.0
59
+
60
+ {score: f1.clamp(0.0, 1.0), details: {method: :token_overlap, precision: precision, recall: recall}}
61
+ end
62
+
63
+ def tokenize(text)
64
+ text.downcase.scan(/\w+/)
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class Faithfulness < Base
6
+ PROMPT_TEMPLATE = <<~PROMPT
7
+ Given the following context and answer, evaluate whether the answer
8
+ is faithful to (supported by) the context.
9
+
10
+ For each claim in the answer, determine if it is:
11
+ 1. SUPPORTED - directly supported by the context
12
+ 2. NOT SUPPORTED - contradicts or is not mentioned in the context
13
+
14
+ Context:
15
+ %{context}
16
+
17
+ Answer:
18
+ %{answer}
19
+
20
+ List each claim and whether it is SUPPORTED or NOT SUPPORTED.
21
+ Then give a faithfulness score from 0.0 to 1.0 where:
22
+ - 1.0 = all claims are supported
23
+ - 0.0 = no claims are supported
24
+
25
+ Respond in JSON: {"claims": [{"claim": "...", "supported": true}], "score": 0.0}
26
+ PROMPT
27
+
28
+ def call(answer:, context:, **_kwargs)
29
+ context_text = Array(context).join("\n\n")
30
+ prompt = format(PROMPT_TEMPLATE, context: context_text, answer: answer)
31
+
32
+ result = judge.call(prompt)
33
+ raise Error, "Judge returned invalid response for faithfulness" unless result&.key?("score")
34
+
35
+ {
36
+ score: result["score"].to_f.clamp(0.0, 1.0),
37
+ details: {claims: result["claims"] || []}
38
+ }
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class MRR < Base
6
+ def call(retrieved:, relevant:, **_kwargs)
7
+ retrieved.each_with_index do |doc, i|
8
+ return 1.0 / (i + 1) if relevant.include?(doc)
9
+ end
10
+ 0.0
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class NDCG < Base
6
+ def call(retrieved:, relevant:, k: nil, **_kwargs)
7
+ k ||= retrieved.length
8
+ top_k = retrieved.first(k)
9
+
10
+ dcg = top_k.each_with_index.sum do |doc, i|
11
+ rel = relevant.include?(doc) ? 1.0 : 0.0
12
+ rel / Math.log2(i + 2)
13
+ end
14
+
15
+ ideal_length = [relevant.length, k].min
16
+ idcg = ideal_length.times.sum { |i| 1.0 / Math.log2(i + 2) }
17
+
18
+ idcg.zero? ? 0.0 : dcg / idcg
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class PrecisionAtK < Base
6
+ def call(retrieved:, relevant:, k: nil, **_kwargs)
7
+ k ||= retrieved.length
8
+ top_k = retrieved.first(k)
9
+ return 0.0 if top_k.empty?
10
+
11
+ hits = top_k.count { |doc| relevant.include?(doc) }
12
+ hits.to_f / top_k.size
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class RecallAtK < Base
6
+ def call(retrieved:, relevant:, k: nil, **_kwargs)
7
+ return 0.0 if relevant.empty?
8
+
9
+ k ||= retrieved.length
10
+ top_k = retrieved.first(k)
11
+ hits = top_k.count { |doc| relevant.include?(doc) }
12
+ hits.to_f / relevant.size
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Metrics
5
+ class Relevance < Base
6
+ PROMPT_TEMPLATE = <<~PROMPT
7
+ Given the following question and answer, evaluate whether the answer
8
+ is relevant to and addresses the question.
9
+
10
+ Question:
11
+ %{question}
12
+
13
+ Answer:
14
+ %{answer}
15
+
16
+ Evaluate relevance on a scale from 0.0 to 1.0 where:
17
+ - 1.0 = the answer fully and directly addresses the question
18
+ - 0.5 = the answer partially addresses the question
19
+ - 0.0 = the answer is completely irrelevant to the question
20
+
21
+ Respond in JSON: {"reasoning": "...", "score": 0.0}
22
+ PROMPT
23
+
24
+ def call(question:, answer:, **_kwargs)
25
+ prompt = format(PROMPT_TEMPLATE, question: question, answer: answer)
26
+
27
+ result = judge.call(prompt)
28
+ raise Error, "Judge returned invalid response for relevance" unless result&.key?("score")
29
+
30
+ {
31
+ score: result["score"].to_f.clamp(0.0, 1.0),
32
+ details: {reasoning: result["reasoning"]}
33
+ }
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "eval_ruby"
4
+
5
+ module EvalRuby
6
+ module Assertions
7
+ def assert_faithful(answer, context, threshold: 0.8, message: nil)
8
+ result = eval_metric(:faithfulness, answer: answer, context: Array(context))
9
+ msg = message || "Expected faithfulness >= #{threshold}, got #{result[:score].round(4)}"
10
+ assert result[:score] >= threshold, msg
11
+ end
12
+
13
+ def assert_relevant(question, answer, threshold: 0.8, message: nil)
14
+ result = eval_metric(:relevance, question: question, answer: answer)
15
+ msg = message || "Expected relevance >= #{threshold}, got #{result[:score].round(4)}"
16
+ assert result[:score] >= threshold, msg
17
+ end
18
+
19
+ def assert_correct(answer, ground_truth:, threshold: 0.7, message: nil)
20
+ result = eval_metric(:correctness, answer: answer, ground_truth: ground_truth)
21
+ msg = message || "Expected correctness >= #{threshold}, got #{result[:score].round(4)}"
22
+ assert result[:score] >= threshold, msg
23
+ end
24
+
25
+ def assert_precision_at_k(retrieved, relevant, k:, threshold: 0.5, message: nil)
26
+ score = Metrics::PrecisionAtK.new.call(retrieved: retrieved, relevant: relevant, k: k)
27
+ msg = message || "Expected precision@#{k} >= #{threshold}, got #{score.round(4)}"
28
+ assert score >= threshold, msg
29
+ end
30
+
31
+ def refute_hallucination(answer, context, threshold: 0.8, message: nil)
32
+ result = eval_metric(:faithfulness, answer: answer, context: Array(context))
33
+ msg = message || "Expected no hallucination (faithfulness >= #{threshold}), got #{result[:score].round(4)}"
34
+ assert result[:score] >= threshold, msg
35
+ end
36
+
37
+ private
38
+
39
+ def eval_metric(metric_name, **kwargs)
40
+ judge = EvalRuby.send(:build_judge)
41
+ metric_class = case metric_name
42
+ when :faithfulness then Metrics::Faithfulness
43
+ when :relevance then Metrics::Relevance
44
+ when :correctness then Metrics::Correctness
45
+ when :context_precision then Metrics::ContextPrecision
46
+ when :context_recall then Metrics::ContextRecall
47
+ end
48
+ metric_class.new(judge: judge).call(**kwargs)
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "json"
5
+
6
+ module EvalRuby
7
+ class Report
8
+ attr_reader :results, :duration, :samples
9
+
10
+ def initialize(results:, samples: [], duration: nil)
11
+ @results = results
12
+ @samples = samples
13
+ @duration = duration
14
+ end
15
+
16
+ def summary
17
+ lines = []
18
+ metric_stats.each do |metric, stats|
19
+ lines << format("%-20s %.4f (+/- %.4f)", "#{metric}:", stats[:mean], stats[:std])
20
+ end
21
+ lines << ""
22
+ lines << "Total: #{@results.size} samples | Duration: #{format_duration}"
23
+ lines.join("\n")
24
+ end
25
+
26
+ def metric_stats
27
+ return {} if @results.empty?
28
+
29
+ all_metrics = @results.flat_map { |r| r.scores.keys }.uniq
30
+ all_metrics.each_with_object({}) do |metric, hash|
31
+ values = @results.filter_map { |r| r.scores[metric] }
32
+ next if values.empty?
33
+
34
+ mean = values.sum / values.size.to_f
35
+ variance = values.sum { |v| (v - mean)**2 } / values.size.to_f
36
+ std = Math.sqrt(variance)
37
+ hash[metric] = {mean: mean, std: std, min: values.min, max: values.max, count: values.size}
38
+ end
39
+ end
40
+
41
+ def worst(n = 5)
42
+ @results.sort_by { |r| r.overall || 0.0 }.first(n)
43
+ end
44
+
45
+ def failures(threshold: nil)
46
+ threshold ||= EvalRuby.configuration.default_threshold
47
+ @results.select { |r| (r.overall || 0.0) < threshold }
48
+ end
49
+
50
+ def to_csv(path)
51
+ return if @results.empty?
52
+
53
+ all_metrics = @results.flat_map { |r| r.scores.keys }.uniq
54
+ CSV.open(path, "w") do |csv|
55
+ csv << ["sample_index"] + all_metrics.map(&:to_s) + ["overall"]
56
+ @results.each_with_index do |result, i|
57
+ row = [i] + all_metrics.map { |m| result.scores[m]&.round(4) } + [result.overall&.round(4)]
58
+ csv << row
59
+ end
60
+ end
61
+ end
62
+
63
+ def to_json(path)
64
+ data = @results.each_with_index.map do |result, i|
65
+ {index: i, scores: result.scores, overall: result.overall, sample: @samples[i]}
66
+ end
67
+ File.write(path, JSON.pretty_generate({results: data, summary: metric_stats}))
68
+ end
69
+
70
+ private
71
+
72
+ def format_duration
73
+ return "N/A" unless @duration
74
+
75
+ if @duration < 60
76
+ "#{@duration.round(1)}s"
77
+ else
78
+ "#{(@duration / 60).floor}m #{(@duration % 60).round(1)}s"
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ class Result
5
+ METRICS = %i[faithfulness relevance correctness context_precision context_recall].freeze
6
+
7
+ attr_reader :scores, :details
8
+
9
+ def initialize(scores: {}, details: {})
10
+ @scores = scores
11
+ @details = details
12
+ end
13
+
14
+ METRICS.each do |metric|
15
+ define_method(metric) { @scores[metric] }
16
+ end
17
+
18
+ def overall(weights: nil)
19
+ weights ||= METRICS.each_with_object({}) { |m, h| h[m] = 1.0 }
20
+ available = @scores.select { |k, v| weights.key?(k) && v }
21
+ return nil if available.empty?
22
+
23
+ total_weight = available.sum { |k, _| weights[k] }
24
+ available.sum { |k, v| v * weights[k] } / total_weight
25
+ end
26
+
27
+ def to_h
28
+ @scores.merge(overall: overall)
29
+ end
30
+
31
+ def to_s
32
+ lines = @scores.map { |k, v| " #{k}: #{v&.round(4) || 'N/A'}" }
33
+ lines << " overall: #{overall&.round(4) || 'N/A'}"
34
+ "EvalRuby::Result\n#{lines.join("\n")}"
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "eval_ruby"
4
+
5
+ module EvalRuby
6
+ module RSpecMatchers
7
+ class BeFaithfulTo
8
+ def initialize(context)
9
+ @context = Array(context)
10
+ @threshold = 0.8
11
+ end
12
+
13
+ def with_threshold(threshold)
14
+ @threshold = threshold
15
+ self
16
+ end
17
+
18
+ def matches?(answer)
19
+ @answer = answer
20
+ judge = EvalRuby.send(:build_judge)
21
+ result = Metrics::Faithfulness.new(judge: judge).call(answer: answer, context: @context)
22
+ @score = result[:score]
23
+ @score >= @threshold
24
+ end
25
+
26
+ def failure_message
27
+ "expected answer to be faithful to context (threshold: #{@threshold}), but got score #{@score.round(4)}"
28
+ end
29
+
30
+ def failure_message_when_negated
31
+ "expected answer not to be faithful to context, but got score #{@score.round(4)}"
32
+ end
33
+ end
34
+
35
+ class HavePrecisionAtK
36
+ def initialize(k)
37
+ @k = k
38
+ @threshold = 0.5
39
+ end
40
+
41
+ def above(threshold)
42
+ @threshold = threshold
43
+ self
44
+ end
45
+
46
+ def matches?(results)
47
+ @results = results
48
+ # results should respond to retrieved and relevant, or be arrays
49
+ if results.is_a?(EvalRuby::RetrievalResult)
50
+ @score = results.precision_at_k(@k)
51
+ else
52
+ raise ArgumentError, "Expected EvalRuby::RetrievalResult or use assert_precision_at_k"
53
+ end
54
+ @score >= @threshold
55
+ end
56
+
57
+ def failure_message
58
+ "expected precision@#{@k} >= #{@threshold}, but got #{@score.round(4)}"
59
+ end
60
+ end
61
+
62
+ def be_faithful_to(context)
63
+ BeFaithfulTo.new(context)
64
+ end
65
+
66
+ def have_precision_at_k(k)
67
+ HavePrecisionAtK.new(k)
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ VERSION = "0.1.0"
5
+ end
data/lib/eval_ruby.rb ADDED
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "eval_ruby/version"
4
+ require_relative "eval_ruby/configuration"
5
+ require_relative "eval_ruby/judges/base"
6
+ require_relative "eval_ruby/judges/openai"
7
+ require_relative "eval_ruby/judges/anthropic"
8
+ require_relative "eval_ruby/metrics/base"
9
+ require_relative "eval_ruby/metrics/faithfulness"
10
+ require_relative "eval_ruby/metrics/relevance"
11
+ require_relative "eval_ruby/metrics/correctness"
12
+ require_relative "eval_ruby/metrics/context_precision"
13
+ require_relative "eval_ruby/metrics/context_recall"
14
+ require_relative "eval_ruby/metrics/precision_at_k"
15
+ require_relative "eval_ruby/metrics/recall_at_k"
16
+ require_relative "eval_ruby/metrics/mrr"
17
+ require_relative "eval_ruby/metrics/ndcg"
18
+ require_relative "eval_ruby/result"
19
+ require_relative "eval_ruby/evaluator"
20
+ require_relative "eval_ruby/report"
21
+ require_relative "eval_ruby/dataset"
22
+ require_relative "eval_ruby/comparison"
23
+
24
+ module EvalRuby
25
+ class Error < StandardError; end
26
+
27
+ class << self
28
+ def configuration
29
+ @configuration ||= Configuration.new
30
+ end
31
+
32
+ def configure
33
+ yield(configuration)
34
+ end
35
+
36
+ def reset_configuration!
37
+ @configuration = Configuration.new
38
+ end
39
+
40
+ def evaluate(question:, answer:, context: [], ground_truth: nil)
41
+ Evaluator.new.evaluate(
42
+ question: question,
43
+ answer: answer,
44
+ context: context,
45
+ ground_truth: ground_truth
46
+ )
47
+ end
48
+
49
+ def evaluate_retrieval(question:, retrieved:, relevant:)
50
+ Evaluator.new.evaluate_retrieval(
51
+ question: question,
52
+ retrieved: retrieved,
53
+ relevant: relevant
54
+ )
55
+ end
56
+
57
+ def evaluate_batch(dataset, pipeline: nil)
58
+ samples = dataset.is_a?(Dataset) ? dataset.samples : dataset
59
+ evaluator = Evaluator.new
60
+ start_time = Time.now
61
+
62
+ results = samples.map do |sample|
63
+ if pipeline
64
+ response = pipeline.query(sample[:question])
65
+ evaluator.evaluate(
66
+ question: sample[:question],
67
+ answer: response.respond_to?(:text) ? response.text : response.to_s,
68
+ context: response.respond_to?(:context) ? response.context : sample[:context],
69
+ ground_truth: sample[:ground_truth]
70
+ )
71
+ else
72
+ evaluator.evaluate(**sample.slice(:question, :answer, :context, :ground_truth))
73
+ end
74
+ end
75
+
76
+ Report.new(results: results, samples: samples, duration: Time.now - start_time)
77
+ end
78
+
79
+ def compare(report_a, report_b)
80
+ Comparison.new(report_a, report_b)
81
+ end
82
+
83
+ private
84
+
85
+ def build_judge
86
+ config = configuration
87
+ case config.judge_llm
88
+ when :openai then Judges::OpenAI.new(config)
89
+ when :anthropic then Judges::Anthropic.new(config)
90
+ else raise Error, "Unknown judge LLM: #{config.judge_llm}"
91
+ end
92
+ end
93
+ end
94
+ end