eval-ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +51 -0
- data/LICENSE +21 -0
- data/README.md +121 -0
- data/Rakefile +11 -0
- data/eval-ruby.gemspec +34 -0
- data/lib/eval_ruby/comparison.rb +88 -0
- data/lib/eval_ruby/configuration.rb +17 -0
- data/lib/eval_ruby/dataset.rb +130 -0
- data/lib/eval_ruby/evaluator.rb +94 -0
- data/lib/eval_ruby/judges/anthropic.rb +40 -0
- data/lib/eval_ruby/judges/base.rb +26 -0
- data/lib/eval_ruby/judges/openai.rb +39 -0
- data/lib/eval_ruby/metrics/base.rb +17 -0
- data/lib/eval_ruby/metrics/context_precision.rb +39 -0
- data/lib/eval_ruby/metrics/context_recall.rb +40 -0
- data/lib/eval_ruby/metrics/correctness.rb +68 -0
- data/lib/eval_ruby/metrics/faithfulness.rb +42 -0
- data/lib/eval_ruby/metrics/mrr.rb +14 -0
- data/lib/eval_ruby/metrics/ndcg.rb +22 -0
- data/lib/eval_ruby/metrics/precision_at_k.rb +16 -0
- data/lib/eval_ruby/metrics/recall_at_k.rb +16 -0
- data/lib/eval_ruby/metrics/relevance.rb +37 -0
- data/lib/eval_ruby/minitest.rb +51 -0
- data/lib/eval_ruby/report.rb +82 -0
- data/lib/eval_ruby/result.rb +37 -0
- data/lib/eval_ruby/rspec.rb +70 -0
- data/lib/eval_ruby/version.rb +5 -0
- data/lib/eval_ruby.rb +94 -0
- metadata +128 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class Base
|
|
6
|
+
attr_reader :judge
|
|
7
|
+
|
|
8
|
+
def initialize(judge: nil)
|
|
9
|
+
@judge = judge
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def call(**kwargs)
|
|
13
|
+
raise NotImplementedError, "#{self.class}#call must be implemented"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class ContextPrecision < Base
|
|
6
|
+
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
|
+
Given the following question and a list of retrieved contexts, evaluate
|
|
8
|
+
whether each context is relevant to answering the question.
|
|
9
|
+
|
|
10
|
+
Question:
|
|
11
|
+
%{question}
|
|
12
|
+
|
|
13
|
+
Contexts:
|
|
14
|
+
%{contexts}
|
|
15
|
+
|
|
16
|
+
For each context, determine if it is RELEVANT or NOT RELEVANT to answering the question.
|
|
17
|
+
|
|
18
|
+
Respond in JSON: {"evaluations": [{"index": 0, "relevant": true}], "score": 0.0}
|
|
19
|
+
The score should be the proportion of relevant contexts (0.0 to 1.0).
|
|
20
|
+
PROMPT
|
|
21
|
+
|
|
22
|
+
def call(question:, context:, **_kwargs)
|
|
23
|
+
contexts = Array(context)
|
|
24
|
+
return {score: 0.0, details: {}} if contexts.empty?
|
|
25
|
+
|
|
26
|
+
contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
|
|
27
|
+
prompt = format(PROMPT_TEMPLATE, question: question, contexts: contexts_text)
|
|
28
|
+
|
|
29
|
+
result = judge.call(prompt)
|
|
30
|
+
raise Error, "Judge returned invalid response for context_precision" unless result&.key?("score")
|
|
31
|
+
|
|
32
|
+
{
|
|
33
|
+
score: result["score"].to_f.clamp(0.0, 1.0),
|
|
34
|
+
details: {evaluations: result["evaluations"] || []}
|
|
35
|
+
}
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class ContextRecall < Base
|
|
6
|
+
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
|
+
Given the following ground truth answer and retrieved contexts, evaluate
|
|
8
|
+
whether the contexts contain enough information to support the ground truth.
|
|
9
|
+
|
|
10
|
+
Ground Truth:
|
|
11
|
+
%{ground_truth}
|
|
12
|
+
|
|
13
|
+
Contexts:
|
|
14
|
+
%{contexts}
|
|
15
|
+
|
|
16
|
+
For each statement in the ground truth, determine if it can be attributed
|
|
17
|
+
to the retrieved contexts.
|
|
18
|
+
|
|
19
|
+
Respond in JSON: {"statements": [{"statement": "...", "attributed": true}], "score": 0.0}
|
|
20
|
+
The score should be the proportion of statements attributed to context (0.0 to 1.0).
|
|
21
|
+
PROMPT
|
|
22
|
+
|
|
23
|
+
def call(context:, ground_truth:, **_kwargs)
|
|
24
|
+
contexts = Array(context)
|
|
25
|
+
return {score: 0.0, details: {}} if contexts.empty?
|
|
26
|
+
|
|
27
|
+
contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
|
|
28
|
+
prompt = format(PROMPT_TEMPLATE, ground_truth: ground_truth, contexts: contexts_text)
|
|
29
|
+
|
|
30
|
+
result = judge.call(prompt)
|
|
31
|
+
raise Error, "Judge returned invalid response for context_recall" unless result&.key?("score")
|
|
32
|
+
|
|
33
|
+
{
|
|
34
|
+
score: result["score"].to_f.clamp(0.0, 1.0),
|
|
35
|
+
details: {statements: result["statements"] || []}
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class Correctness < Base
|
|
6
|
+
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
|
+
Given the following answer and ground truth, evaluate whether the answer
|
|
8
|
+
is factually correct.
|
|
9
|
+
|
|
10
|
+
Answer:
|
|
11
|
+
%{answer}
|
|
12
|
+
|
|
13
|
+
Ground Truth:
|
|
14
|
+
%{ground_truth}
|
|
15
|
+
|
|
16
|
+
Evaluate correctness on a scale from 0.0 to 1.0 where:
|
|
17
|
+
- 1.0 = the answer is completely correct and matches the ground truth
|
|
18
|
+
- 0.5 = the answer is partially correct
|
|
19
|
+
- 0.0 = the answer is completely wrong
|
|
20
|
+
|
|
21
|
+
Consider both semantic meaning and factual accuracy, not just exact string matching.
|
|
22
|
+
|
|
23
|
+
Respond in JSON: {"reasoning": "...", "score": 0.0}
|
|
24
|
+
PROMPT
|
|
25
|
+
|
|
26
|
+
def call(answer:, ground_truth:, **_kwargs)
|
|
27
|
+
if judge
|
|
28
|
+
llm_score(answer, ground_truth)
|
|
29
|
+
else
|
|
30
|
+
string_similarity_score(answer, ground_truth)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def llm_score(answer, ground_truth)
|
|
37
|
+
prompt = format(PROMPT_TEMPLATE, answer: answer, ground_truth: ground_truth)
|
|
38
|
+
|
|
39
|
+
result = judge.call(prompt)
|
|
40
|
+
raise Error, "Judge returned invalid response for correctness" unless result&.key?("score")
|
|
41
|
+
|
|
42
|
+
{
|
|
43
|
+
score: result["score"].to_f.clamp(0.0, 1.0),
|
|
44
|
+
details: {reasoning: result["reasoning"]}
|
|
45
|
+
}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def string_similarity_score(answer, ground_truth)
|
|
49
|
+
answer_tokens = tokenize(answer)
|
|
50
|
+
truth_tokens = tokenize(ground_truth)
|
|
51
|
+
|
|
52
|
+
return {score: 1.0, details: {method: :exact_match}} if answer_tokens == truth_tokens
|
|
53
|
+
return {score: 0.0, details: {method: :token_overlap}} if answer_tokens.empty? || truth_tokens.empty?
|
|
54
|
+
|
|
55
|
+
overlap = (answer_tokens & truth_tokens).size
|
|
56
|
+
precision = overlap.to_f / answer_tokens.size
|
|
57
|
+
recall = overlap.to_f / truth_tokens.size
|
|
58
|
+
f1 = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0.0
|
|
59
|
+
|
|
60
|
+
{score: f1.clamp(0.0, 1.0), details: {method: :token_overlap, precision: precision, recall: recall}}
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def tokenize(text)
|
|
64
|
+
text.downcase.scan(/\w+/)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class Faithfulness < Base
|
|
6
|
+
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
|
+
Given the following context and answer, evaluate whether the answer
|
|
8
|
+
is faithful to (supported by) the context.
|
|
9
|
+
|
|
10
|
+
For each claim in the answer, determine if it is:
|
|
11
|
+
1. SUPPORTED - directly supported by the context
|
|
12
|
+
2. NOT SUPPORTED - contradicts or is not mentioned in the context
|
|
13
|
+
|
|
14
|
+
Context:
|
|
15
|
+
%{context}
|
|
16
|
+
|
|
17
|
+
Answer:
|
|
18
|
+
%{answer}
|
|
19
|
+
|
|
20
|
+
List each claim and whether it is SUPPORTED or NOT SUPPORTED.
|
|
21
|
+
Then give a faithfulness score from 0.0 to 1.0 where:
|
|
22
|
+
- 1.0 = all claims are supported
|
|
23
|
+
- 0.0 = no claims are supported
|
|
24
|
+
|
|
25
|
+
Respond in JSON: {"claims": [{"claim": "...", "supported": true}], "score": 0.0}
|
|
26
|
+
PROMPT
|
|
27
|
+
|
|
28
|
+
def call(answer:, context:, **_kwargs)
|
|
29
|
+
context_text = Array(context).join("\n\n")
|
|
30
|
+
prompt = format(PROMPT_TEMPLATE, context: context_text, answer: answer)
|
|
31
|
+
|
|
32
|
+
result = judge.call(prompt)
|
|
33
|
+
raise Error, "Judge returned invalid response for faithfulness" unless result&.key?("score")
|
|
34
|
+
|
|
35
|
+
{
|
|
36
|
+
score: result["score"].to_f.clamp(0.0, 1.0),
|
|
37
|
+
details: {claims: result["claims"] || []}
|
|
38
|
+
}
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class MRR < Base
|
|
6
|
+
def call(retrieved:, relevant:, **_kwargs)
|
|
7
|
+
retrieved.each_with_index do |doc, i|
|
|
8
|
+
return 1.0 / (i + 1) if relevant.include?(doc)
|
|
9
|
+
end
|
|
10
|
+
0.0
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class NDCG < Base
|
|
6
|
+
def call(retrieved:, relevant:, k: nil, **_kwargs)
|
|
7
|
+
k ||= retrieved.length
|
|
8
|
+
top_k = retrieved.first(k)
|
|
9
|
+
|
|
10
|
+
dcg = top_k.each_with_index.sum do |doc, i|
|
|
11
|
+
rel = relevant.include?(doc) ? 1.0 : 0.0
|
|
12
|
+
rel / Math.log2(i + 2)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
ideal_length = [relevant.length, k].min
|
|
16
|
+
idcg = ideal_length.times.sum { |i| 1.0 / Math.log2(i + 2) }
|
|
17
|
+
|
|
18
|
+
idcg.zero? ? 0.0 : dcg / idcg
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class PrecisionAtK < Base
|
|
6
|
+
def call(retrieved:, relevant:, k: nil, **_kwargs)
|
|
7
|
+
k ||= retrieved.length
|
|
8
|
+
top_k = retrieved.first(k)
|
|
9
|
+
return 0.0 if top_k.empty?
|
|
10
|
+
|
|
11
|
+
hits = top_k.count { |doc| relevant.include?(doc) }
|
|
12
|
+
hits.to_f / top_k.size
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class RecallAtK < Base
|
|
6
|
+
def call(retrieved:, relevant:, k: nil, **_kwargs)
|
|
7
|
+
return 0.0 if relevant.empty?
|
|
8
|
+
|
|
9
|
+
k ||= retrieved.length
|
|
10
|
+
top_k = retrieved.first(k)
|
|
11
|
+
hits = top_k.count { |doc| relevant.include?(doc) }
|
|
12
|
+
hits.to_f / relevant.size
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
module Metrics
|
|
5
|
+
class Relevance < Base
|
|
6
|
+
PROMPT_TEMPLATE = <<~PROMPT
|
|
7
|
+
Given the following question and answer, evaluate whether the answer
|
|
8
|
+
is relevant to and addresses the question.
|
|
9
|
+
|
|
10
|
+
Question:
|
|
11
|
+
%{question}
|
|
12
|
+
|
|
13
|
+
Answer:
|
|
14
|
+
%{answer}
|
|
15
|
+
|
|
16
|
+
Evaluate relevance on a scale from 0.0 to 1.0 where:
|
|
17
|
+
- 1.0 = the answer fully and directly addresses the question
|
|
18
|
+
- 0.5 = the answer partially addresses the question
|
|
19
|
+
- 0.0 = the answer is completely irrelevant to the question
|
|
20
|
+
|
|
21
|
+
Respond in JSON: {"reasoning": "...", "score": 0.0}
|
|
22
|
+
PROMPT
|
|
23
|
+
|
|
24
|
+
def call(question:, answer:, **_kwargs)
|
|
25
|
+
prompt = format(PROMPT_TEMPLATE, question: question, answer: answer)
|
|
26
|
+
|
|
27
|
+
result = judge.call(prompt)
|
|
28
|
+
raise Error, "Judge returned invalid response for relevance" unless result&.key?("score")
|
|
29
|
+
|
|
30
|
+
{
|
|
31
|
+
score: result["score"].to_f.clamp(0.0, 1.0),
|
|
32
|
+
details: {reasoning: result["reasoning"]}
|
|
33
|
+
}
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "eval_ruby"
|
|
4
|
+
|
|
5
|
+
module EvalRuby
|
|
6
|
+
module Assertions
|
|
7
|
+
def assert_faithful(answer, context, threshold: 0.8, message: nil)
|
|
8
|
+
result = eval_metric(:faithfulness, answer: answer, context: Array(context))
|
|
9
|
+
msg = message || "Expected faithfulness >= #{threshold}, got #{result[:score].round(4)}"
|
|
10
|
+
assert result[:score] >= threshold, msg
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def assert_relevant(question, answer, threshold: 0.8, message: nil)
|
|
14
|
+
result = eval_metric(:relevance, question: question, answer: answer)
|
|
15
|
+
msg = message || "Expected relevance >= #{threshold}, got #{result[:score].round(4)}"
|
|
16
|
+
assert result[:score] >= threshold, msg
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def assert_correct(answer, ground_truth:, threshold: 0.7, message: nil)
|
|
20
|
+
result = eval_metric(:correctness, answer: answer, ground_truth: ground_truth)
|
|
21
|
+
msg = message || "Expected correctness >= #{threshold}, got #{result[:score].round(4)}"
|
|
22
|
+
assert result[:score] >= threshold, msg
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def assert_precision_at_k(retrieved, relevant, k:, threshold: 0.5, message: nil)
|
|
26
|
+
score = Metrics::PrecisionAtK.new.call(retrieved: retrieved, relevant: relevant, k: k)
|
|
27
|
+
msg = message || "Expected precision@#{k} >= #{threshold}, got #{score.round(4)}"
|
|
28
|
+
assert score >= threshold, msg
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def refute_hallucination(answer, context, threshold: 0.8, message: nil)
|
|
32
|
+
result = eval_metric(:faithfulness, answer: answer, context: Array(context))
|
|
33
|
+
msg = message || "Expected no hallucination (faithfulness >= #{threshold}), got #{result[:score].round(4)}"
|
|
34
|
+
assert result[:score] >= threshold, msg
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def eval_metric(metric_name, **kwargs)
|
|
40
|
+
judge = EvalRuby.send(:build_judge)
|
|
41
|
+
metric_class = case metric_name
|
|
42
|
+
when :faithfulness then Metrics::Faithfulness
|
|
43
|
+
when :relevance then Metrics::Relevance
|
|
44
|
+
when :correctness then Metrics::Correctness
|
|
45
|
+
when :context_precision then Metrics::ContextPrecision
|
|
46
|
+
when :context_recall then Metrics::ContextRecall
|
|
47
|
+
end
|
|
48
|
+
metric_class.new(judge: judge).call(**kwargs)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module EvalRuby
|
|
7
|
+
class Report
|
|
8
|
+
attr_reader :results, :duration, :samples
|
|
9
|
+
|
|
10
|
+
def initialize(results:, samples: [], duration: nil)
|
|
11
|
+
@results = results
|
|
12
|
+
@samples = samples
|
|
13
|
+
@duration = duration
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def summary
|
|
17
|
+
lines = []
|
|
18
|
+
metric_stats.each do |metric, stats|
|
|
19
|
+
lines << format("%-20s %.4f (+/- %.4f)", "#{metric}:", stats[:mean], stats[:std])
|
|
20
|
+
end
|
|
21
|
+
lines << ""
|
|
22
|
+
lines << "Total: #{@results.size} samples | Duration: #{format_duration}"
|
|
23
|
+
lines.join("\n")
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def metric_stats
|
|
27
|
+
return {} if @results.empty?
|
|
28
|
+
|
|
29
|
+
all_metrics = @results.flat_map { |r| r.scores.keys }.uniq
|
|
30
|
+
all_metrics.each_with_object({}) do |metric, hash|
|
|
31
|
+
values = @results.filter_map { |r| r.scores[metric] }
|
|
32
|
+
next if values.empty?
|
|
33
|
+
|
|
34
|
+
mean = values.sum / values.size.to_f
|
|
35
|
+
variance = values.sum { |v| (v - mean)**2 } / values.size.to_f
|
|
36
|
+
std = Math.sqrt(variance)
|
|
37
|
+
hash[metric] = {mean: mean, std: std, min: values.min, max: values.max, count: values.size}
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def worst(n = 5)
|
|
42
|
+
@results.sort_by { |r| r.overall || 0.0 }.first(n)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def failures(threshold: nil)
|
|
46
|
+
threshold ||= EvalRuby.configuration.default_threshold
|
|
47
|
+
@results.select { |r| (r.overall || 0.0) < threshold }
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def to_csv(path)
|
|
51
|
+
return if @results.empty?
|
|
52
|
+
|
|
53
|
+
all_metrics = @results.flat_map { |r| r.scores.keys }.uniq
|
|
54
|
+
CSV.open(path, "w") do |csv|
|
|
55
|
+
csv << ["sample_index"] + all_metrics.map(&:to_s) + ["overall"]
|
|
56
|
+
@results.each_with_index do |result, i|
|
|
57
|
+
row = [i] + all_metrics.map { |m| result.scores[m]&.round(4) } + [result.overall&.round(4)]
|
|
58
|
+
csv << row
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def to_json(path)
|
|
64
|
+
data = @results.each_with_index.map do |result, i|
|
|
65
|
+
{index: i, scores: result.scores, overall: result.overall, sample: @samples[i]}
|
|
66
|
+
end
|
|
67
|
+
File.write(path, JSON.pretty_generate({results: data, summary: metric_stats}))
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
def format_duration
|
|
73
|
+
return "N/A" unless @duration
|
|
74
|
+
|
|
75
|
+
if @duration < 60
|
|
76
|
+
"#{@duration.round(1)}s"
|
|
77
|
+
else
|
|
78
|
+
"#{(@duration / 60).floor}m #{(@duration % 60).round(1)}s"
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module EvalRuby
|
|
4
|
+
class Result
|
|
5
|
+
METRICS = %i[faithfulness relevance correctness context_precision context_recall].freeze
|
|
6
|
+
|
|
7
|
+
attr_reader :scores, :details
|
|
8
|
+
|
|
9
|
+
def initialize(scores: {}, details: {})
|
|
10
|
+
@scores = scores
|
|
11
|
+
@details = details
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
METRICS.each do |metric|
|
|
15
|
+
define_method(metric) { @scores[metric] }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def overall(weights: nil)
|
|
19
|
+
weights ||= METRICS.each_with_object({}) { |m, h| h[m] = 1.0 }
|
|
20
|
+
available = @scores.select { |k, v| weights.key?(k) && v }
|
|
21
|
+
return nil if available.empty?
|
|
22
|
+
|
|
23
|
+
total_weight = available.sum { |k, _| weights[k] }
|
|
24
|
+
available.sum { |k, v| v * weights[k] } / total_weight
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def to_h
|
|
28
|
+
@scores.merge(overall: overall)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def to_s
|
|
32
|
+
lines = @scores.map { |k, v| " #{k}: #{v&.round(4) || 'N/A'}" }
|
|
33
|
+
lines << " overall: #{overall&.round(4) || 'N/A'}"
|
|
34
|
+
"EvalRuby::Result\n#{lines.join("\n")}"
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "eval_ruby"
|
|
4
|
+
|
|
5
|
+
module EvalRuby
|
|
6
|
+
module RSpecMatchers
|
|
7
|
+
class BeFaithfulTo
|
|
8
|
+
def initialize(context)
|
|
9
|
+
@context = Array(context)
|
|
10
|
+
@threshold = 0.8
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def with_threshold(threshold)
|
|
14
|
+
@threshold = threshold
|
|
15
|
+
self
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def matches?(answer)
|
|
19
|
+
@answer = answer
|
|
20
|
+
judge = EvalRuby.send(:build_judge)
|
|
21
|
+
result = Metrics::Faithfulness.new(judge: judge).call(answer: answer, context: @context)
|
|
22
|
+
@score = result[:score]
|
|
23
|
+
@score >= @threshold
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def failure_message
|
|
27
|
+
"expected answer to be faithful to context (threshold: #{@threshold}), but got score #{@score.round(4)}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def failure_message_when_negated
|
|
31
|
+
"expected answer not to be faithful to context, but got score #{@score.round(4)}"
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
class HavePrecisionAtK
|
|
36
|
+
def initialize(k)
|
|
37
|
+
@k = k
|
|
38
|
+
@threshold = 0.5
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def above(threshold)
|
|
42
|
+
@threshold = threshold
|
|
43
|
+
self
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def matches?(results)
|
|
47
|
+
@results = results
|
|
48
|
+
# results should respond to retrieved and relevant, or be arrays
|
|
49
|
+
if results.is_a?(EvalRuby::RetrievalResult)
|
|
50
|
+
@score = results.precision_at_k(@k)
|
|
51
|
+
else
|
|
52
|
+
raise ArgumentError, "Expected EvalRuby::RetrievalResult or use assert_precision_at_k"
|
|
53
|
+
end
|
|
54
|
+
@score >= @threshold
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def failure_message
|
|
58
|
+
"expected precision@#{@k} >= #{@threshold}, but got #{@score.round(4)}"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def be_faithful_to(context)
|
|
63
|
+
BeFaithfulTo.new(context)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def have_precision_at_k(k)
|
|
67
|
+
HavePrecisionAtK.new(k)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
data/lib/eval_ruby.rb
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "eval_ruby/version"
|
|
4
|
+
require_relative "eval_ruby/configuration"
|
|
5
|
+
require_relative "eval_ruby/judges/base"
|
|
6
|
+
require_relative "eval_ruby/judges/openai"
|
|
7
|
+
require_relative "eval_ruby/judges/anthropic"
|
|
8
|
+
require_relative "eval_ruby/metrics/base"
|
|
9
|
+
require_relative "eval_ruby/metrics/faithfulness"
|
|
10
|
+
require_relative "eval_ruby/metrics/relevance"
|
|
11
|
+
require_relative "eval_ruby/metrics/correctness"
|
|
12
|
+
require_relative "eval_ruby/metrics/context_precision"
|
|
13
|
+
require_relative "eval_ruby/metrics/context_recall"
|
|
14
|
+
require_relative "eval_ruby/metrics/precision_at_k"
|
|
15
|
+
require_relative "eval_ruby/metrics/recall_at_k"
|
|
16
|
+
require_relative "eval_ruby/metrics/mrr"
|
|
17
|
+
require_relative "eval_ruby/metrics/ndcg"
|
|
18
|
+
require_relative "eval_ruby/result"
|
|
19
|
+
require_relative "eval_ruby/evaluator"
|
|
20
|
+
require_relative "eval_ruby/report"
|
|
21
|
+
require_relative "eval_ruby/dataset"
|
|
22
|
+
require_relative "eval_ruby/comparison"
|
|
23
|
+
|
|
24
|
+
module EvalRuby
|
|
25
|
+
class Error < StandardError; end
|
|
26
|
+
|
|
27
|
+
class << self
|
|
28
|
+
def configuration
|
|
29
|
+
@configuration ||= Configuration.new
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def configure
|
|
33
|
+
yield(configuration)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def reset_configuration!
|
|
37
|
+
@configuration = Configuration.new
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def evaluate(question:, answer:, context: [], ground_truth: nil)
|
|
41
|
+
Evaluator.new.evaluate(
|
|
42
|
+
question: question,
|
|
43
|
+
answer: answer,
|
|
44
|
+
context: context,
|
|
45
|
+
ground_truth: ground_truth
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def evaluate_retrieval(question:, retrieved:, relevant:)
|
|
50
|
+
Evaluator.new.evaluate_retrieval(
|
|
51
|
+
question: question,
|
|
52
|
+
retrieved: retrieved,
|
|
53
|
+
relevant: relevant
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def evaluate_batch(dataset, pipeline: nil)
|
|
58
|
+
samples = dataset.is_a?(Dataset) ? dataset.samples : dataset
|
|
59
|
+
evaluator = Evaluator.new
|
|
60
|
+
start_time = Time.now
|
|
61
|
+
|
|
62
|
+
results = samples.map do |sample|
|
|
63
|
+
if pipeline
|
|
64
|
+
response = pipeline.query(sample[:question])
|
|
65
|
+
evaluator.evaluate(
|
|
66
|
+
question: sample[:question],
|
|
67
|
+
answer: response.respond_to?(:text) ? response.text : response.to_s,
|
|
68
|
+
context: response.respond_to?(:context) ? response.context : sample[:context],
|
|
69
|
+
ground_truth: sample[:ground_truth]
|
|
70
|
+
)
|
|
71
|
+
else
|
|
72
|
+
evaluator.evaluate(**sample.slice(:question, :answer, :context, :ground_truth))
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
Report.new(results: results, samples: samples, duration: Time.now - start_time)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def compare(report_a, report_b)
|
|
80
|
+
Comparison.new(report_a, report_b)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def build_judge
|
|
86
|
+
config = configuration
|
|
87
|
+
case config.judge_llm
|
|
88
|
+
when :openai then Judges::OpenAI.new(config)
|
|
89
|
+
when :anthropic then Judges::Anthropic.new(config)
|
|
90
|
+
else raise Error, "Unknown judge LLM: #{config.judge_llm}"
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|