eval-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 07e82b10ad871e882d8a5da7a3d85ef7436c3b1695840c2974b22a0df70ae0fc
4
+ data.tar.gz: f483be5c375db41ff367162257186dde52627aa4b9bba4d493dfa1455363d310
5
+ SHA512:
6
+ metadata.gz: b4938e44301b2440500d6506057588fbafa5ca91f6cf574f288690132748ba81f2010c61ba76ad2f536a5d3cb7610442f1db854b08f0a8bf65d4bef7cf3b607c
7
+ data.tar.gz: f76fbe015937d962fb9747ff90a0fc245ae32fc47834644a894d4b69595dd31c30cd621922dca94386e8e737c6be82f726e4b76d039ebd38b8f73d6f548b5ce7
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,51 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ eval-ruby (0.1.0)
5
+ csv
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ addressable (2.8.9)
11
+ public_suffix (>= 2.0.2, < 8.0)
12
+ bigdecimal (4.0.1)
13
+ crack (1.0.1)
14
+ bigdecimal
15
+ rexml
16
+ csv (3.3.5)
17
+ hashdiff (1.2.1)
18
+ minitest (5.27.0)
19
+ public_suffix (7.0.5)
20
+ rake (13.3.1)
21
+ rexml (3.4.4)
22
+ webmock (3.26.1)
23
+ addressable (>= 2.8.0)
24
+ crack (>= 0.3.2)
25
+ hashdiff (>= 0.4.0, < 2.0.0)
26
+
27
+ PLATFORMS
28
+ arm64-darwin-24
29
+ ruby
30
+
31
+ DEPENDENCIES
32
+ eval-ruby!
33
+ minitest (~> 5.0)
34
+ rake (~> 13.0)
35
+ webmock (~> 3.0)
36
+
37
+ CHECKSUMS
38
+ addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
39
+ bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
40
+ crack (1.0.1) sha256=ff4a10390cd31d66440b7524eb1841874db86201d5b70032028553130b6d4c7e
41
+ csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
42
+ eval-ruby (0.1.0)
43
+ hashdiff (1.2.1) sha256=9c079dbc513dfc8833ab59c0c2d8f230fa28499cc5efb4b8dd276cf931457cd1
44
+ minitest (5.27.0) sha256=2d3b17f8a36fe7801c1adcffdbc38233b938eb0b4966e97a6739055a45fa77d5
45
+ public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
46
+ rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
47
+ rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142
48
+ webmock (3.26.1) sha256=4f696fb57c90a827c20aadb2d4f9058bbff10f7f043bd0d4c3f58791143b1cd7
49
+
50
+ BUNDLED WITH
51
+ 4.0.4
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Johannes Dwi Cahyo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,121 @@
1
+ # eval-ruby
2
+
3
+ Evaluation framework for LLM and RAG applications in Ruby. Measures quality metrics like faithfulness, relevance, context precision, and answer correctness.
4
+
5
+ Think [Ragas](https://github.com/explodinggradients/ragas) or [DeepEval](https://github.com/confident-ai/deepeval) for Ruby.
6
+
7
+ ## Installation
8
+
9
+ ```ruby
10
+ gem "eval-ruby"
11
+ ```
12
+
13
+ ## Quick Start
14
+
15
+ ```ruby
16
+ require "eval_ruby"
17
+
18
+ EvalRuby.configure do |config|
19
+ config.judge_llm = :openai # or :anthropic
20
+ config.judge_model = "gpt-4o"
21
+ config.api_key = ENV["OPENAI_API_KEY"]
22
+ end
23
+
24
+ result = EvalRuby.evaluate(
25
+ question: "What is the capital of France?",
26
+ answer: "The capital of France is Paris.",
27
+ context: ["Paris is the capital of France."],
28
+ ground_truth: "Paris"
29
+ )
30
+
31
+ result.faithfulness # => 0.95
32
+ result.relevance # => 0.92
33
+ result.context_precision # => 0.85
34
+ result.correctness # => 0.98
35
+ result.overall # => 0.94
36
+ ```
37
+
38
+ ## Metrics
39
+
40
+ ### LLM-as-Judge
41
+ - **Faithfulness** — Is the answer supported by the context?
42
+ - **Relevance** — Does the answer address the question?
43
+ - **Correctness** — Does the answer match the ground truth?
44
+ - **Context Precision** — Are retrieved contexts relevant?
45
+ - **Context Recall** — Do contexts cover the ground truth?
46
+
47
+ ### Retrieval Metrics
48
+ - **Precision@K** / **Recall@K**
49
+ - **MRR** (Mean Reciprocal Rank)
50
+ - **NDCG** (Normalized Discounted Cumulative Gain)
51
+ - **Hit Rate**
52
+
53
+ ## Retrieval Evaluation
54
+
55
+ ```ruby
56
+ result = EvalRuby.evaluate_retrieval(
57
+ question: "What is Ruby?",
58
+ retrieved: ["Ruby is...", "Python is...", "Java is..."],
59
+ relevant: ["Ruby is..."]
60
+ )
61
+
62
+ result.precision_at_k(1) # => 1.0
63
+ result.mrr # => 1.0
64
+ result.ndcg # => 0.63
65
+ ```
66
+
67
+ ## Batch Evaluation
68
+
69
+ ```ruby
70
+ report = EvalRuby.evaluate_batch(dataset)
71
+ report.summary
72
+ report.worst(5)
73
+ report.failures(threshold: 0.8)
74
+ report.to_csv("results.csv")
75
+ ```
76
+
77
+ ## Test Integration
78
+
79
+ ### Minitest
80
+
81
+ ```ruby
82
+ require "eval_ruby/minitest"
83
+
84
+ class TestRAG < Minitest::Test
85
+ include EvalRuby::Assertions
86
+
87
+ def test_faithfulness
88
+ assert_faithful answer, context, threshold: 0.8
89
+ end
90
+
91
+ def test_no_hallucination
92
+ refute_hallucination answer, context
93
+ end
94
+ end
95
+ ```
96
+
97
+ ### RSpec
98
+
99
+ ```ruby
100
+ require "eval_ruby/rspec"
101
+
102
+ RSpec.describe "RAG" do
103
+ include EvalRuby::RSpecMatchers
104
+
105
+ it "produces faithful answers" do
106
+ expect(answer).to be_faithful_to(context).with_threshold(0.8)
107
+ end
108
+ end
109
+ ```
110
+
111
+ ## A/B Comparison
112
+
113
+ ```ruby
114
+ comparison = EvalRuby.compare(report_a, report_b)
115
+ comparison.summary
116
+ comparison.significant_improvements # => [:faithfulness, :context_precision]
117
+ ```
118
+
119
+ ## License
120
+
121
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rake/testtask"
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList["test/**/test_*.rb"]
9
+ end
10
+
11
+ task default: :test
data/eval-ruby.gemspec ADDED
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/eval_ruby/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "eval-ruby"
7
+ spec.version = EvalRuby::VERSION
8
+ spec.authors = ["Johannes Dwi Cahyo"]
9
+ spec.homepage = "https://github.com/johannesdwicahyo/eval-ruby"
10
+ spec.summary = "Evaluation framework for LLM and RAG applications in Ruby"
11
+ spec.description = "Measures quality metrics like faithfulness, relevance, context precision, " \
12
+ "and answer correctness for LLM and RAG applications. " \
13
+ "Think Ragas or DeepEval for Ruby."
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 3.1.0"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = spec.homepage
19
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
20
+
21
+ spec.files = Dir.chdir(__dir__) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ (File.expand_path(f) == __FILE__) ||
24
+ f.start_with?("test/", "spec/", "examples/", ".git")
25
+ end
26
+ end
27
+ spec.require_paths = ["lib"]
28
+
29
+ spec.add_dependency "csv"
30
+
31
+ spec.add_development_dependency "minitest", "~> 5.0"
32
+ spec.add_development_dependency "rake", "~> 13.0"
33
+ spec.add_development_dependency "webmock", "~> 3.0"
34
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ class Comparison
5
+ attr_reader :report_a, :report_b
6
+
7
+ def initialize(report_a, report_b)
8
+ @report_a = report_a
9
+ @report_b = report_b
10
+ end
11
+
12
+ def summary
13
+ lines = [
14
+ format("%-20s | %-10s | %-10s | %-8s | %s", "Metric", "A", "B", "Delta", "p-value"),
15
+ "-" * 70
16
+ ]
17
+
18
+ all_metrics.each do |metric|
19
+ stats_a = @report_a.metric_stats[metric]
20
+ stats_b = @report_b.metric_stats[metric]
21
+ next unless stats_a && stats_b
22
+
23
+ delta = stats_b[:mean] - stats_a[:mean]
24
+ scores_a = @report_a.results.filter_map { |r| r.scores[metric] }
25
+ scores_b = @report_b.results.filter_map { |r| r.scores[metric] }
26
+ t_result = paired_t_test(scores_a, scores_b)
27
+ sig = significance_marker(t_result[:p_value])
28
+
29
+ lines << format(
30
+ "%-20s | %-10.4f | %-10.4f | %+.4f | %.4f %s",
31
+ metric, stats_a[:mean], stats_b[:mean], delta, t_result[:p_value], sig
32
+ )
33
+ end
34
+
35
+ lines.join("\n")
36
+ end
37
+
38
+ def significant_improvements(alpha: 0.05)
39
+ all_metrics.select do |metric|
40
+ scores_a = @report_a.results.filter_map { |r| r.scores[metric] }
41
+ scores_b = @report_b.results.filter_map { |r| r.scores[metric] }
42
+ next false if scores_a.empty? || scores_b.empty?
43
+
44
+ t_result = paired_t_test(scores_a, scores_b)
45
+ mean_b = scores_b.sum / scores_b.size.to_f
46
+ mean_a = scores_a.sum / scores_a.size.to_f
47
+ t_result[:p_value] < alpha && mean_b > mean_a
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def all_metrics
54
+ (@report_a.metric_stats.keys + @report_b.metric_stats.keys).uniq
55
+ end
56
+
57
+ def paired_t_test(scores_a, scores_b)
58
+ n = [scores_a.length, scores_b.length].min
59
+ return {t_stat: 0.0, p_value: 1.0, significant: false} if n < 2
60
+
61
+ diffs = scores_a.first(n).zip(scores_b.first(n)).map { |a, b| a - b }
62
+ mean_diff = diffs.sum / n.to_f
63
+ std_diff = Math.sqrt(diffs.sum { |d| (d - mean_diff)**2 } / (n - 1).to_f)
64
+
65
+ return {t_stat: 0.0, p_value: 1.0, significant: false} if std_diff.zero?
66
+
67
+ t_stat = mean_diff / (std_diff / Math.sqrt(n))
68
+ p_value = 2 * (1 - normal_cdf(t_stat.abs))
69
+ {t_stat: t_stat, p_value: p_value, significant: p_value < 0.05}
70
+ end
71
+
72
+ def normal_cdf(x)
73
+ 0.5 * (1 + Math.erf(x / Math.sqrt(2)))
74
+ end
75
+
76
+ def significance_marker(p_value)
77
+ if p_value < 0.001
78
+ "***"
79
+ elsif p_value < 0.01
80
+ "**"
81
+ elsif p_value < 0.05
82
+ "*"
83
+ else
84
+ ""
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ class Configuration
5
+ attr_accessor :judge_llm, :judge_model, :api_key, :default_threshold,
6
+ :timeout, :max_retries
7
+
8
+ def initialize
9
+ @judge_llm = :openai
10
+ @judge_model = "gpt-4o"
11
+ @api_key = nil
12
+ @default_threshold = 0.7
13
+ @timeout = 30
14
+ @max_retries = 3
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+ require "json"
5
+
6
+ module EvalRuby
7
+ class Dataset
8
+ include Enumerable
9
+
10
+ attr_reader :name, :samples
11
+
12
+ def initialize(name = "default")
13
+ @name = name
14
+ @samples = []
15
+ end
16
+
17
+ def add(question:, ground_truth: nil, relevant_contexts: [], answer: nil, context: [])
18
+ @samples << {
19
+ question: question,
20
+ answer: answer,
21
+ context: context.empty? ? relevant_contexts : context,
22
+ ground_truth: ground_truth
23
+ }
24
+ self
25
+ end
26
+
27
+ def each(&block)
28
+ @samples.each(&block)
29
+ end
30
+
31
+ def size
32
+ @samples.size
33
+ end
34
+
35
+ def [](index)
36
+ @samples[index]
37
+ end
38
+
39
+ def self.from_csv(path)
40
+ dataset = new(File.basename(path, ".*"))
41
+ CSV.foreach(path, headers: true) do |row|
42
+ dataset.add(
43
+ question: row["question"],
44
+ answer: row["answer"],
45
+ context: parse_array_field(row["context"]),
46
+ ground_truth: row["ground_truth"]
47
+ )
48
+ end
49
+ dataset
50
+ end
51
+
52
+ def self.from_json(path)
53
+ dataset = new(File.basename(path, ".*"))
54
+ data = JSON.parse(File.read(path))
55
+ samples = data.is_a?(Array) ? data : data["samples"] || data["data"] || []
56
+ samples.each do |sample|
57
+ dataset.add(
58
+ question: sample["question"],
59
+ answer: sample["answer"],
60
+ context: Array(sample["context"]),
61
+ ground_truth: sample["ground_truth"]
62
+ )
63
+ end
64
+ dataset
65
+ end
66
+
67
+ def to_csv(path)
68
+ CSV.open(path, "w") do |csv|
69
+ csv << %w[question answer context ground_truth]
70
+ @samples.each do |sample|
71
+ csv << [
72
+ sample[:question],
73
+ sample[:answer],
74
+ JSON.generate(sample[:context]),
75
+ sample[:ground_truth]
76
+ ]
77
+ end
78
+ end
79
+ end
80
+
81
+ def to_json(path)
82
+ File.write(path, JSON.pretty_generate({name: @name, samples: @samples}))
83
+ end
84
+
85
+ def self.generate(documents:, questions_per_doc: 5, llm: :openai)
86
+ config = EvalRuby.configuration.dup
87
+ config.judge_llm = llm
88
+ judge = case llm
89
+ when :openai then Judges::OpenAI.new(config)
90
+ when :anthropic then Judges::Anthropic.new(config)
91
+ else raise Error, "Unknown LLM: #{llm}"
92
+ end
93
+
94
+ dataset = new("generated")
95
+ documents.each do |doc_path|
96
+ content = File.read(doc_path)
97
+ prompt = <<~PROMPT
98
+ Given the following document, generate #{questions_per_doc} question-answer pairs
99
+ that can be answered using the document content.
100
+
101
+ Document:
102
+ #{content}
103
+
104
+ Respond in JSON: {"pairs": [{"question": "...", "answer": "...", "context": "relevant excerpt"}]}
105
+ PROMPT
106
+
107
+ result = judge.call(prompt)
108
+ next unless result&.key?("pairs")
109
+
110
+ result["pairs"].each do |pair|
111
+ dataset.add(
112
+ question: pair["question"],
113
+ answer: pair["answer"],
114
+ context: [pair["context"] || content],
115
+ ground_truth: pair["answer"]
116
+ )
117
+ end
118
+ end
119
+ dataset
120
+ end
121
+
122
+ private_class_method def self.parse_array_field(value)
123
+ return [] if value.nil? || value.empty?
124
+
125
+ JSON.parse(value)
126
+ rescue JSON::ParserError
127
+ [value]
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ class Evaluator
5
+ def initialize(config = EvalRuby.configuration)
6
+ @config = config
7
+ @judge = build_judge(config)
8
+ end
9
+
10
+ def evaluate(question:, answer:, context: [], ground_truth: nil)
11
+ scores = {}
12
+ details = {}
13
+
14
+ # LLM-as-judge metrics
15
+ faith = Metrics::Faithfulness.new(judge: @judge).call(answer: answer, context: context)
16
+ scores[:faithfulness] = faith[:score]
17
+ details[:faithfulness] = faith[:details]
18
+
19
+ rel = Metrics::Relevance.new(judge: @judge).call(question: question, answer: answer)
20
+ scores[:relevance] = rel[:score]
21
+ details[:relevance] = rel[:details]
22
+
23
+ cp = Metrics::ContextPrecision.new(judge: @judge).call(question: question, context: context)
24
+ scores[:context_precision] = cp[:score]
25
+ details[:context_precision] = cp[:details]
26
+
27
+ if ground_truth
28
+ corr = Metrics::Correctness.new(judge: @judge).call(answer: answer, ground_truth: ground_truth)
29
+ scores[:correctness] = corr[:score]
30
+ details[:correctness] = corr[:details]
31
+
32
+ cr = Metrics::ContextRecall.new(judge: @judge).call(context: context, ground_truth: ground_truth)
33
+ scores[:context_recall] = cr[:score]
34
+ details[:context_recall] = cr[:details]
35
+ end
36
+
37
+ Result.new(scores: scores, details: details)
38
+ end
39
+
40
+ def evaluate_retrieval(question:, retrieved:, relevant:)
41
+ RetrievalResult.new(retrieved: retrieved, relevant: relevant)
42
+ end
43
+
44
+ private
45
+
46
+ def build_judge(config)
47
+ case config.judge_llm
48
+ when :openai
49
+ Judges::OpenAI.new(config)
50
+ when :anthropic
51
+ Judges::Anthropic.new(config)
52
+ else
53
+ raise Error, "Unknown judge LLM: #{config.judge_llm}"
54
+ end
55
+ end
56
+ end
57
+
58
+ class RetrievalResult
59
+ def initialize(retrieved:, relevant:)
60
+ @retrieved = retrieved
61
+ @relevant = relevant
62
+ end
63
+
64
+ def precision_at_k(k)
65
+ Metrics::PrecisionAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
66
+ end
67
+
68
+ def recall_at_k(k)
69
+ Metrics::RecallAtK.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
70
+ end
71
+
72
+ def mrr
73
+ Metrics::MRR.new.call(retrieved: @retrieved, relevant: @relevant)
74
+ end
75
+
76
+ def ndcg(k: nil)
77
+ Metrics::NDCG.new.call(retrieved: @retrieved, relevant: @relevant, k: k)
78
+ end
79
+
80
+ def hit_rate
81
+ @retrieved.any? { |doc| @relevant.include?(doc) } ? 1.0 : 0.0
82
+ end
83
+
84
+ def to_h
85
+ {
86
+ precision_at_k: precision_at_k(@retrieved.length),
87
+ recall_at_k: recall_at_k(@retrieved.length),
88
+ mrr: mrr,
89
+ ndcg: ndcg,
90
+ hit_rate: hit_rate
91
+ }
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "json"
5
+ require "uri"
6
+
7
+ module EvalRuby
8
+ module Judges
9
+ class Anthropic < Base
10
+ API_URL = "https://api.anthropic.com/v1/messages"
11
+
12
+ def call(prompt)
13
+ uri = URI(API_URL)
14
+ request = Net::HTTP::Post.new(uri)
15
+ request["x-api-key"] = @config.api_key
16
+ request["anthropic-version"] = "2023-06-01"
17
+ request["Content-Type"] = "application/json"
18
+ request.body = JSON.generate({
19
+ model: @config.judge_model,
20
+ max_tokens: 4096,
21
+ messages: [{role: "user", content: prompt}],
22
+ temperature: 0.0
23
+ })
24
+
25
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
26
+ read_timeout: @config.timeout) do |http|
27
+ http.request(request)
28
+ end
29
+
30
+ unless response.is_a?(Net::HTTPSuccess)
31
+ raise Error, "Anthropic API error: #{response.code} - #{response.body}"
32
+ end
33
+
34
+ body = JSON.parse(response.body)
35
+ content = body.dig("content", 0, "text")
36
+ parse_json_response(content)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EvalRuby
4
+ module Judges
5
+ class Base
6
+ def initialize(config)
7
+ @config = config
8
+ end
9
+
10
+ def call(prompt)
11
+ raise NotImplementedError, "#{self.class}#call must be implemented"
12
+ end
13
+
14
+ private
15
+
16
+ def parse_json_response(text)
17
+ match = text.match(/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/m)
18
+ return nil unless match
19
+
20
+ JSON.parse(match[0])
21
+ rescue JSON::ParserError
22
+ nil
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "json"
5
+ require "uri"
6
+
7
+ module EvalRuby
8
+ module Judges
9
+ class OpenAI < Base
10
+ API_URL = "https://api.openai.com/v1/chat/completions"
11
+
12
+ def call(prompt)
13
+ uri = URI(API_URL)
14
+ request = Net::HTTP::Post.new(uri)
15
+ request["Authorization"] = "Bearer #{@config.api_key}"
16
+ request["Content-Type"] = "application/json"
17
+ request.body = JSON.generate({
18
+ model: @config.judge_model,
19
+ messages: [{role: "user", content: prompt}],
20
+ temperature: 0.0,
21
+ response_format: {type: "json_object"}
22
+ })
23
+
24
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
25
+ read_timeout: @config.timeout) do |http|
26
+ http.request(request)
27
+ end
28
+
29
+ unless response.is_a?(Net::HTTPSuccess)
30
+ raise Error, "OpenAI API error: #{response.code} - #{response.body}"
31
+ end
32
+
33
+ body = JSON.parse(response.body)
34
+ content = body.dig("choices", 0, "message", "content")
35
+ parse_json_response(content)
36
+ end
37
+ end
38
+ end
39
+ end