eval-ruby 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 07e82b10ad871e882d8a5da7a3d85ef7436c3b1695840c2974b22a0df70ae0fc
4
- data.tar.gz: f483be5c375db41ff367162257186dde52627aa4b9bba4d493dfa1455363d310
3
+ metadata.gz: 59b7bd64cf696d82a27cb6330ab948306904f444a8a64476f702ee1937bbccab
4
+ data.tar.gz: f9c0c234f0712d37d309d460d1c3204d3e6f70bfbd8fe4dcff95ac508bcb7f34
5
5
  SHA512:
6
- metadata.gz: b4938e44301b2440500d6506057588fbafa5ca91f6cf574f288690132748ba81f2010c61ba76ad2f536a5d3cb7610442f1db854b08f0a8bf65d4bef7cf3b607c
7
- data.tar.gz: f76fbe015937d962fb9747ff90a0fc245ae32fc47834644a894d4b69595dd31c30cd621922dca94386e8e737c6be82f726e4b76d039ebd38b8f73d6f548b5ce7
6
+ metadata.gz: 6a9f0c12a790b0098ba639bc236c6080c64c7fbb4ad9892c36810e93b88486bebaa42dca9245beaf828beca29e31793030d62ab232cad50107bc99905635a069
7
+ data.tar.gz: 7c922b6fd8743d5241a254baf301aae2af6aba936fdeedbcb467f9549a8f16493c7571ec9724a5460a75228e415cbcaa852b915ef78f23aa8f1fcbddb85d9450
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- eval-ruby (0.1.0)
4
+ eval-ruby (0.1.1)
5
5
  csv
6
6
 
7
7
  GEM
@@ -39,7 +39,7 @@ CHECKSUMS
39
39
  bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
40
40
  crack (1.0.1) sha256=ff4a10390cd31d66440b7524eb1841874db86201d5b70032028553130b6d4c7e
41
41
  csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
42
- eval-ruby (0.1.0)
42
+ eval-ruby (0.1.1)
43
43
  hashdiff (1.2.1) sha256=9c079dbc513dfc8833ab59c0c2d8f230fa28499cc5efb4b8dd276cf931457cd1
44
44
  minitest (5.27.0) sha256=2d3b17f8a36fe7801c1adcffdbc38233b938eb0b4966e97a6739055a45fa77d5
45
45
  public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
@@ -105,7 +105,7 @@ module EvalRuby
105
105
  PROMPT
106
106
 
107
107
  result = judge.call(prompt)
108
- next unless result&.key?("pairs")
108
+ next unless result.is_a?(Hash) && result.key?("pairs")
109
109
 
110
110
  result["pairs"].each do |pair|
111
111
  dataset.add(
@@ -9,31 +9,46 @@ module EvalRuby
9
9
  class Anthropic < Base
10
10
  API_URL = "https://api.anthropic.com/v1/messages"
11
11
 
12
+ def initialize(config)
13
+ super
14
+ raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
15
+ end
16
+
12
17
  def call(prompt)
13
- uri = URI(API_URL)
14
- request = Net::HTTP::Post.new(uri)
15
- request["x-api-key"] = @config.api_key
16
- request["anthropic-version"] = "2023-06-01"
17
- request["Content-Type"] = "application/json"
18
- request.body = JSON.generate({
19
- model: @config.judge_model,
20
- max_tokens: 4096,
21
- messages: [{role: "user", content: prompt}],
22
- temperature: 0.0
23
- })
18
+ retries = 0
19
+ begin
20
+ uri = URI(API_URL)
21
+ request = Net::HTTP::Post.new(uri)
22
+ request["x-api-key"] = @config.api_key
23
+ request["anthropic-version"] = "2023-06-01"
24
+ request["Content-Type"] = "application/json"
25
+ request.body = JSON.generate({
26
+ model: @config.judge_model,
27
+ max_tokens: 4096,
28
+ messages: [{role: "user", content: prompt}],
29
+ temperature: 0.0
30
+ })
24
31
 
25
- response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
26
- read_timeout: @config.timeout) do |http|
27
- http.request(request)
28
- end
32
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
33
+ read_timeout: @config.timeout) do |http|
34
+ http.request(request)
35
+ end
29
36
 
30
- unless response.is_a?(Net::HTTPSuccess)
31
- raise Error, "Anthropic API error: #{response.code} - #{response.body}"
32
- end
37
+ unless response.is_a?(Net::HTTPSuccess)
38
+ raise Error, "Anthropic API error: #{response.code} - #{response.body}"
39
+ end
33
40
 
34
- body = JSON.parse(response.body)
35
- content = body.dig("content", 0, "text")
36
- parse_json_response(content)
41
+ body = JSON.parse(response.body)
42
+ content = body.dig("content", 0, "text")
43
+ parse_json_response(content)
44
+ rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET => e
45
+ retries += 1
46
+ if retries <= @config.max_retries
47
+ sleep(2 ** (retries - 1))
48
+ retry
49
+ end
50
+ raise EvalRuby::TimeoutError, "Judge API failed after #{@config.max_retries} retries: #{e.message}"
51
+ end
37
52
  end
38
53
  end
39
54
  end
@@ -9,30 +9,45 @@ module EvalRuby
9
9
  class OpenAI < Base
10
10
  API_URL = "https://api.openai.com/v1/chat/completions"
11
11
 
12
+ def initialize(config)
13
+ super
14
+ raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
15
+ end
16
+
12
17
  def call(prompt)
13
- uri = URI(API_URL)
14
- request = Net::HTTP::Post.new(uri)
15
- request["Authorization"] = "Bearer #{@config.api_key}"
16
- request["Content-Type"] = "application/json"
17
- request.body = JSON.generate({
18
- model: @config.judge_model,
19
- messages: [{role: "user", content: prompt}],
20
- temperature: 0.0,
21
- response_format: {type: "json_object"}
22
- })
18
+ retries = 0
19
+ begin
20
+ uri = URI(API_URL)
21
+ request = Net::HTTP::Post.new(uri)
22
+ request["Authorization"] = "Bearer #{@config.api_key}"
23
+ request["Content-Type"] = "application/json"
24
+ request.body = JSON.generate({
25
+ model: @config.judge_model,
26
+ messages: [{role: "user", content: prompt}],
27
+ temperature: 0.0,
28
+ response_format: {type: "json_object"}
29
+ })
23
30
 
24
- response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
25
- read_timeout: @config.timeout) do |http|
26
- http.request(request)
27
- end
31
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
32
+ read_timeout: @config.timeout) do |http|
33
+ http.request(request)
34
+ end
28
35
 
29
- unless response.is_a?(Net::HTTPSuccess)
30
- raise Error, "OpenAI API error: #{response.code} - #{response.body}"
31
- end
36
+ unless response.is_a?(Net::HTTPSuccess)
37
+ raise Error, "OpenAI API error: #{response.code} - #{response.body}"
38
+ end
32
39
 
33
- body = JSON.parse(response.body)
34
- content = body.dig("choices", 0, "message", "content")
35
- parse_json_response(content)
40
+ body = JSON.parse(response.body)
41
+ content = body.dig("choices", 0, "message", "content")
42
+ parse_json_response(content)
43
+ rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET => e
44
+ retries += 1
45
+ if retries <= @config.max_retries
46
+ sleep(2 ** (retries - 1))
47
+ retry
48
+ end
49
+ raise EvalRuby::TimeoutError, "Judge API failed after #{@config.max_retries} retries: #{e.message}"
50
+ end
36
51
  end
37
52
  end
38
53
  end
@@ -20,7 +20,7 @@ module EvalRuby
20
20
  PROMPT
21
21
 
22
22
  def call(question:, context:, **_kwargs)
23
- contexts = Array(context)
23
+ contexts = context.is_a?(Array) ? context : [context.to_s]
24
24
  return {score: 0.0, details: {}} if contexts.empty?
25
25
 
26
26
  contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
@@ -21,7 +21,7 @@ module EvalRuby
21
21
  PROMPT
22
22
 
23
23
  def call(context:, ground_truth:, **_kwargs)
24
- contexts = Array(context)
24
+ contexts = context.is_a?(Array) ? context : [context.to_s]
25
25
  return {score: 0.0, details: {}} if contexts.empty?
26
26
 
27
27
  contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
@@ -26,7 +26,7 @@ module EvalRuby
26
26
  PROMPT
27
27
 
28
28
  def call(answer:, context:, **_kwargs)
29
- context_text = Array(context).join("\n\n")
29
+ context_text = context.is_a?(Array) ? context.join("\n\n") : context.to_s
30
30
  prompt = format(PROMPT_TEMPLATE, context: context_text, answer: answer)
31
31
 
32
32
  result = judge.call(prompt)
@@ -32,7 +32,8 @@ module EvalRuby
32
32
  next if values.empty?
33
33
 
34
34
  mean = values.sum / values.size.to_f
35
- variance = values.sum { |v| (v - mean)**2 } / values.size.to_f
35
+ denominator = values.size > 1 ? (values.size - 1).to_f : 1.0
36
+ variance = values.sum { |v| (v - mean)**2 } / denominator
36
37
  std = Math.sqrt(variance)
37
38
  hash[metric] = {mean: mean, std: std, min: values.min, max: values.max, count: values.size}
38
39
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EvalRuby
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
data/lib/eval_ruby.rb CHANGED
@@ -23,6 +23,9 @@ require_relative "eval_ruby/comparison"
23
23
 
24
24
  module EvalRuby
25
25
  class Error < StandardError; end
26
+ class APIError < Error; end
27
+ class TimeoutError < Error; end
28
+ class InvalidResponseError < Error; end
26
29
 
27
30
  class << self
28
31
  def configuration
@@ -79,16 +82,5 @@ module EvalRuby
79
82
  def compare(report_a, report_b)
80
83
  Comparison.new(report_a, report_b)
81
84
  end
82
-
83
- private
84
-
85
- def build_judge
86
- config = configuration
87
- case config.judge_llm
88
- when :openai then Judges::OpenAI.new(config)
89
- when :anthropic then Judges::Anthropic.new(config)
90
- else raise Error, "Unknown judge LLM: #{config.judge_llm}"
91
- end
92
- end
93
85
  end
94
86
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eval-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo