eval-ruby 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/lib/eval_ruby/dataset.rb +1 -1
- data/lib/eval_ruby/judges/anthropic.rb +36 -21
- data/lib/eval_ruby/judges/openai.rb +35 -20
- data/lib/eval_ruby/metrics/context_precision.rb +1 -1
- data/lib/eval_ruby/metrics/context_recall.rb +1 -1
- data/lib/eval_ruby/metrics/faithfulness.rb +1 -1
- data/lib/eval_ruby/report.rb +2 -1
- data/lib/eval_ruby/version.rb +1 -1
- data/lib/eval_ruby.rb +3 -11
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 59b7bd64cf696d82a27cb6330ab948306904f444a8a64476f702ee1937bbccab
|
|
4
|
+
data.tar.gz: f9c0c234f0712d37d309d460d1c3204d3e6f70bfbd8fe4dcff95ac508bcb7f34
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6a9f0c12a790b0098ba639bc236c6080c64c7fbb4ad9892c36810e93b88486bebaa42dca9245beaf828beca29e31793030d62ab232cad50107bc99905635a069
|
|
7
|
+
data.tar.gz: 7c922b6fd8743d5241a254baf301aae2af6aba936fdeedbcb467f9549a8f16493c7571ec9724a5460a75228e415cbcaa852b915ef78f23aa8f1fcbddb85d9450
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
eval-ruby (0.1.
|
|
4
|
+
eval-ruby (0.1.1)
|
|
5
5
|
csv
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -39,7 +39,7 @@ CHECKSUMS
|
|
|
39
39
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
40
40
|
crack (1.0.1) sha256=ff4a10390cd31d66440b7524eb1841874db86201d5b70032028553130b6d4c7e
|
|
41
41
|
csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
|
|
42
|
-
eval-ruby (0.1.
|
|
42
|
+
eval-ruby (0.1.1)
|
|
43
43
|
hashdiff (1.2.1) sha256=9c079dbc513dfc8833ab59c0c2d8f230fa28499cc5efb4b8dd276cf931457cd1
|
|
44
44
|
minitest (5.27.0) sha256=2d3b17f8a36fe7801c1adcffdbc38233b938eb0b4966e97a6739055a45fa77d5
|
|
45
45
|
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
|
data/lib/eval_ruby/dataset.rb
CHANGED
|
@@ -9,31 +9,46 @@ module EvalRuby
|
|
|
9
9
|
class Anthropic < Base
|
|
10
10
|
API_URL = "https://api.anthropic.com/v1/messages"
|
|
11
11
|
|
|
12
|
+
def initialize(config)
|
|
13
|
+
super
|
|
14
|
+
raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
|
|
15
|
+
end
|
|
16
|
+
|
|
12
17
|
def call(prompt)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
18
|
+
retries = 0
|
|
19
|
+
begin
|
|
20
|
+
uri = URI(API_URL)
|
|
21
|
+
request = Net::HTTP::Post.new(uri)
|
|
22
|
+
request["x-api-key"] = @config.api_key
|
|
23
|
+
request["anthropic-version"] = "2023-06-01"
|
|
24
|
+
request["Content-Type"] = "application/json"
|
|
25
|
+
request.body = JSON.generate({
|
|
26
|
+
model: @config.judge_model,
|
|
27
|
+
max_tokens: 4096,
|
|
28
|
+
messages: [{role: "user", content: prompt}],
|
|
29
|
+
temperature: 0.0
|
|
30
|
+
})
|
|
24
31
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
32
|
+
response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
|
|
33
|
+
read_timeout: @config.timeout) do |http|
|
|
34
|
+
http.request(request)
|
|
35
|
+
end
|
|
29
36
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
37
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
38
|
+
raise Error, "Anthropic API error: #{response.code} - #{response.body}"
|
|
39
|
+
end
|
|
33
40
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
41
|
+
body = JSON.parse(response.body)
|
|
42
|
+
content = body.dig("content", 0, "text")
|
|
43
|
+
parse_json_response(content)
|
|
44
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET => e
|
|
45
|
+
retries += 1
|
|
46
|
+
if retries <= @config.max_retries
|
|
47
|
+
sleep(2 ** (retries - 1))
|
|
48
|
+
retry
|
|
49
|
+
end
|
|
50
|
+
raise EvalRuby::TimeoutError, "Judge API failed after #{@config.max_retries} retries: #{e.message}"
|
|
51
|
+
end
|
|
37
52
|
end
|
|
38
53
|
end
|
|
39
54
|
end
|
|
@@ -9,30 +9,45 @@ module EvalRuby
|
|
|
9
9
|
class OpenAI < Base
|
|
10
10
|
API_URL = "https://api.openai.com/v1/chat/completions"
|
|
11
11
|
|
|
12
|
+
def initialize(config)
|
|
13
|
+
super
|
|
14
|
+
raise EvalRuby::Error, "API key is required. Set via EvalRuby.configure { |c| c.api_key = '...' }" if @config.api_key.nil? || @config.api_key.empty?
|
|
15
|
+
end
|
|
16
|
+
|
|
12
17
|
def call(prompt)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
18
|
+
retries = 0
|
|
19
|
+
begin
|
|
20
|
+
uri = URI(API_URL)
|
|
21
|
+
request = Net::HTTP::Post.new(uri)
|
|
22
|
+
request["Authorization"] = "Bearer #{@config.api_key}"
|
|
23
|
+
request["Content-Type"] = "application/json"
|
|
24
|
+
request.body = JSON.generate({
|
|
25
|
+
model: @config.judge_model,
|
|
26
|
+
messages: [{role: "user", content: prompt}],
|
|
27
|
+
temperature: 0.0,
|
|
28
|
+
response_format: {type: "json_object"}
|
|
29
|
+
})
|
|
23
30
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
31
|
+
response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true,
|
|
32
|
+
read_timeout: @config.timeout) do |http|
|
|
33
|
+
http.request(request)
|
|
34
|
+
end
|
|
28
35
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
36
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
37
|
+
raise Error, "OpenAI API error: #{response.code} - #{response.body}"
|
|
38
|
+
end
|
|
32
39
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
40
|
+
body = JSON.parse(response.body)
|
|
41
|
+
content = body.dig("choices", 0, "message", "content")
|
|
42
|
+
parse_json_response(content)
|
|
43
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET => e
|
|
44
|
+
retries += 1
|
|
45
|
+
if retries <= @config.max_retries
|
|
46
|
+
sleep(2 ** (retries - 1))
|
|
47
|
+
retry
|
|
48
|
+
end
|
|
49
|
+
raise EvalRuby::TimeoutError, "Judge API failed after #{@config.max_retries} retries: #{e.message}"
|
|
50
|
+
end
|
|
36
51
|
end
|
|
37
52
|
end
|
|
38
53
|
end
|
|
@@ -20,7 +20,7 @@ module EvalRuby
|
|
|
20
20
|
PROMPT
|
|
21
21
|
|
|
22
22
|
def call(question:, context:, **_kwargs)
|
|
23
|
-
contexts = Array
|
|
23
|
+
contexts = context.is_a?(Array) ? context : [context.to_s]
|
|
24
24
|
return {score: 0.0, details: {}} if contexts.empty?
|
|
25
25
|
|
|
26
26
|
contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
|
|
@@ -21,7 +21,7 @@ module EvalRuby
|
|
|
21
21
|
PROMPT
|
|
22
22
|
|
|
23
23
|
def call(context:, ground_truth:, **_kwargs)
|
|
24
|
-
contexts = Array
|
|
24
|
+
contexts = context.is_a?(Array) ? context : [context.to_s]
|
|
25
25
|
return {score: 0.0, details: {}} if contexts.empty?
|
|
26
26
|
|
|
27
27
|
contexts_text = contexts.each_with_index.map { |c, i| "[#{i}] #{c}" }.join("\n\n")
|
|
@@ -26,7 +26,7 @@ module EvalRuby
|
|
|
26
26
|
PROMPT
|
|
27
27
|
|
|
28
28
|
def call(answer:, context:, **_kwargs)
|
|
29
|
-
context_text = Array
|
|
29
|
+
context_text = context.is_a?(Array) ? context.join("\n\n") : context.to_s
|
|
30
30
|
prompt = format(PROMPT_TEMPLATE, context: context_text, answer: answer)
|
|
31
31
|
|
|
32
32
|
result = judge.call(prompt)
|
data/lib/eval_ruby/report.rb
CHANGED
|
@@ -32,7 +32,8 @@ module EvalRuby
|
|
|
32
32
|
next if values.empty?
|
|
33
33
|
|
|
34
34
|
mean = values.sum / values.size.to_f
|
|
35
|
-
|
|
35
|
+
denominator = values.size > 1 ? (values.size - 1).to_f : 1.0
|
|
36
|
+
variance = values.sum { |v| (v - mean)**2 } / denominator
|
|
36
37
|
std = Math.sqrt(variance)
|
|
37
38
|
hash[metric] = {mean: mean, std: std, min: values.min, max: values.max, count: values.size}
|
|
38
39
|
end
|
data/lib/eval_ruby/version.rb
CHANGED
data/lib/eval_ruby.rb
CHANGED
|
@@ -23,6 +23,9 @@ require_relative "eval_ruby/comparison"
|
|
|
23
23
|
|
|
24
24
|
module EvalRuby
|
|
25
25
|
class Error < StandardError; end
|
|
26
|
+
class APIError < Error; end
|
|
27
|
+
class TimeoutError < Error; end
|
|
28
|
+
class InvalidResponseError < Error; end
|
|
26
29
|
|
|
27
30
|
class << self
|
|
28
31
|
def configuration
|
|
@@ -79,16 +82,5 @@ module EvalRuby
|
|
|
79
82
|
def compare(report_a, report_b)
|
|
80
83
|
Comparison.new(report_a, report_b)
|
|
81
84
|
end
|
|
82
|
-
|
|
83
|
-
private
|
|
84
|
-
|
|
85
|
-
def build_judge
|
|
86
|
-
config = configuration
|
|
87
|
-
case config.judge_llm
|
|
88
|
-
when :openai then Judges::OpenAI.new(config)
|
|
89
|
-
when :anthropic then Judges::Anthropic.new(config)
|
|
90
|
-
else raise Error, "Unknown judge LLM: #{config.judge_llm}"
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
85
|
end
|
|
94
86
|
end
|