ask-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +19 -0
- data/LICENSE +21 -0
- data/README.md +157 -0
- data/lib/ask/eval/assertions/deterministic.rb +154 -0
- data/lib/ask/eval/assertions/judge.rb +33 -0
- data/lib/ask/eval/assertions.rb +99 -0
- data/lib/ask/eval/configuration.rb +49 -0
- data/lib/ask/eval/cost_tracker.rb +99 -0
- data/lib/ask/eval/dsl.rb +175 -0
- data/lib/ask/eval/judge.rb +248 -0
- data/lib/ask/eval/judges/bias.rb +55 -0
- data/lib/ask/eval/judges/correctness.rb +58 -0
- data/lib/ask/eval/judges/faithful.rb +67 -0
- data/lib/ask/eval/judges/hallucination.rb +72 -0
- data/lib/ask/eval/judges/toxicity.rb +53 -0
- data/lib/ask/eval/minitest.rb +8 -0
- data/lib/ask/eval/reporters/console.rb +55 -0
- data/lib/ask/eval/reporters/github.rb +61 -0
- data/lib/ask/eval/reporters/junit.rb +66 -0
- data/lib/ask/eval/runner.rb +97 -0
- data/lib/ask/eval/test_case.rb +23 -0
- data/lib/ask/eval/version.rb +5 -0
- data/lib/ask/eval.rb +65 -0
- data/lib/ask-eval.rb +1 -0
- metadata +111 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 255e463739c832d6784d161c3f6b26b267dbaaf6e29b27268ac8b49bd3a3fa05
|
|
4
|
+
data.tar.gz: 842364b8ea52fbae673bd53ebec870627481cac8a9214b98d92476daa609789c
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: dade52a1b23dc7415788d02c858ee3ca2a0c14739a45f2bcb172b72818cc003db215c2bda019cf68fe072fdc8f40d3aa0a1ae8628df98e5e9e52b435c14c0d0b
|
|
7
|
+
data.tar.gz: e82d90fb63e88693df7f5c4e1c89dbb166ade6617bfa3980a0a7ba1cfdcc355b7d8abcc458882d60a84d701f98c21cb741c65b586399b1826701c061b3123936
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 2026-06-10
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Core types: `Ask::Eval::TestCase` (Data.define with input, actual_output, expected_output, context)
|
|
7
|
+
- Deterministic assertions: contains, not_contains, regex, is_json, max_tokens, starts_with,
|
|
8
|
+
ends_with, equals, min_length, max_length, url, email
|
|
9
|
+
- LLM-as-judge assertions: Faithful, Hallucination, Bias, Toxicity, Correctness
|
|
10
|
+
- Minitest DSL mixin (`Ask::Eval::DSL`) with assert_contains, assert_faithful, refute_bias, etc.
|
|
11
|
+
- Minitest plugin (`require "ask/eval/minitest"`) for auto-inclusion in all tests
|
|
12
|
+
- Assertion runner (`Ask::Eval::Assertions.evaluate`) routing to deterministic or judge
|
|
13
|
+
- Batch evaluation runner (`Ask::Eval::Runner`)
|
|
14
|
+
- CI reporters: Console, JUnit XML, GitHub Actions annotations
|
|
15
|
+
- Cost tracking with `CostTracker` — per-model pricing, summary reports
|
|
16
|
+
- Zero runtime dependencies — deterministic assertions work standalone
|
|
17
|
+
- Optional ask-llm-providers integration for judge models
|
|
18
|
+
- Tests: 88 minitest tests covering all components
|
|
19
|
+
</RUBY>
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Kaka Ruto
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# ask-eval
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/rb/ask-eval)
|
|
4
|
+
|
|
5
|
+
LLM evaluation framework for Ruby. Minitest-native assertions for testing
|
|
6
|
+
LLM outputs. LLM-as-judge for faithfulness, hallucination, bias, and toxicity.
|
|
7
|
+
Deterministic assertions for basic checks. CI-native output.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem "ask-eval"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```ruby
|
|
18
|
+
require "ask/eval"
|
|
19
|
+
require "ask/eval/dsl"
|
|
20
|
+
|
|
21
|
+
class MyEvalTest < Minitest::Test
|
|
22
|
+
include Ask::Eval::DSL
|
|
23
|
+
|
|
24
|
+
test "response is faithful to context" do
|
|
25
|
+
response = my_rag_app.query("What's the return policy?")
|
|
26
|
+
assert_faithful response, context: [my_docs]
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
test "response contains expected info" do
|
|
30
|
+
response = my_app.generate_email("Order confirmation")
|
|
31
|
+
assert_contains response, "Thank you for your order"
|
|
32
|
+
assert_regex response, /order #\d{5}/
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Deterministic Assertions
|
|
38
|
+
|
|
39
|
+
```ruby
|
|
40
|
+
assert_contains output, "substring"
|
|
41
|
+
assert_not_contains output, "bad word"
|
|
42
|
+
assert_regex output, /pattern/
|
|
43
|
+
assert_json output # valid JSON?
|
|
44
|
+
assert_max_tokens output, 500
|
|
45
|
+
assert_starts_with output, "Hello"
|
|
46
|
+
assert_ends_with output, "Goodbye"
|
|
47
|
+
assert_equals output, "exact string"
|
|
48
|
+
assert_min_length output, 10
|
|
49
|
+
assert_max_length output, 500
|
|
50
|
+
assert_url output
|
|
51
|
+
assert_email output
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## LLM-as-Judge Assertions
|
|
55
|
+
|
|
56
|
+
```ruby
|
|
57
|
+
assert_faithful response, context: docs # faithful to source?
|
|
58
|
+
assert_not_hallucinating response, context: docs # made-up info?
|
|
59
|
+
refute_bias response
|
|
60
|
+
refute_toxicity response
|
|
61
|
+
assert_correctness response, expected: expected
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
These require a judge model. Pass one per assertion or configure globally:
|
|
65
|
+
|
|
66
|
+
```ruby
|
|
67
|
+
# Configure a default judge model
|
|
68
|
+
Ask::Eval.configure do |c|
|
|
69
|
+
c.default_judge = model # any callable, Ask::Provider instance, or model string
|
|
70
|
+
end
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or pass a model directly to each assertion:
|
|
74
|
+
|
|
75
|
+
```ruby
|
|
76
|
+
assert_faithful response, context: docs, model: my_model
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
The model can be:
|
|
80
|
+
- A **callable** (lambda/proc) that accepts messages and returns a response
|
|
81
|
+
- An **Ask::Provider** instance (e.g., `Ask::Providers::OpenAI.new`)
|
|
82
|
+
- A **model string** (e.g., `"openai/gpt-4o-mini"` — requires ask-llm-providers)
|
|
83
|
+
|
|
84
|
+
### Using a lambda for testing
|
|
85
|
+
|
|
86
|
+
```ruby
|
|
87
|
+
require "json"
|
|
88
|
+
|
|
89
|
+
model = ->(messages) {
|
|
90
|
+
{ content: JSON.generate({ passed: true, score: 0.95, reason: "OK" }) }
|
|
91
|
+
}
|
|
92
|
+
assert_faithful response, context: docs, model: model
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Minitest Plugin
|
|
96
|
+
|
|
97
|
+
For automatic inclusion in all Minitest tests, use the plugin:
|
|
98
|
+
|
|
99
|
+
```ruby
|
|
100
|
+
# test/test_helper.rb
|
|
101
|
+
require "ask/eval/minitest"
|
|
102
|
+
# Now ALL test classes have assert_faithful, assert_contains, etc.
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## CI Integration
|
|
106
|
+
|
|
107
|
+
**JUnit XML** (works with Jenkins, CircleCI, GitLab CI):
|
|
108
|
+
|
|
109
|
+
```ruby
|
|
110
|
+
results = runner.summary[:results]
|
|
111
|
+
xml = Ask::Eval::Reporters::JUnit.new(results).to_xml
|
|
112
|
+
File.write("eval-results.xml", xml)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
**GitHub Actions** — annotations on PRs:
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
reporter = Ask::Eval::Reporters::GitHub.new(results)
|
|
119
|
+
reporter.report # prints ::warning and ::error annotations
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Cost Tracking
|
|
123
|
+
|
|
124
|
+
```ruby
|
|
125
|
+
Ask::Eval.configure do |c|
|
|
126
|
+
c.track_cost = true
|
|
127
|
+
end
|
|
128
|
+
# Access accumulated costs
|
|
129
|
+
puts Ask::Eval.cost_report
|
|
130
|
+
# => { total: 0.00015, by_judge: { faithful: { calls: 2, total_cost: 0.00015 } } }
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Running Tests
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
bundle exec rake test
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Design Philosophy
|
|
140
|
+
|
|
141
|
+
**This gem should NOT be a port of ruby_llm-tribunal.** See the comparison:
|
|
142
|
+
|
|
143
|
+
| ruby_llm-tribunal (~500 lines, 25+ files) | ask-eval (~300 lines, 10 files) |
|
|
144
|
+
|---|---|
|
|
145
|
+
| Standalone evaluator with its own API | **Minitest-native assertions** — drops into existing tests |
|
|
146
|
+
| 10 judges (including niche: jailbreak, PII, refusal) | **5 essential judges** — faithful, hallucination, bias, toxicity, correctness |
|
|
147
|
+
| 6 reporters (console, text, JSON, HTML, JUnit, GitHub) | **3 reporters** — console (dev), JUnit (CI), GitHub Actions (annotations) |
|
|
148
|
+
| Dataset management, red teaming, custom judges | **No datasets, no red teaming.** Focus on what matters for 80% of users. |
|
|
149
|
+
| Tied to RubyLLM for judge model | **Any model as judge** — cheap gpt-4o-mini, accurate claude, or local |
|
|
150
|
+
| Cost tracking: none | **Cost tracking per evaluation** |
|
|
151
|
+
| Snapshot testing: none | **Eval snapshots for regression detection** |
|
|
152
|
+
| Test framework integration: requires include | **Minitest plugin** — auto-loads with `require "ask/eval/minitest"` |
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
MIT
|
|
157
|
+
</RUBY>
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module Ask
|
|
7
|
+
module Eval
|
|
8
|
+
module Assertions
|
|
9
|
+
# Deterministic (non-LLM) assertion checks.
|
|
10
|
+
# Each method returns { passed: Boolean, score: Float, reason: String }.
|
|
11
|
+
module Deterministic
|
|
12
|
+
class << self
|
|
13
|
+
# @param output [String] the output to check
|
|
14
|
+
# @param value [String] substring to find
|
|
15
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
16
|
+
def contains(output, value:)
|
|
17
|
+
passed = output.to_s.include?(value.to_s)
|
|
18
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
19
|
+
reason: passed ? "Output contains #{value.inspect}" :
|
|
20
|
+
"Output does not contain #{value.inspect}")
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# @param output [String] the output to check
|
|
24
|
+
# @param value [String] substring that should be absent
|
|
25
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
26
|
+
def not_contains(output, value:)
|
|
27
|
+
passed = !output.to_s.include?(value.to_s)
|
|
28
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
29
|
+
reason: passed ? "Output does not contain #{value.inspect}" :
|
|
30
|
+
"Output contains #{value.inspect}")
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @param output [String] the output to check
|
|
34
|
+
# @param pattern [Regexp, String] regex pattern to match
|
|
35
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
36
|
+
def regex(output, pattern:)
|
|
37
|
+
re = pattern.is_a?(Regexp) ? pattern : Regexp.new(pattern.to_s)
|
|
38
|
+
match = re.match(output.to_s)
|
|
39
|
+
passed = !match.nil?
|
|
40
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
41
|
+
reason: passed ? "Output matches #{re.inspect}#{" at #{match[0].inspect}" if match}" :
|
|
42
|
+
"Output does not match #{re.inspect}")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# @param output [String] the output to check
|
|
46
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
47
|
+
def is_json(output, **_kwargs)
|
|
48
|
+
JSON.parse(output.to_s)
|
|
49
|
+
result(true, score: 1.0, reason: "Output is valid JSON")
|
|
50
|
+
rescue JSON::ParserError => e
|
|
51
|
+
result(false, score: 0.0, reason: "Output is not valid JSON: #{e.message}")
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# @param output [String] the output to check
|
|
55
|
+
# @param max [Integer] maximum allowed tokens
|
|
56
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
57
|
+
def max_tokens(output, max:)
|
|
58
|
+
count = approximate_tokens(output.to_s)
|
|
59
|
+
passed = count <= max
|
|
60
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
61
|
+
reason: passed ? "Output has ~#{count} tokens (max: #{max})" :
|
|
62
|
+
"Output has ~#{count} tokens (max: #{max})")
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# @param output [String] the output to check
|
|
66
|
+
# @param prefix [String] expected prefix
|
|
67
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
68
|
+
def starts_with(output, prefix:)
|
|
69
|
+
passed = output.to_s.start_with?(prefix.to_s)
|
|
70
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
71
|
+
reason: passed ? "Output starts with #{prefix.inspect}" :
|
|
72
|
+
"Output does not start with #{prefix.inspect}")
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# @param output [String] the output to check
|
|
76
|
+
# @param suffix [String] expected suffix
|
|
77
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
78
|
+
def ends_with(output, suffix:)
|
|
79
|
+
passed = output.to_s.end_with?(suffix.to_s)
|
|
80
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
81
|
+
reason: passed ? "Output ends with #{suffix.inspect}" :
|
|
82
|
+
"Output does not end with #{suffix.inspect}")
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# @param output [String] the output to check
|
|
86
|
+
# @param value [String] exact expected value
|
|
87
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
88
|
+
def equals(output, value:)
|
|
89
|
+
passed = output.to_s == value.to_s
|
|
90
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
91
|
+
reason: passed ? "Output equals #{value.inspect}" :
|
|
92
|
+
"Output does not equal #{value.inspect}")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# @param output [String] the output to check
|
|
96
|
+
# @param min [Integer] minimum string length
|
|
97
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
98
|
+
def min_length(output, min:)
|
|
99
|
+
len = output.to_s.length
|
|
100
|
+
passed = len >= min
|
|
101
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
102
|
+
reason: passed ? "Output length #{len} >= #{min}" :
|
|
103
|
+
"Output length #{len} < #{min}")
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @param output [String] the output to check
|
|
107
|
+
# @param max [Integer] maximum string length
|
|
108
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
109
|
+
def max_length(output, max:)
|
|
110
|
+
len = output.to_s.length
|
|
111
|
+
passed = len <= max
|
|
112
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
113
|
+
reason: passed ? "Output length #{len} <= #{max}" :
|
|
114
|
+
"Output length #{len} > #{max}")
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# @param output [String] the output to check
|
|
118
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
119
|
+
def url(output, **_kwargs)
|
|
120
|
+
uri = URI.parse(output.to_s.strip)
|
|
121
|
+
passed = uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
122
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
123
|
+
reason: passed ? "Output is a valid URL" :
|
|
124
|
+
"Output is not a valid URL")
|
|
125
|
+
rescue URI::InvalidURIError
|
|
126
|
+
result(false, score: 0.0, reason: "Output is not a valid URL (parse error)")
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# @param output [String] the output to check
|
|
130
|
+
# @return [Hash] { passed:, score:, reason: }
|
|
131
|
+
def email(output, **_kwargs)
|
|
132
|
+
# Simple but effective email regex
|
|
133
|
+
pattern = /\A[^@\s]+@[^@\s]+\.[^@\s]+\z/
|
|
134
|
+
passed = pattern.match?(output.to_s.strip)
|
|
135
|
+
result(passed, score: passed ? 1.0 : 0.0,
|
|
136
|
+
reason: passed ? "Output is a valid email address" :
|
|
137
|
+
"Output is not a valid email address")
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
private
|
|
141
|
+
|
|
142
|
+
# Approximate token count (4 chars per token).
|
|
143
|
+
def approximate_tokens(text)
|
|
144
|
+
(text.length / 4.0).ceil
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def result(passed, score:, reason:)
|
|
148
|
+
{ passed: passed, score: score, reason: reason }
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ask
|
|
4
|
+
module Eval
|
|
5
|
+
module Assertions
|
|
6
|
+
# Wraps an LLM judge call into a standard assertion result.
|
|
7
|
+
# This is the bridge between the judge system and the assertion runner.
|
|
8
|
+
module Judge
|
|
9
|
+
class << self
|
|
10
|
+
# Run a judge assertion against a test case.
|
|
11
|
+
#
|
|
12
|
+
# @param judge_class [Class] the judge class (e.g., Judges::Faithful)
|
|
13
|
+
# @param test_case [Ask::Eval::TestCase]
|
|
14
|
+
# @param model [Object, nil] the judge model
|
|
15
|
+
# @param threshold [Float] minimum acceptable score
|
|
16
|
+
# @param track_cost [Boolean] whether to track costs
|
|
17
|
+
# @return [Ask::Eval::Judge::Result]
|
|
18
|
+
def evaluate(judge_class, test_case, model: nil, threshold: 0.7, track_cost: false)
|
|
19
|
+
judge = judge_class.new(model: model, track_cost: track_cost)
|
|
20
|
+
result = judge.call(test_case)
|
|
21
|
+
passed = result.score >= threshold
|
|
22
|
+
Ask::Eval::Judge::Result.new(
|
|
23
|
+
passed: passed,
|
|
24
|
+
score: result.score,
|
|
25
|
+
reason: result.reason,
|
|
26
|
+
cost: result.cost
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ask
|
|
4
|
+
module Eval
|
|
5
|
+
module Assertions
|
|
6
|
+
class << self
|
|
7
|
+
# Evaluate a single assertion by name.
|
|
8
|
+
#
|
|
9
|
+
# @param name [Symbol] assertion name (:contains, :faithful, etc.)
|
|
10
|
+
# @param output [String] the LLM output
|
|
11
|
+
# @param kwargs [Hash] additional arguments
|
|
12
|
+
# @return [Ask::Eval::Judge::Result, Hash] the evaluation result
|
|
13
|
+
def evaluate(name, output, **kwargs)
|
|
14
|
+
case name
|
|
15
|
+
when *DETERMINISTIC_ASSERTIONS
|
|
16
|
+
evaluate_deterministic(name, output, **kwargs)
|
|
17
|
+
when *JUDGE_ASSERTIONS
|
|
18
|
+
evaluate_judge(name, output, **kwargs)
|
|
19
|
+
else
|
|
20
|
+
raise ArgumentError, "Unknown assertion: #{name.inspect}. " \
|
|
21
|
+
"Available: #{available_assertions.join(', ')}"
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Evaluate multiple assertions against a test case.
|
|
26
|
+
#
|
|
27
|
+
# @param test_case [Ask::Eval::TestCase]
|
|
28
|
+
# @param assertions [Array<Hash>] array of { name:, **kwargs }
|
|
29
|
+
# @return [Array<Hash>] results with assertion name and result
|
|
30
|
+
def evaluate_all(test_case, assertions)
|
|
31
|
+
assertions.map do |assertion|
|
|
32
|
+
name = assertion[:name] || assertion["name"]
|
|
33
|
+
kwargs = assertion.reject { |k, _| k.to_s == "name" }
|
|
34
|
+
result = evaluate(name, test_case.actual_output, **kwargs)
|
|
35
|
+
{ name: name, result: result }
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @return [Array<Symbol>] all available assertion names
|
|
40
|
+
def available_assertions
|
|
41
|
+
DETERMINISTIC_ASSERTIONS + JUDGE_ASSERTIONS
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @return [Array<Symbol>] deterministic assertion names
|
|
45
|
+
def deterministic_assertions
|
|
46
|
+
DETERMINISTIC_ASSERTIONS
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [Array<Symbol>] judge-based assertion names
|
|
50
|
+
def judge_assertions
|
|
51
|
+
JUDGE_ASSERTIONS
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
DETERMINISTIC_ASSERTIONS = %i[
|
|
57
|
+
contains not_contains regex is_json max_tokens
|
|
58
|
+
starts_with ends_with equals min_length max_length
|
|
59
|
+
url email
|
|
60
|
+
].freeze
|
|
61
|
+
|
|
62
|
+
JUDGE_ASSERTIONS = %i[
|
|
63
|
+
faithful hallucination bias toxicity correctness
|
|
64
|
+
].freeze
|
|
65
|
+
|
|
66
|
+
def evaluate_deterministic(name, output, **kwargs)
|
|
67
|
+
Deterministic.public_send(name, output, **kwargs)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def evaluate_judge(name, output, **kwargs)
|
|
71
|
+
context = kwargs.delete(:context)
|
|
72
|
+
expected = kwargs.delete(:expected)
|
|
73
|
+
input = kwargs.delete(:input)
|
|
74
|
+
|
|
75
|
+
test_case = TestCase.new(
|
|
76
|
+
actual_output: output,
|
|
77
|
+
context: context,
|
|
78
|
+
expected_output: expected,
|
|
79
|
+
input: input
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
judge_class = case name
|
|
83
|
+
when :faithful then Judges::Faithful
|
|
84
|
+
when :hallucination then Judges::Hallucination
|
|
85
|
+
when :bias then Judges::Bias
|
|
86
|
+
when :toxicity then Judges::Toxicity
|
|
87
|
+
when :correctness then Judges::Correctness
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
judge = judge_class.new(**kwargs)
|
|
91
|
+
judge.call(test_case)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
require_relative "assertions/deterministic"
|
|
99
|
+
require_relative "assertions/judge"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ask
|
|
4
|
+
module Eval
|
|
5
|
+
class Configuration
|
|
6
|
+
# @return [Object, nil] the default judge model to use for LLM-as-judge assertions.
|
|
7
|
+
# Can be a callable, an Ask::Provider instance, or a model string.
|
|
8
|
+
attr_accessor :default_judge
|
|
9
|
+
|
|
10
|
+
# @return [Boolean] whether to print detailed output during evaluation
|
|
11
|
+
attr_accessor :verbose
|
|
12
|
+
|
|
13
|
+
# @return [Boolean] whether to track token usage and costs
|
|
14
|
+
attr_accessor :track_cost
|
|
15
|
+
|
|
16
|
+
def initialize
|
|
17
|
+
@default_judge = nil
|
|
18
|
+
@verbose = false
|
|
19
|
+
@track_cost = false
|
|
20
|
+
@cost_tracker = CostTracker.new
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Internal: accumulate cost from a judge result.
|
|
24
|
+
# @api private
|
|
25
|
+
def _accumulate_cost(cost)
|
|
26
|
+
return unless @track_cost && cost
|
|
27
|
+
# CostTracker handles the recording via DSL methods
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @return [Ask::Eval::CostTracker] the cost tracker instance
|
|
31
|
+
def cost_tracker
|
|
32
|
+
@cost_tracker ||= CostTracker.new
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @return [Hash] cost report summary
|
|
36
|
+
def cost_report
|
|
37
|
+
cost_tracker.summary
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Reset all configuration state.
|
|
41
|
+
def reset!
|
|
42
|
+
@default_judge = nil
|
|
43
|
+
@verbose = false
|
|
44
|
+
@track_cost = false
|
|
45
|
+
@cost_tracker = CostTracker.new
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ask
|
|
4
|
+
module Eval
|
|
5
|
+
# Tracks token usage and costs for judge evaluations.
|
|
6
|
+
class CostTracker
|
|
7
|
+
# Estimated pricing per 1M tokens (USD) for common models.
|
|
8
|
+
# Used when actual pricing isn't available from the provider.
|
|
9
|
+
DEFAULT_PRICING = {
|
|
10
|
+
"gpt-4o-mini" => { input: 0.15, output: 0.60 },
|
|
11
|
+
"gpt-4o" => { input: 2.50, output: 10.00 },
|
|
12
|
+
"gpt-4" => { input: 30.00, output: 60.00 },
|
|
13
|
+
"gpt-3.5-turbo" => { input: 0.50, output: 1.50 },
|
|
14
|
+
"claude-3-5-sonnet" => { input: 3.00, output: 15.00 },
|
|
15
|
+
"claude-3-haiku" => { input: 0.25, output: 1.25 },
|
|
16
|
+
"claude-3-opus" => { input: 15.00, output: 75.00 },
|
|
17
|
+
"gemini-1.5-pro" => { input: 1.25, output: 5.00 },
|
|
18
|
+
"gemini-1.5-flash" => { input: 0.075, output: 0.30 },
|
|
19
|
+
"default" => { input: 1.00, output: 2.00 } # fallback
|
|
20
|
+
}.freeze
|
|
21
|
+
|
|
22
|
+
attr_reader :entries
|
|
23
|
+
|
|
24
|
+
def initialize
|
|
25
|
+
@entries = []
|
|
26
|
+
@mutex = Mutex.new
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Record a single judge call cost.
|
|
30
|
+
# @param model [String] model identifier
|
|
31
|
+
# @param input_tokens [Integer, nil] input tokens used
|
|
32
|
+
# @param output_tokens [Integer, nil] output tokens used
|
|
33
|
+
# @param duration [Float, nil] elapsed time in seconds
|
|
34
|
+
def record(model:, input_tokens: nil, output_tokens: nil, duration: nil)
|
|
35
|
+
pricing = pricing_for(model)
|
|
36
|
+
input_cost = input_tokens.to_f / 1_000_000 * pricing[:input]
|
|
37
|
+
output_cost = output_tokens.to_f / 1_000_000 * pricing[:output]
|
|
38
|
+
|
|
39
|
+
entry = {
|
|
40
|
+
model: model,
|
|
41
|
+
input_tokens: input_tokens,
|
|
42
|
+
output_tokens: output_tokens,
|
|
43
|
+
input_cost: input_cost.round(6),
|
|
44
|
+
output_cost: output_cost.round(6),
|
|
45
|
+
total_cost: (input_cost + output_cost).round(6),
|
|
46
|
+
duration: duration&.round(3)
|
|
47
|
+
}.compact
|
|
48
|
+
|
|
49
|
+
@mutex.synchronize { @entries << entry }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# @return [Hash] summary of all costs
|
|
53
|
+
def summary
|
|
54
|
+
@mutex.synchronize do
|
|
55
|
+
by_judge = @entries.group_by { |e| e[:model] }
|
|
56
|
+
judge_costs = by_judge.transform_values do |entries|
|
|
57
|
+
{
|
|
58
|
+
calls: entries.size,
|
|
59
|
+
total_cost: entries.sum { |e| e[:total_cost] || 0 },
|
|
60
|
+
total_tokens: entries.sum { |e| (e[:input_tokens] || 0) + (e[:output_tokens] || 0) }
|
|
61
|
+
}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
{
|
|
65
|
+
total_cost: @entries.sum { |e| e[:total_cost] || 0 }.round(6),
|
|
66
|
+
total_calls: @entries.size,
|
|
67
|
+
by_judge: judge_costs,
|
|
68
|
+
entries: @entries.dup
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# @return [String] human-readable cost report
|
|
74
|
+
def to_s
|
|
75
|
+
s = summary
|
|
76
|
+
lines = ["Cost Report"]
|
|
77
|
+
lines << " Total: $#{format('%.4f', s[:total_cost])} (#{s[:total_calls]} calls)"
|
|
78
|
+
s[:by_judge].each do |model, data|
|
|
79
|
+
lines << " #{model}: $#{format('%.4f', data[:total_cost])} (#{data[:calls]} calls, #{data[:total_tokens]} tokens)"
|
|
80
|
+
end
|
|
81
|
+
lines.join("\n")
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Reset all tracked data.
|
|
85
|
+
def reset!
|
|
86
|
+
@mutex.synchronize { @entries.clear }
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
def pricing_for(model)
|
|
92
|
+
DEFAULT_PRICING.each do |key, prices|
|
|
93
|
+
return prices if model.to_s.include?(key)
|
|
94
|
+
end
|
|
95
|
+
DEFAULT_PRICING["default"]
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|