llm_conductor 1.7.1 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -0
- data/README.md +63 -0
- data/examples/model_eval_usage.rb +86 -0
- data/lib/llm_conductor/eval/json_parser.rb +76 -0
- data/lib/llm_conductor/eval/judge.rb +188 -0
- data/lib/llm_conductor/eval/model_runner.rb +95 -0
- data/lib/llm_conductor/eval/report.rb +22 -0
- data/lib/llm_conductor/eval/report_builder.rb +258 -0
- data/lib/llm_conductor/eval/result.rb +30 -0
- data/lib/llm_conductor/eval/runner.rb +148 -0
- data/lib/llm_conductor/eval/spec.rb +78 -0
- data/lib/llm_conductor/eval/store/base.rb +58 -0
- data/lib/llm_conductor/eval/store/file_store.rb +94 -0
- data/lib/llm_conductor/eval/store/in_memory.rb +76 -0
- data/lib/llm_conductor/eval/verdict.rb +31 -0
- data/lib/llm_conductor/eval.rb +75 -0
- data/lib/llm_conductor/version.rb +1 -1
- metadata +30 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ac4b318b4227e3f089f42d471d51a1d7fefc9c54d080c30bbb2e748039ee4ae0
|
|
4
|
+
data.tar.gz: c21eb439aed4fc671fde4dc30c06c12f0853a5f99ba29d47c3bd3b389f5b27c0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 918613d702e3918ae5651ec8aefab34126a36a17c0be75549a2227fb9a3e6b7392c245a7d96a2d15df6e76996e871c66c1572cb1f061d5bef1c96fe1e31ec8e4
|
|
7
|
+
data.tar.gz: e4b356669ce32103a8a264c66574edf7f0819379240798d0d91b0cd29c623e512cd6776eb4d5bcf421d6fb8cb30bb2467cc188154910928a8aa6b16adc017d76
|
data/.rubocop.yml
CHANGED
|
@@ -31,6 +31,10 @@ Lint/ConstantDefinitionInBlock:
|
|
|
31
31
|
|
|
32
32
|
Metrics/ClassLength:
|
|
33
33
|
Max: 130
|
|
34
|
+
Exclude:
|
|
35
|
+
# Eval engine classes are faithful ports of the Rails prototype; their size
|
|
36
|
+
# mirrors the reference implementation.
|
|
37
|
+
- 'lib/llm_conductor/eval/**/*'
|
|
34
38
|
|
|
35
39
|
Metrics/MethodLength:
|
|
36
40
|
Max: 15
|
|
@@ -39,6 +43,7 @@ Metrics/MethodLength:
|
|
|
39
43
|
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
40
44
|
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
41
45
|
- 'lib/llm_conductor/client_factory.rb'
|
|
46
|
+
- 'lib/llm_conductor/eval/**/*'
|
|
42
47
|
- 'examples/*.rb'
|
|
43
48
|
|
|
44
49
|
RSpec/ExampleLength:
|
|
@@ -98,12 +103,15 @@ Metrics/AbcSize:
|
|
|
98
103
|
- 'lib/llm_conductor/prompts.rb'
|
|
99
104
|
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
100
105
|
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
106
|
+
- 'lib/llm_conductor/eval/**/*'
|
|
101
107
|
- 'examples/*.rb'
|
|
102
108
|
|
|
103
109
|
Metrics/ParameterLists:
|
|
104
110
|
Exclude:
|
|
105
111
|
- 'lib/llm_conductor.rb'
|
|
106
112
|
- 'lib/llm_conductor/configuration.rb'
|
|
113
|
+
- 'lib/llm_conductor/eval.rb'
|
|
114
|
+
- 'lib/llm_conductor/eval/**/*'
|
|
107
115
|
|
|
108
116
|
Metrics/CyclomaticComplexity:
|
|
109
117
|
Exclude:
|
|
@@ -111,6 +119,7 @@ Metrics/CyclomaticComplexity:
|
|
|
111
119
|
- 'lib/llm_conductor/prompts.rb'
|
|
112
120
|
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
113
121
|
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
122
|
+
- 'lib/llm_conductor/eval/**/*'
|
|
114
123
|
- 'examples/*.rb'
|
|
115
124
|
|
|
116
125
|
Metrics/PerceivedComplexity:
|
|
@@ -118,6 +127,7 @@ Metrics/PerceivedComplexity:
|
|
|
118
127
|
- 'lib/llm_conductor/prompts.rb'
|
|
119
128
|
- 'lib/llm_conductor/clients/openrouter_client.rb'
|
|
120
129
|
- 'lib/llm_conductor/clients/zai_client.rb'
|
|
130
|
+
- 'lib/llm_conductor/eval/**/*'
|
|
121
131
|
|
|
122
132
|
Layout/LineLength:
|
|
123
133
|
Max: 125
|
data/README.md
CHANGED
|
@@ -252,6 +252,68 @@ else
|
|
|
252
252
|
end
|
|
253
253
|
```
|
|
254
254
|
|
|
255
|
+
## Model Evaluation (opt-in)
|
|
256
|
+
|
|
257
|
+
Which model/vendor is best for *your* prompt? The eval harness runs the same
|
|
258
|
+
prompt across N `(model, vendor)` pairs over M inputs and compares them on
|
|
259
|
+
**cost, latency, tokens, and LLM-judged quality** — three of which `generate`
|
|
260
|
+
already produces for free.
|
|
261
|
+
|
|
262
|
+
It's behind a separate require so core users pay nothing:
|
|
263
|
+
|
|
264
|
+
```ruby
|
|
265
|
+
require 'llm_conductor/eval'
|
|
266
|
+
|
|
267
|
+
# 1. Describe how to evaluate your feature (the only feature-specific code).
|
|
268
|
+
class ArticleSummarySpec < LlmConductor::Eval::Spec
|
|
269
|
+
def prompt_type = :summarize_text # a registered prompt type
|
|
270
|
+
def input_id(article) = article[:id]
|
|
271
|
+
def input_label(article) = article[:title]
|
|
272
|
+
def build_data(article) = { content: article[:body] } # payload for generate(type:, data:)
|
|
273
|
+
|
|
274
|
+
# { score:, bucket: } — bucket is any discrete label; powers disagreement detection.
|
|
275
|
+
def output_summary(parsed) = { score: parsed['rating'], bucket: parsed['verdict'] }
|
|
276
|
+
|
|
277
|
+
def judge_rubric_excerpt = 'A good summary is faithful, concise, and covers the key points.'
|
|
278
|
+
def judge_dimensions
|
|
279
|
+
[{ key: 'faithfulness', description: 'no hallucinations vs. the source' },
|
|
280
|
+
{ key: 'coverage', description: 'captures the key points' }]
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# 2. Run it. `inputs` is ANY enumerable — selection is YOUR job, never the gem's.
|
|
285
|
+
report = LlmConductor::Eval.run(
|
|
286
|
+
spec: ArticleSummarySpec.new,
|
|
287
|
+
inputs: my_articles,
|
|
288
|
+
models: [ # caller-owned; no baked-in defaults
|
|
289
|
+
{ model: 'phi4-mini', vendor: :ollama },
|
|
290
|
+
{ model: 'gemini-2.5-flash', vendor: :gemini },
|
|
291
|
+
{ model: 'gpt-4o-mini', vendor: :openai }
|
|
292
|
+
],
|
|
293
|
+
judge: { model: 'llama-3.3-70b-versatile', vendor: :groq }, # default; needs Groq creds
|
|
294
|
+
store: LlmConductor::Eval::Store::FileStore.new('tmp/llm_eval') # or in-memory (default)
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
report.summary # per-model aggregates: parse-OK%, mean quality, p50/p95 latency, cost
|
|
298
|
+
report.to_markdown # decision-aid report (you persist it)
|
|
299
|
+
report.to_csv # full per-row data
|
|
300
|
+
report.needs_review # rows flagged for a human (bucket disagreement / borderline / errors)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
**Judge bias matters.** The judge defaults to Groq's `llama-3.3-70b-versatile`
|
|
304
|
+
precisely because it sits *outside* the Gemini/OpenAI/Ollama candidate families —
|
|
305
|
+
a model grading its own family scores it high. Any row where the judged model
|
|
306
|
+
equals the judge model is flagged `self_judge=true` so you can discount it.
|
|
307
|
+
|
|
308
|
+
Cheap re-runs reuse stored candidate outputs — no re-calling the candidates:
|
|
309
|
+
|
|
310
|
+
```ruby
|
|
311
|
+
LlmConductor::Eval.judge_only(run_id:, spec:, store:, judge: { model: 'gemini-2.5-pro', vendor: :gemini })
|
|
312
|
+
LlmConductor::Eval.report_only(run_id:, spec:, store:)
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
See [`examples/model_eval_usage.rb`](examples/model_eval_usage.rb) for a complete runnable example.
|
|
316
|
+
|
|
255
317
|
## Documentation
|
|
256
318
|
|
|
257
319
|
- **[Custom Parameters Guide](docs/custom-parameters.md)** - Temperature, top_p, and more
|
|
@@ -272,6 +334,7 @@ Check the [examples/](examples/) directory for comprehensive examples:
|
|
|
272
334
|
- `data_builder_usage.rb` - Data builder patterns
|
|
273
335
|
- `prompt_registration.rb` - Custom prompt classes
|
|
274
336
|
- `rag_usage.rb` - Retrieval-Augmented Generation
|
|
337
|
+
- `model_eval_usage.rb` - Model evaluation harness (cost/latency/quality comparison)
|
|
275
338
|
|
|
276
339
|
Run any example:
|
|
277
340
|
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Model Evaluation harness example.
|
|
4
|
+
#
|
|
5
|
+
# Runs the same prompt across several (model, vendor) pairs over a handful of
|
|
6
|
+
# inputs and compares them on cost, latency, tokens, and LLM-judged quality.
|
|
7
|
+
#
|
|
8
|
+
# Run with: ruby examples/model_eval_usage.rb
|
|
9
|
+
#
|
|
10
|
+
# Requires credentials for whichever vendors you list in CANDIDATES (and Groq
|
|
11
|
+
# for the default judge). Configure them via ENV (OPENAI_API_KEY, GEMINI_API_KEY,
|
|
12
|
+
# GROQ_API_KEY, OLLAMA_ADDRESS, ...) or LlmConductor.configure.
|
|
13
|
+
|
|
14
|
+
require 'llm_conductor/eval'
|
|
15
|
+
|
|
16
|
+
# 1. A Spec describes the ONE feature being evaluated. It is the only
|
|
17
|
+
# feature-specific code; the engine itself is generic.
|
|
18
|
+
class SentimentSpec < LlmConductor::Eval::Spec
|
|
19
|
+
# We build a full prompt string ourselves, so prompt_type is nil and the
|
|
20
|
+
# engine passes build_data as `prompt:` (instead of `type:` + `data:`).
|
|
21
|
+
def prompt_type = nil
|
|
22
|
+
|
|
23
|
+
def input_id(review) = review[:id]
|
|
24
|
+
def input_label(review) = review[:product]
|
|
25
|
+
|
|
26
|
+
def build_data(review)
|
|
27
|
+
<<~PROMPT
|
|
28
|
+
Classify the sentiment of this product review. Respond with ONLY a JSON
|
|
29
|
+
object: {"sentiment": "positive|neutral|negative", "confidence": 0-100}
|
|
30
|
+
|
|
31
|
+
Review: #{review[:text]}
|
|
32
|
+
PROMPT
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# score + bucket drive the CSV and bucket-disagreement detection. The bucket
|
|
36
|
+
# here is the sentiment label — if models disagree on it, the row is flagged.
|
|
37
|
+
def output_summary(parsed)
|
|
38
|
+
{ score: parsed['confidence'], bucket: parsed['sentiment'] }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def judge_rubric_excerpt
|
|
42
|
+
'A correct classification matches the review\'s actual sentiment and gives ' \
|
|
43
|
+
'a calibrated confidence (high only when the text is unambiguous).'
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def judge_dimensions
|
|
47
|
+
[{ key: 'correctness', description: 'is the sentiment label correct' },
|
|
48
|
+
{ key: 'calibration', description: 'is the confidence well-calibrated' }]
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# 2. Inputs are ANY enumerable of opaque objects — selecting them is YOUR job.
|
|
53
|
+
reviews = [
|
|
54
|
+
{ id: 1, product: 'Widget', text: 'Absolutely love it, works perfectly!' },
|
|
55
|
+
{ id: 2, product: 'Gadget', text: 'It broke after two days. Very disappointed.' },
|
|
56
|
+
{ id: 3, product: 'Gizmo', text: 'It is fine. Does what it says, nothing special.' }
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# 3. Candidate (model, vendor) pairs are caller-owned — there is no baked-in
|
|
60
|
+
# default list (which models you have pulled / hold keys for is your concern).
|
|
61
|
+
CANDIDATES = [
|
|
62
|
+
{ model: 'gpt-4o-mini', vendor: :openai },
|
|
63
|
+
{ model: 'gemini-2.5-flash', vendor: :gemini }
|
|
64
|
+
].freeze
|
|
65
|
+
|
|
66
|
+
report = LlmConductor::Eval.run(
|
|
67
|
+
spec: SentimentSpec.new,
|
|
68
|
+
inputs: reviews,
|
|
69
|
+
models: CANDIDATES,
|
|
70
|
+
# Judge defaults to llama-3.3-70b-versatile on Groq (outside the candidate
|
|
71
|
+
# families → no self-judge bias). Override here if you have other quota.
|
|
72
|
+
judge: { model: 'llama-3.3-70b-versatile', vendor: :groq },
|
|
73
|
+
# InMemory store is the default; swap in FileStore to persist + enable
|
|
74
|
+
# report_only / judge_only re-runs:
|
|
75
|
+
store: LlmConductor::Eval::Store::FileStore.new('tmp/llm_eval')
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
puts report.to_markdown
|
|
79
|
+
puts "\n--- Rows needing human review ---"
|
|
80
|
+
report.needs_review.each do |row|
|
|
81
|
+
puts "input=#{row[:input_id]} model=#{row[:model]} reasons=#{row[:reasons].join(', ')}"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Persist the CSV yourself — the engine returns data, it doesn't impose a layout.
|
|
85
|
+
File.write('tmp/llm_eval_results.csv', report.to_csv)
|
|
86
|
+
puts "\nWrote tmp/llm_eval_results.csv"
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module LlmConductor
|
|
6
|
+
module Eval
|
|
7
|
+
# Minimal, conservative JSON-from-LLM-text parser.
|
|
8
|
+
#
|
|
9
|
+
# Replaces the app-level LlmJsonCleaner the Rails prototype relied on. The
|
|
10
|
+
# guiding principle (from docs/llm_eval_framework.md) is: NEVER "repair"
|
|
11
|
+
# already-valid JSON — heavy cleaning corrupts numeric scores and the like.
|
|
12
|
+
# We only strip markdown fences, drop any preamble before the first brace,
|
|
13
|
+
# trim to the outermost balanced object/array, then parse once.
|
|
14
|
+
module JsonParser
|
|
15
|
+
module_function
|
|
16
|
+
|
|
17
|
+
# Parse +text+ into a Hash or Array, or return nil on any failure.
|
|
18
|
+
def parse(text)
|
|
19
|
+
prepared = prepare_text(text)
|
|
20
|
+
return nil if prepared.empty?
|
|
21
|
+
|
|
22
|
+
obj = begin
|
|
23
|
+
JSON.parse(prepared)
|
|
24
|
+
rescue JSON::ParserError
|
|
25
|
+
nil
|
|
26
|
+
end
|
|
27
|
+
obj.is_a?(Hash) || obj.is_a?(Array) ? obj : nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Strip ```json fences, drop preamble before the first [ or {, and trim
|
|
31
|
+
# to the matching closing brace/bracket. Returns '' when there is no
|
|
32
|
+
# JSON-looking content at all.
|
|
33
|
+
def prepare_text(text)
|
|
34
|
+
str = text.to_s.strip
|
|
35
|
+
.gsub(/\A```(?:json)?\s*/i, '')
|
|
36
|
+
.gsub(/```\s*\z/, '')
|
|
37
|
+
.strip
|
|
38
|
+
start = str.index(/[\[{]/)
|
|
39
|
+
return '' if start.nil?
|
|
40
|
+
|
|
41
|
+
balance(str[start..])
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Given a string that starts with '{' or '[', return the substring up to
|
|
45
|
+
# and including its matching close. String contents (and escapes) are
|
|
46
|
+
# skipped so braces inside string literals don't throw off the depth.
|
|
47
|
+
def balance(str)
|
|
48
|
+
open = str[0]
|
|
49
|
+
close = open == '{' ? '}' : ']'
|
|
50
|
+
depth = 0
|
|
51
|
+
in_string = false
|
|
52
|
+
escape = false
|
|
53
|
+
|
|
54
|
+
str.each_char.with_index do |char, index|
|
|
55
|
+
if in_string
|
|
56
|
+
if escape then escape = false
|
|
57
|
+
elsif char == '\\' then escape = true
|
|
58
|
+
elsif char == '"' then in_string = false
|
|
59
|
+
end
|
|
60
|
+
next
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
case char
|
|
64
|
+
when '"' then in_string = true
|
|
65
|
+
when open then depth += 1
|
|
66
|
+
when close
|
|
67
|
+
depth -= 1
|
|
68
|
+
return str[0..index] if depth.zero?
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
str
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative 'verdict'
|
|
5
|
+
require_relative 'json_parser'
|
|
6
|
+
require_relative 'model_runner'
|
|
7
|
+
|
|
8
|
+
module LlmConductor
|
|
9
|
+
module Eval
|
|
10
|
+
# LLM-as-judge for one candidate (input, model) output.
|
|
11
|
+
#
|
|
12
|
+
# Sends the judge model the original input data, the spec's rubric excerpt,
|
|
13
|
+
# and the candidate's parsed output (or raw text on parse failure), and
|
|
14
|
+
# expects strict JSON back with a quality_score + per-dimension scores.
|
|
15
|
+
#
|
|
16
|
+
# Judge defaults to Groq's llama-3.3-70b-versatile: it sits OUTSIDE the
|
|
17
|
+
# Gemini/OpenAI/Ollama families that dominate most candidate lists (avoiding
|
|
18
|
+
# self-judge bias — Gemini grades its own output ~10pts high) and Groq's
|
|
19
|
+
# free tier offers far more throughput than Gemini Pro's ~2 RPM. Override
|
|
20
|
+
# via the +judge:+ config. It needs Groq credentials configured; rows where
|
|
21
|
+
# the judged model == the judge model are flagged +self_judge+ in the report.
|
|
22
|
+
class Judge
|
|
23
|
+
DEFAULT_MODEL = 'llama-3.3-70b-versatile'
|
|
24
|
+
DEFAULT_VENDOR = :groq
|
|
25
|
+
|
|
26
|
+
def self.borderline?(score)
|
|
27
|
+
Verdict.borderline?(score)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def initialize(spec:, store:, run_id:, logger:, judge_model: DEFAULT_MODEL,
|
|
31
|
+
judge_vendor: DEFAULT_VENDOR, rate_limit_retries: 3,
|
|
32
|
+
rate_limit_backoff_seconds: 20)
|
|
33
|
+
@spec = spec
|
|
34
|
+
@store = store
|
|
35
|
+
@run_id = run_id
|
|
36
|
+
@logger = logger
|
|
37
|
+
@judge_model = judge_model
|
|
38
|
+
@judge_vendor = judge_vendor.to_sym
|
|
39
|
+
@rate_limit_retries = rate_limit_retries
|
|
40
|
+
@rate_limit_backoff_seconds = rate_limit_backoff_seconds
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# +model_result+ is an Eval::Result. +input_data+ is the spec's data Hash
|
|
44
|
+
# for the input being judged.
|
|
45
|
+
def judge(model_result:, input_data:)
|
|
46
|
+
prompt = build_prompt(model_result:, input_data:)
|
|
47
|
+
response, latency_ms = call_with_rate_limit_retry(prompt)
|
|
48
|
+
|
|
49
|
+
unless response&.success?
|
|
50
|
+
error = response&.metadata&.dig(:error) || 'judge LLM call failed'
|
|
51
|
+
return failure_verdict(latency_ms:, response:, error:)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
parsed = JsonParser.parse(response.output)
|
|
55
|
+
if parsed.nil?
|
|
56
|
+
return failure_verdict(latency_ms:, response:,
|
|
57
|
+
error: "judge output not valid JSON: #{response.output.to_s[0, 200]}")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
build_verdict(parsed:, latency_ms:, response:)
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
@logger.error("[Eval::Judge] #{@judge_model}: #{e.class}: #{e.message}")
|
|
63
|
+
Verdict.new(judge_model: @judge_model, judge_error: "#{e.class}: #{e.message}")
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def build_verdict(parsed:, latency_ms:, response:)
|
|
69
|
+
Verdict.new(
|
|
70
|
+
quality_score: clamp_score(parsed['quality_score']),
|
|
71
|
+
dimensions: extract_dimensions(parsed['dimensions']),
|
|
72
|
+
issues: Array(parsed['issues']).map(&:to_s),
|
|
73
|
+
verdict_one_line: parsed['verdict_one_line'].to_s,
|
|
74
|
+
judge_model: @judge_model,
|
|
75
|
+
judge_latency_ms: latency_ms,
|
|
76
|
+
judge_input_tokens: response.input_tokens,
|
|
77
|
+
judge_output_tokens: response.output_tokens,
|
|
78
|
+
judge_estimated_cost_usd: response.estimated_cost
|
|
79
|
+
)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def call_with_rate_limit_retry(prompt)
|
|
83
|
+
attempt = 0
|
|
84
|
+
started_at = Time.now.utc
|
|
85
|
+
loop do
|
|
86
|
+
response = LlmConductor.generate(model: @judge_model, prompt:, vendor: @judge_vendor)
|
|
87
|
+
if !response&.success? && rate_limited?(response) && attempt < @rate_limit_retries
|
|
88
|
+
wait = @rate_limit_backoff_seconds * (2**attempt)
|
|
89
|
+
@logger.warn("[Eval::Judge] 429 from #{@judge_model}; sleeping #{wait}s then retrying " \
|
|
90
|
+
"(attempt #{attempt + 1}/#{@rate_limit_retries})")
|
|
91
|
+
sleep(wait)
|
|
92
|
+
attempt += 1
|
|
93
|
+
next
|
|
94
|
+
end
|
|
95
|
+
return [response, ((Time.now.utc - started_at) * 1000).round]
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def rate_limited?(response)
|
|
100
|
+
error = response&.metadata&.dig(:error).to_s
|
|
101
|
+
error.include?('429') || error.match?(/rate.limit/i)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def build_prompt(model_result:, input_data:)
|
|
105
|
+
<<~PROMPT
|
|
106
|
+
You are an impartial judge evaluating how well a candidate LLM performed a
|
|
107
|
+
task against its rubric. Score the candidate's output on a 0-100 quality
|
|
108
|
+
scale. Be strict but fair: a perfect rubric-adherent response grounded in
|
|
109
|
+
the provided evidence is 90-100; obvious hallucinations or rubric violations
|
|
110
|
+
should drop the score significantly.
|
|
111
|
+
|
|
112
|
+
<rubric_excerpt>
|
|
113
|
+
#{@spec.judge_rubric_excerpt}
|
|
114
|
+
</rubric_excerpt>
|
|
115
|
+
|
|
116
|
+
<original_input_data>
|
|
117
|
+
#{JSON.pretty_generate(input_data)}
|
|
118
|
+
</original_input_data>
|
|
119
|
+
|
|
120
|
+
<candidate_output>
|
|
121
|
+
#{candidate_block(model_result)}
|
|
122
|
+
</candidate_output>
|
|
123
|
+
|
|
124
|
+
<judging_dimensions>
|
|
125
|
+
#{judging_dimensions_block}
|
|
126
|
+
</judging_dimensions>
|
|
127
|
+
|
|
128
|
+
Return ONE JSON object with no markdown fences and no commentary:
|
|
129
|
+
|
|
130
|
+
{
|
|
131
|
+
"quality_score": 0-100,
|
|
132
|
+
"dimensions": {
|
|
133
|
+
#{dimensions_json_template}
|
|
134
|
+
},
|
|
135
|
+
"issues": ["concrete one-line problem", "..."],
|
|
136
|
+
"verdict_one_line": "one-line summary of overall judgment"
|
|
137
|
+
}
|
|
138
|
+
PROMPT
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def candidate_block(model_result)
|
|
142
|
+
slug = ModelRunner.slug(model_result.model)
|
|
143
|
+
parsed = @store.read_parsed(@run_id, model_result.input_id, slug)
|
|
144
|
+
return parsed.is_a?(String) ? parsed : JSON.pretty_generate(parsed) if parsed
|
|
145
|
+
|
|
146
|
+
raw = @store.read_raw(@run_id, model_result.input_id, slug)
|
|
147
|
+
if raw && !raw.empty?
|
|
148
|
+
"PARSE FAILED. RAW OUTPUT:\n#{raw}"
|
|
149
|
+
else
|
|
150
|
+
"CANDIDATE PRODUCED NO USABLE OUTPUT. status=#{model_result.status} error=#{model_result.error}"
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def judging_dimensions_block
|
|
155
|
+
@spec.judge_dimensions.map { |d| " - #{d[:key]} (0-100): #{d[:description]}" }.join("\n")
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def dimensions_json_template
|
|
159
|
+
@spec.judge_dimensions.map { |d| " \"#{d[:key]}\": 0-100" }.join(",\n")
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def extract_dimensions(raw)
|
|
163
|
+
return {} unless raw.is_a?(Hash)
|
|
164
|
+
|
|
165
|
+
@spec.judge_dimensions.each_with_object({}) do |d, acc|
|
|
166
|
+
acc[d[:key]] = clamp_score(raw[d[:key]] || raw[d[:key].to_s])
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def clamp_score(raw)
|
|
171
|
+
return nil if raw.nil?
|
|
172
|
+
|
|
173
|
+
Integer(raw).clamp(0, 100)
|
|
174
|
+
rescue ArgumentError, TypeError
|
|
175
|
+
nil
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def failure_verdict(latency_ms:, response:, error:)
|
|
179
|
+
Verdict.new(
|
|
180
|
+
judge_model: @judge_model, judge_latency_ms: latency_ms,
|
|
181
|
+
judge_input_tokens: response&.input_tokens, judge_output_tokens: response&.output_tokens,
|
|
182
|
+
judge_estimated_cost_usd: response&.estimated_cost, judge_error: error,
|
|
183
|
+
quality_score: 0, dimensions: {}, issues: [], verdict_one_line: ''
|
|
184
|
+
)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'result'
|
|
4
|
+
|
|
5
|
+
module LlmConductor
|
|
6
|
+
module Eval
|
|
7
|
+
# Runs one (input, model) pair through LlmConductor.generate, capturing
|
|
8
|
+
# latency / tokens / cost / parse status and writing raw + parsed outputs
|
|
9
|
+
# through the Store. Side-effect free — never touches the caller's data.
|
|
10
|
+
#
|
|
11
|
+
# All feature-specific behavior (prompt type, payload, parsing,
|
|
12
|
+
# score/bucket extraction) is delegated to the Spec.
|
|
13
|
+
class ModelRunner
|
|
14
|
+
# Filesystem-safe slug for a model name (e.g. "gemini-2.5-flash").
|
|
15
|
+
def self.slug(model)
|
|
16
|
+
model.to_s.gsub(/[^A-Za-z0-9_.-]+/, '_')
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def initialize(input, model:, vendor:, spec:, store:, run_id:, logger:, data: nil)
|
|
20
|
+
@input = input
|
|
21
|
+
@model = model
|
|
22
|
+
@vendor = vendor.to_sym
|
|
23
|
+
@spec = spec
|
|
24
|
+
@store = store
|
|
25
|
+
@run_id = run_id
|
|
26
|
+
@logger = logger
|
|
27
|
+
@data = data
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def run
|
|
31
|
+
input_id = @spec.input_id(@input)
|
|
32
|
+
data = @data || @spec.build_data(@input)
|
|
33
|
+
|
|
34
|
+
started_at = Time.now.utc
|
|
35
|
+
response = LlmConductor.generate(**generate_args(data, input_id))
|
|
36
|
+
latency_ms = ((Time.now.utc - started_at) * 1000).round
|
|
37
|
+
|
|
38
|
+
raw_ref = @store.write_raw(@run_id, input_id, slug, response&.output.to_s)
|
|
39
|
+
|
|
40
|
+
if response.nil? || !response.success?
|
|
41
|
+
error = response&.metadata&.dig(:error) || 'LLM returned no response'
|
|
42
|
+
return build_result(input_id:, status: 'llm_error', latency_ms:, response:, raw_ref:, error:)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
parsed = @spec.parse(response.output)
|
|
46
|
+
if parsed.nil?
|
|
47
|
+
return build_result(input_id:, status: 'parse_error', latency_ms:, response:, raw_ref:,
|
|
48
|
+
error: 'LLM output not valid structured data')
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
parsed_ref = @store.write_parsed(@run_id, input_id, slug, parsed)
|
|
52
|
+
build_result(input_id:, status: 'ok', latency_ms:, response:, raw_ref:, parsed_ref:, parsed:)
|
|
53
|
+
rescue StandardError => e
|
|
54
|
+
@logger.error("[Eval::ModelRunner] #{@model}@#{@spec.input_id(@input)}: #{e.class}: #{e.message}")
|
|
55
|
+
Result.new(input_id: @spec.input_id(@input), input_label:, model: @model,
|
|
56
|
+
vendor: @vendor, status: 'exception', error: "#{e.class}: #{e.message}")
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def slug
|
|
60
|
+
self.class.slug(@model)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def generate_args(data, input_id)
|
|
66
|
+
args = { model: @model, vendor: @vendor }
|
|
67
|
+
if @spec.prompt_type
|
|
68
|
+
args[:type] = @spec.prompt_type
|
|
69
|
+
args[:data] = data
|
|
70
|
+
else
|
|
71
|
+
args[:prompt] = data
|
|
72
|
+
end
|
|
73
|
+
params = @spec.vendor_params(vendor: @vendor, input_id:)
|
|
74
|
+
args[:params] = params unless params.nil? || params.empty?
|
|
75
|
+
args
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def build_result(input_id:, status:, latency_ms:, response:, raw_ref:, parsed_ref: nil, parsed: nil, error: nil)
|
|
79
|
+
summary = parsed ? @spec.output_summary(parsed) : { score: nil, bucket: nil }
|
|
80
|
+
Result.new(
|
|
81
|
+
input_id:, input_label:, model: @model, vendor: @vendor, status:, latency_ms:,
|
|
82
|
+
input_tokens: response&.input_tokens, output_tokens: response&.output_tokens,
|
|
83
|
+
total_tokens: response&.total_tokens, estimated_cost_usd: response&.estimated_cost,
|
|
84
|
+
parsed_score: summary[:score], parsed_bucket: summary[:bucket],
|
|
85
|
+
extra_columns: parsed ? @spec.extra_columns(parsed) : {},
|
|
86
|
+
raw_output_ref: raw_ref, parsed_output_ref: parsed_ref, error:
|
|
87
|
+
)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def input_label
|
|
91
|
+
@spec.input_label(@input)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmConductor
|
|
4
|
+
module Eval
|
|
5
|
+
# Value object returned by a run. Holds the aggregated results and renders
|
|
6
|
+
# CSV / markdown on demand. The caller decides whether to persist anything —
|
|
7
|
+
# the engine never forces a filesystem layout on consumers.
|
|
8
|
+
#
|
|
9
|
+
# - +rows+ : Array of { model_result: Result, judge_verdict: Verdict }
|
|
10
|
+
# - +summary+ : Array of per-model aggregate Hashes, best-quality first
|
|
11
|
+
# - +needs_review+ : Array of Hashes for rows flagged for human eyeball
|
|
12
|
+
Report = Struct.new(:rows, :summary, :needs_review, :csv_string, :markdown_string, keyword_init: true) do
|
|
13
|
+
def to_csv
|
|
14
|
+
csv_string
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_markdown
|
|
18
|
+
markdown_string
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|