llm_conductor 1.7.1 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a6fce29f76d891a6c773b998ec21a2cc30b1c34f979640f86a71d20bfc6af8d
4
- data.tar.gz: 07ead7a9e05e819a6a644fc741ed5a118123d9130ea96d8725da6b4be09573ca
3
+ metadata.gz: ac4b318b4227e3f089f42d471d51a1d7fefc9c54d080c30bbb2e748039ee4ae0
4
+ data.tar.gz: c21eb439aed4fc671fde4dc30c06c12f0853a5f99ba29d47c3bd3b389f5b27c0
5
5
  SHA512:
6
- metadata.gz: 7cc94df2e2ed528d7fff9d688c9ac69cdad48282fc268da8aa78b8a0aa99db66bbffccd74661860b47f4df04a0f98826b60db27052eb55845282712bf514bfef
7
- data.tar.gz: '032148a55e18c3402765695baa053f9fed3e07fcbb0dd7d550fa6199ba29a1ef1a63d7df644d6d839c502952614e6ae08585a24459345b7b34e96989fe60c350'
6
+ metadata.gz: 918613d702e3918ae5651ec8aefab34126a36a17c0be75549a2227fb9a3e6b7392c245a7d96a2d15df6e76996e871c66c1572cb1f061d5bef1c96fe1e31ec8e4
7
+ data.tar.gz: e4b356669ce32103a8a264c66574edf7f0819379240798d0d91b0cd29c623e512cd6776eb4d5bcf421d6fb8cb30bb2467cc188154910928a8aa6b16adc017d76
data/.rubocop.yml CHANGED
@@ -31,6 +31,10 @@ Lint/ConstantDefinitionInBlock:
31
31
 
32
32
  Metrics/ClassLength:
33
33
  Max: 130
34
+ Exclude:
35
+ # Eval engine classes are faithful ports of the Rails prototype; their size
36
+ # mirrors the reference implementation.
37
+ - 'lib/llm_conductor/eval/**/*'
34
38
 
35
39
  Metrics/MethodLength:
36
40
  Max: 15
@@ -39,6 +43,7 @@ Metrics/MethodLength:
39
43
  - 'lib/llm_conductor/clients/openrouter_client.rb'
40
44
  - 'lib/llm_conductor/clients/zai_client.rb'
41
45
  - 'lib/llm_conductor/client_factory.rb'
46
+ - 'lib/llm_conductor/eval/**/*'
42
47
  - 'examples/*.rb'
43
48
 
44
49
  RSpec/ExampleLength:
@@ -98,12 +103,15 @@ Metrics/AbcSize:
98
103
  - 'lib/llm_conductor/prompts.rb'
99
104
  - 'lib/llm_conductor/clients/openrouter_client.rb'
100
105
  - 'lib/llm_conductor/clients/zai_client.rb'
106
+ - 'lib/llm_conductor/eval/**/*'
101
107
  - 'examples/*.rb'
102
108
 
103
109
  Metrics/ParameterLists:
104
110
  Exclude:
105
111
  - 'lib/llm_conductor.rb'
106
112
  - 'lib/llm_conductor/configuration.rb'
113
+ - 'lib/llm_conductor/eval.rb'
114
+ - 'lib/llm_conductor/eval/**/*'
107
115
 
108
116
  Metrics/CyclomaticComplexity:
109
117
  Exclude:
@@ -111,6 +119,7 @@ Metrics/CyclomaticComplexity:
111
119
  - 'lib/llm_conductor/prompts.rb'
112
120
  - 'lib/llm_conductor/clients/openrouter_client.rb'
113
121
  - 'lib/llm_conductor/clients/zai_client.rb'
122
+ - 'lib/llm_conductor/eval/**/*'
114
123
  - 'examples/*.rb'
115
124
 
116
125
  Metrics/PerceivedComplexity:
@@ -118,6 +127,7 @@ Metrics/PerceivedComplexity:
118
127
  - 'lib/llm_conductor/prompts.rb'
119
128
  - 'lib/llm_conductor/clients/openrouter_client.rb'
120
129
  - 'lib/llm_conductor/clients/zai_client.rb'
130
+ - 'lib/llm_conductor/eval/**/*'
121
131
 
122
132
  Layout/LineLength:
123
133
  Max: 125
data/README.md CHANGED
@@ -252,6 +252,68 @@ else
252
252
  end
253
253
  ```
254
254
 
255
+ ## Model Evaluation (opt-in)
256
+
257
+ Which model/vendor is best for *your* prompt? The eval harness runs the same
258
+ prompt across N `(model, vendor)` pairs over M inputs and compares them on
259
+ **cost, latency, tokens, and LLM-judged quality** — three of which `generate`
260
+ already produces for free.
261
+
262
+ It's behind a separate require so core users pay nothing:
263
+
264
+ ```ruby
265
+ require 'llm_conductor/eval'
266
+
267
+ # 1. Describe how to evaluate your feature (the only feature-specific code).
268
+ class ArticleSummarySpec < LlmConductor::Eval::Spec
269
+ def prompt_type = :summarize_text # a registered prompt type
270
+ def input_id(article) = article[:id]
271
+ def input_label(article) = article[:title]
272
+ def build_data(article) = { content: article[:body] } # payload for generate(type:, data:)
273
+
274
+ # { score:, bucket: } — bucket is any discrete label; powers disagreement detection.
275
+ def output_summary(parsed) = { score: parsed['rating'], bucket: parsed['verdict'] }
276
+
277
+ def judge_rubric_excerpt = 'A good summary is faithful, concise, and covers the key points.'
278
+ def judge_dimensions
279
+ [{ key: 'faithfulness', description: 'no hallucinations vs. the source' },
280
+ { key: 'coverage', description: 'captures the key points' }]
281
+ end
282
+ end
283
+
284
+ # 2. Run it. `inputs` is ANY enumerable — selection is YOUR job, never the gem's.
285
+ report = LlmConductor::Eval.run(
286
+ spec: ArticleSummarySpec.new,
287
+ inputs: my_articles,
288
+ models: [ # caller-owned; no baked-in defaults
289
+ { model: 'phi4-mini', vendor: :ollama },
290
+ { model: 'gemini-2.5-flash', vendor: :gemini },
291
+ { model: 'gpt-4o-mini', vendor: :openai }
292
+ ],
293
+ judge: { model: 'llama-3.3-70b-versatile', vendor: :groq }, # default; needs Groq creds
294
+ store: LlmConductor::Eval::Store::FileStore.new('tmp/llm_eval') # or in-memory (default)
295
+ )
296
+
297
+ report.summary # per-model aggregates: parse-OK%, mean quality, p50/p95 latency, cost
298
+ report.to_markdown # decision-aid report (you persist it)
299
+ report.to_csv # full per-row data
300
+ report.needs_review # rows flagged for a human (bucket disagreement / borderline / errors)
301
+ ```
302
+
303
+ **Judge bias matters.** The judge defaults to Groq's `llama-3.3-70b-versatile`
304
+ precisely because it sits *outside* the Gemini/OpenAI/Ollama candidate families —
305
+ a model grading its own family scores it high. Any row where the judged model
306
+ equals the judge model is flagged `self_judge=true` so you can discount it.
307
+
308
+ Cheap re-runs reuse stored candidate outputs — no re-calling the candidates:
309
+
310
+ ```ruby
311
+ LlmConductor::Eval.judge_only(run_id:, spec:, store:, judge: { model: 'gemini-2.5-pro', vendor: :gemini })
312
+ LlmConductor::Eval.report_only(run_id:, spec:, store:)
313
+ ```
314
+
315
+ See [`examples/model_eval_usage.rb`](examples/model_eval_usage.rb) for a complete runnable example.
316
+
255
317
  ## Documentation
256
318
 
257
319
  - **[Custom Parameters Guide](docs/custom-parameters.md)** - Temperature, top_p, and more
@@ -272,6 +334,7 @@ Check the [examples/](examples/) directory for comprehensive examples:
272
334
  - `data_builder_usage.rb` - Data builder patterns
273
335
  - `prompt_registration.rb` - Custom prompt classes
274
336
  - `rag_usage.rb` - Retrieval-Augmented Generation
337
+ - `model_eval_usage.rb` - Model evaluation harness (cost/latency/quality comparison)
275
338
 
276
339
  Run any example:
277
340
 
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Model Evaluation harness example.
4
+ #
5
+ # Runs the same prompt across several (model, vendor) pairs over a handful of
6
+ # inputs and compares them on cost, latency, tokens, and LLM-judged quality.
7
+ #
8
+ # Run with: ruby examples/model_eval_usage.rb
9
+ #
10
+ # Requires credentials for whichever vendors you list in CANDIDATES (and Groq
11
+ # for the default judge). Configure them via ENV (OPENAI_API_KEY, GEMINI_API_KEY,
12
+ # GROQ_API_KEY, OLLAMA_ADDRESS, ...) or LlmConductor.configure.
13
+
14
+ require 'llm_conductor/eval'
15
+
16
+ # 1. A Spec describes the ONE feature being evaluated. It is the only
17
+ # feature-specific code; the engine itself is generic.
18
+ class SentimentSpec < LlmConductor::Eval::Spec
19
+ # We build a full prompt string ourselves, so prompt_type is nil and the
20
+ # engine passes build_data as `prompt:` (instead of `type:` + `data:`).
21
+ def prompt_type = nil
22
+
23
+ def input_id(review) = review[:id]
24
+ def input_label(review) = review[:product]
25
+
26
+ def build_data(review)
27
+ <<~PROMPT
28
+ Classify the sentiment of this product review. Respond with ONLY a JSON
29
+ object: {"sentiment": "positive|neutral|negative", "confidence": 0-100}
30
+
31
+ Review: #{review[:text]}
32
+ PROMPT
33
+ end
34
+
35
+ # score + bucket drive the CSV and bucket-disagreement detection. The bucket
36
+ # here is the sentiment label — if models disagree on it, the row is flagged.
37
+ def output_summary(parsed)
38
+ { score: parsed['confidence'], bucket: parsed['sentiment'] }
39
+ end
40
+
41
+ def judge_rubric_excerpt
42
+ 'A correct classification matches the review\'s actual sentiment and gives ' \
43
+ 'a calibrated confidence (high only when the text is unambiguous).'
44
+ end
45
+
46
+ def judge_dimensions
47
+ [{ key: 'correctness', description: 'is the sentiment label correct' },
48
+ { key: 'calibration', description: 'is the confidence well-calibrated' }]
49
+ end
50
+ end
51
+
52
+ # 2. Inputs are ANY enumerable of opaque objects — selecting them is YOUR job.
53
+ reviews = [
54
+ { id: 1, product: 'Widget', text: 'Absolutely love it, works perfectly!' },
55
+ { id: 2, product: 'Gadget', text: 'It broke after two days. Very disappointed.' },
56
+ { id: 3, product: 'Gizmo', text: 'It is fine. Does what it says, nothing special.' }
57
+ ]
58
+
59
+ # 3. Candidate (model, vendor) pairs are caller-owned — there is no baked-in
60
+ # default list (which models you have pulled / hold keys for is your concern).
61
+ CANDIDATES = [
62
+ { model: 'gpt-4o-mini', vendor: :openai },
63
+ { model: 'gemini-2.5-flash', vendor: :gemini }
64
+ ].freeze
65
+
66
+ report = LlmConductor::Eval.run(
67
+ spec: SentimentSpec.new,
68
+ inputs: reviews,
69
+ models: CANDIDATES,
70
+ # Judge defaults to llama-3.3-70b-versatile on Groq (outside the candidate
71
+ # families → no self-judge bias). Override here if you have other quota.
72
+ judge: { model: 'llama-3.3-70b-versatile', vendor: :groq },
73
+ # InMemory store is the default; swap in FileStore to persist + enable
74
+ # report_only / judge_only re-runs:
75
+ store: LlmConductor::Eval::Store::FileStore.new('tmp/llm_eval')
76
+ )
77
+
78
+ puts report.to_markdown
79
+ puts "\n--- Rows needing human review ---"
80
+ report.needs_review.each do |row|
81
+ puts "input=#{row[:input_id]} model=#{row[:model]} reasons=#{row[:reasons].join(', ')}"
82
+ end
83
+
84
+ # Persist the CSV yourself — the engine returns data, it doesn't impose a layout.
85
+ File.write('tmp/llm_eval_results.csv', report.to_csv)
86
+ puts "\nWrote tmp/llm_eval_results.csv"
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module LlmConductor
6
+ module Eval
7
+ # Minimal, conservative JSON-from-LLM-text parser.
8
+ #
9
+ # Replaces the app-level LlmJsonCleaner the Rails prototype relied on. The
10
+ # guiding principle (from docs/llm_eval_framework.md) is: NEVER "repair"
11
+ # already-valid JSON — heavy cleaning corrupts numeric scores and the like.
12
+ # We only strip markdown fences, drop any preamble before the first brace,
13
+ # trim to the outermost balanced object/array, then parse once.
14
+ module JsonParser
15
+ module_function
16
+
17
+ # Parse +text+ into a Hash or Array, or return nil on any failure.
18
+ def parse(text)
19
+ prepared = prepare_text(text)
20
+ return nil if prepared.empty?
21
+
22
+ obj = begin
23
+ JSON.parse(prepared)
24
+ rescue JSON::ParserError
25
+ nil
26
+ end
27
+ obj.is_a?(Hash) || obj.is_a?(Array) ? obj : nil
28
+ end
29
+
30
+ # Strip ```json fences, drop preamble before the first [ or {, and trim
31
+ # to the matching closing brace/bracket. Returns '' when there is no
32
+ # JSON-looking content at all.
33
+ def prepare_text(text)
34
+ str = text.to_s.strip
35
+ .gsub(/\A```(?:json)?\s*/i, '')
36
+ .gsub(/```\s*\z/, '')
37
+ .strip
38
+ start = str.index(/[\[{]/)
39
+ return '' if start.nil?
40
+
41
+ balance(str[start..])
42
+ end
43
+
44
+ # Given a string that starts with '{' or '[', return the substring up to
45
+ # and including its matching close. String contents (and escapes) are
46
+ # skipped so braces inside string literals don't throw off the depth.
47
+ def balance(str)
48
+ open = str[0]
49
+ close = open == '{' ? '}' : ']'
50
+ depth = 0
51
+ in_string = false
52
+ escape = false
53
+
54
+ str.each_char.with_index do |char, index|
55
+ if in_string
56
+ if escape then escape = false
57
+ elsif char == '\\' then escape = true
58
+ elsif char == '"' then in_string = false
59
+ end
60
+ next
61
+ end
62
+
63
+ case char
64
+ when '"' then in_string = true
65
+ when open then depth += 1
66
+ when close
67
+ depth -= 1
68
+ return str[0..index] if depth.zero?
69
+ end
70
+ end
71
+
72
+ str
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,188 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative 'verdict'
5
+ require_relative 'json_parser'
6
+ require_relative 'model_runner'
7
+
8
+ module LlmConductor
9
+ module Eval
10
+ # LLM-as-judge for one candidate (input, model) output.
11
+ #
12
+ # Sends the judge model the original input data, the spec's rubric excerpt,
13
+ # and the candidate's parsed output (or raw text on parse failure), and
14
+ # expects strict JSON back with a quality_score + per-dimension scores.
15
+ #
16
+ # Judge defaults to Groq's llama-3.3-70b-versatile: it sits OUTSIDE the
17
+ # Gemini/OpenAI/Ollama families that dominate most candidate lists (avoiding
18
+ # self-judge bias — Gemini grades its own output ~10pts high) and Groq's
19
+ # free tier offers far more throughput than Gemini Pro's ~2 RPM. Override
20
+ # via the +judge:+ config. It needs Groq credentials configured; rows where
21
+ # the judged model == the judge model are flagged +self_judge+ in the report.
22
+ class Judge
23
+ DEFAULT_MODEL = 'llama-3.3-70b-versatile'
24
+ DEFAULT_VENDOR = :groq
25
+
26
+ def self.borderline?(score)
27
+ Verdict.borderline?(score)
28
+ end
29
+
30
+ def initialize(spec:, store:, run_id:, logger:, judge_model: DEFAULT_MODEL,
31
+ judge_vendor: DEFAULT_VENDOR, rate_limit_retries: 3,
32
+ rate_limit_backoff_seconds: 20)
33
+ @spec = spec
34
+ @store = store
35
+ @run_id = run_id
36
+ @logger = logger
37
+ @judge_model = judge_model
38
+ @judge_vendor = judge_vendor.to_sym
39
+ @rate_limit_retries = rate_limit_retries
40
+ @rate_limit_backoff_seconds = rate_limit_backoff_seconds
41
+ end
42
+
43
+ # +model_result+ is an Eval::Result. +input_data+ is the spec's data Hash
44
+ # for the input being judged.
45
+ def judge(model_result:, input_data:)
46
+ prompt = build_prompt(model_result:, input_data:)
47
+ response, latency_ms = call_with_rate_limit_retry(prompt)
48
+
49
+ unless response&.success?
50
+ error = response&.metadata&.dig(:error) || 'judge LLM call failed'
51
+ return failure_verdict(latency_ms:, response:, error:)
52
+ end
53
+
54
+ parsed = JsonParser.parse(response.output)
55
+ if parsed.nil?
56
+ return failure_verdict(latency_ms:, response:,
57
+ error: "judge output not valid JSON: #{response.output.to_s[0, 200]}")
58
+ end
59
+
60
+ build_verdict(parsed:, latency_ms:, response:)
61
+ rescue StandardError => e
62
+ @logger.error("[Eval::Judge] #{@judge_model}: #{e.class}: #{e.message}")
63
+ Verdict.new(judge_model: @judge_model, judge_error: "#{e.class}: #{e.message}")
64
+ end
65
+
66
+ private
67
+
68
+ def build_verdict(parsed:, latency_ms:, response:)
69
+ Verdict.new(
70
+ quality_score: clamp_score(parsed['quality_score']),
71
+ dimensions: extract_dimensions(parsed['dimensions']),
72
+ issues: Array(parsed['issues']).map(&:to_s),
73
+ verdict_one_line: parsed['verdict_one_line'].to_s,
74
+ judge_model: @judge_model,
75
+ judge_latency_ms: latency_ms,
76
+ judge_input_tokens: response.input_tokens,
77
+ judge_output_tokens: response.output_tokens,
78
+ judge_estimated_cost_usd: response.estimated_cost
79
+ )
80
+ end
81
+
82
+ def call_with_rate_limit_retry(prompt)
83
+ attempt = 0
84
+ started_at = Time.now.utc
85
+ loop do
86
+ response = LlmConductor.generate(model: @judge_model, prompt:, vendor: @judge_vendor)
87
+ if !response&.success? && rate_limited?(response) && attempt < @rate_limit_retries
88
+ wait = @rate_limit_backoff_seconds * (2**attempt)
89
+ @logger.warn("[Eval::Judge] 429 from #{@judge_model}; sleeping #{wait}s then retrying " \
90
+ "(attempt #{attempt + 1}/#{@rate_limit_retries})")
91
+ sleep(wait)
92
+ attempt += 1
93
+ next
94
+ end
95
+ return [response, ((Time.now.utc - started_at) * 1000).round]
96
+ end
97
+ end
98
+
99
+ def rate_limited?(response)
100
+ error = response&.metadata&.dig(:error).to_s
101
+ error.include?('429') || error.match?(/rate.limit/i)
102
+ end
103
+
104
+ def build_prompt(model_result:, input_data:)
105
+ <<~PROMPT
106
+ You are an impartial judge evaluating how well a candidate LLM performed a
107
+ task against its rubric. Score the candidate's output on a 0-100 quality
108
+ scale. Be strict but fair: a perfect rubric-adherent response grounded in
109
+ the provided evidence is 90-100; obvious hallucinations or rubric violations
110
+ should drop the score significantly.
111
+
112
+ <rubric_excerpt>
113
+ #{@spec.judge_rubric_excerpt}
114
+ </rubric_excerpt>
115
+
116
+ <original_input_data>
117
+ #{JSON.pretty_generate(input_data)}
118
+ </original_input_data>
119
+
120
+ <candidate_output>
121
+ #{candidate_block(model_result)}
122
+ </candidate_output>
123
+
124
+ <judging_dimensions>
125
+ #{judging_dimensions_block}
126
+ </judging_dimensions>
127
+
128
+ Return ONE JSON object with no markdown fences and no commentary:
129
+
130
+ {
131
+ "quality_score": 0-100,
132
+ "dimensions": {
133
+ #{dimensions_json_template}
134
+ },
135
+ "issues": ["concrete one-line problem", "..."],
136
+ "verdict_one_line": "one-line summary of overall judgment"
137
+ }
138
+ PROMPT
139
+ end
140
+
141
+ def candidate_block(model_result)
142
+ slug = ModelRunner.slug(model_result.model)
143
+ parsed = @store.read_parsed(@run_id, model_result.input_id, slug)
144
+ return parsed.is_a?(String) ? parsed : JSON.pretty_generate(parsed) if parsed
145
+
146
+ raw = @store.read_raw(@run_id, model_result.input_id, slug)
147
+ if raw && !raw.empty?
148
+ "PARSE FAILED. RAW OUTPUT:\n#{raw}"
149
+ else
150
+ "CANDIDATE PRODUCED NO USABLE OUTPUT. status=#{model_result.status} error=#{model_result.error}"
151
+ end
152
+ end
153
+
154
+ def judging_dimensions_block
155
+ @spec.judge_dimensions.map { |d| " - #{d[:key]} (0-100): #{d[:description]}" }.join("\n")
156
+ end
157
+
158
+ def dimensions_json_template
159
+ @spec.judge_dimensions.map { |d| " \"#{d[:key]}\": 0-100" }.join(",\n")
160
+ end
161
+
162
+ def extract_dimensions(raw)
163
+ return {} unless raw.is_a?(Hash)
164
+
165
+ @spec.judge_dimensions.each_with_object({}) do |d, acc|
166
+ acc[d[:key]] = clamp_score(raw[d[:key]] || raw[d[:key].to_s])
167
+ end
168
+ end
169
+
170
+ def clamp_score(raw)
171
+ return nil if raw.nil?
172
+
173
+ Integer(raw).clamp(0, 100)
174
+ rescue ArgumentError, TypeError
175
+ nil
176
+ end
177
+
178
+ def failure_verdict(latency_ms:, response:, error:)
179
+ Verdict.new(
180
+ judge_model: @judge_model, judge_latency_ms: latency_ms,
181
+ judge_input_tokens: response&.input_tokens, judge_output_tokens: response&.output_tokens,
182
+ judge_estimated_cost_usd: response&.estimated_cost, judge_error: error,
183
+ quality_score: 0, dimensions: {}, issues: [], verdict_one_line: ''
184
+ )
185
+ end
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'result'
4
+
5
+ module LlmConductor
6
+ module Eval
7
+ # Runs one (input, model) pair through LlmConductor.generate, capturing
8
+ # latency / tokens / cost / parse status and writing raw + parsed outputs
9
+ # through the Store. Side-effect free — never touches the caller's data.
10
+ #
11
+ # All feature-specific behavior (prompt type, payload, parsing,
12
+ # score/bucket extraction) is delegated to the Spec.
13
+ class ModelRunner
14
+ # Filesystem-safe slug for a model name (e.g. "gemini-2.5-flash").
15
+ def self.slug(model)
16
+ model.to_s.gsub(/[^A-Za-z0-9_.-]+/, '_')
17
+ end
18
+
19
+ def initialize(input, model:, vendor:, spec:, store:, run_id:, logger:, data: nil)
20
+ @input = input
21
+ @model = model
22
+ @vendor = vendor.to_sym
23
+ @spec = spec
24
+ @store = store
25
+ @run_id = run_id
26
+ @logger = logger
27
+ @data = data
28
+ end
29
+
30
+ def run
31
+ input_id = @spec.input_id(@input)
32
+ data = @data || @spec.build_data(@input)
33
+
34
+ started_at = Time.now.utc
35
+ response = LlmConductor.generate(**generate_args(data, input_id))
36
+ latency_ms = ((Time.now.utc - started_at) * 1000).round
37
+
38
+ raw_ref = @store.write_raw(@run_id, input_id, slug, response&.output.to_s)
39
+
40
+ if response.nil? || !response.success?
41
+ error = response&.metadata&.dig(:error) || 'LLM returned no response'
42
+ return build_result(input_id:, status: 'llm_error', latency_ms:, response:, raw_ref:, error:)
43
+ end
44
+
45
+ parsed = @spec.parse(response.output)
46
+ if parsed.nil?
47
+ return build_result(input_id:, status: 'parse_error', latency_ms:, response:, raw_ref:,
48
+ error: 'LLM output not valid structured data')
49
+ end
50
+
51
+ parsed_ref = @store.write_parsed(@run_id, input_id, slug, parsed)
52
+ build_result(input_id:, status: 'ok', latency_ms:, response:, raw_ref:, parsed_ref:, parsed:)
53
+ rescue StandardError => e
54
+ @logger.error("[Eval::ModelRunner] #{@model}@#{@spec.input_id(@input)}: #{e.class}: #{e.message}")
55
+ Result.new(input_id: @spec.input_id(@input), input_label:, model: @model,
56
+ vendor: @vendor, status: 'exception', error: "#{e.class}: #{e.message}")
57
+ end
58
+
59
+ def slug
60
+ self.class.slug(@model)
61
+ end
62
+
63
+ private
64
+
65
+ def generate_args(data, input_id)
66
+ args = { model: @model, vendor: @vendor }
67
+ if @spec.prompt_type
68
+ args[:type] = @spec.prompt_type
69
+ args[:data] = data
70
+ else
71
+ args[:prompt] = data
72
+ end
73
+ params = @spec.vendor_params(vendor: @vendor, input_id:)
74
+ args[:params] = params unless params.nil? || params.empty?
75
+ args
76
+ end
77
+
78
+ def build_result(input_id:, status:, latency_ms:, response:, raw_ref:, parsed_ref: nil, parsed: nil, error: nil)
79
+ summary = parsed ? @spec.output_summary(parsed) : { score: nil, bucket: nil }
80
+ Result.new(
81
+ input_id:, input_label:, model: @model, vendor: @vendor, status:, latency_ms:,
82
+ input_tokens: response&.input_tokens, output_tokens: response&.output_tokens,
83
+ total_tokens: response&.total_tokens, estimated_cost_usd: response&.estimated_cost,
84
+ parsed_score: summary[:score], parsed_bucket: summary[:bucket],
85
+ extra_columns: parsed ? @spec.extra_columns(parsed) : {},
86
+ raw_output_ref: raw_ref, parsed_output_ref: parsed_ref, error:
87
+ )
88
+ end
89
+
90
+ def input_label
91
+ @spec.input_label(@input)
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmConductor
4
+ module Eval
5
+ # Value object returned by a run. Holds the aggregated results and renders
6
+ # CSV / markdown on demand. The caller decides whether to persist anything —
7
+ # the engine never forces a filesystem layout on consumers.
8
+ #
9
+ # - +rows+ : Array of { model_result: Result, judge_verdict: Verdict }
10
+ # - +summary+ : Array of per-model aggregate Hashes, best-quality first
11
+ # - +needs_review+ : Array of Hashes for rows flagged for human eyeball
12
+ Report = Struct.new(:rows, :summary, :needs_review, :csv_string, :markdown_string, keyword_init: true) do
13
+ def to_csv
14
+ csv_string
15
+ end
16
+
17
+ def to_markdown
18
+ markdown_string
19
+ end
20
+ end
21
+ end
22
+ end