llm_conductor 1.7.1 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -0
- data/README.md +63 -0
- data/examples/model_eval_usage.rb +86 -0
- data/lib/llm_conductor/eval/json_parser.rb +76 -0
- data/lib/llm_conductor/eval/judge.rb +188 -0
- data/lib/llm_conductor/eval/model_runner.rb +95 -0
- data/lib/llm_conductor/eval/report.rb +22 -0
- data/lib/llm_conductor/eval/report_builder.rb +258 -0
- data/lib/llm_conductor/eval/result.rb +30 -0
- data/lib/llm_conductor/eval/runner.rb +148 -0
- data/lib/llm_conductor/eval/spec.rb +78 -0
- data/lib/llm_conductor/eval/store/base.rb +58 -0
- data/lib/llm_conductor/eval/store/file_store.rb +94 -0
- data/lib/llm_conductor/eval/store/in_memory.rb +76 -0
- data/lib/llm_conductor/eval/verdict.rb +31 -0
- data/lib/llm_conductor/eval.rb +75 -0
- data/lib/llm_conductor/version.rb +1 -1
- metadata +30 -2
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'csv'
|
|
4
|
+
require_relative 'verdict'
|
|
5
|
+
require_relative 'report'
|
|
6
|
+
|
|
7
|
+
module LlmConductor
|
|
8
|
+
module Eval
|
|
9
|
+
# Pure aggregation: turns a run's per-(input, model) rows into a Report
|
|
10
|
+
# (CSV string + decision-aid markdown + per-model summary + needs-review
|
|
11
|
+
# list). Unlike the Rails prototype's ReportBuilder it writes no files;
|
|
12
|
+
# persistence is the caller's / Store's job.
|
|
13
|
+
#
|
|
14
|
+
# +rows+ is an Array of { model_result: Result, judge_verdict: Verdict }.
|
|
15
|
+
class ReportBuilder
|
|
16
|
+
BASE_CSV_COLUMNS = %w[
|
|
17
|
+
input_id input_label model vendor status
|
|
18
|
+
latency_ms input_tokens output_tokens total_tokens estimated_cost_usd
|
|
19
|
+
parsed_score parsed_bucket
|
|
20
|
+
judge_quality_score
|
|
21
|
+
].freeze
|
|
22
|
+
|
|
23
|
+
JUDGE_TAIL_COLUMNS = %w[
|
|
24
|
+
judge_verdict_one_line judge_issues judge_error
|
|
25
|
+
self_judge needs_human_review review_reasons
|
|
26
|
+
raw_output_ref parsed_output_ref error
|
|
27
|
+
].freeze
|
|
28
|
+
|
|
29
|
+
def initialize(rows:, run_id:, judge_model:, spec:)
|
|
30
|
+
@rows = rows
|
|
31
|
+
@run_id = run_id
|
|
32
|
+
@judge_model = judge_model
|
|
33
|
+
@spec = spec
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def build
|
|
37
|
+
bucket_disagreement = compute_bucket_disagreement
|
|
38
|
+
summary = build_summary
|
|
39
|
+
Report.new(
|
|
40
|
+
rows: @rows,
|
|
41
|
+
summary:,
|
|
42
|
+
needs_review: build_needs_review(bucket_disagreement),
|
|
43
|
+
csv_string: build_csv(bucket_disagreement),
|
|
44
|
+
markdown_string: build_markdown(bucket_disagreement, summary)
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
attr_reader :rows, :run_id, :judge_model, :spec
|
|
51
|
+
|
|
52
|
+
def judge_dimension_columns
|
|
53
|
+
@judge_dimension_columns ||= spec.judge_dimensions.map { |d| "judge_#{d[:key]}" }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def extra_csv_keys
|
|
57
|
+
@extra_csv_keys ||= rows.flat_map { |r| (r[:model_result].extra_columns || {}).keys }.uniq
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def csv_columns
|
|
61
|
+
BASE_CSV_COLUMNS + judge_dimension_columns + JUDGE_TAIL_COLUMNS + extra_csv_keys
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def compute_bucket_disagreement
|
|
65
|
+
by_input = Hash.new { |h, k| h[k] = Set.new }
|
|
66
|
+
rows.each do |row|
|
|
67
|
+
mr = row[:model_result]
|
|
68
|
+
next unless mr.status == 'ok' && mr.parsed_bucket
|
|
69
|
+
|
|
70
|
+
by_input[mr.input_id] << mr.parsed_bucket
|
|
71
|
+
end
|
|
72
|
+
by_input.transform_values { |set| set.size > 1 ? set.sort : [] }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def build_csv(bucket_disagreement)
|
|
76
|
+
columns = csv_columns
|
|
77
|
+
CSV.generate(write_headers: true, headers: columns) do |csv|
|
|
78
|
+
rows.each { |row| csv << csv_row(row, bucket_disagreement, columns) }
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def csv_row(row, bucket_disagreement, columns)
|
|
83
|
+
mr = row[:model_result]
|
|
84
|
+
jv = row[:judge_verdict]
|
|
85
|
+
base = base_csv_values(mr, jv, flag_reasons(mr, jv, bucket_disagreement))
|
|
86
|
+
spec.judge_dimensions.each { |d| base["judge_#{d[:key]}"] = jv&.dimensions&.dig(d[:key]) }
|
|
87
|
+
(mr.extra_columns || {}).each { |k, v| base[k] = v }
|
|
88
|
+
columns.map { |c| base[c] }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def base_csv_values(result, verdict, review_reasons)
|
|
92
|
+
{
|
|
93
|
+
'input_id' => result.input_id, 'input_label' => result.input_label, 'model' => result.model,
|
|
94
|
+
'vendor' => result.vendor, 'status' => result.status, 'latency_ms' => result.latency_ms,
|
|
95
|
+
'input_tokens' => result.input_tokens, 'output_tokens' => result.output_tokens,
|
|
96
|
+
'total_tokens' => result.total_tokens, 'estimated_cost_usd' => result.estimated_cost_usd,
|
|
97
|
+
'parsed_score' => result.parsed_score, 'parsed_bucket' => result.parsed_bucket,
|
|
98
|
+
'judge_quality_score' => verdict&.quality_score, 'judge_verdict_one_line' => verdict&.verdict_one_line,
|
|
99
|
+
'judge_issues' => Array(verdict&.issues).join(' | '), 'judge_error' => verdict&.judge_error,
|
|
100
|
+
'self_judge' => (result.model == judge_model), 'needs_human_review' => review_reasons.any?,
|
|
101
|
+
'review_reasons' => review_reasons.join('; '), 'raw_output_ref' => result.raw_output_ref,
|
|
102
|
+
'parsed_output_ref' => result.parsed_output_ref, 'error' => result.error
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def build_needs_review(bucket_disagreement)
|
|
107
|
+
rows.filter_map do |row|
|
|
108
|
+
mr = row[:model_result]
|
|
109
|
+
reasons = flag_reasons(mr, row[:judge_verdict], bucket_disagreement)
|
|
110
|
+
next if reasons.empty?
|
|
111
|
+
|
|
112
|
+
{ input_id: mr.input_id, input_label: mr.input_label, model: mr.model, reasons: }
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def flag_reasons(result, verdict, bucket_disagreement)
|
|
117
|
+
reasons = []
|
|
118
|
+
disagree = bucket_disagreement[result.input_id]
|
|
119
|
+
reasons << "buckets_disagree(#{disagree.join(',')})" if disagree&.any?
|
|
120
|
+
reasons << 'judge_borderline' if verdict && Verdict.borderline?(verdict.quality_score)
|
|
121
|
+
reasons << 'parse_failed' if result.status == 'parse_error'
|
|
122
|
+
reasons << 'llm_error' if result.status == 'llm_error'
|
|
123
|
+
reasons
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def build_summary
|
|
127
|
+
rows.group_by { |r| r[:model_result].model }
|
|
128
|
+
.map { |model, model_rows| summarize_model(model, model_rows) }
|
|
129
|
+
.sort_by { |s| -(s[:mean_quality] || -1) }
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def summarize_model(model, model_rows)
|
|
133
|
+
ok = model_rows.count { |r| r[:model_result].status == 'ok' }
|
|
134
|
+
latencies = model_rows.filter_map { |r| r[:model_result].latency_ms }
|
|
135
|
+
costs = model_rows.filter_map { |r| r[:model_result].estimated_cost_usd }.map(&:to_f)
|
|
136
|
+
qualities = model_rows.filter_map { |r| r[:judge_verdict]&.quality_score }
|
|
137
|
+
{
|
|
138
|
+
model:, parse_ok_pct: pct(ok, model_rows.size), mean_quality: mean(qualities)&.round(1),
|
|
139
|
+
median_latency_ms: percentile(latencies, 50), p95_latency_ms: percentile(latencies, 95),
|
|
140
|
+
mean_cost: costs.empty? ? 0.0 : (costs.sum / costs.size).round(5), total_cost: costs.sum.round(4),
|
|
141
|
+
review_pct: pct(review_count(model_rows), model_rows.size)
|
|
142
|
+
}
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def review_count(model_rows)
|
|
146
|
+
model_rows.count do |r|
|
|
147
|
+
jv = r[:judge_verdict]
|
|
148
|
+
r[:model_result].status != 'ok' || (jv && Verdict.borderline?(jv.quality_score))
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def build_markdown(bucket_disagreement, summary)
|
|
153
|
+
<<~MD
|
|
154
|
+
# LLM Eval — #{run_id}
|
|
155
|
+
|
|
156
|
+
- Inputs in this run: **#{rows.map { |r| r[:model_result].input_id }.uniq.size}**
|
|
157
|
+
- Candidate models: **#{summary.size}**
|
|
158
|
+
- Judge model: `#{judge_model}` (rows where the judged model == judge model are flagged `self_judge`)
|
|
159
|
+
|
|
160
|
+
## Per-model summary
|
|
161
|
+
|
|
162
|
+
| Model | Parse OK% | Mean Judge | Median latency | P95 latency | Mean cost | Total cost | % needs review |
|
|
163
|
+
|---|---:|---:|---:|---:|---:|---:|---:|
|
|
164
|
+
#{summary.map { |s| markdown_summary_row(s) }.join("\n")}
|
|
165
|
+
|
|
166
|
+
## Recommendation buckets per input
|
|
167
|
+
|
|
168
|
+
#{bucket_table}
|
|
169
|
+
|
|
170
|
+
## Bucket-disagreement cases (model choice matters here)
|
|
171
|
+
|
|
172
|
+
#{bucket_disagreement_section(bucket_disagreement)}
|
|
173
|
+
|
|
174
|
+
## Pareto picks
|
|
175
|
+
|
|
176
|
+
#{pareto_section(summary)}
|
|
177
|
+
MD
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def markdown_summary_row(summary)
|
|
181
|
+
"| `#{summary[:model]}` | #{summary[:parse_ok_pct]}% | #{summary[:mean_quality] || 'n/a'} | " \
|
|
182
|
+
"#{summary[:median_latency_ms] || 'n/a'}ms | #{summary[:p95_latency_ms] || 'n/a'}ms | " \
|
|
183
|
+
"$#{format('%.5f', summary[:mean_cost])} | $#{format('%.4f', summary[:total_cost])} | " \
|
|
184
|
+
"#{summary[:review_pct]}% |"
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def bucket_table
|
|
188
|
+
lines = ['| Input | Model | Bucket | Score | Judge | Status |', '|---|---|---|---:|---:|---|']
|
|
189
|
+
rows.group_by { |r| r[:model_result].input_id }.each do |input_id, input_rows|
|
|
190
|
+
label = input_rows.first[:model_result].input_label
|
|
191
|
+
input_rows.each do |r|
|
|
192
|
+
mr = r[:model_result]
|
|
193
|
+
jv = r[:judge_verdict]
|
|
194
|
+
lines << "| #{input_id} #{label} | `#{mr.model}` | #{mr.parsed_bucket || '-'} | " \
|
|
195
|
+
"#{mr.parsed_score || '-'} | #{jv&.quality_score || '-'} | #{mr.status} |"
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
lines.join("\n")
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def bucket_disagreement_section(bucket_disagreement)
|
|
202
|
+
cases = bucket_disagreement.reject { |_, v| v.empty? }
|
|
203
|
+
return '_None — every input received the same bucket across all models._' if cases.empty?
|
|
204
|
+
|
|
205
|
+
cases.flat_map { |input_id, buckets| disagreement_block(input_id, buckets) }.join("\n")
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def disagreement_block(input_id, buckets)
|
|
209
|
+
input_rows = rows.select { |r| r[:model_result].input_id == input_id && r[:model_result].status == 'ok' }
|
|
210
|
+
return [] if input_rows.empty?
|
|
211
|
+
|
|
212
|
+
label = input_rows.first[:model_result].input_label
|
|
213
|
+
lines = ["### #{input_id} — #{label}", '', "Buckets seen: **#{buckets.join(', ')}**", '',
|
|
214
|
+
'| Model | Bucket | Score | Judge |', '|---|---|---:|---:|']
|
|
215
|
+
input_rows.each do |r|
|
|
216
|
+
mr = r[:model_result]
|
|
217
|
+
jv = r[:judge_verdict]
|
|
218
|
+
lines << "| `#{mr.model}` | #{mr.parsed_bucket || '-'} | #{mr.parsed_score || '-'} | #{jv&.quality_score || '-'} |"
|
|
219
|
+
end
|
|
220
|
+
lines << ''
|
|
221
|
+
lines
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
def pareto_section(summary)
|
|
225
|
+
return '_No successful runs to rank._' if summary.empty?
|
|
226
|
+
|
|
227
|
+
lines = ['Top 3 by mean judge quality:', '']
|
|
228
|
+
summary.first(3).each_with_index do |s, i|
|
|
229
|
+
lines << "#{i + 1}. **`#{s[:model]}`** — judge **#{s[:mean_quality] || 'n/a'}**, " \
|
|
230
|
+
"median **#{s[:median_latency_ms]}ms**, mean cost **$#{format('%.5f', s[:mean_cost])}**, " \
|
|
231
|
+
"parse OK **#{s[:parse_ok_pct]}%**."
|
|
232
|
+
end
|
|
233
|
+
lines << ''
|
|
234
|
+
lines << '_Rows flagged `needs_human_review=true` are the ones to sanity-check manually._'
|
|
235
|
+
lines.join("\n")
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def pct(num, den)
|
|
239
|
+
return 0 if den.zero?
|
|
240
|
+
|
|
241
|
+
((num.to_f / den) * 100).round(1)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def mean(values)
|
|
245
|
+
return nil if values.empty?
|
|
246
|
+
|
|
247
|
+
values.sum.to_f / values.size
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def percentile(values, target_pct)
|
|
251
|
+
return nil if values.empty?
|
|
252
|
+
|
|
253
|
+
sorted = values.sort
|
|
254
|
+
sorted[((target_pct / 100.0) * (sorted.size - 1)).round]
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmConductor
|
|
4
|
+
module Eval
|
|
5
|
+
# Outcome of running ONE (input, model) pair through the engine.
|
|
6
|
+
#
|
|
7
|
+
# Ported from the Rails prototype's ModelRunner::Result struct, with the
|
|
8
|
+
# +record_*+ fields renamed to +input_*+ and the on-disk +*_path+ fields
|
|
9
|
+
# generalized to +*_ref+ (a Store handle — a filesystem path for FileStore,
|
|
10
|
+
# an opaque key for InMemory).
|
|
11
|
+
#
|
|
12
|
+
# +status+ is one of: 'ok', 'parse_error', 'llm_error', 'exception'.
|
|
13
|
+
Result = Struct.new(
|
|
14
|
+
:input_id, :input_label, :model, :vendor, :status, :latency_ms,
|
|
15
|
+
:input_tokens, :output_tokens, :total_tokens, :estimated_cost_usd,
|
|
16
|
+
:parsed_score, :parsed_bucket, :extra_columns,
|
|
17
|
+
:raw_output_ref, :parsed_output_ref, :error,
|
|
18
|
+
keyword_init: true
|
|
19
|
+
) do
|
|
20
|
+
# String-keyed hash for JSON manifest persistence.
|
|
21
|
+
def to_h
|
|
22
|
+
super.transform_keys(&:to_s)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def ok?
|
|
26
|
+
status == 'ok'
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'time'
|
|
4
|
+
require_relative 'model_runner'
|
|
5
|
+
require_relative 'judge'
|
|
6
|
+
require_relative 'report_builder'
|
|
7
|
+
require_relative 'result'
|
|
8
|
+
require_relative 'verdict'
|
|
9
|
+
|
|
10
|
+
module LlmConductor
|
|
11
|
+
module Eval
|
|
12
|
+
# Top-level orchestrator. For each input, builds the prompt data once, runs
|
|
13
|
+
# every candidate (input, model) pair through ModelRunner, judges it, and
|
|
14
|
+
# rewrites the manifest after each pair so the run stays resumable /
|
|
15
|
+
# reportable mid-flight.
|
|
16
|
+
#
|
|
17
|
+
# Unlike the Rails prototype it does NO data selection — the caller passes
|
|
18
|
+
# +inputs:+ directly. See LlmConductor::Eval.run for the public entrypoint.
|
|
19
|
+
class Runner
|
|
20
|
+
def initialize(spec:, inputs:, models:, judge:, store:, logger:, run_id:)
|
|
21
|
+
@spec = spec
|
|
22
|
+
@inputs = inputs.to_a
|
|
23
|
+
@models = models
|
|
24
|
+
@judge_config = self.class.normalize_judge(judge)
|
|
25
|
+
@store = store
|
|
26
|
+
@logger = logger
|
|
27
|
+
@run_id = run_id
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def run
|
|
31
|
+
@logger.info("LLM eval run=#{@run_id} models=#{@models.map { |m| m[:model] }.join(',')} " \
|
|
32
|
+
"judge=#{@judge_config[:model]}")
|
|
33
|
+
warn_self_judge
|
|
34
|
+
manifest = base_manifest
|
|
35
|
+
rows = run_all_pairs(manifest)
|
|
36
|
+
manifest[:finished_at] = Time.now.utc.iso8601
|
|
37
|
+
@store.write_manifest(@run_id, manifest)
|
|
38
|
+
build_report(rows)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Rebuild a Report from a stored manifest without recalling models or judge.
|
|
42
|
+
def self.report_only(run_id:, spec:, store:)
|
|
43
|
+
manifest = store.read_manifest(run_id) or raise ArgumentError, "No manifest for run_id=#{run_id}"
|
|
44
|
+
rows = manifest['rows'].map { |raw| restore_row(raw) }
|
|
45
|
+
ReportBuilder.new(rows:, run_id:, judge_model: manifest['judge_model'], spec:).build
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Re-run the judge against stored candidate outputs (e.g. after changing
|
|
49
|
+
# the judge model). Fully self-contained: input data is read from the store.
|
|
50
|
+
def self.judge_only(run_id:, spec:, store:, judge:, logger:)
|
|
51
|
+
config = normalize_judge(judge)
|
|
52
|
+
manifest = store.read_manifest(run_id) or raise ArgumentError, "No manifest for run_id=#{run_id}"
|
|
53
|
+
judge_obj = Judge.new(spec:, store:, run_id:, logger:,
|
|
54
|
+
judge_model: config[:model], judge_vendor: config[:vendor])
|
|
55
|
+
rows = manifest['rows'].map { |raw| rejudge_row(raw, judge_obj, store, run_id) }
|
|
56
|
+
manifest['judge_model'] = config[:model]
|
|
57
|
+
manifest['rejudged_at'] = Time.now.utc.iso8601
|
|
58
|
+
store.write_manifest(run_id, manifest)
|
|
59
|
+
ReportBuilder.new(rows:, run_id:, judge_model: config[:model], spec:).build
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def self.normalize_judge(judge)
|
|
63
|
+
judge ||= {}
|
|
64
|
+
{ model: judge[:model] || Judge::DEFAULT_MODEL,
|
|
65
|
+
vendor: (judge[:vendor] || Judge::DEFAULT_VENDOR).to_sym }
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def self.restore_result(raw)
|
|
69
|
+
Result.new(**raw.transform_keys(&:to_sym))
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def self.restore_verdict(raw)
|
|
73
|
+
raw ? Verdict.new(**raw.transform_keys(&:to_sym)) : nil
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def self.restore_row(raw)
|
|
77
|
+
{ model_result: restore_result(raw['model_result']),
|
|
78
|
+
judge_verdict: restore_verdict(raw['judge_verdict']) }
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def self.rejudge_row(raw, judge_obj, store, run_id)
|
|
82
|
+
result = restore_result(raw['model_result'])
|
|
83
|
+
input_data = store.read_input_data(run_id, result.input_id)
|
|
84
|
+
verdict = judge_obj.judge(model_result: result, input_data:)
|
|
85
|
+
raw['judge_verdict'] = verdict&.to_h
|
|
86
|
+
{ model_result: result, judge_verdict: verdict }
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
def run_all_pairs(manifest)
|
|
92
|
+
rows = []
|
|
93
|
+
@inputs.each_with_index do |input, idx|
|
|
94
|
+
input_id = @spec.input_id(input)
|
|
95
|
+
data = @spec.build_data(input)
|
|
96
|
+
@store.write_input_data(@run_id, input_id, data)
|
|
97
|
+
@models.each do |cand|
|
|
98
|
+
row = run_pair(input, data, cand)
|
|
99
|
+
rows << row
|
|
100
|
+
manifest[:rows] << serialize_row(row)
|
|
101
|
+
@store.write_manifest(@run_id, manifest)
|
|
102
|
+
log_pair(idx, cand, row)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
rows
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def run_pair(input, data, cand)
|
|
109
|
+
result = ModelRunner.new(input, model: cand[:model], vendor: cand[:vendor], spec: @spec,
|
|
110
|
+
store: @store, run_id: @run_id, logger: @logger, data:).run
|
|
111
|
+
verdict = build_judge.judge(model_result: result, input_data: data)
|
|
112
|
+
{ model_result: result, judge_verdict: verdict }
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def build_judge
|
|
116
|
+
Judge.new(spec: @spec, store: @store, run_id: @run_id, logger: @logger,
|
|
117
|
+
judge_model: @judge_config[:model], judge_vendor: @judge_config[:vendor])
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def base_manifest
|
|
121
|
+
{ run_id: @run_id, started_at: Time.now.utc.iso8601, judge_model: @judge_config[:model],
|
|
122
|
+
models: @models, rows: [] }
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def serialize_row(row)
|
|
126
|
+
{ 'model_result' => row[:model_result].to_h, 'judge_verdict' => row[:judge_verdict]&.to_h }
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def build_report(rows)
|
|
130
|
+
ReportBuilder.new(rows:, run_id: @run_id, judge_model: @judge_config[:model], spec: @spec).build
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def warn_self_judge
|
|
134
|
+
return unless @models.any? { |m| m[:model] == @judge_config[:model] }
|
|
135
|
+
|
|
136
|
+
@logger.warn("[Eval] judge model #{@judge_config[:model]} also appears in candidates — " \
|
|
137
|
+
'those rows will be flagged self_judge=true and should be discounted when ranking.')
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def log_pair(idx, cand, row)
|
|
141
|
+
result = row[:model_result]
|
|
142
|
+
verdict = row[:judge_verdict]
|
|
143
|
+
@logger.info(" [#{idx + 1}/#{@inputs.size}] #{cand[:model]} -> status=#{result.status} " \
|
|
144
|
+
"latency=#{result.latency_ms}ms judge=#{verdict&.quality_score}")
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'json_parser'
|
|
4
|
+
|
|
5
|
+
module LlmConductor
|
|
6
|
+
module Eval
|
|
7
|
+
# The public extension seam. Subclass (or duck-type) this to describe how to
|
|
8
|
+
# evaluate one LLM-powered feature: how to turn a caller-supplied input into
|
|
9
|
+
# a prompt payload, how to parse the output, and what the judge should grade.
|
|
10
|
+
#
|
|
11
|
+
# The engine itself is generic and feature-agnostic; everything
|
|
12
|
+
# feature-specific lives here. Unlike the Rails prototype's Feature::Base,
|
|
13
|
+
# there is no +select_cases+ — selecting which inputs to evaluate is the
|
|
14
|
+
# caller's job, done before calling LlmConductor::Eval.run and passed via
|
|
15
|
+
# +inputs:+. The engine never queries a database.
|
|
16
|
+
class Spec
|
|
17
|
+
# Symbol passed to LlmConductor.generate as +type:+ (must match a
|
|
18
|
+
# registered prompt). Return nil if instead you build a full prompt
|
|
19
|
+
# string in #build_data, in which case the engine passes it as +prompt:+.
|
|
20
|
+
def prompt_type
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Stable id for an input (was record.id). Used for output grouping/paths.
|
|
25
|
+
def input_id(_input)
|
|
26
|
+
raise NotImplementedError
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Human label for an input (was record.name). Defaults to the id.
|
|
30
|
+
def input_label(input)
|
|
31
|
+
input_id(input).to_s
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Build the prompt payload for one input. When #prompt_type is set this is
|
|
35
|
+
# passed as +data:+; otherwise it must be a full prompt String passed as
|
|
36
|
+
# +prompt:+ (was build_data(record)).
|
|
37
|
+
def build_data(_input)
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Parse the LLM's raw text into a Hash, or nil on failure. Defaults to the
|
|
42
|
+
# gem's conservative JsonParser; override for tuned/feature-specific parsing.
|
|
43
|
+
def parse(raw)
|
|
44
|
+
JsonParser.parse(raw)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Vendor-specific generation params (e.g. a deterministic Ollama seed).
|
|
48
|
+
# Return {} for vendors that don't expose one.
|
|
49
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
|
50
|
+
def vendor_params(vendor:, input_id:)
|
|
51
|
+
{}
|
|
52
|
+
end
|
|
53
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
|
54
|
+
|
|
55
|
+
# { score: Numeric|nil, bucket: String|nil } — powers CSV columns and the
|
|
56
|
+
# bucket-disagreement detection. +bucket+ may be any discrete label.
|
|
57
|
+
def output_summary(_parsed)
|
|
58
|
+
raise NotImplementedError
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Text inlined into the judge prompt describing the rubric the candidate
|
|
62
|
+
# was asked to follow.
|
|
63
|
+
def judge_rubric_excerpt
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# [{ key:, description: }] — dimensions the judge scores 0-100 each.
|
|
68
|
+
def judge_dimensions
|
|
69
|
+
raise NotImplementedError
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Extra per-row CSV columns beyond the base set. Keys become headers.
|
|
73
|
+
def extra_columns(_parsed)
|
|
74
|
+
{}
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmConductor
|
|
4
|
+
module Eval
|
|
5
|
+
module Store
|
|
6
|
+
# Pluggable persistence interface for an eval run. Replaces the prototype's
|
|
7
|
+
# hard-coded Rails.root.join('tmp', ...) + File.read/write calls.
|
|
8
|
+
#
|
|
9
|
+
# Two implementations ship with the gem: InMemory (default; nothing hits
|
|
10
|
+
# disk) and FileStore (resumable, reproduces the prototype's tmp/<run_id>/
|
|
11
|
+
# layout). Implement this interface to persist anywhere else.
|
|
12
|
+
#
|
|
13
|
+
# Write methods return an opaque "ref" (a filesystem path for FileStore, a
|
|
14
|
+
# key for InMemory) recorded on the Result for the report's path columns.
|
|
15
|
+
class Base
|
|
16
|
+
def write_raw(_run_id, _input_id, _model_slug, _text)
|
|
17
|
+
raise NotImplementedError
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def read_raw(_run_id, _input_id, _model_slug)
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def write_parsed(_run_id, _input_id, _model_slug, _hash)
|
|
25
|
+
raise NotImplementedError
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Returns the parsed Hash/Array (not the ref), or nil if absent.
|
|
29
|
+
def read_parsed(_run_id, _input_id, _model_slug)
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def write_input_data(_run_id, _input_id, _hash)
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Enables self-contained re-judge / report without the original inputs.
|
|
38
|
+
def read_input_data(_run_id, _input_id)
|
|
39
|
+
raise NotImplementedError
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def write_manifest(_run_id, _manifest_hash)
|
|
43
|
+
raise NotImplementedError
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def read_manifest(_run_id)
|
|
47
|
+
raise NotImplementedError
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# True when this (input, model) pair already has stored output — lets a
|
|
51
|
+
# future restart skip already-completed pairs.
|
|
52
|
+
def completed?(_run_id, _input_id, _model_slug)
|
|
53
|
+
raise NotImplementedError
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
require_relative 'base'
|
|
6
|
+
|
|
7
|
+
module LlmConductor
|
|
8
|
+
module Eval
|
|
9
|
+
module Store
|
|
10
|
+
# Resumable on-disk store. Reproduces the Rails prototype's layout:
|
|
11
|
+
#
|
|
12
|
+
# <base_dir>/<run_id>/manifest.json
|
|
13
|
+
# <base_dir>/<run_id>/<input_id>/_input.json
|
|
14
|
+
# <base_dir>/<run_id>/<input_id>/<model_slug>.raw.txt
|
|
15
|
+
# <base_dir>/<run_id>/<input_id>/<model_slug>.json
|
|
16
|
+
#
|
|
17
|
+
# The manifest is rewritten after every (input, model) pair, so a run is
|
|
18
|
+
# reportable / re-judgeable mid-flight (see Runner.report_only/judge_only).
|
|
19
|
+
class FileStore < Base
|
|
20
|
+
attr_reader :base_dir
|
|
21
|
+
|
|
22
|
+
def initialize(base_dir)
|
|
23
|
+
super()
|
|
24
|
+
@base_dir = base_dir.to_s
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def write_raw(run_id, input_id, model_slug, text)
|
|
28
|
+
write_file(output_path(run_id, input_id, "#{model_slug}.raw.txt"), text.to_s)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def read_raw(run_id, input_id, model_slug)
|
|
32
|
+
read_file(output_path(run_id, input_id, "#{model_slug}.raw.txt"))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def write_parsed(run_id, input_id, model_slug, hash)
|
|
36
|
+
write_file(output_path(run_id, input_id, "#{model_slug}.json"), JSON.pretty_generate(hash))
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def read_parsed(run_id, input_id, model_slug)
|
|
40
|
+
read_json(output_path(run_id, input_id, "#{model_slug}.json"))
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def write_input_data(run_id, input_id, hash)
|
|
44
|
+
write_file(output_path(run_id, input_id, '_input.json'), JSON.pretty_generate(hash))
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def read_input_data(run_id, input_id)
|
|
48
|
+
read_json(output_path(run_id, input_id, '_input.json'))
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def write_manifest(run_id, manifest_hash)
|
|
52
|
+
write_file(manifest_path(run_id), JSON.pretty_generate(manifest_hash))
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def read_manifest(run_id)
|
|
56
|
+
read_json(manifest_path(run_id))
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def completed?(run_id, input_id, model_slug)
|
|
60
|
+
File.exist?(output_path(run_id, input_id, "#{model_slug}.json")) ||
|
|
61
|
+
File.exist?(output_path(run_id, input_id, "#{model_slug}.raw.txt"))
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
def output_path(run_id, input_id, name)
|
|
67
|
+
File.join(@base_dir, run_id.to_s, input_id.to_s, name)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def manifest_path(run_id)
|
|
71
|
+
File.join(@base_dir, run_id.to_s, 'manifest.json')
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def write_file(path, content)
|
|
75
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
76
|
+
File.write(path, content)
|
|
77
|
+
path
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def read_file(path)
|
|
81
|
+
File.exist?(path) ? File.read(path) : nil
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def read_json(path)
|
|
85
|
+
return nil unless File.exist?(path)
|
|
86
|
+
|
|
87
|
+
JSON.parse(File.read(path))
|
|
88
|
+
rescue JSON::ParserError
|
|
89
|
+
nil
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|