llm_conductor 1.7.1 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ require_relative 'verdict'
5
+ require_relative 'report'
6
+
7
+ module LlmConductor
8
+ module Eval
9
+ # Pure aggregation: turns a run's per-(input, model) rows into a Report
10
+ # (CSV string + decision-aid markdown + per-model summary + needs-review
11
+ # list). Unlike the Rails prototype's ReportBuilder it writes no files;
12
+ # persistence is the caller's / Store's job.
13
+ #
14
+ # +rows+ is an Array of { model_result: Result, judge_verdict: Verdict }.
15
+ class ReportBuilder
16
+ BASE_CSV_COLUMNS = %w[
17
+ input_id input_label model vendor status
18
+ latency_ms input_tokens output_tokens total_tokens estimated_cost_usd
19
+ parsed_score parsed_bucket
20
+ judge_quality_score
21
+ ].freeze
22
+
23
+ JUDGE_TAIL_COLUMNS = %w[
24
+ judge_verdict_one_line judge_issues judge_error
25
+ self_judge needs_human_review review_reasons
26
+ raw_output_ref parsed_output_ref error
27
+ ].freeze
28
+
29
+ def initialize(rows:, run_id:, judge_model:, spec:)
30
+ @rows = rows
31
+ @run_id = run_id
32
+ @judge_model = judge_model
33
+ @spec = spec
34
+ end
35
+
36
+ def build
37
+ bucket_disagreement = compute_bucket_disagreement
38
+ summary = build_summary
39
+ Report.new(
40
+ rows: @rows,
41
+ summary:,
42
+ needs_review: build_needs_review(bucket_disagreement),
43
+ csv_string: build_csv(bucket_disagreement),
44
+ markdown_string: build_markdown(bucket_disagreement, summary)
45
+ )
46
+ end
47
+
48
+ private
49
+
50
+ attr_reader :rows, :run_id, :judge_model, :spec
51
+
52
+ def judge_dimension_columns
53
+ @judge_dimension_columns ||= spec.judge_dimensions.map { |d| "judge_#{d[:key]}" }
54
+ end
55
+
56
+ def extra_csv_keys
57
+ @extra_csv_keys ||= rows.flat_map { |r| (r[:model_result].extra_columns || {}).keys }.uniq
58
+ end
59
+
60
+ def csv_columns
61
+ BASE_CSV_COLUMNS + judge_dimension_columns + JUDGE_TAIL_COLUMNS + extra_csv_keys
62
+ end
63
+
64
+ def compute_bucket_disagreement
65
+ by_input = Hash.new { |h, k| h[k] = Set.new }
66
+ rows.each do |row|
67
+ mr = row[:model_result]
68
+ next unless mr.status == 'ok' && mr.parsed_bucket
69
+
70
+ by_input[mr.input_id] << mr.parsed_bucket
71
+ end
72
+ by_input.transform_values { |set| set.size > 1 ? set.sort : [] }
73
+ end
74
+
75
+ def build_csv(bucket_disagreement)
76
+ columns = csv_columns
77
+ CSV.generate(write_headers: true, headers: columns) do |csv|
78
+ rows.each { |row| csv << csv_row(row, bucket_disagreement, columns) }
79
+ end
80
+ end
81
+
82
+ def csv_row(row, bucket_disagreement, columns)
83
+ mr = row[:model_result]
84
+ jv = row[:judge_verdict]
85
+ base = base_csv_values(mr, jv, flag_reasons(mr, jv, bucket_disagreement))
86
+ spec.judge_dimensions.each { |d| base["judge_#{d[:key]}"] = jv&.dimensions&.dig(d[:key]) }
87
+ (mr.extra_columns || {}).each { |k, v| base[k] = v }
88
+ columns.map { |c| base[c] }
89
+ end
90
+
91
+ def base_csv_values(result, verdict, review_reasons)
92
+ {
93
+ 'input_id' => result.input_id, 'input_label' => result.input_label, 'model' => result.model,
94
+ 'vendor' => result.vendor, 'status' => result.status, 'latency_ms' => result.latency_ms,
95
+ 'input_tokens' => result.input_tokens, 'output_tokens' => result.output_tokens,
96
+ 'total_tokens' => result.total_tokens, 'estimated_cost_usd' => result.estimated_cost_usd,
97
+ 'parsed_score' => result.parsed_score, 'parsed_bucket' => result.parsed_bucket,
98
+ 'judge_quality_score' => verdict&.quality_score, 'judge_verdict_one_line' => verdict&.verdict_one_line,
99
+ 'judge_issues' => Array(verdict&.issues).join(' | '), 'judge_error' => verdict&.judge_error,
100
+ 'self_judge' => (result.model == judge_model), 'needs_human_review' => review_reasons.any?,
101
+ 'review_reasons' => review_reasons.join('; '), 'raw_output_ref' => result.raw_output_ref,
102
+ 'parsed_output_ref' => result.parsed_output_ref, 'error' => result.error
103
+ }
104
+ end
105
+
106
+ def build_needs_review(bucket_disagreement)
107
+ rows.filter_map do |row|
108
+ mr = row[:model_result]
109
+ reasons = flag_reasons(mr, row[:judge_verdict], bucket_disagreement)
110
+ next if reasons.empty?
111
+
112
+ { input_id: mr.input_id, input_label: mr.input_label, model: mr.model, reasons: }
113
+ end
114
+ end
115
+
116
+ def flag_reasons(result, verdict, bucket_disagreement)
117
+ reasons = []
118
+ disagree = bucket_disagreement[result.input_id]
119
+ reasons << "buckets_disagree(#{disagree.join(',')})" if disagree&.any?
120
+ reasons << 'judge_borderline' if verdict && Verdict.borderline?(verdict.quality_score)
121
+ reasons << 'parse_failed' if result.status == 'parse_error'
122
+ reasons << 'llm_error' if result.status == 'llm_error'
123
+ reasons
124
+ end
125
+
126
+ def build_summary
127
+ rows.group_by { |r| r[:model_result].model }
128
+ .map { |model, model_rows| summarize_model(model, model_rows) }
129
+ .sort_by { |s| -(s[:mean_quality] || -1) }
130
+ end
131
+
132
+ def summarize_model(model, model_rows)
133
+ ok = model_rows.count { |r| r[:model_result].status == 'ok' }
134
+ latencies = model_rows.filter_map { |r| r[:model_result].latency_ms }
135
+ costs = model_rows.filter_map { |r| r[:model_result].estimated_cost_usd }.map(&:to_f)
136
+ qualities = model_rows.filter_map { |r| r[:judge_verdict]&.quality_score }
137
+ {
138
+ model:, parse_ok_pct: pct(ok, model_rows.size), mean_quality: mean(qualities)&.round(1),
139
+ median_latency_ms: percentile(latencies, 50), p95_latency_ms: percentile(latencies, 95),
140
+ mean_cost: costs.empty? ? 0.0 : (costs.sum / costs.size).round(5), total_cost: costs.sum.round(4),
141
+ review_pct: pct(review_count(model_rows), model_rows.size)
142
+ }
143
+ end
144
+
145
+ def review_count(model_rows)
146
+ model_rows.count do |r|
147
+ jv = r[:judge_verdict]
148
+ r[:model_result].status != 'ok' || (jv && Verdict.borderline?(jv.quality_score))
149
+ end
150
+ end
151
+
152
+ def build_markdown(bucket_disagreement, summary)
153
+ <<~MD
154
+ # LLM Eval — #{run_id}
155
+
156
+ - Inputs in this run: **#{rows.map { |r| r[:model_result].input_id }.uniq.size}**
157
+ - Candidate models: **#{summary.size}**
158
+ - Judge model: `#{judge_model}` (rows where the judged model == judge model are flagged `self_judge`)
159
+
160
+ ## Per-model summary
161
+
162
+ | Model | Parse OK% | Mean Judge | Median latency | P95 latency | Mean cost | Total cost | % needs review |
163
+ |---|---:|---:|---:|---:|---:|---:|---:|
164
+ #{summary.map { |s| markdown_summary_row(s) }.join("\n")}
165
+
166
+ ## Recommendation buckets per input
167
+
168
+ #{bucket_table}
169
+
170
+ ## Bucket-disagreement cases (model choice matters here)
171
+
172
+ #{bucket_disagreement_section(bucket_disagreement)}
173
+
174
+ ## Pareto picks
175
+
176
+ #{pareto_section(summary)}
177
+ MD
178
+ end
179
+
180
+ def markdown_summary_row(summary)
181
+ "| `#{summary[:model]}` | #{summary[:parse_ok_pct]}% | #{summary[:mean_quality] || 'n/a'} | " \
182
+ "#{summary[:median_latency_ms] || 'n/a'}ms | #{summary[:p95_latency_ms] || 'n/a'}ms | " \
183
+ "$#{format('%.5f', summary[:mean_cost])} | $#{format('%.4f', summary[:total_cost])} | " \
184
+ "#{summary[:review_pct]}% |"
185
+ end
186
+
187
+ def bucket_table
188
+ lines = ['| Input | Model | Bucket | Score | Judge | Status |', '|---|---|---|---:|---:|---|']
189
+ rows.group_by { |r| r[:model_result].input_id }.each do |input_id, input_rows|
190
+ label = input_rows.first[:model_result].input_label
191
+ input_rows.each do |r|
192
+ mr = r[:model_result]
193
+ jv = r[:judge_verdict]
194
+ lines << "| #{input_id} #{label} | `#{mr.model}` | #{mr.parsed_bucket || '-'} | " \
195
+ "#{mr.parsed_score || '-'} | #{jv&.quality_score || '-'} | #{mr.status} |"
196
+ end
197
+ end
198
+ lines.join("\n")
199
+ end
200
+
201
+ def bucket_disagreement_section(bucket_disagreement)
202
+ cases = bucket_disagreement.reject { |_, v| v.empty? }
203
+ return '_None — every input received the same bucket across all models._' if cases.empty?
204
+
205
+ cases.flat_map { |input_id, buckets| disagreement_block(input_id, buckets) }.join("\n")
206
+ end
207
+
208
+ def disagreement_block(input_id, buckets)
209
+ input_rows = rows.select { |r| r[:model_result].input_id == input_id && r[:model_result].status == 'ok' }
210
+ return [] if input_rows.empty?
211
+
212
+ label = input_rows.first[:model_result].input_label
213
+ lines = ["### #{input_id} — #{label}", '', "Buckets seen: **#{buckets.join(', ')}**", '',
214
+ '| Model | Bucket | Score | Judge |', '|---|---|---:|---:|']
215
+ input_rows.each do |r|
216
+ mr = r[:model_result]
217
+ jv = r[:judge_verdict]
218
+ lines << "| `#{mr.model}` | #{mr.parsed_bucket || '-'} | #{mr.parsed_score || '-'} | #{jv&.quality_score || '-'} |"
219
+ end
220
+ lines << ''
221
+ lines
222
+ end
223
+
224
+ def pareto_section(summary)
225
+ return '_No successful runs to rank._' if summary.empty?
226
+
227
+ lines = ['Top 3 by mean judge quality:', '']
228
+ summary.first(3).each_with_index do |s, i|
229
+ lines << "#{i + 1}. **`#{s[:model]}`** — judge **#{s[:mean_quality] || 'n/a'}**, " \
230
+ "median **#{s[:median_latency_ms]}ms**, mean cost **$#{format('%.5f', s[:mean_cost])}**, " \
231
+ "parse OK **#{s[:parse_ok_pct]}%**."
232
+ end
233
+ lines << ''
234
+ lines << '_Rows flagged `needs_human_review=true` are the ones to sanity-check manually._'
235
+ lines.join("\n")
236
+ end
237
+
238
+ def pct(num, den)
239
+ return 0 if den.zero?
240
+
241
+ ((num.to_f / den) * 100).round(1)
242
+ end
243
+
244
+ def mean(values)
245
+ return nil if values.empty?
246
+
247
+ values.sum.to_f / values.size
248
+ end
249
+
250
+ def percentile(values, target_pct)
251
+ return nil if values.empty?
252
+
253
+ sorted = values.sort
254
+ sorted[((target_pct / 100.0) * (sorted.size - 1)).round]
255
+ end
256
+ end
257
+ end
258
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmConductor
4
+ module Eval
5
+ # Outcome of running ONE (input, model) pair through the engine.
6
+ #
7
+ # Ported from the Rails prototype's ModelRunner::Result struct, with the
8
+ # +record_*+ fields renamed to +input_*+ and the on-disk +*_path+ fields
9
+ # generalized to +*_ref+ (a Store handle — a filesystem path for FileStore,
10
+ # an opaque key for InMemory).
11
+ #
12
+ # +status+ is one of: 'ok', 'parse_error', 'llm_error', 'exception'.
13
+ Result = Struct.new(
14
+ :input_id, :input_label, :model, :vendor, :status, :latency_ms,
15
+ :input_tokens, :output_tokens, :total_tokens, :estimated_cost_usd,
16
+ :parsed_score, :parsed_bucket, :extra_columns,
17
+ :raw_output_ref, :parsed_output_ref, :error,
18
+ keyword_init: true
19
+ ) do
20
+ # String-keyed hash for JSON manifest persistence.
21
+ def to_h
22
+ super.transform_keys(&:to_s)
23
+ end
24
+
25
+ def ok?
26
+ status == 'ok'
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'time'
4
+ require_relative 'model_runner'
5
+ require_relative 'judge'
6
+ require_relative 'report_builder'
7
+ require_relative 'result'
8
+ require_relative 'verdict'
9
+
10
+ module LlmConductor
11
+ module Eval
12
+ # Top-level orchestrator. For each input, builds the prompt data once, runs
13
+ # every candidate (input, model) pair through ModelRunner, judges it, and
14
+ # rewrites the manifest after each pair so the run stays resumable /
15
+ # reportable mid-flight.
16
+ #
17
+ # Unlike the Rails prototype it does NO data selection — the caller passes
18
+ # +inputs:+ directly. See LlmConductor::Eval.run for the public entrypoint.
19
+ class Runner
20
+ def initialize(spec:, inputs:, models:, judge:, store:, logger:, run_id:)
21
+ @spec = spec
22
+ @inputs = inputs.to_a
23
+ @models = models
24
+ @judge_config = self.class.normalize_judge(judge)
25
+ @store = store
26
+ @logger = logger
27
+ @run_id = run_id
28
+ end
29
+
30
+ def run
31
+ @logger.info("LLM eval run=#{@run_id} models=#{@models.map { |m| m[:model] }.join(',')} " \
32
+ "judge=#{@judge_config[:model]}")
33
+ warn_self_judge
34
+ manifest = base_manifest
35
+ rows = run_all_pairs(manifest)
36
+ manifest[:finished_at] = Time.now.utc.iso8601
37
+ @store.write_manifest(@run_id, manifest)
38
+ build_report(rows)
39
+ end
40
+
41
+ # Rebuild a Report from a stored manifest without recalling models or judge.
42
+ def self.report_only(run_id:, spec:, store:)
43
+ manifest = store.read_manifest(run_id) or raise ArgumentError, "No manifest for run_id=#{run_id}"
44
+ rows = manifest['rows'].map { |raw| restore_row(raw) }
45
+ ReportBuilder.new(rows:, run_id:, judge_model: manifest['judge_model'], spec:).build
46
+ end
47
+
48
+ # Re-run the judge against stored candidate outputs (e.g. after changing
49
+ # the judge model). Fully self-contained: input data is read from the store.
50
+ def self.judge_only(run_id:, spec:, store:, judge:, logger:)
51
+ config = normalize_judge(judge)
52
+ manifest = store.read_manifest(run_id) or raise ArgumentError, "No manifest for run_id=#{run_id}"
53
+ judge_obj = Judge.new(spec:, store:, run_id:, logger:,
54
+ judge_model: config[:model], judge_vendor: config[:vendor])
55
+ rows = manifest['rows'].map { |raw| rejudge_row(raw, judge_obj, store, run_id) }
56
+ manifest['judge_model'] = config[:model]
57
+ manifest['rejudged_at'] = Time.now.utc.iso8601
58
+ store.write_manifest(run_id, manifest)
59
+ ReportBuilder.new(rows:, run_id:, judge_model: config[:model], spec:).build
60
+ end
61
+
62
+ def self.normalize_judge(judge)
63
+ judge ||= {}
64
+ { model: judge[:model] || Judge::DEFAULT_MODEL,
65
+ vendor: (judge[:vendor] || Judge::DEFAULT_VENDOR).to_sym }
66
+ end
67
+
68
+ def self.restore_result(raw)
69
+ Result.new(**raw.transform_keys(&:to_sym))
70
+ end
71
+
72
+ def self.restore_verdict(raw)
73
+ raw ? Verdict.new(**raw.transform_keys(&:to_sym)) : nil
74
+ end
75
+
76
+ def self.restore_row(raw)
77
+ { model_result: restore_result(raw['model_result']),
78
+ judge_verdict: restore_verdict(raw['judge_verdict']) }
79
+ end
80
+
81
+ def self.rejudge_row(raw, judge_obj, store, run_id)
82
+ result = restore_result(raw['model_result'])
83
+ input_data = store.read_input_data(run_id, result.input_id)
84
+ verdict = judge_obj.judge(model_result: result, input_data:)
85
+ raw['judge_verdict'] = verdict&.to_h
86
+ { model_result: result, judge_verdict: verdict }
87
+ end
88
+
89
+ private
90
+
91
+ def run_all_pairs(manifest)
92
+ rows = []
93
+ @inputs.each_with_index do |input, idx|
94
+ input_id = @spec.input_id(input)
95
+ data = @spec.build_data(input)
96
+ @store.write_input_data(@run_id, input_id, data)
97
+ @models.each do |cand|
98
+ row = run_pair(input, data, cand)
99
+ rows << row
100
+ manifest[:rows] << serialize_row(row)
101
+ @store.write_manifest(@run_id, manifest)
102
+ log_pair(idx, cand, row)
103
+ end
104
+ end
105
+ rows
106
+ end
107
+
108
+ def run_pair(input, data, cand)
109
+ result = ModelRunner.new(input, model: cand[:model], vendor: cand[:vendor], spec: @spec,
110
+ store: @store, run_id: @run_id, logger: @logger, data:).run
111
+ verdict = build_judge.judge(model_result: result, input_data: data)
112
+ { model_result: result, judge_verdict: verdict }
113
+ end
114
+
115
+ def build_judge
116
+ Judge.new(spec: @spec, store: @store, run_id: @run_id, logger: @logger,
117
+ judge_model: @judge_config[:model], judge_vendor: @judge_config[:vendor])
118
+ end
119
+
120
+ def base_manifest
121
+ { run_id: @run_id, started_at: Time.now.utc.iso8601, judge_model: @judge_config[:model],
122
+ models: @models, rows: [] }
123
+ end
124
+
125
+ def serialize_row(row)
126
+ { 'model_result' => row[:model_result].to_h, 'judge_verdict' => row[:judge_verdict]&.to_h }
127
+ end
128
+
129
+ def build_report(rows)
130
+ ReportBuilder.new(rows:, run_id: @run_id, judge_model: @judge_config[:model], spec: @spec).build
131
+ end
132
+
133
+ def warn_self_judge
134
+ return unless @models.any? { |m| m[:model] == @judge_config[:model] }
135
+
136
+ @logger.warn("[Eval] judge model #{@judge_config[:model]} also appears in candidates — " \
137
+ 'those rows will be flagged self_judge=true and should be discounted when ranking.')
138
+ end
139
+
140
+ def log_pair(idx, cand, row)
141
+ result = row[:model_result]
142
+ verdict = row[:judge_verdict]
143
+ @logger.info(" [#{idx + 1}/#{@inputs.size}] #{cand[:model]} -> status=#{result.status} " \
144
+ "latency=#{result.latency_ms}ms judge=#{verdict&.quality_score}")
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'json_parser'
4
+
5
+ module LlmConductor
6
+ module Eval
7
+ # The public extension seam. Subclass (or duck-type) this to describe how to
8
+ # evaluate one LLM-powered feature: how to turn a caller-supplied input into
9
+ # a prompt payload, how to parse the output, and what the judge should grade.
10
+ #
11
+ # The engine itself is generic and feature-agnostic; everything
12
+ # feature-specific lives here. Unlike the Rails prototype's Feature::Base,
13
+ # there is no +select_cases+ — selecting which inputs to evaluate is the
14
+ # caller's job, done before calling LlmConductor::Eval.run and passed via
15
+ # +inputs:+. The engine never queries a database.
16
+ class Spec
17
+ # Symbol passed to LlmConductor.generate as +type:+ (must match a
18
+ # registered prompt). Return nil if instead you build a full prompt
19
+ # string in #build_data, in which case the engine passes it as +prompt:+.
20
+ def prompt_type
21
+ raise NotImplementedError
22
+ end
23
+
24
+ # Stable id for an input (was record.id). Used for output grouping/paths.
25
+ def input_id(_input)
26
+ raise NotImplementedError
27
+ end
28
+
29
+ # Human label for an input (was record.name). Defaults to the id.
30
+ def input_label(input)
31
+ input_id(input).to_s
32
+ end
33
+
34
+ # Build the prompt payload for one input. When #prompt_type is set this is
35
+ # passed as +data:+; otherwise it must be a full prompt String passed as
36
+ # +prompt:+ (was build_data(record)).
37
+ def build_data(_input)
38
+ raise NotImplementedError
39
+ end
40
+
41
+ # Parse the LLM's raw text into a Hash, or nil on failure. Defaults to the
42
+ # gem's conservative JsonParser; override for tuned/feature-specific parsing.
43
+ def parse(raw)
44
+ JsonParser.parse(raw)
45
+ end
46
+
47
+ # Vendor-specific generation params (e.g. a deterministic Ollama seed).
48
+ # Return {} for vendors that don't expose one.
49
+ # rubocop:disable Lint/UnusedMethodArgument
50
+ def vendor_params(vendor:, input_id:)
51
+ {}
52
+ end
53
+ # rubocop:enable Lint/UnusedMethodArgument
54
+
55
+ # { score: Numeric|nil, bucket: String|nil } — powers CSV columns and the
56
+ # bucket-disagreement detection. +bucket+ may be any discrete label.
57
+ def output_summary(_parsed)
58
+ raise NotImplementedError
59
+ end
60
+
61
+ # Text inlined into the judge prompt describing the rubric the candidate
62
+ # was asked to follow.
63
+ def judge_rubric_excerpt
64
+ raise NotImplementedError
65
+ end
66
+
67
+ # [{ key:, description: }] — dimensions the judge scores 0-100 each.
68
+ def judge_dimensions
69
+ raise NotImplementedError
70
+ end
71
+
72
+ # Extra per-row CSV columns beyond the base set. Keys become headers.
73
+ def extra_columns(_parsed)
74
+ {}
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmConductor
4
+ module Eval
5
+ module Store
6
+ # Pluggable persistence interface for an eval run. Replaces the prototype's
7
+ # hard-coded Rails.root.join('tmp', ...) + File.read/write calls.
8
+ #
9
+ # Two implementations ship with the gem: InMemory (default; nothing hits
10
+ # disk) and FileStore (resumable, reproduces the prototype's tmp/<run_id>/
11
+ # layout). Implement this interface to persist anywhere else.
12
+ #
13
+ # Write methods return an opaque "ref" (a filesystem path for FileStore, a
14
+ # key for InMemory) recorded on the Result for the report's path columns.
15
+ class Base
16
+ def write_raw(_run_id, _input_id, _model_slug, _text)
17
+ raise NotImplementedError
18
+ end
19
+
20
+ def read_raw(_run_id, _input_id, _model_slug)
21
+ raise NotImplementedError
22
+ end
23
+
24
+ def write_parsed(_run_id, _input_id, _model_slug, _hash)
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Returns the parsed Hash/Array (not the ref), or nil if absent.
29
+ def read_parsed(_run_id, _input_id, _model_slug)
30
+ raise NotImplementedError
31
+ end
32
+
33
+ def write_input_data(_run_id, _input_id, _hash)
34
+ raise NotImplementedError
35
+ end
36
+
37
+ # Enables self-contained re-judge / report without the original inputs.
38
+ def read_input_data(_run_id, _input_id)
39
+ raise NotImplementedError
40
+ end
41
+
42
+ def write_manifest(_run_id, _manifest_hash)
43
+ raise NotImplementedError
44
+ end
45
+
46
+ def read_manifest(_run_id)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # True when this (input, model) pair already has stored output — lets a
51
+ # future restart skip already-completed pairs.
52
+ def completed?(_run_id, _input_id, _model_slug)
53
+ raise NotImplementedError
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'fileutils'
5
+ require_relative 'base'
6
+
7
+ module LlmConductor
8
+ module Eval
9
+ module Store
10
+ # Resumable on-disk store. Reproduces the Rails prototype's layout:
11
+ #
12
+ # <base_dir>/<run_id>/manifest.json
13
+ # <base_dir>/<run_id>/<input_id>/_input.json
14
+ # <base_dir>/<run_id>/<input_id>/<model_slug>.raw.txt
15
+ # <base_dir>/<run_id>/<input_id>/<model_slug>.json
16
+ #
17
+ # The manifest is rewritten after every (input, model) pair, so a run is
18
+ # reportable / re-judgeable mid-flight (see Runner.report_only/judge_only).
19
+ class FileStore < Base
20
+ attr_reader :base_dir
21
+
22
+ def initialize(base_dir)
23
+ super()
24
+ @base_dir = base_dir.to_s
25
+ end
26
+
27
+ def write_raw(run_id, input_id, model_slug, text)
28
+ write_file(output_path(run_id, input_id, "#{model_slug}.raw.txt"), text.to_s)
29
+ end
30
+
31
+ def read_raw(run_id, input_id, model_slug)
32
+ read_file(output_path(run_id, input_id, "#{model_slug}.raw.txt"))
33
+ end
34
+
35
+ def write_parsed(run_id, input_id, model_slug, hash)
36
+ write_file(output_path(run_id, input_id, "#{model_slug}.json"), JSON.pretty_generate(hash))
37
+ end
38
+
39
+ def read_parsed(run_id, input_id, model_slug)
40
+ read_json(output_path(run_id, input_id, "#{model_slug}.json"))
41
+ end
42
+
43
+ def write_input_data(run_id, input_id, hash)
44
+ write_file(output_path(run_id, input_id, '_input.json'), JSON.pretty_generate(hash))
45
+ end
46
+
47
+ def read_input_data(run_id, input_id)
48
+ read_json(output_path(run_id, input_id, '_input.json'))
49
+ end
50
+
51
+ def write_manifest(run_id, manifest_hash)
52
+ write_file(manifest_path(run_id), JSON.pretty_generate(manifest_hash))
53
+ end
54
+
55
+ def read_manifest(run_id)
56
+ read_json(manifest_path(run_id))
57
+ end
58
+
59
+ def completed?(run_id, input_id, model_slug)
60
+ File.exist?(output_path(run_id, input_id, "#{model_slug}.json")) ||
61
+ File.exist?(output_path(run_id, input_id, "#{model_slug}.raw.txt"))
62
+ end
63
+
64
+ private
65
+
66
+ def output_path(run_id, input_id, name)
67
+ File.join(@base_dir, run_id.to_s, input_id.to_s, name)
68
+ end
69
+
70
+ def manifest_path(run_id)
71
+ File.join(@base_dir, run_id.to_s, 'manifest.json')
72
+ end
73
+
74
+ def write_file(path, content)
75
+ FileUtils.mkdir_p(File.dirname(path))
76
+ File.write(path, content)
77
+ path
78
+ end
79
+
80
+ def read_file(path)
81
+ File.exist?(path) ? File.read(path) : nil
82
+ end
83
+
84
+ def read_json(path)
85
+ return nil unless File.exist?(path)
86
+
87
+ JSON.parse(File.read(path))
88
+ rescue JSON::ParserError
89
+ nil
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end