llm_conductor 1.7.1 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -0
- data/README.md +63 -0
- data/examples/model_eval_usage.rb +86 -0
- data/lib/llm_conductor/eval/json_parser.rb +76 -0
- data/lib/llm_conductor/eval/judge.rb +188 -0
- data/lib/llm_conductor/eval/model_runner.rb +95 -0
- data/lib/llm_conductor/eval/report.rb +22 -0
- data/lib/llm_conductor/eval/report_builder.rb +258 -0
- data/lib/llm_conductor/eval/result.rb +30 -0
- data/lib/llm_conductor/eval/runner.rb +148 -0
- data/lib/llm_conductor/eval/spec.rb +78 -0
- data/lib/llm_conductor/eval/store/base.rb +58 -0
- data/lib/llm_conductor/eval/store/file_store.rb +94 -0
- data/lib/llm_conductor/eval/store/in_memory.rb +76 -0
- data/lib/llm_conductor/eval/verdict.rb +31 -0
- data/lib/llm_conductor/eval.rb +75 -0
- data/lib/llm_conductor/version.rb +1 -1
- metadata +30 -2
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative 'base'
|
|
5
|
+
|
|
6
|
+
module LlmConductor
|
|
7
|
+
module Eval
|
|
8
|
+
module Store
|
|
9
|
+
# Default store: everything lives in process memory, nothing hits disk.
|
|
10
|
+
# Ideal for tests and ephemeral runs. Manifests are round-tripped through
|
|
11
|
+
# JSON on write so reads return string-keyed hashes, matching FileStore.
|
|
12
|
+
class InMemory < Base
|
|
13
|
+
def initialize
|
|
14
|
+
super
|
|
15
|
+
@raw = {}
|
|
16
|
+
@parsed = {}
|
|
17
|
+
@inputs = {}
|
|
18
|
+
@manifests = {}
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def write_raw(run_id, input_id, model_slug, text)
|
|
22
|
+
key = output_key(run_id, input_id, model_slug)
|
|
23
|
+
@raw[key] = text.to_s
|
|
24
|
+
"memory://#{key}.raw"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def read_raw(run_id, input_id, model_slug)
|
|
28
|
+
@raw[output_key(run_id, input_id, model_slug)]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def write_parsed(run_id, input_id, model_slug, hash)
|
|
32
|
+
key = output_key(run_id, input_id, model_slug)
|
|
33
|
+
@parsed[key] = hash
|
|
34
|
+
"memory://#{key}.json"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def read_parsed(run_id, input_id, model_slug)
|
|
38
|
+
@parsed[output_key(run_id, input_id, model_slug)]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def write_input_data(run_id, input_id, hash)
|
|
42
|
+
# Round-trip through JSON so reads return string-keyed hashes, matching
|
|
43
|
+
# FileStore — keeps judge_only/report_only behavior identical across stores.
|
|
44
|
+
@inputs[input_key(run_id, input_id)] = JSON.parse(JSON.generate(hash))
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def read_input_data(run_id, input_id)
|
|
48
|
+
@inputs[input_key(run_id, input_id)]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def write_manifest(run_id, manifest_hash)
|
|
52
|
+
@manifests[run_id.to_s] = JSON.parse(JSON.generate(manifest_hash))
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def read_manifest(run_id)
|
|
56
|
+
@manifests[run_id.to_s]
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def completed?(run_id, input_id, model_slug)
|
|
60
|
+
key = output_key(run_id, input_id, model_slug)
|
|
61
|
+
@parsed.key?(key) || @raw.key?(key)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
def output_key(run_id, input_id, model_slug)
|
|
67
|
+
"#{run_id}/#{input_id}/#{model_slug}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def input_key(run_id, input_id)
|
|
71
|
+
"#{run_id}/#{input_id}"
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LlmConductor
|
|
4
|
+
module Eval
|
|
5
|
+
# Scores in this range are "borderline" — the judge is uncertain enough that
|
|
6
|
+
# the row is flagged for human review. Tuned in the Rails prototype.
|
|
7
|
+
BORDERLINE_RANGE = (50..70)
|
|
8
|
+
|
|
9
|
+
# The LLM-as-judge's verdict for one candidate (input, model) output.
|
|
10
|
+
# Ported verbatim from the prototype's Judge::Verdict struct.
|
|
11
|
+
Verdict = Struct.new(
|
|
12
|
+
:quality_score, :dimensions, :issues, :verdict_one_line,
|
|
13
|
+
:judge_model, :judge_latency_ms, :judge_input_tokens, :judge_output_tokens,
|
|
14
|
+
:judge_estimated_cost_usd, :judge_error,
|
|
15
|
+
keyword_init: true
|
|
16
|
+
) do
|
|
17
|
+
# String-keyed hash for JSON manifest persistence.
|
|
18
|
+
def to_h
|
|
19
|
+
super.transform_keys(&:to_s)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def borderline?
|
|
23
|
+
Verdict.borderline?(quality_score)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.borderline?(score)
|
|
27
|
+
score.is_a?(Numeric) && BORDERLINE_RANGE.cover?(score)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'logger'
|
|
4
|
+
require 'time'
|
|
5
|
+
|
|
6
|
+
require 'llm_conductor'
|
|
7
|
+
|
|
8
|
+
require_relative 'eval/json_parser'
|
|
9
|
+
require_relative 'eval/result'
|
|
10
|
+
require_relative 'eval/verdict'
|
|
11
|
+
require_relative 'eval/spec'
|
|
12
|
+
require_relative 'eval/store/base'
|
|
13
|
+
require_relative 'eval/store/in_memory'
|
|
14
|
+
require_relative 'eval/store/file_store'
|
|
15
|
+
require_relative 'eval/model_runner'
|
|
16
|
+
require_relative 'eval/judge'
|
|
17
|
+
require_relative 'eval/report'
|
|
18
|
+
require_relative 'eval/report_builder'
|
|
19
|
+
require_relative 'eval/runner'
|
|
20
|
+
|
|
21
|
+
module LlmConductor
|
|
22
|
+
# Opt-in model-evaluation harness. `require 'llm_conductor/eval'` to load it;
|
|
23
|
+
# core `require 'llm_conductor'` users pay nothing.
|
|
24
|
+
#
|
|
25
|
+
# Runs the same prompt across N (model, vendor) pairs over M caller-supplied
|
|
26
|
+
# inputs, then compares them on cost, latency, tokens, and LLM-judged quality.
|
|
27
|
+
# The engine is feature-agnostic; everything feature-specific lives in a Spec.
|
|
28
|
+
#
|
|
29
|
+
# require 'llm_conductor/eval'
|
|
30
|
+
#
|
|
31
|
+
# report = LlmConductor::Eval.run(
|
|
32
|
+
# spec: MyFeatureSpec.new,
|
|
33
|
+
# inputs: my_inputs, # any enumerable; engine never selects/queries
|
|
34
|
+
# models: [{ model: 'gpt-4o-mini', vendor: :openai },
|
|
35
|
+
# { model: 'gemini-2.5-flash', vendor: :gemini }],
|
|
36
|
+
# judge: { model: 'llama-3.3-70b-versatile', vendor: :groq }
|
|
37
|
+
# )
|
|
38
|
+
# report.summary # per-model aggregates
|
|
39
|
+
# report.to_markdown # decision-aid report (caller persists)
|
|
40
|
+
# report.to_csv # per-row data
|
|
41
|
+
# report.needs_review # rows flagged for human eyeball
|
|
42
|
+
module Eval
|
|
43
|
+
module_function
|
|
44
|
+
|
|
45
|
+
# The single entrypoint. +spec+ implements Eval::Spec; +inputs+ is any
|
|
46
|
+
# enumerable of opaque objects the spec knows how to interpret; +models+ is
|
|
47
|
+
# the caller-owned list of { model:, vendor: } candidate pairs.
|
|
48
|
+
def run(spec:, inputs:, models:, judge: {}, store: nil, logger: nil, run_id: nil)
|
|
49
|
+
Runner.new(
|
|
50
|
+
spec:, inputs:, models:, judge:,
|
|
51
|
+
store: store || Store::InMemory.new,
|
|
52
|
+
logger: logger || default_logger,
|
|
53
|
+
run_id: run_id || generate_run_id
|
|
54
|
+
).run
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Re-judge stored candidate outputs without recalling the candidate models.
|
|
58
|
+
def judge_only(run_id:, spec:, store:, judge: {}, logger: nil)
|
|
59
|
+
Runner.judge_only(run_id:, spec:, store:, judge:, logger: logger || default_logger)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Rebuild the Report from a stored manifest, no model or judge calls.
|
|
63
|
+
def report_only(run_id:, spec:, store:)
|
|
64
|
+
Runner.report_only(run_id:, spec:, store:)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def default_logger
|
|
68
|
+
LlmConductor.configuration.logger || Logger.new($stdout)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def generate_run_id
|
|
72
|
+
"run_#{Time.now.utc.strftime('%Y%m%d_%H%M%S')}"
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llm_conductor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.8.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ben Zheng
|
|
8
8
|
bindir: exe
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2026-
|
|
10
|
+
date: 2026-06-10 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: activesupport
|
|
@@ -37,6 +37,20 @@ dependencies:
|
|
|
37
37
|
- - "~>"
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
39
|
version: '1.7'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: csv
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '3.0'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '3.0'
|
|
40
54
|
- !ruby/object:Gem::Dependency
|
|
41
55
|
name: gemini-ai
|
|
42
56
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -162,6 +176,7 @@ files:
|
|
|
162
176
|
- examples/gemini_vision_usage.rb
|
|
163
177
|
- examples/gpt_vision_usage.rb
|
|
164
178
|
- examples/groq_usage.rb
|
|
179
|
+
- examples/model_eval_usage.rb
|
|
165
180
|
- examples/ollama_params_usage.rb
|
|
166
181
|
- examples/openrouter_vision_usage.rb
|
|
167
182
|
- examples/prompt_registration.rb
|
|
@@ -181,6 +196,19 @@ files:
|
|
|
181
196
|
- lib/llm_conductor/clients/zai_client.rb
|
|
182
197
|
- lib/llm_conductor/configuration.rb
|
|
183
198
|
- lib/llm_conductor/data_builder.rb
|
|
199
|
+
- lib/llm_conductor/eval.rb
|
|
200
|
+
- lib/llm_conductor/eval/json_parser.rb
|
|
201
|
+
- lib/llm_conductor/eval/judge.rb
|
|
202
|
+
- lib/llm_conductor/eval/model_runner.rb
|
|
203
|
+
- lib/llm_conductor/eval/report.rb
|
|
204
|
+
- lib/llm_conductor/eval/report_builder.rb
|
|
205
|
+
- lib/llm_conductor/eval/result.rb
|
|
206
|
+
- lib/llm_conductor/eval/runner.rb
|
|
207
|
+
- lib/llm_conductor/eval/spec.rb
|
|
208
|
+
- lib/llm_conductor/eval/store/base.rb
|
|
209
|
+
- lib/llm_conductor/eval/store/file_store.rb
|
|
210
|
+
- lib/llm_conductor/eval/store/in_memory.rb
|
|
211
|
+
- lib/llm_conductor/eval/verdict.rb
|
|
184
212
|
- lib/llm_conductor/patches/gemini_vertex_api_key.rb
|
|
185
213
|
- lib/llm_conductor/prompt_manager.rb
|
|
186
214
|
- lib/llm_conductor/prompts.rb
|