llm_conductor 1.7.1 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require_relative 'base'
5
+
6
+ module LlmConductor
7
+ module Eval
8
+ module Store
9
+ # Default store: everything lives in process memory, nothing hits disk.
10
+ # Ideal for tests and ephemeral runs. Manifests are round-tripped through
11
+ # JSON on write so reads return string-keyed hashes, matching FileStore.
12
+ class InMemory < Base
13
+ def initialize
14
+ super
15
+ @raw = {}
16
+ @parsed = {}
17
+ @inputs = {}
18
+ @manifests = {}
19
+ end
20
+
21
+ def write_raw(run_id, input_id, model_slug, text)
22
+ key = output_key(run_id, input_id, model_slug)
23
+ @raw[key] = text.to_s
24
+ "memory://#{key}.raw"
25
+ end
26
+
27
+ def read_raw(run_id, input_id, model_slug)
28
+ @raw[output_key(run_id, input_id, model_slug)]
29
+ end
30
+
31
+ def write_parsed(run_id, input_id, model_slug, hash)
32
+ key = output_key(run_id, input_id, model_slug)
33
+ @parsed[key] = hash
34
+ "memory://#{key}.json"
35
+ end
36
+
37
+ def read_parsed(run_id, input_id, model_slug)
38
+ @parsed[output_key(run_id, input_id, model_slug)]
39
+ end
40
+
41
+ def write_input_data(run_id, input_id, hash)
42
+ # Round-trip through JSON so reads return string-keyed hashes, matching
43
+ # FileStore — keeps judge_only/report_only behavior identical across stores.
44
+ @inputs[input_key(run_id, input_id)] = JSON.parse(JSON.generate(hash))
45
+ end
46
+
47
+ def read_input_data(run_id, input_id)
48
+ @inputs[input_key(run_id, input_id)]
49
+ end
50
+
51
+ def write_manifest(run_id, manifest_hash)
52
+ @manifests[run_id.to_s] = JSON.parse(JSON.generate(manifest_hash))
53
+ end
54
+
55
+ def read_manifest(run_id)
56
+ @manifests[run_id.to_s]
57
+ end
58
+
59
+ def completed?(run_id, input_id, model_slug)
60
+ key = output_key(run_id, input_id, model_slug)
61
+ @parsed.key?(key) || @raw.key?(key)
62
+ end
63
+
64
+ private
65
+
66
+ def output_key(run_id, input_id, model_slug)
67
+ "#{run_id}/#{input_id}/#{model_slug}"
68
+ end
69
+
70
+ def input_key(run_id, input_id)
71
+ "#{run_id}/#{input_id}"
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmConductor
4
+ module Eval
5
+ # Scores in this range are "borderline" — the judge is uncertain enough that
6
+ # the row is flagged for human review. Tuned in the Rails prototype.
7
+ BORDERLINE_RANGE = (50..70)
8
+
9
+ # The LLM-as-judge's verdict for one candidate (input, model) output.
10
+ # Ported verbatim from the prototype's Judge::Verdict struct.
11
+ Verdict = Struct.new(
12
+ :quality_score, :dimensions, :issues, :verdict_one_line,
13
+ :judge_model, :judge_latency_ms, :judge_input_tokens, :judge_output_tokens,
14
+ :judge_estimated_cost_usd, :judge_error,
15
+ keyword_init: true
16
+ ) do
17
+ # String-keyed hash for JSON manifest persistence.
18
+ def to_h
19
+ super.transform_keys(&:to_s)
20
+ end
21
+
22
+ def borderline?
23
+ Verdict.borderline?(quality_score)
24
+ end
25
+
26
+ def self.borderline?(score)
27
+ score.is_a?(Numeric) && BORDERLINE_RANGE.cover?(score)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logger'
4
+ require 'time'
5
+
6
+ require 'llm_conductor'
7
+
8
+ require_relative 'eval/json_parser'
9
+ require_relative 'eval/result'
10
+ require_relative 'eval/verdict'
11
+ require_relative 'eval/spec'
12
+ require_relative 'eval/store/base'
13
+ require_relative 'eval/store/in_memory'
14
+ require_relative 'eval/store/file_store'
15
+ require_relative 'eval/model_runner'
16
+ require_relative 'eval/judge'
17
+ require_relative 'eval/report'
18
+ require_relative 'eval/report_builder'
19
+ require_relative 'eval/runner'
20
+
21
+ module LlmConductor
22
+ # Opt-in model-evaluation harness. `require 'llm_conductor/eval'` to load it;
23
+ # core `require 'llm_conductor'` users pay nothing.
24
+ #
25
+ # Runs the same prompt across N (model, vendor) pairs over M caller-supplied
26
+ # inputs, then compares them on cost, latency, tokens, and LLM-judged quality.
27
+ # The engine is feature-agnostic; everything feature-specific lives in a Spec.
28
+ #
29
+ # require 'llm_conductor/eval'
30
+ #
31
+ # report = LlmConductor::Eval.run(
32
+ # spec: MyFeatureSpec.new,
33
+ # inputs: my_inputs, # any enumerable; engine never selects/queries
34
+ # models: [{ model: 'gpt-4o-mini', vendor: :openai },
35
+ # { model: 'gemini-2.5-flash', vendor: :gemini }],
36
+ # judge: { model: 'llama-3.3-70b-versatile', vendor: :groq }
37
+ # )
38
+ # report.summary # per-model aggregates
39
+ # report.to_markdown # decision-aid report (caller persists)
40
+ # report.to_csv # per-row data
41
+ # report.needs_review # rows flagged for human eyeball
42
+ module Eval
43
+ module_function
44
+
45
+ # The single entrypoint. +spec+ implements Eval::Spec; +inputs+ is any
46
+ # enumerable of opaque objects the spec knows how to interpret; +models+ is
47
+ # the caller-owned list of { model:, vendor: } candidate pairs.
48
+ def run(spec:, inputs:, models:, judge: {}, store: nil, logger: nil, run_id: nil)
49
+ Runner.new(
50
+ spec:, inputs:, models:, judge:,
51
+ store: store || Store::InMemory.new,
52
+ logger: logger || default_logger,
53
+ run_id: run_id || generate_run_id
54
+ ).run
55
+ end
56
+
57
+ # Re-judge stored candidate outputs without recalling the candidate models.
58
+ def judge_only(run_id:, spec:, store:, judge: {}, logger: nil)
59
+ Runner.judge_only(run_id:, spec:, store:, judge:, logger: logger || default_logger)
60
+ end
61
+
62
+ # Rebuild the Report from a stored manifest, no model or judge calls.
63
+ def report_only(run_id:, spec:, store:)
64
+ Runner.report_only(run_id:, spec:, store:)
65
+ end
66
+
67
+ def default_logger
68
+ LlmConductor.configuration.logger || Logger.new($stdout)
69
+ end
70
+
71
+ def generate_run_id
72
+ "run_#{Time.now.utc.strftime('%Y%m%d_%H%M%S')}"
73
+ end
74
+ end
75
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LlmConductor
4
- VERSION = '1.7.1'
4
+ VERSION = '1.8.0'
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llm_conductor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.7.1
4
+ version: 1.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Zheng
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2026-05-16 00:00:00.000000000 Z
10
+ date: 2026-06-10 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: activesupport
@@ -37,6 +37,20 @@ dependencies:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
39
  version: '1.7'
40
+ - !ruby/object:Gem::Dependency
41
+ name: csv
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '3.0'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '3.0'
40
54
  - !ruby/object:Gem::Dependency
41
55
  name: gemini-ai
42
56
  requirement: !ruby/object:Gem::Requirement
@@ -162,6 +176,7 @@ files:
162
176
  - examples/gemini_vision_usage.rb
163
177
  - examples/gpt_vision_usage.rb
164
178
  - examples/groq_usage.rb
179
+ - examples/model_eval_usage.rb
165
180
  - examples/ollama_params_usage.rb
166
181
  - examples/openrouter_vision_usage.rb
167
182
  - examples/prompt_registration.rb
@@ -181,6 +196,19 @@ files:
181
196
  - lib/llm_conductor/clients/zai_client.rb
182
197
  - lib/llm_conductor/configuration.rb
183
198
  - lib/llm_conductor/data_builder.rb
199
+ - lib/llm_conductor/eval.rb
200
+ - lib/llm_conductor/eval/json_parser.rb
201
+ - lib/llm_conductor/eval/judge.rb
202
+ - lib/llm_conductor/eval/model_runner.rb
203
+ - lib/llm_conductor/eval/report.rb
204
+ - lib/llm_conductor/eval/report_builder.rb
205
+ - lib/llm_conductor/eval/result.rb
206
+ - lib/llm_conductor/eval/runner.rb
207
+ - lib/llm_conductor/eval/spec.rb
208
+ - lib/llm_conductor/eval/store/base.rb
209
+ - lib/llm_conductor/eval/store/file_store.rb
210
+ - lib/llm_conductor/eval/store/in_memory.rb
211
+ - lib/llm_conductor/eval/verdict.rb
184
212
  - lib/llm_conductor/patches/gemini_vertex_api_key.rb
185
213
  - lib/llm_conductor/prompt_manager.rb
186
214
  - lib/llm_conductor/prompts.rb