RubyGems - llm_conductor - Versions diffs - 1.7.1 → 1.8.0 - Mend

llm_conductor 1.7.1 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/.rubocop.yml +10 -0
data/README.md +63 -0
data/examples/model_eval_usage.rb +86 -0
data/lib/llm_conductor/eval/json_parser.rb +76 -0
data/lib/llm_conductor/eval/judge.rb +188 -0
data/lib/llm_conductor/eval/model_runner.rb +95 -0
data/lib/llm_conductor/eval/report.rb +22 -0
data/lib/llm_conductor/eval/report_builder.rb +258 -0
data/lib/llm_conductor/eval/result.rb +30 -0
data/lib/llm_conductor/eval/runner.rb +148 -0
data/lib/llm_conductor/eval/spec.rb +78 -0
data/lib/llm_conductor/eval/store/base.rb +58 -0
data/lib/llm_conductor/eval/store/file_store.rb +94 -0
data/lib/llm_conductor/eval/store/in_memory.rb +76 -0
data/lib/llm_conductor/eval/verdict.rb +31 -0
data/lib/llm_conductor/eval.rb +75 -0
data/lib/llm_conductor/version.rb +1 -1
metadata +30 -2

data/lib/llm_conductor/eval/store/in_memory.rb ADDED Viewed

@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+require 'json'
+require_relative 'base'
+module LlmConductor
+  module Eval
+    module Store
+      # Default store: everything lives in process memory, nothing hits disk.
+      # Ideal for tests and ephemeral runs. Manifests are round-tripped through
+      # JSON on write so reads return string-keyed hashes, matching FileStore.
+      class InMemory < Base
+        def initialize
+          super
+          @raw = {}
+          @parsed = {}
+          @inputs = {}
+          @manifests = {}
+        end
+        def write_raw(run_id, input_id, model_slug, text)
+          key = output_key(run_id, input_id, model_slug)
+          @raw[key] = text.to_s
+          "memory://#{key}.raw"
+        end
+        def read_raw(run_id, input_id, model_slug)
+          @raw[output_key(run_id, input_id, model_slug)]
+        end
+        def write_parsed(run_id, input_id, model_slug, hash)
+          key = output_key(run_id, input_id, model_slug)
+          @parsed[key] = hash
+          "memory://#{key}.json"
+        end
+        def read_parsed(run_id, input_id, model_slug)
+          @parsed[output_key(run_id, input_id, model_slug)]
+        end
+        def write_input_data(run_id, input_id, hash)
+          # Round-trip through JSON so reads return string-keyed hashes, matching
+          # FileStore — keeps judge_only/report_only behavior identical across stores.
+          @inputs[input_key(run_id, input_id)] = JSON.parse(JSON.generate(hash))
+        end
+        def read_input_data(run_id, input_id)
+          @inputs[input_key(run_id, input_id)]
+        end
+        def write_manifest(run_id, manifest_hash)
+          @manifests[run_id.to_s] = JSON.parse(JSON.generate(manifest_hash))
+        end
+        def read_manifest(run_id)
+          @manifests[run_id.to_s]
+        end
+        def completed?(run_id, input_id, model_slug)
+          key = output_key(run_id, input_id, model_slug)
+          @parsed.key?(key) || @raw.key?(key)
+        end
+        private
+        def output_key(run_id, input_id, model_slug)
+          "#{run_id}/#{input_id}/#{model_slug}"
+        end
+        def input_key(run_id, input_id)
+          "#{run_id}/#{input_id}"
+        end
+      end
+    end
+  end
+end

data/lib/llm_conductor/eval/verdict.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+module LlmConductor
+  module Eval
+    # Scores in this range are "borderline" — the judge is uncertain enough that
+    # the row is flagged for human review. Tuned in the Rails prototype.
+    BORDERLINE_RANGE = (50..70)
+    # The LLM-as-judge's verdict for one candidate (input, model) output.
+    # Ported verbatim from the prototype's Judge::Verdict struct.
+    Verdict = Struct.new(
+      :quality_score, :dimensions, :issues, :verdict_one_line,
+      :judge_model, :judge_latency_ms, :judge_input_tokens, :judge_output_tokens,
+      :judge_estimated_cost_usd, :judge_error,
+      keyword_init: true
+    ) do
+      # String-keyed hash for JSON manifest persistence.
+      def to_h
+        super.transform_keys(&:to_s)
+      end
+      def borderline?
+        Verdict.borderline?(quality_score)
+      end
+      def self.borderline?(score)
+        score.is_a?(Numeric) && BORDERLINE_RANGE.cover?(score)
+      end
+    end
+  end
+end

data/lib/llm_conductor/eval.rb ADDED Viewed

@@ -0,0 +1,75 @@
+# frozen_string_literal: true
+require 'logger'
+require 'time'
+require 'llm_conductor'
+require_relative 'eval/json_parser'
+require_relative 'eval/result'
+require_relative 'eval/verdict'
+require_relative 'eval/spec'
+require_relative 'eval/store/base'
+require_relative 'eval/store/in_memory'
+require_relative 'eval/store/file_store'
+require_relative 'eval/model_runner'
+require_relative 'eval/judge'
+require_relative 'eval/report'
+require_relative 'eval/report_builder'
+require_relative 'eval/runner'
+module LlmConductor
+  # Opt-in model-evaluation harness. `require 'llm_conductor/eval'` to load it;
+  # core `require 'llm_conductor'` users pay nothing.
+  #
+  # Runs the same prompt across N (model, vendor) pairs over M caller-supplied
+  # inputs, then compares them on cost, latency, tokens, and LLM-judged quality.
+  # The engine is feature-agnostic; everything feature-specific lives in a Spec.
+  #
+  #   require 'llm_conductor/eval'
+  #
+  #   report = LlmConductor::Eval.run(
+  #     spec:   MyFeatureSpec.new,
+  #     inputs: my_inputs,                       # any enumerable; engine never selects/queries
+  #     models: [{ model: 'gpt-4o-mini', vendor: :openai },
+  #              { model: 'gemini-2.5-flash', vendor: :gemini }],
+  #     judge:  { model: 'llama-3.3-70b-versatile', vendor: :groq }
+  #   )
+  #   report.summary       # per-model aggregates
+  #   report.to_markdown   # decision-aid report (caller persists)
+  #   report.to_csv        # per-row data
+  #   report.needs_review  # rows flagged for human eyeball
+  module Eval
+    module_function
+    # The single entrypoint. +spec+ implements Eval::Spec; +inputs+ is any
+    # enumerable of opaque objects the spec knows how to interpret; +models+ is
+    # the caller-owned list of { model:, vendor: } candidate pairs.
+    def run(spec:, inputs:, models:, judge: {}, store: nil, logger: nil, run_id: nil)
+      Runner.new(
+        spec:, inputs:, models:, judge:,
+        store: store || Store::InMemory.new,
+        logger: logger || default_logger,
+        run_id: run_id || generate_run_id
+      ).run
+    end
+    # Re-judge stored candidate outputs without recalling the candidate models.
+    def judge_only(run_id:, spec:, store:, judge: {}, logger: nil)
+      Runner.judge_only(run_id:, spec:, store:, judge:, logger: logger || default_logger)
+    end
+    # Rebuild the Report from a stored manifest, no model or judge calls.
+    def report_only(run_id:, spec:, store:)
+      Runner.report_only(run_id:, spec:, store:)
+    end
+    def default_logger
+      LlmConductor.configuration.logger || Logger.new($stdout)
+    end
+    def generate_run_id
+      "run_#{Time.now.utc.strftime('%Y%m%d_%H%M%S')}"
+    end
+  end
+end

data/lib/llm_conductor/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module LlmConductor
-  VERSION = '1.7.1'
+  VERSION = '1.8.0'
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: llm_conductor
 version: !ruby/object:Gem::Version
-  version: 1.7.1
+  version: 1.8.0
 platform: ruby
 authors:
 - Ben Zheng
 bindir: exe
 cert_chain: []
-date: 2026-05-16 00:00:00.000000000 Z
+date: 2026-06-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
@@ -37,6 +37,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.7'
+- !ruby/object:Gem::Dependency
+  name: csv
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
 - !ruby/object:Gem::Dependency
   name: gemini-ai
   requirement: !ruby/object:Gem::Requirement
@@ -162,6 +176,7 @@ files:
 - examples/gemini_vision_usage.rb
 - examples/gpt_vision_usage.rb
 - examples/groq_usage.rb
+- examples/model_eval_usage.rb
 - examples/ollama_params_usage.rb
 - examples/openrouter_vision_usage.rb
 - examples/prompt_registration.rb
@@ -181,6 +196,19 @@ files:
 - lib/llm_conductor/clients/zai_client.rb
 - lib/llm_conductor/configuration.rb
 - lib/llm_conductor/data_builder.rb
+- lib/llm_conductor/eval.rb
+- lib/llm_conductor/eval/json_parser.rb
+- lib/llm_conductor/eval/judge.rb
+- lib/llm_conductor/eval/model_runner.rb
+- lib/llm_conductor/eval/report.rb
+- lib/llm_conductor/eval/report_builder.rb
+- lib/llm_conductor/eval/result.rb
+- lib/llm_conductor/eval/runner.rb
+- lib/llm_conductor/eval/spec.rb
+- lib/llm_conductor/eval/store/base.rb
+- lib/llm_conductor/eval/store/file_store.rb
+- lib/llm_conductor/eval/store/in_memory.rb
+- lib/llm_conductor/eval/verdict.rb
 - lib/llm_conductor/patches/gemini_vertex_api_key.rb
 - lib/llm_conductor/prompt_manager.rb
 - lib/llm_conductor/prompts.rb