RubyGems - llm_bench - Versions diffs - 0.1.0 → 0.3.1 - Mend

llm_bench 0.1.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.claude/settings.local.json +2 -1
data/.rubocop.yml +57 -0
data/Dockerfile +35 -0
data/README.md +68 -13
data/Rakefile +3 -1
data/exe/llm_bench +93 -48
data/lib/llm_bench/benchmark.rb +162 -183
data/lib/llm_bench/benchmark_factory.rb +39 -0
data/lib/llm_bench/colors.rb +50 -0
data/lib/llm_bench/configuration_manager.rb +66 -0
data/lib/llm_bench/parallel_benchmark.rb +37 -111
data/lib/llm_bench/results_formatter.rb +168 -0
data/lib/llm_bench/tracker.rb +69 -111
data/lib/llm_bench/version.rb +4 -2
data/lib/llm_bench.rb +6 -2
data/llm_bench.gemspec +12 -3
metadata +28 -6

data/lib/llm_bench/parallel_benchmark.rb CHANGED Viewed

@@ -1,130 +1,56 @@
-module LLMBench
-  class ParallelBenchmark
-  def initialize(config, print_result = false)
-    @config = config
-    @print_result = print_result
-  end
-  def run_all
-    puts "=== LLM Benchmark ==="
-    puts "Running benchmarks on all configured models..."
-    puts "Starting at #{Time.now.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
-    puts
-    benchmarks = create_benchmarks
-    results = run_parallel(benchmarks)
-    display_results_table(results)
-    display_summary(results)
-  end
-  def run_silent
-    benchmarks = create_benchmarks
-    run_parallel(benchmarks)
-  end
-  private
+# frozen_string_literal: true
-  def create_benchmarks
-    benchmarks = []
+require_relative "colors"
-    @config['providers'].each do |provider|
-      provider['models'].each do |model|
-        benchmarks << Benchmark.new(provider['name'], model['nickname'], @print_result, @config)
-      end
+module LLMBench
+  class ParallelBenchmark
+    def initialize(config_manager:, print_result: false)
+      @config_manager = config_manager
+      @config = config_manager.config
+      @print_result = print_result
+      @benchmark_factory = BenchmarkFactory.new(config_manager:, print_result:)
+      @results_formatter = ResultsFormatter.new(print_result:)
     end
-    benchmarks
-  end
+    def run_all
+      puts Colors.header("=== LLM Benchmark ===")
+      puts Colors.info("Running benchmarks on all configured models...")
+      puts Colors.border("Starting at #{Time.now.strftime("%Y-%m-%d %H:%M:%S.%3N")}")
+      puts
-  def run_parallel(benchmarks)
-    results = []
-    mutex = Mutex.new
+      benchmarks = create_benchmarks
+      results = run_parallel(benchmarks:)
-    threads = benchmarks.map do |benchmark|
-      Thread.new do
-        result = benchmark.run_benchmark_for_results
-        mutex.synchronize { results << result }
-      end
+      results_formatter.display_results_table(results)
+      results_formatter.display_summary(results)
     end
-    threads.each(&:join)
-    results
-  end
-  def display_results_table(results)
-    sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
-    provider_width = sorted_results.map { |r| r[:provider].length }.max
-    model_width = sorted_results.map { |r| r[:model].length }.max
-    tokens_width = 12
-    tps_width = 15
-    if @print_result
-      header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} | Message Content"
-      separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} | #{'-' * 80}"
-    else
-      header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} |"
-      separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} |"
+    def run_silent
+      benchmarks = create_benchmarks
+      run_parallel(benchmarks:)
     end
-    puts header
-    puts separator
+    private
-    sorted_results.each do |result|
-      provider_col = result[:provider].ljust(provider_width)
-      model_col = result[:model].ljust(model_width)
+    attr_reader :print_result, :config, :config_manager, :benchmark_factory, :results_formatter
-      if result[:success]
-        tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
-        tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
+    def create_benchmarks
+      benchmark_factory.create_all_benchmarks
+    end
-        if @print_result
-          message_content = result[:message_content][0..79]
-          puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} | #{message_content}"
-        else
-          puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} |"
-        end
-      else
-        tokens_col = "ERROR".rjust(tokens_width)
-        tps_col = "FAILED".rjust(tps_width)
+    def run_parallel(benchmarks:)
+      results = []
+      mutex = Mutex.new
-        if @print_result
-          puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} | #{result[:error][0..79]}"
-        else
-          puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} |"
+      threads = benchmarks.map do |benchmark|
+        Thread.new do
+          result = benchmark.run_benchmark_for_results
+          mutex.synchronize { results << result }
         end
       end
-    end
-    puts
-  end
-  def display_summary(results)
-    successful = results.select { |r| r[:success] }
-    failed = results.select { |r| !r[:success] }
-    puts "=== Summary ==="
-    puts "Total benchmarks: #{results.length}"
-    puts "Successful: #{successful.length}"
-    puts "Failed: #{failed.length}"
-    if successful.any?
-      avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
-      fastest = successful.max_by { |r| r[:tokens_per_second] }
-      slowest = successful.min_by { |r| r[:tokens_per_second] }
-      puts "Average tokens/sec: #{avg_tps.round(2)}"
-      puts "Fastest: #{fastest[:provider]}/#{fastest[:model]} (#{fastest[:tokens_per_second]} tokens/sec)"
-      puts "Slowest: #{slowest[:provider]}/#{slowest[:model]} (#{slowest[:tokens_per_second]} tokens/sec)"
+      threads.each(&:join)
+      results
     end
-    return unless failed.any?
-    puts "\nFailed benchmarks:"
-    failed.each do |result|
-      puts "  #{result[:provider]}/#{result[:model]}: #{result[:error]}"
-    end
-  end
   end
-end
+end

data/lib/llm_bench/results_formatter.rb ADDED Viewed

@@ -0,0 +1,168 @@
+# frozen_string_literal: true
+require_relative "colors"
+module LLMBench
+  class ResultsFormatter
+    def initialize(print_result: false)
+      @print_result = print_result
+    end
+    def display_results_table(results)
+      sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
+      provider_width = calculate_column_width(sorted_results, :provider)
+      model_width = calculate_column_width(sorted_results, :model)
+      tokens_width = 12
+      tps_width = 15
+      header, separator = build_table_header(provider_width:, model_width:, tokens_width:, tps_width:)
+      puts Colors.header(header)
+      puts Colors.border(separator)
+      display_table_rows(sorted_results, provider_width:, model_width:, tokens_width:, tps_width:)
+      puts
+    end
+    def display_summary(results)
+      successful = results.select { |r| r[:success] }
+      failed = results.reject { |r| r[:success] }
+      puts Colors.header("=== Summary ===")
+      puts Colors.info("Total benchmarks: #{results.length}")
+      puts Colors.success("Successful: #{successful.length}")
+      puts Colors.error("Failed: #{failed.length}")
+      display_performance_metrics(successful) if successful.any?
+      display_failed_benchmarks(failed) if failed.any?
+    end
+    def display_cycle_summary(results)
+      successful = results.select { |r| r[:success] }
+      failed = results.reject { |r| r[:success] }
+      puts "  #{Colors.success("Completed: #{successful.length} successful")}, #{Colors.error("#{failed.length} failed")}"
+      if successful.any?
+        avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
+        puts "  #{Colors.metric("Average tokens/sec: #{avg_tps.round(2)}")}"
+      end
+      puts "  #{Colors.error("Failed: #{failed.map { |f| "#{f[:provider]}/#{f[:model]}" }.join(", ")}")}" if failed.any?
+      display_individual_results(results) if results.any?
+    end
+    private
+    attr_reader :print_result
+    def calculate_column_width(results, column)
+      results.map { |r| r[column].length }.max
+    end
+    def build_table_header(provider_width:, model_width:, tokens_width:, tps_width:)
+      if print_result
+        header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} | Message Content"
+        separator = "| #{"-" * provider_width} | #{"-" * model_width} | #{"-" * tokens_width} | #{"-" * tps_width} | #{"-" * 80}"
+      else
+        header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} |"
+        separator = "| #{"-" * provider_width} | #{"-" * model_width} | #{"-" * tokens_width} | #{"-" * tps_width} |"
+      end
+      [header, separator]
+    end
+    def display_table_rows(results, provider_width:, model_width:, tokens_width:, tps_width:)
+      results.each do |result|
+        provider_col = result[:provider].ljust(provider_width)
+        model_col = result[:model].ljust(model_width)
+        if result[:success]
+          display_successful_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
+        else
+          display_failed_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
+        end
+      end
+    end
+    def display_successful_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
+      tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
+      tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
+      if print_result
+        message_content = result[:message_content][0..79]
+        puts "| #{Colors.success(provider_col)} | #{Colors.success(model_col)} | #{Colors.metric(tokens_col)} | #{Colors.success(tps_col)} | #{Colors.border(message_content)}"
+      else
+        puts "| #{Colors.success(provider_col)} | #{Colors.success(model_col)} | #{Colors.metric(tokens_col)} | #{Colors.success(tps_col)} |"
+      end
+    end
+    def display_failed_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
+      tokens_col = Colors.error("ERROR".rjust(tokens_width))
+      tps_col = Colors.error("FAILED".rjust(tps_width))
+      if print_result
+        puts "| #{Colors.error(provider_col)} | #{Colors.error(model_col)} | #{tokens_col} | #{tps_col} | #{Colors.border(result[:error][0..79])}"
+      else
+        puts "| #{Colors.error(provider_col)} | #{Colors.error(model_col)} | #{tokens_col} | #{tps_col} |"
+      end
+    end
+    def display_performance_metrics(successful)
+      avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
+      fastest = successful.max_by { |r| r[:tokens_per_second] }
+      slowest = successful.min_by { |r| r[:tokens_per_second] }
+      puts Colors.metric("Average tokens/sec: #{avg_tps.round(2)}")
+      puts Colors.success("Fastest: #{fastest[:provider]}/#{fastest[:model]} (#{fastest[:tokens_per_second]} tokens/sec)")
+      puts Colors.warning("Slowest: #{slowest[:provider]}/#{slowest[:model]} (#{slowest[:tokens_per_second]} tokens/sec)")
+    end
+    def display_failed_benchmarks(failed)
+      puts "\n#{Colors.error("Failed benchmarks:")}"
+      failed.each do |result|
+        puts "  #{Colors.error("#{result[:provider]}/#{result[:model]}")}: #{Colors.warning(result[:error])}"
+      end
+    end
+    def display_individual_results(results)
+      puts "\n  #{Colors.header('=== Individual Model Results ===')}"
+      sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
+      provider_width = calculate_column_width(sorted_results, :provider)
+      model_width = calculate_column_width(sorted_results, :model)
+      tokens_width = 12
+      tps_width = 15
+      duration_width = 12
+      header = "  | #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | " \
+               "#{"Tokens/sec".rjust(tps_width)} | #{"Total Tokens".rjust(tokens_width)} | " \
+               "#{"Duration".rjust(duration_width)} |"
+      separator = "  | #{"-" * provider_width} | #{"-" * model_width} | " \
+                  "#{"-" * tps_width} | #{"-" * tokens_width} | " \
+                  "#{"-" * duration_width} |"
+      puts Colors.header(header)
+      puts Colors.border(separator)
+      sorted_results.each do |result|
+        provider_col = result[:provider].ljust(provider_width)
+        model_col = result[:model].ljust(model_width)
+        if result[:success]
+          tps_col = Colors.success(result[:tokens_per_second].to_s.rjust(tps_width))
+          tokens_col = Colors.metric(result[:total_tokens].to_s.rjust(tokens_width))
+          duration_col = Colors.info("#{result[:duration]}s".rjust(duration_width))
+        else
+          tps_col = Colors.error("FAILED".rjust(tps_width))
+          tokens_col = Colors.error("ERROR".rjust(tokens_width))
+          duration_col = Colors.warning("N/A".rjust(duration_width))
+        end
+        puts "  | #{Colors.info(provider_col)} | #{Colors.info(model_col)} | #{tps_col} | #{tokens_col} | #{duration_col} |"
+      end
+    end
+  end
+end

data/lib/llm_bench/tracker.rb CHANGED Viewed

@@ -1,136 +1,94 @@
+# frozen_string_literal: true
+require_relative "colors"
 module LLMBench
   class Tracker
-  def initialize(config)
-    @config = config
-    @csv_file = "llm_benchmark_results_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
-    @running = true
-    @next_run_time = Time.now
-    setup_signal_handlers
-  end
+    def initialize(config_manager:, interval: 600, output_file: nil)
+      @config_manager = config_manager
+      @config = config_manager.config
+      @csv_file = output_file || "llm_benchmark_results_#{Time.now.strftime("%Y%m%d_%H%M%S")}.csv"
+      @running = true
+      @next_run_time = Time.now
+      @interval = interval
+      @results_formatter = ResultsFormatter.new(print_result: false)
+      setup_signal_handlers
+    end
-  def start_tracking
-    puts "=== LLM Performance Tracker ==="
-    puts "Tracking all models every 60 seconds"
-    puts "Results will be saved to: #{@csv_file}"
-    puts "Press Ctrl+C to stop tracking"
-    puts
+    def start_tracking
+      puts Colors.header("=== LLM Performance Tracker ===")
+      puts Colors.info("Tracking all models every #{interval} seconds")
+      puts Colors.info("Results will be saved to: #{csv_file}")
+      puts Colors.highlight("Press Ctrl+C to stop tracking")
+      puts
-    initialize_csv
+      initialize_csv
-    run_tracking_cycle
+      run_tracking_cycle
-    while @running
-      time_until_next_run = @next_run_time - Time.now
+      while running
+        time_until_next_run = next_run_time - Time.now
-      if time_until_next_run.positive?
-        sleep_time = [time_until_next_run, 1.0].min
-        sleep(sleep_time)
-      else
-        run_tracking_cycle
-        @next_run_time = Time.now + 60
+        if time_until_next_run.positive?
+          sleep_time = [time_until_next_run, 1.0].min
+          sleep(sleep_time)
+        else
+          run_tracking_cycle
+          @next_run_time = Time.now + interval
+        end
       end
-    end
-    puts "\nTracking stopped by user"
-    puts "Results saved to: #{@csv_file}"
-  end
-  private
-  def setup_signal_handlers
-    Signal.trap('INT') do
-      @running = false
-      puts "\nStopping tracking..."
-    end
-    Signal.trap('TERM') do
-      @running = false
-      puts "\nStopping tracking..."
-    end
-  end
-  def initialize_csv
-    File.open(@csv_file, 'w') do |file|
-      file.write("timestamp,provider_model,tokens_per_second,total_tokens,duration_seconds\n")
+      puts "\n#{Colors.warning('Tracking stopped by user')}"
+      puts Colors.info("Results saved to: #{csv_file}")
     end
-  end
-  def run_tracking_cycle
-    timestamp = Time.now
-    puts "[#{timestamp.strftime('%Y-%m-%d %H:%M:%S')}] Running benchmark cycle..."
-    parallel_benchmark = ParallelBenchmark.new(@config)
-    results = parallel_benchmark.run_silent
+    private
-    write_results_to_csv(timestamp, results)
-    display_cycle_summary(results)
-  end
+    attr_reader :csv_file, :running, :next_run_time, :config, :config_manager, :results_formatter, :interval
-  def write_results_to_csv(timestamp, results)
-    File.open(@csv_file, 'a') do |file|
-      results.each do |result|
-        next unless result[:success]
-        provider_model = "#{result[:provider]}+#{result[:model]}"
-        csv_line = [
-          timestamp.strftime('%Y-%m-%d %H:%M:%S'),
-          provider_model,
-          result[:tokens_per_second],
-          result[:total_tokens],
-          result[:duration]
-        ].join(',') + "\n"
-        file.write(csv_line)
+    def setup_signal_handlers
+      Signal.trap("INT") do
+        puts "\n#{Colors.warning('Received interrupt signal, exiting immediately...')}"
+        exit 0
       end
-    end
-  end
-  def display_cycle_summary(results)
-    successful = results.select { |r| r[:success] }
-    failed = results.select { |r| !r[:success] }
-    puts "  Completed: #{successful.length} successful, #{failed.length} failed"
-    if successful.any?
-      avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
-      puts "  Average tokens/sec: #{avg_tps.round(2)}"
+      Signal.trap("TERM") do
+        puts "\n#{Colors.warning('Received termination signal, exiting immediately...')}"
+        exit 0
+      end
     end
-    if failed.any?
-      puts "  Failed: #{failed.map { |f| "#{f[:provider]}/#{f[:model]}" }.join(', ')}"
+    def initialize_csv
+      File.write(csv_file, "timestamp,provider_model,tokens_per_second,total_tokens,duration_seconds\n")
     end
-    puts "\n  === Individual Model Results ==="
-    sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
-    provider_width = sorted_results.map { |r| r[:provider].length }.max
-    model_width = sorted_results.map { |r| r[:model].length }.max
-    tokens_width = 12
-    tps_width = 15
-    duration_width = 12
+    def run_tracking_cycle
+      timestamp = Time.now
+      puts "#{Colors.border("[#{timestamp.strftime('%Y-%m-%d %H:%M:%S')}]")} #{Colors.highlight('Running benchmark cycle...')}"
-    header = "  | #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Tokens/sec".rjust(tps_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Duration".rjust(duration_width)} |"
-    separator = "  | #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tps_width} | #{'-' * tokens_width} | #{'-' * duration_width} |"
+      parallel_benchmark = ParallelBenchmark.new(config_manager:, print_result: false)
+      results = parallel_benchmark.run_silent
-    puts header
-    puts separator
-    sorted_results.each do |result|
-      provider_col = result[:provider].ljust(provider_width)
-      model_col = result[:model].ljust(model_width)
+      write_results_to_csv(timestamp:, results:)
+      results_formatter.display_cycle_summary(results)
+    end
-      if result[:success]
-        tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
-        tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
-        duration_col = "#{result[:duration]}s".rjust(duration_width)
-        puts "  | #{provider_col} | #{model_col} | #{tps_col} | #{tokens_col} | #{duration_col} |"
-      else
-        tps_col = "FAILED".rjust(tps_width)
-        tokens_col = "ERROR".rjust(tokens_width)
-        duration_col = "N/A".rjust(duration_width)
-        puts "  | #{provider_col} | #{model_col} | #{tps_col} | #{tokens_col} | #{duration_col} |"
+    def write_results_to_csv(timestamp:, results:)
+      File.open(csv_file, "a") do |file|
+        results.each do |result|
+          next unless result[:success]
+          provider_model = "#{result[:provider]}: #{result[:model]}"
+          csv_line = [
+            timestamp.strftime("%Y-%m-%d %H:%M:%S"),
+            provider_model,
+            result[:tokens_per_second],
+            result[:total_tokens],
+            result[:duration]
+          ].join(",") << "\n"
+          file.write(csv_line)
+        end
       end
     end
   end
-  end
-end
+end

data/lib/llm_bench/version.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module LLMBench
-  VERSION = "0.1.0"
-end
+  VERSION = "0.3.1"
+end

data/lib/llm_bench.rb CHANGED Viewed

@@ -1,9 +1,13 @@
+# frozen_string_literal: true
 require_relative "llm_bench/version"
+require_relative "llm_bench/configuration_manager"
+require_relative "llm_bench/results_formatter"
+require_relative "llm_bench/benchmark_factory"
 require_relative "llm_bench/benchmark"
 require_relative "llm_bench/parallel_benchmark"
 require_relative "llm_bench/tracker"
 module LLMBench
   class Error < StandardError; end
-  # Your code goes here...
-end
+end

data/llm_bench.gemspec CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 lib = File.expand_path('lib', __dir__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'llm_bench/version'
@@ -9,14 +11,20 @@ Gem::Specification.new do |spec|
   spec.email         = []
   spec.summary       = "A tool for benchmarking LLM performance across different providers and models"
-  spec.description   = "LLM Bench is a Ruby gem that allows you to benchmark and compare the performance of different Large Language Model providers and APIs. It supports both OpenAI and Anthropic-compatible API formats, provides parallel execution, and includes continuous tracking capabilities with CSV export."
+  spec.description   = <<~DESC
+    LLM Bench is a Ruby gem that allows you to benchmark and compare the performance
+    of different Large Language Model providers and APIs. It supports both OpenAI and
+    Anthropic-compatible API formats, provides parallel execution, and includes
+    continuous tracking capabilities with CSV export.
+  DESC
   spec.homepage      = "https://github.com/vitobotta/llm-bench"
   spec.license       = "MIT"
-  spec.required_ruby_version = ">= 2.7.0"
+  spec.required_ruby_version = ">= 3.2"
   spec.metadata["homepage_uri"] = spec.homepage
   spec.metadata["source_code_uri"] = spec.homepage
   spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
+  spec.metadata['rubygems_mfa_required'] = 'true'
   spec.files = Dir.chdir(__dir__) do
     `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
@@ -26,5 +34,6 @@ Gem::Specification.new do |spec|
   spec.executables   = ["llm_bench"]
   spec.require_paths = ["lib"]
-  # Standard library dependencies - no external gems required
+  # Color support for enhanced output
+  spec.add_dependency "colorize", "~> 1.1"
 end