llm_bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d90ed99b03730fd89c2fd93d62ec728c4b474f9cc6fefc4b4030f635fdf6effd
4
+ data.tar.gz: '054129a5c38f180e2bd46ba6372bc61474d41b8e5a74f9e2e21aa335ab35278a'
5
+ SHA512:
6
+ metadata.gz: eabdee5f9298517c6b9617fbf6ccd346e49a64021a81f7d35150439262a5587fcfed197c6710049402d2e2a7908fbfbf5005cb71d54bf73e6466be84e540be19
7
+ data.tar.gz: 6631ef5c989762cdbe86baf86e29682baaf78ddc9b005db1bc9f4e4c37a8c07bffa5864b590f0340323b88d50dd885b3adcca2b5e130084d6bf252d86fc3b95e
@@ -0,0 +1,10 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(gem build:*)",
5
+ "Bash(gem install:*)"
6
+ ],
7
+ "deny": [],
8
+ "ask": []
9
+ }
10
+ }
data/.gitignore ADDED
@@ -0,0 +1,57 @@
1
+ # Configuration files with API keys
2
+ models.yaml
3
+
4
+ # Benchmark results CSV files
5
+ *.csv
6
+
7
+ # Ruby cache files
8
+ *.rbc
9
+ *.gem
10
+ *.rbo
11
+
12
+ # Ruby bundler
13
+ Gemfile.lock
14
+ .bundle
15
+ /.bundle
16
+
17
+ # Ruby temporary files
18
+ *.tmp
19
+ *.temp
20
+
21
+ # IDE files
22
+ .vscode/
23
+ .idea/
24
+ *.swp
25
+ *.swo
26
+ *~
27
+
28
+ # OS generated files
29
+ .DS_Store
30
+ .DS_Store?
31
+ ._*
32
+ .Spotlight-V100
33
+ .Trashes
34
+ ehthumbs.db
35
+ Thumbs.db
36
+
37
+ # Logs
38
+ *.log
39
+
40
+ # Coverage reports
41
+ coverage/
42
+
43
+ # Documentation
44
+ /doc/
45
+ /.yardoc/
46
+ /_yardoc/
47
+ /.rdoc/
48
+ /rdoc/
49
+
50
+ # Environment files
51
+ .env
52
+ .env.*
53
+ !.env.example
54
+
55
+ # Temporary test files
56
+ test_*.rb
57
+ verify_*.rb
data/README.md ADDED
@@ -0,0 +1,104 @@
1
+ # LLMBench
2
+
3
+ A Ruby gem for benchmarking and comparing the performance of different Large Language Model providers and APIs.
4
+
5
+ ## Features
6
+
7
+ - Support for both OpenAI and Anthropic-compatible API formats
8
+ - Parallel execution across multiple models and providers
9
+ - Continuous tracking with CSV export functionality
10
+ - Clean, modular architecture with proper gem structure
11
+ - No external dependencies - uses only Ruby standard library
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ ```ruby
18
+ gem 'llm_bench'
19
+ ```
20
+
21
+ And then execute:
22
+ ```bash
23
+ bundle install
24
+ ```
25
+
26
+ Or install it yourself as:
27
+ ```bash
28
+ gem install llm_bench
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ### Configuration
34
+
35
+ Create a configuration file named `models.yaml` in your current directory, or specify a custom path with the `--config` argument:
36
+
37
+ ```yaml
38
+ prompt: "Explain the concept of machine learning in simple terms in exactly 300 words..."
39
+
40
+ providers:
41
+ - name: "openai"
42
+ base_url: "https://api.openai.com/v1"
43
+ api_key: "your-api-key-here"
44
+ models:
45
+ - nickname: "gpt-4"
46
+ id: "gpt-4"
47
+ api_format: "openai"
48
+
49
+ - name: "anthropic"
50
+ base_url: "https://api.anthropic.com"
51
+ api_key: "your-api-key-here"
52
+ models:
53
+ - nickname: "claude"
54
+ id: "claude-3-sonnet-20240229"
55
+ api_format: "anthropic"
56
+ ```
57
+
58
+ ### Commands
59
+
60
+ #### Benchmark a single model:
61
+ ```bash
62
+ llm_bench --config ./my-config.yaml --provider openai --model gpt-4
63
+ ```
64
+
65
+ #### Benchmark all configured models:
66
+ ```bash
67
+ llm_bench --all
68
+ ```
69
+
70
+ #### Benchmark all models with custom config:
71
+ ```bash
72
+ llm_bench --config ./my-config.yaml --all
73
+ ```
74
+
75
+ #### Enable continuous tracking:
76
+ ```bash
77
+ llm_bench --config ./my-config.yaml --all --track
78
+ ```
79
+
80
+ #### Print full responses:
81
+ ```bash
82
+ llm_bench --config ./my-config.yaml --provider openai --model gpt-4 --print-result
83
+ ```
84
+
85
+ **Note**: If no `--config` argument is provided, `llm_bench` will look for `models.yaml` in the current directory. If the configuration file is not found, an error will be displayed.
86
+
87
+ ## Development
88
+
89
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
90
+
91
+ To build and install the gem locally:
92
+
93
+ ```bash
94
+ gem build llm_bench.gemspec
95
+ gem install ./llm_bench-0.1.0.gem
96
+ ```
97
+
98
+ ## Contributing
99
+
100
+ Bug reports and pull requests are welcome on GitHub at https://github.com/vito/llm-bench.
101
+
102
+ ## License
103
+
104
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/exe/llm_bench ADDED
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Add the lib directory to the load path when running from source
4
+ if __FILE__ == $PROGRAM_NAME
5
+ lib_path = File.expand_path('../../lib', __FILE__)
6
+ $LOAD_PATH.unshift(lib_path) if File.directory?(lib_path)
7
+ end
8
+
9
+ begin
10
+ require 'llm_bench'
11
+ rescue LoadError
12
+ # If we can't load the gem, try to load from source
13
+ require_relative '../lib/llm_bench'
14
+ end
15
+
16
+ require 'yaml'
17
+ require 'optparse'
18
+
19
+ def parse_arguments
20
+ options = {}
21
+ OptionParser.new do |opts|
22
+ opts.banner = "Usage: llm_bench --config CONFIG --provider PROVIDER --model NICKNAME [--print-result]"
23
+ opts.banner += "\n llm_bench --config CONFIG --all [--track] [--print-result]"
24
+
25
+ opts.on('--config CONFIG', 'Path to configuration file (default: models.yaml)') do |config|
26
+ options[:config] = config
27
+ end
28
+
29
+ opts.on('--provider PROVIDER', 'Provider name from config file') do |provider|
30
+ options[:provider] = provider
31
+ end
32
+
33
+ opts.on('--model NICKNAME', 'Model nickname from config file') do |model|
34
+ options[:model] = model
35
+ end
36
+
37
+ opts.on('--all', 'Run benchmark on all configured models') do
38
+ options[:all] = true
39
+ end
40
+
41
+ opts.on('--track', 'Enable continuous tracking with CSV output (requires --all)') do
42
+ options[:track] = true
43
+ end
44
+
45
+ opts.on('--print-result', 'Print the full message returned by each LLM') do
46
+ options[:print_result] = true
47
+ end
48
+
49
+ opts.on('--help', 'Display help') do
50
+ puts opts
51
+ exit
52
+ end
53
+ end.parse!
54
+
55
+ if options[:track] && !options[:all]
56
+ puts "Error: --track requires --all"
57
+ puts "Use --help for usage information"
58
+ exit 1
59
+ end
60
+
61
+ if options[:all]
62
+ options
63
+ elsif options[:provider] && options[:model]
64
+ options
65
+ else
66
+ puts "Error: Either --provider and --model, or --all is required"
67
+ puts "Use --help for usage information"
68
+ exit 1
69
+ end
70
+
71
+ options
72
+ end
73
+
74
+ def main
75
+ options = parse_arguments
76
+
77
+ # Determine config file path
78
+ config_path = options[:config] || './models.yaml'
79
+
80
+ # Validate config file exists
81
+ unless File.exist?(config_path)
82
+ puts "Error: Configuration file not found at #{config_path}"
83
+ exit 1
84
+ end
85
+
86
+ # Load configuration
87
+ config = YAML.load_file(config_path)
88
+
89
+ if options[:all]
90
+ if options[:track]
91
+ tracker = LLMBench::Tracker.new(config)
92
+ tracker.start_tracking
93
+ else
94
+ parallel_benchmark = LLMBench::ParallelBenchmark.new(config, options[:print_result])
95
+ parallel_benchmark.run_all
96
+ end
97
+ else
98
+ benchmark = LLMBench::Benchmark.new(options[:provider], options[:model], options[:print_result], config)
99
+ benchmark.run_benchmark
100
+ end
101
+ rescue StandardError => e
102
+ puts "Error: #{e.message}"
103
+ exit 1
104
+ end
105
+
106
+ main
@@ -0,0 +1,225 @@
1
+ require 'yaml'
2
+ require 'json'
3
+ require 'net/http'
4
+ require 'uri'
5
+ require 'time'
6
+
7
+ module LLMBench
8
+ class Benchmark
9
+ attr_reader :config, :provider, :model, :start_time, :end_time
10
+
11
+ def initialize(provider_name, model_nickname, print_result = false, config = nil)
12
+ @provider_name = provider_name
13
+ @model_nickname = model_nickname
14
+ @print_result = print_result
15
+ @config = config || load_config
16
+ validate_provider_and_model!
17
+ end
18
+
19
+ def load_config
20
+ config_path = File.join(__dir__, '..', 'models.yaml')
21
+ unless File.exist?(config_path)
22
+ raise "Configuration file models.yaml not found"
23
+ end
24
+
25
+ YAML.load_file(config_path)
26
+ end
27
+
28
+ def validate_provider_and_model!
29
+ provider_config = @config['providers'].find { |p| p['name'] == @provider_name }
30
+ unless provider_config
31
+ raise "Provider '#{@provider_name}' not found in configuration"
32
+ end
33
+
34
+ model_config = provider_config['models'].find { |m| m['nickname'] == @model_nickname }
35
+ unless model_config
36
+ raise "Model '#{@model_nickname}' not found for provider '#{@provider_name}'"
37
+ end
38
+
39
+ model_config['api_format'] ||= 'openai'
40
+
41
+ unless ['openai', 'anthropic'].include?(model_config['api_format'])
42
+ raise "Invalid API format '#{model_config['api_format']}' for model '#{@model_nickname}'. Must be 'openai' or 'anthropic'"
43
+ end
44
+
45
+ @provider = provider_config
46
+ @model = model_config
47
+ end
48
+
49
+ def run_benchmark
50
+ puts "=== LLM Benchmark ==="
51
+ puts "Provider: #{@provider_name}"
52
+ puts "Model: #{@model_nickname} (#{@model['id']})"
53
+ puts "Starting benchmark..."
54
+
55
+ @start_time = Time.now
56
+ puts "Start time: #{@start_time.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
57
+
58
+ response = make_api_call
59
+
60
+ @end_time = Time.now
61
+ puts "End time: #{@end_time.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
62
+
63
+ calculate_and_display_metrics(response)
64
+ end
65
+
66
+ def anthropic_format?
67
+ @model['api_format'] == 'anthropic'
68
+ end
69
+
70
+ def api_endpoint
71
+ anthropic_format? ? "#{@provider['base_url']}/v1/messages" : "#{@provider['base_url']}/chat/completions"
72
+ end
73
+
74
+ def build_request_headers
75
+ headers = { 'Content-Type' => 'application/json' }
76
+ if anthropic_format?
77
+ headers['x-api-key'] = @provider['api_key']
78
+ headers['anthropic-version'] = '2023-06-01'
79
+ else
80
+ headers['Authorization'] = "Bearer #{@provider['api_key']}"
81
+ end
82
+ headers
83
+ end
84
+
85
+ def build_request_body
86
+ base_body = {
87
+ model: @model['id'],
88
+ messages: [{ role: 'user', content: @config['prompt'] }]
89
+ }
90
+
91
+ if anthropic_format?
92
+ base_body.merge(max_tokens: 1000)
93
+ else
94
+ base_body.merge(max_tokens: 1000, temperature: 0.7)
95
+ end
96
+ end
97
+
98
+ def extract_response_content(response)
99
+ if anthropic_format?
100
+ extract_anthropic_content(response)
101
+ else
102
+ response.dig('choices', 0, 'message', 'content') || ''
103
+ end
104
+ end
105
+
106
+ def extract_token_counts(response, message_content)
107
+ if anthropic_format?
108
+ input_tokens = response.dig('usage', 'input_tokens') || estimate_tokens(@config['prompt'])
109
+ output_tokens = response.dig('usage', 'output_tokens') || estimate_tokens(message_content)
110
+ else
111
+ input_tokens = response.dig('usage', 'prompt_tokens') || estimate_tokens(@config['prompt'])
112
+ output_tokens = response.dig('usage', 'completion_tokens') || estimate_tokens(message_content)
113
+ end
114
+ [input_tokens, output_tokens]
115
+ end
116
+
117
+ def make_api_call
118
+ uri = URI.parse(api_endpoint)
119
+ request = Net::HTTP::Post.new(uri)
120
+ request['Content-Type'] = 'application/json'
121
+
122
+ build_request_headers.each { |key, value| request[key] = value }
123
+ request.body = build_request_body.to_json
124
+
125
+ http = Net::HTTP.new(uri.host, uri.port)
126
+ http.use_ssl = uri.scheme == 'https'
127
+
128
+ response = http.request(request)
129
+
130
+ handle_api_error(response) unless response.is_a?(Net::HTTPSuccess)
131
+
132
+ JSON.parse(response.body)
133
+ end
134
+
135
+ def handle_api_error(response)
136
+ error_response = JSON.parse(response.body)
137
+ error_msg = error_response['msg'] || error_response['message'] ||
138
+ error_response.dig('error', 'message') || response.message
139
+ raise "API request failed: #{response.code} - #{error_msg}"
140
+ rescue JSON::ParserError
141
+ raise "API request failed: #{response.code} #{response.message}"
142
+ end
143
+
144
+ def calculate_metrics(response)
145
+ duration = @end_time - @start_time
146
+ message_content = extract_response_content(response)
147
+ input_tokens, output_tokens = extract_token_counts(response, message_content)
148
+
149
+ total_tokens = input_tokens + output_tokens
150
+ tokens_per_second = total_tokens / duration if duration.positive?
151
+
152
+ {
153
+ duration: duration,
154
+ input_tokens: input_tokens,
155
+ output_tokens: output_tokens,
156
+ total_tokens: total_tokens,
157
+ tokens_per_second: tokens_per_second,
158
+ message_content: message_content
159
+ }
160
+ end
161
+
162
+ def calculate_and_display_metrics(response)
163
+ metrics = calculate_metrics(response)
164
+
165
+ puts "\n=== Results ==="
166
+ puts "Duration: #{metrics[:duration].round(3)} seconds"
167
+ puts "Input tokens: #{metrics[:input_tokens]}"
168
+ puts "Output tokens: #{metrics[:output_tokens]}"
169
+ puts "Total tokens: #{metrics[:total_tokens]}"
170
+ puts "Tokens per second: #{metrics[:tokens_per_second].round(2)}"
171
+
172
+ puts "\n=== Message Content ==="
173
+ puts metrics[:message_content] if @print_result
174
+ end
175
+
176
+ def extract_anthropic_content(response)
177
+ if response.key?('code') && response.key?('msg') && response.key?('success')
178
+ return "Error: #{response['msg']}"
179
+ end
180
+
181
+ content_blocks = response.dig('content')
182
+
183
+ if content_blocks.is_a?(Array) && !content_blocks.empty?
184
+ text_block = content_blocks.find { |block| block.is_a?(Hash) && block['type'] == 'text' }
185
+ text_block ? text_block['text'] : nil
186
+ elsif response.dig('content', 0, 'text')
187
+ response.dig('content', 0, 'text')
188
+ else
189
+ nil
190
+ end
191
+ end
192
+
193
+ def estimate_tokens(text)
194
+ (text.length / 4.0).round
195
+ end
196
+
197
+ def run_benchmark_for_results
198
+ @start_time = Time.now
199
+ response = make_api_call
200
+ @end_time = Time.now
201
+
202
+ metrics = calculate_metrics(response)
203
+ {
204
+ provider: @provider_name,
205
+ model: @model_nickname,
206
+ total_tokens: metrics[:total_tokens],
207
+ tokens_per_second: metrics[:tokens_per_second].round(2),
208
+ duration: metrics[:duration].round(3),
209
+ success: true,
210
+ message_content: metrics[:message_content]
211
+ }
212
+ rescue StandardError => e
213
+ {
214
+ provider: @provider_name,
215
+ model: @model_nickname,
216
+ total_tokens: 0,
217
+ tokens_per_second: 0,
218
+ duration: 0,
219
+ success: false,
220
+ error: e.message,
221
+ message_content: ''
222
+ }
223
+ end
224
+ end
225
+ end
@@ -0,0 +1,130 @@
1
+ module LLMBench
2
+ class ParallelBenchmark
3
+ def initialize(config, print_result = false)
4
+ @config = config
5
+ @print_result = print_result
6
+ end
7
+
8
+ def run_all
9
+ puts "=== LLM Benchmark ==="
10
+ puts "Running benchmarks on all configured models..."
11
+ puts "Starting at #{Time.now.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
12
+ puts
13
+
14
+ benchmarks = create_benchmarks
15
+ results = run_parallel(benchmarks)
16
+
17
+ display_results_table(results)
18
+ display_summary(results)
19
+ end
20
+
21
+ def run_silent
22
+ benchmarks = create_benchmarks
23
+ run_parallel(benchmarks)
24
+ end
25
+
26
+ private
27
+
28
+ def create_benchmarks
29
+ benchmarks = []
30
+
31
+ @config['providers'].each do |provider|
32
+ provider['models'].each do |model|
33
+ benchmarks << Benchmark.new(provider['name'], model['nickname'], @print_result, @config)
34
+ end
35
+ end
36
+
37
+ benchmarks
38
+ end
39
+
40
+ def run_parallel(benchmarks)
41
+ results = []
42
+ mutex = Mutex.new
43
+
44
+ threads = benchmarks.map do |benchmark|
45
+ Thread.new do
46
+ result = benchmark.run_benchmark_for_results
47
+ mutex.synchronize { results << result }
48
+ end
49
+ end
50
+
51
+ threads.each(&:join)
52
+ results
53
+ end
54
+
55
+ def display_results_table(results)
56
+ sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
57
+
58
+ provider_width = sorted_results.map { |r| r[:provider].length }.max
59
+ model_width = sorted_results.map { |r| r[:model].length }.max
60
+ tokens_width = 12
61
+ tps_width = 15
62
+
63
+ if @print_result
64
+ header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} | Message Content"
65
+ separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} | #{'-' * 80}"
66
+ else
67
+ header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} |"
68
+ separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} |"
69
+ end
70
+
71
+ puts header
72
+ puts separator
73
+
74
+ sorted_results.each do |result|
75
+ provider_col = result[:provider].ljust(provider_width)
76
+ model_col = result[:model].ljust(model_width)
77
+
78
+ if result[:success]
79
+ tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
80
+ tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
81
+
82
+ if @print_result
83
+ message_content = result[:message_content][0..79]
84
+ puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} | #{message_content}"
85
+ else
86
+ puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} |"
87
+ end
88
+ else
89
+ tokens_col = "ERROR".rjust(tokens_width)
90
+ tps_col = "FAILED".rjust(tps_width)
91
+
92
+ if @print_result
93
+ puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} | #{result[:error][0..79]}"
94
+ else
95
+ puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} |"
96
+ end
97
+ end
98
+ end
99
+
100
+ puts
101
+ end
102
+
103
+ def display_summary(results)
104
+ successful = results.select { |r| r[:success] }
105
+ failed = results.select { |r| !r[:success] }
106
+
107
+ puts "=== Summary ==="
108
+ puts "Total benchmarks: #{results.length}"
109
+ puts "Successful: #{successful.length}"
110
+ puts "Failed: #{failed.length}"
111
+
112
+ if successful.any?
113
+ avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
114
+ fastest = successful.max_by { |r| r[:tokens_per_second] }
115
+ slowest = successful.min_by { |r| r[:tokens_per_second] }
116
+
117
+ puts "Average tokens/sec: #{avg_tps.round(2)}"
118
+ puts "Fastest: #{fastest[:provider]}/#{fastest[:model]} (#{fastest[:tokens_per_second]} tokens/sec)"
119
+ puts "Slowest: #{slowest[:provider]}/#{slowest[:model]} (#{slowest[:tokens_per_second]} tokens/sec)"
120
+ end
121
+
122
+ return unless failed.any?
123
+
124
+ puts "\nFailed benchmarks:"
125
+ failed.each do |result|
126
+ puts " #{result[:provider]}/#{result[:model]}: #{result[:error]}"
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,136 @@
1
+ module LLMBench
2
+ class Tracker
3
+ def initialize(config)
4
+ @config = config
5
+ @csv_file = "llm_benchmark_results_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
6
+ @running = true
7
+ @next_run_time = Time.now
8
+ setup_signal_handlers
9
+ end
10
+
11
+ def start_tracking
12
+ puts "=== LLM Performance Tracker ==="
13
+ puts "Tracking all models every 60 seconds"
14
+ puts "Results will be saved to: #{@csv_file}"
15
+ puts "Press Ctrl+C to stop tracking"
16
+ puts
17
+
18
+ initialize_csv
19
+
20
+ run_tracking_cycle
21
+
22
+ while @running
23
+ time_until_next_run = @next_run_time - Time.now
24
+
25
+ if time_until_next_run.positive?
26
+ sleep_time = [time_until_next_run, 1.0].min
27
+ sleep(sleep_time)
28
+ else
29
+ run_tracking_cycle
30
+ @next_run_time = Time.now + 60
31
+ end
32
+ end
33
+
34
+ puts "\nTracking stopped by user"
35
+ puts "Results saved to: #{@csv_file}"
36
+ end
37
+
38
+ private
39
+
40
+ def setup_signal_handlers
41
+ Signal.trap('INT') do
42
+ @running = false
43
+ puts "\nStopping tracking..."
44
+ end
45
+
46
+ Signal.trap('TERM') do
47
+ @running = false
48
+ puts "\nStopping tracking..."
49
+ end
50
+ end
51
+
52
+ def initialize_csv
53
+ File.open(@csv_file, 'w') do |file|
54
+ file.write("timestamp,provider_model,tokens_per_second,total_tokens,duration_seconds\n")
55
+ end
56
+ end
57
+
58
+ def run_tracking_cycle
59
+ timestamp = Time.now
60
+ puts "[#{timestamp.strftime('%Y-%m-%d %H:%M:%S')}] Running benchmark cycle..."
61
+
62
+ parallel_benchmark = ParallelBenchmark.new(@config)
63
+ results = parallel_benchmark.run_silent
64
+
65
+ write_results_to_csv(timestamp, results)
66
+ display_cycle_summary(results)
67
+ end
68
+
69
+ def write_results_to_csv(timestamp, results)
70
+ File.open(@csv_file, 'a') do |file|
71
+ results.each do |result|
72
+ next unless result[:success]
73
+
74
+ provider_model = "#{result[:provider]}+#{result[:model]}"
75
+ csv_line = [
76
+ timestamp.strftime('%Y-%m-%d %H:%M:%S'),
77
+ provider_model,
78
+ result[:tokens_per_second],
79
+ result[:total_tokens],
80
+ result[:duration]
81
+ ].join(',') + "\n"
82
+ file.write(csv_line)
83
+ end
84
+ end
85
+ end
86
+
87
+ def display_cycle_summary(results)
88
+ successful = results.select { |r| r[:success] }
89
+ failed = results.select { |r| !r[:success] }
90
+
91
+ puts " Completed: #{successful.length} successful, #{failed.length} failed"
92
+
93
+ if successful.any?
94
+ avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
95
+ puts " Average tokens/sec: #{avg_tps.round(2)}"
96
+ end
97
+
98
+ if failed.any?
99
+ puts " Failed: #{failed.map { |f| "#{f[:provider]}/#{f[:model]}" }.join(', ')}"
100
+ end
101
+
102
+ puts "\n === Individual Model Results ==="
103
+
104
+ sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
105
+
106
+ provider_width = sorted_results.map { |r| r[:provider].length }.max
107
+ model_width = sorted_results.map { |r| r[:model].length }.max
108
+ tokens_width = 12
109
+ tps_width = 15
110
+ duration_width = 12
111
+
112
+ header = " | #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Tokens/sec".rjust(tps_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Duration".rjust(duration_width)} |"
113
+ separator = " | #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tps_width} | #{'-' * tokens_width} | #{'-' * duration_width} |"
114
+
115
+ puts header
116
+ puts separator
117
+
118
+ sorted_results.each do |result|
119
+ provider_col = result[:provider].ljust(provider_width)
120
+ model_col = result[:model].ljust(model_width)
121
+
122
+ if result[:success]
123
+ tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
124
+ tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
125
+ duration_col = "#{result[:duration]}s".rjust(duration_width)
126
+ puts " | #{provider_col} | #{model_col} | #{tps_col} | #{tokens_col} | #{duration_col} |"
127
+ else
128
+ tps_col = "FAILED".rjust(tps_width)
129
+ tokens_col = "ERROR".rjust(tokens_width)
130
+ duration_col = "N/A".rjust(duration_width)
131
+ puts " | #{provider_col} | #{model_col} | #{tps_col} | #{tokens_col} | #{duration_col} |"
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,3 @@
1
+ module LLMBench
2
+ VERSION = "0.1.0"
3
+ end
data/lib/llm_bench.rb ADDED
@@ -0,0 +1,9 @@
1
+ require_relative "llm_bench/version"
2
+ require_relative "llm_bench/benchmark"
3
+ require_relative "llm_bench/parallel_benchmark"
4
+ require_relative "llm_bench/tracker"
5
+
6
+ module LLMBench
7
+ class Error < StandardError; end
8
+ # Your code goes here...
9
+ end
data/llm_bench.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'llm_bench/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "llm_bench"
7
+ spec.version = LLMBench::VERSION
8
+ spec.authors = ["Vito"]
9
+ spec.email = []
10
+
11
+ spec.summary = "A tool for benchmarking LLM performance across different providers and models"
12
+ spec.description = "LLM Bench is a Ruby gem that allows you to benchmark and compare the performance of different Large Language Model providers and APIs. It supports both OpenAI and Anthropic-compatible API formats, provides parallel execution, and includes continuous tracking capabilities with CSV export."
13
+ spec.homepage = "https://github.com/vitobotta/llm-bench"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.7.0"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = spec.homepage
19
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
20
+
21
+ spec.files = Dir.chdir(__dir__) do
22
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
23
+ end
24
+
25
+ spec.bindir = "exe"
26
+ spec.executables = ["llm_bench"]
27
+ spec.require_paths = ["lib"]
28
+
29
+ # Standard library dependencies - no external gems required
30
+ end
@@ -0,0 +1,20 @@
1
+ # LLM Benchmark Configuration
2
+ prompt: "Explain the concept of machine learning in simple terms in exactly 300 words or as close to it as possible, covering the main types of learning approaches and providing examples of real-world applications."
3
+
4
+ providers:
5
+ - name: "z-ai"
6
+ base_url: "https://api.z.ai/api/coding/paas/v4"
7
+ api_key: "YOUR_API_KEY_HERE"
8
+ models:
9
+ - nickname: "glm"
10
+ id: "glm-4.5"
11
+ # api_format can be 'openai' (default) or 'anthropic'
12
+ api_format: "openai"
13
+ - name: "z-ai-anthropic"
14
+ base_url: "https://api.z.ai/api/anthropic"
15
+ api_key: "YOUR_API_KEY_HERE"
16
+ models:
17
+ - nickname: "glm-anthropic"
18
+ id: "glm-4.5"
19
+ # api_format can be 'openai' (default) or 'anthropic'
20
+ api_format: "anthropic"
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: llm_bench
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Vito
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: LLM Bench is a Ruby gem that allows you to benchmark and compare the
13
+ performance of different Large Language Model providers and APIs. It supports both
14
+ OpenAI and Anthropic-compatible API formats, provides parallel execution, and includes
15
+ continuous tracking capabilities with CSV export.
16
+ email: []
17
+ executables:
18
+ - llm_bench
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - ".claude/settings.local.json"
23
+ - ".gitignore"
24
+ - README.md
25
+ - Rakefile
26
+ - exe/llm_bench
27
+ - lib/llm_bench.rb
28
+ - lib/llm_bench/benchmark.rb
29
+ - lib/llm_bench/parallel_benchmark.rb
30
+ - lib/llm_bench/tracker.rb
31
+ - lib/llm_bench/version.rb
32
+ - llm_bench.gemspec
33
+ - models.yaml.example
34
+ homepage: https://github.com/vitobotta/llm-bench
35
+ licenses:
36
+ - MIT
37
+ metadata:
38
+ homepage_uri: https://github.com/vitobotta/llm-bench
39
+ source_code_uri: https://github.com/vitobotta/llm-bench
40
+ changelog_uri: https://github.com/vitobotta/llm-bench/blob/main/CHANGELOG.md
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: 2.7.0
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ requirements: []
55
+ rubygems_version: 3.6.9
56
+ specification_version: 4
57
+ summary: A tool for benchmarking LLM performance across different providers and models
58
+ test_files: []