llm_bench 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.claude/settings.local.json +10 -0
- data/.gitignore +57 -0
- data/README.md +104 -0
- data/Rakefile +2 -0
- data/exe/llm_bench +106 -0
- data/lib/llm_bench/benchmark.rb +225 -0
- data/lib/llm_bench/parallel_benchmark.rb +130 -0
- data/lib/llm_bench/tracker.rb +136 -0
- data/lib/llm_bench/version.rb +3 -0
- data/lib/llm_bench.rb +9 -0
- data/llm_bench.gemspec +30 -0
- data/models.yaml.example +20 -0
- metadata +58 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d90ed99b03730fd89c2fd93d62ec728c4b474f9cc6fefc4b4030f635fdf6effd
|
4
|
+
data.tar.gz: '054129a5c38f180e2bd46ba6372bc61474d41b8e5a74f9e2e21aa335ab35278a'
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: eabdee5f9298517c6b9617fbf6ccd346e49a64021a81f7d35150439262a5587fcfed197c6710049402d2e2a7908fbfbf5005cb71d54bf73e6466be84e540be19
|
7
|
+
data.tar.gz: 6631ef5c989762cdbe86baf86e29682baaf78ddc9b005db1bc9f4e4c37a8c07bffa5864b590f0340323b88d50dd885b3adcca2b5e130084d6bf252d86fc3b95e
|
data/.gitignore
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# Configuration files with API keys
|
2
|
+
models.yaml
|
3
|
+
|
4
|
+
# Benchmark results CSV files
|
5
|
+
*.csv
|
6
|
+
|
7
|
+
# Ruby cache files
|
8
|
+
*.rbc
|
9
|
+
*.gem
|
10
|
+
*.rbo
|
11
|
+
|
12
|
+
# Ruby bundler
|
13
|
+
Gemfile.lock
|
14
|
+
.bundle
|
15
|
+
/.bundle
|
16
|
+
|
17
|
+
# Ruby temporary files
|
18
|
+
*.tmp
|
19
|
+
*.temp
|
20
|
+
|
21
|
+
# IDE files
|
22
|
+
.vscode/
|
23
|
+
.idea/
|
24
|
+
*.swp
|
25
|
+
*.swo
|
26
|
+
*~
|
27
|
+
|
28
|
+
# OS generated files
|
29
|
+
.DS_Store
|
30
|
+
.DS_Store?
|
31
|
+
._*
|
32
|
+
.Spotlight-V100
|
33
|
+
.Trashes
|
34
|
+
ehthumbs.db
|
35
|
+
Thumbs.db
|
36
|
+
|
37
|
+
# Logs
|
38
|
+
*.log
|
39
|
+
|
40
|
+
# Coverage reports
|
41
|
+
coverage/
|
42
|
+
|
43
|
+
# Documentation
|
44
|
+
/doc/
|
45
|
+
/.yardoc/
|
46
|
+
/_yardoc/
|
47
|
+
/.rdoc/
|
48
|
+
/rdoc/
|
49
|
+
|
50
|
+
# Environment files
|
51
|
+
.env
|
52
|
+
.env.*
|
53
|
+
!.env.example
|
54
|
+
|
55
|
+
# Temporary test files
|
56
|
+
test_*.rb
|
57
|
+
verify_*.rb
|
data/README.md
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# LLMBench
|
2
|
+
|
3
|
+
A Ruby gem for benchmarking and comparing the performance of different Large Language Model providers and APIs.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- Support for both OpenAI and Anthropic-compatible API formats
|
8
|
+
- Parallel execution across multiple models and providers
|
9
|
+
- Continuous tracking with CSV export functionality
|
10
|
+
- Clean, modular architecture with proper gem structure
|
11
|
+
- No external dependencies - uses only Ruby standard library
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
gem 'llm_bench'
|
19
|
+
```
|
20
|
+
|
21
|
+
And then execute:
|
22
|
+
```bash
|
23
|
+
bundle install
|
24
|
+
```
|
25
|
+
|
26
|
+
Or install it yourself as:
|
27
|
+
```bash
|
28
|
+
gem install llm_bench
|
29
|
+
```
|
30
|
+
|
31
|
+
## Usage
|
32
|
+
|
33
|
+
### Configuration
|
34
|
+
|
35
|
+
Create a configuration file named `models.yaml` in your current directory, or specify a custom path with the `--config` argument:
|
36
|
+
|
37
|
+
```yaml
|
38
|
+
prompt: "Explain the concept of machine learning in simple terms in exactly 300 words..."
|
39
|
+
|
40
|
+
providers:
|
41
|
+
- name: "openai"
|
42
|
+
base_url: "https://api.openai.com/v1"
|
43
|
+
api_key: "your-api-key-here"
|
44
|
+
models:
|
45
|
+
- nickname: "gpt-4"
|
46
|
+
id: "gpt-4"
|
47
|
+
api_format: "openai"
|
48
|
+
|
49
|
+
- name: "anthropic"
|
50
|
+
base_url: "https://api.anthropic.com"
|
51
|
+
api_key: "your-api-key-here"
|
52
|
+
models:
|
53
|
+
- nickname: "claude"
|
54
|
+
id: "claude-3-sonnet-20240229"
|
55
|
+
api_format: "anthropic"
|
56
|
+
```
|
57
|
+
|
58
|
+
### Commands
|
59
|
+
|
60
|
+
#### Benchmark a single model:
|
61
|
+
```bash
|
62
|
+
llm_bench --config ./my-config.yaml --provider openai --model gpt-4
|
63
|
+
```
|
64
|
+
|
65
|
+
#### Benchmark all configured models:
|
66
|
+
```bash
|
67
|
+
llm_bench --all
|
68
|
+
```
|
69
|
+
|
70
|
+
#### Benchmark all models with custom config:
|
71
|
+
```bash
|
72
|
+
llm_bench --config ./my-config.yaml --all
|
73
|
+
```
|
74
|
+
|
75
|
+
#### Enable continuous tracking:
|
76
|
+
```bash
|
77
|
+
llm_bench --config ./my-config.yaml --all --track
|
78
|
+
```
|
79
|
+
|
80
|
+
#### Print full responses:
|
81
|
+
```bash
|
82
|
+
llm_bench --config ./my-config.yaml --provider openai --model gpt-4 --print-result
|
83
|
+
```
|
84
|
+
|
85
|
+
**Note**: If no `--config` argument is provided, `llm_bench` will look for `models.yaml` in the current directory. If the configuration file is not found, an error will be displayed.
|
86
|
+
|
87
|
+
## Development
|
88
|
+
|
89
|
+
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
90
|
+
|
91
|
+
To build and install the gem locally:
|
92
|
+
|
93
|
+
```bash
|
94
|
+
gem build llm_bench.gemspec
|
95
|
+
gem install ./llm_bench-0.1.0.gem
|
96
|
+
```
|
97
|
+
|
98
|
+
## Contributing
|
99
|
+
|
100
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/vito/llm-bench.
|
101
|
+
|
102
|
+
## License
|
103
|
+
|
104
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/exe/llm_bench
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Add the lib directory to the load path when running from source
|
4
|
+
if __FILE__ == $PROGRAM_NAME
|
5
|
+
lib_path = File.expand_path('../../lib', __FILE__)
|
6
|
+
$LOAD_PATH.unshift(lib_path) if File.directory?(lib_path)
|
7
|
+
end
|
8
|
+
|
9
|
+
begin
|
10
|
+
require 'llm_bench'
|
11
|
+
rescue LoadError
|
12
|
+
# If we can't load the gem, try to load from source
|
13
|
+
require_relative '../lib/llm_bench'
|
14
|
+
end
|
15
|
+
|
16
|
+
require 'yaml'
|
17
|
+
require 'optparse'
|
18
|
+
|
19
|
+
def parse_arguments
|
20
|
+
options = {}
|
21
|
+
OptionParser.new do |opts|
|
22
|
+
opts.banner = "Usage: llm_bench --config CONFIG --provider PROVIDER --model NICKNAME [--print-result]"
|
23
|
+
opts.banner += "\n llm_bench --config CONFIG --all [--track] [--print-result]"
|
24
|
+
|
25
|
+
opts.on('--config CONFIG', 'Path to configuration file (default: models.yaml)') do |config|
|
26
|
+
options[:config] = config
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on('--provider PROVIDER', 'Provider name from config file') do |provider|
|
30
|
+
options[:provider] = provider
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on('--model NICKNAME', 'Model nickname from config file') do |model|
|
34
|
+
options[:model] = model
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on('--all', 'Run benchmark on all configured models') do
|
38
|
+
options[:all] = true
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on('--track', 'Enable continuous tracking with CSV output (requires --all)') do
|
42
|
+
options[:track] = true
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on('--print-result', 'Print the full message returned by each LLM') do
|
46
|
+
options[:print_result] = true
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on('--help', 'Display help') do
|
50
|
+
puts opts
|
51
|
+
exit
|
52
|
+
end
|
53
|
+
end.parse!
|
54
|
+
|
55
|
+
if options[:track] && !options[:all]
|
56
|
+
puts "Error: --track requires --all"
|
57
|
+
puts "Use --help for usage information"
|
58
|
+
exit 1
|
59
|
+
end
|
60
|
+
|
61
|
+
if options[:all]
|
62
|
+
options
|
63
|
+
elsif options[:provider] && options[:model]
|
64
|
+
options
|
65
|
+
else
|
66
|
+
puts "Error: Either --provider and --model, or --all is required"
|
67
|
+
puts "Use --help for usage information"
|
68
|
+
exit 1
|
69
|
+
end
|
70
|
+
|
71
|
+
options
|
72
|
+
end
|
73
|
+
|
74
|
+
def main
|
75
|
+
options = parse_arguments
|
76
|
+
|
77
|
+
# Determine config file path
|
78
|
+
config_path = options[:config] || './models.yaml'
|
79
|
+
|
80
|
+
# Validate config file exists
|
81
|
+
unless File.exist?(config_path)
|
82
|
+
puts "Error: Configuration file not found at #{config_path}"
|
83
|
+
exit 1
|
84
|
+
end
|
85
|
+
|
86
|
+
# Load configuration
|
87
|
+
config = YAML.load_file(config_path)
|
88
|
+
|
89
|
+
if options[:all]
|
90
|
+
if options[:track]
|
91
|
+
tracker = LLMBench::Tracker.new(config)
|
92
|
+
tracker.start_tracking
|
93
|
+
else
|
94
|
+
parallel_benchmark = LLMBench::ParallelBenchmark.new(config, options[:print_result])
|
95
|
+
parallel_benchmark.run_all
|
96
|
+
end
|
97
|
+
else
|
98
|
+
benchmark = LLMBench::Benchmark.new(options[:provider], options[:model], options[:print_result], config)
|
99
|
+
benchmark.run_benchmark
|
100
|
+
end
|
101
|
+
rescue StandardError => e
|
102
|
+
puts "Error: #{e.message}"
|
103
|
+
exit 1
|
104
|
+
end
|
105
|
+
|
106
|
+
main
|
@@ -0,0 +1,225 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'json'
|
3
|
+
require 'net/http'
|
4
|
+
require 'uri'
|
5
|
+
require 'time'
|
6
|
+
|
7
|
+
module LLMBench
|
8
|
+
class Benchmark
|
9
|
+
attr_reader :config, :provider, :model, :start_time, :end_time
|
10
|
+
|
11
|
+
def initialize(provider_name, model_nickname, print_result = false, config = nil)
|
12
|
+
@provider_name = provider_name
|
13
|
+
@model_nickname = model_nickname
|
14
|
+
@print_result = print_result
|
15
|
+
@config = config || load_config
|
16
|
+
validate_provider_and_model!
|
17
|
+
end
|
18
|
+
|
19
|
+
def load_config
|
20
|
+
config_path = File.join(__dir__, '..', 'models.yaml')
|
21
|
+
unless File.exist?(config_path)
|
22
|
+
raise "Configuration file models.yaml not found"
|
23
|
+
end
|
24
|
+
|
25
|
+
YAML.load_file(config_path)
|
26
|
+
end
|
27
|
+
|
28
|
+
def validate_provider_and_model!
|
29
|
+
provider_config = @config['providers'].find { |p| p['name'] == @provider_name }
|
30
|
+
unless provider_config
|
31
|
+
raise "Provider '#{@provider_name}' not found in configuration"
|
32
|
+
end
|
33
|
+
|
34
|
+
model_config = provider_config['models'].find { |m| m['nickname'] == @model_nickname }
|
35
|
+
unless model_config
|
36
|
+
raise "Model '#{@model_nickname}' not found for provider '#{@provider_name}'"
|
37
|
+
end
|
38
|
+
|
39
|
+
model_config['api_format'] ||= 'openai'
|
40
|
+
|
41
|
+
unless ['openai', 'anthropic'].include?(model_config['api_format'])
|
42
|
+
raise "Invalid API format '#{model_config['api_format']}' for model '#{@model_nickname}'. Must be 'openai' or 'anthropic'"
|
43
|
+
end
|
44
|
+
|
45
|
+
@provider = provider_config
|
46
|
+
@model = model_config
|
47
|
+
end
|
48
|
+
|
49
|
+
def run_benchmark
|
50
|
+
puts "=== LLM Benchmark ==="
|
51
|
+
puts "Provider: #{@provider_name}"
|
52
|
+
puts "Model: #{@model_nickname} (#{@model['id']})"
|
53
|
+
puts "Starting benchmark..."
|
54
|
+
|
55
|
+
@start_time = Time.now
|
56
|
+
puts "Start time: #{@start_time.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
|
57
|
+
|
58
|
+
response = make_api_call
|
59
|
+
|
60
|
+
@end_time = Time.now
|
61
|
+
puts "End time: #{@end_time.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
|
62
|
+
|
63
|
+
calculate_and_display_metrics(response)
|
64
|
+
end
|
65
|
+
|
66
|
+
def anthropic_format?
|
67
|
+
@model['api_format'] == 'anthropic'
|
68
|
+
end
|
69
|
+
|
70
|
+
def api_endpoint
|
71
|
+
anthropic_format? ? "#{@provider['base_url']}/v1/messages" : "#{@provider['base_url']}/chat/completions"
|
72
|
+
end
|
73
|
+
|
74
|
+
def build_request_headers
|
75
|
+
headers = { 'Content-Type' => 'application/json' }
|
76
|
+
if anthropic_format?
|
77
|
+
headers['x-api-key'] = @provider['api_key']
|
78
|
+
headers['anthropic-version'] = '2023-06-01'
|
79
|
+
else
|
80
|
+
headers['Authorization'] = "Bearer #{@provider['api_key']}"
|
81
|
+
end
|
82
|
+
headers
|
83
|
+
end
|
84
|
+
|
85
|
+
def build_request_body
|
86
|
+
base_body = {
|
87
|
+
model: @model['id'],
|
88
|
+
messages: [{ role: 'user', content: @config['prompt'] }]
|
89
|
+
}
|
90
|
+
|
91
|
+
if anthropic_format?
|
92
|
+
base_body.merge(max_tokens: 1000)
|
93
|
+
else
|
94
|
+
base_body.merge(max_tokens: 1000, temperature: 0.7)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def extract_response_content(response)
|
99
|
+
if anthropic_format?
|
100
|
+
extract_anthropic_content(response)
|
101
|
+
else
|
102
|
+
response.dig('choices', 0, 'message', 'content') || ''
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def extract_token_counts(response, message_content)
|
107
|
+
if anthropic_format?
|
108
|
+
input_tokens = response.dig('usage', 'input_tokens') || estimate_tokens(@config['prompt'])
|
109
|
+
output_tokens = response.dig('usage', 'output_tokens') || estimate_tokens(message_content)
|
110
|
+
else
|
111
|
+
input_tokens = response.dig('usage', 'prompt_tokens') || estimate_tokens(@config['prompt'])
|
112
|
+
output_tokens = response.dig('usage', 'completion_tokens') || estimate_tokens(message_content)
|
113
|
+
end
|
114
|
+
[input_tokens, output_tokens]
|
115
|
+
end
|
116
|
+
|
117
|
+
def make_api_call
|
118
|
+
uri = URI.parse(api_endpoint)
|
119
|
+
request = Net::HTTP::Post.new(uri)
|
120
|
+
request['Content-Type'] = 'application/json'
|
121
|
+
|
122
|
+
build_request_headers.each { |key, value| request[key] = value }
|
123
|
+
request.body = build_request_body.to_json
|
124
|
+
|
125
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
126
|
+
http.use_ssl = uri.scheme == 'https'
|
127
|
+
|
128
|
+
response = http.request(request)
|
129
|
+
|
130
|
+
handle_api_error(response) unless response.is_a?(Net::HTTPSuccess)
|
131
|
+
|
132
|
+
JSON.parse(response.body)
|
133
|
+
end
|
134
|
+
|
135
|
+
def handle_api_error(response)
|
136
|
+
error_response = JSON.parse(response.body)
|
137
|
+
error_msg = error_response['msg'] || error_response['message'] ||
|
138
|
+
error_response.dig('error', 'message') || response.message
|
139
|
+
raise "API request failed: #{response.code} - #{error_msg}"
|
140
|
+
rescue JSON::ParserError
|
141
|
+
raise "API request failed: #{response.code} #{response.message}"
|
142
|
+
end
|
143
|
+
|
144
|
+
def calculate_metrics(response)
|
145
|
+
duration = @end_time - @start_time
|
146
|
+
message_content = extract_response_content(response)
|
147
|
+
input_tokens, output_tokens = extract_token_counts(response, message_content)
|
148
|
+
|
149
|
+
total_tokens = input_tokens + output_tokens
|
150
|
+
tokens_per_second = total_tokens / duration if duration.positive?
|
151
|
+
|
152
|
+
{
|
153
|
+
duration: duration,
|
154
|
+
input_tokens: input_tokens,
|
155
|
+
output_tokens: output_tokens,
|
156
|
+
total_tokens: total_tokens,
|
157
|
+
tokens_per_second: tokens_per_second,
|
158
|
+
message_content: message_content
|
159
|
+
}
|
160
|
+
end
|
161
|
+
|
162
|
+
def calculate_and_display_metrics(response)
|
163
|
+
metrics = calculate_metrics(response)
|
164
|
+
|
165
|
+
puts "\n=== Results ==="
|
166
|
+
puts "Duration: #{metrics[:duration].round(3)} seconds"
|
167
|
+
puts "Input tokens: #{metrics[:input_tokens]}"
|
168
|
+
puts "Output tokens: #{metrics[:output_tokens]}"
|
169
|
+
puts "Total tokens: #{metrics[:total_tokens]}"
|
170
|
+
puts "Tokens per second: #{metrics[:tokens_per_second].round(2)}"
|
171
|
+
|
172
|
+
puts "\n=== Message Content ==="
|
173
|
+
puts metrics[:message_content] if @print_result
|
174
|
+
end
|
175
|
+
|
176
|
+
def extract_anthropic_content(response)
|
177
|
+
if response.key?('code') && response.key?('msg') && response.key?('success')
|
178
|
+
return "Error: #{response['msg']}"
|
179
|
+
end
|
180
|
+
|
181
|
+
content_blocks = response.dig('content')
|
182
|
+
|
183
|
+
if content_blocks.is_a?(Array) && !content_blocks.empty?
|
184
|
+
text_block = content_blocks.find { |block| block.is_a?(Hash) && block['type'] == 'text' }
|
185
|
+
text_block ? text_block['text'] : nil
|
186
|
+
elsif response.dig('content', 0, 'text')
|
187
|
+
response.dig('content', 0, 'text')
|
188
|
+
else
|
189
|
+
nil
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def estimate_tokens(text)
|
194
|
+
(text.length / 4.0).round
|
195
|
+
end
|
196
|
+
|
197
|
+
def run_benchmark_for_results
|
198
|
+
@start_time = Time.now
|
199
|
+
response = make_api_call
|
200
|
+
@end_time = Time.now
|
201
|
+
|
202
|
+
metrics = calculate_metrics(response)
|
203
|
+
{
|
204
|
+
provider: @provider_name,
|
205
|
+
model: @model_nickname,
|
206
|
+
total_tokens: metrics[:total_tokens],
|
207
|
+
tokens_per_second: metrics[:tokens_per_second].round(2),
|
208
|
+
duration: metrics[:duration].round(3),
|
209
|
+
success: true,
|
210
|
+
message_content: metrics[:message_content]
|
211
|
+
}
|
212
|
+
rescue StandardError => e
|
213
|
+
{
|
214
|
+
provider: @provider_name,
|
215
|
+
model: @model_nickname,
|
216
|
+
total_tokens: 0,
|
217
|
+
tokens_per_second: 0,
|
218
|
+
duration: 0,
|
219
|
+
success: false,
|
220
|
+
error: e.message,
|
221
|
+
message_content: ''
|
222
|
+
}
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
module LLMBench
|
2
|
+
class ParallelBenchmark
|
3
|
+
def initialize(config, print_result = false)
|
4
|
+
@config = config
|
5
|
+
@print_result = print_result
|
6
|
+
end
|
7
|
+
|
8
|
+
def run_all
|
9
|
+
puts "=== LLM Benchmark ==="
|
10
|
+
puts "Running benchmarks on all configured models..."
|
11
|
+
puts "Starting at #{Time.now.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
|
12
|
+
puts
|
13
|
+
|
14
|
+
benchmarks = create_benchmarks
|
15
|
+
results = run_parallel(benchmarks)
|
16
|
+
|
17
|
+
display_results_table(results)
|
18
|
+
display_summary(results)
|
19
|
+
end
|
20
|
+
|
21
|
+
def run_silent
|
22
|
+
benchmarks = create_benchmarks
|
23
|
+
run_parallel(benchmarks)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def create_benchmarks
|
29
|
+
benchmarks = []
|
30
|
+
|
31
|
+
@config['providers'].each do |provider|
|
32
|
+
provider['models'].each do |model|
|
33
|
+
benchmarks << Benchmark.new(provider['name'], model['nickname'], @print_result, @config)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
benchmarks
|
38
|
+
end
|
39
|
+
|
40
|
+
def run_parallel(benchmarks)
|
41
|
+
results = []
|
42
|
+
mutex = Mutex.new
|
43
|
+
|
44
|
+
threads = benchmarks.map do |benchmark|
|
45
|
+
Thread.new do
|
46
|
+
result = benchmark.run_benchmark_for_results
|
47
|
+
mutex.synchronize { results << result }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
threads.each(&:join)
|
52
|
+
results
|
53
|
+
end
|
54
|
+
|
55
|
+
def display_results_table(results)
|
56
|
+
sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
|
57
|
+
|
58
|
+
provider_width = sorted_results.map { |r| r[:provider].length }.max
|
59
|
+
model_width = sorted_results.map { |r| r[:model].length }.max
|
60
|
+
tokens_width = 12
|
61
|
+
tps_width = 15
|
62
|
+
|
63
|
+
if @print_result
|
64
|
+
header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} | Message Content"
|
65
|
+
separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} | #{'-' * 80}"
|
66
|
+
else
|
67
|
+
header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} |"
|
68
|
+
separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} |"
|
69
|
+
end
|
70
|
+
|
71
|
+
puts header
|
72
|
+
puts separator
|
73
|
+
|
74
|
+
sorted_results.each do |result|
|
75
|
+
provider_col = result[:provider].ljust(provider_width)
|
76
|
+
model_col = result[:model].ljust(model_width)
|
77
|
+
|
78
|
+
if result[:success]
|
79
|
+
tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
|
80
|
+
tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
|
81
|
+
|
82
|
+
if @print_result
|
83
|
+
message_content = result[:message_content][0..79]
|
84
|
+
puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} | #{message_content}"
|
85
|
+
else
|
86
|
+
puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} |"
|
87
|
+
end
|
88
|
+
else
|
89
|
+
tokens_col = "ERROR".rjust(tokens_width)
|
90
|
+
tps_col = "FAILED".rjust(tps_width)
|
91
|
+
|
92
|
+
if @print_result
|
93
|
+
puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} | #{result[:error][0..79]}"
|
94
|
+
else
|
95
|
+
puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} |"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
puts
|
101
|
+
end
|
102
|
+
|
103
|
+
def display_summary(results)
|
104
|
+
successful = results.select { |r| r[:success] }
|
105
|
+
failed = results.select { |r| !r[:success] }
|
106
|
+
|
107
|
+
puts "=== Summary ==="
|
108
|
+
puts "Total benchmarks: #{results.length}"
|
109
|
+
puts "Successful: #{successful.length}"
|
110
|
+
puts "Failed: #{failed.length}"
|
111
|
+
|
112
|
+
if successful.any?
|
113
|
+
avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
|
114
|
+
fastest = successful.max_by { |r| r[:tokens_per_second] }
|
115
|
+
slowest = successful.min_by { |r| r[:tokens_per_second] }
|
116
|
+
|
117
|
+
puts "Average tokens/sec: #{avg_tps.round(2)}"
|
118
|
+
puts "Fastest: #{fastest[:provider]}/#{fastest[:model]} (#{fastest[:tokens_per_second]} tokens/sec)"
|
119
|
+
puts "Slowest: #{slowest[:provider]}/#{slowest[:model]} (#{slowest[:tokens_per_second]} tokens/sec)"
|
120
|
+
end
|
121
|
+
|
122
|
+
return unless failed.any?
|
123
|
+
|
124
|
+
puts "\nFailed benchmarks:"
|
125
|
+
failed.each do |result|
|
126
|
+
puts " #{result[:provider]}/#{result[:model]}: #{result[:error]}"
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
module LLMBench
|
2
|
+
class Tracker
|
3
|
+
def initialize(config)
|
4
|
+
@config = config
|
5
|
+
@csv_file = "llm_benchmark_results_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
|
6
|
+
@running = true
|
7
|
+
@next_run_time = Time.now
|
8
|
+
setup_signal_handlers
|
9
|
+
end
|
10
|
+
|
11
|
+
def start_tracking
|
12
|
+
puts "=== LLM Performance Tracker ==="
|
13
|
+
puts "Tracking all models every 60 seconds"
|
14
|
+
puts "Results will be saved to: #{@csv_file}"
|
15
|
+
puts "Press Ctrl+C to stop tracking"
|
16
|
+
puts
|
17
|
+
|
18
|
+
initialize_csv
|
19
|
+
|
20
|
+
run_tracking_cycle
|
21
|
+
|
22
|
+
while @running
|
23
|
+
time_until_next_run = @next_run_time - Time.now
|
24
|
+
|
25
|
+
if time_until_next_run.positive?
|
26
|
+
sleep_time = [time_until_next_run, 1.0].min
|
27
|
+
sleep(sleep_time)
|
28
|
+
else
|
29
|
+
run_tracking_cycle
|
30
|
+
@next_run_time = Time.now + 60
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
puts "\nTracking stopped by user"
|
35
|
+
puts "Results saved to: #{@csv_file}"
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def setup_signal_handlers
|
41
|
+
Signal.trap('INT') do
|
42
|
+
@running = false
|
43
|
+
puts "\nStopping tracking..."
|
44
|
+
end
|
45
|
+
|
46
|
+
Signal.trap('TERM') do
|
47
|
+
@running = false
|
48
|
+
puts "\nStopping tracking..."
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize_csv
|
53
|
+
File.open(@csv_file, 'w') do |file|
|
54
|
+
file.write("timestamp,provider_model,tokens_per_second,total_tokens,duration_seconds\n")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def run_tracking_cycle
|
59
|
+
timestamp = Time.now
|
60
|
+
puts "[#{timestamp.strftime('%Y-%m-%d %H:%M:%S')}] Running benchmark cycle..."
|
61
|
+
|
62
|
+
parallel_benchmark = ParallelBenchmark.new(@config)
|
63
|
+
results = parallel_benchmark.run_silent
|
64
|
+
|
65
|
+
write_results_to_csv(timestamp, results)
|
66
|
+
display_cycle_summary(results)
|
67
|
+
end
|
68
|
+
|
69
|
+
def write_results_to_csv(timestamp, results)
|
70
|
+
File.open(@csv_file, 'a') do |file|
|
71
|
+
results.each do |result|
|
72
|
+
next unless result[:success]
|
73
|
+
|
74
|
+
provider_model = "#{result[:provider]}+#{result[:model]}"
|
75
|
+
csv_line = [
|
76
|
+
timestamp.strftime('%Y-%m-%d %H:%M:%S'),
|
77
|
+
provider_model,
|
78
|
+
result[:tokens_per_second],
|
79
|
+
result[:total_tokens],
|
80
|
+
result[:duration]
|
81
|
+
].join(',') + "\n"
|
82
|
+
file.write(csv_line)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def display_cycle_summary(results)
|
88
|
+
successful = results.select { |r| r[:success] }
|
89
|
+
failed = results.select { |r| !r[:success] }
|
90
|
+
|
91
|
+
puts " Completed: #{successful.length} successful, #{failed.length} failed"
|
92
|
+
|
93
|
+
if successful.any?
|
94
|
+
avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
|
95
|
+
puts " Average tokens/sec: #{avg_tps.round(2)}"
|
96
|
+
end
|
97
|
+
|
98
|
+
if failed.any?
|
99
|
+
puts " Failed: #{failed.map { |f| "#{f[:provider]}/#{f[:model]}" }.join(', ')}"
|
100
|
+
end
|
101
|
+
|
102
|
+
puts "\n === Individual Model Results ==="
|
103
|
+
|
104
|
+
sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
|
105
|
+
|
106
|
+
provider_width = sorted_results.map { |r| r[:provider].length }.max
|
107
|
+
model_width = sorted_results.map { |r| r[:model].length }.max
|
108
|
+
tokens_width = 12
|
109
|
+
tps_width = 15
|
110
|
+
duration_width = 12
|
111
|
+
|
112
|
+
header = " | #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Tokens/sec".rjust(tps_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Duration".rjust(duration_width)} |"
|
113
|
+
separator = " | #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tps_width} | #{'-' * tokens_width} | #{'-' * duration_width} |"
|
114
|
+
|
115
|
+
puts header
|
116
|
+
puts separator
|
117
|
+
|
118
|
+
sorted_results.each do |result|
|
119
|
+
provider_col = result[:provider].ljust(provider_width)
|
120
|
+
model_col = result[:model].ljust(model_width)
|
121
|
+
|
122
|
+
if result[:success]
|
123
|
+
tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
|
124
|
+
tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
|
125
|
+
duration_col = "#{result[:duration]}s".rjust(duration_width)
|
126
|
+
puts " | #{provider_col} | #{model_col} | #{tps_col} | #{tokens_col} | #{duration_col} |"
|
127
|
+
else
|
128
|
+
tps_col = "FAILED".rjust(tps_width)
|
129
|
+
tokens_col = "ERROR".rjust(tokens_width)
|
130
|
+
duration_col = "N/A".rjust(duration_width)
|
131
|
+
puts " | #{provider_col} | #{model_col} | #{tps_col} | #{tokens_col} | #{duration_col} |"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
data/lib/llm_bench.rb
ADDED
data/llm_bench.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'llm_bench/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "llm_bench"
|
7
|
+
spec.version = LLMBench::VERSION
|
8
|
+
spec.authors = ["Vito"]
|
9
|
+
spec.email = []
|
10
|
+
|
11
|
+
spec.summary = "A tool for benchmarking LLM performance across different providers and models"
|
12
|
+
spec.description = "LLM Bench is a Ruby gem that allows you to benchmark and compare the performance of different Large Language Model providers and APIs. It supports both OpenAI and Anthropic-compatible API formats, provides parallel execution, and includes continuous tracking capabilities with CSV export."
|
13
|
+
spec.homepage = "https://github.com/vitobotta/llm-bench"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 2.7.0"
|
16
|
+
|
17
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
18
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
19
|
+
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
20
|
+
|
21
|
+
spec.files = Dir.chdir(__dir__) do
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
23
|
+
end
|
24
|
+
|
25
|
+
spec.bindir = "exe"
|
26
|
+
spec.executables = ["llm_bench"]
|
27
|
+
spec.require_paths = ["lib"]
|
28
|
+
|
29
|
+
# Standard library dependencies - no external gems required
|
30
|
+
end
|
data/models.yaml.example
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# LLM Benchmark Configuration
|
2
|
+
prompt: "Explain the concept of machine learning in simple terms in exactly 300 words or as close to it as possible, covering the main types of learning approaches and providing examples of real-world applications."
|
3
|
+
|
4
|
+
providers:
|
5
|
+
- name: "z-ai"
|
6
|
+
base_url: "https://api.z.ai/api/coding/paas/v4"
|
7
|
+
api_key: "YOUR_API_KEY_HERE"
|
8
|
+
models:
|
9
|
+
- nickname: "glm"
|
10
|
+
id: "glm-4.5"
|
11
|
+
# api_format can be 'openai' (default) or 'anthropic'
|
12
|
+
api_format: "openai"
|
13
|
+
- name: "z-ai-anthropic"
|
14
|
+
base_url: "https://api.z.ai/api/anthropic"
|
15
|
+
api_key: "YOUR_API_KEY_HERE"
|
16
|
+
models:
|
17
|
+
- nickname: "glm-anthropic"
|
18
|
+
id: "glm-4.5"
|
19
|
+
# api_format can be 'openai' (default) or 'anthropic'
|
20
|
+
api_format: "anthropic"
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: llm_bench
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vito
|
8
|
+
bindir: exe
|
9
|
+
cert_chain: []
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
|
+
dependencies: []
|
12
|
+
description: LLM Bench is a Ruby gem that allows you to benchmark and compare the
|
13
|
+
performance of different Large Language Model providers and APIs. It supports both
|
14
|
+
OpenAI and Anthropic-compatible API formats, provides parallel execution, and includes
|
15
|
+
continuous tracking capabilities with CSV export.
|
16
|
+
email: []
|
17
|
+
executables:
|
18
|
+
- llm_bench
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- ".claude/settings.local.json"
|
23
|
+
- ".gitignore"
|
24
|
+
- README.md
|
25
|
+
- Rakefile
|
26
|
+
- exe/llm_bench
|
27
|
+
- lib/llm_bench.rb
|
28
|
+
- lib/llm_bench/benchmark.rb
|
29
|
+
- lib/llm_bench/parallel_benchmark.rb
|
30
|
+
- lib/llm_bench/tracker.rb
|
31
|
+
- lib/llm_bench/version.rb
|
32
|
+
- llm_bench.gemspec
|
33
|
+
- models.yaml.example
|
34
|
+
homepage: https://github.com/vitobotta/llm-bench
|
35
|
+
licenses:
|
36
|
+
- MIT
|
37
|
+
metadata:
|
38
|
+
homepage_uri: https://github.com/vitobotta/llm-bench
|
39
|
+
source_code_uri: https://github.com/vitobotta/llm-bench
|
40
|
+
changelog_uri: https://github.com/vitobotta/llm-bench/blob/main/CHANGELOG.md
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: 2.7.0
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
requirements: []
|
55
|
+
rubygems_version: 3.6.9
|
56
|
+
specification_version: 4
|
57
|
+
summary: A tool for benchmarking LLM performance across different providers and models
|
58
|
+
test_files: []
|