llm_bench 0.1.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +2 -1
- data/.rubocop.yml +57 -0
- data/Dockerfile +35 -0
- data/README.md +68 -13
- data/Rakefile +3 -1
- data/exe/llm_bench +93 -48
- data/lib/llm_bench/benchmark.rb +162 -183
- data/lib/llm_bench/benchmark_factory.rb +39 -0
- data/lib/llm_bench/colors.rb +50 -0
- data/lib/llm_bench/configuration_manager.rb +66 -0
- data/lib/llm_bench/parallel_benchmark.rb +37 -111
- data/lib/llm_bench/results_formatter.rb +168 -0
- data/lib/llm_bench/tracker.rb +69 -111
- data/lib/llm_bench/version.rb +4 -2
- data/lib/llm_bench.rb +6 -2
- data/llm_bench.gemspec +12 -3
- metadata +28 -6
data/lib/llm_bench/benchmark.rb
CHANGED
@@ -1,225 +1,204 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "json"
|
4
|
+
require "net/http"
|
5
|
+
require "uri"
|
6
|
+
require "time"
|
7
|
+
require_relative "colors"
|
6
8
|
|
7
9
|
module LLMBench
|
8
10
|
class Benchmark
|
9
|
-
|
10
|
-
|
11
|
-
def initialize(provider_name, model_nickname, print_result = false, config = nil)
|
12
|
-
@provider_name = provider_name
|
13
|
-
@model_nickname = model_nickname
|
14
|
-
@print_result = print_result
|
15
|
-
@config = config || load_config
|
16
|
-
validate_provider_and_model!
|
17
|
-
end
|
11
|
+
attr_reader :config, :provider, :model, :start_time, :end_time, :provider_name, :model_nickname, :print_result
|
18
12
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
end
|
13
|
+
def initialize(provider_name:, model_nickname:, config_manager:, print_result: false)
|
14
|
+
@provider_name = provider_name
|
15
|
+
@model_nickname = model_nickname
|
16
|
+
@print_result = print_result
|
24
17
|
|
25
|
-
|
26
|
-
|
18
|
+
@config_manager = config_manager
|
19
|
+
@config = config_manager.config
|
27
20
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
21
|
+
@provider, @model = config_manager.validate_provider_and_model!(
|
22
|
+
provider_name:,
|
23
|
+
model_nickname:
|
24
|
+
)
|
32
25
|
end
|
33
26
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
27
|
+
def run_benchmark
|
28
|
+
puts Colors.header("=== LLM Benchmark ===")
|
29
|
+
puts Colors.info("Provider: #{provider_name}")
|
30
|
+
puts Colors.info("Model: #{model_nickname} (#{model["id"]})")
|
31
|
+
puts Colors.highlight("Starting benchmark...")
|
38
32
|
|
39
|
-
|
33
|
+
@start_time = Time.now
|
34
|
+
puts Colors.border("Start time: #{start_time.strftime("%Y-%m-%d %H:%M:%S.%3N")}")
|
40
35
|
|
41
|
-
|
42
|
-
|
36
|
+
response = make_api_call
|
37
|
+
|
38
|
+
@end_time = Time.now
|
39
|
+
puts Colors.border("End time: #{end_time.strftime("%Y-%m-%d %H:%M:%S.%3N")}")
|
40
|
+
|
41
|
+
calculate_and_display_metrics(response:)
|
43
42
|
end
|
44
43
|
|
45
|
-
|
46
|
-
|
47
|
-
|
44
|
+
def anthropic_format?
|
45
|
+
model["api_format"] == "anthropic"
|
46
|
+
end
|
48
47
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
puts "Model: #{@model_nickname} (#{@model['id']})"
|
53
|
-
puts "Starting benchmark..."
|
48
|
+
def api_endpoint
|
49
|
+
anthropic_format? ? "#{provider["base_url"]}/v1/messages" : "#{provider["base_url"]}/chat/completions"
|
50
|
+
end
|
54
51
|
|
55
|
-
|
56
|
-
|
52
|
+
def build_request_headers
|
53
|
+
headers = { "Content-Type" => "application/json" }
|
54
|
+
if anthropic_format?
|
55
|
+
headers["x-api-key"] = provider["api_key"]
|
56
|
+
headers["anthropic-version"] = "2023-06-01"
|
57
|
+
else
|
58
|
+
headers["Authorization"] = "Bearer #{provider["api_key"]}"
|
59
|
+
end
|
60
|
+
headers
|
61
|
+
end
|
57
62
|
|
58
|
-
|
63
|
+
def build_request_body
|
64
|
+
base_body = {
|
65
|
+
model: model["id"],
|
66
|
+
messages: [{ role: "user", content: config["prompt"] }]
|
67
|
+
}
|
68
|
+
|
69
|
+
if anthropic_format?
|
70
|
+
base_body.merge(max_tokens: 1000)
|
71
|
+
else
|
72
|
+
base_body.merge(max_tokens: 1000, temperature: 0.7)
|
73
|
+
end
|
74
|
+
end
|
59
75
|
|
60
|
-
|
61
|
-
|
76
|
+
def extract_response_content(response)
|
77
|
+
if anthropic_format?
|
78
|
+
extract_anthropic_content(response:)
|
79
|
+
else
|
80
|
+
response.dig("choices", 0, "message", "content") || ""
|
81
|
+
end
|
82
|
+
end
|
62
83
|
|
63
|
-
|
64
|
-
|
84
|
+
def extract_token_counts(response:)
|
85
|
+
if anthropic_format?
|
86
|
+
input_tokens = response.dig("usage", "input_tokens")
|
87
|
+
output_tokens = response.dig("usage", "output_tokens")
|
88
|
+
else
|
89
|
+
input_tokens = response.dig("usage", "prompt_tokens")
|
90
|
+
output_tokens = response.dig("usage", "completion_tokens")
|
91
|
+
end
|
92
|
+
[input_tokens, output_tokens]
|
93
|
+
end
|
65
94
|
|
66
|
-
|
67
|
-
|
68
|
-
|
95
|
+
def make_api_call
|
96
|
+
uri = URI.parse(api_endpoint)
|
97
|
+
request = Net::HTTP::Post.new(uri)
|
98
|
+
request["Content-Type"] = "application/json"
|
69
99
|
|
70
|
-
|
71
|
-
|
72
|
-
end
|
100
|
+
build_request_headers.each { |key, value| request[key] = value }
|
101
|
+
request.body = build_request_body.to_json
|
73
102
|
|
74
|
-
|
75
|
-
|
76
|
-
if anthropic_format?
|
77
|
-
headers['x-api-key'] = @provider['api_key']
|
78
|
-
headers['anthropic-version'] = '2023-06-01'
|
79
|
-
else
|
80
|
-
headers['Authorization'] = "Bearer #{@provider['api_key']}"
|
81
|
-
end
|
82
|
-
headers
|
83
|
-
end
|
103
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
104
|
+
http.use_ssl = uri.scheme == "https"
|
84
105
|
|
85
|
-
|
86
|
-
base_body = {
|
87
|
-
model: @model['id'],
|
88
|
-
messages: [{ role: 'user', content: @config['prompt'] }]
|
89
|
-
}
|
106
|
+
response = http.request(request)
|
90
107
|
|
91
|
-
|
92
|
-
base_body.merge(max_tokens: 1000)
|
93
|
-
else
|
94
|
-
base_body.merge(max_tokens: 1000, temperature: 0.7)
|
95
|
-
end
|
96
|
-
end
|
108
|
+
handle_api_error(response:) unless response.is_a?(Net::HTTPSuccess)
|
97
109
|
|
98
|
-
|
99
|
-
if anthropic_format?
|
100
|
-
extract_anthropic_content(response)
|
101
|
-
else
|
102
|
-
response.dig('choices', 0, 'message', 'content') || ''
|
110
|
+
JSON.parse(response.body)
|
103
111
|
end
|
104
|
-
end
|
105
112
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
+
def handle_api_error(response:)
|
114
|
+
error_response = JSON.parse(response.body)
|
115
|
+
error_msg = error_response["msg"] || error_response["message"] ||
|
116
|
+
error_response.dig("error", "message") || response.message
|
117
|
+
raise "API request failed: #{response.code} - #{error_msg}"
|
118
|
+
rescue JSON::ParserError
|
119
|
+
raise "API request failed: #{response.code} #{response.message}"
|
113
120
|
end
|
114
|
-
[input_tokens, output_tokens]
|
115
|
-
end
|
116
121
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
122
|
+
def calculate_metrics(response:)
|
123
|
+
duration = end_time - start_time
|
124
|
+
message_content = extract_response_content(response)
|
125
|
+
input_tokens, output_tokens = extract_token_counts(response:)
|
121
126
|
|
122
|
-
|
123
|
-
request.body = build_request_body.to_json
|
127
|
+
total_tokens = (input_tokens + output_tokens if input_tokens && output_tokens)
|
124
128
|
|
125
|
-
|
126
|
-
http.use_ssl = uri.scheme == 'https'
|
129
|
+
tokens_per_second = (total_tokens / duration if total_tokens && duration.positive?)
|
127
130
|
|
128
|
-
|
131
|
+
{
|
132
|
+
duration:,
|
133
|
+
input_tokens:,
|
134
|
+
output_tokens:,
|
135
|
+
total_tokens:,
|
136
|
+
tokens_per_second:,
|
137
|
+
message_content:
|
138
|
+
}
|
139
|
+
end
|
129
140
|
|
130
|
-
|
141
|
+
def calculate_and_display_metrics(response:)
|
142
|
+
metrics = calculate_metrics(response:)
|
131
143
|
|
132
|
-
|
133
|
-
|
144
|
+
puts "\n#{Colors.header("=== Results ===")}"
|
145
|
+
puts Colors.metric("Duration: #{metrics[:duration].round(3)} seconds")
|
134
146
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
147
|
+
if metrics[:input_tokens] && metrics[:output_tokens]
|
148
|
+
puts Colors.metric("Input tokens: #{metrics[:input_tokens]}")
|
149
|
+
puts Colors.metric("Output tokens: #{metrics[:output_tokens]}")
|
150
|
+
puts Colors.success("Total tokens: #{metrics[:total_tokens]}")
|
151
|
+
puts Colors.success("Tokens per second: #{metrics[:tokens_per_second].round(2)}")
|
152
|
+
else
|
153
|
+
puts Colors.warning("Token usage data not available in API response")
|
154
|
+
end
|
143
155
|
|
144
|
-
|
145
|
-
duration = @end_time - @start_time
|
146
|
-
message_content = extract_response_content(response)
|
147
|
-
input_tokens, output_tokens = extract_token_counts(response, message_content)
|
148
|
-
|
149
|
-
total_tokens = input_tokens + output_tokens
|
150
|
-
tokens_per_second = total_tokens / duration if duration.positive?
|
151
|
-
|
152
|
-
{
|
153
|
-
duration: duration,
|
154
|
-
input_tokens: input_tokens,
|
155
|
-
output_tokens: output_tokens,
|
156
|
-
total_tokens: total_tokens,
|
157
|
-
tokens_per_second: tokens_per_second,
|
158
|
-
message_content: message_content
|
159
|
-
}
|
160
|
-
end
|
156
|
+
return unless print_result
|
161
157
|
|
162
|
-
|
163
|
-
|
158
|
+
puts "\n#{Colors.header("=== Message Content ===")}"
|
159
|
+
puts Colors.border(metrics[:message_content])
|
160
|
+
end
|
164
161
|
|
165
|
-
|
166
|
-
|
167
|
-
puts "Input tokens: #{metrics[:input_tokens]}"
|
168
|
-
puts "Output tokens: #{metrics[:output_tokens]}"
|
169
|
-
puts "Total tokens: #{metrics[:total_tokens]}"
|
170
|
-
puts "Tokens per second: #{metrics[:tokens_per_second].round(2)}"
|
162
|
+
def extract_anthropic_content(response:)
|
163
|
+
return "Error: #{response["msg"]}" if response.key?("code") && response.key?("msg") && response.key?("success")
|
171
164
|
|
172
|
-
|
173
|
-
puts metrics[:message_content] if @print_result
|
174
|
-
end
|
165
|
+
content_blocks = response["content"]
|
175
166
|
|
176
|
-
|
177
|
-
|
178
|
-
|
167
|
+
if content_blocks.is_a?(Array) && !content_blocks.empty?
|
168
|
+
text_block = content_blocks.find { |block| block.is_a?(Hash) && block["type"] == "text" }
|
169
|
+
text_block ? text_block["text"] : nil
|
170
|
+
elsif response.dig("content", 0, "text")
|
171
|
+
response.dig("content", 0, "text")
|
172
|
+
end
|
179
173
|
end
|
180
174
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
175
|
+
def run_benchmark_for_results
|
176
|
+
@start_time = Time.now
|
177
|
+
response = make_api_call
|
178
|
+
@end_time = Time.now
|
179
|
+
|
180
|
+
metrics = calculate_metrics(response:)
|
181
|
+
|
182
|
+
{
|
183
|
+
provider: provider_name,
|
184
|
+
model: model_nickname,
|
185
|
+
total_tokens: metrics[:total_tokens] || 0,
|
186
|
+
tokens_per_second: metrics[:tokens_per_second]&.round(2) || 0,
|
187
|
+
duration: metrics[:duration].round(3),
|
188
|
+
success: true,
|
189
|
+
message_content: metrics[:message_content]
|
190
|
+
}
|
191
|
+
rescue StandardError => e
|
192
|
+
{
|
193
|
+
provider: provider_name,
|
194
|
+
model: model_nickname,
|
195
|
+
total_tokens: 0,
|
196
|
+
tokens_per_second: 0,
|
197
|
+
duration: 0,
|
198
|
+
success: false,
|
199
|
+
error: e.message,
|
200
|
+
message_content: ""
|
201
|
+
}
|
190
202
|
end
|
191
203
|
end
|
192
|
-
|
193
|
-
def estimate_tokens(text)
|
194
|
-
(text.length / 4.0).round
|
195
|
-
end
|
196
|
-
|
197
|
-
def run_benchmark_for_results
|
198
|
-
@start_time = Time.now
|
199
|
-
response = make_api_call
|
200
|
-
@end_time = Time.now
|
201
|
-
|
202
|
-
metrics = calculate_metrics(response)
|
203
|
-
{
|
204
|
-
provider: @provider_name,
|
205
|
-
model: @model_nickname,
|
206
|
-
total_tokens: metrics[:total_tokens],
|
207
|
-
tokens_per_second: metrics[:tokens_per_second].round(2),
|
208
|
-
duration: metrics[:duration].round(3),
|
209
|
-
success: true,
|
210
|
-
message_content: metrics[:message_content]
|
211
|
-
}
|
212
|
-
rescue StandardError => e
|
213
|
-
{
|
214
|
-
provider: @provider_name,
|
215
|
-
model: @model_nickname,
|
216
|
-
total_tokens: 0,
|
217
|
-
tokens_per_second: 0,
|
218
|
-
duration: 0,
|
219
|
-
success: false,
|
220
|
-
error: e.message,
|
221
|
-
message_content: ''
|
222
|
-
}
|
223
|
-
end
|
224
|
-
end
|
225
|
-
end
|
204
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LLMBench
|
4
|
+
class BenchmarkFactory
|
5
|
+
def initialize(config_manager:, print_result: false)
|
6
|
+
@config_manager = config_manager
|
7
|
+
@config = config_manager.config
|
8
|
+
@print_result = print_result
|
9
|
+
end
|
10
|
+
|
11
|
+
def create_all_benchmarks
|
12
|
+
benchmarks = []
|
13
|
+
|
14
|
+
config["providers"].each do |provider|
|
15
|
+
provider["models"].each do |model|
|
16
|
+
benchmarks << create_benchmark(
|
17
|
+
provider_name: provider["name"],
|
18
|
+
model_nickname: model["nickname"]
|
19
|
+
)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
benchmarks
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
attr_reader :print_result, :config, :config_manager
|
29
|
+
|
30
|
+
def create_benchmark(provider_name:, model_nickname:)
|
31
|
+
Benchmark.new(
|
32
|
+
provider_name:,
|
33
|
+
model_nickname:,
|
34
|
+
print_result:,
|
35
|
+
config_manager:
|
36
|
+
)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "colorize"
|
4
|
+
|
5
|
+
module LLMBench
|
6
|
+
module Colors
|
7
|
+
# Colors for different elements
|
8
|
+
HEADER = :cyan
|
9
|
+
SUCCESS = :green
|
10
|
+
ERROR = :red
|
11
|
+
WARNING = :yellow
|
12
|
+
INFO = :blue
|
13
|
+
METRIC = :magenta
|
14
|
+
HIGHLIGHT = :light_blue
|
15
|
+
BORDER = :white
|
16
|
+
|
17
|
+
# Predefined color methods
|
18
|
+
def self.header(text)
|
19
|
+
text.colorize(HEADER)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.success(text)
|
23
|
+
text.colorize(SUCCESS)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.error(text)
|
27
|
+
text.colorize(ERROR)
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.warning(text)
|
31
|
+
text.colorize(WARNING)
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.info(text)
|
35
|
+
text.colorize(INFO)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.metric(text)
|
39
|
+
text.colorize(METRIC)
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.highlight(text)
|
43
|
+
text.colorize(HIGHLIGHT)
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.border(text)
|
47
|
+
text.colorize(BORDER)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "yaml"
|
4
|
+
require_relative "colors"
|
5
|
+
|
6
|
+
module LLMBench
|
7
|
+
class ConfigurationManager
|
8
|
+
attr_reader :config
|
9
|
+
|
10
|
+
def initialize(config_path: nil)
|
11
|
+
@config_path = config_path || File.join(__dir__, "..", "..", "models.yaml")
|
12
|
+
@config = load_config_from_file
|
13
|
+
end
|
14
|
+
|
15
|
+
def load_config_from_file
|
16
|
+
unless File.exist?(config_path)
|
17
|
+
warn Colors.error("Error: Configuration file not found at #{config_path}")
|
18
|
+
exit 1
|
19
|
+
end
|
20
|
+
|
21
|
+
YAML.load_file(config_path)
|
22
|
+
end
|
23
|
+
|
24
|
+
def validate_provider_and_model!(provider_name:, model_nickname:)
|
25
|
+
provider_config = find_provider(provider_name:)
|
26
|
+
model_config = find_model(provider_config:, model_nickname:)
|
27
|
+
|
28
|
+
validate_api_format!(model_config:)
|
29
|
+
|
30
|
+
[provider_config, model_config]
|
31
|
+
rescue StandardError => e
|
32
|
+
warn Colors.error("Error: #{e.message}")
|
33
|
+
exit 1
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
attr_reader :config_path
|
39
|
+
|
40
|
+
def find_provider(provider_name:)
|
41
|
+
provider_config = config["providers"].find { |p| p["name"] == provider_name }
|
42
|
+
return provider_config if provider_config
|
43
|
+
|
44
|
+
warn Colors.error("Error: Provider '#{provider_name}' not found in configuration")
|
45
|
+
exit 1
|
46
|
+
end
|
47
|
+
|
48
|
+
def find_model(provider_config:, model_nickname:)
|
49
|
+
model_config = provider_config["models"].find { |m| m["nickname"] == model_nickname }
|
50
|
+
return model_config if model_config
|
51
|
+
|
52
|
+
warn Colors.error("Error: Model '#{model_nickname}' not found for provider '#{provider_config["name"]}'")
|
53
|
+
exit 1
|
54
|
+
end
|
55
|
+
|
56
|
+
def validate_api_format!(model_config:)
|
57
|
+
model_config["api_format"] ||= "openai"
|
58
|
+
|
59
|
+
valid_formats = %w[openai anthropic]
|
60
|
+
return if valid_formats.include?(model_config["api_format"])
|
61
|
+
|
62
|
+
warn Colors.error("Error: Invalid API format '#{model_config["api_format"]}' for model '#{model_config["nickname"]}'. Must be 'openai' or 'anthropic'")
|
63
|
+
exit 1
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|