llm_bench 0.1.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +2 -1
- data/.rubocop.yml +57 -0
- data/Dockerfile +35 -0
- data/README.md +68 -13
- data/Rakefile +3 -1
- data/exe/llm_bench +93 -48
- data/lib/llm_bench/benchmark.rb +162 -183
- data/lib/llm_bench/benchmark_factory.rb +39 -0
- data/lib/llm_bench/colors.rb +50 -0
- data/lib/llm_bench/configuration_manager.rb +66 -0
- data/lib/llm_bench/parallel_benchmark.rb +37 -111
- data/lib/llm_bench/results_formatter.rb +168 -0
- data/lib/llm_bench/tracker.rb +69 -111
- data/lib/llm_bench/version.rb +4 -2
- data/lib/llm_bench.rb +6 -2
- data/llm_bench.gemspec +12 -3
- metadata +28 -6
@@ -1,130 +1,56 @@
|
|
1
|
-
|
2
|
-
class ParallelBenchmark
|
3
|
-
def initialize(config, print_result = false)
|
4
|
-
@config = config
|
5
|
-
@print_result = print_result
|
6
|
-
end
|
7
|
-
|
8
|
-
def run_all
|
9
|
-
puts "=== LLM Benchmark ==="
|
10
|
-
puts "Running benchmarks on all configured models..."
|
11
|
-
puts "Starting at #{Time.now.strftime('%Y-%m-%d %H:%M:%S.%3N')}"
|
12
|
-
puts
|
13
|
-
|
14
|
-
benchmarks = create_benchmarks
|
15
|
-
results = run_parallel(benchmarks)
|
16
|
-
|
17
|
-
display_results_table(results)
|
18
|
-
display_summary(results)
|
19
|
-
end
|
20
|
-
|
21
|
-
def run_silent
|
22
|
-
benchmarks = create_benchmarks
|
23
|
-
run_parallel(benchmarks)
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
1
|
+
# frozen_string_literal: true
|
27
2
|
|
28
|
-
|
29
|
-
benchmarks = []
|
3
|
+
require_relative "colors"
|
30
4
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
5
|
+
module LLMBench
|
6
|
+
class ParallelBenchmark
|
7
|
+
def initialize(config_manager:, print_result: false)
|
8
|
+
@config_manager = config_manager
|
9
|
+
@config = config_manager.config
|
10
|
+
@print_result = print_result
|
11
|
+
@benchmark_factory = BenchmarkFactory.new(config_manager:, print_result:)
|
12
|
+
@results_formatter = ResultsFormatter.new(print_result:)
|
35
13
|
end
|
36
14
|
|
37
|
-
|
38
|
-
|
15
|
+
def run_all
|
16
|
+
puts Colors.header("=== LLM Benchmark ===")
|
17
|
+
puts Colors.info("Running benchmarks on all configured models...")
|
18
|
+
puts Colors.border("Starting at #{Time.now.strftime("%Y-%m-%d %H:%M:%S.%3N")}")
|
19
|
+
puts
|
39
20
|
|
40
|
-
|
41
|
-
|
42
|
-
mutex = Mutex.new
|
21
|
+
benchmarks = create_benchmarks
|
22
|
+
results = run_parallel(benchmarks:)
|
43
23
|
|
44
|
-
|
45
|
-
|
46
|
-
result = benchmark.run_benchmark_for_results
|
47
|
-
mutex.synchronize { results << result }
|
48
|
-
end
|
24
|
+
results_formatter.display_results_table(results)
|
25
|
+
results_formatter.display_summary(results)
|
49
26
|
end
|
50
27
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
def display_results_table(results)
|
56
|
-
sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
|
57
|
-
|
58
|
-
provider_width = sorted_results.map { |r| r[:provider].length }.max
|
59
|
-
model_width = sorted_results.map { |r| r[:model].length }.max
|
60
|
-
tokens_width = 12
|
61
|
-
tps_width = 15
|
62
|
-
|
63
|
-
if @print_result
|
64
|
-
header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} | Message Content"
|
65
|
-
separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} | #{'-' * 80}"
|
66
|
-
else
|
67
|
-
header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} |"
|
68
|
-
separator = "| #{'-' * provider_width} | #{'-' * model_width} | #{'-' * tokens_width} | #{'-' * tps_width} |"
|
28
|
+
def run_silent
|
29
|
+
benchmarks = create_benchmarks
|
30
|
+
run_parallel(benchmarks:)
|
69
31
|
end
|
70
32
|
|
71
|
-
|
72
|
-
puts separator
|
33
|
+
private
|
73
34
|
|
74
|
-
|
75
|
-
provider_col = result[:provider].ljust(provider_width)
|
76
|
-
model_col = result[:model].ljust(model_width)
|
35
|
+
attr_reader :print_result, :config, :config_manager, :benchmark_factory, :results_formatter
|
77
36
|
|
78
|
-
|
79
|
-
|
80
|
-
|
37
|
+
def create_benchmarks
|
38
|
+
benchmark_factory.create_all_benchmarks
|
39
|
+
end
|
81
40
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
else
|
86
|
-
puts "| #{provider_col} | #{model_col} | #{tokens_col} | #{tps_col} |"
|
87
|
-
end
|
88
|
-
else
|
89
|
-
tokens_col = "ERROR".rjust(tokens_width)
|
90
|
-
tps_col = "FAILED".rjust(tps_width)
|
41
|
+
def run_parallel(benchmarks:)
|
42
|
+
results = []
|
43
|
+
mutex = Mutex.new
|
91
44
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
45
|
+
threads = benchmarks.map do |benchmark|
|
46
|
+
Thread.new do
|
47
|
+
result = benchmark.run_benchmark_for_results
|
48
|
+
mutex.synchronize { results << result }
|
96
49
|
end
|
97
50
|
end
|
98
|
-
end
|
99
|
-
|
100
|
-
puts
|
101
|
-
end
|
102
51
|
|
103
|
-
|
104
|
-
|
105
|
-
failed = results.select { |r| !r[:success] }
|
106
|
-
|
107
|
-
puts "=== Summary ==="
|
108
|
-
puts "Total benchmarks: #{results.length}"
|
109
|
-
puts "Successful: #{successful.length}"
|
110
|
-
puts "Failed: #{failed.length}"
|
111
|
-
|
112
|
-
if successful.any?
|
113
|
-
avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
|
114
|
-
fastest = successful.max_by { |r| r[:tokens_per_second] }
|
115
|
-
slowest = successful.min_by { |r| r[:tokens_per_second] }
|
116
|
-
|
117
|
-
puts "Average tokens/sec: #{avg_tps.round(2)}"
|
118
|
-
puts "Fastest: #{fastest[:provider]}/#{fastest[:model]} (#{fastest[:tokens_per_second]} tokens/sec)"
|
119
|
-
puts "Slowest: #{slowest[:provider]}/#{slowest[:model]} (#{slowest[:tokens_per_second]} tokens/sec)"
|
52
|
+
threads.each(&:join)
|
53
|
+
results
|
120
54
|
end
|
121
|
-
|
122
|
-
return unless failed.any?
|
123
|
-
|
124
|
-
puts "\nFailed benchmarks:"
|
125
|
-
failed.each do |result|
|
126
|
-
puts " #{result[:provider]}/#{result[:model]}: #{result[:error]}"
|
127
|
-
end
|
128
|
-
end
|
129
55
|
end
|
130
|
-
end
|
56
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "colors"
|
4
|
+
|
5
|
+
module LLMBench
|
6
|
+
class ResultsFormatter
|
7
|
+
def initialize(print_result: false)
|
8
|
+
@print_result = print_result
|
9
|
+
end
|
10
|
+
|
11
|
+
def display_results_table(results)
|
12
|
+
sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
|
13
|
+
|
14
|
+
provider_width = calculate_column_width(sorted_results, :provider)
|
15
|
+
model_width = calculate_column_width(sorted_results, :model)
|
16
|
+
tokens_width = 12
|
17
|
+
tps_width = 15
|
18
|
+
|
19
|
+
header, separator = build_table_header(provider_width:, model_width:, tokens_width:, tps_width:)
|
20
|
+
|
21
|
+
puts Colors.header(header)
|
22
|
+
puts Colors.border(separator)
|
23
|
+
|
24
|
+
display_table_rows(sorted_results, provider_width:, model_width:, tokens_width:, tps_width:)
|
25
|
+
puts
|
26
|
+
end
|
27
|
+
|
28
|
+
def display_summary(results)
|
29
|
+
successful = results.select { |r| r[:success] }
|
30
|
+
failed = results.reject { |r| r[:success] }
|
31
|
+
|
32
|
+
puts Colors.header("=== Summary ===")
|
33
|
+
puts Colors.info("Total benchmarks: #{results.length}")
|
34
|
+
puts Colors.success("Successful: #{successful.length}")
|
35
|
+
puts Colors.error("Failed: #{failed.length}")
|
36
|
+
|
37
|
+
display_performance_metrics(successful) if successful.any?
|
38
|
+
|
39
|
+
display_failed_benchmarks(failed) if failed.any?
|
40
|
+
end
|
41
|
+
|
42
|
+
def display_cycle_summary(results)
|
43
|
+
successful = results.select { |r| r[:success] }
|
44
|
+
failed = results.reject { |r| r[:success] }
|
45
|
+
|
46
|
+
puts " #{Colors.success("Completed: #{successful.length} successful")}, #{Colors.error("#{failed.length} failed")}"
|
47
|
+
|
48
|
+
if successful.any?
|
49
|
+
avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
|
50
|
+
puts " #{Colors.metric("Average tokens/sec: #{avg_tps.round(2)}")}"
|
51
|
+
end
|
52
|
+
|
53
|
+
puts " #{Colors.error("Failed: #{failed.map { |f| "#{f[:provider]}/#{f[:model]}" }.join(", ")}")}" if failed.any?
|
54
|
+
|
55
|
+
display_individual_results(results) if results.any?
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
attr_reader :print_result
|
61
|
+
|
62
|
+
def calculate_column_width(results, column)
|
63
|
+
results.map { |r| r[column].length }.max
|
64
|
+
end
|
65
|
+
|
66
|
+
def build_table_header(provider_width:, model_width:, tokens_width:, tps_width:)
|
67
|
+
if print_result
|
68
|
+
header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} | Message Content"
|
69
|
+
separator = "| #{"-" * provider_width} | #{"-" * model_width} | #{"-" * tokens_width} | #{"-" * tps_width} | #{"-" * 80}"
|
70
|
+
else
|
71
|
+
header = "| #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | #{"Total Tokens".rjust(tokens_width)} | #{"Tokens/sec".rjust(tps_width)} |"
|
72
|
+
separator = "| #{"-" * provider_width} | #{"-" * model_width} | #{"-" * tokens_width} | #{"-" * tps_width} |"
|
73
|
+
end
|
74
|
+
[header, separator]
|
75
|
+
end
|
76
|
+
|
77
|
+
def display_table_rows(results, provider_width:, model_width:, tokens_width:, tps_width:)
|
78
|
+
results.each do |result|
|
79
|
+
provider_col = result[:provider].ljust(provider_width)
|
80
|
+
model_col = result[:model].ljust(model_width)
|
81
|
+
|
82
|
+
if result[:success]
|
83
|
+
display_successful_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
|
84
|
+
else
|
85
|
+
display_failed_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def display_successful_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
|
91
|
+
tokens_col = result[:total_tokens].to_s.rjust(tokens_width)
|
92
|
+
tps_col = result[:tokens_per_second].to_s.rjust(tps_width)
|
93
|
+
|
94
|
+
if print_result
|
95
|
+
message_content = result[:message_content][0..79]
|
96
|
+
puts "| #{Colors.success(provider_col)} | #{Colors.success(model_col)} | #{Colors.metric(tokens_col)} | #{Colors.success(tps_col)} | #{Colors.border(message_content)}"
|
97
|
+
else
|
98
|
+
puts "| #{Colors.success(provider_col)} | #{Colors.success(model_col)} | #{Colors.metric(tokens_col)} | #{Colors.success(tps_col)} |"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def display_failed_row(result, provider_col:, model_col:, tokens_width:, tps_width:)
|
103
|
+
tokens_col = Colors.error("ERROR".rjust(tokens_width))
|
104
|
+
tps_col = Colors.error("FAILED".rjust(tps_width))
|
105
|
+
|
106
|
+
if print_result
|
107
|
+
puts "| #{Colors.error(provider_col)} | #{Colors.error(model_col)} | #{tokens_col} | #{tps_col} | #{Colors.border(result[:error][0..79])}"
|
108
|
+
else
|
109
|
+
puts "| #{Colors.error(provider_col)} | #{Colors.error(model_col)} | #{tokens_col} | #{tps_col} |"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def display_performance_metrics(successful)
|
114
|
+
avg_tps = successful.map { |r| r[:tokens_per_second] }.sum / successful.length
|
115
|
+
fastest = successful.max_by { |r| r[:tokens_per_second] }
|
116
|
+
slowest = successful.min_by { |r| r[:tokens_per_second] }
|
117
|
+
|
118
|
+
puts Colors.metric("Average tokens/sec: #{avg_tps.round(2)}")
|
119
|
+
puts Colors.success("Fastest: #{fastest[:provider]}/#{fastest[:model]} (#{fastest[:tokens_per_second]} tokens/sec)")
|
120
|
+
puts Colors.warning("Slowest: #{slowest[:provider]}/#{slowest[:model]} (#{slowest[:tokens_per_second]} tokens/sec)")
|
121
|
+
end
|
122
|
+
|
123
|
+
def display_failed_benchmarks(failed)
|
124
|
+
puts "\n#{Colors.error("Failed benchmarks:")}"
|
125
|
+
failed.each do |result|
|
126
|
+
puts " #{Colors.error("#{result[:provider]}/#{result[:model]}")}: #{Colors.warning(result[:error])}"
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def display_individual_results(results)
|
131
|
+
puts "\n #{Colors.header('=== Individual Model Results ===')}"
|
132
|
+
|
133
|
+
sorted_results = results.sort_by { |r| -r[:tokens_per_second] }
|
134
|
+
|
135
|
+
provider_width = calculate_column_width(sorted_results, :provider)
|
136
|
+
model_width = calculate_column_width(sorted_results, :model)
|
137
|
+
tokens_width = 12
|
138
|
+
tps_width = 15
|
139
|
+
duration_width = 12
|
140
|
+
|
141
|
+
header = " | #{"Provider".ljust(provider_width)} | #{"Model".ljust(model_width)} | " \
|
142
|
+
"#{"Tokens/sec".rjust(tps_width)} | #{"Total Tokens".rjust(tokens_width)} | " \
|
143
|
+
"#{"Duration".rjust(duration_width)} |"
|
144
|
+
separator = " | #{"-" * provider_width} | #{"-" * model_width} | " \
|
145
|
+
"#{"-" * tps_width} | #{"-" * tokens_width} | " \
|
146
|
+
"#{"-" * duration_width} |"
|
147
|
+
|
148
|
+
puts Colors.header(header)
|
149
|
+
puts Colors.border(separator)
|
150
|
+
|
151
|
+
sorted_results.each do |result|
|
152
|
+
provider_col = result[:provider].ljust(provider_width)
|
153
|
+
model_col = result[:model].ljust(model_width)
|
154
|
+
|
155
|
+
if result[:success]
|
156
|
+
tps_col = Colors.success(result[:tokens_per_second].to_s.rjust(tps_width))
|
157
|
+
tokens_col = Colors.metric(result[:total_tokens].to_s.rjust(tokens_width))
|
158
|
+
duration_col = Colors.info("#{result[:duration]}s".rjust(duration_width))
|
159
|
+
else
|
160
|
+
tps_col = Colors.error("FAILED".rjust(tps_width))
|
161
|
+
tokens_col = Colors.error("ERROR".rjust(tokens_width))
|
162
|
+
duration_col = Colors.warning("N/A".rjust(duration_width))
|
163
|
+
end
|
164
|
+
puts " | #{Colors.info(provider_col)} | #{Colors.info(model_col)} | #{tps_col} | #{tokens_col} | #{duration_col} |"
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
data/lib/llm_bench/tracker.rb
CHANGED
@@ -1,136 +1,94 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "colors"
|
4
|
+
|
1
5
|
module LLMBench
|
2
6
|
class Tracker
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
def initialize(config_manager:, interval: 600, output_file: nil)
|
8
|
+
@config_manager = config_manager
|
9
|
+
@config = config_manager.config
|
10
|
+
@csv_file = output_file || "llm_benchmark_results_#{Time.now.strftime("%Y%m%d_%H%M%S")}.csv"
|
11
|
+
@running = true
|
12
|
+
@next_run_time = Time.now
|
13
|
+
@interval = interval
|
14
|
+
@results_formatter = ResultsFormatter.new(print_result: false)
|
15
|
+
setup_signal_handlers
|
16
|
+
end
|
10
17
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
18
|
+
def start_tracking
|
19
|
+
puts Colors.header("=== LLM Performance Tracker ===")
|
20
|
+
puts Colors.info("Tracking all models every #{interval} seconds")
|
21
|
+
puts Colors.info("Results will be saved to: #{csv_file}")
|
22
|
+
puts Colors.highlight("Press Ctrl+C to stop tracking")
|
23
|
+
puts
|
17
24
|
|
18
|
-
|
25
|
+
initialize_csv
|
19
26
|
|
20
|
-
|
27
|
+
run_tracking_cycle
|
21
28
|
|
22
|
-
|
23
|
-
|
29
|
+
while running
|
30
|
+
time_until_next_run = next_run_time - Time.now
|
24
31
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
32
|
+
if time_until_next_run.positive?
|
33
|
+
sleep_time = [time_until_next_run, 1.0].min
|
34
|
+
sleep(sleep_time)
|
35
|
+
else
|
36
|
+
run_tracking_cycle
|
37
|
+
@next_run_time = Time.now + interval
|
38
|
+
end
|
31
39
|
end
|
32
|
-
end
|
33
|
-
|
34
|
-
puts "\nTracking stopped by user"
|
35
|
-
puts "Results saved to: #{@csv_file}"
|
36
|
-
end
|
37
|
-
|
38
|
-
private
|
39
|
-
|
40
|
-
def setup_signal_handlers
|
41
|
-
Signal.trap('INT') do
|
42
|
-
@running = false
|
43
|
-
puts "\nStopping tracking..."
|
44
|
-
end
|
45
|
-
|
46
|
-
Signal.trap('TERM') do
|
47
|
-
@running = false
|
48
|
-
puts "\nStopping tracking..."
|
49
|
-
end
|
50
|
-
end
|
51
40
|
|
52
|
-
|
53
|
-
|
54
|
-
file.write("timestamp,provider_model,tokens_per_second,total_tokens,duration_seconds\n")
|
41
|
+
puts "\n#{Colors.warning('Tracking stopped by user')}"
|
42
|
+
puts Colors.info("Results saved to: #{csv_file}")
|
55
43
|
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def run_tracking_cycle
|
59
|
-
timestamp = Time.now
|
60
|
-
puts "[#{timestamp.strftime('%Y-%m-%d %H:%M:%S')}] Running benchmark cycle..."
|
61
44
|
|
62
|
-
|
63
|
-
results = parallel_benchmark.run_silent
|
45
|
+
private
|
64
46
|
|
65
|
-
|
66
|
-
display_cycle_summary(results)
|
67
|
-
end
|
47
|
+
attr_reader :csv_file, :running, :next_run_time, :config, :config_manager, :results_formatter, :interval
|
68
48
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
provider_model = "#{result[:provider]}+#{result[:model]}"
|
75
|
-
csv_line = [
|
76
|
-
timestamp.strftime('%Y-%m-%d %H:%M:%S'),
|
77
|
-
provider_model,
|
78
|
-
result[:tokens_per_second],
|
79
|
-
result[:total_tokens],
|
80
|
-
result[:duration]
|
81
|
-
].join(',') + "\n"
|
82
|
-
file.write(csv_line)
|
49
|
+
def setup_signal_handlers
|
50
|
+
Signal.trap("INT") do
|
51
|
+
puts "\n#{Colors.warning('Received interrupt signal, exiting immediately...')}"
|
52
|
+
exit 0
|
83
53
|
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
def display_cycle_summary(results)
|
88
|
-
successful = results.select { |r| r[:success] }
|
89
|
-
failed = results.select { |r| !r[:success] }
|
90
|
-
|
91
|
-
puts " Completed: #{successful.length} successful, #{failed.length} failed"
|
92
54
|
|
93
|
-
|
94
|
-
|
95
|
-
|
55
|
+
Signal.trap("TERM") do
|
56
|
+
puts "\n#{Colors.warning('Received termination signal, exiting immediately...')}"
|
57
|
+
exit 0
|
58
|
+
end
|
96
59
|
end
|
97
60
|
|
98
|
-
|
99
|
-
|
61
|
+
def initialize_csv
|
62
|
+
File.write(csv_file, "timestamp,provider_model,tokens_per_second,total_tokens,duration_seconds\n")
|
100
63
|
end
|
101
64
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
provider_width = sorted_results.map { |r| r[:provider].length }.max
|
107
|
-
model_width = sorted_results.map { |r| r[:model].length }.max
|
108
|
-
tokens_width = 12
|
109
|
-
tps_width = 15
|
110
|
-
duration_width = 12
|
65
|
+
def run_tracking_cycle
|
66
|
+
timestamp = Time.now
|
67
|
+
puts "#{Colors.border("[#{timestamp.strftime('%Y-%m-%d %H:%M:%S')}]")} #{Colors.highlight('Running benchmark cycle...')}"
|
111
68
|
|
112
|
-
|
113
|
-
|
69
|
+
parallel_benchmark = ParallelBenchmark.new(config_manager:, print_result: false)
|
70
|
+
results = parallel_benchmark.run_silent
|
114
71
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
sorted_results.each do |result|
|
119
|
-
provider_col = result[:provider].ljust(provider_width)
|
120
|
-
model_col = result[:model].ljust(model_width)
|
72
|
+
write_results_to_csv(timestamp:, results:)
|
73
|
+
results_formatter.display_cycle_summary(results)
|
74
|
+
end
|
121
75
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
76
|
+
def write_results_to_csv(timestamp:, results:)
|
77
|
+
File.open(csv_file, "a") do |file|
|
78
|
+
results.each do |result|
|
79
|
+
next unless result[:success]
|
80
|
+
|
81
|
+
provider_model = "#{result[:provider]}: #{result[:model]}"
|
82
|
+
csv_line = [
|
83
|
+
timestamp.strftime("%Y-%m-%d %H:%M:%S"),
|
84
|
+
provider_model,
|
85
|
+
result[:tokens_per_second],
|
86
|
+
result[:total_tokens],
|
87
|
+
result[:duration]
|
88
|
+
].join(",") << "\n"
|
89
|
+
file.write(csv_line)
|
90
|
+
end
|
132
91
|
end
|
133
92
|
end
|
134
93
|
end
|
135
|
-
|
136
|
-
end
|
94
|
+
end
|
data/lib/llm_bench/version.rb
CHANGED
data/lib/llm_bench.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative "llm_bench/version"
|
4
|
+
require_relative "llm_bench/configuration_manager"
|
5
|
+
require_relative "llm_bench/results_formatter"
|
6
|
+
require_relative "llm_bench/benchmark_factory"
|
2
7
|
require_relative "llm_bench/benchmark"
|
3
8
|
require_relative "llm_bench/parallel_benchmark"
|
4
9
|
require_relative "llm_bench/tracker"
|
5
10
|
|
6
11
|
module LLMBench
|
7
12
|
class Error < StandardError; end
|
8
|
-
|
9
|
-
end
|
13
|
+
end
|
data/llm_bench.gemspec
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'llm_bench/version'
|
@@ -9,14 +11,20 @@ Gem::Specification.new do |spec|
|
|
9
11
|
spec.email = []
|
10
12
|
|
11
13
|
spec.summary = "A tool for benchmarking LLM performance across different providers and models"
|
12
|
-
spec.description =
|
14
|
+
spec.description = <<~DESC
|
15
|
+
LLM Bench is a Ruby gem that allows you to benchmark and compare the performance
|
16
|
+
of different Large Language Model providers and APIs. It supports both OpenAI and
|
17
|
+
Anthropic-compatible API formats, provides parallel execution, and includes
|
18
|
+
continuous tracking capabilities with CSV export.
|
19
|
+
DESC
|
13
20
|
spec.homepage = "https://github.com/vitobotta/llm-bench"
|
14
21
|
spec.license = "MIT"
|
15
|
-
spec.required_ruby_version = ">= 2
|
22
|
+
spec.required_ruby_version = ">= 3.2"
|
16
23
|
|
17
24
|
spec.metadata["homepage_uri"] = spec.homepage
|
18
25
|
spec.metadata["source_code_uri"] = spec.homepage
|
19
26
|
spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
27
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
28
|
|
21
29
|
spec.files = Dir.chdir(__dir__) do
|
22
30
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
|
@@ -26,5 +34,6 @@ Gem::Specification.new do |spec|
|
|
26
34
|
spec.executables = ["llm_bench"]
|
27
35
|
spec.require_paths = ["lib"]
|
28
36
|
|
29
|
-
#
|
37
|
+
# Color support for enhanced output
|
38
|
+
spec.add_dependency "colorize", "~> 1.1"
|
30
39
|
end
|