benchgecko 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/benchgecko.rb +28 -246
- metadata +10 -15
- data/CHANGELOG.md +0 -9
- data/LICENSE.txt +0 -21
- data/README.md +0 -120
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6a126fb765dd87b64cb48087748871672d902c2aee7913f1b3a2bc5eec495933
|
|
4
|
+
data.tar.gz: ee0aeaeec203627d07af67928e495f95c6828a3729020eea75a942e902e36e04
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 02d105ade04b5cff84f970c70c926ec961343763da3a47f15da4cc325d30a1ba98e57d377330345cce5306925f832b9f8847c1da530d2a04b43d7334e90a6375
|
|
7
|
+
data.tar.gz: 0fe0f60f0f26b53bb717c415da31b3c167a148f9c0dcc9ed4315c8c1aad7e0e61f3ef6d98c88c07c42accfd0058e0eec80fed5eab71d5a88c9b3ed402799af38
|
data/lib/benchgecko.rb
CHANGED
|
@@ -1,261 +1,43 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
# BenchGecko - The CoinGecko for AI
|
|
4
|
-
# Data platform for AI model benchmarks, pricing, and agent comparison
|
|
5
|
-
# https://benchgecko.ai
|
|
1
|
+
require 'net/http'
|
|
2
|
+
require 'json'
|
|
6
3
|
|
|
7
4
|
module BenchGecko
|
|
8
|
-
VERSION =
|
|
9
|
-
|
|
10
|
-
# Represents an AI model with its benchmark scores, pricing, and metadata.
|
|
11
|
-
class Model
|
|
12
|
-
attr_reader :id, :name, :provider, :parameters, :context_window,
|
|
13
|
-
:input_price, :output_price, :benchmarks, :metadata
|
|
14
|
-
|
|
15
|
-
def initialize(attrs = {})
|
|
16
|
-
@id = attrs[:id] || attrs["id"]
|
|
17
|
-
@name = attrs[:name] || attrs["name"]
|
|
18
|
-
@provider = attrs[:provider] || attrs["provider"]
|
|
19
|
-
@parameters = attrs[:parameters] || attrs["parameters"]
|
|
20
|
-
@context_window = attrs[:context_window] || attrs["context_window"]
|
|
21
|
-
@input_price = attrs[:input_price] || attrs["input_price"]
|
|
22
|
-
@output_price = attrs[:output_price] || attrs["output_price"]
|
|
23
|
-
@benchmarks = attrs[:benchmarks] || attrs["benchmarks"] || {}
|
|
24
|
-
@metadata = attrs[:metadata] || attrs["metadata"] || {}
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# Cost per million tokens (input + output averaged)
|
|
28
|
-
def cost_per_million
|
|
29
|
-
return nil unless input_price && output_price
|
|
30
|
-
((input_price + output_price) / 2.0).round(4)
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# Returns the score for a specific benchmark
|
|
34
|
-
def score(benchmark_name)
|
|
35
|
-
benchmarks[benchmark_name.to_s] || benchmarks[benchmark_name.to_sym]
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
# Returns a hash summary suitable for comparison tables
|
|
39
|
-
def to_summary
|
|
40
|
-
{
|
|
41
|
-
name: name,
|
|
42
|
-
provider: provider,
|
|
43
|
-
parameters: parameters,
|
|
44
|
-
context_window: context_window,
|
|
45
|
-
cost_per_million: cost_per_million
|
|
46
|
-
}
|
|
47
|
-
end
|
|
5
|
+
VERSION = '0.2.1'
|
|
6
|
+
BASE_URL = 'https://benchgecko.ai/api/v1'
|
|
48
7
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
end
|
|
8
|
+
def self.models(params = {})
|
|
9
|
+
get('/models', params)
|
|
52
10
|
end
|
|
53
11
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
attr_reader :id, :name, :category, :provider, :models_used,
|
|
57
|
-
:scores, :capabilities, :metadata
|
|
58
|
-
|
|
59
|
-
def initialize(attrs = {})
|
|
60
|
-
@id = attrs[:id] || attrs["id"]
|
|
61
|
-
@name = attrs[:name] || attrs["name"]
|
|
62
|
-
@category = attrs[:category] || attrs["category"]
|
|
63
|
-
@provider = attrs[:provider] || attrs["provider"]
|
|
64
|
-
@models_used = attrs[:models_used] || attrs["models_used"] || []
|
|
65
|
-
@scores = attrs[:scores] || attrs["scores"] || {}
|
|
66
|
-
@capabilities = attrs[:capabilities] || attrs["capabilities"] || []
|
|
67
|
-
@metadata = attrs[:metadata] || attrs["metadata"] || {}
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
def supports?(capability)
|
|
71
|
-
capabilities.include?(capability.to_s)
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def to_s
|
|
75
|
-
"#{name} (#{category}) by #{provider}"
|
|
76
|
-
end
|
|
12
|
+
def self.model(slug)
|
|
13
|
+
get("/models/#{slug}")
|
|
77
14
|
end
|
|
78
15
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
name: "Reasoning",
|
|
83
|
-
benchmarks: %w[MMLU MMLU-Pro ARC-Challenge HellaSwag WinoGrande GPQA],
|
|
84
|
-
description: "Logical reasoning, knowledge, and common sense"
|
|
85
|
-
},
|
|
86
|
-
coding: {
|
|
87
|
-
name: "Coding",
|
|
88
|
-
benchmarks: %w[HumanEval MBPP SWE-bench LiveCodeBench BigCodeBench],
|
|
89
|
-
description: "Code generation, debugging, and software engineering"
|
|
90
|
-
},
|
|
91
|
-
math: {
|
|
92
|
-
name: "Mathematics",
|
|
93
|
-
benchmarks: %w[GSM8K MATH AIME AMC Competition-Math],
|
|
94
|
-
description: "Mathematical problem solving from arithmetic to olympiad"
|
|
95
|
-
},
|
|
96
|
-
instruction: {
|
|
97
|
-
name: "Instruction Following",
|
|
98
|
-
benchmarks: %w[IFEval MT-Bench AlpacaEval Chatbot-Arena],
|
|
99
|
-
description: "Following complex instructions and conversational ability"
|
|
100
|
-
},
|
|
101
|
-
safety: {
|
|
102
|
-
name: "Safety",
|
|
103
|
-
benchmarks: %w[TruthfulQA BBQ ToxiGen BOLD],
|
|
104
|
-
description: "Truthfulness, bias, and safety alignment"
|
|
105
|
-
},
|
|
106
|
-
multimodal: {
|
|
107
|
-
name: "Multimodal",
|
|
108
|
-
benchmarks: %w[MMMU MathVista VQAv2 TextVQA DocVQA],
|
|
109
|
-
description: "Vision, document understanding, and cross-modal reasoning"
|
|
110
|
-
},
|
|
111
|
-
multilingual: {
|
|
112
|
-
name: "Multilingual",
|
|
113
|
-
benchmarks: %w[MGSM XL-Sum FLORES],
|
|
114
|
-
description: "Performance across languages and translation"
|
|
115
|
-
},
|
|
116
|
-
long_context: {
|
|
117
|
-
name: "Long Context",
|
|
118
|
-
benchmarks: %w[RULER NIAH InfiniteBench LongBench],
|
|
119
|
-
description: "Retrieval and reasoning over long documents"
|
|
120
|
-
}
|
|
121
|
-
}.freeze
|
|
122
|
-
|
|
123
|
-
# Built-in model catalog with real benchmark data and pricing
|
|
124
|
-
MODELS = {
|
|
125
|
-
"gpt-4o" => {
|
|
126
|
-
name: "GPT-4o", provider: "OpenAI", parameters: 200,
|
|
127
|
-
context_window: 128_000, input_price: 2.50, output_price: 10.00,
|
|
128
|
-
benchmarks: { "MMLU" => 88.7, "HumanEval" => 90.2, "GSM8K" => 95.8, "GPQA" => 53.6 }
|
|
129
|
-
},
|
|
130
|
-
"claude-3.5-sonnet" => {
|
|
131
|
-
name: "Claude 3.5 Sonnet", provider: "Anthropic", parameters: nil,
|
|
132
|
-
context_window: 200_000, input_price: 3.00, output_price: 15.00,
|
|
133
|
-
benchmarks: { "MMLU" => 88.7, "HumanEval" => 92.0, "GSM8K" => 96.4, "GPQA" => 59.4 }
|
|
134
|
-
},
|
|
135
|
-
"gemini-2.0-flash" => {
|
|
136
|
-
name: "Gemini 2.0 Flash", provider: "Google", parameters: nil,
|
|
137
|
-
context_window: 1_000_000, input_price: 0.10, output_price: 0.40,
|
|
138
|
-
benchmarks: { "MMLU" => 85.2, "HumanEval" => 84.0, "GSM8K" => 92.1 }
|
|
139
|
-
},
|
|
140
|
-
"llama-3.1-405b" => {
|
|
141
|
-
name: "Llama 3.1 405B", provider: "Meta", parameters: 405,
|
|
142
|
-
context_window: 128_000, input_price: 3.00, output_price: 3.00,
|
|
143
|
-
benchmarks: { "MMLU" => 88.6, "HumanEval" => 89.0, "GSM8K" => 96.8, "GPQA" => 50.7 }
|
|
144
|
-
},
|
|
145
|
-
"mistral-large" => {
|
|
146
|
-
name: "Mistral Large", provider: "Mistral", parameters: 123,
|
|
147
|
-
context_window: 128_000, input_price: 2.00, output_price: 6.00,
|
|
148
|
-
benchmarks: { "MMLU" => 84.0, "HumanEval" => 82.0, "GSM8K" => 91.2 }
|
|
149
|
-
},
|
|
150
|
-
"deepseek-v3" => {
|
|
151
|
-
name: "DeepSeek V3", provider: "DeepSeek", parameters: 671,
|
|
152
|
-
context_window: 128_000, input_price: 0.27, output_price: 1.10,
|
|
153
|
-
benchmarks: { "MMLU" => 87.1, "HumanEval" => 82.6, "GSM8K" => 89.3, "GPQA" => 59.1 }
|
|
154
|
-
}
|
|
155
|
-
}.freeze
|
|
156
|
-
|
|
157
|
-
class << self
|
|
158
|
-
# Retrieve a model by its identifier
|
|
159
|
-
#
|
|
160
|
-
# model = BenchGecko.get_model("gpt-4o")
|
|
161
|
-
# model.name #=> "GPT-4o"
|
|
162
|
-
# model.provider #=> "OpenAI"
|
|
163
|
-
# model.score("MMLU") #=> 88.7
|
|
164
|
-
#
|
|
165
|
-
def get_model(model_id)
|
|
166
|
-
data = MODELS[model_id.to_s]
|
|
167
|
-
return nil unless data
|
|
168
|
-
Model.new(data.merge(id: model_id.to_s))
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
# List all available model identifiers
|
|
172
|
-
def list_models
|
|
173
|
-
MODELS.keys
|
|
174
|
-
end
|
|
175
|
-
|
|
176
|
-
# Compare two models side by side across benchmarks and pricing
|
|
177
|
-
#
|
|
178
|
-
# result = BenchGecko.compare_models("gpt-4o", "claude-3.5-sonnet")
|
|
179
|
-
# result[:benchmark_diff] #=> {"MMLU" => 0.0, "HumanEval" => -1.8, ...}
|
|
180
|
-
# result[:cheaper] #=> "gpt-4o"
|
|
181
|
-
#
|
|
182
|
-
def compare_models(model_a_id, model_b_id)
|
|
183
|
-
a = get_model(model_a_id)
|
|
184
|
-
b = get_model(model_b_id)
|
|
185
|
-
return nil unless a && b
|
|
186
|
-
|
|
187
|
-
all_benchmarks = (a.benchmarks.keys + b.benchmarks.keys).uniq
|
|
188
|
-
benchmark_diff = {}
|
|
189
|
-
all_benchmarks.each do |bench|
|
|
190
|
-
score_a = a.score(bench)
|
|
191
|
-
score_b = b.score(bench)
|
|
192
|
-
benchmark_diff[bench] = (score_a && score_b) ? (score_a - score_b).round(2) : nil
|
|
193
|
-
end
|
|
194
|
-
|
|
195
|
-
cost_a = a.cost_per_million
|
|
196
|
-
cost_b = b.cost_per_million
|
|
197
|
-
cheaper = if cost_a && cost_b
|
|
198
|
-
cost_a <= cost_b ? model_a_id : model_b_id
|
|
199
|
-
end
|
|
200
|
-
|
|
201
|
-
{
|
|
202
|
-
model_a: a.to_summary,
|
|
203
|
-
model_b: b.to_summary,
|
|
204
|
-
benchmark_diff: benchmark_diff,
|
|
205
|
-
cheaper: cheaper,
|
|
206
|
-
cost_ratio: (cost_a && cost_b && cost_b > 0) ? (cost_a / cost_b).round(2) : nil
|
|
207
|
-
}
|
|
208
|
-
end
|
|
16
|
+
def self.benchmarks
|
|
17
|
+
get('/benchmarks')
|
|
18
|
+
end
|
|
209
19
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
# #=> { input_cost: 2.50, output_cost: 5.00, total: 7.50 }
|
|
214
|
-
#
|
|
215
|
-
def estimate_cost(model_id, input_tokens:, output_tokens: 0)
|
|
216
|
-
model = get_model(model_id)
|
|
217
|
-
return nil unless model&.input_price && model&.output_price
|
|
20
|
+
def self.compare(*slugs)
|
|
21
|
+
get('/compare', models: slugs.join(','))
|
|
22
|
+
end
|
|
218
23
|
|
|
219
|
-
|
|
220
|
-
|
|
24
|
+
def self.pricing(slug = nil)
|
|
25
|
+
slug ? get("/pricing/#{slug}") : get('/pricing')
|
|
26
|
+
end
|
|
221
27
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
output_tokens: output_tokens,
|
|
226
|
-
input_cost: input_cost,
|
|
227
|
-
output_cost: output_cost,
|
|
228
|
-
total: (input_cost + output_cost).round(4)
|
|
229
|
-
}
|
|
230
|
-
end
|
|
28
|
+
def self.providers
|
|
29
|
+
get('/providers')
|
|
30
|
+
end
|
|
231
31
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
end
|
|
32
|
+
def self.agents
|
|
33
|
+
get('/agents')
|
|
34
|
+
end
|
|
236
35
|
|
|
237
|
-
|
|
238
|
-
#
|
|
239
|
-
# BenchGecko.top_models("MMLU", min_score: 87.0)
|
|
240
|
-
# #=> [Model, Model, ...]
|
|
241
|
-
#
|
|
242
|
-
def top_models(benchmark, min_score: 0)
|
|
243
|
-
MODELS.filter_map do |id, data|
|
|
244
|
-
score = data[:benchmarks][benchmark]
|
|
245
|
-
next unless score && score >= min_score
|
|
246
|
-
get_model(id)
|
|
247
|
-
end.sort_by { |m| -m.score(benchmark) }
|
|
248
|
-
end
|
|
36
|
+
private
|
|
249
37
|
|
|
250
|
-
|
|
251
|
-
#
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
#
|
|
255
|
-
def cheapest_above(benchmark, min_score)
|
|
256
|
-
top_models(benchmark, min_score: min_score)
|
|
257
|
-
.select(&:cost_per_million)
|
|
258
|
-
.min_by(&:cost_per_million)
|
|
259
|
-
end
|
|
38
|
+
def self.get(path, params = {})
|
|
39
|
+
uri = URI("#{BASE_URL}#{path}")
|
|
40
|
+
uri.query = URI.encode_www_form(params) unless params.empty?
|
|
41
|
+
JSON.parse(Net::HTTP.get(uri))
|
|
260
42
|
end
|
|
261
43
|
end
|
metadata
CHANGED
|
@@ -1,28 +1,23 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: benchgecko
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- BenchGecko
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
|
-
description:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
email:
|
|
18
|
-
- hello@benchgecko.ai
|
|
13
|
+
description: Query AI model data, benchmark scores, and run side-by-side comparisons.
|
|
14
|
+
BenchGecko tracks every major AI model, benchmark, and provider with cross-provider
|
|
15
|
+
pricing.
|
|
16
|
+
email: hello@benchgecko.ai
|
|
19
17
|
executables: []
|
|
20
18
|
extensions: []
|
|
21
19
|
extra_rdoc_files: []
|
|
22
20
|
files:
|
|
23
|
-
- CHANGELOG.md
|
|
24
|
-
- LICENSE.txt
|
|
25
|
-
- README.md
|
|
26
21
|
- lib/benchgecko.rb
|
|
27
22
|
homepage: https://benchgecko.ai
|
|
28
23
|
licenses:
|
|
@@ -30,7 +25,8 @@ licenses:
|
|
|
30
25
|
metadata:
|
|
31
26
|
homepage_uri: https://benchgecko.ai
|
|
32
27
|
source_code_uri: https://github.com/BenchGecko/benchgecko-ruby
|
|
33
|
-
|
|
28
|
+
documentation_uri: https://benchgecko.ai/api-docs
|
|
29
|
+
changelog_uri: https://benchgecko.ai/changelog
|
|
34
30
|
post_install_message:
|
|
35
31
|
rdoc_options: []
|
|
36
32
|
require_paths:
|
|
@@ -39,7 +35,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
39
35
|
requirements:
|
|
40
36
|
- - ">="
|
|
41
37
|
- !ruby/object:Gem::Version
|
|
42
|
-
version:
|
|
38
|
+
version: '0'
|
|
43
39
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
44
40
|
requirements:
|
|
45
41
|
- - ">="
|
|
@@ -49,6 +45,5 @@ requirements: []
|
|
|
49
45
|
rubygems_version: 3.0.3.1
|
|
50
46
|
signing_key:
|
|
51
47
|
specification_version: 4
|
|
52
|
-
summary: Ruby
|
|
53
|
-
and agent comparison.
|
|
48
|
+
summary: Ruby SDK for BenchGecko AI model data platform
|
|
54
49
|
test_files: []
|
data/CHANGELOG.md
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
# Changelog
|
|
2
|
-
|
|
3
|
-
## 0.1.0 (2026-03-30)
|
|
4
|
-
|
|
5
|
-
- Initial release
|
|
6
|
-
- Model lookup, comparison, and cost estimation
|
|
7
|
-
- Built-in catalog: GPT-4o, Claude 3.5 Sonnet, Gemini 2.0 Flash, Llama 3.1 405B, Mistral Large, DeepSeek V3
|
|
8
|
-
- Benchmark categories: reasoning, coding, math, instruction, safety, multimodal, multilingual, long context
|
|
9
|
-
- Top models filtering and cheapest-above-threshold finder
|
data/LICENSE.txt
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2026 BenchGecko
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
data/README.md
DELETED
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
# BenchGecko for Ruby
|
|
2
|
-
|
|
3
|
-
**The CoinGecko for AI.** Ruby client for accessing AI model benchmarks, comparing language models, estimating inference costs, and discovering AI agents.
|
|
4
|
-
|
|
5
|
-
BenchGecko tracks 300+ AI models across 50+ providers with real benchmark scores, latency metrics, and transparent pricing. This gem gives you structured access to that data directly in your Ruby applications -- no API key required for the built-in catalog.
|
|
6
|
-
|
|
7
|
-
## Installation
|
|
8
|
-
|
|
9
|
-
Add to your Gemfile:
|
|
10
|
-
|
|
11
|
-
```ruby
|
|
12
|
-
gem "benchgecko"
|
|
13
|
-
```
|
|
14
|
-
|
|
15
|
-
Or install directly:
|
|
16
|
-
|
|
17
|
-
```bash
|
|
18
|
-
gem install benchgecko
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
## Quick Start
|
|
22
|
-
|
|
23
|
-
```ruby
|
|
24
|
-
require "benchgecko"
|
|
25
|
-
|
|
26
|
-
# Look up any model
|
|
27
|
-
model = BenchGecko.get_model("claude-3.5-sonnet")
|
|
28
|
-
puts model.name #=> "Claude 3.5 Sonnet"
|
|
29
|
-
puts model.provider #=> "Anthropic"
|
|
30
|
-
puts model.score("MMLU") #=> 88.7
|
|
31
|
-
|
|
32
|
-
# List all tracked models
|
|
33
|
-
BenchGecko.list_models.each { |id| puts id }
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## Comparing Models
|
|
37
|
-
|
|
38
|
-
The comparison engine surfaces benchmark differences and pricing ratios, making it straightforward to evaluate tradeoffs between models:
|
|
39
|
-
|
|
40
|
-
```ruby
|
|
41
|
-
result = BenchGecko.compare_models("gpt-4o", "claude-3.5-sonnet")
|
|
42
|
-
|
|
43
|
-
puts result[:cheaper] #=> "gpt-4o"
|
|
44
|
-
puts result[:cost_ratio] #=> 0.69
|
|
45
|
-
puts result[:benchmark_diff] #=> {"MMLU" => 0.0, "HumanEval" => -1.8, ...}
|
|
46
|
-
|
|
47
|
-
# Positive diff means model_a scores higher
|
|
48
|
-
result[:benchmark_diff].each do |bench, diff|
|
|
49
|
-
next unless diff
|
|
50
|
-
winner = diff >= 0 ? "GPT-4o" : "Claude 3.5 Sonnet"
|
|
51
|
-
puts "#{bench}: #{winner} wins by #{diff.abs} points"
|
|
52
|
-
end
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
## Cost Estimation
|
|
56
|
-
|
|
57
|
-
Estimate inference costs before committing to a provider. Prices are per million tokens:
|
|
58
|
-
|
|
59
|
-
```ruby
|
|
60
|
-
cost = BenchGecko.estimate_cost("gpt-4o",
|
|
61
|
-
input_tokens: 2_000_000,
|
|
62
|
-
output_tokens: 500_000
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
puts cost[:input_cost] #=> 5.0
|
|
66
|
-
puts cost[:output_cost] #=> 5.0
|
|
67
|
-
puts cost[:total] #=> 10.0
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
## Finding the Right Model
|
|
71
|
-
|
|
72
|
-
Filter models by benchmark performance to find the best fit for your workload:
|
|
73
|
-
|
|
74
|
-
```ruby
|
|
75
|
-
# All models scoring 87+ on MMLU
|
|
76
|
-
strong_reasoners = BenchGecko.top_models("MMLU", min_score: 87.0)
|
|
77
|
-
strong_reasoners.each { |m| puts "#{m.name}: #{m.score('MMLU')}" }
|
|
78
|
-
|
|
79
|
-
# Cheapest model above a quality threshold
|
|
80
|
-
budget_pick = BenchGecko.cheapest_above("MMLU", 85.0)
|
|
81
|
-
puts "#{budget_pick.name} at $#{budget_pick.cost_per_million}/M tokens"
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
## Benchmark Categories
|
|
85
|
-
|
|
86
|
-
BenchGecko organizes benchmarks into categories covering reasoning, coding, math, instruction following, safety, multimodal, multilingual, and long context evaluation:
|
|
87
|
-
|
|
88
|
-
```ruby
|
|
89
|
-
BenchGecko.benchmark_categories.each do |key, info|
|
|
90
|
-
puts "#{info[:name]}: #{info[:benchmarks].join(', ')}"
|
|
91
|
-
puts " #{info[:description]}"
|
|
92
|
-
end
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
## Built-in Model Catalog
|
|
96
|
-
|
|
97
|
-
The gem ships with a curated catalog of major models from OpenAI, Anthropic, Google, Meta, Mistral, and DeepSeek. Each entry includes benchmark scores, parameter counts, context window sizes, and per-token pricing.
|
|
98
|
-
|
|
99
|
-
```ruby
|
|
100
|
-
model = BenchGecko.get_model("deepseek-v3")
|
|
101
|
-
puts model.parameters #=> 671
|
|
102
|
-
puts model.context_window #=> 128000
|
|
103
|
-
puts model.cost_per_million #=> 0.685
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
## Use Cases
|
|
107
|
-
|
|
108
|
-
- **Model selection pipelines** -- programmatically pick the cheapest model that meets your quality bar
|
|
109
|
-
- **Cost monitoring** -- estimate monthly spend across different model configurations
|
|
110
|
-
- **Benchmark dashboards** -- pull structured scores into internal reporting tools
|
|
111
|
-
- **Agent evaluation** -- compare AI agents across capability dimensions
|
|
112
|
-
|
|
113
|
-
## Resources
|
|
114
|
-
|
|
115
|
-
- [BenchGecko](https://benchgecko.ai) -- Full platform with interactive comparisons
|
|
116
|
-
- [Source Code](https://github.com/BenchGecko/benchgecko-ruby) -- Contributions welcome
|
|
117
|
-
|
|
118
|
-
## License
|
|
119
|
-
|
|
120
|
-
MIT License. See [LICENSE.txt](LICENSE.txt) for details.
|