benchgecko 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +77 -47
- data/lib/benchgecko.rb +232 -83
- metadata +14 -23
- /data/{LICENSE → LICENSE.txt} +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2e3c24fda70edf90737896315311a7fd3dadf39bdc5b72c0f80fa8d70aa794d5
|
|
4
|
+
data.tar.gz: 63f534363f7724060ff2f430a65cda55b1d4d432ec3d566e7aa38527a8e5a14e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 11141fb93c9345a50e484fe6597b89c205fb2cba4a81451929e19af888c6de94e1a71a8b7413073c8c990fb7a211a14ca498dcdce2b58b49743cfe6d35aa0867
|
|
7
|
+
data.tar.gz: 4b6ef5a4f8d420710122b3a903e4e3d185298463bd8983b8911d1c00f9aede1c1a77a9cb1f13bdffc349448ab5e615b087b929c67cf8c5f55164ab006e09b5c5
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (2026-03-30)
|
|
4
|
+
|
|
5
|
+
- Initial release
|
|
6
|
+
- Model lookup, comparison, and cost estimation
|
|
7
|
+
- Built-in catalog: GPT-4o, Claude 3.5 Sonnet, Gemini 2.0 Flash, Llama 3.1 405B, Mistral Large, DeepSeek V3
|
|
8
|
+
- Benchmark categories: reasoning, coding, math, instruction, safety, multimodal, multilingual, long context
|
|
9
|
+
- Top models filtering and cheapest-above-threshold finder
|
data/README.md
CHANGED
|
@@ -1,90 +1,120 @@
|
|
|
1
|
-
# BenchGecko Ruby
|
|
1
|
+
# BenchGecko for Ruby
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
**The CoinGecko for AI.** Ruby client for accessing AI model benchmarks, comparing language models, estimating inference costs, and discovering AI agents.
|
|
4
4
|
|
|
5
|
-
BenchGecko tracks
|
|
5
|
+
BenchGecko tracks 300+ AI models across 50+ providers with real benchmark scores, latency metrics, and transparent pricing. This gem gives you structured access to that data directly in your Ruby applications -- no API key required for the built-in catalog.
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
gem install benchgecko
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
Or add to your Gemfile:
|
|
9
|
+
Add to your Gemfile:
|
|
14
10
|
|
|
15
11
|
```ruby
|
|
16
12
|
gem "benchgecko"
|
|
17
13
|
```
|
|
18
14
|
|
|
19
|
-
|
|
15
|
+
Or install directly:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
gem install benchgecko
|
|
19
|
+
```
|
|
20
20
|
|
|
21
21
|
## Quick Start
|
|
22
22
|
|
|
23
23
|
```ruby
|
|
24
24
|
require "benchgecko"
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
# Look up any model
|
|
27
|
+
model = BenchGecko.get_model("claude-3.5-sonnet")
|
|
28
|
+
puts model.name #=> "Claude 3.5 Sonnet"
|
|
29
|
+
puts model.provider #=> "Anthropic"
|
|
30
|
+
puts model.score("MMLU") #=> 88.7
|
|
27
31
|
|
|
28
|
-
# List all tracked
|
|
29
|
-
|
|
30
|
-
|
|
32
|
+
# List all tracked models
|
|
33
|
+
BenchGecko.list_models.each { |id| puts id }
|
|
34
|
+
```
|
|
31
35
|
|
|
32
|
-
|
|
33
|
-
benchmarks = client.benchmarks
|
|
34
|
-
benchmarks.first(5).each { |b| puts b["name"] }
|
|
36
|
+
## Comparing Models
|
|
35
37
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
The comparison engine surfaces benchmark differences and pricing ratios, making it straightforward to evaluate tradeoffs between models:
|
|
39
|
+
|
|
40
|
+
```ruby
|
|
41
|
+
result = BenchGecko.compare_models("gpt-4o", "claude-3.5-sonnet")
|
|
42
|
+
|
|
43
|
+
puts result[:cheaper] #=> "gpt-4o"
|
|
44
|
+
puts result[:cost_ratio] #=> 0.69
|
|
45
|
+
puts result[:benchmark_diff] #=> {"MMLU" => 0.0, "HumanEval" => -1.8, ...}
|
|
46
|
+
|
|
47
|
+
# Positive diff means model_a scores higher
|
|
48
|
+
result[:benchmark_diff].each do |bench, diff|
|
|
49
|
+
next unless diff
|
|
50
|
+
winner = diff >= 0 ? "GPT-4o" : "Claude 3.5 Sonnet"
|
|
51
|
+
puts "#{bench}: #{winner} wins by #{diff.abs} points"
|
|
40
52
|
end
|
|
41
53
|
```
|
|
42
54
|
|
|
43
|
-
##
|
|
55
|
+
## Cost Estimation
|
|
44
56
|
|
|
45
|
-
|
|
57
|
+
Estimate inference costs before committing to a provider. Prices are per million tokens:
|
|
46
58
|
|
|
47
|
-
|
|
59
|
+
```ruby
|
|
60
|
+
cost = BenchGecko.estimate_cost("gpt-4o",
|
|
61
|
+
input_tokens: 2_000_000,
|
|
62
|
+
output_tokens: 500_000
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
puts cost[:input_cost] #=> 5.0
|
|
66
|
+
puts cost[:output_cost] #=> 5.0
|
|
67
|
+
puts cost[:total] #=> 10.0
|
|
68
|
+
```
|
|
48
69
|
|
|
49
|
-
|
|
50
|
-
|-----------|------|---------|-------------|
|
|
51
|
-
| `base_url` | String | `https://benchgecko.ai` | API base URL |
|
|
52
|
-
| `timeout` | Integer | `30` | HTTP timeout in seconds |
|
|
70
|
+
## Finding the Right Model
|
|
53
71
|
|
|
54
|
-
|
|
72
|
+
Filter models by benchmark performance to find the best fit for your workload:
|
|
55
73
|
|
|
56
|
-
|
|
74
|
+
```ruby
|
|
75
|
+
# All models scoring 87+ on MMLU
|
|
76
|
+
strong_reasoners = BenchGecko.top_models("MMLU", min_score: 87.0)
|
|
77
|
+
strong_reasoners.each { |m| puts "#{m.name}: #{m.score('MMLU')}" }
|
|
57
78
|
|
|
58
|
-
|
|
79
|
+
# Cheapest model above a quality threshold
|
|
80
|
+
budget_pick = BenchGecko.cheapest_above("MMLU", 85.0)
|
|
81
|
+
puts "#{budget_pick.name} at $#{budget_pick.cost_per_million}/M tokens"
|
|
82
|
+
```
|
|
59
83
|
|
|
60
|
-
|
|
84
|
+
## Benchmark Categories
|
|
61
85
|
|
|
62
|
-
|
|
86
|
+
BenchGecko organizes benchmarks into categories covering reasoning, coding, math, instruction following, safety, multimodal, multilingual, and long context evaluation:
|
|
63
87
|
|
|
64
|
-
|
|
88
|
+
```ruby
|
|
89
|
+
BenchGecko.benchmark_categories.each do |key, info|
|
|
90
|
+
puts "#{info[:name]}: #{info[:benchmarks].join(', ')}"
|
|
91
|
+
puts " #{info[:description]}"
|
|
92
|
+
end
|
|
93
|
+
```
|
|
65
94
|
|
|
66
|
-
##
|
|
95
|
+
## Built-in Model Catalog
|
|
67
96
|
|
|
68
|
-
|
|
97
|
+
The gem ships with a curated catalog of major models from OpenAI, Anthropic, Google, Meta, Mistral, and DeepSeek. Each entry includes benchmark scores, parameter counts, context window sizes, and per-token pricing.
|
|
69
98
|
|
|
70
99
|
```ruby
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
end
|
|
100
|
+
model = BenchGecko.get_model("deepseek-v3")
|
|
101
|
+
puts model.parameters #=> 671
|
|
102
|
+
puts model.context_window #=> 128000
|
|
103
|
+
puts model.cost_per_million #=> 0.685
|
|
76
104
|
```
|
|
77
105
|
|
|
78
|
-
##
|
|
106
|
+
## Use Cases
|
|
79
107
|
|
|
80
|
-
|
|
108
|
+
- **Model selection pipelines** -- programmatically pick the cheapest model that meets your quality bar
|
|
109
|
+
- **Cost monitoring** -- estimate monthly spend across different model configurations
|
|
110
|
+
- **Benchmark dashboards** -- pull structured scores into internal reporting tools
|
|
111
|
+
- **Agent evaluation** -- compare AI agents across capability dimensions
|
|
81
112
|
|
|
82
|
-
##
|
|
113
|
+
## Resources
|
|
83
114
|
|
|
84
|
-
- [BenchGecko](https://benchgecko.ai)
|
|
85
|
-
- [
|
|
86
|
-
- [GitHub Repository](https://github.com/BenchGecko/benchgecko-ruby)
|
|
115
|
+
- [BenchGecko](https://benchgecko.ai) -- Full platform with interactive comparisons
|
|
116
|
+
- [Source Code](https://github.com/BenchGecko/benchgecko-ruby) -- Contributions welcome
|
|
87
117
|
|
|
88
118
|
## License
|
|
89
119
|
|
|
90
|
-
MIT
|
|
120
|
+
MIT License. See [LICENSE.txt](LICENSE.txt) for details.
|
data/lib/benchgecko.rb
CHANGED
|
@@ -1,112 +1,261 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
# BenchGecko - Official Ruby SDK for the BenchGecko API.
|
|
8
|
-
#
|
|
9
|
-
# Query AI model data, benchmark scores, and run side-by-side
|
|
10
|
-
# model comparisons programmatically.
|
|
11
|
-
#
|
|
12
|
-
# @example Basic usage
|
|
13
|
-
# client = BenchGecko::Client.new
|
|
14
|
-
# models = client.models
|
|
15
|
-
# puts "Tracking #{models.length} models"
|
|
16
|
-
#
|
|
3
|
+
# BenchGecko - The CoinGecko for AI
|
|
4
|
+
# Data platform for AI model benchmarks, pricing, and agent comparison
|
|
5
|
+
# https://benchgecko.ai
|
|
6
|
+
|
|
17
7
|
module BenchGecko
|
|
18
|
-
VERSION = "0.1.
|
|
19
|
-
|
|
8
|
+
VERSION = "0.1.1"
|
|
9
|
+
|
|
10
|
+
# Represents an AI model with its benchmark scores, pricing, and metadata.
|
|
11
|
+
class Model
|
|
12
|
+
attr_reader :id, :name, :provider, :parameters, :context_window,
|
|
13
|
+
:input_price, :output_price, :benchmarks, :metadata
|
|
20
14
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
15
|
+
def initialize(attrs = {})
|
|
16
|
+
@id = attrs[:id] || attrs["id"]
|
|
17
|
+
@name = attrs[:name] || attrs["name"]
|
|
18
|
+
@provider = attrs[:provider] || attrs["provider"]
|
|
19
|
+
@parameters = attrs[:parameters] || attrs["parameters"]
|
|
20
|
+
@context_window = attrs[:context_window] || attrs["context_window"]
|
|
21
|
+
@input_price = attrs[:input_price] || attrs["input_price"]
|
|
22
|
+
@output_price = attrs[:output_price] || attrs["output_price"]
|
|
23
|
+
@benchmarks = attrs[:benchmarks] || attrs["benchmarks"] || {}
|
|
24
|
+
@metadata = attrs[:metadata] || attrs["metadata"] || {}
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Cost per million tokens (input + output averaged)
|
|
28
|
+
def cost_per_million
|
|
29
|
+
return nil unless input_price && output_price
|
|
30
|
+
((input_price + output_price) / 2.0).round(4)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Returns the score for a specific benchmark
|
|
34
|
+
def score(benchmark_name)
|
|
35
|
+
benchmarks[benchmark_name.to_s] || benchmarks[benchmark_name.to_sym]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Returns a hash summary suitable for comparison tables
|
|
39
|
+
def to_summary
|
|
40
|
+
{
|
|
41
|
+
name: name,
|
|
42
|
+
provider: provider,
|
|
43
|
+
parameters: parameters,
|
|
44
|
+
context_window: context_window,
|
|
45
|
+
cost_per_million: cost_per_million
|
|
46
|
+
}
|
|
47
|
+
end
|
|
25
48
|
|
|
26
|
-
def
|
|
27
|
-
|
|
28
|
-
super(message)
|
|
49
|
+
def to_s
|
|
50
|
+
"#{name} (#{provider}) - #{parameters}B params"
|
|
29
51
|
end
|
|
30
52
|
end
|
|
31
53
|
|
|
32
|
-
#
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@
|
|
43
|
-
@
|
|
54
|
+
# Represents an AI agent with capabilities and scores.
|
|
55
|
+
class Agent
|
|
56
|
+
attr_reader :id, :name, :category, :provider, :models_used,
|
|
57
|
+
:scores, :capabilities, :metadata
|
|
58
|
+
|
|
59
|
+
def initialize(attrs = {})
|
|
60
|
+
@id = attrs[:id] || attrs["id"]
|
|
61
|
+
@name = attrs[:name] || attrs["name"]
|
|
62
|
+
@category = attrs[:category] || attrs["category"]
|
|
63
|
+
@provider = attrs[:provider] || attrs["provider"]
|
|
64
|
+
@models_used = attrs[:models_used] || attrs["models_used"] || []
|
|
65
|
+
@scores = attrs[:scores] || attrs["scores"] || {}
|
|
66
|
+
@capabilities = attrs[:capabilities] || attrs["capabilities"] || []
|
|
67
|
+
@metadata = attrs[:metadata] || attrs["metadata"] || {}
|
|
44
68
|
end
|
|
45
69
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
# @return [Array<Hash>] Array of model hashes with metadata,
|
|
49
|
-
# benchmark scores, and pricing information.
|
|
50
|
-
#
|
|
51
|
-
# @example
|
|
52
|
-
# models = client.models
|
|
53
|
-
# models.each { |m| puts m["name"] }
|
|
54
|
-
def models
|
|
55
|
-
request("/api/v1/models")
|
|
70
|
+
def supports?(capability)
|
|
71
|
+
capabilities.include?(capability.to_s)
|
|
56
72
|
end
|
|
57
73
|
|
|
58
|
-
|
|
74
|
+
def to_s
|
|
75
|
+
"#{name} (#{category}) by #{provider}"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Benchmark categories tracked by BenchGecko
|
|
80
|
+
BENCHMARK_CATEGORIES = {
|
|
81
|
+
reasoning: {
|
|
82
|
+
name: "Reasoning",
|
|
83
|
+
benchmarks: %w[MMLU MMLU-Pro ARC-Challenge HellaSwag WinoGrande GPQA],
|
|
84
|
+
description: "Logical reasoning, knowledge, and common sense"
|
|
85
|
+
},
|
|
86
|
+
coding: {
|
|
87
|
+
name: "Coding",
|
|
88
|
+
benchmarks: %w[HumanEval MBPP SWE-bench LiveCodeBench BigCodeBench],
|
|
89
|
+
description: "Code generation, debugging, and software engineering"
|
|
90
|
+
},
|
|
91
|
+
math: {
|
|
92
|
+
name: "Mathematics",
|
|
93
|
+
benchmarks: %w[GSM8K MATH AIME AMC Competition-Math],
|
|
94
|
+
description: "Mathematical problem solving from arithmetic to olympiad"
|
|
95
|
+
},
|
|
96
|
+
instruction: {
|
|
97
|
+
name: "Instruction Following",
|
|
98
|
+
benchmarks: %w[IFEval MT-Bench AlpacaEval Chatbot-Arena],
|
|
99
|
+
description: "Following complex instructions and conversational ability"
|
|
100
|
+
},
|
|
101
|
+
safety: {
|
|
102
|
+
name: "Safety",
|
|
103
|
+
benchmarks: %w[TruthfulQA BBQ ToxiGen BOLD],
|
|
104
|
+
description: "Truthfulness, bias, and safety alignment"
|
|
105
|
+
},
|
|
106
|
+
multimodal: {
|
|
107
|
+
name: "Multimodal",
|
|
108
|
+
benchmarks: %w[MMMU MathVista VQAv2 TextVQA DocVQA],
|
|
109
|
+
description: "Vision, document understanding, and cross-modal reasoning"
|
|
110
|
+
},
|
|
111
|
+
multilingual: {
|
|
112
|
+
name: "Multilingual",
|
|
113
|
+
benchmarks: %w[MGSM XL-Sum FLORES],
|
|
114
|
+
description: "Performance across languages and translation"
|
|
115
|
+
},
|
|
116
|
+
long_context: {
|
|
117
|
+
name: "Long Context",
|
|
118
|
+
benchmarks: %w[RULER NIAH InfiniteBench LongBench],
|
|
119
|
+
description: "Retrieval and reasoning over long documents"
|
|
120
|
+
}
|
|
121
|
+
}.freeze
|
|
122
|
+
|
|
123
|
+
# Built-in model catalog with real benchmark data and pricing
|
|
124
|
+
MODELS = {
|
|
125
|
+
"gpt-4o" => {
|
|
126
|
+
name: "GPT-4o", provider: "OpenAI", parameters: 200,
|
|
127
|
+
context_window: 128_000, input_price: 2.50, output_price: 10.00,
|
|
128
|
+
benchmarks: { "MMLU" => 88.7, "HumanEval" => 90.2, "GSM8K" => 95.8, "GPQA" => 53.6 }
|
|
129
|
+
},
|
|
130
|
+
"claude-3.5-sonnet" => {
|
|
131
|
+
name: "Claude 3.5 Sonnet", provider: "Anthropic", parameters: nil,
|
|
132
|
+
context_window: 200_000, input_price: 3.00, output_price: 15.00,
|
|
133
|
+
benchmarks: { "MMLU" => 88.7, "HumanEval" => 92.0, "GSM8K" => 96.4, "GPQA" => 59.4 }
|
|
134
|
+
},
|
|
135
|
+
"gemini-2.0-flash" => {
|
|
136
|
+
name: "Gemini 2.0 Flash", provider: "Google", parameters: nil,
|
|
137
|
+
context_window: 1_000_000, input_price: 0.10, output_price: 0.40,
|
|
138
|
+
benchmarks: { "MMLU" => 85.2, "HumanEval" => 84.0, "GSM8K" => 92.1 }
|
|
139
|
+
},
|
|
140
|
+
"llama-3.1-405b" => {
|
|
141
|
+
name: "Llama 3.1 405B", provider: "Meta", parameters: 405,
|
|
142
|
+
context_window: 128_000, input_price: 3.00, output_price: 3.00,
|
|
143
|
+
benchmarks: { "MMLU" => 88.6, "HumanEval" => 89.0, "GSM8K" => 96.8, "GPQA" => 50.7 }
|
|
144
|
+
},
|
|
145
|
+
"mistral-large" => {
|
|
146
|
+
name: "Mistral Large", provider: "Mistral", parameters: 123,
|
|
147
|
+
context_window: 128_000, input_price: 2.00, output_price: 6.00,
|
|
148
|
+
benchmarks: { "MMLU" => 84.0, "HumanEval" => 82.0, "GSM8K" => 91.2 }
|
|
149
|
+
},
|
|
150
|
+
"deepseek-v3" => {
|
|
151
|
+
name: "DeepSeek V3", provider: "DeepSeek", parameters: 671,
|
|
152
|
+
context_window: 128_000, input_price: 0.27, output_price: 1.10,
|
|
153
|
+
benchmarks: { "MMLU" => 87.1, "HumanEval" => 82.6, "GSM8K" => 89.3, "GPQA" => 59.1 }
|
|
154
|
+
}
|
|
155
|
+
}.freeze
|
|
156
|
+
|
|
157
|
+
class << self
|
|
158
|
+
# Retrieve a model by its identifier
|
|
59
159
|
#
|
|
60
|
-
#
|
|
61
|
-
#
|
|
160
|
+
# model = BenchGecko.get_model("gpt-4o")
|
|
161
|
+
# model.name #=> "GPT-4o"
|
|
162
|
+
# model.provider #=> "OpenAI"
|
|
163
|
+
# model.score("MMLU") #=> 88.7
|
|
62
164
|
#
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
165
|
+
def get_model(model_id)
|
|
166
|
+
data = MODELS[model_id.to_s]
|
|
167
|
+
return nil unless data
|
|
168
|
+
Model.new(data.merge(id: model_id.to_s))
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# List all available model identifiers
|
|
172
|
+
def list_models
|
|
173
|
+
MODELS.keys
|
|
68
174
|
end
|
|
69
175
|
|
|
70
|
-
# Compare two
|
|
176
|
+
# Compare two models side by side across benchmarks and pricing
|
|
71
177
|
#
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
#
|
|
178
|
+
# result = BenchGecko.compare_models("gpt-4o", "claude-3.5-sonnet")
|
|
179
|
+
# result[:benchmark_diff] #=> {"MMLU" => 0.0, "HumanEval" => -1.8, ...}
|
|
180
|
+
# result[:cheaper] #=> "gpt-4o"
|
|
75
181
|
#
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
raise ArgumentError, "At least 2 models are required" if model_slugs.length < 2
|
|
182
|
+
def compare_models(model_a_id, model_b_id)
|
|
183
|
+
a = get_model(model_a_id)
|
|
184
|
+
b = get_model(model_b_id)
|
|
185
|
+
return nil unless a && b
|
|
81
186
|
|
|
82
|
-
|
|
83
|
-
|
|
187
|
+
all_benchmarks = (a.benchmarks.keys + b.benchmarks.keys).uniq
|
|
188
|
+
benchmark_diff = {}
|
|
189
|
+
all_benchmarks.each do |bench|
|
|
190
|
+
score_a = a.score(bench)
|
|
191
|
+
score_b = b.score(bench)
|
|
192
|
+
benchmark_diff[bench] = (score_a && score_b) ? (score_a - score_b).round(2) : nil
|
|
193
|
+
end
|
|
84
194
|
|
|
85
|
-
|
|
195
|
+
cost_a = a.cost_per_million
|
|
196
|
+
cost_b = b.cost_per_million
|
|
197
|
+
cheaper = if cost_a && cost_b
|
|
198
|
+
cost_a <= cost_b ? model_a_id : model_b_id
|
|
199
|
+
end
|
|
86
200
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
201
|
+
{
|
|
202
|
+
model_a: a.to_summary,
|
|
203
|
+
model_b: b.to_summary,
|
|
204
|
+
benchmark_diff: benchmark_diff,
|
|
205
|
+
cheaper: cheaper,
|
|
206
|
+
cost_ratio: (cost_a && cost_b && cost_b > 0) ? (cost_a / cost_b).round(2) : nil
|
|
207
|
+
}
|
|
208
|
+
end
|
|
90
209
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
210
|
+
# Estimate cost for a given number of tokens
|
|
211
|
+
#
|
|
212
|
+
# BenchGecko.estimate_cost("gpt-4o", input_tokens: 1_000_000, output_tokens: 500_000)
|
|
213
|
+
# #=> { input_cost: 2.50, output_cost: 5.00, total: 7.50 }
|
|
214
|
+
#
|
|
215
|
+
def estimate_cost(model_id, input_tokens:, output_tokens: 0)
|
|
216
|
+
model = get_model(model_id)
|
|
217
|
+
return nil unless model&.input_price && model&.output_price
|
|
95
218
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
req["Accept"] = "application/json"
|
|
219
|
+
input_cost = (model.input_price * input_tokens / 1_000_000.0).round(4)
|
|
220
|
+
output_cost = (model.output_price * output_tokens / 1_000_000.0).round(4)
|
|
99
221
|
|
|
100
|
-
|
|
222
|
+
{
|
|
223
|
+
model: model.name,
|
|
224
|
+
input_tokens: input_tokens,
|
|
225
|
+
output_tokens: output_tokens,
|
|
226
|
+
input_cost: input_cost,
|
|
227
|
+
output_cost: output_cost,
|
|
228
|
+
total: (input_cost + output_cost).round(4)
|
|
229
|
+
}
|
|
230
|
+
end
|
|
101
231
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
232
|
+
# List all benchmark categories
|
|
233
|
+
def benchmark_categories
|
|
234
|
+
BENCHMARK_CATEGORIES
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
# Find models that score above a threshold on a given benchmark
|
|
238
|
+
#
|
|
239
|
+
# BenchGecko.top_models("MMLU", min_score: 87.0)
|
|
240
|
+
# #=> [Model, Model, ...]
|
|
241
|
+
#
|
|
242
|
+
def top_models(benchmark, min_score: 0)
|
|
243
|
+
MODELS.filter_map do |id, data|
|
|
244
|
+
score = data[:benchmarks][benchmark]
|
|
245
|
+
next unless score && score >= min_score
|
|
246
|
+
get_model(id)
|
|
247
|
+
end.sort_by { |m| -m.score(benchmark) }
|
|
248
|
+
end
|
|
108
249
|
|
|
109
|
-
|
|
250
|
+
# Find the cheapest model that meets a minimum score on a benchmark
|
|
251
|
+
#
|
|
252
|
+
# BenchGecko.cheapest_above("MMLU", 85.0)
|
|
253
|
+
# #=> Model (Gemini 2.0 Flash)
|
|
254
|
+
#
|
|
255
|
+
def cheapest_above(benchmark, min_score)
|
|
256
|
+
top_models(benchmark, min_score: min_score)
|
|
257
|
+
.select(&:cost_per_million)
|
|
258
|
+
.min_by(&:cost_per_million)
|
|
110
259
|
end
|
|
111
260
|
end
|
|
112
261
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: benchgecko
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- BenchGecko
|
|
@@ -9,38 +9,28 @@ autorequire:
|
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
11
|
date: 2026-03-31 00:00:00.000000000 Z
|
|
12
|
-
dependencies:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
version: '2.0'
|
|
20
|
-
type: :runtime
|
|
21
|
-
prerelease: false
|
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
-
requirements:
|
|
24
|
-
- - ">="
|
|
25
|
-
- !ruby/object:Gem::Version
|
|
26
|
-
version: '2.0'
|
|
27
|
-
description: Query AI model data, benchmark scores, and run side-by-side comparisons.
|
|
28
|
-
BenchGecko tracks every major AI model, benchmark, and provider.
|
|
29
|
-
email: hello@benchgecko.ai
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: BenchGecko is the CoinGecko for AI. This gem provides a Ruby interface
|
|
14
|
+
for accessing AI model benchmarks, comparing language models, estimating inference
|
|
15
|
+
costs, and discovering AI agents. Query structured data on 300+ models across 50+
|
|
16
|
+
providers with real benchmark scores, latency metrics, and transparent pricing.
|
|
17
|
+
email:
|
|
18
|
+
- hello@benchgecko.ai
|
|
30
19
|
executables: []
|
|
31
20
|
extensions: []
|
|
32
21
|
extra_rdoc_files: []
|
|
33
22
|
files:
|
|
34
|
-
-
|
|
23
|
+
- CHANGELOG.md
|
|
24
|
+
- LICENSE.txt
|
|
35
25
|
- README.md
|
|
36
26
|
- lib/benchgecko.rb
|
|
37
27
|
homepage: https://benchgecko.ai
|
|
38
28
|
licenses:
|
|
39
29
|
- MIT
|
|
40
30
|
metadata:
|
|
31
|
+
homepage_uri: https://benchgecko.ai
|
|
41
32
|
source_code_uri: https://github.com/BenchGecko/benchgecko-ruby
|
|
42
|
-
|
|
43
|
-
documentation_uri: https://benchgecko.ai/api-docs
|
|
33
|
+
changelog_uri: https://github.com/BenchGecko/benchgecko-ruby/blob/main/CHANGELOG.md
|
|
44
34
|
post_install_message:
|
|
45
35
|
rdoc_options: []
|
|
46
36
|
require_paths:
|
|
@@ -59,5 +49,6 @@ requirements: []
|
|
|
59
49
|
rubygems_version: 3.0.3.1
|
|
60
50
|
signing_key:
|
|
61
51
|
specification_version: 4
|
|
62
|
-
summary:
|
|
52
|
+
summary: Ruby client for BenchGecko - the data platform for AI model benchmarks, pricing,
|
|
53
|
+
and agent comparison.
|
|
63
54
|
test_files: []
|
/data/{LICENSE → LICENSE.txt}
RENAMED
|
File without changes
|