benchgecko 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +4 -4
  2. data/lib/benchgecko.rb +28 -246
  3. metadata +10 -16
  4. data/CHANGELOG.md +0 -15
  5. data/LICENSE.txt +0 -21
  6. data/README.md +0 -129
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d8d1081a88ea9b84bd0ce328125cca300ae5b50a4f958936531683a22291f342
4
- data.tar.gz: 34aa28808170063b21ebb3dc1a5bbcb4ee7a8696f8697154044db7b57446e0ed
3
+ metadata.gz: 6a126fb765dd87b64cb48087748871672d902c2aee7913f1b3a2bc5eec495933
4
+ data.tar.gz: ee0aeaeec203627d07af67928e495f95c6828a3729020eea75a942e902e36e04
5
5
  SHA512:
6
- metadata.gz: 9d8501cda7ce337d38df5c93918300a69fb739ba10c3032ee1880133aeffc97dfccaf1b88ab660fbb1ae9f8f2bebc70af2bdfef0911aa98c05e77d7fd0f50d19
7
- data.tar.gz: ba861f4d31368d4a95017c561306ccee24de87dfa60cc79af246809acd1073d711427be4c31467b5d8192f9545f9702d857fbc6a416dff19f8f30662928054c0
6
+ metadata.gz: 02d105ade04b5cff84f970c70c926ec961343763da3a47f15da4cc325d30a1ba98e57d377330345cce5306925f832b9f8847c1da530d2a04b43d7334e90a6375
7
+ data.tar.gz: 0fe0f60f0f26b53bb717c415da31b3c167a148f9c0dcc9ed4315c8c1aad7e0e61f3ef6d98c88c07c42accfd0058e0eec80fed5eab71d5a88c9b3ed402799af38
data/lib/benchgecko.rb CHANGED
@@ -1,261 +1,43 @@
1
- # frozen_string_literal: true
2
-
3
- # BenchGecko - The data layer of the AI economy.
4
- # Every model. Every agent. Everything AI. Tracked.
5
- # https://benchgecko.ai
1
+ require 'net/http'
2
+ require 'json'
6
3
 
7
4
  module BenchGecko
8
- VERSION = "0.2.0"
9
-
10
- # Represents an AI model with its benchmark scores, pricing, and metadata.
11
- class Model
12
- attr_reader :id, :name, :provider, :parameters, :context_window,
13
- :input_price, :output_price, :benchmarks, :metadata
14
-
15
- def initialize(attrs = {})
16
- @id = attrs[:id] || attrs["id"]
17
- @name = attrs[:name] || attrs["name"]
18
- @provider = attrs[:provider] || attrs["provider"]
19
- @parameters = attrs[:parameters] || attrs["parameters"]
20
- @context_window = attrs[:context_window] || attrs["context_window"]
21
- @input_price = attrs[:input_price] || attrs["input_price"]
22
- @output_price = attrs[:output_price] || attrs["output_price"]
23
- @benchmarks = attrs[:benchmarks] || attrs["benchmarks"] || {}
24
- @metadata = attrs[:metadata] || attrs["metadata"] || {}
25
- end
26
-
27
- # Cost per million tokens (input + output averaged)
28
- def cost_per_million
29
- return nil unless input_price && output_price
30
- ((input_price + output_price) / 2.0).round(4)
31
- end
32
-
33
- # Returns the score for a specific benchmark
34
- def score(benchmark_name)
35
- benchmarks[benchmark_name.to_s] || benchmarks[benchmark_name.to_sym]
36
- end
37
-
38
- # Returns a hash summary suitable for comparison tables
39
- def to_summary
40
- {
41
- name: name,
42
- provider: provider,
43
- parameters: parameters,
44
- context_window: context_window,
45
- cost_per_million: cost_per_million
46
- }
47
- end
5
+ VERSION = '0.2.1'
6
+ BASE_URL = 'https://benchgecko.ai/api/v1'
48
7
 
49
- def to_s
50
- "#{name} (#{provider}) - #{parameters}B params"
51
- end
8
+ def self.models(params = {})
9
+ get('/models', params)
52
10
  end
53
11
 
54
- # Represents an AI agent with capabilities and scores.
55
- class Agent
56
- attr_reader :id, :name, :category, :provider, :models_used,
57
- :scores, :capabilities, :metadata
58
-
59
- def initialize(attrs = {})
60
- @id = attrs[:id] || attrs["id"]
61
- @name = attrs[:name] || attrs["name"]
62
- @category = attrs[:category] || attrs["category"]
63
- @provider = attrs[:provider] || attrs["provider"]
64
- @models_used = attrs[:models_used] || attrs["models_used"] || []
65
- @scores = attrs[:scores] || attrs["scores"] || {}
66
- @capabilities = attrs[:capabilities] || attrs["capabilities"] || []
67
- @metadata = attrs[:metadata] || attrs["metadata"] || {}
68
- end
69
-
70
- def supports?(capability)
71
- capabilities.include?(capability.to_s)
72
- end
73
-
74
- def to_s
75
- "#{name} (#{category}) by #{provider}"
76
- end
12
+ def self.model(slug)
13
+ get("/models/#{slug}")
77
14
  end
78
15
 
79
- # Benchmark categories tracked by BenchGecko
80
- BENCHMARK_CATEGORIES = {
81
- reasoning: {
82
- name: "Reasoning",
83
- benchmarks: %w[MMLU MMLU-Pro ARC-Challenge HellaSwag WinoGrande GPQA],
84
- description: "Logical reasoning, knowledge, and common sense"
85
- },
86
- coding: {
87
- name: "Coding",
88
- benchmarks: %w[HumanEval MBPP SWE-bench LiveCodeBench BigCodeBench],
89
- description: "Code generation, debugging, and software engineering"
90
- },
91
- math: {
92
- name: "Mathematics",
93
- benchmarks: %w[GSM8K MATH AIME AMC Competition-Math],
94
- description: "Mathematical problem solving from arithmetic to olympiad"
95
- },
96
- instruction: {
97
- name: "Instruction Following",
98
- benchmarks: %w[IFEval MT-Bench AlpacaEval Chatbot-Arena],
99
- description: "Following complex instructions and conversational ability"
100
- },
101
- safety: {
102
- name: "Safety",
103
- benchmarks: %w[TruthfulQA BBQ ToxiGen BOLD],
104
- description: "Truthfulness, bias, and safety alignment"
105
- },
106
- multimodal: {
107
- name: "Multimodal",
108
- benchmarks: %w[MMMU MathVista VQAv2 TextVQA DocVQA],
109
- description: "Vision, document understanding, and cross-modal reasoning"
110
- },
111
- multilingual: {
112
- name: "Multilingual",
113
- benchmarks: %w[MGSM XL-Sum FLORES],
114
- description: "Performance across languages and translation"
115
- },
116
- long_context: {
117
- name: "Long Context",
118
- benchmarks: %w[RULER NIAH InfiniteBench LongBench],
119
- description: "Retrieval and reasoning over long documents"
120
- }
121
- }.freeze
122
-
123
- # Built-in model catalog with real benchmark data and pricing
124
- MODELS = {
125
- "gpt-4o" => {
126
- name: "GPT-4o", provider: "OpenAI", parameters: 200,
127
- context_window: 128_000, input_price: 2.50, output_price: 10.00,
128
- benchmarks: { "MMLU" => 88.7, "HumanEval" => 90.2, "GSM8K" => 95.8, "GPQA" => 53.6 }
129
- },
130
- "claude-3.5-sonnet" => {
131
- name: "Claude 3.5 Sonnet", provider: "Anthropic", parameters: nil,
132
- context_window: 200_000, input_price: 3.00, output_price: 15.00,
133
- benchmarks: { "MMLU" => 88.7, "HumanEval" => 92.0, "GSM8K" => 96.4, "GPQA" => 59.4 }
134
- },
135
- "gemini-2.0-flash" => {
136
- name: "Gemini 2.0 Flash", provider: "Google", parameters: nil,
137
- context_window: 1_000_000, input_price: 0.10, output_price: 0.40,
138
- benchmarks: { "MMLU" => 85.2, "HumanEval" => 84.0, "GSM8K" => 92.1 }
139
- },
140
- "llama-3.1-405b" => {
141
- name: "Llama 3.1 405B", provider: "Meta", parameters: 405,
142
- context_window: 128_000, input_price: 3.00, output_price: 3.00,
143
- benchmarks: { "MMLU" => 88.6, "HumanEval" => 89.0, "GSM8K" => 96.8, "GPQA" => 50.7 }
144
- },
145
- "mistral-large" => {
146
- name: "Mistral Large", provider: "Mistral", parameters: 123,
147
- context_window: 128_000, input_price: 2.00, output_price: 6.00,
148
- benchmarks: { "MMLU" => 84.0, "HumanEval" => 82.0, "GSM8K" => 91.2 }
149
- },
150
- "deepseek-v3" => {
151
- name: "DeepSeek V3", provider: "DeepSeek", parameters: 671,
152
- context_window: 128_000, input_price: 0.27, output_price: 1.10,
153
- benchmarks: { "MMLU" => 87.1, "HumanEval" => 82.6, "GSM8K" => 89.3, "GPQA" => 59.1 }
154
- }
155
- }.freeze
156
-
157
- class << self
158
- # Retrieve a model by its identifier
159
- #
160
- # model = BenchGecko.get_model("gpt-4o")
161
- # model.name #=> "GPT-4o"
162
- # model.provider #=> "OpenAI"
163
- # model.score("MMLU") #=> 88.7
164
- #
165
- def get_model(model_id)
166
- data = MODELS[model_id.to_s]
167
- return nil unless data
168
- Model.new(data.merge(id: model_id.to_s))
169
- end
170
-
171
- # List all available model identifiers
172
- def list_models
173
- MODELS.keys
174
- end
175
-
176
- # Compare two models side by side across benchmarks and pricing
177
- #
178
- # result = BenchGecko.compare_models("gpt-4o", "claude-3.5-sonnet")
179
- # result[:benchmark_diff] #=> {"MMLU" => 0.0, "HumanEval" => -1.8, ...}
180
- # result[:cheaper] #=> "gpt-4o"
181
- #
182
- def compare_models(model_a_id, model_b_id)
183
- a = get_model(model_a_id)
184
- b = get_model(model_b_id)
185
- return nil unless a && b
186
-
187
- all_benchmarks = (a.benchmarks.keys + b.benchmarks.keys).uniq
188
- benchmark_diff = {}
189
- all_benchmarks.each do |bench|
190
- score_a = a.score(bench)
191
- score_b = b.score(bench)
192
- benchmark_diff[bench] = (score_a && score_b) ? (score_a - score_b).round(2) : nil
193
- end
194
-
195
- cost_a = a.cost_per_million
196
- cost_b = b.cost_per_million
197
- cheaper = if cost_a && cost_b
198
- cost_a <= cost_b ? model_a_id : model_b_id
199
- end
200
-
201
- {
202
- model_a: a.to_summary,
203
- model_b: b.to_summary,
204
- benchmark_diff: benchmark_diff,
205
- cheaper: cheaper,
206
- cost_ratio: (cost_a && cost_b && cost_b > 0) ? (cost_a / cost_b).round(2) : nil
207
- }
208
- end
16
+ def self.benchmarks
17
+ get('/benchmarks')
18
+ end
209
19
 
210
- # Estimate cost for a given number of tokens
211
- #
212
- # BenchGecko.estimate_cost("gpt-4o", input_tokens: 1_000_000, output_tokens: 500_000)
213
- # #=> { input_cost: 2.50, output_cost: 5.00, total: 7.50 }
214
- #
215
- def estimate_cost(model_id, input_tokens:, output_tokens: 0)
216
- model = get_model(model_id)
217
- return nil unless model&.input_price && model&.output_price
20
+ def self.compare(*slugs)
21
+ get('/compare', models: slugs.join(','))
22
+ end
218
23
 
219
- input_cost = (model.input_price * input_tokens / 1_000_000.0).round(4)
220
- output_cost = (model.output_price * output_tokens / 1_000_000.0).round(4)
24
+ def self.pricing(slug = nil)
25
+ slug ? get("/pricing/#{slug}") : get('/pricing')
26
+ end
221
27
 
222
- {
223
- model: model.name,
224
- input_tokens: input_tokens,
225
- output_tokens: output_tokens,
226
- input_cost: input_cost,
227
- output_cost: output_cost,
228
- total: (input_cost + output_cost).round(4)
229
- }
230
- end
28
+ def self.providers
29
+ get('/providers')
30
+ end
231
31
 
232
- # List all benchmark categories
233
- def benchmark_categories
234
- BENCHMARK_CATEGORIES
235
- end
32
+ def self.agents
33
+ get('/agents')
34
+ end
236
35
 
237
- # Find models that score above a threshold on a given benchmark
238
- #
239
- # BenchGecko.top_models("MMLU", min_score: 87.0)
240
- # #=> [Model, Model, ...]
241
- #
242
- def top_models(benchmark, min_score: 0)
243
- MODELS.filter_map do |id, data|
244
- score = data[:benchmarks][benchmark]
245
- next unless score && score >= min_score
246
- get_model(id)
247
- end.sort_by { |m| -m.score(benchmark) }
248
- end
36
+ private
249
37
 
250
- # Find the cheapest model that meets a minimum score on a benchmark
251
- #
252
- # BenchGecko.cheapest_above("MMLU", 85.0)
253
- # #=> Model (Gemini 2.0 Flash)
254
- #
255
- def cheapest_above(benchmark, min_score)
256
- top_models(benchmark, min_score: min_score)
257
- .select(&:cost_per_million)
258
- .min_by(&:cost_per_million)
259
- end
38
+ def self.get(path, params = {})
39
+ uri = URI("#{BASE_URL}#{path}")
40
+ uri.query = URI.encode_www_form(params) unless params.empty?
41
+ JSON.parse(Net::HTTP.get(uri))
260
42
  end
261
43
  end
metadata CHANGED
@@ -1,29 +1,23 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: benchgecko
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - BenchGecko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-11 00:00:00.000000000 Z
11
+ date: 2026-04-25 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Official Ruby SDK for BenchGecko, the data layer of the AI economy. Query
14
- thousands of AI models with cross-provider pricing and daily price history. Track
15
- company valuations, funding timelines, and revenue estimates. Pull benchmark scores,
16
- agent leaderboards, and a live changelog of every price drop, every launch, every
17
- deprecation. If it moved in AI today, it's already on BenchGecko.
18
- email:
19
- - hello@benchgecko.ai
13
+ description: Query AI model data, benchmark scores, and run side-by-side comparisons.
14
+ BenchGecko tracks every major AI model, benchmark, and provider with cross-provider
15
+ pricing.
16
+ email: hello@benchgecko.ai
20
17
  executables: []
21
18
  extensions: []
22
19
  extra_rdoc_files: []
23
20
  files:
24
- - CHANGELOG.md
25
- - LICENSE.txt
26
- - README.md
27
21
  - lib/benchgecko.rb
28
22
  homepage: https://benchgecko.ai
29
23
  licenses:
@@ -31,7 +25,8 @@ licenses:
31
25
  metadata:
32
26
  homepage_uri: https://benchgecko.ai
33
27
  source_code_uri: https://github.com/BenchGecko/benchgecko-ruby
34
- changelog_uri: https://github.com/BenchGecko/benchgecko-ruby/blob/main/CHANGELOG.md
28
+ documentation_uri: https://benchgecko.ai/api-docs
29
+ changelog_uri: https://benchgecko.ai/changelog
35
30
  post_install_message:
36
31
  rdoc_options: []
37
32
  require_paths:
@@ -40,7 +35,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
40
35
  requirements:
41
36
  - - ">="
42
37
  - !ruby/object:Gem::Version
43
- version: 2.7.0
38
+ version: '0'
44
39
  required_rubygems_version: !ruby/object:Gem::Requirement
45
40
  requirements:
46
41
  - - ">="
@@ -50,6 +45,5 @@ requirements: []
50
45
  rubygems_version: 3.0.3.1
51
46
  signing_key:
52
47
  specification_version: 4
53
- summary: The data layer of the AI economy. Every model. Every agent. Everything AI.
54
- Tracked.
48
+ summary: Ruby SDK for BenchGecko AI model data platform
55
49
  test_files: []
data/CHANGELOG.md DELETED
@@ -1,15 +0,0 @@
1
- # Changelog
2
-
3
- ## 0.2.0 (2026-03-27)
4
-
5
- - Rewrite gem description, summary, and README with the official BenchGecko brand voice
6
- - Remove hardcoded model and provider counts in favor of evergreen language
7
- - Reframe the SDK around the full BenchGecko data layer: models, companies, benchmarks, agents, and the live changelog
8
-
9
- ## 0.1.0 (2026-03-30)
10
-
11
- - Initial release
12
- - Model lookup, comparison, and cost estimation
13
- - Built-in catalog: GPT-4o, Claude 3.5 Sonnet, Gemini 2.0 Flash, Llama 3.1 405B, Mistral Large, DeepSeek V3
14
- - Benchmark categories: reasoning, coding, math, instruction, safety, multimodal, multilingual, long context
15
- - Top models filtering and cheapest-above-threshold finder
data/LICENSE.txt DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2026 BenchGecko
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
data/README.md DELETED
@@ -1,129 +0,0 @@
1
- # BenchGecko for Ruby
2
-
3
- **The data layer of the AI economy.** Official Ruby SDK for querying thousands of AI models with cross-provider pricing and daily price history, company valuations, funding timelines, revenue estimates, benchmark scores, agent leaderboards, and a live changelog of every price drop, every launch, every deprecation.
4
-
5
- If it moved in AI today, it's already on BenchGecko.
6
-
7
- ## What's Tracked
8
-
9
- - **Models.** Thousands of AI models with cross-provider pricing and daily price history.
10
- - **Companies.** Hundreds of AI companies with valuations, funding timelines, and revenue estimates.
11
- - **Benchmarks.** Reasoning, coding, math, instruction following, safety, multimodal, multilingual, long context.
12
- - **Agents.** Developer adoption signals and agent leaderboards.
13
- - **Changelog.** Every price drop, every launch, every deprecation, as it happens.
14
-
15
- ## Installation
16
-
17
- Add to your Gemfile:
18
-
19
- ```ruby
20
- gem "benchgecko"
21
- ```
22
-
23
- Or install directly:
24
-
25
- ```bash
26
- gem install benchgecko
27
- ```
28
-
29
- ## Quick Start
30
-
31
- ```ruby
32
- require "benchgecko"
33
-
34
- # Look up any model
35
- model = BenchGecko.get_model("claude-3.5-sonnet")
36
- puts model.name #=> "Claude 3.5 Sonnet"
37
- puts model.provider #=> "Anthropic"
38
- puts model.score("MMLU") #=> 88.7
39
-
40
- # List all tracked models
41
- BenchGecko.list_models.each { |id| puts id }
42
- ```
43
-
44
- ## Comparing Models
45
-
46
- The comparison engine surfaces benchmark differences and pricing ratios, making it straightforward to evaluate tradeoffs between models:
47
-
48
- ```ruby
49
- result = BenchGecko.compare_models("gpt-4o", "claude-3.5-sonnet")
50
-
51
- puts result[:cheaper] #=> "gpt-4o"
52
- puts result[:cost_ratio] #=> 0.69
53
- puts result[:benchmark_diff] #=> {"MMLU" => 0.0, "HumanEval" => -1.8, ...}
54
-
55
- # Positive diff means model_a scores higher
56
- result[:benchmark_diff].each do |bench, diff|
57
- next unless diff
58
- winner = diff >= 0 ? "GPT-4o" : "Claude 3.5 Sonnet"
59
- puts "#{bench}: #{winner} wins by #{diff.abs} points"
60
- end
61
- ```
62
-
63
- ## Cost Estimation
64
-
65
- Estimate inference costs before committing to a provider. Prices are per million tokens:
66
-
67
- ```ruby
68
- cost = BenchGecko.estimate_cost("gpt-4o",
69
- input_tokens: 2_000_000,
70
- output_tokens: 500_000
71
- )
72
-
73
- puts cost[:input_cost] #=> 5.0
74
- puts cost[:output_cost] #=> 5.0
75
- puts cost[:total] #=> 10.0
76
- ```
77
-
78
- ## Finding the Right Model
79
-
80
- Filter models by benchmark performance to find the best fit for your workload:
81
-
82
- ```ruby
83
- # All models scoring 87+ on MMLU
84
- strong_reasoners = BenchGecko.top_models("MMLU", min_score: 87.0)
85
- strong_reasoners.each { |m| puts "#{m.name}: #{m.score('MMLU')}" }
86
-
87
- # Cheapest model above a quality threshold
88
- budget_pick = BenchGecko.cheapest_above("MMLU", 85.0)
89
- puts "#{budget_pick.name} at $#{budget_pick.cost_per_million}/M tokens"
90
- ```
91
-
92
- ## Benchmark Categories
93
-
94
- BenchGecko organizes benchmarks into categories covering reasoning, coding, math, instruction following, safety, multimodal, multilingual, and long context evaluation:
95
-
96
- ```ruby
97
- BenchGecko.benchmark_categories.each do |key, info|
98
- puts "#{info[:name]}: #{info[:benchmarks].join(', ')}"
99
- puts " #{info[:description]}"
100
- end
101
- ```
102
-
103
- ## Built-in Model Catalog
104
-
105
- The gem ships with a curated catalog of major models from OpenAI, Anthropic, Google, Meta, Mistral, and DeepSeek. Each entry includes benchmark scores, parameter counts, context window sizes, and per-token pricing.
106
-
107
- ```ruby
108
- model = BenchGecko.get_model("deepseek-v3")
109
- puts model.parameters #=> 671
110
- puts model.context_window #=> 128000
111
- puts model.cost_per_million #=> 0.685
112
- ```
113
-
114
- ## Use Cases
115
-
116
- - **Model selection pipelines.** Programmatically pick the cheapest model that meets your quality bar.
117
- - **Cost monitoring.** Estimate monthly spend across different model configurations.
118
- - **Benchmark dashboards.** Pull structured scores into internal reporting tools.
119
- - **Agent evaluation.** Compare AI agents across capability dimensions.
120
- - **Pricing intelligence.** Track every price drop and launch through the live changelog.
121
-
122
- ## Resources
123
-
124
- - [BenchGecko](https://benchgecko.ai). The data layer of the AI economy.
125
- - [Source Code](https://github.com/BenchGecko/benchgecko-ruby). Contributions welcome.
126
-
127
- ## License
128
-
129
- MIT License. See [LICENSE.txt](LICENSE.txt) for details.