inferencebench-leaderboard 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inferencebench_leaderboard-0.0.2/.gitignore +137 -0
- inferencebench_leaderboard-0.0.2/PKG-INFO +112 -0
- inferencebench_leaderboard-0.0.2/README.md +88 -0
- inferencebench_leaderboard-0.0.2/pyproject.toml +47 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/__init__.py +39 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/__main__.py +8 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/cli.py +62 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/data.py +147 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/render.py +346 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/static/filter.js +46 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/static/site.css +143 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/static/sort.js +41 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/templates/base.html +26 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/templates/category.html +52 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/templates/entry.html +67 -0
- inferencebench_leaderboard-0.0.2/src/inferencebench_leaderboard/templates/index.html +30 -0
- inferencebench_leaderboard-0.0.2/tests/conftest.py +161 -0
- inferencebench_leaderboard-0.0.2/tests/test_load_envelopes.py +29 -0
- inferencebench_leaderboard-0.0.2/tests/test_pareto.py +54 -0
- inferencebench_leaderboard-0.0.2/tests/test_render_smoke.py +100 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
share/python-wheels/
|
|
20
|
+
*.egg-info/
|
|
21
|
+
.installed.cfg
|
|
22
|
+
*.egg
|
|
23
|
+
MANIFEST
|
|
24
|
+
|
|
25
|
+
# uv / virtualenv
|
|
26
|
+
.venv/
|
|
27
|
+
venv/
|
|
28
|
+
env/
|
|
29
|
+
ENV/
|
|
30
|
+
uv.lock.tmp
|
|
31
|
+
.python-version
|
|
32
|
+
|
|
33
|
+
# Testing / coverage
|
|
34
|
+
.tox/
|
|
35
|
+
.nox/
|
|
36
|
+
.coverage
|
|
37
|
+
.coverage.*
|
|
38
|
+
.cache
|
|
39
|
+
nosetests.xml
|
|
40
|
+
coverage.xml
|
|
41
|
+
*.cover
|
|
42
|
+
*.py,cover
|
|
43
|
+
.hypothesis/
|
|
44
|
+
.pytest_cache/
|
|
45
|
+
cover/
|
|
46
|
+
htmlcov/
|
|
47
|
+
|
|
48
|
+
# Type checking
|
|
49
|
+
.mypy_cache/
|
|
50
|
+
.dmypy.json
|
|
51
|
+
dmypy.json
|
|
52
|
+
.pyre/
|
|
53
|
+
.pytype/
|
|
54
|
+
|
|
55
|
+
# Ruff
|
|
56
|
+
.ruff_cache/
|
|
57
|
+
|
|
58
|
+
# IDE / editor
|
|
59
|
+
.idea/
|
|
60
|
+
.vscode/
|
|
61
|
+
*.swp
|
|
62
|
+
*.swo
|
|
63
|
+
*~
|
|
64
|
+
.DS_Store
|
|
65
|
+
|
|
66
|
+
# OS
|
|
67
|
+
Thumbs.db
|
|
68
|
+
desktop.ini
|
|
69
|
+
|
|
70
|
+
# Secrets / env
|
|
71
|
+
.env
|
|
72
|
+
.env.*
|
|
73
|
+
!.env.example
|
|
74
|
+
.envrc
|
|
75
|
+
|
|
76
|
+
# Bench-specific local caches
|
|
77
|
+
~/.cache/inferencebench/
|
|
78
|
+
.cache/inferencebench/
|
|
79
|
+
.inferencebench/
|
|
80
|
+
|
|
81
|
+
# Sigstore dev keys (never commit private keys)
|
|
82
|
+
cosign.key
|
|
83
|
+
cosign-*.key
|
|
84
|
+
cosign-*.pub
|
|
85
|
+
.bench/*.key
|
|
86
|
+
# Local benchmark working dirs (kept local; published outputs land under validation-runs/)
|
|
87
|
+
envelopes-voice/
|
|
88
|
+
envelopes-*/
|
|
89
|
+
*.pem
|
|
90
|
+
!tests/fixtures/**/*.pem
|
|
91
|
+
|
|
92
|
+
# Real-GPU validation artifacts (kept locally, never pushed)
|
|
93
|
+
# Use slash-star (not trailing slash) so individual subpaths can be re-included below.
|
|
94
|
+
validation-runs/*
|
|
95
|
+
# ...except the canonical published marathon corpus — small, public, used by docs + CI
|
|
96
|
+
!validation-runs/2026-05-18-multi-vendor-marathon
|
|
97
|
+
validation-runs/2026-05-18-multi-vendor-marathon/*
|
|
98
|
+
!validation-runs/2026-05-18-multi-vendor-marathon/marathon
|
|
99
|
+
validation-runs/2026-05-18-multi-vendor-marathon/marathon/*
|
|
100
|
+
!validation-runs/2026-05-18-multi-vendor-marathon/marathon/all
|
|
101
|
+
!validation-runs/2026-05-18-multi-vendor-marathon/marathon/all/*.json
|
|
102
|
+
# Voice ASR validation envelopes (small, public, used by leaderboard build)
|
|
103
|
+
!validation-runs/2026-05-25-voice-rtx4000ada
|
|
104
|
+
!validation-runs/2026-05-25-voice-rtx4000ada/*.json
|
|
105
|
+
!validation-runs/2026-05-29-voice-testbm-h100
|
|
106
|
+
!validation-runs/2026-05-29-voice-testbm-h100/*.json
|
|
107
|
+
|
|
108
|
+
# Model weights / datasets (use Git LFS or S3)
|
|
109
|
+
*.bin
|
|
110
|
+
*.safetensors
|
|
111
|
+
*.pt
|
|
112
|
+
*.pth
|
|
113
|
+
*.gguf
|
|
114
|
+
*.onnx
|
|
115
|
+
*.parquet
|
|
116
|
+
!tests/fixtures/**/*.parquet
|
|
117
|
+
|
|
118
|
+
# Logs
|
|
119
|
+
*.log
|
|
120
|
+
logs/
|
|
121
|
+
|
|
122
|
+
# Documentation build
|
|
123
|
+
docs/_build/
|
|
124
|
+
site/
|
|
125
|
+
|
|
126
|
+
# Internal-only files (Claude Code context + planning) — kept locally, not pushed
|
|
127
|
+
/CLAUDE.md
|
|
128
|
+
/INDEX.md
|
|
129
|
+
/PROJECT_PLAN.md
|
|
130
|
+
/CONVENTIONS.md
|
|
131
|
+
/HUMAN_REVIEW_GATES.md
|
|
132
|
+
**/CLAUDE.md
|
|
133
|
+
memory/
|
|
134
|
+
skills/
|
|
135
|
+
agents/
|
|
136
|
+
.claude/
|
|
137
|
+
TICKETS/
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inferencebench-leaderboard
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Static-site leaderboard renderer for signed InferenceBench envelopes
|
|
5
|
+
Project-URL: Homepage, https://github.com/yobitelcomm/bench
|
|
6
|
+
Project-URL: Documentation, https://yobitelcomm.github.io/bench
|
|
7
|
+
Author-email: Yobitel Communications <bench@yobitel.com>
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Keywords: ai,benchmark,inference,leaderboard,llm,ml,static-site
|
|
10
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Requires-Dist: inferencebench-envelope
|
|
20
|
+
Requires-Dist: jinja2~=3.1
|
|
21
|
+
Requires-Dist: pydantic~=2.9
|
|
22
|
+
Requires-Dist: typer~=0.12
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# inferencebench-leaderboard
|
|
26
|
+
|
|
27
|
+
Static-site renderer that turns a directory of signed InferenceBench envelope
|
|
28
|
+
JSONs into a plain HTML+CSS+JSON leaderboard suitable for GitHub Pages
|
|
29
|
+
(`https://yobitelcomm.github.io/bench`). Vendor-neutral, no JavaScript
|
|
30
|
+
frameworks; the only client-side code is a ~40-line vanilla sorter for the
|
|
31
|
+
tables.
|
|
32
|
+
|
|
33
|
+
## Install (workspace)
|
|
34
|
+
|
|
35
|
+
This package is a `uv` workspace member of the root `bench/` monorepo:
|
|
36
|
+
|
|
37
|
+
```toml
|
|
38
|
+
# bench/pyproject.toml
|
|
39
|
+
[tool.uv.workspace]
|
|
40
|
+
members = [..., "tools/leaderboard"]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Then `uv sync` from the repo root.
|
|
44
|
+
|
|
45
|
+
## Build a site
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
python -m inferencebench_leaderboard build envelopes/ site/
|
|
49
|
+
# or
|
|
50
|
+
inferencebench-leaderboard build envelopes/ site/ --base-url /bench/
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
`envelopes/` must contain `*.json` files that parse against the canonical
|
|
54
|
+
`inferencebench.envelope.Envelope` Pydantic model. Files that don't parse
|
|
55
|
+
are logged and skipped; the rest of the site still renders.
|
|
56
|
+
|
|
57
|
+
## Output layout
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
site/
|
|
61
|
+
index.html — category index
|
|
62
|
+
static/site.css — Hacker News-style table CSS
|
|
63
|
+
static/sort.js — vanilla sort for tables
|
|
64
|
+
envelopes/<file>.json — verbatim copies for `bench verify`
|
|
65
|
+
<suite_id>/index.html — per-category table
|
|
66
|
+
<suite_id>/<run_id>.html — per-entry detail (verify snippet)
|
|
67
|
+
data/leaderboard.json — machine-readable index
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Public API
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from inferencebench_leaderboard import (
|
|
74
|
+
render_site, # main entry point
|
|
75
|
+
SiteRenderResult, # return type
|
|
76
|
+
load_envelopes, # directory -> [LoadedEnvelope]
|
|
77
|
+
compute_pareto, # Pareto-frontier classifier
|
|
78
|
+
LoadedEnvelope,
|
|
79
|
+
PARETO_DIRECTIONS,
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
`compute_pareto` accepts per-axis `direction={"min","max"}` so the same
|
|
84
|
+
function works for throughput-vs-latency, latency-vs-cost, etc.
|
|
85
|
+
|
|
86
|
+
## Tests
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
pytest tools/leaderboard/tests/
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Covers: smoke render, Pareto math on synthetic data, schema-validation
|
|
93
|
+
skipping.
|
|
94
|
+
|
|
95
|
+
## Hosted at GitHub Pages
|
|
96
|
+
|
|
97
|
+
The marathon corpus committed at
|
|
98
|
+
`validation-runs/2026-05-18-multi-vendor-marathon/marathon/all/` is rendered
|
|
99
|
+
and deployed to **<https://yobitelcomm.github.io/bench/>** by
|
|
100
|
+
`.github/workflows/deploy-pages.yml`. The workflow re-runs on push to main
|
|
101
|
+
(when leaderboard code or corpus envelopes change), weekly on schedule, and on
|
|
102
|
+
manual dispatch from the Actions tab.
|
|
103
|
+
|
|
104
|
+
To preview locally before pushing:
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
uv run bench leaderboard --build \
|
|
108
|
+
--envelopes validation-runs/2026-05-18-multi-vendor-marathon/marathon/all \
|
|
109
|
+
--out _site --base-url /bench/
|
|
110
|
+
python3 -m http.server --directory _site 8080
|
|
111
|
+
# open http://localhost:8080/bench/ (note the /bench/ prefix matches --base-url)
|
|
112
|
+
```
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# inferencebench-leaderboard
|
|
2
|
+
|
|
3
|
+
Static-site renderer that turns a directory of signed InferenceBench envelope
|
|
4
|
+
JSONs into a plain HTML+CSS+JSON leaderboard suitable for GitHub Pages
|
|
5
|
+
(`https://yobitelcomm.github.io/bench`). Vendor-neutral, no JavaScript
|
|
6
|
+
frameworks; the only client-side code is a ~40-line vanilla sorter for the
|
|
7
|
+
tables.
|
|
8
|
+
|
|
9
|
+
## Install (workspace)
|
|
10
|
+
|
|
11
|
+
This package is a `uv` workspace member of the root `bench/` monorepo:
|
|
12
|
+
|
|
13
|
+
```toml
|
|
14
|
+
# bench/pyproject.toml
|
|
15
|
+
[tool.uv.workspace]
|
|
16
|
+
members = [..., "tools/leaderboard"]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Then `uv sync` from the repo root.
|
|
20
|
+
|
|
21
|
+
## Build a site
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
python -m inferencebench_leaderboard build envelopes/ site/
|
|
25
|
+
# or
|
|
26
|
+
inferencebench-leaderboard build envelopes/ site/ --base-url /bench/
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`envelopes/` must contain `*.json` files that parse against the canonical
|
|
30
|
+
`inferencebench.envelope.Envelope` Pydantic model. Files that don't parse
|
|
31
|
+
are logged and skipped; the rest of the site still renders.
|
|
32
|
+
|
|
33
|
+
## Output layout
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
site/
|
|
37
|
+
index.html — category index
|
|
38
|
+
static/site.css — Hacker News-style table CSS
|
|
39
|
+
static/sort.js — vanilla sort for tables
|
|
40
|
+
envelopes/<file>.json — verbatim copies for `bench verify`
|
|
41
|
+
<suite_id>/index.html — per-category table
|
|
42
|
+
<suite_id>/<run_id>.html — per-entry detail (verify snippet)
|
|
43
|
+
data/leaderboard.json — machine-readable index
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Public API
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from inferencebench_leaderboard import (
|
|
50
|
+
render_site, # main entry point
|
|
51
|
+
SiteRenderResult, # return type
|
|
52
|
+
load_envelopes, # directory -> [LoadedEnvelope]
|
|
53
|
+
compute_pareto, # Pareto-frontier classifier
|
|
54
|
+
LoadedEnvelope,
|
|
55
|
+
PARETO_DIRECTIONS,
|
|
56
|
+
)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
`compute_pareto` accepts per-axis `direction={"min","max"}` so the same
|
|
60
|
+
function works for throughput-vs-latency, latency-vs-cost, etc.
|
|
61
|
+
|
|
62
|
+
## Tests
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
pytest tools/leaderboard/tests/
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Covers: smoke render, Pareto math on synthetic data, schema-validation
|
|
69
|
+
skipping.
|
|
70
|
+
|
|
71
|
+
## Hosted at GitHub Pages
|
|
72
|
+
|
|
73
|
+
The marathon corpus committed at
|
|
74
|
+
`validation-runs/2026-05-18-multi-vendor-marathon/marathon/all/` is rendered
|
|
75
|
+
and deployed to **<https://yobitelcomm.github.io/bench/>** by
|
|
76
|
+
`.github/workflows/deploy-pages.yml`. The workflow re-runs on push to main
|
|
77
|
+
(when leaderboard code or corpus envelopes change), weekly on schedule, and on
|
|
78
|
+
manual dispatch from the Actions tab.
|
|
79
|
+
|
|
80
|
+
To preview locally before pushing:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
uv run bench leaderboard --build \
|
|
84
|
+
--envelopes validation-runs/2026-05-18-multi-vendor-marathon/marathon/all \
|
|
85
|
+
--out _site --base-url /bench/
|
|
86
|
+
python3 -m http.server --directory _site 8080
|
|
87
|
+
# open http://localhost:8080/bench/ (note the /bench/ prefix matches --base-url)
|
|
88
|
+
```
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "inferencebench-leaderboard"
|
|
7
|
+
version = "0.0.2"
|
|
8
|
+
description = "Static-site leaderboard renderer for signed InferenceBench envelopes"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Yobitel Communications", email = "bench@yobitel.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["benchmark", "ai", "ml", "llm", "inference", "leaderboard", "static-site"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 2 - Pre-Alpha",
|
|
18
|
+
"Environment :: Console",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Intended Audience :: Science/Research",
|
|
21
|
+
"License :: OSI Approved :: Apache Software License",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"inferencebench-envelope",
|
|
28
|
+
"jinja2~=3.1",
|
|
29
|
+
"typer~=0.12",
|
|
30
|
+
"pydantic~=2.9",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
inferencebench-leaderboard = "inferencebench_leaderboard.cli:app"
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/yobitelcomm/bench"
|
|
38
|
+
Documentation = "https://yobitelcomm.github.io/bench"
|
|
39
|
+
|
|
40
|
+
[tool.uv.sources]
|
|
41
|
+
inferencebench-envelope = { workspace = true }
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.wheel]
|
|
44
|
+
packages = ["src/inferencebench_leaderboard"]
|
|
45
|
+
# templates/ and static/ live inside src/inferencebench_leaderboard so they are
|
|
46
|
+
# already picked up by the package include. A separate force-include would
|
|
47
|
+
# duplicate every entry in the wheel zip — PyPI rejects with 400.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Static-site leaderboard renderer for InferenceBench signed envelopes.
|
|
2
|
+
|
|
3
|
+
Reads a directory of canonical envelope JSONs, groups them by ``suite_id``,
|
|
4
|
+
and emits a static HTML + CSS + JSON site suitable for hosting on GitHub Pages
|
|
5
|
+
(target: https://yobitelcomm.github.io/bench).
|
|
6
|
+
|
|
7
|
+
Public surface:
|
|
8
|
+
|
|
9
|
+
render_site(envelopes_dir, out_dir, *, base_url="/") -> SiteRenderResult
|
|
10
|
+
SiteRenderResult — summary of a render pass (counts, skipped, paths)
|
|
11
|
+
LoadedEnvelope — parsed envelope plus its source filename
|
|
12
|
+
load_envelopes(dir) — collect parseable envelopes from a directory
|
|
13
|
+
compute_pareto(...) — Pareto-frontier classifier for arbitrary axes
|
|
14
|
+
|
|
15
|
+
The renderer is deliberately framework-free: plain HTML + a small static CSS
|
|
16
|
+
file, no client-side bundler, no JS frameworks. A tiny vanilla sort script
|
|
17
|
+
ships with the site to make the tables sortable.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from inferencebench_leaderboard.data import (
|
|
23
|
+
PARETO_DIRECTIONS,
|
|
24
|
+
LoadedEnvelope,
|
|
25
|
+
compute_pareto,
|
|
26
|
+
load_envelopes,
|
|
27
|
+
)
|
|
28
|
+
from inferencebench_leaderboard.render import SiteRenderResult, render_site
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"PARETO_DIRECTIONS",
|
|
32
|
+
"LoadedEnvelope",
|
|
33
|
+
"SiteRenderResult",
|
|
34
|
+
"compute_pareto",
|
|
35
|
+
"load_envelopes",
|
|
36
|
+
"render_site",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
__version__ = "0.0.0"
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Typer CLI entry point for the static leaderboard renderer.
|
|
2
|
+
|
|
3
|
+
Invoke with::
|
|
4
|
+
|
|
5
|
+
python -m inferencebench_leaderboard build envelopes/ site/
|
|
6
|
+
inferencebench-leaderboard build envelopes/ site/
|
|
7
|
+
|
|
8
|
+
The ``build`` subcommand is the only operation today; future subcommands
|
|
9
|
+
(e.g. ``validate``, ``diff``) plug in here.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import typer
|
|
17
|
+
|
|
18
|
+
from inferencebench_leaderboard.render import render_site
|
|
19
|
+
|
|
20
|
+
app = typer.Typer(
|
|
21
|
+
add_completion=False,
|
|
22
|
+
no_args_is_help=True,
|
|
23
|
+
help="Render the static InferenceBench leaderboard site.",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.command("build")
|
|
28
|
+
def build(
|
|
29
|
+
envelopes_dir: Path = typer.Argument(
|
|
30
|
+
...,
|
|
31
|
+
exists=True,
|
|
32
|
+
file_okay=False,
|
|
33
|
+
dir_okay=True,
|
|
34
|
+
readable=True,
|
|
35
|
+
help="Directory containing signed envelope JSON files.",
|
|
36
|
+
),
|
|
37
|
+
out_dir: Path = typer.Argument(
|
|
38
|
+
...,
|
|
39
|
+
file_okay=False,
|
|
40
|
+
dir_okay=True,
|
|
41
|
+
help="Destination directory for the generated static site.",
|
|
42
|
+
),
|
|
43
|
+
base_url: str = typer.Option(
|
|
44
|
+
"/",
|
|
45
|
+
"--base-url",
|
|
46
|
+
help="URL prefix the site will be served from (e.g. '/bench/').",
|
|
47
|
+
),
|
|
48
|
+
) -> None:
|
|
49
|
+
"""Render the static leaderboard site to ``out_dir``."""
|
|
50
|
+
result = render_site(envelopes_dir, out_dir, base_url=base_url)
|
|
51
|
+
typer.echo(
|
|
52
|
+
f"Rendered {result.envelopes_loaded} envelope(s) "
|
|
53
|
+
f"across {len(result.categories)} category(ies) "
|
|
54
|
+
f"to {result.out_dir} "
|
|
55
|
+
f"({result.envelopes_skipped} skipped, {result.pages_written} files written)."
|
|
56
|
+
)
|
|
57
|
+
for suite_id, count in sorted(result.categories.items()):
|
|
58
|
+
typer.echo(f" - {suite_id}: {count}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == "__main__": # pragma: no cover
|
|
62
|
+
app()
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Envelope loading and Pareto-frontier math for the static leaderboard.
|
|
2
|
+
|
|
3
|
+
Both helpers are pure: they do not touch disk except via the directory passed
|
|
4
|
+
in to :func:`load_envelopes`. Malformed JSON files and JSON blobs that fail
|
|
5
|
+
Pydantic validation are *skipped*, never crashed, because the leaderboard is
|
|
6
|
+
populated from community contributions and one bad envelope must not break
|
|
7
|
+
the whole site.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Literal
|
|
17
|
+
|
|
18
|
+
from inferencebench.envelope import Envelope
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
Direction = Literal["min", "max"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Convention for the leaderboard's headline metrics. Lower is better for
|
|
26
|
+
# latency and price, higher is better for throughput. The renderer uses this
|
|
27
|
+
# map to decide which axis direction to feed :func:`compute_pareto`.
|
|
28
|
+
PARETO_DIRECTIONS: dict[str, Direction] = {
|
|
29
|
+
"ttft_p50_ms": "min",
|
|
30
|
+
"ttft_p99_ms": "min",
|
|
31
|
+
"itl_p50_ms": "min",
|
|
32
|
+
"itl_p99_ms": "min",
|
|
33
|
+
"throughput_tok_per_s": "max",
|
|
34
|
+
"goodput_tok_per_s": "max",
|
|
35
|
+
"cost_per_m_tokens_usd": "min",
|
|
36
|
+
"joules_per_token": "min",
|
|
37
|
+
"energy_per_token_j": "min",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True, slots=True)
|
|
42
|
+
class LoadedEnvelope:
|
|
43
|
+
"""A parsed envelope and the source filename it came from.
|
|
44
|
+
|
|
45
|
+
The filename is kept so the rendered site can link to a stable JSON path
|
|
46
|
+
(``/envelopes/<filename>``) for ``bench verify`` to re-download and check.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
source_filename: str
|
|
50
|
+
envelope: Envelope
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_envelopes(envelopes_dir: Path) -> list[LoadedEnvelope]:
|
|
54
|
+
"""Load every ``*.json`` under ``envelopes_dir`` that parses as an Envelope.
|
|
55
|
+
|
|
56
|
+
Files that are not valid JSON, or that parse as JSON but fail Pydantic
|
|
57
|
+
validation against :class:`inferencebench.envelope.Envelope`, are logged
|
|
58
|
+
at WARNING level and skipped. This is intentional: the leaderboard
|
|
59
|
+
aggregates community-submitted envelopes and one malformed file must not
|
|
60
|
+
take down the rest of the site.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
envelopes_dir: Directory containing envelope JSON files.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Sorted list of successfully parsed envelopes (by source filename).
|
|
67
|
+
"""
|
|
68
|
+
if not envelopes_dir.exists() or not envelopes_dir.is_dir():
|
|
69
|
+
logger.warning("envelopes_dir does not exist or is not a directory: %s", envelopes_dir)
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
loaded: list[LoadedEnvelope] = []
|
|
73
|
+
for path in sorted(envelopes_dir.glob("*.json")):
|
|
74
|
+
try:
|
|
75
|
+
raw = path.read_text(encoding="utf-8")
|
|
76
|
+
except OSError as exc:
|
|
77
|
+
logger.warning("could not read envelope file %s: %s", path, exc)
|
|
78
|
+
continue
|
|
79
|
+
try:
|
|
80
|
+
data = json.loads(raw)
|
|
81
|
+
except json.JSONDecodeError as exc:
|
|
82
|
+
logger.warning("skipping non-JSON file %s: %s", path, exc)
|
|
83
|
+
continue
|
|
84
|
+
try:
|
|
85
|
+
envelope = Envelope.model_validate(data)
|
|
86
|
+
except (ValueError, TypeError) as exc:
|
|
87
|
+
logger.warning("skipping invalid envelope %s: %s", path, exc)
|
|
88
|
+
continue
|
|
89
|
+
loaded.append(LoadedEnvelope(source_filename=path.name, envelope=envelope))
|
|
90
|
+
return loaded
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def compute_pareto(
|
|
94
|
+
entries: list[tuple[float | None, float | None]],
|
|
95
|
+
*,
|
|
96
|
+
x_direction: Direction = "max",
|
|
97
|
+
y_direction: Direction = "min",
|
|
98
|
+
) -> list[bool]:
|
|
99
|
+
"""Classify each ``(x, y)`` pair as on the Pareto frontier or dominated.
|
|
100
|
+
|
|
101
|
+
A point ``p`` is *dominated* by another point ``q`` when ``q`` is at least
|
|
102
|
+
as good as ``p`` on both axes and strictly better on at least one. The
|
|
103
|
+
direction parameters say whether higher or lower is "better" per axis:
|
|
104
|
+
|
|
105
|
+
- ``x_direction="max", y_direction="min"`` (default): higher x and lower y
|
|
106
|
+
are both improvements. Matches the canonical
|
|
107
|
+
throughput-vs-latency plot.
|
|
108
|
+
- ``x_direction="min", y_direction="min"``: both axes minimize (e.g.
|
|
109
|
+
latency vs. cost).
|
|
110
|
+
- Any other combination is supported analogously.
|
|
111
|
+
|
|
112
|
+
Entries containing ``None`` on either axis are never on the frontier
|
|
113
|
+
(treated as missing data, marked ``False``).
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
entries: List of ``(x, y)`` coordinate tuples; ``None`` denotes missing.
|
|
117
|
+
x_direction: ``"max"`` if higher x is better, ``"min"`` if lower.
|
|
118
|
+
y_direction: ``"max"`` if higher y is better, ``"min"`` if lower.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
A list of booleans, one per input, ``True`` iff the entry is on the
|
|
122
|
+
Pareto frontier.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
def better_or_equal(a: float, b: float, direction: Direction) -> bool:
|
|
126
|
+
return a >= b if direction == "max" else a <= b
|
|
127
|
+
|
|
128
|
+
def strictly_better(a: float, b: float, direction: Direction) -> bool:
|
|
129
|
+
return a > b if direction == "max" else a < b
|
|
130
|
+
|
|
131
|
+
result = [False] * len(entries)
|
|
132
|
+
for i, (xi, yi) in enumerate(entries):
|
|
133
|
+
if xi is None or yi is None:
|
|
134
|
+
continue
|
|
135
|
+
dominated = False
|
|
136
|
+
for j, (xj, yj) in enumerate(entries):
|
|
137
|
+
if i == j or xj is None or yj is None:
|
|
138
|
+
continue
|
|
139
|
+
if (
|
|
140
|
+
better_or_equal(xj, xi, x_direction)
|
|
141
|
+
and better_or_equal(yj, yi, y_direction)
|
|
142
|
+
and (strictly_better(xj, xi, x_direction) or strictly_better(yj, yi, y_direction))
|
|
143
|
+
):
|
|
144
|
+
dominated = True
|
|
145
|
+
break
|
|
146
|
+
result[i] = not dominated
|
|
147
|
+
return result
|