benchmaker 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmaker-0.1.0/PKG-INFO +214 -0
- benchmaker-0.1.0/README.md +195 -0
- benchmaker-0.1.0/benchmaker/__init__.py +152 -0
- benchmaker-0.1.0/benchmaker/bundle.py +193 -0
- benchmaker-0.1.0/benchmaker/cli.py +382 -0
- benchmaker-0.1.0/benchmaker/collect.py +178 -0
- benchmaker-0.1.0/benchmaker/config.py +448 -0
- benchmaker-0.1.0/benchmaker/env.py +87 -0
- benchmaker-0.1.0/benchmaker/load.py +326 -0
- benchmaker-0.1.0/benchmaker/metrics.py +234 -0
- benchmaker-0.1.0/benchmaker/monitors.py +228 -0
- benchmaker-0.1.0/benchmaker/runner.py +275 -0
- benchmaker-0.1.0/benchmaker/trace.py +217 -0
- benchmaker-0.1.0/benchmaker/types.py +98 -0
- benchmaker-0.1.0/benchmaker/workloads/__init__.py +53 -0
- benchmaker-0.1.0/benchmaker/workloads/agent.py +308 -0
- benchmaker-0.1.0/benchmaker/workloads/base.py +79 -0
- benchmaker-0.1.0/benchmaker/workloads/datasets.py +156 -0
- benchmaker-0.1.0/benchmaker/workloads/eval.py +504 -0
- benchmaker-0.1.0/benchmaker/workloads/hf.py +382 -0
- benchmaker-0.1.0/benchmaker/workloads/http.py +77 -0
- benchmaker-0.1.0/benchmaker/workloads/llm.py +258 -0
- benchmaker-0.1.0/benchmaker/workloads/sandbox.py +470 -0
- benchmaker-0.1.0/benchmaker.egg-info/PKG-INFO +214 -0
- benchmaker-0.1.0/benchmaker.egg-info/SOURCES.txt +36 -0
- benchmaker-0.1.0/benchmaker.egg-info/dependency_links.txt +1 -0
- benchmaker-0.1.0/benchmaker.egg-info/entry_points.txt +2 -0
- benchmaker-0.1.0/benchmaker.egg-info/requires.txt +13 -0
- benchmaker-0.1.0/benchmaker.egg-info/top_level.txt +1 -0
- benchmaker-0.1.0/pyproject.toml +32 -0
- benchmaker-0.1.0/setup.cfg +4 -0
- benchmaker-0.1.0/tests/test_agent.py +185 -0
- benchmaker-0.1.0/tests/test_bundle.py +179 -0
- benchmaker-0.1.0/tests/test_coding_agent.py +352 -0
- benchmaker-0.1.0/tests/test_eval.py +531 -0
- benchmaker-0.1.0/tests/test_hf.py +366 -0
- benchmaker-0.1.0/tests/test_smoke.py +628 -0
- benchmaker-0.1.0/tests/test_trace.py +161 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: benchmaker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
|
|
5
|
+
Author: Xiaozhe Yao
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: aiohttp>=3.9
|
|
10
|
+
Requires-Dist: click>=8.1
|
|
11
|
+
Requires-Dist: pyyaml>=6.0
|
|
12
|
+
Provides-Extra: rich
|
|
13
|
+
Requires-Dist: rich>=13; extra == "rich"
|
|
14
|
+
Provides-Extra: hf
|
|
15
|
+
Requires-Dist: datasets>=2.18; extra == "hf"
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
19
|
+
|
|
20
|
+
# bench-maker
|
|
21
|
+
|
|
22
|
+
Async HTTP benchmarking with pluggable workload-types (protocols), workloads
|
|
23
|
+
(datasets), load models, hooks, and optional periodic monitors.
|
|
24
|
+
|
|
25
|
+
```text
|
|
26
|
+
+--------+ item +---------------+ request +-----------+ +---------+
|
|
27
|
+
|workload|--------->| workload-type |------------>| pre-hooks |-->| aiohttp |
|
|
28
|
+
|(dataset| | (protocol) | +-----------+ +---------+
|
|
29
|
+
| / log) | | make_request | |
|
|
30
|
+
+--------+ | make_sample | +------------+ v
|
|
31
|
+
^ +---------------+ | post-hooks |<----+
|
|
32
|
+
| +------------+
|
|
33
|
+
+-- load model decides WHEN to fire ----+ v
|
|
34
|
+
| +----------+
|
|
35
|
+
monitors run alongside ------+------->| metrics |
|
|
36
|
+
(Prometheus, NVML, ...) | aggregator|
|
|
37
|
+
+----------+
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Install
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e .
|
|
44
|
+
pip install -e .[dev] # for tests
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
This installs the `benchmaker` Python package and the `bench-maker` CLI.
|
|
48
|
+
|
|
49
|
+
## 30-second tour
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
import asyncio
|
|
53
|
+
from benchmaker import BenchConfig, BenchRunner, ConstantRPS, HttpWorkloadType
|
|
54
|
+
|
|
55
|
+
async def main():
|
|
56
|
+
cfg = BenchConfig(
|
|
57
|
+
workload_type=HttpWorkloadType(url="https://httpbin.org/get"),
|
|
58
|
+
load=ConstantRPS(rps=50, duration_s=10),
|
|
59
|
+
)
|
|
60
|
+
result = await BenchRunner(cfg).run()
|
|
61
|
+
print(result.summary)
|
|
62
|
+
|
|
63
|
+
asyncio.run(main())
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Or via the CLI:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Walkthrough: benchmarking an LLM endpoint with ShareGPT
|
|
73
|
+
|
|
74
|
+
A realistic LLM benchmark needs a real prompt distribution.
|
|
75
|
+
[ShareGPT V3](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
|
|
76
|
+
is a common choice — multi-turn human/assistant conversations scraped from real
|
|
77
|
+
ChatGPT users. A cleaned, benchmark-ready copy is published at
|
|
78
|
+
[`researchcomputer/llmsys-bench`](https://huggingface.co/datasets/researchcomputer/llmsys-bench)
|
|
79
|
+
(`split="sharegpt"`), with one row per conversation:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{"id": "...", "messages": [{"role": "user", "content": "..."},
|
|
83
|
+
{"role": "assistant", "content": "..."},
|
|
84
|
+
{"role": "user", "content": "..."}]}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
`messages` is the only content field — it's everything a chat benchmark needs.
|
|
88
|
+
Each row is truncated to end on a **user** turn, so it's a valid generation
|
|
89
|
+
request: the server completes the final assistant reply given the prior
|
|
90
|
+
history. Short source conversations collapse to a single user turn (a plain
|
|
91
|
+
single-turn prompt); longer ones carry multi-turn context.
|
|
92
|
+
|
|
93
|
+
### Load it directly from the Hub
|
|
94
|
+
|
|
95
|
+
Pull the published split and feed each row's `messages` list straight into the
|
|
96
|
+
chat workload-type (`pip install -e .[hf]`):
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import asyncio
|
|
100
|
+
from datasets import load_dataset
|
|
101
|
+
from benchmaker import (
|
|
102
|
+
BenchConfig, BenchRunner, OpenAIChatWorkloadType,
|
|
103
|
+
IterableWorkload, parse_rate_spec,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
async def main():
|
|
107
|
+
ds = load_dataset("researchcomputer/llmsys-bench", split="sharegpt")
|
|
108
|
+
cfg = BenchConfig(
|
|
109
|
+
workload_type=OpenAIChatWorkloadType(
|
|
110
|
+
url="http://localhost:8000/v1/chat/completions",
|
|
111
|
+
model="meta-llama/Llama-3.1-8B-Instruct",
|
|
112
|
+
max_tokens=256,
|
|
113
|
+
),
|
|
114
|
+
workload=IterableWorkload(row["messages"] for row in ds),
|
|
115
|
+
load=parse_rate_spec("poisson:8", duration_s=60),
|
|
116
|
+
timeout_s=600,
|
|
117
|
+
)
|
|
118
|
+
result = await BenchRunner(cfg).run()
|
|
119
|
+
print(result.summary)
|
|
120
|
+
|
|
121
|
+
asyncio.run(main())
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
`OpenAIChatWorkloadType` receives the message list as-is, so single-turn rows
|
|
125
|
+
send one user message and multi-turn rows replay the full history before the
|
|
126
|
+
server generates the final assistant turn. TTFT, inter-token latency, and
|
|
127
|
+
tokens/sec are captured the same way in both cases. URL / model / API key can
|
|
128
|
+
also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
|
|
129
|
+
|
|
130
|
+
### Rebuild or customize it yourself
|
|
131
|
+
|
|
132
|
+
The published split is produced by `tools/prepare_sharegpt.py`, which downloads
|
|
133
|
+
the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
|
|
134
|
+
shape above. Run it when you want a subset, different filtering, or a refresh:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# Defaults: .local/sharegpt_v3_raw.json -> .local/sharegpt_v3.jsonl
|
|
138
|
+
python tools/prepare_sharegpt.py
|
|
139
|
+
|
|
140
|
+
# A quick subset for smoke tests:
|
|
141
|
+
python tools/prepare_sharegpt.py --max-items 2000
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
|
|
145
|
+
pathologically long conversations (measured over total message content per
|
|
146
|
+
row). Point any workload at the local file with `JsonlWorkload(path=...,
|
|
147
|
+
field="messages")`, or on the CLI:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
bench-maker llm \
|
|
151
|
+
--url http://localhost:8000/v1/chat/completions \
|
|
152
|
+
--model meta-llama/Llama-3.1-8B-Instruct \
|
|
153
|
+
--prompts-jsonl .local/sharegpt_v3.jsonl \
|
|
154
|
+
--prompt-field messages \
|
|
155
|
+
--max-tokens 256 \
|
|
156
|
+
--rate poisson:8 --duration 60s \
|
|
157
|
+
--out-dir ./runs --label dataset=sharegpt
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
|
|
161
|
+
JSONL back to the Hub (needs a write token).
|
|
162
|
+
|
|
163
|
+
## Documentation
|
|
164
|
+
|
|
165
|
+
Full docs live in [`docs/`](docs/):
|
|
166
|
+
|
|
167
|
+
- [Quickstart](docs/quickstart.md)
|
|
168
|
+
- [Concepts](docs/concepts.md) — WorkloadType, Workload, LoadModel, Monitor
|
|
169
|
+
- [Load models](docs/load-models.md) — rate-spec syntax, open vs closed loop
|
|
170
|
+
- [Workloads & workload-types](docs/workloads.md) — built-ins and custom subclasses
|
|
171
|
+
- [Hooks](docs/hooks.md) — pre/post request processing
|
|
172
|
+
- [Monitors](docs/monitors.md) — vLLM `/metrics`, GPU telemetry, custom samplers
|
|
173
|
+
- [Metrics & output](docs/metrics.md) — summary structure, JSONL dumps
|
|
174
|
+
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
175
|
+
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
176
|
+
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
177
|
+
|
|
178
|
+
## Examples
|
|
179
|
+
|
|
180
|
+
Under [`examples/`](examples/):
|
|
181
|
+
|
|
182
|
+
- `simple_get.py` — minimal library usage
|
|
183
|
+
- `custom_hooks.py` — request signing + response parsing
|
|
184
|
+
- `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
|
|
185
|
+
- `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
|
|
186
|
+
- `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
|
|
187
|
+
- `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
|
|
188
|
+
- `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
|
|
189
|
+
- `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
|
|
190
|
+
- `config.yaml` — generic HTTP YAML config
|
|
191
|
+
- `config_llm.yaml` — LLM YAML config with a Prometheus monitor
|
|
192
|
+
|
|
193
|
+
Helper scripts under [`tools/`](tools/):
|
|
194
|
+
|
|
195
|
+
- `prepare_sharegpt.py` — fetch ShareGPT V3 and convert to a generic JSONL
|
|
196
|
+
- `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
|
|
197
|
+
- `start_local_llm.sh` — example local SGLang launch command
|
|
198
|
+
|
|
199
|
+
## Project layout
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
benchmaker/ # library code
|
|
203
|
+
entrypoints/ # CLI (bench-maker)
|
|
204
|
+
examples/ # runnable examples
|
|
205
|
+
tools/ # one-off helper scripts (dataset prep, etc.)
|
|
206
|
+
tests/ # pytest smoke tests
|
|
207
|
+
docs/ # reference docs
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Run the tests
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
pytest -q
|
|
214
|
+
```
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# bench-maker
|
|
2
|
+
|
|
3
|
+
Async HTTP benchmarking with pluggable workload-types (protocols), workloads
|
|
4
|
+
(datasets), load models, hooks, and optional periodic monitors.
|
|
5
|
+
|
|
6
|
+
```text
|
|
7
|
+
+--------+ item +---------------+ request +-----------+ +---------+
|
|
8
|
+
|workload|--------->| workload-type |------------>| pre-hooks |-->| aiohttp |
|
|
9
|
+
|(dataset| | (protocol) | +-----------+ +---------+
|
|
10
|
+
| / log) | | make_request | |
|
|
11
|
+
+--------+ | make_sample | +------------+ v
|
|
12
|
+
^ +---------------+ | post-hooks |<----+
|
|
13
|
+
| +------------+
|
|
14
|
+
+-- load model decides WHEN to fire ----+ v
|
|
15
|
+
| +----------+
|
|
16
|
+
monitors run alongside ------+------->| metrics |
|
|
17
|
+
(Prometheus, NVML, ...) | aggregator|
|
|
18
|
+
+----------+
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e .
|
|
25
|
+
pip install -e .[dev] # for tests
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
This installs the `benchmaker` Python package and the `bench-maker` CLI.
|
|
29
|
+
|
|
30
|
+
## 30-second tour
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import asyncio
|
|
34
|
+
from benchmaker import BenchConfig, BenchRunner, ConstantRPS, HttpWorkloadType
|
|
35
|
+
|
|
36
|
+
async def main():
|
|
37
|
+
cfg = BenchConfig(
|
|
38
|
+
workload_type=HttpWorkloadType(url="https://httpbin.org/get"),
|
|
39
|
+
load=ConstantRPS(rps=50, duration_s=10),
|
|
40
|
+
)
|
|
41
|
+
result = await BenchRunner(cfg).run()
|
|
42
|
+
print(result.summary)
|
|
43
|
+
|
|
44
|
+
asyncio.run(main())
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Or via the CLI:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Walkthrough: benchmarking an LLM endpoint with ShareGPT
|
|
54
|
+
|
|
55
|
+
A realistic LLM benchmark needs a real prompt distribution.
|
|
56
|
+
[ShareGPT V3](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
|
|
57
|
+
is a common choice — multi-turn human/assistant conversations scraped from real
|
|
58
|
+
ChatGPT users. A cleaned, benchmark-ready copy is published at
|
|
59
|
+
[`researchcomputer/llmsys-bench`](https://huggingface.co/datasets/researchcomputer/llmsys-bench)
|
|
60
|
+
(`split="sharegpt"`), with one row per conversation:
|
|
61
|
+
|
|
62
|
+
```json
|
|
63
|
+
{"id": "...", "messages": [{"role": "user", "content": "..."},
|
|
64
|
+
{"role": "assistant", "content": "..."},
|
|
65
|
+
{"role": "user", "content": "..."}]}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
`messages` is the only content field — it's everything a chat benchmark needs.
|
|
69
|
+
Each row is truncated to end on a **user** turn, so it's a valid generation
|
|
70
|
+
request: the server completes the final assistant reply given the prior
|
|
71
|
+
history. Short source conversations collapse to a single user turn (a plain
|
|
72
|
+
single-turn prompt); longer ones carry multi-turn context.
|
|
73
|
+
|
|
74
|
+
### Load it directly from the Hub
|
|
75
|
+
|
|
76
|
+
Pull the published split and feed each row's `messages` list straight into the
|
|
77
|
+
chat workload-type (`pip install -e .[hf]`):
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import asyncio
|
|
81
|
+
from datasets import load_dataset
|
|
82
|
+
from benchmaker import (
|
|
83
|
+
BenchConfig, BenchRunner, OpenAIChatWorkloadType,
|
|
84
|
+
IterableWorkload, parse_rate_spec,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
async def main():
|
|
88
|
+
ds = load_dataset("researchcomputer/llmsys-bench", split="sharegpt")
|
|
89
|
+
cfg = BenchConfig(
|
|
90
|
+
workload_type=OpenAIChatWorkloadType(
|
|
91
|
+
url="http://localhost:8000/v1/chat/completions",
|
|
92
|
+
model="meta-llama/Llama-3.1-8B-Instruct",
|
|
93
|
+
max_tokens=256,
|
|
94
|
+
),
|
|
95
|
+
workload=IterableWorkload(row["messages"] for row in ds),
|
|
96
|
+
load=parse_rate_spec("poisson:8", duration_s=60),
|
|
97
|
+
timeout_s=600,
|
|
98
|
+
)
|
|
99
|
+
result = await BenchRunner(cfg).run()
|
|
100
|
+
print(result.summary)
|
|
101
|
+
|
|
102
|
+
asyncio.run(main())
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
`OpenAIChatWorkloadType` receives the message list as-is, so single-turn rows
|
|
106
|
+
send one user message and multi-turn rows replay the full history before the
|
|
107
|
+
server generates the final assistant turn. TTFT, inter-token latency, and
|
|
108
|
+
tokens/sec are captured the same way in both cases. URL / model / API key can
|
|
109
|
+
also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
|
|
110
|
+
|
|
111
|
+
### Rebuild or customize it yourself
|
|
112
|
+
|
|
113
|
+
The published split is produced by `tools/prepare_sharegpt.py`, which downloads
|
|
114
|
+
the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
|
|
115
|
+
shape above. Run it when you want a subset, different filtering, or a refresh:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Defaults: .local/sharegpt_v3_raw.json -> .local/sharegpt_v3.jsonl
|
|
119
|
+
python tools/prepare_sharegpt.py
|
|
120
|
+
|
|
121
|
+
# A quick subset for smoke tests:
|
|
122
|
+
python tools/prepare_sharegpt.py --max-items 2000
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
|
|
126
|
+
pathologically long conversations (measured over total message content per
|
|
127
|
+
row). Point any workload at the local file with `JsonlWorkload(path=...,
|
|
128
|
+
field="messages")`, or on the CLI:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
bench-maker llm \
|
|
132
|
+
--url http://localhost:8000/v1/chat/completions \
|
|
133
|
+
--model meta-llama/Llama-3.1-8B-Instruct \
|
|
134
|
+
--prompts-jsonl .local/sharegpt_v3.jsonl \
|
|
135
|
+
--prompt-field messages \
|
|
136
|
+
--max-tokens 256 \
|
|
137
|
+
--rate poisson:8 --duration 60s \
|
|
138
|
+
--out-dir ./runs --label dataset=sharegpt
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
|
|
142
|
+
JSONL back to the Hub (needs a write token).
|
|
143
|
+
|
|
144
|
+
## Documentation
|
|
145
|
+
|
|
146
|
+
Full docs live in [`docs/`](docs/):
|
|
147
|
+
|
|
148
|
+
- [Quickstart](docs/quickstart.md)
|
|
149
|
+
- [Concepts](docs/concepts.md) — WorkloadType, Workload, LoadModel, Monitor
|
|
150
|
+
- [Load models](docs/load-models.md) — rate-spec syntax, open vs closed loop
|
|
151
|
+
- [Workloads & workload-types](docs/workloads.md) — built-ins and custom subclasses
|
|
152
|
+
- [Hooks](docs/hooks.md) — pre/post request processing
|
|
153
|
+
- [Monitors](docs/monitors.md) — vLLM `/metrics`, GPU telemetry, custom samplers
|
|
154
|
+
- [Metrics & output](docs/metrics.md) — summary structure, JSONL dumps
|
|
155
|
+
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
156
|
+
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
157
|
+
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
158
|
+
|
|
159
|
+
## Examples
|
|
160
|
+
|
|
161
|
+
Under [`examples/`](examples/):
|
|
162
|
+
|
|
163
|
+
- `simple_get.py` — minimal library usage
|
|
164
|
+
- `custom_hooks.py` — request signing + response parsing
|
|
165
|
+
- `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
|
|
166
|
+
- `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
|
|
167
|
+
- `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
|
|
168
|
+
- `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
|
|
169
|
+
- `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
|
|
170
|
+
- `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
|
|
171
|
+
- `config.yaml` — generic HTTP YAML config
|
|
172
|
+
- `config_llm.yaml` — LLM YAML config with a Prometheus monitor
|
|
173
|
+
|
|
174
|
+
Helper scripts under [`tools/`](tools/):
|
|
175
|
+
|
|
176
|
+
- `prepare_sharegpt.py` — fetch ShareGPT V3 and convert to a generic JSONL
|
|
177
|
+
- `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
|
|
178
|
+
- `start_local_llm.sh` — example local SGLang launch command
|
|
179
|
+
|
|
180
|
+
## Project layout
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
benchmaker/ # library code
|
|
184
|
+
entrypoints/ # CLI (bench-maker)
|
|
185
|
+
examples/ # runnable examples
|
|
186
|
+
tools/ # one-off helper scripts (dataset prep, etc.)
|
|
187
|
+
tests/ # pytest smoke tests
|
|
188
|
+
docs/ # reference docs
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Run the tests
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
pytest -q
|
|
195
|
+
```
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""benchmaker: async HTTP benchmarking with pluggable workload-types + workloads (datasets)."""
|
|
2
|
+
|
|
3
|
+
from benchmaker.types import (
|
|
4
|
+
Request,
|
|
5
|
+
Response,
|
|
6
|
+
Sample,
|
|
7
|
+
PreRequestHook,
|
|
8
|
+
PostResponseHook,
|
|
9
|
+
)
|
|
10
|
+
from benchmaker.workloads.base import WorkloadType
|
|
11
|
+
from benchmaker.workloads.datasets import (
|
|
12
|
+
Workload,
|
|
13
|
+
StaticWorkload,
|
|
14
|
+
JsonlWorkload,
|
|
15
|
+
CallableWorkload,
|
|
16
|
+
IterableWorkload,
|
|
17
|
+
)
|
|
18
|
+
from benchmaker.workloads.http import HttpWorkloadType
|
|
19
|
+
from benchmaker.workloads.llm import OpenAIChatWorkloadType
|
|
20
|
+
from benchmaker.workloads.sandbox import SandboxWorkloadType
|
|
21
|
+
from benchmaker.workloads.hf import HFDatasetWorkload
|
|
22
|
+
from benchmaker.workloads.agent import (
|
|
23
|
+
Agent,
|
|
24
|
+
AgentContext,
|
|
25
|
+
AgentResult,
|
|
26
|
+
AgentWorkloadType,
|
|
27
|
+
CallableAgent,
|
|
28
|
+
)
|
|
29
|
+
from benchmaker.workloads.eval import (
|
|
30
|
+
EvalWorkloadType,
|
|
31
|
+
Scorer,
|
|
32
|
+
correctness_hook,
|
|
33
|
+
extract_openai_text,
|
|
34
|
+
extract_raw_text,
|
|
35
|
+
extract_text,
|
|
36
|
+
exact_match,
|
|
37
|
+
contains,
|
|
38
|
+
regex_match,
|
|
39
|
+
json_valid,
|
|
40
|
+
multiple_choice,
|
|
41
|
+
judge_llm,
|
|
42
|
+
openai_chat_judge,
|
|
43
|
+
)
|
|
44
|
+
from benchmaker.load import (
|
|
45
|
+
LoadModel,
|
|
46
|
+
ConstantRPS,
|
|
47
|
+
PoissonRPS,
|
|
48
|
+
ClosedLoop,
|
|
49
|
+
Sweep,
|
|
50
|
+
Ramp,
|
|
51
|
+
parse_rate_spec,
|
|
52
|
+
)
|
|
53
|
+
from benchmaker.env import interpolate, load_dotenv
|
|
54
|
+
from benchmaker.monitors import (
|
|
55
|
+
Monitor,
|
|
56
|
+
FunctionMonitor,
|
|
57
|
+
PrometheusMonitor,
|
|
58
|
+
parse_prometheus,
|
|
59
|
+
)
|
|
60
|
+
from benchmaker.runner import BenchRunner, BenchConfig, BenchResult
|
|
61
|
+
from benchmaker.trace import (
|
|
62
|
+
ReplayWorkloadType,
|
|
63
|
+
TracePacedLoad,
|
|
64
|
+
TraceRecorder,
|
|
65
|
+
TraceWorkload,
|
|
66
|
+
load_trace,
|
|
67
|
+
)
|
|
68
|
+
from benchmaker.bundle import (
|
|
69
|
+
BUNDLE_VERSION,
|
|
70
|
+
RunMeta,
|
|
71
|
+
default_run_id,
|
|
72
|
+
is_bundle_dir,
|
|
73
|
+
iter_jsonl,
|
|
74
|
+
read_bundle,
|
|
75
|
+
write_bundle,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
__all__ = [
|
|
79
|
+
"Request",
|
|
80
|
+
"Response",
|
|
81
|
+
"Sample",
|
|
82
|
+
"PreRequestHook",
|
|
83
|
+
"PostResponseHook",
|
|
84
|
+
# workload-types (protocols)
|
|
85
|
+
"WorkloadType",
|
|
86
|
+
"HttpWorkloadType",
|
|
87
|
+
"OpenAIChatWorkloadType",
|
|
88
|
+
"SandboxWorkloadType",
|
|
89
|
+
"HFDatasetWorkload",
|
|
90
|
+
# agent workload (pluggable user-defined agents)
|
|
91
|
+
"Agent",
|
|
92
|
+
"AgentContext",
|
|
93
|
+
"AgentResult",
|
|
94
|
+
"AgentWorkloadType",
|
|
95
|
+
"CallableAgent",
|
|
96
|
+
# eval / correctness
|
|
97
|
+
"EvalWorkloadType",
|
|
98
|
+
"Scorer",
|
|
99
|
+
"correctness_hook",
|
|
100
|
+
"extract_openai_text",
|
|
101
|
+
"extract_raw_text",
|
|
102
|
+
"extract_text",
|
|
103
|
+
"exact_match",
|
|
104
|
+
"contains",
|
|
105
|
+
"regex_match",
|
|
106
|
+
"json_valid",
|
|
107
|
+
"multiple_choice",
|
|
108
|
+
"judge_llm",
|
|
109
|
+
"openai_chat_judge",
|
|
110
|
+
# workloads (datasets / input sources)
|
|
111
|
+
"Workload",
|
|
112
|
+
"StaticWorkload",
|
|
113
|
+
"JsonlWorkload",
|
|
114
|
+
"CallableWorkload",
|
|
115
|
+
"IterableWorkload",
|
|
116
|
+
# load models
|
|
117
|
+
"LoadModel",
|
|
118
|
+
"ConstantRPS",
|
|
119
|
+
"PoissonRPS",
|
|
120
|
+
"ClosedLoop",
|
|
121
|
+
"Sweep",
|
|
122
|
+
"Ramp",
|
|
123
|
+
"parse_rate_spec",
|
|
124
|
+
# monitors
|
|
125
|
+
"Monitor",
|
|
126
|
+
"FunctionMonitor",
|
|
127
|
+
"PrometheusMonitor",
|
|
128
|
+
"parse_prometheus",
|
|
129
|
+
# env
|
|
130
|
+
"load_dotenv",
|
|
131
|
+
"interpolate",
|
|
132
|
+
# runner
|
|
133
|
+
"BenchRunner",
|
|
134
|
+
"BenchConfig",
|
|
135
|
+
"BenchResult",
|
|
136
|
+
# trace: record & replay
|
|
137
|
+
"TraceRecorder",
|
|
138
|
+
"ReplayWorkloadType",
|
|
139
|
+
"TraceWorkload",
|
|
140
|
+
"TracePacedLoad",
|
|
141
|
+
"load_trace",
|
|
142
|
+
# bundle / output layout
|
|
143
|
+
"BUNDLE_VERSION",
|
|
144
|
+
"RunMeta",
|
|
145
|
+
"default_run_id",
|
|
146
|
+
"is_bundle_dir",
|
|
147
|
+
"iter_jsonl",
|
|
148
|
+
"read_bundle",
|
|
149
|
+
"write_bundle",
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
__version__ = "0.1.0"
|