benchmaker 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. benchmaker-0.1.0/PKG-INFO +214 -0
  2. benchmaker-0.1.0/README.md +195 -0
  3. benchmaker-0.1.0/benchmaker/__init__.py +152 -0
  4. benchmaker-0.1.0/benchmaker/bundle.py +193 -0
  5. benchmaker-0.1.0/benchmaker/cli.py +382 -0
  6. benchmaker-0.1.0/benchmaker/collect.py +178 -0
  7. benchmaker-0.1.0/benchmaker/config.py +448 -0
  8. benchmaker-0.1.0/benchmaker/env.py +87 -0
  9. benchmaker-0.1.0/benchmaker/load.py +326 -0
  10. benchmaker-0.1.0/benchmaker/metrics.py +234 -0
  11. benchmaker-0.1.0/benchmaker/monitors.py +228 -0
  12. benchmaker-0.1.0/benchmaker/runner.py +275 -0
  13. benchmaker-0.1.0/benchmaker/trace.py +217 -0
  14. benchmaker-0.1.0/benchmaker/types.py +98 -0
  15. benchmaker-0.1.0/benchmaker/workloads/__init__.py +53 -0
  16. benchmaker-0.1.0/benchmaker/workloads/agent.py +308 -0
  17. benchmaker-0.1.0/benchmaker/workloads/base.py +79 -0
  18. benchmaker-0.1.0/benchmaker/workloads/datasets.py +156 -0
  19. benchmaker-0.1.0/benchmaker/workloads/eval.py +504 -0
  20. benchmaker-0.1.0/benchmaker/workloads/hf.py +382 -0
  21. benchmaker-0.1.0/benchmaker/workloads/http.py +77 -0
  22. benchmaker-0.1.0/benchmaker/workloads/llm.py +258 -0
  23. benchmaker-0.1.0/benchmaker/workloads/sandbox.py +470 -0
  24. benchmaker-0.1.0/benchmaker.egg-info/PKG-INFO +214 -0
  25. benchmaker-0.1.0/benchmaker.egg-info/SOURCES.txt +36 -0
  26. benchmaker-0.1.0/benchmaker.egg-info/dependency_links.txt +1 -0
  27. benchmaker-0.1.0/benchmaker.egg-info/entry_points.txt +2 -0
  28. benchmaker-0.1.0/benchmaker.egg-info/requires.txt +13 -0
  29. benchmaker-0.1.0/benchmaker.egg-info/top_level.txt +1 -0
  30. benchmaker-0.1.0/pyproject.toml +32 -0
  31. benchmaker-0.1.0/setup.cfg +4 -0
  32. benchmaker-0.1.0/tests/test_agent.py +185 -0
  33. benchmaker-0.1.0/tests/test_bundle.py +179 -0
  34. benchmaker-0.1.0/tests/test_coding_agent.py +352 -0
  35. benchmaker-0.1.0/tests/test_eval.py +531 -0
  36. benchmaker-0.1.0/tests/test_hf.py +366 -0
  37. benchmaker-0.1.0/tests/test_smoke.py +628 -0
  38. benchmaker-0.1.0/tests/test_trace.py +161 -0
@@ -0,0 +1,214 @@
1
+ Metadata-Version: 2.4
2
+ Name: benchmaker
3
+ Version: 0.1.0
4
+ Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
5
+ Author: Xiaozhe Yao
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: aiohttp>=3.9
10
+ Requires-Dist: click>=8.1
11
+ Requires-Dist: pyyaml>=6.0
12
+ Provides-Extra: rich
13
+ Requires-Dist: rich>=13; extra == "rich"
14
+ Provides-Extra: hf
15
+ Requires-Dist: datasets>=2.18; extra == "hf"
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=7; extra == "dev"
18
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
19
+
20
+ # bench-maker
21
+
22
+ Async HTTP benchmarking with pluggable workload-types (protocols), workloads
23
+ (datasets), load models, hooks, and optional periodic monitors.
24
+
25
+ ```text
26
+ +--------+ item +---------------+ request +-----------+ +---------+
27
+ |workload|--------->| workload-type |------------>| pre-hooks |-->| aiohttp |
28
+ |(dataset| | (protocol) | +-----------+ +---------+
29
+ | / log) | | make_request | |
30
+ +--------+ | make_sample | +------------+ v
31
+ ^ +---------------+ | post-hooks |<----+
32
+ | +------------+
33
+ +-- load model decides WHEN to fire ----+ v
34
+ | +----------+
35
+ monitors run alongside ------+------->| metrics |
36
+ (Prometheus, NVML, ...) | aggregator|
37
+ +----------+
38
+ ```
39
+
40
+ ## Install
41
+
42
+ ```bash
43
+ pip install -e .
44
+ pip install -e .[dev] # for tests
45
+ ```
46
+
47
+ This installs the `benchmaker` Python package and the `bench-maker` CLI.
48
+
49
+ ## 30-second tour
50
+
51
+ ```python
52
+ import asyncio
53
+ from benchmaker import BenchConfig, BenchRunner, ConstantRPS, HttpWorkloadType
54
+
55
+ async def main():
56
+ cfg = BenchConfig(
57
+ workload_type=HttpWorkloadType(url="https://httpbin.org/get"),
58
+ load=ConstantRPS(rps=50, duration_s=10),
59
+ )
60
+ result = await BenchRunner(cfg).run()
61
+ print(result.summary)
62
+
63
+ asyncio.run(main())
64
+ ```
65
+
66
+ Or via the CLI:
67
+
68
+ ```bash
69
+ bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
70
+ ```
71
+
72
+ ## Walkthrough: benchmarking an LLM endpoint with ShareGPT
73
+
74
+ A realistic LLM benchmark needs a real prompt distribution.
75
+ [ShareGPT V3](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
76
+ is a common choice — multi-turn human/assistant conversations scraped from real
77
+ ChatGPT users. A cleaned, benchmark-ready copy is published at
78
+ [`researchcomputer/llmsys-bench`](https://huggingface.co/datasets/researchcomputer/llmsys-bench)
79
+ (`split="sharegpt"`), with one row per conversation:
80
+
81
+ ```json
82
+ {"id": "...", "messages": [{"role": "user", "content": "..."},
83
+ {"role": "assistant", "content": "..."},
84
+ {"role": "user", "content": "..."}]}
85
+ ```
86
+
87
+ `messages` is the only content field — it's everything a chat benchmark needs.
88
+ Each row is truncated to end on a **user** turn, so it's a valid generation
89
+ request: the server completes the final assistant reply given the prior
90
+ history. Short source conversations collapse to a single user turn (a plain
91
+ single-turn prompt); longer ones carry multi-turn context.
92
+
93
+ ### Load it directly from the Hub
94
+
95
+ Pull the published split and feed each row's `messages` list straight into the
96
+ chat workload-type (`pip install -e .[hf]`):
97
+
98
+ ```python
99
+ import asyncio
100
+ from datasets import load_dataset
101
+ from benchmaker import (
102
+ BenchConfig, BenchRunner, OpenAIChatWorkloadType,
103
+ IterableWorkload, parse_rate_spec,
104
+ )
105
+
106
+ async def main():
107
+ ds = load_dataset("researchcomputer/llmsys-bench", split="sharegpt")
108
+ cfg = BenchConfig(
109
+ workload_type=OpenAIChatWorkloadType(
110
+ url="http://localhost:8000/v1/chat/completions",
111
+ model="meta-llama/Llama-3.1-8B-Instruct",
112
+ max_tokens=256,
113
+ ),
114
+ workload=IterableWorkload(row["messages"] for row in ds),
115
+ load=parse_rate_spec("poisson:8", duration_s=60),
116
+ timeout_s=600,
117
+ )
118
+ result = await BenchRunner(cfg).run()
119
+ print(result.summary)
120
+
121
+ asyncio.run(main())
122
+ ```
123
+
124
+ `OpenAIChatWorkloadType` receives the message list as-is, so single-turn rows
125
+ send one user message and multi-turn rows replay the full history before the
126
+ server generates the final assistant turn. TTFT, inter-token latency, and
127
+ tokens/sec are captured the same way in both cases. URL / model / API key can
128
+ also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
129
+
130
+ ### Rebuild or customize it yourself
131
+
132
+ The published split is produced by `tools/prepare_sharegpt.py`, which downloads
133
+ the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
134
+ shape above. Run it when you want a subset, different filtering, or a refresh:
135
+
136
+ ```bash
137
+ # Defaults: .local/sharegpt_v3_raw.json -> .local/sharegpt_v3.jsonl
138
+ python tools/prepare_sharegpt.py
139
+
140
+ # A quick subset for smoke tests:
141
+ python tools/prepare_sharegpt.py --max-items 2000
142
+ ```
143
+
144
+ The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
145
+ pathologically long conversations (measured over total message content per
146
+ row). Point any workload at the local file with `JsonlWorkload(path=...,
147
+ field="messages")`, or on the CLI:
148
+
149
+ ```bash
150
+ bench-maker llm \
151
+ --url http://localhost:8000/v1/chat/completions \
152
+ --model meta-llama/Llama-3.1-8B-Instruct \
153
+ --prompts-jsonl .local/sharegpt_v3.jsonl \
154
+ --prompt-field messages \
155
+ --max-tokens 256 \
156
+ --rate poisson:8 --duration 60s \
157
+ --out-dir ./runs --label dataset=sharegpt
158
+ ```
159
+
160
+ To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
161
+ JSONL back to the Hub (needs a write token).
162
+
163
+ ## Documentation
164
+
165
+ Full docs live in [`docs/`](docs/):
166
+
167
+ - [Quickstart](docs/quickstart.md)
168
+ - [Concepts](docs/concepts.md) — WorkloadType, Workload, LoadModel, Monitor
169
+ - [Load models](docs/load-models.md) — rate-spec syntax, open vs closed loop
170
+ - [Workloads & workload-types](docs/workloads.md) — built-ins and custom subclasses
171
+ - [Hooks](docs/hooks.md) — pre/post request processing
172
+ - [Monitors](docs/monitors.md) — vLLM `/metrics`, GPU telemetry, custom samplers
173
+ - [Metrics & output](docs/metrics.md) — summary structure, JSONL dumps
174
+ - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
175
+ - [CLI & YAML reference](docs/cli-and-yaml.md)
176
+ - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
177
+
178
+ ## Examples
179
+
180
+ Under [`examples/`](examples/):
181
+
182
+ - `simple_get.py` — minimal library usage
183
+ - `custom_hooks.py` — request signing + response parsing
184
+ - `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
185
+ - `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
186
+ - `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
187
+ - `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
188
+ - `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
189
+ - `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
190
+ - `config.yaml` — generic HTTP YAML config
191
+ - `config_llm.yaml` — LLM YAML config with a Prometheus monitor
192
+
193
+ Helper scripts under [`tools/`](tools/):
194
+
195
+ - `prepare_sharegpt.py` — fetch ShareGPT V3 and convert to a generic JSONL
196
+ - `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
197
+ - `start_local_llm.sh` — example local SGLang launch command
198
+
199
+ ## Project layout
200
+
201
+ ```
202
+ benchmaker/ # library code
203
+ entrypoints/ # CLI (bench-maker)
204
+ examples/ # runnable examples
205
+ tools/ # one-off helper scripts (dataset prep, etc.)
206
+ tests/ # pytest smoke tests
207
+ docs/ # reference docs
208
+ ```
209
+
210
+ ## Run the tests
211
+
212
+ ```bash
213
+ pytest -q
214
+ ```
@@ -0,0 +1,195 @@
1
+ # bench-maker
2
+
3
+ Async HTTP benchmarking with pluggable workload-types (protocols), workloads
4
+ (datasets), load models, hooks, and optional periodic monitors.
5
+
6
+ ```text
7
+ +--------+ item +---------------+ request +-----------+ +---------+
8
+ |workload|--------->| workload-type |------------>| pre-hooks |-->| aiohttp |
9
+ |(dataset| | (protocol) | +-----------+ +---------+
10
+ | / log) | | make_request | |
11
+ +--------+ | make_sample | +------------+ v
12
+ ^ +---------------+ | post-hooks |<----+
13
+ | +------------+
14
+ +-- load model decides WHEN to fire ----+ v
15
+ | +----------+
16
+ monitors run alongside ------+------->| metrics |
17
+ (Prometheus, NVML, ...) | aggregator|
18
+ +----------+
19
+ ```
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install -e .
25
+ pip install -e .[dev] # for tests
26
+ ```
27
+
28
+ This installs the `benchmaker` Python package and the `bench-maker` CLI.
29
+
30
+ ## 30-second tour
31
+
32
+ ```python
33
+ import asyncio
34
+ from benchmaker import BenchConfig, BenchRunner, ConstantRPS, HttpWorkloadType
35
+
36
+ async def main():
37
+ cfg = BenchConfig(
38
+ workload_type=HttpWorkloadType(url="https://httpbin.org/get"),
39
+ load=ConstantRPS(rps=50, duration_s=10),
40
+ )
41
+ result = await BenchRunner(cfg).run()
42
+ print(result.summary)
43
+
44
+ asyncio.run(main())
45
+ ```
46
+
47
+ Or via the CLI:
48
+
49
+ ```bash
50
+ bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
51
+ ```
52
+
53
+ ## Walkthrough: benchmarking an LLM endpoint with ShareGPT
54
+
55
+ A realistic LLM benchmark needs a real prompt distribution.
56
+ [ShareGPT V3](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered)
57
+ is a common choice — multi-turn human/assistant conversations scraped from real
58
+ ChatGPT users. A cleaned, benchmark-ready copy is published at
59
+ [`researchcomputer/llmsys-bench`](https://huggingface.co/datasets/researchcomputer/llmsys-bench)
60
+ (`split="sharegpt"`), with one row per conversation:
61
+
62
+ ```json
63
+ {"id": "...", "messages": [{"role": "user", "content": "..."},
64
+ {"role": "assistant", "content": "..."},
65
+ {"role": "user", "content": "..."}]}
66
+ ```
67
+
68
+ `messages` is the only content field — it's everything a chat benchmark needs.
69
+ Each row is truncated to end on a **user** turn, so it's a valid generation
70
+ request: the server completes the final assistant reply given the prior
71
+ history. Short source conversations collapse to a single user turn (a plain
72
+ single-turn prompt); longer ones carry multi-turn context.
73
+
74
+ ### Load it directly from the Hub
75
+
76
+ Pull the published split and feed each row's `messages` list straight into the
77
+ chat workload-type (`pip install -e .[hf]`):
78
+
79
+ ```python
80
+ import asyncio
81
+ from datasets import load_dataset
82
+ from benchmaker import (
83
+ BenchConfig, BenchRunner, OpenAIChatWorkloadType,
84
+ IterableWorkload, parse_rate_spec,
85
+ )
86
+
87
+ async def main():
88
+ ds = load_dataset("researchcomputer/llmsys-bench", split="sharegpt")
89
+ cfg = BenchConfig(
90
+ workload_type=OpenAIChatWorkloadType(
91
+ url="http://localhost:8000/v1/chat/completions",
92
+ model="meta-llama/Llama-3.1-8B-Instruct",
93
+ max_tokens=256,
94
+ ),
95
+ workload=IterableWorkload(row["messages"] for row in ds),
96
+ load=parse_rate_spec("poisson:8", duration_s=60),
97
+ timeout_s=600,
98
+ )
99
+ result = await BenchRunner(cfg).run()
100
+ print(result.summary)
101
+
102
+ asyncio.run(main())
103
+ ```
104
+
105
+ `OpenAIChatWorkloadType` receives the message list as-is, so single-turn rows
106
+ send one user message and multi-turn rows replay the full history before the
107
+ server generates the final assistant turn. TTFT, inter-token latency, and
108
+ tokens/sec are captured the same way in both cases. URL / model / API key can
109
+ also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
110
+
111
+ ### Rebuild or customize it yourself
112
+
113
+ The published split is produced by `tools/prepare_sharegpt.py`, which downloads
114
+ the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
115
+ shape above. Run it when you want a subset, different filtering, or a refresh:
116
+
117
+ ```bash
118
+ # Defaults: .local/sharegpt_v3_raw.json -> .local/sharegpt_v3.jsonl
119
+ python tools/prepare_sharegpt.py
120
+
121
+ # A quick subset for smoke tests:
122
+ python tools/prepare_sharegpt.py --max-items 2000
123
+ ```
124
+
125
+ The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
126
+ pathologically long conversations (measured over total message content per
127
+ row). Point any workload at the local file with `JsonlWorkload(path=...,
128
+ field="messages")`, or on the CLI:
129
+
130
+ ```bash
131
+ bench-maker llm \
132
+ --url http://localhost:8000/v1/chat/completions \
133
+ --model meta-llama/Llama-3.1-8B-Instruct \
134
+ --prompts-jsonl .local/sharegpt_v3.jsonl \
135
+ --prompt-field messages \
136
+ --max-tokens 256 \
137
+ --rate poisson:8 --duration 60s \
138
+ --out-dir ./runs --label dataset=sharegpt
139
+ ```
140
+
141
+ To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
142
+ JSONL back to the Hub (needs a write token).
143
+
144
+ ## Documentation
145
+
146
+ Full docs live in [`docs/`](docs/):
147
+
148
+ - [Quickstart](docs/quickstart.md)
149
+ - [Concepts](docs/concepts.md) — WorkloadType, Workload, LoadModel, Monitor
150
+ - [Load models](docs/load-models.md) — rate-spec syntax, open vs closed loop
151
+ - [Workloads & workload-types](docs/workloads.md) — built-ins and custom subclasses
152
+ - [Hooks](docs/hooks.md) — pre/post request processing
153
+ - [Monitors](docs/monitors.md) — vLLM `/metrics`, GPU telemetry, custom samplers
154
+ - [Metrics & output](docs/metrics.md) — summary structure, JSONL dumps
155
+ - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
156
+ - [CLI & YAML reference](docs/cli-and-yaml.md)
157
+ - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
158
+
159
+ ## Examples
160
+
161
+ Under [`examples/`](examples/):
162
+
163
+ - `simple_get.py` — minimal library usage
164
+ - `custom_hooks.py` — request signing + response parsing
165
+ - `llm_chat.py` — OpenAI-compatible LLM endpoint with streaming
166
+ - `vllm_with_monitor.py` — LLM benchmark with concurrent vLLM `/metrics` scrape
167
+ - `sandbox_exec.py` — Flash Sandbox `/exec` latency benchmark
168
+ - `sandbox_lifecycle.py` — full create → exec → delete cold-start benchmark
169
+ - `llm_eval.py` — LLM benchmark + accuracy grading (exact/regex/judge)
170
+ - `gsm8k_eval.py` — GSM8K from HuggingFace + integer-match scorer
171
+ - `config.yaml` — generic HTTP YAML config
172
+ - `config_llm.yaml` — LLM YAML config with a Prometheus monitor
173
+
174
+ Helper scripts under [`tools/`](tools/):
175
+
176
+ - `prepare_sharegpt.py` — fetch ShareGPT V3 and convert to a generic JSONL
177
+ - `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
178
+ - `start_local_llm.sh` — example local SGLang launch command
179
+
180
+ ## Project layout
181
+
182
+ ```
183
+ benchmaker/ # library code
184
+ entrypoints/ # CLI (bench-maker)
185
+ examples/ # runnable examples
186
+ tools/ # one-off helper scripts (dataset prep, etc.)
187
+ tests/ # pytest smoke tests
188
+ docs/ # reference docs
189
+ ```
190
+
191
+ ## Run the tests
192
+
193
+ ```bash
194
+ pytest -q
195
+ ```
@@ -0,0 +1,152 @@
1
+ """benchmaker: async HTTP benchmarking with pluggable workload-types + workloads (datasets)."""
2
+
3
+ from benchmaker.types import (
4
+ Request,
5
+ Response,
6
+ Sample,
7
+ PreRequestHook,
8
+ PostResponseHook,
9
+ )
10
+ from benchmaker.workloads.base import WorkloadType
11
+ from benchmaker.workloads.datasets import (
12
+ Workload,
13
+ StaticWorkload,
14
+ JsonlWorkload,
15
+ CallableWorkload,
16
+ IterableWorkload,
17
+ )
18
+ from benchmaker.workloads.http import HttpWorkloadType
19
+ from benchmaker.workloads.llm import OpenAIChatWorkloadType
20
+ from benchmaker.workloads.sandbox import SandboxWorkloadType
21
+ from benchmaker.workloads.hf import HFDatasetWorkload
22
+ from benchmaker.workloads.agent import (
23
+ Agent,
24
+ AgentContext,
25
+ AgentResult,
26
+ AgentWorkloadType,
27
+ CallableAgent,
28
+ )
29
+ from benchmaker.workloads.eval import (
30
+ EvalWorkloadType,
31
+ Scorer,
32
+ correctness_hook,
33
+ extract_openai_text,
34
+ extract_raw_text,
35
+ extract_text,
36
+ exact_match,
37
+ contains,
38
+ regex_match,
39
+ json_valid,
40
+ multiple_choice,
41
+ judge_llm,
42
+ openai_chat_judge,
43
+ )
44
+ from benchmaker.load import (
45
+ LoadModel,
46
+ ConstantRPS,
47
+ PoissonRPS,
48
+ ClosedLoop,
49
+ Sweep,
50
+ Ramp,
51
+ parse_rate_spec,
52
+ )
53
+ from benchmaker.env import interpolate, load_dotenv
54
+ from benchmaker.monitors import (
55
+ Monitor,
56
+ FunctionMonitor,
57
+ PrometheusMonitor,
58
+ parse_prometheus,
59
+ )
60
+ from benchmaker.runner import BenchRunner, BenchConfig, BenchResult
61
+ from benchmaker.trace import (
62
+ ReplayWorkloadType,
63
+ TracePacedLoad,
64
+ TraceRecorder,
65
+ TraceWorkload,
66
+ load_trace,
67
+ )
68
+ from benchmaker.bundle import (
69
+ BUNDLE_VERSION,
70
+ RunMeta,
71
+ default_run_id,
72
+ is_bundle_dir,
73
+ iter_jsonl,
74
+ read_bundle,
75
+ write_bundle,
76
+ )
77
+
78
+ __all__ = [
79
+ "Request",
80
+ "Response",
81
+ "Sample",
82
+ "PreRequestHook",
83
+ "PostResponseHook",
84
+ # workload-types (protocols)
85
+ "WorkloadType",
86
+ "HttpWorkloadType",
87
+ "OpenAIChatWorkloadType",
88
+ "SandboxWorkloadType",
89
+ "HFDatasetWorkload",
90
+ # agent workload (pluggable user-defined agents)
91
+ "Agent",
92
+ "AgentContext",
93
+ "AgentResult",
94
+ "AgentWorkloadType",
95
+ "CallableAgent",
96
+ # eval / correctness
97
+ "EvalWorkloadType",
98
+ "Scorer",
99
+ "correctness_hook",
100
+ "extract_openai_text",
101
+ "extract_raw_text",
102
+ "extract_text",
103
+ "exact_match",
104
+ "contains",
105
+ "regex_match",
106
+ "json_valid",
107
+ "multiple_choice",
108
+ "judge_llm",
109
+ "openai_chat_judge",
110
+ # workloads (datasets / input sources)
111
+ "Workload",
112
+ "StaticWorkload",
113
+ "JsonlWorkload",
114
+ "CallableWorkload",
115
+ "IterableWorkload",
116
+ # load models
117
+ "LoadModel",
118
+ "ConstantRPS",
119
+ "PoissonRPS",
120
+ "ClosedLoop",
121
+ "Sweep",
122
+ "Ramp",
123
+ "parse_rate_spec",
124
+ # monitors
125
+ "Monitor",
126
+ "FunctionMonitor",
127
+ "PrometheusMonitor",
128
+ "parse_prometheus",
129
+ # env
130
+ "load_dotenv",
131
+ "interpolate",
132
+ # runner
133
+ "BenchRunner",
134
+ "BenchConfig",
135
+ "BenchResult",
136
+ # trace: record & replay
137
+ "TraceRecorder",
138
+ "ReplayWorkloadType",
139
+ "TraceWorkload",
140
+ "TracePacedLoad",
141
+ "load_trace",
142
+ # bundle / output layout
143
+ "BUNDLE_VERSION",
144
+ "RunMeta",
145
+ "default_run_id",
146
+ "is_bundle_dir",
147
+ "iter_jsonl",
148
+ "read_bundle",
149
+ "write_bundle",
150
+ ]
151
+
152
+ __version__ = "0.1.0"