benchmaker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmaker/__init__.py +152 -0
- benchmaker/bundle.py +193 -0
- benchmaker/cli.py +382 -0
- benchmaker/collect.py +178 -0
- benchmaker/config.py +448 -0
- benchmaker/env.py +87 -0
- benchmaker/load.py +326 -0
- benchmaker/metrics.py +234 -0
- benchmaker/monitors.py +228 -0
- benchmaker/runner.py +275 -0
- benchmaker/trace.py +217 -0
- benchmaker/types.py +98 -0
- benchmaker/workloads/__init__.py +53 -0
- benchmaker/workloads/agent.py +308 -0
- benchmaker/workloads/base.py +79 -0
- benchmaker/workloads/datasets.py +156 -0
- benchmaker/workloads/eval.py +504 -0
- benchmaker/workloads/hf.py +382 -0
- benchmaker/workloads/http.py +77 -0
- benchmaker/workloads/llm.py +258 -0
- benchmaker/workloads/sandbox.py +470 -0
- benchmaker-0.1.0.dist-info/METADATA +214 -0
- benchmaker-0.1.0.dist-info/RECORD +26 -0
- benchmaker-0.1.0.dist-info/WHEEL +5 -0
- benchmaker-0.1.0.dist-info/entry_points.txt +2 -0
- benchmaker-0.1.0.dist-info/top_level.txt +1 -0
benchmaker/__init__.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""benchmaker: async HTTP benchmarking with pluggable workload-types + workloads (datasets)."""
|
|
2
|
+
|
|
3
|
+
from benchmaker.types import (
|
|
4
|
+
Request,
|
|
5
|
+
Response,
|
|
6
|
+
Sample,
|
|
7
|
+
PreRequestHook,
|
|
8
|
+
PostResponseHook,
|
|
9
|
+
)
|
|
10
|
+
from benchmaker.workloads.base import WorkloadType
|
|
11
|
+
from benchmaker.workloads.datasets import (
|
|
12
|
+
Workload,
|
|
13
|
+
StaticWorkload,
|
|
14
|
+
JsonlWorkload,
|
|
15
|
+
CallableWorkload,
|
|
16
|
+
IterableWorkload,
|
|
17
|
+
)
|
|
18
|
+
from benchmaker.workloads.http import HttpWorkloadType
|
|
19
|
+
from benchmaker.workloads.llm import OpenAIChatWorkloadType
|
|
20
|
+
from benchmaker.workloads.sandbox import SandboxWorkloadType
|
|
21
|
+
from benchmaker.workloads.hf import HFDatasetWorkload
|
|
22
|
+
from benchmaker.workloads.agent import (
|
|
23
|
+
Agent,
|
|
24
|
+
AgentContext,
|
|
25
|
+
AgentResult,
|
|
26
|
+
AgentWorkloadType,
|
|
27
|
+
CallableAgent,
|
|
28
|
+
)
|
|
29
|
+
from benchmaker.workloads.eval import (
|
|
30
|
+
EvalWorkloadType,
|
|
31
|
+
Scorer,
|
|
32
|
+
correctness_hook,
|
|
33
|
+
extract_openai_text,
|
|
34
|
+
extract_raw_text,
|
|
35
|
+
extract_text,
|
|
36
|
+
exact_match,
|
|
37
|
+
contains,
|
|
38
|
+
regex_match,
|
|
39
|
+
json_valid,
|
|
40
|
+
multiple_choice,
|
|
41
|
+
judge_llm,
|
|
42
|
+
openai_chat_judge,
|
|
43
|
+
)
|
|
44
|
+
from benchmaker.load import (
|
|
45
|
+
LoadModel,
|
|
46
|
+
ConstantRPS,
|
|
47
|
+
PoissonRPS,
|
|
48
|
+
ClosedLoop,
|
|
49
|
+
Sweep,
|
|
50
|
+
Ramp,
|
|
51
|
+
parse_rate_spec,
|
|
52
|
+
)
|
|
53
|
+
from benchmaker.env import interpolate, load_dotenv
|
|
54
|
+
from benchmaker.monitors import (
|
|
55
|
+
Monitor,
|
|
56
|
+
FunctionMonitor,
|
|
57
|
+
PrometheusMonitor,
|
|
58
|
+
parse_prometheus,
|
|
59
|
+
)
|
|
60
|
+
from benchmaker.runner import BenchRunner, BenchConfig, BenchResult
|
|
61
|
+
from benchmaker.trace import (
|
|
62
|
+
ReplayWorkloadType,
|
|
63
|
+
TracePacedLoad,
|
|
64
|
+
TraceRecorder,
|
|
65
|
+
TraceWorkload,
|
|
66
|
+
load_trace,
|
|
67
|
+
)
|
|
68
|
+
from benchmaker.bundle import (
|
|
69
|
+
BUNDLE_VERSION,
|
|
70
|
+
RunMeta,
|
|
71
|
+
default_run_id,
|
|
72
|
+
is_bundle_dir,
|
|
73
|
+
iter_jsonl,
|
|
74
|
+
read_bundle,
|
|
75
|
+
write_bundle,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
__all__ = [
|
|
79
|
+
"Request",
|
|
80
|
+
"Response",
|
|
81
|
+
"Sample",
|
|
82
|
+
"PreRequestHook",
|
|
83
|
+
"PostResponseHook",
|
|
84
|
+
# workload-types (protocols)
|
|
85
|
+
"WorkloadType",
|
|
86
|
+
"HttpWorkloadType",
|
|
87
|
+
"OpenAIChatWorkloadType",
|
|
88
|
+
"SandboxWorkloadType",
|
|
89
|
+
"HFDatasetWorkload",
|
|
90
|
+
# agent workload (pluggable user-defined agents)
|
|
91
|
+
"Agent",
|
|
92
|
+
"AgentContext",
|
|
93
|
+
"AgentResult",
|
|
94
|
+
"AgentWorkloadType",
|
|
95
|
+
"CallableAgent",
|
|
96
|
+
# eval / correctness
|
|
97
|
+
"EvalWorkloadType",
|
|
98
|
+
"Scorer",
|
|
99
|
+
"correctness_hook",
|
|
100
|
+
"extract_openai_text",
|
|
101
|
+
"extract_raw_text",
|
|
102
|
+
"extract_text",
|
|
103
|
+
"exact_match",
|
|
104
|
+
"contains",
|
|
105
|
+
"regex_match",
|
|
106
|
+
"json_valid",
|
|
107
|
+
"multiple_choice",
|
|
108
|
+
"judge_llm",
|
|
109
|
+
"openai_chat_judge",
|
|
110
|
+
# workloads (datasets / input sources)
|
|
111
|
+
"Workload",
|
|
112
|
+
"StaticWorkload",
|
|
113
|
+
"JsonlWorkload",
|
|
114
|
+
"CallableWorkload",
|
|
115
|
+
"IterableWorkload",
|
|
116
|
+
# load models
|
|
117
|
+
"LoadModel",
|
|
118
|
+
"ConstantRPS",
|
|
119
|
+
"PoissonRPS",
|
|
120
|
+
"ClosedLoop",
|
|
121
|
+
"Sweep",
|
|
122
|
+
"Ramp",
|
|
123
|
+
"parse_rate_spec",
|
|
124
|
+
# monitors
|
|
125
|
+
"Monitor",
|
|
126
|
+
"FunctionMonitor",
|
|
127
|
+
"PrometheusMonitor",
|
|
128
|
+
"parse_prometheus",
|
|
129
|
+
# env
|
|
130
|
+
"load_dotenv",
|
|
131
|
+
"interpolate",
|
|
132
|
+
# runner
|
|
133
|
+
"BenchRunner",
|
|
134
|
+
"BenchConfig",
|
|
135
|
+
"BenchResult",
|
|
136
|
+
# trace: record & replay
|
|
137
|
+
"TraceRecorder",
|
|
138
|
+
"ReplayWorkloadType",
|
|
139
|
+
"TraceWorkload",
|
|
140
|
+
"TracePacedLoad",
|
|
141
|
+
"load_trace",
|
|
142
|
+
# bundle / output layout
|
|
143
|
+
"BUNDLE_VERSION",
|
|
144
|
+
"RunMeta",
|
|
145
|
+
"default_run_id",
|
|
146
|
+
"is_bundle_dir",
|
|
147
|
+
"iter_jsonl",
|
|
148
|
+
"read_bundle",
|
|
149
|
+
"write_bundle",
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
__version__ = "0.1.0"
|
benchmaker/bundle.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Per-run output bundle.
|
|
2
|
+
|
|
3
|
+
Each benchmark run writes a directory:
|
|
4
|
+
|
|
5
|
+
<out_dir>/<run_id>/
|
|
6
|
+
meta.json # run identifiers, timestamps, resolved config
|
|
7
|
+
summary.json # aggregated metrics
|
|
8
|
+
samples.jsonl # one record per request
|
|
9
|
+
monitors.jsonl # one record per monitor tick
|
|
10
|
+
|
|
11
|
+
The bundle is the unit of result interchange. `bench-maker collect` walks
|
|
12
|
+
one or more such directories and pivots the summaries into a table.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import platform
|
|
20
|
+
import socket
|
|
21
|
+
import time
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from typing import Any, Optional
|
|
25
|
+
|
|
26
|
+
from benchmaker.metrics import MetricsAggregator, _safe_meta
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
BUNDLE_VERSION = 1
|
|
30
|
+
META_FILENAME = "meta.json"
|
|
31
|
+
SUMMARY_FILENAME = "summary.json"
|
|
32
|
+
SAMPLES_FILENAME = "samples.jsonl"
|
|
33
|
+
MONITORS_FILENAME = "monitors.jsonl"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def default_run_id(prefix: Optional[str] = None) -> str:
|
|
37
|
+
"""Filesystem-safe UTC timestamp, e.g. `20260526T142233Z` or `prefix-20260526T142233Z`."""
|
|
38
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
39
|
+
return f"{prefix}-{ts}" if prefix else ts
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class RunMeta:
|
|
44
|
+
"""Identifiers and provenance written next to the results."""
|
|
45
|
+
|
|
46
|
+
run_id: str
|
|
47
|
+
started_at: str # ISO-8601 UTC
|
|
48
|
+
ended_at: str # ISO-8601 UTC
|
|
49
|
+
wall_time_s: float
|
|
50
|
+
workload_type: str # e.g. "http", "openai-chat"
|
|
51
|
+
workload: str # e.g. "static", "jsonl"
|
|
52
|
+
hostname: str = field(default_factory=socket.gethostname)
|
|
53
|
+
python_version: str = field(default_factory=platform.python_version)
|
|
54
|
+
bench_maker_version: str = "" # filled in by write_bundle
|
|
55
|
+
bundle_version: int = BUNDLE_VERSION
|
|
56
|
+
source_config: dict = field(default_factory=dict)
|
|
57
|
+
labels: dict = field(default_factory=dict)
|
|
58
|
+
notes: str = ""
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> dict:
|
|
61
|
+
return {
|
|
62
|
+
"run_id": self.run_id,
|
|
63
|
+
"bundle_version": self.bundle_version,
|
|
64
|
+
"bench_maker_version": self.bench_maker_version,
|
|
65
|
+
"started_at": self.started_at,
|
|
66
|
+
"ended_at": self.ended_at,
|
|
67
|
+
"wall_time_s": self.wall_time_s,
|
|
68
|
+
"hostname": self.hostname,
|
|
69
|
+
"python_version": self.python_version,
|
|
70
|
+
"workload_type": self.workload_type,
|
|
71
|
+
"workload": self.workload,
|
|
72
|
+
"labels": self.labels,
|
|
73
|
+
"notes": self.notes,
|
|
74
|
+
"source_config": _safe_meta(self.source_config),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def write_bundle(
|
|
79
|
+
out_dir: str,
|
|
80
|
+
metrics: MetricsAggregator,
|
|
81
|
+
*,
|
|
82
|
+
workload_type_name: str,
|
|
83
|
+
workload_name: str,
|
|
84
|
+
run_id: Optional[str] = None,
|
|
85
|
+
source_config: Optional[dict] = None,
|
|
86
|
+
labels: Optional[dict] = None,
|
|
87
|
+
notes: str = "",
|
|
88
|
+
started_wall: Optional[float] = None,
|
|
89
|
+
ended_wall: Optional[float] = None,
|
|
90
|
+
) -> str:
|
|
91
|
+
"""Write the bundle for a finished run. Returns the absolute run directory path.
|
|
92
|
+
|
|
93
|
+
`out_dir` is treated as the parent: the actual run lives in `<out_dir>/<run_id>/`.
|
|
94
|
+
If `out_dir` itself already looks like a run dir (already has `<run_id>` baked in),
|
|
95
|
+
pass `run_id=""` to skip the extra nesting.
|
|
96
|
+
"""
|
|
97
|
+
from benchmaker import __version__ as bm_version
|
|
98
|
+
|
|
99
|
+
if metrics.end_time is None:
|
|
100
|
+
metrics.finalize()
|
|
101
|
+
|
|
102
|
+
run_id = run_id if run_id is not None else default_run_id()
|
|
103
|
+
target = os.path.join(out_dir, run_id) if run_id else out_dir
|
|
104
|
+
os.makedirs(target, exist_ok=True)
|
|
105
|
+
|
|
106
|
+
duration = metrics.end_time - metrics.start_time if metrics.end_time else 0.0
|
|
107
|
+
end_wall = ended_wall if ended_wall is not None else (metrics.end_wall or time.time())
|
|
108
|
+
start_wall = started_wall if started_wall is not None else (metrics.start_wall or (end_wall - duration))
|
|
109
|
+
|
|
110
|
+
meta = RunMeta(
|
|
111
|
+
run_id=run_id or os.path.basename(os.path.abspath(target)),
|
|
112
|
+
started_at=_iso(start_wall),
|
|
113
|
+
ended_at=_iso(end_wall),
|
|
114
|
+
wall_time_s=duration,
|
|
115
|
+
workload_type=workload_type_name,
|
|
116
|
+
workload=workload_name,
|
|
117
|
+
bench_maker_version=bm_version,
|
|
118
|
+
source_config=source_config or {},
|
|
119
|
+
labels=labels or {},
|
|
120
|
+
notes=notes,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
with open(os.path.join(target, META_FILENAME), "w") as f:
|
|
124
|
+
json.dump(meta.to_dict(), f, indent=2, default=_json_default)
|
|
125
|
+
|
|
126
|
+
with open(os.path.join(target, SUMMARY_FILENAME), "w") as f:
|
|
127
|
+
json.dump(metrics.summary(), f, indent=2, default=_json_default)
|
|
128
|
+
|
|
129
|
+
metrics.dump_samples_jsonl(os.path.join(target, SAMPLES_FILENAME))
|
|
130
|
+
if metrics.monitor_samples:
|
|
131
|
+
metrics.dump_monitor_jsonl(os.path.join(target, MONITORS_FILENAME))
|
|
132
|
+
|
|
133
|
+
return os.path.abspath(target)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def read_bundle(run_dir: str) -> dict:
|
|
137
|
+
"""Read a run directory and return `{meta, summary, samples_path, monitors_path}`.
|
|
138
|
+
|
|
139
|
+
Samples and monitor ticks are returned as file paths rather than loaded, so
|
|
140
|
+
a `collect` over many runs stays cheap. Use `iter_jsonl` to stream rows.
|
|
141
|
+
"""
|
|
142
|
+
meta_path = os.path.join(run_dir, META_FILENAME)
|
|
143
|
+
summary_path = os.path.join(run_dir, SUMMARY_FILENAME)
|
|
144
|
+
if not (os.path.isfile(meta_path) and os.path.isfile(summary_path)):
|
|
145
|
+
raise FileNotFoundError(
|
|
146
|
+
f"{run_dir!r} is not a run bundle (missing meta.json or summary.json)"
|
|
147
|
+
)
|
|
148
|
+
with open(meta_path) as f:
|
|
149
|
+
meta = json.load(f)
|
|
150
|
+
with open(summary_path) as f:
|
|
151
|
+
summary = json.load(f)
|
|
152
|
+
samples = os.path.join(run_dir, SAMPLES_FILENAME)
|
|
153
|
+
monitors = os.path.join(run_dir, MONITORS_FILENAME)
|
|
154
|
+
return {
|
|
155
|
+
"meta": meta,
|
|
156
|
+
"summary": summary,
|
|
157
|
+
"samples_path": samples if os.path.isfile(samples) else None,
|
|
158
|
+
"monitors_path": monitors if os.path.isfile(monitors) else None,
|
|
159
|
+
"dir": os.path.abspath(run_dir),
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def is_bundle_dir(path: str) -> bool:
|
|
164
|
+
return (
|
|
165
|
+
os.path.isdir(path)
|
|
166
|
+
and os.path.isfile(os.path.join(path, META_FILENAME))
|
|
167
|
+
and os.path.isfile(os.path.join(path, SUMMARY_FILENAME))
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def iter_jsonl(path: str):
|
|
172
|
+
"""Yield decoded JSON objects from a JSONL file. Skips blank lines."""
|
|
173
|
+
with open(path) as f:
|
|
174
|
+
for line in f:
|
|
175
|
+
line = line.strip()
|
|
176
|
+
if not line:
|
|
177
|
+
continue
|
|
178
|
+
yield json.loads(line)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _iso(wall_seconds: float) -> str:
|
|
182
|
+
return datetime.fromtimestamp(wall_seconds, tz=timezone.utc).isoformat()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _json_default(obj: Any) -> Any:
|
|
186
|
+
if isinstance(obj, set):
|
|
187
|
+
return sorted(obj)
|
|
188
|
+
if hasattr(obj, "to_dict"):
|
|
189
|
+
try:
|
|
190
|
+
return obj.to_dict()
|
|
191
|
+
except Exception:
|
|
192
|
+
pass
|
|
193
|
+
return repr(obj)
|