benchmaker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
benchmaker/config.py ADDED
@@ -0,0 +1,448 @@
1
+ """YAML config loading: build a BenchConfig from a dict.
2
+
3
+ Config shape:
4
+
5
+ workload_type: # how to talk
6
+ type: http | openai | ...
7
+ ...kwargs...
8
+ workload: # what to send (optional; defaults to one None item)
9
+ type: static | jsonl | callable
10
+ ...kwargs...
11
+ load: <rate spec> # when to fire
12
+ duration: 30s
13
+ pre_hooks: [module:fn, ...]
14
+ post_hooks: [module:fn, ...]
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import importlib
20
+ from typing import Any, Callable, Optional
21
+
22
+ from benchmaker.env import interpolate, load_dotenv
23
+ from benchmaker.load import parse_duration, parse_rate_spec
24
+ from benchmaker.monitors import FunctionMonitor, Monitor, PrometheusMonitor
25
+ from benchmaker.runner import BenchConfig
26
+ from benchmaker.workloads.base import WorkloadType
27
+ from benchmaker.workloads.datasets import (
28
+ CallableWorkload,
29
+ JsonlWorkload,
30
+ StaticWorkload,
31
+ Workload,
32
+ )
33
+ from benchmaker.workloads.hf import HFDatasetWorkload
34
+ from benchmaker.workloads.http import HttpWorkloadType
35
+ from benchmaker.workloads.llm import OpenAIChatWorkloadType
36
+ from benchmaker.workloads.sandbox import SandboxWorkloadType
37
+ from benchmaker.workloads.agent import Agent, AgentWorkloadType
38
+ from benchmaker.workloads.eval import (
39
+ EvalWorkloadType,
40
+ Scorer,
41
+ contains,
42
+ correctness_hook,
43
+ exact_match,
44
+ json_valid,
45
+ judge_llm,
46
+ multiple_choice,
47
+ openai_chat_judge,
48
+ regex_match,
49
+ )
50
+ from benchmaker.trace import (
51
+ ReplayWorkloadType,
52
+ TracePacedLoad,
53
+ TraceRecorder,
54
+ TraceWorkload,
55
+ load_trace,
56
+ )
57
+
58
+
59
+ def resolve_callable(ref: str) -> Callable:
60
+ if ":" in ref:
61
+ modname, attr = ref.split(":", 1)
62
+ else:
63
+ modname, _, attr = ref.rpartition(".")
64
+ if not modname:
65
+ raise ValueError(f"Cannot resolve callable {ref!r}")
66
+ mod = importlib.import_module(modname)
67
+ obj: Any = mod
68
+ for part in attr.split("."):
69
+ obj = getattr(obj, part)
70
+ if not callable(obj):
71
+ raise TypeError(f"{ref!r} is not callable")
72
+ return obj
73
+
74
+
75
+ def build_workload_type(spec: dict) -> WorkloadType:
76
+ """Build a WorkloadType from a dict."""
77
+ if "factory" in spec:
78
+ fn = resolve_callable(spec["factory"])
79
+ kwargs = {k: v for k, v in spec.items() if k != "factory"}
80
+ obj = fn(**kwargs)
81
+ if not isinstance(obj, WorkloadType):
82
+ raise TypeError(f"factory must return a WorkloadType, got {type(obj)}")
83
+ return obj
84
+
85
+ t = (spec.get("type") or "http").lower()
86
+ kwargs = {k: v for k, v in spec.items() if k != "type"}
87
+ if t == "http":
88
+ return HttpWorkloadType(**kwargs)
89
+ if t in ("openai", "openai-chat", "llm-chat", "llm"):
90
+ return OpenAIChatWorkloadType(**kwargs)
91
+ if t in ("sandbox", "flash-sandbox"):
92
+ return SandboxWorkloadType(**kwargs)
93
+ if t == "agent":
94
+ return _build_agent_workload_type(kwargs)
95
+ raise ValueError(f"Unknown workload_type {t!r}")
96
+
97
+
98
+ def _build_agent_workload_type(spec: dict) -> AgentWorkloadType:
99
+ """Build an `AgentWorkloadType` from YAML.
100
+
101
+ Accepts ``agent: 'module:ClassOrCallable'`` (resolved via `resolve_callable`)
102
+ plus optional ``agent_kwargs`` forwarded to the constructor. The remaining
103
+ keys are passed to ``AgentWorkloadType`` (``reference_key``,
104
+ ``extra_meta_keys``, ``name``).
105
+ """
106
+ ref = spec.pop("agent", None) or spec.pop("class", None)
107
+ if not ref:
108
+ raise ValueError(
109
+ "agent workload-type requires 'agent: <module:ClassOrCallable>'"
110
+ )
111
+ obj: Any = resolve_callable(ref) if isinstance(ref, str) else ref
112
+ agent_kwargs = spec.pop("agent_kwargs", None) or {}
113
+ return AgentWorkloadType(obj, agent_kwargs=agent_kwargs, **spec)
114
+
115
+
116
+ def build_workload(spec: Any) -> Workload:
117
+ """Build a Workload (dataset) from a dict, list, or string."""
118
+ if spec is None:
119
+ return StaticWorkload() # one None item, cycled
120
+
121
+ # Convenience: a bare list becomes a StaticWorkload.
122
+ if isinstance(spec, list):
123
+ return StaticWorkload(items=spec)
124
+
125
+ # Convenience: a bare string ending in .jsonl becomes a JsonlWorkload.
126
+ if isinstance(spec, str):
127
+ if spec.endswith(".jsonl"):
128
+ return JsonlWorkload(path=spec)
129
+ # Otherwise treat as a single static string item.
130
+ return StaticWorkload(items=[spec])
131
+
132
+ if not isinstance(spec, dict):
133
+ raise TypeError(f"workload spec must be dict|list|str|None, got {type(spec)}")
134
+
135
+ if "factory" in spec:
136
+ fn = resolve_callable(spec["factory"])
137
+ kwargs = {k: v for k, v in spec.items() if k != "factory"}
138
+ obj = fn(**kwargs)
139
+ if not isinstance(obj, Workload):
140
+ raise TypeError(f"factory must return a Workload, got {type(obj)}")
141
+ return obj
142
+
143
+ t = (spec.get("type") or "static").lower()
144
+ kwargs = {k: v for k, v in spec.items() if k != "type"}
145
+ if t == "static":
146
+ return StaticWorkload(**kwargs)
147
+ if t == "jsonl":
148
+ return JsonlWorkload(**kwargs)
149
+ if t == "callable":
150
+ fn = resolve_callable(kwargs.pop("fn"))
151
+ return CallableWorkload(fn=fn, **kwargs)
152
+ if t in ("hf", "huggingface"):
153
+ return HFDatasetWorkload(**kwargs)
154
+ raise ValueError(f"Unknown workload type {t!r}")
155
+
156
+
157
+ def build_monitor(spec: dict) -> Monitor:
158
+ """Build a Monitor from a dict.
159
+
160
+ Forms:
161
+ type: prometheus url=..., metric_names=[...], interval_s=...
162
+ type: function fn='module:func', interval_s=...
163
+ factory: 'module:fn' kwargs...
164
+ """
165
+ if "factory" in spec:
166
+ fn = resolve_callable(spec["factory"])
167
+ kwargs = {k: v for k, v in spec.items() if k != "factory"}
168
+ obj = fn(**kwargs)
169
+ if not isinstance(obj, Monitor):
170
+ raise TypeError(f"monitor factory must return a Monitor, got {type(obj)}")
171
+ return obj
172
+
173
+ t = (spec.get("type") or "function").lower()
174
+ kwargs = {k: v for k, v in spec.items() if k != "type"}
175
+ if t == "prometheus":
176
+ if "metric_names" in kwargs and kwargs["metric_names"] is not None:
177
+ kwargs["metric_names"] = set(kwargs["metric_names"])
178
+ return PrometheusMonitor(**kwargs)
179
+ if t == "function":
180
+ fn = resolve_callable(kwargs.pop("fn"))
181
+ return FunctionMonitor(fn=fn, **kwargs)
182
+ raise ValueError(f"Unknown monitor type {t!r}")
183
+
184
+
185
+ _SCORER_BUILDERS: dict[str, Any] = {
186
+ "exact_match": lambda kw: exact_match(**kw),
187
+ "exact": lambda kw: exact_match(**kw),
188
+ "contains": lambda kw: contains(**kw),
189
+ "regex_match": lambda kw: regex_match(**kw),
190
+ "regex": lambda kw: regex_match(**kw),
191
+ "json_valid": lambda kw: json_valid(**kw),
192
+ "multiple_choice": lambda kw: multiple_choice(**kw),
193
+ }
194
+
195
+
196
+ def build_scorer(spec: Any) -> tuple[Scorer, Optional[Callable[[], Any]]]:
197
+ """Build a (scorer, optional_aclose) pair from a YAML spec.
198
+
199
+ Spec forms:
200
+ type: exact_match | contains | regex | json_valid | multiple_choice | judge_llm
201
+ # ...type-specific kwargs
202
+
203
+ OR:
204
+ factory: 'module:fn'
205
+ # ...kwargs forwarded to the factory
206
+
207
+ A bare string ("exact_match") is shorthand for `{type: <that>}`.
208
+
209
+ The aclose callable, when not None, owns transient resources held by the
210
+ scorer (e.g. a judge LLM's aiohttp session). The YAML build path wires it
211
+ into the wrapped workload-type's `aclose` chain.
212
+ """
213
+ if spec is None:
214
+ raise ValueError("correctness.scorer must be set")
215
+ if isinstance(spec, str):
216
+ spec = {"type": spec}
217
+ if not isinstance(spec, dict):
218
+ raise TypeError(f"scorer spec must be dict|str, got {type(spec).__name__}")
219
+
220
+ if "factory" in spec:
221
+ fn = resolve_callable(spec["factory"])
222
+ kwargs = {k: v for k, v in spec.items() if k != "factory"}
223
+ obj = fn(**kwargs)
224
+ # Factories may return either (scorer, aclose) or just scorer.
225
+ if isinstance(obj, tuple) and len(obj) == 2 and callable(obj[0]):
226
+ return obj # type: ignore[return-value]
227
+ if callable(obj):
228
+ return obj, None
229
+ raise TypeError(
230
+ f"scorer factory {spec['factory']!r} must return a callable "
231
+ f"(or (callable, aclose) tuple), got {type(obj).__name__}"
232
+ )
233
+
234
+ t = (spec.get("type") or "").lower()
235
+ if t in _SCORER_BUILDERS:
236
+ kwargs = {k: v for k, v in spec.items() if k != "type"}
237
+ return _SCORER_BUILDERS[t](kwargs), None
238
+
239
+ if t in ("judge_llm", "judge"):
240
+ return _build_judge_scorer(spec)
241
+
242
+ raise ValueError(f"unknown scorer type {t!r}")
243
+
244
+
245
+ def _build_judge_scorer(spec: dict) -> tuple[Scorer, Optional[Callable[[], Any]]]:
246
+ """Construct a judge_llm scorer.
247
+
248
+ Either `send_factory: 'module:fn'` (a factory returning a `send` callable or
249
+ `(send, aclose)`) OR `openai_chat: {url, model, api_key, ...}` shortcut.
250
+ """
251
+ template = spec.get("template")
252
+ max_concurrency = int(spec.get("max_concurrency", 4))
253
+ parse = None
254
+ if "parse_factory" in spec:
255
+ parse = resolve_callable(spec["parse_factory"])
256
+
257
+ aclose: Optional[Callable[[], Any]] = None
258
+
259
+ if "send_factory" in spec:
260
+ fn = resolve_callable(spec["send_factory"])
261
+ kwargs = spec.get("send_kwargs", {}) or {}
262
+ result = fn(**kwargs)
263
+ if isinstance(result, tuple) and len(result) == 2:
264
+ send, aclose = result
265
+ else:
266
+ send = result
267
+ elif "openai_chat" in spec:
268
+ oc = dict(spec["openai_chat"])
269
+ send, aclose = openai_chat_judge(**oc)
270
+ else:
271
+ raise ValueError(
272
+ "judge_llm scorer requires either 'send_factory' or 'openai_chat'"
273
+ )
274
+
275
+ kwargs: dict[str, Any] = {"max_concurrency": max_concurrency}
276
+ if template is not None:
277
+ kwargs["template"] = template
278
+ if parse is not None:
279
+ kwargs["parse"] = parse
280
+ return judge_llm(send, **kwargs), aclose
281
+
282
+
283
+ def apply_correctness(workload_type: WorkloadType, spec: dict
284
+ ) -> tuple[WorkloadType, list]:
285
+ """Install correctness grading on a workload-type.
286
+
287
+ If the workload-type sets ``handles_reference = True`` (it already peels
288
+ ``reference`` out of items into ``Request.meta``), we install just the
289
+ post-hook. Otherwise we wrap the workload-type in ``EvalWorkloadType``
290
+ so it gets the reference plumbing for free.
291
+
292
+ Returns ``(workload_type_to_use, [hook])``. If a scorer owns transient
293
+ resources (e.g. a judge session), its ``aclose`` is chained onto the
294
+ workload-type's ``aclose`` so the runner cleans it up.
295
+ """
296
+ reference_key = spec.get("reference_key", "reference")
297
+ extra_meta_keys = tuple(spec.get("extra_meta_keys") or ())
298
+ gate_key = spec.get("gate_key", "correct")
299
+ if gate_key in ("", "null", "none"):
300
+ gate_key = None
301
+ prefix = spec.get("prefix", "")
302
+ require_reference = bool(spec.get("require_reference", True))
303
+ max_prediction_chars: Any = spec.get("max_prediction_chars", 2048)
304
+ if isinstance(max_prediction_chars, str) and max_prediction_chars.lower() in (
305
+ "", "null", "none", "all", "full"
306
+ ):
307
+ max_prediction_chars = None
308
+
309
+ scorer_spec = spec.get("scorer")
310
+ scorer, scorer_aclose = build_scorer(scorer_spec)
311
+
312
+ if getattr(workload_type, "handles_reference", False):
313
+ wrapped = workload_type
314
+ else:
315
+ wrapped = EvalWorkloadType(
316
+ workload_type,
317
+ reference_key=reference_key,
318
+ extra_meta_keys=extra_meta_keys,
319
+ )
320
+
321
+ if scorer_aclose is not None:
322
+ original_aclose = wrapped.aclose
323
+
324
+ async def _chained() -> None:
325
+ try:
326
+ await original_aclose()
327
+ finally:
328
+ res = scorer_aclose()
329
+ if hasattr(res, "__await__"):
330
+ await res
331
+
332
+ wrapped.aclose = _chained # type: ignore[method-assign]
333
+
334
+ hook = correctness_hook(
335
+ scorer,
336
+ reference_key=reference_key,
337
+ gate_key=gate_key,
338
+ prefix=prefix,
339
+ require_reference=require_reference,
340
+ max_prediction_chars=max_prediction_chars,
341
+ )
342
+ return wrapped, [hook]
343
+
344
+
345
+ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
346
+ interpolate_env: bool = True) -> BenchConfig:
347
+ """Build a BenchConfig from a (typically YAML-loaded) dict.
348
+
349
+ Args:
350
+ cfg: the config dict.
351
+ dotenv_path: if non-None and the file exists, KEY=VALUE pairs are
352
+ loaded into `os.environ` (existing vars are NOT overwritten).
353
+ interpolate_env: if True, `${VAR}` and `${VAR:-default}` are
354
+ substituted in all string values throughout the config.
355
+ """
356
+ if dotenv_path:
357
+ load_dotenv(dotenv_path)
358
+ if interpolate_env:
359
+ cfg = interpolate(cfg)
360
+
361
+ replay_spec = cfg.get("replay")
362
+ if replay_spec is not None:
363
+ workload_type, workload, load_model = _build_replay(replay_spec)
364
+ else:
365
+ wt_spec = cfg.get("workload_type")
366
+ if not wt_spec:
367
+ # Back-compat shim: if old 'workload' key looks like a workload-type config
368
+ # (has 'type: http' or 'type: openai'), accept it.
369
+ legacy = cfg.get("workload")
370
+ if isinstance(legacy, dict) and legacy.get("type", "http").lower() in (
371
+ "http", "openai", "openai-chat", "llm", "llm-chat"
372
+ ):
373
+ wt_spec = legacy
374
+ cfg = {**cfg, "workload": None}
375
+ else:
376
+ raise ValueError("config must define 'workload_type' or 'replay'")
377
+
378
+ workload_type = build_workload_type(wt_spec)
379
+ workload = build_workload(cfg.get("workload"))
380
+
381
+ load_spec = cfg.get("load")
382
+ if load_spec is None:
383
+ raise ValueError("config must define 'load'")
384
+ duration = cfg.get("duration") or cfg.get("duration_s")
385
+ if duration is not None and isinstance(duration, str):
386
+ duration = parse_duration(duration)
387
+ load_model = parse_rate_spec(load_spec, duration_s=duration,
388
+ max_requests=cfg.get("max_requests"))
389
+
390
+ pre_hooks = [resolve_callable(h) for h in (cfg.get("pre_hooks") or [])]
391
+ post_hooks = [resolve_callable(h) for h in (cfg.get("post_hooks") or [])]
392
+ monitors = [build_monitor(m) for m in (cfg.get("monitors") or [])]
393
+
394
+ # Correctness block. In normal mode it wraps the workload-type (to split
395
+ # references out of items); in replay mode the recorded Request already
396
+ # carries the reference under `meta`, so we just install the post-hook.
397
+ correctness_spec = cfg.get("correctness")
398
+ if correctness_spec:
399
+ if not isinstance(correctness_spec, dict):
400
+ raise TypeError("'correctness' must be a dict")
401
+ workload_type, extra_post = apply_correctness(workload_type, correctness_spec)
402
+ post_hooks = list(post_hooks) + list(extra_post)
403
+
404
+ recorder = _build_recorder(cfg.get("record"))
405
+
406
+ return BenchConfig(
407
+ workload_type=workload_type,
408
+ workload=workload,
409
+ load=load_model,
410
+ pre_hooks=pre_hooks,
411
+ post_hooks=post_hooks,
412
+ monitors=monitors,
413
+ recorder=recorder,
414
+ connection_limit=int(cfg.get("connection_limit", 1000)),
415
+ timeout_s=float(cfg.get("timeout_s", 60.0)),
416
+ max_in_flight=int(cfg.get("max_in_flight", 10000)),
417
+ progress_every_s=float(cfg.get("progress_every_s", 1.0)),
418
+ )
419
+
420
+
421
+ def _build_recorder(spec: Any) -> Optional[TraceRecorder]:
422
+ if spec is None:
423
+ return None
424
+ path = spec if isinstance(spec, str) else spec.get("path")
425
+ if not path:
426
+ raise ValueError("'record' must specify a 'path' (or be a bare string path)")
427
+ return TraceRecorder(path)
428
+
429
+
430
+ def _build_replay(spec: Any) -> tuple[WorkloadType, Workload, Any]:
431
+ """Resolve a `replay:` block into (workload_type, workload, load_model)."""
432
+ if isinstance(spec, str):
433
+ spec = {"path": spec}
434
+ if not isinstance(spec, dict):
435
+ raise TypeError(f"'replay' must be dict|str, got {type(spec).__name__}")
436
+ path = spec.get("path")
437
+ if not path:
438
+ raise ValueError("'replay' must specify a 'path'")
439
+ speed = float(spec.get("speed", 1.0))
440
+ streaming = bool(spec.get("streaming", False))
441
+ trace = load_trace(path)
442
+ return (
443
+ ReplayWorkloadType(streaming=streaming),
444
+ TraceWorkload(trace),
445
+ TracePacedLoad(trace, speed=speed),
446
+ )
447
+
448
+
benchmaker/env.py ADDED
@@ -0,0 +1,87 @@
1
+ """Lightweight .env loading and ${VAR} interpolation for YAML configs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ from typing import Any, Mapping, Optional
8
+
9
+
10
+ def load_dotenv(path: str = ".env", override: bool = False) -> dict[str, str]:
11
+ """Parse a .env file and inject KEY=VALUE pairs into `os.environ`.
12
+
13
+ Minimal parser — handles:
14
+ KEY=value
15
+ KEY="quoted value"
16
+ KEY='single-quoted'
17
+ # comment lines
18
+ export KEY=value (the `export` prefix is stripped)
19
+ KEY=value with spaces (unquoted; trailing whitespace stripped)
20
+
21
+ Returns the dict of values loaded (also injected into `os.environ`).
22
+ Silently returns `{}` if the file doesn't exist.
23
+
24
+ By default, existing env vars are NOT overwritten (set `override=True` to
25
+ force).
26
+ """
27
+ if not os.path.exists(path):
28
+ return {}
29
+
30
+ out: dict[str, str] = {}
31
+ with open(path) as f:
32
+ for raw in f:
33
+ line = raw.strip()
34
+ if not line or line.startswith("#"):
35
+ continue
36
+ if line.startswith("export "):
37
+ line = line[len("export "):].lstrip()
38
+ if "=" not in line:
39
+ continue
40
+ key, _, value = line.partition("=")
41
+ key = key.strip()
42
+ value = value.strip()
43
+ # Strip surrounding quotes if matched.
44
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'):
45
+ value = value[1:-1]
46
+ # Strip inline comments only for unquoted lines (rough heuristic).
47
+ elif " #" in value:
48
+ value = value.split(" #", 1)[0].rstrip()
49
+ out[key] = value
50
+ if override or key not in os.environ:
51
+ os.environ[key] = value
52
+ return out
53
+
54
+
55
+ _VAR_RE = re.compile(r"\$\{([A-Za-z_][A-Za-z_0-9]*)(?::-([^}]*))?\}")
56
+
57
+
58
+ def interpolate(value: Any, env: Optional[Mapping[str, str]] = None) -> Any:
59
+ """Recursively walk `value` and substitute ${VAR} / ${VAR:-default}.
60
+
61
+ Lookups go to `env` if given, otherwise `os.environ`. Missing vars without
62
+ a default raise `KeyError`.
63
+ """
64
+ src: Mapping[str, str] = env if env is not None else os.environ
65
+ return _walk(value, src)
66
+
67
+
68
+ def _walk(value: Any, env: Mapping[str, str]) -> Any:
69
+ if isinstance(value, str):
70
+ return _substitute(value, env)
71
+ if isinstance(value, dict):
72
+ return {k: _walk(v, env) for k, v in value.items()}
73
+ if isinstance(value, list):
74
+ return [_walk(v, env) for v in value]
75
+ return value
76
+
77
+
78
+ def _substitute(s: str, env: Mapping[str, str]) -> str:
79
+ def repl(m: re.Match) -> str:
80
+ name, default = m.group(1), m.group(2)
81
+ if name in env:
82
+ return env[name]
83
+ if default is not None:
84
+ return default
85
+ raise KeyError(f"environment variable {name!r} is not set (used in config)")
86
+
87
+ return _VAR_RE.sub(repl, s)