benchmaker 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchmaker-0.1.3 → benchmaker-0.1.4}/PKG-INFO +2 -1
- {benchmaker-0.1.3 → benchmaker-0.1.4}/README.md +1 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/__init__.py +5 -2
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/config.py +71 -12
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/metrics.py +95 -57
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/runner.py +120 -19
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/swebench_replay.py +27 -1
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/harbor_eval.py +16 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/__init__.py +2 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/llm.py +7 -0
- benchmaker-0.1.4/benchmaker/workloads/rag.py +188 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/PKG-INFO +2 -1
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/SOURCES.txt +5 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/pyproject.toml +1 -1
- benchmaker-0.1.4/tests/test_collect_sweep_data.py +24 -0
- benchmaker-0.1.4/tests/test_mix.py +86 -0
- benchmaker-0.1.4/tests/test_qos_job_config.py +56 -0
- benchmaker-0.1.4/tests/test_rag.py +120 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/cli.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/__init__.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/load.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/monitors.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/trace.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/core/types.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/env.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/io/__init__.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/io/bundle.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/io/collect.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/__init__.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/_cli_shared.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/_factory.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/base.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/http.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/llm.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/sandbox.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/sglang.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/swebench.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/recipes/trajectory_replay.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/__init__.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/_flash_hardening.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/agent.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/grading.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/harbor_agent.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/observability.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_agent.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/max_turns.js +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/register_provider.js +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/remote_exec.js +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/pi_ext/remote_exec_all.js +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/replay_server.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/timeout_load.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/swebench/trajectory.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/agent.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/base.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/datasets.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/eval.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/hf.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/http.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/sandbox.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/sglang.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker/workloads/trajectory.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/dependency_links.txt +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/entry_points.txt +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/requires.txt +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/benchmaker.egg-info/top_level.txt +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/setup.cfg +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_agent.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_agent_warmup.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_backfill_trajectory_status.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_bundle.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_coding_agent.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_collect_trajectories.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_dedupe_trajectories.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_eval.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_flash_hardening.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_hf.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_observability.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_passthrough_meta.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_pi_agent.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_pi_agent_timeout_injection.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_recipes_cli.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_replay_server.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_sandbox_duration.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_sglang.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_smoke.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_swebench_replay_recipe.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_timeout_load.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trace.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trajectory.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trajectory_interleave.py +0 -0
- {benchmaker-0.1.3 → benchmaker-0.1.4}/tests/test_trajectory_replay.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: benchmaker
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
|
|
5
5
|
Author: Xiaozhe Yao
|
|
6
6
|
License: MIT
|
|
@@ -187,6 +187,7 @@ Full docs live in [`docs/`](docs/):
|
|
|
187
187
|
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
188
188
|
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
189
189
|
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
190
|
+
- [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
|
|
190
191
|
- [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
|
|
191
192
|
- [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
|
|
192
193
|
|
|
@@ -157,6 +157,7 @@ Full docs live in [`docs/`](docs/):
|
|
|
157
157
|
- [Correctness / accuracy eval](docs/eval.md) — grade responses against references
|
|
158
158
|
- [CLI & YAML reference](docs/cli-and-yaml.md)
|
|
159
159
|
- [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
|
|
160
|
+
- [DeepRAG and mixed lanes](docs/deeprag-mix.md) — prefill-heavy RAG and phase-swinging dataset lanes
|
|
160
161
|
- [SGLang benchmark](docs/sglang.md) — native SGLang `/generate` benchmark
|
|
161
162
|
- [Trajectory replay](docs/trajectory-replay.md) — multi-turn prefix-cache parity replay
|
|
162
163
|
|
|
@@ -19,6 +19,7 @@ from benchmaker.workloads.http import HttpWorkloadType
|
|
|
19
19
|
from benchmaker.workloads.llm import OpenAIChatWorkloadType
|
|
20
20
|
from benchmaker.workloads.sandbox import SandboxWorkloadType
|
|
21
21
|
from benchmaker.workloads.hf import HFDatasetWorkload
|
|
22
|
+
from benchmaker.workloads.rag import DeepRAGWorkload
|
|
22
23
|
from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
|
|
23
24
|
from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
|
|
24
25
|
from benchmaker.workloads.agent import (
|
|
@@ -59,7 +60,7 @@ from benchmaker.core.monitors import (
|
|
|
59
60
|
PrometheusMonitor,
|
|
60
61
|
parse_prometheus,
|
|
61
62
|
)
|
|
62
|
-
from benchmaker.core.runner import BenchRunner, BenchConfig, BenchResult
|
|
63
|
+
from benchmaker.core.runner import BenchLane, BenchRunner, BenchConfig, BenchResult
|
|
63
64
|
from benchmaker.core.trace import (
|
|
64
65
|
ReplayWorkloadType,
|
|
65
66
|
TracePacedLoad,
|
|
@@ -89,6 +90,7 @@ __all__ = [
|
|
|
89
90
|
"OpenAIChatWorkloadType",
|
|
90
91
|
"SandboxWorkloadType",
|
|
91
92
|
"HFDatasetWorkload",
|
|
93
|
+
"DeepRAGWorkload",
|
|
92
94
|
"SGLangGenerateWorkloadType",
|
|
93
95
|
"TrajectoryReplayWorkload",
|
|
94
96
|
# agent workload (pluggable user-defined agents)
|
|
@@ -136,6 +138,7 @@ __all__ = [
|
|
|
136
138
|
# runner
|
|
137
139
|
"BenchRunner",
|
|
138
140
|
"BenchConfig",
|
|
141
|
+
"BenchLane",
|
|
139
142
|
"BenchResult",
|
|
140
143
|
# trace: record & replay
|
|
141
144
|
"TraceRecorder",
|
|
@@ -153,4 +156,4 @@ __all__ = [
|
|
|
153
156
|
"write_bundle",
|
|
154
157
|
]
|
|
155
158
|
|
|
156
|
-
__version__ = "0.1.
|
|
159
|
+
__version__ = "0.1.4"
|
|
@@ -22,7 +22,7 @@ from typing import Any, Callable, Optional
|
|
|
22
22
|
from benchmaker.env import interpolate, load_dotenv
|
|
23
23
|
from benchmaker.core.load import parse_duration, parse_rate_spec
|
|
24
24
|
from benchmaker.core.monitors import FunctionMonitor, Monitor, PrometheusMonitor
|
|
25
|
-
from benchmaker.core.runner import BenchConfig
|
|
25
|
+
from benchmaker.core.runner import BenchConfig, BenchLane
|
|
26
26
|
from benchmaker.workloads.base import WorkloadType
|
|
27
27
|
from benchmaker.workloads.datasets import (
|
|
28
28
|
CallableWorkload,
|
|
@@ -31,6 +31,7 @@ from benchmaker.workloads.datasets import (
|
|
|
31
31
|
Workload,
|
|
32
32
|
)
|
|
33
33
|
from benchmaker.workloads.hf import HFDatasetWorkload
|
|
34
|
+
from benchmaker.workloads.rag import DeepRAGWorkload
|
|
34
35
|
from benchmaker.workloads.http import HttpWorkloadType
|
|
35
36
|
from benchmaker.workloads.llm import OpenAIChatWorkloadType
|
|
36
37
|
from benchmaker.workloads.sandbox import SandboxWorkloadType
|
|
@@ -154,6 +155,8 @@ def build_workload(spec: Any) -> Workload:
|
|
|
154
155
|
return CallableWorkload(fn=fn, **kwargs)
|
|
155
156
|
if t in ("hf", "huggingface"):
|
|
156
157
|
return HFDatasetWorkload(**kwargs)
|
|
158
|
+
if t in ("deeprag", "deep-rag", "rag"):
|
|
159
|
+
return DeepRAGWorkload(**kwargs)
|
|
157
160
|
if t == "trajectory":
|
|
158
161
|
from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
|
|
159
162
|
return TrajectoryReplayWorkload(**kwargs)
|
|
@@ -365,8 +368,12 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
365
368
|
cfg = interpolate(cfg)
|
|
366
369
|
|
|
367
370
|
replay_spec = cfg.get("replay")
|
|
371
|
+
mix_spec = cfg.get("mix")
|
|
372
|
+
if replay_spec is not None and mix_spec is not None:
|
|
373
|
+
raise ValueError("'replay' and 'mix' are mutually exclusive")
|
|
368
374
|
if replay_spec is not None:
|
|
369
375
|
workload_type, workload, load_model = _build_replay(replay_spec)
|
|
376
|
+
lanes: list[BenchLane] = []
|
|
370
377
|
else:
|
|
371
378
|
wt_spec = cfg.get("workload_type")
|
|
372
379
|
if not wt_spec:
|
|
@@ -382,16 +389,27 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
382
389
|
raise ValueError("config must define 'workload_type' or 'replay'")
|
|
383
390
|
|
|
384
391
|
workload_type = build_workload_type(wt_spec)
|
|
385
|
-
workload = build_workload(cfg.get("workload"))
|
|
386
|
-
|
|
387
|
-
load_spec = cfg.get("load")
|
|
388
|
-
if load_spec is None:
|
|
389
|
-
raise ValueError("config must define 'load'")
|
|
390
392
|
duration = cfg.get("duration") or cfg.get("duration_s")
|
|
391
393
|
if duration is not None and isinstance(duration, str):
|
|
392
394
|
duration = parse_duration(duration)
|
|
393
|
-
|
|
394
|
-
|
|
395
|
+
if mix_spec is not None:
|
|
396
|
+
if cfg.get("load") is not None:
|
|
397
|
+
raise ValueError("a mixed config cannot also define top-level 'load'")
|
|
398
|
+
workload = StaticWorkload()
|
|
399
|
+
load_model = None
|
|
400
|
+
lanes = _build_lanes(
|
|
401
|
+
mix_spec,
|
|
402
|
+
duration_s=duration,
|
|
403
|
+
max_requests=cfg.get("max_requests"),
|
|
404
|
+
)
|
|
405
|
+
else:
|
|
406
|
+
workload = build_workload(cfg.get("workload"))
|
|
407
|
+
load_spec = cfg.get("load")
|
|
408
|
+
if load_spec is None:
|
|
409
|
+
raise ValueError("config must define 'load' or 'mix.lanes'")
|
|
410
|
+
load_model = parse_rate_spec(load_spec, duration_s=duration,
|
|
411
|
+
max_requests=cfg.get("max_requests"))
|
|
412
|
+
lanes = []
|
|
395
413
|
|
|
396
414
|
pre_hooks = [resolve_callable(h) for h in (cfg.get("pre_hooks") or [])]
|
|
397
415
|
post_hooks = [resolve_callable(h) for h in (cfg.get("post_hooks") or [])]
|
|
@@ -410,9 +428,11 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
410
428
|
# A workload that schedules on per-request completion (e.g. interleaved
|
|
411
429
|
# trajectory replay) declares the post-hook it needs; install it so a YAML
|
|
412
430
|
# config can't silently stall waiting for a signal it never wired up.
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
431
|
+
workloads = [lane.workload for lane in lanes] if lanes else [workload]
|
|
432
|
+
for lane_workload in workloads:
|
|
433
|
+
wl_hook = lane_workload.completion_hook()
|
|
434
|
+
if wl_hook is not None and wl_hook not in post_hooks:
|
|
435
|
+
post_hooks = list(post_hooks) + [wl_hook]
|
|
416
436
|
|
|
417
437
|
recorder = _build_recorder(cfg.get("record"))
|
|
418
438
|
|
|
@@ -420,6 +440,7 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
420
440
|
workload_type=workload_type,
|
|
421
441
|
workload=workload,
|
|
422
442
|
load=load_model,
|
|
443
|
+
lanes=lanes,
|
|
423
444
|
pre_hooks=pre_hooks,
|
|
424
445
|
post_hooks=post_hooks,
|
|
425
446
|
monitors=monitors,
|
|
@@ -428,9 +449,48 @@ def build_config(cfg: dict, dotenv_path: Optional[str] = ".env",
|
|
|
428
449
|
timeout_s=float(cfg.get("timeout_s", 60.0)),
|
|
429
450
|
max_in_flight=int(cfg.get("max_in_flight", 10000)),
|
|
430
451
|
progress_every_s=float(cfg.get("progress_every_s", 1.0)),
|
|
452
|
+
stop_on_exhausted=bool(cfg.get("stop_on_exhausted", True)),
|
|
431
453
|
)
|
|
432
454
|
|
|
433
455
|
|
|
456
|
+
def _build_lanes(spec: Any, *, duration_s: Optional[float],
|
|
457
|
+
max_requests: Optional[int]) -> list[BenchLane]:
|
|
458
|
+
"""Build independent workload/load pairs from a ``mix:`` YAML block."""
|
|
459
|
+
if not isinstance(spec, dict):
|
|
460
|
+
raise TypeError("'mix' must be a mapping with a 'lanes' list")
|
|
461
|
+
lane_specs = spec.get("lanes")
|
|
462
|
+
if not isinstance(lane_specs, list) or not lane_specs:
|
|
463
|
+
raise ValueError("'mix.lanes' must be a non-empty list")
|
|
464
|
+
|
|
465
|
+
lanes: list[BenchLane] = []
|
|
466
|
+
for index, lane_spec in enumerate(lane_specs):
|
|
467
|
+
if not isinstance(lane_spec, dict):
|
|
468
|
+
raise TypeError(f"mix.lanes[{index}] must be a mapping")
|
|
469
|
+
name = lane_spec.get("name")
|
|
470
|
+
if not isinstance(name, str) or not name.strip():
|
|
471
|
+
raise ValueError(f"mix.lanes[{index}].name must be a non-empty string")
|
|
472
|
+
if "workload" not in lane_spec:
|
|
473
|
+
raise ValueError(f"mix.lanes[{index}] must define a workload")
|
|
474
|
+
rate = lane_spec.get("rate", lane_spec.get("load"))
|
|
475
|
+
if rate is None:
|
|
476
|
+
raise ValueError(f"mix.lanes[{index}] must define rate (or load)")
|
|
477
|
+
|
|
478
|
+
lane_duration = lane_spec.get("duration", duration_s)
|
|
479
|
+
if isinstance(lane_duration, str):
|
|
480
|
+
lane_duration = parse_duration(lane_duration)
|
|
481
|
+
lane_max_requests = lane_spec.get("max_requests", max_requests)
|
|
482
|
+
lanes.append(BenchLane(
|
|
483
|
+
name=name,
|
|
484
|
+
workload=build_workload(lane_spec["workload"]),
|
|
485
|
+
load=parse_rate_spec(
|
|
486
|
+
rate,
|
|
487
|
+
duration_s=lane_duration,
|
|
488
|
+
max_requests=lane_max_requests,
|
|
489
|
+
),
|
|
490
|
+
))
|
|
491
|
+
return lanes
|
|
492
|
+
|
|
493
|
+
|
|
434
494
|
def _build_recorder(spec: Any) -> Optional[TraceRecorder]:
|
|
435
495
|
if spec is None:
|
|
436
496
|
return None
|
|
@@ -458,4 +518,3 @@ def _build_replay(spec: Any) -> tuple[WorkloadType, Workload, Any]:
|
|
|
458
518
|
TracePacedLoad(trace, speed=speed),
|
|
459
519
|
)
|
|
460
520
|
|
|
461
|
-
|
|
@@ -52,64 +52,22 @@ class MetricsAggregator:
|
|
|
52
52
|
def summary(self) -> dict:
|
|
53
53
|
end = self.end_time or time.monotonic()
|
|
54
54
|
wall_s = max(end - self.start_time, 1e-9)
|
|
55
|
-
|
|
56
|
-
fail = [s for s in self.samples if not s.ok]
|
|
57
|
-
# Split fail into transport failures vs. delivered-but-graded-wrong.
|
|
58
|
-
wrong = [s for s in fail if s.request_ok]
|
|
59
|
-
request_failed = [s for s in fail if not s.request_ok]
|
|
60
|
-
latencies = [s.latency_s for s in ok]
|
|
61
|
-
|
|
62
|
-
status_counts = Counter(s.status for s in self.samples)
|
|
63
|
-
error_counts = Counter(s.error for s in fail if s.error)
|
|
64
|
-
|
|
65
|
-
out: dict = {
|
|
66
|
-
"wall_time_s": wall_s,
|
|
67
|
-
"total_requests": len(self.samples),
|
|
68
|
-
"success": len(ok),
|
|
69
|
-
"failed": len(fail),
|
|
70
|
-
"request_failed": len(request_failed),
|
|
71
|
-
"wrong_output": len(wrong),
|
|
72
|
-
"error_rate": (len(fail) / len(self.samples)) if self.samples else 0.0,
|
|
73
|
-
"request_failure_rate": (
|
|
74
|
-
(len(request_failed) / len(self.samples)) if self.samples else 0.0
|
|
75
|
-
),
|
|
76
|
-
"throughput_rps": len(self.samples) / wall_s,
|
|
77
|
-
"goodput_rps": len(ok) / wall_s,
|
|
78
|
-
"bytes_sent": sum(s.bytes_sent for s in self.samples),
|
|
79
|
-
"bytes_recv": sum(s.bytes_recv for s in self.samples),
|
|
80
|
-
"status_codes": dict(status_counts),
|
|
81
|
-
"errors": dict(error_counts),
|
|
82
|
-
}
|
|
83
|
-
if latencies:
|
|
84
|
-
out["latency_s"] = {
|
|
85
|
-
"mean": statistics.mean(latencies),
|
|
86
|
-
"min": min(latencies),
|
|
87
|
-
"max": max(latencies),
|
|
88
|
-
"p50": _pct(latencies, 50),
|
|
89
|
-
"p90": _pct(latencies, 90),
|
|
90
|
-
"p95": _pct(latencies, 95),
|
|
91
|
-
"p99": _pct(latencies, 99),
|
|
92
|
-
"p999": _pct(latencies, 99.9),
|
|
93
|
-
}
|
|
55
|
+
out = _summary_for_samples(self.samples, wall_s)
|
|
94
56
|
|
|
95
|
-
#
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"min": min(vals),
|
|
110
|
-
"max": max(vals),
|
|
111
|
-
}
|
|
112
|
-
out["workload_metrics"] = ext_summary
|
|
57
|
+
# A mixed benchmark needs each lane's SLO signal independently. Use
|
|
58
|
+
# the same wall-clock interval as the aggregate so lane throughput is
|
|
59
|
+
# directly comparable to the total, while latency and workload metrics
|
|
60
|
+
# remain scoped to that lane's samples.
|
|
61
|
+
lanes: dict[str, list[Sample]] = defaultdict(list)
|
|
62
|
+
for sample in self.samples:
|
|
63
|
+
lane = sample.meta.get("lane")
|
|
64
|
+
if isinstance(lane, str) and lane:
|
|
65
|
+
lanes[lane].append(sample)
|
|
66
|
+
if lanes:
|
|
67
|
+
out["lanes"] = {
|
|
68
|
+
name: _summary_for_samples(samples, wall_s)
|
|
69
|
+
for name, samples in sorted(lanes.items())
|
|
70
|
+
}
|
|
113
71
|
|
|
114
72
|
# Monitor time-series: summarize each metric per monitor.
|
|
115
73
|
if self.monitor_samples:
|
|
@@ -181,6 +139,22 @@ class MetricsAggregator:
|
|
|
181
139
|
lines.append(f" {k}")
|
|
182
140
|
for kk in ("mean", "p50", "p90", "p99", "max"):
|
|
183
141
|
lines.append(f" {kk:<6}: {v[kk]:.4f}")
|
|
142
|
+
if s.get("lanes"):
|
|
143
|
+
lines.append("")
|
|
144
|
+
lines.append(" lanes")
|
|
145
|
+
for name, lane in s["lanes"].items():
|
|
146
|
+
lines.append(
|
|
147
|
+
f" {name}: {lane['total_requests']} requests, "
|
|
148
|
+
f"{lane['throughput_rps']:.2f} req/s, "
|
|
149
|
+
f"{lane['success']} success"
|
|
150
|
+
)
|
|
151
|
+
for metric in ("ttft_s", "itl_ms_mean", "tokens_per_s"):
|
|
152
|
+
values = lane.get("workload_metrics", {}).get(metric)
|
|
153
|
+
if values:
|
|
154
|
+
lines.append(
|
|
155
|
+
f" {metric}: p50={values['p50']:.4f}, "
|
|
156
|
+
f"p99={values['p99']:.4f}"
|
|
157
|
+
)
|
|
184
158
|
if s.get("monitors"):
|
|
185
159
|
for mon_name, mon in s["monitors"].items():
|
|
186
160
|
lines.append("")
|
|
@@ -223,6 +197,70 @@ class MetricsAggregator:
|
|
|
223
197
|
}) + "\n")
|
|
224
198
|
|
|
225
199
|
|
|
200
|
+
def _summary_for_samples(samples: list[Sample], wall_s: float) -> dict:
|
|
201
|
+
"""Summarize a sample subset over a shared benchmark wall-clock interval."""
|
|
202
|
+
ok = [s for s in samples if s.ok]
|
|
203
|
+
fail = [s for s in samples if not s.ok]
|
|
204
|
+
# Split fail into transport failures vs. delivered-but-graded-wrong.
|
|
205
|
+
wrong = [s for s in fail if s.request_ok]
|
|
206
|
+
request_failed = [s for s in fail if not s.request_ok]
|
|
207
|
+
latencies = [s.latency_s for s in ok]
|
|
208
|
+
|
|
209
|
+
status_counts = Counter(s.status for s in samples)
|
|
210
|
+
error_counts = Counter(s.error for s in fail if s.error)
|
|
211
|
+
|
|
212
|
+
out: dict = {
|
|
213
|
+
"wall_time_s": wall_s,
|
|
214
|
+
"total_requests": len(samples),
|
|
215
|
+
"success": len(ok),
|
|
216
|
+
"failed": len(fail),
|
|
217
|
+
"request_failed": len(request_failed),
|
|
218
|
+
"wrong_output": len(wrong),
|
|
219
|
+
"error_rate": (len(fail) / len(samples)) if samples else 0.0,
|
|
220
|
+
"request_failure_rate": (
|
|
221
|
+
(len(request_failed) / len(samples)) if samples else 0.0
|
|
222
|
+
),
|
|
223
|
+
"throughput_rps": len(samples) / wall_s,
|
|
224
|
+
"goodput_rps": len(ok) / wall_s,
|
|
225
|
+
"bytes_sent": sum(s.bytes_sent for s in samples),
|
|
226
|
+
"bytes_recv": sum(s.bytes_recv for s in samples),
|
|
227
|
+
"status_codes": dict(status_counts),
|
|
228
|
+
"errors": dict(error_counts),
|
|
229
|
+
}
|
|
230
|
+
if latencies:
|
|
231
|
+
out["latency_s"] = {
|
|
232
|
+
"mean": statistics.mean(latencies),
|
|
233
|
+
"min": min(latencies),
|
|
234
|
+
"max": max(latencies),
|
|
235
|
+
"p50": _pct(latencies, 50),
|
|
236
|
+
"p90": _pct(latencies, 90),
|
|
237
|
+
"p95": _pct(latencies, 95),
|
|
238
|
+
"p99": _pct(latencies, 99),
|
|
239
|
+
"p999": _pct(latencies, 99.9),
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
# Aggregate workload-specific `extra` metrics generically: mean + percentiles.
|
|
243
|
+
extras: dict[str, list[float]] = defaultdict(list)
|
|
244
|
+
for s in ok:
|
|
245
|
+
for k, v in s.extra.items():
|
|
246
|
+
if isinstance(v, (int, float)):
|
|
247
|
+
extras[k].append(float(v))
|
|
248
|
+
if extras:
|
|
249
|
+
ext_summary = {}
|
|
250
|
+
for k, vals in extras.items():
|
|
251
|
+
ext_summary[k] = {
|
|
252
|
+
"mean": statistics.mean(vals),
|
|
253
|
+
"p50": _pct(vals, 50),
|
|
254
|
+
"p90": _pct(vals, 90),
|
|
255
|
+
"p99": _pct(vals, 99),
|
|
256
|
+
"min": min(vals),
|
|
257
|
+
"max": max(vals),
|
|
258
|
+
}
|
|
259
|
+
out["workload_metrics"] = ext_summary
|
|
260
|
+
|
|
261
|
+
return out
|
|
262
|
+
|
|
263
|
+
|
|
226
264
|
def _safe_meta(meta: dict) -> dict:
|
|
227
265
|
out = {}
|
|
228
266
|
for k, v in meta.items():
|
|
@@ -30,11 +30,31 @@ from benchmaker.workloads.base import WorkloadType
|
|
|
30
30
|
from benchmaker.workloads.datasets import Workload, StaticWorkload
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
@dataclass
|
|
34
|
+
class BenchLane:
|
|
35
|
+
"""One independently scheduled input lane in a mixed benchmark.
|
|
36
|
+
|
|
37
|
+
All lanes share the enclosing :class:`BenchConfig`'s workload type and
|
|
38
|
+
endpoint, but each owns its own data source and load model. This keeps
|
|
39
|
+
OpenAI/HTTP protocol configuration centralized while preserving independent
|
|
40
|
+
arrival processes for phase-swing experiments.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
name: str
|
|
44
|
+
workload: Workload
|
|
45
|
+
load: LoadModel
|
|
46
|
+
|
|
47
|
+
def __post_init__(self) -> None:
|
|
48
|
+
if not self.name or not self.name.strip():
|
|
49
|
+
raise ValueError("lane name must be a non-empty string")
|
|
50
|
+
|
|
51
|
+
|
|
33
52
|
@dataclass
|
|
34
53
|
class BenchConfig:
|
|
35
54
|
workload_type: WorkloadType # how to talk to the service
|
|
36
|
-
load: LoadModel
|
|
55
|
+
load: Optional[LoadModel] = None # when to fire (single workload)
|
|
37
56
|
workload: Workload = field(default_factory=StaticWorkload) # what to send
|
|
57
|
+
lanes: list[BenchLane] = field(default_factory=list)
|
|
38
58
|
pre_hooks: list[PreRequestHook] = field(default_factory=list)
|
|
39
59
|
post_hooks: list[PostResponseHook] = field(default_factory=list)
|
|
40
60
|
monitors: list[Monitor] = field(default_factory=list) # optional periodic samplers
|
|
@@ -48,6 +68,16 @@ class BenchConfig:
|
|
|
48
68
|
progress_every_s: float = 1.0
|
|
49
69
|
stop_on_exhausted: bool = True
|
|
50
70
|
|
|
71
|
+
def __post_init__(self) -> None:
|
|
72
|
+
if self.lanes:
|
|
73
|
+
if self.load is not None:
|
|
74
|
+
raise ValueError("configure either load/workload or lanes, not both")
|
|
75
|
+
names = [lane.name for lane in self.lanes]
|
|
76
|
+
if len(names) != len(set(names)):
|
|
77
|
+
raise ValueError("mixed benchmark lane names must be unique")
|
|
78
|
+
elif self.load is None:
|
|
79
|
+
raise ValueError("BenchConfig requires a load model or at least one lane")
|
|
80
|
+
|
|
51
81
|
|
|
52
82
|
@dataclass
|
|
53
83
|
class BenchResult:
|
|
@@ -73,7 +103,7 @@ class BenchRunner:
|
|
|
73
103
|
try:
|
|
74
104
|
await self._drive(session)
|
|
75
105
|
finally:
|
|
76
|
-
await self.
|
|
106
|
+
await self._aclose_workloads()
|
|
77
107
|
await self.cfg.workload_type.aclose()
|
|
78
108
|
if self.cfg.recorder is not None:
|
|
79
109
|
self.cfg.recorder.close()
|
|
@@ -96,19 +126,10 @@ class BenchRunner:
|
|
|
96
126
|
))
|
|
97
127
|
|
|
98
128
|
try:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
if self.cfg.stop_on_exhausted:
|
|
104
|
-
break
|
|
105
|
-
else:
|
|
106
|
-
continue
|
|
107
|
-
|
|
108
|
-
await sem.acquire()
|
|
109
|
-
task = asyncio.create_task(self._fire(session, item, sem))
|
|
110
|
-
tasks.add(task)
|
|
111
|
-
task.add_done_callback(tasks.discard)
|
|
129
|
+
if self.cfg.lanes:
|
|
130
|
+
await self._drive_lanes(session, sem, tasks)
|
|
131
|
+
else:
|
|
132
|
+
await self._drive_single(session, sem, tasks)
|
|
112
133
|
finally:
|
|
113
134
|
progress_task.cancel()
|
|
114
135
|
try:
|
|
@@ -124,11 +145,71 @@ class BenchRunner:
|
|
|
124
145
|
if monitor_tasks:
|
|
125
146
|
await asyncio.gather(*monitor_tasks, return_exceptions=True)
|
|
126
147
|
|
|
148
|
+
async def _drive_single(self, session: aiohttp.ClientSession,
|
|
149
|
+
sem: asyncio.Semaphore,
|
|
150
|
+
tasks: set[asyncio.Task]) -> None:
|
|
151
|
+
assert self.cfg.load is not None
|
|
152
|
+
async for _ in self.cfg.load.tickets():
|
|
153
|
+
try:
|
|
154
|
+
item = await self.cfg.workload.next_item()
|
|
155
|
+
except StopAsyncIteration:
|
|
156
|
+
if self.cfg.stop_on_exhausted:
|
|
157
|
+
break
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
await sem.acquire()
|
|
161
|
+
task = asyncio.create_task(
|
|
162
|
+
self._fire(session, item, sem, self.cfg.load)
|
|
163
|
+
)
|
|
164
|
+
tasks.add(task)
|
|
165
|
+
task.add_done_callback(tasks.discard)
|
|
166
|
+
|
|
167
|
+
async def _drive_lanes(self, session: aiohttp.ClientSession,
|
|
168
|
+
sem: asyncio.Semaphore,
|
|
169
|
+
tasks: set[asyncio.Task]) -> None:
|
|
170
|
+
"""Run each lane's admission iterator concurrently.
|
|
171
|
+
|
|
172
|
+
A finite workload ends only its own lane. Other lanes keep producing
|
|
173
|
+
tickets, which is required when one dataset is short or intentionally
|
|
174
|
+
bursty and another drives a long complementary phase.
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
async def produce(lane: BenchLane) -> None:
|
|
178
|
+
async for _ in lane.load.tickets():
|
|
179
|
+
try:
|
|
180
|
+
item = await lane.workload.next_item()
|
|
181
|
+
except StopAsyncIteration:
|
|
182
|
+
if self.cfg.stop_on_exhausted:
|
|
183
|
+
break
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
await sem.acquire()
|
|
187
|
+
task = asyncio.create_task(
|
|
188
|
+
self._fire(session, item, sem, lane.load, lane.name)
|
|
189
|
+
)
|
|
190
|
+
tasks.add(task)
|
|
191
|
+
task.add_done_callback(tasks.discard)
|
|
192
|
+
|
|
193
|
+
producers = [asyncio.create_task(produce(lane)) for lane in self.cfg.lanes]
|
|
194
|
+
try:
|
|
195
|
+
await asyncio.gather(*producers)
|
|
196
|
+
finally:
|
|
197
|
+
for producer in producers:
|
|
198
|
+
if not producer.done():
|
|
199
|
+
producer.cancel()
|
|
200
|
+
await asyncio.gather(*producers, return_exceptions=True)
|
|
201
|
+
|
|
127
202
|
async def _fire(self, session: aiohttp.ClientSession, item: Any,
|
|
128
|
-
sem: asyncio.Semaphore
|
|
203
|
+
sem: asyncio.Semaphore, load: LoadModel,
|
|
204
|
+
lane_name: Optional[str] = None) -> None:
|
|
129
205
|
start_mono = time.monotonic()
|
|
130
206
|
try:
|
|
131
207
|
async def fire(req: Request) -> Response:
|
|
208
|
+
if lane_name is not None:
|
|
209
|
+
# The config-defined lane is authoritative: callers may
|
|
210
|
+
# still attach arbitrary metadata, but cannot accidentally
|
|
211
|
+
# collapse a mixed run into an incorrect lane.
|
|
212
|
+
req.meta["lane"] = lane_name
|
|
132
213
|
for hook in self.cfg.pre_hooks:
|
|
133
214
|
req = await maybe_await(hook(req))
|
|
134
215
|
fire_start = time.monotonic()
|
|
@@ -145,15 +226,30 @@ class BenchRunner:
|
|
|
145
226
|
workload_name=self.cfg.workload_type.name,
|
|
146
227
|
)
|
|
147
228
|
sample = await self.cfg.workload_type.run_ticket(ctx)
|
|
229
|
+
if lane_name is not None:
|
|
230
|
+
sample.meta["lane"] = lane_name
|
|
148
231
|
self.metrics.add(sample)
|
|
149
232
|
except Exception as e:
|
|
150
233
|
self.metrics.add(_failure_sample(
|
|
151
234
|
f"{type(e).__name__}: {e}",
|
|
152
235
|
self.cfg.workload_type.name,
|
|
236
|
+
lane_name,
|
|
153
237
|
))
|
|
154
238
|
finally:
|
|
155
239
|
sem.release()
|
|
156
|
-
|
|
240
|
+
load.on_complete()
|
|
241
|
+
|
|
242
|
+
async def _aclose_workloads(self) -> None:
|
|
243
|
+
workloads = (
|
|
244
|
+
[lane.workload for lane in self.cfg.lanes]
|
|
245
|
+
if self.cfg.lanes
|
|
246
|
+
else [self.cfg.workload]
|
|
247
|
+
)
|
|
248
|
+
closed: set[int] = set()
|
|
249
|
+
for workload in workloads:
|
|
250
|
+
if id(workload) not in closed:
|
|
251
|
+
closed.add(id(workload))
|
|
252
|
+
await workload.aclose()
|
|
157
253
|
|
|
158
254
|
async def _execute(self, session: aiohttp.ClientSession, req: Request,
|
|
159
255
|
start_mono: float) -> Response:
|
|
@@ -254,12 +350,16 @@ class BenchRunner:
|
|
|
254
350
|
out_dir,
|
|
255
351
|
self.metrics,
|
|
256
352
|
workload_type_name=self.cfg.workload_type.name,
|
|
257
|
-
workload_name=
|
|
353
|
+
workload_name=(
|
|
354
|
+
"mix:" + ",".join(lane.name for lane in self.cfg.lanes)
|
|
355
|
+
if self.cfg.lanes else self.cfg.workload.name
|
|
356
|
+
),
|
|
258
357
|
**kwargs,
|
|
259
358
|
)
|
|
260
359
|
|
|
261
360
|
|
|
262
|
-
def _failure_sample(error: str, workload: str
|
|
361
|
+
def _failure_sample(error: str, workload: str,
|
|
362
|
+
lane_name: Optional[str] = None) -> Sample:
|
|
263
363
|
return Sample(
|
|
264
364
|
start_ts=time.monotonic(),
|
|
265
365
|
latency_s=0.0,
|
|
@@ -268,6 +368,7 @@ def _failure_sample(error: str, workload: str) -> Sample:
|
|
|
268
368
|
request_ok=False,
|
|
269
369
|
error=error,
|
|
270
370
|
workload=workload,
|
|
371
|
+
meta={"lane": lane_name} if lane_name is not None else {},
|
|
271
372
|
)
|
|
272
373
|
|
|
273
374
|
|
|
@@ -157,6 +157,26 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
157
157
|
"trajectory store recorded with tool_results."),
|
|
158
158
|
click.option("--utilization-interval-sec", "utilization_interval_sec",
|
|
159
159
|
type=float, default=5.0, show_default=True),
|
|
160
|
+
click.option("--qos-enabled/--no-qos-enabled", "qos_enabled",
|
|
161
|
+
default=False, show_default=True,
|
|
162
|
+
help="Demote verifier-phase containers to best_effort "
|
|
163
|
+
"cpu.weight (and apply the QoS verifier-timeout "
|
|
164
|
+
"multiplier)."),
|
|
165
|
+
click.option("--on-demand-cpu-weight", "on_demand_cpu_weight",
|
|
166
|
+
type=int, default=10000, show_default=True,
|
|
167
|
+
help="cpu.weight for on-demand (agent-phase) containers "
|
|
168
|
+
"when --qos-enabled. Ignored when --no-qos-enabled."),
|
|
169
|
+
click.option("--best-effort-cpu-weight", "best_effort_cpu_weight",
|
|
170
|
+
type=int, default=10, show_default=True,
|
|
171
|
+
help="cpu.weight for best-effort (verifier-phase) "
|
|
172
|
+
"containers when --qos-enabled. Ignored when "
|
|
173
|
+
"--no-qos-enabled."),
|
|
174
|
+
click.option("--qos-verifier-timeout-multiplier",
|
|
175
|
+
"qos_verifier_timeout_multiplier",
|
|
176
|
+
type=float, default=2.0, show_default=True,
|
|
177
|
+
help="Verifier timeout multiplier applied only when "
|
|
178
|
+
"--qos-enabled (QoS demotes verifier CPU, so the "
|
|
179
|
+
"verifier needs more wall-clock time)."),
|
|
160
180
|
]
|
|
161
181
|
|
|
162
182
|
def run(self, shared: SharedOpts, *, job, trajectories, concurrency,
|
|
@@ -164,7 +184,9 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
164
184
|
dataset, exec_timeout_sec, n_tasks, task, exclude_task, n_attempts,
|
|
165
185
|
timeout_multiplier, backend_type, request_timeout_sec,
|
|
166
186
|
agent_ready_timeout_sec, jobs_dir, timeline,
|
|
167
|
-
utilization_interval_sec, validate_observations
|
|
187
|
+
utilization_interval_sec, validate_observations,
|
|
188
|
+
qos_enabled, on_demand_cpu_weight, best_effort_cpu_weight,
|
|
189
|
+
qos_verifier_timeout_multiplier) -> Optional[int]:
|
|
168
190
|
from benchmaker.swebench import harbor_eval as he
|
|
169
191
|
from benchmaker.swebench import trajectory as T
|
|
170
192
|
|
|
@@ -250,6 +272,10 @@ class SWEBenchReplayRecipe(Recipe):
|
|
|
250
272
|
request_timeout_sec=request_timeout_sec,
|
|
251
273
|
agent_ready_timeout_sec=agent_ready_timeout_sec,
|
|
252
274
|
jobs_dir=jobs_dir,
|
|
275
|
+
qos_enabled=qos_enabled,
|
|
276
|
+
on_demand_cpu_weight=on_demand_cpu_weight,
|
|
277
|
+
best_effort_cpu_weight=best_effort_cpu_weight,
|
|
278
|
+
qos_verifier_timeout_multiplier=qos_verifier_timeout_multiplier,
|
|
253
279
|
)
|
|
254
280
|
|
|
255
281
|
results: list[tuple] = []
|
|
@@ -219,12 +219,28 @@ def _build_job_config(args: argparse.Namespace) -> JobConfig:
|
|
|
219
219
|
if jobs_dir:
|
|
220
220
|
job_kwargs["jobs_dir"] = Path(jobs_dir)
|
|
221
221
|
|
|
222
|
+
# QoS: when enabled, splat the cpu.weight knobs into the environment kwargs
|
|
223
|
+
# (consumed by FlashSandboxEnvironment.__init__) and couple the verifier
|
|
224
|
+
# timeout — QoS demotes verifier-phase CPU, so the verifier needs more
|
|
225
|
+
# wall-clock time. Left untouched (None) when QoS is off.
|
|
226
|
+
verifier_timeout_multiplier = None
|
|
227
|
+
# QoS is wired only via the swebench-replay recipe CLI; harbor_eval's own
|
|
228
|
+
# _parse_args does not expose these flags, so this guard no-ops there.
|
|
229
|
+
if getattr(args, "qos_enabled", False):
|
|
230
|
+
environment.kwargs.update(
|
|
231
|
+
qos_enabled=True,
|
|
232
|
+
on_demand_cpu_weight=args.on_demand_cpu_weight,
|
|
233
|
+
best_effort_cpu_weight=args.best_effort_cpu_weight,
|
|
234
|
+
)
|
|
235
|
+
verifier_timeout_multiplier = args.qos_verifier_timeout_multiplier
|
|
236
|
+
|
|
222
237
|
return JobConfig(
|
|
223
238
|
job_name=args.job_name or "",
|
|
224
239
|
n_attempts=args.n_attempts,
|
|
225
240
|
n_concurrent_trials=args.concurrency,
|
|
226
241
|
quiet=False,
|
|
227
242
|
timeout_multiplier=args.timeout_multiplier,
|
|
243
|
+
verifier_timeout_multiplier=verifier_timeout_multiplier,
|
|
228
244
|
environment=environment,
|
|
229
245
|
agents=[agent],
|
|
230
246
|
datasets=[dataset],
|