PyPI - benchmaker - Versions diffs - 0.1.0__tar.gz → 0.1.2__tar.gz - Mend

benchmaker 0.1.0tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

{benchmaker-0.1.0/benchmaker.egg-info → benchmaker-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,23 +1,32 @@
 Metadata-Version: 2.4
 Name: benchmaker
-Version: 0.1.0
+Version: 0.1.2
 Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
 Author: Xiaozhe Yao
 License: MIT
-Requires-Python: >=3.10
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 Requires-Dist: aiohttp>=3.9
 Requires-Dist: click>=8.1
+Requires-Dist: datasets>=4.8.5
+Requires-Dist: huggingface-hub>=1.16.4
+Requires-Dist: pyarrow>=24.0.0
 Requires-Dist: pyyaml>=6.0
+Requires-Dist: swebench>=4.1.0
 Provides-Extra: rich
 Requires-Dist: rich>=13; extra == "rich"
 Provides-Extra: hf
 Requires-Dist: datasets>=2.18; extra == "hf"
+Requires-Dist: transformers>=4.40; extra == "hf"
 Provides-Extra: dev
 Requires-Dist: pytest>=7; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
+Provides-Extra: plot
+Requires-Dist: ipykernel>=7.2.0; extra == "plot"
+Requires-Dist: matplotlib>=3.10.9; extra == "plot"
+Requires-Dist: seaborn>=0.13.2; extra == "plot"
-# bench-maker
+# benchmaker
 Async HTTP benchmarking with pluggable workload-types (protocols), workloads
 (datasets), load models, hooks, and optional periodic monitors.
@@ -44,7 +53,7 @@ pip install -e .
 pip install -e .[dev]   # for tests
 ```
-This installs the `benchmaker` Python package and the `bench-maker` CLI.
+This installs the `benchmaker` Python package and the `benchmaker` CLI.
 ## 30-second tour
@@ -63,10 +72,12 @@ async def main():
 asyncio.run(main())
 ```
-Or via the CLI:
+Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
+`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
+`trajectory-replay`):
 ```bash
-bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
+benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
 ```
 ## Walkthrough: benchmarking an LLM endpoint with ShareGPT
@@ -129,16 +140,16 @@ also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
 ### Rebuild or customize it yourself
-The published split is produced by `tools/prepare_sharegpt.py`, which downloads
+The published split is produced by `tools/sharegpt/prepare.py`, which downloads
 the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
 shape above. Run it when you want a subset, different filtering, or a refresh:
 ```bash
 # Defaults: .local/sharegpt_v3_raw.json  ->  .local/sharegpt_v3.jsonl
-python tools/prepare_sharegpt.py
+python tools/sharegpt/prepare.py
 # A quick subset for smoke tests:
-python tools/prepare_sharegpt.py --max-items 2000
+python tools/sharegpt/prepare.py --max-items 2000
 ```
 The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
@@ -147,7 +158,7 @@ row). Point any workload at the local file with `JsonlWorkload(path=...,
 field="messages")`, or on the CLI:
 ```bash
-bench-maker llm \
+benchmaker llm \
     --url   http://localhost:8000/v1/chat/completions \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --prompts-jsonl .local/sharegpt_v3.jsonl \
@@ -157,7 +168,7 @@ bench-maker llm \
     --out-dir ./runs --label dataset=sharegpt
 ```
-To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
+To re-publish after regenerating, `tools/sharegpt/upload_hf.py` pushes the
 JSONL back to the Hub (needs a write token).
 ## Documentation
@@ -174,6 +185,41 @@ Full docs live in [`docs/`](docs/):
 - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
 - [CLI & YAML reference](docs/cli-and-yaml.md)
 - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
+- `benchmaker sglang` — native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
+- `benchmaker trajectory-replay` — multi-turn prefix-cache parity replay of
+  trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
+## Deterministic replay (`swebench-replay`)
+Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
+real pi + sandbox + verifier pipeline still runs, only the model is served back
+from recorded outputs, so re-runs are deterministic and free of model
+cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
+pipeline without the model's stochasticity as a confound. Still needs
+`FLASH_SANDBOX_URL` (the sandbox + verifier are real).
+```bash
+# 1) (optional) convert a job's pi logs to a replay store — the recipe can also
+#    do this inline via --job.
+python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
+    -o replay-trajectories.jsonl
+# 2) replay (host mode, localhost) across a concurrency sweep
+FLASH_SANDBOX_URL=http://localhost:8080 \
+  benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
+    --mode pi-host --sweep 1,5,25
+# container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
+FLASH_SANDBOX_URL=http://localhost:8080 \
+  benchmaker swebench-replay --job jobs/2026-06-08__05-24-01_b352cb \
+    --mode pi-container --host 0.0.0.0 --reachable-host "$(hostname -I | awk '{print $1}')"
+```
+The replay server is stateless: it picks each response by the task's identity
+(the `# Task:` line, falling back to a hash of the full prompt when the recorded
+run lacked an instance id) plus the count of assistant messages already in the
+request — so it is correct at any concurrency. A `MISSES` column in the summary
+flags any divergence (a request beyond the recorded turns).
 ## Examples
@@ -190,19 +236,29 @@ Under [`examples/`](examples/):
 - `config.yaml`           — generic HTTP YAML config
 - `config_llm.yaml`       — LLM YAML config with a Prometheus monitor
-Helper scripts under [`tools/`](tools/):
+Helper tooling under [`tools/`](tools/), grouped by purpose:
-- `prepare_sharegpt.py`   — fetch ShareGPT V3 and convert to a generic JSONL
-- `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
-- `start_local_llm.sh`    — example local SGLang launch command
+- `sharegpt/`     — `prepare.py` (fetch ShareGPT V3 → JSONL) + `upload_hf.py`
+  (push to the HF Hub with a write token)
+- `swe_images/`   — mirror SWE-bench/R2E-Gym container images to ghcr
+  (`publish.py`) and list the published refs (`pull.py`)
+- `agent_warmup/` — build the agent-warmup SFT dataset
+  (`python -m tools.agent_warmup.cli`)
+- `start_local_llm.sh` — example local SGLang launch command
 ## Project layout
 ```
 benchmaker/          # library code
-entrypoints/         # CLI (bench-maker)
-examples/            # runnable examples
-tools/               # one-off helper scripts (dataset prep, etc.)
+  __init__.py        #   public API (re-exports); cli.py — the `benchmaker` CLI
+  config.py  env.py  #   YAML config loading + .env interpolation
+  core/              #   engine: types, load models, runner, metrics, monitors, trace
+  io/                #   run output: per-run bundle + cross-run collection
+  workloads/         #   workload-types (http, llm, sandbox, agent, hf, eval)
+  recipes/           #   CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
+  swebench/          #   SWE-bench coding agent + grading + harbor adapters
+examples/            # runnable examples (incl. swebench/ coding-agent config)
+tools/               # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
 tests/               # pytest smoke tests
 docs/                # reference docs
 ```

benchmaker-0.1.0/PKG-INFO → benchmaker-0.1.2/README.md RENAMED Viewed

@@ -1,23 +1,4 @@
-Metadata-Version: 2.4
-Name: benchmaker
-Version: 0.1.0
-Summary: Async HTTP benchmarking utility with pluggable workloads and load models.
-Author: Xiaozhe Yao
-License: MIT
-Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-Requires-Dist: aiohttp>=3.9
-Requires-Dist: click>=8.1
-Requires-Dist: pyyaml>=6.0
-Provides-Extra: rich
-Requires-Dist: rich>=13; extra == "rich"
-Provides-Extra: hf
-Requires-Dist: datasets>=2.18; extra == "hf"
-Provides-Extra: dev
-Requires-Dist: pytest>=7; extra == "dev"
-Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
-# bench-maker
+# benchmaker
 Async HTTP benchmarking with pluggable workload-types (protocols), workloads
 (datasets), load models, hooks, and optional periodic monitors.
@@ -44,7 +25,7 @@ pip install -e .
 pip install -e .[dev]   # for tests
 ```
-This installs the `benchmaker` Python package and the `bench-maker` CLI.
+This installs the `benchmaker` Python package and the `benchmaker` CLI.
 ## 30-second tour
@@ -63,10 +44,12 @@ async def main():
 asyncio.run(main())
 ```
-Or via the CLI:
+Or via the CLI. Workload-specific benchmarks are exposed as **recipes** —
+`benchmaker <recipe> --args` (`http`, `llm`, `sandbox`, `swebench`, `sglang`,
+`trajectory-replay`):
 ```bash
-bench-maker quick --url https://httpbin.org/get --rate poisson:50 --duration 10s
+benchmaker http --url https://httpbin.org/get --rate poisson:50 --duration 10s
 ```
 ## Walkthrough: benchmarking an LLM endpoint with ShareGPT
@@ -129,16 +112,16 @@ also come from `.env` via `OpenAIChatWorkloadType.from_env(...)`.
 ### Rebuild or customize it yourself
-The published split is produced by `tools/prepare_sharegpt.py`, which downloads
+The published split is produced by `tools/sharegpt/prepare.py`, which downloads
 the upstream JSON once into `.local/` (gitignored) and converts it to the JSONL
 shape above. Run it when you want a subset, different filtering, or a refresh:
 ```bash
 # Defaults: .local/sharegpt_v3_raw.json  ->  .local/sharegpt_v3.jsonl
-python tools/prepare_sharegpt.py
+python tools/sharegpt/prepare.py
 # A quick subset for smoke tests:
-python tools/prepare_sharegpt.py --max-items 2000
+python tools/sharegpt/prepare.py --max-items 2000
 ```
 The raw download is ~700 MB. Use `--min-chars` / `--max-chars` to drop empty or
@@ -147,7 +130,7 @@ row). Point any workload at the local file with `JsonlWorkload(path=...,
 field="messages")`, or on the CLI:
 ```bash
-bench-maker llm \
+benchmaker llm \
     --url   http://localhost:8000/v1/chat/completions \
     --model meta-llama/Llama-3.1-8B-Instruct \
     --prompts-jsonl .local/sharegpt_v3.jsonl \
@@ -157,7 +140,7 @@ bench-maker llm \
     --out-dir ./runs --label dataset=sharegpt
 ```
-To re-publish after regenerating, `tools/upload_sharegpt_hf.py` pushes the
+To re-publish after regenerating, `tools/sharegpt/upload_hf.py` pushes the
 JSONL back to the Hub (needs a write token).
 ## Documentation
@@ -174,6 +157,41 @@ Full docs live in [`docs/`](docs/):
 - [Correctness / accuracy eval](docs/eval.md) — grade responses against references
 - [CLI & YAML reference](docs/cli-and-yaml.md)
 - [ShareGPT benchmark](docs/sharegpt-benchmark.md) — self-contained end-to-end walkthrough
+- `benchmaker sglang` — native SGLang `/generate` benchmark (see [`docs/sglang.md`](docs/sglang.md)).
+- `benchmaker trajectory-replay` — multi-turn prefix-cache parity replay of
+  trajectory datasets like SWE-smith (see [`docs/trajectory-replay.md`](docs/trajectory-replay.md)).
+## Deterministic replay (`swebench-replay`)
+Re-run a recorded SWE-bench job with the LLM **mocked from its own logs** — the
+real pi + sandbox + verifier pipeline still runs, only the model is served back
+from recorded outputs, so re-runs are deterministic and free of model
+cost/variance. Vary `--concurrency` (or `--sweep`) to study the rest of the
+pipeline without the model's stochasticity as a confound. Still needs
+`FLASH_SANDBOX_URL` (the sandbox + verifier are real).
+```bash
+# 1) (optional) convert a job's pi logs to a replay store — the recipe can also
+#    do this inline via --job.
+python -m benchmaker.swebench.trajectory jobs/2026-06-08__05-24-01_b352cb \
+    -o replay-trajectories.jsonl
+# 2) replay (host mode, localhost) across a concurrency sweep
+FLASH_SANDBOX_URL=http://localhost:8080 \
+  benchmaker swebench-replay --trajectories replay-trajectories.jsonl \
+    --mode pi-host --sweep 1,5,25
+# container mode: bind 0.0.0.0 and tell the sandbox how to reach the server
+FLASH_SANDBOX_URL=http://localhost:8080 \
+  benchmaker swebench-replay --job jobs/2026-06-08__05-24-01_b352cb \
+    --mode pi-container --host 0.0.0.0 --reachable-host "$(hostname -I | awk '{print $1}')"
+```
+The replay server is stateless: it picks each response by the task's identity
+(the `# Task:` line, falling back to a hash of the full prompt when the recorded
+run lacked an instance id) plus the count of assistant messages already in the
+request — so it is correct at any concurrency. A `MISSES` column in the summary
+flags any divergence (a request beyond the recorded turns).
 ## Examples
@@ -190,19 +208,29 @@ Under [`examples/`](examples/):
 - `config.yaml`           — generic HTTP YAML config
 - `config_llm.yaml`       — LLM YAML config with a Prometheus monitor
-Helper scripts under [`tools/`](tools/):
+Helper tooling under [`tools/`](tools/), grouped by purpose:
-- `prepare_sharegpt.py`   — fetch ShareGPT V3 and convert to a generic JSONL
-- `upload_sharegpt_hf.py` — push the converted JSONL to the HF Hub (write token)
-- `start_local_llm.sh`    — example local SGLang launch command
+- `sharegpt/`     — `prepare.py` (fetch ShareGPT V3 → JSONL) + `upload_hf.py`
+  (push to the HF Hub with a write token)
+- `swe_images/`   — mirror SWE-bench/R2E-Gym container images to ghcr
+  (`publish.py`) and list the published refs (`pull.py`)
+- `agent_warmup/` — build the agent-warmup SFT dataset
+  (`python -m tools.agent_warmup.cli`)
+- `start_local_llm.sh` — example local SGLang launch command
 ## Project layout
 ```
 benchmaker/          # library code
-entrypoints/         # CLI (bench-maker)
-examples/            # runnable examples
-tools/               # one-off helper scripts (dataset prep, etc.)
+  __init__.py        #   public API (re-exports); cli.py — the `benchmaker` CLI
+  config.py  env.py  #   YAML config loading + .env interpolation
+  core/              #   engine: types, load models, runner, metrics, monitors, trace
+  io/                #   run output: per-run bundle + cross-run collection
+  workloads/         #   workload-types (http, llm, sandbox, agent, hf, eval)
+  recipes/           #   CLI recipes (http, llm, sandbox, swebench, swebench-replay) + registry
+  swebench/          #   SWE-bench coding agent + grading + harbor adapters
+examples/            # runnable examples (incl. swebench/ coding-agent config)
+tools/               # out-of-tree tooling: sharegpt/, swe_images/, agent_warmup/
 tests/               # pytest smoke tests
 docs/                # reference docs
 ```

{benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """benchmaker: async HTTP benchmarking with pluggable workload-types + workloads (datasets)."""
-from benchmaker.types import (
+from benchmaker.core.types import (
     Request,
     Response,
     Sample,
@@ -19,6 +19,8 @@ from benchmaker.workloads.http import HttpWorkloadType
 from benchmaker.workloads.llm import OpenAIChatWorkloadType
 from benchmaker.workloads.sandbox import SandboxWorkloadType
 from benchmaker.workloads.hf import HFDatasetWorkload
+from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
+from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
 from benchmaker.workloads.agent import (
     Agent,
     AgentContext,
@@ -41,7 +43,7 @@ from benchmaker.workloads.eval import (
     judge_llm,
     openai_chat_judge,
 )
-from benchmaker.load import (
+from benchmaker.core.load import (
     LoadModel,
     ConstantRPS,
     PoissonRPS,
@@ -51,21 +53,21 @@ from benchmaker.load import (
     parse_rate_spec,
 )
 from benchmaker.env import interpolate, load_dotenv
-from benchmaker.monitors import (
+from benchmaker.core.monitors import (
     Monitor,
     FunctionMonitor,
     PrometheusMonitor,
     parse_prometheus,
 )
-from benchmaker.runner import BenchRunner, BenchConfig, BenchResult
-from benchmaker.trace import (
+from benchmaker.core.runner import BenchRunner, BenchConfig, BenchResult
+from benchmaker.core.trace import (
     ReplayWorkloadType,
     TracePacedLoad,
     TraceRecorder,
     TraceWorkload,
     load_trace,
 )
-from benchmaker.bundle import (
+from benchmaker.io.bundle import (
     BUNDLE_VERSION,
     RunMeta,
     default_run_id,
@@ -87,6 +89,8 @@ __all__ = [
     "OpenAIChatWorkloadType",
     "SandboxWorkloadType",
     "HFDatasetWorkload",
+    "SGLangGenerateWorkloadType",
+    "TrajectoryReplayWorkload",
     # agent workload (pluggable user-defined agents)
     "Agent",
     "AgentContext",
@@ -149,4 +153,4 @@ __all__ = [
     "write_bundle",
 ]
-__version__ = "0.1.0"
+__version__ = "0.1.1"

benchmaker-0.1.2/benchmaker/cli.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""benchmaker CLI."""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import sys
+import click
+import yaml
+from benchmaker.config import build_config
+from benchmaker.core.runner import BenchRunner
+from benchmaker.recipes import all_recipes
+from benchmaker.recipes._cli_shared import (
+    output_options as _output_options,
+    parse_headers as _parse_headers,
+    write_bundle_if_requested as _write_bundle_if_requested,
+)
+from benchmaker.recipes._factory import make_command
+# ---------------------------------------------------------------- main
+@click.group()
+@click.option("--log-level", default="INFO",
+              type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False),
+              help="Logging level (default: INFO).")
+def main(log_level: str) -> None:
+    """[benchmaker]: async HTTP benchmarking with pluggable workloads."""
+    level = log_level.upper()
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(name)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    # Chatty third-party loggers (one INFO line per HTTP request / hub fetch)
+    # drown out our own output. Pin them to WARNING unless DEBUG was requested.
+    if level != "DEBUG":
+        for noisy in ("httpx", "httpcore", "urllib3", "huggingface_hub",
+                      "filelock", "fsspec", "datasets", "aiohttp"):
+            logging.getLogger(noisy).setLevel(logging.WARNING)
+@main.command()
+@click.argument("config_path", type=click.Path(exists=True, dir_okay=False))
+@_output_options
+@click.option("--dotenv", type=click.Path(), default=".env",
+              help="Path to .env file to load (default: .env). "
+                   "Use --dotenv '' to disable.")
+@click.option("--record", "record_path", type=click.Path(), default=None,
+              help="Write a JSONL request trace (with relative timestamps) to "
+                   "this path. A later run can replay it deterministically via "
+                   "a 'replay:' config block. Overrides any 'record:' in YAML.")
+@click.option("--replay", "replay_path", type=click.Path(exists=True, dir_okay=False),
+              default=None,
+              help="Replay a previously recorded trace at the same relative "
+                   "timings. Overrides 'workload_type' / 'workload' / 'load' "
+                   "(and any 'replay:' in YAML).")
+@click.option("--replay-speed", type=float, default=None,
+              help="Speed multiplier for --replay (default 1.0).")
+@click.option("--quiet", is_flag=True, help="Suppress progress output.")
+def run(config_path: str, out_dir: str | None, run_id: str | None,
+        labels: tuple[str, ...], notes: str, dotenv: str,
+        record_path: str | None, replay_path: str | None,
+        replay_speed: float | None, quiet: bool) -> None:
+    """Run a benchmark from a YAML config file.
+    Environment variables (loaded from `.env` by default) are interpolated
+    into the YAML using `${VAR}` or `${VAR:-default}` syntax.
+    """
+    with open(config_path) as f:
+        raw_cfg = yaml.safe_load(f)
+    if record_path is not None:
+        raw_cfg = {**raw_cfg, "record": {"path": record_path}}
+    if replay_path is not None:
+        replay_cfg: dict = {"path": replay_path}
+        if replay_speed is not None:
+            replay_cfg["speed"] = replay_speed
+        raw_cfg = {**raw_cfg, "replay": replay_cfg}
+    bench_cfg = build_config(raw_cfg, dotenv_path=(dotenv or None))
+    if quiet:
+        bench_cfg.progress_every_s = 0.0
+    runner = BenchRunner(bench_cfg)
+    asyncio.run(runner.run())
+    runner.metrics.render(sys.stdout)
+    _write_bundle_if_requested(runner, raw_cfg, out_dir, run_id, labels, notes)
+@main.command()
+@click.option("--url", required=True, help="Target URL.")
+@click.option("--method", default="GET")
+@click.option("--header", "-H", multiple=True, help="Header 'Name: value'.")
+@click.option("--json-body", default=None, help="JSON body string.")
+@click.option("--data", default=None, help="Raw body string.")
+@click.option("--rate", default="10", help="Load spec, e.g. '100', 'poisson:100', "
+              "'closed:32', 'ramp:10..500:30s'.")
+@click.option("--duration", default="10s", help="Run duration (e.g. '30s', '2m').")
+@click.option("--max-requests", type=int, default=None)
+@click.option("--timeout", "timeout_s", default=60.0, type=float)
+@click.option("--connection-limit", default=1000, type=int)
+@_output_options
+@click.option("--quiet", is_flag=True)
+def quick(url: str, method: str, header: tuple[str, ...], json_body: str | None,
+          data: str | None, rate: str, duration: str, max_requests: int | None,
+          timeout_s: float, connection_limit: int,
+          out_dir: str | None, run_id: str | None,
+          labels: tuple[str, ...], notes: str, quiet: bool) -> None:
+    """[deprecated] One-liner HTTP benchmark — use `benchmaker http` instead."""
+    sys.stderr.write(
+        "[benchmaker] 'quick' is deprecated; use 'benchmaker http'.\n"
+    )
+    cfg: dict = {
+        "workload_type": {
+            "type": "http",
+            "url": url,
+            "method": method,
+            "headers": _parse_headers(header),
+            "timeout_s": timeout_s,
+        },
+        "load": rate,
+        "duration": duration,
+        "max_requests": max_requests,
+        "timeout_s": timeout_s,
+        "connection_limit": connection_limit,
+    }
+    if json_body is not None:
+        cfg["workload"] = {"type": "static", "items": [json.loads(json_body)]}
+    elif data is not None:
+        cfg["workload"] = {"type": "static", "items": [data.encode("utf-8")]}
+    bench_cfg = build_config(cfg)
+    if quiet:
+        bench_cfg.progress_every_s = 0.0
+    runner = BenchRunner(bench_cfg)
+    asyncio.run(runner.run())
+    runner.metrics.render(sys.stdout)
+    _write_bundle_if_requested(runner, cfg, out_dir, run_id, labels, notes)
+# ---------------------------------------------------------------- collect
+@main.command()
+@click.argument("paths", nargs=-1, required=True,
+                type=click.Path(exists=True, file_okay=False))
+@click.option("--format", "fmt", type=click.Choice(["md", "csv", "json"]),
+              default="md", show_default=True,
+              help="Output format. 'md' is a Markdown table, 'csv' is comma-separated, "
+                   "'json' is a JSON array of row dicts.")
+@click.option("--metric", "metrics", multiple=True,
+              help="Extra dotted-path metric to add as a column "
+                   "(e.g. 'workload_metrics.ttft_s.p50'). Repeatable.")
+@click.option("--columns", default=None,
+              help="Comma-separated list of column names to keep (after metrics are added). "
+                   "Overrides the default column set.")
+@click.option("--sort-by", default=None,
+              help="Column name to sort rows by (ascending).")
+@click.option("--label", "label_keys", multiple=True,
+              help="Promote a meta.labels[<key>] entry into its own column. Repeatable.")
+@click.option("--recursive/--no-recursive", default=True,
+              help="When a path is a directory of run-dirs, descend one level to find them.")
+def collect(paths: tuple[str, ...], fmt: str, metrics: tuple[str, ...],
+            columns: str | None, sort_by: str | None,
+            label_keys: tuple[str, ...], recursive: bool) -> None:
+    """Collect summaries from one or more run-dirs into a table.
+    Each PATH may be a run directory (containing meta.json + summary.json) or a
+    directory of such run-dirs. With --recursive (default), a non-bundle
+    directory is scanned for immediate subdirectories that are bundles.
+    """
+    from benchmaker.io.collect import collect_table, format_table, find_bundles
+    bundle_dirs: list[str] = []
+    for p in paths:
+        bundle_dirs.extend(find_bundles(p, recursive=recursive))
+    if not bundle_dirs:
+        raise click.UsageError(
+            f"No run bundles found under: {', '.join(paths)}. "
+            "Run bundles must contain meta.json and summary.json."
+        )
+    rows, column_names = collect_table(
+        bundle_dirs,
+        extra_metrics=list(metrics),
+        label_keys=list(label_keys),
+    )
+    if columns:
+        column_names = [c.strip() for c in columns.split(",") if c.strip()]
+    if sort_by:
+        rows.sort(key=lambda r: (r.get(sort_by) is None, r.get(sort_by)))
+    sys.stdout.write(format_table(rows, column_names, fmt))
+    if fmt != "json":
+        sys.stdout.write("\n")
+# ---------------------------------------------------------------- recipes
+#
+# Each registered recipe (http, llm, sandbox, swebench, ...) is exposed as a
+# `benchmaker <recipe> --args` subcommand, built from the recipe's options plus
+# the shared load/output options. See benchmaker/recipes/.
+for _recipe in all_recipes():
+    main.add_command(make_command(_recipe))
+if __name__ == "__main__":
+    main()

{benchmaker-0.1.0 → benchmaker-0.1.2}/benchmaker/config.py RENAMED Viewed

@@ -20,9 +20,9 @@ import importlib
 from typing import Any, Callable, Optional
 from benchmaker.env import interpolate, load_dotenv
-from benchmaker.load import parse_duration, parse_rate_spec
-from benchmaker.monitors import FunctionMonitor, Monitor, PrometheusMonitor
-from benchmaker.runner import BenchConfig
+from benchmaker.core.load import parse_duration, parse_rate_spec
+from benchmaker.core.monitors import FunctionMonitor, Monitor, PrometheusMonitor
+from benchmaker.core.runner import BenchConfig
 from benchmaker.workloads.base import WorkloadType
 from benchmaker.workloads.datasets import (
     CallableWorkload,
@@ -47,7 +47,7 @@ from benchmaker.workloads.eval import (
     openai_chat_judge,
     regex_match,
 )
-from benchmaker.trace import (
+from benchmaker.core.trace import (
     ReplayWorkloadType,
     TracePacedLoad,
     TraceRecorder,
@@ -88,6 +88,9 @@ def build_workload_type(spec: dict) -> WorkloadType:
         return HttpWorkloadType(**kwargs)
     if t in ("openai", "openai-chat", "llm-chat", "llm"):
         return OpenAIChatWorkloadType(**kwargs)
+    if t in ("sglang", "sglang-generate"):
+        from benchmaker.workloads.sglang import SGLangGenerateWorkloadType
+        return SGLangGenerateWorkloadType(**kwargs)
     if t in ("sandbox", "flash-sandbox"):
         return SandboxWorkloadType(**kwargs)
     if t == "agent":
@@ -151,6 +154,9 @@ def build_workload(spec: Any) -> Workload:
         return CallableWorkload(fn=fn, **kwargs)
     if t in ("hf", "huggingface"):
         return HFDatasetWorkload(**kwargs)
+    if t == "trajectory":
+        from benchmaker.workloads.trajectory import TrajectoryReplayWorkload
+        return TrajectoryReplayWorkload(**kwargs)
     raise ValueError(f"Unknown workload type {t!r}")

benchmaker-0.1.2/benchmaker/core/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """benchmaker engine internals: data types, load models, the run loop,
2	+ metrics aggregation, periodic monitors, and trace record/replay."""

{benchmaker-0.1.0/benchmaker → benchmaker-0.1.2/benchmaker/core}/metrics.py RENAMED Viewed

@@ -9,7 +9,7 @@ from collections import Counter, defaultdict
 from dataclasses import dataclass, field
 from typing import Optional, TextIO
-from benchmaker.types import Sample
+from benchmaker.core.types import Sample
 def _pct(xs: list[float], p: float) -> float:

benchmaker 0.1.0__tar.gz → 0.1.2__tar.gz

benchmaker 0.1.0tar.gz → 0.1.2tar.gz