pytest-benchmem 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pytest_benchmem/__init__.py +38 -0
- pytest_benchmem/cli.py +104 -0
- pytest_benchmem/compare.py +38 -0
- pytest_benchmem/memray.py +59 -0
- pytest_benchmem/plotting.py +352 -0
- pytest_benchmem/py.typed +0 -0
- pytest_benchmem/pytest_plugin.py +263 -0
- pytest_benchmem/snapshot.py +141 -0
- pytest_benchmem/sweep.py +147 -0
- pytest_benchmem-0.1.0.dist-info/METADATA +152 -0
- pytest_benchmem-0.1.0.dist-info/RECORD +14 -0
- pytest_benchmem-0.1.0.dist-info/WHEEL +4 -0
- pytest_benchmem-0.1.0.dist-info/entry_points.txt +5 -0
- pytest_benchmem-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""pytest-benchmem — the memory companion to pytest-benchmark.
|
|
2
|
+
|
|
3
|
+
pytest-benchmark times your code; pytest-benchmem adds a memray peak-memory pass to the
|
|
4
|
+
same test via the ``benchmark_memory`` fixture, stored in the same run, plus
|
|
5
|
+
dims-aware plots and cross-version sweeps it has no answer for.
|
|
6
|
+
|
|
7
|
+
Light to import: this re-exports only the engine (``measure_peak``) and the
|
|
8
|
+
readers/loader that turn pytest-benchmark JSON into a tidy frame. ``pytest_benchmem.plotting``
|
|
9
|
+
pulls numpy/plotly and ``pytest_benchmem.sweep`` shells out to ``uv`` — import those
|
|
10
|
+
submodules directly when needed. The ``benchmark_memory`` fixture is delivered by
|
|
11
|
+
the pytest plugin (entry point), not by import.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from pytest_benchmem.memray import measure_peak
|
|
17
|
+
from pytest_benchmem.snapshot import (
|
|
18
|
+
DimValue,
|
|
19
|
+
Metric,
|
|
20
|
+
Sample,
|
|
21
|
+
discover_runs,
|
|
22
|
+
from_pytest_benchmark,
|
|
23
|
+
load_long_df,
|
|
24
|
+
load_samples,
|
|
25
|
+
memory_from_pytest_benchmark,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"DimValue",
|
|
30
|
+
"Metric",
|
|
31
|
+
"Sample",
|
|
32
|
+
"discover_runs",
|
|
33
|
+
"from_pytest_benchmark",
|
|
34
|
+
"load_long_df",
|
|
35
|
+
"load_samples",
|
|
36
|
+
"measure_peak",
|
|
37
|
+
"memory_from_pytest_benchmark",
|
|
38
|
+
]
|
pytest_benchmem/cli.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""pytest-benchmem CLI — ``plot`` and ``compare`` over pytest-benchmark JSON runs.
|
|
2
|
+
|
|
3
|
+
Both commands read the JSON pytest-benchmark writes (``.benchmarks/…``) and pick
|
|
4
|
+
a ``--metric`` (``time`` from ``stats``, ``memory`` from ``extra_info.peak_mib``).
|
|
5
|
+
Timing comparison/histograms are pytest-benchmark's own job; these commands are
|
|
6
|
+
the memory-aware, dims-aware views on top.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import importlib.util
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Annotated
|
|
14
|
+
|
|
15
|
+
import typer
|
|
16
|
+
|
|
17
|
+
from pytest_benchmem.snapshot import Metric, discover_runs
|
|
18
|
+
|
|
19
|
+
app = typer.Typer(help="pytest-benchmem — plot and compare benchmark runs.", no_args_is_help=True)
|
|
20
|
+
|
|
21
|
+
MetricOpt = Annotated[Metric, typer.Option(help="Which metric to read: time | memory.")]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _need_plotly() -> None:
|
|
25
|
+
if importlib.util.find_spec("plotly") is None:
|
|
26
|
+
typer.secho(
|
|
27
|
+
"plotting needs extras: pip install 'pytest-benchmem[plot]'",
|
|
28
|
+
fg=typer.colors.RED,
|
|
29
|
+
err=True,
|
|
30
|
+
)
|
|
31
|
+
raise typer.Exit(code=2)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@app.command()
|
|
35
|
+
def plot(
|
|
36
|
+
runs: Annotated[list[Path], typer.Argument(help="pytest-benchmark JSON file(s).")],
|
|
37
|
+
metric: MetricOpt = "time",
|
|
38
|
+
view: Annotated[
|
|
39
|
+
str | None,
|
|
40
|
+
typer.Option(help="compare | scatter | sweep | scaling (default: by count)."),
|
|
41
|
+
] = None,
|
|
42
|
+
facet: Annotated[str | None, typer.Option(help="Dim to facet by.")] = None,
|
|
43
|
+
x: Annotated[str | None, typer.Option(help="scaling: dim for the x-axis.")] = None,
|
|
44
|
+
clip: Annotated[float | None, typer.Option(help="Clamp the colour scale.")] = None,
|
|
45
|
+
output: Annotated[Path | None, typer.Option("--output", "-o", help="HTML out.")] = None,
|
|
46
|
+
open_browser: Annotated[bool, typer.Option("--open/--no-open")] = False,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Render an interactive plotly view from one or more pytest-benchmark runs."""
|
|
49
|
+
missing = [p for p in runs if not p.exists()]
|
|
50
|
+
if missing:
|
|
51
|
+
typer.secho(f"missing: {[str(p) for p in missing]}", fg=typer.colors.RED, err=True)
|
|
52
|
+
found = discover_runs()
|
|
53
|
+
if found:
|
|
54
|
+
typer.echo("available:\n " + "\n ".join(str(p) for p in found), err=True)
|
|
55
|
+
raise typer.Exit(code=2)
|
|
56
|
+
|
|
57
|
+
chosen = view or ("scaling" if len(runs) == 1 else "scatter" if len(runs) == 2 else "sweep")
|
|
58
|
+
_need_plotly()
|
|
59
|
+
from pytest_benchmem import plotting
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
if chosen == "compare":
|
|
63
|
+
fig, n = plotting.plot_compare(runs, metric=metric, facet=facet, clip=clip)
|
|
64
|
+
elif chosen == "scatter":
|
|
65
|
+
fig, n = plotting.plot_scatter(runs, metric=metric, facet=facet, clip=clip)
|
|
66
|
+
elif chosen == "sweep":
|
|
67
|
+
fig, n = plotting.plot_sweep(runs, metric=metric, clip=clip)
|
|
68
|
+
elif chosen == "scaling":
|
|
69
|
+
fig, n = plotting.plot_scaling(runs, metric=metric, x=x, facet=facet)
|
|
70
|
+
else:
|
|
71
|
+
typer.secho(f"unknown view {chosen!r}", fg=typer.colors.RED, err=True)
|
|
72
|
+
raise typer.Exit(code=2)
|
|
73
|
+
except ValueError as exc:
|
|
74
|
+
typer.secho(str(exc), fg=typer.colors.RED, err=True)
|
|
75
|
+
raise typer.Exit(code=1) from exc
|
|
76
|
+
|
|
77
|
+
output = output or Path(".benchmarks") / "plots" / f"{chosen}-{metric}.html"
|
|
78
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
fig.write_html(output)
|
|
80
|
+
typer.secho(f"{chosen} ({metric}): {n} ids → {output}", fg=typer.colors.GREEN)
|
|
81
|
+
if open_browser:
|
|
82
|
+
import webbrowser
|
|
83
|
+
|
|
84
|
+
webbrowser.open(output.resolve().as_uri())
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@app.command()
|
|
88
|
+
def compare(
|
|
89
|
+
a: Annotated[Path, typer.Argument(help="Baseline run.")],
|
|
90
|
+
b: Annotated[Path, typer.Argument(help="Candidate run.")],
|
|
91
|
+
metric: MetricOpt = "time",
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Print a per-id delta table for two pytest-benchmark runs."""
|
|
94
|
+
for p in (a, b):
|
|
95
|
+
if not p.exists():
|
|
96
|
+
typer.secho(f"missing: {p}", fg=typer.colors.RED, err=True)
|
|
97
|
+
raise typer.Exit(code=2)
|
|
98
|
+
from pytest_benchmem.compare import compare_runs
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
compare_runs(a, b, metric=metric)
|
|
102
|
+
except ValueError as exc:
|
|
103
|
+
typer.secho(str(exc), fg=typer.colors.RED, err=True)
|
|
104
|
+
raise typer.Exit(code=1) from exc
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Text comparison of two pytest-benchmark runs — keyed on the benchmark id."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TextIO
|
|
8
|
+
|
|
9
|
+
from pytest_benchmem.snapshot import Metric, load_long_df
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def compare_runs(
|
|
13
|
+
a: str | Path, b: str | Path, *, metric: Metric = "time", out: TextIO | None = None
|
|
14
|
+
) -> None:
|
|
15
|
+
"""Print a per-id table of ``a`` vs ``b`` (for ``metric``) with percent change.
|
|
16
|
+
|
|
17
|
+
Reads the chosen metric (timing or memory) from two pytest-benchmark files.
|
|
18
|
+
Ids present in only one run show ``—``.
|
|
19
|
+
"""
|
|
20
|
+
out = out or sys.stdout
|
|
21
|
+
df, unit = load_long_df([Path(a), Path(b)], metric=metric)
|
|
22
|
+
labels = df["snapshot"].drop_duplicates().tolist()
|
|
23
|
+
la, lb = labels[0], labels[1]
|
|
24
|
+
wide = df.pivot(index="id", columns="snapshot", values="value")
|
|
25
|
+
|
|
26
|
+
print(f"\n{'id':<70} {la:>12} {lb:>12} {'change':>10} ({unit})", file=out)
|
|
27
|
+
print("-" * 108, file=out)
|
|
28
|
+
for test_id in sorted(wide.index):
|
|
29
|
+
va = wide.at[test_id, la] if la in wide.columns else None
|
|
30
|
+
vb = wide.at[test_id, lb] if lb in wide.columns else None
|
|
31
|
+
va = None if va is None or va != va else va # noqa: PLR0124 — NaN check
|
|
32
|
+
vb = None if vb is None or vb != vb else vb
|
|
33
|
+
a_str = f"{va:.4g}" if va is not None else "—"
|
|
34
|
+
b_str = f"{vb:.4g}" if vb is not None else "—"
|
|
35
|
+
change = f"{(vb - va) / va * 100:+.1f}%" if va and vb and va > 0 else "—"
|
|
36
|
+
short = test_id.split("::")[-1]
|
|
37
|
+
print(f"{short:<70} {a_str:>12} {b_str:>12} {change:>10}", file=out)
|
|
38
|
+
print(file=out)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Peak-memory measurement via ``memray`` — the core pytest-benchmem engine.
|
|
2
|
+
|
|
3
|
+
:func:`measure_peak` runs one action under a ``memray.Tracker`` and returns the
|
|
4
|
+
peak in MiB. It's what the ``benchmark_memory`` pytest fixture calls for its
|
|
5
|
+
memory pass, and it doubles as a standalone one-liner for a REPL or notebook::
|
|
6
|
+
|
|
7
|
+
from pytest_benchmem import measure_peak
|
|
8
|
+
|
|
9
|
+
peak = measure_peak(lambda: build_model(1000)) # MiB
|
|
10
|
+
|
|
11
|
+
Why memray and not RSS sampling: peak RSS (what ASV's ``peakmem`` and naive
|
|
12
|
+
samplers use) misses the numpy/C-allocation detail that's usually the point of
|
|
13
|
+
profiling a numeric library. memray tracks the allocator directly.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import gc
|
|
19
|
+
import platform
|
|
20
|
+
import tempfile
|
|
21
|
+
from collections.abc import Callable
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
#: A zero-arg callable doing the work that gets memory-tracked.
|
|
25
|
+
Action = Callable[[], object]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _require_memray() -> None:
|
|
29
|
+
"""Raise if memory measurement isn't supported — memray runs only on Linux/macOS."""
|
|
30
|
+
if platform.system() not in ("Linux", "Darwin"):
|
|
31
|
+
raise RuntimeError(
|
|
32
|
+
"memory measurement requires memray, which supports only Linux and macOS "
|
|
33
|
+
f"(this is {platform.system()}). Timing still works; run memory benchmarks "
|
|
34
|
+
"on Linux or macOS."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def measure_peak(action: Action, repeats: int = 1) -> float:
|
|
39
|
+
"""Run ``action()`` under ``memray.Tracker`` and return peak MiB.
|
|
40
|
+
|
|
41
|
+
``repeats > 1`` runs it that many times in fresh trackers and returns the
|
|
42
|
+
*minimum* peak — peak memory is noisier than expected (GC timing, lazy
|
|
43
|
+
imports, page cache), so min-of-N is the cleanest "floor this can hit".
|
|
44
|
+
"""
|
|
45
|
+
_require_memray()
|
|
46
|
+
|
|
47
|
+
import memray
|
|
48
|
+
|
|
49
|
+
peaks: list[float] = []
|
|
50
|
+
for _ in range(max(1, repeats)):
|
|
51
|
+
with tempfile.TemporaryDirectory(prefix="pytest-benchmem-") as tmp:
|
|
52
|
+
out = Path(tmp) / "track.bin" # memray must create the file itself
|
|
53
|
+
with memray.Tracker(out):
|
|
54
|
+
action()
|
|
55
|
+
peak_bytes = memray.FileReader(out).metadata.peak_memory
|
|
56
|
+
peaks.append(round(peak_bytes / (1024**2), 3))
|
|
57
|
+
gc.collect()
|
|
58
|
+
|
|
59
|
+
return min(peaks)
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""Interactive plotly views over pytest-benchmark runs — dims-driven.
|
|
2
|
+
|
|
3
|
+
Each view reads one ``metric`` (``"time"`` or ``"memory"``) out of the
|
|
4
|
+
pytest-benchmark JSON files and returns ``(figure, n_rendered)``:
|
|
5
|
+
|
|
6
|
+
- :func:`plot_compare` (2 runs) — bar chart of per-id delta.
|
|
7
|
+
- :func:`plot_scatter` (2+) — baseline cost vs ratio; top-right = the regressed one.
|
|
8
|
+
- :func:`plot_sweep` (3+) — heatmap of per-id fold-change (log2 ratio) vs the first.
|
|
9
|
+
- :func:`plot_scaling` (1) — cost vs a numeric dim, faceted/coloured by others.
|
|
10
|
+
|
|
11
|
+
The sweep/scatter/compare views key on the opaque ``id``; only ``scaling`` reads
|
|
12
|
+
``dims`` — and which dim is x / colour / facet is auto-inferred from dtype
|
|
13
|
+
(numeric → x) but overridable. plotly is imported lazily.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import TYPE_CHECKING, Literal
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
from pytest_benchmem.snapshot import Metric, load_long_df
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
import numpy.typing as npt
|
|
27
|
+
import pandas as pd
|
|
28
|
+
from plotly.graph_objects import Figure
|
|
29
|
+
|
|
30
|
+
SortMode = Literal["absolute", "relative"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _value_label(unit: str) -> str:
|
|
34
|
+
return {"s": "time", "MiB": "peak"}.get(unit, "value")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _diverging_kwargs(midpoint: float = 0.0) -> dict[str, object]:
|
|
38
|
+
"""green→white→red continuous colour scale centred on ``midpoint``."""
|
|
39
|
+
return {
|
|
40
|
+
"color_continuous_scale": ["green", "white", "red"],
|
|
41
|
+
"color_continuous_midpoint": midpoint,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _symmetric_clip(
|
|
46
|
+
magnitudes: npt.NDArray[np.float64], override: float | None, pct: float = 95.0
|
|
47
|
+
) -> float:
|
|
48
|
+
"""Symmetric colour bound: ``override`` if given, else the ``pct`` percentile
|
|
49
|
+
of ``|magnitudes|`` so outliers don't wash the rest to the midpoint.
|
|
50
|
+
"""
|
|
51
|
+
if override is not None:
|
|
52
|
+
return float(override)
|
|
53
|
+
mags = np.abs(np.asarray(magnitudes, dtype=float))
|
|
54
|
+
mags = mags[np.isfinite(mags)]
|
|
55
|
+
if mags.size == 0:
|
|
56
|
+
return 1.0
|
|
57
|
+
bound = float(np.percentile(mags, pct))
|
|
58
|
+
return bound if bound > 0 else (float(mags.max()) or 1e-9)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _fold_label(fold: float) -> str:
|
|
62
|
+
"""Format a fold-change for a colourbar tick: ``2×``, ``1/4×``, ``1.5×``."""
|
|
63
|
+
if abs(fold - 1.0) < 1e-9:
|
|
64
|
+
return "1×"
|
|
65
|
+
return f"{fold:.3g}×" if fold > 1.0 else f"1/{1.0 / fold:.3g}×"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _fold_ticks(bound: float) -> tuple[list[float], list[str]]:
|
|
69
|
+
"""Colourbar ticks (log2 positions, fold labels) spanning ±bound — clean
|
|
70
|
+
powers of two for a wide range, round sub-2× folds for a tight one.
|
|
71
|
+
"""
|
|
72
|
+
if bound >= 1.0:
|
|
73
|
+
pos = [2.0**t for t in range(1, int(bound) + 1)]
|
|
74
|
+
else:
|
|
75
|
+
pos = [f for f in (1.1, 1.25, 1.5) if float(np.log2(f)) <= bound + 1e-9]
|
|
76
|
+
if len(pos) < 2:
|
|
77
|
+
pos = [2.0 ** (bound / 2), 2.0**bound]
|
|
78
|
+
vals, text = [0.0], ["1×"]
|
|
79
|
+
for f in sorted(pos):
|
|
80
|
+
lv = float(np.log2(f))
|
|
81
|
+
vals = [-lv, *vals, lv]
|
|
82
|
+
text = [_fold_label(1.0 / f), *text, _fold_label(f)]
|
|
83
|
+
return vals, text
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _axis_kwargs(unit: str) -> dict[str, object]:
|
|
87
|
+
if unit == "s":
|
|
88
|
+
return {"tickformat": ".2s", "ticksuffix": "s"}
|
|
89
|
+
return {"ticksuffix": f" {unit}"}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _dim_columns(df: pd.DataFrame) -> list[str]:
|
|
93
|
+
"""The dim columns (everything but the fixed snapshot/id/value)."""
|
|
94
|
+
return [c for c in df.columns if c not in ("snapshot", "id", "value")]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def plot_compare(
|
|
98
|
+
snapshots: list[Path],
|
|
99
|
+
*,
|
|
100
|
+
metric: Metric = "time",
|
|
101
|
+
sort: SortMode = "absolute",
|
|
102
|
+
facet: str | None = None,
|
|
103
|
+
clip: float | None = None,
|
|
104
|
+
) -> tuple[Figure, int]:
|
|
105
|
+
"""Bar chart of per-id delta, sorted by the chosen Δ (biggest regressions on top).
|
|
106
|
+
|
|
107
|
+
``sort`` picks the bar dimension: ``absolute`` plots ``b - a`` in the native
|
|
108
|
+
unit, ``relative`` plots percent change. ``facet`` splits into subplots by any
|
|
109
|
+
dim. ``clip`` clamps the colour scale (default symmetric p95).
|
|
110
|
+
"""
|
|
111
|
+
import sys
|
|
112
|
+
|
|
113
|
+
import plotly.express as px
|
|
114
|
+
|
|
115
|
+
df_long, unit = load_long_df(snapshots[:2], metric=metric)
|
|
116
|
+
vlabel = _value_label(unit)
|
|
117
|
+
labels = df_long["snapshot"].drop_duplicates().tolist()
|
|
118
|
+
a_label, b_label = labels[0], labels[1]
|
|
119
|
+
|
|
120
|
+
dims = _dim_columns(df_long)
|
|
121
|
+
wide = (
|
|
122
|
+
df_long.pivot(index=["id", *dims], columns="snapshot", values="value")
|
|
123
|
+
.reset_index()
|
|
124
|
+
.rename_axis(columns=None)
|
|
125
|
+
)
|
|
126
|
+
only_a = wide[wide[a_label].notna() & wide[b_label].isna()]
|
|
127
|
+
only_b = wide[wide[a_label].isna() & wide[b_label].notna()]
|
|
128
|
+
df = wide.dropna(subset=[a_label, b_label]).copy()
|
|
129
|
+
if df.empty:
|
|
130
|
+
raise ValueError("no ids in common between the two snapshots")
|
|
131
|
+
if len(only_a) or len(only_b):
|
|
132
|
+
print(
|
|
133
|
+
f"compare: {len(only_a)} only in {a_label}, {len(only_b)} only in "
|
|
134
|
+
f"{b_label} (intersection: {len(df)}).",
|
|
135
|
+
file=sys.stderr,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
df["delta_abs"] = df[b_label] - df[a_label]
|
|
139
|
+
df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
|
|
140
|
+
x_col = "delta_abs" if sort == "absolute" else "delta_pct"
|
|
141
|
+
df = df.sort_values(x_col).reset_index(drop=True)
|
|
142
|
+
|
|
143
|
+
x_label = f"{vlabel} delta ({unit})" if sort == "absolute" else f"{vlabel} delta %"
|
|
144
|
+
text_fmt = (".2s" if unit == "s" else ".2f") if sort == "absolute" else ".1f"
|
|
145
|
+
direction = "slower" if unit == "s" else "more memory"
|
|
146
|
+
title = f"{vlabel} delta ({sort}): {a_label} → {b_label} (positive = {direction})"
|
|
147
|
+
|
|
148
|
+
facet_kwargs: dict[str, object] = {}
|
|
149
|
+
if facet is not None and facet in df.columns:
|
|
150
|
+
facet_kwargs = {"facet_col": facet, "facet_col_wrap": 3}
|
|
151
|
+
|
|
152
|
+
color_clip = _symmetric_clip(df[x_col].to_numpy(), clip)
|
|
153
|
+
fig = px.bar(
|
|
154
|
+
df,
|
|
155
|
+
x=x_col,
|
|
156
|
+
y="id",
|
|
157
|
+
orientation="h",
|
|
158
|
+
color=x_col,
|
|
159
|
+
**_diverging_kwargs(),
|
|
160
|
+
range_color=[-color_clip, color_clip],
|
|
161
|
+
title=title,
|
|
162
|
+
labels={x_col: x_label, "id": ""},
|
|
163
|
+
text_auto=text_fmt,
|
|
164
|
+
**facet_kwargs,
|
|
165
|
+
)
|
|
166
|
+
if sort == "absolute":
|
|
167
|
+
fig.update_xaxes(**_axis_kwargs(unit))
|
|
168
|
+
fig.update_traces(textposition="outside", cliponaxis=False)
|
|
169
|
+
fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
|
|
170
|
+
return fig, len(df)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def plot_scatter(
|
|
174
|
+
snapshots: list[Path],
|
|
175
|
+
*,
|
|
176
|
+
metric: Metric = "time",
|
|
177
|
+
facet: str | None = None,
|
|
178
|
+
clip: float | None = None,
|
|
179
|
+
) -> tuple[Figure, int]:
|
|
180
|
+
"""Baseline cost (log-x) vs candidate/baseline ratio (log-y).
|
|
181
|
+
|
|
182
|
+
Top-right = slow *and* slower (the regressed corner). The first snapshot is
|
|
183
|
+
the baseline; with 3+, the rest animate. Colour encodes absolute Δ; ``clip``
|
|
184
|
+
clamps it (default p95). ``facet`` splits by any dim.
|
|
185
|
+
"""
|
|
186
|
+
import plotly.express as px
|
|
187
|
+
|
|
188
|
+
if len(snapshots) < 2:
|
|
189
|
+
raise ValueError("scatter needs at least 2 snapshots (baseline + 1)")
|
|
190
|
+
|
|
191
|
+
df_long, unit = load_long_df(snapshots, metric=metric)
|
|
192
|
+
vlabel = _value_label(unit)
|
|
193
|
+
labels = df_long["snapshot"].drop_duplicates().tolist()
|
|
194
|
+
baseline_label = labels[0]
|
|
195
|
+
|
|
196
|
+
base = df_long.loc[df_long["snapshot"] == baseline_label, ["id", "value"]].rename(
|
|
197
|
+
columns={"value": "baseline"}
|
|
198
|
+
)
|
|
199
|
+
df = df_long.merge(base, on="id", how="inner")
|
|
200
|
+
df = df[df["baseline"] > 0].copy()
|
|
201
|
+
if df.empty:
|
|
202
|
+
raise ValueError(f"no ids shared with baseline ({baseline_label})")
|
|
203
|
+
|
|
204
|
+
df = df.rename(columns={"snapshot": "version", "value": "candidate"})
|
|
205
|
+
df["ratio"] = df["candidate"] / df["baseline"]
|
|
206
|
+
df["delta_abs"] = df["candidate"] - df["baseline"]
|
|
207
|
+
df = df[df["ratio"] > 0]
|
|
208
|
+
y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
|
|
209
|
+
bound = max(y_hi, 1.0 / y_lo, 1.1) ** 1.05
|
|
210
|
+
color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)
|
|
211
|
+
|
|
212
|
+
extra: dict[str, object] = {}
|
|
213
|
+
if len(snapshots) >= 3:
|
|
214
|
+
extra["animation_frame"] = "version"
|
|
215
|
+
extra["category_orders"] = {"version": labels}
|
|
216
|
+
if facet is not None and facet in df.columns:
|
|
217
|
+
extra["facet_col"] = facet
|
|
218
|
+
extra["facet_col_wrap"] = 3
|
|
219
|
+
|
|
220
|
+
fig = px.scatter(
|
|
221
|
+
df,
|
|
222
|
+
x="baseline",
|
|
223
|
+
y="ratio",
|
|
224
|
+
color="delta_abs",
|
|
225
|
+
**_diverging_kwargs(),
|
|
226
|
+
range_color=[-color_clip, color_clip],
|
|
227
|
+
log_x=True,
|
|
228
|
+
log_y=True,
|
|
229
|
+
range_y=[1.0 / bound, bound],
|
|
230
|
+
hover_name="id",
|
|
231
|
+
title=f"{vlabel} scatter vs baseline ({baseline_label}) — top-right = regressed",
|
|
232
|
+
labels={
|
|
233
|
+
"baseline": f"baseline {vlabel} ({unit}, log)",
|
|
234
|
+
"ratio": "ratio (candidate / baseline, log)",
|
|
235
|
+
},
|
|
236
|
+
**extra,
|
|
237
|
+
)
|
|
238
|
+
fig.add_hline(y=1.0, line_dash="dash", line_color="grey")
|
|
239
|
+
fig.update_traces(marker={"size": 8, "line": {"width": 0.5, "color": "DarkSlateGrey"}})
|
|
240
|
+
fig.update_layout(height=600)
|
|
241
|
+
return fig, int(df["id"].nunique())
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def plot_sweep(
|
|
245
|
+
snapshots: list[Path], *, metric: Metric = "time", clip: float | None = None
|
|
246
|
+
) -> tuple[Figure, int]:
|
|
247
|
+
"""Heatmap of per-id fold-change (log2 ratio) vs the first snapshot."""
|
|
248
|
+
import plotly.express as px
|
|
249
|
+
|
|
250
|
+
df_long, unit = load_long_df(snapshots, metric=metric)
|
|
251
|
+
vlabel = _value_label(unit)
|
|
252
|
+
versions = df_long["snapshot"].drop_duplicates().tolist()
|
|
253
|
+
baseline = versions[0]
|
|
254
|
+
|
|
255
|
+
abs_df = df_long.pivot(index="id", columns="snapshot", values="value").reindex(columns=versions)
|
|
256
|
+
abs_df = abs_df.dropna(subset=[baseline])
|
|
257
|
+
if abs_df.empty:
|
|
258
|
+
raise ValueError(f"no overlap with baseline snapshot {baseline}")
|
|
259
|
+
ratio = abs_df.div(abs_df[baseline], axis=0)
|
|
260
|
+
logr = np.log2(ratio.where(ratio > 0))
|
|
261
|
+
abs_df.index.name = ratio.index.name = logr.index.name = "id"
|
|
262
|
+
|
|
263
|
+
bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None)
|
|
264
|
+
fig = px.imshow(
|
|
265
|
+
logr,
|
|
266
|
+
color_continuous_scale=["green", "white", "red"],
|
|
267
|
+
color_continuous_midpoint=0.0,
|
|
268
|
+
zmin=-bound,
|
|
269
|
+
zmax=bound,
|
|
270
|
+
aspect="auto",
|
|
271
|
+
title=f"{vlabel} fold-change vs baseline ({baseline})",
|
|
272
|
+
labels={"x": "snapshot", "y": "id", "color": "fold"},
|
|
273
|
+
)
|
|
274
|
+
tickvals, ticktext = _fold_ticks(bound)
|
|
275
|
+
fig.update_coloraxes(colorbar={"tickvals": tickvals, "ticktext": ticktext, "title": "fold"})
|
|
276
|
+
fig.update_traces(
|
|
277
|
+
text=ratio.round(2).values,
|
|
278
|
+
texttemplate="%{text}×",
|
|
279
|
+
customdata=abs_df.values,
|
|
280
|
+
hovertemplate=(
|
|
281
|
+
"id: %{y}<br>snapshot: %{x}<br>fold: %{text}×<br>"
|
|
282
|
+
f"{vlabel}: %{{customdata:.4g}}{unit}<extra></extra>"
|
|
283
|
+
),
|
|
284
|
+
)
|
|
285
|
+
fig.update_layout(height=max(500, len(logr) * 22))
|
|
286
|
+
return fig, len(logr)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _infer_roles(
|
|
290
|
+
df: pd.DataFrame, x: str | None, color: str | None, facet: str | None
|
|
291
|
+
) -> tuple[str, str | None, str | None]:
|
|
292
|
+
"""Pick (x, color, facet) dim columns, auto-filling unset ones.
|
|
293
|
+
|
|
294
|
+
x defaults to the lone *numeric* dim; color/facet to leftover categoricals.
|
|
295
|
+
"""
|
|
296
|
+
dims = _dim_columns(df)
|
|
297
|
+
numeric = [d for d in dims if df[d].dropna().map(lambda v: isinstance(v, (int, float))).all()]
|
|
298
|
+
if x is None:
|
|
299
|
+
if len(numeric) != 1:
|
|
300
|
+
raise ValueError(
|
|
301
|
+
f"scaling needs exactly one numeric dim for x (found {numeric}); pass x= explicitly"
|
|
302
|
+
)
|
|
303
|
+
x = numeric[0]
|
|
304
|
+
leftover = [d for d in dims if d != x]
|
|
305
|
+
if color is None:
|
|
306
|
+
color = leftover[0] if leftover else None
|
|
307
|
+
if facet is None:
|
|
308
|
+
facet = next((d for d in leftover if d != color), None)
|
|
309
|
+
return x, color, facet
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def plot_scaling(
|
|
313
|
+
snapshots: list[Path],
|
|
314
|
+
*,
|
|
315
|
+
metric: Metric = "time",
|
|
316
|
+
x: str | None = None,
|
|
317
|
+
color: str | None = None,
|
|
318
|
+
facet: str | None = None,
|
|
319
|
+
log: bool | Literal["auto"] = "auto",
|
|
320
|
+
) -> tuple[Figure, int]:
|
|
321
|
+
"""Cost vs a numeric dim, coloured/faceted by other dims.
|
|
322
|
+
|
|
323
|
+
``x``/``color``/``facet`` default to inference from the dims (the lone numeric
|
|
324
|
+
dim → x); pass them to override. ``log="auto"`` log-scales when x is numeric
|
|
325
|
+
and strictly positive.
|
|
326
|
+
"""
|
|
327
|
+
import plotly.express as px
|
|
328
|
+
|
|
329
|
+
df_long, unit = load_long_df(snapshots[:1], metric=metric)
|
|
330
|
+
vlabel = _value_label(unit)
|
|
331
|
+
x, color, facet = _infer_roles(df_long, x, color, facet)
|
|
332
|
+
df = df_long.dropna(subset=[x]).sort_values([c for c in (facet, color, x) if c])
|
|
333
|
+
if df.empty:
|
|
334
|
+
raise ValueError("no rows with the x dim set")
|
|
335
|
+
|
|
336
|
+
use_log = bool((df[x] > 0).all()) if log == "auto" else bool(log)
|
|
337
|
+
fig = px.line(
|
|
338
|
+
df,
|
|
339
|
+
x=x,
|
|
340
|
+
y="value",
|
|
341
|
+
color=color,
|
|
342
|
+
facet_col=facet,
|
|
343
|
+
facet_col_wrap=3 if facet else None,
|
|
344
|
+
log_x=use_log,
|
|
345
|
+
log_y=use_log,
|
|
346
|
+
markers=True,
|
|
347
|
+
labels={"value": f"{vlabel} ({unit})", x: x},
|
|
348
|
+
title=f"Scaling: {vlabel} ({unit}) vs {x} ({snapshots[0].stem})",
|
|
349
|
+
)
|
|
350
|
+
n_facets = df[facet].nunique() if facet else 1
|
|
351
|
+
fig.update_layout(height=max(400, ((n_facets + 2) // 3) * 350))
|
|
352
|
+
return fig, len(df)
|
pytest_benchmem/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""pytest plugin — peak-memory alongside pytest-benchmark timing.
|
|
2
|
+
|
|
3
|
+
pytest-benchmark times your code but can't measure memory. This plugin adds a
|
|
4
|
+
memray peak-memory pass *to the same test, in the same run*, stored in
|
|
5
|
+
pytest-benchmark's own JSON (``extra_info.peak_mib``) so timing and memory share
|
|
6
|
+
one node id. Two ways in:
|
|
7
|
+
|
|
8
|
+
- The ``benchmark_memory`` fixture — explicit, per-test::
|
|
9
|
+
|
|
10
|
+
def test_build(benchmark_memory):
|
|
11
|
+
benchmark_memory(build_model, 1000)
|
|
12
|
+
|
|
13
|
+
- The ``--benchmark-memory`` flag — augments the stock ``benchmark`` fixture, so
|
|
14
|
+
an *existing* pytest-benchmark suite records memory with no code change::
|
|
15
|
+
|
|
16
|
+
pytest --benchmark-only --benchmark-memory
|
|
17
|
+
|
|
18
|
+
Either way the two passes never overlap: pytest-benchmark times the action with
|
|
19
|
+
no tracker installed, then memray measures peak on a separate, untimed call — so
|
|
20
|
+
the allocator hooks cost the timing numbers nothing. Memory comes back out via
|
|
21
|
+
:func:`pytest_benchmem.memory_from_pytest_benchmark` into the unit-aware
|
|
22
|
+
``plot`` / ``compare``.
|
|
23
|
+
|
|
24
|
+
Registered via the ``pytest11`` entry point — installing pytest-benchmem is enough.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import inspect
|
|
30
|
+
from collections.abc import Callable, Mapping
|
|
31
|
+
from typing import TYPE_CHECKING, Any
|
|
32
|
+
|
|
33
|
+
import pytest
|
|
34
|
+
|
|
35
|
+
from pytest_benchmem.memray import measure_peak
|
|
36
|
+
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
|
39
|
+
|
|
40
|
+
#: Key under which the peak (MiB) is written into pytest-benchmark ``extra_info``.
|
|
41
|
+
PEAK_KEY = "peak_mib"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _record_peak(
|
|
45
|
+
benchmark: BenchmarkFixture, action: Callable[[], Any], *, repeats: int = 1
|
|
46
|
+
) -> float:
|
|
47
|
+
"""Memory-profile ``action`` into ``benchmark.extra_info[PEAK_KEY]``, once.
|
|
48
|
+
|
|
49
|
+
Idempotent: if a peak is already recorded for this benchmark (e.g. the
|
|
50
|
+
``--benchmark-memory`` patch and the explicit fixture both fire), reuse it
|
|
51
|
+
rather than measuring twice.
|
|
52
|
+
"""
|
|
53
|
+
existing = benchmark.extra_info.get(PEAK_KEY)
|
|
54
|
+
if existing is not None:
|
|
55
|
+
return float(existing)
|
|
56
|
+
peak = measure_peak(action, repeats=repeats)
|
|
57
|
+
benchmark.extra_info[PEAK_KEY] = peak
|
|
58
|
+
return peak
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class MemoryBenchmark:
|
|
62
|
+
"""Callable wrapper that times via pytest-benchmark, then memory-profiles.
|
|
63
|
+
|
|
64
|
+
Mirrors the ``benchmark`` fixture's surface (``__call__`` and ``pedantic``)
|
|
65
|
+
so it's a drop-in for tests that want memory too. After a call, the peak is
|
|
66
|
+
on :attr:`peak_mib` and in the benchmark's ``extra_info``.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, benchmark: BenchmarkFixture, *, repeats: int = 1) -> None:
|
|
70
|
+
self._benchmark = benchmark
|
|
71
|
+
self._repeats = repeats
|
|
72
|
+
self.peak_mib: float | None = None
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def extra_info(self) -> dict[str, Any]:
|
|
76
|
+
"""pytest-benchmark's per-benchmark metadata dict.
|
|
77
|
+
|
|
78
|
+
Set scalars here to attach analysis dims — ``benchmark_memory.extra_info["op"]
|
|
79
|
+
= "sort"`` — and pytest-benchmem's readers pick them up as dims alongside the
|
|
80
|
+
parametrize params. The peak is stored here too, under ``peak_mib``.
|
|
81
|
+
"""
|
|
82
|
+
return self._benchmark.extra_info
|
|
83
|
+
|
|
84
|
+
def __call__(self, function_to_benchmark: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
|
|
85
|
+
"""Time ``function_to_benchmark(*args, **kwargs)``, then record its peak memory."""
|
|
86
|
+
result = self._benchmark(function_to_benchmark, *args, **kwargs)
|
|
87
|
+
self.peak_mib = _record_peak(
|
|
88
|
+
self._benchmark, lambda: function_to_benchmark(*args, **kwargs), repeats=self._repeats
|
|
89
|
+
)
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
def pedantic(
|
|
93
|
+
self,
|
|
94
|
+
target: Callable[..., Any],
|
|
95
|
+
args: tuple[Any, ...] = (),
|
|
96
|
+
kwargs: Mapping[str, Any] | None = None,
|
|
97
|
+
setup: Callable[[], Any] | None = None,
|
|
98
|
+
rounds: int = 1,
|
|
99
|
+
warmup_rounds: int = 0,
|
|
100
|
+
iterations: int = 1,
|
|
101
|
+
) -> Any:
|
|
102
|
+
"""Like :meth:`pytest_benchmark.fixture.BenchmarkFixture.pedantic`, plus a memory pass.
|
|
103
|
+
|
|
104
|
+
``setup`` runs untracked before the measured call (and, if it returns
|
|
105
|
+
``(args, kwargs)``, supplies them) — matching pytest-benchmark, and
|
|
106
|
+
giving back the untimed-setup/measured-action split cleanly.
|
|
107
|
+
"""
|
|
108
|
+
kwargs = dict(kwargs or {})
|
|
109
|
+
result = self._benchmark.pedantic( # type: ignore[no-untyped-call]
|
|
110
|
+
target,
|
|
111
|
+
args=args,
|
|
112
|
+
kwargs=kwargs,
|
|
113
|
+
setup=setup,
|
|
114
|
+
rounds=rounds,
|
|
115
|
+
warmup_rounds=warmup_rounds,
|
|
116
|
+
iterations=iterations,
|
|
117
|
+
)
|
|
118
|
+
self.peak_mib = _record_peak(
|
|
119
|
+
self._benchmark, _pedantic_action(target, args, kwargs, setup), repeats=self._repeats
|
|
120
|
+
)
|
|
121
|
+
return result
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _pedantic_action(
|
|
125
|
+
target: Callable[..., Any],
|
|
126
|
+
args: tuple[Any, ...],
|
|
127
|
+
kwargs: Mapping[str, Any],
|
|
128
|
+
setup: Callable[[], Any] | None,
|
|
129
|
+
) -> Callable[[], Any]:
|
|
130
|
+
"""Build the zero-arg measured action for a pedantic call, honoring ``setup``."""
|
|
131
|
+
|
|
132
|
+
def action() -> Any:
|
|
133
|
+
call_args, call_kwargs = args, dict(kwargs)
|
|
134
|
+
if setup is not None:
|
|
135
|
+
produced = setup()
|
|
136
|
+
if produced is not None:
|
|
137
|
+
call_args, call_kwargs = produced
|
|
138
|
+
return target(*call_args, **call_kwargs)
|
|
139
|
+
|
|
140
|
+
return action
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@pytest.fixture
|
|
144
|
+
def benchmark_memory(benchmark: BenchmarkFixture) -> MemoryBenchmark:
|
|
145
|
+
"""Time *and* peak-memory-profile a callable in one pytest-benchmark test.
|
|
146
|
+
|
|
147
|
+
Depends on the ``benchmark`` fixture, so timing rides pytest-benchmark fully;
|
|
148
|
+
the memray memory pass is added on top and stored in the same entry. For an
|
|
149
|
+
existing suite, prefer the ``--benchmark-memory`` flag over rewriting tests.
|
|
150
|
+
"""
|
|
151
|
+
return MemoryBenchmark(benchmark)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# --- --benchmark-memory: augment the stock `benchmark` fixture --------------------
|
|
155
|
+
|
|
156
|
+
_PATCHED: tuple[Any, Any] | None = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
#: pedantic kwargs the patch passes through — guard against a version that drops one.
|
|
160
|
+
_PEDANTIC_PARAMS = frozenset({"args", "kwargs", "setup", "rounds", "warmup_rounds", "iterations"})
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _patch_compatible(fixture_cls: Any) -> bool:
|
|
164
|
+
"""True if ``BenchmarkFixture.pedantic`` still takes the kwargs the patch passes.
|
|
165
|
+
|
|
166
|
+
(``__call__`` is invoked positionally, so it needs no signature guard.)
|
|
167
|
+
"""
|
|
168
|
+
try:
|
|
169
|
+
pedantic_params = set(inspect.signature(fixture_cls.pedantic).parameters)
|
|
170
|
+
except (TypeError, ValueError, AttributeError):
|
|
171
|
+
return False
|
|
172
|
+
return pedantic_params >= _PEDANTIC_PARAMS
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _install_auto_memory() -> None:
|
|
176
|
+
"""Wrap ``BenchmarkFixture.__call__``/``.pedantic`` to also record peak memory."""
|
|
177
|
+
global _PATCHED
|
|
178
|
+
if _PATCHED is not None:
|
|
179
|
+
return
|
|
180
|
+
import pytest_benchmark
|
|
181
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
|
182
|
+
|
|
183
|
+
if not _patch_compatible(BenchmarkFixture):
|
|
184
|
+
raise pytest.UsageError(
|
|
185
|
+
f"--benchmark-memory: pytest-benchmark {pytest_benchmark.__version__} has a "
|
|
186
|
+
f"BenchmarkFixture.pedantic() signature pytest-benchmem doesn't recognize, so "
|
|
187
|
+
f"the auto-memory patch can't apply safely. Either:\n"
|
|
188
|
+
f" - install a pytest-benchmark version pytest-benchmem supports, or\n"
|
|
189
|
+
f" - drop --benchmark-memory (timing still runs normally), or\n"
|
|
190
|
+
f" - report it so we add support: "
|
|
191
|
+
f"https://github.com/fluxopt/pytest-benchmem/issues"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
orig_call = BenchmarkFixture.__call__
|
|
195
|
+
orig_pedantic = BenchmarkFixture.pedantic
|
|
196
|
+
|
|
197
|
+
def call(
|
|
198
|
+
self: BenchmarkFixture, function_to_benchmark: Callable[..., Any], *a: Any, **k: Any
|
|
199
|
+
) -> Any:
|
|
200
|
+
result = orig_call(self, function_to_benchmark, *a, **k) # type: ignore[no-untyped-call]
|
|
201
|
+
_record_peak(self, lambda: function_to_benchmark(*a, **k))
|
|
202
|
+
return result
|
|
203
|
+
|
|
204
|
+
def pedantic(
|
|
205
|
+
self: BenchmarkFixture,
|
|
206
|
+
target: Callable[..., Any],
|
|
207
|
+
args: tuple[Any, ...] = (),
|
|
208
|
+
kwargs: Mapping[str, Any] | None = None,
|
|
209
|
+
setup: Callable[[], Any] | None = None,
|
|
210
|
+
rounds: int = 1,
|
|
211
|
+
warmup_rounds: int = 0,
|
|
212
|
+
iterations: int = 1,
|
|
213
|
+
) -> Any:
|
|
214
|
+
kwargs = dict(kwargs or {})
|
|
215
|
+
result = orig_pedantic( # type: ignore[no-untyped-call]
|
|
216
|
+
self,
|
|
217
|
+
target,
|
|
218
|
+
args=args,
|
|
219
|
+
kwargs=kwargs,
|
|
220
|
+
setup=setup,
|
|
221
|
+
rounds=rounds,
|
|
222
|
+
warmup_rounds=warmup_rounds,
|
|
223
|
+
iterations=iterations,
|
|
224
|
+
)
|
|
225
|
+
_record_peak(self, _pedantic_action(target, args, kwargs, setup))
|
|
226
|
+
return result
|
|
227
|
+
|
|
228
|
+
BenchmarkFixture.__call__ = call # type: ignore[method-assign]
|
|
229
|
+
BenchmarkFixture.pedantic = pedantic # type: ignore[method-assign,assignment]
|
|
230
|
+
_PATCHED = (orig_call, orig_pedantic)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _remove_auto_memory() -> None:
|
|
234
|
+
global _PATCHED
|
|
235
|
+
if _PATCHED is None:
|
|
236
|
+
return
|
|
237
|
+
from pytest_benchmark.fixture import BenchmarkFixture
|
|
238
|
+
|
|
239
|
+
BenchmarkFixture.__call__, BenchmarkFixture.pedantic = _PATCHED # type: ignore[method-assign]
|
|
240
|
+
_PATCHED = None
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def pytest_addoption(parser: pytest.Parser) -> None:
|
|
244
|
+
"""Register the ``--benchmark-memory`` flag."""
|
|
245
|
+
group = parser.getgroup("pytest-benchmem", "peak-memory benchmarking")
|
|
246
|
+
group.addoption(
|
|
247
|
+
"--benchmark-memory",
|
|
248
|
+
action="store_true",
|
|
249
|
+
default=False,
|
|
250
|
+
help="Record peak memory (memray) for every benchmark() call, "
|
|
251
|
+
"not only benchmark_memory — no test changes needed.",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def pytest_configure(config: pytest.Config) -> None:
|
|
256
|
+
"""Patch the stock ``benchmark`` fixture when ``--benchmark-memory`` is set."""
|
|
257
|
+
if config.getoption("--benchmark-memory"):
|
|
258
|
+
_install_auto_memory()
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def pytest_unconfigure(config: pytest.Config) -> None:
|
|
262
|
+
"""Restore the stock ``benchmark`` fixture after the session."""
|
|
263
|
+
_remove_auto_memory()
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""The reading spine — normalize pytest-benchmark JSON into a tidy frame for plots.
|
|
2
|
+
|
|
3
|
+
pytest-benchmem defines no on-disk format of its own. The source of truth is the JSON
|
|
4
|
+
pytest-benchmark writes under ``.benchmarks/<machine>/NNNN_*.json``; a single
|
|
5
|
+
file carries *both* metrics for each benchmark id:
|
|
6
|
+
|
|
7
|
+
- ``stats`` — timing (min/mean/median/…), in seconds.
|
|
8
|
+
- ``extra_info.peak_mib`` — peak memory in MiB, written by the
|
|
9
|
+
``benchmark_memory`` fixture.
|
|
10
|
+
|
|
11
|
+
So reads are *per metric*: :func:`from_pytest_benchmark` pulls timing,
|
|
12
|
+
:func:`memory_from_pytest_benchmark` pulls memory, each yielding a list of
|
|
13
|
+
:class:`Sample` — an opaque ``id``, a ``value``, and analysis ``dims``. Dims come
|
|
14
|
+
from the benchmark's ``params`` (``@pytest.mark.parametrize``) plus any scalar
|
|
15
|
+
``extra_info`` you set in the test. :func:`load_long_df` stacks several files
|
|
16
|
+
(e.g. one per version from a sweep) into the single frame every plot pivots.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
from collections.abc import Mapping, Sequence
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Literal, NamedTuple
|
|
25
|
+
|
|
26
|
+
#: A value of a dim axis — categorical (str) or numeric (int/float).
|
|
27
|
+
DimValue = str | int | float
|
|
28
|
+
|
|
29
|
+
#: Which metric to read out of a pytest-benchmark file.
|
|
30
|
+
Metric = Literal["time", "memory"]
|
|
31
|
+
|
|
32
|
+
#: Default key under which ``benchmark_memory`` stores the peak (MiB).
|
|
33
|
+
PEAK_KEY = "peak_mib"
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
import pandas as pd
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Sample(NamedTuple):
|
|
40
|
+
"""One measured result: an opaque ``id``, a ``value``, and analysis ``dims``."""
|
|
41
|
+
|
|
42
|
+
id: str
|
|
43
|
+
value: float
|
|
44
|
+
dims: Mapping[str, DimValue]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _read_benchmarks(path: str | Path) -> tuple[Path, list[dict[str, Any]]]:
|
|
48
|
+
"""Read a pytest-benchmark file, returning ``(path, benchmarks)`` or raising."""
|
|
49
|
+
p = Path(path)
|
|
50
|
+
try:
|
|
51
|
+
data = json.loads(p.read_text())
|
|
52
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
53
|
+
raise ValueError(f"{path}: not a readable JSON file ({exc})") from exc
|
|
54
|
+
if "benchmarks" not in data:
|
|
55
|
+
raise ValueError(f"{path}: not a pytest-benchmark file (no 'benchmarks' key)")
|
|
56
|
+
return p, data["benchmarks"]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _dims(bm: Mapping[str, Any]) -> dict[str, DimValue]:
|
|
60
|
+
"""Analysis dims for one benchmark — its parametrize ``params`` plus any
|
|
61
|
+
scalar ``extra_info`` (minus the reserved ``peak_mib``). No id parsing.
|
|
62
|
+
"""
|
|
63
|
+
dims: dict[str, DimValue] = {}
|
|
64
|
+
params = bm.get("params")
|
|
65
|
+
if isinstance(params, Mapping):
|
|
66
|
+
dims.update(params)
|
|
67
|
+
extra = bm.get("extra_info")
|
|
68
|
+
if isinstance(extra, Mapping):
|
|
69
|
+
dims.update({k: v for k, v in extra.items() if k != PEAK_KEY})
|
|
70
|
+
return dims
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def from_pytest_benchmark(
|
|
74
|
+
path: str | Path, *, metric: str = "min"
|
|
75
|
+
) -> tuple[str, list[Sample], str]:
|
|
76
|
+
"""Read timing out of a pytest-benchmark file → ``(label, samples, "s")``.
|
|
77
|
+
|
|
78
|
+
``metric`` picks the stat (``min`` / ``median`` / …). Dims come from each
|
|
79
|
+
benchmark's parametrize ``params`` and ``extra_info``.
|
|
80
|
+
"""
|
|
81
|
+
p, benchmarks = _read_benchmarks(path)
|
|
82
|
+
samples = [
|
|
83
|
+
Sample(id=bm["fullname"], value=bm["stats"][metric], dims=_dims(bm)) for bm in benchmarks
|
|
84
|
+
]
|
|
85
|
+
return p.stem, samples, "s"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def memory_from_pytest_benchmark(
|
|
89
|
+
path: str | Path, *, key: str = PEAK_KEY
|
|
90
|
+
) -> tuple[str, list[Sample], str]:
|
|
91
|
+
"""Read peak memory out of a pytest-benchmark file → ``(label, samples, "MiB")``.
|
|
92
|
+
|
|
93
|
+
The ``benchmark_memory`` fixture stores each run's peak under
|
|
94
|
+
``extra_info[key]``, keyed by the same benchmark id pytest-benchmark uses.
|
|
95
|
+
Benchmarks lacking the key (timing-only tests) are skipped. Dims come from
|
|
96
|
+
parametrize ``params`` and ``extra_info``.
|
|
97
|
+
"""
|
|
98
|
+
p, benchmarks = _read_benchmarks(path)
|
|
99
|
+
samples = [
|
|
100
|
+
Sample(id=bm["fullname"], value=float(bm["extra_info"][key]), dims=_dims(bm))
|
|
101
|
+
for bm in benchmarks
|
|
102
|
+
if isinstance(bm.get("extra_info"), Mapping) and bm["extra_info"].get(key) is not None
|
|
103
|
+
]
|
|
104
|
+
return p.stem, samples, "MiB"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def load_samples(
|
|
108
|
+
path: str | Path, *, metric: Metric = "time", stat: str = "min"
|
|
109
|
+
) -> tuple[str, list[Sample], str]:
|
|
110
|
+
"""Read one pytest-benchmark file for the chosen ``metric`` → ``(label, samples, unit)``."""
|
|
111
|
+
if metric == "time":
|
|
112
|
+
return from_pytest_benchmark(path, metric=stat)
|
|
113
|
+
if metric == "memory":
|
|
114
|
+
return memory_from_pytest_benchmark(path)
|
|
115
|
+
raise ValueError(f"metric must be 'time' or 'memory', got {metric!r}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def discover_runs(root: str | Path = ".benchmarks") -> list[Path]:
|
|
119
|
+
"""Return pytest-benchmark JSON files under ``root`` (for CLI suggestions)."""
|
|
120
|
+
base = Path(root)
|
|
121
|
+
return sorted(base.rglob("*.json")) if base.exists() else []
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def load_long_df(
|
|
125
|
+
runs: Sequence[str | Path], *, metric: Metric = "time", stat: str = "min"
|
|
126
|
+
) -> tuple[pd.DataFrame, str]:
|
|
127
|
+
"""Stack pytest-benchmark files into one long frame → ``(df, unit)``.
|
|
128
|
+
|
|
129
|
+
One row per ``(run, id)`` for the chosen ``metric``. Columns: ``snapshot``
|
|
130
|
+
(the file stem / version label), ``id``, ``value``, then one column per dim
|
|
131
|
+
key seen (missing dims are ``NaN``). Every plot view pivots this frame.
|
|
132
|
+
"""
|
|
133
|
+
import pandas as pd
|
|
134
|
+
|
|
135
|
+
rows: list[dict[str, object]] = []
|
|
136
|
+
unit = ""
|
|
137
|
+
for path in runs:
|
|
138
|
+
label, samples, unit = load_samples(path, metric=metric, stat=stat)
|
|
139
|
+
for s in samples:
|
|
140
|
+
rows.append({"snapshot": label, "id": s.id, "value": s.value, **s.dims})
|
|
141
|
+
return pd.DataFrame(rows), unit
|
pytest_benchmem/sweep.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Cross-version sweep — run benchmarks against several installed versions of a
|
|
2
|
+
package in fresh, isolated ``uv`` venvs.
|
|
3
|
+
|
|
4
|
+
Generic: the package install spec, the extra pins, the directory to copy for
|
|
5
|
+
import isolation, and *what to run* in each venv are all injected. A consumer
|
|
6
|
+
(e.g. linopy) wires ``install_spec=lambda v: f"linopy=={v}"`` and a ``run``
|
|
7
|
+
callback that invokes its pytest / memory command.
|
|
8
|
+
|
|
9
|
+
Isolation: if the repo root contains a dev copy of the package under test, it
|
|
10
|
+
would shadow the venv's installed version on ``sys.path``. So each venv runs
|
|
11
|
+
with ``cwd`` set to a fresh tempdir holding only a *copy* of ``copy_dir`` — the
|
|
12
|
+
benchmark code imports from there, while the package-under-test resolves to the
|
|
13
|
+
venv. A preflight asserts that resolution held.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
import shutil
|
|
21
|
+
import subprocess
|
|
22
|
+
import sys
|
|
23
|
+
import tempfile
|
|
24
|
+
from collections.abc import Callable, Iterator, Sequence
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
_PLAIN_VERSION_RE = re.compile(r"^\d+(\.\d+)*([a-z]+\d*)?$")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def version_label(version: str) -> str:
|
|
32
|
+
"""Filesystem-safe label from a version/pip-spec (part after the last ``@``)."""
|
|
33
|
+
label = version.rsplit("@", 1)[-1] if "@" in version else version
|
|
34
|
+
return re.sub(r"[^0-9A-Za-z._-]+", "-", label).strip("-._") or "spec"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _venv_python(venv: Path) -> Path:
|
|
38
|
+
return venv / ("Scripts/python.exe" if os.name == "nt" else "bin/python")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class Venv:
|
|
43
|
+
"""One provisioned per-version venv (or a failure)."""
|
|
44
|
+
|
|
45
|
+
version: str
|
|
46
|
+
python: Path | None
|
|
47
|
+
env: dict[str, str] | None
|
|
48
|
+
cwd: Path | None
|
|
49
|
+
failed_at: str | None # "venv" | "install" | "isolation" | None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def provision(
|
|
53
|
+
versions: Sequence[str],
|
|
54
|
+
*,
|
|
55
|
+
install_spec: Callable[[str], str] = lambda v: v,
|
|
56
|
+
pins: Sequence[str] = (),
|
|
57
|
+
copy_dir: str | Path | None = None,
|
|
58
|
+
import_check: str | None = None,
|
|
59
|
+
as_of: str | None = None,
|
|
60
|
+
tmp_prefix: str = "pytest-benchmem-",
|
|
61
|
+
) -> Iterator[Venv]:
|
|
62
|
+
"""Yield one fresh ``uv`` venv per version.
|
|
63
|
+
|
|
64
|
+
``install_spec(v)`` → the pip spec for the package under test (default:
|
|
65
|
+
``v`` verbatim, so ``"pkg==1.2"`` works). ``pins`` are extra specs installed
|
|
66
|
+
alongside. ``copy_dir`` is copied into each venv's cwd for import isolation.
|
|
67
|
+
``import_check`` is a module name asserted to resolve to the venv (preflight).
|
|
68
|
+
``as_of`` (``YYYY-MM-DD``) freezes the whole resolution via ``--exclude-newer``.
|
|
69
|
+
"""
|
|
70
|
+
if shutil.which("uv") is None:
|
|
71
|
+
raise RuntimeError("uv not found on PATH — https://docs.astral.sh/uv/")
|
|
72
|
+
|
|
73
|
+
for version in versions:
|
|
74
|
+
with tempfile.TemporaryDirectory(prefix=tmp_prefix) as tmp:
|
|
75
|
+
venv = Path(tmp) / "venv"
|
|
76
|
+
r = subprocess.run(["uv", "venv", "--python", sys.executable, str(venv)], check=False)
|
|
77
|
+
if r.returncode != 0:
|
|
78
|
+
yield Venv(version, None, None, None, "venv")
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
vpy = _venv_python(venv)
|
|
82
|
+
spec = install_spec(version) if _PLAIN_VERSION_RE.match(version) else version
|
|
83
|
+
install = [
|
|
84
|
+
"uv",
|
|
85
|
+
"pip",
|
|
86
|
+
"install",
|
|
87
|
+
"--python",
|
|
88
|
+
str(vpy),
|
|
89
|
+
*(["--exclude-newer", as_of] if as_of else []),
|
|
90
|
+
*pins,
|
|
91
|
+
spec,
|
|
92
|
+
]
|
|
93
|
+
if subprocess.run(install, check=False).returncode != 0:
|
|
94
|
+
yield Venv(version, None, None, None, "install")
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
cwd = Path(tmp) / "iso"
|
|
98
|
+
cwd.mkdir()
|
|
99
|
+
if copy_dir is not None:
|
|
100
|
+
src = Path(copy_dir)
|
|
101
|
+
shutil.copytree(
|
|
102
|
+
src,
|
|
103
|
+
cwd / src.name,
|
|
104
|
+
ignore=shutil.ignore_patterns("__pycache__", "*.ipynb", ".DS_Store"),
|
|
105
|
+
)
|
|
106
|
+
env = os.environ.copy()
|
|
107
|
+
env.pop("PYTHONPATH", None)
|
|
108
|
+
|
|
109
|
+
if import_check is not None:
|
|
110
|
+
pre = subprocess.run(
|
|
111
|
+
[
|
|
112
|
+
str(vpy),
|
|
113
|
+
"-c",
|
|
114
|
+
f"import {import_check} as _m; "
|
|
115
|
+
f"assert {str(venv)!r} in _m.__file__, _m.__file__",
|
|
116
|
+
],
|
|
117
|
+
cwd=str(cwd),
|
|
118
|
+
env=env,
|
|
119
|
+
capture_output=True,
|
|
120
|
+
text=True,
|
|
121
|
+
check=False,
|
|
122
|
+
)
|
|
123
|
+
if pre.returncode != 0:
|
|
124
|
+
yield Venv(version, None, None, None, "isolation")
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
yield Venv(version, vpy, env, cwd, None)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def sweep(
|
|
131
|
+
versions: Sequence[str],
|
|
132
|
+
run: Callable[[Venv], None],
|
|
133
|
+
**provision_kwargs: object,
|
|
134
|
+
) -> list[str]:
|
|
135
|
+
"""Provision a venv per version and call ``run(venv)`` in each.
|
|
136
|
+
|
|
137
|
+
``run`` does whatever the consumer needs (invoke pytest / a memory command
|
|
138
|
+
with ``venv.python`` and ``cwd=venv.cwd``). Returns the list of versions
|
|
139
|
+
that failed to provision.
|
|
140
|
+
"""
|
|
141
|
+
failed: list[str] = []
|
|
142
|
+
for prov in provision(versions, **provision_kwargs): # type: ignore[arg-type]
|
|
143
|
+
if prov.failed_at:
|
|
144
|
+
failed.append(prov.version)
|
|
145
|
+
continue
|
|
146
|
+
run(prov)
|
|
147
|
+
return failed
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pytest-benchmem
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The memory companion to pytest-benchmark: a memray peak-memory pass on the same test, plus dims-aware plots and cross-version sweeps.
|
|
5
|
+
Project-URL: Homepage, https://github.com/fluxopt/pytest-benchmem
|
|
6
|
+
Project-URL: Documentation, https://fluxopt.github.io/pytest-benchmem/
|
|
7
|
+
Project-URL: Repository, https://github.com/fluxopt/pytest-benchmem
|
|
8
|
+
Project-URL: Issues, https://github.com/fluxopt/pytest-benchmem/issues
|
|
9
|
+
Author: Felix Bumann
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: benchmark,memory,memray,performance,pytest,pytest-benchmark
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Framework :: Pytest
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Topic :: Software Development :: Testing
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: memray>=1.11; platform_system == 'Linux' or platform_system == 'Darwin'
|
|
20
|
+
Requires-Dist: pytest-benchmark<6,>=4
|
|
21
|
+
Provides-Extra: plot
|
|
22
|
+
Requires-Dist: pandas; extra == 'plot'
|
|
23
|
+
Requires-Dist: plotly>=5; extra == 'plot'
|
|
24
|
+
Requires-Dist: typer>=0.12; extra == 'plot'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# pytest-benchmem
|
|
28
|
+
|
|
29
|
+
[](https://github.com/fluxopt/pytest-benchmem/actions/workflows/ci.yaml)
|
|
30
|
+

|
|
31
|
+
[](LICENSE)
|
|
32
|
+
|
|
33
|
+
**The memory companion to [pytest-benchmark].** It times your code; pytest-benchmem
|
|
34
|
+
adds a memray **peak-memory** pass to the *same test, in the same run* — one node
|
|
35
|
+
id, one JSON file, both metrics. Plus dims-aware plots and cross-version sweeps.
|
|
36
|
+
|
|
37
|
+
## Quickstart
|
|
38
|
+
|
|
39
|
+
Write a normal pytest-benchmark test; swap `benchmark` for `benchmark_memory`:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import pytest
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@pytest.mark.parametrize("n", [10_000, 100_000, 1_000_000])
|
|
46
|
+
def test_sort(benchmark_memory, n):
|
|
47
|
+
data = list(range(n, 0, -1))
|
|
48
|
+
benchmark_memory(sorted, data)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pytest --benchmark-only --benchmark-json=run.json
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
One run, one `run.json`, for each benchmark id — both metrics, one node id:
|
|
56
|
+
|
|
57
|
+
- `stats: {min, mean, median, …}` — **timing**, from pytest-benchmark.
|
|
58
|
+
- `extra_info: {"peak_mib": 3.81}` — **peak memory**, from pytest-benchmem.
|
|
59
|
+
|
|
60
|
+
The two passes never overlap: pytest-benchmark times the action untracked, then
|
|
61
|
+
memray measures peak on a *separate, untimed* call — so the allocator hooks cost
|
|
62
|
+
the timing nothing. The parametrize `params` become the analysis dims the plots
|
|
63
|
+
scale by.
|
|
64
|
+
|
|
65
|
+
## Already have a pytest-benchmark suite?
|
|
66
|
+
|
|
67
|
+
Don't rewrite a thing — add `--benchmark-memory` and every `benchmark(...)` call
|
|
68
|
+
also records peak memory:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
def test_sort(benchmark): # unchanged
|
|
72
|
+
benchmark(sorted, list(range(1_000_000, 0, -1)))
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pytest --benchmark-only --benchmark-memory # timing + peak_mib for the whole suite
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
It's opt-in at the run level: without the flag, plain `benchmark` tests are
|
|
80
|
+
untouched. (Reach for the `benchmark_memory` fixture when you want memory on
|
|
81
|
+
specific tests only, or `pedantic` control.)
|
|
82
|
+
|
|
83
|
+
## Reading it back
|
|
84
|
+
|
|
85
|
+
Timing rides pytest-benchmark's own tooling (`pytest-benchmark compare`,
|
|
86
|
+
`--benchmark-histogram`) — pytest-benchmem doesn't reimplement it. For **memory**,
|
|
87
|
+
and dims-aware views over either metric:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
benchmem compare base.json head.json --metric memory # per-id delta table
|
|
91
|
+
benchmem plot base.json head.json --metric memory # interactive plotly view
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
id base.json head.json change (MiB)
|
|
96
|
+
--------------------------------------------------------------------
|
|
97
|
+
test_sort[10000] 0.076 0.078 +2.6%
|
|
98
|
+
test_sort[100000] 0.76 0.74 -2.6%
|
|
99
|
+
test_sort[1000000] 7.63 9.155 +20.0%
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Or pull the numbers into your own analysis:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from pytest_benchmem import from_pytest_benchmark, memory_from_pytest_benchmark
|
|
106
|
+
|
|
107
|
+
_, timing, _ = from_pytest_benchmark("run.json") # seconds, from stats
|
|
108
|
+
_, memory, _ = memory_from_pytest_benchmark("run.json") # MiB, from extra_info
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Outside pytest, `measure_peak(lambda: build_model(1000))` is the bare memray
|
|
112
|
+
engine — a one-liner peak number for a REPL or notebook.
|
|
113
|
+
|
|
114
|
+
## Where it sits
|
|
115
|
+
|
|
116
|
+
Its reason to exist is the gap nothing else fills cleanly: **memray-precision
|
|
117
|
+
memory benchmarking** of your own code, right where you already benchmark. ASV's
|
|
118
|
+
`peakmem` is coarse RSS sampling that misses numpy/C-allocation detail; CodSpeed
|
|
119
|
+
covers CI timing.
|
|
120
|
+
|
|
121
|
+
| Need | Reach for | pytest-benchmem |
|
|
122
|
+
|---|---|---|
|
|
123
|
+
| CI regression, per-PR dashboard | **CodSpeed** | — (don't rebuild it) |
|
|
124
|
+
| Local timing + A/B compare | **pytest-benchmark** | rides it (timing is its job) |
|
|
125
|
+
| Rigorous perf history across commits | **ASV** | — (heavier, RSS memory) |
|
|
126
|
+
| **Precise local peak memory (numpy/C allocs)** | **memray** | ⭐ the core |
|
|
127
|
+
| Memory *in your pytest-benchmark tests* | — | ⭐ fixture **or** `--benchmark-memory` |
|
|
128
|
+
| Same runs across installed versions | — | ⭐ `sweep` |
|
|
129
|
+
|
|
130
|
+
> **Not** a CI dashboard (use [CodSpeed]) and **not** a rigorous perf-history
|
|
131
|
+
> system (use [ASV]). If your core need is *precise local memory* over the
|
|
132
|
+
> benchmarks you already write — timing/sweeps/plots in one vocabulary — that's
|
|
133
|
+
> pytest-benchmem.
|
|
134
|
+
|
|
135
|
+
## Install
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
uv add pytest-benchmem # the fixture + flag + memray engine
|
|
139
|
+
uv add "pytest-benchmem[plot]" # + the plot/compare CLI (pandas, plotly, typer)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
pytest-benchmark and memray are core deps; memray is Linux/macOS only, so Windows
|
|
143
|
+
installs cleanly with timing-only (the memory pass raises a clear error there).
|
|
144
|
+
|
|
145
|
+
## Status
|
|
146
|
+
|
|
147
|
+
Early. Extracted from the linopy internal benchmark suite, where it's the local
|
|
148
|
+
memory-profiling layer. API may move before 1.0.
|
|
149
|
+
|
|
150
|
+
[pytest-benchmark]: https://pytest-benchmark.readthedocs.io
|
|
151
|
+
[CodSpeed]: https://codspeed.io
|
|
152
|
+
[ASV]: https://asv.readthedocs.io
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
pytest_benchmem/__init__.py,sha256=yUI5fVb4Neo6KACY3EsU5fauu0uolIsNbvmq_16Q6Cc,1173
|
|
2
|
+
pytest_benchmem/cli.py,sha256=nElgZ7wK8V9dFZEvvhd79PnUh6xv5KXA9LlW_YalS-Q,4076
|
|
3
|
+
pytest_benchmem/compare.py,sha256=yzTeOJFEUMxQQ2l60XF9piJsupbZcsgbBwDNkrQU9eE,1579
|
|
4
|
+
pytest_benchmem/memray.py,sha256=2dW1jNZRqctsnjUcS07FEo8aWfQ0ffBh4SAJqrV_Ow8,2180
|
|
5
|
+
pytest_benchmem/plotting.py,sha256=Lg9cLbpvegdKxngxn4a2QJ5lKZqu3xwR5tSsRzVrgSw,12662
|
|
6
|
+
pytest_benchmem/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
pytest_benchmem/pytest_plugin.py,sha256=--m9UB8zgCav3M-VM1exVGtJvjjoSrFY5GSTCQ9ndz0,9681
|
|
8
|
+
pytest_benchmem/snapshot.py,sha256=TB3SeuGYt0ePYp0vuqlkSMvcFO9ZX7DTkVE3UhnmyME,5429
|
|
9
|
+
pytest_benchmem/sweep.py,sha256=VAFB64rmyMbjdRlI7R3A7vsiQKcqQzS9u5Kaz5NSlnY,5260
|
|
10
|
+
pytest_benchmem-0.1.0.dist-info/METADATA,sha256=DfiMmjdVOPKU0FG2pogVF7YDSoqkHwfnaNmNqZh8bsg,5966
|
|
11
|
+
pytest_benchmem-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
12
|
+
pytest_benchmem-0.1.0.dist-info/entry_points.txt,sha256=v8fZRWsQTl2Tk6opcZ3BDKbqRhJzOBWsco5Isc8FN3M,106
|
|
13
|
+
pytest_benchmem-0.1.0.dist-info/licenses/LICENSE,sha256=jRwycONJR6wDwPOtHRIFKIU7X-EqCXRyOnktVz2UaYs,1069
|
|
14
|
+
pytest_benchmem-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Felix Bumann
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|