pytest-benchmem 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ """pytest-benchmem — the memory companion to pytest-benchmark.
2
+
3
+ pytest-benchmark times your code; pytest-benchmem adds a memray peak-memory pass to the
4
+ same test via the ``benchmark_memory`` fixture, stored in the same run, plus
5
+ dims-aware plots and cross-version sweeps it has no answer for.
6
+
7
+ Light to import: this re-exports only the engine (``measure_peak``) and the
8
+ readers/loader that turn pytest-benchmark JSON into a tidy frame. ``pytest_benchmem.plotting``
9
+ pulls numpy/plotly and ``pytest_benchmem.sweep`` shells out to ``uv`` — import those
10
+ submodules directly when needed. The ``benchmark_memory`` fixture is delivered by
11
+ the pytest plugin (entry point), not by import.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from pytest_benchmem.memray import measure_peak
17
+ from pytest_benchmem.snapshot import (
18
+ DimValue,
19
+ Metric,
20
+ Sample,
21
+ discover_runs,
22
+ from_pytest_benchmark,
23
+ load_long_df,
24
+ load_samples,
25
+ memory_from_pytest_benchmark,
26
+ )
27
+
28
+ __all__ = [
29
+ "DimValue",
30
+ "Metric",
31
+ "Sample",
32
+ "discover_runs",
33
+ "from_pytest_benchmark",
34
+ "load_long_df",
35
+ "load_samples",
36
+ "measure_peak",
37
+ "memory_from_pytest_benchmark",
38
+ ]
pytest_benchmem/cli.py ADDED
@@ -0,0 +1,104 @@
1
+ """pytest-benchmem CLI — ``plot`` and ``compare`` over pytest-benchmark JSON runs.
2
+
3
+ Both commands read the JSON pytest-benchmark writes (``.benchmarks/…``) and pick
4
+ a ``--metric`` (``time`` from ``stats``, ``memory`` from ``extra_info.peak_mib``).
5
+ Timing comparison/histograms are pytest-benchmark's own job; these commands are
6
+ the memory-aware, dims-aware views on top.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import importlib.util
12
+ from pathlib import Path
13
+ from typing import Annotated
14
+
15
+ import typer
16
+
17
+ from pytest_benchmem.snapshot import Metric, discover_runs
18
+
19
+ app = typer.Typer(help="pytest-benchmem — plot and compare benchmark runs.", no_args_is_help=True)
20
+
21
+ MetricOpt = Annotated[Metric, typer.Option(help="Which metric to read: time | memory.")]
22
+
23
+
24
+ def _need_plotly() -> None:
25
+ if importlib.util.find_spec("plotly") is None:
26
+ typer.secho(
27
+ "plotting needs extras: pip install 'pytest-benchmem[plot]'",
28
+ fg=typer.colors.RED,
29
+ err=True,
30
+ )
31
+ raise typer.Exit(code=2)
32
+
33
+
34
+ @app.command()
35
+ def plot(
36
+ runs: Annotated[list[Path], typer.Argument(help="pytest-benchmark JSON file(s).")],
37
+ metric: MetricOpt = "time",
38
+ view: Annotated[
39
+ str | None,
40
+ typer.Option(help="compare | scatter | sweep | scaling (default: by count)."),
41
+ ] = None,
42
+ facet: Annotated[str | None, typer.Option(help="Dim to facet by.")] = None,
43
+ x: Annotated[str | None, typer.Option(help="scaling: dim for the x-axis.")] = None,
44
+ clip: Annotated[float | None, typer.Option(help="Clamp the colour scale.")] = None,
45
+ output: Annotated[Path | None, typer.Option("--output", "-o", help="HTML out.")] = None,
46
+ open_browser: Annotated[bool, typer.Option("--open/--no-open")] = False,
47
+ ) -> None:
48
+ """Render an interactive plotly view from one or more pytest-benchmark runs."""
49
+ missing = [p for p in runs if not p.exists()]
50
+ if missing:
51
+ typer.secho(f"missing: {[str(p) for p in missing]}", fg=typer.colors.RED, err=True)
52
+ found = discover_runs()
53
+ if found:
54
+ typer.echo("available:\n " + "\n ".join(str(p) for p in found), err=True)
55
+ raise typer.Exit(code=2)
56
+
57
+ chosen = view or ("scaling" if len(runs) == 1 else "scatter" if len(runs) == 2 else "sweep")
58
+ _need_plotly()
59
+ from pytest_benchmem import plotting
60
+
61
+ try:
62
+ if chosen == "compare":
63
+ fig, n = plotting.plot_compare(runs, metric=metric, facet=facet, clip=clip)
64
+ elif chosen == "scatter":
65
+ fig, n = plotting.plot_scatter(runs, metric=metric, facet=facet, clip=clip)
66
+ elif chosen == "sweep":
67
+ fig, n = plotting.plot_sweep(runs, metric=metric, clip=clip)
68
+ elif chosen == "scaling":
69
+ fig, n = plotting.plot_scaling(runs, metric=metric, x=x, facet=facet)
70
+ else:
71
+ typer.secho(f"unknown view {chosen!r}", fg=typer.colors.RED, err=True)
72
+ raise typer.Exit(code=2)
73
+ except ValueError as exc:
74
+ typer.secho(str(exc), fg=typer.colors.RED, err=True)
75
+ raise typer.Exit(code=1) from exc
76
+
77
+ output = output or Path(".benchmarks") / "plots" / f"{chosen}-{metric}.html"
78
+ output.parent.mkdir(parents=True, exist_ok=True)
79
+ fig.write_html(output)
80
+ typer.secho(f"{chosen} ({metric}): {n} ids → {output}", fg=typer.colors.GREEN)
81
+ if open_browser:
82
+ import webbrowser
83
+
84
+ webbrowser.open(output.resolve().as_uri())
85
+
86
+
87
+ @app.command()
88
+ def compare(
89
+ a: Annotated[Path, typer.Argument(help="Baseline run.")],
90
+ b: Annotated[Path, typer.Argument(help="Candidate run.")],
91
+ metric: MetricOpt = "time",
92
+ ) -> None:
93
+ """Print a per-id delta table for two pytest-benchmark runs."""
94
+ for p in (a, b):
95
+ if not p.exists():
96
+ typer.secho(f"missing: {p}", fg=typer.colors.RED, err=True)
97
+ raise typer.Exit(code=2)
98
+ from pytest_benchmem.compare import compare_runs
99
+
100
+ try:
101
+ compare_runs(a, b, metric=metric)
102
+ except ValueError as exc:
103
+ typer.secho(str(exc), fg=typer.colors.RED, err=True)
104
+ raise typer.Exit(code=1) from exc
@@ -0,0 +1,38 @@
1
+ """Text comparison of two pytest-benchmark runs — keyed on the benchmark id."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import TextIO
8
+
9
+ from pytest_benchmem.snapshot import Metric, load_long_df
10
+
11
+
12
+ def compare_runs(
13
+ a: str | Path, b: str | Path, *, metric: Metric = "time", out: TextIO | None = None
14
+ ) -> None:
15
+ """Print a per-id table of ``a`` vs ``b`` (for ``metric``) with percent change.
16
+
17
+ Reads the chosen metric (timing or memory) from two pytest-benchmark files.
18
+ Ids present in only one run show ``—``.
19
+ """
20
+ out = out or sys.stdout
21
+ df, unit = load_long_df([Path(a), Path(b)], metric=metric)
22
+ labels = df["snapshot"].drop_duplicates().tolist()
23
+ la, lb = labels[0], labels[1]
24
+ wide = df.pivot(index="id", columns="snapshot", values="value")
25
+
26
+ print(f"\n{'id':<70} {la:>12} {lb:>12} {'change':>10} ({unit})", file=out)
27
+ print("-" * 108, file=out)
28
+ for test_id in sorted(wide.index):
29
+ va = wide.at[test_id, la] if la in wide.columns else None
30
+ vb = wide.at[test_id, lb] if lb in wide.columns else None
31
+ va = None if va is None or va != va else va # noqa: PLR0124 — NaN check
32
+ vb = None if vb is None or vb != vb else vb
33
+ a_str = f"{va:.4g}" if va is not None else "—"
34
+ b_str = f"{vb:.4g}" if vb is not None else "—"
35
+ change = f"{(vb - va) / va * 100:+.1f}%" if va and vb and va > 0 else "—"
36
+ short = test_id.split("::")[-1]
37
+ print(f"{short:<70} {a_str:>12} {b_str:>12} {change:>10}", file=out)
38
+ print(file=out)
@@ -0,0 +1,59 @@
1
+ """Peak-memory measurement via ``memray`` — the core pytest-benchmem engine.
2
+
3
+ :func:`measure_peak` runs one action under a ``memray.Tracker`` and returns the
4
+ peak in MiB. It's what the ``benchmark_memory`` pytest fixture calls for its
5
+ memory pass, and it doubles as a standalone one-liner for a REPL or notebook::
6
+
7
+ from pytest_benchmem import measure_peak
8
+
9
+ peak = measure_peak(lambda: build_model(1000)) # MiB
10
+
11
+ Why memray and not RSS sampling: peak RSS (what ASV's ``peakmem`` and naive
12
+ samplers use) misses the numpy/C-allocation detail that's usually the point of
13
+ profiling a numeric library. memray tracks the allocator directly.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import gc
19
+ import platform
20
+ import tempfile
21
+ from collections.abc import Callable
22
+ from pathlib import Path
23
+
24
+ #: A zero-arg callable doing the work that gets memory-tracked.
25
+ Action = Callable[[], object]
26
+
27
+
28
+ def _require_memray() -> None:
29
+ """Raise if memory measurement isn't supported — memray runs only on Linux/macOS."""
30
+ if platform.system() not in ("Linux", "Darwin"):
31
+ raise RuntimeError(
32
+ "memory measurement requires memray, which supports only Linux and macOS "
33
+ f"(this is {platform.system()}). Timing still works; run memory benchmarks "
34
+ "on Linux or macOS."
35
+ )
36
+
37
+
38
+ def measure_peak(action: Action, repeats: int = 1) -> float:
39
+ """Run ``action()`` under ``memray.Tracker`` and return peak MiB.
40
+
41
+ ``repeats > 1`` runs it that many times in fresh trackers and returns the
42
+ *minimum* peak — peak memory is noisier than expected (GC timing, lazy
43
+ imports, page cache), so min-of-N is the cleanest "floor this can hit".
44
+ """
45
+ _require_memray()
46
+
47
+ import memray
48
+
49
+ peaks: list[float] = []
50
+ for _ in range(max(1, repeats)):
51
+ with tempfile.TemporaryDirectory(prefix="pytest-benchmem-") as tmp:
52
+ out = Path(tmp) / "track.bin" # memray must create the file itself
53
+ with memray.Tracker(out):
54
+ action()
55
+ peak_bytes = memray.FileReader(out).metadata.peak_memory
56
+ peaks.append(round(peak_bytes / (1024**2), 3))
57
+ gc.collect()
58
+
59
+ return min(peaks)
@@ -0,0 +1,352 @@
1
+ """Interactive plotly views over pytest-benchmark runs — dims-driven.
2
+
3
+ Each view reads one ``metric`` (``"time"`` or ``"memory"``) out of the
4
+ pytest-benchmark JSON files and returns ``(figure, n_rendered)``:
5
+
6
+ - :func:`plot_compare` (2 runs) — bar chart of per-id delta.
7
+ - :func:`plot_scatter` (2+) — baseline cost vs ratio; top-right = the regressed one.
8
+ - :func:`plot_sweep` (3+) — heatmap of per-id fold-change (log2 ratio) vs the first.
9
+ - :func:`plot_scaling` (1) — cost vs a numeric dim, faceted/coloured by others.
10
+
11
+ The sweep/scatter/compare views key on the opaque ``id``; only ``scaling`` reads
12
+ ``dims`` — and which dim is x / colour / facet is auto-inferred from dtype
13
+ (numeric → x) but overridable. plotly is imported lazily.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from pathlib import Path
19
+ from typing import TYPE_CHECKING, Literal
20
+
21
+ import numpy as np
22
+
23
+ from pytest_benchmem.snapshot import Metric, load_long_df
24
+
25
+ if TYPE_CHECKING:
26
+ import numpy.typing as npt
27
+ import pandas as pd
28
+ from plotly.graph_objects import Figure
29
+
30
+ SortMode = Literal["absolute", "relative"]
31
+
32
+
33
+ def _value_label(unit: str) -> str:
34
+ return {"s": "time", "MiB": "peak"}.get(unit, "value")
35
+
36
+
37
+ def _diverging_kwargs(midpoint: float = 0.0) -> dict[str, object]:
38
+ """green→white→red continuous colour scale centred on ``midpoint``."""
39
+ return {
40
+ "color_continuous_scale": ["green", "white", "red"],
41
+ "color_continuous_midpoint": midpoint,
42
+ }
43
+
44
+
45
+ def _symmetric_clip(
46
+ magnitudes: npt.NDArray[np.float64], override: float | None, pct: float = 95.0
47
+ ) -> float:
48
+ """Symmetric colour bound: ``override`` if given, else the ``pct`` percentile
49
+ of ``|magnitudes|`` so outliers don't wash the rest to the midpoint.
50
+ """
51
+ if override is not None:
52
+ return float(override)
53
+ mags = np.abs(np.asarray(magnitudes, dtype=float))
54
+ mags = mags[np.isfinite(mags)]
55
+ if mags.size == 0:
56
+ return 1.0
57
+ bound = float(np.percentile(mags, pct))
58
+ return bound if bound > 0 else (float(mags.max()) or 1e-9)
59
+
60
+
61
+ def _fold_label(fold: float) -> str:
62
+ """Format a fold-change for a colourbar tick: ``2×``, ``1/4×``, ``1.5×``."""
63
+ if abs(fold - 1.0) < 1e-9:
64
+ return "1×"
65
+ return f"{fold:.3g}×" if fold > 1.0 else f"1/{1.0 / fold:.3g}×"
66
+
67
+
68
+ def _fold_ticks(bound: float) -> tuple[list[float], list[str]]:
69
+ """Colourbar ticks (log2 positions, fold labels) spanning ±bound — clean
70
+ powers of two for a wide range, round sub-2× folds for a tight one.
71
+ """
72
+ if bound >= 1.0:
73
+ pos = [2.0**t for t in range(1, int(bound) + 1)]
74
+ else:
75
+ pos = [f for f in (1.1, 1.25, 1.5) if float(np.log2(f)) <= bound + 1e-9]
76
+ if len(pos) < 2:
77
+ pos = [2.0 ** (bound / 2), 2.0**bound]
78
+ vals, text = [0.0], ["1×"]
79
+ for f in sorted(pos):
80
+ lv = float(np.log2(f))
81
+ vals = [-lv, *vals, lv]
82
+ text = [_fold_label(1.0 / f), *text, _fold_label(f)]
83
+ return vals, text
84
+
85
+
86
+ def _axis_kwargs(unit: str) -> dict[str, object]:
87
+ if unit == "s":
88
+ return {"tickformat": ".2s", "ticksuffix": "s"}
89
+ return {"ticksuffix": f" {unit}"}
90
+
91
+
92
+ def _dim_columns(df: pd.DataFrame) -> list[str]:
93
+ """The dim columns (everything but the fixed snapshot/id/value)."""
94
+ return [c for c in df.columns if c not in ("snapshot", "id", "value")]
95
+
96
+
97
+ def plot_compare(
98
+ snapshots: list[Path],
99
+ *,
100
+ metric: Metric = "time",
101
+ sort: SortMode = "absolute",
102
+ facet: str | None = None,
103
+ clip: float | None = None,
104
+ ) -> tuple[Figure, int]:
105
+ """Bar chart of per-id delta, sorted by the chosen Δ (biggest regressions on top).
106
+
107
+ ``sort`` picks the bar dimension: ``absolute`` plots ``b - a`` in the native
108
+ unit, ``relative`` plots percent change. ``facet`` splits into subplots by any
109
+ dim. ``clip`` clamps the colour scale (default symmetric p95).
110
+ """
111
+ import sys
112
+
113
+ import plotly.express as px
114
+
115
+ df_long, unit = load_long_df(snapshots[:2], metric=metric)
116
+ vlabel = _value_label(unit)
117
+ labels = df_long["snapshot"].drop_duplicates().tolist()
118
+ a_label, b_label = labels[0], labels[1]
119
+
120
+ dims = _dim_columns(df_long)
121
+ wide = (
122
+ df_long.pivot(index=["id", *dims], columns="snapshot", values="value")
123
+ .reset_index()
124
+ .rename_axis(columns=None)
125
+ )
126
+ only_a = wide[wide[a_label].notna() & wide[b_label].isna()]
127
+ only_b = wide[wide[a_label].isna() & wide[b_label].notna()]
128
+ df = wide.dropna(subset=[a_label, b_label]).copy()
129
+ if df.empty:
130
+ raise ValueError("no ids in common between the two snapshots")
131
+ if len(only_a) or len(only_b):
132
+ print(
133
+ f"compare: {len(only_a)} only in {a_label}, {len(only_b)} only in "
134
+ f"{b_label} (intersection: {len(df)}).",
135
+ file=sys.stderr,
136
+ )
137
+
138
+ df["delta_abs"] = df[b_label] - df[a_label]
139
+ df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
140
+ x_col = "delta_abs" if sort == "absolute" else "delta_pct"
141
+ df = df.sort_values(x_col).reset_index(drop=True)
142
+
143
+ x_label = f"{vlabel} delta ({unit})" if sort == "absolute" else f"{vlabel} delta %"
144
+ text_fmt = (".2s" if unit == "s" else ".2f") if sort == "absolute" else ".1f"
145
+ direction = "slower" if unit == "s" else "more memory"
146
+ title = f"{vlabel} delta ({sort}): {a_label} → {b_label} (positive = {direction})"
147
+
148
+ facet_kwargs: dict[str, object] = {}
149
+ if facet is not None and facet in df.columns:
150
+ facet_kwargs = {"facet_col": facet, "facet_col_wrap": 3}
151
+
152
+ color_clip = _symmetric_clip(df[x_col].to_numpy(), clip)
153
+ fig = px.bar(
154
+ df,
155
+ x=x_col,
156
+ y="id",
157
+ orientation="h",
158
+ color=x_col,
159
+ **_diverging_kwargs(),
160
+ range_color=[-color_clip, color_clip],
161
+ title=title,
162
+ labels={x_col: x_label, "id": ""},
163
+ text_auto=text_fmt,
164
+ **facet_kwargs,
165
+ )
166
+ if sort == "absolute":
167
+ fig.update_xaxes(**_axis_kwargs(unit))
168
+ fig.update_traces(textposition="outside", cliponaxis=False)
169
+ fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
170
+ return fig, len(df)
171
+
172
+
173
+ def plot_scatter(
174
+ snapshots: list[Path],
175
+ *,
176
+ metric: Metric = "time",
177
+ facet: str | None = None,
178
+ clip: float | None = None,
179
+ ) -> tuple[Figure, int]:
180
+ """Baseline cost (log-x) vs candidate/baseline ratio (log-y).
181
+
182
+ Top-right = slow *and* slower (the regressed corner). The first snapshot is
183
+ the baseline; with 3+, the rest animate. Colour encodes absolute Δ; ``clip``
184
+ clamps it (default p95). ``facet`` splits by any dim.
185
+ """
186
+ import plotly.express as px
187
+
188
+ if len(snapshots) < 2:
189
+ raise ValueError("scatter needs at least 2 snapshots (baseline + 1)")
190
+
191
+ df_long, unit = load_long_df(snapshots, metric=metric)
192
+ vlabel = _value_label(unit)
193
+ labels = df_long["snapshot"].drop_duplicates().tolist()
194
+ baseline_label = labels[0]
195
+
196
+ base = df_long.loc[df_long["snapshot"] == baseline_label, ["id", "value"]].rename(
197
+ columns={"value": "baseline"}
198
+ )
199
+ df = df_long.merge(base, on="id", how="inner")
200
+ df = df[df["baseline"] > 0].copy()
201
+ if df.empty:
202
+ raise ValueError(f"no ids shared with baseline ({baseline_label})")
203
+
204
+ df = df.rename(columns={"snapshot": "version", "value": "candidate"})
205
+ df["ratio"] = df["candidate"] / df["baseline"]
206
+ df["delta_abs"] = df["candidate"] - df["baseline"]
207
+ df = df[df["ratio"] > 0]
208
+ y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
209
+ bound = max(y_hi, 1.0 / y_lo, 1.1) ** 1.05
210
+ color_clip = _symmetric_clip(df["delta_abs"].to_numpy(), clip)
211
+
212
+ extra: dict[str, object] = {}
213
+ if len(snapshots) >= 3:
214
+ extra["animation_frame"] = "version"
215
+ extra["category_orders"] = {"version": labels}
216
+ if facet is not None and facet in df.columns:
217
+ extra["facet_col"] = facet
218
+ extra["facet_col_wrap"] = 3
219
+
220
+ fig = px.scatter(
221
+ df,
222
+ x="baseline",
223
+ y="ratio",
224
+ color="delta_abs",
225
+ **_diverging_kwargs(),
226
+ range_color=[-color_clip, color_clip],
227
+ log_x=True,
228
+ log_y=True,
229
+ range_y=[1.0 / bound, bound],
230
+ hover_name="id",
231
+ title=f"{vlabel} scatter vs baseline ({baseline_label}) — top-right = regressed",
232
+ labels={
233
+ "baseline": f"baseline {vlabel} ({unit}, log)",
234
+ "ratio": "ratio (candidate / baseline, log)",
235
+ },
236
+ **extra,
237
+ )
238
+ fig.add_hline(y=1.0, line_dash="dash", line_color="grey")
239
+ fig.update_traces(marker={"size": 8, "line": {"width": 0.5, "color": "DarkSlateGrey"}})
240
+ fig.update_layout(height=600)
241
+ return fig, int(df["id"].nunique())
242
+
243
+
244
+ def plot_sweep(
245
+ snapshots: list[Path], *, metric: Metric = "time", clip: float | None = None
246
+ ) -> tuple[Figure, int]:
247
+ """Heatmap of per-id fold-change (log2 ratio) vs the first snapshot."""
248
+ import plotly.express as px
249
+
250
+ df_long, unit = load_long_df(snapshots, metric=metric)
251
+ vlabel = _value_label(unit)
252
+ versions = df_long["snapshot"].drop_duplicates().tolist()
253
+ baseline = versions[0]
254
+
255
+ abs_df = df_long.pivot(index="id", columns="snapshot", values="value").reindex(columns=versions)
256
+ abs_df = abs_df.dropna(subset=[baseline])
257
+ if abs_df.empty:
258
+ raise ValueError(f"no overlap with baseline snapshot {baseline}")
259
+ ratio = abs_df.div(abs_df[baseline], axis=0)
260
+ logr = np.log2(ratio.where(ratio > 0))
261
+ abs_df.index.name = ratio.index.name = logr.index.name = "id"
262
+
263
+ bound = _symmetric_clip(logr.values, float(np.log2(clip)) if clip else None)
264
+ fig = px.imshow(
265
+ logr,
266
+ color_continuous_scale=["green", "white", "red"],
267
+ color_continuous_midpoint=0.0,
268
+ zmin=-bound,
269
+ zmax=bound,
270
+ aspect="auto",
271
+ title=f"{vlabel} fold-change vs baseline ({baseline})",
272
+ labels={"x": "snapshot", "y": "id", "color": "fold"},
273
+ )
274
+ tickvals, ticktext = _fold_ticks(bound)
275
+ fig.update_coloraxes(colorbar={"tickvals": tickvals, "ticktext": ticktext, "title": "fold"})
276
+ fig.update_traces(
277
+ text=ratio.round(2).values,
278
+ texttemplate="%{text}×",
279
+ customdata=abs_df.values,
280
+ hovertemplate=(
281
+ "id: %{y}<br>snapshot: %{x}<br>fold: %{text}×<br>"
282
+ f"{vlabel}: %{{customdata:.4g}}{unit}<extra></extra>"
283
+ ),
284
+ )
285
+ fig.update_layout(height=max(500, len(logr) * 22))
286
+ return fig, len(logr)
287
+
288
+
289
+ def _infer_roles(
290
+ df: pd.DataFrame, x: str | None, color: str | None, facet: str | None
291
+ ) -> tuple[str, str | None, str | None]:
292
+ """Pick (x, color, facet) dim columns, auto-filling unset ones.
293
+
294
+ x defaults to the lone *numeric* dim; color/facet to leftover categoricals.
295
+ """
296
+ dims = _dim_columns(df)
297
+ numeric = [d for d in dims if df[d].dropna().map(lambda v: isinstance(v, (int, float))).all()]
298
+ if x is None:
299
+ if len(numeric) != 1:
300
+ raise ValueError(
301
+ f"scaling needs exactly one numeric dim for x (found {numeric}); pass x= explicitly"
302
+ )
303
+ x = numeric[0]
304
+ leftover = [d for d in dims if d != x]
305
+ if color is None:
306
+ color = leftover[0] if leftover else None
307
+ if facet is None:
308
+ facet = next((d for d in leftover if d != color), None)
309
+ return x, color, facet
310
+
311
+
312
+ def plot_scaling(
313
+ snapshots: list[Path],
314
+ *,
315
+ metric: Metric = "time",
316
+ x: str | None = None,
317
+ color: str | None = None,
318
+ facet: str | None = None,
319
+ log: bool | Literal["auto"] = "auto",
320
+ ) -> tuple[Figure, int]:
321
+ """Cost vs a numeric dim, coloured/faceted by other dims.
322
+
323
+ ``x``/``color``/``facet`` default to inference from the dims (the lone numeric
324
+ dim → x); pass them to override. ``log="auto"`` log-scales when x is numeric
325
+ and strictly positive.
326
+ """
327
+ import plotly.express as px
328
+
329
+ df_long, unit = load_long_df(snapshots[:1], metric=metric)
330
+ vlabel = _value_label(unit)
331
+ x, color, facet = _infer_roles(df_long, x, color, facet)
332
+ df = df_long.dropna(subset=[x]).sort_values([c for c in (facet, color, x) if c])
333
+ if df.empty:
334
+ raise ValueError("no rows with the x dim set")
335
+
336
+ use_log = bool((df[x] > 0).all()) if log == "auto" else bool(log)
337
+ fig = px.line(
338
+ df,
339
+ x=x,
340
+ y="value",
341
+ color=color,
342
+ facet_col=facet,
343
+ facet_col_wrap=3 if facet else None,
344
+ log_x=use_log,
345
+ log_y=use_log,
346
+ markers=True,
347
+ labels={"value": f"{vlabel} ({unit})", x: x},
348
+ title=f"Scaling: {vlabel} ({unit}) vs {x} ({snapshots[0].stem})",
349
+ )
350
+ n_facets = df[facet].nunique() if facet else 1
351
+ fig.update_layout(height=max(400, ((n_facets + 2) // 3) * 350))
352
+ return fig, len(df)
File without changes
@@ -0,0 +1,263 @@
1
+ """pytest plugin — peak-memory alongside pytest-benchmark timing.
2
+
3
+ pytest-benchmark times your code but can't measure memory. This plugin adds a
4
+ memray peak-memory pass *to the same test, in the same run*, stored in
5
+ pytest-benchmark's own JSON (``extra_info.peak_mib``) so timing and memory share
6
+ one node id. Two ways in:
7
+
8
+ - The ``benchmark_memory`` fixture — explicit, per-test::
9
+
10
+ def test_build(benchmark_memory):
11
+ benchmark_memory(build_model, 1000)
12
+
13
+ - The ``--benchmark-memory`` flag — augments the stock ``benchmark`` fixture, so
14
+ an *existing* pytest-benchmark suite records memory with no code change::
15
+
16
+ pytest --benchmark-only --benchmark-memory
17
+
18
+ Either way the two passes never overlap: pytest-benchmark times the action with
19
+ no tracker installed, then memray measures peak on a separate, untimed call — so
20
+ the allocator hooks cost the timing numbers nothing. Memory comes back out via
21
+ :func:`pytest_benchmem.memory_from_pytest_benchmark` into the unit-aware
22
+ ``plot`` / ``compare``.
23
+
24
+ Registered via the ``pytest11`` entry point — installing pytest-benchmem is enough.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import inspect
30
+ from collections.abc import Callable, Mapping
31
+ from typing import TYPE_CHECKING, Any
32
+
33
+ import pytest
34
+
35
+ from pytest_benchmem.memray import measure_peak
36
+
37
+ if TYPE_CHECKING:
38
+ from pytest_benchmark.fixture import BenchmarkFixture
39
+
40
+ #: Key under which the peak (MiB) is written into pytest-benchmark ``extra_info``.
41
+ PEAK_KEY = "peak_mib"
42
+
43
+
44
+ def _record_peak(
45
+ benchmark: BenchmarkFixture, action: Callable[[], Any], *, repeats: int = 1
46
+ ) -> float:
47
+ """Memory-profile ``action`` into ``benchmark.extra_info[PEAK_KEY]``, once.
48
+
49
+ Idempotent: if a peak is already recorded for this benchmark (e.g. the
50
+ ``--benchmark-memory`` patch and the explicit fixture both fire), reuse it
51
+ rather than measuring twice.
52
+ """
53
+ existing = benchmark.extra_info.get(PEAK_KEY)
54
+ if existing is not None:
55
+ return float(existing)
56
+ peak = measure_peak(action, repeats=repeats)
57
+ benchmark.extra_info[PEAK_KEY] = peak
58
+ return peak
59
+
60
+
61
+ class MemoryBenchmark:
62
+ """Callable wrapper that times via pytest-benchmark, then memory-profiles.
63
+
64
+ Mirrors the ``benchmark`` fixture's surface (``__call__`` and ``pedantic``)
65
+ so it's a drop-in for tests that want memory too. After a call, the peak is
66
+ on :attr:`peak_mib` and in the benchmark's ``extra_info``.
67
+ """
68
+
69
+ def __init__(self, benchmark: BenchmarkFixture, *, repeats: int = 1) -> None:
70
+ self._benchmark = benchmark
71
+ self._repeats = repeats
72
+ self.peak_mib: float | None = None
73
+
74
+ @property
75
+ def extra_info(self) -> dict[str, Any]:
76
+ """pytest-benchmark's per-benchmark metadata dict.
77
+
78
+ Set scalars here to attach analysis dims — ``benchmark_memory.extra_info["op"]
79
+ = "sort"`` — and pytest-benchmem's readers pick them up as dims alongside the
80
+ parametrize params. The peak is stored here too, under ``peak_mib``.
81
+ """
82
+ return self._benchmark.extra_info
83
+
84
+ def __call__(self, function_to_benchmark: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
85
+ """Time ``function_to_benchmark(*args, **kwargs)``, then record its peak memory."""
86
+ result = self._benchmark(function_to_benchmark, *args, **kwargs)
87
+ self.peak_mib = _record_peak(
88
+ self._benchmark, lambda: function_to_benchmark(*args, **kwargs), repeats=self._repeats
89
+ )
90
+ return result
91
+
92
+ def pedantic(
93
+ self,
94
+ target: Callable[..., Any],
95
+ args: tuple[Any, ...] = (),
96
+ kwargs: Mapping[str, Any] | None = None,
97
+ setup: Callable[[], Any] | None = None,
98
+ rounds: int = 1,
99
+ warmup_rounds: int = 0,
100
+ iterations: int = 1,
101
+ ) -> Any:
102
+ """Like :meth:`pytest_benchmark.fixture.BenchmarkFixture.pedantic`, plus a memory pass.
103
+
104
+ ``setup`` runs untracked before the measured call (and, if it returns
105
+ ``(args, kwargs)``, supplies them) — matching pytest-benchmark, and
106
+ giving back the untimed-setup/measured-action split cleanly.
107
+ """
108
+ kwargs = dict(kwargs or {})
109
+ result = self._benchmark.pedantic( # type: ignore[no-untyped-call]
110
+ target,
111
+ args=args,
112
+ kwargs=kwargs,
113
+ setup=setup,
114
+ rounds=rounds,
115
+ warmup_rounds=warmup_rounds,
116
+ iterations=iterations,
117
+ )
118
+ self.peak_mib = _record_peak(
119
+ self._benchmark, _pedantic_action(target, args, kwargs, setup), repeats=self._repeats
120
+ )
121
+ return result
122
+
123
+
124
+ def _pedantic_action(
125
+ target: Callable[..., Any],
126
+ args: tuple[Any, ...],
127
+ kwargs: Mapping[str, Any],
128
+ setup: Callable[[], Any] | None,
129
+ ) -> Callable[[], Any]:
130
+ """Build the zero-arg measured action for a pedantic call, honoring ``setup``."""
131
+
132
+ def action() -> Any:
133
+ call_args, call_kwargs = args, dict(kwargs)
134
+ if setup is not None:
135
+ produced = setup()
136
+ if produced is not None:
137
+ call_args, call_kwargs = produced
138
+ return target(*call_args, **call_kwargs)
139
+
140
+ return action
141
+
142
+
143
+ @pytest.fixture
144
+ def benchmark_memory(benchmark: BenchmarkFixture) -> MemoryBenchmark:
145
+ """Time *and* peak-memory-profile a callable in one pytest-benchmark test.
146
+
147
+ Depends on the ``benchmark`` fixture, so timing rides pytest-benchmark fully;
148
+ the memray memory pass is added on top and stored in the same entry. For an
149
+ existing suite, prefer the ``--benchmark-memory`` flag over rewriting tests.
150
+ """
151
+ return MemoryBenchmark(benchmark)
152
+
153
+
154
+ # --- --benchmark-memory: augment the stock `benchmark` fixture --------------------
155
+
156
+ _PATCHED: tuple[Any, Any] | None = None
157
+
158
+
159
+ #: pedantic kwargs the patch passes through — guard against a version that drops one.
160
+ _PEDANTIC_PARAMS = frozenset({"args", "kwargs", "setup", "rounds", "warmup_rounds", "iterations"})
161
+
162
+
163
+ def _patch_compatible(fixture_cls: Any) -> bool:
164
+ """True if ``BenchmarkFixture.pedantic`` still takes the kwargs the patch passes.
165
+
166
+ (``__call__`` is invoked positionally, so it needs no signature guard.)
167
+ """
168
+ try:
169
+ pedantic_params = set(inspect.signature(fixture_cls.pedantic).parameters)
170
+ except (TypeError, ValueError, AttributeError):
171
+ return False
172
+ return pedantic_params >= _PEDANTIC_PARAMS
173
+
174
+
175
+ def _install_auto_memory() -> None:
176
+ """Wrap ``BenchmarkFixture.__call__``/``.pedantic`` to also record peak memory."""
177
+ global _PATCHED
178
+ if _PATCHED is not None:
179
+ return
180
+ import pytest_benchmark
181
+ from pytest_benchmark.fixture import BenchmarkFixture
182
+
183
+ if not _patch_compatible(BenchmarkFixture):
184
+ raise pytest.UsageError(
185
+ f"--benchmark-memory: pytest-benchmark {pytest_benchmark.__version__} has a "
186
+ f"BenchmarkFixture.pedantic() signature pytest-benchmem doesn't recognize, so "
187
+ f"the auto-memory patch can't apply safely. Either:\n"
188
+ f" - install a pytest-benchmark version pytest-benchmem supports, or\n"
189
+ f" - drop --benchmark-memory (timing still runs normally), or\n"
190
+ f" - report it so we add support: "
191
+ f"https://github.com/fluxopt/pytest-benchmem/issues"
192
+ )
193
+
194
+ orig_call = BenchmarkFixture.__call__
195
+ orig_pedantic = BenchmarkFixture.pedantic
196
+
197
+ def call(
198
+ self: BenchmarkFixture, function_to_benchmark: Callable[..., Any], *a: Any, **k: Any
199
+ ) -> Any:
200
+ result = orig_call(self, function_to_benchmark, *a, **k) # type: ignore[no-untyped-call]
201
+ _record_peak(self, lambda: function_to_benchmark(*a, **k))
202
+ return result
203
+
204
+ def pedantic(
205
+ self: BenchmarkFixture,
206
+ target: Callable[..., Any],
207
+ args: tuple[Any, ...] = (),
208
+ kwargs: Mapping[str, Any] | None = None,
209
+ setup: Callable[[], Any] | None = None,
210
+ rounds: int = 1,
211
+ warmup_rounds: int = 0,
212
+ iterations: int = 1,
213
+ ) -> Any:
214
+ kwargs = dict(kwargs or {})
215
+ result = orig_pedantic( # type: ignore[no-untyped-call]
216
+ self,
217
+ target,
218
+ args=args,
219
+ kwargs=kwargs,
220
+ setup=setup,
221
+ rounds=rounds,
222
+ warmup_rounds=warmup_rounds,
223
+ iterations=iterations,
224
+ )
225
+ _record_peak(self, _pedantic_action(target, args, kwargs, setup))
226
+ return result
227
+
228
+ BenchmarkFixture.__call__ = call # type: ignore[method-assign]
229
+ BenchmarkFixture.pedantic = pedantic # type: ignore[method-assign,assignment]
230
+ _PATCHED = (orig_call, orig_pedantic)
231
+
232
+
233
+ def _remove_auto_memory() -> None:
234
+ global _PATCHED
235
+ if _PATCHED is None:
236
+ return
237
+ from pytest_benchmark.fixture import BenchmarkFixture
238
+
239
+ BenchmarkFixture.__call__, BenchmarkFixture.pedantic = _PATCHED # type: ignore[method-assign]
240
+ _PATCHED = None
241
+
242
+
243
+ def pytest_addoption(parser: pytest.Parser) -> None:
244
+ """Register the ``--benchmark-memory`` flag."""
245
+ group = parser.getgroup("pytest-benchmem", "peak-memory benchmarking")
246
+ group.addoption(
247
+ "--benchmark-memory",
248
+ action="store_true",
249
+ default=False,
250
+ help="Record peak memory (memray) for every benchmark() call, "
251
+ "not only benchmark_memory — no test changes needed.",
252
+ )
253
+
254
+
255
+ def pytest_configure(config: pytest.Config) -> None:
256
+ """Patch the stock ``benchmark`` fixture when ``--benchmark-memory`` is set."""
257
+ if config.getoption("--benchmark-memory"):
258
+ _install_auto_memory()
259
+
260
+
261
+ def pytest_unconfigure(config: pytest.Config) -> None:
262
+ """Restore the stock ``benchmark`` fixture after the session."""
263
+ _remove_auto_memory()
@@ -0,0 +1,141 @@
1
+ """The reading spine — normalize pytest-benchmark JSON into a tidy frame for plots.
2
+
3
+ pytest-benchmem defines no on-disk format of its own. The source of truth is the JSON
4
+ pytest-benchmark writes under ``.benchmarks/<machine>/NNNN_*.json``; a single
5
+ file carries *both* metrics for each benchmark id:
6
+
7
+ - ``stats`` — timing (min/mean/median/…), in seconds.
8
+ - ``extra_info.peak_mib`` — peak memory in MiB, written by the
9
+ ``benchmark_memory`` fixture.
10
+
11
+ So reads are *per metric*: :func:`from_pytest_benchmark` pulls timing,
12
+ :func:`memory_from_pytest_benchmark` pulls memory, each yielding a list of
13
+ :class:`Sample` — an opaque ``id``, a ``value``, and analysis ``dims``. Dims come
14
+ from the benchmark's ``params`` (``@pytest.mark.parametrize``) plus any scalar
15
+ ``extra_info`` you set in the test. :func:`load_long_df` stacks several files
16
+ (e.g. one per version from a sweep) into the single frame every plot pivots.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ from collections.abc import Mapping, Sequence
23
+ from pathlib import Path
24
+ from typing import TYPE_CHECKING, Any, Literal, NamedTuple
25
+
26
+ #: A value of a dim axis — categorical (str) or numeric (int/float).
27
+ DimValue = str | int | float
28
+
29
+ #: Which metric to read out of a pytest-benchmark file.
30
+ Metric = Literal["time", "memory"]
31
+
32
+ #: Default key under which ``benchmark_memory`` stores the peak (MiB).
33
+ PEAK_KEY = "peak_mib"
34
+
35
+ if TYPE_CHECKING:
36
+ import pandas as pd
37
+
38
+
39
+ class Sample(NamedTuple):
40
+ """One measured result: an opaque ``id``, a ``value``, and analysis ``dims``."""
41
+
42
+ id: str
43
+ value: float
44
+ dims: Mapping[str, DimValue]
45
+
46
+
47
+ def _read_benchmarks(path: str | Path) -> tuple[Path, list[dict[str, Any]]]:
48
+ """Read a pytest-benchmark file, returning ``(path, benchmarks)`` or raising."""
49
+ p = Path(path)
50
+ try:
51
+ data = json.loads(p.read_text())
52
+ except (OSError, json.JSONDecodeError) as exc:
53
+ raise ValueError(f"{path}: not a readable JSON file ({exc})") from exc
54
+ if "benchmarks" not in data:
55
+ raise ValueError(f"{path}: not a pytest-benchmark file (no 'benchmarks' key)")
56
+ return p, data["benchmarks"]
57
+
58
+
59
+ def _dims(bm: Mapping[str, Any]) -> dict[str, DimValue]:
60
+ """Analysis dims for one benchmark — its parametrize ``params`` plus any
61
+ scalar ``extra_info`` (minus the reserved ``peak_mib``). No id parsing.
62
+ """
63
+ dims: dict[str, DimValue] = {}
64
+ params = bm.get("params")
65
+ if isinstance(params, Mapping):
66
+ dims.update(params)
67
+ extra = bm.get("extra_info")
68
+ if isinstance(extra, Mapping):
69
+ dims.update({k: v for k, v in extra.items() if k != PEAK_KEY})
70
+ return dims
71
+
72
+
73
+ def from_pytest_benchmark(
74
+ path: str | Path, *, metric: str = "min"
75
+ ) -> tuple[str, list[Sample], str]:
76
+ """Read timing out of a pytest-benchmark file → ``(label, samples, "s")``.
77
+
78
+ ``metric`` picks the stat (``min`` / ``median`` / …). Dims come from each
79
+ benchmark's parametrize ``params`` and ``extra_info``.
80
+ """
81
+ p, benchmarks = _read_benchmarks(path)
82
+ samples = [
83
+ Sample(id=bm["fullname"], value=bm["stats"][metric], dims=_dims(bm)) for bm in benchmarks
84
+ ]
85
+ return p.stem, samples, "s"
86
+
87
+
88
+ def memory_from_pytest_benchmark(
89
+ path: str | Path, *, key: str = PEAK_KEY
90
+ ) -> tuple[str, list[Sample], str]:
91
+ """Read peak memory out of a pytest-benchmark file → ``(label, samples, "MiB")``.
92
+
93
+ The ``benchmark_memory`` fixture stores each run's peak under
94
+ ``extra_info[key]``, keyed by the same benchmark id pytest-benchmark uses.
95
+ Benchmarks lacking the key (timing-only tests) are skipped. Dims come from
96
+ parametrize ``params`` and ``extra_info``.
97
+ """
98
+ p, benchmarks = _read_benchmarks(path)
99
+ samples = [
100
+ Sample(id=bm["fullname"], value=float(bm["extra_info"][key]), dims=_dims(bm))
101
+ for bm in benchmarks
102
+ if isinstance(bm.get("extra_info"), Mapping) and bm["extra_info"].get(key) is not None
103
+ ]
104
+ return p.stem, samples, "MiB"
105
+
106
+
107
+ def load_samples(
108
+ path: str | Path, *, metric: Metric = "time", stat: str = "min"
109
+ ) -> tuple[str, list[Sample], str]:
110
+ """Read one pytest-benchmark file for the chosen ``metric`` → ``(label, samples, unit)``."""
111
+ if metric == "time":
112
+ return from_pytest_benchmark(path, metric=stat)
113
+ if metric == "memory":
114
+ return memory_from_pytest_benchmark(path)
115
+ raise ValueError(f"metric must be 'time' or 'memory', got {metric!r}")
116
+
117
+
118
+ def discover_runs(root: str | Path = ".benchmarks") -> list[Path]:
119
+ """Return pytest-benchmark JSON files under ``root`` (for CLI suggestions)."""
120
+ base = Path(root)
121
+ return sorted(base.rglob("*.json")) if base.exists() else []
122
+
123
+
124
+ def load_long_df(
125
+ runs: Sequence[str | Path], *, metric: Metric = "time", stat: str = "min"
126
+ ) -> tuple[pd.DataFrame, str]:
127
+ """Stack pytest-benchmark files into one long frame → ``(df, unit)``.
128
+
129
+ One row per ``(run, id)`` for the chosen ``metric``. Columns: ``snapshot``
130
+ (the file stem / version label), ``id``, ``value``, then one column per dim
131
+ key seen (missing dims are ``NaN``). Every plot view pivots this frame.
132
+ """
133
+ import pandas as pd
134
+
135
+ rows: list[dict[str, object]] = []
136
+ unit = ""
137
+ for path in runs:
138
+ label, samples, unit = load_samples(path, metric=metric, stat=stat)
139
+ for s in samples:
140
+ rows.append({"snapshot": label, "id": s.id, "value": s.value, **s.dims})
141
+ return pd.DataFrame(rows), unit
@@ -0,0 +1,147 @@
1
+ """Cross-version sweep — run benchmarks against several installed versions of a
2
+ package in fresh, isolated ``uv`` venvs.
3
+
4
+ Generic: the package install spec, the extra pins, the directory to copy for
5
+ import isolation, and *what to run* in each venv are all injected. A consumer
6
+ (e.g. linopy) wires ``install_spec=lambda v: f"linopy=={v}"`` and a ``run``
7
+ callback that invokes its pytest / memory command.
8
+
9
+ Isolation: if the repo root contains a dev copy of the package under test, it
10
+ would shadow the venv's installed version on ``sys.path``. So each venv runs
11
+ with ``cwd`` set to a fresh tempdir holding only a *copy* of ``copy_dir`` — the
12
+ benchmark code imports from there, while the package-under-test resolves to the
13
+ venv. A preflight asserts that resolution held.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import re
20
+ import shutil
21
+ import subprocess
22
+ import sys
23
+ import tempfile
24
+ from collections.abc import Callable, Iterator, Sequence
25
+ from dataclasses import dataclass
26
+ from pathlib import Path
27
+
28
+ _PLAIN_VERSION_RE = re.compile(r"^\d+(\.\d+)*([a-z]+\d*)?$")
29
+
30
+
31
+ def version_label(version: str) -> str:
32
+ """Filesystem-safe label from a version/pip-spec (part after the last ``@``)."""
33
+ label = version.rsplit("@", 1)[-1] if "@" in version else version
34
+ return re.sub(r"[^0-9A-Za-z._-]+", "-", label).strip("-._") or "spec"
35
+
36
+
37
+ def _venv_python(venv: Path) -> Path:
38
+ return venv / ("Scripts/python.exe" if os.name == "nt" else "bin/python")
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class Venv:
43
+ """One provisioned per-version venv (or a failure)."""
44
+
45
+ version: str
46
+ python: Path | None
47
+ env: dict[str, str] | None
48
+ cwd: Path | None
49
+ failed_at: str | None # "venv" | "install" | "isolation" | None
50
+
51
+
52
+ def provision(
53
+ versions: Sequence[str],
54
+ *,
55
+ install_spec: Callable[[str], str] = lambda v: v,
56
+ pins: Sequence[str] = (),
57
+ copy_dir: str | Path | None = None,
58
+ import_check: str | None = None,
59
+ as_of: str | None = None,
60
+ tmp_prefix: str = "pytest-benchmem-",
61
+ ) -> Iterator[Venv]:
62
+ """Yield one fresh ``uv`` venv per version.
63
+
64
+ ``install_spec(v)`` → the pip spec for the package under test (default:
65
+ ``v`` verbatim, so ``"pkg==1.2"`` works). ``pins`` are extra specs installed
66
+ alongside. ``copy_dir`` is copied into each venv's cwd for import isolation.
67
+ ``import_check`` is a module name asserted to resolve to the venv (preflight).
68
+ ``as_of`` (``YYYY-MM-DD``) freezes the whole resolution via ``--exclude-newer``.
69
+ """
70
+ if shutil.which("uv") is None:
71
+ raise RuntimeError("uv not found on PATH — https://docs.astral.sh/uv/")
72
+
73
+ for version in versions:
74
+ with tempfile.TemporaryDirectory(prefix=tmp_prefix) as tmp:
75
+ venv = Path(tmp) / "venv"
76
+ r = subprocess.run(["uv", "venv", "--python", sys.executable, str(venv)], check=False)
77
+ if r.returncode != 0:
78
+ yield Venv(version, None, None, None, "venv")
79
+ continue
80
+
81
+ vpy = _venv_python(venv)
82
+ spec = install_spec(version) if _PLAIN_VERSION_RE.match(version) else version
83
+ install = [
84
+ "uv",
85
+ "pip",
86
+ "install",
87
+ "--python",
88
+ str(vpy),
89
+ *(["--exclude-newer", as_of] if as_of else []),
90
+ *pins,
91
+ spec,
92
+ ]
93
+ if subprocess.run(install, check=False).returncode != 0:
94
+ yield Venv(version, None, None, None, "install")
95
+ continue
96
+
97
+ cwd = Path(tmp) / "iso"
98
+ cwd.mkdir()
99
+ if copy_dir is not None:
100
+ src = Path(copy_dir)
101
+ shutil.copytree(
102
+ src,
103
+ cwd / src.name,
104
+ ignore=shutil.ignore_patterns("__pycache__", "*.ipynb", ".DS_Store"),
105
+ )
106
+ env = os.environ.copy()
107
+ env.pop("PYTHONPATH", None)
108
+
109
+ if import_check is not None:
110
+ pre = subprocess.run(
111
+ [
112
+ str(vpy),
113
+ "-c",
114
+ f"import {import_check} as _m; "
115
+ f"assert {str(venv)!r} in _m.__file__, _m.__file__",
116
+ ],
117
+ cwd=str(cwd),
118
+ env=env,
119
+ capture_output=True,
120
+ text=True,
121
+ check=False,
122
+ )
123
+ if pre.returncode != 0:
124
+ yield Venv(version, None, None, None, "isolation")
125
+ continue
126
+
127
+ yield Venv(version, vpy, env, cwd, None)
128
+
129
+
130
+ def sweep(
131
+ versions: Sequence[str],
132
+ run: Callable[[Venv], None],
133
+ **provision_kwargs: object,
134
+ ) -> list[str]:
135
+ """Provision a venv per version and call ``run(venv)`` in each.
136
+
137
+ ``run`` does whatever the consumer needs (invoke pytest / a memory command
138
+ with ``venv.python`` and ``cwd=venv.cwd``). Returns the list of versions
139
+ that failed to provision.
140
+ """
141
+ failed: list[str] = []
142
+ for prov in provision(versions, **provision_kwargs): # type: ignore[arg-type]
143
+ if prov.failed_at:
144
+ failed.append(prov.version)
145
+ continue
146
+ run(prov)
147
+ return failed
@@ -0,0 +1,152 @@
1
+ Metadata-Version: 2.4
2
+ Name: pytest-benchmem
3
+ Version: 0.1.0
4
+ Summary: The memory companion to pytest-benchmark: a memray peak-memory pass on the same test, plus dims-aware plots and cross-version sweeps.
5
+ Project-URL: Homepage, https://github.com/fluxopt/pytest-benchmem
6
+ Project-URL: Documentation, https://fluxopt.github.io/pytest-benchmem/
7
+ Project-URL: Repository, https://github.com/fluxopt/pytest-benchmem
8
+ Project-URL: Issues, https://github.com/fluxopt/pytest-benchmem/issues
9
+ Author: Felix Bumann
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: benchmark,memory,memray,performance,pytest,pytest-benchmark
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Framework :: Pytest
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Classifier: Topic :: Software Development :: Testing
17
+ Classifier: Typing :: Typed
18
+ Requires-Python: >=3.11
19
+ Requires-Dist: memray>=1.11; platform_system == 'Linux' or platform_system == 'Darwin'
20
+ Requires-Dist: pytest-benchmark<6,>=4
21
+ Provides-Extra: plot
22
+ Requires-Dist: pandas; extra == 'plot'
23
+ Requires-Dist: plotly>=5; extra == 'plot'
24
+ Requires-Dist: typer>=0.12; extra == 'plot'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # pytest-benchmem
28
+
29
+ [![CI](https://github.com/fluxopt/pytest-benchmem/actions/workflows/ci.yaml/badge.svg)](https://github.com/fluxopt/pytest-benchmem/actions/workflows/ci.yaml)
30
+ ![Python](https://img.shields.io/badge/python-3.11%2B-blue)
31
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
32
+
33
+ **The memory companion to [pytest-benchmark].** It times your code; pytest-benchmem
34
+ adds a memray **peak-memory** pass to the *same test, in the same run* — one node
35
+ id, one JSON file, both metrics. Plus dims-aware plots and cross-version sweeps.
36
+
37
+ ## Quickstart
38
+
39
+ Write a normal pytest-benchmark test; swap `benchmark` for `benchmark_memory`:
40
+
41
+ ```python
42
+ import pytest
43
+
44
+
45
+ @pytest.mark.parametrize("n", [10_000, 100_000, 1_000_000])
46
+ def test_sort(benchmark_memory, n):
47
+ data = list(range(n, 0, -1))
48
+ benchmark_memory(sorted, data)
49
+ ```
50
+
51
+ ```bash
52
+ pytest --benchmark-only --benchmark-json=run.json
53
+ ```
54
+
55
+ One run, one `run.json`, for each benchmark id — both metrics, one node id:
56
+
57
+ - `stats: {min, mean, median, …}` — **timing**, from pytest-benchmark.
58
+ - `extra_info: {"peak_mib": 3.81}` — **peak memory**, from pytest-benchmem.
59
+
60
+ The two passes never overlap: pytest-benchmark times the action untracked, then
61
+ memray measures peak on a *separate, untimed* call — so the allocator hooks cost
62
+ the timing nothing. The parametrize `params` become the analysis dims the plots
63
+ scale by.
64
+
65
+ ## Already have a pytest-benchmark suite?
66
+
67
+ Don't rewrite a thing — add `--benchmark-memory` and every `benchmark(...)` call
68
+ also records peak memory:
69
+
70
+ ```python
71
+ def test_sort(benchmark): # unchanged
72
+ benchmark(sorted, list(range(1_000_000, 0, -1)))
73
+ ```
74
+
75
+ ```bash
76
+ pytest --benchmark-only --benchmark-memory # timing + peak_mib for the whole suite
77
+ ```
78
+
79
+ It's opt-in at the run level: without the flag, plain `benchmark` tests are
80
+ untouched. (Reach for the `benchmark_memory` fixture when you want memory on
81
+ specific tests only, or `pedantic` control.)
82
+
83
+ ## Reading it back
84
+
85
+ Timing rides pytest-benchmark's own tooling (`pytest-benchmark compare`,
86
+ `--benchmark-histogram`) — pytest-benchmem doesn't reimplement it. For **memory**,
87
+ and dims-aware views over either metric:
88
+
89
+ ```bash
90
+ benchmem compare base.json head.json --metric memory # per-id delta table
91
+ benchmem plot base.json head.json --metric memory # interactive plotly view
92
+ ```
93
+
94
+ ```
95
+ id base.json head.json change (MiB)
96
+ --------------------------------------------------------------------
97
+ test_sort[10000] 0.076 0.078 +2.6%
98
+ test_sort[100000] 0.76 0.74 -2.6%
99
+ test_sort[1000000] 7.63 9.155 +20.0%
100
+ ```
101
+
102
+ Or pull the numbers into your own analysis:
103
+
104
+ ```python
105
+ from pytest_benchmem import from_pytest_benchmark, memory_from_pytest_benchmark
106
+
107
+ _, timing, _ = from_pytest_benchmark("run.json") # seconds, from stats
108
+ _, memory, _ = memory_from_pytest_benchmark("run.json") # MiB, from extra_info
109
+ ```
110
+
111
+ Outside pytest, `measure_peak(lambda: build_model(1000))` is the bare memray
112
+ engine — a one-liner peak number for a REPL or notebook.
113
+
114
+ ## Where it sits
115
+
116
+ Its reason to exist is the gap nothing else fills cleanly: **memray-precision
117
+ memory benchmarking** of your own code, right where you already benchmark. ASV's
118
+ `peakmem` is coarse RSS sampling that misses numpy/C-allocation detail; CodSpeed
119
+ covers CI timing.
120
+
121
+ | Need | Reach for | pytest-benchmem |
122
+ |---|---|---|
123
+ | CI regression, per-PR dashboard | **CodSpeed** | — (don't rebuild it) |
124
+ | Local timing + A/B compare | **pytest-benchmark** | rides it (timing is its job) |
125
+ | Rigorous perf history across commits | **ASV** | — (heavier, RSS memory) |
126
+ | **Precise local peak memory (numpy/C allocs)** | **memray** | ⭐ the core |
127
+ | Memory *in your pytest-benchmark tests* | — | ⭐ fixture **or** `--benchmark-memory` |
128
+ | Same runs across installed versions | — | ⭐ `sweep` |
129
+
130
+ > **Not** a CI dashboard (use [CodSpeed]) and **not** a rigorous perf-history
131
+ > system (use [ASV]). If your core need is *precise local memory* over the
132
+ > benchmarks you already write — timing/sweeps/plots in one vocabulary — that's
133
+ > pytest-benchmem.
134
+
135
+ ## Install
136
+
137
+ ```bash
138
+ uv add pytest-benchmem # the fixture + flag + memray engine
139
+ uv add "pytest-benchmem[plot]" # + the plot/compare CLI (pandas, plotly, typer)
140
+ ```
141
+
142
+ pytest-benchmark and memray are core deps; memray is Linux/macOS only, so Windows
143
+ installs cleanly with timing-only (the memory pass raises a clear error there).
144
+
145
+ ## Status
146
+
147
+ Early. Extracted from the linopy internal benchmark suite, where it's the local
148
+ memory-profiling layer. API may move before 1.0.
149
+
150
+ [pytest-benchmark]: https://pytest-benchmark.readthedocs.io
151
+ [CodSpeed]: https://codspeed.io
152
+ [ASV]: https://asv.readthedocs.io
@@ -0,0 +1,14 @@
1
+ pytest_benchmem/__init__.py,sha256=yUI5fVb4Neo6KACY3EsU5fauu0uolIsNbvmq_16Q6Cc,1173
2
+ pytest_benchmem/cli.py,sha256=nElgZ7wK8V9dFZEvvhd79PnUh6xv5KXA9LlW_YalS-Q,4076
3
+ pytest_benchmem/compare.py,sha256=yzTeOJFEUMxQQ2l60XF9piJsupbZcsgbBwDNkrQU9eE,1579
4
+ pytest_benchmem/memray.py,sha256=2dW1jNZRqctsnjUcS07FEo8aWfQ0ffBh4SAJqrV_Ow8,2180
5
+ pytest_benchmem/plotting.py,sha256=Lg9cLbpvegdKxngxn4a2QJ5lKZqu3xwR5tSsRzVrgSw,12662
6
+ pytest_benchmem/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ pytest_benchmem/pytest_plugin.py,sha256=--m9UB8zgCav3M-VM1exVGtJvjjoSrFY5GSTCQ9ndz0,9681
8
+ pytest_benchmem/snapshot.py,sha256=TB3SeuGYt0ePYp0vuqlkSMvcFO9ZX7DTkVE3UhnmyME,5429
9
+ pytest_benchmem/sweep.py,sha256=VAFB64rmyMbjdRlI7R3A7vsiQKcqQzS9u5Kaz5NSlnY,5260
10
+ pytest_benchmem-0.1.0.dist-info/METADATA,sha256=DfiMmjdVOPKU0FG2pogVF7YDSoqkHwfnaNmNqZh8bsg,5966
11
+ pytest_benchmem-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
12
+ pytest_benchmem-0.1.0.dist-info/entry_points.txt,sha256=v8fZRWsQTl2Tk6opcZ3BDKbqRhJzOBWsco5Isc8FN3M,106
13
+ pytest_benchmem-0.1.0.dist-info/licenses/LICENSE,sha256=jRwycONJR6wDwPOtHRIFKIU7X-EqCXRyOnktVz2UaYs,1069
14
+ pytest_benchmem-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ benchmem = pytest_benchmem.cli:app
3
+
4
+ [pytest11]
5
+ benchmem = pytest_benchmem.pytest_plugin
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Felix Bumann
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.