PyPI - flopscope - Versions diffs - 0.2.0__py3-none-any.whl - Mend

flopscope 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

benchmarks/__init__.py +1 -0
benchmarks/__main__.py +6 -0
benchmarks/_baseline.py +171 -0
benchmarks/_bitwise.py +231 -0
benchmarks/_complex.py +176 -0
benchmarks/_contractions.py +291 -0
benchmarks/_fft.py +198 -0
benchmarks/_impl_urls.py +139 -0
benchmarks/_linalg.py +197 -0
benchmarks/_linalg_delegates.py +407 -0
benchmarks/_metadata.py +141 -0
benchmarks/_misc.py +653 -0
benchmarks/_perf.py +321 -0
benchmarks/_perm_group_calibration.py +175 -0
benchmarks/_pointwise.py +372 -0
benchmarks/_polynomial.py +193 -0
benchmarks/_random.py +209 -0
benchmarks/_reductions.py +136 -0
benchmarks/_sorting.py +289 -0
benchmarks/_stats.py +137 -0
benchmarks/_window.py +92 -0
benchmarks/accumulation/__init__.py +0 -0
benchmarks/accumulation/bench_cost_compute.py +138 -0
benchmarks/dashboard.py +312 -0
benchmarks/runner.py +636 -0
flopscope/__init__.py +273 -0
flopscope/_accumulation/__init__.py +13 -0
flopscope/_accumulation/_bipartite.py +121 -0
flopscope/_accumulation/_burnside.py +51 -0
flopscope/_accumulation/_cache.py +146 -0
flopscope/_accumulation/_components.py +153 -0
flopscope/_accumulation/_cost.py +1414 -0
flopscope/_accumulation/_cost_descriptions.py +63 -0
flopscope/_accumulation/_detection.py +318 -0
flopscope/_accumulation/_ladder.py +191 -0
flopscope/_accumulation/_output_orbit.py +104 -0
flopscope/_accumulation/_partition.py +290 -0
flopscope/_accumulation/_path_info.py +211 -0
flopscope/_accumulation/_public.py +169 -0
flopscope/_accumulation/_reduction.py +310 -0
flopscope/_accumulation/_regimes.py +303 -0
flopscope/_accumulation/_shape.py +33 -0
flopscope/_accumulation/_wreath.py +209 -0
flopscope/_budget.py +1027 -0
flopscope/_config.py +118 -0
flopscope/_counting_ops.py +451 -0
flopscope/_display.py +478 -0
flopscope/_docstrings.py +59 -0
flopscope/_dtypes.py +20 -0
flopscope/_einsum.py +717 -0
flopscope/_errstate.py +25 -0
flopscope/_flops.py +282 -0
flopscope/_free_ops.py +2654 -0
flopscope/_ndarray.py +1126 -0
flopscope/_opt_einsum/LICENSE +21 -0
flopscope/_opt_einsum/NOTICE +59 -0
flopscope/_opt_einsum/__init__.py +209 -0
flopscope/_opt_einsum/_contract.py +1478 -0
flopscope/_opt_einsum/_helpers.py +164 -0
flopscope/_opt_einsum/_hsluv.py +273 -0
flopscope/_opt_einsum/_path_random.py +462 -0
flopscope/_opt_einsum/_paths.py +1653 -0
flopscope/_opt_einsum/_subgraph_symmetry.py +544 -0
flopscope/_opt_einsum/_symmetry.py +140 -0
flopscope/_opt_einsum/_typing.py +37 -0
flopscope/_perm_group.py +717 -0
flopscope/_pointwise.py +2522 -0
flopscope/_polynomial.py +278 -0
flopscope/_registry.py +3216 -0
flopscope/_sorting_ops.py +571 -0
flopscope/_symmetric.py +812 -0
flopscope/_symmetry_transport.py +510 -0
flopscope/_symmetry_utils.py +669 -0
flopscope/_type_info.py +12 -0
flopscope/_unwrap.py +70 -0
flopscope/_validation.py +83 -0
flopscope/_version_check.py +46 -0
flopscope/_weights.py +195 -0
flopscope/_window.py +177 -0
flopscope/accounting.py +565 -0
flopscope/data/default_weights.json +462 -0
flopscope/data/weights.csv +509 -0
flopscope/errors.py +197 -0
flopscope/numpy/__init__.py +878 -0
flopscope/numpy/fft/__init__.py +55 -0
flopscope/numpy/fft/_free.py +51 -0
flopscope/numpy/fft/_transforms.py +695 -0
flopscope/numpy/linalg/__init__.py +105 -0
flopscope/numpy/linalg/_aliases.py +126 -0
flopscope/numpy/linalg/_compound.py +161 -0
flopscope/numpy/linalg/_decompositions.py +353 -0
flopscope/numpy/linalg/_properties.py +533 -0
flopscope/numpy/linalg/_solvers.py +444 -0
flopscope/numpy/linalg/_svd.py +122 -0
flopscope/numpy/random/__init__.py +684 -0
flopscope/numpy/random/_cost_formulas.py +115 -0
flopscope/numpy/random/_counted_classes.py +241 -0
flopscope/numpy/testing/__init__.py +13 -0
flopscope/numpy/typing/__init__.py +30 -0
flopscope/py.typed +0 -0
flopscope/stats/__init__.py +84 -0
flopscope/stats/_base.py +77 -0
flopscope/stats/_cauchy.py +146 -0
flopscope/stats/_erf.py +190 -0
flopscope/stats/_expon.py +146 -0
flopscope/stats/_laplace.py +150 -0
flopscope/stats/_logistic.py +148 -0
flopscope/stats/_lognorm.py +160 -0
flopscope/stats/_ndtri.py +133 -0
flopscope/stats/_norm.py +149 -0
flopscope/stats/_truncnorm.py +186 -0
flopscope/stats/_uniform.py +141 -0
flopscope-0.2.0.dist-info/METADATA +23 -0
flopscope-0.2.0.dist-info/RECORD +115 -0
flopscope-0.2.0.dist-info/WHEEL +4 -0

benchmarks/_linalg.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Benchmark linear algebra operations."""
+from __future__ import annotations
+import statistics
+from benchmarks._perf import measure_flops
+LINALG_OPS: list[str] = [
+    "linalg.cholesky",
+    "linalg.qr",
+    "linalg.eig",
+    "linalg.eigh",
+    "linalg.eigvals",
+    "linalg.eigvalsh",
+    "linalg.svd",
+    "linalg.svdvals",
+    "linalg.solve",
+    "linalg.inv",
+    "linalg.lstsq",
+    "linalg.pinv",
+    "linalg.det",
+    "linalg.slogdet",
+]
+# Ops that need symmetric positive-definite matrices.
+_SPD_OPS = {"linalg.cholesky", "linalg.eigh", "linalg.eigvalsh"}
+_FORMULA_STRINGS: dict[str, str] = {
+    "linalg.cholesky": "n^3",
+    "linalg.qr": "m*n*min(m,n)",
+    "linalg.eig": "n^3",
+    "linalg.eigh": "n^3",
+    "linalg.eigvals": "n^3",
+    "linalg.eigvalsh": "n^3",
+    "linalg.svd": "m*n*min(m,n)",
+    "linalg.svdvals": "m*n*min(m,n)",
+    "linalg.solve": "n^3",
+    "linalg.inv": "n^3",
+    "linalg.lstsq": "m*n*min(m,n)",
+    "linalg.pinv": "m*n*min(m,n)",
+    "linalg.det": "n^3",
+    "linalg.slogdet": "n^3",
+}
+def _analytical_cost(op_name: str, n: int) -> int:
+    """Return the textbook FLOP count for *op_name* on an (n, n) matrix.
+    Parameters
+    ----------
+    op_name : str
+        Operation name (e.g. ``"linalg.cholesky"``).
+    n : int
+        Matrix dimension.
+    Returns
+    -------
+    int
+        Analytical FLOP count.
+    """
+    m = n  # square matrices
+    short = op_name.split(".")[-1]
+    costs: dict[str, int] = {
+        "cholesky": n**3,
+        "qr": m * n * min(m, n),
+        "eig": n**3,
+        "eigh": n**3,
+        "eigvals": n**3,
+        "eigvalsh": n**3,
+        "svd": m * n * min(m, n),
+        "svdvals": m * n * min(m, n),
+        "solve": n**3,
+        "inv": n**3,
+        "lstsq": m * n * min(m, n),
+        "pinv": m * n * min(m, n),
+        "det": n**3,
+        "slogdet": n**3,
+    }
+    return costs[short]
+def benchmark_linalg(
+    n: int = 1024,
+    dtype: str = "float64",
+    repeats: int = 10,
+) -> tuple[dict[str, float], dict[str, dict]]:
+    """Benchmark linalg ops, returning raw measurement per analytical FLOP.
+    In perf mode this is actual FP ops / analytical FLOPs (correction factor).
+    In timing mode this is nanoseconds / analytical FLOPs (same units as
+    pointwise — the runner normalizes against baseline to get relative weights).
+    Parameters
+    ----------
+    n : int
+        Matrix dimension (n x n).
+    dtype : str
+        NumPy dtype string.
+    repeats : int
+        Number of repetitions per measurement.
+    Returns
+    -------
+    tuple[dict[str, float], dict[str, dict]]
+        A pair of (alphas, details). ``alphas`` maps op name to median
+        raw measurement per analytical FLOP. ``details`` maps op name to
+        a dict of raw benchmark metadata.
+    """
+    results: dict[str, float] = {}
+    details: dict[str, dict] = {}
+    for op in LINALG_OPS:
+        dist_values: list[float] = []
+        dist_raw_totals: list[int] = []
+        if op in _SPD_OPS:
+            # SPD matrices: A@A.T + n*I
+            setups = [
+                (
+                    f"import numpy as np; rng = np.random.default_rng(42); "
+                    f"_A = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
+                    f"A = _A @ _A.T + {n} * np.eye({n}, dtype=np.{dtype})"
+                ),
+                (
+                    f"import numpy as np; rng = np.random.default_rng(42); "
+                    f"_A = rng.uniform(0.1, 1.0, size=({n}, {n})).astype(np.{dtype}); "
+                    f"A = _A @ _A.T + {n} * np.eye({n}, dtype=np.{dtype})"
+                ),
+                (
+                    f"import numpy as np; rng = np.random.default_rng(42); "
+                    f"_A = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
+                    f"A = _A @ _A.T + {n * 100} * np.eye({n}, dtype=np.{dtype})"
+                ),
+            ]
+        else:
+            # General, well-conditioned, ill-conditioned
+            setups = [
+                (
+                    f"import numpy as np; rng = np.random.default_rng(42); "
+                    f"A = rng.standard_normal(({n}, {n})).astype(np.{dtype})"
+                ),
+                (
+                    f"import numpy as np; rng = np.random.default_rng(42); "
+                    f"A = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
+                    f"A = A + {n} * np.eye({n}, dtype=np.{dtype})"
+                ),
+                (
+                    f"import numpy as np; rng = np.random.default_rng(42); "
+                    f"_u = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
+                    f"_s = np.logspace(0, -10, {n}, dtype=np.{dtype}); "
+                    f"A = _u * _s @ _u.T"
+                ),
+            ]
+        # Build bench code
+        if op == "linalg.solve":
+            bench_suffix = f"; b = np.ones({n}, dtype=np.{dtype})"
+            bench = "np.linalg.solve(A, b)"
+        elif op == "linalg.lstsq":
+            bench_suffix = f"; b = np.ones({n}, dtype=np.{dtype})"
+            bench = "np.linalg.lstsq(A, b, rcond=None)"
+        else:
+            bench_suffix = ""
+            bench = f"np.{op}(A)"
+        analytical = _analytical_cost(op, n)
+        for setup in setups:
+            full_setup = setup + bench_suffix
+            try:
+                result = measure_flops(full_setup, bench, repeats=repeats)
+            except RuntimeError:
+                continue
+            measured = result.total_flops / repeats
+            dist_values.append(measured / analytical if analytical else 0.0)
+            dist_raw_totals.append(result.total_flops)
+        if dist_values:
+            results[op] = statistics.median(dist_values)
+            if op in ("linalg.solve", "linalg.lstsq"):
+                bm_size = f"A: ({n},{n}), b: ({n},)"
+            else:
+                bm_size = f"A: ({n},{n})"
+            details[op] = {
+                "category": "counted_custom",
+                "measurement_mode": "blas",
+                "analytical_formula": _FORMULA_STRINGS.get(op, ""),
+                "analytical_flops": analytical,
+                "benchmark_size": bm_size,
+                "bench_code": bench,
+                "repeats": repeats,
+                "perf_instructions_total": dist_raw_totals,
+                "distribution_alphas": dist_values,
+            }
+    return results, details

benchmarks/_linalg_delegates.py ADDED Viewed

@@ -0,0 +1,407 @@
+"""Benchmark linalg namespace delegate operations.
+These 15 ops live under ``numpy.linalg.*`` and typically delegate to a
+primary operation (matmul, SVD, solve, ...).  We benchmark them directly
+with perf counters to capture any wrapper overhead.
+"""
+from __future__ import annotations
+import statistics
+from benchmarks._perf import measure_flops
+LINALG_DELEGATE_OPS: list[str] = [
+    "linalg.cond",
+    "linalg.cross",
+    "linalg.matmul",
+    "linalg.matrix_norm",
+    "linalg.matrix_power",
+    "linalg.matrix_rank",
+    "linalg.multi_dot",
+    "linalg.norm",
+    "linalg.outer",
+    "linalg.tensordot",
+    "linalg.tensorinv",
+    "linalg.tensorsolve",
+    "linalg.trace",
+    "linalg.vecdot",
+    "linalg.vector_norm",
+]
+_FORMULA_STRINGS: dict[str, str] = {
+    "linalg.cond": "m*n*min(m,n)",
+    "linalg.cross": "6*n",
+    "linalg.matmul": "MNK",
+    "linalg.matrix_norm": "numel",
+    "linalg.matrix_power": "(ceil(log2(k))+popcount(k)-1)*n^3",
+    "linalg.matrix_rank": "m*n*min(m,n)",
+    "linalg.multi_dot": "sum of chain MNK costs",
+    "linalg.norm": "numel",
+    "linalg.outer": "M*N",
+    "linalg.tensordot": "product of free * contracted dims",
+    "linalg.tensorinv": "n^3",
+    "linalg.tensorsolve": "n^3",
+    "linalg.trace": "min(m,n)",
+    "linalg.vecdot": "batch*K",
+    "linalg.vector_norm": "numel",
+}
+# NumPy 2.x-only ops — skip gracefully on older versions.
+_NUMPY2_OPS = {
+    "linalg.cross",
+    "linalg.matrix_norm",
+    "linalg.vector_norm",
+    "linalg.outer",
+    "linalg.vecdot",
+    "linalg.matmul",
+    "linalg.tensordot",
+}
+def _analytical_cost(op_name: str) -> int:
+    """Return the analytical FLOP count for *op_name* at the canonical size.
+    Each op has a fixed benchmark size (see the table in the module docstring).
+    This function returns the textbook cost for that size.
+    Parameters
+    ----------
+    op_name : str
+        Fully-qualified operation name, e.g. ``"linalg.cond"``.
+    Returns
+    -------
+    int
+        Analytical FLOP count.
+    """
+    short = op_name.split(".")[-1]
+    costs: dict[str, int] = {
+        "cond": 512 * 512 * 512,  # m*n*min(m,n) via SVD
+        "cross": 6 * 1_000_000,  # 6*n
+        "matmul": 2 * 512 * 512 * 512
+        - 512 * 512,  # 2*M*N*K - M*N (FMA=2); = 268,173,312
+        "matrix_norm": 2 * 512 * 512,  # 2*numel (Frobenius)
+        "matrix_power": 3 * 64**3,  # 3 matmuls for n=5
+        "matrix_rank": 512 * 512 * 512,  # m*n*min(m,n) via SVD
+        "multi_dot": 128 * 64 * 128
+        + 128
+        * 128
+        * 64,  # optimal chain (FMA=2); coincidentally same as FMA=1 value = 2,097,152
+        "norm": 2 * 10_000_000,  # 2*numel (FMA=2, vector L2)
+        "outer": 5000 * 5000,  # M*N
+        "tensordot": 64
+        ** 5,  # d^5 (FMA=2 textbook; matches flopscope charge = 1,073,741,824)
+        "tensorinv": 64**3,  # n^3 after reshape
+        "tensorsolve": 64**3,  # n^3 after reshape
+        "trace": 10_000,  # min(m,n)
+        "vecdot": 1000 * 512,  # batch*K
+        "vector_norm": 2 * 10_000_000,  # 2*numel (FMA=2)
+    }
+    return costs[short]
+# ---------------------------------------------------------------------------
+# Per-op setup / bench code builders
+# ---------------------------------------------------------------------------
+def _op_config(op: str, dtype: str) -> tuple[list[str], str, str]:
+    """Return (setups, bench_code, benchmark_size) for a delegate op.
+    Each op gets 3 setup variants (distributions) to take the median over.
+    Returns
+    -------
+    tuple[list[str], str, str]
+        (list of setup strings, benchmark expression, human-readable size)
+    """
+    short = op.split(".")[-1]
+    d = dtype
+    if short == "cond":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d}) + "
+            f"512 * np.eye(512, dtype=np.{d})",
+        ]
+        return setups, "np.linalg.cond(A)", "A: (512,512)"
+    if short == "cross":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"a = rng.standard_normal((1000000, 3)).astype(np.{d}); "
+            f"b = rng.standard_normal((1000000, 3)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"a = rng.uniform(-1, 1, (1000000, 3)).astype(np.{d}); "
+            f"b = rng.uniform(-1, 1, (1000000, 3)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"a = rng.standard_normal((1000000, 3)).astype(np.{d}) * 100; "
+            f"b = rng.standard_normal((1000000, 3)).astype(np.{d}) * 0.01",
+        ]
+        return setups, "np.linalg.cross(a, b)", "a: (1000000,3), b: (1000000,3)"
+    if short == "matmul":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d}); "
+            f"B = rng.standard_normal((512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d}); "
+            f"B = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d}) * 100; "
+            f"B = rng.standard_normal((512, 512)).astype(np.{d}) * 0.01",
+        ]
+        return setups, "np.linalg.matmul(A, B)", "A: (512,512), B: (512,512)"
+    if short == "matrix_norm":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d}) * 100",
+        ]
+        return setups, "np.linalg.matrix_norm(A)", "A: (512,512)"
+    if short == "matrix_power":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((64, 64)).astype(np.{d}) + "
+            f"64 * np.eye(64, dtype=np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (64, 64)).astype(np.{d}) + "
+            f"64 * np.eye(64, dtype=np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((64, 64)).astype(np.{d}) + "
+            f"640 * np.eye(64, dtype=np.{d})",
+        ]
+        return setups, "np.linalg.matrix_power(A, 5)", "A: (64,64), n=5"
+    if short == "matrix_rank":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((512, 512)).astype(np.{d}) + "
+            f"512 * np.eye(512, dtype=np.{d})",
+        ]
+        return setups, "np.linalg.matrix_rank(A)", "A: (512,512)"
+    if short == "multi_dot":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((128, 64)).astype(np.{d}); "
+            f"B = rng.standard_normal((64, 128)).astype(np.{d}); "
+            f"C = rng.standard_normal((128, 64)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (128, 64)).astype(np.{d}); "
+            f"B = rng.uniform(0.1, 1.0, (64, 128)).astype(np.{d}); "
+            f"C = rng.uniform(0.1, 1.0, (128, 64)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((128, 64)).astype(np.{d}) * 100; "
+            f"B = rng.standard_normal((64, 128)).astype(np.{d}) * 0.01; "
+            f"C = rng.standard_normal((128, 64)).astype(np.{d})",
+        ]
+        return (
+            setups,
+            "np.linalg.multi_dot([A, B, C])",
+            "A: (128,64), B: (64,128), C: (128,64)",
+        )
+    if short == "norm":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"x = rng.standard_normal(10000000).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"x = rng.uniform(0.1, 1.0, 10000000).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"x = rng.standard_normal(10000000).astype(np.{d}) * 100",
+        ]
+        return setups, "np.linalg.norm(x)", "x: (10000000,)"
+    if short == "outer":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"a = rng.standard_normal(5000).astype(np.{d}); "
+            f"b = rng.standard_normal(5000).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"a = rng.uniform(0.1, 1.0, 5000).astype(np.{d}); "
+            f"b = rng.uniform(0.1, 1.0, 5000).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"a = rng.standard_normal(5000).astype(np.{d}) * 100; "
+            f"b = rng.standard_normal(5000).astype(np.{d}) * 0.01",
+        ]
+        return setups, "np.linalg.outer(a, b)", "a: (5000,), b: (5000,)"
+    if short == "tensordot":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((64, 64, 64)).astype(np.{d}); "
+            f"B = rng.standard_normal((64, 64, 64)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (64, 64, 64)).astype(np.{d}); "
+            f"B = rng.uniform(0.1, 1.0, (64, 64, 64)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((64, 64, 64)).astype(np.{d}) * 100; "
+            f"B = rng.standard_normal((64, 64, 64)).astype(np.{d}) * 0.01",
+        ]
+        return (
+            setups,
+            "np.linalg.tensordot(A, B, axes=1)",
+            "A: (64,64,64), B: (64,64,64)",
+        )
+    if short == "tensorinv":
+        # Build an invertible (64,64) matrix via A@A.T + n*I, then reshape
+        # to (8,8,8,8).
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
+            f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
+            f"A = _M.reshape(8, 8, 8, 8)",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"_A = rng.uniform(0.1, 1.0, (64, 64)).astype(np.{d}); "
+            f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
+            f"A = _M.reshape(8, 8, 8, 8)",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
+            f"_M = _A @ _A.T + 640 * np.eye(64, dtype=np.{d}); "
+            f"A = _M.reshape(8, 8, 8, 8)",
+        ]
+        return setups, "np.linalg.tensorinv(A, ind=2)", "A: (8,8,8,8)"
+    if short == "tensorsolve":
+        # Build a solvable system: invertible (64,64) reshaped to (8,8,8,8),
+        # with b of shape (8,8).
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
+            f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
+            f"A = _M.reshape(8, 8, 8, 8); "
+            f"b = rng.standard_normal((8, 8)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"_A = rng.uniform(0.1, 1.0, (64, 64)).astype(np.{d}); "
+            f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
+            f"A = _M.reshape(8, 8, 8, 8); "
+            f"b = rng.uniform(0.1, 1.0, (8, 8)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
+            f"_M = _A @ _A.T + 640 * np.eye(64, dtype=np.{d}); "
+            f"A = _M.reshape(8, 8, 8, 8); "
+            f"b = rng.standard_normal((8, 8)).astype(np.{d})",
+        ]
+        return (
+            setups,
+            "np.linalg.tensorsolve(A, b)",
+            "A: (8,8,8,8), b: (8,8)",
+        )
+    if short == "trace":
+        # Use np.ones instead of random arrays to avoid the setup's random
+        # number generation dominating the measurement. Trace just sums the
+        # diagonal — the values don't affect the FP instruction count.
+        setups = [
+            f"import numpy as np; A = np.ones((10000, 10000), dtype=np.{d})",
+            f"import numpy as np; A = np.ones((10000, 10000), dtype=np.{d}) * 2.5",
+            f"import numpy as np; A = np.ones((10000, 10000), dtype=np.{d}) * 100",
+        ]
+        return setups, "np.linalg.trace(A)", "A: (10000,10000)"
+    if short == "vecdot":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((1000, 512)).astype(np.{d}); "
+            f"B = rng.standard_normal((1000, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.uniform(0.1, 1.0, (1000, 512)).astype(np.{d}); "
+            f"B = rng.uniform(0.1, 1.0, (1000, 512)).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"A = rng.standard_normal((1000, 512)).astype(np.{d}) * 100; "
+            f"B = rng.standard_normal((1000, 512)).astype(np.{d}) * 0.01",
+        ]
+        return setups, "np.linalg.vecdot(A, B)", "A: (1000,512), B: (1000,512)"
+    if short == "vector_norm":
+        setups = [
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"x = rng.standard_normal(10000000).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"x = rng.uniform(0.1, 1.0, 10000000).astype(np.{d})",
+            f"import numpy as np; rng = np.random.default_rng(42); "
+            f"x = rng.standard_normal(10000000).astype(np.{d}) * 100",
+        ]
+        return setups, "np.linalg.vector_norm(x)", "x: (10000000,)"
+    raise ValueError(f"Unknown delegate op: {op}")
+# ---------------------------------------------------------------------------
+# Main benchmark entry point
+# ---------------------------------------------------------------------------
+def benchmark_linalg_delegates(
+    dtype: str = "float64",
+    repeats: int = 10,
+) -> tuple[dict[str, float], dict[str, dict]]:
+    """Benchmark linalg delegate ops via perf counters.
+    Returns
+    -------
+    tuple[dict[str, float], dict[str, dict]]
+        (alphas, details) — same schema as ``benchmark_linalg``.
+    """
+    results: dict[str, float] = {}
+    details: dict[str, dict] = {}
+    for op in LINALG_DELEGATE_OPS:
+        # Skip ops that don't exist in this NumPy version.
+        if op in _NUMPY2_OPS:
+            try:
+                import numpy as np  # noqa: F811
+                fn = np.linalg
+                for part in op.split(".")[1:]:
+                    fn = getattr(fn, part)
+            except AttributeError:
+                continue
+        setups, bench, bm_size = _op_config(op, dtype)
+        analytical = _analytical_cost(op)
+        dist_values: list[float] = []
+        dist_raw_totals: list[int] = []
+        for setup in setups:
+            try:
+                result = measure_flops(setup, bench, repeats=repeats)
+            except RuntimeError:
+                continue
+            measured = result.total_flops / repeats
+            dist_values.append(measured / analytical if analytical else 0.0)
+            dist_raw_totals.append(result.total_flops)
+        if dist_values:
+            results[op] = statistics.median(dist_values)
+            details[op] = {
+                "category": "counted_custom",
+                "measurement_mode": "blas",
+                "analytical_formula": _FORMULA_STRINGS.get(op, ""),
+                "analytical_flops": analytical,
+                "benchmark_size": bm_size,
+                "bench_code": bench,
+                "repeats": repeats,
+                "perf_instructions_total": dist_raw_totals,
+                "distribution_alphas": dist_values,
+            }
+    return results, details