flopscope 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. benchmarks/__init__.py +1 -0
  2. benchmarks/__main__.py +6 -0
  3. benchmarks/_baseline.py +171 -0
  4. benchmarks/_bitwise.py +231 -0
  5. benchmarks/_complex.py +176 -0
  6. benchmarks/_contractions.py +291 -0
  7. benchmarks/_fft.py +198 -0
  8. benchmarks/_impl_urls.py +139 -0
  9. benchmarks/_linalg.py +197 -0
  10. benchmarks/_linalg_delegates.py +407 -0
  11. benchmarks/_metadata.py +141 -0
  12. benchmarks/_misc.py +653 -0
  13. benchmarks/_perf.py +321 -0
  14. benchmarks/_perm_group_calibration.py +175 -0
  15. benchmarks/_pointwise.py +372 -0
  16. benchmarks/_polynomial.py +193 -0
  17. benchmarks/_random.py +209 -0
  18. benchmarks/_reductions.py +136 -0
  19. benchmarks/_sorting.py +289 -0
  20. benchmarks/_stats.py +137 -0
  21. benchmarks/_window.py +92 -0
  22. benchmarks/accumulation/__init__.py +0 -0
  23. benchmarks/accumulation/bench_cost_compute.py +138 -0
  24. benchmarks/dashboard.py +312 -0
  25. benchmarks/runner.py +636 -0
  26. flopscope/__init__.py +273 -0
  27. flopscope/_accumulation/__init__.py +13 -0
  28. flopscope/_accumulation/_bipartite.py +121 -0
  29. flopscope/_accumulation/_burnside.py +51 -0
  30. flopscope/_accumulation/_cache.py +146 -0
  31. flopscope/_accumulation/_components.py +153 -0
  32. flopscope/_accumulation/_cost.py +1414 -0
  33. flopscope/_accumulation/_cost_descriptions.py +63 -0
  34. flopscope/_accumulation/_detection.py +318 -0
  35. flopscope/_accumulation/_ladder.py +191 -0
  36. flopscope/_accumulation/_output_orbit.py +104 -0
  37. flopscope/_accumulation/_partition.py +290 -0
  38. flopscope/_accumulation/_path_info.py +211 -0
  39. flopscope/_accumulation/_public.py +169 -0
  40. flopscope/_accumulation/_reduction.py +310 -0
  41. flopscope/_accumulation/_regimes.py +303 -0
  42. flopscope/_accumulation/_shape.py +33 -0
  43. flopscope/_accumulation/_wreath.py +209 -0
  44. flopscope/_budget.py +1027 -0
  45. flopscope/_config.py +118 -0
  46. flopscope/_counting_ops.py +451 -0
  47. flopscope/_display.py +478 -0
  48. flopscope/_docstrings.py +59 -0
  49. flopscope/_dtypes.py +20 -0
  50. flopscope/_einsum.py +717 -0
  51. flopscope/_errstate.py +25 -0
  52. flopscope/_flops.py +282 -0
  53. flopscope/_free_ops.py +2654 -0
  54. flopscope/_ndarray.py +1126 -0
  55. flopscope/_opt_einsum/LICENSE +21 -0
  56. flopscope/_opt_einsum/NOTICE +59 -0
  57. flopscope/_opt_einsum/__init__.py +209 -0
  58. flopscope/_opt_einsum/_contract.py +1478 -0
  59. flopscope/_opt_einsum/_helpers.py +164 -0
  60. flopscope/_opt_einsum/_hsluv.py +273 -0
  61. flopscope/_opt_einsum/_path_random.py +462 -0
  62. flopscope/_opt_einsum/_paths.py +1653 -0
  63. flopscope/_opt_einsum/_subgraph_symmetry.py +544 -0
  64. flopscope/_opt_einsum/_symmetry.py +140 -0
  65. flopscope/_opt_einsum/_typing.py +37 -0
  66. flopscope/_perm_group.py +717 -0
  67. flopscope/_pointwise.py +2522 -0
  68. flopscope/_polynomial.py +278 -0
  69. flopscope/_registry.py +3216 -0
  70. flopscope/_sorting_ops.py +571 -0
  71. flopscope/_symmetric.py +812 -0
  72. flopscope/_symmetry_transport.py +510 -0
  73. flopscope/_symmetry_utils.py +669 -0
  74. flopscope/_type_info.py +12 -0
  75. flopscope/_unwrap.py +70 -0
  76. flopscope/_validation.py +83 -0
  77. flopscope/_version_check.py +46 -0
  78. flopscope/_weights.py +195 -0
  79. flopscope/_window.py +177 -0
  80. flopscope/accounting.py +565 -0
  81. flopscope/data/default_weights.json +462 -0
  82. flopscope/data/weights.csv +509 -0
  83. flopscope/errors.py +197 -0
  84. flopscope/numpy/__init__.py +878 -0
  85. flopscope/numpy/fft/__init__.py +55 -0
  86. flopscope/numpy/fft/_free.py +51 -0
  87. flopscope/numpy/fft/_transforms.py +695 -0
  88. flopscope/numpy/linalg/__init__.py +105 -0
  89. flopscope/numpy/linalg/_aliases.py +126 -0
  90. flopscope/numpy/linalg/_compound.py +161 -0
  91. flopscope/numpy/linalg/_decompositions.py +353 -0
  92. flopscope/numpy/linalg/_properties.py +533 -0
  93. flopscope/numpy/linalg/_solvers.py +444 -0
  94. flopscope/numpy/linalg/_svd.py +122 -0
  95. flopscope/numpy/random/__init__.py +684 -0
  96. flopscope/numpy/random/_cost_formulas.py +115 -0
  97. flopscope/numpy/random/_counted_classes.py +241 -0
  98. flopscope/numpy/testing/__init__.py +13 -0
  99. flopscope/numpy/typing/__init__.py +30 -0
  100. flopscope/py.typed +0 -0
  101. flopscope/stats/__init__.py +84 -0
  102. flopscope/stats/_base.py +77 -0
  103. flopscope/stats/_cauchy.py +146 -0
  104. flopscope/stats/_erf.py +190 -0
  105. flopscope/stats/_expon.py +146 -0
  106. flopscope/stats/_laplace.py +150 -0
  107. flopscope/stats/_logistic.py +148 -0
  108. flopscope/stats/_lognorm.py +160 -0
  109. flopscope/stats/_ndtri.py +133 -0
  110. flopscope/stats/_norm.py +149 -0
  111. flopscope/stats/_truncnorm.py +186 -0
  112. flopscope/stats/_uniform.py +141 -0
  113. flopscope-0.2.0.dist-info/METADATA +23 -0
  114. flopscope-0.2.0.dist-info/RECORD +115 -0
  115. flopscope-0.2.0.dist-info/WHEEL +4 -0
benchmarks/_linalg.py ADDED
@@ -0,0 +1,197 @@
1
+ """Benchmark linear algebra operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import statistics
6
+
7
+ from benchmarks._perf import measure_flops
8
+
9
+ LINALG_OPS: list[str] = [
10
+ "linalg.cholesky",
11
+ "linalg.qr",
12
+ "linalg.eig",
13
+ "linalg.eigh",
14
+ "linalg.eigvals",
15
+ "linalg.eigvalsh",
16
+ "linalg.svd",
17
+ "linalg.svdvals",
18
+ "linalg.solve",
19
+ "linalg.inv",
20
+ "linalg.lstsq",
21
+ "linalg.pinv",
22
+ "linalg.det",
23
+ "linalg.slogdet",
24
+ ]
25
+
26
+ # Ops that need symmetric positive-definite matrices.
27
+ _SPD_OPS = {"linalg.cholesky", "linalg.eigh", "linalg.eigvalsh"}
28
+
29
+ _FORMULA_STRINGS: dict[str, str] = {
30
+ "linalg.cholesky": "n^3",
31
+ "linalg.qr": "m*n*min(m,n)",
32
+ "linalg.eig": "n^3",
33
+ "linalg.eigh": "n^3",
34
+ "linalg.eigvals": "n^3",
35
+ "linalg.eigvalsh": "n^3",
36
+ "linalg.svd": "m*n*min(m,n)",
37
+ "linalg.svdvals": "m*n*min(m,n)",
38
+ "linalg.solve": "n^3",
39
+ "linalg.inv": "n^3",
40
+ "linalg.lstsq": "m*n*min(m,n)",
41
+ "linalg.pinv": "m*n*min(m,n)",
42
+ "linalg.det": "n^3",
43
+ "linalg.slogdet": "n^3",
44
+ }
45
+
46
+
47
+ def _analytical_cost(op_name: str, n: int) -> int:
48
+ """Return the textbook FLOP count for *op_name* on an (n, n) matrix.
49
+
50
+ Parameters
51
+ ----------
52
+ op_name : str
53
+ Operation name (e.g. ``"linalg.cholesky"``).
54
+ n : int
55
+ Matrix dimension.
56
+
57
+ Returns
58
+ -------
59
+ int
60
+ Analytical FLOP count.
61
+ """
62
+ m = n # square matrices
63
+ short = op_name.split(".")[-1]
64
+ costs: dict[str, int] = {
65
+ "cholesky": n**3,
66
+ "qr": m * n * min(m, n),
67
+ "eig": n**3,
68
+ "eigh": n**3,
69
+ "eigvals": n**3,
70
+ "eigvalsh": n**3,
71
+ "svd": m * n * min(m, n),
72
+ "svdvals": m * n * min(m, n),
73
+ "solve": n**3,
74
+ "inv": n**3,
75
+ "lstsq": m * n * min(m, n),
76
+ "pinv": m * n * min(m, n),
77
+ "det": n**3,
78
+ "slogdet": n**3,
79
+ }
80
+ return costs[short]
81
+
82
+
83
+ def benchmark_linalg(
84
+ n: int = 1024,
85
+ dtype: str = "float64",
86
+ repeats: int = 10,
87
+ ) -> tuple[dict[str, float], dict[str, dict]]:
88
+ """Benchmark linalg ops, returning raw measurement per analytical FLOP.
89
+
90
+ In perf mode this is actual FP ops / analytical FLOPs (correction factor).
91
+ In timing mode this is nanoseconds / analytical FLOPs (same units as
92
+ pointwise — the runner normalizes against baseline to get relative weights).
93
+
94
+ Parameters
95
+ ----------
96
+ n : int
97
+ Matrix dimension (n x n).
98
+ dtype : str
99
+ NumPy dtype string.
100
+ repeats : int
101
+ Number of repetitions per measurement.
102
+
103
+ Returns
104
+ -------
105
+ tuple[dict[str, float], dict[str, dict]]
106
+ A pair of (alphas, details). ``alphas`` maps op name to median
107
+ raw measurement per analytical FLOP. ``details`` maps op name to
108
+ a dict of raw benchmark metadata.
109
+ """
110
+ results: dict[str, float] = {}
111
+ details: dict[str, dict] = {}
112
+
113
+ for op in LINALG_OPS:
114
+ dist_values: list[float] = []
115
+ dist_raw_totals: list[int] = []
116
+
117
+ if op in _SPD_OPS:
118
+ # SPD matrices: A@A.T + n*I
119
+ setups = [
120
+ (
121
+ f"import numpy as np; rng = np.random.default_rng(42); "
122
+ f"_A = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
123
+ f"A = _A @ _A.T + {n} * np.eye({n}, dtype=np.{dtype})"
124
+ ),
125
+ (
126
+ f"import numpy as np; rng = np.random.default_rng(42); "
127
+ f"_A = rng.uniform(0.1, 1.0, size=({n}, {n})).astype(np.{dtype}); "
128
+ f"A = _A @ _A.T + {n} * np.eye({n}, dtype=np.{dtype})"
129
+ ),
130
+ (
131
+ f"import numpy as np; rng = np.random.default_rng(42); "
132
+ f"_A = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
133
+ f"A = _A @ _A.T + {n * 100} * np.eye({n}, dtype=np.{dtype})"
134
+ ),
135
+ ]
136
+ else:
137
+ # General, well-conditioned, ill-conditioned
138
+ setups = [
139
+ (
140
+ f"import numpy as np; rng = np.random.default_rng(42); "
141
+ f"A = rng.standard_normal(({n}, {n})).astype(np.{dtype})"
142
+ ),
143
+ (
144
+ f"import numpy as np; rng = np.random.default_rng(42); "
145
+ f"A = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
146
+ f"A = A + {n} * np.eye({n}, dtype=np.{dtype})"
147
+ ),
148
+ (
149
+ f"import numpy as np; rng = np.random.default_rng(42); "
150
+ f"_u = rng.standard_normal(({n}, {n})).astype(np.{dtype}); "
151
+ f"_s = np.logspace(0, -10, {n}, dtype=np.{dtype}); "
152
+ f"A = _u * _s @ _u.T"
153
+ ),
154
+ ]
155
+
156
+ # Build bench code
157
+ if op == "linalg.solve":
158
+ bench_suffix = f"; b = np.ones({n}, dtype=np.{dtype})"
159
+ bench = "np.linalg.solve(A, b)"
160
+ elif op == "linalg.lstsq":
161
+ bench_suffix = f"; b = np.ones({n}, dtype=np.{dtype})"
162
+ bench = "np.linalg.lstsq(A, b, rcond=None)"
163
+ else:
164
+ bench_suffix = ""
165
+ bench = f"np.{op}(A)"
166
+
167
+ analytical = _analytical_cost(op, n)
168
+
169
+ for setup in setups:
170
+ full_setup = setup + bench_suffix
171
+ try:
172
+ result = measure_flops(full_setup, bench, repeats=repeats)
173
+ except RuntimeError:
174
+ continue
175
+ measured = result.total_flops / repeats
176
+ dist_values.append(measured / analytical if analytical else 0.0)
177
+ dist_raw_totals.append(result.total_flops)
178
+
179
+ if dist_values:
180
+ results[op] = statistics.median(dist_values)
181
+ if op in ("linalg.solve", "linalg.lstsq"):
182
+ bm_size = f"A: ({n},{n}), b: ({n},)"
183
+ else:
184
+ bm_size = f"A: ({n},{n})"
185
+ details[op] = {
186
+ "category": "counted_custom",
187
+ "measurement_mode": "blas",
188
+ "analytical_formula": _FORMULA_STRINGS.get(op, ""),
189
+ "analytical_flops": analytical,
190
+ "benchmark_size": bm_size,
191
+ "bench_code": bench,
192
+ "repeats": repeats,
193
+ "perf_instructions_total": dist_raw_totals,
194
+ "distribution_alphas": dist_values,
195
+ }
196
+
197
+ return results, details
@@ -0,0 +1,407 @@
1
+ """Benchmark linalg namespace delegate operations.
2
+
3
+ These 15 ops live under ``numpy.linalg.*`` and typically delegate to a
4
+ primary operation (matmul, SVD, solve, ...). We benchmark them directly
5
+ with perf counters to capture any wrapper overhead.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import statistics
11
+
12
+ from benchmarks._perf import measure_flops
13
+
14
+ LINALG_DELEGATE_OPS: list[str] = [
15
+ "linalg.cond",
16
+ "linalg.cross",
17
+ "linalg.matmul",
18
+ "linalg.matrix_norm",
19
+ "linalg.matrix_power",
20
+ "linalg.matrix_rank",
21
+ "linalg.multi_dot",
22
+ "linalg.norm",
23
+ "linalg.outer",
24
+ "linalg.tensordot",
25
+ "linalg.tensorinv",
26
+ "linalg.tensorsolve",
27
+ "linalg.trace",
28
+ "linalg.vecdot",
29
+ "linalg.vector_norm",
30
+ ]
31
+
32
+ _FORMULA_STRINGS: dict[str, str] = {
33
+ "linalg.cond": "m*n*min(m,n)",
34
+ "linalg.cross": "6*n",
35
+ "linalg.matmul": "MNK",
36
+ "linalg.matrix_norm": "numel",
37
+ "linalg.matrix_power": "(ceil(log2(k))+popcount(k)-1)*n^3",
38
+ "linalg.matrix_rank": "m*n*min(m,n)",
39
+ "linalg.multi_dot": "sum of chain MNK costs",
40
+ "linalg.norm": "numel",
41
+ "linalg.outer": "M*N",
42
+ "linalg.tensordot": "product of free * contracted dims",
43
+ "linalg.tensorinv": "n^3",
44
+ "linalg.tensorsolve": "n^3",
45
+ "linalg.trace": "min(m,n)",
46
+ "linalg.vecdot": "batch*K",
47
+ "linalg.vector_norm": "numel",
48
+ }
49
+
50
+ # NumPy 2.x-only ops — skip gracefully on older versions.
51
+ _NUMPY2_OPS = {
52
+ "linalg.cross",
53
+ "linalg.matrix_norm",
54
+ "linalg.vector_norm",
55
+ "linalg.outer",
56
+ "linalg.vecdot",
57
+ "linalg.matmul",
58
+ "linalg.tensordot",
59
+ }
60
+
61
+
62
+ def _analytical_cost(op_name: str) -> int:
63
+ """Return the analytical FLOP count for *op_name* at the canonical size.
64
+
65
+ Each op has a fixed benchmark size (see the table in the module docstring).
66
+ This function returns the textbook cost for that size.
67
+
68
+ Parameters
69
+ ----------
70
+ op_name : str
71
+ Fully-qualified operation name, e.g. ``"linalg.cond"``.
72
+
73
+ Returns
74
+ -------
75
+ int
76
+ Analytical FLOP count.
77
+ """
78
+ short = op_name.split(".")[-1]
79
+ costs: dict[str, int] = {
80
+ "cond": 512 * 512 * 512, # m*n*min(m,n) via SVD
81
+ "cross": 6 * 1_000_000, # 6*n
82
+ "matmul": 2 * 512 * 512 * 512
83
+ - 512 * 512, # 2*M*N*K - M*N (FMA=2); = 268,173,312
84
+ "matrix_norm": 2 * 512 * 512, # 2*numel (Frobenius)
85
+ "matrix_power": 3 * 64**3, # 3 matmuls for n=5
86
+ "matrix_rank": 512 * 512 * 512, # m*n*min(m,n) via SVD
87
+ "multi_dot": 128 * 64 * 128
88
+ + 128
89
+ * 128
90
+ * 64, # optimal chain (FMA=2); coincidentally same as FMA=1 value = 2,097,152
91
+ "norm": 2 * 10_000_000, # 2*numel (FMA=2, vector L2)
92
+ "outer": 5000 * 5000, # M*N
93
+ "tensordot": 64
94
+ ** 5, # d^5 (FMA=2 textbook; matches flopscope charge = 1,073,741,824)
95
+ "tensorinv": 64**3, # n^3 after reshape
96
+ "tensorsolve": 64**3, # n^3 after reshape
97
+ "trace": 10_000, # min(m,n)
98
+ "vecdot": 1000 * 512, # batch*K
99
+ "vector_norm": 2 * 10_000_000, # 2*numel (FMA=2)
100
+ }
101
+ return costs[short]
102
+
103
+
104
+ # ---------------------------------------------------------------------------
105
+ # Per-op setup / bench code builders
106
+ # ---------------------------------------------------------------------------
107
+
108
+
109
+ def _op_config(op: str, dtype: str) -> tuple[list[str], str, str]:
110
+ """Return (setups, bench_code, benchmark_size) for a delegate op.
111
+
112
+ Each op gets 3 setup variants (distributions) to take the median over.
113
+
114
+ Returns
115
+ -------
116
+ tuple[list[str], str, str]
117
+ (list of setup strings, benchmark expression, human-readable size)
118
+ """
119
+ short = op.split(".")[-1]
120
+ d = dtype
121
+
122
+ if short == "cond":
123
+ setups = [
124
+ f"import numpy as np; rng = np.random.default_rng(42); "
125
+ f"A = rng.standard_normal((512, 512)).astype(np.{d})",
126
+ f"import numpy as np; rng = np.random.default_rng(42); "
127
+ f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
128
+ f"import numpy as np; rng = np.random.default_rng(42); "
129
+ f"A = rng.standard_normal((512, 512)).astype(np.{d}) + "
130
+ f"512 * np.eye(512, dtype=np.{d})",
131
+ ]
132
+ return setups, "np.linalg.cond(A)", "A: (512,512)"
133
+
134
+ if short == "cross":
135
+ setups = [
136
+ f"import numpy as np; rng = np.random.default_rng(42); "
137
+ f"a = rng.standard_normal((1000000, 3)).astype(np.{d}); "
138
+ f"b = rng.standard_normal((1000000, 3)).astype(np.{d})",
139
+ f"import numpy as np; rng = np.random.default_rng(42); "
140
+ f"a = rng.uniform(-1, 1, (1000000, 3)).astype(np.{d}); "
141
+ f"b = rng.uniform(-1, 1, (1000000, 3)).astype(np.{d})",
142
+ f"import numpy as np; rng = np.random.default_rng(42); "
143
+ f"a = rng.standard_normal((1000000, 3)).astype(np.{d}) * 100; "
144
+ f"b = rng.standard_normal((1000000, 3)).astype(np.{d}) * 0.01",
145
+ ]
146
+ return setups, "np.linalg.cross(a, b)", "a: (1000000,3), b: (1000000,3)"
147
+
148
+ if short == "matmul":
149
+ setups = [
150
+ f"import numpy as np; rng = np.random.default_rng(42); "
151
+ f"A = rng.standard_normal((512, 512)).astype(np.{d}); "
152
+ f"B = rng.standard_normal((512, 512)).astype(np.{d})",
153
+ f"import numpy as np; rng = np.random.default_rng(42); "
154
+ f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d}); "
155
+ f"B = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
156
+ f"import numpy as np; rng = np.random.default_rng(42); "
157
+ f"A = rng.standard_normal((512, 512)).astype(np.{d}) * 100; "
158
+ f"B = rng.standard_normal((512, 512)).astype(np.{d}) * 0.01",
159
+ ]
160
+ return setups, "np.linalg.matmul(A, B)", "A: (512,512), B: (512,512)"
161
+
162
+ if short == "matrix_norm":
163
+ setups = [
164
+ f"import numpy as np; rng = np.random.default_rng(42); "
165
+ f"A = rng.standard_normal((512, 512)).astype(np.{d})",
166
+ f"import numpy as np; rng = np.random.default_rng(42); "
167
+ f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
168
+ f"import numpy as np; rng = np.random.default_rng(42); "
169
+ f"A = rng.standard_normal((512, 512)).astype(np.{d}) * 100",
170
+ ]
171
+ return setups, "np.linalg.matrix_norm(A)", "A: (512,512)"
172
+
173
+ if short == "matrix_power":
174
+ setups = [
175
+ f"import numpy as np; rng = np.random.default_rng(42); "
176
+ f"A = rng.standard_normal((64, 64)).astype(np.{d}) + "
177
+ f"64 * np.eye(64, dtype=np.{d})",
178
+ f"import numpy as np; rng = np.random.default_rng(42); "
179
+ f"A = rng.uniform(0.1, 1.0, (64, 64)).astype(np.{d}) + "
180
+ f"64 * np.eye(64, dtype=np.{d})",
181
+ f"import numpy as np; rng = np.random.default_rng(42); "
182
+ f"A = rng.standard_normal((64, 64)).astype(np.{d}) + "
183
+ f"640 * np.eye(64, dtype=np.{d})",
184
+ ]
185
+ return setups, "np.linalg.matrix_power(A, 5)", "A: (64,64), n=5"
186
+
187
+ if short == "matrix_rank":
188
+ setups = [
189
+ f"import numpy as np; rng = np.random.default_rng(42); "
190
+ f"A = rng.standard_normal((512, 512)).astype(np.{d})",
191
+ f"import numpy as np; rng = np.random.default_rng(42); "
192
+ f"A = rng.uniform(0.1, 1.0, (512, 512)).astype(np.{d})",
193
+ f"import numpy as np; rng = np.random.default_rng(42); "
194
+ f"A = rng.standard_normal((512, 512)).astype(np.{d}) + "
195
+ f"512 * np.eye(512, dtype=np.{d})",
196
+ ]
197
+ return setups, "np.linalg.matrix_rank(A)", "A: (512,512)"
198
+
199
+ if short == "multi_dot":
200
+ setups = [
201
+ f"import numpy as np; rng = np.random.default_rng(42); "
202
+ f"A = rng.standard_normal((128, 64)).astype(np.{d}); "
203
+ f"B = rng.standard_normal((64, 128)).astype(np.{d}); "
204
+ f"C = rng.standard_normal((128, 64)).astype(np.{d})",
205
+ f"import numpy as np; rng = np.random.default_rng(42); "
206
+ f"A = rng.uniform(0.1, 1.0, (128, 64)).astype(np.{d}); "
207
+ f"B = rng.uniform(0.1, 1.0, (64, 128)).astype(np.{d}); "
208
+ f"C = rng.uniform(0.1, 1.0, (128, 64)).astype(np.{d})",
209
+ f"import numpy as np; rng = np.random.default_rng(42); "
210
+ f"A = rng.standard_normal((128, 64)).astype(np.{d}) * 100; "
211
+ f"B = rng.standard_normal((64, 128)).astype(np.{d}) * 0.01; "
212
+ f"C = rng.standard_normal((128, 64)).astype(np.{d})",
213
+ ]
214
+ return (
215
+ setups,
216
+ "np.linalg.multi_dot([A, B, C])",
217
+ "A: (128,64), B: (64,128), C: (128,64)",
218
+ )
219
+
220
+ if short == "norm":
221
+ setups = [
222
+ f"import numpy as np; rng = np.random.default_rng(42); "
223
+ f"x = rng.standard_normal(10000000).astype(np.{d})",
224
+ f"import numpy as np; rng = np.random.default_rng(42); "
225
+ f"x = rng.uniform(0.1, 1.0, 10000000).astype(np.{d})",
226
+ f"import numpy as np; rng = np.random.default_rng(42); "
227
+ f"x = rng.standard_normal(10000000).astype(np.{d}) * 100",
228
+ ]
229
+ return setups, "np.linalg.norm(x)", "x: (10000000,)"
230
+
231
+ if short == "outer":
232
+ setups = [
233
+ f"import numpy as np; rng = np.random.default_rng(42); "
234
+ f"a = rng.standard_normal(5000).astype(np.{d}); "
235
+ f"b = rng.standard_normal(5000).astype(np.{d})",
236
+ f"import numpy as np; rng = np.random.default_rng(42); "
237
+ f"a = rng.uniform(0.1, 1.0, 5000).astype(np.{d}); "
238
+ f"b = rng.uniform(0.1, 1.0, 5000).astype(np.{d})",
239
+ f"import numpy as np; rng = np.random.default_rng(42); "
240
+ f"a = rng.standard_normal(5000).astype(np.{d}) * 100; "
241
+ f"b = rng.standard_normal(5000).astype(np.{d}) * 0.01",
242
+ ]
243
+ return setups, "np.linalg.outer(a, b)", "a: (5000,), b: (5000,)"
244
+
245
+ if short == "tensordot":
246
+ setups = [
247
+ f"import numpy as np; rng = np.random.default_rng(42); "
248
+ f"A = rng.standard_normal((64, 64, 64)).astype(np.{d}); "
249
+ f"B = rng.standard_normal((64, 64, 64)).astype(np.{d})",
250
+ f"import numpy as np; rng = np.random.default_rng(42); "
251
+ f"A = rng.uniform(0.1, 1.0, (64, 64, 64)).astype(np.{d}); "
252
+ f"B = rng.uniform(0.1, 1.0, (64, 64, 64)).astype(np.{d})",
253
+ f"import numpy as np; rng = np.random.default_rng(42); "
254
+ f"A = rng.standard_normal((64, 64, 64)).astype(np.{d}) * 100; "
255
+ f"B = rng.standard_normal((64, 64, 64)).astype(np.{d}) * 0.01",
256
+ ]
257
+ return (
258
+ setups,
259
+ "np.linalg.tensordot(A, B, axes=1)",
260
+ "A: (64,64,64), B: (64,64,64)",
261
+ )
262
+
263
+ if short == "tensorinv":
264
+ # Build an invertible (64,64) matrix via A@A.T + n*I, then reshape
265
+ # to (8,8,8,8).
266
+ setups = [
267
+ f"import numpy as np; rng = np.random.default_rng(42); "
268
+ f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
269
+ f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
270
+ f"A = _M.reshape(8, 8, 8, 8)",
271
+ f"import numpy as np; rng = np.random.default_rng(42); "
272
+ f"_A = rng.uniform(0.1, 1.0, (64, 64)).astype(np.{d}); "
273
+ f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
274
+ f"A = _M.reshape(8, 8, 8, 8)",
275
+ f"import numpy as np; rng = np.random.default_rng(42); "
276
+ f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
277
+ f"_M = _A @ _A.T + 640 * np.eye(64, dtype=np.{d}); "
278
+ f"A = _M.reshape(8, 8, 8, 8)",
279
+ ]
280
+ return setups, "np.linalg.tensorinv(A, ind=2)", "A: (8,8,8,8)"
281
+
282
+ if short == "tensorsolve":
283
+ # Build a solvable system: invertible (64,64) reshaped to (8,8,8,8),
284
+ # with b of shape (8,8).
285
+ setups = [
286
+ f"import numpy as np; rng = np.random.default_rng(42); "
287
+ f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
288
+ f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
289
+ f"A = _M.reshape(8, 8, 8, 8); "
290
+ f"b = rng.standard_normal((8, 8)).astype(np.{d})",
291
+ f"import numpy as np; rng = np.random.default_rng(42); "
292
+ f"_A = rng.uniform(0.1, 1.0, (64, 64)).astype(np.{d}); "
293
+ f"_M = _A @ _A.T + 64 * np.eye(64, dtype=np.{d}); "
294
+ f"A = _M.reshape(8, 8, 8, 8); "
295
+ f"b = rng.uniform(0.1, 1.0, (8, 8)).astype(np.{d})",
296
+ f"import numpy as np; rng = np.random.default_rng(42); "
297
+ f"_A = rng.standard_normal((64, 64)).astype(np.{d}); "
298
+ f"_M = _A @ _A.T + 640 * np.eye(64, dtype=np.{d}); "
299
+ f"A = _M.reshape(8, 8, 8, 8); "
300
+ f"b = rng.standard_normal((8, 8)).astype(np.{d})",
301
+ ]
302
+ return (
303
+ setups,
304
+ "np.linalg.tensorsolve(A, b)",
305
+ "A: (8,8,8,8), b: (8,8)",
306
+ )
307
+
308
+ if short == "trace":
309
+ # Use np.ones instead of random arrays to avoid the setup's random
310
+ # number generation dominating the measurement. Trace just sums the
311
+ # diagonal — the values don't affect the FP instruction count.
312
+ setups = [
313
+ f"import numpy as np; A = np.ones((10000, 10000), dtype=np.{d})",
314
+ f"import numpy as np; A = np.ones((10000, 10000), dtype=np.{d}) * 2.5",
315
+ f"import numpy as np; A = np.ones((10000, 10000), dtype=np.{d}) * 100",
316
+ ]
317
+ return setups, "np.linalg.trace(A)", "A: (10000,10000)"
318
+
319
+ if short == "vecdot":
320
+ setups = [
321
+ f"import numpy as np; rng = np.random.default_rng(42); "
322
+ f"A = rng.standard_normal((1000, 512)).astype(np.{d}); "
323
+ f"B = rng.standard_normal((1000, 512)).astype(np.{d})",
324
+ f"import numpy as np; rng = np.random.default_rng(42); "
325
+ f"A = rng.uniform(0.1, 1.0, (1000, 512)).astype(np.{d}); "
326
+ f"B = rng.uniform(0.1, 1.0, (1000, 512)).astype(np.{d})",
327
+ f"import numpy as np; rng = np.random.default_rng(42); "
328
+ f"A = rng.standard_normal((1000, 512)).astype(np.{d}) * 100; "
329
+ f"B = rng.standard_normal((1000, 512)).astype(np.{d}) * 0.01",
330
+ ]
331
+ return setups, "np.linalg.vecdot(A, B)", "A: (1000,512), B: (1000,512)"
332
+
333
+ if short == "vector_norm":
334
+ setups = [
335
+ f"import numpy as np; rng = np.random.default_rng(42); "
336
+ f"x = rng.standard_normal(10000000).astype(np.{d})",
337
+ f"import numpy as np; rng = np.random.default_rng(42); "
338
+ f"x = rng.uniform(0.1, 1.0, 10000000).astype(np.{d})",
339
+ f"import numpy as np; rng = np.random.default_rng(42); "
340
+ f"x = rng.standard_normal(10000000).astype(np.{d}) * 100",
341
+ ]
342
+ return setups, "np.linalg.vector_norm(x)", "x: (10000000,)"
343
+
344
+ raise ValueError(f"Unknown delegate op: {op}")
345
+
346
+
347
+ # ---------------------------------------------------------------------------
348
+ # Main benchmark entry point
349
+ # ---------------------------------------------------------------------------
350
+
351
+
352
+ def benchmark_linalg_delegates(
353
+ dtype: str = "float64",
354
+ repeats: int = 10,
355
+ ) -> tuple[dict[str, float], dict[str, dict]]:
356
+ """Benchmark linalg delegate ops via perf counters.
357
+
358
+ Returns
359
+ -------
360
+ tuple[dict[str, float], dict[str, dict]]
361
+ (alphas, details) — same schema as ``benchmark_linalg``.
362
+ """
363
+ results: dict[str, float] = {}
364
+ details: dict[str, dict] = {}
365
+
366
+ for op in LINALG_DELEGATE_OPS:
367
+ # Skip ops that don't exist in this NumPy version.
368
+ if op in _NUMPY2_OPS:
369
+ try:
370
+ import numpy as np # noqa: F811
371
+
372
+ fn = np.linalg
373
+ for part in op.split(".")[1:]:
374
+ fn = getattr(fn, part)
375
+ except AttributeError:
376
+ continue
377
+
378
+ setups, bench, bm_size = _op_config(op, dtype)
379
+ analytical = _analytical_cost(op)
380
+
381
+ dist_values: list[float] = []
382
+ dist_raw_totals: list[int] = []
383
+
384
+ for setup in setups:
385
+ try:
386
+ result = measure_flops(setup, bench, repeats=repeats)
387
+ except RuntimeError:
388
+ continue
389
+ measured = result.total_flops / repeats
390
+ dist_values.append(measured / analytical if analytical else 0.0)
391
+ dist_raw_totals.append(result.total_flops)
392
+
393
+ if dist_values:
394
+ results[op] = statistics.median(dist_values)
395
+ details[op] = {
396
+ "category": "counted_custom",
397
+ "measurement_mode": "blas",
398
+ "analytical_formula": _FORMULA_STRINGS.get(op, ""),
399
+ "analytical_flops": analytical,
400
+ "benchmark_size": bm_size,
401
+ "bench_code": bench,
402
+ "repeats": repeats,
403
+ "perf_instructions_total": dist_raw_totals,
404
+ "distribution_alphas": dist_values,
405
+ }
406
+
407
+ return results, details