flopscope 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +1 -0
- benchmarks/__main__.py +6 -0
- benchmarks/_baseline.py +171 -0
- benchmarks/_bitwise.py +231 -0
- benchmarks/_complex.py +176 -0
- benchmarks/_contractions.py +291 -0
- benchmarks/_fft.py +198 -0
- benchmarks/_impl_urls.py +139 -0
- benchmarks/_linalg.py +197 -0
- benchmarks/_linalg_delegates.py +407 -0
- benchmarks/_metadata.py +141 -0
- benchmarks/_misc.py +653 -0
- benchmarks/_perf.py +321 -0
- benchmarks/_perm_group_calibration.py +175 -0
- benchmarks/_pointwise.py +372 -0
- benchmarks/_polynomial.py +193 -0
- benchmarks/_random.py +209 -0
- benchmarks/_reductions.py +136 -0
- benchmarks/_sorting.py +289 -0
- benchmarks/_stats.py +137 -0
- benchmarks/_window.py +92 -0
- benchmarks/accumulation/__init__.py +0 -0
- benchmarks/accumulation/bench_cost_compute.py +138 -0
- benchmarks/dashboard.py +312 -0
- benchmarks/runner.py +636 -0
- flopscope/__init__.py +273 -0
- flopscope/_accumulation/__init__.py +13 -0
- flopscope/_accumulation/_bipartite.py +121 -0
- flopscope/_accumulation/_burnside.py +51 -0
- flopscope/_accumulation/_cache.py +146 -0
- flopscope/_accumulation/_components.py +153 -0
- flopscope/_accumulation/_cost.py +1414 -0
- flopscope/_accumulation/_cost_descriptions.py +63 -0
- flopscope/_accumulation/_detection.py +318 -0
- flopscope/_accumulation/_ladder.py +191 -0
- flopscope/_accumulation/_output_orbit.py +104 -0
- flopscope/_accumulation/_partition.py +290 -0
- flopscope/_accumulation/_path_info.py +211 -0
- flopscope/_accumulation/_public.py +169 -0
- flopscope/_accumulation/_reduction.py +310 -0
- flopscope/_accumulation/_regimes.py +303 -0
- flopscope/_accumulation/_shape.py +33 -0
- flopscope/_accumulation/_wreath.py +209 -0
- flopscope/_budget.py +1027 -0
- flopscope/_config.py +118 -0
- flopscope/_counting_ops.py +451 -0
- flopscope/_display.py +478 -0
- flopscope/_docstrings.py +59 -0
- flopscope/_dtypes.py +20 -0
- flopscope/_einsum.py +717 -0
- flopscope/_errstate.py +25 -0
- flopscope/_flops.py +282 -0
- flopscope/_free_ops.py +2654 -0
- flopscope/_ndarray.py +1126 -0
- flopscope/_opt_einsum/LICENSE +21 -0
- flopscope/_opt_einsum/NOTICE +59 -0
- flopscope/_opt_einsum/__init__.py +209 -0
- flopscope/_opt_einsum/_contract.py +1478 -0
- flopscope/_opt_einsum/_helpers.py +164 -0
- flopscope/_opt_einsum/_hsluv.py +273 -0
- flopscope/_opt_einsum/_path_random.py +462 -0
- flopscope/_opt_einsum/_paths.py +1653 -0
- flopscope/_opt_einsum/_subgraph_symmetry.py +544 -0
- flopscope/_opt_einsum/_symmetry.py +140 -0
- flopscope/_opt_einsum/_typing.py +37 -0
- flopscope/_perm_group.py +717 -0
- flopscope/_pointwise.py +2522 -0
- flopscope/_polynomial.py +278 -0
- flopscope/_registry.py +3216 -0
- flopscope/_sorting_ops.py +571 -0
- flopscope/_symmetric.py +812 -0
- flopscope/_symmetry_transport.py +510 -0
- flopscope/_symmetry_utils.py +669 -0
- flopscope/_type_info.py +12 -0
- flopscope/_unwrap.py +70 -0
- flopscope/_validation.py +83 -0
- flopscope/_version_check.py +46 -0
- flopscope/_weights.py +195 -0
- flopscope/_window.py +177 -0
- flopscope/accounting.py +565 -0
- flopscope/data/default_weights.json +462 -0
- flopscope/data/weights.csv +509 -0
- flopscope/errors.py +197 -0
- flopscope/numpy/__init__.py +878 -0
- flopscope/numpy/fft/__init__.py +55 -0
- flopscope/numpy/fft/_free.py +51 -0
- flopscope/numpy/fft/_transforms.py +695 -0
- flopscope/numpy/linalg/__init__.py +105 -0
- flopscope/numpy/linalg/_aliases.py +126 -0
- flopscope/numpy/linalg/_compound.py +161 -0
- flopscope/numpy/linalg/_decompositions.py +353 -0
- flopscope/numpy/linalg/_properties.py +533 -0
- flopscope/numpy/linalg/_solvers.py +444 -0
- flopscope/numpy/linalg/_svd.py +122 -0
- flopscope/numpy/random/__init__.py +684 -0
- flopscope/numpy/random/_cost_formulas.py +115 -0
- flopscope/numpy/random/_counted_classes.py +241 -0
- flopscope/numpy/testing/__init__.py +13 -0
- flopscope/numpy/typing/__init__.py +30 -0
- flopscope/py.typed +0 -0
- flopscope/stats/__init__.py +84 -0
- flopscope/stats/_base.py +77 -0
- flopscope/stats/_cauchy.py +146 -0
- flopscope/stats/_erf.py +190 -0
- flopscope/stats/_expon.py +146 -0
- flopscope/stats/_laplace.py +150 -0
- flopscope/stats/_logistic.py +148 -0
- flopscope/stats/_lognorm.py +160 -0
- flopscope/stats/_ndtri.py +133 -0
- flopscope/stats/_norm.py +149 -0
- flopscope/stats/_truncnorm.py +186 -0
- flopscope/stats/_uniform.py +141 -0
- flopscope-0.2.0.dist-info/METADATA +23 -0
- flopscope-0.2.0.dist-info/RECORD +115 -0
- flopscope-0.2.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""Benchmark BLAS contraction operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import statistics
|
|
6
|
+
|
|
7
|
+
from benchmarks._perf import measure_flops
|
|
8
|
+
|
|
9
|
+
CONTRACTION_OPS: list[str] = [
|
|
10
|
+
"dot",
|
|
11
|
+
"matmul",
|
|
12
|
+
"inner",
|
|
13
|
+
"vdot",
|
|
14
|
+
"vecdot",
|
|
15
|
+
"outer",
|
|
16
|
+
"tensordot",
|
|
17
|
+
"kron",
|
|
18
|
+
"einsum",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
_FORMULA_STRINGS: dict[str, str] = {
|
|
22
|
+
"dot": "MNK",
|
|
23
|
+
"matmul": "MNK",
|
|
24
|
+
"inner": "N (a.size)",
|
|
25
|
+
"vdot": "N (a.size)",
|
|
26
|
+
"vecdot": "batch * K (output_size * contracted_axis)",
|
|
27
|
+
"outer": "M*N",
|
|
28
|
+
"tensordot": "product of free * contracted dims",
|
|
29
|
+
"kron": "numel(output)",
|
|
30
|
+
"einsum": "α/M model (FMA=2 textbook)",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
_BENCHMARK_SIZE_STRINGS: dict[str, str] = {
|
|
34
|
+
"dot": "A: (512,512), B: (512,512)",
|
|
35
|
+
"matmul": "A: (512,512), B: (512,512)",
|
|
36
|
+
"inner": "a: (1000000,), b: (1000000,)",
|
|
37
|
+
"vdot": "a: (1000000,), b: (1000000,)",
|
|
38
|
+
"vecdot": "A: (1000,512), B: (1000,512)",
|
|
39
|
+
"outer": "a: (5000,), b: (5000,)",
|
|
40
|
+
"tensordot": "A: (64,64,64), B: (64,64,64), axes=1",
|
|
41
|
+
"kron": "A: (64,64), B: (64,64)",
|
|
42
|
+
"einsum": "A: (512,512), B: (512,512), subscripts='ij,jk->ik'",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _analytical_cost(op: str, **kwargs: int) -> int:
|
|
47
|
+
"""Return analytical FLOP count for the benchmark configuration.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
op : str
|
|
52
|
+
Operation name (e.g. ``"dot"``).
|
|
53
|
+
**kwargs : int
|
|
54
|
+
Shape parameters used by each formula.
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
int
|
|
59
|
+
Analytical FLOP count.
|
|
60
|
+
"""
|
|
61
|
+
costs: dict[str, int] = {
|
|
62
|
+
# dot: 2D matrix multiply A(512,512) @ B(512,512), FMA=2 textbook
|
|
63
|
+
"dot": 512 * 512 * 512,
|
|
64
|
+
# matmul: identical to dot for 2D
|
|
65
|
+
"matmul": 512 * 512 * 512,
|
|
66
|
+
# inner: dot product of two 1M-element vectors.
|
|
67
|
+
# Runtime charges a.size — matches flopscope's convention (FMA=2, but
|
|
68
|
+
# a.size is pointwise-shaped so the FMA off-by-one doesn't apply here).
|
|
69
|
+
"inner": 1_000_000,
|
|
70
|
+
# vdot: same as inner for 1D real inputs.
|
|
71
|
+
# Runtime charges a.size (FMA=2, pointwise-shaped — no off-by-one).
|
|
72
|
+
"vdot": 1_000_000,
|
|
73
|
+
# vecdot: batched dot product A(1000,512) . B(1000,512)
|
|
74
|
+
# Output (1000,) with contracted axis 512.
|
|
75
|
+
# Runtime charges result.size * contracted = 1000 * 512 (FMA=2 textbook).
|
|
76
|
+
"vecdot": 1000 * 512,
|
|
77
|
+
# outer: outer product of two 5000-element vectors
|
|
78
|
+
"outer": 5000 * 5000,
|
|
79
|
+
# tensordot: A(64,64,64) . B(64,64,64) axes=1 -> contract last of A with first of B
|
|
80
|
+
"tensordot": 64**5,
|
|
81
|
+
# kron: Kronecker product A(64,64) x B(64,64)
|
|
82
|
+
"kron": 64**4,
|
|
83
|
+
# einsum: 'ij,jk->ik' is matrix multiply (512,512)x(512,512), FMA=2 textbook
|
|
84
|
+
"einsum": 512 * 512 * 512,
|
|
85
|
+
}
|
|
86
|
+
return costs[op]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def benchmark_contractions(
|
|
90
|
+
dtype: str = "float64",
|
|
91
|
+
repeats: int = 10,
|
|
92
|
+
) -> tuple[dict[str, float], dict[str, dict]]:
|
|
93
|
+
"""Benchmark contraction ops, returning raw measurement per analytical FLOP.
|
|
94
|
+
|
|
95
|
+
In perf mode this is actual FP ops / analytical FLOPs (correction factor).
|
|
96
|
+
In timing mode this is nanoseconds / analytical FLOPs (same units as
|
|
97
|
+
pointwise -- the runner normalizes against baseline to get relative weights).
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
dtype : str
|
|
102
|
+
NumPy dtype string.
|
|
103
|
+
repeats : int
|
|
104
|
+
Number of repetitions per measurement.
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
tuple[dict[str, float], dict[str, dict]]
|
|
109
|
+
A pair of (alphas, details). ``alphas`` maps op name to median
|
|
110
|
+
raw measurement per analytical FLOP. ``details`` maps op name to
|
|
111
|
+
a dict of raw benchmark metadata.
|
|
112
|
+
"""
|
|
113
|
+
results: dict[str, float] = {}
|
|
114
|
+
details: dict[str, dict] = {}
|
|
115
|
+
|
|
116
|
+
for op in CONTRACTION_OPS:
|
|
117
|
+
dist_values: list[float] = []
|
|
118
|
+
dist_raw_totals: list[int] = []
|
|
119
|
+
|
|
120
|
+
# --- Build setups and bench code per op ---
|
|
121
|
+
|
|
122
|
+
if op in ("dot", "matmul", "einsum"):
|
|
123
|
+
# Two 512x512 matrices
|
|
124
|
+
setups = [
|
|
125
|
+
(
|
|
126
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
127
|
+
f"A = rng.standard_normal((512, 512)).astype(np.{dtype}); "
|
|
128
|
+
f"B = rng.standard_normal((512, 512)).astype(np.{dtype})"
|
|
129
|
+
),
|
|
130
|
+
(
|
|
131
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
132
|
+
f"A = rng.uniform(0.01, 100, size=(512, 512)).astype(np.{dtype}); "
|
|
133
|
+
f"B = rng.uniform(0.01, 100, size=(512, 512)).astype(np.{dtype})"
|
|
134
|
+
),
|
|
135
|
+
(
|
|
136
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
137
|
+
f"A = rng.uniform(-1000, 1000, size=(512, 512)).astype(np.{dtype}); "
|
|
138
|
+
f"B = rng.uniform(-1000, 1000, size=(512, 512)).astype(np.{dtype})"
|
|
139
|
+
),
|
|
140
|
+
]
|
|
141
|
+
if op == "dot":
|
|
142
|
+
bench = "np.dot(A, B)"
|
|
143
|
+
elif op == "matmul":
|
|
144
|
+
bench = "np.matmul(A, B)"
|
|
145
|
+
else: # einsum
|
|
146
|
+
bench = "np.einsum('ij,jk->ik', A, B)"
|
|
147
|
+
|
|
148
|
+
elif op in ("inner", "vdot"):
|
|
149
|
+
# Two 1M-element vectors — large enough for BLAS ddot FMA to dominate
|
|
150
|
+
# over per-call overhead (10K was too small, overhead inflated alpha)
|
|
151
|
+
vec_n = 1_000_000
|
|
152
|
+
setups = [
|
|
153
|
+
(
|
|
154
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
155
|
+
f"a = rng.standard_normal({vec_n}).astype(np.{dtype}); "
|
|
156
|
+
f"b = rng.standard_normal({vec_n}).astype(np.{dtype})"
|
|
157
|
+
),
|
|
158
|
+
(
|
|
159
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
160
|
+
f"a = rng.uniform(0.01, 100, size={vec_n}).astype(np.{dtype}); "
|
|
161
|
+
f"b = rng.uniform(0.01, 100, size={vec_n}).astype(np.{dtype})"
|
|
162
|
+
),
|
|
163
|
+
(
|
|
164
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
165
|
+
f"a = rng.uniform(-1000, 1000, size={vec_n}).astype(np.{dtype}); "
|
|
166
|
+
f"b = rng.uniform(-1000, 1000, size={vec_n}).astype(np.{dtype})"
|
|
167
|
+
),
|
|
168
|
+
]
|
|
169
|
+
bench = f"np.{op}(a, b)"
|
|
170
|
+
|
|
171
|
+
elif op == "vecdot":
|
|
172
|
+
# Batched dot: A(1000,512), B(1000,512) -- NumPy 2.x only
|
|
173
|
+
setups = [
|
|
174
|
+
(
|
|
175
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
176
|
+
f"A = rng.standard_normal((1000, 512)).astype(np.{dtype}); "
|
|
177
|
+
f"B = rng.standard_normal((1000, 512)).astype(np.{dtype})"
|
|
178
|
+
),
|
|
179
|
+
(
|
|
180
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
181
|
+
f"A = rng.uniform(0.01, 100, size=(1000, 512)).astype(np.{dtype}); "
|
|
182
|
+
f"B = rng.uniform(0.01, 100, size=(1000, 512)).astype(np.{dtype})"
|
|
183
|
+
),
|
|
184
|
+
(
|
|
185
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
186
|
+
f"A = rng.uniform(-1000, 1000, size=(1000, 512)).astype(np.{dtype}); "
|
|
187
|
+
f"B = rng.uniform(-1000, 1000, size=(1000, 512)).astype(np.{dtype})"
|
|
188
|
+
),
|
|
189
|
+
]
|
|
190
|
+
bench = "np.vecdot(A, B)"
|
|
191
|
+
|
|
192
|
+
elif op == "outer":
|
|
193
|
+
# Two 5000-element vectors
|
|
194
|
+
setups = [
|
|
195
|
+
(
|
|
196
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
197
|
+
f"a = rng.standard_normal(5000).astype(np.{dtype}); "
|
|
198
|
+
f"b = rng.standard_normal(5000).astype(np.{dtype})"
|
|
199
|
+
),
|
|
200
|
+
(
|
|
201
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
202
|
+
f"a = rng.uniform(0.01, 100, size=5000).astype(np.{dtype}); "
|
|
203
|
+
f"b = rng.uniform(0.01, 100, size=5000).astype(np.{dtype})"
|
|
204
|
+
),
|
|
205
|
+
(
|
|
206
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
207
|
+
f"a = rng.uniform(-1000, 1000, size=5000).astype(np.{dtype}); "
|
|
208
|
+
f"b = rng.uniform(-1000, 1000, size=5000).astype(np.{dtype})"
|
|
209
|
+
),
|
|
210
|
+
]
|
|
211
|
+
bench = "np.outer(a, b)"
|
|
212
|
+
|
|
213
|
+
elif op == "tensordot":
|
|
214
|
+
# Two (64,64,64) tensors, axes=1
|
|
215
|
+
setups = [
|
|
216
|
+
(
|
|
217
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
218
|
+
f"A = rng.standard_normal((64, 64, 64)).astype(np.{dtype}); "
|
|
219
|
+
f"B = rng.standard_normal((64, 64, 64)).astype(np.{dtype})"
|
|
220
|
+
),
|
|
221
|
+
(
|
|
222
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
223
|
+
f"A = rng.uniform(0.01, 100, size=(64, 64, 64)).astype(np.{dtype}); "
|
|
224
|
+
f"B = rng.uniform(0.01, 100, size=(64, 64, 64)).astype(np.{dtype})"
|
|
225
|
+
),
|
|
226
|
+
(
|
|
227
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
228
|
+
f"A = rng.uniform(-1000, 1000, size=(64, 64, 64)).astype(np.{dtype}); "
|
|
229
|
+
f"B = rng.uniform(-1000, 1000, size=(64, 64, 64)).astype(np.{dtype})"
|
|
230
|
+
),
|
|
231
|
+
]
|
|
232
|
+
bench = "np.tensordot(A, B, axes=1)"
|
|
233
|
+
|
|
234
|
+
elif op == "kron":
|
|
235
|
+
# Two (64,64) matrices
|
|
236
|
+
setups = [
|
|
237
|
+
(
|
|
238
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
239
|
+
f"A = rng.standard_normal((64, 64)).astype(np.{dtype}); "
|
|
240
|
+
f"B = rng.standard_normal((64, 64)).astype(np.{dtype})"
|
|
241
|
+
),
|
|
242
|
+
(
|
|
243
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
244
|
+
f"A = rng.uniform(0.01, 100, size=(64, 64)).astype(np.{dtype}); "
|
|
245
|
+
f"B = rng.uniform(0.01, 100, size=(64, 64)).astype(np.{dtype})"
|
|
246
|
+
),
|
|
247
|
+
(
|
|
248
|
+
f"import numpy as np; rng = np.random.default_rng(42); "
|
|
249
|
+
f"A = rng.uniform(-1000, 1000, size=(64, 64)).astype(np.{dtype}); "
|
|
250
|
+
f"B = rng.uniform(-1000, 1000, size=(64, 64)).astype(np.{dtype})"
|
|
251
|
+
),
|
|
252
|
+
]
|
|
253
|
+
bench = "np.kron(A, B)"
|
|
254
|
+
|
|
255
|
+
else:
|
|
256
|
+
continue # pragma: no cover
|
|
257
|
+
|
|
258
|
+
analytical = _analytical_cost(op)
|
|
259
|
+
|
|
260
|
+
for setup in setups:
|
|
261
|
+
# For vecdot, wrap in try/except since it's NumPy 2.x only
|
|
262
|
+
if op == "vecdot":
|
|
263
|
+
try:
|
|
264
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
265
|
+
except (RuntimeError, AttributeError):
|
|
266
|
+
continue
|
|
267
|
+
else:
|
|
268
|
+
try:
|
|
269
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
270
|
+
except RuntimeError:
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
measured = result.total_flops / repeats
|
|
274
|
+
dist_values.append(measured / analytical if analytical else 0.0)
|
|
275
|
+
dist_raw_totals.append(result.total_flops)
|
|
276
|
+
|
|
277
|
+
if dist_values:
|
|
278
|
+
results[op] = statistics.median(dist_values)
|
|
279
|
+
details[op] = {
|
|
280
|
+
"category": "counted_custom",
|
|
281
|
+
"measurement_mode": "blas",
|
|
282
|
+
"analytical_formula": _FORMULA_STRINGS.get(op, ""),
|
|
283
|
+
"analytical_flops": analytical,
|
|
284
|
+
"benchmark_size": _BENCHMARK_SIZE_STRINGS.get(op, ""),
|
|
285
|
+
"bench_code": bench,
|
|
286
|
+
"repeats": repeats,
|
|
287
|
+
"perf_instructions_total": dist_raw_totals,
|
|
288
|
+
"distribution_alphas": dist_values,
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return results, details
|
benchmarks/_fft.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Benchmark FFT operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import statistics
|
|
7
|
+
|
|
8
|
+
from benchmarks._perf import measure_flops
|
|
9
|
+
|
|
10
|
+
FFT_OPS: list[str] = [
|
|
11
|
+
"fft.fft",
|
|
12
|
+
"fft.ifft",
|
|
13
|
+
"fft.rfft",
|
|
14
|
+
"fft.irfft",
|
|
15
|
+
"fft.fft2",
|
|
16
|
+
"fft.ifft2",
|
|
17
|
+
"fft.rfft2",
|
|
18
|
+
"fft.irfft2",
|
|
19
|
+
"fft.fftn",
|
|
20
|
+
"fft.ifftn",
|
|
21
|
+
"fft.rfftn",
|
|
22
|
+
"fft.irfftn",
|
|
23
|
+
"fft.hfft",
|
|
24
|
+
"fft.ihfft",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
_RFFT_OPS = {
|
|
28
|
+
"fft.rfft",
|
|
29
|
+
"fft.irfft",
|
|
30
|
+
"fft.rfft2",
|
|
31
|
+
"fft.irfft2",
|
|
32
|
+
"fft.rfftn",
|
|
33
|
+
"fft.irfftn",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_FORMULA_STRINGS: dict[str, str] = {
|
|
37
|
+
"fft.fft": "5*n*ceil(log2(n))",
|
|
38
|
+
"fft.ifft": "5*n*ceil(log2(n))",
|
|
39
|
+
"fft.rfft": "5*(n/2)*ceil(log2(n))",
|
|
40
|
+
"fft.irfft": "5*(n/2)*ceil(log2(n))",
|
|
41
|
+
"fft.fft2": "5*n*ceil(log2(n))",
|
|
42
|
+
"fft.ifft2": "5*n*ceil(log2(n))",
|
|
43
|
+
"fft.rfft2": "5*(n/2)*ceil(log2(n))",
|
|
44
|
+
"fft.irfft2": "5*(n/2)*ceil(log2(n))",
|
|
45
|
+
"fft.fftn": "5*n*ceil(log2(n))",
|
|
46
|
+
"fft.ifftn": "5*n*ceil(log2(n))",
|
|
47
|
+
"fft.rfftn": "5*(n/2)*ceil(log2(n))",
|
|
48
|
+
"fft.irfftn": "5*(n/2)*ceil(log2(n))",
|
|
49
|
+
"fft.hfft": "5*n*ceil(log2(n))",
|
|
50
|
+
"fft.ihfft": "5*(n/2)*ceil(log2(n))",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _ceil_log2(n: int) -> int:
|
|
55
|
+
"""Return ceil(log2(n)), minimum 1."""
|
|
56
|
+
if n <= 1:
|
|
57
|
+
return 1
|
|
58
|
+
return math.ceil(math.log2(n))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _analytical_cost(op_name: str, n: int) -> int:
|
|
62
|
+
"""Return the analytical FLOP count for an FFT operation.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
op_name : str
|
|
67
|
+
Operation name (e.g. ``"fft.fft"``).
|
|
68
|
+
n : int
|
|
69
|
+
Input size.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
int
|
|
74
|
+
Analytical FLOP count.
|
|
75
|
+
"""
|
|
76
|
+
cl2 = _ceil_log2(n)
|
|
77
|
+
if op_name in _RFFT_OPS:
|
|
78
|
+
return 5 * (n // 2) * cl2
|
|
79
|
+
return 5 * n * cl2
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def benchmark_fft(
|
|
83
|
+
n: int = 2**20,
|
|
84
|
+
dtype: str = "float64",
|
|
85
|
+
repeats: int = 10,
|
|
86
|
+
) -> tuple[dict[str, float], dict[str, dict]]:
|
|
87
|
+
"""Benchmark FFT ops, returning correction factors (measured / analytical).
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
n : int
|
|
92
|
+
Input size.
|
|
93
|
+
dtype : str
|
|
94
|
+
NumPy dtype string.
|
|
95
|
+
repeats : int
|
|
96
|
+
Number of repetitions per measurement.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
tuple[dict[str, float], dict[str, dict]]
|
|
101
|
+
A pair of (alphas, details). ``alphas`` maps op name to median
|
|
102
|
+
correction factor. ``details`` maps op name to a dict of raw
|
|
103
|
+
benchmark metadata.
|
|
104
|
+
"""
|
|
105
|
+
results: dict[str, float] = {}
|
|
106
|
+
details: dict[str, dict] = {}
|
|
107
|
+
|
|
108
|
+
# 2D/nD ops use sqrt(n) x sqrt(n)
|
|
109
|
+
side = int(math.isqrt(n))
|
|
110
|
+
|
|
111
|
+
for op in FFT_OPS:
|
|
112
|
+
dist_values: list[float] = []
|
|
113
|
+
dist_raw_totals: list[int] = []
|
|
114
|
+
|
|
115
|
+
# Determine dimensionality
|
|
116
|
+
is_2d = "2" in op.split(".")[-1]
|
|
117
|
+
is_nd = op.endswith("n") or op.endswith("fftn")
|
|
118
|
+
is_multi = is_2d or is_nd
|
|
119
|
+
|
|
120
|
+
if is_multi:
|
|
121
|
+
shape_str = f"({side}, {side})"
|
|
122
|
+
effective_n = side * side
|
|
123
|
+
benchmark_size = f"x: ({side},{side})"
|
|
124
|
+
else:
|
|
125
|
+
shape_str = f"({n},)"
|
|
126
|
+
effective_n = n
|
|
127
|
+
benchmark_size = f"x: ({n},)"
|
|
128
|
+
|
|
129
|
+
# Determine input type needed
|
|
130
|
+
short = op.split(".")[-1]
|
|
131
|
+
|
|
132
|
+
# irfft variants need rfft output (complex, half-size)
|
|
133
|
+
needs_rfft_input = short in ("irfft", "irfft2", "irfftn")
|
|
134
|
+
# ifft variants and hfft need complex input
|
|
135
|
+
needs_complex = short in ("ifft", "ifft2", "ifftn", "hfft")
|
|
136
|
+
|
|
137
|
+
setups = []
|
|
138
|
+
if needs_rfft_input:
|
|
139
|
+
# Generate input by applying rfft to real data
|
|
140
|
+
rfft_func = short.replace("i", "", 1) # irfft -> rfft
|
|
141
|
+
setups = [
|
|
142
|
+
f"import numpy as np; _r = np.random.default_rng(42).standard_normal({shape_str}).astype(np.{dtype}); x = np.fft.{rfft_func}(_r)",
|
|
143
|
+
f"import numpy as np; _r = np.random.default_rng(43).uniform(-1, 1, size={shape_str}).astype(np.{dtype}); x = np.fft.{rfft_func}(_r)",
|
|
144
|
+
f"import numpy as np; _r = np.random.default_rng(99).standard_normal({shape_str}).astype(np.{dtype}) * 100; x = np.fft.{rfft_func}(_r)",
|
|
145
|
+
]
|
|
146
|
+
elif needs_complex:
|
|
147
|
+
setups = [
|
|
148
|
+
(
|
|
149
|
+
f"import numpy as np; x = np.random.default_rng(42).standard_normal({shape_str}).astype(np.{dtype}) "
|
|
150
|
+
f"+ 1j * np.random.default_rng(43).standard_normal({shape_str}).astype(np.{dtype})"
|
|
151
|
+
),
|
|
152
|
+
(
|
|
153
|
+
f"import numpy as np; t = np.linspace(0, 2*np.pi, {effective_n}).reshape({shape_str}).astype(np.{dtype}); "
|
|
154
|
+
f"x = np.sin(t) + 1j * np.cos(t)"
|
|
155
|
+
),
|
|
156
|
+
(
|
|
157
|
+
f"import numpy as np; x = (np.random.default_rng(99).uniform(-1, 1, size={shape_str}).astype(np.{dtype}) "
|
|
158
|
+
f"+ 1j * np.random.default_rng(100).uniform(-1, 1, size={shape_str}).astype(np.{dtype}))"
|
|
159
|
+
),
|
|
160
|
+
]
|
|
161
|
+
else:
|
|
162
|
+
# Real input: fft, rfft, fftn, rfftn, ihfft
|
|
163
|
+
setups = [
|
|
164
|
+
f"import numpy as np; x = np.random.default_rng(42).standard_normal({shape_str}).astype(np.{dtype})",
|
|
165
|
+
(
|
|
166
|
+
f"import numpy as np; t = np.linspace(0, 2*np.pi, {effective_n}).reshape({shape_str}).astype(np.{dtype}); "
|
|
167
|
+
f"x = np.sin(5*t) + 0.5*np.sin(13*t)"
|
|
168
|
+
),
|
|
169
|
+
f"import numpy as np; x = np.random.default_rng(99).uniform(-1, 1, size={shape_str}).astype(np.{dtype})",
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
bench = f"np.{op}(x)"
|
|
173
|
+
analytical = _analytical_cost(op, effective_n)
|
|
174
|
+
|
|
175
|
+
for setup in setups:
|
|
176
|
+
try:
|
|
177
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
178
|
+
except RuntimeError:
|
|
179
|
+
continue
|
|
180
|
+
measured = result.total_flops / repeats
|
|
181
|
+
dist_values.append(measured / analytical if analytical else 0.0)
|
|
182
|
+
dist_raw_totals.append(result.total_flops)
|
|
183
|
+
|
|
184
|
+
if dist_values:
|
|
185
|
+
results[op] = statistics.median(dist_values)
|
|
186
|
+
details[op] = {
|
|
187
|
+
"category": "counted_custom",
|
|
188
|
+
"measurement_mode": "custom",
|
|
189
|
+
"analytical_formula": _FORMULA_STRINGS.get(op, ""),
|
|
190
|
+
"analytical_flops": analytical,
|
|
191
|
+
"benchmark_size": benchmark_size,
|
|
192
|
+
"bench_code": bench,
|
|
193
|
+
"repeats": repeats,
|
|
194
|
+
"perf_instructions_total": dist_raw_totals,
|
|
195
|
+
"distribution_alphas": dist_values,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return results, details
|
benchmarks/_impl_urls.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Map operation names to GitHub source code URLs.
|
|
2
|
+
|
|
3
|
+
For each operation in flopscope, finds where the FLOP cost is charged in the
|
|
4
|
+
source tree and constructs a GitHub permalink.
|
|
5
|
+
|
|
6
|
+
Strategy (tried in order for each op):
|
|
7
|
+
1. Literal ``budget.deduct("op_name", ...)`` call — works for hand-written ops.
|
|
8
|
+
2. Factory registration like ``sin = _counted_unary(_np.sin, "sin")`` — works
|
|
9
|
+
for pointwise ops created via factory helpers.
|
|
10
|
+
3. Falls back to file-level URL if only the file is identified.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import subprocess
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
REPO_URL = "https://github.com/AIcrowd/flopscope/blob/main"
|
|
20
|
+
SRC_ROOT = Path(__file__).resolve().parent.parent / "src" / "flopscope"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _grep(pattern: str, path: str | None = None) -> list[tuple[str, int, str]]:
|
|
24
|
+
"""Run grep -rn and return list of (abs_path, line_no, line_text)."""
|
|
25
|
+
target = path or str(SRC_ROOT)
|
|
26
|
+
try:
|
|
27
|
+
result = subprocess.run(
|
|
28
|
+
["grep", "-rn", "-E", pattern, target],
|
|
29
|
+
capture_output=True,
|
|
30
|
+
text=True,
|
|
31
|
+
timeout=10,
|
|
32
|
+
)
|
|
33
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
hits: list[tuple[str, int, str]] = []
|
|
37
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
38
|
+
for line in result.stdout.strip().splitlines():
|
|
39
|
+
parts = line.split(":", 2)
|
|
40
|
+
if len(parts) >= 3:
|
|
41
|
+
try:
|
|
42
|
+
hits.append((parts[0], int(parts[1]), parts[2]))
|
|
43
|
+
except ValueError:
|
|
44
|
+
pass
|
|
45
|
+
return hits
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _to_rel_path(abs_path: str) -> str:
|
|
49
|
+
"""Convert absolute path to repo-relative path."""
|
|
50
|
+
repo_root = SRC_ROOT.parent.parent # up from src/flopscope
|
|
51
|
+
try:
|
|
52
|
+
return str(Path(abs_path).relative_to(repo_root))
|
|
53
|
+
except ValueError:
|
|
54
|
+
return str(abs_path)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _find_deduct_line(op_name: str) -> tuple[str, int | None]:
|
|
58
|
+
"""Find where the FLOP cost for *op_name* is charged in the source tree.
|
|
59
|
+
|
|
60
|
+
Returns ``(repo_relative_path, line_number)`` or ``("", None)``.
|
|
61
|
+
"""
|
|
62
|
+
escaped = re.escape(op_name)
|
|
63
|
+
|
|
64
|
+
# --- Strategy 1: direct deduct("op_name" or deduct('op_name' --------
|
|
65
|
+
for quote in ('"', "'"):
|
|
66
|
+
hits = _grep(f"deduct\\({quote}{escaped}{quote}")
|
|
67
|
+
if hits:
|
|
68
|
+
abs_path, line_no, _ = hits[0]
|
|
69
|
+
return (_to_rel_path(abs_path), line_no)
|
|
70
|
+
|
|
71
|
+
# --- Strategy 2: factory registration like sin = _counted_unary(..., "sin")
|
|
72
|
+
# Also handles _counted_binary, _counted_sampler, etc.
|
|
73
|
+
for quote in ('"', "'"):
|
|
74
|
+
# Match patterns like:
|
|
75
|
+
# sin = _counted_unary(_np.sin, "sin")
|
|
76
|
+
# random.rand = _counted_dims_sampler(_np.random.rand, "random.rand")
|
|
77
|
+
pattern = f"_counted_\\w+\\([^)]*,\\s*{quote}{escaped}{quote}"
|
|
78
|
+
hits = _grep(pattern)
|
|
79
|
+
if hits:
|
|
80
|
+
abs_path, line_no, _ = hits[0]
|
|
81
|
+
return (_to_rel_path(abs_path), line_no)
|
|
82
|
+
|
|
83
|
+
# --- Strategy 3: for reduction ops created via loops or dicts --------
|
|
84
|
+
# Search for the string literal "op_name" near a deduct or factory call
|
|
85
|
+
for quote in ('"', "'"):
|
|
86
|
+
pattern = f"{quote}{escaped}{quote}"
|
|
87
|
+
hits = _grep(pattern)
|
|
88
|
+
# Filter to source files (not test, not __pycache__, not data/)
|
|
89
|
+
src_hits = [
|
|
90
|
+
h
|
|
91
|
+
for h in hits
|
|
92
|
+
if "/src/flopscope/" in h[0]
|
|
93
|
+
and "__pycache__" not in h[0]
|
|
94
|
+
and "/data/" not in h[0]
|
|
95
|
+
and "_registry.py" not in h[0]
|
|
96
|
+
and "_docstrings.py" not in h[0]
|
|
97
|
+
]
|
|
98
|
+
if src_hits:
|
|
99
|
+
abs_path, line_no, _ = src_hits[0]
|
|
100
|
+
return (_to_rel_path(abs_path), line_no)
|
|
101
|
+
|
|
102
|
+
return ("", None)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def map_op_to_url(op_name: str) -> str:
|
|
106
|
+
"""Return GitHub URL for the runtime cost implementation of an operation.
|
|
107
|
+
|
|
108
|
+
Returns an empty string if the source cannot be located.
|
|
109
|
+
"""
|
|
110
|
+
rel_path, line_no = _find_deduct_line(op_name)
|
|
111
|
+
if not rel_path:
|
|
112
|
+
return ""
|
|
113
|
+
url = f"{REPO_URL}/{rel_path}"
|
|
114
|
+
if line_no is not None:
|
|
115
|
+
url += f"#L{line_no}"
|
|
116
|
+
return url
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def build_url_map(op_names: list[str] | None = None) -> dict[str, str]:
|
|
120
|
+
"""Build URL map for all given operation names.
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
op_names : list of str, optional
|
|
125
|
+
If *None*, reads all operation names from ``weights.json``.
|
|
126
|
+
|
|
127
|
+
Returns
|
|
128
|
+
-------
|
|
129
|
+
dict mapping operation name to GitHub URL (empty string if not found).
|
|
130
|
+
"""
|
|
131
|
+
if op_names is None:
|
|
132
|
+
import json
|
|
133
|
+
|
|
134
|
+
weights_path = SRC_ROOT / "data" / "weights.json"
|
|
135
|
+
with open(weights_path) as f:
|
|
136
|
+
data = json.load(f)
|
|
137
|
+
op_names = list(data["weights"].keys())
|
|
138
|
+
|
|
139
|
+
return {op: map_op_to_url(op) for op in op_names}
|