flopscope 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +1 -0
- benchmarks/__main__.py +6 -0
- benchmarks/_baseline.py +171 -0
- benchmarks/_bitwise.py +231 -0
- benchmarks/_complex.py +176 -0
- benchmarks/_contractions.py +291 -0
- benchmarks/_fft.py +198 -0
- benchmarks/_impl_urls.py +139 -0
- benchmarks/_linalg.py +197 -0
- benchmarks/_linalg_delegates.py +407 -0
- benchmarks/_metadata.py +141 -0
- benchmarks/_misc.py +653 -0
- benchmarks/_perf.py +321 -0
- benchmarks/_perm_group_calibration.py +175 -0
- benchmarks/_pointwise.py +372 -0
- benchmarks/_polynomial.py +193 -0
- benchmarks/_random.py +209 -0
- benchmarks/_reductions.py +136 -0
- benchmarks/_sorting.py +289 -0
- benchmarks/_stats.py +137 -0
- benchmarks/_window.py +92 -0
- benchmarks/accumulation/__init__.py +0 -0
- benchmarks/accumulation/bench_cost_compute.py +138 -0
- benchmarks/dashboard.py +312 -0
- benchmarks/runner.py +636 -0
- flopscope/__init__.py +273 -0
- flopscope/_accumulation/__init__.py +13 -0
- flopscope/_accumulation/_bipartite.py +121 -0
- flopscope/_accumulation/_burnside.py +51 -0
- flopscope/_accumulation/_cache.py +146 -0
- flopscope/_accumulation/_components.py +153 -0
- flopscope/_accumulation/_cost.py +1414 -0
- flopscope/_accumulation/_cost_descriptions.py +63 -0
- flopscope/_accumulation/_detection.py +318 -0
- flopscope/_accumulation/_ladder.py +191 -0
- flopscope/_accumulation/_output_orbit.py +104 -0
- flopscope/_accumulation/_partition.py +290 -0
- flopscope/_accumulation/_path_info.py +211 -0
- flopscope/_accumulation/_public.py +169 -0
- flopscope/_accumulation/_reduction.py +310 -0
- flopscope/_accumulation/_regimes.py +303 -0
- flopscope/_accumulation/_shape.py +33 -0
- flopscope/_accumulation/_wreath.py +209 -0
- flopscope/_budget.py +1027 -0
- flopscope/_config.py +118 -0
- flopscope/_counting_ops.py +451 -0
- flopscope/_display.py +478 -0
- flopscope/_docstrings.py +59 -0
- flopscope/_dtypes.py +20 -0
- flopscope/_einsum.py +717 -0
- flopscope/_errstate.py +25 -0
- flopscope/_flops.py +282 -0
- flopscope/_free_ops.py +2654 -0
- flopscope/_ndarray.py +1126 -0
- flopscope/_opt_einsum/LICENSE +21 -0
- flopscope/_opt_einsum/NOTICE +59 -0
- flopscope/_opt_einsum/__init__.py +209 -0
- flopscope/_opt_einsum/_contract.py +1478 -0
- flopscope/_opt_einsum/_helpers.py +164 -0
- flopscope/_opt_einsum/_hsluv.py +273 -0
- flopscope/_opt_einsum/_path_random.py +462 -0
- flopscope/_opt_einsum/_paths.py +1653 -0
- flopscope/_opt_einsum/_subgraph_symmetry.py +544 -0
- flopscope/_opt_einsum/_symmetry.py +140 -0
- flopscope/_opt_einsum/_typing.py +37 -0
- flopscope/_perm_group.py +717 -0
- flopscope/_pointwise.py +2522 -0
- flopscope/_polynomial.py +278 -0
- flopscope/_registry.py +3216 -0
- flopscope/_sorting_ops.py +571 -0
- flopscope/_symmetric.py +812 -0
- flopscope/_symmetry_transport.py +510 -0
- flopscope/_symmetry_utils.py +669 -0
- flopscope/_type_info.py +12 -0
- flopscope/_unwrap.py +70 -0
- flopscope/_validation.py +83 -0
- flopscope/_version_check.py +46 -0
- flopscope/_weights.py +195 -0
- flopscope/_window.py +177 -0
- flopscope/accounting.py +565 -0
- flopscope/data/default_weights.json +462 -0
- flopscope/data/weights.csv +509 -0
- flopscope/errors.py +197 -0
- flopscope/numpy/__init__.py +878 -0
- flopscope/numpy/fft/__init__.py +55 -0
- flopscope/numpy/fft/_free.py +51 -0
- flopscope/numpy/fft/_transforms.py +695 -0
- flopscope/numpy/linalg/__init__.py +105 -0
- flopscope/numpy/linalg/_aliases.py +126 -0
- flopscope/numpy/linalg/_compound.py +161 -0
- flopscope/numpy/linalg/_decompositions.py +353 -0
- flopscope/numpy/linalg/_properties.py +533 -0
- flopscope/numpy/linalg/_solvers.py +444 -0
- flopscope/numpy/linalg/_svd.py +122 -0
- flopscope/numpy/random/__init__.py +684 -0
- flopscope/numpy/random/_cost_formulas.py +115 -0
- flopscope/numpy/random/_counted_classes.py +241 -0
- flopscope/numpy/testing/__init__.py +13 -0
- flopscope/numpy/typing/__init__.py +30 -0
- flopscope/py.typed +0 -0
- flopscope/stats/__init__.py +84 -0
- flopscope/stats/_base.py +77 -0
- flopscope/stats/_cauchy.py +146 -0
- flopscope/stats/_erf.py +190 -0
- flopscope/stats/_expon.py +146 -0
- flopscope/stats/_laplace.py +150 -0
- flopscope/stats/_logistic.py +148 -0
- flopscope/stats/_lognorm.py +160 -0
- flopscope/stats/_ndtri.py +133 -0
- flopscope/stats/_norm.py +149 -0
- flopscope/stats/_truncnorm.py +186 -0
- flopscope/stats/_uniform.py +141 -0
- flopscope-0.2.0.dist-info/METADATA +23 -0
- flopscope-0.2.0.dist-info/RECORD +115 -0
- flopscope-0.2.0.dist-info/WHEEL +4 -0
benchmarks/_pointwise.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""Benchmark pointwise (element-wise) unary and binary operations.
|
|
2
|
+
|
|
3
|
+
All benchmarks pre-allocate output arrays and use ``out=`` to eliminate
|
|
4
|
+
memory allocation overhead from measurements, isolating pure compute cost.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import statistics
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from benchmarks._perf import measure_flops
|
|
14
|
+
|
|
15
|
+
UNARY_OPS: list[str] = [
|
|
16
|
+
"abs",
|
|
17
|
+
"negative",
|
|
18
|
+
"positive",
|
|
19
|
+
"exp",
|
|
20
|
+
"exp2",
|
|
21
|
+
"expm1",
|
|
22
|
+
"log",
|
|
23
|
+
"log2",
|
|
24
|
+
"log10",
|
|
25
|
+
"log1p",
|
|
26
|
+
"sqrt",
|
|
27
|
+
"cbrt",
|
|
28
|
+
"square",
|
|
29
|
+
"reciprocal",
|
|
30
|
+
"sin",
|
|
31
|
+
"cos",
|
|
32
|
+
"tan",
|
|
33
|
+
"arcsin",
|
|
34
|
+
"arccos",
|
|
35
|
+
"arctan",
|
|
36
|
+
"sinh",
|
|
37
|
+
"cosh",
|
|
38
|
+
"tanh",
|
|
39
|
+
"arcsinh",
|
|
40
|
+
"arccosh",
|
|
41
|
+
"arctanh",
|
|
42
|
+
"ceil",
|
|
43
|
+
"floor",
|
|
44
|
+
"trunc",
|
|
45
|
+
"rint",
|
|
46
|
+
"sign",
|
|
47
|
+
"signbit",
|
|
48
|
+
"fabs",
|
|
49
|
+
"deg2rad",
|
|
50
|
+
"rad2deg",
|
|
51
|
+
"degrees",
|
|
52
|
+
"radians",
|
|
53
|
+
"logical_not",
|
|
54
|
+
# --- added in Step 2.3 ---
|
|
55
|
+
"frexp",
|
|
56
|
+
"modf",
|
|
57
|
+
"sinc",
|
|
58
|
+
"i0",
|
|
59
|
+
"spacing",
|
|
60
|
+
"nan_to_num",
|
|
61
|
+
"isneginf",
|
|
62
|
+
"isposinf",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
BINARY_OPS: list[str] = [
|
|
66
|
+
"add",
|
|
67
|
+
"subtract",
|
|
68
|
+
"multiply",
|
|
69
|
+
"divide",
|
|
70
|
+
"true_divide",
|
|
71
|
+
"floor_divide",
|
|
72
|
+
"power",
|
|
73
|
+
"float_power",
|
|
74
|
+
"mod",
|
|
75
|
+
"remainder",
|
|
76
|
+
"fmod",
|
|
77
|
+
"maximum",
|
|
78
|
+
"minimum",
|
|
79
|
+
"fmax",
|
|
80
|
+
"fmin",
|
|
81
|
+
"greater",
|
|
82
|
+
"greater_equal",
|
|
83
|
+
"less",
|
|
84
|
+
"less_equal",
|
|
85
|
+
"equal",
|
|
86
|
+
"not_equal",
|
|
87
|
+
"logical_and",
|
|
88
|
+
"logical_or",
|
|
89
|
+
"logical_xor",
|
|
90
|
+
"arctan2",
|
|
91
|
+
"hypot",
|
|
92
|
+
"copysign",
|
|
93
|
+
"nextafter",
|
|
94
|
+
"logaddexp",
|
|
95
|
+
"logaddexp2",
|
|
96
|
+
"ldexp",
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
# Special pointwise ops that don't follow the standard unary/binary pattern.
|
|
100
|
+
SPECIAL_OPS: list[str] = [
|
|
101
|
+
"isclose",
|
|
102
|
+
"heaviside",
|
|
103
|
+
"clip",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
# Ops whose output dtype is bool (need bool pre-allocation for out=)
|
|
107
|
+
_BOOL_UNARY = frozenset({"signbit", "logical_not", "isneginf", "isposinf"})
|
|
108
|
+
_BOOL_BINARY = frozenset(
|
|
109
|
+
{
|
|
110
|
+
"greater",
|
|
111
|
+
"greater_equal",
|
|
112
|
+
"less",
|
|
113
|
+
"less_equal",
|
|
114
|
+
"equal",
|
|
115
|
+
"not_equal",
|
|
116
|
+
"logical_and",
|
|
117
|
+
"logical_or",
|
|
118
|
+
"logical_xor",
|
|
119
|
+
}
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Ops that return tuples — benchmark without out= parameter.
|
|
123
|
+
_TUPLE_RETURN_OPS = frozenset({"frexp", "modf"})
|
|
124
|
+
|
|
125
|
+
# Ops that require positive input.
|
|
126
|
+
_POSITIVE_INPUT_OPS = frozenset({"i0"})
|
|
127
|
+
|
|
128
|
+
# Ops that benefit from NaN/inf values in input.
|
|
129
|
+
_NAN_INPUT_OPS = frozenset({"nan_to_num"})
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _make_inputs_unary(n: int, dtype: str) -> list[np.ndarray]:
|
|
133
|
+
"""Return 3 input arrays with different distributions."""
|
|
134
|
+
rng = np.random.default_rng(42)
|
|
135
|
+
return [
|
|
136
|
+
rng.standard_normal(n).astype(dtype),
|
|
137
|
+
rng.uniform(0.01, 100, size=n).astype(dtype),
|
|
138
|
+
rng.uniform(-1000, 1000, size=n).astype(dtype),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _make_inputs_binary(n: int, dtype: str) -> list[tuple[np.ndarray, np.ndarray]]:
|
|
143
|
+
"""Return 3 (a, b) tuples with different distributions."""
|
|
144
|
+
rng = np.random.default_rng(42)
|
|
145
|
+
return [
|
|
146
|
+
(
|
|
147
|
+
rng.standard_normal(n).astype(dtype),
|
|
148
|
+
rng.standard_normal(n).astype(dtype),
|
|
149
|
+
),
|
|
150
|
+
(
|
|
151
|
+
rng.uniform(0.01, 100, size=n).astype(dtype),
|
|
152
|
+
rng.uniform(0.01, 100, size=n).astype(dtype),
|
|
153
|
+
),
|
|
154
|
+
(
|
|
155
|
+
rng.uniform(-1000, 1000, size=n).astype(dtype),
|
|
156
|
+
rng.uniform(-1000, 1000, size=n).astype(dtype),
|
|
157
|
+
),
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# sin/cos have a fast path for small inputs (near 0) that skips range
|
|
162
|
+
# reduction. Use uniform(-100, 100) instead of standard_normal for
|
|
163
|
+
# distribution 0 so all three distributions exercise the full code path.
|
|
164
|
+
_WIDE_INPUT_OPS = frozenset({"sin", "cos"})
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _unary_setup(n: int, dtype: str, op: str, dist_idx: int) -> str:
|
|
168
|
+
"""Build setup code for a unary op with pre-allocated output."""
|
|
169
|
+
if op in _WIDE_INPUT_OPS and dist_idx == 0:
|
|
170
|
+
dists_0 = f"x = np.random.default_rng(42).uniform(-100, 100, size={n}).astype(np.{dtype})"
|
|
171
|
+
else:
|
|
172
|
+
dists_0 = (
|
|
173
|
+
f"x = np.random.default_rng(42).standard_normal({n}).astype(np.{dtype})"
|
|
174
|
+
)
|
|
175
|
+
dists = [
|
|
176
|
+
dists_0,
|
|
177
|
+
f"x = np.random.default_rng(42).uniform(0.01, 100, size={n}).astype(np.{dtype})",
|
|
178
|
+
f"x = np.random.default_rng(42).uniform(-1000, 1000, size={n}).astype(np.{dtype})",
|
|
179
|
+
]
|
|
180
|
+
setup = f"import numpy as np; {dists[dist_idx]}"
|
|
181
|
+
|
|
182
|
+
# i0 only works on positive input.
|
|
183
|
+
if op in _POSITIVE_INPUT_OPS:
|
|
184
|
+
setup += "; x = np.abs(x)"
|
|
185
|
+
|
|
186
|
+
# nan_to_num benefits from NaN/inf values in one distribution.
|
|
187
|
+
if op in _NAN_INPUT_OPS and dist_idx == 0:
|
|
188
|
+
setup += (
|
|
189
|
+
f"; x[:{n}//100] = np.nan"
|
|
190
|
+
f"; x[{n}//100:{n}//50] = np.inf"
|
|
191
|
+
f"; x[{n}//50:{n}//50+{n}//100] = -np.inf"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Tuple-return ops (frexp, modf) don't use out=.
|
|
195
|
+
if op in _TUPLE_RETURN_OPS:
|
|
196
|
+
return setup
|
|
197
|
+
|
|
198
|
+
out_dtype = "bool" if op in _BOOL_UNARY else f"np.{dtype}"
|
|
199
|
+
setup += f"; _out = np.empty({n}, dtype={out_dtype})"
|
|
200
|
+
return setup
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _binary_setup(n: int, dtype: str, op: str, dist_idx: int) -> str:
|
|
204
|
+
"""Build setup code for a binary op with pre-allocated output."""
|
|
205
|
+
dists = [
|
|
206
|
+
(
|
|
207
|
+
f"rng = np.random.default_rng(42); "
|
|
208
|
+
f"a = rng.standard_normal({n}).astype(np.{dtype}); "
|
|
209
|
+
f"b = rng.standard_normal({n}).astype(np.{dtype})"
|
|
210
|
+
),
|
|
211
|
+
(
|
|
212
|
+
f"rng = np.random.default_rng(42); "
|
|
213
|
+
f"a = rng.uniform(0.01, 100, size={n}).astype(np.{dtype}); "
|
|
214
|
+
f"b = rng.uniform(0.01, 100, size={n}).astype(np.{dtype})"
|
|
215
|
+
),
|
|
216
|
+
(
|
|
217
|
+
f"rng = np.random.default_rng(42); "
|
|
218
|
+
f"a = rng.uniform(-1000, 1000, size={n}).astype(np.{dtype}); "
|
|
219
|
+
f"b = rng.uniform(-1000, 1000, size={n}).astype(np.{dtype})"
|
|
220
|
+
),
|
|
221
|
+
]
|
|
222
|
+
out_dtype = "bool" if op in _BOOL_BINARY else f"np.{dtype}"
|
|
223
|
+
return (
|
|
224
|
+
f"import numpy as np; {dists[dist_idx]}; "
|
|
225
|
+
f"_out = np.empty({n}, dtype={out_dtype})"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def benchmark_pointwise(
|
|
230
|
+
n: int = 10_000_000,
|
|
231
|
+
dtype: str = "float64",
|
|
232
|
+
repeats: int = 10,
|
|
233
|
+
distributions: int = 3,
|
|
234
|
+
) -> tuple[dict[str, float], dict[str, dict]]:
|
|
235
|
+
"""Benchmark all pointwise ops, returning raw measurement per element.
|
|
236
|
+
|
|
237
|
+
All operations use pre-allocated output (``out=``) to eliminate memory
|
|
238
|
+
allocation overhead from measurements, isolating pure compute cost.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
n : int
|
|
243
|
+
Array size.
|
|
244
|
+
dtype : str
|
|
245
|
+
NumPy dtype string.
|
|
246
|
+
repeats : int
|
|
247
|
+
Number of repetitions per measurement.
|
|
248
|
+
distributions : int
|
|
249
|
+
Number of input distributions to measure (median is taken).
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
tuple[dict[str, float], dict[str, dict]]
|
|
254
|
+
A pair of (alphas, details). ``alphas`` maps op name to median
|
|
255
|
+
measurement per element. ``details`` maps op name to a dict of
|
|
256
|
+
raw benchmark metadata.
|
|
257
|
+
"""
|
|
258
|
+
results: dict[str, float] = {}
|
|
259
|
+
details: dict[str, dict] = {}
|
|
260
|
+
|
|
261
|
+
# --- Unary ops ---
|
|
262
|
+
for op in UNARY_OPS:
|
|
263
|
+
dist_values: list[float] = []
|
|
264
|
+
dist_raw_totals: list[int] = []
|
|
265
|
+
bench = ""
|
|
266
|
+
for di in range(distributions):
|
|
267
|
+
setup = _unary_setup(n, dtype, op, di)
|
|
268
|
+
if op in _TUPLE_RETURN_OPS:
|
|
269
|
+
bench = f"np.{op}(x)"
|
|
270
|
+
else:
|
|
271
|
+
bench = f"np.{op}(x, out=_out)"
|
|
272
|
+
try:
|
|
273
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
274
|
+
except RuntimeError:
|
|
275
|
+
continue
|
|
276
|
+
dist_values.append(result.total_flops / (n * repeats))
|
|
277
|
+
dist_raw_totals.append(result.total_flops)
|
|
278
|
+
if dist_values:
|
|
279
|
+
results[op] = statistics.median(dist_values)
|
|
280
|
+
details[op] = {
|
|
281
|
+
"category": "counted_unary",
|
|
282
|
+
"measurement_mode": "ufunc_unary",
|
|
283
|
+
"analytical_formula": "numel(output)",
|
|
284
|
+
"analytical_flops": n,
|
|
285
|
+
"benchmark_size": f"x: ({n},)",
|
|
286
|
+
"bench_code": bench,
|
|
287
|
+
"repeats": repeats,
|
|
288
|
+
"perf_instructions_total": dist_raw_totals,
|
|
289
|
+
"distribution_alphas": dist_values,
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
# --- Binary ops ---
|
|
293
|
+
for op in BINARY_OPS:
|
|
294
|
+
dist_values: list[float] = []
|
|
295
|
+
dist_raw_totals: list[int] = []
|
|
296
|
+
bench = ""
|
|
297
|
+
for di in range(distributions):
|
|
298
|
+
setup = _binary_setup(n, dtype, op, di)
|
|
299
|
+
bench = f"np.{op}(a, b, out=_out)"
|
|
300
|
+
try:
|
|
301
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
302
|
+
except RuntimeError:
|
|
303
|
+
continue
|
|
304
|
+
dist_values.append(result.total_flops / (n * repeats))
|
|
305
|
+
dist_raw_totals.append(result.total_flops)
|
|
306
|
+
if dist_values:
|
|
307
|
+
results[op] = statistics.median(dist_values)
|
|
308
|
+
details[op] = {
|
|
309
|
+
"category": "counted_binary",
|
|
310
|
+
"measurement_mode": "ufunc_binary",
|
|
311
|
+
"analytical_formula": "numel(output)",
|
|
312
|
+
"analytical_flops": n,
|
|
313
|
+
"benchmark_size": f"a: ({n},), b: ({n},)",
|
|
314
|
+
"bench_code": bench,
|
|
315
|
+
"repeats": repeats,
|
|
316
|
+
"perf_instructions_total": dist_raw_totals,
|
|
317
|
+
"distribution_alphas": dist_values,
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
# --- Special ops (non-standard patterns) ---
|
|
321
|
+
for op in SPECIAL_OPS:
|
|
322
|
+
dist_values: list[float] = []
|
|
323
|
+
dist_raw_totals: list[int] = []
|
|
324
|
+
bench = ""
|
|
325
|
+
for di in range(distributions):
|
|
326
|
+
if op == "isclose":
|
|
327
|
+
# Binary comparison returning bool.
|
|
328
|
+
setup = _binary_setup(n, dtype, op, di)
|
|
329
|
+
bench = "np.isclose(a, b)"
|
|
330
|
+
category = "counted_binary"
|
|
331
|
+
elif op == "heaviside":
|
|
332
|
+
# Binary with scalar second argument.
|
|
333
|
+
setup = _unary_setup(n, dtype, op, di)
|
|
334
|
+
bench = "np.heaviside(x, 0.5)"
|
|
335
|
+
category = "counted_binary"
|
|
336
|
+
elif op == "clip":
|
|
337
|
+
# Ternary: clip(x, min, max).
|
|
338
|
+
setup = _unary_setup(n, dtype, op, di)
|
|
339
|
+
bench = "np.clip(x, -1.0, 1.0)"
|
|
340
|
+
category = "counted_unary"
|
|
341
|
+
else:
|
|
342
|
+
continue
|
|
343
|
+
try:
|
|
344
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
345
|
+
except RuntimeError:
|
|
346
|
+
continue
|
|
347
|
+
dist_values.append(result.total_flops / (n * repeats))
|
|
348
|
+
dist_raw_totals.append(result.total_flops)
|
|
349
|
+
if dist_values:
|
|
350
|
+
results[op] = statistics.median(dist_values)
|
|
351
|
+
if op == "isclose":
|
|
352
|
+
bm_size = f"a: ({n},), b: ({n},)"
|
|
353
|
+
elif op == "heaviside":
|
|
354
|
+
bm_size = f"x: ({n},), h=0.5"
|
|
355
|
+
elif op == "clip":
|
|
356
|
+
bm_size = f"x: ({n},), a_min=-1.0, a_max=1.0"
|
|
357
|
+
else:
|
|
358
|
+
bm_size = f"x: ({n},)"
|
|
359
|
+
mm = "ufunc_unary" if category == "counted_unary" else "ufunc_binary"
|
|
360
|
+
details[op] = {
|
|
361
|
+
"category": category,
|
|
362
|
+
"measurement_mode": mm,
|
|
363
|
+
"analytical_formula": "numel(output)",
|
|
364
|
+
"analytical_flops": n,
|
|
365
|
+
"benchmark_size": bm_size,
|
|
366
|
+
"bench_code": bench,
|
|
367
|
+
"repeats": repeats,
|
|
368
|
+
"perf_instructions_total": dist_raw_totals,
|
|
369
|
+
"distribution_alphas": dist_values,
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
return results, details
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Benchmark polynomial operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import statistics
|
|
6
|
+
|
|
7
|
+
from benchmarks._perf import measure_flops
|
|
8
|
+
|
|
9
|
+
POLYNOMIAL_OPS: list[str] = [
|
|
10
|
+
"polyval",
|
|
11
|
+
"polyfit",
|
|
12
|
+
"polyadd",
|
|
13
|
+
"polysub",
|
|
14
|
+
"polymul",
|
|
15
|
+
"polydiv",
|
|
16
|
+
"polyder",
|
|
17
|
+
"polyint",
|
|
18
|
+
"poly",
|
|
19
|
+
"roots",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
_FORMULA_STRINGS: dict[str, str] = {
|
|
23
|
+
"polyval": "2 * n * degree (FMA=2)",
|
|
24
|
+
"polyfit": "2 * n * (degree+1)^2",
|
|
25
|
+
"roots": "degree^3",
|
|
26
|
+
"polymul": "(degree+1)^2",
|
|
27
|
+
"polydiv": "(degree+1)^2",
|
|
28
|
+
"polyadd": "degree + 1",
|
|
29
|
+
"polysub": "degree + 1",
|
|
30
|
+
"polyder": "degree + 1",
|
|
31
|
+
"polyint": "degree + 1",
|
|
32
|
+
"poly": "degree^2",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _analytical_cost(op: str, n: int, degree: int) -> int:
|
|
37
|
+
"""Return the analytical FLOP cost for a polynomial operation.
|
|
38
|
+
|
|
39
|
+
These formulas match flopscope's runtime cost model so that the
|
|
40
|
+
benchmark denominator and the budget deduction use the same formula.
|
|
41
|
+
"""
|
|
42
|
+
if op == "polyval":
|
|
43
|
+
return (
|
|
44
|
+
2 * n * degree
|
|
45
|
+
) # Updated for FMA=2 unification (spec 2026-05-20): polyval formula doubled m*deg → 2*m*deg.
|
|
46
|
+
elif op == "polyfit":
|
|
47
|
+
return 2 * n * (degree + 1) ** 2
|
|
48
|
+
elif op == "roots":
|
|
49
|
+
return degree**3
|
|
50
|
+
elif op in ("polymul", "polydiv"):
|
|
51
|
+
return (degree + 1) ** 2
|
|
52
|
+
elif op in ("polyadd", "polysub"):
|
|
53
|
+
return degree + 1
|
|
54
|
+
elif op in ("polyder", "polyint"):
|
|
55
|
+
return degree + 1 # runtime charges len(c) = degree + 1
|
|
56
|
+
elif op == "poly":
|
|
57
|
+
return degree**2
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError(f"Unknown polynomial op: {op!r}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def benchmark_polynomial(
|
|
63
|
+
n: int = 1_000_000,
|
|
64
|
+
dtype: str = "float64",
|
|
65
|
+
repeats: int = 10,
|
|
66
|
+
degree: int = 100,
|
|
67
|
+
) -> tuple[dict[str, float], dict[str, dict]]:
|
|
68
|
+
"""Benchmark polynomial ops, returning raw measurement per element.
|
|
69
|
+
|
|
70
|
+
Each op is normalized by its analytical FLOP cost from
|
|
71
|
+
``_analytical_cost(op, n, degree)`` so the returned value
|
|
72
|
+
represents raw perf-counter FLOPs per analytical FLOP.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
n : int
|
|
77
|
+
Array size for polyval/polyfit.
|
|
78
|
+
dtype : str
|
|
79
|
+
NumPy dtype string.
|
|
80
|
+
repeats : int
|
|
81
|
+
Number of repetitions per measurement.
|
|
82
|
+
degree : int
|
|
83
|
+
Polynomial degree (higher = less overhead-dominated for coeff ops).
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
tuple[dict[str, float], dict[str, dict]]
|
|
88
|
+
``(alphas, details)`` where *alphas* maps op name to median alpha
|
|
89
|
+
and *details* maps op name to a dict of per-op measurement metadata.
|
|
90
|
+
"""
|
|
91
|
+
results: dict[str, float] = {}
|
|
92
|
+
details: dict[str, dict] = {}
|
|
93
|
+
|
|
94
|
+
# 3 distributions with varying coefficient magnitudes
|
|
95
|
+
coeff_setups = [
|
|
96
|
+
f"c = rng.standard_normal({degree + 1}).astype(np.{dtype})",
|
|
97
|
+
f"c = (rng.standard_normal({degree + 1}) * 100).astype(np.{dtype})",
|
|
98
|
+
f"c = (rng.standard_normal({degree + 1}) * 0.01).astype(np.{dtype})",
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
for op in POLYNOMIAL_OPS:
|
|
102
|
+
dist_values: list[float] = []
|
|
103
|
+
perf_instructions: list[int] = []
|
|
104
|
+
|
|
105
|
+
for ci, c_setup in enumerate(coeff_setups):
|
|
106
|
+
seed = 42 + ci
|
|
107
|
+
base_setup = (
|
|
108
|
+
f"import numpy as np; rng = np.random.default_rng({seed}); {c_setup}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if op == "polyval":
|
|
112
|
+
setup = (
|
|
113
|
+
base_setup + f"; x = rng.standard_normal({n}).astype(np.{dtype})"
|
|
114
|
+
)
|
|
115
|
+
bench = "np.polyval(c, x)"
|
|
116
|
+
elif op == "polyfit":
|
|
117
|
+
setup = (
|
|
118
|
+
base_setup
|
|
119
|
+
+ f"; x = np.linspace(-1, 1, {n}).astype(np.{dtype})"
|
|
120
|
+
+ f"; y = np.polyval(c, x) + rng.standard_normal({n}).astype(np.{dtype}) * 0.01"
|
|
121
|
+
)
|
|
122
|
+
bench = f"np.polyfit(x, y, {degree})"
|
|
123
|
+
elif op == "poly":
|
|
124
|
+
setup = (
|
|
125
|
+
base_setup
|
|
126
|
+
+ f"; r = rng.standard_normal({degree}).astype(np.{dtype})"
|
|
127
|
+
)
|
|
128
|
+
bench = "np.poly(r)"
|
|
129
|
+
elif op == "roots":
|
|
130
|
+
setup = base_setup
|
|
131
|
+
bench = "np.roots(c)"
|
|
132
|
+
elif op in ("polyadd", "polysub"):
|
|
133
|
+
setup = (
|
|
134
|
+
base_setup
|
|
135
|
+
+ f"; d = rng.standard_normal({degree + 1}).astype(np.{dtype})"
|
|
136
|
+
)
|
|
137
|
+
bench = f"np.{op}(c, d)"
|
|
138
|
+
elif op in ("polymul", "polydiv"):
|
|
139
|
+
setup = (
|
|
140
|
+
base_setup
|
|
141
|
+
+ f"; d = rng.standard_normal({degree + 1}).astype(np.{dtype})"
|
|
142
|
+
)
|
|
143
|
+
bench = f"np.{op}(c, d)"
|
|
144
|
+
elif op == "polyder":
|
|
145
|
+
setup = base_setup
|
|
146
|
+
bench = "np.polyder(c)"
|
|
147
|
+
elif op == "polyint":
|
|
148
|
+
setup = base_setup
|
|
149
|
+
bench = "np.polyint(c)"
|
|
150
|
+
else:
|
|
151
|
+
setup = base_setup
|
|
152
|
+
bench = f"np.{op}(c)"
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
156
|
+
except RuntimeError:
|
|
157
|
+
continue
|
|
158
|
+
analytical = _analytical_cost(op, n, degree)
|
|
159
|
+
perf_instructions.append(result.total_flops)
|
|
160
|
+
dist_values.append(result.total_flops / (analytical * repeats))
|
|
161
|
+
|
|
162
|
+
if dist_values:
|
|
163
|
+
results[op] = statistics.median(dist_values)
|
|
164
|
+
# Build explicit benchmark_size per op
|
|
165
|
+
if op == "polyval":
|
|
166
|
+
bm_size = f"c: ({degree + 1},), x: ({n},)"
|
|
167
|
+
elif op == "polyfit":
|
|
168
|
+
bm_size = f"x: ({n},), y: ({n},), degree={degree}"
|
|
169
|
+
elif op in ("polymul", "polydiv"):
|
|
170
|
+
bm_size = f"c: ({degree + 1},), d: ({degree + 1},)"
|
|
171
|
+
elif op in ("polyadd", "polysub"):
|
|
172
|
+
bm_size = f"c: ({degree + 1},), d: ({degree + 1},)"
|
|
173
|
+
elif op in ("polyder", "polyint"):
|
|
174
|
+
bm_size = f"c: ({degree + 1},)"
|
|
175
|
+
elif op == "poly":
|
|
176
|
+
bm_size = f"r: ({degree},)"
|
|
177
|
+
elif op == "roots":
|
|
178
|
+
bm_size = f"c: ({degree + 1},)"
|
|
179
|
+
else:
|
|
180
|
+
bm_size = f"n={n}, degree={degree}"
|
|
181
|
+
details[op] = {
|
|
182
|
+
"category": "counted_custom",
|
|
183
|
+
"measurement_mode": "custom",
|
|
184
|
+
"analytical_formula": _FORMULA_STRINGS.get(op, "n"),
|
|
185
|
+
"analytical_flops": analytical,
|
|
186
|
+
"benchmark_size": bm_size,
|
|
187
|
+
"bench_code": bench,
|
|
188
|
+
"repeats": repeats,
|
|
189
|
+
"perf_instructions_total": perf_instructions,
|
|
190
|
+
"distribution_alphas": dist_values,
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
return results, details
|