flopscope 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +1 -0
- benchmarks/__main__.py +6 -0
- benchmarks/_baseline.py +171 -0
- benchmarks/_bitwise.py +231 -0
- benchmarks/_complex.py +176 -0
- benchmarks/_contractions.py +291 -0
- benchmarks/_fft.py +198 -0
- benchmarks/_impl_urls.py +139 -0
- benchmarks/_linalg.py +197 -0
- benchmarks/_linalg_delegates.py +407 -0
- benchmarks/_metadata.py +141 -0
- benchmarks/_misc.py +653 -0
- benchmarks/_perf.py +321 -0
- benchmarks/_perm_group_calibration.py +175 -0
- benchmarks/_pointwise.py +372 -0
- benchmarks/_polynomial.py +193 -0
- benchmarks/_random.py +209 -0
- benchmarks/_reductions.py +136 -0
- benchmarks/_sorting.py +289 -0
- benchmarks/_stats.py +137 -0
- benchmarks/_window.py +92 -0
- benchmarks/accumulation/__init__.py +0 -0
- benchmarks/accumulation/bench_cost_compute.py +138 -0
- benchmarks/dashboard.py +312 -0
- benchmarks/runner.py +636 -0
- flopscope/__init__.py +273 -0
- flopscope/_accumulation/__init__.py +13 -0
- flopscope/_accumulation/_bipartite.py +121 -0
- flopscope/_accumulation/_burnside.py +51 -0
- flopscope/_accumulation/_cache.py +146 -0
- flopscope/_accumulation/_components.py +153 -0
- flopscope/_accumulation/_cost.py +1414 -0
- flopscope/_accumulation/_cost_descriptions.py +63 -0
- flopscope/_accumulation/_detection.py +318 -0
- flopscope/_accumulation/_ladder.py +191 -0
- flopscope/_accumulation/_output_orbit.py +104 -0
- flopscope/_accumulation/_partition.py +290 -0
- flopscope/_accumulation/_path_info.py +211 -0
- flopscope/_accumulation/_public.py +169 -0
- flopscope/_accumulation/_reduction.py +310 -0
- flopscope/_accumulation/_regimes.py +303 -0
- flopscope/_accumulation/_shape.py +33 -0
- flopscope/_accumulation/_wreath.py +209 -0
- flopscope/_budget.py +1027 -0
- flopscope/_config.py +118 -0
- flopscope/_counting_ops.py +451 -0
- flopscope/_display.py +478 -0
- flopscope/_docstrings.py +59 -0
- flopscope/_dtypes.py +20 -0
- flopscope/_einsum.py +717 -0
- flopscope/_errstate.py +25 -0
- flopscope/_flops.py +282 -0
- flopscope/_free_ops.py +2654 -0
- flopscope/_ndarray.py +1126 -0
- flopscope/_opt_einsum/LICENSE +21 -0
- flopscope/_opt_einsum/NOTICE +59 -0
- flopscope/_opt_einsum/__init__.py +209 -0
- flopscope/_opt_einsum/_contract.py +1478 -0
- flopscope/_opt_einsum/_helpers.py +164 -0
- flopscope/_opt_einsum/_hsluv.py +273 -0
- flopscope/_opt_einsum/_path_random.py +462 -0
- flopscope/_opt_einsum/_paths.py +1653 -0
- flopscope/_opt_einsum/_subgraph_symmetry.py +544 -0
- flopscope/_opt_einsum/_symmetry.py +140 -0
- flopscope/_opt_einsum/_typing.py +37 -0
- flopscope/_perm_group.py +717 -0
- flopscope/_pointwise.py +2522 -0
- flopscope/_polynomial.py +278 -0
- flopscope/_registry.py +3216 -0
- flopscope/_sorting_ops.py +571 -0
- flopscope/_symmetric.py +812 -0
- flopscope/_symmetry_transport.py +510 -0
- flopscope/_symmetry_utils.py +669 -0
- flopscope/_type_info.py +12 -0
- flopscope/_unwrap.py +70 -0
- flopscope/_validation.py +83 -0
- flopscope/_version_check.py +46 -0
- flopscope/_weights.py +195 -0
- flopscope/_window.py +177 -0
- flopscope/accounting.py +565 -0
- flopscope/data/default_weights.json +462 -0
- flopscope/data/weights.csv +509 -0
- flopscope/errors.py +197 -0
- flopscope/numpy/__init__.py +878 -0
- flopscope/numpy/fft/__init__.py +55 -0
- flopscope/numpy/fft/_free.py +51 -0
- flopscope/numpy/fft/_transforms.py +695 -0
- flopscope/numpy/linalg/__init__.py +105 -0
- flopscope/numpy/linalg/_aliases.py +126 -0
- flopscope/numpy/linalg/_compound.py +161 -0
- flopscope/numpy/linalg/_decompositions.py +353 -0
- flopscope/numpy/linalg/_properties.py +533 -0
- flopscope/numpy/linalg/_solvers.py +444 -0
- flopscope/numpy/linalg/_svd.py +122 -0
- flopscope/numpy/random/__init__.py +684 -0
- flopscope/numpy/random/_cost_formulas.py +115 -0
- flopscope/numpy/random/_counted_classes.py +241 -0
- flopscope/numpy/testing/__init__.py +13 -0
- flopscope/numpy/typing/__init__.py +30 -0
- flopscope/py.typed +0 -0
- flopscope/stats/__init__.py +84 -0
- flopscope/stats/_base.py +77 -0
- flopscope/stats/_cauchy.py +146 -0
- flopscope/stats/_erf.py +190 -0
- flopscope/stats/_expon.py +146 -0
- flopscope/stats/_laplace.py +150 -0
- flopscope/stats/_logistic.py +148 -0
- flopscope/stats/_lognorm.py +160 -0
- flopscope/stats/_ndtri.py +133 -0
- flopscope/stats/_norm.py +149 -0
- flopscope/stats/_truncnorm.py +186 -0
- flopscope/stats/_uniform.py +141 -0
- flopscope-0.2.0.dist-info/METADATA +23 -0
- flopscope-0.2.0.dist-info/RECORD +115 -0
- flopscope-0.2.0.dist-info/WHEEL +4 -0
benchmarks/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Benchmark suite for measuring empirical FLOP weights."""
|
benchmarks/__main__.py
ADDED
benchmarks/_baseline.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Baseline measurements for overhead-subtracted weight normalization.
|
|
2
|
+
|
|
3
|
+
Measures three baselines:
|
|
4
|
+
|
|
5
|
+
1. **alpha(add)** — raw FP instructions per element for ``np.add`` (includes
|
|
6
|
+
ufunc overhead). Used to derive the binary ufunc overhead.
|
|
7
|
+
2. **alpha(abs)** — raw FP instructions per element for ``np.abs``. Since abs
|
|
8
|
+
on float64 is a bitwise sign-bit clear (NOT an FP instruction), all
|
|
9
|
+
measured FP instructions are pure **unary ufunc overhead**.
|
|
10
|
+
3. **Binary ufunc overhead** = ``alpha(add) - 1.0`` (since one add = exactly
|
|
11
|
+
one FP instruction; the rest is overhead).
|
|
12
|
+
|
|
13
|
+
The runner subtracts the appropriate overhead from each counted operation's
|
|
14
|
+
raw alpha before storing it as the weight::
|
|
15
|
+
|
|
16
|
+
weight(op) = max(alpha_raw(op) - overhead_for_category, 0.0)
|
|
17
|
+
|
|
18
|
+
Known analytical zero-FLOP operations are stored separately with
|
|
19
|
+
``weight(op) = 0.0`` so the published artifacts surface them as free rather
|
|
20
|
+
than as unit-cost operations.
|
|
21
|
+
|
|
22
|
+
This replaces the old ``weight(op) = alpha(op) / alpha(add)`` formula which
|
|
23
|
+
penalized BLAS ops (that bypass the ufunc layer) with ufunc overhead they
|
|
24
|
+
don't have.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import statistics
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
|
|
32
|
+
from benchmarks._perf import measure_flops
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class BaselineResult:
|
|
37
|
+
"""All baseline measurements needed for overhead-subtracted normalization."""
|
|
38
|
+
|
|
39
|
+
alpha_add: float
|
|
40
|
+
"""Raw alpha for np.add (FP instructions per element, including overhead)."""
|
|
41
|
+
|
|
42
|
+
alpha_abs: float
|
|
43
|
+
"""Raw alpha for np.abs (pure unary ufunc overhead — abs is bitwise)."""
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def overhead_ufunc_unary(self) -> float:
|
|
47
|
+
"""Unary ufunc overhead per element (from abs measurement)."""
|
|
48
|
+
return self.alpha_abs
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def overhead_ufunc_binary(self) -> float:
|
|
52
|
+
"""Binary ufunc overhead per element.
|
|
53
|
+
|
|
54
|
+
Derived as alpha(add) - 1.0, since one add = exactly 1 FP instruction.
|
|
55
|
+
"""
|
|
56
|
+
return max(self.alpha_add - 1.0, 0.0)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def overhead_ufunc_reduction(self) -> float:
|
|
60
|
+
"""Reduction ufunc overhead (same iterator structure as unary)."""
|
|
61
|
+
return self.alpha_abs
|
|
62
|
+
|
|
63
|
+
def overhead_for_mode(self, mode: str) -> float:
|
|
64
|
+
"""Return the overhead to subtract for a given measurement mode."""
|
|
65
|
+
return {
|
|
66
|
+
"ufunc_unary": self.overhead_ufunc_unary,
|
|
67
|
+
"ufunc_binary": self.overhead_ufunc_binary,
|
|
68
|
+
"ufunc_reduction": self.overhead_ufunc_reduction,
|
|
69
|
+
"blas": 0.0,
|
|
70
|
+
"linalg": 0.0,
|
|
71
|
+
"custom": 0.0,
|
|
72
|
+
"instructions": 0.0,
|
|
73
|
+
}.get(mode, 0.0)
|
|
74
|
+
|
|
75
|
+
def to_dict(self) -> dict:
|
|
76
|
+
"""Serialize for weights.json metadata."""
|
|
77
|
+
return {
|
|
78
|
+
"alpha_add_raw": self.alpha_add,
|
|
79
|
+
"alpha_abs_raw": self.alpha_abs,
|
|
80
|
+
"overhead_ufunc_unary": self.overhead_ufunc_unary,
|
|
81
|
+
"overhead_ufunc_binary": self.overhead_ufunc_binary,
|
|
82
|
+
"overhead_ufunc_reduction": self.overhead_ufunc_reduction,
|
|
83
|
+
"normalization": (
|
|
84
|
+
"subtract per-category ufunc overhead; known zero-FLOP ops use "
|
|
85
|
+
"weight 0.0"
|
|
86
|
+
),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _measure_alpha(setups: list[str], bench: str, n: int, repeats: int) -> float:
|
|
91
|
+
"""Measure median alpha across distributions."""
|
|
92
|
+
dist_alphas = []
|
|
93
|
+
for setup in setups:
|
|
94
|
+
result = measure_flops(setup, bench, repeats=repeats)
|
|
95
|
+
dist_alphas.append(result.total_flops / (n * repeats))
|
|
96
|
+
return statistics.median(dist_alphas)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _unary_setups(n: int, dtype: str) -> list[str]:
|
|
100
|
+
return [
|
|
101
|
+
f"x = np.random.default_rng(42).standard_normal({n}).astype(np.{dtype})",
|
|
102
|
+
(
|
|
103
|
+
f"rng = np.random.default_rng(42); "
|
|
104
|
+
f"x = rng.uniform(0.01, 100, size={n}).astype(np.{dtype})"
|
|
105
|
+
),
|
|
106
|
+
(
|
|
107
|
+
f"rng = np.random.default_rng(42); "
|
|
108
|
+
f"x = rng.uniform(-1000, 1000, size={n}).astype(np.{dtype})"
|
|
109
|
+
),
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _binary_setups(n: int, dtype: str) -> list[str]:
|
|
114
|
+
return [
|
|
115
|
+
(
|
|
116
|
+
f"x = np.random.default_rng(42).standard_normal({n}).astype(np.{dtype}); "
|
|
117
|
+
f"y = np.random.default_rng(43).standard_normal({n}).astype(np.{dtype}); "
|
|
118
|
+
f"_out = np.empty({n}, dtype=np.{dtype})"
|
|
119
|
+
),
|
|
120
|
+
(
|
|
121
|
+
f"rng = np.random.default_rng(42); "
|
|
122
|
+
f"x = rng.uniform(0.01, 100, size={n}).astype(np.{dtype}); "
|
|
123
|
+
f"y = rng.uniform(0.01, 100, size={n}).astype(np.{dtype}); "
|
|
124
|
+
f"_out = np.empty({n}, dtype=np.{dtype})"
|
|
125
|
+
),
|
|
126
|
+
(
|
|
127
|
+
f"rng = np.random.default_rng(42); "
|
|
128
|
+
f"x = rng.uniform(-1000, 1000, size={n}).astype(np.{dtype}); "
|
|
129
|
+
f"y = rng.uniform(-1000, 1000, size={n}).astype(np.{dtype}); "
|
|
130
|
+
f"_out = np.empty({n}, dtype=np.{dtype})"
|
|
131
|
+
),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def measure_baseline(
|
|
136
|
+
n: int = 10_000_000, dtype: str = "float64", repeats: int = 10
|
|
137
|
+
) -> float:
|
|
138
|
+
"""Return alpha(add) for backwards compatibility.
|
|
139
|
+
|
|
140
|
+
Prefer :func:`measure_baselines` which returns the full
|
|
141
|
+
:class:`BaselineResult` with overhead measurements.
|
|
142
|
+
"""
|
|
143
|
+
return _measure_alpha(
|
|
144
|
+
_binary_setups(n, dtype), "np.add(x, y, out=_out)", n, repeats
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def measure_baselines(
|
|
149
|
+
n: int = 10_000_000, dtype: str = "float64", repeats: int = 10
|
|
150
|
+
) -> BaselineResult:
|
|
151
|
+
"""Measure all baselines needed for overhead-subtracted normalization.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
BaselineResult
|
|
156
|
+
Contains alpha(add), alpha(abs), and derived overhead values.
|
|
157
|
+
"""
|
|
158
|
+
alpha_add = _measure_alpha(
|
|
159
|
+
_binary_setups(n, dtype), "np.add(x, y, out=_out)", n, repeats
|
|
160
|
+
)
|
|
161
|
+
alpha_abs = _measure_alpha(_unary_setups(n, dtype), "np.abs(x)", n, repeats)
|
|
162
|
+
|
|
163
|
+
result = BaselineResult(alpha_add=alpha_add, alpha_abs=alpha_abs)
|
|
164
|
+
print(f" alpha(add) = {alpha_add:.4f}")
|
|
165
|
+
print(f" alpha(abs) = {alpha_abs:.4f} (pure unary ufunc overhead)")
|
|
166
|
+
print(" Derived overheads:")
|
|
167
|
+
print(f" ufunc_unary: {result.overhead_ufunc_unary:.4f}")
|
|
168
|
+
print(f" ufunc_binary: {result.overhead_ufunc_binary:.4f}")
|
|
169
|
+
print(f" ufunc_reduction: {result.overhead_ufunc_reduction:.4f}")
|
|
170
|
+
print(" blas/linalg: 0.0000")
|
|
171
|
+
return result
|
benchmarks/_bitwise.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Benchmark bitwise and integer operations via ``instructions`` counter.
|
|
2
|
+
|
|
3
|
+
These ops operate on integers, so ``fp_arith_inst_retired`` perf counters
|
|
4
|
+
read 0. We use ``perf stat -e instructions`` (total retired instructions)
|
|
5
|
+
as the hardware-counter fallback — more stable and deterministic than
|
|
6
|
+
wall-clock timing. Falls back to timing if ``perf`` is unavailable.
|
|
7
|
+
|
|
8
|
+
Also includes ``isnat`` which operates on datetime64 arrays.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import statistics
|
|
14
|
+
|
|
15
|
+
from benchmarks._perf import measure_instructions
|
|
16
|
+
|
|
17
|
+
# --- Operation lists -------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
UNARY_OPS: list[str] = [
|
|
20
|
+
"bitwise_not",
|
|
21
|
+
"bitwise_invert",
|
|
22
|
+
"bitwise_count",
|
|
23
|
+
"invert",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
BINARY_OPS: list[str] = [
|
|
27
|
+
"bitwise_and",
|
|
28
|
+
"bitwise_or",
|
|
29
|
+
"bitwise_xor",
|
|
30
|
+
"gcd",
|
|
31
|
+
"lcm",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
SHIFT_OPS: list[str] = [
|
|
35
|
+
"bitwise_left_shift",
|
|
36
|
+
"bitwise_right_shift",
|
|
37
|
+
"left_shift",
|
|
38
|
+
"right_shift",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
SPECIAL_OPS: list[str] = [
|
|
42
|
+
"isnat",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
BITWISE_OPS: list[str] = UNARY_OPS + BINARY_OPS + SHIFT_OPS + SPECIAL_OPS
|
|
46
|
+
|
|
47
|
+
# --- Analytical formula strings (all cost = n) ----------------------------
|
|
48
|
+
|
|
49
|
+
_FORMULA_STRINGS: dict[str, str] = dict.fromkeys(BITWISE_OPS, "n")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _analytical_cost(op: str, n: int) -> int: # noqa: ARG001
|
|
53
|
+
"""Return analytical FLOP cost for *op* on arrays of length *n*."""
|
|
54
|
+
return n
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# --- Setup helpers ----------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _unary_setup(n: int, dist_idx: int) -> str:
|
|
61
|
+
"""Build setup code for a unary integer op."""
|
|
62
|
+
seeds = [42, 123, 7]
|
|
63
|
+
seed = seeds[dist_idx]
|
|
64
|
+
return (
|
|
65
|
+
f"import numpy as np; "
|
|
66
|
+
f"x = np.random.default_rng({seed}).integers(-1_000_000, 1_000_000, "
|
|
67
|
+
f"size={n}, dtype=np.int64)"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _binary_setup(n: int, dist_idx: int) -> str:
|
|
72
|
+
"""Build setup code for a binary integer op."""
|
|
73
|
+
seeds = [42, 123, 7]
|
|
74
|
+
seed = seeds[dist_idx]
|
|
75
|
+
return (
|
|
76
|
+
f"import numpy as np; "
|
|
77
|
+
f"rng = np.random.default_rng({seed}); "
|
|
78
|
+
f"a = rng.integers(-1_000_000, 1_000_000, size={n}, dtype=np.int64); "
|
|
79
|
+
f"b = rng.integers(-1_000_000, 1_000_000, size={n}, dtype=np.int64)"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _gcd_lcm_setup(n: int, dist_idx: int) -> str:
|
|
84
|
+
"""Build setup code for gcd/lcm (positive integers)."""
|
|
85
|
+
seeds = [42, 123, 7]
|
|
86
|
+
seed = seeds[dist_idx]
|
|
87
|
+
return (
|
|
88
|
+
f"import numpy as np; "
|
|
89
|
+
f"rng = np.random.default_rng({seed}); "
|
|
90
|
+
f"a = rng.integers(1, 1_000_000, size={n}, dtype=np.int64); "
|
|
91
|
+
f"b = rng.integers(1, 1_000_000, size={n}, dtype=np.int64)"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _shift_setup(n: int, dist_idx: int) -> str:
|
|
96
|
+
"""Build setup code for shift ops (second operand 0-10)."""
|
|
97
|
+
seeds = [42, 123, 7]
|
|
98
|
+
seed = seeds[dist_idx]
|
|
99
|
+
return (
|
|
100
|
+
f"import numpy as np; "
|
|
101
|
+
f"rng = np.random.default_rng({seed}); "
|
|
102
|
+
f"a = rng.integers(-1_000_000, 1_000_000, size={n}, dtype=np.int64); "
|
|
103
|
+
f"b = rng.integers(0, 11, size={n}, dtype=np.int64)"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _isnat_setup(n: int, dist_idx: int) -> str:
|
|
108
|
+
"""Build setup code for isnat (datetime64 input with some NaTs)."""
|
|
109
|
+
seeds = [42, 123, 7]
|
|
110
|
+
seeds[dist_idx]
|
|
111
|
+
# Create datetime64 array with ~1/3 NaT values
|
|
112
|
+
return (
|
|
113
|
+
f"import numpy as np; "
|
|
114
|
+
f"x = np.array(['2020-01-01', 'NaT', '2020-06-15'] * ({n} // 3), "
|
|
115
|
+
f"dtype='datetime64')"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# --- Main benchmark function -----------------------------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def benchmark_bitwise(
|
|
123
|
+
n: int = 10_000_000,
|
|
124
|
+
dtype: str = "int64",
|
|
125
|
+
repeats: int = 10,
|
|
126
|
+
) -> tuple[dict[str, float], dict[str, dict]]:
|
|
127
|
+
"""Benchmark bitwise/integer ops using timing mode only.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
n : int
|
|
132
|
+
Array size.
|
|
133
|
+
dtype : str
|
|
134
|
+
Ignored (always uses int64 for bitwise ops). Kept for interface
|
|
135
|
+
consistency with other benchmark modules.
|
|
136
|
+
repeats : int
|
|
137
|
+
Number of repetitions per measurement.
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
tuple[dict[str, float], dict[str, dict]]
|
|
142
|
+
A pair of (alphas, details). ``alphas`` maps op name to median
|
|
143
|
+
timing per element (nanoseconds). ``details`` maps op name to a
|
|
144
|
+
dict of raw benchmark metadata.
|
|
145
|
+
"""
|
|
146
|
+
distributions = 3
|
|
147
|
+
results: dict[str, float] = {}
|
|
148
|
+
details: dict[str, dict] = {}
|
|
149
|
+
|
|
150
|
+
def _bench_op(
|
|
151
|
+
op: str, setup_fn, bench_code: str, category: str, size_desc: str
|
|
152
|
+
) -> None:
|
|
153
|
+
"""Benchmark a single op across distributions using instructions counter."""
|
|
154
|
+
dist_values: list[float] = []
|
|
155
|
+
dist_raw_totals: list[int] = []
|
|
156
|
+
for di in range(distributions):
|
|
157
|
+
setup = setup_fn(n, di)
|
|
158
|
+
try:
|
|
159
|
+
result = measure_instructions(setup, bench_code, repeats=repeats)
|
|
160
|
+
except RuntimeError:
|
|
161
|
+
continue
|
|
162
|
+
dist_values.append(result.total_flops / (n * repeats))
|
|
163
|
+
dist_raw_totals.append(result.total_flops)
|
|
164
|
+
if dist_values:
|
|
165
|
+
results[op] = statistics.median(dist_values)
|
|
166
|
+
details[op] = {
|
|
167
|
+
"category": category,
|
|
168
|
+
"measurement_mode": "instructions",
|
|
169
|
+
"analytical_formula": _FORMULA_STRINGS[op],
|
|
170
|
+
"analytical_flops": n,
|
|
171
|
+
"benchmark_size": size_desc,
|
|
172
|
+
"bench_code": bench_code,
|
|
173
|
+
"repeats": repeats,
|
|
174
|
+
"perf_instructions_total": dist_raw_totals,
|
|
175
|
+
"distribution_alphas": dist_values,
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# --- Unary ops ---
|
|
179
|
+
for op in UNARY_OPS:
|
|
180
|
+
_bench_op(op, _unary_setup, f"np.{op}(x)", "instructions_unary", f"x: ({n},)")
|
|
181
|
+
|
|
182
|
+
# --- Binary ops ---
|
|
183
|
+
for op in BINARY_OPS:
|
|
184
|
+
setup_fn = _gcd_lcm_setup if op in ("gcd", "lcm") else _binary_setup
|
|
185
|
+
_bench_op(
|
|
186
|
+
op,
|
|
187
|
+
setup_fn,
|
|
188
|
+
f"np.{op}(a, b)",
|
|
189
|
+
"instructions_binary",
|
|
190
|
+
f"a: ({n},), b: ({n},)",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# --- Shift ops ---
|
|
194
|
+
for op in SHIFT_OPS:
|
|
195
|
+
_bench_op(
|
|
196
|
+
op,
|
|
197
|
+
_shift_setup,
|
|
198
|
+
f"np.{op}(a, b)",
|
|
199
|
+
"instructions_shift",
|
|
200
|
+
f"a: ({n},), b: ({n},) (values 0-10)",
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# --- Special ops ---
|
|
204
|
+
# isnat: operates on datetime64 arrays
|
|
205
|
+
op = "isnat"
|
|
206
|
+
dist_values: list[float] = []
|
|
207
|
+
dist_raw_totals: list[int] = []
|
|
208
|
+
bench = "np.isnat(x)"
|
|
209
|
+
for di in range(distributions):
|
|
210
|
+
setup = _isnat_setup(n, di)
|
|
211
|
+
try:
|
|
212
|
+
result = measure_instructions(setup, bench, repeats=repeats)
|
|
213
|
+
except RuntimeError:
|
|
214
|
+
continue
|
|
215
|
+
dist_values.append(result.total_flops / (n * repeats))
|
|
216
|
+
dist_raw_totals.append(result.total_flops)
|
|
217
|
+
if dist_values:
|
|
218
|
+
results[op] = statistics.median(dist_values)
|
|
219
|
+
details[op] = {
|
|
220
|
+
"category": "instructions_special",
|
|
221
|
+
"measurement_mode": "instructions",
|
|
222
|
+
"analytical_formula": _FORMULA_STRINGS[op],
|
|
223
|
+
"analytical_flops": n,
|
|
224
|
+
"benchmark_size": f"x: ({n},) datetime64 with NaTs",
|
|
225
|
+
"bench_code": bench,
|
|
226
|
+
"repeats": repeats,
|
|
227
|
+
"perf_instructions_total": dist_raw_totals,
|
|
228
|
+
"distribution_alphas": dist_values,
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
return results, details
|
benchmarks/_complex.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Benchmark complex-number operations.
|
|
2
|
+
|
|
3
|
+
Most ops use perf mode with complex128 input (they DO retire FP instructions
|
|
4
|
+
on complex data). Two type-check ops (``iscomplexobj``, ``isrealobj``) use
|
|
5
|
+
the ``instructions`` counter because they inspect the dtype, not the array
|
|
6
|
+
elements (so ``fp_arith_inst_retired`` reads 0).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import statistics
|
|
12
|
+
|
|
13
|
+
from benchmarks._perf import measure_flops, measure_instructions
|
|
14
|
+
|
|
15
|
+
COMPLEX_OPS: list[str] = [
|
|
16
|
+
"angle",
|
|
17
|
+
"conj",
|
|
18
|
+
"conjugate",
|
|
19
|
+
"imag",
|
|
20
|
+
"real",
|
|
21
|
+
"real_if_close",
|
|
22
|
+
"iscomplex",
|
|
23
|
+
"isreal",
|
|
24
|
+
"sort_complex",
|
|
25
|
+
"iscomplexobj",
|
|
26
|
+
"isrealobj",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Ops that use the ``instructions`` counter instead of ``fp_arith_inst_retired``
|
|
30
|
+
# because they inspect the dtype, not the array elements.
|
|
31
|
+
_INSTRUCTIONS_OPS: frozenset[str] = frozenset({"iscomplexobj", "isrealobj"})
|
|
32
|
+
|
|
33
|
+
_FORMULA_STRINGS: dict[str, str] = {
|
|
34
|
+
"angle": "numel(output)",
|
|
35
|
+
"conj": "numel(output)",
|
|
36
|
+
"conjugate": "numel(output)",
|
|
37
|
+
"imag": "numel(output)",
|
|
38
|
+
"real": "numel(output)",
|
|
39
|
+
"real_if_close": "numel(output)",
|
|
40
|
+
"iscomplex": "numel(output)",
|
|
41
|
+
"isreal": "numel(output)",
|
|
42
|
+
"sort_complex": "numel(output)",
|
|
43
|
+
"iscomplexobj": "numel(output)",
|
|
44
|
+
"isrealobj": "numel(output)",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Seeds for the 3 input distributions.
|
|
48
|
+
_DIST_SEEDS: list[tuple[int, int]] = [
|
|
49
|
+
(42, 43),
|
|
50
|
+
(100, 101),
|
|
51
|
+
(200, 201),
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _complex_setup(n: int, seed_real: int, seed_imag: int) -> str:
|
|
56
|
+
"""Build setup code that creates a complex128 array from two RNGs."""
|
|
57
|
+
return (
|
|
58
|
+
f"import numpy as np; "
|
|
59
|
+
f"x = np.random.default_rng({seed_real}).standard_normal({n}).astype(np.float64) "
|
|
60
|
+
f"+ 1j * np.random.default_rng({seed_imag}).standard_normal({n}).astype(np.float64)"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _real_if_close_setup(n: int, dist_idx: int) -> str:
|
|
65
|
+
"""Build setup for ``real_if_close``.
|
|
66
|
+
|
|
67
|
+
Distribution 0 has negligible imaginary parts (tests the "close to real"
|
|
68
|
+
path). Distributions 1 and 2 have substantial imaginary parts.
|
|
69
|
+
"""
|
|
70
|
+
if dist_idx == 0:
|
|
71
|
+
# Negligible imaginary part — real_if_close may strip it.
|
|
72
|
+
return (
|
|
73
|
+
f"import numpy as np; "
|
|
74
|
+
f"x = np.random.default_rng(42).standard_normal({n}).astype(np.float64) "
|
|
75
|
+
f"+ 1j * np.random.default_rng(43).standard_normal({n}).astype(np.float64) * 1e-15"
|
|
76
|
+
)
|
|
77
|
+
seed_r, seed_i = _DIST_SEEDS[dist_idx]
|
|
78
|
+
return _complex_setup(n, seed_r, seed_i)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _timing_setup(n: int) -> str:
|
|
82
|
+
"""Build setup for type-check ops (``iscomplexobj`` / ``isrealobj``)."""
|
|
83
|
+
return (
|
|
84
|
+
f"import numpy as np; "
|
|
85
|
+
f"x = np.random.default_rng(42).standard_normal({n}) "
|
|
86
|
+
f"+ 1j * np.random.default_rng(43).standard_normal({n})"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _bench_code(op: str) -> str:
|
|
91
|
+
"""Return the benchmark statement for *op*."""
|
|
92
|
+
return f"np.{op}(x)"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def benchmark_complex(
|
|
96
|
+
n: int = 10_000_000,
|
|
97
|
+
dtype: str = "complex128",
|
|
98
|
+
repeats: int = 10,
|
|
99
|
+
distributions: int = 3,
|
|
100
|
+
) -> tuple[dict[str, float], dict[str, dict]]:
|
|
101
|
+
"""Benchmark complex-number ops, returning raw measurement per element.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
n : int
|
|
106
|
+
Array size (element count).
|
|
107
|
+
dtype : str
|
|
108
|
+
NumPy dtype string (unused — always complex128, kept for API parity).
|
|
109
|
+
repeats : int
|
|
110
|
+
Number of repetitions per measurement.
|
|
111
|
+
distributions : int
|
|
112
|
+
Number of input distributions to measure (median is taken).
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
tuple[dict[str, float], dict[str, dict]]
|
|
117
|
+
``(alphas, details)`` — *alphas* maps op name to median measurement
|
|
118
|
+
per analytical FLOP; *details* maps op name to benchmark metadata.
|
|
119
|
+
"""
|
|
120
|
+
results: dict[str, float] = {}
|
|
121
|
+
details: dict[str, dict] = {}
|
|
122
|
+
|
|
123
|
+
for op in COMPLEX_OPS:
|
|
124
|
+
# --- Determine n for this op ---
|
|
125
|
+
op_n = 1_000_000 if op == "sort_complex" else n
|
|
126
|
+
|
|
127
|
+
# --- Choose measurement function ---
|
|
128
|
+
use_instructions = op in _INSTRUCTIONS_OPS
|
|
129
|
+
measure_fn = measure_instructions if use_instructions else measure_flops
|
|
130
|
+
|
|
131
|
+
dist_values: list[float] = []
|
|
132
|
+
dist_raw_totals: list[int] = []
|
|
133
|
+
bench = _bench_code(op)
|
|
134
|
+
|
|
135
|
+
for di in range(distributions):
|
|
136
|
+
# --- Build setup code ---
|
|
137
|
+
if use_instructions:
|
|
138
|
+
setup = _timing_setup(op_n)
|
|
139
|
+
elif op == "real_if_close":
|
|
140
|
+
setup = _real_if_close_setup(op_n, di)
|
|
141
|
+
else:
|
|
142
|
+
seed_r, seed_i = _DIST_SEEDS[di]
|
|
143
|
+
setup = _complex_setup(op_n, seed_r, seed_i)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
result = measure_fn(setup, bench, repeats=repeats)
|
|
147
|
+
except RuntimeError:
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
# Analytical cost = numel(output) = op_n for all complex ops.
|
|
151
|
+
analytical = op_n
|
|
152
|
+
dist_values.append(result.total_flops / (analytical * repeats))
|
|
153
|
+
dist_raw_totals.append(result.total_flops)
|
|
154
|
+
|
|
155
|
+
if dist_values:
|
|
156
|
+
results[op] = statistics.median(dist_values)
|
|
157
|
+
|
|
158
|
+
if use_instructions:
|
|
159
|
+
bm_size = f"x: ({op_n},) complex128 (instructions counter)"
|
|
160
|
+
else:
|
|
161
|
+
bm_size = f"x: ({op_n},) complex128"
|
|
162
|
+
|
|
163
|
+
mm = "instructions" if op in _INSTRUCTIONS_OPS else "ufunc_unary"
|
|
164
|
+
details[op] = {
|
|
165
|
+
"category": "counted_complex",
|
|
166
|
+
"measurement_mode": mm,
|
|
167
|
+
"analytical_formula": _FORMULA_STRINGS[op],
|
|
168
|
+
"analytical_flops": op_n,
|
|
169
|
+
"benchmark_size": bm_size,
|
|
170
|
+
"bench_code": bench,
|
|
171
|
+
"repeats": repeats,
|
|
172
|
+
"perf_instructions_total": dist_raw_totals,
|
|
173
|
+
"distribution_alphas": dist_values,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return results, details
|