kernelmeter 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kernelmeter/cli.py ADDED
@@ -0,0 +1,304 @@
1
+ """kernelmeter command line interface.
2
+
3
+ kernelmeter info dump every device attribute + derived peaks
4
+ kernelmeter bench file.py run all @kernelmeter.benchmark specs in file
5
+ kernelmeter roofline draw the device roofline, place your kernel on it
6
+ kernelmeter occupancy theoretical occupancy from block/regs/smem
7
+ kernelmeter ceiling measure real achievable bandwidth and FP32
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import importlib.util
14
+ import json
15
+ import sys
16
+
17
+ from . import attrs as _attrs
18
+ from . import occupancy as _occupancy
19
+ from . import peaks as _peaks
20
+ from . import roofline as _roofline
21
+ from .cudadrv import CudaNotAvailableError, Driver
22
+
23
+
24
+ def _fmt(value: float | None, suffix: str = "", nd: int = 1) -> str:
25
+ return f"{value:.{nd}f}{suffix}" if value is not None else "-"
26
+
27
+
28
+ def _device_attrs(ordinal: int = 0) -> dict[str, int]:
29
+ driver = Driver()
30
+ return _attrs.query_all(driver, driver.device(ordinal))
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # info
35
+ # ---------------------------------------------------------------------------
36
+
37
+ def gather_info(driver: Driver) -> dict:
38
+ major, minor = driver.driver_version()
39
+ devices = []
40
+ for ordinal in range(driver.device_count()):
41
+ dev = driver.device(ordinal)
42
+ attributes = _attrs.query_all(driver, dev)
43
+ peaks = _peaks.derive(attributes)
44
+ devices.append(
45
+ {
46
+ "ordinal": ordinal,
47
+ "name": dev.name,
48
+ "total_memory_bytes": dev.total_mem_bytes,
49
+ "derived": peaks.as_dict(),
50
+ "attributes": attributes,
51
+ }
52
+ )
53
+ return {"driver_version": f"{major}.{minor}", "devices": devices}
54
+
55
+
56
+ def cmd_info(args: argparse.Namespace) -> int:
57
+ try:
58
+ driver = Driver()
59
+ except CudaNotAvailableError as exc:
60
+ print(f"error: {exc}", file=sys.stderr)
61
+ return 1
62
+
63
+ info = gather_info(driver)
64
+ if args.json:
65
+ print(json.dumps(info, indent=2))
66
+ return 0
67
+
68
+ print(f"CUDA driver version : {info['driver_version']}")
69
+ for dev in info["devices"]:
70
+ gib = dev["total_memory_bytes"] / 2**30
71
+ print(f"\nDevice {dev['ordinal']}: {dev['name']} ({gib:.1f} GiB)")
72
+ derived = dev["derived"]
73
+ print(f" compute capability : {derived['compute_capability']}")
74
+ print(
75
+ " theoretical mem bandwidth : "
76
+ + _fmt(derived["theoretical_mem_bandwidth_gb_s"], " GB/s")
77
+ )
78
+ print(
79
+ " theoretical FP32 peak : "
80
+ + _fmt(derived["theoretical_fp32_tflops"], " TFLOP/s", nd=2)
81
+ )
82
+ print(f"\n {'attribute':<48} value")
83
+ print(f" {'-' * 48} {'-' * 12}")
84
+ for name, value in dev["attributes"].items():
85
+ print(f" {name:<48} {value}")
86
+ return 0
87
+
88
+
89
+ # ---------------------------------------------------------------------------
90
+ # bench
91
+ # ---------------------------------------------------------------------------
92
+
93
+ def _load_module(path: str) -> None:
94
+ spec = importlib.util.spec_from_file_location("kernelmeter_bench_target", path)
95
+ if spec is None or spec.loader is None:
96
+ raise SystemExit(f"error: cannot import {path}")
97
+ module = importlib.util.module_from_spec(spec)
98
+ spec.loader.exec_module(module)
99
+
100
+
101
+ def cmd_bench(args: argparse.Namespace) -> int:
102
+ from . import bench as _bench
103
+
104
+ _load_module(args.file)
105
+ if not _bench.REGISTRY:
106
+ print(
107
+ f"error: {args.file} registered no benchmarks. "
108
+ "Decorate your kernels with @kernelmeter.benchmark(...).",
109
+ file=sys.stderr,
110
+ )
111
+ return 1
112
+
113
+ results = _bench.run_registry(flush_l2=not args.no_flush_l2)
114
+
115
+ if args.save:
116
+ with open(args.save, "w") as fh:
117
+ json.dump([r.as_dict() for r in results], fh, indent=2)
118
+
119
+ if args.json:
120
+ print(json.dumps([r.as_dict() for r in results], indent=2))
121
+ else:
122
+ header = (
123
+ f"{'kernel':<24} {'median ms':>10} {'GB/s':>9} {'TFLOP/s':>9} "
124
+ f"{'bound':>6} {'%roof':>7} {'vs ref':>8} {'correct':>8}"
125
+ )
126
+ print(header)
127
+ print("-" * len(header))
128
+ for r in results:
129
+ if r.error:
130
+ print(f"{r.name:<24} failed: {r.error}")
131
+ continue
132
+ correct = "-" if r.correct is None else ("PASS" if r.correct else "FAIL")
133
+ speedup = f"{r.speedup_vs_ref:.2f}x" if r.speedup_vs_ref else "-"
134
+ print(
135
+ f"{r.name:<24} {r.ms_median:>10.4f} {_fmt(r.gbps):>9} "
136
+ f"{_fmt(r.tflops, nd=2):>9} {r.bound or '-':>6} "
137
+ f"{_fmt(r.pct_roofline, '%'):>7} {speedup:>8} {correct:>8}"
138
+ )
139
+
140
+ ok = all(r.error is None and r.correct in (None, True) for r in results)
141
+
142
+ if args.compare:
143
+ with open(args.compare) as fh:
144
+ baseline = json.load(fh)
145
+ rows, regressions = _bench.diff_results(baseline, results)
146
+ if rows:
147
+ print(f"\n{'kernel':<24} {'baseline ms':>12} {'now ms':>10} {'delta':>8}")
148
+ for name, old_ms, new_ms, delta in rows:
149
+ print(f"{name:<24} {old_ms:>12.4f} {new_ms:>10.4f} {delta:>+7.1f}%")
150
+ for name in regressions:
151
+ print(f"regression: {name} is more than 5% slower than the baseline")
152
+ if regressions:
153
+ ok = False
154
+
155
+ return 0 if ok else 1
156
+
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # roofline
160
+ # ---------------------------------------------------------------------------
161
+
162
+ def cmd_roofline(args: argparse.Namespace) -> int:
163
+ peak_bw, peak_tf = args.peak_bw, args.peak_tflops
164
+ name = None
165
+ if peak_bw is None or peak_tf is None:
166
+ try:
167
+ driver = Driver()
168
+ dev = driver.device(args.device)
169
+ name = dev.name
170
+ peaks = _peaks.derive(_attrs.query_all(driver, dev))
171
+ peak_bw = peak_bw or peaks.mem_bandwidth_gbs
172
+ peak_tf = peak_tf or peaks.fp32_tflops
173
+ except CudaNotAvailableError:
174
+ pass
175
+ if not peak_bw or not peak_tf:
176
+ print(
177
+ "error: no CUDA device found. Pass --peak-bw GB/s and "
178
+ "--peak-tflops to draw a roofline for any card.",
179
+ file=sys.stderr,
180
+ )
181
+ return 1
182
+
183
+ ridge = _roofline.ridge_point(peak_tf, peak_bw)
184
+ if name:
185
+ print(f"Device {args.device}: {name}")
186
+ print(f" peak bandwidth : {peak_bw:.1f} GB/s")
187
+ print(f" peak compute : {peak_tf:.2f} TFLOP/s")
188
+ print(f" ridge point : {ridge:.1f} flop/byte\n")
189
+ for line in _roofline.render(peak_tf, peak_bw, ai=args.ai):
190
+ print(line)
191
+ if args.ai is not None:
192
+ attainable = _roofline.attainable_tflops(args.ai, peak_tf, peak_bw)
193
+ which = _roofline.bound(args.ai, peak_tf, peak_bw)
194
+ kind = "memory" if which == "mem" else "compute"
195
+ print(
196
+ f"\nat {args.ai:g} flop/byte the kernel is {kind}-bound; "
197
+ f"attainable: {attainable:.2f} TFLOP/s"
198
+ )
199
+ return 0
200
+
201
+
202
+ # ---------------------------------------------------------------------------
203
+ # occupancy
204
+ # ---------------------------------------------------------------------------
205
+
206
+ def cmd_occupancy(args: argparse.Namespace) -> int:
207
+ if args.cc:
208
+ try:
209
+ major, minor = (int(x) for x in args.cc.split("."))
210
+ limits = _occupancy.limits_for_cc(major, minor)
211
+ except ValueError as exc:
212
+ print(f"error: {exc}", file=sys.stderr)
213
+ return 1
214
+ source = f"compute capability {args.cc}"
215
+ else:
216
+ try:
217
+ limits = _occupancy.limits_from_attrs(_device_attrs(args.device))
218
+ source = f"device {args.device}"
219
+ except CudaNotAvailableError:
220
+ print(
221
+ "error: no CUDA device found. Pass --cc (e.g. --cc 8.6) to "
222
+ "model a card you don't have.",
223
+ file=sys.stderr,
224
+ )
225
+ return 1
226
+
227
+ occ = _occupancy.compute(args.block, args.regs, args.smem, limits)
228
+ print(f"occupancy for {source}")
229
+ print(f" block={args.block} regs/thread={args.regs} smem/block={args.smem}\n")
230
+ print(f" occupancy : {occ.pct:.1f}% ({occ.active_warps}/{occ.max_warps} warps per SM)")
231
+ print(f" blocks per SM: {occ.blocks_per_sm}")
232
+ print(f" limited by : {', '.join(occ.limited_by)}\n")
233
+
234
+ sweep = _occupancy.block_size_sweep(args.regs, args.smem, limits)
235
+ print(" block size " + "".join(f"{s:>7}" for s, _ in sweep))
236
+ print(" occupancy " + "".join(f"{p:>6.0f}%" for _, p in sweep))
237
+ return 0
238
+
239
+
240
+ # ---------------------------------------------------------------------------
241
+ # ceiling
242
+ # ---------------------------------------------------------------------------
243
+
244
+ def cmd_ceiling(args: argparse.Namespace) -> int:
245
+ from . import ceiling as _ceiling
246
+
247
+ results = _ceiling.measure(mb=args.mb, matmul_n=args.matmul_n)
248
+ if args.json:
249
+ print(json.dumps([r.as_dict() for r in results], indent=2))
250
+ else:
251
+ for line in _ceiling.format_table(results):
252
+ print(line)
253
+ return 0
254
+
255
+
256
+ def main(argv: list[str] | None = None) -> int:
257
+ parser = argparse.ArgumentParser(prog="kernelmeter", description=__doc__)
258
+ sub = parser.add_subparsers(dest="command", required=True)
259
+
260
+ p_info = sub.add_parser("info", help="dump all CUDA device attributes and derived peaks")
261
+ p_info.add_argument("--json", action="store_true", help="machine-readable output")
262
+ p_info.set_defaults(func=cmd_info)
263
+
264
+ p_bench = sub.add_parser("bench", help="run @kernelmeter.benchmark specs from a file")
265
+ p_bench.add_argument("file", help="python file that registers benchmarks")
266
+ p_bench.add_argument("--json", action="store_true", help="machine-readable output")
267
+ p_bench.add_argument("--save", metavar="FILE", help="write results to a json file")
268
+ p_bench.add_argument(
269
+ "--compare", metavar="FILE", help="compare against results saved with --save"
270
+ )
271
+ p_bench.add_argument(
272
+ "--no-flush-l2",
273
+ action="store_true",
274
+ help="skip the L2 flush between iterations (lets small workloads run hot in cache)",
275
+ )
276
+ p_bench.set_defaults(func=cmd_bench)
277
+
278
+ p_roof = sub.add_parser("roofline", help="draw the device roofline")
279
+ p_roof.add_argument("--ai", type=float, help="mark a kernel at this arithmetic intensity")
280
+ p_roof.add_argument("--device", type=int, default=0)
281
+ p_roof.add_argument("--peak-bw", type=float, help="override bandwidth in GB/s")
282
+ p_roof.add_argument("--peak-tflops", type=float, help="override compute in TFLOP/s")
283
+ p_roof.set_defaults(func=cmd_roofline)
284
+
285
+ p_occ = sub.add_parser("occupancy", help="theoretical occupancy calculator")
286
+ p_occ.add_argument("--block", type=int, required=True, help="threads per block")
287
+ p_occ.add_argument("--regs", type=int, default=0, help="registers per thread (from ptxas/ncu)")
288
+ p_occ.add_argument("--smem", type=int, default=0, help="shared memory per block in bytes")
289
+ p_occ.add_argument("--cc", help="compute capability, e.g. 8.6 (default: ask the device)")
290
+ p_occ.add_argument("--device", type=int, default=0)
291
+ p_occ.set_defaults(func=cmd_occupancy)
292
+
293
+ p_ceil = sub.add_parser("ceiling", help="measure real achievable bandwidth and FP32")
294
+ p_ceil.add_argument("--mb", type=int, default=256, help="working-set size per array")
295
+ p_ceil.add_argument("--matmul-n", type=int, default=4096, help="matmul size for the FP32 test")
296
+ p_ceil.add_argument("--json", action="store_true", help="machine-readable output")
297
+ p_ceil.set_defaults(func=cmd_ceiling)
298
+
299
+ args = parser.parse_args(argv)
300
+ return args.func(args)
301
+
302
+
303
+ if __name__ == "__main__":
304
+ raise SystemExit(main())
kernelmeter/cudadrv.py ADDED
@@ -0,0 +1,129 @@
1
+ """Thin ctypes wrapper around the CUDA driver API (libcuda).
2
+
3
+ The driver API is used instead of the runtime API (libcudart) on purpose:
4
+ libcuda ships with the NVIDIA driver itself, so this works on any machine
5
+ that can run CUDA programs -- no CUDA toolkit installation required. Its C
6
+ ABI is stable, unlike ``cudaDeviceProp`` whose struct layout changes
7
+ between toolkit versions.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import ctypes
13
+ import sys
14
+ from dataclasses import dataclass
15
+
16
+ CUDA_SUCCESS = 0
17
+ CUDA_ERROR_INVALID_VALUE = 1
18
+ CUDA_ERROR_NO_DEVICE = 100
19
+
20
+
21
+ class CudaDriverError(RuntimeError):
22
+ """The CUDA driver returned an unexpected error code."""
23
+
24
+ def __init__(self, func: str, code: int):
25
+ self.func = func
26
+ self.code = code
27
+ super().__init__(f"{func} failed with CUresult {code}")
28
+
29
+
30
+ class CudaNotAvailableError(RuntimeError):
31
+ """libcuda could not be loaded or no NVIDIA device is present."""
32
+
33
+
34
+ _LIB_CANDIDATES = {
35
+ "linux": ("libcuda.so.1", "libcuda.so"),
36
+ "win32": ("nvcuda.dll",),
37
+ "darwin": (), # NVIDIA dropped macOS support after CUDA 10.2
38
+ }
39
+
40
+
41
+ def load_library() -> ctypes.CDLL:
42
+ """Locate and load libcuda for the current platform."""
43
+ platform = "win32" if sys.platform == "win32" else sys.platform
44
+ candidates = _LIB_CANDIDATES.get(platform, _LIB_CANDIDATES["linux"])
45
+ if not candidates:
46
+ raise CudaNotAvailableError(
47
+ "CUDA is not supported on this platform (no NVIDIA driver for "
48
+ f"{sys.platform!r}). Run kernelmeter on a machine with an NVIDIA GPU."
49
+ )
50
+ errors = []
51
+ for name in candidates:
52
+ try:
53
+ if sys.platform == "win32":
54
+ return ctypes.WinDLL(name) # pragma: no cover
55
+ return ctypes.CDLL(name)
56
+ except OSError as exc: # library missing
57
+ errors.append(f"{name}: {exc}")
58
+ raise CudaNotAvailableError(
59
+ "Could not load the CUDA driver library. Is the NVIDIA driver "
60
+ "installed?\nTried: " + "; ".join(errors)
61
+ )
62
+
63
+
64
+ @dataclass
65
+ class DeviceHandle:
66
+ ordinal: int
67
+ handle: int
68
+ name: str
69
+ total_mem_bytes: int
70
+
71
+
72
+ class Driver:
73
+ """Minimal, injectable wrapper over the few entry points we need.
74
+
75
+ Pass a fake ``lib`` object in tests; production code calls ``Driver()``
76
+ which loads the real libcuda.
77
+ """
78
+
79
+ def __init__(self, lib=None):
80
+ self._lib = lib if lib is not None else load_library()
81
+ self._check("cuInit", self._lib.cuInit(0))
82
+
83
+ def _check(self, func: str, code: int) -> None:
84
+ if code == CUDA_ERROR_NO_DEVICE:
85
+ raise CudaNotAvailableError(
86
+ "The NVIDIA driver loaded but reported no CUDA-capable device."
87
+ )
88
+ if code != CUDA_SUCCESS:
89
+ raise CudaDriverError(func, code)
90
+
91
+ def driver_version(self) -> tuple[int, int]:
92
+ v = ctypes.c_int(0)
93
+ self._check("cuDriverGetVersion", self._lib.cuDriverGetVersion(ctypes.byref(v)))
94
+ return v.value // 1000, (v.value % 1000) // 10
95
+
96
+ def device_count(self) -> int:
97
+ n = ctypes.c_int(0)
98
+ self._check("cuDeviceGetCount", self._lib.cuDeviceGetCount(ctypes.byref(n)))
99
+ return n.value
100
+
101
+ def device(self, ordinal: int) -> DeviceHandle:
102
+ dev = ctypes.c_int(0)
103
+ self._check("cuDeviceGet", self._lib.cuDeviceGet(ctypes.byref(dev), ordinal))
104
+
105
+ buf = ctypes.create_string_buffer(256)
106
+ self._check("cuDeviceGetName", self._lib.cuDeviceGetName(buf, 256, dev))
107
+
108
+ mem = ctypes.c_size_t(0)
109
+ # cuDeviceTotalMem_v2 is the modern 64-bit symbol; fall back for
110
+ # exotic builds that only export the unsuffixed name.
111
+ fn = getattr(self._lib, "cuDeviceTotalMem_v2", None) or self._lib.cuDeviceTotalMem
112
+ self._check("cuDeviceTotalMem", fn(ctypes.byref(mem), dev))
113
+
114
+ return DeviceHandle(
115
+ ordinal=ordinal,
116
+ handle=dev.value,
117
+ name=buf.value.decode("utf-8", errors="replace"),
118
+ total_mem_bytes=mem.value,
119
+ )
120
+
121
+ def attribute(self, device: DeviceHandle, attr_id: int) -> int | None:
122
+ """Query one device attribute. Returns None when the driver does
123
+ not know the attribute id (older driver than the probe range)."""
124
+ out = ctypes.c_int(0)
125
+ code = self._lib.cuDeviceGetAttribute(ctypes.byref(out), attr_id, device.handle)
126
+ if code == CUDA_ERROR_INVALID_VALUE:
127
+ return None
128
+ self._check("cuDeviceGetAttribute", code)
129
+ return out.value
@@ -0,0 +1,102 @@
1
+ """Theoretical occupancy from block size, registers and shared memory.
2
+
3
+ This reimplements the model behind NVIDIA's old occupancy calculator
4
+ spreadsheet. It is a model, not a measurement: the real limiter can also
5
+ be launch bounds or the driver, but for the everyday question ("why is my
6
+ occupancy 50% and what do I change?") the model is what you want.
7
+
8
+ Registers are allocated per warp in units of 256. Shared memory is
9
+ allocated per block in architecture-specific units, and on Ampere and
10
+ newer the runtime reserves about 1 KiB per block on top of what you ask
11
+ for.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass
17
+
18
+ REG_ALLOC_UNIT = 256 # registers, rounded up per warp
19
+
20
+ # Per-SM limits by compute capability. Values from the CUDA C programming
21
+ # guide's compute-capability table.
22
+ ARCH_LIMITS: dict[tuple[int, int], dict] = {
23
+ (7, 0): dict(max_warps=64, max_blocks=32, regs=65536, smem=98304, smem_gran=256, reserved_smem=0),
24
+ (7, 5): dict(max_warps=32, max_blocks=16, regs=65536, smem=65536, smem_gran=256, reserved_smem=0),
25
+ (8, 0): dict(max_warps=64, max_blocks=32, regs=65536, smem=167936, smem_gran=128, reserved_smem=1024),
26
+ (8, 6): dict(max_warps=48, max_blocks=16, regs=65536, smem=102400, smem_gran=128, reserved_smem=1024),
27
+ (8, 7): dict(max_warps=48, max_blocks=16, regs=65536, smem=167936, smem_gran=128, reserved_smem=1024),
28
+ (8, 9): dict(max_warps=48, max_blocks=24, regs=65536, smem=102400, smem_gran=128, reserved_smem=1024),
29
+ (9, 0): dict(max_warps=64, max_blocks=32, regs=65536, smem=233472, smem_gran=128, reserved_smem=1024),
30
+ (12, 0): dict(max_warps=48, max_blocks=24, regs=65536, smem=102400, smem_gran=128, reserved_smem=1024),
31
+ }
32
+
33
+
34
+ def limits_for_cc(major: int, minor: int) -> dict:
35
+ if (major, minor) in ARCH_LIMITS:
36
+ return ARCH_LIMITS[(major, minor)]
37
+ older = [cc for cc in ARCH_LIMITS if cc <= (major, minor)]
38
+ if not older:
39
+ raise ValueError(f"no occupancy data for compute capability {major}.{minor}")
40
+ return ARCH_LIMITS[max(older)]
41
+
42
+
43
+ def limits_from_attrs(attrs: dict[str, int]) -> dict:
44
+ """Build the limits dict from live device attributes, falling back to
45
+ the arch table for allocation granularities the driver doesn't report."""
46
+ cc = (attrs["compute_capability_major"], attrs["compute_capability_minor"])
47
+ arch = limits_for_cc(*cc)
48
+ return dict(
49
+ max_warps=attrs.get("max_threads_per_multiprocessor", arch["max_warps"] * 32) // 32,
50
+ max_blocks=attrs.get("max_blocks_per_multiprocessor", arch["max_blocks"]),
51
+ regs=attrs.get("max_registers_per_multiprocessor", arch["regs"]),
52
+ smem=attrs.get("max_shared_memory_per_multiprocessor", arch["smem"]),
53
+ smem_gran=arch["smem_gran"],
54
+ reserved_smem=attrs.get("reserved_shared_memory_per_block", arch["reserved_smem"]),
55
+ )
56
+
57
+
58
+ def _ceil_to(value: int, unit: int) -> int:
59
+ return (value + unit - 1) // unit * unit
60
+
61
+
62
+ @dataclass
63
+ class Occupancy:
64
+ blocks_per_sm: int
65
+ active_warps: int
66
+ max_warps: int
67
+ pct: float
68
+ limited_by: list[str]
69
+
70
+ def as_dict(self) -> dict:
71
+ return dict(self.__dict__)
72
+
73
+
74
+ def compute(block: int, regs_per_thread: int, smem_per_block: int, limits: dict) -> Occupancy:
75
+ if block < 1 or block > 1024:
76
+ raise ValueError("block size must be 1..1024")
77
+ warps_per_block = (block + 31) // 32
78
+
79
+ by = {"blocks": limits["max_blocks"], "threads": limits["max_warps"] // warps_per_block}
80
+ if regs_per_thread:
81
+ regs_per_warp = _ceil_to(regs_per_thread * 32, REG_ALLOC_UNIT)
82
+ by["registers"] = (limits["regs"] // regs_per_warp) // warps_per_block
83
+ smem_total = smem_per_block + limits["reserved_smem"] if smem_per_block else 0
84
+ if smem_total:
85
+ by["shared memory"] = limits["smem"] // _ceil_to(smem_total, limits["smem_gran"])
86
+
87
+ blocks = min(by.values())
88
+ warps = blocks * warps_per_block
89
+ return Occupancy(
90
+ blocks_per_sm=blocks,
91
+ active_warps=warps,
92
+ max_warps=limits["max_warps"],
93
+ pct=100.0 * warps / limits["max_warps"],
94
+ limited_by=[k for k, v in by.items() if v == blocks],
95
+ )
96
+
97
+
98
+ def block_size_sweep(
99
+ regs_per_thread: int, smem_per_block: int, limits: dict,
100
+ sizes: tuple = (64, 128, 192, 256, 384, 512, 768, 1024),
101
+ ) -> list[tuple[int, float]]:
102
+ return [(s, compute(s, regs_per_thread, smem_per_block, limits).pct) for s in sizes]
kernelmeter/peaks.py ADDED
@@ -0,0 +1,83 @@
1
+ """Derive theoretical peak throughput ("speed of light") from device attributes.
2
+
3
+ These numbers are upper bounds computed from the max boost clock the driver
4
+ reports; sustained clocks under load are usually lower, so treat a kernel
5
+ at 85%+ of these peaks as effectively saturating the machine.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ # FP32 CUDA cores per SM, keyed by compute capability. The fallback for an
13
+ # unknown capability is the value of the closest older architecture.
14
+ _FP32_CORES_PER_SM: dict[tuple[int, int], int] = {
15
+ (3, 0): 192, (3, 5): 192, (3, 7): 192,
16
+ (5, 0): 128, (5, 2): 128, (5, 3): 128,
17
+ (6, 0): 64, (6, 1): 128, (6, 2): 128,
18
+ (7, 0): 64, (7, 2): 64, (7, 5): 64,
19
+ (8, 0): 64, (8, 6): 128, (8, 7): 128, (8, 9): 128,
20
+ (9, 0): 128,
21
+ (10, 0): 128, (10, 3): 128,
22
+ (12, 0): 128, (12, 1): 128,
23
+ }
24
+
25
+
26
+ def fp32_cores_per_sm(major: int, minor: int) -> int:
27
+ if (major, minor) in _FP32_CORES_PER_SM:
28
+ return _FP32_CORES_PER_SM[(major, minor)]
29
+ older = [cc for cc in _FP32_CORES_PER_SM if cc <= (major, minor)]
30
+ if older:
31
+ return _FP32_CORES_PER_SM[max(older)]
32
+ return 64
33
+
34
+
35
+ @dataclass
36
+ class Peaks:
37
+ """Theoretical per-device ceilings derived from driver attributes."""
38
+
39
+ mem_bandwidth_gbs: float | None
40
+ fp32_tflops: float | None
41
+ compute_capability: tuple[int, int] | None
42
+
43
+ def as_dict(self) -> dict:
44
+ return {
45
+ "theoretical_mem_bandwidth_gb_s": self.mem_bandwidth_gbs,
46
+ "theoretical_fp32_tflops": self.fp32_tflops,
47
+ "compute_capability": (
48
+ f"{self.compute_capability[0]}.{self.compute_capability[1]}"
49
+ if self.compute_capability
50
+ else None
51
+ ),
52
+ }
53
+
54
+
55
+ def mem_bandwidth_gbs(memory_clock_khz: int, bus_width_bits: int) -> float:
56
+ """DDR: two transfers per clock. clock(kHz) * 1e3 * width(bytes) * 2 / 1e9."""
57
+ return 2.0 * memory_clock_khz * 1e3 * (bus_width_bits / 8.0) / 1e9
58
+
59
+
60
+ def fp32_tflops(sm_count: int, clock_khz: int, major: int, minor: int) -> float:
61
+ """One FMA per core per clock = 2 FLOPs."""
62
+ cores = fp32_cores_per_sm(major, minor)
63
+ return 2.0 * sm_count * cores * clock_khz * 1e3 / 1e12
64
+
65
+
66
+ def derive(attrs: dict[str, int]) -> Peaks:
67
+ """Compute peaks from a query_all() attribute dict, tolerating gaps."""
68
+ bw = None
69
+ if "memory_clock_rate_khz" in attrs and "global_memory_bus_width_bits" in attrs:
70
+ bw = mem_bandwidth_gbs(
71
+ attrs["memory_clock_rate_khz"], attrs["global_memory_bus_width_bits"]
72
+ )
73
+
74
+ cc = None
75
+ flops = None
76
+ if "compute_capability_major" in attrs and "compute_capability_minor" in attrs:
77
+ cc = (attrs["compute_capability_major"], attrs["compute_capability_minor"])
78
+ if "multiprocessor_count" in attrs and "clock_rate_khz" in attrs:
79
+ flops = fp32_tflops(
80
+ attrs["multiprocessor_count"], attrs["clock_rate_khz"], cc[0], cc[1]
81
+ )
82
+
83
+ return Peaks(mem_bandwidth_gbs=bw, fp32_tflops=flops, compute_capability=cc)