kernelmeter 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kernelmeter/__init__.py +28 -0
- kernelmeter/attrs.py +149 -0
- kernelmeter/bench.py +292 -0
- kernelmeter/ceiling.py +111 -0
- kernelmeter/cli.py +304 -0
- kernelmeter/cudadrv.py +129 -0
- kernelmeter/occupancy.py +102 -0
- kernelmeter/peaks.py +83 -0
- kernelmeter/roofline.py +89 -0
- kernelmeter-0.2.0.dist-info/METADATA +309 -0
- kernelmeter-0.2.0.dist-info/RECORD +15 -0
- kernelmeter-0.2.0.dist-info/WHEEL +5 -0
- kernelmeter-0.2.0.dist-info/entry_points.txt +2 -0
- kernelmeter-0.2.0.dist-info/licenses/LICENSE +21 -0
- kernelmeter-0.2.0.dist-info/top_level.txt +1 -0
kernelmeter/cli.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""kernelmeter command line interface.
|
|
2
|
+
|
|
3
|
+
kernelmeter info dump every device attribute + derived peaks
|
|
4
|
+
kernelmeter bench file.py run all @kernelmeter.benchmark specs in file
|
|
5
|
+
kernelmeter roofline draw the device roofline, place your kernel on it
|
|
6
|
+
kernelmeter occupancy theoretical occupancy from block/regs/smem
|
|
7
|
+
kernelmeter ceiling measure real achievable bandwidth and FP32
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import importlib.util
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
from . import attrs as _attrs
|
|
18
|
+
from . import occupancy as _occupancy
|
|
19
|
+
from . import peaks as _peaks
|
|
20
|
+
from . import roofline as _roofline
|
|
21
|
+
from .cudadrv import CudaNotAvailableError, Driver
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _fmt(value: float | None, suffix: str = "", nd: int = 1) -> str:
|
|
25
|
+
return f"{value:.{nd}f}{suffix}" if value is not None else "-"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _device_attrs(ordinal: int = 0) -> dict[str, int]:
|
|
29
|
+
driver = Driver()
|
|
30
|
+
return _attrs.query_all(driver, driver.device(ordinal))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# info
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
def gather_info(driver: Driver) -> dict:
|
|
38
|
+
major, minor = driver.driver_version()
|
|
39
|
+
devices = []
|
|
40
|
+
for ordinal in range(driver.device_count()):
|
|
41
|
+
dev = driver.device(ordinal)
|
|
42
|
+
attributes = _attrs.query_all(driver, dev)
|
|
43
|
+
peaks = _peaks.derive(attributes)
|
|
44
|
+
devices.append(
|
|
45
|
+
{
|
|
46
|
+
"ordinal": ordinal,
|
|
47
|
+
"name": dev.name,
|
|
48
|
+
"total_memory_bytes": dev.total_mem_bytes,
|
|
49
|
+
"derived": peaks.as_dict(),
|
|
50
|
+
"attributes": attributes,
|
|
51
|
+
}
|
|
52
|
+
)
|
|
53
|
+
return {"driver_version": f"{major}.{minor}", "devices": devices}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def cmd_info(args: argparse.Namespace) -> int:
|
|
57
|
+
try:
|
|
58
|
+
driver = Driver()
|
|
59
|
+
except CudaNotAvailableError as exc:
|
|
60
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
61
|
+
return 1
|
|
62
|
+
|
|
63
|
+
info = gather_info(driver)
|
|
64
|
+
if args.json:
|
|
65
|
+
print(json.dumps(info, indent=2))
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
print(f"CUDA driver version : {info['driver_version']}")
|
|
69
|
+
for dev in info["devices"]:
|
|
70
|
+
gib = dev["total_memory_bytes"] / 2**30
|
|
71
|
+
print(f"\nDevice {dev['ordinal']}: {dev['name']} ({gib:.1f} GiB)")
|
|
72
|
+
derived = dev["derived"]
|
|
73
|
+
print(f" compute capability : {derived['compute_capability']}")
|
|
74
|
+
print(
|
|
75
|
+
" theoretical mem bandwidth : "
|
|
76
|
+
+ _fmt(derived["theoretical_mem_bandwidth_gb_s"], " GB/s")
|
|
77
|
+
)
|
|
78
|
+
print(
|
|
79
|
+
" theoretical FP32 peak : "
|
|
80
|
+
+ _fmt(derived["theoretical_fp32_tflops"], " TFLOP/s", nd=2)
|
|
81
|
+
)
|
|
82
|
+
print(f"\n {'attribute':<48} value")
|
|
83
|
+
print(f" {'-' * 48} {'-' * 12}")
|
|
84
|
+
for name, value in dev["attributes"].items():
|
|
85
|
+
print(f" {name:<48} {value}")
|
|
86
|
+
return 0
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
# bench
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
|
|
93
|
+
def _load_module(path: str) -> None:
|
|
94
|
+
spec = importlib.util.spec_from_file_location("kernelmeter_bench_target", path)
|
|
95
|
+
if spec is None or spec.loader is None:
|
|
96
|
+
raise SystemExit(f"error: cannot import {path}")
|
|
97
|
+
module = importlib.util.module_from_spec(spec)
|
|
98
|
+
spec.loader.exec_module(module)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def cmd_bench(args: argparse.Namespace) -> int:
|
|
102
|
+
from . import bench as _bench
|
|
103
|
+
|
|
104
|
+
_load_module(args.file)
|
|
105
|
+
if not _bench.REGISTRY:
|
|
106
|
+
print(
|
|
107
|
+
f"error: {args.file} registered no benchmarks. "
|
|
108
|
+
"Decorate your kernels with @kernelmeter.benchmark(...).",
|
|
109
|
+
file=sys.stderr,
|
|
110
|
+
)
|
|
111
|
+
return 1
|
|
112
|
+
|
|
113
|
+
results = _bench.run_registry(flush_l2=not args.no_flush_l2)
|
|
114
|
+
|
|
115
|
+
if args.save:
|
|
116
|
+
with open(args.save, "w") as fh:
|
|
117
|
+
json.dump([r.as_dict() for r in results], fh, indent=2)
|
|
118
|
+
|
|
119
|
+
if args.json:
|
|
120
|
+
print(json.dumps([r.as_dict() for r in results], indent=2))
|
|
121
|
+
else:
|
|
122
|
+
header = (
|
|
123
|
+
f"{'kernel':<24} {'median ms':>10} {'GB/s':>9} {'TFLOP/s':>9} "
|
|
124
|
+
f"{'bound':>6} {'%roof':>7} {'vs ref':>8} {'correct':>8}"
|
|
125
|
+
)
|
|
126
|
+
print(header)
|
|
127
|
+
print("-" * len(header))
|
|
128
|
+
for r in results:
|
|
129
|
+
if r.error:
|
|
130
|
+
print(f"{r.name:<24} failed: {r.error}")
|
|
131
|
+
continue
|
|
132
|
+
correct = "-" if r.correct is None else ("PASS" if r.correct else "FAIL")
|
|
133
|
+
speedup = f"{r.speedup_vs_ref:.2f}x" if r.speedup_vs_ref else "-"
|
|
134
|
+
print(
|
|
135
|
+
f"{r.name:<24} {r.ms_median:>10.4f} {_fmt(r.gbps):>9} "
|
|
136
|
+
f"{_fmt(r.tflops, nd=2):>9} {r.bound or '-':>6} "
|
|
137
|
+
f"{_fmt(r.pct_roofline, '%'):>7} {speedup:>8} {correct:>8}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
ok = all(r.error is None and r.correct in (None, True) for r in results)
|
|
141
|
+
|
|
142
|
+
if args.compare:
|
|
143
|
+
with open(args.compare) as fh:
|
|
144
|
+
baseline = json.load(fh)
|
|
145
|
+
rows, regressions = _bench.diff_results(baseline, results)
|
|
146
|
+
if rows:
|
|
147
|
+
print(f"\n{'kernel':<24} {'baseline ms':>12} {'now ms':>10} {'delta':>8}")
|
|
148
|
+
for name, old_ms, new_ms, delta in rows:
|
|
149
|
+
print(f"{name:<24} {old_ms:>12.4f} {new_ms:>10.4f} {delta:>+7.1f}%")
|
|
150
|
+
for name in regressions:
|
|
151
|
+
print(f"regression: {name} is more than 5% slower than the baseline")
|
|
152
|
+
if regressions:
|
|
153
|
+
ok = False
|
|
154
|
+
|
|
155
|
+
return 0 if ok else 1
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
# roofline
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
def cmd_roofline(args: argparse.Namespace) -> int:
|
|
163
|
+
peak_bw, peak_tf = args.peak_bw, args.peak_tflops
|
|
164
|
+
name = None
|
|
165
|
+
if peak_bw is None or peak_tf is None:
|
|
166
|
+
try:
|
|
167
|
+
driver = Driver()
|
|
168
|
+
dev = driver.device(args.device)
|
|
169
|
+
name = dev.name
|
|
170
|
+
peaks = _peaks.derive(_attrs.query_all(driver, dev))
|
|
171
|
+
peak_bw = peak_bw or peaks.mem_bandwidth_gbs
|
|
172
|
+
peak_tf = peak_tf or peaks.fp32_tflops
|
|
173
|
+
except CudaNotAvailableError:
|
|
174
|
+
pass
|
|
175
|
+
if not peak_bw or not peak_tf:
|
|
176
|
+
print(
|
|
177
|
+
"error: no CUDA device found. Pass --peak-bw GB/s and "
|
|
178
|
+
"--peak-tflops to draw a roofline for any card.",
|
|
179
|
+
file=sys.stderr,
|
|
180
|
+
)
|
|
181
|
+
return 1
|
|
182
|
+
|
|
183
|
+
ridge = _roofline.ridge_point(peak_tf, peak_bw)
|
|
184
|
+
if name:
|
|
185
|
+
print(f"Device {args.device}: {name}")
|
|
186
|
+
print(f" peak bandwidth : {peak_bw:.1f} GB/s")
|
|
187
|
+
print(f" peak compute : {peak_tf:.2f} TFLOP/s")
|
|
188
|
+
print(f" ridge point : {ridge:.1f} flop/byte\n")
|
|
189
|
+
for line in _roofline.render(peak_tf, peak_bw, ai=args.ai):
|
|
190
|
+
print(line)
|
|
191
|
+
if args.ai is not None:
|
|
192
|
+
attainable = _roofline.attainable_tflops(args.ai, peak_tf, peak_bw)
|
|
193
|
+
which = _roofline.bound(args.ai, peak_tf, peak_bw)
|
|
194
|
+
kind = "memory" if which == "mem" else "compute"
|
|
195
|
+
print(
|
|
196
|
+
f"\nat {args.ai:g} flop/byte the kernel is {kind}-bound; "
|
|
197
|
+
f"attainable: {attainable:.2f} TFLOP/s"
|
|
198
|
+
)
|
|
199
|
+
return 0
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ---------------------------------------------------------------------------
|
|
203
|
+
# occupancy
|
|
204
|
+
# ---------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def cmd_occupancy(args: argparse.Namespace) -> int:
|
|
207
|
+
if args.cc:
|
|
208
|
+
try:
|
|
209
|
+
major, minor = (int(x) for x in args.cc.split("."))
|
|
210
|
+
limits = _occupancy.limits_for_cc(major, minor)
|
|
211
|
+
except ValueError as exc:
|
|
212
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
213
|
+
return 1
|
|
214
|
+
source = f"compute capability {args.cc}"
|
|
215
|
+
else:
|
|
216
|
+
try:
|
|
217
|
+
limits = _occupancy.limits_from_attrs(_device_attrs(args.device))
|
|
218
|
+
source = f"device {args.device}"
|
|
219
|
+
except CudaNotAvailableError:
|
|
220
|
+
print(
|
|
221
|
+
"error: no CUDA device found. Pass --cc (e.g. --cc 8.6) to "
|
|
222
|
+
"model a card you don't have.",
|
|
223
|
+
file=sys.stderr,
|
|
224
|
+
)
|
|
225
|
+
return 1
|
|
226
|
+
|
|
227
|
+
occ = _occupancy.compute(args.block, args.regs, args.smem, limits)
|
|
228
|
+
print(f"occupancy for {source}")
|
|
229
|
+
print(f" block={args.block} regs/thread={args.regs} smem/block={args.smem}\n")
|
|
230
|
+
print(f" occupancy : {occ.pct:.1f}% ({occ.active_warps}/{occ.max_warps} warps per SM)")
|
|
231
|
+
print(f" blocks per SM: {occ.blocks_per_sm}")
|
|
232
|
+
print(f" limited by : {', '.join(occ.limited_by)}\n")
|
|
233
|
+
|
|
234
|
+
sweep = _occupancy.block_size_sweep(args.regs, args.smem, limits)
|
|
235
|
+
print(" block size " + "".join(f"{s:>7}" for s, _ in sweep))
|
|
236
|
+
print(" occupancy " + "".join(f"{p:>6.0f}%" for _, p in sweep))
|
|
237
|
+
return 0
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# ---------------------------------------------------------------------------
|
|
241
|
+
# ceiling
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
def cmd_ceiling(args: argparse.Namespace) -> int:
|
|
245
|
+
from . import ceiling as _ceiling
|
|
246
|
+
|
|
247
|
+
results = _ceiling.measure(mb=args.mb, matmul_n=args.matmul_n)
|
|
248
|
+
if args.json:
|
|
249
|
+
print(json.dumps([r.as_dict() for r in results], indent=2))
|
|
250
|
+
else:
|
|
251
|
+
for line in _ceiling.format_table(results):
|
|
252
|
+
print(line)
|
|
253
|
+
return 0
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def main(argv: list[str] | None = None) -> int:
|
|
257
|
+
parser = argparse.ArgumentParser(prog="kernelmeter", description=__doc__)
|
|
258
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
259
|
+
|
|
260
|
+
p_info = sub.add_parser("info", help="dump all CUDA device attributes and derived peaks")
|
|
261
|
+
p_info.add_argument("--json", action="store_true", help="machine-readable output")
|
|
262
|
+
p_info.set_defaults(func=cmd_info)
|
|
263
|
+
|
|
264
|
+
p_bench = sub.add_parser("bench", help="run @kernelmeter.benchmark specs from a file")
|
|
265
|
+
p_bench.add_argument("file", help="python file that registers benchmarks")
|
|
266
|
+
p_bench.add_argument("--json", action="store_true", help="machine-readable output")
|
|
267
|
+
p_bench.add_argument("--save", metavar="FILE", help="write results to a json file")
|
|
268
|
+
p_bench.add_argument(
|
|
269
|
+
"--compare", metavar="FILE", help="compare against results saved with --save"
|
|
270
|
+
)
|
|
271
|
+
p_bench.add_argument(
|
|
272
|
+
"--no-flush-l2",
|
|
273
|
+
action="store_true",
|
|
274
|
+
help="skip the L2 flush between iterations (lets small workloads run hot in cache)",
|
|
275
|
+
)
|
|
276
|
+
p_bench.set_defaults(func=cmd_bench)
|
|
277
|
+
|
|
278
|
+
p_roof = sub.add_parser("roofline", help="draw the device roofline")
|
|
279
|
+
p_roof.add_argument("--ai", type=float, help="mark a kernel at this arithmetic intensity")
|
|
280
|
+
p_roof.add_argument("--device", type=int, default=0)
|
|
281
|
+
p_roof.add_argument("--peak-bw", type=float, help="override bandwidth in GB/s")
|
|
282
|
+
p_roof.add_argument("--peak-tflops", type=float, help="override compute in TFLOP/s")
|
|
283
|
+
p_roof.set_defaults(func=cmd_roofline)
|
|
284
|
+
|
|
285
|
+
p_occ = sub.add_parser("occupancy", help="theoretical occupancy calculator")
|
|
286
|
+
p_occ.add_argument("--block", type=int, required=True, help="threads per block")
|
|
287
|
+
p_occ.add_argument("--regs", type=int, default=0, help="registers per thread (from ptxas/ncu)")
|
|
288
|
+
p_occ.add_argument("--smem", type=int, default=0, help="shared memory per block in bytes")
|
|
289
|
+
p_occ.add_argument("--cc", help="compute capability, e.g. 8.6 (default: ask the device)")
|
|
290
|
+
p_occ.add_argument("--device", type=int, default=0)
|
|
291
|
+
p_occ.set_defaults(func=cmd_occupancy)
|
|
292
|
+
|
|
293
|
+
p_ceil = sub.add_parser("ceiling", help="measure real achievable bandwidth and FP32")
|
|
294
|
+
p_ceil.add_argument("--mb", type=int, default=256, help="working-set size per array")
|
|
295
|
+
p_ceil.add_argument("--matmul-n", type=int, default=4096, help="matmul size for the FP32 test")
|
|
296
|
+
p_ceil.add_argument("--json", action="store_true", help="machine-readable output")
|
|
297
|
+
p_ceil.set_defaults(func=cmd_ceiling)
|
|
298
|
+
|
|
299
|
+
args = parser.parse_args(argv)
|
|
300
|
+
return args.func(args)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
if __name__ == "__main__":
|
|
304
|
+
raise SystemExit(main())
|
kernelmeter/cudadrv.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Thin ctypes wrapper around the CUDA driver API (libcuda).
|
|
2
|
+
|
|
3
|
+
The driver API is used instead of the runtime API (libcudart) on purpose:
|
|
4
|
+
libcuda ships with the NVIDIA driver itself, so this works on any machine
|
|
5
|
+
that can run CUDA programs -- no CUDA toolkit installation required. Its C
|
|
6
|
+
ABI is stable, unlike ``cudaDeviceProp`` whose struct layout changes
|
|
7
|
+
between toolkit versions.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import ctypes
|
|
13
|
+
import sys
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
|
|
16
|
+
CUDA_SUCCESS = 0
|
|
17
|
+
CUDA_ERROR_INVALID_VALUE = 1
|
|
18
|
+
CUDA_ERROR_NO_DEVICE = 100
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CudaDriverError(RuntimeError):
|
|
22
|
+
"""The CUDA driver returned an unexpected error code."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, func: str, code: int):
|
|
25
|
+
self.func = func
|
|
26
|
+
self.code = code
|
|
27
|
+
super().__init__(f"{func} failed with CUresult {code}")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CudaNotAvailableError(RuntimeError):
|
|
31
|
+
"""libcuda could not be loaded or no NVIDIA device is present."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_LIB_CANDIDATES = {
|
|
35
|
+
"linux": ("libcuda.so.1", "libcuda.so"),
|
|
36
|
+
"win32": ("nvcuda.dll",),
|
|
37
|
+
"darwin": (), # NVIDIA dropped macOS support after CUDA 10.2
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_library() -> ctypes.CDLL:
|
|
42
|
+
"""Locate and load libcuda for the current platform."""
|
|
43
|
+
platform = "win32" if sys.platform == "win32" else sys.platform
|
|
44
|
+
candidates = _LIB_CANDIDATES.get(platform, _LIB_CANDIDATES["linux"])
|
|
45
|
+
if not candidates:
|
|
46
|
+
raise CudaNotAvailableError(
|
|
47
|
+
"CUDA is not supported on this platform (no NVIDIA driver for "
|
|
48
|
+
f"{sys.platform!r}). Run kernelmeter on a machine with an NVIDIA GPU."
|
|
49
|
+
)
|
|
50
|
+
errors = []
|
|
51
|
+
for name in candidates:
|
|
52
|
+
try:
|
|
53
|
+
if sys.platform == "win32":
|
|
54
|
+
return ctypes.WinDLL(name) # pragma: no cover
|
|
55
|
+
return ctypes.CDLL(name)
|
|
56
|
+
except OSError as exc: # library missing
|
|
57
|
+
errors.append(f"{name}: {exc}")
|
|
58
|
+
raise CudaNotAvailableError(
|
|
59
|
+
"Could not load the CUDA driver library. Is the NVIDIA driver "
|
|
60
|
+
"installed?\nTried: " + "; ".join(errors)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class DeviceHandle:
|
|
66
|
+
ordinal: int
|
|
67
|
+
handle: int
|
|
68
|
+
name: str
|
|
69
|
+
total_mem_bytes: int
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class Driver:
|
|
73
|
+
"""Minimal, injectable wrapper over the few entry points we need.
|
|
74
|
+
|
|
75
|
+
Pass a fake ``lib`` object in tests; production code calls ``Driver()``
|
|
76
|
+
which loads the real libcuda.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, lib=None):
|
|
80
|
+
self._lib = lib if lib is not None else load_library()
|
|
81
|
+
self._check("cuInit", self._lib.cuInit(0))
|
|
82
|
+
|
|
83
|
+
def _check(self, func: str, code: int) -> None:
|
|
84
|
+
if code == CUDA_ERROR_NO_DEVICE:
|
|
85
|
+
raise CudaNotAvailableError(
|
|
86
|
+
"The NVIDIA driver loaded but reported no CUDA-capable device."
|
|
87
|
+
)
|
|
88
|
+
if code != CUDA_SUCCESS:
|
|
89
|
+
raise CudaDriverError(func, code)
|
|
90
|
+
|
|
91
|
+
def driver_version(self) -> tuple[int, int]:
|
|
92
|
+
v = ctypes.c_int(0)
|
|
93
|
+
self._check("cuDriverGetVersion", self._lib.cuDriverGetVersion(ctypes.byref(v)))
|
|
94
|
+
return v.value // 1000, (v.value % 1000) // 10
|
|
95
|
+
|
|
96
|
+
def device_count(self) -> int:
|
|
97
|
+
n = ctypes.c_int(0)
|
|
98
|
+
self._check("cuDeviceGetCount", self._lib.cuDeviceGetCount(ctypes.byref(n)))
|
|
99
|
+
return n.value
|
|
100
|
+
|
|
101
|
+
def device(self, ordinal: int) -> DeviceHandle:
|
|
102
|
+
dev = ctypes.c_int(0)
|
|
103
|
+
self._check("cuDeviceGet", self._lib.cuDeviceGet(ctypes.byref(dev), ordinal))
|
|
104
|
+
|
|
105
|
+
buf = ctypes.create_string_buffer(256)
|
|
106
|
+
self._check("cuDeviceGetName", self._lib.cuDeviceGetName(buf, 256, dev))
|
|
107
|
+
|
|
108
|
+
mem = ctypes.c_size_t(0)
|
|
109
|
+
# cuDeviceTotalMem_v2 is the modern 64-bit symbol; fall back for
|
|
110
|
+
# exotic builds that only export the unsuffixed name.
|
|
111
|
+
fn = getattr(self._lib, "cuDeviceTotalMem_v2", None) or self._lib.cuDeviceTotalMem
|
|
112
|
+
self._check("cuDeviceTotalMem", fn(ctypes.byref(mem), dev))
|
|
113
|
+
|
|
114
|
+
return DeviceHandle(
|
|
115
|
+
ordinal=ordinal,
|
|
116
|
+
handle=dev.value,
|
|
117
|
+
name=buf.value.decode("utf-8", errors="replace"),
|
|
118
|
+
total_mem_bytes=mem.value,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def attribute(self, device: DeviceHandle, attr_id: int) -> int | None:
|
|
122
|
+
"""Query one device attribute. Returns None when the driver does
|
|
123
|
+
not know the attribute id (older driver than the probe range)."""
|
|
124
|
+
out = ctypes.c_int(0)
|
|
125
|
+
code = self._lib.cuDeviceGetAttribute(ctypes.byref(out), attr_id, device.handle)
|
|
126
|
+
if code == CUDA_ERROR_INVALID_VALUE:
|
|
127
|
+
return None
|
|
128
|
+
self._check("cuDeviceGetAttribute", code)
|
|
129
|
+
return out.value
|
kernelmeter/occupancy.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Theoretical occupancy from block size, registers and shared memory.
|
|
2
|
+
|
|
3
|
+
This reimplements the model behind NVIDIA's old occupancy calculator
|
|
4
|
+
spreadsheet. It is a model, not a measurement: the real limiter can also
|
|
5
|
+
be launch bounds or the driver, but for the everyday question ("why is my
|
|
6
|
+
occupancy 50% and what do I change?") the model is what you want.
|
|
7
|
+
|
|
8
|
+
Registers are allocated per warp in units of 256. Shared memory is
|
|
9
|
+
allocated per block in architecture-specific units, and on Ampere and
|
|
10
|
+
newer the runtime reserves about 1 KiB per block on top of what you ask
|
|
11
|
+
for.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
REG_ALLOC_UNIT = 256 # registers, rounded up per warp
|
|
19
|
+
|
|
20
|
+
# Per-SM limits by compute capability. Values from the CUDA C programming
|
|
21
|
+
# guide's compute-capability table.
|
|
22
|
+
ARCH_LIMITS: dict[tuple[int, int], dict] = {
|
|
23
|
+
(7, 0): dict(max_warps=64, max_blocks=32, regs=65536, smem=98304, smem_gran=256, reserved_smem=0),
|
|
24
|
+
(7, 5): dict(max_warps=32, max_blocks=16, regs=65536, smem=65536, smem_gran=256, reserved_smem=0),
|
|
25
|
+
(8, 0): dict(max_warps=64, max_blocks=32, regs=65536, smem=167936, smem_gran=128, reserved_smem=1024),
|
|
26
|
+
(8, 6): dict(max_warps=48, max_blocks=16, regs=65536, smem=102400, smem_gran=128, reserved_smem=1024),
|
|
27
|
+
(8, 7): dict(max_warps=48, max_blocks=16, regs=65536, smem=167936, smem_gran=128, reserved_smem=1024),
|
|
28
|
+
(8, 9): dict(max_warps=48, max_blocks=24, regs=65536, smem=102400, smem_gran=128, reserved_smem=1024),
|
|
29
|
+
(9, 0): dict(max_warps=64, max_blocks=32, regs=65536, smem=233472, smem_gran=128, reserved_smem=1024),
|
|
30
|
+
(12, 0): dict(max_warps=48, max_blocks=24, regs=65536, smem=102400, smem_gran=128, reserved_smem=1024),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def limits_for_cc(major: int, minor: int) -> dict:
|
|
35
|
+
if (major, minor) in ARCH_LIMITS:
|
|
36
|
+
return ARCH_LIMITS[(major, minor)]
|
|
37
|
+
older = [cc for cc in ARCH_LIMITS if cc <= (major, minor)]
|
|
38
|
+
if not older:
|
|
39
|
+
raise ValueError(f"no occupancy data for compute capability {major}.{minor}")
|
|
40
|
+
return ARCH_LIMITS[max(older)]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def limits_from_attrs(attrs: dict[str, int]) -> dict:
|
|
44
|
+
"""Build the limits dict from live device attributes, falling back to
|
|
45
|
+
the arch table for allocation granularities the driver doesn't report."""
|
|
46
|
+
cc = (attrs["compute_capability_major"], attrs["compute_capability_minor"])
|
|
47
|
+
arch = limits_for_cc(*cc)
|
|
48
|
+
return dict(
|
|
49
|
+
max_warps=attrs.get("max_threads_per_multiprocessor", arch["max_warps"] * 32) // 32,
|
|
50
|
+
max_blocks=attrs.get("max_blocks_per_multiprocessor", arch["max_blocks"]),
|
|
51
|
+
regs=attrs.get("max_registers_per_multiprocessor", arch["regs"]),
|
|
52
|
+
smem=attrs.get("max_shared_memory_per_multiprocessor", arch["smem"]),
|
|
53
|
+
smem_gran=arch["smem_gran"],
|
|
54
|
+
reserved_smem=attrs.get("reserved_shared_memory_per_block", arch["reserved_smem"]),
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _ceil_to(value: int, unit: int) -> int:
|
|
59
|
+
return (value + unit - 1) // unit * unit
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Occupancy:
|
|
64
|
+
blocks_per_sm: int
|
|
65
|
+
active_warps: int
|
|
66
|
+
max_warps: int
|
|
67
|
+
pct: float
|
|
68
|
+
limited_by: list[str]
|
|
69
|
+
|
|
70
|
+
def as_dict(self) -> dict:
|
|
71
|
+
return dict(self.__dict__)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def compute(block: int, regs_per_thread: int, smem_per_block: int, limits: dict) -> Occupancy:
|
|
75
|
+
if block < 1 or block > 1024:
|
|
76
|
+
raise ValueError("block size must be 1..1024")
|
|
77
|
+
warps_per_block = (block + 31) // 32
|
|
78
|
+
|
|
79
|
+
by = {"blocks": limits["max_blocks"], "threads": limits["max_warps"] // warps_per_block}
|
|
80
|
+
if regs_per_thread:
|
|
81
|
+
regs_per_warp = _ceil_to(regs_per_thread * 32, REG_ALLOC_UNIT)
|
|
82
|
+
by["registers"] = (limits["regs"] // regs_per_warp) // warps_per_block
|
|
83
|
+
smem_total = smem_per_block + limits["reserved_smem"] if smem_per_block else 0
|
|
84
|
+
if smem_total:
|
|
85
|
+
by["shared memory"] = limits["smem"] // _ceil_to(smem_total, limits["smem_gran"])
|
|
86
|
+
|
|
87
|
+
blocks = min(by.values())
|
|
88
|
+
warps = blocks * warps_per_block
|
|
89
|
+
return Occupancy(
|
|
90
|
+
blocks_per_sm=blocks,
|
|
91
|
+
active_warps=warps,
|
|
92
|
+
max_warps=limits["max_warps"],
|
|
93
|
+
pct=100.0 * warps / limits["max_warps"],
|
|
94
|
+
limited_by=[k for k, v in by.items() if v == blocks],
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def block_size_sweep(
|
|
99
|
+
regs_per_thread: int, smem_per_block: int, limits: dict,
|
|
100
|
+
sizes: tuple = (64, 128, 192, 256, 384, 512, 768, 1024),
|
|
101
|
+
) -> list[tuple[int, float]]:
|
|
102
|
+
return [(s, compute(s, regs_per_thread, smem_per_block, limits).pct) for s in sizes]
|
kernelmeter/peaks.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Derive theoretical peak throughput ("speed of light") from device attributes.
|
|
2
|
+
|
|
3
|
+
These numbers are upper bounds computed from the max boost clock the driver
|
|
4
|
+
reports; sustained clocks under load are usually lower, so treat a kernel
|
|
5
|
+
at 85%+ of these peaks as effectively saturating the machine.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
# FP32 CUDA cores per SM, keyed by compute capability. The fallback for an
|
|
13
|
+
# unknown capability is the value of the closest older architecture.
|
|
14
|
+
_FP32_CORES_PER_SM: dict[tuple[int, int], int] = {
|
|
15
|
+
(3, 0): 192, (3, 5): 192, (3, 7): 192,
|
|
16
|
+
(5, 0): 128, (5, 2): 128, (5, 3): 128,
|
|
17
|
+
(6, 0): 64, (6, 1): 128, (6, 2): 128,
|
|
18
|
+
(7, 0): 64, (7, 2): 64, (7, 5): 64,
|
|
19
|
+
(8, 0): 64, (8, 6): 128, (8, 7): 128, (8, 9): 128,
|
|
20
|
+
(9, 0): 128,
|
|
21
|
+
(10, 0): 128, (10, 3): 128,
|
|
22
|
+
(12, 0): 128, (12, 1): 128,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def fp32_cores_per_sm(major: int, minor: int) -> int:
|
|
27
|
+
if (major, minor) in _FP32_CORES_PER_SM:
|
|
28
|
+
return _FP32_CORES_PER_SM[(major, minor)]
|
|
29
|
+
older = [cc for cc in _FP32_CORES_PER_SM if cc <= (major, minor)]
|
|
30
|
+
if older:
|
|
31
|
+
return _FP32_CORES_PER_SM[max(older)]
|
|
32
|
+
return 64
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Peaks:
|
|
37
|
+
"""Theoretical per-device ceilings derived from driver attributes."""
|
|
38
|
+
|
|
39
|
+
mem_bandwidth_gbs: float | None
|
|
40
|
+
fp32_tflops: float | None
|
|
41
|
+
compute_capability: tuple[int, int] | None
|
|
42
|
+
|
|
43
|
+
def as_dict(self) -> dict:
|
|
44
|
+
return {
|
|
45
|
+
"theoretical_mem_bandwidth_gb_s": self.mem_bandwidth_gbs,
|
|
46
|
+
"theoretical_fp32_tflops": self.fp32_tflops,
|
|
47
|
+
"compute_capability": (
|
|
48
|
+
f"{self.compute_capability[0]}.{self.compute_capability[1]}"
|
|
49
|
+
if self.compute_capability
|
|
50
|
+
else None
|
|
51
|
+
),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def mem_bandwidth_gbs(memory_clock_khz: int, bus_width_bits: int) -> float:
|
|
56
|
+
"""DDR: two transfers per clock. clock(kHz) * 1e3 * width(bytes) * 2 / 1e9."""
|
|
57
|
+
return 2.0 * memory_clock_khz * 1e3 * (bus_width_bits / 8.0) / 1e9
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def fp32_tflops(sm_count: int, clock_khz: int, major: int, minor: int) -> float:
|
|
61
|
+
"""One FMA per core per clock = 2 FLOPs."""
|
|
62
|
+
cores = fp32_cores_per_sm(major, minor)
|
|
63
|
+
return 2.0 * sm_count * cores * clock_khz * 1e3 / 1e12
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def derive(attrs: dict[str, int]) -> Peaks:
|
|
67
|
+
"""Compute peaks from a query_all() attribute dict, tolerating gaps."""
|
|
68
|
+
bw = None
|
|
69
|
+
if "memory_clock_rate_khz" in attrs and "global_memory_bus_width_bits" in attrs:
|
|
70
|
+
bw = mem_bandwidth_gbs(
|
|
71
|
+
attrs["memory_clock_rate_khz"], attrs["global_memory_bus_width_bits"]
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
cc = None
|
|
75
|
+
flops = None
|
|
76
|
+
if "compute_capability_major" in attrs and "compute_capability_minor" in attrs:
|
|
77
|
+
cc = (attrs["compute_capability_major"], attrs["compute_capability_minor"])
|
|
78
|
+
if "multiprocessor_count" in attrs and "clock_rate_khz" in attrs:
|
|
79
|
+
flops = fp32_tflops(
|
|
80
|
+
attrs["multiprocessor_count"], attrs["clock_rate_khz"], cc[0], cc[1]
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return Peaks(mem_bandwidth_gbs=bw, fp32_tflops=flops, compute_capability=cc)
|