kernelmeter 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kernelmeter-0.2.0/src/kernelmeter.egg-info → kernelmeter-0.3.0}/PKG-INFO +56 -11
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/README.md +55 -10
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/pyproject.toml +1 -1
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/__init__.py +1 -1
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/attrs.py +22 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/bench.py +50 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/cli.py +68 -2
- kernelmeter-0.3.0/src/kernelmeter/nvml.py +172 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/peaks.py +52 -6
- {kernelmeter-0.2.0 → kernelmeter-0.3.0/src/kernelmeter.egg-info}/PKG-INFO +56 -11
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/SOURCES.txt +4 -1
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_attrs.py +7 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_cli_new_commands.py +7 -0
- kernelmeter-0.3.0/tests/test_nvml.py +91 -0
- kernelmeter-0.3.0/tests/test_tensor_peaks.py +50 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/LICENSE +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/setup.cfg +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/ceiling.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/cudadrv.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/occupancy.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/roofline.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/dependency_links.txt +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/entry_points.txt +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/requires.txt +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/top_level.txt +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_bench_math.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_bench_roofline.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_cli.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_occupancy.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_peaks.py +0 -0
- {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_roofline.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kernelmeter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
|
|
5
5
|
Author: nuemaan
|
|
6
6
|
License: MIT
|
|
@@ -24,6 +24,11 @@ Dynamic: license-file
|
|
|
24
24
|
|
|
25
25
|
# kernelmeter
|
|
26
26
|
|
|
27
|
+
[](https://pypi.org/project/kernelmeter/)
|
|
28
|
+
[](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml)
|
|
29
|
+
[](https://pypi.org/project/kernelmeter/)
|
|
30
|
+
[](LICENSE)
|
|
31
|
+
|
|
27
32
|
Small tools for one question: **is my GPU kernel actually good, and if
|
|
28
33
|
not, what exactly is holding it back?** All in one package with zero
|
|
29
34
|
required dependencies.
|
|
@@ -36,7 +41,9 @@ required dependencies.
|
|
|
36
41
|
reference, and scores it against the roofline: the best your card could
|
|
37
42
|
possibly do for that kernel's mix of math and memory traffic. 240 GB/s
|
|
38
43
|
means nothing on its own; "76% of attainable" tells you how much room
|
|
39
|
-
is left.
|
|
44
|
+
is left. While the kernel runs it also samples the real clocks, power
|
|
45
|
+
and temperature through NVML, and re-scores against the ceiling the
|
|
46
|
+
card actually held.
|
|
40
47
|
* `kernelmeter roofline` draws your card's roofline in the terminal and
|
|
41
48
|
shows where a kernel sits on it.
|
|
42
49
|
* `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
|
|
@@ -137,7 +144,7 @@ kernelmeter bench mybench.py
|
|
|
137
144
|
```text
|
|
138
145
|
kernel median ms GB/s TFLOP/s bound %roof vs ref correct
|
|
139
146
|
------------------------------------------------------------------------------------------
|
|
140
|
-
my_add 3.
|
|
147
|
+
my_add 3.2725 246.1 - mem 76.9% 1.03x PASS
|
|
141
148
|
```
|
|
142
149
|
|
|
143
150
|
* **correct** - your output matched the reference. If this says FAIL,
|
|
@@ -151,8 +158,39 @@ my_add 3.3393 241.2 - mem 75.3% 1.01x
|
|
|
151
158
|
|
|
152
159
|
Pass `flops_per_call` too and the roofline model places your kernel
|
|
153
160
|
precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
|
|
154
|
-
it gets judged against the right ceiling
|
|
155
|
-
|
|
161
|
+
it gets judged against the right ceiling (`kernelmeter info` prints the
|
|
162
|
+
derived fp16/tf32 tensor peaks for your card). Raw `%peak bw` and
|
|
163
|
+
`%fp32` numbers are always in the `--json` output.
|
|
164
|
+
|
|
165
|
+
When NVML is available (it ships with the driver) a second table follows
|
|
166
|
+
with what the card was doing during each measurement:
|
|
167
|
+
|
|
168
|
+
```text
|
|
169
|
+
telemetry sm MHz mem MHz temp power %roof@clk
|
|
170
|
+
-----------------------------------------------------------------------
|
|
171
|
+
my_add 1062/1590 5000 42C 53.1W 76.9%
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
`%roof@clk` is the same roofline score, but against the ceiling at the
|
|
175
|
+
clocks the card actually held. If `%roof` looks bad but `%roof@clk` is
|
|
176
|
+
high, your kernel is fine: the card is thermal or power limited, and no
|
|
177
|
+
amount of kernel work will change that. A real example, cuBLAS fp32
|
|
178
|
+
matmul on a 70 W T4:
|
|
179
|
+
|
|
180
|
+
```text
|
|
181
|
+
kernel median ms GB/s TFLOP/s bound %roof vs ref correct
|
|
182
|
+
----------------------------------------------------------------------------------------
|
|
183
|
+
fp32_matmul 32.0354 6.3 4.29 comp 52.7% - -
|
|
184
|
+
|
|
185
|
+
telemetry sm MHz mem MHz temp power %roof@clk
|
|
186
|
+
-----------------------------------------------------------------------
|
|
187
|
+
fp32_matmul 877/1590 5000 46C 70.4W 95.5%
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
53% of peak looks like a kernel problem. The telemetry shows it is not:
|
|
191
|
+
the card hit its 70 W power limit and dropped to 877 MHz, and at those
|
|
192
|
+
clocks the kernel was at 95.5% of what the silicon could deliver. cuBLAS
|
|
193
|
+
was never the problem.
|
|
156
194
|
|
|
157
195
|
Timing uses CUDA events with warmup, and the L2 cache is flushed between
|
|
158
196
|
iterations so small workloads can't fake huge bandwidth numbers from
|
|
@@ -194,6 +232,9 @@ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
|
|
|
194
232
|
more FLOPs are free: the memory traffic is the bill you are paying
|
|
195
233
|
anyway. That is the whole argument for kernel fusion, in one picture.
|
|
196
234
|
No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
|
|
235
|
+
`--tensor` swaps in the fp16 tensor-core roof, which moves the ridge
|
|
236
|
+
point far to the left; that picture explains why tensor-core kernels
|
|
237
|
+
are almost always memory-bound.
|
|
197
238
|
|
|
198
239
|
## Why is my occupancy low?
|
|
199
240
|
|
|
@@ -280,15 +321,15 @@ whether your kernels are any good:
|
|
|
280
321
|
## Caveats
|
|
281
322
|
|
|
282
323
|
* Theoretical peaks are computed from the max boost clock the driver
|
|
283
|
-
reports. Sustained clocks under load are lower;
|
|
284
|
-
|
|
285
|
-
* The
|
|
286
|
-
tensor
|
|
287
|
-
|
|
324
|
+
reports. Sustained clocks under load are lower; the telemetry table
|
|
325
|
+
and `kernelmeter ceiling` both show what you can actually reach.
|
|
326
|
+
* The tensor-core peaks are dense rates with fp16 accumulate. GeForce
|
|
327
|
+
cards run tensor cores at half rate when accumulating in fp32, and
|
|
328
|
+
sparse rates are double; pass `peak_tflops=...` when those apply.
|
|
288
329
|
* The occupancy command implements the standard calculator model. Real
|
|
289
330
|
occupancy can differ (launch bounds, driver decisions); confirm with
|
|
290
331
|
Nsight Compute when it matters.
|
|
291
|
-
* Attribute names above id
|
|
332
|
+
* Attribute names above id 143 are best-effort against the CUDA 12.x
|
|
292
333
|
headers. Values are always read live from your driver. PRs that extend
|
|
293
334
|
the name table are welcome.
|
|
294
335
|
|
|
@@ -304,6 +345,10 @@ them on plain GitHub runners. For an end-to-end check on a real GPU there
|
|
|
304
345
|
is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
|
|
305
346
|
The numbers in this README come from that script on a T4.
|
|
306
347
|
|
|
348
|
+
Releases are tag-driven: bump the version in `pyproject.toml`, add a
|
|
349
|
+
[CHANGELOG.md](CHANGELOG.md) entry, push a `v*` tag. CI tests, builds and
|
|
350
|
+
publishes to PyPI through trusted publishing.
|
|
351
|
+
|
|
307
352
|
## License
|
|
308
353
|
|
|
309
354
|
MIT
|
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# kernelmeter
|
|
2
2
|
|
|
3
|
+
[](https://pypi.org/project/kernelmeter/)
|
|
4
|
+
[](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml)
|
|
5
|
+
[](https://pypi.org/project/kernelmeter/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
3
8
|
Small tools for one question: **is my GPU kernel actually good, and if
|
|
4
9
|
not, what exactly is holding it back?** All in one package with zero
|
|
5
10
|
required dependencies.
|
|
@@ -12,7 +17,9 @@ required dependencies.
|
|
|
12
17
|
reference, and scores it against the roofline: the best your card could
|
|
13
18
|
possibly do for that kernel's mix of math and memory traffic. 240 GB/s
|
|
14
19
|
means nothing on its own; "76% of attainable" tells you how much room
|
|
15
|
-
is left.
|
|
20
|
+
is left. While the kernel runs it also samples the real clocks, power
|
|
21
|
+
and temperature through NVML, and re-scores against the ceiling the
|
|
22
|
+
card actually held.
|
|
16
23
|
* `kernelmeter roofline` draws your card's roofline in the terminal and
|
|
17
24
|
shows where a kernel sits on it.
|
|
18
25
|
* `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
|
|
@@ -113,7 +120,7 @@ kernelmeter bench mybench.py
|
|
|
113
120
|
```text
|
|
114
121
|
kernel median ms GB/s TFLOP/s bound %roof vs ref correct
|
|
115
122
|
------------------------------------------------------------------------------------------
|
|
116
|
-
my_add 3.
|
|
123
|
+
my_add 3.2725 246.1 - mem 76.9% 1.03x PASS
|
|
117
124
|
```
|
|
118
125
|
|
|
119
126
|
* **correct** - your output matched the reference. If this says FAIL,
|
|
@@ -127,8 +134,39 @@ my_add 3.3393 241.2 - mem 75.3% 1.01x
|
|
|
127
134
|
|
|
128
135
|
Pass `flops_per_call` too and the roofline model places your kernel
|
|
129
136
|
precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
|
|
130
|
-
it gets judged against the right ceiling
|
|
131
|
-
|
|
137
|
+
it gets judged against the right ceiling (`kernelmeter info` prints the
|
|
138
|
+
derived fp16/tf32 tensor peaks for your card). Raw `%peak bw` and
|
|
139
|
+
`%fp32` numbers are always in the `--json` output.
|
|
140
|
+
|
|
141
|
+
When NVML is available (it ships with the driver) a second table follows
|
|
142
|
+
with what the card was doing during each measurement:
|
|
143
|
+
|
|
144
|
+
```text
|
|
145
|
+
telemetry sm MHz mem MHz temp power %roof@clk
|
|
146
|
+
-----------------------------------------------------------------------
|
|
147
|
+
my_add 1062/1590 5000 42C 53.1W 76.9%
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
`%roof@clk` is the same roofline score, but against the ceiling at the
|
|
151
|
+
clocks the card actually held. If `%roof` looks bad but `%roof@clk` is
|
|
152
|
+
high, your kernel is fine: the card is thermal or power limited, and no
|
|
153
|
+
amount of kernel work will change that. A real example, cuBLAS fp32
|
|
154
|
+
matmul on a 70 W T4:
|
|
155
|
+
|
|
156
|
+
```text
|
|
157
|
+
kernel median ms GB/s TFLOP/s bound %roof vs ref correct
|
|
158
|
+
----------------------------------------------------------------------------------------
|
|
159
|
+
fp32_matmul 32.0354 6.3 4.29 comp 52.7% - -
|
|
160
|
+
|
|
161
|
+
telemetry sm MHz mem MHz temp power %roof@clk
|
|
162
|
+
-----------------------------------------------------------------------
|
|
163
|
+
fp32_matmul 877/1590 5000 46C 70.4W 95.5%
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
53% of peak looks like a kernel problem. The telemetry shows it is not:
|
|
167
|
+
the card hit its 70 W power limit and dropped to 877 MHz, and at those
|
|
168
|
+
clocks the kernel was at 95.5% of what the silicon could deliver. cuBLAS
|
|
169
|
+
was never the problem.
|
|
132
170
|
|
|
133
171
|
Timing uses CUDA events with warmup, and the L2 cache is flushed between
|
|
134
172
|
iterations so small workloads can't fake huge bandwidth numbers from
|
|
@@ -170,6 +208,9 @@ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
|
|
|
170
208
|
more FLOPs are free: the memory traffic is the bill you are paying
|
|
171
209
|
anyway. That is the whole argument for kernel fusion, in one picture.
|
|
172
210
|
No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
|
|
211
|
+
`--tensor` swaps in the fp16 tensor-core roof, which moves the ridge
|
|
212
|
+
point far to the left; that picture explains why tensor-core kernels
|
|
213
|
+
are almost always memory-bound.
|
|
173
214
|
|
|
174
215
|
## Why is my occupancy low?
|
|
175
216
|
|
|
@@ -256,15 +297,15 @@ whether your kernels are any good:
|
|
|
256
297
|
## Caveats
|
|
257
298
|
|
|
258
299
|
* Theoretical peaks are computed from the max boost clock the driver
|
|
259
|
-
reports. Sustained clocks under load are lower;
|
|
260
|
-
|
|
261
|
-
* The
|
|
262
|
-
tensor
|
|
263
|
-
|
|
300
|
+
reports. Sustained clocks under load are lower; the telemetry table
|
|
301
|
+
and `kernelmeter ceiling` both show what you can actually reach.
|
|
302
|
+
* The tensor-core peaks are dense rates with fp16 accumulate. GeForce
|
|
303
|
+
cards run tensor cores at half rate when accumulating in fp32, and
|
|
304
|
+
sparse rates are double; pass `peak_tflops=...` when those apply.
|
|
264
305
|
* The occupancy command implements the standard calculator model. Real
|
|
265
306
|
occupancy can differ (launch bounds, driver decisions); confirm with
|
|
266
307
|
Nsight Compute when it matters.
|
|
267
|
-
* Attribute names above id
|
|
308
|
+
* Attribute names above id 143 are best-effort against the CUDA 12.x
|
|
268
309
|
headers. Values are always read live from your driver. PRs that extend
|
|
269
310
|
the name table are welcome.
|
|
270
311
|
|
|
@@ -280,6 +321,10 @@ them on plain GitHub runners. For an end-to-end check on a real GPU there
|
|
|
280
321
|
is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
|
|
281
322
|
The numbers in this README come from that script on a T4.
|
|
282
323
|
|
|
324
|
+
Releases are tag-driven: bump the version in `pyproject.toml`, add a
|
|
325
|
+
[CHANGELOG.md](CHANGELOG.md) entry, push a `v*` tag. CI tests, builds and
|
|
326
|
+
publishes to PyPI through trusted publishing.
|
|
327
|
+
|
|
283
328
|
## License
|
|
284
329
|
|
|
285
330
|
MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "kernelmeter"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -132,6 +132,28 @@ KNOWN_ATTRS: dict[int, str] = {
|
|
|
132
132
|
119: "mempool_supported_handle_types",
|
|
133
133
|
120: "cluster_launch",
|
|
134
134
|
121: "deferred_mapping_cuda_array_supported",
|
|
135
|
+
122: "can_use_64_bit_stream_mem_ops",
|
|
136
|
+
123: "can_use_stream_wait_value_nor",
|
|
137
|
+
124: "dma_buf_supported",
|
|
138
|
+
125: "ipc_event_supported",
|
|
139
|
+
126: "mem_sync_domain_count",
|
|
140
|
+
127: "tensor_map_access_supported",
|
|
141
|
+
128: "handle_type_fabric_supported",
|
|
142
|
+
129: "unified_function_pointers",
|
|
143
|
+
130: "numa_config",
|
|
144
|
+
131: "numa_id",
|
|
145
|
+
132: "multicast_supported",
|
|
146
|
+
133: "mps_enabled",
|
|
147
|
+
134: "host_numa_id",
|
|
148
|
+
135: "d3d12_cig_supported",
|
|
149
|
+
136: "mem_decompress_algorithm_mask",
|
|
150
|
+
137: "mem_decompress_maximum_length",
|
|
151
|
+
138: "vulkan_cig_supported",
|
|
152
|
+
139: "gpu_pci_device_id",
|
|
153
|
+
140: "gpu_pci_subsystem_id",
|
|
154
|
+
141: "host_numa_virtual_memory_management_supported",
|
|
155
|
+
142: "host_numa_memory_pools_supported",
|
|
156
|
+
143: "host_numa_multinode_ipc_supported",
|
|
135
157
|
}
|
|
136
158
|
|
|
137
159
|
|
|
@@ -46,8 +46,14 @@ class BenchResult:
|
|
|
46
46
|
intensity: float | None = None
|
|
47
47
|
bound: str | None = None
|
|
48
48
|
pct_roofline: float | None = None
|
|
49
|
+
pct_roof_sustained: float | None = None
|
|
49
50
|
pct_peak_bw: float | None = None
|
|
50
51
|
pct_peak_fp32: float | None = None
|
|
52
|
+
sm_clock_mhz: float | None = None
|
|
53
|
+
max_sm_clock_mhz: int | None = None
|
|
54
|
+
mem_clock_mhz: float | None = None
|
|
55
|
+
temperature_c: int | None = None
|
|
56
|
+
power_w: float | None = None
|
|
51
57
|
ref_ms_median: float | None = None
|
|
52
58
|
speedup_vs_ref: float | None = None
|
|
53
59
|
correct: bool | None = None
|
|
@@ -159,6 +165,21 @@ def roofline_score(
|
|
|
159
165
|
return None, None, None
|
|
160
166
|
|
|
161
167
|
|
|
168
|
+
def sustained_peaks(peaks: _peaks.Peaks, telemetry, peak_tflops_override: float | None = None) -> _peaks.Peaks:
|
|
169
|
+
"""Scale the theoretical peaks down to the clocks the card actually
|
|
170
|
+
held while the kernel ran."""
|
|
171
|
+
tf = peak_tflops_override or peaks.fp32_tflops
|
|
172
|
+
return _peaks.Peaks(
|
|
173
|
+
mem_bandwidth_gbs=(
|
|
174
|
+
peaks.mem_bandwidth_gbs * telemetry.mem_clock_fraction
|
|
175
|
+
if peaks.mem_bandwidth_gbs
|
|
176
|
+
else None
|
|
177
|
+
),
|
|
178
|
+
fp32_tflops=tf * telemetry.sm_clock_fraction if tf else None,
|
|
179
|
+
compute_capability=peaks.compute_capability,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
162
183
|
def diff_results(baseline: list[dict], results: list["BenchResult"], threshold_pct: float = 5.0):
|
|
163
184
|
"""Compare a run against a saved baseline. Returns (rows, regressions)
|
|
164
185
|
where rows are (name, old_ms, new_ms, delta_pct) and regressions lists
|
|
@@ -238,7 +259,25 @@ def run(spec: BenchSpec, peaks: _peaks.Peaks | None = None, flush_l2: bool = Tru
|
|
|
238
259
|
if spec.ref is not None:
|
|
239
260
|
correct, max_err = _check_correctness(spec, args)
|
|
240
261
|
|
|
262
|
+
monitor = None
|
|
263
|
+
try:
|
|
264
|
+
from . import nvml as _nvml
|
|
265
|
+
|
|
266
|
+
monitor = _nvml.Monitor()
|
|
267
|
+
monitor.start()
|
|
268
|
+
except Exception:
|
|
269
|
+
monitor = None
|
|
270
|
+
|
|
241
271
|
times = _time_fn(spec.fn, args, spec.warmup, spec.iters, flush_l2)
|
|
272
|
+
|
|
273
|
+
telemetry = None
|
|
274
|
+
if monitor is not None:
|
|
275
|
+
try:
|
|
276
|
+
telemetry = monitor.stop()
|
|
277
|
+
monitor.close()
|
|
278
|
+
except Exception:
|
|
279
|
+
telemetry = None
|
|
280
|
+
|
|
242
281
|
ms_mean, ms_median, ms_min = summarize_times(times)
|
|
243
282
|
|
|
244
283
|
nbytes = _resolve(spec.bytes_per_call, args)
|
|
@@ -249,6 +288,11 @@ def run(spec: BenchSpec, peaks: _peaks.Peaks | None = None, flush_l2: bool = Tru
|
|
|
249
288
|
nbytes, nflops, gbps, tflops, peaks, spec.peak_tflops
|
|
250
289
|
)
|
|
251
290
|
|
|
291
|
+
pct_sustained = None
|
|
292
|
+
if telemetry is not None:
|
|
293
|
+
scaled = sustained_peaks(peaks, telemetry, spec.peak_tflops)
|
|
294
|
+
pct_sustained = roofline_score(nbytes, nflops, gbps, tflops, scaled)[2]
|
|
295
|
+
|
|
252
296
|
ref_ms = speedup = None
|
|
253
297
|
if spec.ref is not None:
|
|
254
298
|
ref_times = _time_fn(spec.ref, args, spec.warmup, spec.iters, flush_l2)
|
|
@@ -265,8 +309,14 @@ def run(spec: BenchSpec, peaks: _peaks.Peaks | None = None, flush_l2: bool = Tru
|
|
|
265
309
|
intensity=ai,
|
|
266
310
|
bound=kernel_bound,
|
|
267
311
|
pct_roofline=pct_roof,
|
|
312
|
+
pct_roof_sustained=pct_sustained,
|
|
268
313
|
pct_peak_bw=pct_of_peak(gbps, peaks.mem_bandwidth_gbs) if gbps else None,
|
|
269
314
|
pct_peak_fp32=pct_of_peak(tflops, peaks.fp32_tflops) if tflops else None,
|
|
315
|
+
sm_clock_mhz=telemetry.sm_clock_mhz if telemetry else None,
|
|
316
|
+
max_sm_clock_mhz=telemetry.max_sm_clock_mhz if telemetry else None,
|
|
317
|
+
mem_clock_mhz=telemetry.mem_clock_mhz if telemetry else None,
|
|
318
|
+
temperature_c=telemetry.temperature_c if telemetry else None,
|
|
319
|
+
power_w=telemetry.power_w if telemetry else None,
|
|
270
320
|
ref_ms_median=ref_ms,
|
|
271
321
|
speedup_vs_ref=speedup,
|
|
272
322
|
correct=correct,
|
|
@@ -30,6 +30,28 @@ def _device_attrs(ordinal: int = 0) -> dict[str, int]:
|
|
|
30
30
|
return _attrs.query_all(driver, driver.device(ordinal))
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
def _print_live_telemetry(ordinal: int) -> None:
|
|
34
|
+
"""Current clocks/temp/power via NVML; quietly skipped when absent."""
|
|
35
|
+
try:
|
|
36
|
+
from . import nvml as _nvml
|
|
37
|
+
|
|
38
|
+
n = _nvml.Nvml()
|
|
39
|
+
except Exception:
|
|
40
|
+
return
|
|
41
|
+
try:
|
|
42
|
+
h = n.device(ordinal)
|
|
43
|
+
print(
|
|
44
|
+
f" live: sm {n.sm_clock_mhz(h)}/{n.max_sm_clock_mhz(h)} MHz, "
|
|
45
|
+
f"mem {n.mem_clock_mhz(h)}/{n.max_mem_clock_mhz(h)} MHz, "
|
|
46
|
+
f"{n.temperature_c(h)}C, "
|
|
47
|
+
f"{n.power_w(h):.1f}/{n.power_limit_w(h):.0f}W"
|
|
48
|
+
)
|
|
49
|
+
except Exception:
|
|
50
|
+
pass
|
|
51
|
+
finally:
|
|
52
|
+
n.close()
|
|
53
|
+
|
|
54
|
+
|
|
33
55
|
# ---------------------------------------------------------------------------
|
|
34
56
|
# info
|
|
35
57
|
# ---------------------------------------------------------------------------
|
|
@@ -79,6 +101,17 @@ def cmd_info(args: argparse.Namespace) -> int:
|
|
|
79
101
|
" theoretical FP32 peak : "
|
|
80
102
|
+ _fmt(derived["theoretical_fp32_tflops"], " TFLOP/s", nd=2)
|
|
81
103
|
)
|
|
104
|
+
if derived.get("theoretical_fp16_tensor_tflops"):
|
|
105
|
+
print(
|
|
106
|
+
" theoretical fp16 tensor : "
|
|
107
|
+
+ _fmt(derived["theoretical_fp16_tensor_tflops"], " TFLOP/s (dense)", nd=2)
|
|
108
|
+
)
|
|
109
|
+
if derived.get("theoretical_tf32_tensor_tflops"):
|
|
110
|
+
print(
|
|
111
|
+
" theoretical tf32 tensor : "
|
|
112
|
+
+ _fmt(derived["theoretical_tf32_tensor_tflops"], " TFLOP/s (dense)", nd=2)
|
|
113
|
+
)
|
|
114
|
+
_print_live_telemetry(dev["ordinal"])
|
|
82
115
|
print(f"\n {'attribute':<48} value")
|
|
83
116
|
print(f" {'-' * 48} {'-' * 12}")
|
|
84
117
|
for name, value in dev["attributes"].items():
|
|
@@ -137,6 +170,27 @@ def cmd_bench(args: argparse.Namespace) -> int:
|
|
|
137
170
|
f"{_fmt(r.pct_roofline, '%'):>7} {speedup:>8} {correct:>8}"
|
|
138
171
|
)
|
|
139
172
|
|
|
173
|
+
if any(r.sm_clock_mhz for r in results):
|
|
174
|
+
print()
|
|
175
|
+
theader = (
|
|
176
|
+
f"{'telemetry':<24} {'sm MHz':>10} {'mem MHz':>9} "
|
|
177
|
+
f"{'temp':>6} {'power':>7} {'%roof@clk':>10}"
|
|
178
|
+
)
|
|
179
|
+
print(theader)
|
|
180
|
+
print("-" * len(theader))
|
|
181
|
+
for r in results:
|
|
182
|
+
if not r.sm_clock_mhz:
|
|
183
|
+
continue
|
|
184
|
+
print(
|
|
185
|
+
f"{r.name:<24} {r.sm_clock_mhz:>5.0f}/{r.max_sm_clock_mhz:<4} "
|
|
186
|
+
f"{_fmt(r.mem_clock_mhz, nd=0):>9} {r.temperature_c or '-':>5}C "
|
|
187
|
+
f"{_fmt(r.power_w, 'W'):>7} {_fmt(r.pct_roof_sustained, '%'):>10}"
|
|
188
|
+
)
|
|
189
|
+
print(
|
|
190
|
+
"%roof@clk scores against the ceiling at the clocks the card "
|
|
191
|
+
"actually held during the run"
|
|
192
|
+
)
|
|
193
|
+
|
|
140
194
|
ok = all(r.error is None and r.correct in (None, True) for r in results)
|
|
141
195
|
|
|
142
196
|
if args.compare:
|
|
@@ -169,7 +223,17 @@ def cmd_roofline(args: argparse.Namespace) -> int:
|
|
|
169
223
|
name = dev.name
|
|
170
224
|
peaks = _peaks.derive(_attrs.query_all(driver, dev))
|
|
171
225
|
peak_bw = peak_bw or peaks.mem_bandwidth_gbs
|
|
172
|
-
|
|
226
|
+
if args.tensor:
|
|
227
|
+
if peaks.fp16_tensor_tflops is None:
|
|
228
|
+
print(
|
|
229
|
+
"error: no tensor-core rate known for this card; "
|
|
230
|
+
"pass --peak-tflops instead.",
|
|
231
|
+
file=sys.stderr,
|
|
232
|
+
)
|
|
233
|
+
return 1
|
|
234
|
+
peak_tf = peak_tf or peaks.fp16_tensor_tflops
|
|
235
|
+
else:
|
|
236
|
+
peak_tf = peak_tf or peaks.fp32_tflops
|
|
173
237
|
except CudaNotAvailableError:
|
|
174
238
|
pass
|
|
175
239
|
if not peak_bw or not peak_tf:
|
|
@@ -183,8 +247,9 @@ def cmd_roofline(args: argparse.Namespace) -> int:
|
|
|
183
247
|
ridge = _roofline.ridge_point(peak_tf, peak_bw)
|
|
184
248
|
if name:
|
|
185
249
|
print(f"Device {args.device}: {name}")
|
|
250
|
+
roof_kind = "fp16 tensor" if args.tensor else "fp32"
|
|
186
251
|
print(f" peak bandwidth : {peak_bw:.1f} GB/s")
|
|
187
|
-
print(f" peak compute : {peak_tf:.2f} TFLOP/s")
|
|
252
|
+
print(f" peak compute : {peak_tf:.2f} TFLOP/s ({roof_kind})")
|
|
188
253
|
print(f" ridge point : {ridge:.1f} flop/byte\n")
|
|
189
254
|
for line in _roofline.render(peak_tf, peak_bw, ai=args.ai):
|
|
190
255
|
print(line)
|
|
@@ -278,6 +343,7 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
278
343
|
p_roof = sub.add_parser("roofline", help="draw the device roofline")
|
|
279
344
|
p_roof.add_argument("--ai", type=float, help="mark a kernel at this arithmetic intensity")
|
|
280
345
|
p_roof.add_argument("--device", type=int, default=0)
|
|
346
|
+
p_roof.add_argument("--tensor", action="store_true", help="use the fp16 tensor-core roof")
|
|
281
347
|
p_roof.add_argument("--peak-bw", type=float, help="override bandwidth in GB/s")
|
|
282
348
|
p_roof.add_argument("--peak-tflops", type=float, help="override compute in TFLOP/s")
|
|
283
349
|
p_roof.set_defaults(func=cmd_roofline)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Live telemetry through NVML (the library behind nvidia-smi).
|
|
2
|
+
|
|
3
|
+
libnvidia-ml ships with the driver, so this costs no extra dependency.
|
|
4
|
+
The point: theoretical peaks assume the max boost clock, but cards
|
|
5
|
+
downclock under load. Sampling the actual SM and memory clocks while a
|
|
6
|
+
kernel runs lets the bench report what the ceiling really was during the
|
|
7
|
+
measurement, not what the spec sheet promised.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import ctypes
|
|
13
|
+
import statistics
|
|
14
|
+
import sys
|
|
15
|
+
import threading
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
NVML_SUCCESS = 0
|
|
19
|
+
NVML_CLOCK_SM = 1
|
|
20
|
+
NVML_CLOCK_MEM = 2
|
|
21
|
+
NVML_TEMPERATURE_GPU = 0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NvmlError(RuntimeError):
|
|
25
|
+
def __init__(self, func: str, code: int):
|
|
26
|
+
super().__init__(f"{func} failed with NVML code {code}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class NvmlNotAvailableError(RuntimeError):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_library() -> ctypes.CDLL:
|
|
34
|
+
if sys.platform == "darwin":
|
|
35
|
+
raise NvmlNotAvailableError("NVML is not available on macOS")
|
|
36
|
+
names = ("nvml.dll",) if sys.platform == "win32" else ("libnvidia-ml.so.1", "libnvidia-ml.so")
|
|
37
|
+
for name in names:
|
|
38
|
+
try:
|
|
39
|
+
if sys.platform == "win32":
|
|
40
|
+
return ctypes.WinDLL(name) # pragma: no cover
|
|
41
|
+
return ctypes.CDLL(name)
|
|
42
|
+
except OSError:
|
|
43
|
+
continue
|
|
44
|
+
raise NvmlNotAvailableError("could not load libnvidia-ml; is the NVIDIA driver installed?")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Nvml:
|
|
48
|
+
"""Minimal wrapper. Like cudadrv.Driver, the lib is injectable so the
|
|
49
|
+
tests can run on machines with no NVIDIA driver."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, lib=None):
|
|
52
|
+
self._lib = lib if lib is not None else load_library()
|
|
53
|
+
self._check("nvmlInit_v2", self._lib.nvmlInit_v2())
|
|
54
|
+
|
|
55
|
+
def _check(self, func: str, code: int) -> None:
|
|
56
|
+
if code != NVML_SUCCESS:
|
|
57
|
+
raise NvmlError(func, code)
|
|
58
|
+
|
|
59
|
+
def close(self) -> None:
|
|
60
|
+
self._lib.nvmlShutdown()
|
|
61
|
+
|
|
62
|
+
def device(self, index: int = 0) -> ctypes.c_void_p:
|
|
63
|
+
handle = ctypes.c_void_p()
|
|
64
|
+
self._check(
|
|
65
|
+
"nvmlDeviceGetHandleByIndex",
|
|
66
|
+
self._lib.nvmlDeviceGetHandleByIndex_v2(index, ctypes.byref(handle)),
|
|
67
|
+
)
|
|
68
|
+
return handle
|
|
69
|
+
|
|
70
|
+
def _uint_query(self, func_name: str, handle, *args) -> int:
|
|
71
|
+
out = ctypes.c_uint(0)
|
|
72
|
+
fn = getattr(self._lib, func_name)
|
|
73
|
+
self._check(func_name, fn(handle, *args, ctypes.byref(out)))
|
|
74
|
+
return out.value
|
|
75
|
+
|
|
76
|
+
def sm_clock_mhz(self, handle) -> int:
|
|
77
|
+
return self._uint_query("nvmlDeviceGetClockInfo", handle, NVML_CLOCK_SM)
|
|
78
|
+
|
|
79
|
+
def mem_clock_mhz(self, handle) -> int:
|
|
80
|
+
return self._uint_query("nvmlDeviceGetClockInfo", handle, NVML_CLOCK_MEM)
|
|
81
|
+
|
|
82
|
+
def max_sm_clock_mhz(self, handle) -> int:
|
|
83
|
+
return self._uint_query("nvmlDeviceGetMaxClockInfo", handle, NVML_CLOCK_SM)
|
|
84
|
+
|
|
85
|
+
def max_mem_clock_mhz(self, handle) -> int:
|
|
86
|
+
return self._uint_query("nvmlDeviceGetMaxClockInfo", handle, NVML_CLOCK_MEM)
|
|
87
|
+
|
|
88
|
+
def temperature_c(self, handle) -> int:
|
|
89
|
+
return self._uint_query("nvmlDeviceGetTemperature", handle, NVML_TEMPERATURE_GPU)
|
|
90
|
+
|
|
91
|
+
def power_w(self, handle) -> float:
|
|
92
|
+
return self._uint_query("nvmlDeviceGetPowerUsage", handle) / 1000.0
|
|
93
|
+
|
|
94
|
+
def power_limit_w(self, handle) -> float:
|
|
95
|
+
return self._uint_query("nvmlDeviceGetEnforcedPowerLimit", handle) / 1000.0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class Telemetry:
|
|
100
|
+
sm_clock_mhz: float
|
|
101
|
+
mem_clock_mhz: float
|
|
102
|
+
max_sm_clock_mhz: int
|
|
103
|
+
max_mem_clock_mhz: int
|
|
104
|
+
temperature_c: int
|
|
105
|
+
power_w: float
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def sm_clock_fraction(self) -> float:
|
|
109
|
+
return self.sm_clock_mhz / self.max_sm_clock_mhz if self.max_sm_clock_mhz else 1.0
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def mem_clock_fraction(self) -> float:
|
|
113
|
+
return self.mem_clock_mhz / self.max_mem_clock_mhz if self.max_mem_clock_mhz else 1.0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def summarize_samples(
|
|
117
|
+
sm: list[int], mem: list[int], temp: list[int], power: list[float],
|
|
118
|
+
max_sm: int, max_mem: int,
|
|
119
|
+
) -> Telemetry:
|
|
120
|
+
return Telemetry(
|
|
121
|
+
sm_clock_mhz=statistics.fmean(sm),
|
|
122
|
+
mem_clock_mhz=statistics.fmean(mem),
|
|
123
|
+
max_sm_clock_mhz=max_sm,
|
|
124
|
+
max_mem_clock_mhz=max_mem,
|
|
125
|
+
temperature_c=max(temp),
|
|
126
|
+
power_w=statistics.fmean(power),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Monitor:
|
|
131
|
+
"""Samples clocks/temperature/power on a background thread while a
|
|
132
|
+
kernel benchmark runs in the main thread."""
|
|
133
|
+
|
|
134
|
+
def __init__(self, device_index: int = 0, interval_s: float = 0.02, nvml: Nvml | None = None):
|
|
135
|
+
self._nvml = nvml if nvml is not None else Nvml()
|
|
136
|
+
self._handle = self._nvml.device(device_index)
|
|
137
|
+
self._interval = interval_s
|
|
138
|
+
self._stop = threading.Event()
|
|
139
|
+
self._thread: threading.Thread | None = None
|
|
140
|
+
self._sm: list[int] = []
|
|
141
|
+
self._mem: list[int] = []
|
|
142
|
+
self._temp: list[int] = []
|
|
143
|
+
self._power: list[float] = []
|
|
144
|
+
|
|
145
|
+
def _sample(self) -> None:
|
|
146
|
+
self._sm.append(self._nvml.sm_clock_mhz(self._handle))
|
|
147
|
+
self._mem.append(self._nvml.mem_clock_mhz(self._handle))
|
|
148
|
+
self._temp.append(self._nvml.temperature_c(self._handle))
|
|
149
|
+
self._power.append(self._nvml.power_w(self._handle))
|
|
150
|
+
|
|
151
|
+
def _loop(self) -> None:
|
|
152
|
+
while not self._stop.wait(self._interval):
|
|
153
|
+
self._sample()
|
|
154
|
+
|
|
155
|
+
def start(self) -> None:
|
|
156
|
+
self._stop.clear()
|
|
157
|
+
self._sample() # always have at least one sample
|
|
158
|
+
self._thread = threading.Thread(target=self._loop, daemon=True)
|
|
159
|
+
self._thread.start()
|
|
160
|
+
|
|
161
|
+
def stop(self) -> Telemetry:
|
|
162
|
+
self._stop.set()
|
|
163
|
+
if self._thread is not None:
|
|
164
|
+
self._thread.join()
|
|
165
|
+
return summarize_samples(
|
|
166
|
+
self._sm, self._mem, self._temp, self._power,
|
|
167
|
+
self._nvml.max_sm_clock_mhz(self._handle),
|
|
168
|
+
self._nvml.max_mem_clock_mhz(self._handle),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def close(self) -> None:
|
|
172
|
+
self._nvml.close()
|
|
@@ -32,6 +32,41 @@ def fp32_cores_per_sm(major: int, minor: int) -> int:
|
|
|
32
32
|
return 64
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
# Dense tensor-core FLOPs per SM per clock, fp16 inputs with fp16
|
|
36
|
+
# accumulate. Derived from published board specs (V100 125 TF, T4 65 TF,
|
|
37
|
+
# A100 312 TF, 4090 330 TF, H100 SXM 989 TF, ...), which all divide out
|
|
38
|
+
# to clean powers of two per SM per clock. GeForce parts run at half
|
|
39
|
+
# this rate when accumulating in fp32.
|
|
40
|
+
_FP16_TENSOR_FLOPS_PER_SM: dict[tuple[int, int], int] = {
|
|
41
|
+
(7, 0): 1024, (7, 2): 1024, (7, 5): 1024,
|
|
42
|
+
(8, 0): 2048, (8, 6): 1024, (8, 7): 1024, (8, 9): 1024,
|
|
43
|
+
(9, 0): 4096,
|
|
44
|
+
(12, 0): 1024, (12, 1): 1024,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Same idea for tf32 (only exists on Ampere and newer).
|
|
48
|
+
_TF32_TENSOR_FLOPS_PER_SM: dict[tuple[int, int], int] = {
|
|
49
|
+
(8, 0): 1024, (8, 6): 256, (8, 7): 256, (8, 9): 256,
|
|
50
|
+
(9, 0): 2048,
|
|
51
|
+
(12, 0): 256, (12, 1): 256,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _tensor_tflops(table: dict, sm_count: int, clock_khz: int, major: int, minor: int) -> float | None:
|
|
56
|
+
rate = table.get((major, minor))
|
|
57
|
+
if rate is None:
|
|
58
|
+
return None
|
|
59
|
+
return rate * sm_count * clock_khz * 1e3 / 1e12
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def fp16_tensor_tflops(sm_count: int, clock_khz: int, major: int, minor: int) -> float | None:
|
|
63
|
+
return _tensor_tflops(_FP16_TENSOR_FLOPS_PER_SM, sm_count, clock_khz, major, minor)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def tf32_tensor_tflops(sm_count: int, clock_khz: int, major: int, minor: int) -> float | None:
|
|
67
|
+
return _tensor_tflops(_TF32_TENSOR_FLOPS_PER_SM, sm_count, clock_khz, major, minor)
|
|
68
|
+
|
|
69
|
+
|
|
35
70
|
@dataclass
|
|
36
71
|
class Peaks:
|
|
37
72
|
"""Theoretical per-device ceilings derived from driver attributes."""
|
|
@@ -39,11 +74,15 @@ class Peaks:
|
|
|
39
74
|
mem_bandwidth_gbs: float | None
|
|
40
75
|
fp32_tflops: float | None
|
|
41
76
|
compute_capability: tuple[int, int] | None
|
|
77
|
+
fp16_tensor_tflops: float | None = None
|
|
78
|
+
tf32_tensor_tflops: float | None = None
|
|
42
79
|
|
|
43
80
|
def as_dict(self) -> dict:
|
|
44
81
|
return {
|
|
45
82
|
"theoretical_mem_bandwidth_gb_s": self.mem_bandwidth_gbs,
|
|
46
83
|
"theoretical_fp32_tflops": self.fp32_tflops,
|
|
84
|
+
"theoretical_fp16_tensor_tflops": self.fp16_tensor_tflops,
|
|
85
|
+
"theoretical_tf32_tensor_tflops": self.tf32_tensor_tflops,
|
|
47
86
|
"compute_capability": (
|
|
48
87
|
f"{self.compute_capability[0]}.{self.compute_capability[1]}"
|
|
49
88
|
if self.compute_capability
|
|
@@ -72,12 +111,19 @@ def derive(attrs: dict[str, int]) -> Peaks:
|
|
|
72
111
|
)
|
|
73
112
|
|
|
74
113
|
cc = None
|
|
75
|
-
flops = None
|
|
114
|
+
flops = fp16 = tf32 = None
|
|
76
115
|
if "compute_capability_major" in attrs and "compute_capability_minor" in attrs:
|
|
77
116
|
cc = (attrs["compute_capability_major"], attrs["compute_capability_minor"])
|
|
78
117
|
if "multiprocessor_count" in attrs and "clock_rate_khz" in attrs:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
|
|
118
|
+
sm, clk = attrs["multiprocessor_count"], attrs["clock_rate_khz"]
|
|
119
|
+
flops = fp32_tflops(sm, clk, cc[0], cc[1])
|
|
120
|
+
fp16 = fp16_tensor_tflops(sm, clk, cc[0], cc[1])
|
|
121
|
+
tf32 = tf32_tensor_tflops(sm, clk, cc[0], cc[1])
|
|
122
|
+
|
|
123
|
+
return Peaks(
|
|
124
|
+
mem_bandwidth_gbs=bw,
|
|
125
|
+
fp32_tflops=flops,
|
|
126
|
+
compute_capability=cc,
|
|
127
|
+
fp16_tensor_tflops=fp16,
|
|
128
|
+
tf32_tensor_tflops=tf32,
|
|
129
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kernelmeter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
|
|
5
5
|
Author: nuemaan
|
|
6
6
|
License: MIT
|
|
@@ -24,6 +24,11 @@ Dynamic: license-file
|
|
|
24
24
|
|
|
25
25
|
# kernelmeter
|
|
26
26
|
|
|
27
|
+
[](https://pypi.org/project/kernelmeter/)
|
|
28
|
+
[](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml)
|
|
29
|
+
[](https://pypi.org/project/kernelmeter/)
|
|
30
|
+
[](LICENSE)
|
|
31
|
+
|
|
27
32
|
Small tools for one question: **is my GPU kernel actually good, and if
|
|
28
33
|
not, what exactly is holding it back?** All in one package with zero
|
|
29
34
|
required dependencies.
|
|
@@ -36,7 +41,9 @@ required dependencies.
|
|
|
36
41
|
reference, and scores it against the roofline: the best your card could
|
|
37
42
|
possibly do for that kernel's mix of math and memory traffic. 240 GB/s
|
|
38
43
|
means nothing on its own; "76% of attainable" tells you how much room
|
|
39
|
-
is left.
|
|
44
|
+
is left. While the kernel runs it also samples the real clocks, power
|
|
45
|
+
and temperature through NVML, and re-scores against the ceiling the
|
|
46
|
+
card actually held.
|
|
40
47
|
* `kernelmeter roofline` draws your card's roofline in the terminal and
|
|
41
48
|
shows where a kernel sits on it.
|
|
42
49
|
* `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
|
|
@@ -137,7 +144,7 @@ kernelmeter bench mybench.py
|
|
|
137
144
|
```text
|
|
138
145
|
kernel median ms GB/s TFLOP/s bound %roof vs ref correct
|
|
139
146
|
------------------------------------------------------------------------------------------
|
|
140
|
-
my_add 3.
|
|
147
|
+
my_add 3.2725 246.1 - mem 76.9% 1.03x PASS
|
|
141
148
|
```
|
|
142
149
|
|
|
143
150
|
* **correct** - your output matched the reference. If this says FAIL,
|
|
@@ -151,8 +158,39 @@ my_add 3.3393 241.2 - mem 75.3% 1.01x
|
|
|
151
158
|
|
|
152
159
|
Pass `flops_per_call` too and the roofline model places your kernel
|
|
153
160
|
precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
|
|
154
|
-
it gets judged against the right ceiling
|
|
155
|
-
|
|
161
|
+
it gets judged against the right ceiling (`kernelmeter info` prints the
|
|
162
|
+
derived fp16/tf32 tensor peaks for your card). Raw `%peak bw` and
|
|
163
|
+
`%fp32` numbers are always in the `--json` output.
|
|
164
|
+
|
|
165
|
+
When NVML is available (it ships with the driver) a second table follows
|
|
166
|
+
with what the card was doing during each measurement:
|
|
167
|
+
|
|
168
|
+
```text
|
|
169
|
+
telemetry sm MHz mem MHz temp power %roof@clk
|
|
170
|
+
-----------------------------------------------------------------------
|
|
171
|
+
my_add 1062/1590 5000 42C 53.1W 76.9%
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
`%roof@clk` is the same roofline score, but against the ceiling at the
|
|
175
|
+
clocks the card actually held. If `%roof` looks bad but `%roof@clk` is
|
|
176
|
+
high, your kernel is fine: the card is thermal or power limited, and no
|
|
177
|
+
amount of kernel work will change that. A real example, cuBLAS fp32
|
|
178
|
+
matmul on a 70 W T4:
|
|
179
|
+
|
|
180
|
+
```text
|
|
181
|
+
kernel median ms GB/s TFLOP/s bound %roof vs ref correct
|
|
182
|
+
----------------------------------------------------------------------------------------
|
|
183
|
+
fp32_matmul 32.0354 6.3 4.29 comp 52.7% - -
|
|
184
|
+
|
|
185
|
+
telemetry sm MHz mem MHz temp power %roof@clk
|
|
186
|
+
-----------------------------------------------------------------------
|
|
187
|
+
fp32_matmul 877/1590 5000 46C 70.4W 95.5%
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
53% of peak looks like a kernel problem. The telemetry shows it is not:
|
|
191
|
+
the card hit its 70 W power limit and dropped to 877 MHz, and at those
|
|
192
|
+
clocks the kernel was at 95.5% of what the silicon could deliver. cuBLAS
|
|
193
|
+
was never the problem.
|
|
156
194
|
|
|
157
195
|
Timing uses CUDA events with warmup, and the L2 cache is flushed between
|
|
158
196
|
iterations so small workloads can't fake huge bandwidth numbers from
|
|
@@ -194,6 +232,9 @@ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
|
|
|
194
232
|
more FLOPs are free: the memory traffic is the bill you are paying
|
|
195
233
|
anyway. That is the whole argument for kernel fusion, in one picture.
|
|
196
234
|
No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
|
|
235
|
+
`--tensor` swaps in the fp16 tensor-core roof, which moves the ridge
|
|
236
|
+
point far to the left; that picture explains why tensor-core kernels
|
|
237
|
+
are almost always memory-bound.
|
|
197
238
|
|
|
198
239
|
## Why is my occupancy low?
|
|
199
240
|
|
|
@@ -280,15 +321,15 @@ whether your kernels are any good:
|
|
|
280
321
|
## Caveats
|
|
281
322
|
|
|
282
323
|
* Theoretical peaks are computed from the max boost clock the driver
|
|
283
|
-
reports. Sustained clocks under load are lower;
|
|
284
|
-
|
|
285
|
-
* The
|
|
286
|
-
tensor
|
|
287
|
-
|
|
324
|
+
reports. Sustained clocks under load are lower; the telemetry table
|
|
325
|
+
and `kernelmeter ceiling` both show what you can actually reach.
|
|
326
|
+
* The tensor-core peaks are dense rates with fp16 accumulate. GeForce
|
|
327
|
+
cards run tensor cores at half rate when accumulating in fp32, and
|
|
328
|
+
sparse rates are double; pass `peak_tflops=...` when those apply.
|
|
288
329
|
* The occupancy command implements the standard calculator model. Real
|
|
289
330
|
occupancy can differ (launch bounds, driver decisions); confirm with
|
|
290
331
|
Nsight Compute when it matters.
|
|
291
|
-
* Attribute names above id
|
|
332
|
+
* Attribute names above id 143 are best-effort against the CUDA 12.x
|
|
292
333
|
headers. Values are always read live from your driver. PRs that extend
|
|
293
334
|
the name table are welcome.
|
|
294
335
|
|
|
@@ -304,6 +345,10 @@ them on plain GitHub runners. For an end-to-end check on a real GPU there
|
|
|
304
345
|
is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
|
|
305
346
|
The numbers in this README come from that script on a T4.
|
|
306
347
|
|
|
348
|
+
Releases are tag-driven: bump the version in `pyproject.toml`, add a
|
|
349
|
+
[CHANGELOG.md](CHANGELOG.md) entry, push a `v*` tag. CI tests, builds and
|
|
350
|
+
publishes to PyPI through trusted publishing.
|
|
351
|
+
|
|
307
352
|
## License
|
|
308
353
|
|
|
309
354
|
MIT
|
|
@@ -7,6 +7,7 @@ src/kernelmeter/bench.py
|
|
|
7
7
|
src/kernelmeter/ceiling.py
|
|
8
8
|
src/kernelmeter/cli.py
|
|
9
9
|
src/kernelmeter/cudadrv.py
|
|
10
|
+
src/kernelmeter/nvml.py
|
|
10
11
|
src/kernelmeter/occupancy.py
|
|
11
12
|
src/kernelmeter/peaks.py
|
|
12
13
|
src/kernelmeter/roofline.py
|
|
@@ -21,6 +22,8 @@ tests/test_bench_math.py
|
|
|
21
22
|
tests/test_bench_roofline.py
|
|
22
23
|
tests/test_cli.py
|
|
23
24
|
tests/test_cli_new_commands.py
|
|
25
|
+
tests/test_nvml.py
|
|
24
26
|
tests/test_occupancy.py
|
|
25
27
|
tests/test_peaks.py
|
|
26
|
-
tests/test_roofline.py
|
|
28
|
+
tests/test_roofline.py
|
|
29
|
+
tests/test_tensor_peaks.py
|
|
@@ -24,6 +24,13 @@ def test_unknown_but_supported_ids_get_generic_names(fake_driver):
|
|
|
24
24
|
assert result["attribute_150"] == 7
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
def test_cuda12_range_names(fake_driver):
|
|
28
|
+
dev = fake_driver.device(0)
|
|
29
|
+
result = attrs.query_all(fake_driver, dev)
|
|
30
|
+
assert result["numa_id"] == -1
|
|
31
|
+
assert result["gpu_pci_device_id"] == 0x1EB810DE
|
|
32
|
+
|
|
33
|
+
|
|
27
34
|
def test_device_metadata(fake_driver):
|
|
28
35
|
dev = fake_driver.device(0)
|
|
29
36
|
assert dev.name == "NVIDIA GeForce RTX 3090"
|
|
@@ -20,6 +20,13 @@ def test_roofline_from_device(patched_driver, capsys):
|
|
|
20
20
|
assert "*" in out # the chart got drawn
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def test_roofline_tensor_roof(patched_driver, capsys):
|
|
24
|
+
assert cli.main(["roofline", "--tensor"]) == 0
|
|
25
|
+
out = capsys.readouterr().out
|
|
26
|
+
assert "fp16 tensor" in out
|
|
27
|
+
assert "142.33 TFLOP/s" in out
|
|
28
|
+
|
|
29
|
+
|
|
23
30
|
def test_roofline_manual_peaks_need_no_device(monkeypatch, capsys):
|
|
24
31
|
from kernelmeter.cudadrv import CudaNotAvailableError
|
|
25
32
|
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from kernelmeter import nvml
|
|
6
|
+
|
|
7
|
+
NVML_SUCCESS = 0
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FakeNvmlLib:
|
|
11
|
+
"""Duck-types the libnvidia-ml entry points the wrapper uses.
|
|
12
|
+
Simulates a card boosting to 1590 MHz but holding 1530 under load."""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.shutdown_called = False
|
|
16
|
+
|
|
17
|
+
def nvmlInit_v2(self):
|
|
18
|
+
return NVML_SUCCESS
|
|
19
|
+
|
|
20
|
+
def nvmlShutdown(self):
|
|
21
|
+
self.shutdown_called = True
|
|
22
|
+
return NVML_SUCCESS
|
|
23
|
+
|
|
24
|
+
def nvmlDeviceGetHandleByIndex_v2(self, index, ptr):
|
|
25
|
+
ptr._obj.value = 42
|
|
26
|
+
return NVML_SUCCESS
|
|
27
|
+
|
|
28
|
+
def nvmlDeviceGetClockInfo(self, handle, clock_type, ptr):
|
|
29
|
+
ptr._obj.value = 1530 if clock_type == nvml.NVML_CLOCK_SM else 4985
|
|
30
|
+
return NVML_SUCCESS
|
|
31
|
+
|
|
32
|
+
def nvmlDeviceGetMaxClockInfo(self, handle, clock_type, ptr):
|
|
33
|
+
ptr._obj.value = 1590 if clock_type == nvml.NVML_CLOCK_SM else 5001
|
|
34
|
+
return NVML_SUCCESS
|
|
35
|
+
|
|
36
|
+
def nvmlDeviceGetTemperature(self, handle, sensor, ptr):
|
|
37
|
+
ptr._obj.value = 63
|
|
38
|
+
return NVML_SUCCESS
|
|
39
|
+
|
|
40
|
+
def nvmlDeviceGetPowerUsage(self, handle, ptr):
|
|
41
|
+
ptr._obj.value = 45200 # milliwatts
|
|
42
|
+
return NVML_SUCCESS
|
|
43
|
+
|
|
44
|
+
def nvmlDeviceGetEnforcedPowerLimit(self, handle, ptr):
|
|
45
|
+
ptr._obj.value = 70000
|
|
46
|
+
return NVML_SUCCESS
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_wrapper_reads_values():
|
|
50
|
+
n = nvml.Nvml(lib=FakeNvmlLib())
|
|
51
|
+
h = n.device(0)
|
|
52
|
+
assert n.sm_clock_mhz(h) == 1530
|
|
53
|
+
assert n.max_sm_clock_mhz(h) == 1590
|
|
54
|
+
assert n.mem_clock_mhz(h) == 4985
|
|
55
|
+
assert n.temperature_c(h) == 63
|
|
56
|
+
assert n.power_w(h) == pytest.approx(45.2)
|
|
57
|
+
assert n.power_limit_w(h) == pytest.approx(70.0)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_error_code_raises():
|
|
61
|
+
class Broken(FakeNvmlLib):
|
|
62
|
+
def nvmlDeviceGetTemperature(self, handle, sensor, ptr):
|
|
63
|
+
return 999
|
|
64
|
+
|
|
65
|
+
n = nvml.Nvml(lib=Broken())
|
|
66
|
+
with pytest.raises(nvml.NvmlError):
|
|
67
|
+
n.temperature_c(n.device(0))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_summarize_samples():
|
|
71
|
+
t = nvml.summarize_samples(
|
|
72
|
+
sm=[1500, 1560], mem=[4985, 4985], temp=[60, 63], power=[44.0, 46.0],
|
|
73
|
+
max_sm=1590, max_mem=5001,
|
|
74
|
+
)
|
|
75
|
+
assert t.sm_clock_mhz == pytest.approx(1530)
|
|
76
|
+
assert t.temperature_c == 63
|
|
77
|
+
assert t.power_w == pytest.approx(45.0)
|
|
78
|
+
assert t.sm_clock_fraction == pytest.approx(1530 / 1590)
|
|
79
|
+
assert t.mem_clock_fraction == pytest.approx(4985 / 5001)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_monitor_collects_while_running():
|
|
83
|
+
lib = FakeNvmlLib()
|
|
84
|
+
mon = nvml.Monitor(nvml=nvml.Nvml(lib=lib), interval_s=0.001)
|
|
85
|
+
mon.start()
|
|
86
|
+
time.sleep(0.02)
|
|
87
|
+
t = mon.stop()
|
|
88
|
+
mon.close()
|
|
89
|
+
assert t.sm_clock_mhz == pytest.approx(1530)
|
|
90
|
+
assert t.max_sm_clock_mhz == 1590
|
|
91
|
+
assert lib.shutdown_called
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from kernelmeter import attrs, peaks
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_t4_fp16_tensor():
|
|
7
|
+
# T4: 40 SMs, 1590 MHz, CC 7.5 -> spec sheet says 65 TFLOPS fp16
|
|
8
|
+
tf = peaks.fp16_tensor_tflops(40, 1_590_000, 7, 5)
|
|
9
|
+
assert tf == pytest.approx(65.1, rel=0.01)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_a100_fp16_and_tf32():
|
|
13
|
+
# A100: 108 SMs, 1410 MHz -> 312 TFLOPS fp16, 156 TFLOPS tf32
|
|
14
|
+
assert peaks.fp16_tensor_tflops(108, 1_410_000, 8, 0) == pytest.approx(312, rel=0.01)
|
|
15
|
+
assert peaks.tf32_tensor_tflops(108, 1_410_000, 8, 0) == pytest.approx(156, rel=0.01)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_h100_fp16_tensor():
|
|
19
|
+
# H100 SXM: 132 SMs, 1830 MHz -> 989 TFLOPS dense fp16
|
|
20
|
+
assert peaks.fp16_tensor_tflops(132, 1_830_000, 9, 0) == pytest.approx(989, rel=0.01)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_pre_tensor_core_cards_return_none():
|
|
24
|
+
assert peaks.fp16_tensor_tflops(20, 1_700_000, 6, 1) is None
|
|
25
|
+
assert peaks.tf32_tensor_tflops(40, 1_590_000, 7, 5) is None # Turing has no tf32
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_derive_includes_tensor_peaks(fake_driver):
|
|
29
|
+
# fake is a 3090: CC 8.6, 82 SMs, 1695 MHz -> 1024 flops/sm/clk = 142 TF
|
|
30
|
+
dev = fake_driver.device(0)
|
|
31
|
+
p = peaks.derive(attrs.query_all(fake_driver, dev))
|
|
32
|
+
assert p.fp16_tensor_tflops == pytest.approx(142.3, rel=0.01)
|
|
33
|
+
assert p.tf32_tensor_tflops == pytest.approx(35.6, rel=0.01)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_sustained_peaks_scaling():
|
|
37
|
+
from kernelmeter import bench
|
|
38
|
+
from kernelmeter.nvml import Telemetry
|
|
39
|
+
|
|
40
|
+
base = peaks.Peaks(mem_bandwidth_gbs=320.0, fp32_tflops=8.0, compute_capability=(7, 5))
|
|
41
|
+
t = Telemetry(
|
|
42
|
+
sm_clock_mhz=1431, mem_clock_mhz=4500, max_sm_clock_mhz=1590,
|
|
43
|
+
max_mem_clock_mhz=5000, temperature_c=70, power_w=60.0,
|
|
44
|
+
)
|
|
45
|
+
scaled = bench.sustained_peaks(base, t)
|
|
46
|
+
assert scaled.fp32_tflops == pytest.approx(8.0 * 1431 / 1590)
|
|
47
|
+
assert scaled.mem_bandwidth_gbs == pytest.approx(320.0 * 0.9)
|
|
48
|
+
# with an override the override gets scaled instead
|
|
49
|
+
scaled2 = bench.sustained_peaks(base, t, peak_tflops_override=65.0)
|
|
50
|
+
assert scaled2.fp32_tflops == pytest.approx(65.0 * 1431 / 1590)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|