kernelmeter 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {kernelmeter-0.2.0/src/kernelmeter.egg-info → kernelmeter-0.3.0}/PKG-INFO +56 -11
  2. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/README.md +55 -10
  3. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/pyproject.toml +1 -1
  4. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/__init__.py +1 -1
  5. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/attrs.py +22 -0
  6. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/bench.py +50 -0
  7. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/cli.py +68 -2
  8. kernelmeter-0.3.0/src/kernelmeter/nvml.py +172 -0
  9. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/peaks.py +52 -6
  10. {kernelmeter-0.2.0 → kernelmeter-0.3.0/src/kernelmeter.egg-info}/PKG-INFO +56 -11
  11. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/SOURCES.txt +4 -1
  12. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_attrs.py +7 -0
  13. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_cli_new_commands.py +7 -0
  14. kernelmeter-0.3.0/tests/test_nvml.py +91 -0
  15. kernelmeter-0.3.0/tests/test_tensor_peaks.py +50 -0
  16. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/LICENSE +0 -0
  17. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/setup.cfg +0 -0
  18. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/ceiling.py +0 -0
  19. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/cudadrv.py +0 -0
  20. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/occupancy.py +0 -0
  21. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter/roofline.py +0 -0
  22. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/dependency_links.txt +0 -0
  23. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/entry_points.txt +0 -0
  24. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/requires.txt +0 -0
  25. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/src/kernelmeter.egg-info/top_level.txt +0 -0
  26. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_bench_math.py +0 -0
  27. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_bench_roofline.py +0 -0
  28. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_cli.py +0 -0
  29. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_occupancy.py +0 -0
  30. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_peaks.py +0 -0
  31. {kernelmeter-0.2.0 → kernelmeter-0.3.0}/tests/test_roofline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kernelmeter
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
5
5
  Author: nuemaan
6
6
  License: MIT
@@ -24,6 +24,11 @@ Dynamic: license-file
24
24
 
25
25
  # kernelmeter
26
26
 
27
+ [![PyPI](https://img.shields.io/pypi/v/kernelmeter)](https://pypi.org/project/kernelmeter/)
28
+ [![CI](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml/badge.svg)](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml)
29
+ [![Python](https://img.shields.io/pypi/pyversions/kernelmeter)](https://pypi.org/project/kernelmeter/)
30
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
31
+
27
32
  Small tools for one question: **is my GPU kernel actually good, and if
28
33
  not, what exactly is holding it back?** All in one package with zero
29
34
  required dependencies.
@@ -36,7 +41,9 @@ required dependencies.
36
41
  reference, and scores it against the roofline: the best your card could
37
42
  possibly do for that kernel's mix of math and memory traffic. 240 GB/s
38
43
  means nothing on its own; "76% of attainable" tells you how much room
39
- is left.
44
+ is left. While the kernel runs it also samples the real clocks, power
45
+ and temperature through NVML, and re-scores against the ceiling the
46
+ card actually held.
40
47
  * `kernelmeter roofline` draws your card's roofline in the terminal and
41
48
  shows where a kernel sits on it.
42
49
  * `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
@@ -137,7 +144,7 @@ kernelmeter bench mybench.py
137
144
  ```text
138
145
  kernel median ms GB/s TFLOP/s bound %roof vs ref correct
139
146
  ------------------------------------------------------------------------------------------
140
- my_add 3.3393 241.2 - mem 75.3% 1.01x PASS
147
+ my_add 3.2725 246.1 - mem 76.9% 1.03x PASS
141
148
  ```
142
149
 
143
150
  * **correct** - your output matched the reference. If this says FAIL,
@@ -151,8 +158,39 @@ my_add 3.3393 241.2 - mem 75.3% 1.01x
151
158
 
152
159
  Pass `flops_per_call` too and the roofline model places your kernel
153
160
  precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
154
- it gets judged against the right ceiling. Raw `%peak bw` and `%fp32`
155
- numbers are always in the `--json` output.
161
+ it gets judged against the right ceiling (`kernelmeter info` prints the
162
+ derived fp16/tf32 tensor peaks for your card). Raw `%peak bw` and
163
+ `%fp32` numbers are always in the `--json` output.
164
+
165
+ When NVML is available (it ships with the driver) a second table follows
166
+ with what the card was doing during each measurement:
167
+
168
+ ```text
169
+ telemetry sm MHz mem MHz temp power %roof@clk
170
+ -----------------------------------------------------------------------
171
+ my_add 1062/1590 5000 42C 53.1W 76.9%
172
+ ```
173
+
174
+ `%roof@clk` is the same roofline score, but against the ceiling at the
175
+ clocks the card actually held. If `%roof` looks bad but `%roof@clk` is
176
+ high, your kernel is fine: the card is thermal or power limited, and no
177
+ amount of kernel work will change that. A real example, cuBLAS fp32
178
+ matmul on a 70 W T4:
179
+
180
+ ```text
181
+ kernel median ms GB/s TFLOP/s bound %roof vs ref correct
182
+ ----------------------------------------------------------------------------------------
183
+ fp32_matmul 32.0354 6.3 4.29 comp 52.7% - -
184
+
185
+ telemetry sm MHz mem MHz temp power %roof@clk
186
+ -----------------------------------------------------------------------
187
+ fp32_matmul 877/1590 5000 46C 70.4W 95.5%
188
+ ```
189
+
190
+ 53% of peak looks like a kernel problem. The telemetry shows it is not:
191
+ the card hit its 70 W power limit and dropped to 877 MHz, and at those
192
+ clocks the kernel was at 95.5% of what the silicon could deliver. cuBLAS
193
+ was never the problem.
156
194
 
157
195
  Timing uses CUDA events with warmup, and the L2 cache is flushed between
158
196
  iterations so small workloads can't fake huge bandwidth numbers from
@@ -194,6 +232,9 @@ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
194
232
  more FLOPs are free: the memory traffic is the bill you are paying
195
233
  anyway. That is the whole argument for kernel fusion, in one picture.
196
234
  No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
235
+ `--tensor` swaps in the fp16 tensor-core roof, which moves the ridge
236
+ point far to the left; that picture explains why tensor-core kernels
237
+ are almost always memory-bound.
197
238
 
198
239
  ## Why is my occupancy low?
199
240
 
@@ -280,15 +321,15 @@ whether your kernels are any good:
280
321
  ## Caveats
281
322
 
282
323
  * Theoretical peaks are computed from the max boost clock the driver
283
- reports. Sustained clocks under load are lower; `kernelmeter ceiling`
284
- measures what you can actually reach.
285
- * The derived compute peak is for plain FP32 on CUDA cores. For
286
- tensor-core kernels pass `peak_tflops=...` to the benchmark decorator
287
- so the roofline uses the right roof.
324
+ reports. Sustained clocks under load are lower; the telemetry table
325
+ and `kernelmeter ceiling` both show what you can actually reach.
326
+ * The tensor-core peaks are dense rates with fp16 accumulate. GeForce
327
+ cards run tensor cores at half rate when accumulating in fp32, and
328
+ sparse rates are double; pass `peak_tflops=...` when those apply.
288
329
  * The occupancy command implements the standard calculator model. Real
289
330
  occupancy can differ (launch bounds, driver decisions); confirm with
290
331
  Nsight Compute when it matters.
291
- * Attribute names above id 121 are best-effort against the CUDA 12.x
332
+ * Attribute names above id 143 are best-effort against the CUDA 12.x
292
333
  headers. Values are always read live from your driver. PRs that extend
293
334
  the name table are welcome.
294
335
 
@@ -304,6 +345,10 @@ them on plain GitHub runners. For an end-to-end check on a real GPU there
304
345
  is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
305
346
  The numbers in this README come from that script on a T4.
306
347
 
348
+ Releases are tag-driven: bump the version in `pyproject.toml`, add a
349
+ [CHANGELOG.md](CHANGELOG.md) entry, push a `v*` tag. CI tests, builds and
350
+ publishes to PyPI through trusted publishing.
351
+
307
352
  ## License
308
353
 
309
354
  MIT
@@ -1,5 +1,10 @@
1
1
  # kernelmeter
2
2
 
3
+ [![PyPI](https://img.shields.io/pypi/v/kernelmeter)](https://pypi.org/project/kernelmeter/)
4
+ [![CI](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml/badge.svg)](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml)
5
+ [![Python](https://img.shields.io/pypi/pyversions/kernelmeter)](https://pypi.org/project/kernelmeter/)
6
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
7
+
3
8
  Small tools for one question: **is my GPU kernel actually good, and if
4
9
  not, what exactly is holding it back?** All in one package with zero
5
10
  required dependencies.
@@ -12,7 +17,9 @@ required dependencies.
12
17
  reference, and scores it against the roofline: the best your card could
13
18
  possibly do for that kernel's mix of math and memory traffic. 240 GB/s
14
19
  means nothing on its own; "76% of attainable" tells you how much room
15
- is left.
20
+ is left. While the kernel runs it also samples the real clocks, power
21
+ and temperature through NVML, and re-scores against the ceiling the
22
+ card actually held.
16
23
  * `kernelmeter roofline` draws your card's roofline in the terminal and
17
24
  shows where a kernel sits on it.
18
25
  * `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
@@ -113,7 +120,7 @@ kernelmeter bench mybench.py
113
120
  ```text
114
121
  kernel median ms GB/s TFLOP/s bound %roof vs ref correct
115
122
  ------------------------------------------------------------------------------------------
116
- my_add 3.3393 241.2 - mem 75.3% 1.01x PASS
123
+ my_add 3.2725 246.1 - mem 76.9% 1.03x PASS
117
124
  ```
118
125
 
119
126
  * **correct** - your output matched the reference. If this says FAIL,
@@ -127,8 +134,39 @@ my_add 3.3393 241.2 - mem 75.3% 1.01x
127
134
 
128
135
  Pass `flops_per_call` too and the roofline model places your kernel
129
136
  precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
130
- it gets judged against the right ceiling. Raw `%peak bw` and `%fp32`
131
- numbers are always in the `--json` output.
137
+ it gets judged against the right ceiling (`kernelmeter info` prints the
138
+ derived fp16/tf32 tensor peaks for your card). Raw `%peak bw` and
139
+ `%fp32` numbers are always in the `--json` output.
140
+
141
+ When NVML is available (it ships with the driver) a second table follows
142
+ with what the card was doing during each measurement:
143
+
144
+ ```text
145
+ telemetry sm MHz mem MHz temp power %roof@clk
146
+ -----------------------------------------------------------------------
147
+ my_add 1062/1590 5000 42C 53.1W 76.9%
148
+ ```
149
+
150
+ `%roof@clk` is the same roofline score, but against the ceiling at the
151
+ clocks the card actually held. If `%roof` looks bad but `%roof@clk` is
152
+ high, your kernel is fine: the card is thermal or power limited, and no
153
+ amount of kernel work will change that. A real example, cuBLAS fp32
154
+ matmul on a 70 W T4:
155
+
156
+ ```text
157
+ kernel median ms GB/s TFLOP/s bound %roof vs ref correct
158
+ ----------------------------------------------------------------------------------------
159
+ fp32_matmul 32.0354 6.3 4.29 comp 52.7% - -
160
+
161
+ telemetry sm MHz mem MHz temp power %roof@clk
162
+ -----------------------------------------------------------------------
163
+ fp32_matmul 877/1590 5000 46C 70.4W 95.5%
164
+ ```
165
+
166
+ 53% of peak looks like a kernel problem. The telemetry shows it is not:
167
+ the card hit its 70 W power limit and dropped to 877 MHz, and at those
168
+ clocks the kernel was at 95.5% of what the silicon could deliver. cuBLAS
169
+ was never the problem.
132
170
 
133
171
  Timing uses CUDA events with warmup, and the L2 cache is flushed between
134
172
  iterations so small workloads can't fake huge bandwidth numbers from
@@ -170,6 +208,9 @@ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
170
208
  more FLOPs are free: the memory traffic is the bill you are paying
171
209
  anyway. That is the whole argument for kernel fusion, in one picture.
172
210
  No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
211
+ `--tensor` swaps in the fp16 tensor-core roof, which moves the ridge
212
+ point far to the left; that picture explains why tensor-core kernels
213
+ are almost always memory-bound.
173
214
 
174
215
  ## Why is my occupancy low?
175
216
 
@@ -256,15 +297,15 @@ whether your kernels are any good:
256
297
  ## Caveats
257
298
 
258
299
  * Theoretical peaks are computed from the max boost clock the driver
259
- reports. Sustained clocks under load are lower; `kernelmeter ceiling`
260
- measures what you can actually reach.
261
- * The derived compute peak is for plain FP32 on CUDA cores. For
262
- tensor-core kernels pass `peak_tflops=...` to the benchmark decorator
263
- so the roofline uses the right roof.
300
+ reports. Sustained clocks under load are lower; the telemetry table
301
+ and `kernelmeter ceiling` both show what you can actually reach.
302
+ * The tensor-core peaks are dense rates with fp16 accumulate. GeForce
303
+ cards run tensor cores at half rate when accumulating in fp32, and
304
+ sparse rates are double; pass `peak_tflops=...` when those apply.
264
305
  * The occupancy command implements the standard calculator model. Real
265
306
  occupancy can differ (launch bounds, driver decisions); confirm with
266
307
  Nsight Compute when it matters.
267
- * Attribute names above id 121 are best-effort against the CUDA 12.x
308
+ * Attribute names above id 143 are best-effort against the CUDA 12.x
268
309
  headers. Values are always read live from your driver. PRs that extend
269
310
  the name table are welcome.
270
311
 
@@ -280,6 +321,10 @@ them on plain GitHub runners. For an end-to-end check on a real GPU there
280
321
  is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
281
322
  The numbers in this README come from that script on a T4.
282
323
 
324
+ Releases are tag-driven: bump the version in `pyproject.toml`, add a
325
+ [CHANGELOG.md](CHANGELOG.md) entry, push a `v*` tag. CI tests, builds and
326
+ publishes to PyPI through trusted publishing.
327
+
283
328
  ## License
284
329
 
285
330
  MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "kernelmeter"
7
- version = "0.2.0"
7
+ version = "0.3.0"
8
8
  description = "Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -7,7 +7,7 @@ from . import occupancy, roofline
7
7
  from .occupancy import Occupancy
8
8
  from .peaks import Peaks
9
9
 
10
- __version__ = "0.2.0"
10
+ __version__ = "0.3.0"
11
11
 
12
12
  __all__ = [
13
13
  "BenchResult",
@@ -132,6 +132,28 @@ KNOWN_ATTRS: dict[int, str] = {
132
132
  119: "mempool_supported_handle_types",
133
133
  120: "cluster_launch",
134
134
  121: "deferred_mapping_cuda_array_supported",
135
+ 122: "can_use_64_bit_stream_mem_ops",
136
+ 123: "can_use_stream_wait_value_nor",
137
+ 124: "dma_buf_supported",
138
+ 125: "ipc_event_supported",
139
+ 126: "mem_sync_domain_count",
140
+ 127: "tensor_map_access_supported",
141
+ 128: "handle_type_fabric_supported",
142
+ 129: "unified_function_pointers",
143
+ 130: "numa_config",
144
+ 131: "numa_id",
145
+ 132: "multicast_supported",
146
+ 133: "mps_enabled",
147
+ 134: "host_numa_id",
148
+ 135: "d3d12_cig_supported",
149
+ 136: "mem_decompress_algorithm_mask",
150
+ 137: "mem_decompress_maximum_length",
151
+ 138: "vulkan_cig_supported",
152
+ 139: "gpu_pci_device_id",
153
+ 140: "gpu_pci_subsystem_id",
154
+ 141: "host_numa_virtual_memory_management_supported",
155
+ 142: "host_numa_memory_pools_supported",
156
+ 143: "host_numa_multinode_ipc_supported",
135
157
  }
136
158
 
137
159
 
@@ -46,8 +46,14 @@ class BenchResult:
46
46
  intensity: float | None = None
47
47
  bound: str | None = None
48
48
  pct_roofline: float | None = None
49
+ pct_roof_sustained: float | None = None
49
50
  pct_peak_bw: float | None = None
50
51
  pct_peak_fp32: float | None = None
52
+ sm_clock_mhz: float | None = None
53
+ max_sm_clock_mhz: int | None = None
54
+ mem_clock_mhz: float | None = None
55
+ temperature_c: int | None = None
56
+ power_w: float | None = None
51
57
  ref_ms_median: float | None = None
52
58
  speedup_vs_ref: float | None = None
53
59
  correct: bool | None = None
@@ -159,6 +165,21 @@ def roofline_score(
159
165
  return None, None, None
160
166
 
161
167
 
168
+ def sustained_peaks(peaks: _peaks.Peaks, telemetry, peak_tflops_override: float | None = None) -> _peaks.Peaks:
169
+ """Scale the theoretical peaks down to the clocks the card actually
170
+ held while the kernel ran."""
171
+ tf = peak_tflops_override or peaks.fp32_tflops
172
+ return _peaks.Peaks(
173
+ mem_bandwidth_gbs=(
174
+ peaks.mem_bandwidth_gbs * telemetry.mem_clock_fraction
175
+ if peaks.mem_bandwidth_gbs
176
+ else None
177
+ ),
178
+ fp32_tflops=tf * telemetry.sm_clock_fraction if tf else None,
179
+ compute_capability=peaks.compute_capability,
180
+ )
181
+
182
+
162
183
  def diff_results(baseline: list[dict], results: list["BenchResult"], threshold_pct: float = 5.0):
163
184
  """Compare a run against a saved baseline. Returns (rows, regressions)
164
185
  where rows are (name, old_ms, new_ms, delta_pct) and regressions lists
@@ -238,7 +259,25 @@ def run(spec: BenchSpec, peaks: _peaks.Peaks | None = None, flush_l2: bool = Tru
238
259
  if spec.ref is not None:
239
260
  correct, max_err = _check_correctness(spec, args)
240
261
 
262
+ monitor = None
263
+ try:
264
+ from . import nvml as _nvml
265
+
266
+ monitor = _nvml.Monitor()
267
+ monitor.start()
268
+ except Exception:
269
+ monitor = None
270
+
241
271
  times = _time_fn(spec.fn, args, spec.warmup, spec.iters, flush_l2)
272
+
273
+ telemetry = None
274
+ if monitor is not None:
275
+ try:
276
+ telemetry = monitor.stop()
277
+ monitor.close()
278
+ except Exception:
279
+ telemetry = None
280
+
242
281
  ms_mean, ms_median, ms_min = summarize_times(times)
243
282
 
244
283
  nbytes = _resolve(spec.bytes_per_call, args)
@@ -249,6 +288,11 @@ def run(spec: BenchSpec, peaks: _peaks.Peaks | None = None, flush_l2: bool = Tru
249
288
  nbytes, nflops, gbps, tflops, peaks, spec.peak_tflops
250
289
  )
251
290
 
291
+ pct_sustained = None
292
+ if telemetry is not None:
293
+ scaled = sustained_peaks(peaks, telemetry, spec.peak_tflops)
294
+ pct_sustained = roofline_score(nbytes, nflops, gbps, tflops, scaled)[2]
295
+
252
296
  ref_ms = speedup = None
253
297
  if spec.ref is not None:
254
298
  ref_times = _time_fn(spec.ref, args, spec.warmup, spec.iters, flush_l2)
@@ -265,8 +309,14 @@ def run(spec: BenchSpec, peaks: _peaks.Peaks | None = None, flush_l2: bool = Tru
265
309
  intensity=ai,
266
310
  bound=kernel_bound,
267
311
  pct_roofline=pct_roof,
312
+ pct_roof_sustained=pct_sustained,
268
313
  pct_peak_bw=pct_of_peak(gbps, peaks.mem_bandwidth_gbs) if gbps else None,
269
314
  pct_peak_fp32=pct_of_peak(tflops, peaks.fp32_tflops) if tflops else None,
315
+ sm_clock_mhz=telemetry.sm_clock_mhz if telemetry else None,
316
+ max_sm_clock_mhz=telemetry.max_sm_clock_mhz if telemetry else None,
317
+ mem_clock_mhz=telemetry.mem_clock_mhz if telemetry else None,
318
+ temperature_c=telemetry.temperature_c if telemetry else None,
319
+ power_w=telemetry.power_w if telemetry else None,
270
320
  ref_ms_median=ref_ms,
271
321
  speedup_vs_ref=speedup,
272
322
  correct=correct,
@@ -30,6 +30,28 @@ def _device_attrs(ordinal: int = 0) -> dict[str, int]:
30
30
  return _attrs.query_all(driver, driver.device(ordinal))
31
31
 
32
32
 
33
+ def _print_live_telemetry(ordinal: int) -> None:
34
+ """Current clocks/temp/power via NVML; quietly skipped when absent."""
35
+ try:
36
+ from . import nvml as _nvml
37
+
38
+ n = _nvml.Nvml()
39
+ except Exception:
40
+ return
41
+ try:
42
+ h = n.device(ordinal)
43
+ print(
44
+ f" live: sm {n.sm_clock_mhz(h)}/{n.max_sm_clock_mhz(h)} MHz, "
45
+ f"mem {n.mem_clock_mhz(h)}/{n.max_mem_clock_mhz(h)} MHz, "
46
+ f"{n.temperature_c(h)}C, "
47
+ f"{n.power_w(h):.1f}/{n.power_limit_w(h):.0f}W"
48
+ )
49
+ except Exception:
50
+ pass
51
+ finally:
52
+ n.close()
53
+
54
+
33
55
  # ---------------------------------------------------------------------------
34
56
  # info
35
57
  # ---------------------------------------------------------------------------
@@ -79,6 +101,17 @@ def cmd_info(args: argparse.Namespace) -> int:
79
101
  " theoretical FP32 peak : "
80
102
  + _fmt(derived["theoretical_fp32_tflops"], " TFLOP/s", nd=2)
81
103
  )
104
+ if derived.get("theoretical_fp16_tensor_tflops"):
105
+ print(
106
+ " theoretical fp16 tensor : "
107
+ + _fmt(derived["theoretical_fp16_tensor_tflops"], " TFLOP/s (dense)", nd=2)
108
+ )
109
+ if derived.get("theoretical_tf32_tensor_tflops"):
110
+ print(
111
+ " theoretical tf32 tensor : "
112
+ + _fmt(derived["theoretical_tf32_tensor_tflops"], " TFLOP/s (dense)", nd=2)
113
+ )
114
+ _print_live_telemetry(dev["ordinal"])
82
115
  print(f"\n {'attribute':<48} value")
83
116
  print(f" {'-' * 48} {'-' * 12}")
84
117
  for name, value in dev["attributes"].items():
@@ -137,6 +170,27 @@ def cmd_bench(args: argparse.Namespace) -> int:
137
170
  f"{_fmt(r.pct_roofline, '%'):>7} {speedup:>8} {correct:>8}"
138
171
  )
139
172
 
173
+ if any(r.sm_clock_mhz for r in results):
174
+ print()
175
+ theader = (
176
+ f"{'telemetry':<24} {'sm MHz':>10} {'mem MHz':>9} "
177
+ f"{'temp':>6} {'power':>7} {'%roof@clk':>10}"
178
+ )
179
+ print(theader)
180
+ print("-" * len(theader))
181
+ for r in results:
182
+ if not r.sm_clock_mhz:
183
+ continue
184
+ print(
185
+ f"{r.name:<24} {r.sm_clock_mhz:>5.0f}/{r.max_sm_clock_mhz:<4} "
186
+ f"{_fmt(r.mem_clock_mhz, nd=0):>9} {r.temperature_c or '-':>5}C "
187
+ f"{_fmt(r.power_w, 'W'):>7} {_fmt(r.pct_roof_sustained, '%'):>10}"
188
+ )
189
+ print(
190
+ "%roof@clk scores against the ceiling at the clocks the card "
191
+ "actually held during the run"
192
+ )
193
+
140
194
  ok = all(r.error is None and r.correct in (None, True) for r in results)
141
195
 
142
196
  if args.compare:
@@ -169,7 +223,17 @@ def cmd_roofline(args: argparse.Namespace) -> int:
169
223
  name = dev.name
170
224
  peaks = _peaks.derive(_attrs.query_all(driver, dev))
171
225
  peak_bw = peak_bw or peaks.mem_bandwidth_gbs
172
- peak_tf = peak_tf or peaks.fp32_tflops
226
+ if args.tensor:
227
+ if peaks.fp16_tensor_tflops is None:
228
+ print(
229
+ "error: no tensor-core rate known for this card; "
230
+ "pass --peak-tflops instead.",
231
+ file=sys.stderr,
232
+ )
233
+ return 1
234
+ peak_tf = peak_tf or peaks.fp16_tensor_tflops
235
+ else:
236
+ peak_tf = peak_tf or peaks.fp32_tflops
173
237
  except CudaNotAvailableError:
174
238
  pass
175
239
  if not peak_bw or not peak_tf:
@@ -183,8 +247,9 @@ def cmd_roofline(args: argparse.Namespace) -> int:
183
247
  ridge = _roofline.ridge_point(peak_tf, peak_bw)
184
248
  if name:
185
249
  print(f"Device {args.device}: {name}")
250
+ roof_kind = "fp16 tensor" if args.tensor else "fp32"
186
251
  print(f" peak bandwidth : {peak_bw:.1f} GB/s")
187
- print(f" peak compute : {peak_tf:.2f} TFLOP/s")
252
+ print(f" peak compute : {peak_tf:.2f} TFLOP/s ({roof_kind})")
188
253
  print(f" ridge point : {ridge:.1f} flop/byte\n")
189
254
  for line in _roofline.render(peak_tf, peak_bw, ai=args.ai):
190
255
  print(line)
@@ -278,6 +343,7 @@ def main(argv: list[str] | None = None) -> int:
278
343
  p_roof = sub.add_parser("roofline", help="draw the device roofline")
279
344
  p_roof.add_argument("--ai", type=float, help="mark a kernel at this arithmetic intensity")
280
345
  p_roof.add_argument("--device", type=int, default=0)
346
+ p_roof.add_argument("--tensor", action="store_true", help="use the fp16 tensor-core roof")
281
347
  p_roof.add_argument("--peak-bw", type=float, help="override bandwidth in GB/s")
282
348
  p_roof.add_argument("--peak-tflops", type=float, help="override compute in TFLOP/s")
283
349
  p_roof.set_defaults(func=cmd_roofline)
@@ -0,0 +1,172 @@
1
+ """Live telemetry through NVML (the library behind nvidia-smi).
2
+
3
+ libnvidia-ml ships with the driver, so this costs no extra dependency.
4
+ The point: theoretical peaks assume the max boost clock, but cards
5
+ downclock under load. Sampling the actual SM and memory clocks while a
6
+ kernel runs lets the bench report what the ceiling really was during the
7
+ measurement, not what the spec sheet promised.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import ctypes
13
+ import statistics
14
+ import sys
15
+ import threading
16
+ from dataclasses import dataclass
17
+
18
+ NVML_SUCCESS = 0
19
+ NVML_CLOCK_SM = 1
20
+ NVML_CLOCK_MEM = 2
21
+ NVML_TEMPERATURE_GPU = 0
22
+
23
+
24
+ class NvmlError(RuntimeError):
25
+ def __init__(self, func: str, code: int):
26
+ super().__init__(f"{func} failed with NVML code {code}")
27
+
28
+
29
+ class NvmlNotAvailableError(RuntimeError):
30
+ pass
31
+
32
+
33
+ def load_library() -> ctypes.CDLL:
34
+ if sys.platform == "darwin":
35
+ raise NvmlNotAvailableError("NVML is not available on macOS")
36
+ names = ("nvml.dll",) if sys.platform == "win32" else ("libnvidia-ml.so.1", "libnvidia-ml.so")
37
+ for name in names:
38
+ try:
39
+ if sys.platform == "win32":
40
+ return ctypes.WinDLL(name) # pragma: no cover
41
+ return ctypes.CDLL(name)
42
+ except OSError:
43
+ continue
44
+ raise NvmlNotAvailableError("could not load libnvidia-ml; is the NVIDIA driver installed?")
45
+
46
+
47
+ class Nvml:
48
+ """Minimal wrapper. Like cudadrv.Driver, the lib is injectable so the
49
+ tests can run on machines with no NVIDIA driver."""
50
+
51
+ def __init__(self, lib=None):
52
+ self._lib = lib if lib is not None else load_library()
53
+ self._check("nvmlInit_v2", self._lib.nvmlInit_v2())
54
+
55
+ def _check(self, func: str, code: int) -> None:
56
+ if code != NVML_SUCCESS:
57
+ raise NvmlError(func, code)
58
+
59
+ def close(self) -> None:
60
+ self._lib.nvmlShutdown()
61
+
62
+ def device(self, index: int = 0) -> ctypes.c_void_p:
63
+ handle = ctypes.c_void_p()
64
+ self._check(
65
+ "nvmlDeviceGetHandleByIndex",
66
+ self._lib.nvmlDeviceGetHandleByIndex_v2(index, ctypes.byref(handle)),
67
+ )
68
+ return handle
69
+
70
+ def _uint_query(self, func_name: str, handle, *args) -> int:
71
+ out = ctypes.c_uint(0)
72
+ fn = getattr(self._lib, func_name)
73
+ self._check(func_name, fn(handle, *args, ctypes.byref(out)))
74
+ return out.value
75
+
76
+ def sm_clock_mhz(self, handle) -> int:
77
+ return self._uint_query("nvmlDeviceGetClockInfo", handle, NVML_CLOCK_SM)
78
+
79
+ def mem_clock_mhz(self, handle) -> int:
80
+ return self._uint_query("nvmlDeviceGetClockInfo", handle, NVML_CLOCK_MEM)
81
+
82
+ def max_sm_clock_mhz(self, handle) -> int:
83
+ return self._uint_query("nvmlDeviceGetMaxClockInfo", handle, NVML_CLOCK_SM)
84
+
85
+ def max_mem_clock_mhz(self, handle) -> int:
86
+ return self._uint_query("nvmlDeviceGetMaxClockInfo", handle, NVML_CLOCK_MEM)
87
+
88
+ def temperature_c(self, handle) -> int:
89
+ return self._uint_query("nvmlDeviceGetTemperature", handle, NVML_TEMPERATURE_GPU)
90
+
91
+ def power_w(self, handle) -> float:
92
+ return self._uint_query("nvmlDeviceGetPowerUsage", handle) / 1000.0
93
+
94
+ def power_limit_w(self, handle) -> float:
95
+ return self._uint_query("nvmlDeviceGetEnforcedPowerLimit", handle) / 1000.0
96
+
97
+
98
+ @dataclass
99
+ class Telemetry:
100
+ sm_clock_mhz: float
101
+ mem_clock_mhz: float
102
+ max_sm_clock_mhz: int
103
+ max_mem_clock_mhz: int
104
+ temperature_c: int
105
+ power_w: float
106
+
107
+ @property
108
+ def sm_clock_fraction(self) -> float:
109
+ return self.sm_clock_mhz / self.max_sm_clock_mhz if self.max_sm_clock_mhz else 1.0
110
+
111
+ @property
112
+ def mem_clock_fraction(self) -> float:
113
+ return self.mem_clock_mhz / self.max_mem_clock_mhz if self.max_mem_clock_mhz else 1.0
114
+
115
+
116
+ def summarize_samples(
117
+ sm: list[int], mem: list[int], temp: list[int], power: list[float],
118
+ max_sm: int, max_mem: int,
119
+ ) -> Telemetry:
120
+ return Telemetry(
121
+ sm_clock_mhz=statistics.fmean(sm),
122
+ mem_clock_mhz=statistics.fmean(mem),
123
+ max_sm_clock_mhz=max_sm,
124
+ max_mem_clock_mhz=max_mem,
125
+ temperature_c=max(temp),
126
+ power_w=statistics.fmean(power),
127
+ )
128
+
129
+
130
+ class Monitor:
131
+ """Samples clocks/temperature/power on a background thread while a
132
+ kernel benchmark runs in the main thread."""
133
+
134
+ def __init__(self, device_index: int = 0, interval_s: float = 0.02, nvml: Nvml | None = None):
135
+ self._nvml = nvml if nvml is not None else Nvml()
136
+ self._handle = self._nvml.device(device_index)
137
+ self._interval = interval_s
138
+ self._stop = threading.Event()
139
+ self._thread: threading.Thread | None = None
140
+ self._sm: list[int] = []
141
+ self._mem: list[int] = []
142
+ self._temp: list[int] = []
143
+ self._power: list[float] = []
144
+
145
+ def _sample(self) -> None:
146
+ self._sm.append(self._nvml.sm_clock_mhz(self._handle))
147
+ self._mem.append(self._nvml.mem_clock_mhz(self._handle))
148
+ self._temp.append(self._nvml.temperature_c(self._handle))
149
+ self._power.append(self._nvml.power_w(self._handle))
150
+
151
+ def _loop(self) -> None:
152
+ while not self._stop.wait(self._interval):
153
+ self._sample()
154
+
155
+ def start(self) -> None:
156
+ self._stop.clear()
157
+ self._sample() # always have at least one sample
158
+ self._thread = threading.Thread(target=self._loop, daemon=True)
159
+ self._thread.start()
160
+
161
+ def stop(self) -> Telemetry:
162
+ self._stop.set()
163
+ if self._thread is not None:
164
+ self._thread.join()
165
+ return summarize_samples(
166
+ self._sm, self._mem, self._temp, self._power,
167
+ self._nvml.max_sm_clock_mhz(self._handle),
168
+ self._nvml.max_mem_clock_mhz(self._handle),
169
+ )
170
+
171
+ def close(self) -> None:
172
+ self._nvml.close()
@@ -32,6 +32,41 @@ def fp32_cores_per_sm(major: int, minor: int) -> int:
32
32
  return 64
33
33
 
34
34
 
35
+ # Dense tensor-core FLOPs per SM per clock, fp16 inputs with fp16
36
+ # accumulate. Derived from published board specs (V100 125 TF, T4 65 TF,
37
+ # A100 312 TF, 4090 330 TF, H100 SXM 989 TF, ...), which all divide out
38
+ # to clean powers of two per SM per clock. GeForce parts run at half
39
+ # this rate when accumulating in fp32.
40
+ _FP16_TENSOR_FLOPS_PER_SM: dict[tuple[int, int], int] = {
41
+ (7, 0): 1024, (7, 2): 1024, (7, 5): 1024,
42
+ (8, 0): 2048, (8, 6): 1024, (8, 7): 1024, (8, 9): 1024,
43
+ (9, 0): 4096,
44
+ (12, 0): 1024, (12, 1): 1024,
45
+ }
46
+
47
+ # Same idea for tf32 (only exists on Ampere and newer).
48
+ _TF32_TENSOR_FLOPS_PER_SM: dict[tuple[int, int], int] = {
49
+ (8, 0): 1024, (8, 6): 256, (8, 7): 256, (8, 9): 256,
50
+ (9, 0): 2048,
51
+ (12, 0): 256, (12, 1): 256,
52
+ }
53
+
54
+
55
+ def _tensor_tflops(table: dict, sm_count: int, clock_khz: int, major: int, minor: int) -> float | None:
56
+ rate = table.get((major, minor))
57
+ if rate is None:
58
+ return None
59
+ return rate * sm_count * clock_khz * 1e3 / 1e12
60
+
61
+
62
+ def fp16_tensor_tflops(sm_count: int, clock_khz: int, major: int, minor: int) -> float | None:
63
+ return _tensor_tflops(_FP16_TENSOR_FLOPS_PER_SM, sm_count, clock_khz, major, minor)
64
+
65
+
66
+ def tf32_tensor_tflops(sm_count: int, clock_khz: int, major: int, minor: int) -> float | None:
67
+ return _tensor_tflops(_TF32_TENSOR_FLOPS_PER_SM, sm_count, clock_khz, major, minor)
68
+
69
+
35
70
  @dataclass
36
71
  class Peaks:
37
72
  """Theoretical per-device ceilings derived from driver attributes."""
@@ -39,11 +74,15 @@ class Peaks:
39
74
  mem_bandwidth_gbs: float | None
40
75
  fp32_tflops: float | None
41
76
  compute_capability: tuple[int, int] | None
77
+ fp16_tensor_tflops: float | None = None
78
+ tf32_tensor_tflops: float | None = None
42
79
 
43
80
  def as_dict(self) -> dict:
44
81
  return {
45
82
  "theoretical_mem_bandwidth_gb_s": self.mem_bandwidth_gbs,
46
83
  "theoretical_fp32_tflops": self.fp32_tflops,
84
+ "theoretical_fp16_tensor_tflops": self.fp16_tensor_tflops,
85
+ "theoretical_tf32_tensor_tflops": self.tf32_tensor_tflops,
47
86
  "compute_capability": (
48
87
  f"{self.compute_capability[0]}.{self.compute_capability[1]}"
49
88
  if self.compute_capability
@@ -72,12 +111,19 @@ def derive(attrs: dict[str, int]) -> Peaks:
72
111
  )
73
112
 
74
113
  cc = None
75
- flops = None
114
+ flops = fp16 = tf32 = None
76
115
  if "compute_capability_major" in attrs and "compute_capability_minor" in attrs:
77
116
  cc = (attrs["compute_capability_major"], attrs["compute_capability_minor"])
78
117
  if "multiprocessor_count" in attrs and "clock_rate_khz" in attrs:
79
- flops = fp32_tflops(
80
- attrs["multiprocessor_count"], attrs["clock_rate_khz"], cc[0], cc[1]
81
- )
82
-
83
- return Peaks(mem_bandwidth_gbs=bw, fp32_tflops=flops, compute_capability=cc)
118
+ sm, clk = attrs["multiprocessor_count"], attrs["clock_rate_khz"]
119
+ flops = fp32_tflops(sm, clk, cc[0], cc[1])
120
+ fp16 = fp16_tensor_tflops(sm, clk, cc[0], cc[1])
121
+ tf32 = tf32_tensor_tflops(sm, clk, cc[0], cc[1])
122
+
123
+ return Peaks(
124
+ mem_bandwidth_gbs=bw,
125
+ fp32_tflops=flops,
126
+ compute_capability=cc,
127
+ fp16_tensor_tflops=fp16,
128
+ tf32_tensor_tflops=tf32,
129
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kernelmeter
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
5
5
  Author: nuemaan
6
6
  License: MIT
@@ -24,6 +24,11 @@ Dynamic: license-file
24
24
 
25
25
  # kernelmeter
26
26
 
27
+ [![PyPI](https://img.shields.io/pypi/v/kernelmeter)](https://pypi.org/project/kernelmeter/)
28
+ [![CI](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml/badge.svg)](https://github.com/nuemaan/kernelmeter/actions/workflows/ci.yml)
29
+ [![Python](https://img.shields.io/pypi/pyversions/kernelmeter)](https://pypi.org/project/kernelmeter/)
30
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
31
+
27
32
  Small tools for one question: **is my GPU kernel actually good, and if
28
33
  not, what exactly is holding it back?** All in one package with zero
29
34
  required dependencies.
@@ -36,7 +41,9 @@ required dependencies.
36
41
  reference, and scores it against the roofline: the best your card could
37
42
  possibly do for that kernel's mix of math and memory traffic. 240 GB/s
38
43
  means nothing on its own; "76% of attainable" tells you how much room
39
- is left.
44
+ is left. While the kernel runs it also samples the real clocks, power
45
+ and temperature through NVML, and re-scores against the ceiling the
46
+ card actually held.
40
47
  * `kernelmeter roofline` draws your card's roofline in the terminal and
41
48
  shows where a kernel sits on it.
42
49
  * `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
@@ -137,7 +144,7 @@ kernelmeter bench mybench.py
137
144
  ```text
138
145
  kernel median ms GB/s TFLOP/s bound %roof vs ref correct
139
146
  ------------------------------------------------------------------------------------------
140
- my_add 3.3393 241.2 - mem 75.3% 1.01x PASS
147
+ my_add 3.2725 246.1 - mem 76.9% 1.03x PASS
141
148
  ```
142
149
 
143
150
  * **correct** - your output matched the reference. If this says FAIL,
@@ -151,8 +158,39 @@ my_add 3.3393 241.2 - mem 75.3% 1.01x
151
158
 
152
159
  Pass `flops_per_call` too and the roofline model places your kernel
153
160
  precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
154
- it gets judged against the right ceiling. Raw `%peak bw` and `%fp32`
155
- numbers are always in the `--json` output.
161
+ it gets judged against the right ceiling (`kernelmeter info` prints the
162
+ derived fp16/tf32 tensor peaks for your card). Raw `%peak bw` and
163
+ `%fp32` numbers are always in the `--json` output.
164
+
165
+ When NVML is available (it ships with the driver) a second table follows
166
+ with what the card was doing during each measurement:
167
+
168
+ ```text
169
+ telemetry sm MHz mem MHz temp power %roof@clk
170
+ -----------------------------------------------------------------------
171
+ my_add 1062/1590 5000 42C 53.1W 76.9%
172
+ ```
173
+
174
+ `%roof@clk` is the same roofline score, but against the ceiling at the
175
+ clocks the card actually held. If `%roof` looks bad but `%roof@clk` is
176
+ high, your kernel is fine: the card is thermal or power limited, and no
177
+ amount of kernel work will change that. A real example, cuBLAS fp32
178
+ matmul on a 70 W T4:
179
+
180
+ ```text
181
+ kernel median ms GB/s TFLOP/s bound %roof vs ref correct
182
+ ----------------------------------------------------------------------------------------
183
+ fp32_matmul 32.0354 6.3 4.29 comp 52.7% - -
184
+
185
+ telemetry sm MHz mem MHz temp power %roof@clk
186
+ -----------------------------------------------------------------------
187
+ fp32_matmul 877/1590 5000 46C 70.4W 95.5%
188
+ ```
189
+
190
+ 53% of peak looks like a kernel problem. The telemetry shows it is not:
191
+ the card hit its 70 W power limit and dropped to 877 MHz, and at those
192
+ clocks the kernel was at 95.5% of what the silicon could deliver. cuBLAS
193
+ was never the problem.
156
194
 
157
195
  Timing uses CUDA events with warmup, and the L2 cache is flushed between
158
196
  iterations so small workloads can't fake huge bandwidth numbers from
@@ -194,6 +232,9 @@ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
194
232
  more FLOPs are free: the memory traffic is the bill you are paying
195
233
  anyway. That is the whole argument for kernel fusion, in one picture.
196
234
  No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
235
+ `--tensor` swaps in the fp16 tensor-core roof, which moves the ridge
236
+ point far to the left; that picture explains why tensor-core kernels
237
+ are almost always memory-bound.
197
238
 
198
239
  ## Why is my occupancy low?
199
240
 
@@ -280,15 +321,15 @@ whether your kernels are any good:
280
321
  ## Caveats
281
322
 
282
323
  * Theoretical peaks are computed from the max boost clock the driver
283
- reports. Sustained clocks under load are lower; `kernelmeter ceiling`
284
- measures what you can actually reach.
285
- * The derived compute peak is for plain FP32 on CUDA cores. For
286
- tensor-core kernels pass `peak_tflops=...` to the benchmark decorator
287
- so the roofline uses the right roof.
324
+ reports. Sustained clocks under load are lower; the telemetry table
325
+ and `kernelmeter ceiling` both show what you can actually reach.
326
+ * The tensor-core peaks are dense rates with fp16 accumulate. GeForce
327
+ cards run tensor cores at half rate when accumulating in fp32, and
328
+ sparse rates are double; pass `peak_tflops=...` when those apply.
288
329
  * The occupancy command implements the standard calculator model. Real
289
330
  occupancy can differ (launch bounds, driver decisions); confirm with
290
331
  Nsight Compute when it matters.
291
- * Attribute names above id 121 are best-effort against the CUDA 12.x
332
+ * Attribute names above id 143 are best-effort against the CUDA 12.x
292
333
  headers. Values are always read live from your driver. PRs that extend
293
334
  the name table are welcome.
294
335
 
@@ -304,6 +345,10 @@ them on plain GitHub runners. For an end-to-end check on a real GPU there
304
345
  is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
305
346
  The numbers in this README come from that script on a T4.
306
347
 
348
+ Releases are tag-driven: bump the version in `pyproject.toml`, add a
349
+ [CHANGELOG.md](CHANGELOG.md) entry, push a `v*` tag. CI tests, builds and
350
+ publishes to PyPI through trusted publishing.
351
+
307
352
  ## License
308
353
 
309
354
  MIT
@@ -7,6 +7,7 @@ src/kernelmeter/bench.py
7
7
  src/kernelmeter/ceiling.py
8
8
  src/kernelmeter/cli.py
9
9
  src/kernelmeter/cudadrv.py
10
+ src/kernelmeter/nvml.py
10
11
  src/kernelmeter/occupancy.py
11
12
  src/kernelmeter/peaks.py
12
13
  src/kernelmeter/roofline.py
@@ -21,6 +22,8 @@ tests/test_bench_math.py
21
22
  tests/test_bench_roofline.py
22
23
  tests/test_cli.py
23
24
  tests/test_cli_new_commands.py
25
+ tests/test_nvml.py
24
26
  tests/test_occupancy.py
25
27
  tests/test_peaks.py
26
- tests/test_roofline.py
28
+ tests/test_roofline.py
29
+ tests/test_tensor_peaks.py
@@ -24,6 +24,13 @@ def test_unknown_but_supported_ids_get_generic_names(fake_driver):
24
24
  assert result["attribute_150"] == 7
25
25
 
26
26
 
27
+ def test_cuda12_range_names(fake_driver):
28
+ dev = fake_driver.device(0)
29
+ result = attrs.query_all(fake_driver, dev)
30
+ assert result["numa_id"] == -1
31
+ assert result["gpu_pci_device_id"] == 0x1EB810DE
32
+
33
+
27
34
  def test_device_metadata(fake_driver):
28
35
  dev = fake_driver.device(0)
29
36
  assert dev.name == "NVIDIA GeForce RTX 3090"
@@ -20,6 +20,13 @@ def test_roofline_from_device(patched_driver, capsys):
20
20
  assert "*" in out # the chart got drawn
21
21
 
22
22
 
23
+ def test_roofline_tensor_roof(patched_driver, capsys):
24
+ assert cli.main(["roofline", "--tensor"]) == 0
25
+ out = capsys.readouterr().out
26
+ assert "fp16 tensor" in out
27
+ assert "142.33 TFLOP/s" in out
28
+
29
+
23
30
  def test_roofline_manual_peaks_need_no_device(monkeypatch, capsys):
24
31
  from kernelmeter.cudadrv import CudaNotAvailableError
25
32
 
@@ -0,0 +1,91 @@
1
+ import time
2
+
3
+ import pytest
4
+
5
+ from kernelmeter import nvml
6
+
7
+ NVML_SUCCESS = 0
8
+
9
+
10
+ class FakeNvmlLib:
11
+ """Duck-types the libnvidia-ml entry points the wrapper uses.
12
+ Simulates a card boosting to 1590 MHz but holding 1530 under load."""
13
+
14
+ def __init__(self):
15
+ self.shutdown_called = False
16
+
17
+ def nvmlInit_v2(self):
18
+ return NVML_SUCCESS
19
+
20
+ def nvmlShutdown(self):
21
+ self.shutdown_called = True
22
+ return NVML_SUCCESS
23
+
24
+ def nvmlDeviceGetHandleByIndex_v2(self, index, ptr):
25
+ ptr._obj.value = 42
26
+ return NVML_SUCCESS
27
+
28
+ def nvmlDeviceGetClockInfo(self, handle, clock_type, ptr):
29
+ ptr._obj.value = 1530 if clock_type == nvml.NVML_CLOCK_SM else 4985
30
+ return NVML_SUCCESS
31
+
32
+ def nvmlDeviceGetMaxClockInfo(self, handle, clock_type, ptr):
33
+ ptr._obj.value = 1590 if clock_type == nvml.NVML_CLOCK_SM else 5001
34
+ return NVML_SUCCESS
35
+
36
+ def nvmlDeviceGetTemperature(self, handle, sensor, ptr):
37
+ ptr._obj.value = 63
38
+ return NVML_SUCCESS
39
+
40
+ def nvmlDeviceGetPowerUsage(self, handle, ptr):
41
+ ptr._obj.value = 45200 # milliwatts
42
+ return NVML_SUCCESS
43
+
44
+ def nvmlDeviceGetEnforcedPowerLimit(self, handle, ptr):
45
+ ptr._obj.value = 70000
46
+ return NVML_SUCCESS
47
+
48
+
49
+ def test_wrapper_reads_values():
50
+ n = nvml.Nvml(lib=FakeNvmlLib())
51
+ h = n.device(0)
52
+ assert n.sm_clock_mhz(h) == 1530
53
+ assert n.max_sm_clock_mhz(h) == 1590
54
+ assert n.mem_clock_mhz(h) == 4985
55
+ assert n.temperature_c(h) == 63
56
+ assert n.power_w(h) == pytest.approx(45.2)
57
+ assert n.power_limit_w(h) == pytest.approx(70.0)
58
+
59
+
60
+ def test_error_code_raises():
61
+ class Broken(FakeNvmlLib):
62
+ def nvmlDeviceGetTemperature(self, handle, sensor, ptr):
63
+ return 999
64
+
65
+ n = nvml.Nvml(lib=Broken())
66
+ with pytest.raises(nvml.NvmlError):
67
+ n.temperature_c(n.device(0))
68
+
69
+
70
+ def test_summarize_samples():
71
+ t = nvml.summarize_samples(
72
+ sm=[1500, 1560], mem=[4985, 4985], temp=[60, 63], power=[44.0, 46.0],
73
+ max_sm=1590, max_mem=5001,
74
+ )
75
+ assert t.sm_clock_mhz == pytest.approx(1530)
76
+ assert t.temperature_c == 63
77
+ assert t.power_w == pytest.approx(45.0)
78
+ assert t.sm_clock_fraction == pytest.approx(1530 / 1590)
79
+ assert t.mem_clock_fraction == pytest.approx(4985 / 5001)
80
+
81
+
82
+ def test_monitor_collects_while_running():
83
+ lib = FakeNvmlLib()
84
+ mon = nvml.Monitor(nvml=nvml.Nvml(lib=lib), interval_s=0.001)
85
+ mon.start()
86
+ time.sleep(0.02)
87
+ t = mon.stop()
88
+ mon.close()
89
+ assert t.sm_clock_mhz == pytest.approx(1530)
90
+ assert t.max_sm_clock_mhz == 1590
91
+ assert lib.shutdown_called
@@ -0,0 +1,50 @@
1
+ import pytest
2
+
3
+ from kernelmeter import attrs, peaks
4
+
5
+
6
+ def test_t4_fp16_tensor():
7
+ # T4: 40 SMs, 1590 MHz, CC 7.5 -> spec sheet says 65 TFLOPS fp16
8
+ tf = peaks.fp16_tensor_tflops(40, 1_590_000, 7, 5)
9
+ assert tf == pytest.approx(65.1, rel=0.01)
10
+
11
+
12
+ def test_a100_fp16_and_tf32():
13
+ # A100: 108 SMs, 1410 MHz -> 312 TFLOPS fp16, 156 TFLOPS tf32
14
+ assert peaks.fp16_tensor_tflops(108, 1_410_000, 8, 0) == pytest.approx(312, rel=0.01)
15
+ assert peaks.tf32_tensor_tflops(108, 1_410_000, 8, 0) == pytest.approx(156, rel=0.01)
16
+
17
+
18
+ def test_h100_fp16_tensor():
19
+ # H100 SXM: 132 SMs, 1830 MHz -> 989 TFLOPS dense fp16
20
+ assert peaks.fp16_tensor_tflops(132, 1_830_000, 9, 0) == pytest.approx(989, rel=0.01)
21
+
22
+
23
+ def test_pre_tensor_core_cards_return_none():
24
+ assert peaks.fp16_tensor_tflops(20, 1_700_000, 6, 1) is None
25
+ assert peaks.tf32_tensor_tflops(40, 1_590_000, 7, 5) is None # Turing has no tf32
26
+
27
+
28
+ def test_derive_includes_tensor_peaks(fake_driver):
29
+ # fake is a 3090: CC 8.6, 82 SMs, 1695 MHz -> 1024 flops/sm/clk = 142 TF
30
+ dev = fake_driver.device(0)
31
+ p = peaks.derive(attrs.query_all(fake_driver, dev))
32
+ assert p.fp16_tensor_tflops == pytest.approx(142.3, rel=0.01)
33
+ assert p.tf32_tensor_tflops == pytest.approx(35.6, rel=0.01)
34
+
35
+
36
+ def test_sustained_peaks_scaling():
37
+ from kernelmeter import bench
38
+ from kernelmeter.nvml import Telemetry
39
+
40
+ base = peaks.Peaks(mem_bandwidth_gbs=320.0, fp32_tflops=8.0, compute_capability=(7, 5))
41
+ t = Telemetry(
42
+ sm_clock_mhz=1431, mem_clock_mhz=4500, max_sm_clock_mhz=1590,
43
+ max_mem_clock_mhz=5000, temperature_c=70, power_w=60.0,
44
+ )
45
+ scaled = bench.sustained_peaks(base, t)
46
+ assert scaled.fp32_tflops == pytest.approx(8.0 * 1431 / 1590)
47
+ assert scaled.mem_bandwidth_gbs == pytest.approx(320.0 * 0.9)
48
+ # with an override the override gets scaled instead
49
+ scaled2 = bench.sustained_peaks(base, t, peak_tflops_override=65.0)
50
+ assert scaled2.fp32_tflops == pytest.approx(65.0 * 1431 / 1590)
File without changes
File without changes