kernelmeter 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 nuemaan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,309 @@
1
+ Metadata-Version: 2.4
2
+ Name: kernelmeter
3
+ Version: 0.2.0
4
+ Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
5
+ Author: nuemaan
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/nuemaan/kernelmeter
8
+ Project-URL: Issues, https://github.com/nuemaan/kernelmeter/issues
9
+ Keywords: cuda,gpu,benchmark,profiling,device-attributes,speed-of-light
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Software Development :: Debuggers
15
+ Classifier: Topic :: System :: Hardware
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Provides-Extra: bench
20
+ Requires-Dist: torch; extra == "bench"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=7; extra == "dev"
23
+ Dynamic: license-file
24
+
25
+ # kernelmeter
26
+
27
+ Small tools for one question: **is my GPU kernel actually good, and if
28
+ not, what exactly is holding it back?** All in one package with zero
29
+ required dependencies.
30
+
31
+ * `kernelmeter info` prints every device attribute your GPU driver knows,
32
+ plus the card's theoretical peak bandwidth and FP32 throughput. No CUDA
33
+ toolkit, no torch, no kernel launch. It reads straight from `libcuda`,
34
+ which is part of the NVIDIA driver.
35
+ * `kernelmeter bench` times your kernel, checks the output against a
36
+ reference, and scores it against the roofline: the best your card could
37
+ possibly do for that kernel's mix of math and memory traffic. 240 GB/s
38
+ means nothing on its own; "76% of attainable" tells you how much room
39
+ is left.
40
+ * `kernelmeter roofline` draws your card's roofline in the terminal and
41
+ shows where a kernel sits on it.
42
+ * `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
43
+ size, registers and shared memory, and shows which block sizes fix it.
44
+ * `kernelmeter ceiling` measures what the card *really* delivers
45
+ (STREAM-style bandwidth tests plus a big FP32 matmul), because spec
46
+ sheet numbers are never fully reachable.
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ pip install kernelmeter # info only, no dependencies
52
+ pip install "kernelmeter[bench]" # adds torch for the bench harness
53
+ ```
54
+
55
+ Or from source:
56
+
57
+ ```bash
58
+ git clone https://github.com/nuemaan/kernelmeter
59
+ cd kernelmeter
60
+ pip install -e ".[bench]"
61
+ ```
62
+
63
+ ## Querying your GPU
64
+
65
+ ```bash
66
+ kernelmeter info
67
+ ```
68
+
69
+ Output from a Tesla T4:
70
+
71
+ ```text
72
+ CUDA driver version : 13.0
73
+
74
+ Device 0: Tesla T4 (14.6 GiB)
75
+ compute capability : 7.5
76
+ theoretical mem bandwidth : 320.1 GB/s
77
+ theoretical FP32 peak : 8.14 TFLOP/s
78
+
79
+ attribute value
80
+ ------------------------------------------------ ------------
81
+ max_threads_per_block 1024
82
+ max_block_dim_x 1024
83
+ max_shared_memory_per_block 49152
84
+ warp_size 32
85
+ clock_rate_khz 1590000
86
+ ... (147 attributes total)
87
+ ```
88
+
89
+ These are the same values Nsight Compute shows as `device__attribute_*`,
90
+ except you don't need to profile a kernel to see them. Add `--json` for
91
+ machine-readable output.
92
+
93
+ Every attribute id is probed against the live driver, so the output always
94
+ matches the machine you run it on. Ids newer than the bundled name table
95
+ still show up, just under a generic `attribute_<id>` name.
96
+
97
+ ## Benchmarking a kernel
98
+
99
+ Three steps.
100
+
101
+ **1. Write your kernel in a file and decorate it.** Anything callable from
102
+ Python works: Triton kernels, custom CUDA extensions, `torch.compile`
103
+ output, CuPy. Here is a complete file you can copy:
104
+
105
+ ```python
106
+ # mybench.py
107
+ import torch
108
+ import kernelmeter as km
109
+
110
+ N = 1 << 26 # work on big inputs so you measure memory, not cache
111
+
112
+ def make_args():
113
+ return (torch.randn(N, device="cuda"), torch.randn(N, device="cuda"))
114
+
115
+ @km.benchmark(
116
+ "my_add",
117
+ args=make_args, # builds fresh inputs for the run
118
+ ref=torch.add, # trusted implementation to compare with
119
+ bytes_per_call=lambda x, y: 3 * x.numel() * x.element_size(),
120
+ )
121
+ def my_add(x, y):
122
+ return x + y # <- replace with your kernel
123
+ ```
124
+
125
+ `bytes_per_call` is how much memory the algorithm has to move (here: read
126
+ x, read y, write the result). The tool divides it by measured time to get
127
+ your effective bandwidth.
128
+
129
+ **2. Run it.**
130
+
131
+ ```bash
132
+ kernelmeter bench mybench.py
133
+ ```
134
+
135
+ **3. Read the result.** From a T4, with the add written as a Triton kernel:
136
+
137
+ ```text
138
+ kernel median ms GB/s TFLOP/s bound %roof vs ref correct
139
+ ------------------------------------------------------------------------------------------
140
+ my_add 3.3393 241.2 - mem 75.3% 1.01x PASS
141
+ ```
142
+
143
+ * **correct** - your output matched the reference. If this says FAIL,
144
+ nothing else on the line matters.
145
+ * **bound** - whether the memory system (`mem`) or the ALUs (`comp`) limit
146
+ this kernel, decided by its arithmetic intensity (flops per byte).
147
+ * **%roof** - how close you are to the best this card could possibly do
148
+ for that intensity. This is the score to improve. Above ~80% there is
149
+ little left to win.
150
+ * **vs ref** - speedup over the reference implementation.
151
+
152
+ Pass `flops_per_call` too and the roofline model places your kernel
153
+ precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
154
+ it gets judged against the right ceiling. Raw `%peak bw` and `%fp32`
155
+ numbers are always in the `--json` output.
156
+
157
+ Timing uses CUDA events with warmup, and the L2 cache is flushed between
158
+ iterations so small workloads can't fake huge bandwidth numbers from
159
+ cache hits. Pass `--no-flush-l2` if you want cache-hot numbers.
160
+
161
+ The [examples](examples/) folder has ready-to-run starting points: two
162
+ Triton kernels (vector add, fused softmax) and a compute-bound matmul.
163
+
164
+ ## Seeing the roofline
165
+
166
+ ```bash
167
+ kernelmeter roofline --ai 0.33 # mark a kernel at 0.33 flop/byte
168
+ ```
169
+
170
+ ```text
171
+ peak bandwidth : 320.0 GB/s
172
+ peak compute : 8.14 TFLOP/s
173
+ ridge point : 25.4 flop/byte
174
+
175
+ 8.14 TF/s | **x*****************
176
+ | ***
177
+ | ****
178
+ | ***
179
+ | ****
180
+ | ****
181
+ | ***
182
+ | ****
183
+ | ***
184
+ | *o**
185
+ | ****
186
+ |**
187
+ +----------------------------------------------------------
188
+ 2^-3 2^0 2^3 2^6
189
+
190
+ at 0.33 flop/byte the kernel is memory-bound; attainable: 0.11 TFLOP/s
191
+ ```
192
+
193
+ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
194
+ more FLOPs are free: the memory traffic is the bill you are paying
195
+ anyway. That is the whole argument for kernel fusion, in one picture.
196
+ No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
197
+
198
+ ## Why is my occupancy low?
199
+
200
+ Feed it what `ptxas -v` or Nsight Compute tells you about your kernel:
201
+
202
+ ```bash
203
+ kernelmeter occupancy --block 256 --regs 64 --smem 8192 --cc 8.6
204
+ ```
205
+
206
+ ```text
207
+ occupancy for compute capability 8.6
208
+ block=256 regs/thread=64 smem/block=8192
209
+
210
+ occupancy : 66.7% (32/48 warps per SM)
211
+ blocks per SM: 4
212
+ limited by : registers
213
+
214
+ block size 64 128 192 256 384 512 768 1024
215
+ occupancy 46% 67% 62% 67% 50% 67% 50% 67%
216
+ ```
217
+
218
+ It names the resource that is capping you and sweeps block sizes so you
219
+ can see if a different launch shape helps. Works with no GPU present:
220
+ pass `--cc` for any architecture from 7.0 (Volta) to 12.x (Blackwell).
221
+
222
+ ## What can the card really do?
223
+
224
+ Theoretical peaks assume the max boost clock, which the card cannot hold.
225
+ Measure the real ceilings once and judge your kernels against those:
226
+
227
+ ```bash
228
+ kernelmeter ceiling
229
+ ```
230
+
231
+ This runs the four STREAM kernels (copy, scale, add, triad) and a large
232
+ TF32-disabled matmul. On the same T4:
233
+
234
+ ```text
235
+ test median ms GB/s TFLOP/s % of theoretical
236
+ ---------------------------------------------------------------
237
+ copy 1.1495 233.5 - 73.0%
238
+ scale 1.1674 230.0 - 71.8%
239
+ add 1.6903 238.2 - 74.4%
240
+ triad 1.6878 238.6 - 74.5%
241
+ fp32 matmul 3.5563 - 4.83 59.3%
242
+
243
+ measured bandwidth ceiling: 238.6 GB/s (use this as the honest 100%
244
+ for memory-bound kernels)
245
+ ```
246
+
247
+ This reframes the bench results above: the vector add that scored "75.3%
248
+ of theoretical" was moving 241 GB/s on a card whose memory system tops
249
+ out at 238.6 GB/s in practice. It was already saturated. Without the
250
+ measured ceiling you would have kept optimizing a finished kernel.
251
+
252
+ ## Catching regressions
253
+
254
+ ```bash
255
+ kernelmeter bench mykernels.py --save baseline.json
256
+ # ...edit your kernels...
257
+ kernelmeter bench mykernels.py --compare baseline.json
258
+ ```
259
+
260
+ The compare run prints a delta column per kernel and exits non-zero if
261
+ anything got more than 5% slower, so it slots straight into CI.
262
+
263
+ ## A workflow that works
264
+
265
+ If you are learning CUDA (say, working through the PMPP book) and wondering
266
+ whether your kernels are any good:
267
+
268
+ 1. Run `kernelmeter info` and `kernelmeter ceiling` once. Now you know
269
+ your card's real limits.
270
+ 2. Benchmark your kernel with `bytes_per_call` and `flops_per_call` set.
271
+ The `bound` column tells you which resource you are fighting.
272
+ 3. `%roof` under ~60%? If the kernel is memory-bound, check `occupancy`
273
+ first: too few warps in flight cannot hide memory latency. Then open
274
+ Nsight Compute. Now you know what you are looking for, instead of
275
+ staring at forty unfamiliar counters.
276
+ 4. `%roof` above ~80%? Stop optimizing this kernel. The next win is
277
+ algorithmic (fuse it with a neighbor, move less data), and the
278
+ roofline chart shows why: left of the ridge, FLOPs are free.
279
+
280
+ ## Caveats
281
+
282
+ * Theoretical peaks are computed from the max boost clock the driver
283
+ reports. Sustained clocks under load are lower; `kernelmeter ceiling`
284
+ measures what you can actually reach.
285
+ * The derived compute peak is for plain FP32 on CUDA cores. For
286
+ tensor-core kernels pass `peak_tflops=...` to the benchmark decorator
287
+ so the roofline uses the right roof.
288
+ * The occupancy command implements the standard calculator model. Real
289
+ occupancy can differ (launch bounds, driver decisions); confirm with
290
+ Nsight Compute when it matters.
291
+ * Attribute names above id 121 are best-effort against the CUDA 12.x
292
+ headers. Values are always read live from your driver. PRs that extend
293
+ the name table are welcome.
294
+
295
+ ## Development
296
+
297
+ ```bash
298
+ pip install -e ".[dev]"
299
+ pytest
300
+ ```
301
+
302
+ The tests fake the driver, so they run anywhere, no GPU needed. CI runs
303
+ them on plain GitHub runners. For an end-to-end check on a real GPU there
304
+ is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
305
+ The numbers in this README come from that script on a T4.
306
+
307
+ ## License
308
+
309
+ MIT
@@ -0,0 +1,285 @@
1
+ # kernelmeter
2
+
3
+ Small tools for one question: **is my GPU kernel actually good, and if
4
+ not, what exactly is holding it back?** All in one package with zero
5
+ required dependencies.
6
+
7
+ * `kernelmeter info` prints every device attribute your GPU driver knows,
8
+ plus the card's theoretical peak bandwidth and FP32 throughput. No CUDA
9
+ toolkit, no torch, no kernel launch. It reads straight from `libcuda`,
10
+ which is part of the NVIDIA driver.
11
+ * `kernelmeter bench` times your kernel, checks the output against a
12
+ reference, and scores it against the roofline: the best your card could
13
+ possibly do for that kernel's mix of math and memory traffic. 240 GB/s
14
+ means nothing on its own; "76% of attainable" tells you how much room
15
+ is left.
16
+ * `kernelmeter roofline` draws your card's roofline in the terminal and
17
+ shows where a kernel sits on it.
18
+ * `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
19
+ size, registers and shared memory, and shows which block sizes fix it.
20
+ * `kernelmeter ceiling` measures what the card *really* delivers
21
+ (STREAM-style bandwidth tests plus a big FP32 matmul), because spec
22
+ sheet numbers are never fully reachable.
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install kernelmeter # info only, no dependencies
28
+ pip install "kernelmeter[bench]" # adds torch for the bench harness
29
+ ```
30
+
31
+ Or from source:
32
+
33
+ ```bash
34
+ git clone https://github.com/nuemaan/kernelmeter
35
+ cd kernelmeter
36
+ pip install -e ".[bench]"
37
+ ```
38
+
39
+ ## Querying your GPU
40
+
41
+ ```bash
42
+ kernelmeter info
43
+ ```
44
+
45
+ Output from a Tesla T4:
46
+
47
+ ```text
48
+ CUDA driver version : 13.0
49
+
50
+ Device 0: Tesla T4 (14.6 GiB)
51
+ compute capability : 7.5
52
+ theoretical mem bandwidth : 320.1 GB/s
53
+ theoretical FP32 peak : 8.14 TFLOP/s
54
+
55
+ attribute value
56
+ ------------------------------------------------ ------------
57
+ max_threads_per_block 1024
58
+ max_block_dim_x 1024
59
+ max_shared_memory_per_block 49152
60
+ warp_size 32
61
+ clock_rate_khz 1590000
62
+ ... (147 attributes total)
63
+ ```
64
+
65
+ These are the same values Nsight Compute shows as `device__attribute_*`,
66
+ except you don't need to profile a kernel to see them. Add `--json` for
67
+ machine-readable output.
68
+
69
+ Every attribute id is probed against the live driver, so the output always
70
+ matches the machine you run it on. Ids newer than the bundled name table
71
+ still show up, just under a generic `attribute_<id>` name.
72
+
73
+ ## Benchmarking a kernel
74
+
75
+ Three steps.
76
+
77
+ **1. Write your kernel in a file and decorate it.** Anything callable from
78
+ Python works: Triton kernels, custom CUDA extensions, `torch.compile`
79
+ output, CuPy. Here is a complete file you can copy:
80
+
81
+ ```python
82
+ # mybench.py
83
+ import torch
84
+ import kernelmeter as km
85
+
86
+ N = 1 << 26 # work on big inputs so you measure memory, not cache
87
+
88
+ def make_args():
89
+ return (torch.randn(N, device="cuda"), torch.randn(N, device="cuda"))
90
+
91
+ @km.benchmark(
92
+ "my_add",
93
+ args=make_args, # builds fresh inputs for the run
94
+ ref=torch.add, # trusted implementation to compare with
95
+ bytes_per_call=lambda x, y: 3 * x.numel() * x.element_size(),
96
+ )
97
+ def my_add(x, y):
98
+ return x + y # <- replace with your kernel
99
+ ```
100
+
101
+ `bytes_per_call` is how much memory the algorithm has to move (here: read
102
+ x, read y, write the result). The tool divides it by measured time to get
103
+ your effective bandwidth.
104
+
105
+ **2. Run it.**
106
+
107
+ ```bash
108
+ kernelmeter bench mybench.py
109
+ ```
110
+
111
+ **3. Read the result.** From a T4, with the add written as a Triton kernel:
112
+
113
+ ```text
114
+ kernel median ms GB/s TFLOP/s bound %roof vs ref correct
115
+ ------------------------------------------------------------------------------------------
116
+ my_add 3.3393 241.2 - mem 75.3% 1.01x PASS
117
+ ```
118
+
119
+ * **correct** - your output matched the reference. If this says FAIL,
120
+ nothing else on the line matters.
121
+ * **bound** - whether the memory system (`mem`) or the ALUs (`comp`) limit
122
+ this kernel, decided by its arithmetic intensity (flops per byte).
123
+ * **%roof** - how close you are to the best this card could possibly do
124
+ for that intensity. This is the score to improve. Above ~80% there is
125
+ little left to win.
126
+ * **vs ref** - speedup over the reference implementation.
127
+
128
+ Pass `flops_per_call` too and the roofline model places your kernel
129
+ precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
130
+ it gets judged against the right ceiling. Raw `%peak bw` and `%fp32`
131
+ numbers are always in the `--json` output.
132
+
133
+ Timing uses CUDA events with warmup, and the L2 cache is flushed between
134
+ iterations so small workloads can't fake huge bandwidth numbers from
135
+ cache hits. Pass `--no-flush-l2` if you want cache-hot numbers.
136
+
137
+ The [examples](examples/) folder has ready-to-run starting points: two
138
+ Triton kernels (vector add, fused softmax) and a compute-bound matmul.
139
+
140
+ ## Seeing the roofline
141
+
142
+ ```bash
143
+ kernelmeter roofline --ai 0.33 # mark a kernel at 0.33 flop/byte
144
+ ```
145
+
146
+ ```text
147
+ peak bandwidth : 320.0 GB/s
148
+ peak compute : 8.14 TFLOP/s
149
+ ridge point : 25.4 flop/byte
150
+
151
+ 8.14 TF/s | **x*****************
152
+ | ***
153
+ | ****
154
+ | ***
155
+ | ****
156
+ | ****
157
+ | ***
158
+ | ****
159
+ | ***
160
+ | *o**
161
+ | ****
162
+ |**
163
+ +----------------------------------------------------------
164
+ 2^-3 2^0 2^3 2^6
165
+
166
+ at 0.33 flop/byte the kernel is memory-bound; attainable: 0.11 TFLOP/s
167
+ ```
168
+
169
+ The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
170
+ more FLOPs are free: the memory traffic is the bill you are paying
171
+ anyway. That is the whole argument for kernel fusion, in one picture.
172
+ No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
173
+
174
+ ## Why is my occupancy low?
175
+
176
+ Feed it what `ptxas -v` or Nsight Compute tells you about your kernel:
177
+
178
+ ```bash
179
+ kernelmeter occupancy --block 256 --regs 64 --smem 8192 --cc 8.6
180
+ ```
181
+
182
+ ```text
183
+ occupancy for compute capability 8.6
184
+ block=256 regs/thread=64 smem/block=8192
185
+
186
+ occupancy : 66.7% (32/48 warps per SM)
187
+ blocks per SM: 4
188
+ limited by : registers
189
+
190
+ block size 64 128 192 256 384 512 768 1024
191
+ occupancy 46% 67% 62% 67% 50% 67% 50% 67%
192
+ ```
193
+
194
+ It names the resource that is capping you and sweeps block sizes so you
195
+ can see if a different launch shape helps. Works with no GPU present:
196
+ pass `--cc` for any architecture from 7.0 (Volta) to 12.x (Blackwell).
197
+
198
+ ## What can the card really do?
199
+
200
+ Theoretical peaks assume the max boost clock, which the card cannot hold.
201
+ Measure the real ceilings once and judge your kernels against those:
202
+
203
+ ```bash
204
+ kernelmeter ceiling
205
+ ```
206
+
207
+ This runs the four STREAM kernels (copy, scale, add, triad) and a large
208
+ TF32-disabled matmul. On the same T4:
209
+
210
+ ```text
211
+ test median ms GB/s TFLOP/s % of theoretical
212
+ ---------------------------------------------------------------
213
+ copy 1.1495 233.5 - 73.0%
214
+ scale 1.1674 230.0 - 71.8%
215
+ add 1.6903 238.2 - 74.4%
216
+ triad 1.6878 238.6 - 74.5%
217
+ fp32 matmul 3.5563 - 4.83 59.3%
218
+
219
+ measured bandwidth ceiling: 238.6 GB/s (use this as the honest 100%
220
+ for memory-bound kernels)
221
+ ```
222
+
223
+ This reframes the bench results above: the vector add that scored "75.3%
224
+ of theoretical" was moving 241 GB/s on a card whose memory system tops
225
+ out at 238.6 GB/s in practice. It was already saturated. Without the
226
+ measured ceiling you would have kept optimizing a finished kernel.
227
+
228
+ ## Catching regressions
229
+
230
+ ```bash
231
+ kernelmeter bench mykernels.py --save baseline.json
232
+ # ...edit your kernels...
233
+ kernelmeter bench mykernels.py --compare baseline.json
234
+ ```
235
+
236
+ The compare run prints a delta column per kernel and exits non-zero if
237
+ anything got more than 5% slower, so it slots straight into CI.
238
+
239
+ ## A workflow that works
240
+
241
+ If you are learning CUDA (say, working through the PMPP book) and wondering
242
+ whether your kernels are any good:
243
+
244
+ 1. Run `kernelmeter info` and `kernelmeter ceiling` once. Now you know
245
+ your card's real limits.
246
+ 2. Benchmark your kernel with `bytes_per_call` and `flops_per_call` set.
247
+ The `bound` column tells you which resource you are fighting.
248
+ 3. `%roof` under ~60%? If the kernel is memory-bound, check `occupancy`
249
+ first: too few warps in flight cannot hide memory latency. Then open
250
+ Nsight Compute. Now you know what you are looking for, instead of
251
+ staring at forty unfamiliar counters.
252
+ 4. `%roof` above ~80%? Stop optimizing this kernel. The next win is
253
+ algorithmic (fuse it with a neighbor, move less data), and the
254
+ roofline chart shows why: left of the ridge, FLOPs are free.
255
+
256
+ ## Caveats
257
+
258
+ * Theoretical peaks are computed from the max boost clock the driver
259
+ reports. Sustained clocks under load are lower; `kernelmeter ceiling`
260
+ measures what you can actually reach.
261
+ * The derived compute peak is for plain FP32 on CUDA cores. For
262
+ tensor-core kernels pass `peak_tflops=...` to the benchmark decorator
263
+ so the roofline uses the right roof.
264
+ * The occupancy command implements the standard calculator model. Real
265
+ occupancy can differ (launch bounds, driver decisions); confirm with
266
+ Nsight Compute when it matters.
267
+ * Attribute names above id 121 are best-effort against the CUDA 12.x
268
+ headers. Values are always read live from your driver. PRs that extend
269
+ the name table are welcome.
270
+
271
+ ## Development
272
+
273
+ ```bash
274
+ pip install -e ".[dev]"
275
+ pytest
276
+ ```
277
+
278
+ The tests fake the driver, so they run anywhere, no GPU needed. CI runs
279
+ them on plain GitHub runners. For an end-to-end check on a real GPU there
280
+ is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
281
+ The numbers in this README come from that script on a T4.
282
+
283
+ ## License
284
+
285
+ MIT
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "kernelmeter"
7
+ version = "0.2.0"
8
+ description = "Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light."
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "nuemaan" }]
12
+ requires-python = ">=3.9"
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Topic :: Software Development :: Debuggers",
19
+ "Topic :: System :: Hardware",
20
+ ]
21
+ keywords = ["cuda", "gpu", "benchmark", "profiling", "device-attributes", "speed-of-light"]
22
+
23
+ [project.optional-dependencies]
24
+ # The bench harness times kernels with CUDA events via torch.
25
+ bench = ["torch"]
26
+ dev = ["pytest>=7"]
27
+
28
+ [project.scripts]
29
+ kernelmeter = "kernelmeter.cli:main"
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/nuemaan/kernelmeter"
33
+ Issues = "https://github.com/nuemaan/kernelmeter/issues"
34
+
35
+ [tool.setuptools.packages.find]
36
+ where = ["src"]
37
+
38
+ [tool.pytest.ini_options]
39
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,28 @@
1
+ """kernelmeter: CUDA device attributes without profiling, and kernel
2
+ benchmarks measured against the hardware's speed of light."""
3
+
4
+ from .bench import REGISTRY, BenchResult, BenchSpec, benchmark, device_peaks, run, run_registry
5
+ from .cudadrv import CudaDriverError, CudaNotAvailableError, Driver
6
+ from . import occupancy, roofline
7
+ from .occupancy import Occupancy
8
+ from .peaks import Peaks
9
+
10
+ __version__ = "0.2.0"
11
+
12
+ __all__ = [
13
+ "BenchResult",
14
+ "BenchSpec",
15
+ "CudaDriverError",
16
+ "CudaNotAvailableError",
17
+ "Driver",
18
+ "Occupancy",
19
+ "Peaks",
20
+ "REGISTRY",
21
+ "benchmark",
22
+ "device_peaks",
23
+ "occupancy",
24
+ "roofline",
25
+ "run",
26
+ "run_registry",
27
+ "__version__",
28
+ ]