PyPI - kernelmeter - Versions diffs - 0.2.0__py3-none-any.whl - Mend

kernelmeter 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

kernelmeter/__init__.py +28 -0
kernelmeter/attrs.py +149 -0
kernelmeter/bench.py +292 -0
kernelmeter/ceiling.py +111 -0
kernelmeter/cli.py +304 -0
kernelmeter/cudadrv.py +129 -0
kernelmeter/occupancy.py +102 -0
kernelmeter/peaks.py +83 -0
kernelmeter/roofline.py +89 -0
kernelmeter-0.2.0.dist-info/METADATA +309 -0
kernelmeter-0.2.0.dist-info/RECORD +15 -0
kernelmeter-0.2.0.dist-info/WHEEL +5 -0
kernelmeter-0.2.0.dist-info/entry_points.txt +2 -0
kernelmeter-0.2.0.dist-info/licenses/LICENSE +21 -0
kernelmeter-0.2.0.dist-info/top_level.txt +1 -0

kernelmeter/roofline.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Roofline model: how fast a kernel *can* go at a given arithmetic intensity.
+A kernel that does f FLOPs while moving b bytes has intensity f/b. Below
+the ridge point the memory system is the limit, above it the ALUs are.
+The attainable ceiling at intensity I is min(peak_flops, I * peak_bw),
+and a kernel's quality is best judged against that, not against whichever
+single peak happens to look more flattering.
+"""
+from __future__ import annotations
+import math
+def intensity(flops: int, nbytes: int) -> float:
+    return flops / nbytes
+def ridge_point(peak_tflops: float, peak_bw_gbs: float) -> float:
+    """Intensity (flop/byte) where the memory roof meets the compute roof."""
+    return 1000.0 * peak_tflops / peak_bw_gbs
+def attainable_tflops(ai: float, peak_tflops: float, peak_bw_gbs: float) -> float:
+    return min(peak_tflops, ai * peak_bw_gbs / 1000.0)
+def bound(ai: float, peak_tflops: float, peak_bw_gbs: float) -> str:
+    return "mem" if ai < ridge_point(peak_tflops, peak_bw_gbs) else "comp"
+def render(
+    peak_tflops: float,
+    peak_bw_gbs: float,
+    ai: float | None = None,
+    width: int = 58,
+    height: int = 12,
+) -> list[str]:
+    """Draw the roofline as text. Log-log, x from 1/8 to 256 flop/byte.
+    The roof is drawn with '*', the ridge column is marked 'x', and the
+    optional ai argument puts an 'o' where the caller's kernel sits.
+    """
+    lo, hi = -3.0, 8.0  # log2 of the intensity axis
+    ridge = ridge_point(peak_tflops, peak_bw_gbs)
+    def col_to_ai(c: int) -> float:
+        return 2.0 ** (lo + (hi - lo) * c / (width - 1))
+    def ai_to_col(a: float) -> int:
+        c = round((math.log2(a) - lo) / (hi - lo) * (width - 1))
+        return min(max(c, 0), width - 1)
+    ys = [attainable_tflops(col_to_ai(c), peak_tflops, peak_bw_gbs) for c in range(width)]
+    ymin, ymax = math.log10(ys[0]), math.log10(peak_tflops)
+    def y_to_row(y: float) -> int:
+        if ymax == ymin:
+            return height - 1
+        frac = (math.log10(y) - ymin) / (ymax - ymin)
+        return min(max(round(frac * (height - 1)), 0), height - 1)
+    grid = [[" "] * width for _ in range(height)]
+    for c, y in enumerate(ys):
+        grid[height - 1 - y_to_row(y)][c] = "*"
+    grid[height - 1 - y_to_row(attainable_tflops(ridge, peak_tflops, peak_bw_gbs))][
+        ai_to_col(ridge)
+    ] = "x"
+    if ai is not None:
+        a = min(max(ai, 2.0**lo), 2.0**hi)
+        grid[height - 1 - y_to_row(attainable_tflops(a, peak_tflops, peak_bw_gbs))][
+            ai_to_col(a)
+        ] = "o"
+    top_label = f"{peak_tflops:.2f} TF/s "
+    pad = len(top_label)
+    lines = []
+    for r, row in enumerate(grid):
+        label = top_label if r == 0 else " " * pad
+        lines.append(label + "|" + "".join(row))
+    lines.append(" " * pad + "+" + "-" * width)
+    ticks = {ai_to_col(2.0**e): f"2^{e}" for e in (-3, 0, 3, 6)}
+    axis = [" "] * (width + 1)
+    for col, text in ticks.items():
+        for i, ch in enumerate(text):
+            if col + i < len(axis):
+                axis[col + i] = ch
+    lines.append(" " * pad + " " + "".join(axis) + " flop/byte")
+    return lines

kernelmeter-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,309 @@
+Metadata-Version: 2.4
+Name: kernelmeter
+Version: 0.2.0
+Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
+Author: nuemaan
+License: MIT
+Project-URL: Homepage, https://github.com/nuemaan/kernelmeter
+Project-URL: Issues, https://github.com/nuemaan/kernelmeter/issues
+Keywords: cuda,gpu,benchmark,profiling,device-attributes,speed-of-light
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Debuggers
+Classifier: Topic :: System :: Hardware
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Provides-Extra: bench
+Requires-Dist: torch; extra == "bench"
+Provides-Extra: dev
+Requires-Dist: pytest>=7; extra == "dev"
+Dynamic: license-file
+# kernelmeter
+Small tools for one question: **is my GPU kernel actually good, and if
+not, what exactly is holding it back?** All in one package with zero
+required dependencies.
+* `kernelmeter info` prints every device attribute your GPU driver knows,
+  plus the card's theoretical peak bandwidth and FP32 throughput. No CUDA
+  toolkit, no torch, no kernel launch. It reads straight from `libcuda`,
+  which is part of the NVIDIA driver.
+* `kernelmeter bench` times your kernel, checks the output against a
+  reference, and scores it against the roofline: the best your card could
+  possibly do for that kernel's mix of math and memory traffic. 240 GB/s
+  means nothing on its own; "76% of attainable" tells you how much room
+  is left.
+* `kernelmeter roofline` draws your card's roofline in the terminal and
+  shows where a kernel sits on it.
+* `kernelmeter occupancy` answers "why is my occupancy 50%?" from block
+  size, registers and shared memory, and shows which block sizes fix it.
+* `kernelmeter ceiling` measures what the card *really* delivers
+  (STREAM-style bandwidth tests plus a big FP32 matmul), because spec
+  sheet numbers are never fully reachable.
+## Install
+```bash
+pip install kernelmeter           # info only, no dependencies
+pip install "kernelmeter[bench]"  # adds torch for the bench harness
+```
+Or from source:
+```bash
+git clone https://github.com/nuemaan/kernelmeter
+cd kernelmeter
+pip install -e ".[bench]"
+```
+## Querying your GPU
+```bash
+kernelmeter info
+```
+Output from a Tesla T4:
+```text
+CUDA driver version : 13.0
+Device 0: Tesla T4 (14.6 GiB)
+  compute capability        : 7.5
+  theoretical mem bandwidth : 320.1 GB/s
+  theoretical FP32 peak     : 8.14 TFLOP/s
+  attribute                                        value
+  ------------------------------------------------ ------------
+  max_threads_per_block                            1024
+  max_block_dim_x                                  1024
+  max_shared_memory_per_block                      49152
+  warp_size                                        32
+  clock_rate_khz                                   1590000
+  ...                                              (147 attributes total)
+```
+These are the same values Nsight Compute shows as `device__attribute_*`,
+except you don't need to profile a kernel to see them. Add `--json` for
+machine-readable output.
+Every attribute id is probed against the live driver, so the output always
+matches the machine you run it on. Ids newer than the bundled name table
+still show up, just under a generic `attribute_<id>` name.
+## Benchmarking a kernel
+Three steps.
+**1. Write your kernel in a file and decorate it.** Anything callable from
+Python works: Triton kernels, custom CUDA extensions, `torch.compile`
+output, CuPy. Here is a complete file you can copy:
+```python
+# mybench.py
+import torch
+import kernelmeter as km
+N = 1 << 26  # work on big inputs so you measure memory, not cache
+def make_args():
+    return (torch.randn(N, device="cuda"), torch.randn(N, device="cuda"))
+@km.benchmark(
+    "my_add",
+    args=make_args,                 # builds fresh inputs for the run
+    ref=torch.add,                  # trusted implementation to compare with
+    bytes_per_call=lambda x, y: 3 * x.numel() * x.element_size(),
+)
+def my_add(x, y):
+    return x + y                    # <- replace with your kernel
+```
+`bytes_per_call` is how much memory the algorithm has to move (here: read
+x, read y, write the result). The tool divides it by measured time to get
+your effective bandwidth.
+**2. Run it.**
+```bash
+kernelmeter bench mybench.py
+```
+**3. Read the result.** From a T4, with the add written as a Triton kernel:
+```text
+kernel                    median ms      GB/s   TFLOP/s  bound    %roof   vs ref  correct
+------------------------------------------------------------------------------------------
+my_add                       3.3393     241.2         -    mem    75.3%    1.01x     PASS
+```
+* **correct** - your output matched the reference. If this says FAIL,
+  nothing else on the line matters.
+* **bound** - whether the memory system (`mem`) or the ALUs (`comp`) limit
+  this kernel, decided by its arithmetic intensity (flops per byte).
+* **%roof** - how close you are to the best this card could possibly do
+  for that intensity. This is the score to improve. Above ~80% there is
+  little left to win.
+* **vs ref** - speedup over the reference implementation.
+Pass `flops_per_call` too and the roofline model places your kernel
+precisely; pass `peak_tflops=...` if your kernel runs on tensor cores so
+it gets judged against the right ceiling. Raw `%peak bw` and `%fp32`
+numbers are always in the `--json` output.
+Timing uses CUDA events with warmup, and the L2 cache is flushed between
+iterations so small workloads can't fake huge bandwidth numbers from
+cache hits. Pass `--no-flush-l2` if you want cache-hot numbers.
+The [examples](examples/) folder has ready-to-run starting points: two
+Triton kernels (vector add, fused softmax) and a compute-bound matmul.
+## Seeing the roofline
+```bash
+kernelmeter roofline --ai 0.33        # mark a kernel at 0.33 flop/byte
+```
+```text
+  peak bandwidth : 320.0 GB/s
+  peak compute   : 8.14 TFLOP/s
+  ridge point    : 25.4 flop/byte
+8.14 TF/s |                                      **x*****************
+          |                                   ***
+          |                               ****
+          |                            ***
+          |                        ****
+          |                    ****
+          |                 ***
+          |             ****
+          |          ***
+          |      *o**
+          |  ****
+          |**
+          +----------------------------------------------------------
+           2^-3            2^0            2^3             2^6
+at 0.33 flop/byte the kernel is memory-bound; attainable: 0.11 TFLOP/s
+```
+The `o` is your kernel, the `x` is the ridge point. Left of the ridge,
+more FLOPs are free: the memory traffic is the bill you are paying
+anyway. That is the whole argument for kernel fusion, in one picture.
+No GPU around? `--peak-bw` and `--peak-tflops` let you draw any card.
+## Why is my occupancy low?
+Feed it what `ptxas -v` or Nsight Compute tells you about your kernel:
+```bash
+kernelmeter occupancy --block 256 --regs 64 --smem 8192 --cc 8.6
+```
+```text
+occupancy for compute capability 8.6
+  block=256 regs/thread=64 smem/block=8192
+  occupancy    : 66.7% (32/48 warps per SM)
+  blocks per SM: 4
+  limited by   : registers
+  block size      64    128    192    256    384    512    768   1024
+  occupancy      46%    67%    62%    67%    50%    67%    50%    67%
+```
+It names the resource that is capping you and sweeps block sizes so you
+can see if a different launch shape helps. Works with no GPU present:
+pass `--cc` for any architecture from 7.0 (Volta) to 12.x (Blackwell).
+## What can the card really do?
+Theoretical peaks assume the max boost clock, which the card cannot hold.
+Measure the real ceilings once and judge your kernels against those:
+```bash
+kernelmeter ceiling
+```
+This runs the four STREAM kernels (copy, scale, add, triad) and a large
+TF32-disabled matmul. On the same T4:
+```text
+test            median ms      GB/s   TFLOP/s  % of theoretical
+---------------------------------------------------------------
+copy               1.1495     233.5         -             73.0%
+scale              1.1674     230.0         -             71.8%
+add                1.6903     238.2         -             74.4%
+triad              1.6878     238.6         -             74.5%
+fp32 matmul        3.5563         -      4.83             59.3%
+measured bandwidth ceiling: 238.6 GB/s (use this as the honest 100%
+for memory-bound kernels)
+```
+This reframes the bench results above: the vector add that scored "75.3%
+of theoretical" was moving 241 GB/s on a card whose memory system tops
+out at 238.6 GB/s in practice. It was already saturated. Without the
+measured ceiling you would have kept optimizing a finished kernel.
+## Catching regressions
+```bash
+kernelmeter bench mykernels.py --save baseline.json
+# ...edit your kernels...
+kernelmeter bench mykernels.py --compare baseline.json
+```
+The compare run prints a delta column per kernel and exits non-zero if
+anything got more than 5% slower, so it slots straight into CI.
+## A workflow that works
+If you are learning CUDA (say, working through the PMPP book) and wondering
+whether your kernels are any good:
+1. Run `kernelmeter info` and `kernelmeter ceiling` once. Now you know
+   your card's real limits.
+2. Benchmark your kernel with `bytes_per_call` and `flops_per_call` set.
+   The `bound` column tells you which resource you are fighting.
+3. `%roof` under ~60%? If the kernel is memory-bound, check `occupancy`
+   first: too few warps in flight cannot hide memory latency. Then open
+   Nsight Compute. Now you know what you are looking for, instead of
+   staring at forty unfamiliar counters.
+4. `%roof` above ~80%? Stop optimizing this kernel. The next win is
+   algorithmic (fuse it with a neighbor, move less data), and the
+   roofline chart shows why: left of the ridge, FLOPs are free.
+## Caveats
+* Theoretical peaks are computed from the max boost clock the driver
+  reports. Sustained clocks under load are lower; `kernelmeter ceiling`
+  measures what you can actually reach.
+* The derived compute peak is for plain FP32 on CUDA cores. For
+  tensor-core kernels pass `peak_tflops=...` to the benchmark decorator
+  so the roofline uses the right roof.
+* The occupancy command implements the standard calculator model. Real
+  occupancy can differ (launch bounds, driver decisions); confirm with
+  Nsight Compute when it matters.
+* Attribute names above id 121 are best-effort against the CUDA 12.x
+  headers. Values are always read live from your driver. PRs that extend
+  the name table are welcome.
+## Development
+```bash
+pip install -e ".[dev]"
+pytest
+```
+The tests fake the driver, so they run anywhere, no GPU needed. CI runs
+them on plain GitHub runners. For an end-to-end check on a real GPU there
+is a [Modal](https://modal.com) script: `modal run scripts/modal_gpu_test.py`.
+The numbers in this README come from that script on a T4.
+## License
+MIT

kernelmeter-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+kernelmeter/__init__.py,sha256=OpiOgyZfN4cRYJNB_FffmaOWjtaa0F5Fd88UuxxR2Hw,694
+kernelmeter/attrs.py,sha256=_ecmq5ogS56zlq1kdR7lwQQyKiJYyr3719WSNs5QOas,5530
+kernelmeter/bench.py,sha256=i963oZTAi7qitfMnZ_3_v6sD5CJHxogzlqAUxVUJMl4,9526
+kernelmeter/ceiling.py,sha256=82MfdDAvANk3sd4enjJw9Pkz8wVnfPUw6UBJX-bpZYs,3710
+kernelmeter/cli.py,sha256=TlSPGpaZLMOxAf95Or2v7gfjiK3j0ME2uA6enm-qGyU,11812
+kernelmeter/cudadrv.py,sha256=lN2KjBdD1sxrfQTGZ0DKTuuGjyedy5rLo5FJdtIK4vQ,4487
+kernelmeter/occupancy.py,sha256=P6GqCePxRPdEdy0Ndocv1I_yZUAOI4xht-uID3EZwjA,4390
+kernelmeter/peaks.py,sha256=HNf_vTJzDzu593ftSy-cIDRYzC5KpDelG0EIdZTNYn8,3062
+kernelmeter/roofline.py,sha256=wX1-qnoijW9vYe7RBLED_ORpipMvG4GtIIzpaPn5_H4,3132
+kernelmeter-0.2.0.dist-info/licenses/LICENSE,sha256=EASEg34VB7YUVHkVvKfhQaKhmBPZimhHiuK3xKtZeOo,1064
+kernelmeter-0.2.0.dist-info/METADATA,sha256=V7swYSC6tXGqTMazV14LhpGrQUDFXYEAWI1-Rb-4K8Y,11183
+kernelmeter-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+kernelmeter-0.2.0.dist-info/entry_points.txt,sha256=3HIIMm9LY52xXVl64iOtmjp4dgsUHYobn3l9dX0uMQ8,53
+kernelmeter-0.2.0.dist-info/top_level.txt,sha256=PKQS7ycXwZD23SKjrKVcKCRk6KIxVGdBOmOgKyhQUyQ,12
+kernelmeter-0.2.0.dist-info/RECORD,,

kernelmeter-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

kernelmeter-0.2.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ kernelmeter = kernelmeter.cli:main

kernelmeter-0.2.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 nuemaan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

kernelmeter-0.2.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ kernelmeter