kernelmeter 0.3.1__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {kernelmeter-0.3.1/src/kernelmeter.egg-info → kernelmeter-0.4.1}/PKG-INFO +21 -9
  2. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/README.md +20 -8
  3. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/pyproject.toml +1 -1
  4. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/__init__.py +5 -2
  5. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/attrs.py +10 -3
  6. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/cli.py +33 -0
  7. kernelmeter-0.4.1/src/kernelmeter/extras.py +93 -0
  8. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/nvml.py +102 -0
  9. {kernelmeter-0.3.1 → kernelmeter-0.4.1/src/kernelmeter.egg-info}/PKG-INFO +21 -9
  10. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter.egg-info/SOURCES.txt +2 -0
  11. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_attrs.py +8 -5
  12. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_cli.py +40 -0
  13. kernelmeter-0.4.1/tests/test_extras.py +54 -0
  14. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_nvml.py +73 -0
  15. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/LICENSE +0 -0
  16. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/setup.cfg +0 -0
  17. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/bench.py +0 -0
  18. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/ceiling.py +0 -0
  19. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/cudadrv.py +0 -0
  20. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/occupancy.py +0 -0
  21. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/peaks.py +0 -0
  22. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter/roofline.py +0 -0
  23. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter.egg-info/dependency_links.txt +0 -0
  24. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter.egg-info/entry_points.txt +0 -0
  25. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter.egg-info/requires.txt +0 -0
  26. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/src/kernelmeter.egg-info/top_level.txt +0 -0
  27. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_bench_math.py +0 -0
  28. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_bench_roofline.py +0 -0
  29. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_cli_new_commands.py +0 -0
  30. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_occupancy.py +0 -0
  31. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_peaks.py +0 -0
  32. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_roofline.py +0 -0
  33. {kernelmeter-0.3.1 → kernelmeter-0.4.1}/tests/test_tensor_peaks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kernelmeter
3
- Version: 0.3.1
3
+ Version: 0.4.1
4
4
  Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
5
5
  Author: nuemaan
6
6
  License: MIT
@@ -82,6 +82,12 @@ Device 0: Tesla T4 (14.6 GiB)
82
82
  compute capability : 7.5
83
83
  theoretical mem bandwidth : 320.1 GB/s
84
84
  theoretical FP32 peak : 8.14 TFLOP/s
85
+ theoretical fp16 tensor : 65.13 TFLOP/s (dense)
86
+ architecture (nvml) : Turing, 2560 CUDA cores
87
+ pcie link (nvml) : gen1/3 x8/16
88
+ memory in use (nvml) : 450 / 15360 MiB
89
+ ecc (nvml) : on
90
+ vbios (nvml) : 90.04.96.00.02
85
91
 
86
92
  attribute value
87
93
  ------------------------------------------------ ------------
@@ -90,16 +96,22 @@ Device 0: Tesla T4 (14.6 GiB)
90
96
  max_shared_memory_per_block 49152
91
97
  warp_size 32
92
98
  clock_rate_khz 1590000
93
- ... (147 attributes total)
99
+ ... (148 attributes total)
94
100
  ```
95
101
 
96
- These are the same values Nsight Compute shows as `device__attribute_*`,
97
- except you don't need to profile a kernel to see them. Add `--json` for
98
- machine-readable output.
99
-
100
- Every attribute id is probed against the live driver, so the output always
101
- matches the machine you run it on. Ids newer than the bundled name table
102
- still show up, just under a generic `attribute_<id>` name.
102
+ The `attribute` table is read straight from the driver via
103
+ `cuDeviceGetAttribute`, the same values Nsight Compute shows as
104
+ `device__attribute_*`, but you don't need to profile a kernel to see them.
105
+ Every id is probed live, so the output matches the machine you run it on;
106
+ ids newer than the bundled name table show up as `attribute_<id>`.
107
+
108
+ The `(nvml)` lines come from a second source: NVML, the library behind
109
+ `nvidia-smi`, also shipped with the driver. They surface facts the driver
110
+ attribute enum doesn't have (architecture name, real CUDA core count,
111
+ PCIe link, live memory use, ECC, VBIOS) and are skipped silently if NVML
112
+ isn't present. (The `gen1/3 x8/16` above is the live link: an idle T4
113
+ drops to a lower PCIe state and ramps up under load.) Add `--json` for
114
+ machine-readable output; the NVML block lands under `devices[].nvml`.
103
115
 
104
116
  ## Benchmarking a kernel
105
117
 
@@ -58,6 +58,12 @@ Device 0: Tesla T4 (14.6 GiB)
58
58
  compute capability : 7.5
59
59
  theoretical mem bandwidth : 320.1 GB/s
60
60
  theoretical FP32 peak : 8.14 TFLOP/s
61
+ theoretical fp16 tensor : 65.13 TFLOP/s (dense)
62
+ architecture (nvml) : Turing, 2560 CUDA cores
63
+ pcie link (nvml) : gen1/3 x8/16
64
+ memory in use (nvml) : 450 / 15360 MiB
65
+ ecc (nvml) : on
66
+ vbios (nvml) : 90.04.96.00.02
61
67
 
62
68
  attribute value
63
69
  ------------------------------------------------ ------------
@@ -66,16 +72,22 @@ Device 0: Tesla T4 (14.6 GiB)
66
72
  max_shared_memory_per_block 49152
67
73
  warp_size 32
68
74
  clock_rate_khz 1590000
69
- ... (147 attributes total)
75
+ ... (148 attributes total)
70
76
  ```
71
77
 
72
- These are the same values Nsight Compute shows as `device__attribute_*`,
73
- except you don't need to profile a kernel to see them. Add `--json` for
74
- machine-readable output.
75
-
76
- Every attribute id is probed against the live driver, so the output always
77
- matches the machine you run it on. Ids newer than the bundled name table
78
- still show up, just under a generic `attribute_<id>` name.
78
+ The `attribute` table is read straight from the driver via
79
+ `cuDeviceGetAttribute`, the same values Nsight Compute shows as
80
+ `device__attribute_*`, but you don't need to profile a kernel to see them.
81
+ Every id is probed live, so the output matches the machine you run it on;
82
+ ids newer than the bundled name table show up as `attribute_<id>`.
83
+
84
+ The `(nvml)` lines come from a second source: NVML, the library behind
85
+ `nvidia-smi`, also shipped with the driver. They surface facts the driver
86
+ attribute enum doesn't have (architecture name, real CUDA core count,
87
+ PCIe link, live memory use, ECC, VBIOS) and are skipped silently if NVML
88
+ isn't present. (The `gen1/3 x8/16` above is the live link: an idle T4
89
+ drops to a lower PCIe state and ramps up under load.) Add `--json` for
90
+ machine-readable output; the NVML block lands under `devices[].nvml`.
79
91
 
80
92
  ## Benchmarking a kernel
81
93
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "kernelmeter"
7
- version = "0.3.1"
7
+ version = "0.4.1"
8
8
  description = "Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light."
9
9
  readme = "README.md"
10
10
  license = { text = "MIT" }
@@ -1,25 +1,28 @@
1
1
  """kernelmeter: CUDA device attributes without profiling, and kernel
2
2
  benchmarks measured against the hardware's speed of light."""
3
3
 
4
+ from . import extras, occupancy, roofline
4
5
  from .bench import REGISTRY, BenchResult, BenchSpec, benchmark, device_peaks, run, run_registry
5
6
  from .cudadrv import CudaDriverError, CudaNotAvailableError, Driver
6
- from . import occupancy, roofline
7
+ from .extras import DeviceExtras
7
8
  from .occupancy import Occupancy
8
9
  from .peaks import Peaks
9
10
 
10
- __version__ = "0.3.1"
11
+ __version__ = "0.4.1"
11
12
 
12
13
  __all__ = [
13
14
  "BenchResult",
14
15
  "BenchSpec",
15
16
  "CudaDriverError",
16
17
  "CudaNotAvailableError",
18
+ "DeviceExtras",
17
19
  "Driver",
18
20
  "Occupancy",
19
21
  "Peaks",
20
22
  "REGISTRY",
21
23
  "benchmark",
22
24
  "device_peaks",
25
+ "extras",
23
26
  "occupancy",
24
27
  "roofline",
25
28
  "run",
@@ -159,9 +159,16 @@ KNOWN_ATTRS: dict[int, str] = {
159
159
  146: "host_alloc_dma_buf_supported",
160
160
  147: "only_partial_host_native_atomic_supported",
161
161
  148: "atomic_reduction_supported",
162
- # 149 is CU_DEVICE_ATTRIBUTE_MAX, a sentinel rather than a real
163
- # attribute, so it stops here. Anything the driver adds beyond this is
164
- # still reported generically as attribute_<id> by the probe below.
162
+ 149: "d3d12_cig_streams_supported",
163
+ 150: "dma_buf_mmap_supported",
164
+ 151: "logical_endpoint_unicast_supported",
165
+ 152: "logical_endpoint_multicast_supported",
166
+ 153: "logical_endpoint_counted_ops_supported",
167
+ 154: "logical_endpoint_unicast_access_on_owner_device_supported",
168
+ # CU_DEVICE_ATTRIBUTE_MAX (155 as of CUDA 13.x) is a sentinel that
169
+ # moves up with each toolkit release, not a real attribute. Names stop
170
+ # at the last defined value; anything newer the driver reports is still
171
+ # surfaced generically as attribute_<id> by the probe below.
165
172
  }
166
173
 
167
174
 
@@ -57,24 +57,55 @@ def _print_live_telemetry(ordinal: int) -> None:
57
57
  # ---------------------------------------------------------------------------
58
58
 
59
59
  def gather_info(driver: Driver) -> dict:
60
+ from . import extras as _extras
61
+
60
62
  major, minor = driver.driver_version()
61
63
  devices = []
62
64
  for ordinal in range(driver.device_count()):
63
65
  dev = driver.device(ordinal)
64
66
  attributes = _attrs.query_all(driver, dev)
65
67
  peaks = _peaks.derive(attributes)
68
+ nvml_extras = _extras.gather(ordinal)
66
69
  devices.append(
67
70
  {
68
71
  "ordinal": ordinal,
69
72
  "name": dev.name,
70
73
  "total_memory_bytes": dev.total_mem_bytes,
71
74
  "derived": peaks.as_dict(),
75
+ "nvml": nvml_extras.as_dict() if nvml_extras else None,
72
76
  "attributes": attributes,
73
77
  }
74
78
  )
75
79
  return {"driver_version": f"{major}.{minor}", "devices": devices}
76
80
 
77
81
 
82
+ def _print_nvml_extras(nvml: dict) -> None:
83
+ """Print the NVML-sourced facts, skipping fields the card didn't report."""
84
+ arch = nvml.get("architecture")
85
+ cores = nvml.get("num_gpu_cores")
86
+ if arch or cores:
87
+ bits = []
88
+ if arch:
89
+ bits.append(arch)
90
+ if cores:
91
+ bits.append(f"{cores} CUDA cores")
92
+ print(" architecture (nvml) : " + ", ".join(bits))
93
+ gen, gen_max = nvml.get("pcie_gen_current"), nvml.get("pcie_gen_max")
94
+ w, w_max = nvml.get("pcie_width_current"), nvml.get("pcie_width_max")
95
+ if gen and w:
96
+ print(f" pcie link (nvml) : gen{gen}/{gen_max} x{w}/{w_max}")
97
+ total, used = nvml.get("memory_total_bytes"), nvml.get("memory_used_bytes")
98
+ if total:
99
+ print(
100
+ f" memory in use (nvml) : {used / 2**20:.0f} / "
101
+ f"{total / 2**20:.0f} MiB"
102
+ )
103
+ if nvml.get("ecc_enabled") is not None:
104
+ print(f" ecc (nvml) : {'on' if nvml['ecc_enabled'] else 'off'}")
105
+ if nvml.get("vbios_version"):
106
+ print(f" vbios (nvml) : {nvml['vbios_version']}")
107
+
108
+
78
109
  def cmd_info(args: argparse.Namespace) -> int:
79
110
  try:
80
111
  driver = Driver()
@@ -111,6 +142,8 @@ def cmd_info(args: argparse.Namespace) -> int:
111
142
  " theoretical tf32 tensor : "
112
143
  + _fmt(derived["theoretical_tf32_tensor_tflops"], " TFLOP/s (dense)", nd=2)
113
144
  )
145
+ if dev.get("nvml"):
146
+ _print_nvml_extras(dev["nvml"])
114
147
  _print_live_telemetry(dev["ordinal"])
115
148
  print(f"\n {'attribute':<48} value")
116
149
  print(f" {'-' * 48} {'-' * 12}")
@@ -0,0 +1,93 @@
1
+ """Device facts from NVML, the second data source the driver attribute
2
+ enum can't give you.
3
+
4
+ ``kernelmeter info`` reports ``cuDeviceGetAttribute`` values. Tools like
5
+ Nsight Compute show more (architecture name, real core count, PCIe link,
6
+ memory breakdown) because they pull from their own device database and
7
+ from NVML. NVML ships with the driver, so this module adds those facts
8
+ without a toolkit -- the same ctypes approach as the rest of kernelmeter.
9
+
10
+ It does not invent ncu-internal metrics (sass_level, ram_type, ...): those
11
+ aren't exposed by either the driver or NVML, so they would have to be
12
+ hardcoded per board and would go stale. Everything here is read live.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+
19
+ from . import nvml as _nvml
20
+
21
+
22
+ @dataclass
23
+ class DeviceExtras:
24
+ architecture: str | None
25
+ brand: str | None
26
+ num_gpu_cores: int | None
27
+ memory_total_bytes: int | None
28
+ memory_used_bytes: int | None
29
+ memory_free_bytes: int | None
30
+ pcie_gen_current: int | None
31
+ pcie_gen_max: int | None
32
+ pcie_width_current: int | None
33
+ pcie_width_max: int | None
34
+ ecc_enabled: bool | None
35
+ vbios_version: str | None
36
+ driver_version: str | None
37
+
38
+ def as_dict(self) -> dict:
39
+ return dict(self.__dict__)
40
+
41
+
42
+ def from_nvml(n: "_nvml.Nvml", index: int = 0) -> DeviceExtras:
43
+ """Build the extras for one device from an open NVML handle. Each
44
+ query is individually tolerant: an unsupported field becomes None
45
+ rather than failing the whole gather."""
46
+ h = n.device(index)
47
+
48
+ def safe(fn, *args):
49
+ try:
50
+ return fn(*args)
51
+ except Exception:
52
+ return None
53
+
54
+ arch_id = safe(n.architecture, h)
55
+ brand_id = safe(n.brand, h)
56
+ mem = safe(n.memory_info, h) or (None, None, None)
57
+ pcie = safe(n.pcie_link, h) or (None, None, None, None)
58
+
59
+ return DeviceExtras(
60
+ architecture=_nvml.ARCH_NAMES.get(arch_id) if arch_id is not None else None,
61
+ brand=_nvml.BRAND_NAMES.get(brand_id) if brand_id is not None else None,
62
+ num_gpu_cores=safe(n.num_gpu_cores, h),
63
+ memory_total_bytes=mem[0],
64
+ memory_free_bytes=mem[1],
65
+ memory_used_bytes=mem[2],
66
+ pcie_gen_current=pcie[0],
67
+ pcie_gen_max=pcie[1],
68
+ pcie_width_current=pcie[2],
69
+ pcie_width_max=pcie[3],
70
+ ecc_enabled=safe(n.ecc_enabled, h),
71
+ vbios_version=safe(n.vbios_version, h),
72
+ driver_version=safe(n.driver_version),
73
+ )
74
+
75
+
76
+ def gather(index: int = 0, nvml_obj: "_nvml.Nvml | None" = None) -> DeviceExtras | None:
77
+ """Open NVML (if not given one), read the extras, clean up. Returns
78
+ None when NVML isn't available so callers can skip the section."""
79
+ owns = nvml_obj is None
80
+ try:
81
+ n = nvml_obj if nvml_obj is not None else _nvml.Nvml()
82
+ except Exception:
83
+ return None
84
+ try:
85
+ return from_nvml(n, index)
86
+ except Exception:
87
+ return None
88
+ finally:
89
+ if owns:
90
+ try:
91
+ n.close()
92
+ except Exception:
93
+ pass
@@ -16,9 +16,40 @@ import threading
16
16
  from dataclasses import dataclass
17
17
 
18
18
  NVML_SUCCESS = 0
19
+ NVML_ERROR_NOT_SUPPORTED = 3
19
20
  NVML_CLOCK_SM = 1
20
21
  NVML_CLOCK_MEM = 2
21
22
  NVML_TEMPERATURE_GPU = 0
23
+ NVML_FEATURE_ENABLED = 1
24
+
25
+ # nvmlDeviceArchitecture_t
26
+ ARCH_NAMES = {
27
+ 2: "Kepler",
28
+ 3: "Maxwell",
29
+ 4: "Pascal",
30
+ 5: "Volta",
31
+ 6: "Turing",
32
+ 7: "Ampere",
33
+ 8: "Ada Lovelace",
34
+ 9: "Hopper",
35
+ 10: "Blackwell",
36
+ }
37
+
38
+ # nvmlBrandType_t (common entries)
39
+ BRAND_NAMES = {
40
+ 0: "Unknown",
41
+ 1: "Quadro",
42
+ 2: "Tesla",
43
+ 3: "NVS",
44
+ 4: "GRID",
45
+ 5: "GeForce",
46
+ 6: "Titan",
47
+ 7: "NVIDIA vApps",
48
+ 8: "NVIDIA vPC",
49
+ 9: "NVIDIA vCS",
50
+ 10: "NVIDIA vWS",
51
+ 11: "NVIDIA Cloud Gaming",
52
+ }
22
53
 
23
54
 
24
55
  class NvmlError(RuntimeError):
@@ -30,6 +61,14 @@ class NvmlNotAvailableError(RuntimeError):
30
61
  pass
31
62
 
32
63
 
64
+ class _Memory(ctypes.Structure):
65
+ _fields_ = [
66
+ ("total", ctypes.c_ulonglong),
67
+ ("free", ctypes.c_ulonglong),
68
+ ("used", ctypes.c_ulonglong),
69
+ ]
70
+
71
+
33
72
  def load_library() -> ctypes.CDLL:
34
73
  if sys.platform == "darwin":
35
74
  raise NvmlNotAvailableError("NVML is not available on macOS")
@@ -73,6 +112,23 @@ class Nvml:
73
112
  self._check(func_name, fn(handle, *args, ctypes.byref(out)))
74
113
  return out.value
75
114
 
115
+ def _uint_query_opt(self, func_name: str, handle, *args) -> int | None:
116
+ """Like _uint_query but returns None when the card doesn't support
117
+ the query (e.g. consumer cards have no ECC), instead of raising."""
118
+ out = ctypes.c_uint(0)
119
+ fn = getattr(self._lib, func_name)
120
+ code = fn(handle, *args, ctypes.byref(out))
121
+ if code == NVML_ERROR_NOT_SUPPORTED:
122
+ return None
123
+ self._check(func_name, code)
124
+ return out.value
125
+
126
+ def _str_query(self, func_name: str, *args, length: int = 96) -> str:
127
+ buf = ctypes.create_string_buffer(length)
128
+ fn = getattr(self._lib, func_name)
129
+ self._check(func_name, fn(*args, buf, ctypes.c_uint(length)))
130
+ return buf.value.decode("utf-8", errors="replace")
131
+
76
132
  def sm_clock_mhz(self, handle) -> int:
77
133
  return self._uint_query("nvmlDeviceGetClockInfo", handle, NVML_CLOCK_SM)
78
134
 
@@ -94,6 +150,52 @@ class Nvml:
94
150
  def power_limit_w(self, handle) -> float:
95
151
  return self._uint_query("nvmlDeviceGetEnforcedPowerLimit", handle) / 1000.0
96
152
 
153
+ # ---- static device facts the driver attribute enum does not expose ----
154
+
155
+ def architecture(self, handle) -> int | None:
156
+ return self._uint_query_opt("nvmlDeviceGetArchitecture", handle)
157
+
158
+ def brand(self, handle) -> int | None:
159
+ return self._uint_query_opt("nvmlDeviceGetBrand", handle)
160
+
161
+ def num_gpu_cores(self, handle) -> int | None:
162
+ # NVML 11.8+. Older drivers don't have it -> AttributeError on the symbol.
163
+ if not hasattr(self._lib, "nvmlDeviceGetNumGpuCores"):
164
+ return None
165
+ return self._uint_query_opt("nvmlDeviceGetNumGpuCores", handle)
166
+
167
+ def memory_info(self, handle) -> tuple[int, int, int]:
168
+ mem = _Memory()
169
+ self._check(
170
+ "nvmlDeviceGetMemoryInfo",
171
+ self._lib.nvmlDeviceGetMemoryInfo(handle, ctypes.byref(mem)),
172
+ )
173
+ return mem.total, mem.free, mem.used
174
+
175
+ def pcie_link(self, handle) -> tuple[int | None, int | None, int | None, int | None]:
176
+ """(current gen, max gen, current width, max width)."""
177
+ return (
178
+ self._uint_query_opt("nvmlDeviceGetCurrPcieLinkGeneration", handle),
179
+ self._uint_query_opt("nvmlDeviceGetMaxPcieLinkGeneration", handle),
180
+ self._uint_query_opt("nvmlDeviceGetCurrPcieLinkWidth", handle),
181
+ self._uint_query_opt("nvmlDeviceGetMaxPcieLinkWidth", handle),
182
+ )
183
+
184
+ def ecc_enabled(self, handle) -> bool | None:
185
+ cur = ctypes.c_uint(0)
186
+ pend = ctypes.c_uint(0)
187
+ code = self._lib.nvmlDeviceGetEccMode(handle, ctypes.byref(cur), ctypes.byref(pend))
188
+ if code == NVML_ERROR_NOT_SUPPORTED:
189
+ return None
190
+ self._check("nvmlDeviceGetEccMode", code)
191
+ return cur.value == NVML_FEATURE_ENABLED
192
+
193
+ def vbios_version(self, handle) -> str:
194
+ return self._str_query("nvmlDeviceGetVbiosVersion", handle)
195
+
196
+ def driver_version(self) -> str:
197
+ return self._str_query("nvmlSystemGetDriverVersion")
198
+
97
199
 
98
200
  @dataclass
99
201
  class Telemetry:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kernelmeter
3
- Version: 0.3.1
3
+ Version: 0.4.1
4
4
  Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
5
5
  Author: nuemaan
6
6
  License: MIT
@@ -82,6 +82,12 @@ Device 0: Tesla T4 (14.6 GiB)
82
82
  compute capability : 7.5
83
83
  theoretical mem bandwidth : 320.1 GB/s
84
84
  theoretical FP32 peak : 8.14 TFLOP/s
85
+ theoretical fp16 tensor : 65.13 TFLOP/s (dense)
86
+ architecture (nvml) : Turing, 2560 CUDA cores
87
+ pcie link (nvml) : gen1/3 x8/16
88
+ memory in use (nvml) : 450 / 15360 MiB
89
+ ecc (nvml) : on
90
+ vbios (nvml) : 90.04.96.00.02
85
91
 
86
92
  attribute value
87
93
  ------------------------------------------------ ------------
@@ -90,16 +96,22 @@ Device 0: Tesla T4 (14.6 GiB)
90
96
  max_shared_memory_per_block 49152
91
97
  warp_size 32
92
98
  clock_rate_khz 1590000
93
- ... (147 attributes total)
99
+ ... (148 attributes total)
94
100
  ```
95
101
 
96
- These are the same values Nsight Compute shows as `device__attribute_*`,
97
- except you don't need to profile a kernel to see them. Add `--json` for
98
- machine-readable output.
99
-
100
- Every attribute id is probed against the live driver, so the output always
101
- matches the machine you run it on. Ids newer than the bundled name table
102
- still show up, just under a generic `attribute_<id>` name.
102
+ The `attribute` table is read straight from the driver via
103
+ `cuDeviceGetAttribute`, the same values Nsight Compute shows as
104
+ `device__attribute_*`, but you don't need to profile a kernel to see them.
105
+ Every id is probed live, so the output matches the machine you run it on;
106
+ ids newer than the bundled name table show up as `attribute_<id>`.
107
+
108
+ The `(nvml)` lines come from a second source: NVML, the library behind
109
+ `nvidia-smi`, also shipped with the driver. They surface facts the driver
110
+ attribute enum doesn't have (architecture name, real CUDA core count,
111
+ PCIe link, live memory use, ECC, VBIOS) and are skipped silently if NVML
112
+ isn't present. (The `gen1/3 x8/16` above is the live link: an idle T4
113
+ drops to a lower PCIe state and ramps up under load.) Add `--json` for
114
+ machine-readable output; the NVML block lands under `devices[].nvml`.
103
115
 
104
116
  ## Benchmarking a kernel
105
117
 
@@ -7,6 +7,7 @@ src/kernelmeter/bench.py
7
7
  src/kernelmeter/ceiling.py
8
8
  src/kernelmeter/cli.py
9
9
  src/kernelmeter/cudadrv.py
10
+ src/kernelmeter/extras.py
10
11
  src/kernelmeter/nvml.py
11
12
  src/kernelmeter/occupancy.py
12
13
  src/kernelmeter/peaks.py
@@ -22,6 +23,7 @@ tests/test_bench_math.py
22
23
  tests/test_bench_roofline.py
23
24
  tests/test_cli.py
24
25
  tests/test_cli_new_commands.py
26
+ tests/test_extras.py
25
27
  tests/test_nvml.py
26
28
  tests/test_occupancy.py
27
29
  tests/test_peaks.py
@@ -20,8 +20,8 @@ def test_unsupported_ids_are_skipped(fake_driver):
20
20
  def test_unknown_but_supported_ids_get_generic_names(fake_driver):
21
21
  dev = fake_driver.device(0)
22
22
  result = attrs.query_all(fake_driver, dev)
23
- # id 155 succeeds in the fake but has no name in our table
24
- assert result["attribute_155"] == 7
23
+ # id 160 succeeds in the fake but has no name in our table
24
+ assert result["attribute_160"] == 7
25
25
 
26
26
 
27
27
  def test_cuda12_range_names(fake_driver):
@@ -29,13 +29,16 @@ def test_cuda12_range_names(fake_driver):
29
29
  result = attrs.query_all(fake_driver, dev)
30
30
  assert result["numa_id"] == -1
31
31
  assert result["gpu_pci_device_id"] == 0x1EB810DE
32
- # last named driver attribute before the CU_DEVICE_ATTRIBUTE_MAX sentinel
33
32
  assert result["atomic_reduction_supported"] == 1
33
+ # a CUDA 13.x attribute past the old 0.3.1 table
34
+ assert result["dma_buf_mmap_supported"] == 1
34
35
 
35
36
 
36
37
  def test_max_sentinel_is_not_named():
37
- # 149 is CU_DEVICE_ATTRIBUTE_MAX, not a real attribute
38
- assert 149 not in attrs.KNOWN_ATTRS
38
+ # 155 is CU_DEVICE_ATTRIBUTE_MAX as of CUDA 13.x, not a real attribute
39
+ assert 155 not in attrs.KNOWN_ATTRS
40
+ # the last real attribute we name
41
+ assert attrs.KNOWN_ATTRS[154] == "logical_endpoint_unicast_access_on_owner_device_supported"
39
42
 
40
43
 
41
44
  def test_device_metadata(fake_driver):
@@ -31,6 +31,46 @@ def test_info_human_readable(patched_driver, capsys):
31
31
  assert "GB/s" in out
32
32
 
33
33
 
34
+ @pytest.fixture
35
+ def patched_nvml(monkeypatch):
36
+ from kernelmeter import extras, nvml
37
+
38
+ from test_nvml import FakeNvmlLib
39
+
40
+ real_nvml = nvml.Nvml # capture before patching to avoid self-recursion
41
+ monkeypatch.setattr(
42
+ extras._nvml, "Nvml", lambda *a, **k: real_nvml(lib=FakeNvmlLib())
43
+ )
44
+
45
+
46
+ def test_info_json_includes_nvml(patched_driver, patched_nvml, capsys):
47
+ assert cli.main(["info", "--json"]) == 0
48
+ dev = json.loads(capsys.readouterr().out)["devices"][0]
49
+ assert dev["nvml"]["architecture"] == "Turing"
50
+ assert dev["nvml"]["num_gpu_cores"] == 2560
51
+ assert dev["nvml"]["pcie_gen_max"] == 3
52
+
53
+
54
+ def test_info_human_shows_nvml(patched_driver, patched_nvml, capsys):
55
+ assert cli.main(["info"]) == 0
56
+ out = capsys.readouterr().out
57
+ assert "Turing" in out
58
+ assert "2560 CUDA cores" in out
59
+ assert "pcie link" in out
60
+
61
+
62
+ def test_info_json_nvml_null_without_nvml(patched_driver, monkeypatch, capsys):
63
+ from kernelmeter import extras, nvml
64
+
65
+ def boom(*_a, **_k):
66
+ raise nvml.NvmlNotAvailableError("no driver")
67
+
68
+ monkeypatch.setattr(extras._nvml, "Nvml", boom)
69
+ assert cli.main(["info", "--json"]) == 0
70
+ dev = json.loads(capsys.readouterr().out)["devices"][0]
71
+ assert dev["nvml"] is None
72
+
73
+
34
74
  def test_info_without_driver(monkeypatch, capsys):
35
75
  from kernelmeter.cudadrv import CudaNotAvailableError
36
76
 
@@ -0,0 +1,54 @@
1
+ from kernelmeter import extras, nvml
2
+
3
+ from test_nvml import FakeNvmlLib
4
+
5
+
6
+ def _fake_nvml():
7
+ return nvml.Nvml(lib=FakeNvmlLib())
8
+
9
+
10
+ def test_from_nvml_builds_extras():
11
+ ex = extras.from_nvml(_fake_nvml(), 0)
12
+ assert ex.architecture == "Turing"
13
+ assert ex.brand == "Tesla"
14
+ assert ex.num_gpu_cores == 2560
15
+ assert ex.memory_total_bytes == 15843721216
16
+ assert ex.pcie_gen_current == 3
17
+ assert ex.pcie_width_max == 16
18
+ assert ex.ecc_enabled is True
19
+ assert ex.vbios_version == "90.04.38.00.03"
20
+ assert ex.driver_version == "535.104.05"
21
+
22
+
23
+ def test_gather_uses_injected_nvml():
24
+ ex = extras.gather(0, nvml_obj=_fake_nvml())
25
+ assert ex is not None
26
+ assert ex.architecture == "Turing"
27
+
28
+
29
+ def test_gather_returns_none_when_nvml_missing(monkeypatch):
30
+ # simulate a machine with no driver: Nvml() construction raises
31
+ def boom(*_a, **_k):
32
+ raise nvml.NvmlNotAvailableError("no driver")
33
+
34
+ monkeypatch.setattr(extras._nvml, "Nvml", boom)
35
+ assert extras.gather(0) is None
36
+
37
+
38
+ def test_individual_field_failure_is_tolerated():
39
+ # a card that doesn't report cores shouldn't sink the whole gather
40
+ class NoCores(FakeNvmlLib):
41
+ def nvmlDeviceGetNumGpuCores(self, handle, ptr):
42
+ return nvml.NVML_ERROR_NOT_SUPPORTED
43
+
44
+ ex = extras.from_nvml(nvml.Nvml(lib=NoCores()), 0)
45
+ assert ex.num_gpu_cores is None
46
+ assert ex.architecture == "Turing" # the rest still came through
47
+
48
+
49
+ def test_as_dict_roundtrips():
50
+ ex = extras.from_nvml(_fake_nvml(), 0)
51
+ d = ex.as_dict()
52
+ assert d["architecture"] == "Turing"
53
+ assert d["num_gpu_cores"] == 2560
54
+ assert set(d) == set(ex.__dict__)
@@ -45,6 +45,54 @@ class FakeNvmlLib:
45
45
  ptr._obj.value = 70000
46
46
  return NVML_SUCCESS
47
47
 
48
+ # static device facts (modelled on a Tesla T4)
49
+ def nvmlDeviceGetArchitecture(self, handle, ptr):
50
+ ptr._obj.value = 6 # Turing
51
+ return NVML_SUCCESS
52
+
53
+ def nvmlDeviceGetBrand(self, handle, ptr):
54
+ ptr._obj.value = 2 # Tesla
55
+ return NVML_SUCCESS
56
+
57
+ def nvmlDeviceGetNumGpuCores(self, handle, ptr):
58
+ ptr._obj.value = 2560
59
+ return NVML_SUCCESS
60
+
61
+ def nvmlDeviceGetMemoryInfo(self, handle, ptr):
62
+ ptr._obj.total = 15843721216
63
+ ptr._obj.free = 15500000000
64
+ ptr._obj.used = 343721216
65
+ return NVML_SUCCESS
66
+
67
+ def nvmlDeviceGetCurrPcieLinkGeneration(self, handle, ptr):
68
+ ptr._obj.value = 3
69
+ return NVML_SUCCESS
70
+
71
+ def nvmlDeviceGetMaxPcieLinkGeneration(self, handle, ptr):
72
+ ptr._obj.value = 3
73
+ return NVML_SUCCESS
74
+
75
+ def nvmlDeviceGetCurrPcieLinkWidth(self, handle, ptr):
76
+ ptr._obj.value = 16
77
+ return NVML_SUCCESS
78
+
79
+ def nvmlDeviceGetMaxPcieLinkWidth(self, handle, ptr):
80
+ ptr._obj.value = 16
81
+ return NVML_SUCCESS
82
+
83
+ def nvmlDeviceGetEccMode(self, handle, cur, pend):
84
+ cur._obj.value = 1 # enabled
85
+ pend._obj.value = 1
86
+ return NVML_SUCCESS
87
+
88
+ def nvmlDeviceGetVbiosVersion(self, handle, buf, length):
89
+ buf.value = b"90.04.38.00.03"
90
+ return NVML_SUCCESS
91
+
92
+ def nvmlSystemGetDriverVersion(self, buf, length):
93
+ buf.value = b"535.104.05"
94
+ return NVML_SUCCESS
95
+
48
96
 
49
97
  def test_wrapper_reads_values():
50
98
  n = nvml.Nvml(lib=FakeNvmlLib())
@@ -67,6 +115,31 @@ def test_error_code_raises():
67
115
  n.temperature_c(n.device(0))
68
116
 
69
117
 
118
+ def test_static_device_facts():
119
+ n = nvml.Nvml(lib=FakeNvmlLib())
120
+ h = n.device(0)
121
+ assert nvml.ARCH_NAMES[n.architecture(h)] == "Turing"
122
+ assert nvml.BRAND_NAMES[n.brand(h)] == "Tesla"
123
+ assert n.num_gpu_cores(h) == 2560
124
+ total, free, used = n.memory_info(h)
125
+ assert total == 15843721216
126
+ assert used == 343721216
127
+ assert n.pcie_link(h) == (3, 3, 16, 16)
128
+ assert n.ecc_enabled(h) is True
129
+ assert n.vbios_version(h) == "90.04.38.00.03"
130
+ assert n.driver_version() == "535.104.05"
131
+
132
+
133
+ def test_unsupported_field_returns_none():
134
+ # consumer cards return NOT_SUPPORTED (3) for ECC
135
+ class NoEcc(FakeNvmlLib):
136
+ def nvmlDeviceGetEccMode(self, handle, cur, pend):
137
+ return nvml.NVML_ERROR_NOT_SUPPORTED
138
+
139
+ n = nvml.Nvml(lib=NoEcc())
140
+ assert n.ecc_enabled(n.device(0)) is None
141
+
142
+
70
143
  def test_summarize_samples():
71
144
  t = nvml.summarize_samples(
72
145
  sm=[1500, 1560], mem=[4985, 4985], temp=[60, 63], power=[44.0, 46.0],
File without changes
File without changes