kernelmeter 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kernelmeter-0.3.0/src/kernelmeter.egg-info → kernelmeter-0.4.0}/PKG-INFO +21 -9
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/README.md +20 -8
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/pyproject.toml +1 -1
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/__init__.py +5 -2
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/attrs.py +8 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/cli.py +33 -0
- kernelmeter-0.4.0/src/kernelmeter/extras.py +93 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/nvml.py +102 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0/src/kernelmeter.egg-info}/PKG-INFO +21 -9
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter.egg-info/SOURCES.txt +2 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_attrs.py +9 -2
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_cli.py +40 -0
- kernelmeter-0.4.0/tests/test_extras.py +54 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_nvml.py +73 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/LICENSE +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/setup.cfg +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/bench.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/ceiling.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/cudadrv.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/occupancy.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/peaks.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter/roofline.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter.egg-info/dependency_links.txt +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter.egg-info/entry_points.txt +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter.egg-info/requires.txt +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/src/kernelmeter.egg-info/top_level.txt +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_bench_math.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_bench_roofline.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_cli_new_commands.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_occupancy.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_peaks.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_roofline.py +0 -0
- {kernelmeter-0.3.0 → kernelmeter-0.4.0}/tests/test_tensor_peaks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kernelmeter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
|
|
5
5
|
Author: nuemaan
|
|
6
6
|
License: MIT
|
|
@@ -82,6 +82,12 @@ Device 0: Tesla T4 (14.6 GiB)
|
|
|
82
82
|
compute capability : 7.5
|
|
83
83
|
theoretical mem bandwidth : 320.1 GB/s
|
|
84
84
|
theoretical FP32 peak : 8.14 TFLOP/s
|
|
85
|
+
theoretical fp16 tensor : 65.13 TFLOP/s (dense)
|
|
86
|
+
architecture (nvml) : Turing, 2560 CUDA cores
|
|
87
|
+
pcie link (nvml) : gen1/3 x8/16
|
|
88
|
+
memory in use (nvml) : 450 / 15360 MiB
|
|
89
|
+
ecc (nvml) : on
|
|
90
|
+
vbios (nvml) : 90.04.96.00.02
|
|
85
91
|
|
|
86
92
|
attribute value
|
|
87
93
|
------------------------------------------------ ------------
|
|
@@ -90,16 +96,22 @@ Device 0: Tesla T4 (14.6 GiB)
|
|
|
90
96
|
max_shared_memory_per_block 49152
|
|
91
97
|
warp_size 32
|
|
92
98
|
clock_rate_khz 1590000
|
|
93
|
-
... (
|
|
99
|
+
... (148 attributes total)
|
|
94
100
|
```
|
|
95
101
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
102
|
+
The `attribute` table is read straight from the driver via
|
|
103
|
+
`cuDeviceGetAttribute`, the same values Nsight Compute shows as
|
|
104
|
+
`device__attribute_*`, but you don't need to profile a kernel to see them.
|
|
105
|
+
Every id is probed live, so the output matches the machine you run it on;
|
|
106
|
+
ids newer than the bundled name table show up as `attribute_<id>`.
|
|
107
|
+
|
|
108
|
+
The `(nvml)` lines come from a second source: NVML, the library behind
|
|
109
|
+
`nvidia-smi`, also shipped with the driver. They surface facts the driver
|
|
110
|
+
attribute enum doesn't have (architecture name, real CUDA core count,
|
|
111
|
+
PCIe link, live memory use, ECC, VBIOS) and are skipped silently if NVML
|
|
112
|
+
isn't present. (The `gen1/3 x8/16` above is the live link: an idle T4
|
|
113
|
+
drops to a lower PCIe state and ramps up under load.) Add `--json` for
|
|
114
|
+
machine-readable output; the NVML block lands under `devices[].nvml`.
|
|
103
115
|
|
|
104
116
|
## Benchmarking a kernel
|
|
105
117
|
|
|
@@ -58,6 +58,12 @@ Device 0: Tesla T4 (14.6 GiB)
|
|
|
58
58
|
compute capability : 7.5
|
|
59
59
|
theoretical mem bandwidth : 320.1 GB/s
|
|
60
60
|
theoretical FP32 peak : 8.14 TFLOP/s
|
|
61
|
+
theoretical fp16 tensor : 65.13 TFLOP/s (dense)
|
|
62
|
+
architecture (nvml) : Turing, 2560 CUDA cores
|
|
63
|
+
pcie link (nvml) : gen1/3 x8/16
|
|
64
|
+
memory in use (nvml) : 450 / 15360 MiB
|
|
65
|
+
ecc (nvml) : on
|
|
66
|
+
vbios (nvml) : 90.04.96.00.02
|
|
61
67
|
|
|
62
68
|
attribute value
|
|
63
69
|
------------------------------------------------ ------------
|
|
@@ -66,16 +72,22 @@ Device 0: Tesla T4 (14.6 GiB)
|
|
|
66
72
|
max_shared_memory_per_block 49152
|
|
67
73
|
warp_size 32
|
|
68
74
|
clock_rate_khz 1590000
|
|
69
|
-
... (
|
|
75
|
+
... (148 attributes total)
|
|
70
76
|
```
|
|
71
77
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
78
|
+
The `attribute` table is read straight from the driver via
|
|
79
|
+
`cuDeviceGetAttribute`, the same values Nsight Compute shows as
|
|
80
|
+
`device__attribute_*`, but you don't need to profile a kernel to see them.
|
|
81
|
+
Every id is probed live, so the output matches the machine you run it on;
|
|
82
|
+
ids newer than the bundled name table show up as `attribute_<id>`.
|
|
83
|
+
|
|
84
|
+
The `(nvml)` lines come from a second source: NVML, the library behind
|
|
85
|
+
`nvidia-smi`, also shipped with the driver. They surface facts the driver
|
|
86
|
+
attribute enum doesn't have (architecture name, real CUDA core count,
|
|
87
|
+
PCIe link, live memory use, ECC, VBIOS) and are skipped silently if NVML
|
|
88
|
+
isn't present. (The `gen1/3 x8/16` above is the live link: an idle T4
|
|
89
|
+
drops to a lower PCIe state and ramps up under load.) Add `--json` for
|
|
90
|
+
machine-readable output; the NVML block lands under `devices[].nvml`.
|
|
79
91
|
|
|
80
92
|
## Benchmarking a kernel
|
|
81
93
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "kernelmeter"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -1,25 +1,28 @@
|
|
|
1
1
|
"""kernelmeter: CUDA device attributes without profiling, and kernel
|
|
2
2
|
benchmarks measured against the hardware's speed of light."""
|
|
3
3
|
|
|
4
|
+
from . import extras, occupancy, roofline
|
|
4
5
|
from .bench import REGISTRY, BenchResult, BenchSpec, benchmark, device_peaks, run, run_registry
|
|
5
6
|
from .cudadrv import CudaDriverError, CudaNotAvailableError, Driver
|
|
6
|
-
from . import
|
|
7
|
+
from .extras import DeviceExtras
|
|
7
8
|
from .occupancy import Occupancy
|
|
8
9
|
from .peaks import Peaks
|
|
9
10
|
|
|
10
|
-
__version__ = "0.
|
|
11
|
+
__version__ = "0.4.0"
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
13
14
|
"BenchResult",
|
|
14
15
|
"BenchSpec",
|
|
15
16
|
"CudaDriverError",
|
|
16
17
|
"CudaNotAvailableError",
|
|
18
|
+
"DeviceExtras",
|
|
17
19
|
"Driver",
|
|
18
20
|
"Occupancy",
|
|
19
21
|
"Peaks",
|
|
20
22
|
"REGISTRY",
|
|
21
23
|
"benchmark",
|
|
22
24
|
"device_peaks",
|
|
25
|
+
"extras",
|
|
23
26
|
"occupancy",
|
|
24
27
|
"roofline",
|
|
25
28
|
"run",
|
|
@@ -154,6 +154,14 @@ KNOWN_ATTRS: dict[int, str] = {
|
|
|
154
154
|
141: "host_numa_virtual_memory_management_supported",
|
|
155
155
|
142: "host_numa_memory_pools_supported",
|
|
156
156
|
143: "host_numa_multinode_ipc_supported",
|
|
157
|
+
144: "host_memory_pools_supported",
|
|
158
|
+
145: "host_virtual_memory_management_supported",
|
|
159
|
+
146: "host_alloc_dma_buf_supported",
|
|
160
|
+
147: "only_partial_host_native_atomic_supported",
|
|
161
|
+
148: "atomic_reduction_supported",
|
|
162
|
+
# 149 is CU_DEVICE_ATTRIBUTE_MAX, a sentinel rather than a real
|
|
163
|
+
# attribute, so it stops here. Anything the driver adds beyond this is
|
|
164
|
+
# still reported generically as attribute_<id> by the probe below.
|
|
157
165
|
}
|
|
158
166
|
|
|
159
167
|
|
|
@@ -57,24 +57,55 @@ def _print_live_telemetry(ordinal: int) -> None:
|
|
|
57
57
|
# ---------------------------------------------------------------------------
|
|
58
58
|
|
|
59
59
|
def gather_info(driver: Driver) -> dict:
|
|
60
|
+
from . import extras as _extras
|
|
61
|
+
|
|
60
62
|
major, minor = driver.driver_version()
|
|
61
63
|
devices = []
|
|
62
64
|
for ordinal in range(driver.device_count()):
|
|
63
65
|
dev = driver.device(ordinal)
|
|
64
66
|
attributes = _attrs.query_all(driver, dev)
|
|
65
67
|
peaks = _peaks.derive(attributes)
|
|
68
|
+
nvml_extras = _extras.gather(ordinal)
|
|
66
69
|
devices.append(
|
|
67
70
|
{
|
|
68
71
|
"ordinal": ordinal,
|
|
69
72
|
"name": dev.name,
|
|
70
73
|
"total_memory_bytes": dev.total_mem_bytes,
|
|
71
74
|
"derived": peaks.as_dict(),
|
|
75
|
+
"nvml": nvml_extras.as_dict() if nvml_extras else None,
|
|
72
76
|
"attributes": attributes,
|
|
73
77
|
}
|
|
74
78
|
)
|
|
75
79
|
return {"driver_version": f"{major}.{minor}", "devices": devices}
|
|
76
80
|
|
|
77
81
|
|
|
82
|
+
def _print_nvml_extras(nvml: dict) -> None:
|
|
83
|
+
"""Print the NVML-sourced facts, skipping fields the card didn't report."""
|
|
84
|
+
arch = nvml.get("architecture")
|
|
85
|
+
cores = nvml.get("num_gpu_cores")
|
|
86
|
+
if arch or cores:
|
|
87
|
+
bits = []
|
|
88
|
+
if arch:
|
|
89
|
+
bits.append(arch)
|
|
90
|
+
if cores:
|
|
91
|
+
bits.append(f"{cores} CUDA cores")
|
|
92
|
+
print(" architecture (nvml) : " + ", ".join(bits))
|
|
93
|
+
gen, gen_max = nvml.get("pcie_gen_current"), nvml.get("pcie_gen_max")
|
|
94
|
+
w, w_max = nvml.get("pcie_width_current"), nvml.get("pcie_width_max")
|
|
95
|
+
if gen and w:
|
|
96
|
+
print(f" pcie link (nvml) : gen{gen}/{gen_max} x{w}/{w_max}")
|
|
97
|
+
total, used = nvml.get("memory_total_bytes"), nvml.get("memory_used_bytes")
|
|
98
|
+
if total:
|
|
99
|
+
print(
|
|
100
|
+
f" memory in use (nvml) : {used / 2**20:.0f} / "
|
|
101
|
+
f"{total / 2**20:.0f} MiB"
|
|
102
|
+
)
|
|
103
|
+
if nvml.get("ecc_enabled") is not None:
|
|
104
|
+
print(f" ecc (nvml) : {'on' if nvml['ecc_enabled'] else 'off'}")
|
|
105
|
+
if nvml.get("vbios_version"):
|
|
106
|
+
print(f" vbios (nvml) : {nvml['vbios_version']}")
|
|
107
|
+
|
|
108
|
+
|
|
78
109
|
def cmd_info(args: argparse.Namespace) -> int:
|
|
79
110
|
try:
|
|
80
111
|
driver = Driver()
|
|
@@ -111,6 +142,8 @@ def cmd_info(args: argparse.Namespace) -> int:
|
|
|
111
142
|
" theoretical tf32 tensor : "
|
|
112
143
|
+ _fmt(derived["theoretical_tf32_tensor_tflops"], " TFLOP/s (dense)", nd=2)
|
|
113
144
|
)
|
|
145
|
+
if dev.get("nvml"):
|
|
146
|
+
_print_nvml_extras(dev["nvml"])
|
|
114
147
|
_print_live_telemetry(dev["ordinal"])
|
|
115
148
|
print(f"\n {'attribute':<48} value")
|
|
116
149
|
print(f" {'-' * 48} {'-' * 12}")
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Device facts from NVML, the second data source the driver attribute
|
|
2
|
+
enum can't give you.
|
|
3
|
+
|
|
4
|
+
``kernelmeter info`` reports ``cuDeviceGetAttribute`` values. Tools like
|
|
5
|
+
Nsight Compute show more (architecture name, real core count, PCIe link,
|
|
6
|
+
memory breakdown) because they pull from their own device database and
|
|
7
|
+
from NVML. NVML ships with the driver, so this module adds those facts
|
|
8
|
+
without a toolkit -- the same ctypes approach as the rest of kernelmeter.
|
|
9
|
+
|
|
10
|
+
It does not invent ncu-internal metrics (sass_level, ram_type, ...): those
|
|
11
|
+
aren't exposed by either the driver or NVML, so they would have to be
|
|
12
|
+
hardcoded per board and would go stale. Everything here is read live.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from . import nvml as _nvml
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DeviceExtras:
|
|
24
|
+
architecture: str | None
|
|
25
|
+
brand: str | None
|
|
26
|
+
num_gpu_cores: int | None
|
|
27
|
+
memory_total_bytes: int | None
|
|
28
|
+
memory_used_bytes: int | None
|
|
29
|
+
memory_free_bytes: int | None
|
|
30
|
+
pcie_gen_current: int | None
|
|
31
|
+
pcie_gen_max: int | None
|
|
32
|
+
pcie_width_current: int | None
|
|
33
|
+
pcie_width_max: int | None
|
|
34
|
+
ecc_enabled: bool | None
|
|
35
|
+
vbios_version: str | None
|
|
36
|
+
driver_version: str | None
|
|
37
|
+
|
|
38
|
+
def as_dict(self) -> dict:
|
|
39
|
+
return dict(self.__dict__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def from_nvml(n: "_nvml.Nvml", index: int = 0) -> DeviceExtras:
|
|
43
|
+
"""Build the extras for one device from an open NVML handle. Each
|
|
44
|
+
query is individually tolerant: an unsupported field becomes None
|
|
45
|
+
rather than failing the whole gather."""
|
|
46
|
+
h = n.device(index)
|
|
47
|
+
|
|
48
|
+
def safe(fn, *args):
|
|
49
|
+
try:
|
|
50
|
+
return fn(*args)
|
|
51
|
+
except Exception:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
arch_id = safe(n.architecture, h)
|
|
55
|
+
brand_id = safe(n.brand, h)
|
|
56
|
+
mem = safe(n.memory_info, h) or (None, None, None)
|
|
57
|
+
pcie = safe(n.pcie_link, h) or (None, None, None, None)
|
|
58
|
+
|
|
59
|
+
return DeviceExtras(
|
|
60
|
+
architecture=_nvml.ARCH_NAMES.get(arch_id) if arch_id is not None else None,
|
|
61
|
+
brand=_nvml.BRAND_NAMES.get(brand_id) if brand_id is not None else None,
|
|
62
|
+
num_gpu_cores=safe(n.num_gpu_cores, h),
|
|
63
|
+
memory_total_bytes=mem[0],
|
|
64
|
+
memory_free_bytes=mem[1],
|
|
65
|
+
memory_used_bytes=mem[2],
|
|
66
|
+
pcie_gen_current=pcie[0],
|
|
67
|
+
pcie_gen_max=pcie[1],
|
|
68
|
+
pcie_width_current=pcie[2],
|
|
69
|
+
pcie_width_max=pcie[3],
|
|
70
|
+
ecc_enabled=safe(n.ecc_enabled, h),
|
|
71
|
+
vbios_version=safe(n.vbios_version, h),
|
|
72
|
+
driver_version=safe(n.driver_version),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def gather(index: int = 0, nvml_obj: "_nvml.Nvml | None" = None) -> DeviceExtras | None:
|
|
77
|
+
"""Open NVML (if not given one), read the extras, clean up. Returns
|
|
78
|
+
None when NVML isn't available so callers can skip the section."""
|
|
79
|
+
owns = nvml_obj is None
|
|
80
|
+
try:
|
|
81
|
+
n = nvml_obj if nvml_obj is not None else _nvml.Nvml()
|
|
82
|
+
except Exception:
|
|
83
|
+
return None
|
|
84
|
+
try:
|
|
85
|
+
return from_nvml(n, index)
|
|
86
|
+
except Exception:
|
|
87
|
+
return None
|
|
88
|
+
finally:
|
|
89
|
+
if owns:
|
|
90
|
+
try:
|
|
91
|
+
n.close()
|
|
92
|
+
except Exception:
|
|
93
|
+
pass
|
|
@@ -16,9 +16,40 @@ import threading
|
|
|
16
16
|
from dataclasses import dataclass
|
|
17
17
|
|
|
18
18
|
NVML_SUCCESS = 0
|
|
19
|
+
NVML_ERROR_NOT_SUPPORTED = 3
|
|
19
20
|
NVML_CLOCK_SM = 1
|
|
20
21
|
NVML_CLOCK_MEM = 2
|
|
21
22
|
NVML_TEMPERATURE_GPU = 0
|
|
23
|
+
NVML_FEATURE_ENABLED = 1
|
|
24
|
+
|
|
25
|
+
# nvmlDeviceArchitecture_t
|
|
26
|
+
ARCH_NAMES = {
|
|
27
|
+
2: "Kepler",
|
|
28
|
+
3: "Maxwell",
|
|
29
|
+
4: "Pascal",
|
|
30
|
+
5: "Volta",
|
|
31
|
+
6: "Turing",
|
|
32
|
+
7: "Ampere",
|
|
33
|
+
8: "Ada Lovelace",
|
|
34
|
+
9: "Hopper",
|
|
35
|
+
10: "Blackwell",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# nvmlBrandType_t (common entries)
|
|
39
|
+
BRAND_NAMES = {
|
|
40
|
+
0: "Unknown",
|
|
41
|
+
1: "Quadro",
|
|
42
|
+
2: "Tesla",
|
|
43
|
+
3: "NVS",
|
|
44
|
+
4: "GRID",
|
|
45
|
+
5: "GeForce",
|
|
46
|
+
6: "Titan",
|
|
47
|
+
7: "NVIDIA vApps",
|
|
48
|
+
8: "NVIDIA vPC",
|
|
49
|
+
9: "NVIDIA vCS",
|
|
50
|
+
10: "NVIDIA vWS",
|
|
51
|
+
11: "NVIDIA Cloud Gaming",
|
|
52
|
+
}
|
|
22
53
|
|
|
23
54
|
|
|
24
55
|
class NvmlError(RuntimeError):
|
|
@@ -30,6 +61,14 @@ class NvmlNotAvailableError(RuntimeError):
|
|
|
30
61
|
pass
|
|
31
62
|
|
|
32
63
|
|
|
64
|
+
class _Memory(ctypes.Structure):
|
|
65
|
+
_fields_ = [
|
|
66
|
+
("total", ctypes.c_ulonglong),
|
|
67
|
+
("free", ctypes.c_ulonglong),
|
|
68
|
+
("used", ctypes.c_ulonglong),
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
|
|
33
72
|
def load_library() -> ctypes.CDLL:
|
|
34
73
|
if sys.platform == "darwin":
|
|
35
74
|
raise NvmlNotAvailableError("NVML is not available on macOS")
|
|
@@ -73,6 +112,23 @@ class Nvml:
|
|
|
73
112
|
self._check(func_name, fn(handle, *args, ctypes.byref(out)))
|
|
74
113
|
return out.value
|
|
75
114
|
|
|
115
|
+
def _uint_query_opt(self, func_name: str, handle, *args) -> int | None:
|
|
116
|
+
"""Like _uint_query but returns None when the card doesn't support
|
|
117
|
+
the query (e.g. consumer cards have no ECC), instead of raising."""
|
|
118
|
+
out = ctypes.c_uint(0)
|
|
119
|
+
fn = getattr(self._lib, func_name)
|
|
120
|
+
code = fn(handle, *args, ctypes.byref(out))
|
|
121
|
+
if code == NVML_ERROR_NOT_SUPPORTED:
|
|
122
|
+
return None
|
|
123
|
+
self._check(func_name, code)
|
|
124
|
+
return out.value
|
|
125
|
+
|
|
126
|
+
def _str_query(self, func_name: str, *args, length: int = 96) -> str:
|
|
127
|
+
buf = ctypes.create_string_buffer(length)
|
|
128
|
+
fn = getattr(self._lib, func_name)
|
|
129
|
+
self._check(func_name, fn(*args, buf, ctypes.c_uint(length)))
|
|
130
|
+
return buf.value.decode("utf-8", errors="replace")
|
|
131
|
+
|
|
76
132
|
def sm_clock_mhz(self, handle) -> int:
|
|
77
133
|
return self._uint_query("nvmlDeviceGetClockInfo", handle, NVML_CLOCK_SM)
|
|
78
134
|
|
|
@@ -94,6 +150,52 @@ class Nvml:
|
|
|
94
150
|
def power_limit_w(self, handle) -> float:
|
|
95
151
|
return self._uint_query("nvmlDeviceGetEnforcedPowerLimit", handle) / 1000.0
|
|
96
152
|
|
|
153
|
+
# ---- static device facts the driver attribute enum does not expose ----
|
|
154
|
+
|
|
155
|
+
def architecture(self, handle) -> int | None:
|
|
156
|
+
return self._uint_query_opt("nvmlDeviceGetArchitecture", handle)
|
|
157
|
+
|
|
158
|
+
def brand(self, handle) -> int | None:
|
|
159
|
+
return self._uint_query_opt("nvmlDeviceGetBrand", handle)
|
|
160
|
+
|
|
161
|
+
def num_gpu_cores(self, handle) -> int | None:
|
|
162
|
+
# NVML 11.8+. Older drivers don't have it -> AttributeError on the symbol.
|
|
163
|
+
if not hasattr(self._lib, "nvmlDeviceGetNumGpuCores"):
|
|
164
|
+
return None
|
|
165
|
+
return self._uint_query_opt("nvmlDeviceGetNumGpuCores", handle)
|
|
166
|
+
|
|
167
|
+
def memory_info(self, handle) -> tuple[int, int, int]:
|
|
168
|
+
mem = _Memory()
|
|
169
|
+
self._check(
|
|
170
|
+
"nvmlDeviceGetMemoryInfo",
|
|
171
|
+
self._lib.nvmlDeviceGetMemoryInfo(handle, ctypes.byref(mem)),
|
|
172
|
+
)
|
|
173
|
+
return mem.total, mem.free, mem.used
|
|
174
|
+
|
|
175
|
+
def pcie_link(self, handle) -> tuple[int | None, int | None, int | None, int | None]:
|
|
176
|
+
"""(current gen, max gen, current width, max width)."""
|
|
177
|
+
return (
|
|
178
|
+
self._uint_query_opt("nvmlDeviceGetCurrPcieLinkGeneration", handle),
|
|
179
|
+
self._uint_query_opt("nvmlDeviceGetMaxPcieLinkGeneration", handle),
|
|
180
|
+
self._uint_query_opt("nvmlDeviceGetCurrPcieLinkWidth", handle),
|
|
181
|
+
self._uint_query_opt("nvmlDeviceGetMaxPcieLinkWidth", handle),
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
def ecc_enabled(self, handle) -> bool | None:
|
|
185
|
+
cur = ctypes.c_uint(0)
|
|
186
|
+
pend = ctypes.c_uint(0)
|
|
187
|
+
code = self._lib.nvmlDeviceGetEccMode(handle, ctypes.byref(cur), ctypes.byref(pend))
|
|
188
|
+
if code == NVML_ERROR_NOT_SUPPORTED:
|
|
189
|
+
return None
|
|
190
|
+
self._check("nvmlDeviceGetEccMode", code)
|
|
191
|
+
return cur.value == NVML_FEATURE_ENABLED
|
|
192
|
+
|
|
193
|
+
def vbios_version(self, handle) -> str:
|
|
194
|
+
return self._str_query("nvmlDeviceGetVbiosVersion", handle)
|
|
195
|
+
|
|
196
|
+
def driver_version(self) -> str:
|
|
197
|
+
return self._str_query("nvmlSystemGetDriverVersion")
|
|
198
|
+
|
|
97
199
|
|
|
98
200
|
@dataclass
|
|
99
201
|
class Telemetry:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kernelmeter
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Query every CUDA device attribute without profiling a kernel, and benchmark your kernels against the hardware's speed of light.
|
|
5
5
|
Author: nuemaan
|
|
6
6
|
License: MIT
|
|
@@ -82,6 +82,12 @@ Device 0: Tesla T4 (14.6 GiB)
|
|
|
82
82
|
compute capability : 7.5
|
|
83
83
|
theoretical mem bandwidth : 320.1 GB/s
|
|
84
84
|
theoretical FP32 peak : 8.14 TFLOP/s
|
|
85
|
+
theoretical fp16 tensor : 65.13 TFLOP/s (dense)
|
|
86
|
+
architecture (nvml) : Turing, 2560 CUDA cores
|
|
87
|
+
pcie link (nvml) : gen1/3 x8/16
|
|
88
|
+
memory in use (nvml) : 450 / 15360 MiB
|
|
89
|
+
ecc (nvml) : on
|
|
90
|
+
vbios (nvml) : 90.04.96.00.02
|
|
85
91
|
|
|
86
92
|
attribute value
|
|
87
93
|
------------------------------------------------ ------------
|
|
@@ -90,16 +96,22 @@ Device 0: Tesla T4 (14.6 GiB)
|
|
|
90
96
|
max_shared_memory_per_block 49152
|
|
91
97
|
warp_size 32
|
|
92
98
|
clock_rate_khz 1590000
|
|
93
|
-
... (
|
|
99
|
+
... (148 attributes total)
|
|
94
100
|
```
|
|
95
101
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
102
|
+
The `attribute` table is read straight from the driver via
|
|
103
|
+
`cuDeviceGetAttribute`, the same values Nsight Compute shows as
|
|
104
|
+
`device__attribute_*`, but you don't need to profile a kernel to see them.
|
|
105
|
+
Every id is probed live, so the output matches the machine you run it on;
|
|
106
|
+
ids newer than the bundled name table show up as `attribute_<id>`.
|
|
107
|
+
|
|
108
|
+
The `(nvml)` lines come from a second source: NVML, the library behind
|
|
109
|
+
`nvidia-smi`, also shipped with the driver. They surface facts the driver
|
|
110
|
+
attribute enum doesn't have (architecture name, real CUDA core count,
|
|
111
|
+
PCIe link, live memory use, ECC, VBIOS) and are skipped silently if NVML
|
|
112
|
+
isn't present. (The `gen1/3 x8/16` above is the live link: an idle T4
|
|
113
|
+
drops to a lower PCIe state and ramps up under load.) Add `--json` for
|
|
114
|
+
machine-readable output; the NVML block lands under `devices[].nvml`.
|
|
103
115
|
|
|
104
116
|
## Benchmarking a kernel
|
|
105
117
|
|
|
@@ -7,6 +7,7 @@ src/kernelmeter/bench.py
|
|
|
7
7
|
src/kernelmeter/ceiling.py
|
|
8
8
|
src/kernelmeter/cli.py
|
|
9
9
|
src/kernelmeter/cudadrv.py
|
|
10
|
+
src/kernelmeter/extras.py
|
|
10
11
|
src/kernelmeter/nvml.py
|
|
11
12
|
src/kernelmeter/occupancy.py
|
|
12
13
|
src/kernelmeter/peaks.py
|
|
@@ -22,6 +23,7 @@ tests/test_bench_math.py
|
|
|
22
23
|
tests/test_bench_roofline.py
|
|
23
24
|
tests/test_cli.py
|
|
24
25
|
tests/test_cli_new_commands.py
|
|
26
|
+
tests/test_extras.py
|
|
25
27
|
tests/test_nvml.py
|
|
26
28
|
tests/test_occupancy.py
|
|
27
29
|
tests/test_peaks.py
|
|
@@ -20,8 +20,8 @@ def test_unsupported_ids_are_skipped(fake_driver):
|
|
|
20
20
|
def test_unknown_but_supported_ids_get_generic_names(fake_driver):
|
|
21
21
|
dev = fake_driver.device(0)
|
|
22
22
|
result = attrs.query_all(fake_driver, dev)
|
|
23
|
-
# id
|
|
24
|
-
assert result["
|
|
23
|
+
# id 155 succeeds in the fake but has no name in our table
|
|
24
|
+
assert result["attribute_155"] == 7
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def test_cuda12_range_names(fake_driver):
|
|
@@ -29,6 +29,13 @@ def test_cuda12_range_names(fake_driver):
|
|
|
29
29
|
result = attrs.query_all(fake_driver, dev)
|
|
30
30
|
assert result["numa_id"] == -1
|
|
31
31
|
assert result["gpu_pci_device_id"] == 0x1EB810DE
|
|
32
|
+
# last named driver attribute before the CU_DEVICE_ATTRIBUTE_MAX sentinel
|
|
33
|
+
assert result["atomic_reduction_supported"] == 1
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_max_sentinel_is_not_named():
|
|
37
|
+
# 149 is CU_DEVICE_ATTRIBUTE_MAX, not a real attribute
|
|
38
|
+
assert 149 not in attrs.KNOWN_ATTRS
|
|
32
39
|
|
|
33
40
|
|
|
34
41
|
def test_device_metadata(fake_driver):
|
|
@@ -31,6 +31,46 @@ def test_info_human_readable(patched_driver, capsys):
|
|
|
31
31
|
assert "GB/s" in out
|
|
32
32
|
|
|
33
33
|
|
|
34
|
+
@pytest.fixture
|
|
35
|
+
def patched_nvml(monkeypatch):
|
|
36
|
+
from kernelmeter import extras, nvml
|
|
37
|
+
|
|
38
|
+
from test_nvml import FakeNvmlLib
|
|
39
|
+
|
|
40
|
+
real_nvml = nvml.Nvml # capture before patching to avoid self-recursion
|
|
41
|
+
monkeypatch.setattr(
|
|
42
|
+
extras._nvml, "Nvml", lambda *a, **k: real_nvml(lib=FakeNvmlLib())
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_info_json_includes_nvml(patched_driver, patched_nvml, capsys):
|
|
47
|
+
assert cli.main(["info", "--json"]) == 0
|
|
48
|
+
dev = json.loads(capsys.readouterr().out)["devices"][0]
|
|
49
|
+
assert dev["nvml"]["architecture"] == "Turing"
|
|
50
|
+
assert dev["nvml"]["num_gpu_cores"] == 2560
|
|
51
|
+
assert dev["nvml"]["pcie_gen_max"] == 3
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_info_human_shows_nvml(patched_driver, patched_nvml, capsys):
|
|
55
|
+
assert cli.main(["info"]) == 0
|
|
56
|
+
out = capsys.readouterr().out
|
|
57
|
+
assert "Turing" in out
|
|
58
|
+
assert "2560 CUDA cores" in out
|
|
59
|
+
assert "pcie link" in out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_info_json_nvml_null_without_nvml(patched_driver, monkeypatch, capsys):
|
|
63
|
+
from kernelmeter import extras, nvml
|
|
64
|
+
|
|
65
|
+
def boom(*_a, **_k):
|
|
66
|
+
raise nvml.NvmlNotAvailableError("no driver")
|
|
67
|
+
|
|
68
|
+
monkeypatch.setattr(extras._nvml, "Nvml", boom)
|
|
69
|
+
assert cli.main(["info", "--json"]) == 0
|
|
70
|
+
dev = json.loads(capsys.readouterr().out)["devices"][0]
|
|
71
|
+
assert dev["nvml"] is None
|
|
72
|
+
|
|
73
|
+
|
|
34
74
|
def test_info_without_driver(monkeypatch, capsys):
|
|
35
75
|
from kernelmeter.cudadrv import CudaNotAvailableError
|
|
36
76
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from kernelmeter import extras, nvml
|
|
2
|
+
|
|
3
|
+
from test_nvml import FakeNvmlLib
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _fake_nvml():
|
|
7
|
+
return nvml.Nvml(lib=FakeNvmlLib())
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_from_nvml_builds_extras():
|
|
11
|
+
ex = extras.from_nvml(_fake_nvml(), 0)
|
|
12
|
+
assert ex.architecture == "Turing"
|
|
13
|
+
assert ex.brand == "Tesla"
|
|
14
|
+
assert ex.num_gpu_cores == 2560
|
|
15
|
+
assert ex.memory_total_bytes == 15843721216
|
|
16
|
+
assert ex.pcie_gen_current == 3
|
|
17
|
+
assert ex.pcie_width_max == 16
|
|
18
|
+
assert ex.ecc_enabled is True
|
|
19
|
+
assert ex.vbios_version == "90.04.38.00.03"
|
|
20
|
+
assert ex.driver_version == "535.104.05"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_gather_uses_injected_nvml():
|
|
24
|
+
ex = extras.gather(0, nvml_obj=_fake_nvml())
|
|
25
|
+
assert ex is not None
|
|
26
|
+
assert ex.architecture == "Turing"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_gather_returns_none_when_nvml_missing(monkeypatch):
|
|
30
|
+
# simulate a machine with no driver: Nvml() construction raises
|
|
31
|
+
def boom(*_a, **_k):
|
|
32
|
+
raise nvml.NvmlNotAvailableError("no driver")
|
|
33
|
+
|
|
34
|
+
monkeypatch.setattr(extras._nvml, "Nvml", boom)
|
|
35
|
+
assert extras.gather(0) is None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_individual_field_failure_is_tolerated():
|
|
39
|
+
# a card that doesn't report cores shouldn't sink the whole gather
|
|
40
|
+
class NoCores(FakeNvmlLib):
|
|
41
|
+
def nvmlDeviceGetNumGpuCores(self, handle, ptr):
|
|
42
|
+
return nvml.NVML_ERROR_NOT_SUPPORTED
|
|
43
|
+
|
|
44
|
+
ex = extras.from_nvml(nvml.Nvml(lib=NoCores()), 0)
|
|
45
|
+
assert ex.num_gpu_cores is None
|
|
46
|
+
assert ex.architecture == "Turing" # the rest still came through
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_as_dict_roundtrips():
|
|
50
|
+
ex = extras.from_nvml(_fake_nvml(), 0)
|
|
51
|
+
d = ex.as_dict()
|
|
52
|
+
assert d["architecture"] == "Turing"
|
|
53
|
+
assert d["num_gpu_cores"] == 2560
|
|
54
|
+
assert set(d) == set(ex.__dict__)
|
|
@@ -45,6 +45,54 @@ class FakeNvmlLib:
|
|
|
45
45
|
ptr._obj.value = 70000
|
|
46
46
|
return NVML_SUCCESS
|
|
47
47
|
|
|
48
|
+
# static device facts (modelled on a Tesla T4)
|
|
49
|
+
def nvmlDeviceGetArchitecture(self, handle, ptr):
|
|
50
|
+
ptr._obj.value = 6 # Turing
|
|
51
|
+
return NVML_SUCCESS
|
|
52
|
+
|
|
53
|
+
def nvmlDeviceGetBrand(self, handle, ptr):
|
|
54
|
+
ptr._obj.value = 2 # Tesla
|
|
55
|
+
return NVML_SUCCESS
|
|
56
|
+
|
|
57
|
+
def nvmlDeviceGetNumGpuCores(self, handle, ptr):
|
|
58
|
+
ptr._obj.value = 2560
|
|
59
|
+
return NVML_SUCCESS
|
|
60
|
+
|
|
61
|
+
def nvmlDeviceGetMemoryInfo(self, handle, ptr):
|
|
62
|
+
ptr._obj.total = 15843721216
|
|
63
|
+
ptr._obj.free = 15500000000
|
|
64
|
+
ptr._obj.used = 343721216
|
|
65
|
+
return NVML_SUCCESS
|
|
66
|
+
|
|
67
|
+
def nvmlDeviceGetCurrPcieLinkGeneration(self, handle, ptr):
|
|
68
|
+
ptr._obj.value = 3
|
|
69
|
+
return NVML_SUCCESS
|
|
70
|
+
|
|
71
|
+
def nvmlDeviceGetMaxPcieLinkGeneration(self, handle, ptr):
|
|
72
|
+
ptr._obj.value = 3
|
|
73
|
+
return NVML_SUCCESS
|
|
74
|
+
|
|
75
|
+
def nvmlDeviceGetCurrPcieLinkWidth(self, handle, ptr):
|
|
76
|
+
ptr._obj.value = 16
|
|
77
|
+
return NVML_SUCCESS
|
|
78
|
+
|
|
79
|
+
def nvmlDeviceGetMaxPcieLinkWidth(self, handle, ptr):
|
|
80
|
+
ptr._obj.value = 16
|
|
81
|
+
return NVML_SUCCESS
|
|
82
|
+
|
|
83
|
+
def nvmlDeviceGetEccMode(self, handle, cur, pend):
|
|
84
|
+
cur._obj.value = 1 # enabled
|
|
85
|
+
pend._obj.value = 1
|
|
86
|
+
return NVML_SUCCESS
|
|
87
|
+
|
|
88
|
+
def nvmlDeviceGetVbiosVersion(self, handle, buf, length):
|
|
89
|
+
buf.value = b"90.04.38.00.03"
|
|
90
|
+
return NVML_SUCCESS
|
|
91
|
+
|
|
92
|
+
def nvmlSystemGetDriverVersion(self, buf, length):
|
|
93
|
+
buf.value = b"535.104.05"
|
|
94
|
+
return NVML_SUCCESS
|
|
95
|
+
|
|
48
96
|
|
|
49
97
|
def test_wrapper_reads_values():
|
|
50
98
|
n = nvml.Nvml(lib=FakeNvmlLib())
|
|
@@ -67,6 +115,31 @@ def test_error_code_raises():
|
|
|
67
115
|
n.temperature_c(n.device(0))
|
|
68
116
|
|
|
69
117
|
|
|
118
|
+
def test_static_device_facts():
|
|
119
|
+
n = nvml.Nvml(lib=FakeNvmlLib())
|
|
120
|
+
h = n.device(0)
|
|
121
|
+
assert nvml.ARCH_NAMES[n.architecture(h)] == "Turing"
|
|
122
|
+
assert nvml.BRAND_NAMES[n.brand(h)] == "Tesla"
|
|
123
|
+
assert n.num_gpu_cores(h) == 2560
|
|
124
|
+
total, free, used = n.memory_info(h)
|
|
125
|
+
assert total == 15843721216
|
|
126
|
+
assert used == 343721216
|
|
127
|
+
assert n.pcie_link(h) == (3, 3, 16, 16)
|
|
128
|
+
assert n.ecc_enabled(h) is True
|
|
129
|
+
assert n.vbios_version(h) == "90.04.38.00.03"
|
|
130
|
+
assert n.driver_version() == "535.104.05"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_unsupported_field_returns_none():
|
|
134
|
+
# consumer cards return NOT_SUPPORTED (3) for ECC
|
|
135
|
+
class NoEcc(FakeNvmlLib):
|
|
136
|
+
def nvmlDeviceGetEccMode(self, handle, cur, pend):
|
|
137
|
+
return nvml.NVML_ERROR_NOT_SUPPORTED
|
|
138
|
+
|
|
139
|
+
n = nvml.Nvml(lib=NoEcc())
|
|
140
|
+
assert n.ecc_enabled(n.device(0)) is None
|
|
141
|
+
|
|
142
|
+
|
|
70
143
|
def test_summarize_samples():
|
|
71
144
|
t = nvml.summarize_samples(
|
|
72
145
|
sm=[1500, 1560], mem=[4985, 4985], temp=[60, 63], power=[44.0, 46.0],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|