kernels 0.13.0__tar.gz → 0.14.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kernels-0.13.0 → kernels-0.14.0.dev0}/PKG-INFO +3 -3
- {kernels-0.13.0 → kernels-0.14.0.dev0}/pyproject.toml +9 -8
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/__init__.py +2 -0
- kernels-0.14.0.dev0/src/kernels/_versions.py +72 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/backends.py +2 -6
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/benchmarks/attention.py +11 -33
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/benchmarks/layer_norm.py +3 -9
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/cli/__init__.py +8 -119
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/cli/benchmark.py +8 -23
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/cli/benchmark_graphics.py +22 -58
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/cli/check.py +7 -1
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/cli/versions.py +2 -3
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/deps.py +6 -19
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/device.py +2 -8
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/func.py +5 -10
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/globals.py +1 -3
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/kernelize.py +4 -12
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/layer.py +16 -43
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/repos.py +10 -28
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/lockfile.py +21 -17
- kernels-0.14.0.dev0/src/kernels/metadata.py +44 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/status.py +7 -5
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/utils.py +75 -39
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/variants.py +19 -53
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels.egg-info/PKG-INFO +3 -3
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels.egg-info/SOURCES.txt +0 -4
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels.egg-info/requires.txt +1 -1
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_basic.py +22 -14
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_deps.py +1 -3
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_doctest.py +1 -3
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_func.py +1 -1
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_interval_tree.py +3 -9
- kernels-0.14.0.dev0/tests/test_kernel_locking.py +100 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_layer.py +27 -160
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_status.py +4 -8
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_tvm_ffi.py +1 -1
- kernels-0.13.0/src/kernels/_versions.py +0 -111
- kernels-0.13.0/src/kernels/cli/_vendored/convert_rst_to_mdx.py +0 -751
- kernels-0.13.0/src/kernels/cli/skills.py +0 -113
- kernels-0.13.0/src/kernels/cli/upload.py +0 -163
- kernels-0.13.0/src/kernels/metadata.py +0 -40
- kernels-0.13.0/tests/test_kernel_locking.py +0 -208
- kernels-0.13.0/tests/test_kernel_upload.py +0 -309
- {kernels-0.13.0 → kernels-0.14.0.dev0}/README.md +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/setup.cfg +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/_system.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/_windows.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/benchmark.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/benchmarks/__init__.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/benchmarks/activation.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/compat.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/__init__.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/_interval_tree.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/layer/mode.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels/python_depends.json +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels.egg-info/dependency_links.txt +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels.egg-info/entry_points.txt +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/src/kernels.egg-info/top_level.txt +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_benchmarks.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_user_agent.py +0 -0
- {kernels-0.13.0 → kernels-0.14.0.dev0}/tests/test_variants.py +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kernels
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0.dev0
|
|
4
4
|
Summary: Download compute kernels
|
|
5
5
|
Author-email: Daniel de Kok <daniel@huggingface.co>, David Holtz <david@huggingface.co>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist:
|
|
9
|
+
Requires-Dist: huggingface-hub>=1.10.0
|
|
10
10
|
Requires-Dist: packaging>=20.0
|
|
11
11
|
Requires-Dist: pyyaml>=6
|
|
12
12
|
Requires-Dist: tomli>=2.0; python_version < "3.11"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "kernels"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.14.0.dev0"
|
|
4
4
|
description = "Download compute kernels"
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Daniel de Kok", email = "daniel@huggingface.co" },
|
|
@@ -8,9 +8,9 @@ authors = [
|
|
|
8
8
|
]
|
|
9
9
|
license = { text = "Apache-2.0" }
|
|
10
10
|
readme = "README.md"
|
|
11
|
-
requires-python = ">= 3.
|
|
11
|
+
requires-python = ">= 3.10"
|
|
12
12
|
dependencies = [
|
|
13
|
-
"
|
|
13
|
+
"huggingface-hub>=1.10.0",
|
|
14
14
|
"packaging>=20.0",
|
|
15
15
|
"pyyaml>=6",
|
|
16
16
|
"tomli>=2.0; python_version<'3.11'",
|
|
@@ -56,11 +56,10 @@ kernels = "kernels.cli:main"
|
|
|
56
56
|
[tool.setuptools.package-data]
|
|
57
57
|
kernels = ["python_depends.json"]
|
|
58
58
|
|
|
59
|
-
[tool.isort]
|
|
60
|
-
profile = "black"
|
|
61
|
-
line_length = 119
|
|
62
|
-
|
|
63
59
|
[tool.ruff]
|
|
60
|
+
# If the version is changed, apply the change in the Nix overlay
|
|
61
|
+
# as well.
|
|
62
|
+
required-version = "==0.15.10"
|
|
64
63
|
exclude = [
|
|
65
64
|
".eggs",
|
|
66
65
|
".git",
|
|
@@ -85,4 +84,6 @@ line-length = 119
|
|
|
85
84
|
# Ignored rules:
|
|
86
85
|
# "E501" -> line length violation
|
|
87
86
|
lint.ignore = ["E501"]
|
|
88
|
-
lint.select = ["E", "F", "W"]
|
|
87
|
+
lint.select = ["E", "F", "I", "W"]
|
|
88
|
+
|
|
89
|
+
[tool.ruff.format]
|
|
@@ -23,6 +23,7 @@ from kernels.layer import (
|
|
|
23
23
|
)
|
|
24
24
|
from kernels.utils import (
|
|
25
25
|
get_kernel,
|
|
26
|
+
get_loaded_kernels,
|
|
26
27
|
get_local_kernel,
|
|
27
28
|
get_locked_kernel,
|
|
28
29
|
has_kernel,
|
|
@@ -45,6 +46,7 @@ __all__ = [
|
|
|
45
46
|
"LockedLayerRepository",
|
|
46
47
|
"Mode",
|
|
47
48
|
"get_kernel",
|
|
49
|
+
"get_loaded_kernels",
|
|
48
50
|
"get_local_kernel",
|
|
49
51
|
"get_locked_kernel",
|
|
50
52
|
"has_kernel",
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
from huggingface_hub.hf_api import GitRefInfo
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_available_versions(repo_id: str) -> dict[int, GitRefInfo]:
|
|
10
|
+
"""Get kernel versions that are available in the repository."""
|
|
11
|
+
from kernels.utils import _get_hf_api
|
|
12
|
+
|
|
13
|
+
refs = _get_hf_api().list_repo_refs(repo_id=repo_id, repo_type="kernel")
|
|
14
|
+
|
|
15
|
+
versions = {}
|
|
16
|
+
for branch in refs.branches:
|
|
17
|
+
if not branch.name.startswith("v"):
|
|
18
|
+
continue
|
|
19
|
+
try:
|
|
20
|
+
versions[int(branch.name[1:])] = branch
|
|
21
|
+
except ValueError:
|
|
22
|
+
continue
|
|
23
|
+
|
|
24
|
+
return versions
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def resolve_version_spec_as_ref(repo_id: str, version_spec: int) -> GitRefInfo:
|
|
28
|
+
"""
|
|
29
|
+
Get the ref for a kernel with the given version.
|
|
30
|
+
"""
|
|
31
|
+
versions = _get_available_versions(repo_id)
|
|
32
|
+
|
|
33
|
+
ref = versions.get(version_spec, None)
|
|
34
|
+
if ref is None:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Version {version_spec} not found, available versions: {', '.join(sorted(str(v) for v in versions.keys()))}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
latest_version = max(versions.keys())
|
|
40
|
+
if version_spec < latest_version:
|
|
41
|
+
logger.warning(
|
|
42
|
+
"You are using version %d of '%s', but version %d is available.",
|
|
43
|
+
version_spec,
|
|
44
|
+
repo_id,
|
|
45
|
+
latest_version,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return ref
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def select_revision_or_version(
|
|
52
|
+
repo_id: str,
|
|
53
|
+
*,
|
|
54
|
+
revision: str | None,
|
|
55
|
+
version: int | None,
|
|
56
|
+
) -> str:
|
|
57
|
+
if revision is not None and version is not None:
|
|
58
|
+
raise ValueError("Only one of `revision` or `version` must be specified.")
|
|
59
|
+
|
|
60
|
+
if revision is not None:
|
|
61
|
+
return revision
|
|
62
|
+
elif version is not None:
|
|
63
|
+
return resolve_version_spec_as_ref(repo_id, version).target_commit
|
|
64
|
+
|
|
65
|
+
warnings.warn(
|
|
66
|
+
"Future versions of `kernels` (>=0.15) will require specifying a kernel version or revision. "
|
|
67
|
+
"See: https://huggingface.co/docs/kernels/migration",
|
|
68
|
+
FutureWarning,
|
|
69
|
+
stacklevel=2,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return "main"
|
|
@@ -241,9 +241,7 @@ def _select_backend(backend: str | None) -> Backend:
|
|
|
241
241
|
if backend in supported:
|
|
242
242
|
return supported[backend]
|
|
243
243
|
|
|
244
|
-
raise ValueError(
|
|
245
|
-
f"Invalid backend '{backend}', system supported backends: {', '.join(sorted(supported.keys()))}"
|
|
246
|
-
)
|
|
244
|
+
raise ValueError(f"Invalid backend '{backend}', system supported backends: {', '.join(sorted(supported.keys()))}")
|
|
247
245
|
|
|
248
246
|
|
|
249
247
|
def _supported_backends() -> dict[str, Backend]:
|
|
@@ -267,9 +265,7 @@ def _get_cuda() -> Optional[CUDA]:
|
|
|
267
265
|
runtime_version = ctypes.c_int(0)
|
|
268
266
|
result = libcudart.cudaRuntimeGetVersion(ctypes.byref(runtime_version))
|
|
269
267
|
if result != 0:
|
|
270
|
-
warnings.warn(
|
|
271
|
-
"System has CUDA runtime library, but cannot get runtime version."
|
|
272
|
-
)
|
|
268
|
+
warnings.warn("System has CUDA runtime library, but cannot get runtime version.")
|
|
273
269
|
return None
|
|
274
270
|
|
|
275
271
|
# cudaRuntimeGetVersion encodes the version as (major * 1000 + minor * 10).
|
|
@@ -14,9 +14,7 @@ def _reference_attention(query, key, value, causal=False):
|
|
|
14
14
|
"""Reference implementation using PyTorch SDPA."""
|
|
15
15
|
query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
|
|
16
16
|
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
|
|
17
|
-
out = torch.nn.functional.scaled_dot_product_attention(
|
|
18
|
-
query, key, value, is_causal=causal
|
|
19
|
-
)
|
|
17
|
+
out = torch.nn.functional.scaled_dot_product_attention(query, key, value, is_causal=causal)
|
|
20
18
|
return out.transpose(1, 2).contiguous()
|
|
21
19
|
|
|
22
20
|
|
|
@@ -24,9 +22,7 @@ def _varlen_reference_attention(q, k, v, cu_seqlens_q, cu_seqlens_k, causal=Fals
|
|
|
24
22
|
"""Reference implementation for variable length attention."""
|
|
25
23
|
batch_size = cu_seqlens_q.shape[0] - 1
|
|
26
24
|
total_tokens_q = q.shape[0]
|
|
27
|
-
out = torch.zeros(
|
|
28
|
-
(total_tokens_q, q.shape[1], q.shape[2]), device=q.device, dtype=q.dtype
|
|
29
|
-
)
|
|
25
|
+
out = torch.zeros((total_tokens_q, q.shape[1], q.shape[2]), device=q.device, dtype=q.dtype)
|
|
30
26
|
|
|
31
27
|
for b in range(batch_size):
|
|
32
28
|
start_q, end_q = cu_seqlens_q[b], cu_seqlens_q[b + 1]
|
|
@@ -54,9 +50,7 @@ class FlashAttentionBenchmark(Benchmark):
|
|
|
54
50
|
self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
|
|
55
51
|
|
|
56
52
|
def benchmark_small(self):
|
|
57
|
-
self.out = _extract_output(
|
|
58
|
-
self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False)
|
|
59
|
-
)
|
|
53
|
+
self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False))
|
|
60
54
|
|
|
61
55
|
def verify_small(self) -> torch.Tensor:
|
|
62
56
|
return _reference_attention(self.q, self.k, self.v, causal=False)
|
|
@@ -70,9 +64,7 @@ class FlashAttentionBenchmark(Benchmark):
|
|
|
70
64
|
self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
|
|
71
65
|
|
|
72
66
|
def benchmark_medium(self):
|
|
73
|
-
self.out = _extract_output(
|
|
74
|
-
self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False)
|
|
75
|
-
)
|
|
67
|
+
self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False))
|
|
76
68
|
|
|
77
69
|
def verify_medium(self) -> torch.Tensor:
|
|
78
70
|
return _reference_attention(self.q, self.k, self.v, causal=False)
|
|
@@ -86,9 +78,7 @@ class FlashAttentionBenchmark(Benchmark):
|
|
|
86
78
|
self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
|
|
87
79
|
|
|
88
80
|
def benchmark_large(self):
|
|
89
|
-
self.out = _extract_output(
|
|
90
|
-
self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False)
|
|
91
|
-
)
|
|
81
|
+
self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False))
|
|
92
82
|
|
|
93
83
|
def verify_large(self) -> torch.Tensor:
|
|
94
84
|
return _reference_attention(self.q, self.k, self.v, causal=False)
|
|
@@ -106,9 +96,7 @@ class FlashAttentionCausalBenchmark(Benchmark):
|
|
|
106
96
|
self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
|
|
107
97
|
|
|
108
98
|
def benchmark_small(self):
|
|
109
|
-
self.out = _extract_output(
|
|
110
|
-
self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True)
|
|
111
|
-
)
|
|
99
|
+
self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True))
|
|
112
100
|
|
|
113
101
|
def verify_small(self) -> torch.Tensor:
|
|
114
102
|
return _reference_attention(self.q, self.k, self.v, causal=True)
|
|
@@ -122,9 +110,7 @@ class FlashAttentionCausalBenchmark(Benchmark):
|
|
|
122
110
|
self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
|
|
123
111
|
|
|
124
112
|
def benchmark_medium(self):
|
|
125
|
-
self.out = _extract_output(
|
|
126
|
-
self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True)
|
|
127
|
-
)
|
|
113
|
+
self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True))
|
|
128
114
|
|
|
129
115
|
def verify_medium(self) -> torch.Tensor:
|
|
130
116
|
return _reference_attention(self.q, self.k, self.v, causal=True)
|
|
@@ -138,9 +124,7 @@ class FlashAttentionCausalBenchmark(Benchmark):
|
|
|
138
124
|
self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
|
|
139
125
|
|
|
140
126
|
def benchmark_large(self):
|
|
141
|
-
self.out = _extract_output(
|
|
142
|
-
self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True)
|
|
143
|
-
)
|
|
127
|
+
self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True))
|
|
144
128
|
|
|
145
129
|
def verify_large(self) -> torch.Tensor:
|
|
146
130
|
return _reference_attention(self.q, self.k, self.v, causal=True)
|
|
@@ -180,9 +164,7 @@ class FlashAttentionVarlenBenchmark(Benchmark):
|
|
|
180
164
|
)
|
|
181
165
|
|
|
182
166
|
def verify_small(self) -> torch.Tensor:
|
|
183
|
-
return _varlen_reference_attention(
|
|
184
|
-
self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False
|
|
185
|
-
)
|
|
167
|
+
return _varlen_reference_attention(self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False)
|
|
186
168
|
|
|
187
169
|
# Workload: medium (5 sequences, max_seqlen=256)
|
|
188
170
|
def setup_medium(self):
|
|
@@ -214,9 +196,7 @@ class FlashAttentionVarlenBenchmark(Benchmark):
|
|
|
214
196
|
)
|
|
215
197
|
|
|
216
198
|
def verify_medium(self) -> torch.Tensor:
|
|
217
|
-
return _varlen_reference_attention(
|
|
218
|
-
self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False
|
|
219
|
-
)
|
|
199
|
+
return _varlen_reference_attention(self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False)
|
|
220
200
|
|
|
221
201
|
# Workload: large (8 sequences, max_seqlen=512)
|
|
222
202
|
def setup_large(self):
|
|
@@ -248,6 +228,4 @@ class FlashAttentionVarlenBenchmark(Benchmark):
|
|
|
248
228
|
)
|
|
249
229
|
|
|
250
230
|
def verify_large(self) -> torch.Tensor:
|
|
251
|
-
return _varlen_reference_attention(
|
|
252
|
-
self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False
|
|
253
|
-
)
|
|
231
|
+
return _varlen_reference_attention(self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False)
|
|
@@ -129,9 +129,7 @@ class LayerNormBenchmark(Benchmark):
|
|
|
129
129
|
)[0].view(self.B, self.S, self.D)
|
|
130
130
|
|
|
131
131
|
def verify_small(self) -> torch.Tensor:
|
|
132
|
-
return torch.nn.functional.layer_norm(
|
|
133
|
-
self.x, [self.D], self.weight, eps=self.eps
|
|
134
|
-
)
|
|
132
|
+
return torch.nn.functional.layer_norm(self.x, [self.D], self.weight, eps=self.eps)
|
|
135
133
|
|
|
136
134
|
# Workload: medium (B=4, S=512, D=2048)
|
|
137
135
|
def setup_medium(self):
|
|
@@ -160,9 +158,7 @@ class LayerNormBenchmark(Benchmark):
|
|
|
160
158
|
)[0].view(self.B, self.S, self.D)
|
|
161
159
|
|
|
162
160
|
def verify_medium(self) -> torch.Tensor:
|
|
163
|
-
return torch.nn.functional.layer_norm(
|
|
164
|
-
self.x, [self.D], self.weight, eps=self.eps
|
|
165
|
-
)
|
|
161
|
+
return torch.nn.functional.layer_norm(self.x, [self.D], self.weight, eps=self.eps)
|
|
166
162
|
|
|
167
163
|
# Workload: large (B=8, S=1024, D=4096)
|
|
168
164
|
def setup_large(self):
|
|
@@ -191,6 +187,4 @@ class LayerNormBenchmark(Benchmark):
|
|
|
191
187
|
)[0].view(self.B, self.S, self.D)
|
|
192
188
|
|
|
193
189
|
def verify_large(self) -> torch.Tensor:
|
|
194
|
-
return torch.nn.functional.layer_norm(
|
|
195
|
-
self.x, [self.D], self.weight, eps=self.eps
|
|
196
|
-
)
|
|
190
|
+
return torch.nn.functional.layer_norm(self.x, [self.D], self.weight, eps=self.eps)
|
|
@@ -4,8 +4,6 @@ import json
|
|
|
4
4
|
import sys
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
|
-
from kernels.cli.skills import add_skill
|
|
8
|
-
from kernels.cli.upload import upload_kernels_dir
|
|
9
7
|
from kernels.cli.versions import print_kernel_versions
|
|
10
8
|
from kernels.compat import tomllib
|
|
11
9
|
from kernels.lockfile import KernelLock, get_kernel_locks
|
|
@@ -16,9 +14,7 @@ from kernels.utils import (
|
|
|
16
14
|
|
|
17
15
|
|
|
18
16
|
def main():
|
|
19
|
-
parser = argparse.ArgumentParser(
|
|
20
|
-
prog="kernel", description="Manage compute kernels"
|
|
21
|
-
)
|
|
17
|
+
parser = argparse.ArgumentParser(prog="kernel", description="Manage compute kernels")
|
|
22
18
|
subparsers = parser.add_subparsers(required=True)
|
|
23
19
|
|
|
24
20
|
check_parser = subparsers.add_parser("check", help="Check a kernel for compliance")
|
|
@@ -30,12 +26,8 @@ def main():
|
|
|
30
26
|
help="The kernel revision (branch, tag, or commit SHA, defaults to 'main')",
|
|
31
27
|
)
|
|
32
28
|
check_parser.add_argument("--macos", type=str, help="macOS version", default="15.0")
|
|
33
|
-
check_parser.add_argument(
|
|
34
|
-
|
|
35
|
-
)
|
|
36
|
-
check_parser.add_argument(
|
|
37
|
-
"--python-abi", type=str, help="Python ABI version", default="3.9"
|
|
38
|
-
)
|
|
29
|
+
check_parser.add_argument("--manylinux", type=str, help="Manylinux version", default="manylinux_2_28")
|
|
30
|
+
check_parser.add_argument("--python-abi", type=str, help="Python ABI version", default="3.9")
|
|
39
31
|
check_parser.set_defaults(
|
|
40
32
|
func=lambda args: check_kernel(
|
|
41
33
|
macos=args.macos,
|
|
@@ -63,80 +55,6 @@ def main():
|
|
|
63
55
|
versions_parser.add_argument("repo_id", type=str, help="The kernel repo ID")
|
|
64
56
|
versions_parser.set_defaults(func=kernel_versions)
|
|
65
57
|
|
|
66
|
-
upload_parser = subparsers.add_parser(
|
|
67
|
-
"upload",
|
|
68
|
-
help="(Deprecated) Upload kernels to the Hub. Use `kernel-builder upload` instead.",
|
|
69
|
-
)
|
|
70
|
-
upload_parser.add_argument(
|
|
71
|
-
"kernel_dir",
|
|
72
|
-
type=Path,
|
|
73
|
-
help="Directory of the kernel build",
|
|
74
|
-
)
|
|
75
|
-
upload_parser.add_argument(
|
|
76
|
-
"--repo-id",
|
|
77
|
-
type=str,
|
|
78
|
-
required=True,
|
|
79
|
-
help="Repository ID to use to upload to the Hugging Face Hub",
|
|
80
|
-
)
|
|
81
|
-
upload_parser.add_argument(
|
|
82
|
-
"--branch",
|
|
83
|
-
type=str,
|
|
84
|
-
default=None,
|
|
85
|
-
help="If set, the upload will be made to a particular branch of the provided `repo-id`.",
|
|
86
|
-
)
|
|
87
|
-
upload_parser.add_argument(
|
|
88
|
-
"--private",
|
|
89
|
-
action="store_true",
|
|
90
|
-
help="If the repository should be private.",
|
|
91
|
-
)
|
|
92
|
-
upload_parser.set_defaults(func=upload_kernels)
|
|
93
|
-
|
|
94
|
-
skills_parser = subparsers.add_parser(
|
|
95
|
-
"skills",
|
|
96
|
-
help="Install kernels specific skills for agents like Claude, Codex, and OpenCode",
|
|
97
|
-
)
|
|
98
|
-
skills_subparsers = skills_parser.add_subparsers(required=True)
|
|
99
|
-
skills_add_parser = skills_subparsers.add_parser(
|
|
100
|
-
"add",
|
|
101
|
-
help="Install the cuda-kernels skill for an AI assistant",
|
|
102
|
-
)
|
|
103
|
-
skills_add_parser.add_argument(
|
|
104
|
-
"--claude",
|
|
105
|
-
action="store_true",
|
|
106
|
-
help="Install for Claude.",
|
|
107
|
-
)
|
|
108
|
-
skills_add_parser.add_argument(
|
|
109
|
-
"--codex",
|
|
110
|
-
action="store_true",
|
|
111
|
-
help="Install for Codex.",
|
|
112
|
-
)
|
|
113
|
-
skills_add_parser.add_argument(
|
|
114
|
-
"--opencode",
|
|
115
|
-
action="store_true",
|
|
116
|
-
help="Install for OpenCode.",
|
|
117
|
-
)
|
|
118
|
-
skills_add_parser.add_argument(
|
|
119
|
-
"--global",
|
|
120
|
-
"-g",
|
|
121
|
-
dest="global_",
|
|
122
|
-
action="store_true",
|
|
123
|
-
help=(
|
|
124
|
-
"Install globally (user-level) instead of in the current project directory."
|
|
125
|
-
),
|
|
126
|
-
)
|
|
127
|
-
skills_add_parser.add_argument(
|
|
128
|
-
"--dest",
|
|
129
|
-
type=Path,
|
|
130
|
-
default=None,
|
|
131
|
-
help="Install into a custom destination (path to skills directory).",
|
|
132
|
-
)
|
|
133
|
-
skills_add_parser.add_argument(
|
|
134
|
-
"--force",
|
|
135
|
-
action="store_true",
|
|
136
|
-
help="Overwrite existing skills in the destination.",
|
|
137
|
-
)
|
|
138
|
-
skills_add_parser.set_defaults(func=add_skill)
|
|
139
|
-
|
|
140
58
|
lock_parser = subparsers.add_parser("lock", help="Lock kernel revisions")
|
|
141
59
|
lock_parser.add_argument(
|
|
142
60
|
"project_dir",
|
|
@@ -154,12 +72,8 @@ def main():
|
|
|
154
72
|
type=str,
|
|
155
73
|
help="Kernel repo ID (e.g., kernels-community/activation)",
|
|
156
74
|
)
|
|
157
|
-
benchmark_parser.add_argument(
|
|
158
|
-
|
|
159
|
-
)
|
|
160
|
-
benchmark_parser.add_argument(
|
|
161
|
-
"--version", type=int, help="Kernel version to benchmark"
|
|
162
|
-
)
|
|
75
|
+
benchmark_parser.add_argument("--branch", type=str, help="Kernel branch to benchmark")
|
|
76
|
+
benchmark_parser.add_argument("--version", type=int, help="Kernel version to benchmark")
|
|
163
77
|
benchmark_parser.add_argument(
|
|
164
78
|
"--output",
|
|
165
79
|
type=str,
|
|
@@ -211,14 +125,14 @@ def download_kernels(args):
|
|
|
211
125
|
if args.all_variants:
|
|
212
126
|
install_kernel_all_variants(
|
|
213
127
|
kernel_lock.repo_id,
|
|
214
|
-
kernel_lock.sha,
|
|
128
|
+
revision=kernel_lock.sha,
|
|
215
129
|
variant_locks=kernel_lock.variants,
|
|
216
130
|
)
|
|
217
131
|
else:
|
|
218
132
|
try:
|
|
219
133
|
install_kernel(
|
|
220
134
|
kernel_lock.repo_id,
|
|
221
|
-
kernel_lock.sha,
|
|
135
|
+
revision=kernel_lock.sha,
|
|
222
136
|
variant_locks=kernel_lock.variants,
|
|
223
137
|
)
|
|
224
138
|
except FileNotFoundError as e:
|
|
@@ -247,29 +161,6 @@ def lock_kernels(args):
|
|
|
247
161
|
json.dump(all_locks, f, cls=_JSONEncoder, indent=2)
|
|
248
162
|
|
|
249
163
|
|
|
250
|
-
def upload_kernels(args):
|
|
251
|
-
import warnings
|
|
252
|
-
|
|
253
|
-
warnings.warn(
|
|
254
|
-
"`kernels upload` is deprecated and will be removed in version 0.14. "
|
|
255
|
-
"Please use `kernel-builder upload` instead.",
|
|
256
|
-
DeprecationWarning,
|
|
257
|
-
stacklevel=1,
|
|
258
|
-
)
|
|
259
|
-
# Also print to stderr for visibility in CLI usage
|
|
260
|
-
print(
|
|
261
|
-
"Warning: `kernels upload` is deprecated and will be removed in version 0.14.\n"
|
|
262
|
-
"Please use `kernel-builder upload` instead.\n",
|
|
263
|
-
file=sys.stderr,
|
|
264
|
-
)
|
|
265
|
-
upload_kernels_dir(
|
|
266
|
-
Path(args.kernel_dir).resolve(),
|
|
267
|
-
repo_id=args.repo_id,
|
|
268
|
-
branch=args.branch,
|
|
269
|
-
private=args.private,
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
|
|
273
164
|
class _JSONEncoder(json.JSONEncoder):
|
|
274
165
|
def default(self, o):
|
|
275
166
|
if dataclasses.is_dataclass(o):
|
|
@@ -277,9 +168,7 @@ class _JSONEncoder(json.JSONEncoder):
|
|
|
277
168
|
return super().default(o)
|
|
278
169
|
|
|
279
170
|
|
|
280
|
-
def check_kernel(
|
|
281
|
-
*, macos: str, manylinux: str, python_abi: str, repo_id: str, revision: str
|
|
282
|
-
):
|
|
171
|
+
def check_kernel(*, macos: str, manylinux: str, python_abi: str, repo_id: str, revision: str):
|
|
283
172
|
try:
|
|
284
173
|
from kernels.cli import check
|
|
285
174
|
except ImportError:
|
|
@@ -91,9 +91,7 @@ class TimingResults:
|
|
|
91
91
|
|
|
92
92
|
def validate_timing_range(self):
|
|
93
93
|
if self.min_ms > self.max_ms:
|
|
94
|
-
raise ValueError(
|
|
95
|
-
f"min_ms ({self.min_ms}) must be <= max_ms ({self.max_ms})"
|
|
96
|
-
)
|
|
94
|
+
raise ValueError(f"min_ms ({self.min_ms}) must be <= max_ms ({self.max_ms})")
|
|
97
95
|
|
|
98
96
|
|
|
99
97
|
@strict
|
|
@@ -302,9 +300,7 @@ def _get_macos_gpu() -> tuple[str | None, int | None]:
|
|
|
302
300
|
from ctypes import POINTER, byref, c_char_p, c_int, c_int64, c_uint32, c_void_p
|
|
303
301
|
|
|
304
302
|
iokit = ctypes.CDLL("/System/Library/Frameworks/IOKit.framework/IOKit")
|
|
305
|
-
cf = ctypes.CDLL(
|
|
306
|
-
"/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation"
|
|
307
|
-
)
|
|
303
|
+
cf = ctypes.CDLL("/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation")
|
|
308
304
|
|
|
309
305
|
iokit.IOServiceMatching.restype = c_void_p
|
|
310
306
|
iokit.IOServiceMatching.argtypes = [c_char_p]
|
|
@@ -365,9 +361,7 @@ def _get_macos_gpu() -> tuple[str | None, int | None]:
|
|
|
365
361
|
cf.CFRelease(key)
|
|
366
362
|
|
|
367
363
|
# Get GPU core count
|
|
368
|
-
key = cf.CFStringCreateWithCString(
|
|
369
|
-
None, b"gpu-core-count", kCFStringEncodingUTF8
|
|
370
|
-
)
|
|
364
|
+
key = cf.CFStringCreateWithCString(None, b"gpu-core-count", kCFStringEncodingUTF8)
|
|
371
365
|
if key:
|
|
372
366
|
prop = iokit.IORegistryEntryCreateCFProperty(service, key, None, 0)
|
|
373
367
|
if prop:
|
|
@@ -408,9 +402,7 @@ def collect_machine_info() -> MachineInfo:
|
|
|
408
402
|
if hasattr(torch.version, "hip") and torch.version.hip:
|
|
409
403
|
backend_type = f"ROCm {torch.version.hip}"
|
|
410
404
|
else:
|
|
411
|
-
backend_type =
|
|
412
|
-
f"CUDA {torch.version.cuda}" if torch.version.cuda else "CUDA"
|
|
413
|
-
)
|
|
405
|
+
backend_type = f"CUDA {torch.version.cuda}" if torch.version.cuda else "CUDA"
|
|
414
406
|
elif backend_name == "xpu":
|
|
415
407
|
gpu = torch.xpu.get_device_name(0)
|
|
416
408
|
backend_type = "XPU"
|
|
@@ -462,9 +454,7 @@ def run_benchmark_class(
|
|
|
462
454
|
|
|
463
455
|
# Find all benchmark_* methods
|
|
464
456
|
benchmark_methods = [
|
|
465
|
-
name
|
|
466
|
-
for name in dir(benchmark_cls)
|
|
467
|
-
if name.startswith("benchmark_") and callable(getattr(benchmark_cls, name))
|
|
457
|
+
name for name in dir(benchmark_cls) if name.startswith("benchmark_") and callable(getattr(benchmark_cls, name))
|
|
468
458
|
]
|
|
469
459
|
|
|
470
460
|
if not benchmark_methods:
|
|
@@ -646,9 +636,7 @@ def run_benchmark_script(
|
|
|
646
636
|
raise RuntimeError(f"No Benchmark subclasses found in {script_path}")
|
|
647
637
|
|
|
648
638
|
machine_info = collect_machine_info()
|
|
649
|
-
gpu_cores_str = (
|
|
650
|
-
f" ({machine_info.gpu_cores} cores)" if machine_info.gpu_cores else ""
|
|
651
|
-
)
|
|
639
|
+
gpu_cores_str = f" ({machine_info.gpu_cores} cores)" if machine_info.gpu_cores else ""
|
|
652
640
|
print(file=sys.stderr)
|
|
653
641
|
print(f" GPU {machine_info.gpu}{gpu_cores_str}", file=sys.stderr)
|
|
654
642
|
print(f" CPU {machine_info.cpu}", file=sys.stderr)
|
|
@@ -721,8 +709,7 @@ def run_benchmark(
|
|
|
721
709
|
if is_local:
|
|
722
710
|
if repo_id.count("/") == 1 and not repo_id.startswith(("./", "../")):
|
|
723
711
|
warnings.warn(
|
|
724
|
-
f"'{repo_id}' exists locally but looks like a repo_id. "
|
|
725
|
-
f"Use './{repo_id}' to be explicit.",
|
|
712
|
+
f"'{repo_id}' exists locally but looks like a repo_id. Use './{repo_id}' to be explicit.",
|
|
726
713
|
stacklevel=2,
|
|
727
714
|
)
|
|
728
715
|
branch = "local"
|
|
@@ -753,9 +740,7 @@ def run_benchmark(
|
|
|
753
740
|
if is_local:
|
|
754
741
|
repo_path = repo_id_path.resolve()
|
|
755
742
|
else:
|
|
756
|
-
repo_path = Path(
|
|
757
|
-
str(_get_hf_api().snapshot_download(repo_id=repo_id, revision=revision))
|
|
758
|
-
)
|
|
743
|
+
repo_path = Path(str(_get_hf_api().snapshot_download(repo_id=repo_id, revision=revision)))
|
|
759
744
|
|
|
760
745
|
scripts = discover_benchmark_scripts(repo_id, repo_path)
|
|
761
746
|
|