kernels 0.13.0__tar.gz → 0.14.0.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {kernels-0.13.0 → kernels-0.14.0.dev1}/PKG-INFO +3 -3
  2. {kernels-0.13.0 → kernels-0.14.0.dev1}/pyproject.toml +9 -8
  3. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/__init__.py +2 -0
  4. kernels-0.14.0.dev1/src/kernels/_versions.py +72 -0
  5. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/backends.py +2 -6
  6. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/benchmarks/attention.py +11 -33
  7. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/benchmarks/layer_norm.py +3 -9
  8. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/cli/__init__.py +8 -119
  9. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/cli/benchmark.py +8 -23
  10. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/cli/benchmark_graphics.py +22 -58
  11. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/cli/check.py +7 -1
  12. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/cli/versions.py +2 -3
  13. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/deps.py +6 -19
  14. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/device.py +2 -8
  15. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/func.py +5 -10
  16. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/globals.py +1 -3
  17. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/kernelize.py +4 -12
  18. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/layer.py +16 -43
  19. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/repos.py +10 -28
  20. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/lockfile.py +21 -17
  21. kernels-0.14.0.dev1/src/kernels/metadata.py +44 -0
  22. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/status.py +7 -5
  23. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/utils.py +110 -39
  24. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/variants.py +19 -53
  25. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels.egg-info/PKG-INFO +3 -3
  26. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels.egg-info/SOURCES.txt +1 -4
  27. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels.egg-info/requires.txt +1 -1
  28. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_basic.py +26 -18
  29. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_deps.py +1 -3
  30. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_doctest.py +1 -3
  31. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_func.py +1 -1
  32. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_interval_tree.py +3 -9
  33. kernels-0.14.0.dev1/tests/test_kernel_locking.py +100 -0
  34. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_layer.py +30 -163
  35. kernels-0.14.0.dev1/tests/test_loaded_kernels.py +102 -0
  36. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_status.py +4 -8
  37. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_tvm_ffi.py +1 -1
  38. kernels-0.13.0/src/kernels/_versions.py +0 -111
  39. kernels-0.13.0/src/kernels/cli/_vendored/convert_rst_to_mdx.py +0 -751
  40. kernels-0.13.0/src/kernels/cli/skills.py +0 -113
  41. kernels-0.13.0/src/kernels/cli/upload.py +0 -163
  42. kernels-0.13.0/src/kernels/metadata.py +0 -40
  43. kernels-0.13.0/tests/test_kernel_locking.py +0 -208
  44. kernels-0.13.0/tests/test_kernel_upload.py +0 -309
  45. {kernels-0.13.0 → kernels-0.14.0.dev1}/README.md +0 -0
  46. {kernels-0.13.0 → kernels-0.14.0.dev1}/setup.cfg +0 -0
  47. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/_system.py +0 -0
  48. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/_windows.py +0 -0
  49. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/benchmark.py +0 -0
  50. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/benchmarks/__init__.py +0 -0
  51. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/benchmarks/activation.py +0 -0
  52. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/compat.py +0 -0
  53. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/__init__.py +0 -0
  54. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/_interval_tree.py +0 -0
  55. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/layer/mode.py +0 -0
  56. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels/python_depends.json +0 -0
  57. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels.egg-info/dependency_links.txt +0 -0
  58. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels.egg-info/entry_points.txt +0 -0
  59. {kernels-0.13.0 → kernels-0.14.0.dev1}/src/kernels.egg-info/top_level.txt +0 -0
  60. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_benchmarks.py +0 -0
  61. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_user_agent.py +0 -0
  62. {kernels-0.13.0 → kernels-0.14.0.dev1}/tests/test_variants.py +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kernels
3
- Version: 0.13.0
3
+ Version: 0.14.0.dev1
4
4
  Summary: Download compute kernels
5
5
  Author-email: Daniel de Kok <daniel@huggingface.co>, David Holtz <david@huggingface.co>
6
6
  License: Apache-2.0
7
- Requires-Python: >=3.9
7
+ Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: huggingface_hub<2.0,>=1.3.0
9
+ Requires-Dist: huggingface-hub>=1.10.0
10
10
  Requires-Dist: packaging>=20.0
11
11
  Requires-Dist: pyyaml>=6
12
12
  Requires-Dist: tomli>=2.0; python_version < "3.11"
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "kernels"
3
- version = "0.13.0"
3
+ version = "0.14.0.dev1"
4
4
  description = "Download compute kernels"
5
5
  authors = [
6
6
  { name = "Daniel de Kok", email = "daniel@huggingface.co" },
@@ -8,9 +8,9 @@ authors = [
8
8
  ]
9
9
  license = { text = "Apache-2.0" }
10
10
  readme = "README.md"
11
- requires-python = ">= 3.9"
11
+ requires-python = ">= 3.10"
12
12
  dependencies = [
13
- "huggingface_hub>=1.3.0,<2.0",
13
+ "huggingface-hub>=1.10.0",
14
14
  "packaging>=20.0",
15
15
  "pyyaml>=6",
16
16
  "tomli>=2.0; python_version<'3.11'",
@@ -56,11 +56,10 @@ kernels = "kernels.cli:main"
56
56
  [tool.setuptools.package-data]
57
57
  kernels = ["python_depends.json"]
58
58
 
59
- [tool.isort]
60
- profile = "black"
61
- line_length = 119
62
-
63
59
  [tool.ruff]
60
+ # If the version is changed, apply the change in the Nix overlay
61
+ # as well.
62
+ required-version = "==0.15.10"
64
63
  exclude = [
65
64
  ".eggs",
66
65
  ".git",
@@ -85,4 +84,6 @@ line-length = 119
85
84
  # Ignored rules:
86
85
  # "E501" -> line length violation
87
86
  lint.ignore = ["E501"]
88
- lint.select = ["E", "F", "W"]
87
+ lint.select = ["E", "F", "I", "W"]
88
+
89
+ [tool.ruff.format]
@@ -23,6 +23,7 @@ from kernels.layer import (
23
23
  )
24
24
  from kernels.utils import (
25
25
  get_kernel,
26
+ get_loaded_kernels,
26
27
  get_local_kernel,
27
28
  get_locked_kernel,
28
29
  has_kernel,
@@ -45,6 +46,7 @@ __all__ = [
45
46
  "LockedLayerRepository",
46
47
  "Mode",
47
48
  "get_kernel",
49
+ "get_loaded_kernels",
48
50
  "get_local_kernel",
49
51
  "get_locked_kernel",
50
52
  "has_kernel",
@@ -0,0 +1,72 @@
1
+ import logging
2
+ import warnings
3
+
4
+ from huggingface_hub.hf_api import GitRefInfo
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def _get_available_versions(repo_id: str) -> dict[int, GitRefInfo]:
10
+ """Get kernel versions that are available in the repository."""
11
+ from kernels.utils import _get_hf_api
12
+
13
+ refs = _get_hf_api().list_repo_refs(repo_id=repo_id, repo_type="kernel")
14
+
15
+ versions = {}
16
+ for branch in refs.branches:
17
+ if not branch.name.startswith("v"):
18
+ continue
19
+ try:
20
+ versions[int(branch.name[1:])] = branch
21
+ except ValueError:
22
+ continue
23
+
24
+ return versions
25
+
26
+
27
+ def resolve_version_spec_as_ref(repo_id: str, version_spec: int) -> GitRefInfo:
28
+ """
29
+ Get the ref for a kernel with the given version.
30
+ """
31
+ versions = _get_available_versions(repo_id)
32
+
33
+ ref = versions.get(version_spec, None)
34
+ if ref is None:
35
+ raise ValueError(
36
+ f"Version {version_spec} not found, available versions: {', '.join(sorted(str(v) for v in versions.keys()))}"
37
+ )
38
+
39
+ latest_version = max(versions.keys())
40
+ if version_spec < latest_version:
41
+ logger.warning(
42
+ "You are using version %d of '%s', but version %d is available.",
43
+ version_spec,
44
+ repo_id,
45
+ latest_version,
46
+ )
47
+
48
+ return ref
49
+
50
+
51
+ def select_revision_or_version(
52
+ repo_id: str,
53
+ *,
54
+ revision: str | None,
55
+ version: int | None,
56
+ ) -> str:
57
+ if revision is not None and version is not None:
58
+ raise ValueError("Only one of `revision` or `version` must be specified.")
59
+
60
+ if revision is not None:
61
+ return revision
62
+ elif version is not None:
63
+ return resolve_version_spec_as_ref(repo_id, version).target_commit
64
+
65
+ warnings.warn(
66
+ "Future versions of `kernels` (>=0.15) will require specifying a kernel version or revision. "
67
+ "See: https://huggingface.co/docs/kernels/migration",
68
+ FutureWarning,
69
+ stacklevel=2,
70
+ )
71
+
72
+ return "main"
@@ -241,9 +241,7 @@ def _select_backend(backend: str | None) -> Backend:
241
241
  if backend in supported:
242
242
  return supported[backend]
243
243
 
244
- raise ValueError(
245
- f"Invalid backend '{backend}', system supported backends: {', '.join(sorted(supported.keys()))}"
246
- )
244
+ raise ValueError(f"Invalid backend '{backend}', system supported backends: {', '.join(sorted(supported.keys()))}")
247
245
 
248
246
 
249
247
  def _supported_backends() -> dict[str, Backend]:
@@ -267,9 +265,7 @@ def _get_cuda() -> Optional[CUDA]:
267
265
  runtime_version = ctypes.c_int(0)
268
266
  result = libcudart.cudaRuntimeGetVersion(ctypes.byref(runtime_version))
269
267
  if result != 0:
270
- warnings.warn(
271
- "System has CUDA runtime library, but cannot get runtime version."
272
- )
268
+ warnings.warn("System has CUDA runtime library, but cannot get runtime version.")
273
269
  return None
274
270
 
275
271
  # cudaRuntimeGetVersion encodes the version as (major * 1000 + minor * 10).
@@ -14,9 +14,7 @@ def _reference_attention(query, key, value, causal=False):
14
14
  """Reference implementation using PyTorch SDPA."""
15
15
  query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
16
16
  with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
17
- out = torch.nn.functional.scaled_dot_product_attention(
18
- query, key, value, is_causal=causal
19
- )
17
+ out = torch.nn.functional.scaled_dot_product_attention(query, key, value, is_causal=causal)
20
18
  return out.transpose(1, 2).contiguous()
21
19
 
22
20
 
@@ -24,9 +22,7 @@ def _varlen_reference_attention(q, k, v, cu_seqlens_q, cu_seqlens_k, causal=Fals
24
22
  """Reference implementation for variable length attention."""
25
23
  batch_size = cu_seqlens_q.shape[0] - 1
26
24
  total_tokens_q = q.shape[0]
27
- out = torch.zeros(
28
- (total_tokens_q, q.shape[1], q.shape[2]), device=q.device, dtype=q.dtype
29
- )
25
+ out = torch.zeros((total_tokens_q, q.shape[1], q.shape[2]), device=q.device, dtype=q.dtype)
30
26
 
31
27
  for b in range(batch_size):
32
28
  start_q, end_q = cu_seqlens_q[b], cu_seqlens_q[b + 1]
@@ -54,9 +50,7 @@ class FlashAttentionBenchmark(Benchmark):
54
50
  self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
55
51
 
56
52
  def benchmark_small(self):
57
- self.out = _extract_output(
58
- self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False)
59
- )
53
+ self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False))
60
54
 
61
55
  def verify_small(self) -> torch.Tensor:
62
56
  return _reference_attention(self.q, self.k, self.v, causal=False)
@@ -70,9 +64,7 @@ class FlashAttentionBenchmark(Benchmark):
70
64
  self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
71
65
 
72
66
  def benchmark_medium(self):
73
- self.out = _extract_output(
74
- self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False)
75
- )
67
+ self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False))
76
68
 
77
69
  def verify_medium(self) -> torch.Tensor:
78
70
  return _reference_attention(self.q, self.k, self.v, causal=False)
@@ -86,9 +78,7 @@ class FlashAttentionBenchmark(Benchmark):
86
78
  self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
87
79
 
88
80
  def benchmark_large(self):
89
- self.out = _extract_output(
90
- self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False)
91
- )
81
+ self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=False))
92
82
 
93
83
  def verify_large(self) -> torch.Tensor:
94
84
  return _reference_attention(self.q, self.k, self.v, causal=False)
@@ -106,9 +96,7 @@ class FlashAttentionCausalBenchmark(Benchmark):
106
96
  self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
107
97
 
108
98
  def benchmark_small(self):
109
- self.out = _extract_output(
110
- self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True)
111
- )
99
+ self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True))
112
100
 
113
101
  def verify_small(self) -> torch.Tensor:
114
102
  return _reference_attention(self.q, self.k, self.v, causal=True)
@@ -122,9 +110,7 @@ class FlashAttentionCausalBenchmark(Benchmark):
122
110
  self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
123
111
 
124
112
  def benchmark_medium(self):
125
- self.out = _extract_output(
126
- self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True)
127
- )
113
+ self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True))
128
114
 
129
115
  def verify_medium(self) -> torch.Tensor:
130
116
  return _reference_attention(self.q, self.k, self.v, causal=True)
@@ -138,9 +124,7 @@ class FlashAttentionCausalBenchmark(Benchmark):
138
124
  self.out = torch.empty(B, S, H, D, device="cuda", dtype=torch.float16)
139
125
 
140
126
  def benchmark_large(self):
141
- self.out = _extract_output(
142
- self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True)
143
- )
127
+ self.out = _extract_output(self.kernel.flash_attn_func(self.q, self.k, self.v, causal=True))
144
128
 
145
129
  def verify_large(self) -> torch.Tensor:
146
130
  return _reference_attention(self.q, self.k, self.v, causal=True)
@@ -180,9 +164,7 @@ class FlashAttentionVarlenBenchmark(Benchmark):
180
164
  )
181
165
 
182
166
  def verify_small(self) -> torch.Tensor:
183
- return _varlen_reference_attention(
184
- self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False
185
- )
167
+ return _varlen_reference_attention(self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False)
186
168
 
187
169
  # Workload: medium (5 sequences, max_seqlen=256)
188
170
  def setup_medium(self):
@@ -214,9 +196,7 @@ class FlashAttentionVarlenBenchmark(Benchmark):
214
196
  )
215
197
 
216
198
  def verify_medium(self) -> torch.Tensor:
217
- return _varlen_reference_attention(
218
- self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False
219
- )
199
+ return _varlen_reference_attention(self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False)
220
200
 
221
201
  # Workload: large (8 sequences, max_seqlen=512)
222
202
  def setup_large(self):
@@ -248,6 +228,4 @@ class FlashAttentionVarlenBenchmark(Benchmark):
248
228
  )
249
229
 
250
230
  def verify_large(self) -> torch.Tensor:
251
- return _varlen_reference_attention(
252
- self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False
253
- )
231
+ return _varlen_reference_attention(self.q, self.k, self.v, self.cu_seqlens, self.cu_seqlens, causal=False)
@@ -129,9 +129,7 @@ class LayerNormBenchmark(Benchmark):
129
129
  )[0].view(self.B, self.S, self.D)
130
130
 
131
131
  def verify_small(self) -> torch.Tensor:
132
- return torch.nn.functional.layer_norm(
133
- self.x, [self.D], self.weight, eps=self.eps
134
- )
132
+ return torch.nn.functional.layer_norm(self.x, [self.D], self.weight, eps=self.eps)
135
133
 
136
134
  # Workload: medium (B=4, S=512, D=2048)
137
135
  def setup_medium(self):
@@ -160,9 +158,7 @@ class LayerNormBenchmark(Benchmark):
160
158
  )[0].view(self.B, self.S, self.D)
161
159
 
162
160
  def verify_medium(self) -> torch.Tensor:
163
- return torch.nn.functional.layer_norm(
164
- self.x, [self.D], self.weight, eps=self.eps
165
- )
161
+ return torch.nn.functional.layer_norm(self.x, [self.D], self.weight, eps=self.eps)
166
162
 
167
163
  # Workload: large (B=8, S=1024, D=4096)
168
164
  def setup_large(self):
@@ -191,6 +187,4 @@ class LayerNormBenchmark(Benchmark):
191
187
  )[0].view(self.B, self.S, self.D)
192
188
 
193
189
  def verify_large(self) -> torch.Tensor:
194
- return torch.nn.functional.layer_norm(
195
- self.x, [self.D], self.weight, eps=self.eps
196
- )
190
+ return torch.nn.functional.layer_norm(self.x, [self.D], self.weight, eps=self.eps)
@@ -4,8 +4,6 @@ import json
4
4
  import sys
5
5
  from pathlib import Path
6
6
 
7
- from kernels.cli.skills import add_skill
8
- from kernels.cli.upload import upload_kernels_dir
9
7
  from kernels.cli.versions import print_kernel_versions
10
8
  from kernels.compat import tomllib
11
9
  from kernels.lockfile import KernelLock, get_kernel_locks
@@ -16,9 +14,7 @@ from kernels.utils import (
16
14
 
17
15
 
18
16
  def main():
19
- parser = argparse.ArgumentParser(
20
- prog="kernel", description="Manage compute kernels"
21
- )
17
+ parser = argparse.ArgumentParser(prog="kernel", description="Manage compute kernels")
22
18
  subparsers = parser.add_subparsers(required=True)
23
19
 
24
20
  check_parser = subparsers.add_parser("check", help="Check a kernel for compliance")
@@ -30,12 +26,8 @@ def main():
30
26
  help="The kernel revision (branch, tag, or commit SHA, defaults to 'main')",
31
27
  )
32
28
  check_parser.add_argument("--macos", type=str, help="macOS version", default="15.0")
33
- check_parser.add_argument(
34
- "--manylinux", type=str, help="Manylinux version", default="manylinux_2_28"
35
- )
36
- check_parser.add_argument(
37
- "--python-abi", type=str, help="Python ABI version", default="3.9"
38
- )
29
+ check_parser.add_argument("--manylinux", type=str, help="Manylinux version", default="manylinux_2_28")
30
+ check_parser.add_argument("--python-abi", type=str, help="Python ABI version", default="3.9")
39
31
  check_parser.set_defaults(
40
32
  func=lambda args: check_kernel(
41
33
  macos=args.macos,
@@ -63,80 +55,6 @@ def main():
63
55
  versions_parser.add_argument("repo_id", type=str, help="The kernel repo ID")
64
56
  versions_parser.set_defaults(func=kernel_versions)
65
57
 
66
- upload_parser = subparsers.add_parser(
67
- "upload",
68
- help="(Deprecated) Upload kernels to the Hub. Use `kernel-builder upload` instead.",
69
- )
70
- upload_parser.add_argument(
71
- "kernel_dir",
72
- type=Path,
73
- help="Directory of the kernel build",
74
- )
75
- upload_parser.add_argument(
76
- "--repo-id",
77
- type=str,
78
- required=True,
79
- help="Repository ID to use to upload to the Hugging Face Hub",
80
- )
81
- upload_parser.add_argument(
82
- "--branch",
83
- type=str,
84
- default=None,
85
- help="If set, the upload will be made to a particular branch of the provided `repo-id`.",
86
- )
87
- upload_parser.add_argument(
88
- "--private",
89
- action="store_true",
90
- help="If the repository should be private.",
91
- )
92
- upload_parser.set_defaults(func=upload_kernels)
93
-
94
- skills_parser = subparsers.add_parser(
95
- "skills",
96
- help="Install kernels specific skills for agents like Claude, Codex, and OpenCode",
97
- )
98
- skills_subparsers = skills_parser.add_subparsers(required=True)
99
- skills_add_parser = skills_subparsers.add_parser(
100
- "add",
101
- help="Install the cuda-kernels skill for an AI assistant",
102
- )
103
- skills_add_parser.add_argument(
104
- "--claude",
105
- action="store_true",
106
- help="Install for Claude.",
107
- )
108
- skills_add_parser.add_argument(
109
- "--codex",
110
- action="store_true",
111
- help="Install for Codex.",
112
- )
113
- skills_add_parser.add_argument(
114
- "--opencode",
115
- action="store_true",
116
- help="Install for OpenCode.",
117
- )
118
- skills_add_parser.add_argument(
119
- "--global",
120
- "-g",
121
- dest="global_",
122
- action="store_true",
123
- help=(
124
- "Install globally (user-level) instead of in the current project directory."
125
- ),
126
- )
127
- skills_add_parser.add_argument(
128
- "--dest",
129
- type=Path,
130
- default=None,
131
- help="Install into a custom destination (path to skills directory).",
132
- )
133
- skills_add_parser.add_argument(
134
- "--force",
135
- action="store_true",
136
- help="Overwrite existing skills in the destination.",
137
- )
138
- skills_add_parser.set_defaults(func=add_skill)
139
-
140
58
  lock_parser = subparsers.add_parser("lock", help="Lock kernel revisions")
141
59
  lock_parser.add_argument(
142
60
  "project_dir",
@@ -154,12 +72,8 @@ def main():
154
72
  type=str,
155
73
  help="Kernel repo ID (e.g., kernels-community/activation)",
156
74
  )
157
- benchmark_parser.add_argument(
158
- "--branch", type=str, help="Kernel branch to benchmark"
159
- )
160
- benchmark_parser.add_argument(
161
- "--version", type=int, help="Kernel version to benchmark"
162
- )
75
+ benchmark_parser.add_argument("--branch", type=str, help="Kernel branch to benchmark")
76
+ benchmark_parser.add_argument("--version", type=int, help="Kernel version to benchmark")
163
77
  benchmark_parser.add_argument(
164
78
  "--output",
165
79
  type=str,
@@ -211,14 +125,14 @@ def download_kernels(args):
211
125
  if args.all_variants:
212
126
  install_kernel_all_variants(
213
127
  kernel_lock.repo_id,
214
- kernel_lock.sha,
128
+ revision=kernel_lock.sha,
215
129
  variant_locks=kernel_lock.variants,
216
130
  )
217
131
  else:
218
132
  try:
219
133
  install_kernel(
220
134
  kernel_lock.repo_id,
221
- kernel_lock.sha,
135
+ revision=kernel_lock.sha,
222
136
  variant_locks=kernel_lock.variants,
223
137
  )
224
138
  except FileNotFoundError as e:
@@ -247,29 +161,6 @@ def lock_kernels(args):
247
161
  json.dump(all_locks, f, cls=_JSONEncoder, indent=2)
248
162
 
249
163
 
250
- def upload_kernels(args):
251
- import warnings
252
-
253
- warnings.warn(
254
- "`kernels upload` is deprecated and will be removed in version 0.14. "
255
- "Please use `kernel-builder upload` instead.",
256
- DeprecationWarning,
257
- stacklevel=1,
258
- )
259
- # Also print to stderr for visibility in CLI usage
260
- print(
261
- "Warning: `kernels upload` is deprecated and will be removed in version 0.14.\n"
262
- "Please use `kernel-builder upload` instead.\n",
263
- file=sys.stderr,
264
- )
265
- upload_kernels_dir(
266
- Path(args.kernel_dir).resolve(),
267
- repo_id=args.repo_id,
268
- branch=args.branch,
269
- private=args.private,
270
- )
271
-
272
-
273
164
  class _JSONEncoder(json.JSONEncoder):
274
165
  def default(self, o):
275
166
  if dataclasses.is_dataclass(o):
@@ -277,9 +168,7 @@ class _JSONEncoder(json.JSONEncoder):
277
168
  return super().default(o)
278
169
 
279
170
 
280
- def check_kernel(
281
- *, macos: str, manylinux: str, python_abi: str, repo_id: str, revision: str
282
- ):
171
+ def check_kernel(*, macos: str, manylinux: str, python_abi: str, repo_id: str, revision: str):
283
172
  try:
284
173
  from kernels.cli import check
285
174
  except ImportError:
@@ -91,9 +91,7 @@ class TimingResults:
91
91
 
92
92
  def validate_timing_range(self):
93
93
  if self.min_ms > self.max_ms:
94
- raise ValueError(
95
- f"min_ms ({self.min_ms}) must be <= max_ms ({self.max_ms})"
96
- )
94
+ raise ValueError(f"min_ms ({self.min_ms}) must be <= max_ms ({self.max_ms})")
97
95
 
98
96
 
99
97
  @strict
@@ -302,9 +300,7 @@ def _get_macos_gpu() -> tuple[str | None, int | None]:
302
300
  from ctypes import POINTER, byref, c_char_p, c_int, c_int64, c_uint32, c_void_p
303
301
 
304
302
  iokit = ctypes.CDLL("/System/Library/Frameworks/IOKit.framework/IOKit")
305
- cf = ctypes.CDLL(
306
- "/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation"
307
- )
303
+ cf = ctypes.CDLL("/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation")
308
304
 
309
305
  iokit.IOServiceMatching.restype = c_void_p
310
306
  iokit.IOServiceMatching.argtypes = [c_char_p]
@@ -365,9 +361,7 @@ def _get_macos_gpu() -> tuple[str | None, int | None]:
365
361
  cf.CFRelease(key)
366
362
 
367
363
  # Get GPU core count
368
- key = cf.CFStringCreateWithCString(
369
- None, b"gpu-core-count", kCFStringEncodingUTF8
370
- )
364
+ key = cf.CFStringCreateWithCString(None, b"gpu-core-count", kCFStringEncodingUTF8)
371
365
  if key:
372
366
  prop = iokit.IORegistryEntryCreateCFProperty(service, key, None, 0)
373
367
  if prop:
@@ -408,9 +402,7 @@ def collect_machine_info() -> MachineInfo:
408
402
  if hasattr(torch.version, "hip") and torch.version.hip:
409
403
  backend_type = f"ROCm {torch.version.hip}"
410
404
  else:
411
- backend_type = (
412
- f"CUDA {torch.version.cuda}" if torch.version.cuda else "CUDA"
413
- )
405
+ backend_type = f"CUDA {torch.version.cuda}" if torch.version.cuda else "CUDA"
414
406
  elif backend_name == "xpu":
415
407
  gpu = torch.xpu.get_device_name(0)
416
408
  backend_type = "XPU"
@@ -462,9 +454,7 @@ def run_benchmark_class(
462
454
 
463
455
  # Find all benchmark_* methods
464
456
  benchmark_methods = [
465
- name
466
- for name in dir(benchmark_cls)
467
- if name.startswith("benchmark_") and callable(getattr(benchmark_cls, name))
457
+ name for name in dir(benchmark_cls) if name.startswith("benchmark_") and callable(getattr(benchmark_cls, name))
468
458
  ]
469
459
 
470
460
  if not benchmark_methods:
@@ -646,9 +636,7 @@ def run_benchmark_script(
646
636
  raise RuntimeError(f"No Benchmark subclasses found in {script_path}")
647
637
 
648
638
  machine_info = collect_machine_info()
649
- gpu_cores_str = (
650
- f" ({machine_info.gpu_cores} cores)" if machine_info.gpu_cores else ""
651
- )
639
+ gpu_cores_str = f" ({machine_info.gpu_cores} cores)" if machine_info.gpu_cores else ""
652
640
  print(file=sys.stderr)
653
641
  print(f" GPU {machine_info.gpu}{gpu_cores_str}", file=sys.stderr)
654
642
  print(f" CPU {machine_info.cpu}", file=sys.stderr)
@@ -721,8 +709,7 @@ def run_benchmark(
721
709
  if is_local:
722
710
  if repo_id.count("/") == 1 and not repo_id.startswith(("./", "../")):
723
711
  warnings.warn(
724
- f"'{repo_id}' exists locally but looks like a repo_id. "
725
- f"Use './{repo_id}' to be explicit.",
712
+ f"'{repo_id}' exists locally but looks like a repo_id. Use './{repo_id}' to be explicit.",
726
713
  stacklevel=2,
727
714
  )
728
715
  branch = "local"
@@ -753,9 +740,7 @@ def run_benchmark(
753
740
  if is_local:
754
741
  repo_path = repo_id_path.resolve()
755
742
  else:
756
- repo_path = Path(
757
- str(_get_hf_api().snapshot_download(repo_id=repo_id, revision=revision))
758
- )
743
+ repo_path = Path(str(_get_hf_api().snapshot_download(repo_id=repo_id, revision=revision)))
759
744
 
760
745
  scripts = discover_benchmark_scripts(repo_id, repo_path)
761
746