wafer-cli 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer/GUIDE.md +18 -7
- wafer/api_client.py +4 -0
- wafer/cli.py +1177 -278
- wafer/corpus.py +158 -32
- wafer/evaluate.py +75 -6
- wafer/kernel_scope.py +132 -31
- wafer/nsys_analyze.py +903 -73
- wafer/nsys_profile.py +511 -0
- wafer/output.py +241 -0
- wafer/skills/wafer-guide/SKILL.md +13 -0
- wafer/ssh_keys.py +261 -0
- wafer/targets_ops.py +718 -0
- wafer/wevin_cli.py +127 -18
- wafer/workspaces.py +232 -184
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/METADATA +1 -1
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/RECORD +19 -15
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/WHEEL +0 -0
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/entry_points.txt +0 -0
- {wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/top_level.txt +0 -0
wafer/corpus.py
CHANGED
|
@@ -15,7 +15,16 @@ import httpx
|
|
|
15
15
|
|
|
16
16
|
CACHE_DIR = Path.home() / ".cache" / "wafer" / "corpora"
|
|
17
17
|
|
|
18
|
-
CorpusName = Literal["cuda", "cutlass", "hip"]
|
|
18
|
+
CorpusName = Literal["cuda", "cutlass", "hip", "amd"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class RepoSource:
|
|
23
|
+
"""A single GitHub repo source within a corpus."""
|
|
24
|
+
|
|
25
|
+
repo: str
|
|
26
|
+
paths: list[str]
|
|
27
|
+
branch: str = "main"
|
|
19
28
|
|
|
20
29
|
|
|
21
30
|
@dataclass
|
|
@@ -24,10 +33,11 @@ class CorpusConfig:
|
|
|
24
33
|
|
|
25
34
|
name: CorpusName
|
|
26
35
|
description: str
|
|
27
|
-
source_type: Literal["nvidia_md", "github_repo"]
|
|
36
|
+
source_type: Literal["nvidia_md", "github_repo", "github_multi_repo"]
|
|
28
37
|
urls: list[str] | None = None
|
|
29
38
|
repo: str | None = None
|
|
30
39
|
repo_paths: list[str] | None = None
|
|
40
|
+
repos: list[RepoSource] | None = None # For multi-repo corpora
|
|
31
41
|
|
|
32
42
|
|
|
33
43
|
CORPORA: dict[CorpusName, CorpusConfig] = {
|
|
@@ -69,6 +79,74 @@ CORPORA: dict[CorpusName, CorpusConfig] = {
|
|
|
69
79
|
repo="ROCm/HIP",
|
|
70
80
|
repo_paths=["docs"],
|
|
71
81
|
),
|
|
82
|
+
"amd": CorpusConfig(
|
|
83
|
+
name="amd",
|
|
84
|
+
description="AMD GPU kernel development (rocWMMA, CK, AITER, rocBLAS, HipKittens, vLLM)",
|
|
85
|
+
source_type="github_multi_repo",
|
|
86
|
+
repos=[
|
|
87
|
+
# rocWMMA - wave matrix multiply-accumulate (WMMA) intrinsics
|
|
88
|
+
RepoSource(
|
|
89
|
+
repo="ROCm/rocWMMA",
|
|
90
|
+
paths=["docs", "samples", "library/include"],
|
|
91
|
+
branch="develop",
|
|
92
|
+
),
|
|
93
|
+
# Composable Kernel - tile-based GPU programming
|
|
94
|
+
RepoSource(
|
|
95
|
+
repo="ROCm/composable_kernel",
|
|
96
|
+
paths=["docs", "example", "tutorial", "include/ck_tile"],
|
|
97
|
+
branch="develop",
|
|
98
|
+
),
|
|
99
|
+
# AITER - AMD inference tensor runtime
|
|
100
|
+
RepoSource(
|
|
101
|
+
repo="ROCm/aiter",
|
|
102
|
+
paths=["docs", "aiter/ops"],
|
|
103
|
+
),
|
|
104
|
+
# MIOpen - deep learning primitives (deprecated, use rocm-libraries)
|
|
105
|
+
RepoSource(
|
|
106
|
+
repo="ROCm/MIOpen",
|
|
107
|
+
paths=["docs"],
|
|
108
|
+
branch="develop_deprecated",
|
|
109
|
+
),
|
|
110
|
+
# rocBLAS - BLAS library (deprecated, use rocm-libraries)
|
|
111
|
+
RepoSource(
|
|
112
|
+
repo="ROCm/rocBLAS",
|
|
113
|
+
paths=["docs"],
|
|
114
|
+
branch="develop_deprecated",
|
|
115
|
+
),
|
|
116
|
+
# hipBLASLt - lightweight BLAS (deprecated, use rocm-libraries)
|
|
117
|
+
RepoSource(
|
|
118
|
+
repo="ROCm/hipBLASLt",
|
|
119
|
+
paths=["docs"],
|
|
120
|
+
branch="develop_deprecated",
|
|
121
|
+
),
|
|
122
|
+
# Tensile - GEMM code generator (deprecated, use rocm-libraries)
|
|
123
|
+
RepoSource(
|
|
124
|
+
repo="ROCm/Tensile",
|
|
125
|
+
paths=["docs"],
|
|
126
|
+
branch="develop_deprecated",
|
|
127
|
+
),
|
|
128
|
+
# HipKittens - high-performance AMD kernels
|
|
129
|
+
RepoSource(
|
|
130
|
+
repo="HazyResearch/HipKittens",
|
|
131
|
+
paths=["docs", "kernels", "include"],
|
|
132
|
+
),
|
|
133
|
+
# vLLM AMD kernels
|
|
134
|
+
RepoSource(
|
|
135
|
+
repo="vllm-project/vllm",
|
|
136
|
+
paths=["csrc/rocm"],
|
|
137
|
+
),
|
|
138
|
+
# SGLang AMD kernels
|
|
139
|
+
RepoSource(
|
|
140
|
+
repo="sgl-project/sglang",
|
|
141
|
+
paths=["3rdparty/amd"],
|
|
142
|
+
),
|
|
143
|
+
# HuggingFace ROCm kernels
|
|
144
|
+
RepoSource(
|
|
145
|
+
repo="huggingface/hf-rocm-kernels",
|
|
146
|
+
paths=["csrc", "hf_rocm_kernels", "docs"],
|
|
147
|
+
),
|
|
148
|
+
],
|
|
149
|
+
),
|
|
72
150
|
}
|
|
73
151
|
|
|
74
152
|
|
|
@@ -113,41 +191,87 @@ def _download_nvidia_md(config: CorpusConfig, dest: Path, verbose: bool = True)
|
|
|
113
191
|
return downloaded
|
|
114
192
|
|
|
115
193
|
|
|
194
|
+
def _extract_matching_files(
|
|
195
|
+
tar: tarfile.TarFile,
|
|
196
|
+
repo_paths: list[str],
|
|
197
|
+
dest: Path,
|
|
198
|
+
verbose: bool,
|
|
199
|
+
) -> int:
|
|
200
|
+
"""Extract files matching repo_paths from tarball."""
|
|
201
|
+
downloaded = 0
|
|
202
|
+
for member in tar.getmembers():
|
|
203
|
+
if not member.isfile():
|
|
204
|
+
continue
|
|
205
|
+
rel_path = "/".join(member.name.split("/")[1:])
|
|
206
|
+
if not any(rel_path.startswith(rp) for rp in repo_paths):
|
|
207
|
+
continue
|
|
208
|
+
target = dest / rel_path
|
|
209
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
210
|
+
src = tar.extractfile(member)
|
|
211
|
+
if src:
|
|
212
|
+
target.write_bytes(src.read())
|
|
213
|
+
downloaded += 1
|
|
214
|
+
if verbose:
|
|
215
|
+
print(f" ✓ {rel_path}")
|
|
216
|
+
return downloaded
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _download_single_github_repo(
|
|
220
|
+
client: httpx.Client,
|
|
221
|
+
repo: str,
|
|
222
|
+
repo_paths: list[str],
|
|
223
|
+
dest: Path,
|
|
224
|
+
branch: str = "main",
|
|
225
|
+
verbose: bool = True,
|
|
226
|
+
) -> int:
|
|
227
|
+
"""Download specific paths from a single GitHub repo."""
|
|
228
|
+
tarball_url = f"https://api.github.com/repos/{repo}/tarball/{branch}"
|
|
229
|
+
if verbose:
|
|
230
|
+
print(f" Fetching {repo}...")
|
|
231
|
+
resp = client.get(tarball_url)
|
|
232
|
+
resp.raise_for_status()
|
|
233
|
+
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
|
|
234
|
+
tmp.write(resp.content)
|
|
235
|
+
tmp_path = Path(tmp.name)
|
|
236
|
+
try:
|
|
237
|
+
with tarfile.open(tmp_path, "r:gz") as tar:
|
|
238
|
+
return _extract_matching_files(tar, repo_paths, dest, verbose)
|
|
239
|
+
finally:
|
|
240
|
+
tmp_path.unlink()
|
|
241
|
+
|
|
242
|
+
|
|
116
243
|
def _download_github_repo(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
|
|
117
244
|
"""Download specific paths from GitHub repo."""
|
|
118
245
|
assert config.repo is not None
|
|
119
246
|
assert config.repo_paths is not None
|
|
120
|
-
downloaded = 0
|
|
121
247
|
with httpx.Client(timeout=60.0, follow_redirects=True) as client:
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
finally:
|
|
150
|
-
tmp_path.unlink()
|
|
248
|
+
return _download_single_github_repo(
|
|
249
|
+
client, config.repo, config.repo_paths, dest, verbose=verbose
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _download_github_multi_repo(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
|
|
254
|
+
"""Download specific paths from multiple GitHub repos."""
|
|
255
|
+
assert config.repos is not None
|
|
256
|
+
downloaded = 0
|
|
257
|
+
with httpx.Client(timeout=120.0, follow_redirects=True) as client:
|
|
258
|
+
for repo_source in config.repos:
|
|
259
|
+
repo_name = repo_source.repo.split("/")[-1]
|
|
260
|
+
repo_dest = dest / repo_name
|
|
261
|
+
repo_dest.mkdir(parents=True, exist_ok=True)
|
|
262
|
+
try:
|
|
263
|
+
count = _download_single_github_repo(
|
|
264
|
+
client,
|
|
265
|
+
repo_source.repo,
|
|
266
|
+
repo_source.paths,
|
|
267
|
+
repo_dest,
|
|
268
|
+
branch=repo_source.branch,
|
|
269
|
+
verbose=verbose,
|
|
270
|
+
)
|
|
271
|
+
downloaded += count
|
|
272
|
+
except httpx.HTTPError as e:
|
|
273
|
+
if verbose:
|
|
274
|
+
print(f" ✗ {repo_source.repo}: {e}")
|
|
151
275
|
return downloaded
|
|
152
276
|
|
|
153
277
|
|
|
@@ -185,6 +309,8 @@ def download_corpus(name: CorpusName, force: bool = False, verbose: bool = True)
|
|
|
185
309
|
count = _download_nvidia_md(config, dest, verbose)
|
|
186
310
|
elif config.source_type == "github_repo":
|
|
187
311
|
count = _download_github_repo(config, dest, verbose)
|
|
312
|
+
elif config.source_type == "github_multi_repo":
|
|
313
|
+
count = _download_github_multi_repo(config, dest, verbose)
|
|
188
314
|
else:
|
|
189
315
|
raise ValueError(f"Unknown source type: {config.source_type}")
|
|
190
316
|
if verbose:
|
wafer/evaluate.py
CHANGED
|
@@ -22,6 +22,30 @@ from wafer_core.utils.kernel_utils.targets.config import (
|
|
|
22
22
|
)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
# Map AMD compute capability to ROCm architecture
|
|
26
|
+
# Used to set PYTORCH_ROCM_ARCH for faster compilation (compile only for target arch)
|
|
27
|
+
AMD_CC_TO_ARCH = {
|
|
28
|
+
"9.4": "gfx942", # MI300X
|
|
29
|
+
"9.0a": "gfx90a", # MI200 series
|
|
30
|
+
"9.08": "gfx908", # MI100
|
|
31
|
+
"9.06": "gfx906", # MI50/60
|
|
32
|
+
"10.30": "gfx1030", # RDNA2
|
|
33
|
+
"11.0": "gfx1100", # RDNA3
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_rocm_arch(compute_capability: str) -> str | None:
|
|
38
|
+
"""Get ROCm architecture string from compute capability.
|
|
39
|
+
|
|
40
|
+
Returns gfx* string for PYTORCH_ROCM_ARCH, or None if not found.
|
|
41
|
+
"""
|
|
42
|
+
# Already a gfx string
|
|
43
|
+
if compute_capability.startswith("gfx"):
|
|
44
|
+
return compute_capability
|
|
45
|
+
# Map from numeric CC
|
|
46
|
+
return AMD_CC_TO_ARCH.get(compute_capability)
|
|
47
|
+
|
|
48
|
+
|
|
25
49
|
def _build_docker_run_command(
|
|
26
50
|
image: str,
|
|
27
51
|
command: str,
|
|
@@ -162,6 +186,7 @@ class KernelBenchEvaluateArgs:
|
|
|
162
186
|
inputs: Path | None = None # Custom inputs file to override get_inputs()
|
|
163
187
|
seed: int = 42 # Random seed for reproducibility
|
|
164
188
|
defensive: bool = False
|
|
189
|
+
backend: str | None = None # Kernel backend for static validation
|
|
165
190
|
sync_artifacts: bool = True
|
|
166
191
|
gpu_id: int | None = None
|
|
167
192
|
|
|
@@ -2743,6 +2768,17 @@ import torch
|
|
|
2743
2768
|
import torch.nn as nn
|
|
2744
2769
|
from pathlib import Path
|
|
2745
2770
|
|
|
2771
|
+
# Use a unique per-run PyTorch extension cache directory to ensure fresh compilation.
|
|
2772
|
+
# This prevents stale cached extensions from being loaded when the pod is reused.
|
|
2773
|
+
# Without this, if a kernel is modified but uses the same extension name,
|
|
2774
|
+
# PyTorch would load the old cached .so instead of recompiling.
|
|
2775
|
+
# We use a UUID-based directory instead of clearing the cache to avoid race conditions
|
|
2776
|
+
# with other processes that might be using the cache.
|
|
2777
|
+
import uuid
|
|
2778
|
+
unique_cache_dir = f"/tmp/torch_extensions_{uuid.uuid4().hex[:8]}"
|
|
2779
|
+
os.environ["TORCH_EXTENSIONS_DIR"] = unique_cache_dir
|
|
2780
|
+
print(f"[KernelBench] Using unique extension cache: {unique_cache_dir}")
|
|
2781
|
+
|
|
2746
2782
|
# Clear any stale GPU memory from previous runs at startup
|
|
2747
2783
|
# NOTE: empty_cache only frees memory from THIS process's PyTorch allocator.
|
|
2748
2784
|
# It won't free memory from dead/zombie processes - rocm-smi --showpids can show
|
|
@@ -3376,6 +3412,27 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
|
|
|
3376
3412
|
" KernelBench format requires: 'class Model', 'get_inputs()', 'get_init_inputs()'"
|
|
3377
3413
|
)
|
|
3378
3414
|
|
|
3415
|
+
# Static kernel validation if backend specified
|
|
3416
|
+
if args.backend:
|
|
3417
|
+
from wafer_core.utils.kernel_utils.static_checker import validate_kernel_static
|
|
3418
|
+
|
|
3419
|
+
code = args.implementation.read_text()
|
|
3420
|
+
valid, errors, warnings = validate_kernel_static(code, backend=args.backend)
|
|
3421
|
+
|
|
3422
|
+
# Print warnings (don't fail)
|
|
3423
|
+
for warning in warnings:
|
|
3424
|
+
logger.warning(f"Static check warning: {warning}")
|
|
3425
|
+
|
|
3426
|
+
# Fail on errors
|
|
3427
|
+
if not valid:
|
|
3428
|
+
error_list = "\n - ".join(errors)
|
|
3429
|
+
return (
|
|
3430
|
+
f"Static kernel validation failed for backend '{args.backend}':\n"
|
|
3431
|
+
f" - {error_list}\n\n"
|
|
3432
|
+
f"The implementation must use {args.backend.upper()} kernel primitives.\n"
|
|
3433
|
+
"See KernelBench documentation for valid kernel patterns."
|
|
3434
|
+
)
|
|
3435
|
+
|
|
3379
3436
|
return None
|
|
3380
3437
|
|
|
3381
3438
|
|
|
@@ -3819,14 +3876,20 @@ async def run_evaluate_kernelbench_digitalocean(
|
|
|
3819
3876
|
full_cmd = f"cd {container_run_path} && {eval_cmd}"
|
|
3820
3877
|
|
|
3821
3878
|
# Build Docker command for AMD
|
|
3879
|
+
# PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
|
|
3880
|
+
rocm_arch = _get_rocm_arch(target.compute_capability)
|
|
3881
|
+
env_dict = {
|
|
3882
|
+
"HIP_VISIBLE_DEVICES": str(gpu_id),
|
|
3883
|
+
"PYTHONUNBUFFERED": "1",
|
|
3884
|
+
}
|
|
3885
|
+
if rocm_arch:
|
|
3886
|
+
env_dict["PYTORCH_ROCM_ARCH"] = rocm_arch
|
|
3887
|
+
|
|
3822
3888
|
docker_cmd = _build_docker_run_command_amd(
|
|
3823
3889
|
image=docker_image,
|
|
3824
3890
|
command=full_cmd,
|
|
3825
3891
|
working_dir=container_run_path,
|
|
3826
|
-
env=
|
|
3827
|
-
"HIP_VISIBLE_DEVICES": str(gpu_id),
|
|
3828
|
-
"PYTHONUNBUFFERED": "1",
|
|
3829
|
-
},
|
|
3892
|
+
env=env_dict,
|
|
3830
3893
|
volumes={workspace_path: CONTAINER_WORKSPACE},
|
|
3831
3894
|
)
|
|
3832
3895
|
|
|
@@ -4065,7 +4128,10 @@ async def run_evaluate_kernelbench_runpod(
|
|
|
4065
4128
|
eval_cmd = " ".join(python_cmd_parts)
|
|
4066
4129
|
|
|
4067
4130
|
# Set environment for AMD GPU and run
|
|
4068
|
-
|
|
4131
|
+
# PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
|
|
4132
|
+
rocm_arch = _get_rocm_arch(target.compute_capability)
|
|
4133
|
+
arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
|
|
4134
|
+
env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
|
|
4069
4135
|
full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
|
|
4070
4136
|
|
|
4071
4137
|
# Run and stream output
|
|
@@ -4299,7 +4365,10 @@ async def run_evaluate_kernelbench_baremetal_amd(
|
|
|
4299
4365
|
eval_cmd = " ".join(python_cmd_parts)
|
|
4300
4366
|
|
|
4301
4367
|
# Set environment for AMD GPU and run
|
|
4302
|
-
|
|
4368
|
+
# PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
|
|
4369
|
+
rocm_arch = _get_rocm_arch(target.compute_capability)
|
|
4370
|
+
arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
|
|
4371
|
+
env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
|
|
4303
4372
|
full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
|
|
4304
4373
|
|
|
4305
4374
|
# Run and stream output
|
wafer/kernel_scope.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Unified ISA Analyzer - CLI for static ISA analysis of AMD GPU kernels.
|
|
2
2
|
|
|
3
|
-
This module provides the CLI wrapper for the `wafer amd
|
|
3
|
+
This module provides the CLI wrapper for the `wafer amd isa` command.
|
|
4
4
|
It supports analysis of:
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
-
|
|
5
|
+
- AMD GPU code objects (.co) - Via API server with ROCm tools
|
|
6
|
+
- AMDGCN ISA files (.s, .gcn, .asm) - Local parsing
|
|
7
|
+
- LLVM-IR files (.ll) - Local parsing
|
|
8
|
+
- TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
|
|
8
9
|
|
|
9
|
-
Design: Wafer-436 - AMD Kernel Scope
|
|
10
|
+
Design: Wafer-436 - AMD Kernel Scope / ISA Analyzer
|
|
10
11
|
"""
|
|
11
12
|
|
|
12
13
|
import json
|
|
@@ -17,13 +18,19 @@ from typing import Optional
|
|
|
17
18
|
|
|
18
19
|
def print_usage() -> None:
|
|
19
20
|
"""Print CLI usage information."""
|
|
20
|
-
print("Usage: wafer amd
|
|
21
|
+
print("Usage: wafer amd isa <subcommand> [options]", file=sys.stderr)
|
|
21
22
|
print("", file=sys.stderr)
|
|
22
23
|
print("Subcommands:", file=sys.stderr)
|
|
23
|
-
print(" analyze <file|directory> Analyze ISA
|
|
24
|
+
print(" analyze <file|directory> Analyze ISA files (.co, .s, .ll, .ttgir)", file=sys.stderr)
|
|
24
25
|
print(" metrics List available metrics", file=sys.stderr)
|
|
25
26
|
print(" targets List supported GPU targets", file=sys.stderr)
|
|
26
27
|
print("", file=sys.stderr)
|
|
28
|
+
print("Supported File Types:", file=sys.stderr)
|
|
29
|
+
print(" .co AMD GPU code objects (requires API authentication)", file=sys.stderr)
|
|
30
|
+
print(" .s, .gcn, .asm AMDGCN ISA assembly (local parsing)", file=sys.stderr)
|
|
31
|
+
print(" .ll, .bc LLVM-IR (local parsing)", file=sys.stderr)
|
|
32
|
+
print(" .ttgir, .ttir, .mlir TTGIR / Triton IR (local parsing)", file=sys.stderr)
|
|
33
|
+
print("", file=sys.stderr)
|
|
27
34
|
print("Analyze Options:", file=sys.stderr)
|
|
28
35
|
print(" --json Output as JSON", file=sys.stderr)
|
|
29
36
|
print(" --csv Output as CSV", file=sys.stderr)
|
|
@@ -33,12 +40,13 @@ def print_usage() -> None:
|
|
|
33
40
|
print(" --kernel INDEX Kernel index if multiple in file", file=sys.stderr)
|
|
34
41
|
print("", file=sys.stderr)
|
|
35
42
|
print("Examples:", file=sys.stderr)
|
|
36
|
-
print(" wafer amd
|
|
37
|
-
print(" wafer amd
|
|
38
|
-
print(" wafer amd
|
|
39
|
-
print(" wafer amd
|
|
40
|
-
print(" wafer amd
|
|
41
|
-
print(" wafer amd
|
|
43
|
+
print(" wafer amd isa analyze kernel.co # Analyze code object (requires login)", file=sys.stderr)
|
|
44
|
+
print(" wafer amd isa analyze kernel.s # Analyze ISA assembly", file=sys.stderr)
|
|
45
|
+
print(" wafer amd isa analyze kernel.s --json # Output as JSON", file=sys.stderr)
|
|
46
|
+
print(" wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'", file=sys.stderr)
|
|
47
|
+
print(" wafer amd isa analyze . -r --csv -o metrics.csv", file=sys.stderr)
|
|
48
|
+
print(" wafer amd isa metrics # List available metrics", file=sys.stderr)
|
|
49
|
+
print(" wafer amd isa targets # List supported GPU targets", file=sys.stderr)
|
|
42
50
|
|
|
43
51
|
|
|
44
52
|
def analyze_command(
|
|
@@ -49,8 +57,10 @@ def analyze_command(
|
|
|
49
57
|
filter_expr: Optional[str] = None,
|
|
50
58
|
output_file: Optional[str] = None,
|
|
51
59
|
kernel_index: int = 0,
|
|
60
|
+
api_url: Optional[str] = None,
|
|
61
|
+
auth_headers: Optional[dict[str, str]] = None,
|
|
52
62
|
) -> str:
|
|
53
|
-
"""Analyze ISA/LLVM-IR/TTGIR file or directory.
|
|
63
|
+
"""Analyze ISA/LLVM-IR/TTGIR/.co file or directory.
|
|
54
64
|
|
|
55
65
|
Args:
|
|
56
66
|
path: Path to file or directory
|
|
@@ -60,12 +70,15 @@ def analyze_command(
|
|
|
60
70
|
filter_expr: Filter expression (e.g., "spills > 0")
|
|
61
71
|
output_file: Write output to file
|
|
62
72
|
kernel_index: Kernel index for multi-kernel files
|
|
73
|
+
api_url: API URL for .co file analysis (required for .co files)
|
|
74
|
+
auth_headers: Auth headers for .co file analysis
|
|
63
75
|
|
|
64
76
|
Returns:
|
|
65
77
|
Analysis output string
|
|
66
78
|
"""
|
|
67
79
|
from wafer_core.lib.kernel_scope import (
|
|
68
80
|
analyze_isa_file,
|
|
81
|
+
analyze_code_object,
|
|
69
82
|
analyze_directory,
|
|
70
83
|
analyze_file,
|
|
71
84
|
)
|
|
@@ -79,11 +92,19 @@ def analyze_command(
|
|
|
79
92
|
if target_path.is_file():
|
|
80
93
|
suffix = target_path.suffix.lower()
|
|
81
94
|
|
|
82
|
-
#
|
|
83
|
-
if suffix
|
|
95
|
+
# Code object files (.co) - need API
|
|
96
|
+
if suffix == ".co":
|
|
97
|
+
if not api_url or not auth_headers:
|
|
98
|
+
raise RuntimeError(
|
|
99
|
+
"API authentication required for .co file analysis. "
|
|
100
|
+
"Run 'wafer login' first."
|
|
101
|
+
)
|
|
102
|
+
result = analyze_code_object(target_path, api_url, auth_headers)
|
|
103
|
+
# ISA files - use kernel_index parameter
|
|
104
|
+
elif suffix in (".s", ".gcn", ".asm"):
|
|
84
105
|
result = analyze_isa_file(target_path, kernel_index=kernel_index)
|
|
85
106
|
else:
|
|
86
|
-
result = analyze_file(target_path)
|
|
107
|
+
result = analyze_file(target_path, api_url=api_url, auth_headers=auth_headers)
|
|
87
108
|
|
|
88
109
|
if not result.success:
|
|
89
110
|
raise RuntimeError(f"Analysis failed: {result.error}")
|
|
@@ -92,7 +113,12 @@ def analyze_command(
|
|
|
92
113
|
|
|
93
114
|
# Directory analysis
|
|
94
115
|
else:
|
|
95
|
-
batch_result = analyze_directory(
|
|
116
|
+
batch_result = analyze_directory(
|
|
117
|
+
target_path,
|
|
118
|
+
recursive=recursive,
|
|
119
|
+
api_url=api_url,
|
|
120
|
+
auth_headers=auth_headers,
|
|
121
|
+
)
|
|
96
122
|
|
|
97
123
|
# Apply filter if specified
|
|
98
124
|
if filter_expr:
|
|
@@ -217,11 +243,53 @@ def _result_to_text(result) -> str:
|
|
|
217
243
|
"""Format single result as human-readable text."""
|
|
218
244
|
lines = []
|
|
219
245
|
|
|
220
|
-
if result.
|
|
246
|
+
if result.code_object_analysis:
|
|
247
|
+
# .co file analysis (via API)
|
|
248
|
+
a = result.code_object_analysis
|
|
249
|
+
lines.extend([
|
|
250
|
+
f"Kernel: {a.kernel_name}",
|
|
251
|
+
f"Architecture: {a.architecture}",
|
|
252
|
+
f"Source: Code Object (.co)",
|
|
253
|
+
"",
|
|
254
|
+
"=== Registers ===",
|
|
255
|
+
f" VGPRs: {a.vgpr_count}",
|
|
256
|
+
f" SGPRs: {a.sgpr_count}",
|
|
257
|
+
f" AGPRs: {a.agpr_count}",
|
|
258
|
+
])
|
|
259
|
+
|
|
260
|
+
if a.vgpr_spill_count > 0 or a.sgpr_spill_count > 0:
|
|
261
|
+
lines.extend([
|
|
262
|
+
"",
|
|
263
|
+
"!!! SPILLS DETECTED !!!",
|
|
264
|
+
f" VGPR spills: {a.vgpr_spill_count}",
|
|
265
|
+
f" SGPR spills: {a.sgpr_spill_count}",
|
|
266
|
+
])
|
|
267
|
+
else:
|
|
268
|
+
lines.append(" Spills: None (good)")
|
|
269
|
+
|
|
270
|
+
lines.extend([
|
|
271
|
+
"",
|
|
272
|
+
"=== Memory ===",
|
|
273
|
+
f" LDS: {a.lds_bytes} bytes",
|
|
274
|
+
f" Global loads: {a.global_loads}",
|
|
275
|
+
f" Global stores: {a.global_stores}",
|
|
276
|
+
f" LDS ops: {a.lds_ops}",
|
|
277
|
+
"",
|
|
278
|
+
"=== Instructions ===",
|
|
279
|
+
f" MFMA: {a.mfma_count}",
|
|
280
|
+
f" FMA: {a.fma_count}",
|
|
281
|
+
f" Packed (v_pk_*): {a.packed_ops_count}",
|
|
282
|
+
f" Full stalls (waitcnt 0): {a.waitcnt_full_stalls}",
|
|
283
|
+
f" Barriers: {a.barriers}",
|
|
284
|
+
])
|
|
285
|
+
|
|
286
|
+
elif result.isa_analysis:
|
|
287
|
+
# .s/.gcn/.asm file analysis (local parsing)
|
|
221
288
|
a = result.isa_analysis
|
|
222
289
|
lines.extend([
|
|
223
290
|
f"Kernel: {a.kernel_name}",
|
|
224
291
|
f"Architecture: {a.architecture}",
|
|
292
|
+
f"Source: ISA Assembly (.s)",
|
|
225
293
|
"",
|
|
226
294
|
"=== Registers ===",
|
|
227
295
|
f" VGPRs: {a.vgpr_count}",
|
|
@@ -330,10 +398,16 @@ def _result_to_text(result) -> str:
|
|
|
330
398
|
|
|
331
399
|
def _result_to_csv(result) -> str:
|
|
332
400
|
"""Format single result as CSV."""
|
|
401
|
+
header = "kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes,global_loads,global_stores"
|
|
402
|
+
|
|
403
|
+
if result.code_object_analysis:
|
|
404
|
+
a = result.code_object_analysis
|
|
405
|
+
row = f"{a.kernel_name},{a.architecture},code_object,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_bytes},{a.global_loads},{a.global_stores}"
|
|
406
|
+
return f"{header}\n{row}"
|
|
407
|
+
|
|
333
408
|
if result.isa_analysis:
|
|
334
409
|
a = result.isa_analysis
|
|
335
|
-
|
|
336
|
-
row = f"{a.kernel_name},{a.architecture},{a.vgpr_count},{a.sgpr_count},{a.spill_count},{a.mfma_count},{a.mfma_density_pct:.2f},{a.theoretical_occupancy}"
|
|
410
|
+
row = f"{a.kernel_name},{a.architecture},isa_assembly,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_size},{a.global_load_count},{a.global_store_count}"
|
|
337
411
|
return f"{header}\n{row}"
|
|
338
412
|
|
|
339
413
|
return "# Unsupported format for CSV"
|
|
@@ -362,7 +436,15 @@ def _batch_to_text(batch_result) -> str:
|
|
|
362
436
|
|
|
363
437
|
# Show individual results
|
|
364
438
|
for result in batch_result.results:
|
|
365
|
-
if result.success and result.
|
|
439
|
+
if result.success and result.code_object_analysis:
|
|
440
|
+
a = result.code_object_analysis
|
|
441
|
+
spills = a.vgpr_spill_count + a.sgpr_spill_count
|
|
442
|
+
status = "⚠️" if spills > 0 else "✓"
|
|
443
|
+
lines.append(
|
|
444
|
+
f" {status} {result.file_path}: "
|
|
445
|
+
f"VGPRs={a.vgpr_count}, spills={spills}, MFMA={a.mfma_count}"
|
|
446
|
+
)
|
|
447
|
+
elif result.success and result.isa_analysis:
|
|
366
448
|
a = result.isa_analysis
|
|
367
449
|
status = "⚠️" if a.spill_count > 0 else "✓"
|
|
368
450
|
lines.append(
|
|
@@ -377,15 +459,22 @@ def _batch_to_text(batch_result) -> str:
|
|
|
377
459
|
|
|
378
460
|
def _batch_to_csv(batch_result) -> str:
|
|
379
461
|
"""Format batch results as CSV."""
|
|
380
|
-
lines = ["file_path,kernel_name,architecture,vgpr_count,sgpr_count,
|
|
462
|
+
lines = ["file_path,kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes"]
|
|
381
463
|
|
|
382
464
|
for result in batch_result.results:
|
|
383
|
-
if result.success and result.
|
|
465
|
+
if result.success and result.code_object_analysis:
|
|
466
|
+
a = result.code_object_analysis
|
|
467
|
+
lines.append(
|
|
468
|
+
f"{result.file_path},{a.kernel_name},{a.architecture},code_object,"
|
|
469
|
+
f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
|
|
470
|
+
f"{a.mfma_count},{a.lds_bytes}"
|
|
471
|
+
)
|
|
472
|
+
elif result.success and result.isa_analysis:
|
|
384
473
|
a = result.isa_analysis
|
|
385
474
|
lines.append(
|
|
386
|
-
f"{result.file_path},{a.kernel_name},{a.architecture},"
|
|
387
|
-
f"{a.vgpr_count},{a.sgpr_count},{a.
|
|
388
|
-
f"{a.mfma_count},{a.
|
|
475
|
+
f"{result.file_path},{a.kernel_name},{a.architecture},isa_assembly,"
|
|
476
|
+
f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
|
|
477
|
+
f"{a.mfma_count},{a.lds_size}"
|
|
389
478
|
)
|
|
390
479
|
|
|
391
480
|
return "\n".join(lines)
|
|
@@ -416,12 +505,24 @@ def _apply_filter(batch_result, filter_expr: str):
|
|
|
416
505
|
}
|
|
417
506
|
metric = metric_map.get(metric, metric)
|
|
418
507
|
|
|
419
|
-
# Filter function
|
|
508
|
+
# Filter function - supports both isa_analysis and code_object_analysis
|
|
420
509
|
def passes_filter(result):
|
|
421
|
-
if not result.success
|
|
510
|
+
if not result.success:
|
|
422
511
|
return False
|
|
423
512
|
|
|
424
|
-
|
|
513
|
+
# Try to get metric from either analysis type
|
|
514
|
+
actual = None
|
|
515
|
+
if result.isa_analysis:
|
|
516
|
+
actual = getattr(result.isa_analysis, metric, None)
|
|
517
|
+
elif result.code_object_analysis:
|
|
518
|
+
# Map isa_analysis metric names to code_object_analysis equivalents
|
|
519
|
+
co_metric_map = {
|
|
520
|
+
"spill_count": "vgpr_spill_count", # Use vgpr_spill_count as proxy
|
|
521
|
+
"lds_size": "lds_bytes",
|
|
522
|
+
}
|
|
523
|
+
co_metric = co_metric_map.get(metric, metric)
|
|
524
|
+
actual = getattr(result.code_object_analysis, co_metric, None)
|
|
525
|
+
|
|
425
526
|
if actual is None:
|
|
426
527
|
return False
|
|
427
528
|
|