wafer-cli 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/corpus.py CHANGED
@@ -15,7 +15,16 @@ import httpx
15
15
 
16
16
  CACHE_DIR = Path.home() / ".cache" / "wafer" / "corpora"
17
17
 
18
- CorpusName = Literal["cuda", "cutlass", "hip"]
18
+ CorpusName = Literal["cuda", "cutlass", "hip", "amd"]
19
+
20
+
21
+ @dataclass
22
+ class RepoSource:
23
+ """A single GitHub repo source within a corpus."""
24
+
25
+ repo: str
26
+ paths: list[str]
27
+ branch: str = "main"
19
28
 
20
29
 
21
30
  @dataclass
@@ -24,10 +33,11 @@ class CorpusConfig:
24
33
 
25
34
  name: CorpusName
26
35
  description: str
27
- source_type: Literal["nvidia_md", "github_repo"]
36
+ source_type: Literal["nvidia_md", "github_repo", "github_multi_repo"]
28
37
  urls: list[str] | None = None
29
38
  repo: str | None = None
30
39
  repo_paths: list[str] | None = None
40
+ repos: list[RepoSource] | None = None # For multi-repo corpora
31
41
 
32
42
 
33
43
  CORPORA: dict[CorpusName, CorpusConfig] = {
@@ -69,6 +79,74 @@ CORPORA: dict[CorpusName, CorpusConfig] = {
69
79
  repo="ROCm/HIP",
70
80
  repo_paths=["docs"],
71
81
  ),
82
+ "amd": CorpusConfig(
83
+ name="amd",
84
+ description="AMD GPU kernel development (rocWMMA, CK, AITER, rocBLAS, HipKittens, vLLM)",
85
+ source_type="github_multi_repo",
86
+ repos=[
87
+ # rocWMMA - wave matrix multiply-accumulate (WMMA) intrinsics
88
+ RepoSource(
89
+ repo="ROCm/rocWMMA",
90
+ paths=["docs", "samples", "library/include"],
91
+ branch="develop",
92
+ ),
93
+ # Composable Kernel - tile-based GPU programming
94
+ RepoSource(
95
+ repo="ROCm/composable_kernel",
96
+ paths=["docs", "example", "tutorial", "include/ck_tile"],
97
+ branch="develop",
98
+ ),
99
+ # AITER - AMD inference tensor runtime
100
+ RepoSource(
101
+ repo="ROCm/aiter",
102
+ paths=["docs", "aiter/ops"],
103
+ ),
104
+ # MIOpen - deep learning primitives (deprecated, use rocm-libraries)
105
+ RepoSource(
106
+ repo="ROCm/MIOpen",
107
+ paths=["docs"],
108
+ branch="develop_deprecated",
109
+ ),
110
+ # rocBLAS - BLAS library (deprecated, use rocm-libraries)
111
+ RepoSource(
112
+ repo="ROCm/rocBLAS",
113
+ paths=["docs"],
114
+ branch="develop_deprecated",
115
+ ),
116
+ # hipBLASLt - lightweight BLAS (deprecated, use rocm-libraries)
117
+ RepoSource(
118
+ repo="ROCm/hipBLASLt",
119
+ paths=["docs"],
120
+ branch="develop_deprecated",
121
+ ),
122
+ # Tensile - GEMM code generator (deprecated, use rocm-libraries)
123
+ RepoSource(
124
+ repo="ROCm/Tensile",
125
+ paths=["docs"],
126
+ branch="develop_deprecated",
127
+ ),
128
+ # HipKittens - high-performance AMD kernels
129
+ RepoSource(
130
+ repo="HazyResearch/HipKittens",
131
+ paths=["docs", "kernels", "include"],
132
+ ),
133
+ # vLLM AMD kernels
134
+ RepoSource(
135
+ repo="vllm-project/vllm",
136
+ paths=["csrc/rocm"],
137
+ ),
138
+ # SGLang AMD kernels
139
+ RepoSource(
140
+ repo="sgl-project/sglang",
141
+ paths=["3rdparty/amd"],
142
+ ),
143
+ # HuggingFace ROCm kernels
144
+ RepoSource(
145
+ repo="huggingface/hf-rocm-kernels",
146
+ paths=["csrc", "hf_rocm_kernels", "docs"],
147
+ ),
148
+ ],
149
+ ),
72
150
  }
73
151
 
74
152
 
@@ -113,41 +191,87 @@ def _download_nvidia_md(config: CorpusConfig, dest: Path, verbose: bool = True)
113
191
  return downloaded
114
192
 
115
193
 
194
+ def _extract_matching_files(
195
+ tar: tarfile.TarFile,
196
+ repo_paths: list[str],
197
+ dest: Path,
198
+ verbose: bool,
199
+ ) -> int:
200
+ """Extract files matching repo_paths from tarball."""
201
+ downloaded = 0
202
+ for member in tar.getmembers():
203
+ if not member.isfile():
204
+ continue
205
+ rel_path = "/".join(member.name.split("/")[1:])
206
+ if not any(rel_path.startswith(rp) for rp in repo_paths):
207
+ continue
208
+ target = dest / rel_path
209
+ target.parent.mkdir(parents=True, exist_ok=True)
210
+ src = tar.extractfile(member)
211
+ if src:
212
+ target.write_bytes(src.read())
213
+ downloaded += 1
214
+ if verbose:
215
+ print(f" ✓ {rel_path}")
216
+ return downloaded
217
+
218
+
219
+ def _download_single_github_repo(
220
+ client: httpx.Client,
221
+ repo: str,
222
+ repo_paths: list[str],
223
+ dest: Path,
224
+ branch: str = "main",
225
+ verbose: bool = True,
226
+ ) -> int:
227
+ """Download specific paths from a single GitHub repo."""
228
+ tarball_url = f"https://api.github.com/repos/{repo}/tarball/{branch}"
229
+ if verbose:
230
+ print(f" Fetching {repo}...")
231
+ resp = client.get(tarball_url)
232
+ resp.raise_for_status()
233
+ with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
234
+ tmp.write(resp.content)
235
+ tmp_path = Path(tmp.name)
236
+ try:
237
+ with tarfile.open(tmp_path, "r:gz") as tar:
238
+ return _extract_matching_files(tar, repo_paths, dest, verbose)
239
+ finally:
240
+ tmp_path.unlink()
241
+
242
+
116
243
  def _download_github_repo(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
117
244
  """Download specific paths from GitHub repo."""
118
245
  assert config.repo is not None
119
246
  assert config.repo_paths is not None
120
- downloaded = 0
121
247
  with httpx.Client(timeout=60.0, follow_redirects=True) as client:
122
- tarball_url = f"https://api.github.com/repos/{config.repo}/tarball/main"
123
- if verbose:
124
- print(f" Fetching {config.repo}...")
125
- resp = client.get(tarball_url)
126
- resp.raise_for_status()
127
- with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
128
- tmp.write(resp.content)
129
- tmp_path = Path(tmp.name)
130
- try:
131
- with tarfile.open(tmp_path, "r:gz") as tar:
132
- members = tar.getmembers()
133
- root_prefix = members[0].name.split("/")[0] if members else ""
134
- for member in members:
135
- if not member.isfile():
136
- continue
137
- rel_path = "/".join(member.name.split("/")[1:])
138
- for repo_path in config.repo_paths:
139
- if rel_path.startswith(repo_path):
140
- target = dest / rel_path
141
- target.parent.mkdir(parents=True, exist_ok=True)
142
- with tar.extractfile(member) as src:
143
- if src:
144
- target.write_bytes(src.read())
145
- downloaded += 1
146
- if verbose:
147
- print(f" ✓ {rel_path}")
148
- break
149
- finally:
150
- tmp_path.unlink()
248
+ return _download_single_github_repo(
249
+ client, config.repo, config.repo_paths, dest, verbose=verbose
250
+ )
251
+
252
+
253
+ def _download_github_multi_repo(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
254
+ """Download specific paths from multiple GitHub repos."""
255
+ assert config.repos is not None
256
+ downloaded = 0
257
+ with httpx.Client(timeout=120.0, follow_redirects=True) as client:
258
+ for repo_source in config.repos:
259
+ repo_name = repo_source.repo.split("/")[-1]
260
+ repo_dest = dest / repo_name
261
+ repo_dest.mkdir(parents=True, exist_ok=True)
262
+ try:
263
+ count = _download_single_github_repo(
264
+ client,
265
+ repo_source.repo,
266
+ repo_source.paths,
267
+ repo_dest,
268
+ branch=repo_source.branch,
269
+ verbose=verbose,
270
+ )
271
+ downloaded += count
272
+ except httpx.HTTPError as e:
273
+ if verbose:
274
+ print(f" ✗ {repo_source.repo}: {e}")
151
275
  return downloaded
152
276
 
153
277
 
@@ -185,6 +309,8 @@ def download_corpus(name: CorpusName, force: bool = False, verbose: bool = True)
185
309
  count = _download_nvidia_md(config, dest, verbose)
186
310
  elif config.source_type == "github_repo":
187
311
  count = _download_github_repo(config, dest, verbose)
312
+ elif config.source_type == "github_multi_repo":
313
+ count = _download_github_multi_repo(config, dest, verbose)
188
314
  else:
189
315
  raise ValueError(f"Unknown source type: {config.source_type}")
190
316
  if verbose:
wafer/evaluate.py CHANGED
@@ -22,6 +22,30 @@ from wafer_core.utils.kernel_utils.targets.config import (
22
22
  )
23
23
 
24
24
 
25
+ # Map AMD compute capability to ROCm architecture
26
+ # Used to set PYTORCH_ROCM_ARCH for faster compilation (compile only for target arch)
27
+ AMD_CC_TO_ARCH = {
28
+ "9.4": "gfx942", # MI300X
29
+ "9.0a": "gfx90a", # MI200 series
30
+ "9.08": "gfx908", # MI100
31
+ "9.06": "gfx906", # MI50/60
32
+ "10.30": "gfx1030", # RDNA2
33
+ "11.0": "gfx1100", # RDNA3
34
+ }
35
+
36
+
37
+ def _get_rocm_arch(compute_capability: str) -> str | None:
38
+ """Get ROCm architecture string from compute capability.
39
+
40
+ Returns gfx* string for PYTORCH_ROCM_ARCH, or None if not found.
41
+ """
42
+ # Already a gfx string
43
+ if compute_capability.startswith("gfx"):
44
+ return compute_capability
45
+ # Map from numeric CC
46
+ return AMD_CC_TO_ARCH.get(compute_capability)
47
+
48
+
25
49
  def _build_docker_run_command(
26
50
  image: str,
27
51
  command: str,
@@ -162,6 +186,7 @@ class KernelBenchEvaluateArgs:
162
186
  inputs: Path | None = None # Custom inputs file to override get_inputs()
163
187
  seed: int = 42 # Random seed for reproducibility
164
188
  defensive: bool = False
189
+ backend: str | None = None # Kernel backend for static validation
165
190
  sync_artifacts: bool = True
166
191
  gpu_id: int | None = None
167
192
 
@@ -2743,6 +2768,17 @@ import torch
2743
2768
  import torch.nn as nn
2744
2769
  from pathlib import Path
2745
2770
 
2771
+ # Use a unique per-run PyTorch extension cache directory to ensure fresh compilation.
2772
+ # This prevents stale cached extensions from being loaded when the pod is reused.
2773
+ # Without this, if a kernel is modified but uses the same extension name,
2774
+ # PyTorch would load the old cached .so instead of recompiling.
2775
+ # We use a UUID-based directory instead of clearing the cache to avoid race conditions
2776
+ # with other processes that might be using the cache.
2777
+ import uuid
2778
+ unique_cache_dir = f"/tmp/torch_extensions_{uuid.uuid4().hex[:8]}"
2779
+ os.environ["TORCH_EXTENSIONS_DIR"] = unique_cache_dir
2780
+ print(f"[KernelBench] Using unique extension cache: {unique_cache_dir}")
2781
+
2746
2782
  # Clear any stale GPU memory from previous runs at startup
2747
2783
  # NOTE: empty_cache only frees memory from THIS process's PyTorch allocator.
2748
2784
  # It won't free memory from dead/zombie processes - rocm-smi --showpids can show
@@ -3376,6 +3412,27 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
3376
3412
  " KernelBench format requires: 'class Model', 'get_inputs()', 'get_init_inputs()'"
3377
3413
  )
3378
3414
 
3415
+ # Static kernel validation if backend specified
3416
+ if args.backend:
3417
+ from wafer_core.utils.kernel_utils.static_checker import validate_kernel_static
3418
+
3419
+ code = args.implementation.read_text()
3420
+ valid, errors, warnings = validate_kernel_static(code, backend=args.backend)
3421
+
3422
+ # Print warnings (don't fail)
3423
+ for warning in warnings:
3424
+ logger.warning(f"Static check warning: {warning}")
3425
+
3426
+ # Fail on errors
3427
+ if not valid:
3428
+ error_list = "\n - ".join(errors)
3429
+ return (
3430
+ f"Static kernel validation failed for backend '{args.backend}':\n"
3431
+ f" - {error_list}\n\n"
3432
+ f"The implementation must use {args.backend.upper()} kernel primitives.\n"
3433
+ "See KernelBench documentation for valid kernel patterns."
3434
+ )
3435
+
3379
3436
  return None
3380
3437
 
3381
3438
 
@@ -3819,14 +3876,20 @@ async def run_evaluate_kernelbench_digitalocean(
3819
3876
  full_cmd = f"cd {container_run_path} && {eval_cmd}"
3820
3877
 
3821
3878
  # Build Docker command for AMD
3879
+ # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
3880
+ rocm_arch = _get_rocm_arch(target.compute_capability)
3881
+ env_dict = {
3882
+ "HIP_VISIBLE_DEVICES": str(gpu_id),
3883
+ "PYTHONUNBUFFERED": "1",
3884
+ }
3885
+ if rocm_arch:
3886
+ env_dict["PYTORCH_ROCM_ARCH"] = rocm_arch
3887
+
3822
3888
  docker_cmd = _build_docker_run_command_amd(
3823
3889
  image=docker_image,
3824
3890
  command=full_cmd,
3825
3891
  working_dir=container_run_path,
3826
- env={
3827
- "HIP_VISIBLE_DEVICES": str(gpu_id),
3828
- "PYTHONUNBUFFERED": "1",
3829
- },
3892
+ env=env_dict,
3830
3893
  volumes={workspace_path: CONTAINER_WORKSPACE},
3831
3894
  )
3832
3895
 
@@ -4065,7 +4128,10 @@ async def run_evaluate_kernelbench_runpod(
4065
4128
  eval_cmd = " ".join(python_cmd_parts)
4066
4129
 
4067
4130
  # Set environment for AMD GPU and run
4068
- env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1"
4131
+ # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
4132
+ rocm_arch = _get_rocm_arch(target.compute_capability)
4133
+ arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
4134
+ env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
4069
4135
  full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
4070
4136
 
4071
4137
  # Run and stream output
@@ -4299,7 +4365,10 @@ async def run_evaluate_kernelbench_baremetal_amd(
4299
4365
  eval_cmd = " ".join(python_cmd_parts)
4300
4366
 
4301
4367
  # Set environment for AMD GPU and run
4302
- env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1"
4368
+ # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
4369
+ rocm_arch = _get_rocm_arch(target.compute_capability)
4370
+ arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
4371
+ env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
4303
4372
  full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
4304
4373
 
4305
4374
  # Run and stream output
wafer/kernel_scope.py CHANGED
@@ -1,12 +1,13 @@
1
- """Kernel Scope - CLI for static ISA analysis of Triton kernels.
1
+ """Unified ISA Analyzer - CLI for static ISA analysis of AMD GPU kernels.
2
2
 
3
- This module provides the CLI wrapper for the `wafer amd kernel-scope` command.
3
+ This module provides the CLI wrapper for the `wafer amd isa` command.
4
4
  It supports analysis of:
5
- - AMDGCN ISA files (.s, .gcn, .asm)
6
- - LLVM-IR files (.ll)
7
- - TTGIR files (.ttgir, .ttir, .mlir)
5
+ - AMD GPU code objects (.co) - Via API server with ROCm tools
6
+ - AMDGCN ISA files (.s, .gcn, .asm) - Local parsing
7
+ - LLVM-IR files (.ll) - Local parsing
8
+ - TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
8
9
 
9
- Design: Wafer-436 - AMD Kernel Scope
10
+ Design: Wafer-436 - AMD Kernel Scope / ISA Analyzer
10
11
  """
11
12
 
12
13
  import json
@@ -17,13 +18,19 @@ from typing import Optional
17
18
 
18
19
  def print_usage() -> None:
19
20
  """Print CLI usage information."""
20
- print("Usage: wafer amd kernel-scope <subcommand> [options]", file=sys.stderr)
21
+ print("Usage: wafer amd isa <subcommand> [options]", file=sys.stderr)
21
22
  print("", file=sys.stderr)
22
23
  print("Subcommands:", file=sys.stderr)
23
- print(" analyze <file|directory> Analyze ISA/LLVM-IR/TTGIR files", file=sys.stderr)
24
+ print(" analyze <file|directory> Analyze ISA files (.co, .s, .ll, .ttgir)", file=sys.stderr)
24
25
  print(" metrics List available metrics", file=sys.stderr)
25
26
  print(" targets List supported GPU targets", file=sys.stderr)
26
27
  print("", file=sys.stderr)
28
+ print("Supported File Types:", file=sys.stderr)
29
+ print(" .co AMD GPU code objects (requires API authentication)", file=sys.stderr)
30
+ print(" .s, .gcn, .asm AMDGCN ISA assembly (local parsing)", file=sys.stderr)
31
+ print(" .ll, .bc LLVM-IR (local parsing)", file=sys.stderr)
32
+ print(" .ttgir, .ttir, .mlir TTGIR / Triton IR (local parsing)", file=sys.stderr)
33
+ print("", file=sys.stderr)
27
34
  print("Analyze Options:", file=sys.stderr)
28
35
  print(" --json Output as JSON", file=sys.stderr)
29
36
  print(" --csv Output as CSV", file=sys.stderr)
@@ -33,12 +40,13 @@ def print_usage() -> None:
33
40
  print(" --kernel INDEX Kernel index if multiple in file", file=sys.stderr)
34
41
  print("", file=sys.stderr)
35
42
  print("Examples:", file=sys.stderr)
36
- print(" wafer amd kernel-scope analyze kernel.s", file=sys.stderr)
37
- print(" wafer amd kernel-scope analyze kernel.s --json", file=sys.stderr)
38
- print(" wafer amd kernel-scope analyze ~/.triton/cache/ --filter 'spills > 0'", file=sys.stderr)
39
- print(" wafer amd kernel-scope analyze . -r --csv -o metrics.csv", file=sys.stderr)
40
- print(" wafer amd kernel-scope metrics", file=sys.stderr)
41
- print(" wafer amd kernel-scope targets", file=sys.stderr)
43
+ print(" wafer amd isa analyze kernel.co # Analyze code object (requires login)", file=sys.stderr)
44
+ print(" wafer amd isa analyze kernel.s # Analyze ISA assembly", file=sys.stderr)
45
+ print(" wafer amd isa analyze kernel.s --json # Output as JSON", file=sys.stderr)
46
+ print(" wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'", file=sys.stderr)
47
+ print(" wafer amd isa analyze . -r --csv -o metrics.csv", file=sys.stderr)
48
+ print(" wafer amd isa metrics # List available metrics", file=sys.stderr)
49
+ print(" wafer amd isa targets # List supported GPU targets", file=sys.stderr)
42
50
 
43
51
 
44
52
  def analyze_command(
@@ -49,8 +57,10 @@ def analyze_command(
49
57
  filter_expr: Optional[str] = None,
50
58
  output_file: Optional[str] = None,
51
59
  kernel_index: int = 0,
60
+ api_url: Optional[str] = None,
61
+ auth_headers: Optional[dict[str, str]] = None,
52
62
  ) -> str:
53
- """Analyze ISA/LLVM-IR/TTGIR file or directory.
63
+ """Analyze ISA/LLVM-IR/TTGIR/.co file or directory.
54
64
 
55
65
  Args:
56
66
  path: Path to file or directory
@@ -60,12 +70,15 @@ def analyze_command(
60
70
  filter_expr: Filter expression (e.g., "spills > 0")
61
71
  output_file: Write output to file
62
72
  kernel_index: Kernel index for multi-kernel files
73
+ api_url: API URL for .co file analysis (required for .co files)
74
+ auth_headers: Auth headers for .co file analysis
63
75
 
64
76
  Returns:
65
77
  Analysis output string
66
78
  """
67
79
  from wafer_core.lib.kernel_scope import (
68
80
  analyze_isa_file,
81
+ analyze_code_object,
69
82
  analyze_directory,
70
83
  analyze_file,
71
84
  )
@@ -79,11 +92,19 @@ def analyze_command(
79
92
  if target_path.is_file():
80
93
  suffix = target_path.suffix.lower()
81
94
 
82
- # For ISA files, use kernel_index parameter
83
- if suffix in (".s", ".gcn", ".asm"):
95
+ # Code object files (.co) - need API
96
+ if suffix == ".co":
97
+ if not api_url or not auth_headers:
98
+ raise RuntimeError(
99
+ "API authentication required for .co file analysis. "
100
+ "Run 'wafer login' first."
101
+ )
102
+ result = analyze_code_object(target_path, api_url, auth_headers)
103
+ # ISA files - use kernel_index parameter
104
+ elif suffix in (".s", ".gcn", ".asm"):
84
105
  result = analyze_isa_file(target_path, kernel_index=kernel_index)
85
106
  else:
86
- result = analyze_file(target_path)
107
+ result = analyze_file(target_path, api_url=api_url, auth_headers=auth_headers)
87
108
 
88
109
  if not result.success:
89
110
  raise RuntimeError(f"Analysis failed: {result.error}")
@@ -92,7 +113,12 @@ def analyze_command(
92
113
 
93
114
  # Directory analysis
94
115
  else:
95
- batch_result = analyze_directory(target_path, recursive=recursive)
116
+ batch_result = analyze_directory(
117
+ target_path,
118
+ recursive=recursive,
119
+ api_url=api_url,
120
+ auth_headers=auth_headers,
121
+ )
96
122
 
97
123
  # Apply filter if specified
98
124
  if filter_expr:
@@ -217,11 +243,53 @@ def _result_to_text(result) -> str:
217
243
  """Format single result as human-readable text."""
218
244
  lines = []
219
245
 
220
- if result.isa_analysis:
246
+ if result.code_object_analysis:
247
+ # .co file analysis (via API)
248
+ a = result.code_object_analysis
249
+ lines.extend([
250
+ f"Kernel: {a.kernel_name}",
251
+ f"Architecture: {a.architecture}",
252
+ f"Source: Code Object (.co)",
253
+ "",
254
+ "=== Registers ===",
255
+ f" VGPRs: {a.vgpr_count}",
256
+ f" SGPRs: {a.sgpr_count}",
257
+ f" AGPRs: {a.agpr_count}",
258
+ ])
259
+
260
+ if a.vgpr_spill_count > 0 or a.sgpr_spill_count > 0:
261
+ lines.extend([
262
+ "",
263
+ "!!! SPILLS DETECTED !!!",
264
+ f" VGPR spills: {a.vgpr_spill_count}",
265
+ f" SGPR spills: {a.sgpr_spill_count}",
266
+ ])
267
+ else:
268
+ lines.append(" Spills: None (good)")
269
+
270
+ lines.extend([
271
+ "",
272
+ "=== Memory ===",
273
+ f" LDS: {a.lds_bytes} bytes",
274
+ f" Global loads: {a.global_loads}",
275
+ f" Global stores: {a.global_stores}",
276
+ f" LDS ops: {a.lds_ops}",
277
+ "",
278
+ "=== Instructions ===",
279
+ f" MFMA: {a.mfma_count}",
280
+ f" FMA: {a.fma_count}",
281
+ f" Packed (v_pk_*): {a.packed_ops_count}",
282
+ f" Full stalls (waitcnt 0): {a.waitcnt_full_stalls}",
283
+ f" Barriers: {a.barriers}",
284
+ ])
285
+
286
+ elif result.isa_analysis:
287
+ # .s/.gcn/.asm file analysis (local parsing)
221
288
  a = result.isa_analysis
222
289
  lines.extend([
223
290
  f"Kernel: {a.kernel_name}",
224
291
  f"Architecture: {a.architecture}",
292
+ f"Source: ISA Assembly (.s)",
225
293
  "",
226
294
  "=== Registers ===",
227
295
  f" VGPRs: {a.vgpr_count}",
@@ -330,10 +398,16 @@ def _result_to_text(result) -> str:
330
398
 
331
399
  def _result_to_csv(result) -> str:
332
400
  """Format single result as CSV."""
401
+ header = "kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes,global_loads,global_stores"
402
+
403
+ if result.code_object_analysis:
404
+ a = result.code_object_analysis
405
+ row = f"{a.kernel_name},{a.architecture},code_object,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_bytes},{a.global_loads},{a.global_stores}"
406
+ return f"{header}\n{row}"
407
+
333
408
  if result.isa_analysis:
334
409
  a = result.isa_analysis
335
- header = "kernel_name,architecture,vgpr_count,sgpr_count,spill_count,mfma_count,mfma_density_pct,occupancy"
336
- row = f"{a.kernel_name},{a.architecture},{a.vgpr_count},{a.sgpr_count},{a.spill_count},{a.mfma_count},{a.mfma_density_pct:.2f},{a.theoretical_occupancy}"
410
+ row = f"{a.kernel_name},{a.architecture},isa_assembly,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_size},{a.global_load_count},{a.global_store_count}"
337
411
  return f"{header}\n{row}"
338
412
 
339
413
  return "# Unsupported format for CSV"
@@ -362,7 +436,15 @@ def _batch_to_text(batch_result) -> str:
362
436
 
363
437
  # Show individual results
364
438
  for result in batch_result.results:
365
- if result.success and result.isa_analysis:
439
+ if result.success and result.code_object_analysis:
440
+ a = result.code_object_analysis
441
+ spills = a.vgpr_spill_count + a.sgpr_spill_count
442
+ status = "⚠️" if spills > 0 else "✓"
443
+ lines.append(
444
+ f" {status} {result.file_path}: "
445
+ f"VGPRs={a.vgpr_count}, spills={spills}, MFMA={a.mfma_count}"
446
+ )
447
+ elif result.success and result.isa_analysis:
366
448
  a = result.isa_analysis
367
449
  status = "⚠️" if a.spill_count > 0 else "✓"
368
450
  lines.append(
@@ -377,15 +459,22 @@ def _batch_to_text(batch_result) -> str:
377
459
 
378
460
  def _batch_to_csv(batch_result) -> str:
379
461
  """Format batch results as CSV."""
380
- lines = ["file_path,kernel_name,architecture,vgpr_count,sgpr_count,spill_count,mfma_count,mfma_density_pct,occupancy"]
462
+ lines = ["file_path,kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes"]
381
463
 
382
464
  for result in batch_result.results:
383
- if result.success and result.isa_analysis:
465
+ if result.success and result.code_object_analysis:
466
+ a = result.code_object_analysis
467
+ lines.append(
468
+ f"{result.file_path},{a.kernel_name},{a.architecture},code_object,"
469
+ f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
470
+ f"{a.mfma_count},{a.lds_bytes}"
471
+ )
472
+ elif result.success and result.isa_analysis:
384
473
  a = result.isa_analysis
385
474
  lines.append(
386
- f"{result.file_path},{a.kernel_name},{a.architecture},"
387
- f"{a.vgpr_count},{a.sgpr_count},{a.spill_count},"
388
- f"{a.mfma_count},{a.mfma_density_pct:.2f},{a.theoretical_occupancy}"
475
+ f"{result.file_path},{a.kernel_name},{a.architecture},isa_assembly,"
476
+ f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
477
+ f"{a.mfma_count},{a.lds_size}"
389
478
  )
390
479
 
391
480
  return "\n".join(lines)
@@ -416,12 +505,24 @@ def _apply_filter(batch_result, filter_expr: str):
416
505
  }
417
506
  metric = metric_map.get(metric, metric)
418
507
 
419
- # Filter function
508
+ # Filter function - supports both isa_analysis and code_object_analysis
420
509
  def passes_filter(result):
421
- if not result.success or not result.isa_analysis:
510
+ if not result.success:
422
511
  return False
423
512
 
424
- actual = getattr(result.isa_analysis, metric, None)
513
+ # Try to get metric from either analysis type
514
+ actual = None
515
+ if result.isa_analysis:
516
+ actual = getattr(result.isa_analysis, metric, None)
517
+ elif result.code_object_analysis:
518
+ # Map isa_analysis metric names to code_object_analysis equivalents
519
+ co_metric_map = {
520
+ "spill_count": "vgpr_spill_count", # Use vgpr_spill_count as proxy
521
+ "lds_size": "lds_bytes",
522
+ }
523
+ co_metric = co_metric_map.get(metric, metric)
524
+ actual = getattr(result.code_object_analysis, co_metric, None)
525
+
425
526
  if actual is None:
426
527
  return False
427
528