PyPI - wafer-cli - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

wafer-cli 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

wafer/GUIDE.md +18 -7
wafer/api_client.py +4 -0
wafer/cli.py +1177 -278
wafer/corpus.py +158 -32
wafer/evaluate.py +75 -6
wafer/kernel_scope.py +132 -31
wafer/nsys_analyze.py +903 -73
wafer/nsys_profile.py +511 -0
wafer/output.py +241 -0
wafer/skills/wafer-guide/SKILL.md +13 -0
wafer/ssh_keys.py +261 -0
wafer/targets_ops.py +718 -0
wafer/wevin_cli.py +127 -18
wafer/workspaces.py +232 -184
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/METADATA +1 -1
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/RECORD +19 -15
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.9.dist-info → wafer_cli-0.2.11.dist-info}/top_level.txt +0 -0

wafer/corpus.py CHANGED Viewed

@@ -15,7 +15,16 @@ import httpx
 CACHE_DIR = Path.home() / ".cache" / "wafer" / "corpora"
-CorpusName = Literal["cuda", "cutlass", "hip"]
+CorpusName = Literal["cuda", "cutlass", "hip", "amd"]
+@dataclass
+class RepoSource:
+    """A single GitHub repo source within a corpus."""
+    repo: str
+    paths: list[str]
+    branch: str = "main"
 @dataclass
@@ -24,10 +33,11 @@ class CorpusConfig:
     name: CorpusName
     description: str
-    source_type: Literal["nvidia_md", "github_repo"]
+    source_type: Literal["nvidia_md", "github_repo", "github_multi_repo"]
     urls: list[str] | None = None
     repo: str | None = None
     repo_paths: list[str] | None = None
+    repos: list[RepoSource] | None = None  # For multi-repo corpora
 CORPORA: dict[CorpusName, CorpusConfig] = {
@@ -69,6 +79,74 @@ CORPORA: dict[CorpusName, CorpusConfig] = {
         repo="ROCm/HIP",
         repo_paths=["docs"],
     ),
+    "amd": CorpusConfig(
+        name="amd",
+        description="AMD GPU kernel development (rocWMMA, CK, AITER, rocBLAS, HipKittens, vLLM)",
+        source_type="github_multi_repo",
+        repos=[
+            # rocWMMA - wave matrix multiply-accumulate (WMMA) intrinsics
+            RepoSource(
+                repo="ROCm/rocWMMA",
+                paths=["docs", "samples", "library/include"],
+                branch="develop",
+            ),
+            # Composable Kernel - tile-based GPU programming
+            RepoSource(
+                repo="ROCm/composable_kernel",
+                paths=["docs", "example", "tutorial", "include/ck_tile"],
+                branch="develop",
+            ),
+            # AITER - AMD inference tensor runtime
+            RepoSource(
+                repo="ROCm/aiter",
+                paths=["docs", "aiter/ops"],
+            ),
+            # MIOpen - deep learning primitives (deprecated, use rocm-libraries)
+            RepoSource(
+                repo="ROCm/MIOpen",
+                paths=["docs"],
+                branch="develop_deprecated",
+            ),
+            # rocBLAS - BLAS library (deprecated, use rocm-libraries)
+            RepoSource(
+                repo="ROCm/rocBLAS",
+                paths=["docs"],
+                branch="develop_deprecated",
+            ),
+            # hipBLASLt - lightweight BLAS (deprecated, use rocm-libraries)
+            RepoSource(
+                repo="ROCm/hipBLASLt",
+                paths=["docs"],
+                branch="develop_deprecated",
+            ),
+            # Tensile - GEMM code generator (deprecated, use rocm-libraries)
+            RepoSource(
+                repo="ROCm/Tensile",
+                paths=["docs"],
+                branch="develop_deprecated",
+            ),
+            # HipKittens - high-performance AMD kernels
+            RepoSource(
+                repo="HazyResearch/HipKittens",
+                paths=["docs", "kernels", "include"],
+            ),
+            # vLLM AMD kernels
+            RepoSource(
+                repo="vllm-project/vllm",
+                paths=["csrc/rocm"],
+            ),
+            # SGLang AMD kernels
+            RepoSource(
+                repo="sgl-project/sglang",
+                paths=["3rdparty/amd"],
+            ),
+            # HuggingFace ROCm kernels
+            RepoSource(
+                repo="huggingface/hf-rocm-kernels",
+                paths=["csrc", "hf_rocm_kernels", "docs"],
+            ),
+        ],
+    ),
 }
@@ -113,41 +191,87 @@ def _download_nvidia_md(config: CorpusConfig, dest: Path, verbose: bool = True)
     return downloaded
+def _extract_matching_files(
+    tar: tarfile.TarFile,
+    repo_paths: list[str],
+    dest: Path,
+    verbose: bool,
+) -> int:
+    """Extract files matching repo_paths from tarball."""
+    downloaded = 0
+    for member in tar.getmembers():
+        if not member.isfile():
+            continue
+        rel_path = "/".join(member.name.split("/")[1:])
+        if not any(rel_path.startswith(rp) for rp in repo_paths):
+            continue
+        target = dest / rel_path
+        target.parent.mkdir(parents=True, exist_ok=True)
+        src = tar.extractfile(member)
+        if src:
+            target.write_bytes(src.read())
+            downloaded += 1
+            if verbose:
+                print(f"  ✓ {rel_path}")
+    return downloaded
+def _download_single_github_repo(
+    client: httpx.Client,
+    repo: str,
+    repo_paths: list[str],
+    dest: Path,
+    branch: str = "main",
+    verbose: bool = True,
+) -> int:
+    """Download specific paths from a single GitHub repo."""
+    tarball_url = f"https://api.github.com/repos/{repo}/tarball/{branch}"
+    if verbose:
+        print(f"  Fetching {repo}...")
+    resp = client.get(tarball_url)
+    resp.raise_for_status()
+    with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
+        tmp.write(resp.content)
+        tmp_path = Path(tmp.name)
+    try:
+        with tarfile.open(tmp_path, "r:gz") as tar:
+            return _extract_matching_files(tar, repo_paths, dest, verbose)
+    finally:
+        tmp_path.unlink()
 def _download_github_repo(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
     """Download specific paths from GitHub repo."""
     assert config.repo is not None
     assert config.repo_paths is not None
-    downloaded = 0
     with httpx.Client(timeout=60.0, follow_redirects=True) as client:
-        tarball_url = f"https://api.github.com/repos/{config.repo}/tarball/main"
-        if verbose:
-            print(f"  Fetching {config.repo}...")
-        resp = client.get(tarball_url)
-        resp.raise_for_status()
-        with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
-            tmp.write(resp.content)
-            tmp_path = Path(tmp.name)
-        try:
-            with tarfile.open(tmp_path, "r:gz") as tar:
-                members = tar.getmembers()
-                root_prefix = members[0].name.split("/")[0] if members else ""
-                for member in members:
-                    if not member.isfile():
-                        continue
-                    rel_path = "/".join(member.name.split("/")[1:])
-                    for repo_path in config.repo_paths:
-                        if rel_path.startswith(repo_path):
-                            target = dest / rel_path
-                            target.parent.mkdir(parents=True, exist_ok=True)
-                            with tar.extractfile(member) as src:
-                                if src:
-                                    target.write_bytes(src.read())
-                                    downloaded += 1
-                                    if verbose:
-                                        print(f"  ✓ {rel_path}")
-                            break
-        finally:
-            tmp_path.unlink()
+        return _download_single_github_repo(
+            client, config.repo, config.repo_paths, dest, verbose=verbose
+        )
+def _download_github_multi_repo(config: CorpusConfig, dest: Path, verbose: bool = True) -> int:
+    """Download specific paths from multiple GitHub repos."""
+    assert config.repos is not None
+    downloaded = 0
+    with httpx.Client(timeout=120.0, follow_redirects=True) as client:
+        for repo_source in config.repos:
+            repo_name = repo_source.repo.split("/")[-1]
+            repo_dest = dest / repo_name
+            repo_dest.mkdir(parents=True, exist_ok=True)
+            try:
+                count = _download_single_github_repo(
+                    client,
+                    repo_source.repo,
+                    repo_source.paths,
+                    repo_dest,
+                    branch=repo_source.branch,
+                    verbose=verbose,
+                )
+                downloaded += count
+            except httpx.HTTPError as e:
+                if verbose:
+                    print(f"  ✗ {repo_source.repo}: {e}")
     return downloaded
@@ -185,6 +309,8 @@ def download_corpus(name: CorpusName, force: bool = False, verbose: bool = True)
         count = _download_nvidia_md(config, dest, verbose)
     elif config.source_type == "github_repo":
         count = _download_github_repo(config, dest, verbose)
+    elif config.source_type == "github_multi_repo":
+        count = _download_github_multi_repo(config, dest, verbose)
     else:
         raise ValueError(f"Unknown source type: {config.source_type}")
     if verbose:

wafer/evaluate.py CHANGED Viewed

@@ -22,6 +22,30 @@ from wafer_core.utils.kernel_utils.targets.config import (
 )
+# Map AMD compute capability to ROCm architecture
+# Used to set PYTORCH_ROCM_ARCH for faster compilation (compile only for target arch)
+AMD_CC_TO_ARCH = {
+    "9.4": "gfx942",  # MI300X
+    "9.0a": "gfx90a",  # MI200 series
+    "9.08": "gfx908",  # MI100
+    "9.06": "gfx906",  # MI50/60
+    "10.30": "gfx1030",  # RDNA2
+    "11.0": "gfx1100",  # RDNA3
+}
+def _get_rocm_arch(compute_capability: str) -> str | None:
+    """Get ROCm architecture string from compute capability.
+    Returns gfx* string for PYTORCH_ROCM_ARCH, or None if not found.
+    """
+    # Already a gfx string
+    if compute_capability.startswith("gfx"):
+        return compute_capability
+    # Map from numeric CC
+    return AMD_CC_TO_ARCH.get(compute_capability)
 def _build_docker_run_command(
     image: str,
     command: str,
@@ -162,6 +186,7 @@ class KernelBenchEvaluateArgs:
     inputs: Path | None = None  # Custom inputs file to override get_inputs()
     seed: int = 42  # Random seed for reproducibility
     defensive: bool = False
+    backend: str | None = None  # Kernel backend for static validation
     sync_artifacts: bool = True
     gpu_id: int | None = None
@@ -2743,6 +2768,17 @@ import torch
 import torch.nn as nn
 from pathlib import Path
+# Use a unique per-run PyTorch extension cache directory to ensure fresh compilation.
+# This prevents stale cached extensions from being loaded when the pod is reused.
+# Without this, if a kernel is modified but uses the same extension name,
+# PyTorch would load the old cached .so instead of recompiling.
+# We use a UUID-based directory instead of clearing the cache to avoid race conditions
+# with other processes that might be using the cache.
+import uuid
+unique_cache_dir = f"/tmp/torch_extensions_{uuid.uuid4().hex[:8]}"
+os.environ["TORCH_EXTENSIONS_DIR"] = unique_cache_dir
+print(f"[KernelBench] Using unique extension cache: {unique_cache_dir}")
 # Clear any stale GPU memory from previous runs at startup
 # NOTE: empty_cache only frees memory from THIS process's PyTorch allocator.
 # It won't free memory from dead/zombie processes - rocm-smi --showpids can show
@@ -3376,6 +3412,27 @@ def _validate_kernelbench_files(args: KernelBenchEvaluateArgs) -> str | None:
             "  KernelBench format requires: 'class Model', 'get_inputs()', 'get_init_inputs()'"
         )
+    # Static kernel validation if backend specified
+    if args.backend:
+        from wafer_core.utils.kernel_utils.static_checker import validate_kernel_static
+        code = args.implementation.read_text()
+        valid, errors, warnings = validate_kernel_static(code, backend=args.backend)
+        # Print warnings (don't fail)
+        for warning in warnings:
+            logger.warning(f"Static check warning: {warning}")
+        # Fail on errors
+        if not valid:
+            error_list = "\n  - ".join(errors)
+            return (
+                f"Static kernel validation failed for backend '{args.backend}':\n"
+                f"  - {error_list}\n\n"
+                f"The implementation must use {args.backend.upper()} kernel primitives.\n"
+                "See KernelBench documentation for valid kernel patterns."
+            )
     return None
@@ -3819,14 +3876,20 @@ async def run_evaluate_kernelbench_digitalocean(
                 full_cmd = f"cd {container_run_path} && {eval_cmd}"
                 # Build Docker command for AMD
+                # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
+                rocm_arch = _get_rocm_arch(target.compute_capability)
+                env_dict = {
+                    "HIP_VISIBLE_DEVICES": str(gpu_id),
+                    "PYTHONUNBUFFERED": "1",
+                }
+                if rocm_arch:
+                    env_dict["PYTORCH_ROCM_ARCH"] = rocm_arch
                 docker_cmd = _build_docker_run_command_amd(
                     image=docker_image,
                     command=full_cmd,
                     working_dir=container_run_path,
-                    env={
-                        "HIP_VISIBLE_DEVICES": str(gpu_id),
-                        "PYTHONUNBUFFERED": "1",
-                    },
+                    env=env_dict,
                     volumes={workspace_path: CONTAINER_WORKSPACE},
                 )
@@ -4065,7 +4128,10 @@ async def run_evaluate_kernelbench_runpod(
                 eval_cmd = " ".join(python_cmd_parts)
                 # Set environment for AMD GPU and run
-                env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1"
+                # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
+                rocm_arch = _get_rocm_arch(target.compute_capability)
+                arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
+                env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
                 full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
                 # Run and stream output
@@ -4299,7 +4365,10 @@ async def run_evaluate_kernelbench_baremetal_amd(
         eval_cmd = " ".join(python_cmd_parts)
         # Set environment for AMD GPU and run
-        env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1"
+        # PYTORCH_ROCM_ARCH: compile only for target arch (5-7x faster compile)
+        rocm_arch = _get_rocm_arch(target.compute_capability)
+        arch_env = f"PYTORCH_ROCM_ARCH={rocm_arch}" if rocm_arch else ""
+        env_vars = f"HIP_VISIBLE_DEVICES={gpu_id} ROCM_PATH=/opt/rocm PYTHONUNBUFFERED=1 {arch_env}"
         full_cmd = f"cd {run_path} && {env_vars} {eval_cmd}"
         # Run and stream output

wafer/kernel_scope.py CHANGED Viewed

@@ -1,12 +1,13 @@
-"""Kernel Scope - CLI for static ISA analysis of Triton kernels.
+"""Unified ISA Analyzer - CLI for static ISA analysis of AMD GPU kernels.
-This module provides the CLI wrapper for the `wafer amd kernel-scope` command.
+This module provides the CLI wrapper for the `wafer amd isa` command.
 It supports analysis of:
-- AMDGCN ISA files (.s, .gcn, .asm)
-- LLVM-IR files (.ll)
-- TTGIR files (.ttgir, .ttir, .mlir)
+- AMD GPU code objects (.co) - Via API server with ROCm tools
+- AMDGCN ISA files (.s, .gcn, .asm) - Local parsing
+- LLVM-IR files (.ll) - Local parsing
+- TTGIR files (.ttgir, .ttir, .mlir) - Local parsing
-Design: Wafer-436 - AMD Kernel Scope
+Design: Wafer-436 - AMD Kernel Scope / ISA Analyzer
 """
 import json
@@ -17,13 +18,19 @@ from typing import Optional
 def print_usage() -> None:
     """Print CLI usage information."""
-    print("Usage: wafer amd kernel-scope <subcommand> [options]", file=sys.stderr)
+    print("Usage: wafer amd isa <subcommand> [options]", file=sys.stderr)
     print("", file=sys.stderr)
     print("Subcommands:", file=sys.stderr)
-    print("  analyze <file|directory>   Analyze ISA/LLVM-IR/TTGIR files", file=sys.stderr)
+    print("  analyze <file|directory>   Analyze ISA files (.co, .s, .ll, .ttgir)", file=sys.stderr)
     print("  metrics                    List available metrics", file=sys.stderr)
     print("  targets                    List supported GPU targets", file=sys.stderr)
     print("", file=sys.stderr)
+    print("Supported File Types:", file=sys.stderr)
+    print("  .co                        AMD GPU code objects (requires API authentication)", file=sys.stderr)
+    print("  .s, .gcn, .asm             AMDGCN ISA assembly (local parsing)", file=sys.stderr)
+    print("  .ll, .bc                   LLVM-IR (local parsing)", file=sys.stderr)
+    print("  .ttgir, .ttir, .mlir       TTGIR / Triton IR (local parsing)", file=sys.stderr)
+    print("", file=sys.stderr)
     print("Analyze Options:", file=sys.stderr)
     print("  --json                     Output as JSON", file=sys.stderr)
     print("  --csv                      Output as CSV", file=sys.stderr)
@@ -33,12 +40,13 @@ def print_usage() -> None:
     print("  --kernel INDEX             Kernel index if multiple in file", file=sys.stderr)
     print("", file=sys.stderr)
     print("Examples:", file=sys.stderr)
-    print("  wafer amd kernel-scope analyze kernel.s", file=sys.stderr)
-    print("  wafer amd kernel-scope analyze kernel.s --json", file=sys.stderr)
-    print("  wafer amd kernel-scope analyze ~/.triton/cache/ --filter 'spills > 0'", file=sys.stderr)
-    print("  wafer amd kernel-scope analyze . -r --csv -o metrics.csv", file=sys.stderr)
-    print("  wafer amd kernel-scope metrics", file=sys.stderr)
-    print("  wafer amd kernel-scope targets", file=sys.stderr)
+    print("  wafer amd isa analyze kernel.co           # Analyze code object (requires login)", file=sys.stderr)
+    print("  wafer amd isa analyze kernel.s            # Analyze ISA assembly", file=sys.stderr)
+    print("  wafer amd isa analyze kernel.s --json     # Output as JSON", file=sys.stderr)
+    print("  wafer amd isa analyze ~/.triton/cache/ --filter 'spills > 0'", file=sys.stderr)
+    print("  wafer amd isa analyze . -r --csv -o metrics.csv", file=sys.stderr)
+    print("  wafer amd isa metrics                     # List available metrics", file=sys.stderr)
+    print("  wafer amd isa targets                     # List supported GPU targets", file=sys.stderr)
 def analyze_command(
@@ -49,8 +57,10 @@ def analyze_command(
     filter_expr: Optional[str] = None,
     output_file: Optional[str] = None,
     kernel_index: int = 0,
+    api_url: Optional[str] = None,
+    auth_headers: Optional[dict[str, str]] = None,
 ) -> str:
-    """Analyze ISA/LLVM-IR/TTGIR file or directory.
+    """Analyze ISA/LLVM-IR/TTGIR/.co file or directory.
     Args:
         path: Path to file or directory
@@ -60,12 +70,15 @@ def analyze_command(
         filter_expr: Filter expression (e.g., "spills > 0")
         output_file: Write output to file
         kernel_index: Kernel index for multi-kernel files
+        api_url: API URL for .co file analysis (required for .co files)
+        auth_headers: Auth headers for .co file analysis
     Returns:
         Analysis output string
     """
     from wafer_core.lib.kernel_scope import (
         analyze_isa_file,
+        analyze_code_object,
         analyze_directory,
         analyze_file,
     )
@@ -79,11 +92,19 @@ def analyze_command(
     if target_path.is_file():
         suffix = target_path.suffix.lower()
-        # For ISA files, use kernel_index parameter
-        if suffix in (".s", ".gcn", ".asm"):
+        # Code object files (.co) - need API
+        if suffix == ".co":
+            if not api_url or not auth_headers:
+                raise RuntimeError(
+                    "API authentication required for .co file analysis. "
+                    "Run 'wafer login' first."
+                )
+            result = analyze_code_object(target_path, api_url, auth_headers)
+        # ISA files - use kernel_index parameter
+        elif suffix in (".s", ".gcn", ".asm"):
             result = analyze_isa_file(target_path, kernel_index=kernel_index)
         else:
-            result = analyze_file(target_path)
+            result = analyze_file(target_path, api_url=api_url, auth_headers=auth_headers)
         if not result.success:
             raise RuntimeError(f"Analysis failed: {result.error}")
@@ -92,7 +113,12 @@ def analyze_command(
     # Directory analysis
     else:
-        batch_result = analyze_directory(target_path, recursive=recursive)
+        batch_result = analyze_directory(
+            target_path,
+            recursive=recursive,
+            api_url=api_url,
+            auth_headers=auth_headers,
+        )
         # Apply filter if specified
         if filter_expr:
@@ -217,11 +243,53 @@ def _result_to_text(result) -> str:
     """Format single result as human-readable text."""
     lines = []
-    if result.isa_analysis:
+    if result.code_object_analysis:
+        # .co file analysis (via API)
+        a = result.code_object_analysis
+        lines.extend([
+            f"Kernel: {a.kernel_name}",
+            f"Architecture: {a.architecture}",
+            f"Source: Code Object (.co)",
+            "",
+            "=== Registers ===",
+            f"  VGPRs: {a.vgpr_count}",
+            f"  SGPRs: {a.sgpr_count}",
+            f"  AGPRs: {a.agpr_count}",
+        ])
+        if a.vgpr_spill_count > 0 or a.sgpr_spill_count > 0:
+            lines.extend([
+                "",
+                "!!! SPILLS DETECTED !!!",
+                f"  VGPR spills: {a.vgpr_spill_count}",
+                f"  SGPR spills: {a.sgpr_spill_count}",
+            ])
+        else:
+            lines.append("  Spills: None (good)")
+        lines.extend([
+            "",
+            "=== Memory ===",
+            f"  LDS: {a.lds_bytes} bytes",
+            f"  Global loads: {a.global_loads}",
+            f"  Global stores: {a.global_stores}",
+            f"  LDS ops: {a.lds_ops}",
+            "",
+            "=== Instructions ===",
+            f"  MFMA: {a.mfma_count}",
+            f"  FMA: {a.fma_count}",
+            f"  Packed (v_pk_*): {a.packed_ops_count}",
+            f"  Full stalls (waitcnt 0): {a.waitcnt_full_stalls}",
+            f"  Barriers: {a.barriers}",
+        ])
+    elif result.isa_analysis:
+        # .s/.gcn/.asm file analysis (local parsing)
         a = result.isa_analysis
         lines.extend([
             f"Kernel: {a.kernel_name}",
             f"Architecture: {a.architecture}",
+            f"Source: ISA Assembly (.s)",
             "",
             "=== Registers ===",
             f"  VGPRs: {a.vgpr_count}",
@@ -330,10 +398,16 @@ def _result_to_text(result) -> str:
 def _result_to_csv(result) -> str:
     """Format single result as CSV."""
+    header = "kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes,global_loads,global_stores"
+    if result.code_object_analysis:
+        a = result.code_object_analysis
+        row = f"{a.kernel_name},{a.architecture},code_object,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_bytes},{a.global_loads},{a.global_stores}"
+        return f"{header}\n{row}"
     if result.isa_analysis:
         a = result.isa_analysis
-        header = "kernel_name,architecture,vgpr_count,sgpr_count,spill_count,mfma_count,mfma_density_pct,occupancy"
-        row = f"{a.kernel_name},{a.architecture},{a.vgpr_count},{a.sgpr_count},{a.spill_count},{a.mfma_count},{a.mfma_density_pct:.2f},{a.theoretical_occupancy}"
+        row = f"{a.kernel_name},{a.architecture},isa_assembly,{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},{a.mfma_count},{a.lds_size},{a.global_load_count},{a.global_store_count}"
         return f"{header}\n{row}"
     return "# Unsupported format for CSV"
@@ -362,7 +436,15 @@ def _batch_to_text(batch_result) -> str:
     # Show individual results
     for result in batch_result.results:
-        if result.success and result.isa_analysis:
+        if result.success and result.code_object_analysis:
+            a = result.code_object_analysis
+            spills = a.vgpr_spill_count + a.sgpr_spill_count
+            status = "⚠️" if spills > 0 else "✓"
+            lines.append(
+                f"  {status} {result.file_path}: "
+                f"VGPRs={a.vgpr_count}, spills={spills}, MFMA={a.mfma_count}"
+            )
+        elif result.success and result.isa_analysis:
             a = result.isa_analysis
             status = "⚠️" if a.spill_count > 0 else "✓"
             lines.append(
@@ -377,15 +459,22 @@ def _batch_to_text(batch_result) -> str:
 def _batch_to_csv(batch_result) -> str:
     """Format batch results as CSV."""
-    lines = ["file_path,kernel_name,architecture,vgpr_count,sgpr_count,spill_count,mfma_count,mfma_density_pct,occupancy"]
+    lines = ["file_path,kernel_name,architecture,source_type,vgpr_count,sgpr_count,vgpr_spills,sgpr_spills,mfma_count,lds_bytes"]
     for result in batch_result.results:
-        if result.success and result.isa_analysis:
+        if result.success and result.code_object_analysis:
+            a = result.code_object_analysis
+            lines.append(
+                f"{result.file_path},{a.kernel_name},{a.architecture},code_object,"
+                f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
+                f"{a.mfma_count},{a.lds_bytes}"
+            )
+        elif result.success and result.isa_analysis:
             a = result.isa_analysis
             lines.append(
-                f"{result.file_path},{a.kernel_name},{a.architecture},"
-                f"{a.vgpr_count},{a.sgpr_count},{a.spill_count},"
-                f"{a.mfma_count},{a.mfma_density_pct:.2f},{a.theoretical_occupancy}"
+                f"{result.file_path},{a.kernel_name},{a.architecture},isa_assembly,"
+                f"{a.vgpr_count},{a.sgpr_count},{a.vgpr_spill_count},{a.sgpr_spill_count},"
+                f"{a.mfma_count},{a.lds_size}"
             )
     return "\n".join(lines)
@@ -416,12 +505,24 @@ def _apply_filter(batch_result, filter_expr: str):
     }
     metric = metric_map.get(metric, metric)
-    # Filter function
+    # Filter function - supports both isa_analysis and code_object_analysis
     def passes_filter(result):
-        if not result.success or not result.isa_analysis:
+        if not result.success:
             return False
-        actual = getattr(result.isa_analysis, metric, None)
+        # Try to get metric from either analysis type
+        actual = None
+        if result.isa_analysis:
+            actual = getattr(result.isa_analysis, metric, None)
+        elif result.code_object_analysis:
+            # Map isa_analysis metric names to code_object_analysis equivalents
+            co_metric_map = {
+                "spill_count": "vgpr_spill_count",  # Use vgpr_spill_count as proxy
+                "lds_size": "lds_bytes",
+            }
+            co_metric = co_metric_map.get(metric, metric)
+            actual = getattr(result.code_object_analysis, co_metric, None)
         if actual is None:
             return False

wafer-cli 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

wafer-cli 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl