PyPI - wafer-cli - Versions diffs - 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl - Mend

wafer-cli 0.2.24py3-none-any.whl → 0.2.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

wafer/GUIDE.md +1 -1
wafer/agent_defaults.py +42 -0
wafer/billing.py +6 -6
wafer/cli.py +454 -86
wafer/cli_instructions.py +143 -0
wafer/corpus.py +7 -1
wafer/evaluate.py +13 -6
wafer/kernel_scope.py +1 -1
wafer/ncu_analyze.py +1 -1
wafer/nsys_analyze.py +1 -1
wafer/skills/wafer-guide/SKILL.md +22 -6
wafer/ssh_keys.py +6 -6
wafer/templates/ask_docs.py +1 -1
wafer/templates/optimize_kernel.py +1 -1
wafer/templates/optimize_kernelbench.py +17 -62
wafer/templates/trace_analyze.py +1 -1
wafer/tests/test_eval_cli_parity.py +199 -0
wafer/trace_compare.py +183 -0
wafer/wevin_cli.py +68 -9
wafer/workspaces.py +8 -8
wafer_cli-0.2.25.dist-info/METADATA +107 -0
wafer_cli-0.2.25.dist-info/RECORD +45 -0
wafer_cli-0.2.24.dist-info/METADATA +0 -16
wafer_cli-0.2.24.dist-info/RECORD +0 -41
{wafer_cli-0.2.24.dist-info → wafer_cli-0.2.25.dist-info}/WHEEL +0 -0
{wafer_cli-0.2.24.dist-info → wafer_cli-0.2.25.dist-info}/entry_points.txt +0 -0
{wafer_cli-0.2.24.dist-info → wafer_cli-0.2.25.dist-info}/top_level.txt +0 -0

wafer/cli.py CHANGED Viewed

@@ -194,11 +194,16 @@ def complete_target_name(incomplete: str) -> list[str]:
 # =============================================================================
 # Core subcommand groups (visible in --help)
+#
+# TODO: Further consolidate top-level commands to reduce --help surface area.
+# Candidates:
+#   - compare → wafer nvidia compare or keep top-level (cross-platform)
+#   - guide/skill/demo → wafer onboard {guide,skill,demo}
 # =============================================================================
 # Config management (includes targets as nested subcommand)
 config_app = typer.Typer(help="Manage CLI configuration and local GPU targets")
-app.add_typer(config_app, name="config")
+app.add_typer(config_app, name="config", rich_help_panel="Configuration")
 # Target management - nested under config
 targets_app = typer.Typer(
@@ -218,7 +223,7 @@ config_app.add_typer(targets_app, name="targets")
 workspaces_app = typer.Typer(
     help="""Manage cloud GPU workspaces for remote development.
-Workspaces are on-demand cloud GPU environments. Requires authentication (wafer login).
+Workspaces are on-demand cloud GPU environments. Requires authentication (wafer auth login).
 Available GPUs:
   MI300X  AMD Instinct MI300X (192GB HBM3, ROCm)
@@ -231,21 +236,21 @@ Commands:
   wafer workspaces sync dev ./project      # Sync files
   wafer workspaces delete dev              # Clean up"""
 )
-app.add_typer(workspaces_app, name="workspaces")
+app.add_typer(workspaces_app, name="workspaces", rich_help_panel="Infrastructure")
-# SSH Key management (BYOK - Bring Your Own Key)
+# SSH Key management (BYOK - Bring Your Own Key) - nested under config
 ssh_keys_app = typer.Typer(
     help="""Manage SSH public keys for workspace access.
 Register your SSH public keys here. These keys are installed in all workspaces
 you provision, enabling SSH access from any machine with your private key.
-  wafer ssh-keys list              # List registered keys
-  wafer ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
-  wafer ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
-  wafer ssh-keys remove <key-id>   # Remove a key"""
+  wafer config ssh-keys list              # List registered keys
+  wafer config ssh-keys add               # Add key (auto-detects ~/.ssh/id_ed25519.pub)
+  wafer config ssh-keys add ~/.ssh/id_rsa.pub --name laptop  # Add specific key
+  wafer config ssh-keys remove <key-id>   # Remove a key"""
 )
-app.add_typer(ssh_keys_app, name="ssh-keys")
+config_app.add_typer(ssh_keys_app, name="ssh-keys")
 # Target operations (exec/ssh/sync on configured targets)
 targets_ops_app = typer.Typer(
@@ -261,22 +266,22 @@ Useful for exploratory work, debugging, or custom scripts.
 Supports: RunPod, DigitalOcean (auto-provisions), SSH targets (baremetal/vm).
 Configure targets with: wafer config targets init ..."""
 )
-app.add_typer(targets_ops_app, name="targets")
+app.add_typer(targets_ops_app, name="targets", rich_help_panel="Infrastructure")
-# Billing management
+# Billing management - nested under config
 billing_app = typer.Typer(help="Manage billing, credits, and subscription")
-app.add_typer(billing_app, name="billing")
+config_app.add_typer(billing_app, name="billing")
 # Corpus management
 corpus_app = typer.Typer(help="Download and manage GPU documentation")
-app.add_typer(corpus_app, name="corpus")
+app.add_typer(corpus_app, name="corpus", rich_help_panel="Kernel Development")
 # Evaluate (supports multiple kernel formats)
 evaluate_app = typer.Typer(
     help="Test kernel correctness and performance",
     invoke_without_command=True,
 )
-app.add_typer(evaluate_app, name="evaluate")
+app.add_typer(evaluate_app, name="evaluate", rich_help_panel="Kernel Development")
 # Nested subcommand for kernelbench format
 kernelbench_app = typer.Typer(
@@ -305,7 +310,7 @@ app.add_typer(dev_app, name="dev")
 # =============================================================================
 nvidia_app = typer.Typer(help="NVIDIA GPU profiling and analysis tools")
-app.add_typer(nvidia_app, name="nvidia")
+app.add_typer(nvidia_app, name="nvidia", rich_help_panel="Profiling")
 # NCU analysis - under nvidia
 ncu_app = typer.Typer(help="Nsight Compute profile analysis")
@@ -328,18 +333,25 @@ nvidia_app.add_typer(tracelens_app, name="tracelens")
 # =============================================================================
 amd_app = typer.Typer(help="AMD GPU profiling and analysis tools")
-app.add_typer(amd_app, name="amd")
+app.add_typer(amd_app, name="amd", rich_help_panel="Profiling")
 # Unified ISA Analyzer - supports both .co files and Triton artifacts
 isa_app = typer.Typer(help="ISA analysis for AMD GPU kernels (.co, .s, .ll, .ttgir files)")
 amd_app.add_typer(isa_app, name="isa")
+# =============================================================================
+# Trace comparison (wafer compare)
+# =============================================================================
+compare_app = typer.Typer(help="Compare GPU traces across platforms (AMD vs NVIDIA)")
+app.add_typer(compare_app, name="compare", rich_help_panel="Profiling")
 # =============================================================================
 # Roofline analysis (wafer roofline)
 # =============================================================================
-@app.command("roofline")
+@app.command("roofline", rich_help_panel="Kernel Development")
 def roofline_cmd(
     gpu: str | None = typer.Option(
         None, "--gpu", "-g", help="GPU name (e.g., H100, B200, MI300X, A100)"
@@ -430,7 +442,7 @@ def roofline_cmd(
 # =============================================================================
 skill_app = typer.Typer(help="Manage AI coding assistant skills (Claude Code, Codex)")
-app.add_typer(skill_app, name="skill")
+app.add_typer(skill_app, name="skill", rich_help_panel="Onboarding")
 @skill_app.command("install")
@@ -594,14 +606,17 @@ def skill_status() -> None:
 # =============================================================================
-# Provider auth management (wafer auth ...)
+# Authentication (wafer auth ...)
 # =============================================================================
-provider_auth_app = typer.Typer(help="Manage API keys for cloud GPU providers")
-app.add_typer(provider_auth_app, name="auth")
+auth_app = typer.Typer(help="Authenticate with Wafer and cloud GPU providers")
+app.add_typer(auth_app, name="auth", rich_help_panel="Configuration")
+providers_app = typer.Typer(help="Manage API keys for cloud GPU providers (RunPod, DigitalOcean, etc.)")
+auth_app.add_typer(providers_app, name="providers")
-@provider_auth_app.command("login")
+@providers_app.command("login")
 def provider_auth_login(
     provider: str = typer.Argument(
         ...,
@@ -620,10 +635,10 @@ def provider_auth_login(
     (e.g., ANTHROPIC_API_KEY) take precedence over stored keys.
     Examples:
-        wafer auth login anthropic --api-key sk-ant-xxx
-        wafer auth login runpod --api-key rp_xxx
-        wafer auth login openai --api-key sk-xxx
-        echo $API_KEY | wafer auth login anthropic
+        wafer auth providers login anthropic --api-key sk-ant-xxx
+        wafer auth providers login runpod --api-key rp_xxx
+        wafer auth providers login openai --api-key sk-xxx
+        echo $API_KEY | wafer auth providers login anthropic
     """
     import sys
@@ -653,7 +668,7 @@ def provider_auth_login(
     typer.echo("Stored in: ~/.wafer/auth.json")
-@provider_auth_app.command("logout")
+@providers_app.command("logout")
 def provider_auth_logout(
     provider: str = typer.Argument(
         ...,
@@ -663,8 +678,8 @@ def provider_auth_logout(
     """Remove stored API key for a cloud GPU provider.
     Examples:
-        wafer auth logout runpod
-        wafer auth logout digitalocean
+        wafer auth providers logout runpod
+        wafer auth providers logout digitalocean
     """
     from wafer_core.auth import PROVIDERS, remove_api_key
@@ -680,7 +695,7 @@ def provider_auth_logout(
         typer.echo(f"No stored API key found for {PROVIDERS[provider]['display_name']}")
-@provider_auth_app.command("status")
+@providers_app.command("status")
 def provider_auth_status() -> None:
     """Show authentication status for all cloud GPU providers.
@@ -688,7 +703,7 @@ def provider_auth_status() -> None:
     the keys are coming from (environment variable or auth.json).
     Example:
-        wafer auth status
+        wafer auth providers status
     """
     from wafer_core.auth import get_all_auth_status
@@ -703,7 +718,7 @@ def provider_auth_status() -> None:
             typer.echo(f"  {status.display_name}: ✓ {status.key_preview} {source_str}")
         else:
             typer.echo(f"  {status.display_name}: ✗ Not configured")
-            typer.echo(f"      Run: wafer auth login {status.provider}")
+            typer.echo(f"      Run: wafer auth providers login {status.provider}")
             typer.echo(f"      Or set: {status.key_url}")
     typer.echo("")
@@ -1248,7 +1263,7 @@ def config_show_legacy() -> None:
     config_show_new()
-@app.command()
+@app.command(rich_help_panel="Kernel Development")
 def agent(  # noqa: PLR0913
     prompt: str | None = typer.Argument(
         None,
@@ -1318,7 +1333,7 @@ def agent(  # noqa: PLR0913
         None,
         "--model",
         "-m",
-        help="Model override (default: claude-sonnet-4-5)",
+        help="Model override (default: claude-opus-4-5)",
     ),
     json_output: bool = typer.Option(
         False,
@@ -1347,6 +1362,11 @@ def agent(  # noqa: PLR0913
         "--no-sandbox",
         help="Disable OS-level sandboxing (YOU accept liability for any damage caused by the agent)",
     ),
+    no_proxy: bool = typer.Option(
+        False,
+        "--no-proxy",
+        help="Skip wafer proxy, use ANTHROPIC_API_KEY directly",
+    ),
 ) -> None:
     """AI assistant for GPU kernel development.
@@ -1453,6 +1473,7 @@ def agent(  # noqa: PLR0913
         template_args=parsed_template_args,
         corpus_path=corpus_path,
         no_sandbox=no_sandbox,
+        no_proxy=no_proxy,
     )
@@ -1527,7 +1548,11 @@ def evaluate(  # noqa: PLR0913
         None, "--reference", help="Path to reference kernel file"
     ),
     test_cases: Path | None = typer.Option(
-        None, "--test-cases", help="Path to test cases JSON file"
+        None,
+        "--test-cases",
+        help="Path to test cases JSON file. "
+        'Format: [{"name": "small", "n": 1024, "seed": 42}, ...]. '
+        "Run 'wafer evaluate make-template' to generate an example.",
     ),
     target: str | None = typer.Option(
         None,
@@ -1557,20 +1582,20 @@ def evaluate(  # noqa: PLR0913
     Examples:
         # Basic correctness check
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json
         # With benchmarking on a specific target
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
             --target vultr-b200 --benchmark
         # Full evaluation with defensive timing (detects cheating)
-        wafer evaluate --impl kernel.py --reference ref.py --test-cases tests.json \\
+        wafer evaluate gpumode --impl kernel.py --reference ref.py --test-cases tests.json \\
             --benchmark --defensive
     Subcommands:
         gpumode        Use GPUMode format (functional) - RECOMMENDED
         kernelbench    Use KernelBench format (ModelNew class)
-        make-template  Generate template files for this format (deprecated)
+        make-template  Generate template files for this format
     """
     # If a subcommand is being invoked, skip the main evaluation logic
     if ctx.invoked_subcommand is not None:
@@ -1724,7 +1749,7 @@ def evaluate_make_template(
     typer.echo(f"  2. Edit {output_dir / 'reference.py'} with the ground truth + input generator")
     typer.echo(f"  3. Edit {output_dir / 'test_cases.json'} with your test parameters")
     typer.echo("  4. Run:")
-    typer.echo(f"     wafer evaluate --impl {output_dir / 'kernel.py'} \\")
+    typer.echo(f"     wafer evaluate gpumode --impl {output_dir / 'kernel.py'} \\")
     typer.echo(f"         --reference {output_dir / 'reference.py'} \\")
     typer.echo(f"         --test-cases {output_dir / 'test_cases.json'} --benchmark")
@@ -2275,7 +2300,11 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
         None, "--reference", help="Path to reference kernel file"
     ),
     test_cases: Path | None = typer.Option(
-        None, "--test-cases", help="Path to test cases JSON file"
+        None,
+        "--test-cases",
+        help="Path to test cases JSON file. "
+        'Format: [{"name": "small", "n": 1024, "seed": 42}, ...]. '
+        "Run 'wafer evaluate make-template' to generate an example.",
     ),
     target: str | None = typer.Option(
         None,
@@ -2343,6 +2372,13 @@ def gpumode_evaluate(  # noqa: PLR0913, PLR0915
             err=True,
         )
         typer.echo("", err=True)
+        if "--test-cases" in missing_args:
+            typer.echo(
+                "Tip: Run 'wafer evaluate make-template' to generate template files "
+                "including test_cases.json.",
+                err=True,
+            )
+            typer.echo("", err=True)
         typer.echo("Run 'wafer evaluate gpumode --help' for full options.", err=True)
         typer.echo("Run 'wafer evaluate gpumode download' to download problem sets.", err=True)
         raise typer.Exit(1)
@@ -2749,7 +2785,7 @@ def remote_run(  # noqa: PLR0913
 # =============================================================================
-@app.command("login")
+@auth_app.command("login")
 def login(
     token: str | None = typer.Option(
         None, "--token", "-t", help="Access token (skip browser OAuth)"
@@ -2774,7 +2810,7 @@ def login(
     Uses the API environment from config (see 'wafer config show').
     SSH Users (Easiest):
-    - Just run: wafer login
+    - Just run: wafer auth login
     - Visit the URL and enter the code shown
     - No port forwarding needed!
@@ -2784,17 +2820,17 @@ def login(
     Manual token option:
     - Visit auth.wafer.ai, authenticate, copy token from URL
-    - Run: wafer login --token <paste-token>
+    - Run: wafer auth login --token <paste-token>
     Examples:
-        wafer login                    # device code on SSH, browser on local
-        wafer login --no-device-code   # force browser (needs port forwarding on SSH)
-        wafer login --port 9000        # custom port for browser flow
-        wafer login --token xyz        # manual token (no browser)
+        wafer auth login                    # device code on SSH, browser on local
+        wafer auth login --no-device-code   # force browser (needs port forwarding on SSH)
+        wafer auth login --port 9000        # custom port for browser flow
+        wafer auth login --token xyz        # manual token (no browser)
         # Change environment:
         wafer config set api.environment staging
-        wafer login
+        wafer auth login
     """
     import httpx
@@ -2878,7 +2914,7 @@ def login(
     typer.echo("Token saved to ~/.wafer/credentials.json")
-@app.command("logout")
+@auth_app.command("logout")
 def logout() -> None:
     """Remove stored credentials."""
     from . import analytics
@@ -2895,7 +2931,7 @@ def logout() -> None:
         typer.echo("Not logged in (no credentials found).")
-@app.command("whoami")
+@auth_app.command("whoami")
 def whoami(
     verify: bool = typer.Option(False, "--verify", "-v", help="Verify token with API"),
     refresh: bool = typer.Option(False, "--refresh", "-r", help="Refresh token if expired"),
@@ -2909,7 +2945,7 @@ def whoami(
     creds = load_credentials()
     if creds is None:
-        typer.echo("Not logged in. Run: wafer login")
+        typer.echo("Not logged in. Run: wafer auth login")
         raise typer.Exit(1)
     if verify or refresh:
@@ -2917,7 +2953,7 @@ def whoami(
             # Try to get valid token with auto-refresh
             token = get_valid_token()
             if token is None:
-                typer.echo("Token expired and refresh failed. Run: wafer login", err=True)
+                typer.echo("Token expired and refresh failed. Run: wafer auth login", err=True)
                 raise typer.Exit(1)
             if token != creds.access_token:
                 typer.echo("Token refreshed successfully")
@@ -2930,10 +2966,10 @@ def whoami(
         except Exception as e:
             if creds.refresh_token and not refresh:
                 typer.echo(f"Token expired: {e}", err=True)
-                typer.echo("Try: wafer whoami --refresh", err=True)
+                typer.echo("Try: wafer auth whoami --refresh", err=True)
             else:
                 typer.echo(f"Token invalid or expired: {e}", err=True)
-                typer.echo("Run: wafer login", err=True)
+                typer.echo("Run: wafer auth login", err=True)
             raise typer.Exit(1) from None
     elif creds.email:
         typer.echo(creds.email)
@@ -2941,7 +2977,7 @@ def whoami(
         typer.echo("Logged in (email not available)")
-@app.command("guide")
+@app.command("guide", rich_help_panel="Onboarding")
 def guide() -> None:
     """Show the Wafer CLI usage guide.
@@ -2972,7 +3008,7 @@ demo_app = typer.Typer(
   wafer demo trace  Analyze a sample performance trace
   wafer demo eval   Run kernel evaluation on cloud GPU (requires login)"""
 )
-app.add_typer(demo_app, name="demo")
+app.add_typer(demo_app, name="demo", rich_help_panel="Onboarding")
 DEMO_TRACES_URL = "https://github.com/wafer-ai/wafer/raw/main/apps/wafer-cli/wafer/demo_data"
 DEMO_DIR = Path.home() / ".cache" / "wafer" / "demo"
@@ -3192,7 +3228,7 @@ def demo_eval(
     """Demo: Evaluate a kernel on a cloud GPU.
     Creates a workspace, runs a sample Triton kernel evaluation, and cleans up.
-    Requires authentication (wafer login).
+    Requires authentication (wafer auth login).
     Example:
         wafer demo eval
@@ -3207,7 +3243,7 @@ def demo_eval(
     # Check auth first
     creds = load_credentials()
     if not creds:
-        typer.echo("Error: Not authenticated. Run: wafer login")
+        typer.echo("Error: Not authenticated. Run: wafer auth login")
         raise typer.Exit(1)
     if not yes:
@@ -3856,12 +3892,16 @@ def targets_add(
 @targets_app.command("list")
 def targets_list() -> None:
-    """List all configured targets.
+    """List all configured targets with live provider status.
     Example:
         wafer config targets list
     """
-    from .targets import get_default_target, list_targets
+    import socket
+    import trio
+    from .targets import get_default_target, list_targets, load_target, remove_target
     targets = list_targets()
     default = get_default_target()
@@ -3871,10 +3911,146 @@ def targets_list() -> None:
         typer.echo("Add one with: wafer config targets add <path/to/target.toml>")
         return
+    def _parse_ssh_target(ssh_target: str) -> tuple[str, int]:
+        """Extract (host, port) from user@host:port string."""
+        parts = ssh_target.rsplit(":", 1)
+        host_part = parts[0]
+        port = int(parts[1]) if len(parts) > 1 else 22
+        if "@" in host_part:
+            host = host_part.split("@", 1)[1]
+        else:
+            host = host_part
+        return (host, port)
+    async def _get_live_provider_endpoints() -> set[tuple[str, int]]:
+        """Query RunPod + DO APIs. Returns set of live (ip, port) endpoints."""
+        from wafer_core.targets.digitalocean import list_running_droplets
+        from wafer_core.targets.runpod import sync_pods_from_api
+        live_endpoints: set[tuple[str, int]] = set()
+        async def _fetch_runpod() -> None:
+            try:
+                pods = await sync_pods_from_api()
+                for p in pods:
+                    live_endpoints.add((p.public_ip, p.ssh_port))
+            except Exception:
+                pass
+        async def _fetch_do() -> None:
+            try:
+                droplets = await list_running_droplets()
+                for d in droplets:
+                    live_endpoints.add((d.public_ip, d.ssh_port))
+            except Exception:
+                pass
+        async with trio.open_nursery() as nursery:
+            nursery.start_soon(_fetch_runpod)
+            nursery.start_soon(_fetch_do)
+        return live_endpoints
+    async def _get_target_status(
+        name: str,
+        live_endpoints: set[tuple[str, int]],
+    ) -> tuple[str, str, str]:
+        """Returns (name, status, ssh_info)."""
+        from wafer_core.targets.digitalocean import (
+            _remove_droplet_from_state,
+            check_droplet_running,
+            get_droplet_state,
+        )
+        from wafer_core.targets.runpod import (
+            _remove_pod_from_state,
+            check_pod_running,
+            get_pod_state,
+        )
+        from wafer_core.utils.kernel_utils.targets.config import (
+            BaremetalTarget,
+            DigitalOceanTarget,
+            ModalTarget,
+            RunPodTarget,
+        )
+        try:
+            target = load_target(name)
+        except (FileNotFoundError, ValueError, AssertionError, TypeError):
+            return (name, "error", "")
+        if isinstance(target, RunPodTarget):
+            pod = get_pod_state(name)
+            if not pod:
+                return (name, "no instance", "")
+            if await check_pod_running(pod.pod_id):
+                return (name, "running", f"{pod.ssh_username}@{pod.public_ip}:{pod.ssh_port}")
+            _remove_pod_from_state(name)
+            return (name, "stopped", "")
+        if isinstance(target, DigitalOceanTarget):
+            droplet = get_droplet_state(name)
+            if not droplet:
+                return (name, "no instance", "")
+            if await check_droplet_running(droplet.droplet_id):
+                return (
+                    name,
+                    "running",
+                    f"{droplet.ssh_username}@{droplet.public_ip}:{droplet.ssh_port}",
+                )
+            _remove_droplet_from_state(name)
+            return (name, "stopped", "")
+        if isinstance(target, BaremetalTarget):
+            ssh_target = target.ssh_target
+            host, port = _parse_ssh_target(ssh_target)
+            def _tcp_check() -> bool:
+                try:
+                    sock = socket.create_connection((host, port), timeout=2)
+                    sock.close()
+                    return True
+                except OSError:
+                    return False
+            reachable = await trio.to_thread.run_sync(_tcp_check)
+            if reachable:
+                return (name, "reachable", ssh_target)
+            # Unreachable + has a provider = backed by an ephemeral instance.
+            # If not in the live provider listing, the instance is gone — remove config.
+            if target.provider and (host, port) not in live_endpoints:
+                remove_target(name)
+                return (name, "removed (dead pod)", ssh_target)
+            return (name, "unreachable", ssh_target)
+        if isinstance(target, ModalTarget):
+            return (name, "serverless", "")
+        # Unknown target type
+        return (name, "unknown", "")
+    async def _gather_statuses() -> list[tuple[str, str, str]]:
+        live_endpoints = await _get_live_provider_endpoints()
+        results: list[tuple[str, str, str]] = [("", "", "")] * len(targets)
+        async def _check(i: int, name: str) -> None:
+            results[i] = await _get_target_status(name, live_endpoints)
+        async with trio.open_nursery() as nursery:
+            for i, name in enumerate(targets):
+                nursery.start_soon(_check, i, name)
+        return results
+    statuses = trio.run(_gather_statuses)
     typer.echo("Configured targets:")
-    for name in targets:
+    for name, status, ssh_info in statuses:
         marker = " (default)" if name == default else ""
-        typer.echo(f"  {name}{marker}")
+        label = f"  {name}{marker}"
+        detail = f"  {ssh_info}" if ssh_info else ""
+        typer.echo(f"{label:<40}{status}{detail}")
 @targets_app.command("show")
@@ -4089,10 +4265,19 @@ def targets_cleanup(
 # Known libraries that can be installed on targets
 # TODO: Consider adding HipKittens to the default RunPod/DO Docker images
 # so this install step isn't needed. For now, this command handles it.
+# Architecture → branch mapping for libraries that ship per-arch branches.
+# "default" is used when the detected arch has no explicit entry.
+_ARCH_BRANCHES: dict[str, dict[str, str]] = {
+    "hipkittens": {
+        "gfx942": "cdna3",  # MI300X, MI325X
+        "default": "main",  # MI350X, MI355X, and future CDNA4+
+    },
+}
 INSTALLABLE_LIBRARIES: dict[str, dict[str, object]] = {
     "hipkittens": {
-        "description": "HipKittens - AMD port of ThunderKittens for MI300X",
-        "git_url": "https://github.com/HazyResearch/hipkittens.git",
+        "description": "HipKittens - AMD port of ThunderKittens",
+        "git_url": "https://github.com/HazyResearch/HipKittens.git",
         "install_path": "/opt/hipkittens",
         "requires_amd": True,
     },
@@ -4105,6 +4290,38 @@ INSTALLABLE_LIBRARIES: dict[str, dict[str, object]] = {
 }
+def _resolve_gfx_arch(target: object, ssh_cmd: list[str]) -> str | None:
+    """Return the gfx architecture string for *target*.
+    1. If the target config already carries a compute_capability, map it.
+    2. Otherwise SSH in and probe with ``rocminfo``.
+    Returns None only if detection fails entirely.
+    """
+    import subprocess
+    from .evaluate import AMD_CC_TO_ARCH
+    cc = getattr(target, "compute_capability", None)
+    if cc and cc in AMD_CC_TO_ARCH:
+        return AMD_CC_TO_ARCH[cc]
+    typer.echo("  Detecting GPU architecture via rocminfo...")
+    probe_script = "rocminfo 2>/dev/null | grep -oP 'gfx\\d+' | head -1"
+    result = subprocess.run(
+        ssh_cmd + [probe_script],
+        capture_output=True,
+        text=True,
+        timeout=30,
+    )
+    arch = result.stdout.strip()
+    if result.returncode == 0 and arch.startswith("gfx"):
+        typer.echo(f"  Detected: {arch}")
+        return arch
+    typer.echo("  Warning: could not detect GPU architecture", err=True)
+    return None
 @targets_app.command("install")
 def targets_install(
     name: str = typer.Argument(..., help="Target name"),
@@ -4115,6 +4332,9 @@ def targets_install(
     Installs header-only libraries like HipKittens on remote targets.
     Safe to run multiple times - will skip if already installed.
+    For libraries with per-architecture branches (e.g. HipKittens), the
+    correct branch is selected automatically based on the target's GPU.
     Available libraries:
         hipkittens     - HipKittens (AMD ThunderKittens port)
         repair-headers - Fix ROCm thrust headers (after hipify corruption)
@@ -4188,14 +4408,22 @@ def targets_install(
             install_path = lib_info["install_path"]
             git_url = lib_info["git_url"]
-            # Idempotent install script
+            # Resolve the branch for arch-aware libraries
+            branch = "main"
+            arch_map = _ARCH_BRANCHES.get(library)
+            if arch_map:
+                gfx = await trio.to_thread.run_sync(lambda: _resolve_gfx_arch(target, ssh_cmd))
+                branch = arch_map.get(gfx, arch_map["default"]) if gfx else arch_map["default"]
+                typer.echo(f"  Branch: {branch} (arch={gfx or 'unknown'})")
+            # Idempotent: if already cloned, ensure correct branch & pull
             install_script = f"""
 if [ -d "{install_path}" ]; then
     echo "ALREADY_INSTALLED: {install_path} exists"
-    cd {install_path} && git pull --quiet 2>/dev/null || true
+    cd {install_path} && git fetch --quiet origin && git checkout {branch} --quiet && git pull --quiet origin {branch}
 else
     echo "INSTALLING: cloning to {install_path}"
-    git clone --quiet {git_url} {install_path}
+    git clone --quiet --branch {branch} {git_url} {install_path}
 fi
 echo "DONE"
 """
@@ -4373,8 +4601,8 @@ def billing_usage(
     """Show current billing usage and subscription info.
     Example:
-        wafer billing
-        wafer billing --json
+        wafer config billing
+        wafer config billing --json
     """
     # Only show usage if no subcommand was invoked
     if ctx.invoked_subcommand is not None:
@@ -4402,9 +4630,9 @@ def billing_topup(
     Opens a Stripe checkout page to add credits. Default amount is $25.
     Example:
-        wafer billing topup        # Add $25
-        wafer billing topup 100    # Add $100
-        wafer billing topup --no-browser  # Print URL instead
+        wafer config billing topup        # Add $25
+        wafer config billing topup 100    # Add $100
+        wafer config billing topup --no-browser  # Print URL instead
     """
     import webbrowser
@@ -4450,8 +4678,8 @@ def billing_portal(
     Manage your subscription, update payment method, or view invoices.
     Example:
-        wafer billing portal
-        wafer billing portal --no-browser
+        wafer config billing portal
+        wafer config billing portal --no-browser
     """
     import webbrowser
@@ -4488,8 +4716,8 @@ def ssh_keys_list(
     """List all registered SSH public keys.
     Example:
-        wafer ssh-keys list
-        wafer ssh-keys list --json
+        wafer config ssh-keys list
+        wafer config ssh-keys list --json
     """
     from .ssh_keys import list_ssh_keys
@@ -4515,9 +4743,9 @@ def ssh_keys_add(
     id_ed25519.pub, id_rsa.pub, id_ecdsa.pub.
     Example:
-        wafer ssh-keys add                              # Auto-detect
-        wafer ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
-        wafer ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
+        wafer config ssh-keys add                              # Auto-detect
+        wafer config ssh-keys add ~/.ssh/id_rsa.pub            # Specific file
+        wafer config ssh-keys add ~/.ssh/id_ed25519.pub --name laptop
     """
     from .ssh_keys import add_ssh_key
@@ -4536,10 +4764,10 @@ def ssh_keys_remove(
 ) -> None:
     """Remove an SSH public key.
-    Get the key ID from 'wafer ssh-keys list'.
+    Get the key ID from 'wafer config ssh-keys list'.
     Example:
-        wafer ssh-keys remove abc123-def456-...
+        wafer config ssh-keys remove abc123-def456-...
     """
     from .ssh_keys import remove_ssh_key
@@ -4978,7 +5206,9 @@ def workspaces_sync(
 @workspaces_app.command("pull")
 def workspaces_pull(
     workspace: str = typer.Argument(..., help="Workspace name or ID"),
-    remote_path: str = typer.Argument(..., help="Remote path in workspace (relative to /workspace or absolute)"),
+    remote_path: str = typer.Argument(
+        ..., help="Remote path in workspace (relative to /workspace or absolute)"
+    ),
     local_path: Path = typer.Argument(
         Path("."), help="Local destination path (default: current directory)"
     ),
@@ -5782,7 +6012,7 @@ def ncu_analyze(
     compute/memory throughput, and optimization recommendations.
     By default, uses local NCU if available, otherwise runs analysis
-    remotely via wafer-api (requires authentication: wafer login).
+    remotely via wafer-api (requires authentication: wafer auth login).
     Use --target for direct SSH mode (like wafer remote-run --direct).
     Use --include-source to fetch SASS assembly with register/instruction data.
@@ -5877,7 +6107,7 @@ def nsys_analyze(
     Returns timeline events, kernel information, memory usage, and diagnostics.
     By default, uses local nsys if available, otherwise runs analysis
-    remotely via wafer-api (requires authentication: wafer login).
+    remotely via wafer-api (requires authentication: wafer auth login).
     Supports multiple execution modes:
     - Local: Uses local nsys CLI (no GPU required for analysis)
@@ -6862,7 +7092,7 @@ def autotuner_results(
         raise typer.Exit(1) from None
-@app.command("capture")
+@app.command("capture", rich_help_panel="Kernel Development")
 def capture_command(  # noqa: PLR0915
     label: str = typer.Argument(
         ..., help="Label for this capture (e.g., 'baseline', 'optimized-v2')"
@@ -7527,6 +7757,144 @@ def isa_targets() -> None:
     typer.echo(output)
+# =============================================================================
+# Trace comparison commands
+# =============================================================================
+@compare_app.command("analyze")
+def compare_analyze(
+    trace1: Path = typer.Argument(..., help="First trace file (AMD or NVIDIA)", exists=True),
+    trace2: Path = typer.Argument(..., help="Second trace file (AMD or NVIDIA)", exists=True),
+    format: str = typer.Option(
+        "text",
+        "--format",
+        "-f",
+        help="Output format: text, text-layers, csv, csv-layers, json",
+    ),
+    output: Path | None = typer.Option(
+        None, "--output", "-o", help="Output file (default: stdout)"
+    ),
+    phase: str = typer.Option(
+        "all",
+        "--phase",
+        help="Filter by phase: all, prefill, decode",
+    ),
+    layers: bool = typer.Option(False, "--layers", help="Show layer-wise performance breakdown"),
+    all: bool = typer.Option(
+        False, "--all", help="Show all items (no truncation for layers, operations, kernels)"
+    ),
+    stack_traces: bool = typer.Option(
+        False, "--stack-traces", help="Show Python stack traces for operations"
+    ),
+    json: bool = typer.Option(
+        False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"
+    ),
+) -> None:
+    """Compare GPU traces from AMD and NVIDIA platforms.
+    Analyzes performance differences between traces, identifying which operations
+    are faster/slower on each platform and providing kernel-level details.
+    Examples:
+        # Basic comparison (stdout)
+        wafer compare analyze amd_trace.json nvidia_trace.json
+        # Show layer-wise breakdown
+        wafer compare analyze amd_trace.json nvidia_trace.json --layers
+        wafer compare analyze amd_trace.json nvidia_trace.json --format text-layers
+        # Show all layers without truncation
+        wafer compare analyze amd_trace.json nvidia_trace.json --layers --all
+        # Show Python stack traces
+        wafer compare analyze amd_trace.json nvidia_trace.json --stack-traces
+        # Show all stack traces without truncation
+        wafer compare analyze amd_trace.json nvidia_trace.json --stack-traces --all
+        # Save to file
+        wafer compare analyze amd_trace.json nvidia_trace.json -o report.txt
+        # CSV output (operations) to file
+        wafer compare analyze amd_trace.json nvidia_trace.json --format csv -o operations.csv
+        # CSV output (layers) to file
+        wafer compare analyze amd_trace.json nvidia_trace.json --format csv-layers -o layers.csv
+        # JSON output to file
+        wafer compare analyze amd_trace.json nvidia_trace.json --format json -o report.json
+        # Analyze only prefill phase
+        wafer compare analyze amd_trace.json nvidia_trace.json --phase prefill
+    """
+    from .trace_compare import compare_traces
+    compare_traces(
+        trace1=trace1,
+        trace2=trace2,
+        output=output,
+        output_format=format,
+        phase=phase,
+        show_layers=layers,
+        show_all=all,
+        show_stack_traces=stack_traces,
+    )
+    _mark_command_success()
+@compare_app.command("fusion")
+def compare_fusion_cmd(
+    trace1: Path = typer.Argument(..., help="First trace file (AMD or NVIDIA)", exists=True),
+    trace2: Path = typer.Argument(..., help="Second trace file (AMD or NVIDIA)", exists=True),
+    format: str = typer.Option(
+        "text",
+        "--format",
+        "-f",
+        help="Output format: text, csv, json",
+    ),
+    output: Path | None = typer.Option(
+        None, "--output", "-o", help="Output file (default: stdout)"
+    ),
+    min_group_size: int = typer.Option(
+        50,
+        "--min-group-size",
+        help="Minimum correlation group size to analyze",
+    ),
+    json: bool = typer.Option(
+        False, "--json", hidden=True, help="Ignored (for compatibility with cliExecutor)"
+    ),
+) -> None:
+    """Analyze kernel fusion differences between AMD and NVIDIA traces.
+    Detects which operations are fused differently on each platform by analyzing
+    how many kernel launches each platform uses for the same logical operations.
+    Examples:
+        # Basic fusion analysis (stdout)
+        wafer compare fusion amd_trace.json nvidia_trace.json
+        # Save to file
+        wafer compare fusion amd_trace.json nvidia_trace.json -o fusion_report.txt
+        # JSON output to file
+        wafer compare fusion amd_trace.json nvidia_trace.json --format json -o fusion.json
+        # CSV output to file
+        wafer compare fusion amd_trace.json nvidia_trace.json --format csv -o fusion.csv
+    """
+    from .trace_compare import compare_fusion
+    compare_fusion(
+        trace1=trace1,
+        trace2=trace2,
+        output=output,
+        format_type=format,
+        min_group_size=min_group_size,
+    )
+    _mark_command_success()
 def main() -> None:
     """Entry point for wafer CLI."""
     app()

wafer-cli 0.2.24__py3-none-any.whl → 0.2.25__py3-none-any.whl

wafer-cli 0.2.24py3-none-any.whl → 0.2.25py3-none-any.whl